vm_page.c revision 232066
1193326Sed/*-
2193326Sed * Copyright (c) 1991 Regents of the University of California.
3193326Sed * All rights reserved.
4193326Sed * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5193326Sed *
6193326Sed * This code is derived from software contributed to Berkeley by
7193326Sed * The Mach Operating System project at Carnegie-Mellon University.
8193326Sed *
9193326Sed * Redistribution and use in source and binary forms, with or without
10193326Sed * modification, are permitted provided that the following conditions
11193326Sed * are met:
12193326Sed * 1. Redistributions of source code must retain the above copyright
13193326Sed *    notice, this list of conditions and the following disclaimer.
14212904Sdim * 2. Redistributions in binary form must reproduce the above copyright
15221345Sdim *    notice, this list of conditions and the following disclaimer in the
16193326Sed *    documentation and/or other materials provided with the distribution.
17193326Sed * 4. Neither the name of the University nor the names of its contributors
18263508Sdim *    may be used to endorse or promote products derived from this software
19198092Srdivacky *    without specific prior written permission.
20249423Sdim *
21239462Sdim * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22212904Sdim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23212904Sdim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24193326Sed * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25221345Sdim * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26193326Sed * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27193326Sed * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28198092Srdivacky * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29198092Srdivacky * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30193326Sed * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31249423Sdim * SUCH DAMAGE.
32249423Sdim *
33249423Sdim *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
34249423Sdim */
35249423Sdim
36249423Sdim/*-
37249423Sdim * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38249423Sdim * All rights reserved.
39249423Sdim *
40249423Sdim * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41249423Sdim *
42249423Sdim * Permission to use, copy, modify and distribute this software and
43263508Sdim * its documentation is hereby granted, provided that both the copyright
44234353Sdim * notice and this permission notice appear in all copies of the
45201361Srdivacky * software, derivative works or modified versions, and any portions
46193326Sed * thereof, and that both notices appear in supporting documentation.
47198092Srdivacky *
48193326Sed * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49193326Sed * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50212904Sdim * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51193326Sed *
52224145Sdim * Carnegie Mellon requests users of this software to return to
53224145Sdim *
54224145Sdim *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55224145Sdim *  School of Computer Science
56224145Sdim *  Carnegie Mellon University
57224145Sdim *  Pittsburgh PA 15213-3890
58212904Sdim *
59193326Sed * any improvements or extensions that they make and grant Carnegie the
60193326Sed * rights to redistribute these changes.
61234353Sdim */
62234353Sdim
63234353Sdim/*
64234353Sdim *			GENERAL RULES ON VM_PAGE MANIPULATION
65239462Sdim *
66239462Sdim *	- a pageq mutex is required when adding or removing a page from a
67234353Sdim *	  page queue (vm_page_queue[]), regardless of other mutexes or the
68234353Sdim *	  busy state of a page.
69234353Sdim *
70234353Sdim *	- The object mutex is held when inserting or removing
71234353Sdim *	  pages from an object (vm_page_insert() or vm_page_remove()).
72234353Sdim *
73234353Sdim */
74234353Sdim
75234353Sdim/*
76234353Sdim *	Resident memory management module.
77239462Sdim */
78234353Sdim
79234353Sdim#include <sys/cdefs.h>
80234353Sdim__FBSDID("$FreeBSD: stable/9/sys/vm/vm_page.c 232066 2012-02-23 19:20:36Z kmacy $");
81234353Sdim
82239462Sdim#include "opt_vm.h"
83234353Sdim
84234353Sdim#include <sys/param.h>
85234353Sdim#include <sys/systm.h>
86234353Sdim#include <sys/lock.h>
87239462Sdim#include <sys/kernel.h>
88239462Sdim#include <sys/limits.h>
89239462Sdim#include <sys/malloc.h>
90239462Sdim#include <sys/msgbuf.h>
91239462Sdim#include <sys/mutex.h>
92239462Sdim#include <sys/proc.h>
93239462Sdim#include <sys/sysctl.h>
94239462Sdim#include <sys/vmmeter.h>
95239462Sdim#include <sys/vnode.h>
96239462Sdim
97239462Sdim#include <vm/vm.h>
98239462Sdim#include <vm/pmap.h>
99239462Sdim#include <vm/vm_param.h>
100239462Sdim#include <vm/vm_kern.h>
101239462Sdim#include <vm/vm_object.h>
102239462Sdim#include <vm/vm_page.h>
103239462Sdim#include <vm/vm_pageout.h>
104239462Sdim#include <vm/vm_pager.h>
105239462Sdim#include <vm/vm_phys.h>
106239462Sdim#include <vm/vm_reserv.h>
107239462Sdim#include <vm/vm_extern.h>
108239462Sdim#include <vm/uma.h>
109239462Sdim#include <vm/uma_int.h>
110239462Sdim
111239462Sdim#include <machine/md_var.h>
112239462Sdim
113263508Sdim/*
114239462Sdim *	Associated with page of user-allocatable memory is a
115239462Sdim *	page structure.
116239462Sdim */
117239462Sdim
118239462Sdimstruct vpgqueues vm_page_queues[PQ_COUNT];
119239462Sdimstruct vpglocks vm_page_queue_lock;
120239462Sdimstruct vpglocks vm_page_queue_free_lock;
121239462Sdim
122239462Sdimstruct vpglocks	pa_lock[PA_LOCK_COUNT];
123239462Sdim
124193326Sedvm_page_t vm_page_array = 0;
125193326Sedint vm_page_array_size = 0;
126193326Sedlong first_page = 0;
127193326Sedint vm_page_zero_count = 0;
128193326Sed
129193326Sedstatic int boot_pages = UMA_BOOT_PAGES;
130193326SedTUNABLE_INT("vm.boot_pages", &boot_pages);
131193326SedSYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
132251662Sdim	"number of pages allocated for bootstrapping the VM system");
133212904Sdim
134218893Sdimstatic int pa_tryrelock_restart;
135221345SdimSYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
136234353Sdim    &pa_tryrelock_restart, 0, "Number of tryrelock restarts");
137226633Sdim
138226633Sdimstatic uma_zone_t fakepg_zone;
139199990Srdivacky
140199990Srdivackystatic void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
141199990Srdivackystatic void vm_page_queue_remove(int queue, vm_page_t m);
142212904Sdimstatic void vm_page_enqueue(int queue, vm_page_t m);
143199990Srdivackystatic void vm_page_init_fakepg(void *dummy);
144199990Srdivacky
145207619SrdivackySYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
146199990Srdivacky
147199990Srdivackystatic void
148199990Srdivackyvm_page_init_fakepg(void *dummy)
149199990Srdivacky{
150199990Srdivacky
151199990Srdivacky	fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
152199990Srdivacky	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
153199990Srdivacky}
154199990Srdivacky
155199990Srdivacky/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
156199990Srdivacky#if PAGE_SIZE == 32768
157199990Srdivacky#ifdef CTASSERT
158199990SrdivackyCTASSERT(sizeof(u_long) >= 8);
159234353Sdim#endif
160212904Sdim#endif
161199990Srdivacky
162210299Sed/*
163210299Sed * Try to acquire a physical address lock while a pmap is locked.  If we
164221345Sdim * fail to trylock we unlock and lock the pmap directly and cache the
165221345Sdim * locked pa in *locked.  The caller should then restart their loop in case
166221345Sdim * the virtual to physical mapping has changed.
167221345Sdim */
168212904Sdimint
169221345Sdimvm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked)
170221345Sdim{
171221345Sdim	vm_paddr_t lockpa;
172221345Sdim
173199990Srdivacky	lockpa = *locked;
174199990Srdivacky	*locked = pa;
175212904Sdim	if (lockpa) {
176199990Srdivacky		PA_LOCK_ASSERT(lockpa, MA_OWNED);
177199990Srdivacky		if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa))
178207619Srdivacky			return (0);
179207619Srdivacky		PA_UNLOCK(lockpa);
180212904Sdim	}
181198092Srdivacky	if (PA_TRYLOCK(pa))
182201361Srdivacky		return (0);
183201361Srdivacky	PMAP_UNLOCK(pmap);
184201361Srdivacky	atomic_add_int(&pa_tryrelock_restart, 1);
185201361Srdivacky	PA_LOCK(pa);
186201361Srdivacky	PMAP_LOCK(pmap);
187201361Srdivacky	return (EAGAIN);
188199990Srdivacky}
189199990Srdivacky
190199990Srdivacky/*
191199990Srdivacky *	vm_set_page_size:
192199990Srdivacky *
193199990Srdivacky *	Sets the page size, perhaps based upon the memory
194198092Srdivacky *	size.  Must be called before any use of page-size
195199990Srdivacky *	dependent functions.
196199990Srdivacky */
197199990Srdivackyvoid
198199990Srdivackyvm_set_page_size(void)
199199990Srdivacky{
200199990Srdivacky	if (cnt.v_page_size == 0)
201199990Srdivacky		cnt.v_page_size = PAGE_SIZE;
202199990Srdivacky	if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
203199990Srdivacky		panic("vm_set_page_size: page size not a power of two");
204199990Srdivacky}
205199990Srdivacky
206199990Srdivacky/*
207199990Srdivacky *	vm_page_blacklist_lookup:
208199990Srdivacky *
209193326Sed *	See if a physical address in this page has been listed
210199482Srdivacky *	in the blacklist tunable.  Entries in the tunable are
211193326Sed *	separated by spaces or commas.  If an invalid integer is
212202379Srdivacky *	encountered then the rest of the string is skipped.
213226633Sdim */
214239462Sdimstatic int
215226633Sdimvm_page_blacklist_lookup(char *list, vm_paddr_t pa)
216234353Sdim{
217226633Sdim	vm_paddr_t bad;
218226633Sdim	char *cp, *pos;
219226633Sdim
220226633Sdim	for (pos = list; *pos != '\0'; pos = cp) {
221226633Sdim		bad = strtoq(pos, &cp, 0);
222226633Sdim		if (*cp != '\0') {
223226633Sdim			if (*cp == ' ' || *cp == ',') {
224226633Sdim				cp++;
225226633Sdim				if (cp == pos)
226226633Sdim					continue;
227226633Sdim			} else
228226633Sdim				break;
229226633Sdim		}
230226633Sdim		if (pa == trunc_page(bad))
231226633Sdim			return (1);
232234353Sdim	}
233226633Sdim	return (0);
234226633Sdim}
235226633Sdim
236226633Sdim/*
237234353Sdim *	vm_page_startup:
238226633Sdim *
239226633Sdim *	Initializes the resident memory module.
240263508Sdim *
241263508Sdim *	Allocates memory for the page cells, and
242263508Sdim *	for the object/offset-to-page hash table headers.
243226633Sdim *	Each page cell is initialized and placed on the free list.
244226633Sdim */
245226633Sdimvm_offset_t
246226633Sdimvm_page_startup(vm_offset_t vaddr)
247226633Sdim{
248226633Sdim	vm_offset_t mapped;
249226633Sdim	vm_paddr_t page_range;
250226633Sdim	vm_paddr_t new_end;
251193326Sed	int i;
252199482Srdivacky	vm_paddr_t pa;
253203955Srdivacky	vm_paddr_t last_pa;
254212904Sdim	char *list;
255193326Sed
256198893Srdivacky	/* the biggest memory array is the second group of pages */
257198092Srdivacky	vm_paddr_t end;
258198092Srdivacky	vm_paddr_t biggestsize;
259198092Srdivacky	vm_paddr_t low_water, high_water;
260198092Srdivacky	int biggestone;
261198092Srdivacky
262199482Srdivacky	biggestsize = 0;
263199482Srdivacky	biggestone = 0;
264212904Sdim	vaddr = round_page(vaddr);
265199482Srdivacky
266198092Srdivacky	for (i = 0; phys_avail[i + 1]; i += 2) {
267193326Sed		phys_avail[i] = round_page(phys_avail[i]);
268193326Sed		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
269193326Sed	}
270193326Sed
271198092Srdivacky	low_water = phys_avail[0];
272198092Srdivacky	high_water = phys_avail[1];
273193326Sed
274193326Sed	for (i = 0; phys_avail[i + 1]; i += 2) {
275193326Sed		vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
276193326Sed
277193326Sed		if (size > biggestsize) {
278193326Sed			biggestone = i;
279193326Sed			biggestsize = size;
280193326Sed		}
281193326Sed		if (phys_avail[i] < low_water)
282193326Sed			low_water = phys_avail[i];
283193326Sed		if (phys_avail[i + 1] > high_water)
284193326Sed			high_water = phys_avail[i + 1];
285199482Srdivacky	}
286212904Sdim
287193326Sed#ifdef XEN
288193326Sed	low_water = 0;
289193326Sed#endif
290193326Sed
291193326Sed	end = phys_avail[biggestone+1];
292193326Sed
293193326Sed	/*
294193326Sed	 * Initialize the locks.
295193326Sed	 */
296198092Srdivacky	mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF |
297193326Sed	    MTX_RECURSE);
298193326Sed	mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL,
299193326Sed	    MTX_DEF);
300198893Srdivacky
301198092Srdivacky	/* Setup page locks. */
302198893Srdivacky	for (i = 0; i < PA_LOCK_COUNT; i++)
303198893Srdivacky		mtx_init(&pa_lock[i].data, "page lock", NULL, MTX_DEF);
304198893Srdivacky
305199482Srdivacky	/*
306198893Srdivacky	 * Initialize the queue headers for the hold queue, the active queue,
307198893Srdivacky	 * and the inactive queue.
308234353Sdim	 */
309234353Sdim	for (i = 0; i < PQ_COUNT; i++)
310234353Sdim		TAILQ_INIT(&vm_page_queues[i].pl);
311234353Sdim	vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
312234353Sdim	vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
313221345Sdim	vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count;
314221345Sdim
315221345Sdim	/*
316221345Sdim	 * Allocate memory for use when boot strapping the kernel memory
317221345Sdim	 * allocator.
318221345Sdim	 */
319221345Sdim	new_end = end - (boot_pages * UMA_SLAB_SIZE);
320234353Sdim	new_end = trunc_page(new_end);
321221345Sdim	mapped = pmap_map(&vaddr, new_end, end,
322221345Sdim	    VM_PROT_READ | VM_PROT_WRITE);
323221345Sdim	bzero((void *)mapped, end - new_end);
324221345Sdim	uma_startup((void *)mapped, boot_pages);
325221345Sdim
326221345Sdim#if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \
327198893Srdivacky    defined(__mips__)
328221345Sdim	/*
329218893Sdim	 * Allocate a bitmap to indicate that a random physical page
330218893Sdim	 * needs to be included in a minidump.
331218893Sdim	 *
332218893Sdim	 * The amd64 port needs this to indicate which direct map pages
333218893Sdim	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
334199482Srdivacky	 *
335199482Srdivacky	 * However, i386 still needs this workspace internally within the
336212904Sdim	 * minidump code.  In theory, they are not needed on i386, but are
337199482Srdivacky	 * included should the sf_buf code decide to use them.
338212904Sdim	 */
339193326Sed	last_pa = 0;
340193326Sed	for (i = 0; dump_avail[i + 1] != 0; i += 2)
341193326Sed		if (dump_avail[i + 1] > last_pa)
342193326Sed			last_pa = dump_avail[i + 1];
343193326Sed	page_range = last_pa / PAGE_SIZE;
344243830Sdim	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
345243830Sdim	new_end -= vm_page_dump_size;
346193326Sed	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
347193326Sed	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
348199482Srdivacky	bzero((void *)vm_page_dump, vm_page_dump_size);
349199482Srdivacky#endif
350199482Srdivacky#ifdef __amd64__
351199482Srdivacky	/*
352200583Srdivacky	 * Request that the physical pages underlying the message buffer be
353193326Sed	 * included in a crash dump.  Since the message buffer is accessed
354208600Srdivacky	 * through the direct map, they are not automatically included.
355243830Sdim	 */
356208600Srdivacky	pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
357208600Srdivacky	last_pa = pa + round_page(msgbufsize);
358208600Srdivacky	while (pa < last_pa) {
359193326Sed		dump_add_page(pa);
360193326Sed		pa += PAGE_SIZE;
361198092Srdivacky	}
362193326Sed#endif
363193326Sed	/*
364193326Sed	 * Compute the number of pages of memory that will be available for
365221345Sdim	 * use (taking into account the overhead of a page structure per
366221345Sdim	 * page).
367221345Sdim	 */
368221345Sdim	first_page = low_water / PAGE_SIZE;
369221345Sdim#ifdef VM_PHYSSEG_SPARSE
370221345Sdim	page_range = 0;
371221345Sdim	for (i = 0; phys_avail[i + 1] != 0; i += 2)
372221345Sdim		page_range += atop(phys_avail[i + 1] - phys_avail[i]);
373221345Sdim#elif defined(VM_PHYSSEG_DENSE)
374221345Sdim	page_range = high_water / PAGE_SIZE - first_page;
375221345Sdim#else
376221345Sdim#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
377221345Sdim#endif
378221345Sdim	end = new_end;
379226633Sdim
380221345Sdim	/*
381221345Sdim	 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
382221345Sdim	 */
383221345Sdim	vaddr += PAGE_SIZE;
384221345Sdim
385221345Sdim	/*
386221345Sdim	 * Initialize the mem entry structures now, and put them in the free
387221345Sdim	 * queue.
388226633Sdim	 */
389221345Sdim	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
390226633Sdim	mapped = pmap_map(&vaddr, new_end, end,
391221345Sdim	    VM_PROT_READ | VM_PROT_WRITE);
392221345Sdim	vm_page_array = (vm_page_t) mapped;
393239462Sdim#if VM_NRESERVLEVEL > 0
394198092Srdivacky	/*
395198092Srdivacky	 * Allocate memory for the reservation management system's data
396207619Srdivacky	 * structures.
397212904Sdim	 */
398198092Srdivacky	new_end = vm_reserv_startup(&vaddr, new_end, high_water);
399212904Sdim#endif
400198092Srdivacky#if defined(__amd64__) || defined(__mips__)
401201361Srdivacky	/*
402201361Srdivacky	 * pmap_map on amd64 and mips can come out of the direct-map, not kvm
403234353Sdim	 * like i386, so the pages must be tracked for a crashdump to include
404239462Sdim	 * this data.  This includes the vm_page_array and the early UMA
405234353Sdim	 * bootstrap pages.
406234353Sdim	 */
407224145Sdim	for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
408224145Sdim		dump_add_page(pa);
409263508Sdim#endif
410263508Sdim	phys_avail[biggestone + 1] = new_end;
411224145Sdim
412234353Sdim	/*
413263508Sdim	 * Clear all of the page structures
414263508Sdim	 */
415263508Sdim	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
416263508Sdim	for (i = 0; i < page_range; i++)
417263508Sdim		vm_page_array[i].order = VM_NFREEORDER;
418263508Sdim	vm_page_array_size = page_range;
419263508Sdim
420263508Sdim	/*
421263508Sdim	 * Initialize the physical memory allocator.
422263508Sdim	 */
423263508Sdim	vm_phys_init();
424234353Sdim
425263508Sdim	/*
426201361Srdivacky	 * Add every available physical page that is not blacklisted to
427263508Sdim	 * the free lists.
428263508Sdim	 */
429263508Sdim	cnt.v_page_count = 0;
430263508Sdim	cnt.v_free_count = 0;
431263508Sdim	list = getenv("vm.blacklist");
432263508Sdim	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
433263508Sdim		pa = phys_avail[i];
434234353Sdim		last_pa = phys_avail[i + 1];
435234353Sdim		while (pa < last_pa) {
436207619Srdivacky			if (list != NULL &&
437234353Sdim			    vm_page_blacklist_lookup(list, pa))
438207619Srdivacky				printf("Skipping page with pa 0x%jx\n",
439201361Srdivacky				    (uintmax_t)pa);
440234353Sdim			else
441207619Srdivacky				vm_phys_add_page(pa);
442207619Srdivacky			pa += PAGE_SIZE;
443239462Sdim		}
444207619Srdivacky	}
445207619Srdivacky	freeenv(list);
446208600Srdivacky#if VM_NRESERVLEVEL > 0
447212904Sdim	/*
448212904Sdim	 * Initialize the reservation management system.
449208600Srdivacky	 */
450263508Sdim	vm_reserv_init();
451207619Srdivacky#endif
452207619Srdivacky	return (vaddr);
453207619Srdivacky}
454207619Srdivacky
455207619Srdivacky
456201361SrdivackyCTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0);
457201361Srdivacky
458201361Srdivackyvoid
459201361Srdivackyvm_page_aflag_set(vm_page_t m, uint8_t bits)
460198092Srdivacky{
461198092Srdivacky	uint32_t *addr, val;
462198092Srdivacky
463201361Srdivacky	/*
464239462Sdim	 * The PGA_WRITEABLE flag can only be set if the page is managed and
465198092Srdivacky	 * VPO_BUSY.  Currently, this flag is only set by pmap_enter().
466198092Srdivacky	 */
467239462Sdim	KASSERT((bits & PGA_WRITEABLE) == 0 ||
468198092Srdivacky	    (m->oflags & (VPO_UNMANAGED | VPO_BUSY)) == VPO_BUSY,
469221345Sdim	    ("PGA_WRITEABLE and !VPO_BUSY"));
470234353Sdim
471221345Sdim	/*
472221345Sdim	 * We want to use atomic updates for m->aflags, which is a
473221345Sdim	 * byte wide.  Not all architectures provide atomic operations
474239462Sdim	 * on the single-byte destination.  Punt and access the whole
475198092Srdivacky	 * 4-byte word with an atomic update.  Parallel non-atomic
476206084Srdivacky	 * updates to the fields included in the update by proximity
477239462Sdim	 * are handled properly by atomics.
478239462Sdim	 */
479198092Srdivacky	addr = (void *)&m->aflags;
480198092Srdivacky	MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0);
481198092Srdivacky	val = bits;
482198092Srdivacky#if BYTE_ORDER == BIG_ENDIAN
483198092Srdivacky	val <<= 24;
484198092Srdivacky#endif
485198092Srdivacky	atomic_set_32(addr, val);
486193326Sed}
487221345Sdim
488221345Sdimvoid
489221345Sdimvm_page_aflag_clear(vm_page_t m, uint8_t bits)
490234353Sdim{
491221345Sdim	uint32_t *addr, val;
492221345Sdim
493221345Sdim	/*
494221345Sdim	 * The PGA_REFERENCED flag can only be cleared if the object
495221345Sdim	 * containing the page is locked.
496221345Sdim	 */
497221345Sdim	KASSERT((bits & PGA_REFERENCED) == 0 || VM_OBJECT_LOCKED(m->object),
498221345Sdim	    ("PGA_REFERENCED and !VM_OBJECT_LOCKED"));
499221345Sdim
500221345Sdim	/*
501221345Sdim	 * See the comment in vm_page_aflag_set().
502221345Sdim	 */
503221345Sdim	addr = (void *)&m->aflags;
504239462Sdim	MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0);
505239462Sdim	val = bits;
506239462Sdim#if BYTE_ORDER == BIG_ENDIAN
507239462Sdim	val <<= 24;
508243830Sdim#endif
509243830Sdim	atomic_clear_32(addr, val);
510243830Sdim}
511239462Sdim
512239462Sdimvoid
513239462Sdimvm_page_reference(vm_page_t m)
514239462Sdim{
515239462Sdim
516239462Sdim	vm_page_aflag_set(m, PGA_REFERENCED);
517239462Sdim}
518239462Sdim
519239462Sdimvoid
520239462Sdimvm_page_busy(vm_page_t m)
521239462Sdim{
522239462Sdim
523239462Sdim	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
524239462Sdim	KASSERT((m->oflags & VPO_BUSY) == 0,
525239462Sdim	    ("vm_page_busy: page already busy!!!"));
526239462Sdim	m->oflags |= VPO_BUSY;
527239462Sdim}
528239462Sdim
529243830Sdim/*
530243830Sdim *      vm_page_flash:
531243830Sdim *
532243830Sdim *      wakeup anyone waiting for the page.
533243830Sdim */
534239462Sdimvoid
535239462Sdimvm_page_flash(vm_page_t m)
536239462Sdim{
537239462Sdim
538239462Sdim	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
539239462Sdim	if (m->oflags & VPO_WANTED) {
540239462Sdim		m->oflags &= ~VPO_WANTED;
541239462Sdim		wakeup(m);
542239462Sdim	}
543239462Sdim}
544243830Sdim
545243830Sdim/*
546243830Sdim *      vm_page_wakeup:
547243830Sdim *
548243830Sdim *      clear the VPO_BUSY flag and wakeup anyone waiting for the
549243830Sdim *      page.
550243830Sdim *
551243830Sdim */
552239462Sdimvoid
553239462Sdimvm_page_wakeup(vm_page_t m)
554239462Sdim{
555239462Sdim
556239462Sdim	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
557239462Sdim	KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!"));
558243830Sdim	m->oflags &= ~VPO_BUSY;
559243830Sdim	vm_page_flash(m);
560243830Sdim}
561243830Sdim
562243830Sdimvoid
563243830Sdimvm_page_io_start(vm_page_t m)
564243830Sdim{
565243830Sdim
566243830Sdim	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
567243830Sdim	m->busy++;
568243830Sdim}
569243830Sdim
570243830Sdimvoid
571243830Sdimvm_page_io_finish(vm_page_t m)
572243830Sdim{
573221345Sdim
574221345Sdim	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
575221345Sdim	KASSERT(m->busy > 0, ("vm_page_io_finish: page %p is not busy", m));
576221345Sdim	m->busy--;
577243830Sdim	if (m->busy == 0)
578243830Sdim		vm_page_flash(m);
579243830Sdim}
580221345Sdim
581221345Sdim/*
582221345Sdim * Keep page from being freed by the page daemon
583221345Sdim * much of the same effect as wiring, except much lower
584221345Sdim * overhead and should be used only for *very* temporary
585221345Sdim * holding ("wiring").
586221345Sdim */
587221345Sdimvoid
588221345Sdimvm_page_hold(vm_page_t mem)
589221345Sdim{
590221345Sdim
591221345Sdim	vm_page_lock_assert(mem, MA_OWNED);
592221345Sdim        mem->hold_count++;
593221345Sdim}
594221345Sdim
595221345Sdimvoid
596221345Sdimvm_page_unhold(vm_page_t mem)
597221345Sdim{
598221345Sdim
599221345Sdim	vm_page_lock_assert(mem, MA_OWNED);
600221345Sdim	--mem->hold_count;
601221345Sdim	KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
602221345Sdim	if (mem->hold_count == 0 && mem->queue == PQ_HOLD)
603221345Sdim		vm_page_free_toq(mem);
604221345Sdim}
605221345Sdim
606221345Sdim/*
607221345Sdim *	vm_page_unhold_pages:
608221345Sdim *
609221345Sdim *	Unhold each of the pages that is referenced by the given array.
610221345Sdim */
611221345Sdimvoid
612221345Sdimvm_page_unhold_pages(vm_page_t *ma, int count)
613234353Sdim{
614221345Sdim	struct mtx *mtx, *new_mtx;
615221345Sdim
616221345Sdim	mtx = NULL;
617221345Sdim	for (; count != 0; count--) {
618221345Sdim		/*
619221345Sdim		 * Avoid releasing and reacquiring the same page lock.
620221345Sdim		 */
621221345Sdim		new_mtx = vm_page_lockptr(*ma);
622221345Sdim		if (mtx != new_mtx) {
623221345Sdim			if (mtx != NULL)
624221345Sdim				mtx_unlock(mtx);
625221345Sdim			mtx = new_mtx;
626221345Sdim			mtx_lock(mtx);
627221345Sdim		}
628221345Sdim		vm_page_unhold(*ma);
629221345Sdim		ma++;
630221345Sdim	}
631221345Sdim	if (mtx != NULL)
632221345Sdim		mtx_unlock(mtx);
633221345Sdim}
634221345Sdim
635221345Sdim/*
636221345Sdim *	vm_page_getfake:
637221345Sdim *
638239462Sdim *	Create a fictitious page with the specified physical address and
639239462Sdim *	memory attribute.  The memory attribute is the only the machine-
640239462Sdim *	dependent aspect of a fictitious page that must be initialized.
641221345Sdim */
642221345Sdimvm_page_t
643221345Sdimvm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
644221345Sdim{
645243830Sdim	vm_page_t m;
646224145Sdim
647224145Sdim	m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
648234353Sdim	m->phys_addr = paddr;
649243830Sdim	m->queue = PQ_NONE;
650221345Sdim	/* Fictitious pages don't use "segind". */
651221345Sdim	m->flags = PG_FICTITIOUS;
652263508Sdim	/* Fictitious pages don't use "order" or "pool". */
653224145Sdim	m->oflags = VPO_BUSY | VPO_UNMANAGED;
654221345Sdim	m->wire_count = 1;
655221345Sdim	pmap_page_set_memattr(m, memattr);
656234353Sdim	return (m);
657221345Sdim}
658221345Sdim
659221345Sdim/*
660221345Sdim *	vm_page_putfake:
661221345Sdim *
662221345Sdim *	Release a fictitious page.
663221345Sdim */
664249423Sdimvoid
665249423Sdimvm_page_putfake(vm_page_t m)
666249423Sdim{
667224145Sdim
668263508Sdim	KASSERT((m->flags & PG_FICTITIOUS) != 0,
669263508Sdim	    ("vm_page_putfake: bad page %p", m));
670263508Sdim	uma_zfree(fakepg_zone, m);
671263508Sdim}
672263508Sdim
673263508Sdim/*
674263508Sdim *	vm_page_updatefake:
675263508Sdim *
676263508Sdim *	Update the given fictitious page to the specified physical address and
677263508Sdim *	memory attribute.
678221345Sdim */
679221345Sdimvoid
680224145Sdimvm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
681263508Sdim{
682234353Sdim
683234353Sdim	KASSERT((m->flags & PG_FICTITIOUS) != 0,
684263508Sdim	    ("vm_page_updatefake: bad page %p", m));
685234353Sdim	m->phys_addr = paddr;
686224145Sdim	pmap_page_set_memattr(m, memattr);
687224145Sdim}
688224145Sdim
689224145Sdim/*
690263508Sdim *	vm_page_free:
691234353Sdim *
692221345Sdim *	Free a page.
693221345Sdim */
694221345Sdimvoid
695221345Sdimvm_page_free(vm_page_t m)
696221345Sdim{
697221345Sdim
698221345Sdim	m->flags &= ~PG_ZERO;
699221345Sdim	vm_page_free_toq(m);
700243830Sdim}
701221345Sdim
702221345Sdim/*
703221345Sdim *	vm_page_free_zero:
704221345Sdim *
705221345Sdim *	Free a page to the zerod-pages queue
706221345Sdim */
707221345Sdimvoid
708221345Sdimvm_page_free_zero(vm_page_t m)
709221345Sdim{
710221345Sdim
711234353Sdim	m->flags |= PG_ZERO;
712221345Sdim	vm_page_free_toq(m);
713221345Sdim}
714221345Sdim
715221345Sdim/*
716221345Sdim *	vm_page_sleep:
717221345Sdim *
718221345Sdim *	Sleep and release the page and page queues locks.
719221345Sdim *
720221345Sdim *	The object containing the given page must be locked.
721221345Sdim */
722221345Sdimvoid
723221345Sdimvm_page_sleep(vm_page_t m, const char *msg)
724221345Sdim{
725221345Sdim
726243830Sdim	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
727243830Sdim	if (mtx_owned(&vm_page_queue_mtx))
728243830Sdim		vm_page_unlock_queues();
729234353Sdim	if (mtx_owned(vm_page_lockptr(m)))
730221345Sdim		vm_page_unlock(m);
731221345Sdim
732221345Sdim	/*
733221345Sdim	 * It's possible that while we sleep, the page will get
734221345Sdim	 * unbusied and freed.  If we are holding the object
735221345Sdim	 * lock, we will assume we hold a reference to the object
736221345Sdim	 * such that even if m->object changes, we can re-lock
737234353Sdim	 * it.
738221345Sdim	 */
739221345Sdim	m->oflags |= VPO_WANTED;
740221345Sdim	msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0);
741221345Sdim}
742221345Sdim
743221345Sdim/*
744221345Sdim *	vm_page_dirty:
745221345Sdim *
746221345Sdim *	Set all bits in the page's dirty field.
747221345Sdim *
748221345Sdim *	The object containing the specified page must be locked if the
749221345Sdim *	call is made from the machine-independent layer.
750221345Sdim *
751221345Sdim *	See vm_page_clear_dirty_mask().
752221345Sdim */
753221345Sdimvoid
754221345Sdimvm_page_dirty(vm_page_t m)
755221345Sdim{
756221345Sdim
757221345Sdim	KASSERT((m->flags & PG_CACHED) == 0,
758221345Sdim	    ("vm_page_dirty: page in cache!"));
759221345Sdim	KASSERT(!VM_PAGE_IS_FREE(m),
760221345Sdim	    ("vm_page_dirty: page is free!"));
761234353Sdim	KASSERT(m->valid == VM_PAGE_BITS_ALL,
762221345Sdim	    ("vm_page_dirty: page is invalid!"));
763221345Sdim	m->dirty = VM_PAGE_BITS_ALL;
764221345Sdim}
765221345Sdim
766221345Sdim/*
767221345Sdim *	vm_page_splay:
768221345Sdim *
769221345Sdim *	Implements Sleator and Tarjan's top-down splay algorithm.  Returns
770221345Sdim *	the vm_page containing the given pindex.  If, however, that
771221345Sdim *	pindex is not found in the vm_object, returns a vm_page that is
772221345Sdim *	adjacent to the pindex, coming before or after it.
773221345Sdim */
774263508Sdimvm_page_t
775221345Sdimvm_page_splay(vm_pindex_t pindex, vm_page_t root)
776221345Sdim{
777221345Sdim	struct vm_page dummy;
778221345Sdim	vm_page_t lefttreemax, righttreemin, y;
779221345Sdim
780221345Sdim	if (root == NULL)
781221345Sdim		return (root);
782221345Sdim	lefttreemax = righttreemin = &dummy;
783221345Sdim	for (;; root = y) {
784263508Sdim		if (pindex < root->pindex) {
785263508Sdim			if ((y = root->left) == NULL)
786221345Sdim				break;
787221345Sdim			if (pindex < y->pindex) {
788221345Sdim				/* Rotate right. */
789221345Sdim				root->left = y->right;
790221345Sdim				y->right = root;
791221345Sdim				root = y;
792221345Sdim				if ((y = root->left) == NULL)
793221345Sdim					break;
794221345Sdim			}
795221345Sdim			/* Link into the new root's right tree. */
796221345Sdim			righttreemin->left = root;
797221345Sdim			righttreemin = root;
798221345Sdim		} else if (pindex > root->pindex) {
799221345Sdim			if ((y = root->right) == NULL)
800221345Sdim				break;
801221345Sdim			if (pindex > y->pindex) {
802263508Sdim				/* Rotate left. */
803263508Sdim				root->right = y->left;
804263508Sdim				y->left = root;
805221345Sdim				root = y;
806221345Sdim				if ((y = root->right) == NULL)
807243830Sdim					break;
808221345Sdim			}
809221345Sdim			/* Link into the new root's left tree. */
810221345Sdim			lefttreemax->right = root;
811221345Sdim			lefttreemax = root;
812243830Sdim		} else
813243830Sdim			break;
814239462Sdim	}
815221345Sdim	/* Assemble the new root. */
816243830Sdim	lefttreemax->right = root->left;
817221345Sdim	righttreemin->left = root->right;
818221345Sdim	root->left = dummy.right;
819221345Sdim	root->right = dummy.left;
820221345Sdim	return (root);
821221345Sdim}
822221345Sdim
823221345Sdim/*
824221345Sdim *	vm_page_insert:		[ internal use only ]
825221345Sdim *
826221345Sdim *	Inserts the given mem entry into the object and object list.
827221345Sdim *
828221345Sdim *	The pagetables are not updated but will presumably fault the page
829221345Sdim *	in if necessary, or if a kernel page the caller will at some point
830221345Sdim *	enter the page into the kernel's pmap.  We are not allowed to block
831221345Sdim *	here so we *can't* do this anyway.
832221345Sdim *
833221345Sdim *	The object and page must be locked.
834221345Sdim *	This routine may not block.
835221345Sdim */
836221345Sdimvoid
837221345Sdimvm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
838239462Sdim{
839243830Sdim	vm_page_t root;
840243830Sdim
841243830Sdim	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
842243830Sdim	if (m->object != NULL)
843243830Sdim		panic("vm_page_insert: page already inserted");
844239462Sdim
845239462Sdim	/*
846263508Sdim	 * Record the object/offset pair in this page
847263508Sdim	 */
848263508Sdim	m->object = object;
849263508Sdim	m->pindex = pindex;
850263508Sdim
851263508Sdim	/*
852263508Sdim	 * Now link into the object's ordered list of backed pages.
853263508Sdim	 */
854263508Sdim	root = object->root;
855263508Sdim	if (root == NULL) {
856239462Sdim		m->left = NULL;
857221345Sdim		m->right = NULL;
858243830Sdim		TAILQ_INSERT_TAIL(&object->memq, m, listq);
859234353Sdim	} else {
860221345Sdim		root = vm_page_splay(pindex, root);
861221345Sdim		if (pindex < root->pindex) {
862221345Sdim			m->left = root->left;
863221345Sdim			m->right = root;
864221345Sdim			root->left = NULL;
865198092Srdivacky			TAILQ_INSERT_BEFORE(root, m, listq);
866198092Srdivacky		} else if (pindex == root->pindex)
867198092Srdivacky			panic("vm_page_insert: offset already allocated");
868193326Sed		else {
869193326Sed			m->right = root->right;
870198092Srdivacky			m->left = root;
871198092Srdivacky			root->right = NULL;
872198092Srdivacky			TAILQ_INSERT_AFTER(&object->memq, root, m, listq);
873263508Sdim		}
874263508Sdim	}
875263508Sdim	object->root = m;
876263508Sdim
877263508Sdim	/*
878263508Sdim	 * show that the object has one more resident page.
879263508Sdim	 */
880263508Sdim	object->resident_page_count++;
881263508Sdim	/*
882263508Sdim	 * Hold the vnode until the last page is released.
883198092Srdivacky	 */
884198092Srdivacky	if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
885198092Srdivacky		vhold((struct vnode *)object->handle);
886198092Srdivacky
887198092Srdivacky	/*
888198092Srdivacky	 * Since we are inserting a new and possibly dirty page,
889198092Srdivacky	 * update the object's OBJ_MIGHTBEDIRTY flag.
890198092Srdivacky	 */
891198092Srdivacky	if (m->aflags & PGA_WRITEABLE)
892198092Srdivacky		vm_object_set_writeable_dirty(object);
893193326Sed}
894193326Sed
895193326Sed/*
896193326Sed *	vm_page_remove:
897193326Sed *				NOTE: used by device pager as well -wfj
898193326Sed *
899193326Sed *	Removes the given mem entry from the object/offset-page
900193326Sed *	table and the object page list, but do not invalidate/terminate
901193326Sed *	the backing store.
902193326Sed *
903193326Sed *	The object and page must be locked.
904193326Sed *	The underlying pmap entry (if any) is NOT removed here.
905193326Sed *	This routine may not block.
906193326Sed */
907193326Sedvoid
908193326Sedvm_page_remove(vm_page_t m)
909193326Sed{
910193326Sed	vm_object_t object;
911193326Sed	vm_page_t next, prev, root;
912193326Sed
913193326Sed	if ((m->oflags & VPO_UNMANAGED) == 0)
914193326Sed		vm_page_lock_assert(m, MA_OWNED);
915212904Sdim	if ((object = m->object) == NULL)
916193326Sed		return;
917193326Sed	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
918194613Sed	if (m->oflags & VPO_BUSY) {
919194613Sed		m->oflags &= ~VPO_BUSY;
920201361Srdivacky		vm_page_flash(m);
921194613Sed	}
922201361Srdivacky
923201361Srdivacky	/*
924201361Srdivacky	 * Now remove from the object's list of backed pages.
925201361Srdivacky	 */
926201361Srdivacky	if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
927201361Srdivacky		/*
928201361Srdivacky		 * Since the page's successor in the list is also its parent
929201361Srdivacky		 * in the tree, its right subtree must be empty.
930201361Srdivacky		 */
931201361Srdivacky		next->left = m->left;
932201361Srdivacky		KASSERT(m->right == NULL,
933201361Srdivacky		    ("vm_page_remove: page %p has right child", m));
934201361Srdivacky	} else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
935201361Srdivacky	    prev->right == m) {
936201361Srdivacky		/*
937201361Srdivacky		 * Since the page's predecessor in the list is also its parent
938201361Srdivacky		 * in the tree, its left subtree must be empty.
939201361Srdivacky		 */
940201361Srdivacky		KASSERT(m->left == NULL,
941201361Srdivacky		    ("vm_page_remove: page %p has left child", m));
942201361Srdivacky		prev->right = m->right;
943201361Srdivacky	} else {
944201361Srdivacky		if (m != object->root)
945201361Srdivacky			vm_page_splay(m->pindex, object->root);
946194613Sed		if (m->left == NULL)
947201361Srdivacky			root = m->right;
948194613Sed		else if (m->right == NULL)
949194613Sed			root = m->left;
950194613Sed		else {
951201361Srdivacky			/*
952194613Sed			 * Move the page's successor to the root, because
953201361Srdivacky			 * pages are usually removed in ascending order.
954201361Srdivacky			 */
955201361Srdivacky			if (m->right != next)
956201361Srdivacky				vm_page_splay(m->pindex, m->right);
957263508Sdim			next->left = m->left;
958201361Srdivacky			root = next;
959201361Srdivacky		}
960201361Srdivacky		object->root = root;
961194613Sed	}
962194613Sed	TAILQ_REMOVE(&object->memq, m, listq);
963226633Sdim
964226633Sdim	/*
965226633Sdim	 * And show that the object has one fewer resident page.
966226633Sdim	 */
967226633Sdim	object->resident_page_count--;
968226633Sdim	/*
969226633Sdim	 * The vnode may now be recycled.
970226633Sdim	 */
971226633Sdim	if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
972226633Sdim		vdrop((struct vnode *)object->handle);
973226633Sdim
974234353Sdim	m->object = NULL;
975234353Sdim}
976234353Sdim
977234353Sdim/*
978234353Sdim *	vm_page_lookup:
979234353Sdim *
980234353Sdim *	Returns the page associated with the object/offset
981226633Sdim *	pair specified; if none is found, NULL is returned.
982226633Sdim *
983226633Sdim *	The object must be locked.
984226633Sdim *	This routine may not block.
985226633Sdim *	This is a critical path routine
986226633Sdim */
987226633Sdimvm_page_t
988226633Sdimvm_page_lookup(vm_object_t object, vm_pindex_t pindex)
989226633Sdim{
990226633Sdim	vm_page_t m;
991226633Sdim
992234353Sdim	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
993234353Sdim	if ((m = object->root) != NULL && m->pindex != pindex) {
994234353Sdim		m = vm_page_splay(pindex, m);
995234353Sdim		if ((object->root = m)->pindex != pindex)
996234353Sdim			m = NULL;
997234353Sdim	}
998234353Sdim	return (m);
999234353Sdim}
1000234353Sdim
1001193326Sed/*
1002193326Sed *	vm_page_find_least:
1003193326Sed *
1004193326Sed *	Returns the page associated with the object with least pindex
1005193326Sed *	greater than or equal to the parameter pindex, or NULL.
1006193326Sed *
1007193326Sed *	The object must be locked.
1008193326Sed *	The routine may not block.
1009193326Sed */
1010199512Srdivackyvm_page_t
1011199512Srdivackyvm_page_find_least(vm_object_t object, vm_pindex_t pindex)
1012234353Sdim{
1013193326Sed	vm_page_t m;
1014193326Sed
1015199512Srdivacky	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1016193326Sed	if ((m = TAILQ_FIRST(&object->memq)) != NULL) {
1017193326Sed		if (m->pindex < pindex) {
1018199512Srdivacky			m = vm_page_splay(pindex, object->root);
1019199512Srdivacky			if ((object->root = m)->pindex < pindex)
1020193326Sed				m = TAILQ_NEXT(m, listq);
1021193326Sed		}
1022193326Sed	}
1023198092Srdivacky	return (m);
1024193326Sed}
1025193326Sed
1026193326Sed/*
1027263508Sdim * Returns the given page's successor (by pindex) within the object if it is
1028193326Sed * resident; if none is found, NULL is returned.
1029193326Sed *
1030193326Sed * The object must be locked.
1031193326Sed */
1032193326Sedvm_page_t
1033198092Srdivackyvm_page_next(vm_page_t m)
1034198092Srdivacky{
1035193326Sed	vm_page_t next;
1036263508Sdim
1037263508Sdim	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1038263508Sdim	if ((next = TAILQ_NEXT(m, listq)) != NULL &&
1039226633Sdim	    next->pindex != m->pindex + 1)
1040263508Sdim		next = NULL;
1041263508Sdim	return (next);
1042198092Srdivacky}
1043193326Sed
1044204643Srdivacky/*
1045204643Srdivacky * Returns the given page's predecessor (by pindex) within the object if it is
1046204643Srdivacky * resident; if none is found, NULL is returned.
1047204643Srdivacky *
1048204643Srdivacky * The object must be locked.
1049198092Srdivacky */
1050198092Srdivackyvm_page_t
1051198092Srdivackyvm_page_prev(vm_page_t m)
1052198092Srdivacky{
1053212904Sdim	vm_page_t prev;
1054212904Sdim
1055198092Srdivacky	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1056198092Srdivacky	if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
1057198092Srdivacky	    prev->pindex != m->pindex - 1)
1058198092Srdivacky		prev = NULL;
1059193326Sed	return (prev);
1060193326Sed}
1061193326Sed
1062212904Sdim/*
1063221345Sdim *	vm_page_rename:
1064221345Sdim *
1065221345Sdim *	Move the given memory entry from its
1066221345Sdim *	current object to the specified target object/offset.
1067221345Sdim *
1068221345Sdim *	The object must be locked.
1069221345Sdim *	This routine may not block.
1070221345Sdim *
1071221345Sdim *	Note: swap associated with the page must be invalidated by the move.  We
1072221345Sdim *	      have to do this for several reasons:  (1) we aren't freeing the
1073221345Sdim *	      page, (2) we are dirtying the page, (3) the VM system is probably
1074221345Sdim *	      moving the page from object A to B, and will then later move
1075221345Sdim *	      the backing store from A to B and we can't have a conflict.
1076221345Sdim *
1077221345Sdim *	Note: we *always* dirty the page.  It is necessary both for the
1078221345Sdim *	      fact that we moved it, and because we may be invalidating
1079221345Sdim *	      swap.  If the page is on the cache, we have to deactivate it
1080221345Sdim *	      or vm_page_dirty() will panic.  Dirty pages are not allowed
1081193326Sed *	      on the cache.
1082193326Sed */
1083234353Sdimvoid
1084234353Sdimvm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1085234353Sdim{
1086234353Sdim
1087234353Sdim	vm_page_remove(m);
1088263508Sdim	vm_page_insert(m, new_object, new_pindex);
1089221345Sdim	vm_page_dirty(m);
1090249423Sdim}
1091221345Sdim
1092198092Srdivacky/*
1093198092Srdivacky *	Convert all of the given object's cached pages that have a
1094212904Sdim *	pindex within the given range into free pages.  If the value
1095212904Sdim *	zero is given for "end", then the range's upper bound is
1096212904Sdim *	infinity.  If the given object is backed by a vnode and it
1097263508Sdim *	transitions from having one or more cached pages to none, the
1098212904Sdim *	vnode's hold count is reduced.
1099212904Sdim */
1100212904Sdimvoid
1101212904Sdimvm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1102212904Sdim{
1103212904Sdim	vm_page_t m, m_next;
1104212904Sdim	boolean_t empty;
1105199512Srdivacky
1106199512Srdivacky	mtx_lock(&vm_page_queue_free_mtx);
1107199512Srdivacky	if (__predict_false(object->cache == NULL)) {
1108199512Srdivacky		mtx_unlock(&vm_page_queue_free_mtx);
1109199512Srdivacky		return;
1110199512Srdivacky	}
1111223017Sdim	m = object->cache = vm_page_splay(start, object->cache);
1112223017Sdim	if (m->pindex < start) {
1113223017Sdim		if (m->right == NULL)
1114223017Sdim			m = NULL;
1115199512Srdivacky		else {
1116199512Srdivacky			m_next = vm_page_splay(start, m->right);
1117199512Srdivacky			m_next->left = m;
1118199512Srdivacky			m->right = NULL;
1119223017Sdim			m = object->cache = m_next;
1120199512Srdivacky		}
1121199512Srdivacky	}
1122199512Srdivacky
1123223017Sdim	/*
1124199512Srdivacky	 * At this point, "m" is either (1) a reference to the page
1125199512Srdivacky	 * with the least pindex that is greater than or equal to
1126199512Srdivacky	 * "start" or (2) NULL.
1127199512Srdivacky	 */
1128199512Srdivacky	for (; m != NULL && (m->pindex < end || end == 0); m = m_next) {
1129199512Srdivacky		/*
1130199512Srdivacky		 * Find "m"'s successor and remove "m" from the
1131199512Srdivacky		 * object's cache.
1132199512Srdivacky		 */
1133199512Srdivacky		if (m->right == NULL) {
1134199512Srdivacky			object->cache = m->left;
1135199512Srdivacky			m_next = NULL;
1136199512Srdivacky		} else {
1137199512Srdivacky			m_next = vm_page_splay(start, m->right);
1138199512Srdivacky			m_next->left = m->left;
1139199512Srdivacky			object->cache = m_next;
1140199512Srdivacky		}
1141199512Srdivacky		/* Convert "m" to a free page. */
1142199512Srdivacky		m->object = NULL;
1143199512Srdivacky		m->valid = 0;
1144199512Srdivacky		/* Clear PG_CACHED and set PG_FREE. */
1145199512Srdivacky		m->flags ^= PG_CACHED | PG_FREE;
1146199512Srdivacky		KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
1147199512Srdivacky		    ("vm_page_cache_free: page %p has inconsistent flags", m));
1148212904Sdim		cnt.v_cache_count--;
1149212904Sdim		cnt.v_free_count++;
1150212904Sdim	}
1151212904Sdim	empty = object->cache == NULL;
1152212904Sdim	mtx_unlock(&vm_page_queue_free_mtx);
1153212904Sdim	if (object->type == OBJT_VNODE && empty)
1154212904Sdim		vdrop(object->handle);
1155212904Sdim}
1156212904Sdim
1157212904Sdim/*
1158223017Sdim *	Returns the cached page that is associated with the given
1159212904Sdim *	object and offset.  If, however, none exists, returns NULL.
1160212904Sdim *
1161212904Sdim *	The free page queue must be locked.
1162212904Sdim */
1163218893Sdimstatic inline vm_page_t
1164218893Sdimvm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
1165218893Sdim{
1166212904Sdim	vm_page_t m;
1167212904Sdim
1168249423Sdim	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1169249423Sdim	if ((m = object->cache) != NULL && m->pindex != pindex) {
1170249423Sdim		m = vm_page_splay(pindex, m);
1171249423Sdim		if ((object->cache = m)->pindex != pindex)
1172249423Sdim			m = NULL;
1173249423Sdim	}
1174249423Sdim	return (m);
1175249423Sdim}
1176249423Sdim
1177249423Sdim/*
1178249423Sdim *	Remove the given cached page from its containing object's
1179249423Sdim *	collection of cached pages.
1180249423Sdim *
1181249423Sdim *	The free page queue must be locked.
1182249423Sdim */
1183249423Sdimvoid
1184249423Sdimvm_page_cache_remove(vm_page_t m)
1185249423Sdim{
1186249423Sdim	vm_object_t object;
1187249423Sdim	vm_page_t root;
1188249423Sdim
1189249423Sdim	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1190263508Sdim	KASSERT((m->flags & PG_CACHED) != 0,
1191249423Sdim	    ("vm_page_cache_remove: page %p is not cached", m));
1192249423Sdim	object = m->object;
1193263508Sdim	if (m != object->cache) {
1194263508Sdim		root = vm_page_splay(m->pindex, object->cache);
1195263508Sdim		KASSERT(root == m,
1196263508Sdim		    ("vm_page_cache_remove: page %p is not cached in object %p",
1197263508Sdim		    m, object));
1198263508Sdim	}
1199263508Sdim	if (m->left == NULL)
1200263508Sdim		root = m->right;
1201212904Sdim	else if (m->right == NULL)
1202212904Sdim		root = m->left;
1203212904Sdim	else {
1204212904Sdim		root = vm_page_splay(m->pindex, m->left);
1205212904Sdim		root->right = m->right;
1206212904Sdim	}
1207212904Sdim	object->cache = root;
1208218893Sdim	m->object = NULL;
1209218893Sdim	cnt.v_cache_count--;
1210212904Sdim}
1211212904Sdim
1212212904Sdim/*
1213212904Sdim *	Transfer all of the cached pages with offset greater than or
1214212904Sdim *	equal to 'offidxstart' from the original object's cache to the
1215212904Sdim *	new object's cache.  However, any cached pages with offset
1216212904Sdim *	greater than or equal to the new object's size are kept in the
1217212904Sdim *	original object.  Initially, the new object's cache must be
1218212904Sdim *	empty.  Offset 'offidxstart' in the original object must
1219212904Sdim *	correspond to offset zero in the new object.
1220263508Sdim *
1221263508Sdim *	The new object must be locked.
1222263508Sdim */
1223212904Sdimvoid
1224212904Sdimvm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
1225212904Sdim    vm_object_t new_object)
1226223017Sdim{
1227218893Sdim	vm_page_t m, m_next;
1228218893Sdim
1229218893Sdim	/*
1230263508Sdim	 * Insertion into an object's collection of cached pages
1231263508Sdim	 * requires the object to be locked.  In contrast, removal does
1232263508Sdim	 * not.
1233263508Sdim	 */
1234218893Sdim	VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
1235218893Sdim	KASSERT(new_object->cache == NULL,
1236263508Sdim	    ("vm_page_cache_transfer: object %p has cached pages",
1237263508Sdim	    new_object));
1238263508Sdim	mtx_lock(&vm_page_queue_free_mtx);
1239212904Sdim	if ((m = orig_object->cache) != NULL) {
1240212904Sdim		/*
1241212904Sdim		 * Transfer all of the pages with offset greater than or
1242218893Sdim		 * equal to 'offidxstart' from the original object's
1243218893Sdim		 * cache to the new object's cache.
1244212904Sdim		 */
1245212904Sdim		m = vm_page_splay(offidxstart, m);
1246218893Sdim		if (m->pindex < offidxstart) {
1247249423Sdim			orig_object->cache = m;
1248218893Sdim			new_object->cache = m->right;
1249218893Sdim			m->right = NULL;
1250218893Sdim		} else {
1251212904Sdim			orig_object->cache = m->left;
1252212904Sdim			new_object->cache = m;
1253212904Sdim			m->left = NULL;
1254212904Sdim		}
1255263508Sdim		while ((m = new_object->cache) != NULL) {
1256212904Sdim			if ((m->pindex - offidxstart) >= new_object->size) {
1257212904Sdim				/*
1258212904Sdim				 * Return all of the cached pages with
1259212904Sdim				 * offset greater than or equal to the
1260212904Sdim				 * new object's size to the original
1261263508Sdim				 * object's cache.
1262212904Sdim				 */
1263212904Sdim				new_object->cache = m->left;
1264212904Sdim				m->left = orig_object->cache;
1265212904Sdim				orig_object->cache = m;
1266239462Sdim				break;
1267239462Sdim			}
1268239462Sdim			m_next = vm_page_splay(m->pindex, m->right);
1269212904Sdim			/* Update the page's object and offset. */
1270199482Srdivacky			m->object = new_object;
1271203955Srdivacky			m->pindex -= offidxstart;
1272203955Srdivacky			if (m_next == NULL)
1273203955Srdivacky				break;
1274234353Sdim			m->right = NULL;
1275199482Srdivacky			m_next->left = m;
1276203955Srdivacky			new_object->cache = m_next;
1277218893Sdim		}
1278218893Sdim		KASSERT(new_object->cache == NULL ||
1279218893Sdim		    new_object->type == OBJT_SWAP,
1280203955Srdivacky		    ("vm_page_cache_transfer: object %p's type is incompatible"
1281203955Srdivacky		    " with cached pages", new_object));
1282203955Srdivacky	}
1283203955Srdivacky	mtx_unlock(&vm_page_queue_free_mtx);
1284203955Srdivacky}
1285203955Srdivacky
1286234353Sdim/*
1287206084Srdivacky *	vm_page_alloc:
1288206084Srdivacky *
1289206084Srdivacky *	Allocate and return a memory cell associated
1290206084Srdivacky *	with this VM object/offset pair.
1291206084Srdivacky *
1292243830Sdim *	The caller must always specify an allocation class.
1293206084Srdivacky *
1294206084Srdivacky *	allocation classes:
1295206084Srdivacky *	VM_ALLOC_NORMAL		normal process request
1296206084Srdivacky *	VM_ALLOC_SYSTEM		system *really* needs a page
1297208600Srdivacky *	VM_ALLOC_INTERRUPT	interrupt time request
1298208600Srdivacky *
1299208600Srdivacky *	optional allocation flags:
1300207619Srdivacky *	VM_ALLOC_ZERO		prefer a zeroed page
1301207619Srdivacky *	VM_ALLOC_WIRED		wire the allocated page
1302206084Srdivacky *	VM_ALLOC_NOOBJ		page is not associated with a vm object
1303206084Srdivacky *	VM_ALLOC_NOBUSY		do not set the page busy
1304206084Srdivacky *	VM_ALLOC_IFCACHED	return page only if it is cached
1305206084Srdivacky *	VM_ALLOC_IFNOTCACHED	return NULL, do not reactivate if the page
1306206084Srdivacky *				is cached
1307206084Srdivacky *
1308263508Sdim *	This routine may not sleep.
1309199482Srdivacky */
1310234353Sdimvm_page_t
1311234353Sdimvm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
1312243830Sdim{
1313243830Sdim	struct vnode *vp = NULL;
1314234353Sdim	vm_object_t m_object;
1315234353Sdim	vm_page_t m;
1316234353Sdim	int flags, page_req;
1317234353Sdim
1318263508Sdim	if ((req & VM_ALLOC_NOOBJ) == 0) {
1319234353Sdim		KASSERT(object != NULL,
1320234353Sdim		    ("vm_page_alloc: NULL object."));
1321234353Sdim		VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1322199482Srdivacky	}
1323199482Srdivacky
1324206084Srdivacky	page_req = req & VM_ALLOC_CLASS_MASK;
1325206084Srdivacky
1326199482Srdivacky	/*
1327199482Srdivacky	 * The pager is allowed to eat deeper into the free page list.
1328203955Srdivacky	 */
1329199482Srdivacky	if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT))
1330199482Srdivacky		page_req = VM_ALLOC_SYSTEM;
1331226633Sdim
1332226633Sdim	mtx_lock(&vm_page_queue_free_mtx);
1333226633Sdim	if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
1334226633Sdim	    (page_req == VM_ALLOC_SYSTEM &&
1335234353Sdim	    cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
1336226633Sdim	    (page_req == VM_ALLOC_INTERRUPT &&
1337226633Sdim	    cnt.v_free_count + cnt.v_cache_count > 0)) {
1338226633Sdim		/*
1339226633Sdim		 * Allocate from the free queue if the number of free pages
1340226633Sdim		 * exceeds the minimum for the request class.
1341226633Sdim		 */
1342226633Sdim		if (object != NULL &&
1343226633Sdim		    (m = vm_page_cache_lookup(object, pindex)) != NULL) {
1344218893Sdim			if ((req & VM_ALLOC_IFNOTCACHED) != 0) {
1345218893Sdim				mtx_unlock(&vm_page_queue_free_mtx);
1346208600Srdivacky				return (NULL);
1347226633Sdim			}
1348208600Srdivacky			if (vm_phys_unfree_page(m))
1349208600Srdivacky				vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0);
1350208600Srdivacky#if VM_NRESERVLEVEL > 0
1351226633Sdim			else if (!vm_reserv_reactivate_page(m))
1352226633Sdim#else
1353218893Sdim			else
1354208600Srdivacky#endif
1355218893Sdim				panic("vm_page_alloc: cache page %p is missing"
1356218893Sdim				    " from the free queue", m);
1357218893Sdim		} else if ((req & VM_ALLOC_IFCACHED) != 0) {
1358208600Srdivacky			mtx_unlock(&vm_page_queue_free_mtx);
1359218893Sdim			return (NULL);
1360218893Sdim#if VM_NRESERVLEVEL > 0
1361226633Sdim		} else if (object == NULL || object->type == OBJT_DEVICE ||
1362208600Srdivacky		    object->type == OBJT_SG ||
1363208600Srdivacky		    (object->flags & OBJ_COLORED) == 0 ||
1364218893Sdim		    (m = vm_reserv_alloc_page(object, pindex)) == NULL) {
1365218893Sdim#else
1366218893Sdim		} else {
1367218893Sdim#endif
1368218893Sdim			m = vm_phys_alloc_pages(object != NULL ?
1369218893Sdim			    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
1370218893Sdim#if VM_NRESERVLEVEL > 0
1371218893Sdim			if (m == NULL && vm_reserv_reclaim_inactive()) {
1372193326Sed				m = vm_phys_alloc_pages(object != NULL ?
1373193326Sed				    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT,
1374193326Sed				    0);
1375198092Srdivacky			}
1376193326Sed#endif
1377193326Sed		}
1378193326Sed	} else {
1379212904Sdim		/*
1380193326Sed		 * Not allocatable, give up.
1381193326Sed		 */
1382193326Sed		mtx_unlock(&vm_page_queue_free_mtx);
1383193326Sed		atomic_add_int(&vm_pageout_deficit,
1384193326Sed		    MAX((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
1385193326Sed		pagedaemon_wakeup();
1386193326Sed		return (NULL);
1387198092Srdivacky	}
1388249423Sdim
1389208600Srdivacky	/*
1390208600Srdivacky	 *  At this point we had better have found a good page.
1391218893Sdim	 */
1392218893Sdim
1393218893Sdim	KASSERT(m != NULL, ("vm_page_alloc: missing page"));
1394218893Sdim	KASSERT(m->queue == PQ_NONE,
1395193326Sed	    ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue));
1396193326Sed	KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m));
1397193326Sed	KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m));
1398263508Sdim	KASSERT(m->busy == 0, ("vm_page_alloc: page %p is busy", m));
1399193326Sed	KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m));
1400193326Sed	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
1401234353Sdim	    ("vm_page_alloc: page %p has unexpected memattr %d", m,
1402234353Sdim	    pmap_page_get_memattr(m)));
1403234353Sdim	if ((m->flags & PG_CACHED) != 0) {
1404234353Sdim		KASSERT(m->valid != 0,
1405234353Sdim		    ("vm_page_alloc: cached page %p is invalid", m));
1406234353Sdim		if (m->object == object && m->pindex == pindex)
1407234353Sdim	  		cnt.v_reactivated++;
1408234353Sdim		else
1409234353Sdim			m->valid = 0;
1410207619Srdivacky		m_object = m->object;
1411202379Srdivacky		vm_page_cache_remove(m);
1412207619Srdivacky		if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
1413202379Srdivacky			vp = m_object->handle;
1414202379Srdivacky	} else {
1415202379Srdivacky		KASSERT(VM_PAGE_IS_FREE(m),
1416207619Srdivacky		    ("vm_page_alloc: page %p is not free", m));
1417207619Srdivacky		KASSERT(m->valid == 0,
1418239462Sdim		    ("vm_page_alloc: free page %p is valid", m));
1419207619Srdivacky		cnt.v_free_count--;
1420207619Srdivacky	}
1421207619Srdivacky
1422207619Srdivacky	/*
1423202379Srdivacky	 * Only the PG_ZERO flag is inherited.  The PG_CACHED or PG_FREE flag
1424207619Srdivacky	 * must be cleared before the free page queues lock is released.
1425224145Sdim	 */
1426193326Sed	flags = 0;
1427193326Sed	if (req & VM_ALLOC_NODUMP)
1428207619Srdivacky		flags |= PG_NODUMP;
1429198092Srdivacky	if (m->flags & PG_ZERO) {
1430224145Sdim		vm_page_zero_count--;
1431202379Srdivacky		if (req & VM_ALLOC_ZERO)
1432202379Srdivacky			flags = PG_ZERO;
1433234353Sdim	}
1434234353Sdim	m->flags = flags;
1435234353Sdim	mtx_unlock(&vm_page_queue_free_mtx);
1436234353Sdim	m->aflags = 0;
1437263508Sdim	if (object == NULL || object->type == OBJT_PHYS)
1438234353Sdim		m->oflags = VPO_UNMANAGED;
1439202379Srdivacky	else
1440202379Srdivacky		m->oflags = 0;
1441202379Srdivacky	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ)) == 0)
1442234353Sdim		m->oflags |= VPO_BUSY;
1443234353Sdim	if (req & VM_ALLOC_WIRED) {
1444234353Sdim		/*
1445234353Sdim		 * The page lock is not required for wiring a page until that
1446234353Sdim		 * page is inserted into the object.
1447193326Sed		 */
1448193326Sed		atomic_add_int(&cnt.v_wire_count, 1);
1449193326Sed		m->wire_count = 1;
1450193326Sed	}
1451193326Sed	m->act_count = 0;
1452193326Sed
1453193326Sed	if (object != NULL) {
1454193326Sed		/* Ignore device objects; the pager sets "memattr" for them. */
1455193326Sed		if (object->memattr != VM_MEMATTR_DEFAULT &&
1456193326Sed		    object->type != OBJT_DEVICE && object->type != OBJT_SG)
1457193326Sed			pmap_page_set_memattr(m, object->memattr);
1458198092Srdivacky		vm_page_insert(m, object, pindex);
1459193326Sed	} else
1460193326Sed		m->pindex = pindex;
1461193326Sed
1462193326Sed	/*
1463193326Sed	 * The following call to vdrop() must come after the above call
1464193326Sed	 * to vm_page_insert() in case both affect the same object and
1465193326Sed	 * vnode.  Otherwise, the affected vnode's hold count could
1466193326Sed	 * temporarily become zero.
1467193326Sed	 */
1468193326Sed	if (vp != NULL)
1469193326Sed		vdrop(vp);
1470193326Sed
1471193326Sed	/*
1472193326Sed	 * Don't wakeup too often - wakeup the pageout daemon when
1473193326Sed	 * we would be nearly out of memory.
1474263508Sdim	 */
1475234353Sdim	if (vm_paging_needed())
1476193326Sed		pagedaemon_wakeup();
1477193326Sed
1478193326Sed	return (m);
1479193326Sed}
1480249423Sdim
1481249423Sdim/*
1482249423Sdim * Initialize a page that has been freshly dequeued from a freelist.
1483249423Sdim * The caller has to drop the vnode returned, if it is not NULL.
1484249423Sdim *
1485249423Sdim * To be called with vm_page_queue_free_mtx held.
1486249423Sdim */
1487249423Sdimstruct vnode *
1488249423Sdimvm_page_alloc_init(vm_page_t m)
1489249423Sdim{
1490249423Sdim	struct vnode *drop;
1491249423Sdim	vm_object_t m_object;
1492249423Sdim
1493249423Sdim	KASSERT(m->queue == PQ_NONE,
1494249423Sdim	    ("vm_page_alloc_init: page %p has unexpected queue %d",
1495249423Sdim	    m, m->queue));
1496249423Sdim	KASSERT(m->wire_count == 0,
1497249423Sdim	    ("vm_page_alloc_init: page %p is wired", m));
1498193326Sed	KASSERT(m->hold_count == 0,
1499193326Sed	    ("vm_page_alloc_init: page %p is held", m));
1500193326Sed	KASSERT(m->busy == 0,
1501193326Sed	    ("vm_page_alloc_init: page %p is busy", m));
1502193326Sed	KASSERT(m->dirty == 0,
1503193326Sed	    ("vm_page_alloc_init: page %p is dirty", m));
1504193326Sed	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
1505249423Sdim	    ("vm_page_alloc_init: page %p has unexpected memattr %d",
1506249423Sdim	    m, pmap_page_get_memattr(m)));
1507193326Sed	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1508193326Sed	drop = NULL;
1509194179Sed	if ((m->flags & PG_CACHED) != 0) {
1510198092Srdivacky		m->valid = 0;
1511193326Sed		m_object = m->object;
1512194179Sed		vm_page_cache_remove(m);
1513193326Sed		if (m_object->type == OBJT_VNODE &&
1514193326Sed		    m_object->cache == NULL)
1515193326Sed			drop = m_object->handle;
1516198092Srdivacky	} else {
1517193326Sed		KASSERT(VM_PAGE_IS_FREE(m),
1518218893Sdim		    ("vm_page_alloc_init: page %p is not free", m));
1519193326Sed		KASSERT(m->valid == 0,
1520193326Sed		    ("vm_page_alloc_init: free page %p is valid", m));
1521198092Srdivacky		cnt.v_free_count--;
1522198092Srdivacky	}
1523198092Srdivacky	if (m->flags & PG_ZERO)
1524218893Sdim		vm_page_zero_count--;
1525198092Srdivacky	/* Don't clear the PG_ZERO flag; we'll need it later. */
1526198092Srdivacky	m->flags &= PG_ZERO;
1527227737Sdim	m->aflags = 0;
1528227737Sdim	m->oflags = VPO_UNMANAGED;
1529227737Sdim	/* Unmanaged pages don't use "act_count". */
1530227737Sdim	return (drop);
1531227737Sdim}
1532227737Sdim
1533193326Sed/*
1534193326Sed * 	vm_page_alloc_freelist:
1535193326Sed *
1536193326Sed *	Allocate a page from the specified freelist.
1537193326Sed *	Only the ALLOC_CLASS values in req are honored, other request flags
1538193326Sed *	are ignored.
1539193326Sed */
1540218893Sdimvm_page_t
1541226633Sdimvm_page_alloc_freelist(int flind, int req)
1542193326Sed{
1543193326Sed	struct vnode *drop;
1544193326Sed	vm_page_t m;
1545193326Sed	int page_req;
1546193326Sed
1547263508Sdim	m = NULL;
1548263508Sdim	page_req = req & VM_ALLOC_CLASS_MASK;
1549263508Sdim	mtx_lock(&vm_page_queue_free_mtx);
1550263508Sdim	/*
1551263508Sdim	 * Do not allocate reserved pages unless the req has asked for it.
1552263508Sdim	 */
1553263508Sdim	if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
1554263508Sdim	    (page_req == VM_ALLOC_SYSTEM &&
1555263508Sdim	    cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
1556193326Sed	    (page_req == VM_ALLOC_INTERRUPT &&
1557263508Sdim	    cnt.v_free_count + cnt.v_cache_count > 0)) {
1558221345Sdim		m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
1559212904Sdim	}
1560249423Sdim	if (m == NULL) {
1561193326Sed		mtx_unlock(&vm_page_queue_free_mtx);
1562193326Sed		return (NULL);
1563193326Sed	}
1564193326Sed	drop = vm_page_alloc_init(m);
1565193326Sed	mtx_unlock(&vm_page_queue_free_mtx);
1566218893Sdim	if (drop)
1567226633Sdim		vdrop(drop);
1568221345Sdim	return (m);
1569221345Sdim}
1570221345Sdim
1571221345Sdim/*
1572221345Sdim *	vm_wait:	(also see VM_WAIT macro)
1573249423Sdim *
1574221345Sdim *	Block until free pages are available for allocation
1575221345Sdim *	- Called in various places before memory allocations.
1576221345Sdim */
1577226633Sdimvoid
1578193326Sedvm_wait(void)
1579198092Srdivacky{
1580198092Srdivacky
1581263508Sdim	mtx_lock(&vm_page_queue_free_mtx);
1582198092Srdivacky	if (curproc == pageproc) {
1583193326Sed		vm_pageout_pages_needed = 1;
1584193326Sed		msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
1585193326Sed		    PDROP | PSWP, "VMWait", 0);
1586193326Sed	} else {
1587193326Sed		if (!vm_pages_needed) {
1588263508Sdim			vm_pages_needed = 1;
1589193326Sed			wakeup(&vm_pages_needed);
1590193326Sed		}
1591193326Sed		msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
1592193326Sed		    "vmwait", 0);
1593193326Sed	}
1594249423Sdim}
1595249423Sdim
1596249423Sdim/*
1597249423Sdim *	vm_waitpfault:	(also see VM_WAITPFAULT macro)
1598249423Sdim *
1599249423Sdim *	Block until free pages are available for allocation
1600249423Sdim *	- Called only in vm_fault so that processes page faulting
1601249423Sdim *	  can be easily tracked.
1602249423Sdim *	- Sleeps at a lower priority than vm_wait() so that vm_wait()ing
1603249423Sdim *	  processes will be able to grab memory first.  Do not change
1604249423Sdim *	  this balance without careful testing first.
1605249423Sdim */
1606249423Sdimvoid
1607249423Sdimvm_waitpfault(void)
1608249423Sdim{
1609249423Sdim
1610249423Sdim	mtx_lock(&vm_page_queue_free_mtx);
1611249423Sdim	if (!vm_pages_needed) {
1612249423Sdim		vm_pages_needed = 1;
1613249423Sdim		wakeup(&vm_pages_needed);
1614249423Sdim	}
1615249423Sdim	msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
1616249423Sdim	    "pfault", 0);
1617263508Sdim}
1618249423Sdim
1619249423Sdim/*
1620249423Sdim *	vm_page_requeue:
1621249423Sdim *
1622249423Sdim *	Move the given page to the tail of its present page queue.
1623249423Sdim *
1624234353Sdim *	The page queues must be locked.
1625234353Sdim */
1626234353Sdimvoid
1627234353Sdimvm_page_requeue(vm_page_t m)
1628234353Sdim{
1629234353Sdim	struct vpgqueues *vpq;
1630234353Sdim	int queue;
1631234353Sdim
1632234353Sdim	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1633234353Sdim	queue = m->queue;
1634234353Sdim	KASSERT(queue != PQ_NONE,
1635234353Sdim	    ("vm_page_requeue: page %p is not queued", m));
1636234353Sdim	vpq = &vm_page_queues[queue];
1637234353Sdim	TAILQ_REMOVE(&vpq->pl, m, pageq);
1638234353Sdim	TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
1639234353Sdim}
1640234353Sdim
1641234353Sdim/*
1642234353Sdim *	vm_page_queue_remove:
1643234353Sdim *
1644234353Sdim *	Remove the given page from the specified queue.
1645234353Sdim *
1646234353Sdim *	The page and page queues must be locked.
1647234353Sdim */
1648234353Sdimstatic __inline void
1649234353Sdimvm_page_queue_remove(int queue, vm_page_t m)
1650234353Sdim{
1651234353Sdim	struct vpgqueues *pq;
1652234353Sdim
1653234353Sdim	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1654234353Sdim	vm_page_lock_assert(m, MA_OWNED);
1655234353Sdim	pq = &vm_page_queues[queue];
1656234353Sdim	TAILQ_REMOVE(&pq->pl, m, pageq);
1657234353Sdim	(*pq->cnt)--;
1658221345Sdim}
1659193326Sed
1660193326Sed/*
1661193326Sed *	vm_pageq_remove:
1662193326Sed *
1663221345Sdim *	Remove a page from its queue.
1664199512Srdivacky *
1665199512Srdivacky *	The given page must be locked.
1666199512Srdivacky *	This routine may not block.
1667198092Srdivacky */
1668193326Sedvoid
1669193326Sedvm_pageq_remove(vm_page_t m)
1670234353Sdim{
1671193326Sed	int queue;
1672193326Sed
1673193326Sed	vm_page_lock_assert(m, MA_OWNED);
1674198092Srdivacky	if ((queue = m->queue) != PQ_NONE) {
1675239462Sdim		vm_page_lock_queues();
1676239462Sdim		m->queue = PQ_NONE;
1677239462Sdim		vm_page_queue_remove(queue, m);
1678239462Sdim		vm_page_unlock_queues();
1679239462Sdim	}
1680239462Sdim}
1681239462Sdim
1682239462Sdim/*
1683239462Sdim *	vm_page_enqueue:
1684239462Sdim *
1685239462Sdim *	Add the given page to the specified queue.
1686239462Sdim *
1687239462Sdim *	The page queues must be locked.
1688239462Sdim */
1689239462Sdimstatic void
1690239462Sdimvm_page_enqueue(int queue, vm_page_t m)
1691193326Sed{
1692193326Sed	struct vpgqueues *vpq;
1693193326Sed
1694226633Sdim	vpq = &vm_page_queues[queue];
1695198092Srdivacky	m->queue = queue;
1696198092Srdivacky	TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
1697193326Sed	++*vpq->cnt;
1698193326Sed}
1699193326Sed
1700193326Sed/*
1701226633Sdim *	vm_page_activate:
1702199990Srdivacky *
1703199990Srdivacky *	Put the specified page on the active list (if appropriate).
1704193326Sed *	Ensure that act_count is at least ACT_INIT but do not otherwise
1705193326Sed *	mess with it.
1706193326Sed *
1707193326Sed *	The page must be locked.
1708199512Srdivacky *	This routine may not block.
1709193326Sed */
1710201361Srdivackyvoid
1711201361Srdivackyvm_page_activate(vm_page_t m)
1712198092Srdivacky{
1713193326Sed	int queue;
1714199512Srdivacky
1715199512Srdivacky	vm_page_lock_assert(m, MA_OWNED);
1716193326Sed	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1717193326Sed	if ((queue = m->queue) != PQ_ACTIVE) {
1718199512Srdivacky		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
1719193326Sed			if (m->act_count < ACT_INIT)
1720193326Sed				m->act_count = ACT_INIT;
1721193326Sed			vm_page_lock_queues();
1722199512Srdivacky			if (queue != PQ_NONE)
1723199512Srdivacky				vm_page_queue_remove(queue, m);
1724199512Srdivacky			vm_page_enqueue(PQ_ACTIVE, m);
1725199512Srdivacky			vm_page_unlock_queues();
1726193326Sed		} else
1727193326Sed			KASSERT(queue == PQ_NONE,
1728234353Sdim			    ("vm_page_activate: wired page %p is queued", m));
1729234353Sdim	} else {
1730193326Sed		if (m->act_count < ACT_INIT)
1731263508Sdim			m->act_count = ACT_INIT;
1732263508Sdim	}
1733263508Sdim}
1734263508Sdim
1735263508Sdim/*
1736263508Sdim *	vm_page_free_wakeup:
1737201361Srdivacky *
1738234353Sdim *	Helper routine for vm_page_free_toq() and vm_page_cache().  This
1739193326Sed *	routine is called when a page has been added to the cache or free
1740193326Sed *	queues.
1741234353Sdim *
1742202379Srdivacky *	The page queues must be locked.
1743202379Srdivacky *	This routine may not block.
1744202379Srdivacky */
1745202379Srdivackystatic inline void
1746193326Sedvm_page_free_wakeup(void)
1747193326Sed{
1748202379Srdivacky
1749202379Srdivacky	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1750202379Srdivacky	/*
1751202379Srdivacky	 * if pageout daemon needs pages, then tell it that there are
1752202379Srdivacky	 * some free.
1753202379Srdivacky	 */
1754202379Srdivacky	if (vm_pageout_pages_needed &&
1755202379Srdivacky	    cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
1756202379Srdivacky		wakeup(&vm_pageout_pages_needed);
1757202379Srdivacky		vm_pageout_pages_needed = 0;
1758202379Srdivacky	}
1759202379Srdivacky	/*
1760202379Srdivacky	 * wakeup processes that are waiting on memory if we hit a
1761202379Srdivacky	 * high water mark. And wakeup scheduler process if we have
1762202379Srdivacky	 * lots of memory. this process will swapin processes.
1763202379Srdivacky	 */
1764202379Srdivacky	if (vm_pages_needed && !vm_page_count_min()) {
1765202379Srdivacky		vm_pages_needed = 0;
1766202379Srdivacky		wakeup(&cnt.v_free_count);
1767202379Srdivacky	}
1768202379Srdivacky}
1769202379Srdivacky
1770221345Sdim/*
1771202379Srdivacky *	vm_page_free_toq:
1772202379Srdivacky *
1773193326Sed *	Returns the given page to the free list,
1774193326Sed *	disassociating it with any VM object.
1775193326Sed *
1776193326Sed *	Object and page must be locked prior to entry.
1777193326Sed *	This routine may not block.
1778193326Sed */
1779234353Sdim
1780234353Sdimvoid
1781234353Sdimvm_page_free_toq(vm_page_t m)
1782234353Sdim{
1783193326Sed
1784193326Sed	if ((m->oflags & VPO_UNMANAGED) == 0) {
1785193576Sed		vm_page_lock_assert(m, MA_OWNED);
1786193576Sed		KASSERT(!pmap_page_is_mapped(m),
1787204643Srdivacky		    ("vm_page_free_toq: freeing mapped page %p", m));
1788193576Sed	}
1789193576Sed	PCPU_INC(cnt.v_tfree);
1790193326Sed
1791198092Srdivacky	if (VM_PAGE_IS_FREE(m))
1792193326Sed		panic("vm_page_free: freeing free page %p", m);
1793193326Sed	else if (m->busy != 0)
1794193326Sed		panic("vm_page_free: freeing busy page %p", m);
1795193326Sed
1796193326Sed	/*
1797193326Sed	 * unqueue, then remove page.  Note that we cannot destroy
1798193326Sed	 * the page here because we do not want to call the pager's
1799193326Sed	 * callback routine until after we've put the page on the
1800198092Srdivacky	 * appropriate free queue.
1801212904Sdim	 */
1802239462Sdim	if ((m->oflags & VPO_UNMANAGED) == 0)
1803239462Sdim		vm_pageq_remove(m);
1804239462Sdim	vm_page_remove(m);
1805239462Sdim
1806239462Sdim	/*
1807239462Sdim	 * If fictitious remove object association and
1808239462Sdim	 * return, otherwise delay object association removal.
1809243830Sdim	 */
1810243830Sdim	if ((m->flags & PG_FICTITIOUS) != 0) {
1811243830Sdim		return;
1812243830Sdim	}
1813243830Sdim
1814243830Sdim	m->valid = 0;
1815243830Sdim	vm_page_undirty(m);
1816243830Sdim
1817243830Sdim	if (m->wire_count != 0)
1818243830Sdim		panic("vm_page_free: freeing wired page %p", m);
1819243830Sdim	if (m->hold_count != 0) {
1820243830Sdim		m->flags &= ~PG_ZERO;
1821243830Sdim		vm_page_lock_queues();
1822243830Sdim		vm_page_enqueue(PQ_HOLD, m);
1823243830Sdim		vm_page_unlock_queues();
1824243830Sdim	} else {
1825243830Sdim		/*
1826243830Sdim		 * Restore the default memory attribute to the page.
1827243830Sdim		 */
1828212904Sdim		if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
1829226633Sdim			pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
1830212904Sdim
1831212904Sdim		/*
1832226633Sdim		 * Insert the page into the physical memory allocator's
1833226633Sdim		 * cache/free page queues.
1834226633Sdim		 */
1835226633Sdim		mtx_lock(&vm_page_queue_free_mtx);
1836226633Sdim		m->flags |= PG_FREE;
1837212904Sdim		cnt.v_free_count++;
1838212904Sdim#if VM_NRESERVLEVEL > 0
1839212904Sdim		if (!vm_reserv_free_page(m))
1840193326Sed#else
1841212904Sdim		if (TRUE)
1842193326Sed#endif
1843193326Sed			vm_phys_free_pages(m, 0);
1844193326Sed		if ((m->flags & PG_ZERO) != 0)
1845193326Sed			++vm_page_zero_count;
1846249423Sdim		else
1847249423Sdim			vm_page_zero_idle_wakeup();
1848249423Sdim		vm_page_free_wakeup();
1849249423Sdim		mtx_unlock(&vm_page_queue_free_mtx);
1850249423Sdim	}
1851249423Sdim}
1852249423Sdim
1853249423Sdim/*
1854249423Sdim *	vm_page_wire:
1855249423Sdim *
1856249423Sdim *	Mark this page as wired down by yet
1857249423Sdim *	another map, removing it from paging queues
1858249423Sdim *	as necessary.
1859249423Sdim *
1860249423Sdim *	If the page is fictitious, then its wire count must remain one.
1861249423Sdim *
1862249423Sdim *	The page must be locked.
1863249423Sdim *	This routine may not block.
1864249423Sdim */
1865249423Sdimvoid
1866249423Sdimvm_page_wire(vm_page_t m)
1867249423Sdim{
1868249423Sdim
1869249423Sdim	/*
1870249423Sdim	 * Only bump the wire statistics if the page is not already wired,
1871249423Sdim	 * and only unqueue the page if it is on some queue (if it is unmanaged
1872249423Sdim	 * it is already off the queues).
1873249423Sdim	 */
1874249423Sdim	vm_page_lock_assert(m, MA_OWNED);
1875249423Sdim	if ((m->flags & PG_FICTITIOUS) != 0) {
1876249423Sdim		KASSERT(m->wire_count == 1,
1877249423Sdim		    ("vm_page_wire: fictitious page %p's wire count isn't one",
1878249423Sdim		    m));
1879249423Sdim		return;
1880249423Sdim	}
1881249423Sdim	if (m->wire_count == 0) {
1882249423Sdim		if ((m->oflags & VPO_UNMANAGED) == 0)
1883249423Sdim			vm_pageq_remove(m);
1884249423Sdim		atomic_add_int(&cnt.v_wire_count, 1);
1885249423Sdim	}
1886249423Sdim	m->wire_count++;
1887249423Sdim	KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
1888249423Sdim}
1889249423Sdim
1890249423Sdim/*
1891249423Sdim * vm_page_unwire:
1892249423Sdim *
1893249423Sdim * Release one wiring of the specified page, potentially enabling it to be
1894249423Sdim * paged again.  If paging is enabled, then the value of the parameter
1895249423Sdim * "activate" determines to which queue the page is added.  If "activate" is
1896249423Sdim * non-zero, then the page is added to the active queue.  Otherwise, it is
1897249423Sdim * added to the inactive queue.
1898249423Sdim *
1899249423Sdim * However, unless the page belongs to an object, it is not enqueued because
1900249423Sdim * it cannot be paged out.
1901249423Sdim *
1902249423Sdim * If a page is fictitious, then its wire count must alway be one.
1903249423Sdim *
1904249423Sdim * A managed page must be locked.
1905249423Sdim */
1906249423Sdimvoid
1907249423Sdimvm_page_unwire(vm_page_t m, int activate)
1908249423Sdim{
1909249423Sdim
1910249423Sdim	if ((m->oflags & VPO_UNMANAGED) == 0)
1911249423Sdim		vm_page_lock_assert(m, MA_OWNED);
1912249423Sdim	if ((m->flags & PG_FICTITIOUS) != 0) {
1913249423Sdim		KASSERT(m->wire_count == 1,
1914249423Sdim	    ("vm_page_unwire: fictitious page %p's wire count isn't one", m));
1915249423Sdim		return;
1916249423Sdim	}
1917249423Sdim	if (m->wire_count > 0) {
1918249423Sdim		m->wire_count--;
1919249423Sdim		if (m->wire_count == 0) {
1920249423Sdim			atomic_subtract_int(&cnt.v_wire_count, 1);
1921249423Sdim			if ((m->oflags & VPO_UNMANAGED) != 0 ||
1922249423Sdim			    m->object == NULL)
1923249423Sdim				return;
1924249423Sdim			vm_page_lock_queues();
1925249423Sdim			if (activate)
1926249423Sdim				vm_page_enqueue(PQ_ACTIVE, m);
1927249423Sdim			else {
1928249423Sdim				m->flags &= ~PG_WINATCFLS;
1929249423Sdim				vm_page_enqueue(PQ_INACTIVE, m);
1930249423Sdim			}
1931249423Sdim			vm_page_unlock_queues();
1932249423Sdim		}
1933249423Sdim	} else
1934249423Sdim		panic("vm_page_unwire: page %p's wire count is zero", m);
1935249423Sdim}
1936249423Sdim
1937249423Sdim/*
1938249423Sdim * Move the specified page to the inactive queue.
1939249423Sdim *
1940249423Sdim * Many pages placed on the inactive queue should actually go
1941249423Sdim * into the cache, but it is difficult to figure out which.  What
1942249423Sdim * we do instead, if the inactive target is well met, is to put
1943249423Sdim * clean pages at the head of the inactive queue instead of the tail.
1944249423Sdim * This will cause them to be moved to the cache more quickly and
1945249423Sdim * if not actively re-referenced, reclaimed more quickly.  If we just
1946249423Sdim * stick these pages at the end of the inactive queue, heavy filesystem
1947249423Sdim * meta-data accesses can cause an unnecessary paging load on memory bound
1948249423Sdim * processes.  This optimization causes one-time-use metadata to be
1949249423Sdim * reused more quickly.
1950249423Sdim *
1951249423Sdim * Normally athead is 0 resulting in LRU operation.  athead is set
1952249423Sdim * to 1 if we want this page to be 'as if it were placed in the cache',
1953249423Sdim * except without unmapping it from the process address space.
1954249423Sdim *
1955249423Sdim * This routine may not block.
1956249423Sdim */
1957249423Sdimstatic inline void
1958249423Sdim_vm_page_deactivate(vm_page_t m, int athead)
1959249423Sdim{
1960249423Sdim	int queue;
1961249423Sdim
1962249423Sdim	vm_page_lock_assert(m, MA_OWNED);
1963249423Sdim
1964249423Sdim	/*
1965249423Sdim	 * Ignore if already inactive.
1966249423Sdim	 */
1967249423Sdim	if ((queue = m->queue) == PQ_INACTIVE)
1968249423Sdim		return;
1969249423Sdim	if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
1970249423Sdim		vm_page_lock_queues();
1971239462Sdim		m->flags &= ~PG_WINATCFLS;
1972249423Sdim		if (queue != PQ_NONE)
1973239462Sdim			vm_page_queue_remove(queue, m);
1974249423Sdim		if (athead)
1975249423Sdim			TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m,
1976249423Sdim			    pageq);
1977249423Sdim		else
1978249423Sdim			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m,
1979239462Sdim			    pageq);
1980249423Sdim		m->queue = PQ_INACTIVE;
1981249423Sdim		cnt.v_inactive_count++;
1982249423Sdim		vm_page_unlock_queues();
1983249423Sdim	}
1984249423Sdim}
1985239462Sdim
1986249423Sdim/*
1987249423Sdim * Move the specified page to the inactive queue.
1988239462Sdim *
1989249423Sdim * The page must be locked.
1990249423Sdim */
1991239462Sdimvoid
1992249423Sdimvm_page_deactivate(vm_page_t m)
1993249423Sdim{
1994249423Sdim
1995239462Sdim	_vm_page_deactivate(m, 0);
1996249423Sdim}
1997249423Sdim
1998249423Sdim/*
1999249423Sdim * vm_page_try_to_cache:
2000249423Sdim *
2001249423Sdim * Returns 0 on failure, 1 on success
2002239462Sdim */
2003249423Sdimint
2004239462Sdimvm_page_try_to_cache(vm_page_t m)
2005239462Sdim{
2006239462Sdim
2007239462Sdim	vm_page_lock_assert(m, MA_OWNED);
2008239462Sdim	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2009239462Sdim	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
2010239462Sdim	    (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0)
2011239462Sdim		return (0);
2012239462Sdim	pmap_remove_all(m);
2013239462Sdim	if (m->dirty)
2014239462Sdim		return (0);
2015239462Sdim	vm_page_cache(m);
2016239462Sdim	return (1);
2017263508Sdim}
2018263508Sdim
2019263508Sdim/*
2020263508Sdim * vm_page_try_to_free()
2021263508Sdim *
2022263508Sdim *	Attempt to free the page.  If we cannot free it, we do nothing.
2023239462Sdim *	1 is returned on success, 0 on failure.
2024239462Sdim */
2025263508Sdimint
2026239462Sdimvm_page_try_to_free(vm_page_t m)
2027239462Sdim{
2028239462Sdim
2029239462Sdim	vm_page_lock_assert(m, MA_OWNED);
2030239462Sdim	if (m->object != NULL)
2031239462Sdim		VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2032239462Sdim	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
2033239462Sdim	    (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0)
2034239462Sdim		return (0);
2035239462Sdim	pmap_remove_all(m);
2036239462Sdim	if (m->dirty)
2037239462Sdim		return (0);
2038239462Sdim	vm_page_free(m);
2039239462Sdim	return (1);
2040239462Sdim}
2041239462Sdim
2042239462Sdim/*
2043239462Sdim * vm_page_cache
2044239462Sdim *
2045239462Sdim * Put the specified page onto the page cache queue (if appropriate).
2046239462Sdim *
2047239462Sdim * This routine may not block.
2048239462Sdim */
2049239462Sdimvoid
2050239462Sdimvm_page_cache(vm_page_t m)
2051239462Sdim{
2052239462Sdim	vm_object_t object;
2053239462Sdim	vm_page_t next, prev, root;
2054263508Sdim
2055263508Sdim	vm_page_lock_assert(m, MA_OWNED);
2056263508Sdim	object = m->object;
2057263508Sdim	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2058263508Sdim	if ((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) || m->busy ||
2059263508Sdim	    m->hold_count || m->wire_count)
2060263508Sdim		panic("vm_page_cache: attempting to cache busy page");
2061263508Sdim	pmap_remove_all(m);
2062263508Sdim	if (m->dirty != 0)
2063263508Sdim		panic("vm_page_cache: page %p is dirty", m);
2064263508Sdim	if (m->valid == 0 || object->type == OBJT_DEFAULT ||
2065263508Sdim	    (object->type == OBJT_SWAP &&
2066263508Sdim	    !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
2067263508Sdim		/*
2068263508Sdim		 * Hypothesis: A cache-elgible page belonging to a
2069263508Sdim		 * default object or swap object but without a backing
2070263508Sdim		 * store must be zero filled.
2071263508Sdim		 */
2072263508Sdim		vm_page_free(m);
2073263508Sdim		return;
2074263508Sdim	}
2075263508Sdim	KASSERT((m->flags & PG_CACHED) == 0,
2076263508Sdim	    ("vm_page_cache: page %p is already cached", m));
2077263508Sdim	PCPU_INC(cnt.v_tcached);
2078263508Sdim
2079263508Sdim	/*
2080239462Sdim	 * Remove the page from the paging queues.
2081239462Sdim	 */
2082239462Sdim	vm_pageq_remove(m);
2083239462Sdim
2084249423Sdim	/*
2085249423Sdim	 * Remove the page from the object's collection of resident
2086249423Sdim	 * pages.
2087249423Sdim	 */
2088249423Sdim	if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
2089249423Sdim		/*
2090249423Sdim		 * Since the page's successor in the list is also its parent
2091249423Sdim		 * in the tree, its right subtree must be empty.
2092249423Sdim		 */
2093249423Sdim		next->left = m->left;
2094249423Sdim		KASSERT(m->right == NULL,
2095249423Sdim		    ("vm_page_cache: page %p has right child", m));
2096249423Sdim	} else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
2097249423Sdim	    prev->right == m) {
2098249423Sdim		/*
2099249423Sdim		 * Since the page's predecessor in the list is also its parent
2100249423Sdim		 * in the tree, its left subtree must be empty.
2101249423Sdim		 */
2102249423Sdim		KASSERT(m->left == NULL,
2103249423Sdim		    ("vm_page_cache: page %p has left child", m));
2104249423Sdim		prev->right = m->right;
2105249423Sdim	} else {
2106249423Sdim		if (m != object->root)
2107249423Sdim			vm_page_splay(m->pindex, object->root);
2108249423Sdim		if (m->left == NULL)
2109239462Sdim			root = m->right;
2110239462Sdim		else if (m->right == NULL)
2111239462Sdim			root = m->left;
2112239462Sdim		else {
2113239462Sdim			/*
2114239462Sdim			 * Move the page's successor to the root, because
2115239462Sdim			 * pages are usually removed in ascending order.
2116239462Sdim			 */
2117221345Sdim			if (m->right != next)
2118249423Sdim				vm_page_splay(m->pindex, m->right);
2119249423Sdim			next->left = m->left;
2120263508Sdim			root = next;
2121263508Sdim		}
2122263508Sdim		object->root = root;
2123263508Sdim	}
2124263508Sdim	TAILQ_REMOVE(&object->memq, m, listq);
2125263508Sdim	object->resident_page_count--;
2126249423Sdim
2127249423Sdim	/*
2128249423Sdim	 * Restore the default memory attribute to the page.
2129239462Sdim	 */
2130239462Sdim	if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
2131239462Sdim		pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
2132234353Sdim
2133212904Sdim	/*
2134221345Sdim	 * Insert the page into the object's collection of cached pages
2135234353Sdim	 * and the physical memory allocator's cache/free page queues.
2136221345Sdim	 */
2137212904Sdim	m->flags &= ~PG_ZERO;
2138212904Sdim	mtx_lock(&vm_page_queue_free_mtx);
2139234353Sdim	m->flags |= PG_CACHED;
2140221345Sdim	cnt.v_cache_count++;
2141218893Sdim	root = object->cache;
2142234353Sdim	if (root == NULL) {
2143234353Sdim		m->left = NULL;
2144234353Sdim		m->right = NULL;
2145249423Sdim	} else {
2146226633Sdim		root = vm_page_splay(m->pindex, root);
2147249423Sdim		if (m->pindex < root->pindex) {
2148249423Sdim			m->left = root->left;
2149249423Sdim			m->right = root;
2150249423Sdim			root->left = NULL;
2151249423Sdim		} else if (__predict_false(m->pindex == root->pindex))
2152249423Sdim			panic("vm_page_cache: offset already cached");
2153226633Sdim		else {
2154249423Sdim			m->right = root->right;
2155249423Sdim			m->left = root;
2156249423Sdim			root->right = NULL;
2157249423Sdim		}
2158249423Sdim	}
2159249423Sdim	object->cache = m;
2160249423Sdim#if VM_NRESERVLEVEL > 0
2161249423Sdim	if (!vm_reserv_free_page(m)) {
2162249423Sdim#else
2163263508Sdim	if (TRUE) {
2164263508Sdim#endif
2165263508Sdim		vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0);
2166263508Sdim		vm_phys_free_pages(m, 0);
2167249423Sdim	}
2168221345Sdim	vm_page_free_wakeup();
2169193326Sed	mtx_unlock(&vm_page_queue_free_mtx);
2170221345Sdim
2171249423Sdim	/*
2172249423Sdim	 * Increment the vnode's hold count if this is the object's only
2173249423Sdim	 * cached page.  Decrement the vnode's hold count if this was
2174234353Sdim	 * the object's only resident page.
2175193326Sed	 */
2176193326Sed	if (object->type == OBJT_VNODE) {
2177221345Sdim		if (root == NULL && object->resident_page_count != 0)
2178221345Sdim			vhold(object->handle);
2179221345Sdim		else if (root != NULL && object->resident_page_count == 0)
2180221345Sdim			vdrop(object->handle);
2181249423Sdim	}
2182249423Sdim}
2183249423Sdim
2184249423Sdim/*
2185249423Sdim * vm_page_dontneed
2186249423Sdim *
2187249423Sdim *	Cache, deactivate, or do nothing as appropriate.  This routine
2188249423Sdim *	is typically used by madvise() MADV_DONTNEED.
2189249423Sdim *
2190249423Sdim *	Generally speaking we want to move the page into the cache so
2191249423Sdim *	it gets reused quickly.  However, this can result in a silly syndrome
2192249423Sdim *	due to the page recycling too quickly.  Small objects will not be
2193263508Sdim *	fully cached.  On the otherhand, if we move the page to the inactive
2194249423Sdim *	queue we wind up with a problem whereby very large objects
2195249423Sdim *	unnecessarily blow away our inactive and cache queues.
2196249423Sdim *
2197249423Sdim *	The solution is to move the pages based on a fixed weighting.  We
2198249423Sdim *	either leave them alone, deactivate them, or move them to the cache,
2199249423Sdim *	where moving them to the cache has the highest weighting.
2200221345Sdim *	By forcing some pages into other queues we eventually force the
2201221345Sdim *	system to balance the queues, potentially recovering other unrelated
2202221345Sdim *	space from active.  The idea is to not force this to happen too
2203221345Sdim *	often.
2204221345Sdim */
2205221345Sdimvoid
2206221345Sdimvm_page_dontneed(vm_page_t m)
2207221345Sdim{
2208221345Sdim	int dnw;
2209221345Sdim	int head;
2210221345Sdim
2211221345Sdim	vm_page_lock_assert(m, MA_OWNED);
2212221345Sdim	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2213249423Sdim	dnw = PCPU_GET(dnweight);
2214249423Sdim	PCPU_INC(dnweight);
2215221345Sdim
2216221345Sdim	/*
2217221345Sdim	 * Occasionally leave the page alone.
2218221345Sdim	 */
2219221345Sdim	if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE) {
2220221345Sdim		if (m->act_count >= ACT_INIT)
2221221345Sdim			--m->act_count;
2222221345Sdim		return;
2223221345Sdim	}
2224212904Sdim
2225212904Sdim	/*
2226193326Sed	 * Clear any references to the page.  Otherwise, the page daemon will
2227193326Sed	 * immediately reactivate the page.
2228193326Sed	 *
2229193326Sed	 * Perform the pmap_clear_reference() first.  Otherwise, a concurrent
2230193326Sed	 * pmap operation, such as pmap_remove(), could clear a reference in
2231193326Sed	 * the pmap and set PGA_REFERENCED on the page before the
2232193326Sed	 * pmap_clear_reference() had completed.  Consequently, the page would
2233193326Sed	 * appear referenced based upon an old reference that occurred before
2234212904Sdim	 * this function ran.
2235200583Srdivacky	 */
2236200583Srdivacky	pmap_clear_reference(m);
2237207619Srdivacky	vm_page_aflag_clear(m, PGA_REFERENCED);
2238200583Srdivacky
2239223017Sdim	if (m->dirty == 0 && pmap_is_modified(m))
2240223017Sdim		vm_page_dirty(m);
2241223017Sdim
2242201361Srdivacky	if (m->dirty || (dnw & 0x0070) == 0) {
2243200583Srdivacky		/*
2244223017Sdim		 * Deactivate the page 3 times out of 32.
2245223017Sdim		 */
2246223017Sdim		head = 0;
2247223017Sdim	} else {
2248200583Srdivacky		/*
2249223017Sdim		 * Cache the page 28 times out of every 32.  Note that
2250223017Sdim		 * the page is deactivated instead of cached, but placed
2251226633Sdim		 * at the head of the queue instead of the tail.
2252226633Sdim		 */
2253223017Sdim		head = 1;
2254223017Sdim	}
2255223017Sdim	_vm_page_deactivate(m, head);
2256200583Srdivacky}
2257200583Srdivacky
2258210299Sed/*
2259204643Srdivacky * Grab a page, waiting until we are waken up due to the page
2260204643Srdivacky * changing state.  We keep on waiting, if the page continues
2261204643Srdivacky * to be in the object.  If the page doesn't exist, first allocate it
2262204643Srdivacky * and then conditionally zero it.
2263224145Sdim *
2264224145Sdim * The caller must always specify the VM_ALLOC_RETRY flag.  This is intended
2265204643Srdivacky * to facilitate its eventual removal.
2266212904Sdim *
2267204643Srdivacky * This routine may block.
2268204643Srdivacky */
2269263508Sdimvm_page_t
2270263508Sdimvm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
2271263508Sdim{
2272263508Sdim	vm_page_t m;
2273263508Sdim
2274243830Sdim	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2275243830Sdim	KASSERT((allocflags & VM_ALLOC_RETRY) != 0,
2276249423Sdim	    ("vm_page_grab: VM_ALLOC_RETRY is required"));
2277249423Sdimretrylookup:
2278249423Sdim	if ((m = vm_page_lookup(object, pindex)) != NULL) {
2279249423Sdim		if ((m->oflags & VPO_BUSY) != 0 ||
2280249423Sdim		    ((allocflags & VM_ALLOC_IGN_SBUSY) == 0 && m->busy != 0)) {
2281249423Sdim			/*
2282249423Sdim			 * Reference the page before unlocking and
2283251662Sdim			 * sleeping so that the page daemon is less
2284249423Sdim			 * likely to reclaim it.
2285251662Sdim			 */
2286249423Sdim			vm_page_aflag_set(m, PGA_REFERENCED);
2287249423Sdim			vm_page_sleep(m, "pgrbwt");
2288249423Sdim			goto retrylookup;
2289249423Sdim		} else {
2290193326Sed			if ((allocflags & VM_ALLOC_WIRED) != 0) {
2291193326Sed				vm_page_lock(m);
2292193326Sed				vm_page_wire(m);
2293193326Sed				vm_page_unlock(m);
2294193326Sed			}
2295193326Sed			if ((allocflags & VM_ALLOC_NOBUSY) == 0)
2296193326Sed				vm_page_busy(m);
2297193326Sed			return (m);
2298193326Sed		}
2299193326Sed	}
2300193326Sed	m = vm_page_alloc(object, pindex, allocflags & ~(VM_ALLOC_RETRY |
2301263508Sdim	    VM_ALLOC_IGN_SBUSY));
2302263508Sdim	if (m == NULL) {
2303193326Sed		VM_OBJECT_UNLOCK(object);
2304195099Sed		VM_WAIT;
2305198092Srdivacky		VM_OBJECT_LOCK(object);
2306195099Sed		goto retrylookup;
2307195099Sed	} else if (m->valid != 0)
2308198092Srdivacky		return (m);
2309195099Sed	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
2310193326Sed		pmap_zero_page(m);
2311200583Srdivacky	return (m);
2312249423Sdim}
2313249423Sdim
2314249423Sdim/*
2315249423Sdim * Mapping function for valid bits or for dirty bits in
2316249423Sdim * a page.  May not block.
2317249423Sdim *
2318249423Sdim * Inputs are required to range within a page.
2319249423Sdim */
2320249423Sdimvm_page_bits_t
2321200583Srdivackyvm_page_bits(int base, int size)
2322200583Srdivacky{
2323200583Srdivacky	int first_bit;
2324200583Srdivacky	int last_bit;
2325200583Srdivacky
2326200583Srdivacky	KASSERT(
2327200583Srdivacky	    base + size <= PAGE_SIZE,
2328200583Srdivacky	    ("vm_page_bits: illegal base/size %d/%d", base, size)
2329193326Sed	);
2330193326Sed
2331193326Sed	if (size == 0)		/* handle degenerate case */
2332193326Sed		return (0);
2333193326Sed
2334193326Sed	first_bit = base >> DEV_BSHIFT;
2335263508Sdim	last_bit = (base + size - 1) >> DEV_BSHIFT;
2336263508Sdim
2337263508Sdim	return (((vm_page_bits_t)2 << last_bit) -
2338263508Sdim	    ((vm_page_bits_t)1 << first_bit));
2339193326Sed}
2340193326Sed
2341193326Sed/*
2342193326Sed *	vm_page_set_valid:
2343193326Sed *
2344193326Sed *	Sets portions of a page valid.  The arguments are expected
2345193326Sed *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
2346198092Srdivacky *	of any partial chunks touched by the range.  The invalid portion of
2347193326Sed *	such chunks will be zeroed.
2348198092Srdivacky *
2349204643Srdivacky *	(base + size) must be less then or equal to PAGE_SIZE.
2350204643Srdivacky */
2351249423Sdimvoid
2352249423Sdimvm_page_set_valid(vm_page_t m, int base, int size)
2353193326Sed{
2354212904Sdim	int endoff, frag;
2355263508Sdim
2356249423Sdim	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2357234353Sdim	if (size == 0)	/* handle degenerate case */
2358234353Sdim		return;
2359221345Sdim
2360221345Sdim	/*
2361221345Sdim	 * If the base is not DEV_BSIZE aligned and the valid
2362221345Sdim	 * bit is clear, we have to zero out a portion of the
2363221345Sdim	 * first block.
2364221345Sdim	 */
2365221345Sdim	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
2366193326Sed	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
2367193326Sed		pmap_zero_page_area(m, frag, base - frag);
2368263508Sdim
2369263508Sdim	/*
2370263508Sdim	 * If the ending offset is not DEV_BSIZE aligned and the
2371263508Sdim	 * valid bit is clear, we have to zero out a portion of
2372203955Srdivacky	 * the last block.
2373243830Sdim	 */
2374243830Sdim	endoff = base + size;
2375243830Sdim	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
2376263508Sdim	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
2377263508Sdim		pmap_zero_page_area(m, endoff,
2378263508Sdim		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
2379203955Srdivacky
2380203955Srdivacky	/*
2381203955Srdivacky	 * Assert that no previously invalid block that is now being validated
2382263508Sdim	 * is already dirty.
2383263508Sdim	 */
2384218893Sdim	KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
2385263508Sdim	    ("vm_page_set_valid: page %p is dirty", m));
2386218893Sdim
2387218893Sdim	/*
2388218893Sdim	 * Set valid bits inclusive of any overlap.
2389243830Sdim	 */
2390263508Sdim	m->valid |= vm_page_bits(base, size);
2391263508Sdim}
2392263508Sdim
2393263508Sdim/*
2394263508Sdim * Clear the given bits from the specified page's dirty field.
2395263508Sdim */
2396263508Sdimstatic __inline void
2397263508Sdimvm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
2398263508Sdim{
2399263508Sdim	uintptr_t addr;
2400263508Sdim#if PAGE_SIZE < 16384
2401263508Sdim	int shift;
2402263508Sdim#endif
2403263508Sdim
2404263508Sdim	/*
2405263508Sdim	 * If the object is locked and the page is neither VPO_BUSY nor
2406263508Sdim	 * PGA_WRITEABLE, then the page's dirty field cannot possibly be
2407263508Sdim	 * set by a concurrent pmap operation.
2408263508Sdim	 */
2409243830Sdim	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2410263508Sdim	if ((m->oflags & VPO_BUSY) == 0 && (m->aflags & PGA_WRITEABLE) == 0)
2411263508Sdim		m->dirty &= ~pagebits;
2412263508Sdim	else {
2413263508Sdim		/*
2414203955Srdivacky		 * The pmap layer can call vm_page_dirty() without
2415203955Srdivacky		 * holding a distinguished lock.  The combination of
2416203955Srdivacky		 * the object's lock and an atomic operation suffice
2417218893Sdim		 * to guarantee consistency of the page dirty field.
2418218893Sdim		 *
2419218893Sdim		 * For PAGE_SIZE == 32768 case, compiler already
2420203955Srdivacky		 * properly aligns the dirty field, so no forcible
2421203955Srdivacky		 * alignment is needed. Only require existence of
2422210299Sed		 * atomic_clear_64 when page size is 32768.
2423221345Sdim		 */
2424221345Sdim		addr = (uintptr_t)&m->dirty;
2425221345Sdim#if PAGE_SIZE == 32768
2426210299Sed		atomic_clear_64((uint64_t *)addr, pagebits);
2427210299Sed#elif PAGE_SIZE == 16384
2428210299Sed		atomic_clear_32((uint32_t *)addr, pagebits);
2429210299Sed#else		/* PAGE_SIZE <= 8192 */
2430210299Sed		/*
2431210299Sed		 * Use a trick to perform a 32-bit atomic on the
2432218893Sdim		 * containing aligned word, to not depend on the existence
2433218893Sdim		 * of atomic_clear_{8, 16}.
2434218893Sdim		 */
2435210299Sed		shift = addr & (sizeof(uint32_t) - 1);
2436218893Sdim#if BYTE_ORDER == BIG_ENDIAN
2437226633Sdim		shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY;
2438226633Sdim#else
2439226633Sdim		shift *= NBBY;
2440226633Sdim#endif
2441226633Sdim		addr &= ~(sizeof(uint32_t) - 1);
2442226633Sdim		atomic_clear_32((uint32_t *)addr, pagebits << shift);
2443226633Sdim#endif		/* PAGE_SIZE */
2444226633Sdim	}
2445226633Sdim}
2446226633Sdim
2447226633Sdim/*
2448226633Sdim *	vm_page_set_validclean:
2449218893Sdim *
2450263508Sdim *	Sets portions of a page valid and clean.  The arguments are expected
2451263508Sdim *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
2452263508Sdim *	of any partial chunks touched by the range.  The invalid portion of
2453218893Sdim *	such chunks will be zero'd.
2454263508Sdim *
2455218893Sdim *	This routine may not block.
2456249423Sdim *
2457249423Sdim *	(base + size) must be less then or equal to PAGE_SIZE.
2458249423Sdim */
2459249423Sdimvoid
2460249423Sdimvm_page_set_validclean(vm_page_t m, int base, int size)
2461249423Sdim{
2462249423Sdim	vm_page_bits_t oldvalid, pagebits;
2463249423Sdim	int endoff, frag;
2464249423Sdim
2465249423Sdim	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2466249423Sdim	if (size == 0)	/* handle degenerate case */
2467249423Sdim		return;
2468249423Sdim
2469249423Sdim	/*
2470249423Sdim	 * If the base is not DEV_BSIZE aligned and the valid
2471249423Sdim	 * bit is clear, we have to zero out a portion of the
2472249423Sdim	 * first block.
2473210299Sed	 */
2474234353Sdim	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
2475193326Sed	    (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
2476193326Sed		pmap_zero_page_area(m, frag, base - frag);
2477198092Srdivacky
2478193326Sed	/*
2479251662Sdim	 * If the ending offset is not DEV_BSIZE aligned and the
2480251662Sdim	 * valid bit is clear, we have to zero out a portion of
2481263508Sdim	 * the last block.
2482251662Sdim	 */
2483251662Sdim	endoff = base + size;
2484251662Sdim	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
2485251662Sdim	    (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
2486251662Sdim		pmap_zero_page_area(m, endoff,
2487251662Sdim		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
2488251662Sdim
2489251662Sdim	/*
2490251662Sdim	 * Set valid, clear dirty bits.  If validating the entire
2491208600Srdivacky	 * page we can safely clear the pmap modify bit.  We also
2492263508Sdim	 * use this opportunity to clear the VPO_NOSYNC flag.  If a process
2493263508Sdim	 * takes a write fault on a MAP_NOSYNC memory area the flag will
2494263508Sdim	 * be set again.
2495251662Sdim	 *
2496251662Sdim	 * We set valid bits inclusive of any overlap, but we can only
2497208600Srdivacky	 * clear dirty bits for DEV_BSIZE chunks that are fully within
2498208600Srdivacky	 * the range.
2499218893Sdim	 */
2500218893Sdim	oldvalid = m->valid;
2501218893Sdim	pagebits = vm_page_bits(base, size);
2502218893Sdim	m->valid |= pagebits;
2503218893Sdim#if 0	/* NOT YET */
2504208600Srdivacky	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
2505208600Srdivacky		frag = DEV_BSIZE - frag;
2506208600Srdivacky		base += frag;
2507208600Srdivacky		size -= frag;
2508208600Srdivacky		if (size < 0)
2509193326Sed			size = 0;
2510193326Sed	}
2511251662Sdim	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
2512251662Sdim#endif
2513251662Sdim	if (base == 0 && size == PAGE_SIZE) {
2514251662Sdim		/*
2515251662Sdim		 * The page can only be modified within the pmap if it is
2516251662Sdim		 * mapped, and it can only be mapped if it was previously
2517251662Sdim		 * fully valid.
2518263508Sdim		 */
2519263508Sdim		if (oldvalid == VM_PAGE_BITS_ALL)
2520263508Sdim			/*
2521263508Sdim			 * Perform the pmap_clear_modify() first.  Otherwise,
2522251662Sdim			 * a concurrent pmap operation, such as
2523263508Sdim			 * pmap_protect(), could clear a modification in the
2524263508Sdim			 * pmap and set the dirty field on the page before
2525263508Sdim			 * pmap_clear_modify() had begun and after the dirty
2526251662Sdim			 * field was cleared here.
2527251662Sdim			 */
2528251662Sdim			pmap_clear_modify(m);
2529251662Sdim		m->dirty = 0;
2530251662Sdim		m->oflags &= ~VPO_NOSYNC;
2531200583Srdivacky	} else if (oldvalid != VM_PAGE_BITS_ALL)
2532207619Srdivacky		m->dirty &= ~pagebits;
2533207619Srdivacky	else
2534207619Srdivacky		vm_page_clear_dirty_mask(m, pagebits);
2535226633Sdim}
2536226633Sdim
2537226633Sdimvoid
2538226633Sdimvm_page_clear_dirty(vm_page_t m, int base, int size)
2539226633Sdim{
2540226633Sdim
2541207619Srdivacky	vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
2542207619Srdivacky}
2543226633Sdim
2544226633Sdim/*
2545200583Srdivacky *	vm_page_set_invalid:
2546200583Srdivacky *
2547200583Srdivacky *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
2548263508Sdim *	valid and dirty bits for the effected areas are cleared.
2549200583Srdivacky *
2550200583Srdivacky *	May not block.
2551200583Srdivacky */
2552200583Srdivackyvoid
2553239462Sdimvm_page_set_invalid(vm_page_t m, int base, int size)
2554200583Srdivacky{
2555200583Srdivacky	vm_page_bits_t bits;
2556200583Srdivacky
2557200583Srdivacky	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2558239462Sdim	KASSERT((m->oflags & VPO_BUSY) == 0,
2559239462Sdim	    ("vm_page_set_invalid: page %p is busy", m));
2560239462Sdim	bits = vm_page_bits(base, size);
2561239462Sdim	if (m->valid == VM_PAGE_BITS_ALL && bits != 0)
2562239462Sdim		pmap_remove_all(m);
2563239462Sdim	KASSERT(!pmap_page_is_mapped(m),
2564239462Sdim	    ("vm_page_set_invalid: page %p is mapped", m));
2565239462Sdim	m->valid &= ~bits;
2566239462Sdim	m->dirty &= ~bits;
2567239462Sdim}
2568200583Srdivacky
2569239462Sdim/*
2570239462Sdim * vm_page_zero_invalid()
2571239462Sdim *
2572239462Sdim *	The kernel assumes that the invalid portions of a page contain
2573239462Sdim *	garbage, but such pages can be mapped into memory by user code.
2574193326Sed *	When this occurs, we must zero out the non-valid portions of the
2575207619Srdivacky *	page so user code sees what it expects.
2576207619Srdivacky *
2577207619Srdivacky *	Pages are most often semi-valid when the end of a file is mapped
2578207619Srdivacky *	into memory and the file's size is not page aligned.
2579207619Srdivacky */
2580207619Srdivackyvoid
2581207619Srdivackyvm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
2582207619Srdivacky{
2583207619Srdivacky	int b;
2584207619Srdivacky	int i;
2585200583Srdivacky
2586200583Srdivacky	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2587207619Srdivacky	/*
2588200583Srdivacky	 * Scan the valid bits looking for invalid sections that
2589200583Srdivacky	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
2590239462Sdim	 * valid bit may be set ) have already been zerod by
2591223017Sdim	 * vm_page_set_validclean().
2592223017Sdim	 */
2593223017Sdim	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
2594223017Sdim		if (i == (PAGE_SIZE / DEV_BSIZE) ||
2595193326Sed		    (m->valid & ((vm_page_bits_t)1 << i))) {
2596193326Sed			if (i > b) {
2597193326Sed				pmap_zero_page_area(m,
2598249423Sdim				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
2599249423Sdim			}
2600249423Sdim			b = i + 1;
2601249423Sdim		}
2602249423Sdim	}
2603249423Sdim
2604249423Sdim	/*
2605249423Sdim	 * setvalid is TRUE when we can safely set the zero'd areas
2606263508Sdim	 * as being valid.  We can do this if there are no cache consistancy
2607249423Sdim	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
2608249423Sdim	 */
2609249423Sdim	if (setvalid)
2610249423Sdim		m->valid = VM_PAGE_BITS_ALL;
2611249423Sdim}
2612249423Sdim
2613249423Sdim/*
2614249423Sdim *	vm_page_is_valid:
2615249423Sdim *
2616249423Sdim *	Is (partial) page valid?  Note that the case where size == 0
2617249423Sdim *	will return FALSE in the degenerate case where the page is
2618263508Sdim *	entirely invalid, and TRUE otherwise.
2619249423Sdim *
2620249423Sdim *	May not block.
2621249423Sdim */
2622193326Sedint
2623193326Sedvm_page_is_valid(vm_page_t m, int base, int size)
2624193326Sed{
2625218893Sdim	vm_page_bits_t bits;
2626218893Sdim
2627218893Sdim	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2628218893Sdim	bits = vm_page_bits(base, size);
2629218893Sdim	if (m->valid && ((m->valid & bits) == bits))
2630218893Sdim		return 1;
2631218893Sdim	else
2632218893Sdim		return 0;
2633218893Sdim}
2634218893Sdim
2635218893Sdim/*
2636218893Sdim * update dirty bits from pmap/mmu.  May not block.
2637249423Sdim */
2638263508Sdimvoid
2639263508Sdimvm_page_test_dirty(vm_page_t m)
2640263508Sdim{
2641263508Sdim
2642263508Sdim	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2643263508Sdim	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
2644263508Sdim		vm_page_dirty(m);
2645263508Sdim}
2646263508Sdim
2647263508Sdimvoid
2648263508Sdimvm_page_lock_KBI(vm_page_t m, const char *file, int line)
2649263508Sdim{
2650263508Sdim
2651263508Sdim	mtx_lock_flags_(vm_page_lockptr(m), 0, file, line);
2652263508Sdim}
2653263508Sdim
2654249423Sdimvoid
2655249423Sdimvm_page_unlock_KBI(vm_page_t m, const char *file, int line)
2656218893Sdim{
2657263508Sdim
2658193326Sed	mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line);
2659263508Sdim}
2660263508Sdim
2661263508Sdimint
2662263508Sdimvm_page_trylock_KBI(vm_page_t m, const char *file, int line)
2663263508Sdim{
2664263508Sdim
2665263508Sdim	return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line));
2666263508Sdim}
2667193326Sed
2668193326Sed#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
2669193326Sedvoid
2670193326Sedvm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
2671193326Sed{
2672234353Sdim
2673193326Sed	mtx_assert_(vm_page_lockptr(m), a, file, line);
2674198092Srdivacky}
2675198092Srdivacky#endif
2676193326Sed
2677263508Sdimint so_zerocp_fullpage = 0;
2678193326Sed
2679193326Sed/*
2680193326Sed *	Replace the given page with a copy.  The copied page assumes
2681193326Sed *	the portion of the given page's "wire_count" that is not the
2682226633Sdim *	responsibility of this copy-on-write mechanism.
2683193326Sed *
2684193326Sed *	The object containing the given page must have a non-zero
2685249423Sdim *	paging-in-progress count and be locked.
2686218893Sdim */
2687193326Sedvoid
2688193326Sedvm_page_cowfault(vm_page_t m)
2689193326Sed{
2690193326Sed	vm_page_t mnew;
2691226633Sdim	vm_object_t object;
2692198092Srdivacky	vm_pindex_t pindex;
2693198092Srdivacky
2694193326Sed	mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
2695193326Sed	vm_page_lock_assert(m, MA_OWNED);
2696193326Sed	object = m->object;
2697221345Sdim	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2698193326Sed	KASSERT(object->paging_in_progress != 0,
2699200583Srdivacky	    ("vm_page_cowfault: object %p's paging-in-progress count is zero.",
2700249423Sdim	    object));
2701207619Srdivacky	pindex = m->pindex;
2702221345Sdim
2703193326Sed retry_alloc:
2704193326Sed	pmap_remove_all(m);
2705193326Sed	vm_page_remove(m);
2706193326Sed	mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
2707226633Sdim	if (mnew == NULL) {
2708198092Srdivacky		vm_page_insert(m, object, pindex);
2709193326Sed		vm_page_unlock(m);
2710263508Sdim		VM_OBJECT_UNLOCK(object);
2711193326Sed		VM_WAIT;
2712193326Sed		VM_OBJECT_LOCK(object);
2713193326Sed		if (m == vm_page_lookup(object, pindex)) {
2714193326Sed			vm_page_lock(m);
2715193326Sed			goto retry_alloc;
2716193326Sed		} else {
2717193326Sed			/*
2718193326Sed			 * Page disappeared during the wait.
2719193326Sed			 */
2720193326Sed			return;
2721193326Sed		}
2722193326Sed	}
2723193326Sed
2724234353Sdim	if (m->cow == 0) {
2725193326Sed		/*
2726198092Srdivacky		 * check to see if we raced with an xmit complete when
2727193326Sed		 * waiting to allocate a page.  If so, put things back
2728226633Sdim		 * the way they were
2729226633Sdim		 */
2730198092Srdivacky		vm_page_unlock(m);
2731198092Srdivacky		vm_page_lock(mnew);
2732198092Srdivacky		vm_page_free(mnew);
2733198092Srdivacky		vm_page_unlock(mnew);
2734198092Srdivacky		vm_page_insert(m, object, pindex);
2735193326Sed	} else { /* clear COW & copy page */
2736193326Sed		if (!so_zerocp_fullpage)
2737193326Sed			pmap_copy_page(m, mnew);
2738193326Sed		mnew->valid = VM_PAGE_BITS_ALL;
2739198092Srdivacky		vm_page_dirty(mnew);
2740193326Sed		mnew->wire_count = m->wire_count - m->cow;
2741193326Sed		m->wire_count = m->cow;
2742193326Sed		vm_page_unlock(m);
2743198092Srdivacky	}
2744193326Sed}
2745193326Sed
2746193326Sedvoid
2747212904Sdimvm_page_cowclear(vm_page_t m)
2748212904Sdim{
2749198092Srdivacky
2750193326Sed	vm_page_lock_assert(m, MA_OWNED);
2751193326Sed	if (m->cow) {
2752193326Sed		m->cow--;
2753193326Sed		/*
2754193326Sed		 * let vm_fault add back write permission  lazily
2755193326Sed		 */
2756193326Sed	}
2757193326Sed	/*
2758193326Sed	 *  sf_buf_free() will free the page, so we needn't do it here
2759193326Sed	 */
2760193326Sed}
2761193326Sed
2762193326Sedint
2763212904Sdimvm_page_cowsetup(vm_page_t m)
2764212904Sdim{
2765212904Sdim
2766193326Sed	vm_page_lock_assert(m, MA_OWNED);
2767193326Sed	if ((m->flags & PG_FICTITIOUS) != 0 ||
2768263508Sdim	    (m->oflags & VPO_UNMANAGED) != 0 ||
2769263508Sdim	    m->cow == USHRT_MAX - 1 || !VM_OBJECT_TRYLOCK(m->object))
2770263508Sdim		return (EBUSY);
2771263508Sdim	m->cow++;
2772193326Sed	pmap_remove_write(m);
2773193326Sed	VM_OBJECT_UNLOCK(m->object);
2774193326Sed	return (0);
2775193326Sed}
2776193326Sed
2777251662Sdim#ifdef INVARIANTS
2778251662Sdimvoid
2779251662Sdimvm_page_object_lock_assert(vm_page_t m)
2780251662Sdim{
2781251662Sdim
2782251662Sdim	/*
2783251662Sdim	 * Certain of the page's fields may only be modified by the
2784251662Sdim	 * holder of the containing object's lock or the setter of the
2785251662Sdim	 * page's VPO_BUSY flag.  Unfortunately, the setter of the
2786193326Sed	 * VPO_BUSY flag is not recorded, and thus cannot be checked
2787193326Sed	 * here.
2788193326Sed	 */
2789193326Sed	if (m->object != NULL && (m->oflags & VPO_BUSY) == 0)
2790251662Sdim		VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2791251662Sdim}
2792251662Sdim#endif
2793251662Sdim
2794251662Sdim#include "opt_ddb.h"
2795251662Sdim#ifdef DDB
2796251662Sdim#include <sys/kernel.h>
2797251662Sdim
2798251662Sdim#include <ddb/ddb.h>
2799263508Sdim
2800251662SdimDB_SHOW_COMMAND(page, vm_page_print_page_info)
2801251662Sdim{
2802193326Sed	db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
2803193326Sed	db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
2804193326Sed	db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
2805193326Sed	db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
2806193326Sed	db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
2807193326Sed	db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
2808193326Sed	db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
2809193326Sed	db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
2810193326Sed	db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
2811193326Sed	db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
2812193326Sed}
2813193326Sed
2814198092SrdivackyDB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
2815193326Sed{
2816193326Sed
2817263508Sdim	db_printf("PQ_FREE:");
2818193326Sed	db_printf(" %d", cnt.v_free_count);
2819193326Sed	db_printf("\n");
2820193326Sed
2821193326Sed	db_printf("PQ_CACHE:");
2822234353Sdim	db_printf(" %d", cnt.v_cache_count);
2823263508Sdim	db_printf("\n");
2824193326Sed
2825234353Sdim	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
2826193326Sed		*vm_page_queues[PQ_ACTIVE].cnt,
2827193326Sed		*vm_page_queues[PQ_INACTIVE].cnt);
2828193326Sed}
2829193326Sed#endif /* DDB */
2830193326Sed