vm_page.c revision 331550
1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1991 Regents of the University of California.
5 * All rights reserved.
6 * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * The Mach Operating System project at Carnegie-Mellon University.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
36 */
37
38/*-
39 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
40 * All rights reserved.
41 *
42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
43 *
44 * Permission to use, copy, modify and distribute this software and
45 * its documentation is hereby granted, provided that both the copyright
46 * notice and this permission notice appear in all copies of the
47 * software, derivative works or modified versions, and any portions
48 * thereof, and that both notices appear in supporting documentation.
49 *
50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
51 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
53 *
54 * Carnegie Mellon requests users of this software to return to
55 *
56 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
57 *  School of Computer Science
58 *  Carnegie Mellon University
59 *  Pittsburgh PA 15213-3890
60 *
61 * any improvements or extensions that they make and grant Carnegie the
62 * rights to redistribute these changes.
63 */
64
65/*
66 *			GENERAL RULES ON VM_PAGE MANIPULATION
67 *
68 *	- A page queue lock is required when adding or removing a page from a
69 *	  page queue regardless of other locks or the busy state of a page.
70 *
71 *		* In general, no thread besides the page daemon can acquire or
72 *		  hold more than one page queue lock at a time.
73 *
74 *		* The page daemon can acquire and hold any pair of page queue
75 *		  locks in any order.
76 *
77 *	- The object lock is required when inserting or removing
78 *	  pages from an object (vm_page_insert() or vm_page_remove()).
79 *
80 */
81
82/*
83 *	Resident memory management module.
84 */
85
86#include <sys/cdefs.h>
87__FBSDID("$FreeBSD: stable/11/sys/vm/vm_page.c 331550 2018-03-26 15:16:57Z markj $");
88
89#include "opt_vm.h"
90
91#include <sys/param.h>
92#include <sys/systm.h>
93#include <sys/lock.h>
94#include <sys/kernel.h>
95#include <sys/limits.h>
96#include <sys/linker.h>
97#include <sys/malloc.h>
98#include <sys/mman.h>
99#include <sys/msgbuf.h>
100#include <sys/mutex.h>
101#include <sys/proc.h>
102#include <sys/rwlock.h>
103#include <sys/sbuf.h>
104#include <sys/smp.h>
105#include <sys/sysctl.h>
106#include <sys/vmmeter.h>
107#include <sys/vnode.h>
108
109#include <vm/vm.h>
110#include <vm/pmap.h>
111#include <vm/vm_param.h>
112#include <vm/vm_kern.h>
113#include <vm/vm_object.h>
114#include <vm/vm_page.h>
115#include <vm/vm_pageout.h>
116#include <vm/vm_pager.h>
117#include <vm/vm_phys.h>
118#include <vm/vm_radix.h>
119#include <vm/vm_reserv.h>
120#include <vm/vm_extern.h>
121#include <vm/uma.h>
122#include <vm/uma_int.h>
123
124#include <machine/md_var.h>
125
126/*
127 *	Associated with page of user-allocatable memory is a
128 *	page structure.
129 */
130
131struct vm_domain vm_dom[MAXMEMDOM];
132struct mtx_padalign __exclusive_cache_line vm_page_queue_free_mtx;
133
134struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT];
135
136vm_page_t vm_page_array;
137long vm_page_array_size;
138long first_page;
139int vm_page_zero_count;
140
141static int boot_pages = UMA_BOOT_PAGES;
142SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
143    &boot_pages, 0,
144    "number of pages allocated for bootstrapping the VM system");
145
146static int pa_tryrelock_restart;
147SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
148    &pa_tryrelock_restart, 0, "Number of tryrelock restarts");
149
150static TAILQ_HEAD(, vm_page) blacklist_head;
151static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS);
152SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD |
153    CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages");
154
155/* Is the page daemon waiting for free pages? */
156static int vm_pageout_pages_needed;
157
158static uma_zone_t fakepg_zone;
159
160static void vm_page_alloc_check(vm_page_t m);
161static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
162static void vm_page_enqueue(uint8_t queue, vm_page_t m);
163static void vm_page_free_phys(vm_page_t m);
164static void vm_page_free_wakeup(void);
165static void vm_page_init_fakepg(void *dummy);
166static int vm_page_insert_after(vm_page_t m, vm_object_t object,
167    vm_pindex_t pindex, vm_page_t mpred);
168static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
169    vm_page_t mpred);
170static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
171    vm_paddr_t high);
172static int vm_page_alloc_fail(vm_object_t object, int req);
173
174SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
175
176static void
177vm_page_init_fakepg(void *dummy)
178{
179
180	fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
181	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
182}
183
184/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
185#if PAGE_SIZE == 32768
186#ifdef CTASSERT
187CTASSERT(sizeof(u_long) >= 8);
188#endif
189#endif
190
191/*
192 * Try to acquire a physical address lock while a pmap is locked.  If we
193 * fail to trylock we unlock and lock the pmap directly and cache the
194 * locked pa in *locked.  The caller should then restart their loop in case
195 * the virtual to physical mapping has changed.
196 */
197int
198vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked)
199{
200	vm_paddr_t lockpa;
201
202	lockpa = *locked;
203	*locked = pa;
204	if (lockpa) {
205		PA_LOCK_ASSERT(lockpa, MA_OWNED);
206		if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa))
207			return (0);
208		PA_UNLOCK(lockpa);
209	}
210	if (PA_TRYLOCK(pa))
211		return (0);
212	PMAP_UNLOCK(pmap);
213	atomic_add_int(&pa_tryrelock_restart, 1);
214	PA_LOCK(pa);
215	PMAP_LOCK(pmap);
216	return (EAGAIN);
217}
218
219/*
220 *	vm_set_page_size:
221 *
222 *	Sets the page size, perhaps based upon the memory
223 *	size.  Must be called before any use of page-size
224 *	dependent functions.
225 */
226void
227vm_set_page_size(void)
228{
229	if (vm_cnt.v_page_size == 0)
230		vm_cnt.v_page_size = PAGE_SIZE;
231	if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0)
232		panic("vm_set_page_size: page size not a power of two");
233}
234
235/*
236 *	vm_page_blacklist_next:
237 *
238 *	Find the next entry in the provided string of blacklist
239 *	addresses.  Entries are separated by space, comma, or newline.
240 *	If an invalid integer is encountered then the rest of the
241 *	string is skipped.  Updates the list pointer to the next
242 *	character, or NULL if the string is exhausted or invalid.
243 */
244static vm_paddr_t
245vm_page_blacklist_next(char **list, char *end)
246{
247	vm_paddr_t bad;
248	char *cp, *pos;
249
250	if (list == NULL || *list == NULL)
251		return (0);
252	if (**list =='\0') {
253		*list = NULL;
254		return (0);
255	}
256
257	/*
258	 * If there's no end pointer then the buffer is coming from
259	 * the kenv and we know it's null-terminated.
260	 */
261	if (end == NULL)
262		end = *list + strlen(*list);
263
264	/* Ensure that strtoq() won't walk off the end */
265	if (*end != '\0') {
266		if (*end == '\n' || *end == ' ' || *end  == ',')
267			*end = '\0';
268		else {
269			printf("Blacklist not terminated, skipping\n");
270			*list = NULL;
271			return (0);
272		}
273	}
274
275	for (pos = *list; *pos != '\0'; pos = cp) {
276		bad = strtoq(pos, &cp, 0);
277		if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') {
278			if (bad == 0) {
279				if (++cp < end)
280					continue;
281				else
282					break;
283			}
284		} else
285			break;
286		if (*cp == '\0' || ++cp >= end)
287			*list = NULL;
288		else
289			*list = cp;
290		return (trunc_page(bad));
291	}
292	printf("Garbage in RAM blacklist, skipping\n");
293	*list = NULL;
294	return (0);
295}
296
297/*
298 *	vm_page_blacklist_check:
299 *
300 *	Iterate through the provided string of blacklist addresses, pulling
301 *	each entry out of the physical allocator free list and putting it
302 *	onto a list for reporting via the vm.page_blacklist sysctl.
303 */
304static void
305vm_page_blacklist_check(char *list, char *end)
306{
307	vm_paddr_t pa;
308	vm_page_t m;
309	char *next;
310	int ret;
311
312	next = list;
313	while (next != NULL) {
314		if ((pa = vm_page_blacklist_next(&next, end)) == 0)
315			continue;
316		m = vm_phys_paddr_to_vm_page(pa);
317		if (m == NULL)
318			continue;
319		mtx_lock(&vm_page_queue_free_mtx);
320		ret = vm_phys_unfree_page(m);
321		mtx_unlock(&vm_page_queue_free_mtx);
322		if (ret == TRUE) {
323			TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
324			if (bootverbose)
325				printf("Skipping page with pa 0x%jx\n",
326				    (uintmax_t)pa);
327		}
328	}
329}
330
331/*
332 *	vm_page_blacklist_load:
333 *
334 *	Search for a special module named "ram_blacklist".  It'll be a
335 *	plain text file provided by the user via the loader directive
336 *	of the same name.
337 */
338static void
339vm_page_blacklist_load(char **list, char **end)
340{
341	void *mod;
342	u_char *ptr;
343	u_int len;
344
345	mod = NULL;
346	ptr = NULL;
347
348	mod = preload_search_by_type("ram_blacklist");
349	if (mod != NULL) {
350		ptr = preload_fetch_addr(mod);
351		len = preload_fetch_size(mod);
352        }
353	*list = ptr;
354	if (ptr != NULL)
355		*end = ptr + len;
356	else
357		*end = NULL;
358	return;
359}
360
361static int
362sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
363{
364	vm_page_t m;
365	struct sbuf sbuf;
366	int error, first;
367
368	first = 1;
369	error = sysctl_wire_old_buffer(req, 0);
370	if (error != 0)
371		return (error);
372	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
373	TAILQ_FOREACH(m, &blacklist_head, listq) {
374		sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",",
375		    (uintmax_t)m->phys_addr);
376		first = 0;
377	}
378	error = sbuf_finish(&sbuf);
379	sbuf_delete(&sbuf);
380	return (error);
381}
382
383static void
384vm_page_domain_init(struct vm_domain *vmd)
385{
386	struct vm_pagequeue *pq;
387	int i;
388
389	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
390	    "vm inactive pagequeue";
391	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
392	    &vm_cnt.v_inactive_count;
393	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
394	    "vm active pagequeue";
395	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
396	    &vm_cnt.v_active_count;
397	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) =
398	    "vm laundry pagequeue";
399	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) =
400	    &vm_cnt.v_laundry_count;
401	vmd->vmd_page_count = 0;
402	vmd->vmd_free_count = 0;
403	vmd->vmd_segs = 0;
404	vmd->vmd_oom = FALSE;
405	for (i = 0; i < PQ_COUNT; i++) {
406		pq = &vmd->vmd_pagequeues[i];
407		TAILQ_INIT(&pq->pq_pl);
408		mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
409		    MTX_DEF | MTX_DUPOK);
410	}
411}
412
413/*
414 * Initialize a physical page in preparation for adding it to the free
415 * lists.
416 */
417static void
418vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind)
419{
420
421	m->object = NULL;
422	m->wire_count = 0;
423	m->busy_lock = VPB_UNBUSIED;
424	m->hold_count = 0;
425	m->flags = 0;
426	m->phys_addr = pa;
427	m->queue = PQ_NONE;
428	m->psind = 0;
429	m->segind = segind;
430	m->order = VM_NFREEORDER;
431	m->pool = VM_FREEPOOL_DEFAULT;
432	m->valid = m->dirty = 0;
433	pmap_page_init(m);
434}
435
436/*
437 *	vm_page_startup:
438 *
439 *	Initializes the resident memory module.  Allocates physical memory for
440 *	bootstrapping UMA and some data structures that are used to manage
441 *	physical pages.  Initializes these structures, and populates the free
442 *	page queues.
443 */
444vm_offset_t
445vm_page_startup(vm_offset_t vaddr)
446{
447	struct vm_domain *vmd;
448	struct vm_phys_seg *seg;
449	vm_page_t m;
450	char *list, *listend;
451	vm_offset_t mapped;
452	vm_paddr_t end, high_avail, low_avail, new_end, page_range, size;
453	vm_paddr_t biggestsize, last_pa, pa;
454	u_long pagecount;
455	int biggestone, i, pages_per_zone, segind;
456
457	biggestsize = 0;
458	biggestone = 0;
459	vaddr = round_page(vaddr);
460
461	for (i = 0; phys_avail[i + 1]; i += 2) {
462		phys_avail[i] = round_page(phys_avail[i]);
463		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
464	}
465	for (i = 0; phys_avail[i + 1]; i += 2) {
466		size = phys_avail[i + 1] - phys_avail[i];
467		if (size > biggestsize) {
468			biggestone = i;
469			biggestsize = size;
470		}
471	}
472
473	end = phys_avail[biggestone+1];
474
475	/*
476	 * Initialize the page and queue locks.
477	 */
478	mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF);
479	for (i = 0; i < PA_LOCK_COUNT; i++)
480		mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
481	for (i = 0; i < vm_ndomains; i++)
482		vm_page_domain_init(&vm_dom[i]);
483
484	/*
485	 * Almost all of the pages needed for bootstrapping UMA are used
486	 * for zone structures, so if the number of CPUs results in those
487	 * structures taking more than one page each, we set aside more pages
488	 * in proportion to the zone structure size.
489	 */
490	pages_per_zone = howmany(sizeof(struct uma_zone) +
491	    sizeof(struct uma_cache) * (mp_maxid + 1) +
492	    roundup2(sizeof(struct uma_slab), sizeof(void *)), UMA_SLAB_SIZE);
493	if (pages_per_zone > 1) {
494		/* Reserve more pages so that we don't run out. */
495		boot_pages = UMA_BOOT_PAGES_ZONES * pages_per_zone;
496	}
497
498	/*
499	 * Allocate memory for use when boot strapping the kernel memory
500	 * allocator.
501	 *
502	 * CTFLAG_RDTUN doesn't work during the early boot process, so we must
503	 * manually fetch the value.
504	 */
505	TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages);
506	new_end = end - (boot_pages * UMA_SLAB_SIZE);
507	new_end = trunc_page(new_end);
508	mapped = pmap_map(&vaddr, new_end, end,
509	    VM_PROT_READ | VM_PROT_WRITE);
510	bzero((void *)mapped, end - new_end);
511	uma_startup((void *)mapped, boot_pages);
512
513#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \
514    defined(__i386__) || defined(__mips__)
515	/*
516	 * Allocate a bitmap to indicate that a random physical page
517	 * needs to be included in a minidump.
518	 *
519	 * The amd64 port needs this to indicate which direct map pages
520	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
521	 *
522	 * However, i386 still needs this workspace internally within the
523	 * minidump code.  In theory, they are not needed on i386, but are
524	 * included should the sf_buf code decide to use them.
525	 */
526	last_pa = 0;
527	for (i = 0; dump_avail[i + 1] != 0; i += 2)
528		if (dump_avail[i + 1] > last_pa)
529			last_pa = dump_avail[i + 1];
530	page_range = last_pa / PAGE_SIZE;
531	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
532	new_end -= vm_page_dump_size;
533	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
534	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
535	bzero((void *)vm_page_dump, vm_page_dump_size);
536#else
537	(void)last_pa;
538#endif
539#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__)
540	/*
541	 * Include the UMA bootstrap pages and vm_page_dump in a crash dump.
542	 * When pmap_map() uses the direct map, they are not automatically
543	 * included.
544	 */
545	for (pa = new_end; pa < end; pa += PAGE_SIZE)
546		dump_add_page(pa);
547#endif
548	phys_avail[biggestone + 1] = new_end;
549#ifdef __amd64__
550	/*
551	 * Request that the physical pages underlying the message buffer be
552	 * included in a crash dump.  Since the message buffer is accessed
553	 * through the direct map, they are not automatically included.
554	 */
555	pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
556	last_pa = pa + round_page(msgbufsize);
557	while (pa < last_pa) {
558		dump_add_page(pa);
559		pa += PAGE_SIZE;
560	}
561#endif
562	/*
563	 * Compute the number of pages of memory that will be available for
564	 * use, taking into account the overhead of a page structure per page.
565	 * In other words, solve
566	 *	"available physical memory" - round_page(page_range *
567	 *	    sizeof(struct vm_page)) = page_range * PAGE_SIZE
568	 * for page_range.
569	 */
570	low_avail = phys_avail[0];
571	high_avail = phys_avail[1];
572	for (i = 0; i < vm_phys_nsegs; i++) {
573		if (vm_phys_segs[i].start < low_avail)
574			low_avail = vm_phys_segs[i].start;
575		if (vm_phys_segs[i].end > high_avail)
576			high_avail = vm_phys_segs[i].end;
577	}
578	/* Skip the first chunk.  It is already accounted for. */
579	for (i = 2; phys_avail[i + 1] != 0; i += 2) {
580		if (phys_avail[i] < low_avail)
581			low_avail = phys_avail[i];
582		if (phys_avail[i + 1] > high_avail)
583			high_avail = phys_avail[i + 1];
584	}
585	first_page = low_avail / PAGE_SIZE;
586#ifdef VM_PHYSSEG_SPARSE
587	size = 0;
588	for (i = 0; i < vm_phys_nsegs; i++)
589		size += vm_phys_segs[i].end - vm_phys_segs[i].start;
590	for (i = 0; phys_avail[i + 1] != 0; i += 2)
591		size += phys_avail[i + 1] - phys_avail[i];
592#elif defined(VM_PHYSSEG_DENSE)
593	size = high_avail - low_avail;
594#else
595#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
596#endif
597
598#ifdef VM_PHYSSEG_DENSE
599	/*
600	 * In the VM_PHYSSEG_DENSE case, the number of pages can account for
601	 * the overhead of a page structure per page only if vm_page_array is
602	 * allocated from the last physical memory chunk.  Otherwise, we must
603	 * allocate page structures representing the physical memory
604	 * underlying vm_page_array, even though they will not be used.
605	 */
606	if (new_end != high_avail)
607		page_range = size / PAGE_SIZE;
608	else
609#endif
610	{
611		page_range = size / (PAGE_SIZE + sizeof(struct vm_page));
612
613		/*
614		 * If the partial bytes remaining are large enough for
615		 * a page (PAGE_SIZE) without a corresponding
616		 * 'struct vm_page', then new_end will contain an
617		 * extra page after subtracting the length of the VM
618		 * page array.  Compensate by subtracting an extra
619		 * page from new_end.
620		 */
621		if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) {
622			if (new_end == high_avail)
623				high_avail -= PAGE_SIZE;
624			new_end -= PAGE_SIZE;
625		}
626	}
627	end = new_end;
628
629	/*
630	 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
631	 * However, because this page is allocated from KVM, out-of-bounds
632	 * accesses using the direct map will not be trapped.
633	 */
634	vaddr += PAGE_SIZE;
635
636	/*
637	 * Allocate physical memory for the page structures, and map it.
638	 */
639	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
640	mapped = pmap_map(&vaddr, new_end, end,
641	    VM_PROT_READ | VM_PROT_WRITE);
642	vm_page_array = (vm_page_t)mapped;
643	vm_page_array_size = page_range;
644
645#if VM_NRESERVLEVEL > 0
646	/*
647	 * Allocate physical memory for the reservation management system's
648	 * data structures, and map it.
649	 */
650	if (high_avail == end)
651		high_avail = new_end;
652	new_end = vm_reserv_startup(&vaddr, new_end, high_avail);
653#endif
654#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__)
655	/*
656	 * Include vm_page_array and vm_reserv_array in a crash dump.
657	 */
658	for (pa = new_end; pa < end; pa += PAGE_SIZE)
659		dump_add_page(pa);
660#endif
661	phys_avail[biggestone + 1] = new_end;
662
663	/*
664	 * Add physical memory segments corresponding to the available
665	 * physical pages.
666	 */
667	for (i = 0; phys_avail[i + 1] != 0; i += 2)
668		vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
669
670	/*
671	 * Initialize the physical memory allocator.
672	 */
673	vm_phys_init();
674
675	/*
676	 * Initialize the page structures and add every available page to the
677	 * physical memory allocator's free lists.
678	 */
679	vm_cnt.v_page_count = 0;
680	vm_cnt.v_free_count = 0;
681	for (segind = 0; segind < vm_phys_nsegs; segind++) {
682		seg = &vm_phys_segs[segind];
683		for (m = seg->first_page, pa = seg->start; pa < seg->end;
684		    m++, pa += PAGE_SIZE)
685			vm_page_init_page(m, pa, segind);
686
687		/*
688		 * Add the segment to the free lists only if it is covered by
689		 * one of the ranges in phys_avail.  Because we've added the
690		 * ranges to the vm_phys_segs array, we can assume that each
691		 * segment is either entirely contained in one of the ranges,
692		 * or doesn't overlap any of them.
693		 */
694		for (i = 0; phys_avail[i + 1] != 0; i += 2) {
695			if (seg->start < phys_avail[i] ||
696			    seg->end > phys_avail[i + 1])
697				continue;
698
699			m = seg->first_page;
700			pagecount = (u_long)atop(seg->end - seg->start);
701
702			mtx_lock(&vm_page_queue_free_mtx);
703			vm_phys_free_contig(m, pagecount);
704			vm_phys_freecnt_adj(m, (int)pagecount);
705			mtx_unlock(&vm_page_queue_free_mtx);
706			vm_cnt.v_page_count += (u_int)pagecount;
707
708			vmd = &vm_dom[seg->domain];
709			vmd->vmd_page_count += (u_int)pagecount;
710			vmd->vmd_segs |= 1UL << m->segind;
711			break;
712		}
713	}
714
715	/*
716	 * Remove blacklisted pages from the physical memory allocator.
717	 */
718	TAILQ_INIT(&blacklist_head);
719	vm_page_blacklist_load(&list, &listend);
720	vm_page_blacklist_check(list, listend);
721
722	list = kern_getenv("vm.blacklist");
723	vm_page_blacklist_check(list, NULL);
724
725	freeenv(list);
726#if VM_NRESERVLEVEL > 0
727	/*
728	 * Initialize the reservation management system.
729	 */
730	vm_reserv_init();
731#endif
732	return (vaddr);
733}
734
735void
736vm_page_reference(vm_page_t m)
737{
738
739	vm_page_aflag_set(m, PGA_REFERENCED);
740}
741
742/*
743 *	vm_page_busy_downgrade:
744 *
745 *	Downgrade an exclusive busy page into a single shared busy page.
746 */
747void
748vm_page_busy_downgrade(vm_page_t m)
749{
750	u_int x;
751	bool locked;
752
753	vm_page_assert_xbusied(m);
754	locked = mtx_owned(vm_page_lockptr(m));
755
756	for (;;) {
757		x = m->busy_lock;
758		x &= VPB_BIT_WAITERS;
759		if (x != 0 && !locked)
760			vm_page_lock(m);
761		if (atomic_cmpset_rel_int(&m->busy_lock,
762		    VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1)))
763			break;
764		if (x != 0 && !locked)
765			vm_page_unlock(m);
766	}
767	if (x != 0) {
768		wakeup(m);
769		if (!locked)
770			vm_page_unlock(m);
771	}
772}
773
774/*
775 *	vm_page_sbusied:
776 *
777 *	Return a positive value if the page is shared busied, 0 otherwise.
778 */
779int
780vm_page_sbusied(vm_page_t m)
781{
782	u_int x;
783
784	x = m->busy_lock;
785	return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED);
786}
787
788/*
789 *	vm_page_sunbusy:
790 *
791 *	Shared unbusy a page.
792 */
793void
794vm_page_sunbusy(vm_page_t m)
795{
796	u_int x;
797
798	vm_page_lock_assert(m, MA_NOTOWNED);
799	vm_page_assert_sbusied(m);
800
801	for (;;) {
802		x = m->busy_lock;
803		if (VPB_SHARERS(x) > 1) {
804			if (atomic_cmpset_int(&m->busy_lock, x,
805			    x - VPB_ONE_SHARER))
806				break;
807			continue;
808		}
809		if ((x & VPB_BIT_WAITERS) == 0) {
810			KASSERT(x == VPB_SHARERS_WORD(1),
811			    ("vm_page_sunbusy: invalid lock state"));
812			if (atomic_cmpset_int(&m->busy_lock,
813			    VPB_SHARERS_WORD(1), VPB_UNBUSIED))
814				break;
815			continue;
816		}
817		KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS),
818		    ("vm_page_sunbusy: invalid lock state for waiters"));
819
820		vm_page_lock(m);
821		if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) {
822			vm_page_unlock(m);
823			continue;
824		}
825		wakeup(m);
826		vm_page_unlock(m);
827		break;
828	}
829}
830
831/*
832 *	vm_page_busy_sleep:
833 *
834 *	Sleep and release the page lock, using the page pointer as wchan.
835 *	This is used to implement the hard-path of busying mechanism.
836 *
837 *	The given page must be locked.
838 *
839 *	If nonshared is true, sleep only if the page is xbusy.
840 */
841void
842vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared)
843{
844	u_int x;
845
846	vm_page_assert_locked(m);
847
848	x = m->busy_lock;
849	if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) ||
850	    ((x & VPB_BIT_WAITERS) == 0 &&
851	    !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) {
852		vm_page_unlock(m);
853		return;
854	}
855	msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0);
856}
857
858/*
859 *	vm_page_trysbusy:
860 *
861 *	Try to shared busy a page.
862 *	If the operation succeeds 1 is returned otherwise 0.
863 *	The operation never sleeps.
864 */
865int
866vm_page_trysbusy(vm_page_t m)
867{
868	u_int x;
869
870	for (;;) {
871		x = m->busy_lock;
872		if ((x & VPB_BIT_SHARED) == 0)
873			return (0);
874		if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER))
875			return (1);
876	}
877}
878
879static void
880vm_page_xunbusy_locked(vm_page_t m)
881{
882
883	vm_page_assert_xbusied(m);
884	vm_page_assert_locked(m);
885
886	atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
887	/* There is a waiter, do wakeup() instead of vm_page_flash(). */
888	wakeup(m);
889}
890
891void
892vm_page_xunbusy_maybelocked(vm_page_t m)
893{
894	bool lockacq;
895
896	vm_page_assert_xbusied(m);
897
898	/*
899	 * Fast path for unbusy.  If it succeeds, we know that there
900	 * are no waiters, so we do not need a wakeup.
901	 */
902	if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER,
903	    VPB_UNBUSIED))
904		return;
905
906	lockacq = !mtx_owned(vm_page_lockptr(m));
907	if (lockacq)
908		vm_page_lock(m);
909	vm_page_xunbusy_locked(m);
910	if (lockacq)
911		vm_page_unlock(m);
912}
913
914/*
915 *	vm_page_xunbusy_hard:
916 *
917 *	Called after the first try the exclusive unbusy of a page failed.
918 *	It is assumed that the waiters bit is on.
919 */
920void
921vm_page_xunbusy_hard(vm_page_t m)
922{
923
924	vm_page_assert_xbusied(m);
925
926	vm_page_lock(m);
927	vm_page_xunbusy_locked(m);
928	vm_page_unlock(m);
929}
930
931/*
932 *	vm_page_flash:
933 *
934 *	Wakeup anyone waiting for the page.
935 *	The ownership bits do not change.
936 *
937 *	The given page must be locked.
938 */
939void
940vm_page_flash(vm_page_t m)
941{
942	u_int x;
943
944	vm_page_lock_assert(m, MA_OWNED);
945
946	for (;;) {
947		x = m->busy_lock;
948		if ((x & VPB_BIT_WAITERS) == 0)
949			return;
950		if (atomic_cmpset_int(&m->busy_lock, x,
951		    x & (~VPB_BIT_WAITERS)))
952			break;
953	}
954	wakeup(m);
955}
956
957/*
958 * Avoid releasing and reacquiring the same page lock.
959 */
960void
961vm_page_change_lock(vm_page_t m, struct mtx **mtx)
962{
963	struct mtx *mtx1;
964
965	mtx1 = vm_page_lockptr(m);
966	if (*mtx == mtx1)
967		return;
968	if (*mtx != NULL)
969		mtx_unlock(*mtx);
970	*mtx = mtx1;
971	mtx_lock(mtx1);
972}
973
974/*
975 * Keep page from being freed by the page daemon
976 * much of the same effect as wiring, except much lower
977 * overhead and should be used only for *very* temporary
978 * holding ("wiring").
979 */
980void
981vm_page_hold(vm_page_t mem)
982{
983
984	vm_page_lock_assert(mem, MA_OWNED);
985        mem->hold_count++;
986}
987
988void
989vm_page_unhold(vm_page_t mem)
990{
991
992	vm_page_lock_assert(mem, MA_OWNED);
993	KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!"));
994	--mem->hold_count;
995	if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0)
996		vm_page_free_toq(mem);
997}
998
999/*
1000 *	vm_page_unhold_pages:
1001 *
1002 *	Unhold each of the pages that is referenced by the given array.
1003 */
1004void
1005vm_page_unhold_pages(vm_page_t *ma, int count)
1006{
1007	struct mtx *mtx;
1008
1009	mtx = NULL;
1010	for (; count != 0; count--) {
1011		vm_page_change_lock(*ma, &mtx);
1012		vm_page_unhold(*ma);
1013		ma++;
1014	}
1015	if (mtx != NULL)
1016		mtx_unlock(mtx);
1017}
1018
1019vm_page_t
1020PHYS_TO_VM_PAGE(vm_paddr_t pa)
1021{
1022	vm_page_t m;
1023
1024#ifdef VM_PHYSSEG_SPARSE
1025	m = vm_phys_paddr_to_vm_page(pa);
1026	if (m == NULL)
1027		m = vm_phys_fictitious_to_vm_page(pa);
1028	return (m);
1029#elif defined(VM_PHYSSEG_DENSE)
1030	long pi;
1031
1032	pi = atop(pa);
1033	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1034		m = &vm_page_array[pi - first_page];
1035		return (m);
1036	}
1037	return (vm_phys_fictitious_to_vm_page(pa));
1038#else
1039#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
1040#endif
1041}
1042
1043/*
1044 *	vm_page_getfake:
1045 *
1046 *	Create a fictitious page with the specified physical address and
1047 *	memory attribute.  The memory attribute is the only the machine-
1048 *	dependent aspect of a fictitious page that must be initialized.
1049 */
1050vm_page_t
1051vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
1052{
1053	vm_page_t m;
1054
1055	m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
1056	vm_page_initfake(m, paddr, memattr);
1057	return (m);
1058}
1059
1060void
1061vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1062{
1063
1064	if ((m->flags & PG_FICTITIOUS) != 0) {
1065		/*
1066		 * The page's memattr might have changed since the
1067		 * previous initialization.  Update the pmap to the
1068		 * new memattr.
1069		 */
1070		goto memattr;
1071	}
1072	m->phys_addr = paddr;
1073	m->queue = PQ_NONE;
1074	/* Fictitious pages don't use "segind". */
1075	m->flags = PG_FICTITIOUS;
1076	/* Fictitious pages don't use "order" or "pool". */
1077	m->oflags = VPO_UNMANAGED;
1078	m->busy_lock = VPB_SINGLE_EXCLUSIVER;
1079	m->wire_count = 1;
1080	pmap_page_init(m);
1081memattr:
1082	pmap_page_set_memattr(m, memattr);
1083}
1084
1085/*
1086 *	vm_page_putfake:
1087 *
1088 *	Release a fictitious page.
1089 */
1090void
1091vm_page_putfake(vm_page_t m)
1092{
1093
1094	KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m));
1095	KASSERT((m->flags & PG_FICTITIOUS) != 0,
1096	    ("vm_page_putfake: bad page %p", m));
1097	uma_zfree(fakepg_zone, m);
1098}
1099
1100/*
1101 *	vm_page_updatefake:
1102 *
1103 *	Update the given fictitious page to the specified physical address and
1104 *	memory attribute.
1105 */
1106void
1107vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1108{
1109
1110	KASSERT((m->flags & PG_FICTITIOUS) != 0,
1111	    ("vm_page_updatefake: bad page %p", m));
1112	m->phys_addr = paddr;
1113	pmap_page_set_memattr(m, memattr);
1114}
1115
1116/*
1117 *	vm_page_free:
1118 *
1119 *	Free a page.
1120 */
1121void
1122vm_page_free(vm_page_t m)
1123{
1124
1125	m->flags &= ~PG_ZERO;
1126	vm_page_free_toq(m);
1127}
1128
1129/*
1130 *	vm_page_free_zero:
1131 *
1132 *	Free a page to the zerod-pages queue
1133 */
1134void
1135vm_page_free_zero(vm_page_t m)
1136{
1137
1138	m->flags |= PG_ZERO;
1139	vm_page_free_toq(m);
1140}
1141
1142/*
1143 * Unbusy and handle the page queueing for a page from a getpages request that
1144 * was optionally read ahead or behind.
1145 */
1146void
1147vm_page_readahead_finish(vm_page_t m)
1148{
1149
1150	/* We shouldn't put invalid pages on queues. */
1151	KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m));
1152
1153	/*
1154	 * Since the page is not the actually needed one, whether it should
1155	 * be activated or deactivated is not obvious.  Empirical results
1156	 * have shown that deactivating the page is usually the best choice,
1157	 * unless the page is wanted by another thread.
1158	 */
1159	vm_page_lock(m);
1160	if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
1161		vm_page_activate(m);
1162	else
1163		vm_page_deactivate(m);
1164	vm_page_unlock(m);
1165	vm_page_xunbusy(m);
1166}
1167
1168/*
1169 *	vm_page_sleep_if_busy:
1170 *
1171 *	Sleep and release the page queues lock if the page is busied.
1172 *	Returns TRUE if the thread slept.
1173 *
1174 *	The given page must be unlocked and object containing it must
1175 *	be locked.
1176 */
1177int
1178vm_page_sleep_if_busy(vm_page_t m, const char *msg)
1179{
1180	vm_object_t obj;
1181
1182	vm_page_lock_assert(m, MA_NOTOWNED);
1183	VM_OBJECT_ASSERT_WLOCKED(m->object);
1184
1185	if (vm_page_busied(m)) {
1186		/*
1187		 * The page-specific object must be cached because page
1188		 * identity can change during the sleep, causing the
1189		 * re-lock of a different object.
1190		 * It is assumed that a reference to the object is already
1191		 * held by the callers.
1192		 */
1193		obj = m->object;
1194		vm_page_lock(m);
1195		VM_OBJECT_WUNLOCK(obj);
1196		vm_page_busy_sleep(m, msg, false);
1197		VM_OBJECT_WLOCK(obj);
1198		return (TRUE);
1199	}
1200	return (FALSE);
1201}
1202
1203/*
1204 *	vm_page_dirty_KBI:		[ internal use only ]
1205 *
1206 *	Set all bits in the page's dirty field.
1207 *
1208 *	The object containing the specified page must be locked if the
1209 *	call is made from the machine-independent layer.
1210 *
1211 *	See vm_page_clear_dirty_mask().
1212 *
1213 *	This function should only be called by vm_page_dirty().
1214 */
1215void
1216vm_page_dirty_KBI(vm_page_t m)
1217{
1218
1219	/* Refer to this operation by its public name. */
1220	KASSERT(m->valid == VM_PAGE_BITS_ALL,
1221	    ("vm_page_dirty: page is invalid!"));
1222	m->dirty = VM_PAGE_BITS_ALL;
1223}
1224
1225/*
1226 *	vm_page_insert:		[ internal use only ]
1227 *
1228 *	Inserts the given mem entry into the object and object list.
1229 *
1230 *	The object must be locked.
1231 */
1232int
1233vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1234{
1235	vm_page_t mpred;
1236
1237	VM_OBJECT_ASSERT_WLOCKED(object);
1238	mpred = vm_radix_lookup_le(&object->rtree, pindex);
1239	return (vm_page_insert_after(m, object, pindex, mpred));
1240}
1241
1242/*
1243 *	vm_page_insert_after:
1244 *
1245 *	Inserts the page "m" into the specified object at offset "pindex".
1246 *
1247 *	The page "mpred" must immediately precede the offset "pindex" within
1248 *	the specified object.
1249 *
1250 *	The object must be locked.
1251 */
1252static int
1253vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
1254    vm_page_t mpred)
1255{
1256	vm_page_t msucc;
1257
1258	VM_OBJECT_ASSERT_WLOCKED(object);
1259	KASSERT(m->object == NULL,
1260	    ("vm_page_insert_after: page already inserted"));
1261	if (mpred != NULL) {
1262		KASSERT(mpred->object == object,
1263		    ("vm_page_insert_after: object doesn't contain mpred"));
1264		KASSERT(mpred->pindex < pindex,
1265		    ("vm_page_insert_after: mpred doesn't precede pindex"));
1266		msucc = TAILQ_NEXT(mpred, listq);
1267	} else
1268		msucc = TAILQ_FIRST(&object->memq);
1269	if (msucc != NULL)
1270		KASSERT(msucc->pindex > pindex,
1271		    ("vm_page_insert_after: msucc doesn't succeed pindex"));
1272
1273	/*
1274	 * Record the object/offset pair in this page
1275	 */
1276	m->object = object;
1277	m->pindex = pindex;
1278
1279	/*
1280	 * Now link into the object's ordered list of backed pages.
1281	 */
1282	if (vm_radix_insert(&object->rtree, m)) {
1283		m->object = NULL;
1284		m->pindex = 0;
1285		return (1);
1286	}
1287	vm_page_insert_radixdone(m, object, mpred);
1288	return (0);
1289}
1290
1291/*
1292 *	vm_page_insert_radixdone:
1293 *
1294 *	Complete page "m" insertion into the specified object after the
1295 *	radix trie hooking.
1296 *
1297 *	The page "mpred" must precede the offset "m->pindex" within the
1298 *	specified object.
1299 *
1300 *	The object must be locked.
1301 */
1302static void
1303vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred)
1304{
1305
1306	VM_OBJECT_ASSERT_WLOCKED(object);
1307	KASSERT(object != NULL && m->object == object,
1308	    ("vm_page_insert_radixdone: page %p has inconsistent object", m));
1309	if (mpred != NULL) {
1310		KASSERT(mpred->object == object,
1311		    ("vm_page_insert_after: object doesn't contain mpred"));
1312		KASSERT(mpred->pindex < m->pindex,
1313		    ("vm_page_insert_after: mpred doesn't precede pindex"));
1314	}
1315
1316	if (mpred != NULL)
1317		TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq);
1318	else
1319		TAILQ_INSERT_HEAD(&object->memq, m, listq);
1320
1321	/*
1322	 * Show that the object has one more resident page.
1323	 */
1324	object->resident_page_count++;
1325
1326	/*
1327	 * Hold the vnode until the last page is released.
1328	 */
1329	if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
1330		vhold(object->handle);
1331
1332	/*
1333	 * Since we are inserting a new and possibly dirty page,
1334	 * update the object's OBJ_MIGHTBEDIRTY flag.
1335	 */
1336	if (pmap_page_is_write_mapped(m))
1337		vm_object_set_writeable_dirty(object);
1338}
1339
1340/*
1341 *	vm_page_remove:
1342 *
1343 *	Removes the specified page from its containing object, but does not
1344 *	invalidate any backing storage.
1345 *
1346 *	The object must be locked.  The page must be locked if it is managed.
1347 */
1348void
1349vm_page_remove(vm_page_t m)
1350{
1351	vm_object_t object;
1352	vm_page_t mrem;
1353
1354	if ((m->oflags & VPO_UNMANAGED) == 0)
1355		vm_page_assert_locked(m);
1356	if ((object = m->object) == NULL)
1357		return;
1358	VM_OBJECT_ASSERT_WLOCKED(object);
1359	if (vm_page_xbusied(m))
1360		vm_page_xunbusy_maybelocked(m);
1361	mrem = vm_radix_remove(&object->rtree, m->pindex);
1362	KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m));
1363
1364	/*
1365	 * Now remove from the object's list of backed pages.
1366	 */
1367	TAILQ_REMOVE(&object->memq, m, listq);
1368
1369	/*
1370	 * And show that the object has one fewer resident page.
1371	 */
1372	object->resident_page_count--;
1373
1374	/*
1375	 * The vnode may now be recycled.
1376	 */
1377	if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
1378		vdrop(object->handle);
1379
1380	m->object = NULL;
1381}
1382
1383/*
1384 *	vm_page_lookup:
1385 *
1386 *	Returns the page associated with the object/offset
1387 *	pair specified; if none is found, NULL is returned.
1388 *
1389 *	The object must be locked.
1390 */
1391vm_page_t
1392vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1393{
1394
1395	VM_OBJECT_ASSERT_LOCKED(object);
1396	return (vm_radix_lookup(&object->rtree, pindex));
1397}
1398
1399/*
1400 *	vm_page_find_least:
1401 *
1402 *	Returns the page associated with the object with least pindex
1403 *	greater than or equal to the parameter pindex, or NULL.
1404 *
1405 *	The object must be locked.
1406 */
1407vm_page_t
1408vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
1409{
1410	vm_page_t m;
1411
1412	VM_OBJECT_ASSERT_LOCKED(object);
1413	if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex)
1414		m = vm_radix_lookup_ge(&object->rtree, pindex);
1415	return (m);
1416}
1417
1418/*
1419 * Returns the given page's successor (by pindex) within the object if it is
1420 * resident; if none is found, NULL is returned.
1421 *
1422 * The object must be locked.
1423 */
1424vm_page_t
1425vm_page_next(vm_page_t m)
1426{
1427	vm_page_t next;
1428
1429	VM_OBJECT_ASSERT_LOCKED(m->object);
1430	if ((next = TAILQ_NEXT(m, listq)) != NULL) {
1431		MPASS(next->object == m->object);
1432		if (next->pindex != m->pindex + 1)
1433			next = NULL;
1434	}
1435	return (next);
1436}
1437
1438/*
1439 * Returns the given page's predecessor (by pindex) within the object if it is
1440 * resident; if none is found, NULL is returned.
1441 *
1442 * The object must be locked.
1443 */
1444vm_page_t
1445vm_page_prev(vm_page_t m)
1446{
1447	vm_page_t prev;
1448
1449	VM_OBJECT_ASSERT_LOCKED(m->object);
1450	if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) {
1451		MPASS(prev->object == m->object);
1452		if (prev->pindex != m->pindex - 1)
1453			prev = NULL;
1454	}
1455	return (prev);
1456}
1457
1458/*
1459 * Uses the page mnew as a replacement for an existing page at index
1460 * pindex which must be already present in the object.
1461 *
1462 * The existing page must not be on a paging queue.
1463 */
1464vm_page_t
1465vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex)
1466{
1467	vm_page_t mold;
1468
1469	VM_OBJECT_ASSERT_WLOCKED(object);
1470	KASSERT(mnew->object == NULL,
1471	    ("vm_page_replace: page already in object"));
1472
1473	/*
1474	 * This function mostly follows vm_page_insert() and
1475	 * vm_page_remove() without the radix, object count and vnode
1476	 * dance.  Double check such functions for more comments.
1477	 */
1478
1479	mnew->object = object;
1480	mnew->pindex = pindex;
1481	mold = vm_radix_replace(&object->rtree, mnew);
1482	KASSERT(mold->queue == PQ_NONE,
1483	    ("vm_page_replace: mold is on a paging queue"));
1484
1485	/* Keep the resident page list in sorted order. */
1486	TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq);
1487	TAILQ_REMOVE(&object->memq, mold, listq);
1488
1489	mold->object = NULL;
1490	vm_page_xunbusy_maybelocked(mold);
1491
1492	/*
1493	 * The object's resident_page_count does not change because we have
1494	 * swapped one page for another, but OBJ_MIGHTBEDIRTY.
1495	 */
1496	if (pmap_page_is_write_mapped(mnew))
1497		vm_object_set_writeable_dirty(object);
1498	return (mold);
1499}
1500
1501/*
1502 *	vm_page_rename:
1503 *
1504 *	Move the given memory entry from its
1505 *	current object to the specified target object/offset.
1506 *
1507 *	Note: swap associated with the page must be invalidated by the move.  We
1508 *	      have to do this for several reasons:  (1) we aren't freeing the
1509 *	      page, (2) we are dirtying the page, (3) the VM system is probably
1510 *	      moving the page from object A to B, and will then later move
1511 *	      the backing store from A to B and we can't have a conflict.
1512 *
1513 *	Note: we *always* dirty the page.  It is necessary both for the
1514 *	      fact that we moved it, and because we may be invalidating
1515 *	      swap.
1516 *
1517 *	The objects must be locked.
1518 */
1519int
1520vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1521{
1522	vm_page_t mpred;
1523	vm_pindex_t opidx;
1524
1525	VM_OBJECT_ASSERT_WLOCKED(new_object);
1526
1527	mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex);
1528	KASSERT(mpred == NULL || mpred->pindex != new_pindex,
1529	    ("vm_page_rename: pindex already renamed"));
1530
1531	/*
1532	 * Create a custom version of vm_page_insert() which does not depend
1533	 * by m_prev and can cheat on the implementation aspects of the
1534	 * function.
1535	 */
1536	opidx = m->pindex;
1537	m->pindex = new_pindex;
1538	if (vm_radix_insert(&new_object->rtree, m)) {
1539		m->pindex = opidx;
1540		return (1);
1541	}
1542
1543	/*
1544	 * The operation cannot fail anymore.  The removal must happen before
1545	 * the listq iterator is tainted.
1546	 */
1547	m->pindex = opidx;
1548	vm_page_lock(m);
1549	vm_page_remove(m);
1550
1551	/* Return back to the new pindex to complete vm_page_insert(). */
1552	m->pindex = new_pindex;
1553	m->object = new_object;
1554	vm_page_unlock(m);
1555	vm_page_insert_radixdone(m, new_object, mpred);
1556	vm_page_dirty(m);
1557	return (0);
1558}
1559
1560/*
1561 *	vm_page_alloc:
1562 *
1563 *	Allocate and return a page that is associated with the specified
1564 *	object and offset pair.  By default, this page is exclusive busied.
1565 *
1566 *	The caller must always specify an allocation class.
1567 *
1568 *	allocation classes:
1569 *	VM_ALLOC_NORMAL		normal process request
1570 *	VM_ALLOC_SYSTEM		system *really* needs a page
1571 *	VM_ALLOC_INTERRUPT	interrupt time request
1572 *
1573 *	optional allocation flags:
1574 *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
1575 *				intends to allocate
1576 *	VM_ALLOC_NOBUSY		do not exclusive busy the page
1577 *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
1578 *	VM_ALLOC_NOOBJ		page is not associated with an object and
1579 *				should not be exclusive busy
1580 *	VM_ALLOC_SBUSY		shared busy the allocated page
1581 *	VM_ALLOC_WIRED		wire the allocated page
1582 *	VM_ALLOC_ZERO		prefer a zeroed page
1583 *
1584 *	This routine may not sleep.
1585 */
1586vm_page_t
1587vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
1588{
1589
1590	return (vm_page_alloc_after(object, pindex, req, object != NULL ?
1591	    vm_radix_lookup_le(&object->rtree, pindex) : NULL));
1592}
1593
1594/*
1595 * Allocate a page in the specified object with the given page index.  To
1596 * optimize insertion of the page into the object, the caller must also specifiy
1597 * the resident page in the object with largest index smaller than the given
1598 * page index, or NULL if no such page exists.
1599 */
1600vm_page_t
1601vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, int req,
1602    vm_page_t mpred)
1603{
1604	vm_page_t m;
1605	int flags, req_class;
1606	u_int free_count;
1607
1608	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
1609	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
1610	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
1611	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
1612	    ("inconsistent object(%p)/req(%x)", object, req));
1613	KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0,
1614	    ("Can't sleep and retry object insertion."));
1615	KASSERT(mpred == NULL || mpred->pindex < pindex,
1616	    ("mpred %p doesn't precede pindex 0x%jx", mpred,
1617	    (uintmax_t)pindex));
1618	if (object != NULL)
1619		VM_OBJECT_ASSERT_WLOCKED(object);
1620
1621	if (__predict_false((req & VM_ALLOC_IFCACHED) != 0))
1622		return (NULL);
1623
1624	req_class = req & VM_ALLOC_CLASS_MASK;
1625
1626	/*
1627	 * The page daemon is allowed to dig deeper into the free page list.
1628	 */
1629	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1630		req_class = VM_ALLOC_SYSTEM;
1631
1632	/*
1633	 * Allocate a page if the number of free pages exceeds the minimum
1634	 * for the request class.
1635	 */
1636again:
1637	mtx_lock(&vm_page_queue_free_mtx);
1638	if (vm_cnt.v_free_count > vm_cnt.v_free_reserved ||
1639	    (req_class == VM_ALLOC_SYSTEM &&
1640	    vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) ||
1641	    (req_class == VM_ALLOC_INTERRUPT &&
1642	    vm_cnt.v_free_count > 0)) {
1643		/*
1644		 * Can we allocate the page from a reservation?
1645		 */
1646#if VM_NRESERVLEVEL > 0
1647		if (object == NULL || (object->flags & (OBJ_COLORED |
1648		    OBJ_FICTITIOUS)) != OBJ_COLORED || (m =
1649		    vm_reserv_alloc_page(object, pindex, mpred)) == NULL)
1650#endif
1651		{
1652			/*
1653			 * If not, allocate it from the free page queues.
1654			 */
1655			m = vm_phys_alloc_pages(object != NULL ?
1656			    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
1657#if VM_NRESERVLEVEL > 0
1658			if (m == NULL && vm_reserv_reclaim_inactive()) {
1659				m = vm_phys_alloc_pages(object != NULL ?
1660				    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT,
1661				    0);
1662			}
1663#endif
1664		}
1665	} else {
1666		/*
1667		 * Not allocatable, give up.
1668		 */
1669		if (vm_page_alloc_fail(object, req))
1670			goto again;
1671		return (NULL);
1672	}
1673
1674	/*
1675	 *  At this point we had better have found a good page.
1676	 */
1677	KASSERT(m != NULL, ("missing page"));
1678	free_count = vm_phys_freecnt_adj(m, -1);
1679	if ((m->flags & PG_ZERO) != 0)
1680		vm_page_zero_count--;
1681	mtx_unlock(&vm_page_queue_free_mtx);
1682	vm_page_alloc_check(m);
1683
1684	/*
1685	 * Initialize the page.  Only the PG_ZERO flag is inherited.
1686	 */
1687	flags = 0;
1688	if ((req & VM_ALLOC_ZERO) != 0)
1689		flags = PG_ZERO;
1690	flags &= m->flags;
1691	if ((req & VM_ALLOC_NODUMP) != 0)
1692		flags |= PG_NODUMP;
1693	m->flags = flags;
1694	m->aflags = 0;
1695	m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
1696	    VPO_UNMANAGED : 0;
1697	m->busy_lock = VPB_UNBUSIED;
1698	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
1699		m->busy_lock = VPB_SINGLE_EXCLUSIVER;
1700	if ((req & VM_ALLOC_SBUSY) != 0)
1701		m->busy_lock = VPB_SHARERS_WORD(1);
1702	if (req & VM_ALLOC_WIRED) {
1703		/*
1704		 * The page lock is not required for wiring a page until that
1705		 * page is inserted into the object.
1706		 */
1707		atomic_add_int(&vm_cnt.v_wire_count, 1);
1708		m->wire_count = 1;
1709	}
1710	m->act_count = 0;
1711
1712	if (object != NULL) {
1713		if (vm_page_insert_after(m, object, pindex, mpred)) {
1714			pagedaemon_wakeup();
1715			if (req & VM_ALLOC_WIRED) {
1716				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
1717				m->wire_count = 0;
1718			}
1719			KASSERT(m->object == NULL, ("page %p has object", m));
1720			m->oflags = VPO_UNMANAGED;
1721			m->busy_lock = VPB_UNBUSIED;
1722			/* Don't change PG_ZERO. */
1723			vm_page_free_toq(m);
1724			if (req & VM_ALLOC_WAITFAIL) {
1725				VM_OBJECT_WUNLOCK(object);
1726				vm_radix_wait();
1727				VM_OBJECT_WLOCK(object);
1728			}
1729			return (NULL);
1730		}
1731
1732		/* Ignore device objects; the pager sets "memattr" for them. */
1733		if (object->memattr != VM_MEMATTR_DEFAULT &&
1734		    (object->flags & OBJ_FICTITIOUS) == 0)
1735			pmap_page_set_memattr(m, object->memattr);
1736	} else
1737		m->pindex = pindex;
1738
1739	/*
1740	 * Don't wakeup too often - wakeup the pageout daemon when
1741	 * we would be nearly out of memory.
1742	 */
1743	if (vm_paging_needed(free_count))
1744		pagedaemon_wakeup();
1745
1746	return (m);
1747}
1748
1749/*
1750 *	vm_page_alloc_contig:
1751 *
1752 *	Allocate a contiguous set of physical pages of the given size "npages"
1753 *	from the free lists.  All of the physical pages must be at or above
1754 *	the given physical address "low" and below the given physical address
1755 *	"high".  The given value "alignment" determines the alignment of the
1756 *	first physical page in the set.  If the given value "boundary" is
1757 *	non-zero, then the set of physical pages cannot cross any physical
1758 *	address boundary that is a multiple of that value.  Both "alignment"
1759 *	and "boundary" must be a power of two.
1760 *
1761 *	If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
1762 *	then the memory attribute setting for the physical pages is configured
1763 *	to the object's memory attribute setting.  Otherwise, the memory
1764 *	attribute setting for the physical pages is configured to "memattr",
1765 *	overriding the object's memory attribute setting.  However, if the
1766 *	object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
1767 *	memory attribute setting for the physical pages cannot be configured
1768 *	to VM_MEMATTR_DEFAULT.
1769 *
1770 *	The specified object may not contain fictitious pages.
1771 *
1772 *	The caller must always specify an allocation class.
1773 *
1774 *	allocation classes:
1775 *	VM_ALLOC_NORMAL		normal process request
1776 *	VM_ALLOC_SYSTEM		system *really* needs a page
1777 *	VM_ALLOC_INTERRUPT	interrupt time request
1778 *
1779 *	optional allocation flags:
1780 *	VM_ALLOC_NOBUSY		do not exclusive busy the page
1781 *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
1782 *	VM_ALLOC_NOOBJ		page is not associated with an object and
1783 *				should not be exclusive busy
1784 *	VM_ALLOC_SBUSY		shared busy the allocated page
1785 *	VM_ALLOC_WIRED		wire the allocated page
1786 *	VM_ALLOC_ZERO		prefer a zeroed page
1787 *
1788 *	This routine may not sleep.
1789 */
1790vm_page_t
1791vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
1792    u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
1793    vm_paddr_t boundary, vm_memattr_t memattr)
1794{
1795	vm_page_t m, m_ret, mpred;
1796	u_int busy_lock, flags, oflags;
1797	int req_class;
1798
1799	mpred = NULL;	/* XXX: pacify gcc */
1800	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
1801	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
1802	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
1803	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
1804	    ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object,
1805	    req));
1806	KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0,
1807	    ("Can't sleep and retry object insertion."));
1808	if (object != NULL) {
1809		VM_OBJECT_ASSERT_WLOCKED(object);
1810		KASSERT((object->flags & OBJ_FICTITIOUS) == 0,
1811		    ("vm_page_alloc_contig: object %p has fictitious pages",
1812		    object));
1813	}
1814	KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
1815	req_class = req & VM_ALLOC_CLASS_MASK;
1816
1817	/*
1818	 * The page daemon is allowed to dig deeper into the free page list.
1819	 */
1820	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1821		req_class = VM_ALLOC_SYSTEM;
1822
1823	if (object != NULL) {
1824		mpred = vm_radix_lookup_le(&object->rtree, pindex);
1825		KASSERT(mpred == NULL || mpred->pindex != pindex,
1826		    ("vm_page_alloc_contig: pindex already allocated"));
1827	}
1828
1829	/*
1830	 * Can we allocate the pages without the number of free pages falling
1831	 * below the lower bound for the allocation class?
1832	 */
1833again:
1834	mtx_lock(&vm_page_queue_free_mtx);
1835	if (vm_cnt.v_free_count >= npages + vm_cnt.v_free_reserved ||
1836	    (req_class == VM_ALLOC_SYSTEM &&
1837	    vm_cnt.v_free_count >= npages + vm_cnt.v_interrupt_free_min) ||
1838	    (req_class == VM_ALLOC_INTERRUPT &&
1839	    vm_cnt.v_free_count >= npages)) {
1840		/*
1841		 * Can we allocate the pages from a reservation?
1842		 */
1843#if VM_NRESERVLEVEL > 0
1844retry:
1845		if (object == NULL || (object->flags & OBJ_COLORED) == 0 ||
1846		    (m_ret = vm_reserv_alloc_contig(object, pindex, npages,
1847		    low, high, alignment, boundary, mpred)) == NULL)
1848#endif
1849			/*
1850			 * If not, allocate them from the free page queues.
1851			 */
1852			m_ret = vm_phys_alloc_contig(npages, low, high,
1853			    alignment, boundary);
1854	} else {
1855		if (vm_page_alloc_fail(object, req))
1856			goto again;
1857		return (NULL);
1858	}
1859	if (m_ret != NULL) {
1860		vm_phys_freecnt_adj(m_ret, -npages);
1861		for (m = m_ret; m < &m_ret[npages]; m++)
1862			if ((m->flags & PG_ZERO) != 0)
1863				vm_page_zero_count--;
1864	} else {
1865#if VM_NRESERVLEVEL > 0
1866		if (vm_reserv_reclaim_contig(npages, low, high, alignment,
1867		    boundary))
1868			goto retry;
1869#endif
1870	}
1871	mtx_unlock(&vm_page_queue_free_mtx);
1872	if (m_ret == NULL)
1873		return (NULL);
1874	for (m = m_ret; m < &m_ret[npages]; m++)
1875		vm_page_alloc_check(m);
1876
1877	/*
1878	 * Initialize the pages.  Only the PG_ZERO flag is inherited.
1879	 */
1880	flags = 0;
1881	if ((req & VM_ALLOC_ZERO) != 0)
1882		flags = PG_ZERO;
1883	if ((req & VM_ALLOC_NODUMP) != 0)
1884		flags |= PG_NODUMP;
1885	oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
1886	    VPO_UNMANAGED : 0;
1887	busy_lock = VPB_UNBUSIED;
1888	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
1889		busy_lock = VPB_SINGLE_EXCLUSIVER;
1890	if ((req & VM_ALLOC_SBUSY) != 0)
1891		busy_lock = VPB_SHARERS_WORD(1);
1892	if ((req & VM_ALLOC_WIRED) != 0)
1893		atomic_add_int(&vm_cnt.v_wire_count, npages);
1894	if (object != NULL) {
1895		if (object->memattr != VM_MEMATTR_DEFAULT &&
1896		    memattr == VM_MEMATTR_DEFAULT)
1897			memattr = object->memattr;
1898	}
1899	for (m = m_ret; m < &m_ret[npages]; m++) {
1900		m->aflags = 0;
1901		m->flags = (m->flags | PG_NODUMP) & flags;
1902		m->busy_lock = busy_lock;
1903		if ((req & VM_ALLOC_WIRED) != 0)
1904			m->wire_count = 1;
1905		m->act_count = 0;
1906		m->oflags = oflags;
1907		if (object != NULL) {
1908			if (vm_page_insert_after(m, object, pindex, mpred)) {
1909				pagedaemon_wakeup();
1910				if ((req & VM_ALLOC_WIRED) != 0)
1911					atomic_subtract_int(
1912					    &vm_cnt.v_wire_count, npages);
1913				KASSERT(m->object == NULL,
1914				    ("page %p has object", m));
1915				mpred = m;
1916				for (m = m_ret; m < &m_ret[npages]; m++) {
1917					if (m <= mpred &&
1918					    (req & VM_ALLOC_WIRED) != 0)
1919						m->wire_count = 0;
1920					m->oflags = VPO_UNMANAGED;
1921					m->busy_lock = VPB_UNBUSIED;
1922					/* Don't change PG_ZERO. */
1923					vm_page_free_toq(m);
1924				}
1925				if (req & VM_ALLOC_WAITFAIL) {
1926					VM_OBJECT_WUNLOCK(object);
1927					vm_radix_wait();
1928					VM_OBJECT_WLOCK(object);
1929				}
1930				return (NULL);
1931			}
1932			mpred = m;
1933		} else
1934			m->pindex = pindex;
1935		if (memattr != VM_MEMATTR_DEFAULT)
1936			pmap_page_set_memattr(m, memattr);
1937		pindex++;
1938	}
1939	if (vm_paging_needed(vm_cnt.v_free_count))
1940		pagedaemon_wakeup();
1941	return (m_ret);
1942}
1943
1944/*
1945 * Check a page that has been freshly dequeued from a freelist.
1946 */
1947static void
1948vm_page_alloc_check(vm_page_t m)
1949{
1950
1951	KASSERT(m->object == NULL, ("page %p has object", m));
1952	KASSERT(m->queue == PQ_NONE,
1953	    ("page %p has unexpected queue %d", m, m->queue));
1954	KASSERT(m->wire_count == 0, ("page %p is wired", m));
1955	KASSERT(m->hold_count == 0, ("page %p is held", m));
1956	KASSERT(!vm_page_busied(m), ("page %p is busy", m));
1957	KASSERT(m->dirty == 0, ("page %p is dirty", m));
1958	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
1959	    ("page %p has unexpected memattr %d",
1960	    m, pmap_page_get_memattr(m)));
1961	KASSERT(m->valid == 0, ("free page %p is valid", m));
1962}
1963
1964/*
1965 * 	vm_page_alloc_freelist:
1966 *
1967 *	Allocate a physical page from the specified free page list.
1968 *
1969 *	The caller must always specify an allocation class.
1970 *
1971 *	allocation classes:
1972 *	VM_ALLOC_NORMAL		normal process request
1973 *	VM_ALLOC_SYSTEM		system *really* needs a page
1974 *	VM_ALLOC_INTERRUPT	interrupt time request
1975 *
1976 *	optional allocation flags:
1977 *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
1978 *				intends to allocate
1979 *	VM_ALLOC_WIRED		wire the allocated page
1980 *	VM_ALLOC_ZERO		prefer a zeroed page
1981 *
1982 *	This routine may not sleep.
1983 */
1984vm_page_t
1985vm_page_alloc_freelist(int flind, int req)
1986{
1987	vm_page_t m;
1988	u_int flags, free_count;
1989	int req_class;
1990
1991	req_class = req & VM_ALLOC_CLASS_MASK;
1992
1993	/*
1994	 * The page daemon is allowed to dig deeper into the free page list.
1995	 */
1996	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1997		req_class = VM_ALLOC_SYSTEM;
1998
1999	/*
2000	 * Do not allocate reserved pages unless the req has asked for it.
2001	 */
2002again:
2003	mtx_lock(&vm_page_queue_free_mtx);
2004	if (vm_cnt.v_free_count > vm_cnt.v_free_reserved ||
2005	    (req_class == VM_ALLOC_SYSTEM &&
2006	    vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) ||
2007	    (req_class == VM_ALLOC_INTERRUPT &&
2008	    vm_cnt.v_free_count > 0)) {
2009		m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
2010	} else {
2011		if (vm_page_alloc_fail(NULL, req))
2012			goto again;
2013		return (NULL);
2014	}
2015	if (m == NULL) {
2016		mtx_unlock(&vm_page_queue_free_mtx);
2017		return (NULL);
2018	}
2019	free_count = vm_phys_freecnt_adj(m, -1);
2020	if ((m->flags & PG_ZERO) != 0)
2021		vm_page_zero_count--;
2022	mtx_unlock(&vm_page_queue_free_mtx);
2023	vm_page_alloc_check(m);
2024
2025	/*
2026	 * Initialize the page.  Only the PG_ZERO flag is inherited.
2027	 */
2028	m->aflags = 0;
2029	flags = 0;
2030	if ((req & VM_ALLOC_ZERO) != 0)
2031		flags = PG_ZERO;
2032	m->flags &= flags;
2033	if ((req & VM_ALLOC_WIRED) != 0) {
2034		/*
2035		 * The page lock is not required for wiring a page that does
2036		 * not belong to an object.
2037		 */
2038		atomic_add_int(&vm_cnt.v_wire_count, 1);
2039		m->wire_count = 1;
2040	}
2041	/* Unmanaged pages don't use "act_count". */
2042	m->oflags = VPO_UNMANAGED;
2043	if (vm_paging_needed(free_count))
2044		pagedaemon_wakeup();
2045	return (m);
2046}
2047
2048#define	VPSC_ANY	0	/* No restrictions. */
2049#define	VPSC_NORESERV	1	/* Skip reservations; implies VPSC_NOSUPER. */
2050#define	VPSC_NOSUPER	2	/* Skip superpages. */
2051
2052/*
2053 *	vm_page_scan_contig:
2054 *
2055 *	Scan vm_page_array[] between the specified entries "m_start" and
2056 *	"m_end" for a run of contiguous physical pages that satisfy the
2057 *	specified conditions, and return the lowest page in the run.  The
2058 *	specified "alignment" determines the alignment of the lowest physical
2059 *	page in the run.  If the specified "boundary" is non-zero, then the
2060 *	run of physical pages cannot span a physical address that is a
2061 *	multiple of "boundary".
2062 *
2063 *	"m_end" is never dereferenced, so it need not point to a vm_page
2064 *	structure within vm_page_array[].
2065 *
2066 *	"npages" must be greater than zero.  "m_start" and "m_end" must not
2067 *	span a hole (or discontiguity) in the physical address space.  Both
2068 *	"alignment" and "boundary" must be a power of two.
2069 */
2070vm_page_t
2071vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
2072    u_long alignment, vm_paddr_t boundary, int options)
2073{
2074	struct mtx *m_mtx;
2075	vm_object_t object;
2076	vm_paddr_t pa;
2077	vm_page_t m, m_run;
2078#if VM_NRESERVLEVEL > 0
2079	int level;
2080#endif
2081	int m_inc, order, run_ext, run_len;
2082
2083	KASSERT(npages > 0, ("npages is 0"));
2084	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
2085	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
2086	m_run = NULL;
2087	run_len = 0;
2088	m_mtx = NULL;
2089	for (m = m_start; m < m_end && run_len < npages; m += m_inc) {
2090		KASSERT((m->flags & PG_MARKER) == 0,
2091		    ("page %p is PG_MARKER", m));
2092		KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->wire_count == 1,
2093		    ("fictitious page %p has invalid wire count", m));
2094
2095		/*
2096		 * If the current page would be the start of a run, check its
2097		 * physical address against the end, alignment, and boundary
2098		 * conditions.  If it doesn't satisfy these conditions, either
2099		 * terminate the scan or advance to the next page that
2100		 * satisfies the failed condition.
2101		 */
2102		if (run_len == 0) {
2103			KASSERT(m_run == NULL, ("m_run != NULL"));
2104			if (m + npages > m_end)
2105				break;
2106			pa = VM_PAGE_TO_PHYS(m);
2107			if ((pa & (alignment - 1)) != 0) {
2108				m_inc = atop(roundup2(pa, alignment) - pa);
2109				continue;
2110			}
2111			if (rounddown2(pa ^ (pa + ptoa(npages) - 1),
2112			    boundary) != 0) {
2113				m_inc = atop(roundup2(pa, boundary) - pa);
2114				continue;
2115			}
2116		} else
2117			KASSERT(m_run != NULL, ("m_run == NULL"));
2118
2119		vm_page_change_lock(m, &m_mtx);
2120		m_inc = 1;
2121retry:
2122		if (m->wire_count != 0 || m->hold_count != 0)
2123			run_ext = 0;
2124#if VM_NRESERVLEVEL > 0
2125		else if ((level = vm_reserv_level(m)) >= 0 &&
2126		    (options & VPSC_NORESERV) != 0) {
2127			run_ext = 0;
2128			/* Advance to the end of the reservation. */
2129			pa = VM_PAGE_TO_PHYS(m);
2130			m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) -
2131			    pa);
2132		}
2133#endif
2134		else if ((object = m->object) != NULL) {
2135			/*
2136			 * The page is considered eligible for relocation if
2137			 * and only if it could be laundered or reclaimed by
2138			 * the page daemon.
2139			 */
2140			if (!VM_OBJECT_TRYRLOCK(object)) {
2141				mtx_unlock(m_mtx);
2142				VM_OBJECT_RLOCK(object);
2143				mtx_lock(m_mtx);
2144				if (m->object != object) {
2145					/*
2146					 * The page may have been freed.
2147					 */
2148					VM_OBJECT_RUNLOCK(object);
2149					goto retry;
2150				} else if (m->wire_count != 0 ||
2151				    m->hold_count != 0) {
2152					run_ext = 0;
2153					goto unlock;
2154				}
2155			}
2156			KASSERT((m->flags & PG_UNHOLDFREE) == 0,
2157			    ("page %p is PG_UNHOLDFREE", m));
2158			/* Don't care: PG_NODUMP, PG_ZERO. */
2159			if (object->type != OBJT_DEFAULT &&
2160			    object->type != OBJT_SWAP &&
2161			    object->type != OBJT_VNODE) {
2162				run_ext = 0;
2163#if VM_NRESERVLEVEL > 0
2164			} else if ((options & VPSC_NOSUPER) != 0 &&
2165			    (level = vm_reserv_level_iffullpop(m)) >= 0) {
2166				run_ext = 0;
2167				/* Advance to the end of the superpage. */
2168				pa = VM_PAGE_TO_PHYS(m);
2169				m_inc = atop(roundup2(pa + 1,
2170				    vm_reserv_size(level)) - pa);
2171#endif
2172			} else if (object->memattr == VM_MEMATTR_DEFAULT &&
2173			    m->queue != PQ_NONE && !vm_page_busied(m)) {
2174				/*
2175				 * The page is allocated but eligible for
2176				 * relocation.  Extend the current run by one
2177				 * page.
2178				 */
2179				KASSERT(pmap_page_get_memattr(m) ==
2180				    VM_MEMATTR_DEFAULT,
2181				    ("page %p has an unexpected memattr", m));
2182				KASSERT((m->oflags & (VPO_SWAPINPROG |
2183				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
2184				    ("page %p has unexpected oflags", m));
2185				/* Don't care: VPO_NOSYNC. */
2186				run_ext = 1;
2187			} else
2188				run_ext = 0;
2189unlock:
2190			VM_OBJECT_RUNLOCK(object);
2191#if VM_NRESERVLEVEL > 0
2192		} else if (level >= 0) {
2193			/*
2194			 * The page is reserved but not yet allocated.  In
2195			 * other words, it is still free.  Extend the current
2196			 * run by one page.
2197			 */
2198			run_ext = 1;
2199#endif
2200		} else if ((order = m->order) < VM_NFREEORDER) {
2201			/*
2202			 * The page is enqueued in the physical memory
2203			 * allocator's free page queues.  Moreover, it is the
2204			 * first page in a power-of-two-sized run of
2205			 * contiguous free pages.  Add these pages to the end
2206			 * of the current run, and jump ahead.
2207			 */
2208			run_ext = 1 << order;
2209			m_inc = 1 << order;
2210		} else {
2211			/*
2212			 * Skip the page for one of the following reasons: (1)
2213			 * It is enqueued in the physical memory allocator's
2214			 * free page queues.  However, it is not the first
2215			 * page in a run of contiguous free pages.  (This case
2216			 * rarely occurs because the scan is performed in
2217			 * ascending order.) (2) It is not reserved, and it is
2218			 * transitioning from free to allocated.  (Conversely,
2219			 * the transition from allocated to free for managed
2220			 * pages is blocked by the page lock.) (3) It is
2221			 * allocated but not contained by an object and not
2222			 * wired, e.g., allocated by Xen's balloon driver.
2223			 */
2224			run_ext = 0;
2225		}
2226
2227		/*
2228		 * Extend or reset the current run of pages.
2229		 */
2230		if (run_ext > 0) {
2231			if (run_len == 0)
2232				m_run = m;
2233			run_len += run_ext;
2234		} else {
2235			if (run_len > 0) {
2236				m_run = NULL;
2237				run_len = 0;
2238			}
2239		}
2240	}
2241	if (m_mtx != NULL)
2242		mtx_unlock(m_mtx);
2243	if (run_len >= npages)
2244		return (m_run);
2245	return (NULL);
2246}
2247
2248/*
2249 *	vm_page_reclaim_run:
2250 *
2251 *	Try to relocate each of the allocated virtual pages within the
2252 *	specified run of physical pages to a new physical address.  Free the
2253 *	physical pages underlying the relocated virtual pages.  A virtual page
2254 *	is relocatable if and only if it could be laundered or reclaimed by
2255 *	the page daemon.  Whenever possible, a virtual page is relocated to a
2256 *	physical address above "high".
2257 *
2258 *	Returns 0 if every physical page within the run was already free or
2259 *	just freed by a successful relocation.  Otherwise, returns a non-zero
2260 *	value indicating why the last attempt to relocate a virtual page was
2261 *	unsuccessful.
2262 *
2263 *	"req_class" must be an allocation class.
2264 */
2265static int
2266vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
2267    vm_paddr_t high)
2268{
2269	struct mtx *m_mtx;
2270	struct spglist free;
2271	vm_object_t object;
2272	vm_paddr_t pa;
2273	vm_page_t m, m_end, m_new;
2274	int error, order, req;
2275
2276	KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class,
2277	    ("req_class is not an allocation class"));
2278	SLIST_INIT(&free);
2279	error = 0;
2280	m = m_run;
2281	m_end = m_run + npages;
2282	m_mtx = NULL;
2283	for (; error == 0 && m < m_end; m++) {
2284		KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
2285		    ("page %p is PG_FICTITIOUS or PG_MARKER", m));
2286
2287		/*
2288		 * Avoid releasing and reacquiring the same page lock.
2289		 */
2290		vm_page_change_lock(m, &m_mtx);
2291retry:
2292		if (m->wire_count != 0 || m->hold_count != 0)
2293			error = EBUSY;
2294		else if ((object = m->object) != NULL) {
2295			/*
2296			 * The page is relocated if and only if it could be
2297			 * laundered or reclaimed by the page daemon.
2298			 */
2299			if (!VM_OBJECT_TRYWLOCK(object)) {
2300				mtx_unlock(m_mtx);
2301				VM_OBJECT_WLOCK(object);
2302				mtx_lock(m_mtx);
2303				if (m->object != object) {
2304					/*
2305					 * The page may have been freed.
2306					 */
2307					VM_OBJECT_WUNLOCK(object);
2308					goto retry;
2309				} else if (m->wire_count != 0 ||
2310				    m->hold_count != 0) {
2311					error = EBUSY;
2312					goto unlock;
2313				}
2314			}
2315			KASSERT((m->flags & PG_UNHOLDFREE) == 0,
2316			    ("page %p is PG_UNHOLDFREE", m));
2317			/* Don't care: PG_NODUMP, PG_ZERO. */
2318			if (object->type != OBJT_DEFAULT &&
2319			    object->type != OBJT_SWAP &&
2320			    object->type != OBJT_VNODE)
2321				error = EINVAL;
2322			else if (object->memattr != VM_MEMATTR_DEFAULT)
2323				error = EINVAL;
2324			else if (m->queue != PQ_NONE && !vm_page_busied(m)) {
2325				KASSERT(pmap_page_get_memattr(m) ==
2326				    VM_MEMATTR_DEFAULT,
2327				    ("page %p has an unexpected memattr", m));
2328				KASSERT((m->oflags & (VPO_SWAPINPROG |
2329				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
2330				    ("page %p has unexpected oflags", m));
2331				/* Don't care: VPO_NOSYNC. */
2332				if (m->valid != 0) {
2333					/*
2334					 * First, try to allocate a new page
2335					 * that is above "high".  Failing
2336					 * that, try to allocate a new page
2337					 * that is below "m_run".  Allocate
2338					 * the new page between the end of
2339					 * "m_run" and "high" only as a last
2340					 * resort.
2341					 */
2342					req = req_class | VM_ALLOC_NOOBJ;
2343					if ((m->flags & PG_NODUMP) != 0)
2344						req |= VM_ALLOC_NODUMP;
2345					if (trunc_page(high) !=
2346					    ~(vm_paddr_t)PAGE_MASK) {
2347						m_new = vm_page_alloc_contig(
2348						    NULL, 0, req, 1,
2349						    round_page(high),
2350						    ~(vm_paddr_t)0,
2351						    PAGE_SIZE, 0,
2352						    VM_MEMATTR_DEFAULT);
2353					} else
2354						m_new = NULL;
2355					if (m_new == NULL) {
2356						pa = VM_PAGE_TO_PHYS(m_run);
2357						m_new = vm_page_alloc_contig(
2358						    NULL, 0, req, 1,
2359						    0, pa - 1, PAGE_SIZE, 0,
2360						    VM_MEMATTR_DEFAULT);
2361					}
2362					if (m_new == NULL) {
2363						pa += ptoa(npages);
2364						m_new = vm_page_alloc_contig(
2365						    NULL, 0, req, 1,
2366						    pa, high, PAGE_SIZE, 0,
2367						    VM_MEMATTR_DEFAULT);
2368					}
2369					if (m_new == NULL) {
2370						error = ENOMEM;
2371						goto unlock;
2372					}
2373					KASSERT(m_new->wire_count == 0,
2374					    ("page %p is wired", m_new));
2375
2376					/*
2377					 * Replace "m" with the new page.  For
2378					 * vm_page_replace(), "m" must be busy
2379					 * and dequeued.  Finally, change "m"
2380					 * as if vm_page_free() was called.
2381					 */
2382					if (object->ref_count != 0)
2383						pmap_remove_all(m);
2384					m_new->aflags = m->aflags;
2385					KASSERT(m_new->oflags == VPO_UNMANAGED,
2386					    ("page %p is managed", m_new));
2387					m_new->oflags = m->oflags & VPO_NOSYNC;
2388					pmap_copy_page(m, m_new);
2389					m_new->valid = m->valid;
2390					m_new->dirty = m->dirty;
2391					m->flags &= ~PG_ZERO;
2392					vm_page_xbusy(m);
2393					vm_page_remque(m);
2394					vm_page_replace_checked(m_new, object,
2395					    m->pindex, m);
2396					m->valid = 0;
2397					vm_page_undirty(m);
2398
2399					/*
2400					 * The new page must be deactivated
2401					 * before the object is unlocked.
2402					 */
2403					vm_page_change_lock(m_new, &m_mtx);
2404					vm_page_deactivate(m_new);
2405				} else {
2406					m->flags &= ~PG_ZERO;
2407					vm_page_remque(m);
2408					vm_page_remove(m);
2409					KASSERT(m->dirty == 0,
2410					    ("page %p is dirty", m));
2411				}
2412				SLIST_INSERT_HEAD(&free, m, plinks.s.ss);
2413			} else
2414				error = EBUSY;
2415unlock:
2416			VM_OBJECT_WUNLOCK(object);
2417		} else {
2418			mtx_lock(&vm_page_queue_free_mtx);
2419			order = m->order;
2420			if (order < VM_NFREEORDER) {
2421				/*
2422				 * The page is enqueued in the physical memory
2423				 * allocator's free page queues.  Moreover, it
2424				 * is the first page in a power-of-two-sized
2425				 * run of contiguous free pages.  Jump ahead
2426				 * to the last page within that run, and
2427				 * continue from there.
2428				 */
2429				m += (1 << order) - 1;
2430			}
2431#if VM_NRESERVLEVEL > 0
2432			else if (vm_reserv_is_page_free(m))
2433				order = 0;
2434#endif
2435			mtx_unlock(&vm_page_queue_free_mtx);
2436			if (order == VM_NFREEORDER)
2437				error = EINVAL;
2438		}
2439	}
2440	if (m_mtx != NULL)
2441		mtx_unlock(m_mtx);
2442	if ((m = SLIST_FIRST(&free)) != NULL) {
2443		mtx_lock(&vm_page_queue_free_mtx);
2444		do {
2445			SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2446			vm_page_free_phys(m);
2447		} while ((m = SLIST_FIRST(&free)) != NULL);
2448		vm_page_zero_idle_wakeup();
2449		vm_page_free_wakeup();
2450		mtx_unlock(&vm_page_queue_free_mtx);
2451	}
2452	return (error);
2453}
2454
2455#define	NRUNS	16
2456
2457CTASSERT(powerof2(NRUNS));
2458
2459#define	RUN_INDEX(count)	((count) & (NRUNS - 1))
2460
2461#define	MIN_RECLAIM	8
2462
2463/*
2464 *	vm_page_reclaim_contig:
2465 *
2466 *	Reclaim allocated, contiguous physical memory satisfying the specified
2467 *	conditions by relocating the virtual pages using that physical memory.
2468 *	Returns true if reclamation is successful and false otherwise.  Since
2469 *	relocation requires the allocation of physical pages, reclamation may
2470 *	fail due to a shortage of free pages.  When reclamation fails, callers
2471 *	are expected to perform VM_WAIT before retrying a failed allocation
2472 *	operation, e.g., vm_page_alloc_contig().
2473 *
2474 *	The caller must always specify an allocation class through "req".
2475 *
2476 *	allocation classes:
2477 *	VM_ALLOC_NORMAL		normal process request
2478 *	VM_ALLOC_SYSTEM		system *really* needs a page
2479 *	VM_ALLOC_INTERRUPT	interrupt time request
2480 *
2481 *	The optional allocation flags are ignored.
2482 *
2483 *	"npages" must be greater than zero.  Both "alignment" and "boundary"
2484 *	must be a power of two.
2485 */
2486bool
2487vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
2488    u_long alignment, vm_paddr_t boundary)
2489{
2490	vm_paddr_t curr_low;
2491	vm_page_t m_run, m_runs[NRUNS];
2492	u_long count, reclaimed;
2493	int error, i, options, req_class;
2494
2495	KASSERT(npages > 0, ("npages is 0"));
2496	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
2497	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
2498	req_class = req & VM_ALLOC_CLASS_MASK;
2499
2500	/*
2501	 * The page daemon is allowed to dig deeper into the free page list.
2502	 */
2503	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
2504		req_class = VM_ALLOC_SYSTEM;
2505
2506	/*
2507	 * Return if the number of free pages cannot satisfy the requested
2508	 * allocation.
2509	 */
2510	count = vm_cnt.v_free_count;
2511	if (count < npages + vm_cnt.v_free_reserved || (count < npages +
2512	    vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) ||
2513	    (count < npages && req_class == VM_ALLOC_INTERRUPT))
2514		return (false);
2515
2516	/*
2517	 * Scan up to three times, relaxing the restrictions ("options") on
2518	 * the reclamation of reservations and superpages each time.
2519	 */
2520	for (options = VPSC_NORESERV;;) {
2521		/*
2522		 * Find the highest runs that satisfy the given constraints
2523		 * and restrictions, and record them in "m_runs".
2524		 */
2525		curr_low = low;
2526		count = 0;
2527		for (;;) {
2528			m_run = vm_phys_scan_contig(npages, curr_low, high,
2529			    alignment, boundary, options);
2530			if (m_run == NULL)
2531				break;
2532			curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages);
2533			m_runs[RUN_INDEX(count)] = m_run;
2534			count++;
2535		}
2536
2537		/*
2538		 * Reclaim the highest runs in LIFO (descending) order until
2539		 * the number of reclaimed pages, "reclaimed", is at least
2540		 * MIN_RECLAIM.  Reset "reclaimed" each time because each
2541		 * reclamation is idempotent, and runs will (likely) recur
2542		 * from one scan to the next as restrictions are relaxed.
2543		 */
2544		reclaimed = 0;
2545		for (i = 0; count > 0 && i < NRUNS; i++) {
2546			count--;
2547			m_run = m_runs[RUN_INDEX(count)];
2548			error = vm_page_reclaim_run(req_class, npages, m_run,
2549			    high);
2550			if (error == 0) {
2551				reclaimed += npages;
2552				if (reclaimed >= MIN_RECLAIM)
2553					return (true);
2554			}
2555		}
2556
2557		/*
2558		 * Either relax the restrictions on the next scan or return if
2559		 * the last scan had no restrictions.
2560		 */
2561		if (options == VPSC_NORESERV)
2562			options = VPSC_NOSUPER;
2563		else if (options == VPSC_NOSUPER)
2564			options = VPSC_ANY;
2565		else if (options == VPSC_ANY)
2566			return (reclaimed != 0);
2567	}
2568}
2569
2570/*
2571 *	vm_wait:	(also see VM_WAIT macro)
2572 *
2573 *	Sleep until free pages are available for allocation.
2574 *	- Called in various places before memory allocations.
2575 */
2576static void
2577_vm_wait(void)
2578{
2579
2580	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
2581	if (curproc == pageproc) {
2582		vm_pageout_pages_needed = 1;
2583		msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
2584		    PDROP | PSWP, "VMWait", 0);
2585	} else {
2586		if (pageproc == NULL)
2587			panic("vm_wait in early boot");
2588		pagedaemon_wait(PVM, "vmwait");
2589	}
2590}
2591
2592void
2593vm_wait(void)
2594{
2595
2596	mtx_lock(&vm_page_queue_free_mtx);
2597	_vm_wait();
2598}
2599
2600/*
2601 *	vm_page_alloc_fail:
2602 *
2603 *	Called when a page allocation function fails.  Informs the
2604 *	pagedaemon and performs the requested wait.  Requires the
2605 *	page_queue_free and object lock on entry.  Returns with the
2606 *	object lock held and free lock released.  Returns an error when
2607 *	retry is necessary.
2608 *
2609 */
2610static int
2611vm_page_alloc_fail(vm_object_t object, int req)
2612{
2613
2614	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
2615
2616	atomic_add_int(&vm_pageout_deficit,
2617	    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
2618	if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) {
2619		if (object != NULL)
2620			VM_OBJECT_WUNLOCK(object);
2621		_vm_wait();
2622		if (object != NULL)
2623			VM_OBJECT_WLOCK(object);
2624		if (req & VM_ALLOC_WAITOK)
2625			return (EAGAIN);
2626	} else {
2627		mtx_unlock(&vm_page_queue_free_mtx);
2628		pagedaemon_wakeup();
2629	}
2630	return (0);
2631}
2632
2633/*
2634 *	vm_waitpfault:	(also see VM_WAITPFAULT macro)
2635 *
2636 *	Sleep until free pages are available for allocation.
2637 *	- Called only in vm_fault so that processes page faulting
2638 *	  can be easily tracked.
2639 *	- Sleeps at a lower priority than vm_wait() so that vm_wait()ing
2640 *	  processes will be able to grab memory first.  Do not change
2641 *	  this balance without careful testing first.
2642 */
2643void
2644vm_waitpfault(void)
2645{
2646
2647	mtx_lock(&vm_page_queue_free_mtx);
2648	pagedaemon_wait(PUSER, "pfault");
2649}
2650
2651struct vm_pagequeue *
2652vm_page_pagequeue(vm_page_t m)
2653{
2654
2655	if (vm_page_in_laundry(m))
2656		return (&vm_dom[0].vmd_pagequeues[m->queue]);
2657	else
2658		return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
2659}
2660
2661/*
2662 *	vm_page_dequeue:
2663 *
2664 *	Remove the given page from its current page queue.
2665 *
2666 *	The page must be locked.
2667 */
2668void
2669vm_page_dequeue(vm_page_t m)
2670{
2671	struct vm_pagequeue *pq;
2672
2673	vm_page_assert_locked(m);
2674	KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued",
2675	    m));
2676	pq = vm_page_pagequeue(m);
2677	vm_pagequeue_lock(pq);
2678	m->queue = PQ_NONE;
2679	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
2680	vm_pagequeue_cnt_dec(pq);
2681	vm_pagequeue_unlock(pq);
2682}
2683
2684/*
2685 *	vm_page_dequeue_locked:
2686 *
2687 *	Remove the given page from its current page queue.
2688 *
2689 *	The page and page queue must be locked.
2690 */
2691void
2692vm_page_dequeue_locked(vm_page_t m)
2693{
2694	struct vm_pagequeue *pq;
2695
2696	vm_page_lock_assert(m, MA_OWNED);
2697	pq = vm_page_pagequeue(m);
2698	vm_pagequeue_assert_locked(pq);
2699	m->queue = PQ_NONE;
2700	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
2701	vm_pagequeue_cnt_dec(pq);
2702}
2703
2704/*
2705 *	vm_page_enqueue:
2706 *
2707 *	Add the given page to the specified page queue.
2708 *
2709 *	The page must be locked.
2710 */
2711static void
2712vm_page_enqueue(uint8_t queue, vm_page_t m)
2713{
2714	struct vm_pagequeue *pq;
2715
2716	vm_page_lock_assert(m, MA_OWNED);
2717	KASSERT(queue < PQ_COUNT,
2718	    ("vm_page_enqueue: invalid queue %u request for page %p",
2719	    queue, m));
2720	if (queue == PQ_LAUNDRY)
2721		pq = &vm_dom[0].vmd_pagequeues[queue];
2722	else
2723		pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
2724	vm_pagequeue_lock(pq);
2725	m->queue = queue;
2726	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
2727	vm_pagequeue_cnt_inc(pq);
2728	vm_pagequeue_unlock(pq);
2729}
2730
2731/*
2732 *	vm_page_requeue:
2733 *
2734 *	Move the given page to the tail of its current page queue.
2735 *
2736 *	The page must be locked.
2737 */
2738void
2739vm_page_requeue(vm_page_t m)
2740{
2741	struct vm_pagequeue *pq;
2742
2743	vm_page_lock_assert(m, MA_OWNED);
2744	KASSERT(m->queue != PQ_NONE,
2745	    ("vm_page_requeue: page %p is not queued", m));
2746	pq = vm_page_pagequeue(m);
2747	vm_pagequeue_lock(pq);
2748	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
2749	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
2750	vm_pagequeue_unlock(pq);
2751}
2752
2753/*
2754 *	vm_page_requeue_locked:
2755 *
2756 *	Move the given page to the tail of its current page queue.
2757 *
2758 *	The page queue must be locked.
2759 */
2760void
2761vm_page_requeue_locked(vm_page_t m)
2762{
2763	struct vm_pagequeue *pq;
2764
2765	KASSERT(m->queue != PQ_NONE,
2766	    ("vm_page_requeue_locked: page %p is not queued", m));
2767	pq = vm_page_pagequeue(m);
2768	vm_pagequeue_assert_locked(pq);
2769	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
2770	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
2771}
2772
2773/*
2774 *	vm_page_activate:
2775 *
2776 *	Put the specified page on the active list (if appropriate).
2777 *	Ensure that act_count is at least ACT_INIT but do not otherwise
2778 *	mess with it.
2779 *
2780 *	The page must be locked.
2781 */
2782void
2783vm_page_activate(vm_page_t m)
2784{
2785	int queue;
2786
2787	vm_page_lock_assert(m, MA_OWNED);
2788	if ((queue = m->queue) != PQ_ACTIVE) {
2789		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
2790			if (m->act_count < ACT_INIT)
2791				m->act_count = ACT_INIT;
2792			if (queue != PQ_NONE)
2793				vm_page_dequeue(m);
2794			vm_page_enqueue(PQ_ACTIVE, m);
2795		} else
2796			KASSERT(queue == PQ_NONE,
2797			    ("vm_page_activate: wired page %p is queued", m));
2798	} else {
2799		if (m->act_count < ACT_INIT)
2800			m->act_count = ACT_INIT;
2801	}
2802}
2803
2804/*
2805 *	vm_page_free_wakeup:
2806 *
2807 *	Helper routine for vm_page_free_toq().  This routine is called
2808 *	when a page is added to the free queues.
2809 *
2810 *	The page queues must be locked.
2811 */
2812static void
2813vm_page_free_wakeup(void)
2814{
2815
2816	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
2817	/*
2818	 * if pageout daemon needs pages, then tell it that there are
2819	 * some free.
2820	 */
2821	if (vm_pageout_pages_needed &&
2822	    vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) {
2823		wakeup(&vm_pageout_pages_needed);
2824		vm_pageout_pages_needed = 0;
2825	}
2826	/*
2827	 * wakeup processes that are waiting on memory if we hit a
2828	 * high water mark. And wakeup scheduler process if we have
2829	 * lots of memory. this process will swapin processes.
2830	 */
2831	if (vm_pages_needed && !vm_page_count_min()) {
2832		vm_pages_needed = false;
2833		wakeup(&vm_cnt.v_free_count);
2834	}
2835}
2836
2837/*
2838 *	vm_page_free_prep:
2839 *
2840 *	Prepares the given page to be put on the free list,
2841 *	disassociating it from any VM object. The caller may return
2842 *	the page to the free list only if this function returns true.
2843 *
2844 *	The object must be locked.  The page must be locked if it is
2845 *	managed.  For a queued managed page, the pagequeue_locked
2846 *	argument specifies whether the page queue is already locked.
2847 */
2848bool
2849vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
2850{
2851
2852#if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP)
2853	if ((m->flags & PG_ZERO) != 0) {
2854		uint64_t *p;
2855		int i;
2856		p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2857		for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++)
2858			KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx",
2859			    m, i, (uintmax_t)*p));
2860	}
2861#endif
2862	if ((m->oflags & VPO_UNMANAGED) == 0) {
2863		vm_page_lock_assert(m, MA_OWNED);
2864		KASSERT(!pmap_page_is_mapped(m),
2865		    ("vm_page_free_toq: freeing mapped page %p", m));
2866	} else
2867		KASSERT(m->queue == PQ_NONE,
2868		    ("vm_page_free_toq: unmanaged page %p is queued", m));
2869	PCPU_INC(cnt.v_tfree);
2870
2871	if (vm_page_sbusied(m))
2872		panic("vm_page_free: freeing busy page %p", m);
2873
2874	/*
2875	 * Unqueue, then remove page.  Note that we cannot destroy
2876	 * the page here because we do not want to call the pager's
2877	 * callback routine until after we've put the page on the
2878	 * appropriate free queue.
2879	 */
2880	if (m->queue != PQ_NONE) {
2881		if (pagequeue_locked)
2882			vm_page_dequeue_locked(m);
2883		else
2884			vm_page_dequeue(m);
2885	}
2886	vm_page_remove(m);
2887
2888	/*
2889	 * If fictitious remove object association and
2890	 * return, otherwise delay object association removal.
2891	 */
2892	if ((m->flags & PG_FICTITIOUS) != 0)
2893		return (false);
2894
2895	m->valid = 0;
2896	vm_page_undirty(m);
2897
2898	if (m->wire_count != 0)
2899		panic("vm_page_free: freeing wired page %p", m);
2900	if (m->hold_count != 0) {
2901		m->flags &= ~PG_ZERO;
2902		KASSERT((m->flags & PG_UNHOLDFREE) == 0,
2903		    ("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
2904		m->flags |= PG_UNHOLDFREE;
2905		return (false);
2906	}
2907
2908	/*
2909	 * Restore the default memory attribute to the page.
2910	 */
2911	if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
2912		pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
2913
2914	return (true);
2915}
2916
2917/*
2918 * Insert the page into the physical memory allocator's free page
2919 * queues.  This is the last step to free a page.
2920 */
2921static void
2922vm_page_free_phys(vm_page_t m)
2923{
2924
2925	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
2926
2927	vm_phys_freecnt_adj(m, 1);
2928#if VM_NRESERVLEVEL > 0
2929	if (!vm_reserv_free_page(m))
2930#endif
2931			vm_phys_free_pages(m, 0);
2932	if ((m->flags & PG_ZERO) != 0)
2933		++vm_page_zero_count;
2934	else
2935		vm_page_zero_idle_wakeup();
2936}
2937
2938void
2939vm_page_free_phys_pglist(struct pglist *tq)
2940{
2941	vm_page_t m;
2942
2943	if (TAILQ_EMPTY(tq))
2944		return;
2945	mtx_lock(&vm_page_queue_free_mtx);
2946	TAILQ_FOREACH(m, tq, listq)
2947		vm_page_free_phys(m);
2948	vm_page_free_wakeup();
2949	mtx_unlock(&vm_page_queue_free_mtx);
2950}
2951
2952/*
2953 *	vm_page_free_toq:
2954 *
2955 *	Returns the given page to the free list, disassociating it
2956 *	from any VM object.
2957 *
2958 *	The object must be locked.  The page must be locked if it is
2959 *	managed.
2960 */
2961void
2962vm_page_free_toq(vm_page_t m)
2963{
2964
2965	if (!vm_page_free_prep(m, false))
2966		return;
2967	mtx_lock(&vm_page_queue_free_mtx);
2968	vm_page_free_phys(m);
2969	vm_page_free_wakeup();
2970	mtx_unlock(&vm_page_queue_free_mtx);
2971}
2972
2973/*
2974 *	vm_page_wire:
2975 *
2976 *	Mark this page as wired down by yet
2977 *	another map, removing it from paging queues
2978 *	as necessary.
2979 *
2980 *	If the page is fictitious, then its wire count must remain one.
2981 *
2982 *	The page must be locked.
2983 */
2984void
2985vm_page_wire(vm_page_t m)
2986{
2987
2988	/*
2989	 * Only bump the wire statistics if the page is not already wired,
2990	 * and only unqueue the page if it is on some queue (if it is unmanaged
2991	 * it is already off the queues).
2992	 */
2993	vm_page_lock_assert(m, MA_OWNED);
2994	if ((m->flags & PG_FICTITIOUS) != 0) {
2995		KASSERT(m->wire_count == 1,
2996		    ("vm_page_wire: fictitious page %p's wire count isn't one",
2997		    m));
2998		return;
2999	}
3000	if (m->wire_count == 0) {
3001		KASSERT((m->oflags & VPO_UNMANAGED) == 0 ||
3002		    m->queue == PQ_NONE,
3003		    ("vm_page_wire: unmanaged page %p is queued", m));
3004		vm_page_remque(m);
3005		atomic_add_int(&vm_cnt.v_wire_count, 1);
3006	}
3007	m->wire_count++;
3008	KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
3009}
3010
3011/*
3012 * vm_page_unwire:
3013 *
3014 * Release one wiring of the specified page, potentially allowing it to be
3015 * paged out.  Returns TRUE if the number of wirings transitions to zero and
3016 * FALSE otherwise.
3017 *
3018 * Only managed pages belonging to an object can be paged out.  If the number
3019 * of wirings transitions to zero and the page is eligible for page out, then
3020 * the page is added to the specified paging queue (unless PQ_NONE is
3021 * specified).
3022 *
3023 * If a page is fictitious, then its wire count must always be one.
3024 *
3025 * A managed page must be locked.
3026 */
3027boolean_t
3028vm_page_unwire(vm_page_t m, uint8_t queue)
3029{
3030
3031	KASSERT(queue < PQ_COUNT || queue == PQ_NONE,
3032	    ("vm_page_unwire: invalid queue %u request for page %p",
3033	    queue, m));
3034	if ((m->oflags & VPO_UNMANAGED) == 0)
3035		vm_page_assert_locked(m);
3036	if ((m->flags & PG_FICTITIOUS) != 0) {
3037		KASSERT(m->wire_count == 1,
3038	    ("vm_page_unwire: fictitious page %p's wire count isn't one", m));
3039		return (FALSE);
3040	}
3041	if (m->wire_count > 0) {
3042		m->wire_count--;
3043		if (m->wire_count == 0) {
3044			atomic_subtract_int(&vm_cnt.v_wire_count, 1);
3045			if ((m->oflags & VPO_UNMANAGED) == 0 &&
3046			    m->object != NULL && queue != PQ_NONE)
3047				vm_page_enqueue(queue, m);
3048			return (TRUE);
3049		} else
3050			return (FALSE);
3051	} else
3052		panic("vm_page_unwire: page %p's wire count is zero", m);
3053}
3054
3055/*
3056 * Move the specified page to the inactive queue.
3057 *
3058 * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive
3059 * queue.  However, setting "noreuse" to TRUE will accelerate the specified
3060 * page's reclamation, but it will not unmap the page from any address space.
3061 * This is implemented by inserting the page near the head of the inactive
3062 * queue, using a marker page to guide FIFO insertion ordering.
3063 *
3064 * The page must be locked.
3065 */
3066static inline void
3067_vm_page_deactivate(vm_page_t m, boolean_t noreuse)
3068{
3069	struct vm_pagequeue *pq;
3070	int queue;
3071
3072	vm_page_assert_locked(m);
3073
3074	/*
3075	 * Ignore if the page is already inactive, unless it is unlikely to be
3076	 * reactivated.
3077	 */
3078	if ((queue = m->queue) == PQ_INACTIVE && !noreuse)
3079		return;
3080	if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
3081		pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE];
3082		/* Avoid multiple acquisitions of the inactive queue lock. */
3083		if (queue == PQ_INACTIVE) {
3084			vm_pagequeue_lock(pq);
3085			vm_page_dequeue_locked(m);
3086		} else {
3087			if (queue != PQ_NONE)
3088				vm_page_dequeue(m);
3089			vm_pagequeue_lock(pq);
3090		}
3091		m->queue = PQ_INACTIVE;
3092		if (noreuse)
3093			TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead,
3094			    m, plinks.q);
3095		else
3096			TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
3097		vm_pagequeue_cnt_inc(pq);
3098		vm_pagequeue_unlock(pq);
3099	}
3100}
3101
3102/*
3103 * Move the specified page to the inactive queue.
3104 *
3105 * The page must be locked.
3106 */
3107void
3108vm_page_deactivate(vm_page_t m)
3109{
3110
3111	_vm_page_deactivate(m, FALSE);
3112}
3113
3114/*
3115 * Move the specified page to the inactive queue with the expectation
3116 * that it is unlikely to be reused.
3117 *
3118 * The page must be locked.
3119 */
3120void
3121vm_page_deactivate_noreuse(vm_page_t m)
3122{
3123
3124	_vm_page_deactivate(m, TRUE);
3125}
3126
3127/*
3128 * vm_page_launder
3129 *
3130 * 	Put a page in the laundry.
3131 */
3132void
3133vm_page_launder(vm_page_t m)
3134{
3135	int queue;
3136
3137	vm_page_assert_locked(m);
3138	if ((queue = m->queue) != PQ_LAUNDRY) {
3139		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
3140			if (queue != PQ_NONE)
3141				vm_page_dequeue(m);
3142			vm_page_enqueue(PQ_LAUNDRY, m);
3143		} else
3144			KASSERT(queue == PQ_NONE,
3145			    ("wired page %p is queued", m));
3146	}
3147}
3148
3149/*
3150 * vm_page_try_to_free()
3151 *
3152 *	Attempt to free the page.  If we cannot free it, we do nothing.
3153 *	true is returned on success, false on failure.
3154 */
3155bool
3156vm_page_try_to_free(vm_page_t m)
3157{
3158
3159	vm_page_assert_locked(m);
3160	if (m->object != NULL)
3161		VM_OBJECT_ASSERT_WLOCKED(m->object);
3162	if (m->dirty != 0 || m->hold_count != 0 || m->wire_count != 0 ||
3163	    (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
3164		return (false);
3165	if (m->object != NULL && m->object->ref_count != 0) {
3166		pmap_remove_all(m);
3167		if (m->dirty != 0)
3168			return (false);
3169	}
3170	vm_page_free(m);
3171	return (true);
3172}
3173
3174/*
3175 * vm_page_advise
3176 *
3177 * 	Apply the specified advice to the given page.
3178 *
3179 *	The object and page must be locked.
3180 */
3181void
3182vm_page_advise(vm_page_t m, int advice)
3183{
3184
3185	vm_page_assert_locked(m);
3186	VM_OBJECT_ASSERT_WLOCKED(m->object);
3187	if (advice == MADV_FREE)
3188		/*
3189		 * Mark the page clean.  This will allow the page to be freed
3190		 * without first paging it out.  MADV_FREE pages are often
3191		 * quickly reused by malloc(3), so we do not do anything that
3192		 * would result in a page fault on a later access.
3193		 */
3194		vm_page_undirty(m);
3195	else if (advice != MADV_DONTNEED) {
3196		if (advice == MADV_WILLNEED)
3197			vm_page_activate(m);
3198		return;
3199	}
3200
3201	/*
3202	 * Clear any references to the page.  Otherwise, the page daemon will
3203	 * immediately reactivate the page.
3204	 */
3205	vm_page_aflag_clear(m, PGA_REFERENCED);
3206
3207	if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m))
3208		vm_page_dirty(m);
3209
3210	/*
3211	 * Place clean pages near the head of the inactive queue rather than
3212	 * the tail, thus defeating the queue's LRU operation and ensuring that
3213	 * the page will be reused quickly.  Dirty pages not already in the
3214	 * laundry are moved there.
3215	 */
3216	if (m->dirty == 0)
3217		vm_page_deactivate_noreuse(m);
3218	else
3219		vm_page_launder(m);
3220}
3221
3222/*
3223 * Grab a page, waiting until we are waken up due to the page
3224 * changing state.  We keep on waiting, if the page continues
3225 * to be in the object.  If the page doesn't exist, first allocate it
3226 * and then conditionally zero it.
3227 *
3228 * This routine may sleep.
3229 *
3230 * The object must be locked on entry.  The lock will, however, be released
3231 * and reacquired if the routine sleeps.
3232 */
3233vm_page_t
3234vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
3235{
3236	vm_page_t m;
3237	int sleep;
3238	int pflags;
3239
3240	VM_OBJECT_ASSERT_WLOCKED(object);
3241	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
3242	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
3243	    ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
3244	pflags = allocflags &
3245	    ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
3246	if ((allocflags & VM_ALLOC_NOWAIT) == 0)
3247		pflags |= VM_ALLOC_WAITFAIL;
3248retrylookup:
3249	if ((m = vm_page_lookup(object, pindex)) != NULL) {
3250		sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
3251		    vm_page_xbusied(m) : vm_page_busied(m);
3252		if (sleep) {
3253			if ((allocflags & VM_ALLOC_NOWAIT) != 0)
3254				return (NULL);
3255			/*
3256			 * Reference the page before unlocking and
3257			 * sleeping so that the page daemon is less
3258			 * likely to reclaim it.
3259			 */
3260			vm_page_aflag_set(m, PGA_REFERENCED);
3261			vm_page_lock(m);
3262			VM_OBJECT_WUNLOCK(object);
3263			vm_page_busy_sleep(m, "pgrbwt", (allocflags &
3264			    VM_ALLOC_IGN_SBUSY) != 0);
3265			VM_OBJECT_WLOCK(object);
3266			goto retrylookup;
3267		} else {
3268			if ((allocflags & VM_ALLOC_WIRED) != 0) {
3269				vm_page_lock(m);
3270				vm_page_wire(m);
3271				vm_page_unlock(m);
3272			}
3273			if ((allocflags &
3274			    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
3275				vm_page_xbusy(m);
3276			if ((allocflags & VM_ALLOC_SBUSY) != 0)
3277				vm_page_sbusy(m);
3278			return (m);
3279		}
3280	}
3281	m = vm_page_alloc(object, pindex, pflags);
3282	if (m == NULL) {
3283		if ((allocflags & VM_ALLOC_NOWAIT) != 0)
3284			return (NULL);
3285		goto retrylookup;
3286	}
3287	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
3288		pmap_zero_page(m);
3289	return (m);
3290}
3291
3292/*
3293 * Return the specified range of pages from the given object.  For each
3294 * page offset within the range, if a page already exists within the object
3295 * at that offset and it is busy, then wait for it to change state.  If,
3296 * instead, the page doesn't exist, then allocate it.
3297 *
3298 * The caller must always specify an allocation class.
3299 *
3300 * allocation classes:
3301 *	VM_ALLOC_NORMAL		normal process request
3302 *	VM_ALLOC_SYSTEM		system *really* needs the pages
3303 *
3304 * The caller must always specify that the pages are to be busied and/or
3305 * wired.
3306 *
3307 * optional allocation flags:
3308 *	VM_ALLOC_IGN_SBUSY	do not sleep on soft busy pages
3309 *	VM_ALLOC_NOBUSY		do not exclusive busy the page
3310 *	VM_ALLOC_NOWAIT		do not sleep
3311 *	VM_ALLOC_SBUSY		set page to sbusy state
3312 *	VM_ALLOC_WIRED		wire the pages
3313 *	VM_ALLOC_ZERO		zero and validate any invalid pages
3314 *
3315 * If VM_ALLOC_NOWAIT is not specified, this routine may sleep.  Otherwise, it
3316 * may return a partial prefix of the requested range.
3317 */
3318int
3319vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
3320    vm_page_t *ma, int count)
3321{
3322	vm_page_t m, mpred;
3323	int pflags;
3324	int i;
3325	bool sleep;
3326
3327	VM_OBJECT_ASSERT_WLOCKED(object);
3328	KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0,
3329	    ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed"));
3330	KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 ||
3331	    (allocflags & VM_ALLOC_WIRED) != 0,
3332	    ("vm_page_grab_pages: the pages must be busied or wired"));
3333	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
3334	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
3335	    ("vm_page_grab_pages: VM_ALLOC_SBUSY/IGN_SBUSY mismatch"));
3336	if (count == 0)
3337		return (0);
3338	pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK |
3339	    VM_ALLOC_WAITFAIL | VM_ALLOC_IGN_SBUSY);
3340	if ((allocflags & VM_ALLOC_NOWAIT) == 0)
3341		pflags |= VM_ALLOC_WAITFAIL;
3342	i = 0;
3343retrylookup:
3344	m = vm_radix_lookup_le(&object->rtree, pindex + i);
3345	if (m == NULL || m->pindex != pindex + i) {
3346		mpred = m;
3347		m = NULL;
3348	} else
3349		mpred = TAILQ_PREV(m, pglist, listq);
3350	for (; i < count; i++) {
3351		if (m != NULL) {
3352			sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
3353			    vm_page_xbusied(m) : vm_page_busied(m);
3354			if (sleep) {
3355				if ((allocflags & VM_ALLOC_NOWAIT) != 0)
3356					break;
3357				/*
3358				 * Reference the page before unlocking and
3359				 * sleeping so that the page daemon is less
3360				 * likely to reclaim it.
3361				 */
3362				vm_page_aflag_set(m, PGA_REFERENCED);
3363				vm_page_lock(m);
3364				VM_OBJECT_WUNLOCK(object);
3365				vm_page_busy_sleep(m, "grbmaw", (allocflags &
3366				    VM_ALLOC_IGN_SBUSY) != 0);
3367				VM_OBJECT_WLOCK(object);
3368				goto retrylookup;
3369			}
3370			if ((allocflags & VM_ALLOC_WIRED) != 0) {
3371				vm_page_lock(m);
3372				vm_page_wire(m);
3373				vm_page_unlock(m);
3374			}
3375			if ((allocflags & (VM_ALLOC_NOBUSY |
3376			    VM_ALLOC_SBUSY)) == 0)
3377				vm_page_xbusy(m);
3378			if ((allocflags & VM_ALLOC_SBUSY) != 0)
3379				vm_page_sbusy(m);
3380		} else {
3381			m = vm_page_alloc_after(object, pindex + i,
3382			    pflags | VM_ALLOC_COUNT(count - i), mpred);
3383			if (m == NULL) {
3384				if ((allocflags & VM_ALLOC_NOWAIT) != 0)
3385					break;
3386				goto retrylookup;
3387			}
3388		}
3389		if (m->valid == 0 && (allocflags & VM_ALLOC_ZERO) != 0) {
3390			if ((m->flags & PG_ZERO) == 0)
3391				pmap_zero_page(m);
3392			m->valid = VM_PAGE_BITS_ALL;
3393		}
3394		ma[i] = mpred = m;
3395		m = vm_page_next(m);
3396	}
3397	return (i);
3398}
3399
3400/*
3401 * Mapping function for valid or dirty bits in a page.
3402 *
3403 * Inputs are required to range within a page.
3404 */
3405vm_page_bits_t
3406vm_page_bits(int base, int size)
3407{
3408	int first_bit;
3409	int last_bit;
3410
3411	KASSERT(
3412	    base + size <= PAGE_SIZE,
3413	    ("vm_page_bits: illegal base/size %d/%d", base, size)
3414	);
3415
3416	if (size == 0)		/* handle degenerate case */
3417		return (0);
3418
3419	first_bit = base >> DEV_BSHIFT;
3420	last_bit = (base + size - 1) >> DEV_BSHIFT;
3421
3422	return (((vm_page_bits_t)2 << last_bit) -
3423	    ((vm_page_bits_t)1 << first_bit));
3424}
3425
3426/*
3427 *	vm_page_set_valid_range:
3428 *
3429 *	Sets portions of a page valid.  The arguments are expected
3430 *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
3431 *	of any partial chunks touched by the range.  The invalid portion of
3432 *	such chunks will be zeroed.
3433 *
3434 *	(base + size) must be less then or equal to PAGE_SIZE.
3435 */
3436void
3437vm_page_set_valid_range(vm_page_t m, int base, int size)
3438{
3439	int endoff, frag;
3440
3441	VM_OBJECT_ASSERT_WLOCKED(m->object);
3442	if (size == 0)	/* handle degenerate case */
3443		return;
3444
3445	/*
3446	 * If the base is not DEV_BSIZE aligned and the valid
3447	 * bit is clear, we have to zero out a portion of the
3448	 * first block.
3449	 */
3450	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
3451	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
3452		pmap_zero_page_area(m, frag, base - frag);
3453
3454	/*
3455	 * If the ending offset is not DEV_BSIZE aligned and the
3456	 * valid bit is clear, we have to zero out a portion of
3457	 * the last block.
3458	 */
3459	endoff = base + size;
3460	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
3461	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
3462		pmap_zero_page_area(m, endoff,
3463		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
3464
3465	/*
3466	 * Assert that no previously invalid block that is now being validated
3467	 * is already dirty.
3468	 */
3469	KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
3470	    ("vm_page_set_valid_range: page %p is dirty", m));
3471
3472	/*
3473	 * Set valid bits inclusive of any overlap.
3474	 */
3475	m->valid |= vm_page_bits(base, size);
3476}
3477
3478/*
3479 * Clear the given bits from the specified page's dirty field.
3480 */
3481static __inline void
3482vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
3483{
3484	uintptr_t addr;
3485#if PAGE_SIZE < 16384
3486	int shift;
3487#endif
3488
3489	/*
3490	 * If the object is locked and the page is neither exclusive busy nor
3491	 * write mapped, then the page's dirty field cannot possibly be
3492	 * set by a concurrent pmap operation.
3493	 */
3494	VM_OBJECT_ASSERT_WLOCKED(m->object);
3495	if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m))
3496		m->dirty &= ~pagebits;
3497	else {
3498		/*
3499		 * The pmap layer can call vm_page_dirty() without
3500		 * holding a distinguished lock.  The combination of
3501		 * the object's lock and an atomic operation suffice
3502		 * to guarantee consistency of the page dirty field.
3503		 *
3504		 * For PAGE_SIZE == 32768 case, compiler already
3505		 * properly aligns the dirty field, so no forcible
3506		 * alignment is needed. Only require existence of
3507		 * atomic_clear_64 when page size is 32768.
3508		 */
3509		addr = (uintptr_t)&m->dirty;
3510#if PAGE_SIZE == 32768
3511		atomic_clear_64((uint64_t *)addr, pagebits);
3512#elif PAGE_SIZE == 16384
3513		atomic_clear_32((uint32_t *)addr, pagebits);
3514#else		/* PAGE_SIZE <= 8192 */
3515		/*
3516		 * Use a trick to perform a 32-bit atomic on the
3517		 * containing aligned word, to not depend on the existence
3518		 * of atomic_clear_{8, 16}.
3519		 */
3520		shift = addr & (sizeof(uint32_t) - 1);
3521#if BYTE_ORDER == BIG_ENDIAN
3522		shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY;
3523#else
3524		shift *= NBBY;
3525#endif
3526		addr &= ~(sizeof(uint32_t) - 1);
3527		atomic_clear_32((uint32_t *)addr, pagebits << shift);
3528#endif		/* PAGE_SIZE */
3529	}
3530}
3531
3532/*
3533 *	vm_page_set_validclean:
3534 *
3535 *	Sets portions of a page valid and clean.  The arguments are expected
3536 *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
3537 *	of any partial chunks touched by the range.  The invalid portion of
3538 *	such chunks will be zero'd.
3539 *
3540 *	(base + size) must be less then or equal to PAGE_SIZE.
3541 */
3542void
3543vm_page_set_validclean(vm_page_t m, int base, int size)
3544{
3545	vm_page_bits_t oldvalid, pagebits;
3546	int endoff, frag;
3547
3548	VM_OBJECT_ASSERT_WLOCKED(m->object);
3549	if (size == 0)	/* handle degenerate case */
3550		return;
3551
3552	/*
3553	 * If the base is not DEV_BSIZE aligned and the valid
3554	 * bit is clear, we have to zero out a portion of the
3555	 * first block.
3556	 */
3557	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
3558	    (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
3559		pmap_zero_page_area(m, frag, base - frag);
3560
3561	/*
3562	 * If the ending offset is not DEV_BSIZE aligned and the
3563	 * valid bit is clear, we have to zero out a portion of
3564	 * the last block.
3565	 */
3566	endoff = base + size;
3567	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
3568	    (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
3569		pmap_zero_page_area(m, endoff,
3570		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
3571
3572	/*
3573	 * Set valid, clear dirty bits.  If validating the entire
3574	 * page we can safely clear the pmap modify bit.  We also
3575	 * use this opportunity to clear the VPO_NOSYNC flag.  If a process
3576	 * takes a write fault on a MAP_NOSYNC memory area the flag will
3577	 * be set again.
3578	 *
3579	 * We set valid bits inclusive of any overlap, but we can only
3580	 * clear dirty bits for DEV_BSIZE chunks that are fully within
3581	 * the range.
3582	 */
3583	oldvalid = m->valid;
3584	pagebits = vm_page_bits(base, size);
3585	m->valid |= pagebits;
3586#if 0	/* NOT YET */
3587	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
3588		frag = DEV_BSIZE - frag;
3589		base += frag;
3590		size -= frag;
3591		if (size < 0)
3592			size = 0;
3593	}
3594	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
3595#endif
3596	if (base == 0 && size == PAGE_SIZE) {
3597		/*
3598		 * The page can only be modified within the pmap if it is
3599		 * mapped, and it can only be mapped if it was previously
3600		 * fully valid.
3601		 */
3602		if (oldvalid == VM_PAGE_BITS_ALL)
3603			/*
3604			 * Perform the pmap_clear_modify() first.  Otherwise,
3605			 * a concurrent pmap operation, such as
3606			 * pmap_protect(), could clear a modification in the
3607			 * pmap and set the dirty field on the page before
3608			 * pmap_clear_modify() had begun and after the dirty
3609			 * field was cleared here.
3610			 */
3611			pmap_clear_modify(m);
3612		m->dirty = 0;
3613		m->oflags &= ~VPO_NOSYNC;
3614	} else if (oldvalid != VM_PAGE_BITS_ALL)
3615		m->dirty &= ~pagebits;
3616	else
3617		vm_page_clear_dirty_mask(m, pagebits);
3618}
3619
3620void
3621vm_page_clear_dirty(vm_page_t m, int base, int size)
3622{
3623
3624	vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
3625}
3626
3627/*
3628 *	vm_page_set_invalid:
3629 *
3630 *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
3631 *	valid and dirty bits for the effected areas are cleared.
3632 */
3633void
3634vm_page_set_invalid(vm_page_t m, int base, int size)
3635{
3636	vm_page_bits_t bits;
3637	vm_object_t object;
3638
3639	object = m->object;
3640	VM_OBJECT_ASSERT_WLOCKED(object);
3641	if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) +
3642	    size >= object->un_pager.vnp.vnp_size)
3643		bits = VM_PAGE_BITS_ALL;
3644	else
3645		bits = vm_page_bits(base, size);
3646	if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL &&
3647	    bits != 0)
3648		pmap_remove_all(m);
3649	KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) ||
3650	    !pmap_page_is_mapped(m),
3651	    ("vm_page_set_invalid: page %p is mapped", m));
3652	m->valid &= ~bits;
3653	m->dirty &= ~bits;
3654}
3655
3656/*
3657 * vm_page_zero_invalid()
3658 *
3659 *	The kernel assumes that the invalid portions of a page contain
3660 *	garbage, but such pages can be mapped into memory by user code.
3661 *	When this occurs, we must zero out the non-valid portions of the
3662 *	page so user code sees what it expects.
3663 *
3664 *	Pages are most often semi-valid when the end of a file is mapped
3665 *	into memory and the file's size is not page aligned.
3666 */
3667void
3668vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
3669{
3670	int b;
3671	int i;
3672
3673	VM_OBJECT_ASSERT_WLOCKED(m->object);
3674	/*
3675	 * Scan the valid bits looking for invalid sections that
3676	 * must be zeroed.  Invalid sub-DEV_BSIZE'd areas ( where the
3677	 * valid bit may be set ) have already been zeroed by
3678	 * vm_page_set_validclean().
3679	 */
3680	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
3681		if (i == (PAGE_SIZE / DEV_BSIZE) ||
3682		    (m->valid & ((vm_page_bits_t)1 << i))) {
3683			if (i > b) {
3684				pmap_zero_page_area(m,
3685				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
3686			}
3687			b = i + 1;
3688		}
3689	}
3690
3691	/*
3692	 * setvalid is TRUE when we can safely set the zero'd areas
3693	 * as being valid.  We can do this if there are no cache consistancy
3694	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
3695	 */
3696	if (setvalid)
3697		m->valid = VM_PAGE_BITS_ALL;
3698}
3699
3700/*
3701 *	vm_page_is_valid:
3702 *
3703 *	Is (partial) page valid?  Note that the case where size == 0
3704 *	will return FALSE in the degenerate case where the page is
3705 *	entirely invalid, and TRUE otherwise.
3706 */
3707int
3708vm_page_is_valid(vm_page_t m, int base, int size)
3709{
3710	vm_page_bits_t bits;
3711
3712	VM_OBJECT_ASSERT_LOCKED(m->object);
3713	bits = vm_page_bits(base, size);
3714	return (m->valid != 0 && (m->valid & bits) == bits);
3715}
3716
3717/*
3718 * Returns true if all of the specified predicates are true for the entire
3719 * (super)page and false otherwise.
3720 */
3721bool
3722vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m)
3723{
3724	vm_object_t object;
3725	int i, npages;
3726
3727	object = m->object;
3728	VM_OBJECT_ASSERT_LOCKED(object);
3729	npages = atop(pagesizes[m->psind]);
3730
3731	/*
3732	 * The physically contiguous pages that make up a superpage, i.e., a
3733	 * page with a page size index ("psind") greater than zero, will
3734	 * occupy adjacent entries in vm_page_array[].
3735	 */
3736	for (i = 0; i < npages; i++) {
3737		/* Always test object consistency, including "skip_m". */
3738		if (m[i].object != object)
3739			return (false);
3740		if (&m[i] == skip_m)
3741			continue;
3742		if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i]))
3743			return (false);
3744		if ((flags & PS_ALL_DIRTY) != 0) {
3745			/*
3746			 * Calling vm_page_test_dirty() or pmap_is_modified()
3747			 * might stop this case from spuriously returning
3748			 * "false".  However, that would require a write lock
3749			 * on the object containing "m[i]".
3750			 */
3751			if (m[i].dirty != VM_PAGE_BITS_ALL)
3752				return (false);
3753		}
3754		if ((flags & PS_ALL_VALID) != 0 &&
3755		    m[i].valid != VM_PAGE_BITS_ALL)
3756			return (false);
3757	}
3758	return (true);
3759}
3760
3761/*
3762 * Set the page's dirty bits if the page is modified.
3763 */
3764void
3765vm_page_test_dirty(vm_page_t m)
3766{
3767
3768	VM_OBJECT_ASSERT_WLOCKED(m->object);
3769	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
3770		vm_page_dirty(m);
3771}
3772
3773void
3774vm_page_lock_KBI(vm_page_t m, const char *file, int line)
3775{
3776
3777	mtx_lock_flags_(vm_page_lockptr(m), 0, file, line);
3778}
3779
3780void
3781vm_page_unlock_KBI(vm_page_t m, const char *file, int line)
3782{
3783
3784	mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line);
3785}
3786
3787int
3788vm_page_trylock_KBI(vm_page_t m, const char *file, int line)
3789{
3790
3791	return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line));
3792}
3793
3794#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
3795void
3796vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line)
3797{
3798
3799	vm_page_lock_assert_KBI(m, MA_OWNED, file, line);
3800}
3801
3802void
3803vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
3804{
3805
3806	mtx_assert_(vm_page_lockptr(m), a, file, line);
3807}
3808#endif
3809
3810#ifdef INVARIANTS
3811void
3812vm_page_object_lock_assert(vm_page_t m)
3813{
3814
3815	/*
3816	 * Certain of the page's fields may only be modified by the
3817	 * holder of the containing object's lock or the exclusive busy.
3818	 * holder.  Unfortunately, the holder of the write busy is
3819	 * not recorded, and thus cannot be checked here.
3820	 */
3821	if (m->object != NULL && !vm_page_xbusied(m))
3822		VM_OBJECT_ASSERT_WLOCKED(m->object);
3823}
3824
3825void
3826vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits)
3827{
3828
3829	if ((bits & PGA_WRITEABLE) == 0)
3830		return;
3831
3832	/*
3833	 * The PGA_WRITEABLE flag can only be set if the page is
3834	 * managed, is exclusively busied or the object is locked.
3835	 * Currently, this flag is only set by pmap_enter().
3836	 */
3837	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3838	    ("PGA_WRITEABLE on unmanaged page"));
3839	if (!vm_page_xbusied(m))
3840		VM_OBJECT_ASSERT_LOCKED(m->object);
3841}
3842#endif
3843
3844#include "opt_ddb.h"
3845#ifdef DDB
3846#include <sys/kernel.h>
3847
3848#include <ddb/ddb.h>
3849
3850DB_SHOW_COMMAND(page, vm_page_print_page_info)
3851{
3852
3853	db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count);
3854	db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count);
3855	db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count);
3856	db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count);
3857	db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count);
3858	db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
3859	db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
3860	db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target);
3861	db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target);
3862}
3863
3864DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
3865{
3866	int dom;
3867
3868	db_printf("pq_free %d\n", vm_cnt.v_free_count);
3869	for (dom = 0; dom < vm_ndomains; dom++) {
3870		db_printf(
3871	    "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d\n",
3872		    dom,
3873		    vm_dom[dom].vmd_page_count,
3874		    vm_dom[dom].vmd_free_count,
3875		    vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
3876		    vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
3877		    vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt);
3878	}
3879}
3880
3881DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
3882{
3883	vm_page_t m;
3884	boolean_t phys;
3885
3886	if (!have_addr) {
3887		db_printf("show pginfo addr\n");
3888		return;
3889	}
3890
3891	phys = strchr(modif, 'p') != NULL;
3892	if (phys)
3893		m = PHYS_TO_VM_PAGE(addr);
3894	else
3895		m = (vm_page_t)addr;
3896	db_printf(
3897    "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n"
3898    "  af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n",
3899	    m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr,
3900	    m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags,
3901	    m->flags, m->act_count, m->busy_lock, m->valid, m->dirty);
3902}
3903#endif /* DDB */
3904