vm_page.c revision 327404
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * The Mach Operating System project at Carnegie-Mellon University.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
34 */
35
36/*-
37 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38 * All rights reserved.
39 *
40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41 *
42 * Permission to use, copy, modify and distribute this software and
43 * its documentation is hereby granted, provided that both the copyright
44 * notice and this permission notice appear in all copies of the
45 * software, derivative works or modified versions, and any portions
46 * thereof, and that both notices appear in supporting documentation.
47 *
48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51 *
52 * Carnegie Mellon requests users of this software to return to
53 *
54 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55 *  School of Computer Science
56 *  Carnegie Mellon University
57 *  Pittsburgh PA 15213-3890
58 *
59 * any improvements or extensions that they make and grant Carnegie the
60 * rights to redistribute these changes.
61 */
62
63/*
64 *			GENERAL RULES ON VM_PAGE MANIPULATION
65 *
66 *	- A page queue lock is required when adding or removing a page from a
67 *	  page queue regardless of other locks or the busy state of a page.
68 *
69 *		* In general, no thread besides the page daemon can acquire or
70 *		  hold more than one page queue lock at a time.
71 *
72 *		* The page daemon can acquire and hold any pair of page queue
73 *		  locks in any order.
74 *
75 *	- The object lock is required when inserting or removing
76 *	  pages from an object (vm_page_insert() or vm_page_remove()).
77 *
78 */
79
80/*
81 *	Resident memory management module.
82 */
83
84#include <sys/cdefs.h>
85__FBSDID("$FreeBSD: stable/11/sys/vm/vm_page.c 327404 2017-12-31 03:06:29Z mjg $");
86
87#include "opt_vm.h"
88
89#include <sys/param.h>
90#include <sys/systm.h>
91#include <sys/lock.h>
92#include <sys/kernel.h>
93#include <sys/limits.h>
94#include <sys/linker.h>
95#include <sys/malloc.h>
96#include <sys/mman.h>
97#include <sys/msgbuf.h>
98#include <sys/mutex.h>
99#include <sys/proc.h>
100#include <sys/rwlock.h>
101#include <sys/sbuf.h>
102#include <sys/smp.h>
103#include <sys/sysctl.h>
104#include <sys/vmmeter.h>
105#include <sys/vnode.h>
106
107#include <vm/vm.h>
108#include <vm/pmap.h>
109#include <vm/vm_param.h>
110#include <vm/vm_kern.h>
111#include <vm/vm_object.h>
112#include <vm/vm_page.h>
113#include <vm/vm_pageout.h>
114#include <vm/vm_pager.h>
115#include <vm/vm_phys.h>
116#include <vm/vm_radix.h>
117#include <vm/vm_reserv.h>
118#include <vm/vm_extern.h>
119#include <vm/uma.h>
120#include <vm/uma_int.h>
121
122#include <machine/md_var.h>
123
124/*
125 *	Associated with page of user-allocatable memory is a
126 *	page structure.
127 */
128
129struct vm_domain vm_dom[MAXMEMDOM];
130struct mtx_padalign __exclusive_cache_line vm_page_queue_free_mtx;
131
132struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT];
133
134vm_page_t vm_page_array;
135long vm_page_array_size;
136long first_page;
137int vm_page_zero_count;
138
139static int boot_pages = UMA_BOOT_PAGES;
140SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
141    &boot_pages, 0,
142    "number of pages allocated for bootstrapping the VM system");
143
144static int pa_tryrelock_restart;
145SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
146    &pa_tryrelock_restart, 0, "Number of tryrelock restarts");
147
148static TAILQ_HEAD(, vm_page) blacklist_head;
149static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS);
150SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD |
151    CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages");
152
153/* Is the page daemon waiting for free pages? */
154static int vm_pageout_pages_needed;
155
156static uma_zone_t fakepg_zone;
157
158static void vm_page_alloc_check(vm_page_t m);
159static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
160static void vm_page_enqueue(uint8_t queue, vm_page_t m);
161static void vm_page_free_phys(vm_page_t m);
162static void vm_page_free_wakeup(void);
163static void vm_page_init_fakepg(void *dummy);
164static int vm_page_insert_after(vm_page_t m, vm_object_t object,
165    vm_pindex_t pindex, vm_page_t mpred);
166static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
167    vm_page_t mpred);
168static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
169    vm_paddr_t high);
170
171SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
172
173static void
174vm_page_init_fakepg(void *dummy)
175{
176
177	fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
178	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
179}
180
181/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
182#if PAGE_SIZE == 32768
183#ifdef CTASSERT
184CTASSERT(sizeof(u_long) >= 8);
185#endif
186#endif
187
188/*
189 * Try to acquire a physical address lock while a pmap is locked.  If we
190 * fail to trylock we unlock and lock the pmap directly and cache the
191 * locked pa in *locked.  The caller should then restart their loop in case
192 * the virtual to physical mapping has changed.
193 */
194int
195vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked)
196{
197	vm_paddr_t lockpa;
198
199	lockpa = *locked;
200	*locked = pa;
201	if (lockpa) {
202		PA_LOCK_ASSERT(lockpa, MA_OWNED);
203		if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa))
204			return (0);
205		PA_UNLOCK(lockpa);
206	}
207	if (PA_TRYLOCK(pa))
208		return (0);
209	PMAP_UNLOCK(pmap);
210	atomic_add_int(&pa_tryrelock_restart, 1);
211	PA_LOCK(pa);
212	PMAP_LOCK(pmap);
213	return (EAGAIN);
214}
215
216/*
217 *	vm_set_page_size:
218 *
219 *	Sets the page size, perhaps based upon the memory
220 *	size.  Must be called before any use of page-size
221 *	dependent functions.
222 */
223void
224vm_set_page_size(void)
225{
226	if (vm_cnt.v_page_size == 0)
227		vm_cnt.v_page_size = PAGE_SIZE;
228	if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0)
229		panic("vm_set_page_size: page size not a power of two");
230}
231
232/*
233 *	vm_page_blacklist_next:
234 *
235 *	Find the next entry in the provided string of blacklist
236 *	addresses.  Entries are separated by space, comma, or newline.
237 *	If an invalid integer is encountered then the rest of the
238 *	string is skipped.  Updates the list pointer to the next
239 *	character, or NULL if the string is exhausted or invalid.
240 */
241static vm_paddr_t
242vm_page_blacklist_next(char **list, char *end)
243{
244	vm_paddr_t bad;
245	char *cp, *pos;
246
247	if (list == NULL || *list == NULL)
248		return (0);
249	if (**list =='\0') {
250		*list = NULL;
251		return (0);
252	}
253
254	/*
255	 * If there's no end pointer then the buffer is coming from
256	 * the kenv and we know it's null-terminated.
257	 */
258	if (end == NULL)
259		end = *list + strlen(*list);
260
261	/* Ensure that strtoq() won't walk off the end */
262	if (*end != '\0') {
263		if (*end == '\n' || *end == ' ' || *end  == ',')
264			*end = '\0';
265		else {
266			printf("Blacklist not terminated, skipping\n");
267			*list = NULL;
268			return (0);
269		}
270	}
271
272	for (pos = *list; *pos != '\0'; pos = cp) {
273		bad = strtoq(pos, &cp, 0);
274		if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') {
275			if (bad == 0) {
276				if (++cp < end)
277					continue;
278				else
279					break;
280			}
281		} else
282			break;
283		if (*cp == '\0' || ++cp >= end)
284			*list = NULL;
285		else
286			*list = cp;
287		return (trunc_page(bad));
288	}
289	printf("Garbage in RAM blacklist, skipping\n");
290	*list = NULL;
291	return (0);
292}
293
294/*
295 *	vm_page_blacklist_check:
296 *
297 *	Iterate through the provided string of blacklist addresses, pulling
298 *	each entry out of the physical allocator free list and putting it
299 *	onto a list for reporting via the vm.page_blacklist sysctl.
300 */
301static void
302vm_page_blacklist_check(char *list, char *end)
303{
304	vm_paddr_t pa;
305	vm_page_t m;
306	char *next;
307	int ret;
308
309	next = list;
310	while (next != NULL) {
311		if ((pa = vm_page_blacklist_next(&next, end)) == 0)
312			continue;
313		m = vm_phys_paddr_to_vm_page(pa);
314		if (m == NULL)
315			continue;
316		mtx_lock(&vm_page_queue_free_mtx);
317		ret = vm_phys_unfree_page(m);
318		mtx_unlock(&vm_page_queue_free_mtx);
319		if (ret == TRUE) {
320			TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
321			if (bootverbose)
322				printf("Skipping page with pa 0x%jx\n",
323				    (uintmax_t)pa);
324		}
325	}
326}
327
328/*
329 *	vm_page_blacklist_load:
330 *
331 *	Search for a special module named "ram_blacklist".  It'll be a
332 *	plain text file provided by the user via the loader directive
333 *	of the same name.
334 */
335static void
336vm_page_blacklist_load(char **list, char **end)
337{
338	void *mod;
339	u_char *ptr;
340	u_int len;
341
342	mod = NULL;
343	ptr = NULL;
344
345	mod = preload_search_by_type("ram_blacklist");
346	if (mod != NULL) {
347		ptr = preload_fetch_addr(mod);
348		len = preload_fetch_size(mod);
349        }
350	*list = ptr;
351	if (ptr != NULL)
352		*end = ptr + len;
353	else
354		*end = NULL;
355	return;
356}
357
358static int
359sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
360{
361	vm_page_t m;
362	struct sbuf sbuf;
363	int error, first;
364
365	first = 1;
366	error = sysctl_wire_old_buffer(req, 0);
367	if (error != 0)
368		return (error);
369	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
370	TAILQ_FOREACH(m, &blacklist_head, listq) {
371		sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",",
372		    (uintmax_t)m->phys_addr);
373		first = 0;
374	}
375	error = sbuf_finish(&sbuf);
376	sbuf_delete(&sbuf);
377	return (error);
378}
379
380static void
381vm_page_domain_init(struct vm_domain *vmd)
382{
383	struct vm_pagequeue *pq;
384	int i;
385
386	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
387	    "vm inactive pagequeue";
388	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
389	    &vm_cnt.v_inactive_count;
390	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
391	    "vm active pagequeue";
392	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
393	    &vm_cnt.v_active_count;
394	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) =
395	    "vm laundry pagequeue";
396	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) =
397	    &vm_cnt.v_laundry_count;
398	vmd->vmd_page_count = 0;
399	vmd->vmd_free_count = 0;
400	vmd->vmd_segs = 0;
401	vmd->vmd_oom = FALSE;
402	for (i = 0; i < PQ_COUNT; i++) {
403		pq = &vmd->vmd_pagequeues[i];
404		TAILQ_INIT(&pq->pq_pl);
405		mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
406		    MTX_DEF | MTX_DUPOK);
407	}
408}
409
410/*
411 * Initialize a physical page in preparation for adding it to the free
412 * lists.
413 */
414static void
415vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind)
416{
417
418	m->object = NULL;
419	m->wire_count = 0;
420	m->busy_lock = VPB_UNBUSIED;
421	m->hold_count = 0;
422	m->flags = 0;
423	m->phys_addr = pa;
424	m->queue = PQ_NONE;
425	m->psind = 0;
426	m->segind = segind;
427	m->order = VM_NFREEORDER;
428	m->pool = VM_FREEPOOL_DEFAULT;
429	m->valid = m->dirty = 0;
430	pmap_page_init(m);
431}
432
433/*
434 *	vm_page_startup:
435 *
436 *	Initializes the resident memory module.  Allocates physical memory for
437 *	bootstrapping UMA and some data structures that are used to manage
438 *	physical pages.  Initializes these structures, and populates the free
439 *	page queues.
440 */
441vm_offset_t
442vm_page_startup(vm_offset_t vaddr)
443{
444	struct vm_domain *vmd;
445	struct vm_phys_seg *seg;
446	vm_page_t m;
447	char *list, *listend;
448	vm_offset_t mapped;
449	vm_paddr_t end, high_avail, low_avail, new_end, page_range, size;
450	vm_paddr_t biggestsize, last_pa, pa;
451	u_long pagecount;
452	int biggestone, i, pages_per_zone, segind;
453
454	biggestsize = 0;
455	biggestone = 0;
456	vaddr = round_page(vaddr);
457
458	for (i = 0; phys_avail[i + 1]; i += 2) {
459		phys_avail[i] = round_page(phys_avail[i]);
460		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
461	}
462	for (i = 0; phys_avail[i + 1]; i += 2) {
463		size = phys_avail[i + 1] - phys_avail[i];
464		if (size > biggestsize) {
465			biggestone = i;
466			biggestsize = size;
467		}
468	}
469
470	end = phys_avail[biggestone+1];
471
472	/*
473	 * Initialize the page and queue locks.
474	 */
475	mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF);
476	for (i = 0; i < PA_LOCK_COUNT; i++)
477		mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
478	for (i = 0; i < vm_ndomains; i++)
479		vm_page_domain_init(&vm_dom[i]);
480
481	/*
482	 * Almost all of the pages needed for bootstrapping UMA are used
483	 * for zone structures, so if the number of CPUs results in those
484	 * structures taking more than one page each, we set aside more pages
485	 * in proportion to the zone structure size.
486	 */
487	pages_per_zone = howmany(sizeof(struct uma_zone) +
488	    sizeof(struct uma_cache) * (mp_maxid + 1) +
489	    roundup2(sizeof(struct uma_slab), sizeof(void *)), UMA_SLAB_SIZE);
490	if (pages_per_zone > 1) {
491		/* Reserve more pages so that we don't run out. */
492		boot_pages = UMA_BOOT_PAGES_ZONES * pages_per_zone;
493	}
494
495	/*
496	 * Allocate memory for use when boot strapping the kernel memory
497	 * allocator.
498	 *
499	 * CTFLAG_RDTUN doesn't work during the early boot process, so we must
500	 * manually fetch the value.
501	 */
502	TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages);
503	new_end = end - (boot_pages * UMA_SLAB_SIZE);
504	new_end = trunc_page(new_end);
505	mapped = pmap_map(&vaddr, new_end, end,
506	    VM_PROT_READ | VM_PROT_WRITE);
507	bzero((void *)mapped, end - new_end);
508	uma_startup((void *)mapped, boot_pages);
509
510#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \
511    defined(__i386__) || defined(__mips__)
512	/*
513	 * Allocate a bitmap to indicate that a random physical page
514	 * needs to be included in a minidump.
515	 *
516	 * The amd64 port needs this to indicate which direct map pages
517	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
518	 *
519	 * However, i386 still needs this workspace internally within the
520	 * minidump code.  In theory, they are not needed on i386, but are
521	 * included should the sf_buf code decide to use them.
522	 */
523	last_pa = 0;
524	for (i = 0; dump_avail[i + 1] != 0; i += 2)
525		if (dump_avail[i + 1] > last_pa)
526			last_pa = dump_avail[i + 1];
527	page_range = last_pa / PAGE_SIZE;
528	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
529	new_end -= vm_page_dump_size;
530	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
531	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
532	bzero((void *)vm_page_dump, vm_page_dump_size);
533#else
534	(void)last_pa;
535#endif
536#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__)
537	/*
538	 * Include the UMA bootstrap pages and vm_page_dump in a crash dump.
539	 * When pmap_map() uses the direct map, they are not automatically
540	 * included.
541	 */
542	for (pa = new_end; pa < end; pa += PAGE_SIZE)
543		dump_add_page(pa);
544#endif
545	phys_avail[biggestone + 1] = new_end;
546#ifdef __amd64__
547	/*
548	 * Request that the physical pages underlying the message buffer be
549	 * included in a crash dump.  Since the message buffer is accessed
550	 * through the direct map, they are not automatically included.
551	 */
552	pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
553	last_pa = pa + round_page(msgbufsize);
554	while (pa < last_pa) {
555		dump_add_page(pa);
556		pa += PAGE_SIZE;
557	}
558#endif
559	/*
560	 * Compute the number of pages of memory that will be available for
561	 * use, taking into account the overhead of a page structure per page.
562	 * In other words, solve
563	 *	"available physical memory" - round_page(page_range *
564	 *	    sizeof(struct vm_page)) = page_range * PAGE_SIZE
565	 * for page_range.
566	 */
567	low_avail = phys_avail[0];
568	high_avail = phys_avail[1];
569	for (i = 0; i < vm_phys_nsegs; i++) {
570		if (vm_phys_segs[i].start < low_avail)
571			low_avail = vm_phys_segs[i].start;
572		if (vm_phys_segs[i].end > high_avail)
573			high_avail = vm_phys_segs[i].end;
574	}
575	/* Skip the first chunk.  It is already accounted for. */
576	for (i = 2; phys_avail[i + 1] != 0; i += 2) {
577		if (phys_avail[i] < low_avail)
578			low_avail = phys_avail[i];
579		if (phys_avail[i + 1] > high_avail)
580			high_avail = phys_avail[i + 1];
581	}
582	first_page = low_avail / PAGE_SIZE;
583#ifdef VM_PHYSSEG_SPARSE
584	size = 0;
585	for (i = 0; i < vm_phys_nsegs; i++)
586		size += vm_phys_segs[i].end - vm_phys_segs[i].start;
587	for (i = 0; phys_avail[i + 1] != 0; i += 2)
588		size += phys_avail[i + 1] - phys_avail[i];
589#elif defined(VM_PHYSSEG_DENSE)
590	size = high_avail - low_avail;
591#else
592#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
593#endif
594
595#ifdef VM_PHYSSEG_DENSE
596	/*
597	 * In the VM_PHYSSEG_DENSE case, the number of pages can account for
598	 * the overhead of a page structure per page only if vm_page_array is
599	 * allocated from the last physical memory chunk.  Otherwise, we must
600	 * allocate page structures representing the physical memory
601	 * underlying vm_page_array, even though they will not be used.
602	 */
603	if (new_end != high_avail)
604		page_range = size / PAGE_SIZE;
605	else
606#endif
607	{
608		page_range = size / (PAGE_SIZE + sizeof(struct vm_page));
609
610		/*
611		 * If the partial bytes remaining are large enough for
612		 * a page (PAGE_SIZE) without a corresponding
613		 * 'struct vm_page', then new_end will contain an
614		 * extra page after subtracting the length of the VM
615		 * page array.  Compensate by subtracting an extra
616		 * page from new_end.
617		 */
618		if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) {
619			if (new_end == high_avail)
620				high_avail -= PAGE_SIZE;
621			new_end -= PAGE_SIZE;
622		}
623	}
624	end = new_end;
625
626	/*
627	 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
628	 * However, because this page is allocated from KVM, out-of-bounds
629	 * accesses using the direct map will not be trapped.
630	 */
631	vaddr += PAGE_SIZE;
632
633	/*
634	 * Allocate physical memory for the page structures, and map it.
635	 */
636	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
637	mapped = pmap_map(&vaddr, new_end, end,
638	    VM_PROT_READ | VM_PROT_WRITE);
639	vm_page_array = (vm_page_t)mapped;
640	vm_page_array_size = page_range;
641
642#if VM_NRESERVLEVEL > 0
643	/*
644	 * Allocate physical memory for the reservation management system's
645	 * data structures, and map it.
646	 */
647	if (high_avail == end)
648		high_avail = new_end;
649	new_end = vm_reserv_startup(&vaddr, new_end, high_avail);
650#endif
651#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__)
652	/*
653	 * Include vm_page_array and vm_reserv_array in a crash dump.
654	 */
655	for (pa = new_end; pa < end; pa += PAGE_SIZE)
656		dump_add_page(pa);
657#endif
658	phys_avail[biggestone + 1] = new_end;
659
660	/*
661	 * Add physical memory segments corresponding to the available
662	 * physical pages.
663	 */
664	for (i = 0; phys_avail[i + 1] != 0; i += 2)
665		vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
666
667	/*
668	 * Initialize the physical memory allocator.
669	 */
670	vm_phys_init();
671
672	/*
673	 * Initialize the page structures and add every available page to the
674	 * physical memory allocator's free lists.
675	 */
676	vm_cnt.v_page_count = 0;
677	vm_cnt.v_free_count = 0;
678	for (segind = 0; segind < vm_phys_nsegs; segind++) {
679		seg = &vm_phys_segs[segind];
680		for (m = seg->first_page, pa = seg->start; pa < seg->end;
681		    m++, pa += PAGE_SIZE)
682			vm_page_init_page(m, pa, segind);
683
684		/*
685		 * Add the segment to the free lists only if it is covered by
686		 * one of the ranges in phys_avail.  Because we've added the
687		 * ranges to the vm_phys_segs array, we can assume that each
688		 * segment is either entirely contained in one of the ranges,
689		 * or doesn't overlap any of them.
690		 */
691		for (i = 0; phys_avail[i + 1] != 0; i += 2) {
692			if (seg->start < phys_avail[i] ||
693			    seg->end > phys_avail[i + 1])
694				continue;
695
696			m = seg->first_page;
697			pagecount = (u_long)atop(seg->end - seg->start);
698
699			mtx_lock(&vm_page_queue_free_mtx);
700			vm_phys_free_contig(m, pagecount);
701			vm_phys_freecnt_adj(m, (int)pagecount);
702			mtx_unlock(&vm_page_queue_free_mtx);
703			vm_cnt.v_page_count += (u_int)pagecount;
704
705			vmd = &vm_dom[seg->domain];
706			vmd->vmd_page_count += (u_int)pagecount;
707			vmd->vmd_segs |= 1UL << m->segind;
708			break;
709		}
710	}
711
712	/*
713	 * Remove blacklisted pages from the physical memory allocator.
714	 */
715	TAILQ_INIT(&blacklist_head);
716	vm_page_blacklist_load(&list, &listend);
717	vm_page_blacklist_check(list, listend);
718
719	list = kern_getenv("vm.blacklist");
720	vm_page_blacklist_check(list, NULL);
721
722	freeenv(list);
723#if VM_NRESERVLEVEL > 0
724	/*
725	 * Initialize the reservation management system.
726	 */
727	vm_reserv_init();
728#endif
729	return (vaddr);
730}
731
732void
733vm_page_reference(vm_page_t m)
734{
735
736	vm_page_aflag_set(m, PGA_REFERENCED);
737}
738
739/*
740 *	vm_page_busy_downgrade:
741 *
742 *	Downgrade an exclusive busy page into a single shared busy page.
743 */
744void
745vm_page_busy_downgrade(vm_page_t m)
746{
747	u_int x;
748	bool locked;
749
750	vm_page_assert_xbusied(m);
751	locked = mtx_owned(vm_page_lockptr(m));
752
753	for (;;) {
754		x = m->busy_lock;
755		x &= VPB_BIT_WAITERS;
756		if (x != 0 && !locked)
757			vm_page_lock(m);
758		if (atomic_cmpset_rel_int(&m->busy_lock,
759		    VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1)))
760			break;
761		if (x != 0 && !locked)
762			vm_page_unlock(m);
763	}
764	if (x != 0) {
765		wakeup(m);
766		if (!locked)
767			vm_page_unlock(m);
768	}
769}
770
771/*
772 *	vm_page_sbusied:
773 *
774 *	Return a positive value if the page is shared busied, 0 otherwise.
775 */
776int
777vm_page_sbusied(vm_page_t m)
778{
779	u_int x;
780
781	x = m->busy_lock;
782	return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED);
783}
784
785/*
786 *	vm_page_sunbusy:
787 *
788 *	Shared unbusy a page.
789 */
790void
791vm_page_sunbusy(vm_page_t m)
792{
793	u_int x;
794
795	vm_page_lock_assert(m, MA_NOTOWNED);
796	vm_page_assert_sbusied(m);
797
798	for (;;) {
799		x = m->busy_lock;
800		if (VPB_SHARERS(x) > 1) {
801			if (atomic_cmpset_int(&m->busy_lock, x,
802			    x - VPB_ONE_SHARER))
803				break;
804			continue;
805		}
806		if ((x & VPB_BIT_WAITERS) == 0) {
807			KASSERT(x == VPB_SHARERS_WORD(1),
808			    ("vm_page_sunbusy: invalid lock state"));
809			if (atomic_cmpset_int(&m->busy_lock,
810			    VPB_SHARERS_WORD(1), VPB_UNBUSIED))
811				break;
812			continue;
813		}
814		KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS),
815		    ("vm_page_sunbusy: invalid lock state for waiters"));
816
817		vm_page_lock(m);
818		if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) {
819			vm_page_unlock(m);
820			continue;
821		}
822		wakeup(m);
823		vm_page_unlock(m);
824		break;
825	}
826}
827
828/*
829 *	vm_page_busy_sleep:
830 *
831 *	Sleep and release the page lock, using the page pointer as wchan.
832 *	This is used to implement the hard-path of busying mechanism.
833 *
834 *	The given page must be locked.
835 *
836 *	If nonshared is true, sleep only if the page is xbusy.
837 */
838void
839vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared)
840{
841	u_int x;
842
843	vm_page_assert_locked(m);
844
845	x = m->busy_lock;
846	if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) ||
847	    ((x & VPB_BIT_WAITERS) == 0 &&
848	    !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) {
849		vm_page_unlock(m);
850		return;
851	}
852	msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0);
853}
854
855/*
856 *	vm_page_trysbusy:
857 *
858 *	Try to shared busy a page.
859 *	If the operation succeeds 1 is returned otherwise 0.
860 *	The operation never sleeps.
861 */
862int
863vm_page_trysbusy(vm_page_t m)
864{
865	u_int x;
866
867	for (;;) {
868		x = m->busy_lock;
869		if ((x & VPB_BIT_SHARED) == 0)
870			return (0);
871		if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER))
872			return (1);
873	}
874}
875
876static void
877vm_page_xunbusy_locked(vm_page_t m)
878{
879
880	vm_page_assert_xbusied(m);
881	vm_page_assert_locked(m);
882
883	atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
884	/* There is a waiter, do wakeup() instead of vm_page_flash(). */
885	wakeup(m);
886}
887
888void
889vm_page_xunbusy_maybelocked(vm_page_t m)
890{
891	bool lockacq;
892
893	vm_page_assert_xbusied(m);
894
895	/*
896	 * Fast path for unbusy.  If it succeeds, we know that there
897	 * are no waiters, so we do not need a wakeup.
898	 */
899	if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER,
900	    VPB_UNBUSIED))
901		return;
902
903	lockacq = !mtx_owned(vm_page_lockptr(m));
904	if (lockacq)
905		vm_page_lock(m);
906	vm_page_xunbusy_locked(m);
907	if (lockacq)
908		vm_page_unlock(m);
909}
910
911/*
912 *	vm_page_xunbusy_hard:
913 *
914 *	Called after the first try the exclusive unbusy of a page failed.
915 *	It is assumed that the waiters bit is on.
916 */
917void
918vm_page_xunbusy_hard(vm_page_t m)
919{
920
921	vm_page_assert_xbusied(m);
922
923	vm_page_lock(m);
924	vm_page_xunbusy_locked(m);
925	vm_page_unlock(m);
926}
927
928/*
929 *	vm_page_flash:
930 *
931 *	Wakeup anyone waiting for the page.
932 *	The ownership bits do not change.
933 *
934 *	The given page must be locked.
935 */
936void
937vm_page_flash(vm_page_t m)
938{
939	u_int x;
940
941	vm_page_lock_assert(m, MA_OWNED);
942
943	for (;;) {
944		x = m->busy_lock;
945		if ((x & VPB_BIT_WAITERS) == 0)
946			return;
947		if (atomic_cmpset_int(&m->busy_lock, x,
948		    x & (~VPB_BIT_WAITERS)))
949			break;
950	}
951	wakeup(m);
952}
953
954/*
955 * Avoid releasing and reacquiring the same page lock.
956 */
957void
958vm_page_change_lock(vm_page_t m, struct mtx **mtx)
959{
960	struct mtx *mtx1;
961
962	mtx1 = vm_page_lockptr(m);
963	if (*mtx == mtx1)
964		return;
965	if (*mtx != NULL)
966		mtx_unlock(*mtx);
967	*mtx = mtx1;
968	mtx_lock(mtx1);
969}
970
971/*
972 * Keep page from being freed by the page daemon
973 * much of the same effect as wiring, except much lower
974 * overhead and should be used only for *very* temporary
975 * holding ("wiring").
976 */
977void
978vm_page_hold(vm_page_t mem)
979{
980
981	vm_page_lock_assert(mem, MA_OWNED);
982        mem->hold_count++;
983}
984
985void
986vm_page_unhold(vm_page_t mem)
987{
988
989	vm_page_lock_assert(mem, MA_OWNED);
990	KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!"));
991	--mem->hold_count;
992	if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0)
993		vm_page_free_toq(mem);
994}
995
996/*
997 *	vm_page_unhold_pages:
998 *
999 *	Unhold each of the pages that is referenced by the given array.
1000 */
1001void
1002vm_page_unhold_pages(vm_page_t *ma, int count)
1003{
1004	struct mtx *mtx;
1005
1006	mtx = NULL;
1007	for (; count != 0; count--) {
1008		vm_page_change_lock(*ma, &mtx);
1009		vm_page_unhold(*ma);
1010		ma++;
1011	}
1012	if (mtx != NULL)
1013		mtx_unlock(mtx);
1014}
1015
1016vm_page_t
1017PHYS_TO_VM_PAGE(vm_paddr_t pa)
1018{
1019	vm_page_t m;
1020
1021#ifdef VM_PHYSSEG_SPARSE
1022	m = vm_phys_paddr_to_vm_page(pa);
1023	if (m == NULL)
1024		m = vm_phys_fictitious_to_vm_page(pa);
1025	return (m);
1026#elif defined(VM_PHYSSEG_DENSE)
1027	long pi;
1028
1029	pi = atop(pa);
1030	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1031		m = &vm_page_array[pi - first_page];
1032		return (m);
1033	}
1034	return (vm_phys_fictitious_to_vm_page(pa));
1035#else
1036#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
1037#endif
1038}
1039
1040/*
1041 *	vm_page_getfake:
1042 *
1043 *	Create a fictitious page with the specified physical address and
1044 *	memory attribute.  The memory attribute is the only the machine-
1045 *	dependent aspect of a fictitious page that must be initialized.
1046 */
1047vm_page_t
1048vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
1049{
1050	vm_page_t m;
1051
1052	m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
1053	vm_page_initfake(m, paddr, memattr);
1054	return (m);
1055}
1056
1057void
1058vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1059{
1060
1061	if ((m->flags & PG_FICTITIOUS) != 0) {
1062		/*
1063		 * The page's memattr might have changed since the
1064		 * previous initialization.  Update the pmap to the
1065		 * new memattr.
1066		 */
1067		goto memattr;
1068	}
1069	m->phys_addr = paddr;
1070	m->queue = PQ_NONE;
1071	/* Fictitious pages don't use "segind". */
1072	m->flags = PG_FICTITIOUS;
1073	/* Fictitious pages don't use "order" or "pool". */
1074	m->oflags = VPO_UNMANAGED;
1075	m->busy_lock = VPB_SINGLE_EXCLUSIVER;
1076	m->wire_count = 1;
1077	pmap_page_init(m);
1078memattr:
1079	pmap_page_set_memattr(m, memattr);
1080}
1081
1082/*
1083 *	vm_page_putfake:
1084 *
1085 *	Release a fictitious page.
1086 */
1087void
1088vm_page_putfake(vm_page_t m)
1089{
1090
1091	KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m));
1092	KASSERT((m->flags & PG_FICTITIOUS) != 0,
1093	    ("vm_page_putfake: bad page %p", m));
1094	uma_zfree(fakepg_zone, m);
1095}
1096
1097/*
1098 *	vm_page_updatefake:
1099 *
1100 *	Update the given fictitious page to the specified physical address and
1101 *	memory attribute.
1102 */
1103void
1104vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1105{
1106
1107	KASSERT((m->flags & PG_FICTITIOUS) != 0,
1108	    ("vm_page_updatefake: bad page %p", m));
1109	m->phys_addr = paddr;
1110	pmap_page_set_memattr(m, memattr);
1111}
1112
1113/*
1114 *	vm_page_free:
1115 *
1116 *	Free a page.
1117 */
1118void
1119vm_page_free(vm_page_t m)
1120{
1121
1122	m->flags &= ~PG_ZERO;
1123	vm_page_free_toq(m);
1124}
1125
1126/*
1127 *	vm_page_free_zero:
1128 *
1129 *	Free a page to the zerod-pages queue
1130 */
1131void
1132vm_page_free_zero(vm_page_t m)
1133{
1134
1135	m->flags |= PG_ZERO;
1136	vm_page_free_toq(m);
1137}
1138
1139/*
1140 * Unbusy and handle the page queueing for a page from a getpages request that
1141 * was optionally read ahead or behind.
1142 */
1143void
1144vm_page_readahead_finish(vm_page_t m)
1145{
1146
1147	/* We shouldn't put invalid pages on queues. */
1148	KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m));
1149
1150	/*
1151	 * Since the page is not the actually needed one, whether it should
1152	 * be activated or deactivated is not obvious.  Empirical results
1153	 * have shown that deactivating the page is usually the best choice,
1154	 * unless the page is wanted by another thread.
1155	 */
1156	vm_page_lock(m);
1157	if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
1158		vm_page_activate(m);
1159	else
1160		vm_page_deactivate(m);
1161	vm_page_unlock(m);
1162	vm_page_xunbusy(m);
1163}
1164
1165/*
1166 *	vm_page_sleep_if_busy:
1167 *
1168 *	Sleep and release the page queues lock if the page is busied.
1169 *	Returns TRUE if the thread slept.
1170 *
1171 *	The given page must be unlocked and object containing it must
1172 *	be locked.
1173 */
1174int
1175vm_page_sleep_if_busy(vm_page_t m, const char *msg)
1176{
1177	vm_object_t obj;
1178
1179	vm_page_lock_assert(m, MA_NOTOWNED);
1180	VM_OBJECT_ASSERT_WLOCKED(m->object);
1181
1182	if (vm_page_busied(m)) {
1183		/*
1184		 * The page-specific object must be cached because page
1185		 * identity can change during the sleep, causing the
1186		 * re-lock of a different object.
1187		 * It is assumed that a reference to the object is already
1188		 * held by the callers.
1189		 */
1190		obj = m->object;
1191		vm_page_lock(m);
1192		VM_OBJECT_WUNLOCK(obj);
1193		vm_page_busy_sleep(m, msg, false);
1194		VM_OBJECT_WLOCK(obj);
1195		return (TRUE);
1196	}
1197	return (FALSE);
1198}
1199
1200/*
1201 *	vm_page_dirty_KBI:		[ internal use only ]
1202 *
1203 *	Set all bits in the page's dirty field.
1204 *
1205 *	The object containing the specified page must be locked if the
1206 *	call is made from the machine-independent layer.
1207 *
1208 *	See vm_page_clear_dirty_mask().
1209 *
1210 *	This function should only be called by vm_page_dirty().
1211 */
1212void
1213vm_page_dirty_KBI(vm_page_t m)
1214{
1215
1216	/* Refer to this operation by its public name. */
1217	KASSERT(m->valid == VM_PAGE_BITS_ALL,
1218	    ("vm_page_dirty: page is invalid!"));
1219	m->dirty = VM_PAGE_BITS_ALL;
1220}
1221
1222/*
1223 *	vm_page_insert:		[ internal use only ]
1224 *
1225 *	Inserts the given mem entry into the object and object list.
1226 *
1227 *	The object must be locked.
1228 */
1229int
1230vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1231{
1232	vm_page_t mpred;
1233
1234	VM_OBJECT_ASSERT_WLOCKED(object);
1235	mpred = vm_radix_lookup_le(&object->rtree, pindex);
1236	return (vm_page_insert_after(m, object, pindex, mpred));
1237}
1238
1239/*
1240 *	vm_page_insert_after:
1241 *
1242 *	Inserts the page "m" into the specified object at offset "pindex".
1243 *
1244 *	The page "mpred" must immediately precede the offset "pindex" within
1245 *	the specified object.
1246 *
1247 *	The object must be locked.
1248 */
1249static int
1250vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
1251    vm_page_t mpred)
1252{
1253	vm_page_t msucc;
1254
1255	VM_OBJECT_ASSERT_WLOCKED(object);
1256	KASSERT(m->object == NULL,
1257	    ("vm_page_insert_after: page already inserted"));
1258	if (mpred != NULL) {
1259		KASSERT(mpred->object == object,
1260		    ("vm_page_insert_after: object doesn't contain mpred"));
1261		KASSERT(mpred->pindex < pindex,
1262		    ("vm_page_insert_after: mpred doesn't precede pindex"));
1263		msucc = TAILQ_NEXT(mpred, listq);
1264	} else
1265		msucc = TAILQ_FIRST(&object->memq);
1266	if (msucc != NULL)
1267		KASSERT(msucc->pindex > pindex,
1268		    ("vm_page_insert_after: msucc doesn't succeed pindex"));
1269
1270	/*
1271	 * Record the object/offset pair in this page
1272	 */
1273	m->object = object;
1274	m->pindex = pindex;
1275
1276	/*
1277	 * Now link into the object's ordered list of backed pages.
1278	 */
1279	if (vm_radix_insert(&object->rtree, m)) {
1280		m->object = NULL;
1281		m->pindex = 0;
1282		return (1);
1283	}
1284	vm_page_insert_radixdone(m, object, mpred);
1285	return (0);
1286}
1287
1288/*
1289 *	vm_page_insert_radixdone:
1290 *
1291 *	Complete page "m" insertion into the specified object after the
1292 *	radix trie hooking.
1293 *
1294 *	The page "mpred" must precede the offset "m->pindex" within the
1295 *	specified object.
1296 *
1297 *	The object must be locked.
1298 */
1299static void
1300vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred)
1301{
1302
1303	VM_OBJECT_ASSERT_WLOCKED(object);
1304	KASSERT(object != NULL && m->object == object,
1305	    ("vm_page_insert_radixdone: page %p has inconsistent object", m));
1306	if (mpred != NULL) {
1307		KASSERT(mpred->object == object,
1308		    ("vm_page_insert_after: object doesn't contain mpred"));
1309		KASSERT(mpred->pindex < m->pindex,
1310		    ("vm_page_insert_after: mpred doesn't precede pindex"));
1311	}
1312
1313	if (mpred != NULL)
1314		TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq);
1315	else
1316		TAILQ_INSERT_HEAD(&object->memq, m, listq);
1317
1318	/*
1319	 * Show that the object has one more resident page.
1320	 */
1321	object->resident_page_count++;
1322
1323	/*
1324	 * Hold the vnode until the last page is released.
1325	 */
1326	if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
1327		vhold(object->handle);
1328
1329	/*
1330	 * Since we are inserting a new and possibly dirty page,
1331	 * update the object's OBJ_MIGHTBEDIRTY flag.
1332	 */
1333	if (pmap_page_is_write_mapped(m))
1334		vm_object_set_writeable_dirty(object);
1335}
1336
1337/*
1338 *	vm_page_remove:
1339 *
1340 *	Removes the specified page from its containing object, but does not
1341 *	invalidate any backing storage.
1342 *
1343 *	The object must be locked.  The page must be locked if it is managed.
1344 */
1345void
1346vm_page_remove(vm_page_t m)
1347{
1348	vm_object_t object;
1349	vm_page_t mrem;
1350
1351	if ((m->oflags & VPO_UNMANAGED) == 0)
1352		vm_page_assert_locked(m);
1353	if ((object = m->object) == NULL)
1354		return;
1355	VM_OBJECT_ASSERT_WLOCKED(object);
1356	if (vm_page_xbusied(m))
1357		vm_page_xunbusy_maybelocked(m);
1358	mrem = vm_radix_remove(&object->rtree, m->pindex);
1359	KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m));
1360
1361	/*
1362	 * Now remove from the object's list of backed pages.
1363	 */
1364	TAILQ_REMOVE(&object->memq, m, listq);
1365
1366	/*
1367	 * And show that the object has one fewer resident page.
1368	 */
1369	object->resident_page_count--;
1370
1371	/*
1372	 * The vnode may now be recycled.
1373	 */
1374	if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
1375		vdrop(object->handle);
1376
1377	m->object = NULL;
1378}
1379
1380/*
1381 *	vm_page_lookup:
1382 *
1383 *	Returns the page associated with the object/offset
1384 *	pair specified; if none is found, NULL is returned.
1385 *
1386 *	The object must be locked.
1387 */
1388vm_page_t
1389vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1390{
1391
1392	VM_OBJECT_ASSERT_LOCKED(object);
1393	return (vm_radix_lookup(&object->rtree, pindex));
1394}
1395
1396/*
1397 *	vm_page_find_least:
1398 *
1399 *	Returns the page associated with the object with least pindex
1400 *	greater than or equal to the parameter pindex, or NULL.
1401 *
1402 *	The object must be locked.
1403 */
1404vm_page_t
1405vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
1406{
1407	vm_page_t m;
1408
1409	VM_OBJECT_ASSERT_LOCKED(object);
1410	if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex)
1411		m = vm_radix_lookup_ge(&object->rtree, pindex);
1412	return (m);
1413}
1414
1415/*
1416 * Returns the given page's successor (by pindex) within the object if it is
1417 * resident; if none is found, NULL is returned.
1418 *
1419 * The object must be locked.
1420 */
1421vm_page_t
1422vm_page_next(vm_page_t m)
1423{
1424	vm_page_t next;
1425
1426	VM_OBJECT_ASSERT_LOCKED(m->object);
1427	if ((next = TAILQ_NEXT(m, listq)) != NULL) {
1428		MPASS(next->object == m->object);
1429		if (next->pindex != m->pindex + 1)
1430			next = NULL;
1431	}
1432	return (next);
1433}
1434
1435/*
1436 * Returns the given page's predecessor (by pindex) within the object if it is
1437 * resident; if none is found, NULL is returned.
1438 *
1439 * The object must be locked.
1440 */
1441vm_page_t
1442vm_page_prev(vm_page_t m)
1443{
1444	vm_page_t prev;
1445
1446	VM_OBJECT_ASSERT_LOCKED(m->object);
1447	if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) {
1448		MPASS(prev->object == m->object);
1449		if (prev->pindex != m->pindex - 1)
1450			prev = NULL;
1451	}
1452	return (prev);
1453}
1454
1455/*
1456 * Uses the page mnew as a replacement for an existing page at index
1457 * pindex which must be already present in the object.
1458 *
1459 * The existing page must not be on a paging queue.
1460 */
1461vm_page_t
1462vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex)
1463{
1464	vm_page_t mold;
1465
1466	VM_OBJECT_ASSERT_WLOCKED(object);
1467	KASSERT(mnew->object == NULL,
1468	    ("vm_page_replace: page already in object"));
1469
1470	/*
1471	 * This function mostly follows vm_page_insert() and
1472	 * vm_page_remove() without the radix, object count and vnode
1473	 * dance.  Double check such functions for more comments.
1474	 */
1475
1476	mnew->object = object;
1477	mnew->pindex = pindex;
1478	mold = vm_radix_replace(&object->rtree, mnew);
1479	KASSERT(mold->queue == PQ_NONE,
1480	    ("vm_page_replace: mold is on a paging queue"));
1481
1482	/* Keep the resident page list in sorted order. */
1483	TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq);
1484	TAILQ_REMOVE(&object->memq, mold, listq);
1485
1486	mold->object = NULL;
1487	vm_page_xunbusy_maybelocked(mold);
1488
1489	/*
1490	 * The object's resident_page_count does not change because we have
1491	 * swapped one page for another, but OBJ_MIGHTBEDIRTY.
1492	 */
1493	if (pmap_page_is_write_mapped(mnew))
1494		vm_object_set_writeable_dirty(object);
1495	return (mold);
1496}
1497
1498/*
1499 *	vm_page_rename:
1500 *
1501 *	Move the given memory entry from its
1502 *	current object to the specified target object/offset.
1503 *
1504 *	Note: swap associated with the page must be invalidated by the move.  We
1505 *	      have to do this for several reasons:  (1) we aren't freeing the
1506 *	      page, (2) we are dirtying the page, (3) the VM system is probably
1507 *	      moving the page from object A to B, and will then later move
1508 *	      the backing store from A to B and we can't have a conflict.
1509 *
1510 *	Note: we *always* dirty the page.  It is necessary both for the
1511 *	      fact that we moved it, and because we may be invalidating
1512 *	      swap.
1513 *
1514 *	The objects must be locked.
1515 */
1516int
1517vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1518{
1519	vm_page_t mpred;
1520	vm_pindex_t opidx;
1521
1522	VM_OBJECT_ASSERT_WLOCKED(new_object);
1523
1524	mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex);
1525	KASSERT(mpred == NULL || mpred->pindex != new_pindex,
1526	    ("vm_page_rename: pindex already renamed"));
1527
1528	/*
1529	 * Create a custom version of vm_page_insert() which does not depend
1530	 * by m_prev and can cheat on the implementation aspects of the
1531	 * function.
1532	 */
1533	opidx = m->pindex;
1534	m->pindex = new_pindex;
1535	if (vm_radix_insert(&new_object->rtree, m)) {
1536		m->pindex = opidx;
1537		return (1);
1538	}
1539
1540	/*
1541	 * The operation cannot fail anymore.  The removal must happen before
1542	 * the listq iterator is tainted.
1543	 */
1544	m->pindex = opidx;
1545	vm_page_lock(m);
1546	vm_page_remove(m);
1547
1548	/* Return back to the new pindex to complete vm_page_insert(). */
1549	m->pindex = new_pindex;
1550	m->object = new_object;
1551	vm_page_unlock(m);
1552	vm_page_insert_radixdone(m, new_object, mpred);
1553	vm_page_dirty(m);
1554	return (0);
1555}
1556
1557/*
1558 *	vm_page_alloc:
1559 *
1560 *	Allocate and return a page that is associated with the specified
1561 *	object and offset pair.  By default, this page is exclusive busied.
1562 *
1563 *	The caller must always specify an allocation class.
1564 *
1565 *	allocation classes:
1566 *	VM_ALLOC_NORMAL		normal process request
1567 *	VM_ALLOC_SYSTEM		system *really* needs a page
1568 *	VM_ALLOC_INTERRUPT	interrupt time request
1569 *
1570 *	optional allocation flags:
1571 *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
1572 *				intends to allocate
1573 *	VM_ALLOC_NOBUSY		do not exclusive busy the page
1574 *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
1575 *	VM_ALLOC_NOOBJ		page is not associated with an object and
1576 *				should not be exclusive busy
1577 *	VM_ALLOC_SBUSY		shared busy the allocated page
1578 *	VM_ALLOC_WIRED		wire the allocated page
1579 *	VM_ALLOC_ZERO		prefer a zeroed page
1580 *
1581 *	This routine may not sleep.
1582 */
1583vm_page_t
1584vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
1585{
1586	vm_page_t m, mpred;
1587	int flags, req_class;
1588
1589	mpred = NULL;	/* XXX: pacify gcc */
1590	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
1591	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
1592	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
1593	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
1594	    ("vm_page_alloc: inconsistent object(%p)/req(%x)", object, req));
1595	if (object != NULL)
1596		VM_OBJECT_ASSERT_WLOCKED(object);
1597
1598	if (__predict_false((req & VM_ALLOC_IFCACHED) != 0))
1599		return (NULL);
1600
1601	req_class = req & VM_ALLOC_CLASS_MASK;
1602
1603	/*
1604	 * The page daemon is allowed to dig deeper into the free page list.
1605	 */
1606	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1607		req_class = VM_ALLOC_SYSTEM;
1608
1609	if (object != NULL) {
1610		mpred = vm_radix_lookup_le(&object->rtree, pindex);
1611		KASSERT(mpred == NULL || mpred->pindex != pindex,
1612		   ("vm_page_alloc: pindex already allocated"));
1613	}
1614
1615	/*
1616	 * Allocate a page if the number of free pages exceeds the minimum
1617	 * for the request class.
1618	 */
1619	mtx_lock(&vm_page_queue_free_mtx);
1620	if (vm_cnt.v_free_count > vm_cnt.v_free_reserved ||
1621	    (req_class == VM_ALLOC_SYSTEM &&
1622	    vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) ||
1623	    (req_class == VM_ALLOC_INTERRUPT &&
1624	    vm_cnt.v_free_count > 0)) {
1625		/*
1626		 * Can we allocate the page from a reservation?
1627		 */
1628#if VM_NRESERVLEVEL > 0
1629		if (object == NULL || (object->flags & (OBJ_COLORED |
1630		    OBJ_FICTITIOUS)) != OBJ_COLORED || (m =
1631		    vm_reserv_alloc_page(object, pindex, mpred)) == NULL)
1632#endif
1633		{
1634			/*
1635			 * If not, allocate it from the free page queues.
1636			 */
1637			m = vm_phys_alloc_pages(object != NULL ?
1638			    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
1639#if VM_NRESERVLEVEL > 0
1640			if (m == NULL && vm_reserv_reclaim_inactive()) {
1641				m = vm_phys_alloc_pages(object != NULL ?
1642				    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT,
1643				    0);
1644			}
1645#endif
1646		}
1647	} else {
1648		/*
1649		 * Not allocatable, give up.
1650		 */
1651		mtx_unlock(&vm_page_queue_free_mtx);
1652		atomic_add_int(&vm_pageout_deficit,
1653		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
1654		pagedaemon_wakeup();
1655		return (NULL);
1656	}
1657
1658	/*
1659	 *  At this point we had better have found a good page.
1660	 */
1661	KASSERT(m != NULL, ("vm_page_alloc: missing page"));
1662	vm_phys_freecnt_adj(m, -1);
1663	if ((m->flags & PG_ZERO) != 0)
1664		vm_page_zero_count--;
1665	mtx_unlock(&vm_page_queue_free_mtx);
1666	vm_page_alloc_check(m);
1667
1668	/*
1669	 * Initialize the page.  Only the PG_ZERO flag is inherited.
1670	 */
1671	flags = 0;
1672	if ((req & VM_ALLOC_ZERO) != 0)
1673		flags = PG_ZERO;
1674	flags &= m->flags;
1675	if ((req & VM_ALLOC_NODUMP) != 0)
1676		flags |= PG_NODUMP;
1677	m->flags = flags;
1678	m->aflags = 0;
1679	m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
1680	    VPO_UNMANAGED : 0;
1681	m->busy_lock = VPB_UNBUSIED;
1682	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
1683		m->busy_lock = VPB_SINGLE_EXCLUSIVER;
1684	if ((req & VM_ALLOC_SBUSY) != 0)
1685		m->busy_lock = VPB_SHARERS_WORD(1);
1686	if (req & VM_ALLOC_WIRED) {
1687		/*
1688		 * The page lock is not required for wiring a page until that
1689		 * page is inserted into the object.
1690		 */
1691		atomic_add_int(&vm_cnt.v_wire_count, 1);
1692		m->wire_count = 1;
1693	}
1694	m->act_count = 0;
1695
1696	if (object != NULL) {
1697		if (vm_page_insert_after(m, object, pindex, mpred)) {
1698			pagedaemon_wakeup();
1699			if (req & VM_ALLOC_WIRED) {
1700				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
1701				m->wire_count = 0;
1702			}
1703			KASSERT(m->object == NULL, ("page %p has object", m));
1704			m->oflags = VPO_UNMANAGED;
1705			m->busy_lock = VPB_UNBUSIED;
1706			/* Don't change PG_ZERO. */
1707			vm_page_free_toq(m);
1708			return (NULL);
1709		}
1710
1711		/* Ignore device objects; the pager sets "memattr" for them. */
1712		if (object->memattr != VM_MEMATTR_DEFAULT &&
1713		    (object->flags & OBJ_FICTITIOUS) == 0)
1714			pmap_page_set_memattr(m, object->memattr);
1715	} else
1716		m->pindex = pindex;
1717
1718	/*
1719	 * Don't wakeup too often - wakeup the pageout daemon when
1720	 * we would be nearly out of memory.
1721	 */
1722	if (vm_paging_needed())
1723		pagedaemon_wakeup();
1724
1725	return (m);
1726}
1727
1728/*
1729 *	vm_page_alloc_contig:
1730 *
1731 *	Allocate a contiguous set of physical pages of the given size "npages"
1732 *	from the free lists.  All of the physical pages must be at or above
1733 *	the given physical address "low" and below the given physical address
1734 *	"high".  The given value "alignment" determines the alignment of the
1735 *	first physical page in the set.  If the given value "boundary" is
1736 *	non-zero, then the set of physical pages cannot cross any physical
1737 *	address boundary that is a multiple of that value.  Both "alignment"
1738 *	and "boundary" must be a power of two.
1739 *
1740 *	If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
1741 *	then the memory attribute setting for the physical pages is configured
1742 *	to the object's memory attribute setting.  Otherwise, the memory
1743 *	attribute setting for the physical pages is configured to "memattr",
1744 *	overriding the object's memory attribute setting.  However, if the
1745 *	object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
1746 *	memory attribute setting for the physical pages cannot be configured
1747 *	to VM_MEMATTR_DEFAULT.
1748 *
1749 *	The specified object may not contain fictitious pages.
1750 *
1751 *	The caller must always specify an allocation class.
1752 *
1753 *	allocation classes:
1754 *	VM_ALLOC_NORMAL		normal process request
1755 *	VM_ALLOC_SYSTEM		system *really* needs a page
1756 *	VM_ALLOC_INTERRUPT	interrupt time request
1757 *
1758 *	optional allocation flags:
1759 *	VM_ALLOC_NOBUSY		do not exclusive busy the page
1760 *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
1761 *	VM_ALLOC_NOOBJ		page is not associated with an object and
1762 *				should not be exclusive busy
1763 *	VM_ALLOC_SBUSY		shared busy the allocated page
1764 *	VM_ALLOC_WIRED		wire the allocated page
1765 *	VM_ALLOC_ZERO		prefer a zeroed page
1766 *
1767 *	This routine may not sleep.
1768 */
1769vm_page_t
1770vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
1771    u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
1772    vm_paddr_t boundary, vm_memattr_t memattr)
1773{
1774	vm_page_t m, m_ret, mpred;
1775	u_int busy_lock, flags, oflags;
1776	int req_class;
1777
1778	mpred = NULL;	/* XXX: pacify gcc */
1779	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
1780	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
1781	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
1782	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
1783	    ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object,
1784	    req));
1785	if (object != NULL) {
1786		VM_OBJECT_ASSERT_WLOCKED(object);
1787		KASSERT((object->flags & OBJ_FICTITIOUS) == 0,
1788		    ("vm_page_alloc_contig: object %p has fictitious pages",
1789		    object));
1790	}
1791	KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
1792	req_class = req & VM_ALLOC_CLASS_MASK;
1793
1794	/*
1795	 * The page daemon is allowed to dig deeper into the free page list.
1796	 */
1797	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1798		req_class = VM_ALLOC_SYSTEM;
1799
1800	if (object != NULL) {
1801		mpred = vm_radix_lookup_le(&object->rtree, pindex);
1802		KASSERT(mpred == NULL || mpred->pindex != pindex,
1803		    ("vm_page_alloc_contig: pindex already allocated"));
1804	}
1805
1806	/*
1807	 * Can we allocate the pages without the number of free pages falling
1808	 * below the lower bound for the allocation class?
1809	 */
1810	mtx_lock(&vm_page_queue_free_mtx);
1811	if (vm_cnt.v_free_count >= npages + vm_cnt.v_free_reserved ||
1812	    (req_class == VM_ALLOC_SYSTEM &&
1813	    vm_cnt.v_free_count >= npages + vm_cnt.v_interrupt_free_min) ||
1814	    (req_class == VM_ALLOC_INTERRUPT &&
1815	    vm_cnt.v_free_count >= npages)) {
1816		/*
1817		 * Can we allocate the pages from a reservation?
1818		 */
1819#if VM_NRESERVLEVEL > 0
1820retry:
1821		if (object == NULL || (object->flags & OBJ_COLORED) == 0 ||
1822		    (m_ret = vm_reserv_alloc_contig(object, pindex, npages,
1823		    low, high, alignment, boundary, mpred)) == NULL)
1824#endif
1825			/*
1826			 * If not, allocate them from the free page queues.
1827			 */
1828			m_ret = vm_phys_alloc_contig(npages, low, high,
1829			    alignment, boundary);
1830	} else {
1831		mtx_unlock(&vm_page_queue_free_mtx);
1832		atomic_add_int(&vm_pageout_deficit, npages);
1833		pagedaemon_wakeup();
1834		return (NULL);
1835	}
1836	if (m_ret != NULL) {
1837		vm_phys_freecnt_adj(m_ret, -npages);
1838		for (m = m_ret; m < &m_ret[npages]; m++)
1839			if ((m->flags & PG_ZERO) != 0)
1840				vm_page_zero_count--;
1841	} else {
1842#if VM_NRESERVLEVEL > 0
1843		if (vm_reserv_reclaim_contig(npages, low, high, alignment,
1844		    boundary))
1845			goto retry;
1846#endif
1847	}
1848	mtx_unlock(&vm_page_queue_free_mtx);
1849	if (m_ret == NULL)
1850		return (NULL);
1851	for (m = m_ret; m < &m_ret[npages]; m++)
1852		vm_page_alloc_check(m);
1853
1854	/*
1855	 * Initialize the pages.  Only the PG_ZERO flag is inherited.
1856	 */
1857	flags = 0;
1858	if ((req & VM_ALLOC_ZERO) != 0)
1859		flags = PG_ZERO;
1860	if ((req & VM_ALLOC_NODUMP) != 0)
1861		flags |= PG_NODUMP;
1862	oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
1863	    VPO_UNMANAGED : 0;
1864	busy_lock = VPB_UNBUSIED;
1865	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
1866		busy_lock = VPB_SINGLE_EXCLUSIVER;
1867	if ((req & VM_ALLOC_SBUSY) != 0)
1868		busy_lock = VPB_SHARERS_WORD(1);
1869	if ((req & VM_ALLOC_WIRED) != 0)
1870		atomic_add_int(&vm_cnt.v_wire_count, npages);
1871	if (object != NULL) {
1872		if (object->memattr != VM_MEMATTR_DEFAULT &&
1873		    memattr == VM_MEMATTR_DEFAULT)
1874			memattr = object->memattr;
1875	}
1876	for (m = m_ret; m < &m_ret[npages]; m++) {
1877		m->aflags = 0;
1878		m->flags = (m->flags | PG_NODUMP) & flags;
1879		m->busy_lock = busy_lock;
1880		if ((req & VM_ALLOC_WIRED) != 0)
1881			m->wire_count = 1;
1882		m->act_count = 0;
1883		m->oflags = oflags;
1884		if (object != NULL) {
1885			if (vm_page_insert_after(m, object, pindex, mpred)) {
1886				pagedaemon_wakeup();
1887				if ((req & VM_ALLOC_WIRED) != 0)
1888					atomic_subtract_int(
1889					    &vm_cnt.v_wire_count, npages);
1890				KASSERT(m->object == NULL,
1891				    ("page %p has object", m));
1892				mpred = m;
1893				for (m = m_ret; m < &m_ret[npages]; m++) {
1894					if (m <= mpred &&
1895					    (req & VM_ALLOC_WIRED) != 0)
1896						m->wire_count = 0;
1897					m->oflags = VPO_UNMANAGED;
1898					m->busy_lock = VPB_UNBUSIED;
1899					/* Don't change PG_ZERO. */
1900					vm_page_free_toq(m);
1901				}
1902				return (NULL);
1903			}
1904			mpred = m;
1905		} else
1906			m->pindex = pindex;
1907		if (memattr != VM_MEMATTR_DEFAULT)
1908			pmap_page_set_memattr(m, memattr);
1909		pindex++;
1910	}
1911	if (vm_paging_needed())
1912		pagedaemon_wakeup();
1913	return (m_ret);
1914}
1915
1916/*
1917 * Check a page that has been freshly dequeued from a freelist.
1918 */
1919static void
1920vm_page_alloc_check(vm_page_t m)
1921{
1922
1923	KASSERT(m->object == NULL, ("page %p has object", m));
1924	KASSERT(m->queue == PQ_NONE,
1925	    ("page %p has unexpected queue %d", m, m->queue));
1926	KASSERT(m->wire_count == 0, ("page %p is wired", m));
1927	KASSERT(m->hold_count == 0, ("page %p is held", m));
1928	KASSERT(!vm_page_busied(m), ("page %p is busy", m));
1929	KASSERT(m->dirty == 0, ("page %p is dirty", m));
1930	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
1931	    ("page %p has unexpected memattr %d",
1932	    m, pmap_page_get_memattr(m)));
1933	KASSERT(m->valid == 0, ("free page %p is valid", m));
1934}
1935
1936/*
1937 * 	vm_page_alloc_freelist:
1938 *
1939 *	Allocate a physical page from the specified free page list.
1940 *
1941 *	The caller must always specify an allocation class.
1942 *
1943 *	allocation classes:
1944 *	VM_ALLOC_NORMAL		normal process request
1945 *	VM_ALLOC_SYSTEM		system *really* needs a page
1946 *	VM_ALLOC_INTERRUPT	interrupt time request
1947 *
1948 *	optional allocation flags:
1949 *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
1950 *				intends to allocate
1951 *	VM_ALLOC_WIRED		wire the allocated page
1952 *	VM_ALLOC_ZERO		prefer a zeroed page
1953 *
1954 *	This routine may not sleep.
1955 */
1956vm_page_t
1957vm_page_alloc_freelist(int flind, int req)
1958{
1959	vm_page_t m;
1960	u_int flags;
1961	int req_class;
1962
1963	req_class = req & VM_ALLOC_CLASS_MASK;
1964
1965	/*
1966	 * The page daemon is allowed to dig deeper into the free page list.
1967	 */
1968	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1969		req_class = VM_ALLOC_SYSTEM;
1970
1971	/*
1972	 * Do not allocate reserved pages unless the req has asked for it.
1973	 */
1974	mtx_lock(&vm_page_queue_free_mtx);
1975	if (vm_cnt.v_free_count > vm_cnt.v_free_reserved ||
1976	    (req_class == VM_ALLOC_SYSTEM &&
1977	    vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) ||
1978	    (req_class == VM_ALLOC_INTERRUPT &&
1979	    vm_cnt.v_free_count > 0))
1980		m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
1981	else {
1982		mtx_unlock(&vm_page_queue_free_mtx);
1983		atomic_add_int(&vm_pageout_deficit,
1984		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
1985		pagedaemon_wakeup();
1986		return (NULL);
1987	}
1988	if (m == NULL) {
1989		mtx_unlock(&vm_page_queue_free_mtx);
1990		return (NULL);
1991	}
1992	vm_phys_freecnt_adj(m, -1);
1993	if ((m->flags & PG_ZERO) != 0)
1994		vm_page_zero_count--;
1995	mtx_unlock(&vm_page_queue_free_mtx);
1996	vm_page_alloc_check(m);
1997
1998	/*
1999	 * Initialize the page.  Only the PG_ZERO flag is inherited.
2000	 */
2001	m->aflags = 0;
2002	flags = 0;
2003	if ((req & VM_ALLOC_ZERO) != 0)
2004		flags = PG_ZERO;
2005	m->flags &= flags;
2006	if ((req & VM_ALLOC_WIRED) != 0) {
2007		/*
2008		 * The page lock is not required for wiring a page that does
2009		 * not belong to an object.
2010		 */
2011		atomic_add_int(&vm_cnt.v_wire_count, 1);
2012		m->wire_count = 1;
2013	}
2014	/* Unmanaged pages don't use "act_count". */
2015	m->oflags = VPO_UNMANAGED;
2016	if (vm_paging_needed())
2017		pagedaemon_wakeup();
2018	return (m);
2019}
2020
2021#define	VPSC_ANY	0	/* No restrictions. */
2022#define	VPSC_NORESERV	1	/* Skip reservations; implies VPSC_NOSUPER. */
2023#define	VPSC_NOSUPER	2	/* Skip superpages. */
2024
2025/*
2026 *	vm_page_scan_contig:
2027 *
2028 *	Scan vm_page_array[] between the specified entries "m_start" and
2029 *	"m_end" for a run of contiguous physical pages that satisfy the
2030 *	specified conditions, and return the lowest page in the run.  The
2031 *	specified "alignment" determines the alignment of the lowest physical
2032 *	page in the run.  If the specified "boundary" is non-zero, then the
2033 *	run of physical pages cannot span a physical address that is a
2034 *	multiple of "boundary".
2035 *
2036 *	"m_end" is never dereferenced, so it need not point to a vm_page
2037 *	structure within vm_page_array[].
2038 *
2039 *	"npages" must be greater than zero.  "m_start" and "m_end" must not
2040 *	span a hole (or discontiguity) in the physical address space.  Both
2041 *	"alignment" and "boundary" must be a power of two.
2042 */
2043vm_page_t
2044vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
2045    u_long alignment, vm_paddr_t boundary, int options)
2046{
2047	struct mtx *m_mtx;
2048	vm_object_t object;
2049	vm_paddr_t pa;
2050	vm_page_t m, m_run;
2051#if VM_NRESERVLEVEL > 0
2052	int level;
2053#endif
2054	int m_inc, order, run_ext, run_len;
2055
2056	KASSERT(npages > 0, ("npages is 0"));
2057	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
2058	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
2059	m_run = NULL;
2060	run_len = 0;
2061	m_mtx = NULL;
2062	for (m = m_start; m < m_end && run_len < npages; m += m_inc) {
2063		KASSERT((m->flags & PG_MARKER) == 0,
2064		    ("page %p is PG_MARKER", m));
2065		KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->wire_count == 1,
2066		    ("fictitious page %p has invalid wire count", m));
2067
2068		/*
2069		 * If the current page would be the start of a run, check its
2070		 * physical address against the end, alignment, and boundary
2071		 * conditions.  If it doesn't satisfy these conditions, either
2072		 * terminate the scan or advance to the next page that
2073		 * satisfies the failed condition.
2074		 */
2075		if (run_len == 0) {
2076			KASSERT(m_run == NULL, ("m_run != NULL"));
2077			if (m + npages > m_end)
2078				break;
2079			pa = VM_PAGE_TO_PHYS(m);
2080			if ((pa & (alignment - 1)) != 0) {
2081				m_inc = atop(roundup2(pa, alignment) - pa);
2082				continue;
2083			}
2084			if (rounddown2(pa ^ (pa + ptoa(npages) - 1),
2085			    boundary) != 0) {
2086				m_inc = atop(roundup2(pa, boundary) - pa);
2087				continue;
2088			}
2089		} else
2090			KASSERT(m_run != NULL, ("m_run == NULL"));
2091
2092		vm_page_change_lock(m, &m_mtx);
2093		m_inc = 1;
2094retry:
2095		if (m->wire_count != 0 || m->hold_count != 0)
2096			run_ext = 0;
2097#if VM_NRESERVLEVEL > 0
2098		else if ((level = vm_reserv_level(m)) >= 0 &&
2099		    (options & VPSC_NORESERV) != 0) {
2100			run_ext = 0;
2101			/* Advance to the end of the reservation. */
2102			pa = VM_PAGE_TO_PHYS(m);
2103			m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) -
2104			    pa);
2105		}
2106#endif
2107		else if ((object = m->object) != NULL) {
2108			/*
2109			 * The page is considered eligible for relocation if
2110			 * and only if it could be laundered or reclaimed by
2111			 * the page daemon.
2112			 */
2113			if (!VM_OBJECT_TRYRLOCK(object)) {
2114				mtx_unlock(m_mtx);
2115				VM_OBJECT_RLOCK(object);
2116				mtx_lock(m_mtx);
2117				if (m->object != object) {
2118					/*
2119					 * The page may have been freed.
2120					 */
2121					VM_OBJECT_RUNLOCK(object);
2122					goto retry;
2123				} else if (m->wire_count != 0 ||
2124				    m->hold_count != 0) {
2125					run_ext = 0;
2126					goto unlock;
2127				}
2128			}
2129			KASSERT((m->flags & PG_UNHOLDFREE) == 0,
2130			    ("page %p is PG_UNHOLDFREE", m));
2131			/* Don't care: PG_NODUMP, PG_ZERO. */
2132			if (object->type != OBJT_DEFAULT &&
2133			    object->type != OBJT_SWAP &&
2134			    object->type != OBJT_VNODE) {
2135				run_ext = 0;
2136#if VM_NRESERVLEVEL > 0
2137			} else if ((options & VPSC_NOSUPER) != 0 &&
2138			    (level = vm_reserv_level_iffullpop(m)) >= 0) {
2139				run_ext = 0;
2140				/* Advance to the end of the superpage. */
2141				pa = VM_PAGE_TO_PHYS(m);
2142				m_inc = atop(roundup2(pa + 1,
2143				    vm_reserv_size(level)) - pa);
2144#endif
2145			} else if (object->memattr == VM_MEMATTR_DEFAULT &&
2146			    m->queue != PQ_NONE && !vm_page_busied(m)) {
2147				/*
2148				 * The page is allocated but eligible for
2149				 * relocation.  Extend the current run by one
2150				 * page.
2151				 */
2152				KASSERT(pmap_page_get_memattr(m) ==
2153				    VM_MEMATTR_DEFAULT,
2154				    ("page %p has an unexpected memattr", m));
2155				KASSERT((m->oflags & (VPO_SWAPINPROG |
2156				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
2157				    ("page %p has unexpected oflags", m));
2158				/* Don't care: VPO_NOSYNC. */
2159				run_ext = 1;
2160			} else
2161				run_ext = 0;
2162unlock:
2163			VM_OBJECT_RUNLOCK(object);
2164#if VM_NRESERVLEVEL > 0
2165		} else if (level >= 0) {
2166			/*
2167			 * The page is reserved but not yet allocated.  In
2168			 * other words, it is still free.  Extend the current
2169			 * run by one page.
2170			 */
2171			run_ext = 1;
2172#endif
2173		} else if ((order = m->order) < VM_NFREEORDER) {
2174			/*
2175			 * The page is enqueued in the physical memory
2176			 * allocator's free page queues.  Moreover, it is the
2177			 * first page in a power-of-two-sized run of
2178			 * contiguous free pages.  Add these pages to the end
2179			 * of the current run, and jump ahead.
2180			 */
2181			run_ext = 1 << order;
2182			m_inc = 1 << order;
2183		} else {
2184			/*
2185			 * Skip the page for one of the following reasons: (1)
2186			 * It is enqueued in the physical memory allocator's
2187			 * free page queues.  However, it is not the first
2188			 * page in a run of contiguous free pages.  (This case
2189			 * rarely occurs because the scan is performed in
2190			 * ascending order.) (2) It is not reserved, and it is
2191			 * transitioning from free to allocated.  (Conversely,
2192			 * the transition from allocated to free for managed
2193			 * pages is blocked by the page lock.) (3) It is
2194			 * allocated but not contained by an object and not
2195			 * wired, e.g., allocated by Xen's balloon driver.
2196			 */
2197			run_ext = 0;
2198		}
2199
2200		/*
2201		 * Extend or reset the current run of pages.
2202		 */
2203		if (run_ext > 0) {
2204			if (run_len == 0)
2205				m_run = m;
2206			run_len += run_ext;
2207		} else {
2208			if (run_len > 0) {
2209				m_run = NULL;
2210				run_len = 0;
2211			}
2212		}
2213	}
2214	if (m_mtx != NULL)
2215		mtx_unlock(m_mtx);
2216	if (run_len >= npages)
2217		return (m_run);
2218	return (NULL);
2219}
2220
2221/*
2222 *	vm_page_reclaim_run:
2223 *
2224 *	Try to relocate each of the allocated virtual pages within the
2225 *	specified run of physical pages to a new physical address.  Free the
2226 *	physical pages underlying the relocated virtual pages.  A virtual page
2227 *	is relocatable if and only if it could be laundered or reclaimed by
2228 *	the page daemon.  Whenever possible, a virtual page is relocated to a
2229 *	physical address above "high".
2230 *
2231 *	Returns 0 if every physical page within the run was already free or
2232 *	just freed by a successful relocation.  Otherwise, returns a non-zero
2233 *	value indicating why the last attempt to relocate a virtual page was
2234 *	unsuccessful.
2235 *
2236 *	"req_class" must be an allocation class.
2237 */
2238static int
2239vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
2240    vm_paddr_t high)
2241{
2242	struct mtx *m_mtx;
2243	struct spglist free;
2244	vm_object_t object;
2245	vm_paddr_t pa;
2246	vm_page_t m, m_end, m_new;
2247	int error, order, req;
2248
2249	KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class,
2250	    ("req_class is not an allocation class"));
2251	SLIST_INIT(&free);
2252	error = 0;
2253	m = m_run;
2254	m_end = m_run + npages;
2255	m_mtx = NULL;
2256	for (; error == 0 && m < m_end; m++) {
2257		KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
2258		    ("page %p is PG_FICTITIOUS or PG_MARKER", m));
2259
2260		/*
2261		 * Avoid releasing and reacquiring the same page lock.
2262		 */
2263		vm_page_change_lock(m, &m_mtx);
2264retry:
2265		if (m->wire_count != 0 || m->hold_count != 0)
2266			error = EBUSY;
2267		else if ((object = m->object) != NULL) {
2268			/*
2269			 * The page is relocated if and only if it could be
2270			 * laundered or reclaimed by the page daemon.
2271			 */
2272			if (!VM_OBJECT_TRYWLOCK(object)) {
2273				mtx_unlock(m_mtx);
2274				VM_OBJECT_WLOCK(object);
2275				mtx_lock(m_mtx);
2276				if (m->object != object) {
2277					/*
2278					 * The page may have been freed.
2279					 */
2280					VM_OBJECT_WUNLOCK(object);
2281					goto retry;
2282				} else if (m->wire_count != 0 ||
2283				    m->hold_count != 0) {
2284					error = EBUSY;
2285					goto unlock;
2286				}
2287			}
2288			KASSERT((m->flags & PG_UNHOLDFREE) == 0,
2289			    ("page %p is PG_UNHOLDFREE", m));
2290			/* Don't care: PG_NODUMP, PG_ZERO. */
2291			if (object->type != OBJT_DEFAULT &&
2292			    object->type != OBJT_SWAP &&
2293			    object->type != OBJT_VNODE)
2294				error = EINVAL;
2295			else if (object->memattr != VM_MEMATTR_DEFAULT)
2296				error = EINVAL;
2297			else if (m->queue != PQ_NONE && !vm_page_busied(m)) {
2298				KASSERT(pmap_page_get_memattr(m) ==
2299				    VM_MEMATTR_DEFAULT,
2300				    ("page %p has an unexpected memattr", m));
2301				KASSERT((m->oflags & (VPO_SWAPINPROG |
2302				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
2303				    ("page %p has unexpected oflags", m));
2304				/* Don't care: VPO_NOSYNC. */
2305				if (m->valid != 0) {
2306					/*
2307					 * First, try to allocate a new page
2308					 * that is above "high".  Failing
2309					 * that, try to allocate a new page
2310					 * that is below "m_run".  Allocate
2311					 * the new page between the end of
2312					 * "m_run" and "high" only as a last
2313					 * resort.
2314					 */
2315					req = req_class | VM_ALLOC_NOOBJ;
2316					if ((m->flags & PG_NODUMP) != 0)
2317						req |= VM_ALLOC_NODUMP;
2318					if (trunc_page(high) !=
2319					    ~(vm_paddr_t)PAGE_MASK) {
2320						m_new = vm_page_alloc_contig(
2321						    NULL, 0, req, 1,
2322						    round_page(high),
2323						    ~(vm_paddr_t)0,
2324						    PAGE_SIZE, 0,
2325						    VM_MEMATTR_DEFAULT);
2326					} else
2327						m_new = NULL;
2328					if (m_new == NULL) {
2329						pa = VM_PAGE_TO_PHYS(m_run);
2330						m_new = vm_page_alloc_contig(
2331						    NULL, 0, req, 1,
2332						    0, pa - 1, PAGE_SIZE, 0,
2333						    VM_MEMATTR_DEFAULT);
2334					}
2335					if (m_new == NULL) {
2336						pa += ptoa(npages);
2337						m_new = vm_page_alloc_contig(
2338						    NULL, 0, req, 1,
2339						    pa, high, PAGE_SIZE, 0,
2340						    VM_MEMATTR_DEFAULT);
2341					}
2342					if (m_new == NULL) {
2343						error = ENOMEM;
2344						goto unlock;
2345					}
2346					KASSERT(m_new->wire_count == 0,
2347					    ("page %p is wired", m));
2348
2349					/*
2350					 * Replace "m" with the new page.  For
2351					 * vm_page_replace(), "m" must be busy
2352					 * and dequeued.  Finally, change "m"
2353					 * as if vm_page_free() was called.
2354					 */
2355					if (object->ref_count != 0)
2356						pmap_remove_all(m);
2357					m_new->aflags = m->aflags;
2358					KASSERT(m_new->oflags == VPO_UNMANAGED,
2359					    ("page %p is managed", m));
2360					m_new->oflags = m->oflags & VPO_NOSYNC;
2361					pmap_copy_page(m, m_new);
2362					m_new->valid = m->valid;
2363					m_new->dirty = m->dirty;
2364					m->flags &= ~PG_ZERO;
2365					vm_page_xbusy(m);
2366					vm_page_remque(m);
2367					vm_page_replace_checked(m_new, object,
2368					    m->pindex, m);
2369					m->valid = 0;
2370					vm_page_undirty(m);
2371
2372					/*
2373					 * The new page must be deactivated
2374					 * before the object is unlocked.
2375					 */
2376					vm_page_change_lock(m_new, &m_mtx);
2377					vm_page_deactivate(m_new);
2378				} else {
2379					m->flags &= ~PG_ZERO;
2380					vm_page_remque(m);
2381					vm_page_remove(m);
2382					KASSERT(m->dirty == 0,
2383					    ("page %p is dirty", m));
2384				}
2385				SLIST_INSERT_HEAD(&free, m, plinks.s.ss);
2386			} else
2387				error = EBUSY;
2388unlock:
2389			VM_OBJECT_WUNLOCK(object);
2390		} else {
2391			mtx_lock(&vm_page_queue_free_mtx);
2392			order = m->order;
2393			if (order < VM_NFREEORDER) {
2394				/*
2395				 * The page is enqueued in the physical memory
2396				 * allocator's free page queues.  Moreover, it
2397				 * is the first page in a power-of-two-sized
2398				 * run of contiguous free pages.  Jump ahead
2399				 * to the last page within that run, and
2400				 * continue from there.
2401				 */
2402				m += (1 << order) - 1;
2403			}
2404#if VM_NRESERVLEVEL > 0
2405			else if (vm_reserv_is_page_free(m))
2406				order = 0;
2407#endif
2408			mtx_unlock(&vm_page_queue_free_mtx);
2409			if (order == VM_NFREEORDER)
2410				error = EINVAL;
2411		}
2412	}
2413	if (m_mtx != NULL)
2414		mtx_unlock(m_mtx);
2415	if ((m = SLIST_FIRST(&free)) != NULL) {
2416		mtx_lock(&vm_page_queue_free_mtx);
2417		do {
2418			SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2419			vm_page_free_phys(m);
2420		} while ((m = SLIST_FIRST(&free)) != NULL);
2421		vm_page_zero_idle_wakeup();
2422		vm_page_free_wakeup();
2423		mtx_unlock(&vm_page_queue_free_mtx);
2424	}
2425	return (error);
2426}
2427
2428#define	NRUNS	16
2429
2430CTASSERT(powerof2(NRUNS));
2431
2432#define	RUN_INDEX(count)	((count) & (NRUNS - 1))
2433
2434#define	MIN_RECLAIM	8
2435
2436/*
2437 *	vm_page_reclaim_contig:
2438 *
2439 *	Reclaim allocated, contiguous physical memory satisfying the specified
2440 *	conditions by relocating the virtual pages using that physical memory.
2441 *	Returns true if reclamation is successful and false otherwise.  Since
2442 *	relocation requires the allocation of physical pages, reclamation may
2443 *	fail due to a shortage of free pages.  When reclamation fails, callers
2444 *	are expected to perform VM_WAIT before retrying a failed allocation
2445 *	operation, e.g., vm_page_alloc_contig().
2446 *
2447 *	The caller must always specify an allocation class through "req".
2448 *
2449 *	allocation classes:
2450 *	VM_ALLOC_NORMAL		normal process request
2451 *	VM_ALLOC_SYSTEM		system *really* needs a page
2452 *	VM_ALLOC_INTERRUPT	interrupt time request
2453 *
2454 *	The optional allocation flags are ignored.
2455 *
2456 *	"npages" must be greater than zero.  Both "alignment" and "boundary"
2457 *	must be a power of two.
2458 */
2459bool
2460vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
2461    u_long alignment, vm_paddr_t boundary)
2462{
2463	vm_paddr_t curr_low;
2464	vm_page_t m_run, m_runs[NRUNS];
2465	u_long count, reclaimed;
2466	int error, i, options, req_class;
2467
2468	KASSERT(npages > 0, ("npages is 0"));
2469	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
2470	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
2471	req_class = req & VM_ALLOC_CLASS_MASK;
2472
2473	/*
2474	 * The page daemon is allowed to dig deeper into the free page list.
2475	 */
2476	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
2477		req_class = VM_ALLOC_SYSTEM;
2478
2479	/*
2480	 * Return if the number of free pages cannot satisfy the requested
2481	 * allocation.
2482	 */
2483	count = vm_cnt.v_free_count;
2484	if (count < npages + vm_cnt.v_free_reserved || (count < npages +
2485	    vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) ||
2486	    (count < npages && req_class == VM_ALLOC_INTERRUPT))
2487		return (false);
2488
2489	/*
2490	 * Scan up to three times, relaxing the restrictions ("options") on
2491	 * the reclamation of reservations and superpages each time.
2492	 */
2493	for (options = VPSC_NORESERV;;) {
2494		/*
2495		 * Find the highest runs that satisfy the given constraints
2496		 * and restrictions, and record them in "m_runs".
2497		 */
2498		curr_low = low;
2499		count = 0;
2500		for (;;) {
2501			m_run = vm_phys_scan_contig(npages, curr_low, high,
2502			    alignment, boundary, options);
2503			if (m_run == NULL)
2504				break;
2505			curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages);
2506			m_runs[RUN_INDEX(count)] = m_run;
2507			count++;
2508		}
2509
2510		/*
2511		 * Reclaim the highest runs in LIFO (descending) order until
2512		 * the number of reclaimed pages, "reclaimed", is at least
2513		 * MIN_RECLAIM.  Reset "reclaimed" each time because each
2514		 * reclamation is idempotent, and runs will (likely) recur
2515		 * from one scan to the next as restrictions are relaxed.
2516		 */
2517		reclaimed = 0;
2518		for (i = 0; count > 0 && i < NRUNS; i++) {
2519			count--;
2520			m_run = m_runs[RUN_INDEX(count)];
2521			error = vm_page_reclaim_run(req_class, npages, m_run,
2522			    high);
2523			if (error == 0) {
2524				reclaimed += npages;
2525				if (reclaimed >= MIN_RECLAIM)
2526					return (true);
2527			}
2528		}
2529
2530		/*
2531		 * Either relax the restrictions on the next scan or return if
2532		 * the last scan had no restrictions.
2533		 */
2534		if (options == VPSC_NORESERV)
2535			options = VPSC_NOSUPER;
2536		else if (options == VPSC_NOSUPER)
2537			options = VPSC_ANY;
2538		else if (options == VPSC_ANY)
2539			return (reclaimed != 0);
2540	}
2541}
2542
2543/*
2544 *	vm_wait:	(also see VM_WAIT macro)
2545 *
2546 *	Sleep until free pages are available for allocation.
2547 *	- Called in various places before memory allocations.
2548 */
2549void
2550vm_wait(void)
2551{
2552
2553	mtx_lock(&vm_page_queue_free_mtx);
2554	if (curproc == pageproc) {
2555		vm_pageout_pages_needed = 1;
2556		msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
2557		    PDROP | PSWP, "VMWait", 0);
2558	} else {
2559		if (__predict_false(pageproc == NULL))
2560			panic("vm_wait in early boot");
2561		if (!vm_pageout_wanted) {
2562			vm_pageout_wanted = true;
2563			wakeup(&vm_pageout_wanted);
2564		}
2565		vm_pages_needed = true;
2566		msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
2567		    "vmwait", 0);
2568	}
2569}
2570
2571/*
2572 *	vm_waitpfault:	(also see VM_WAITPFAULT macro)
2573 *
2574 *	Sleep until free pages are available for allocation.
2575 *	- Called only in vm_fault so that processes page faulting
2576 *	  can be easily tracked.
2577 *	- Sleeps at a lower priority than vm_wait() so that vm_wait()ing
2578 *	  processes will be able to grab memory first.  Do not change
2579 *	  this balance without careful testing first.
2580 */
2581void
2582vm_waitpfault(void)
2583{
2584
2585	mtx_lock(&vm_page_queue_free_mtx);
2586	if (!vm_pageout_wanted) {
2587		vm_pageout_wanted = true;
2588		wakeup(&vm_pageout_wanted);
2589	}
2590	vm_pages_needed = true;
2591	msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
2592	    "pfault", 0);
2593}
2594
2595struct vm_pagequeue *
2596vm_page_pagequeue(vm_page_t m)
2597{
2598
2599	if (vm_page_in_laundry(m))
2600		return (&vm_dom[0].vmd_pagequeues[m->queue]);
2601	else
2602		return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
2603}
2604
2605/*
2606 *	vm_page_dequeue:
2607 *
2608 *	Remove the given page from its current page queue.
2609 *
2610 *	The page must be locked.
2611 */
2612void
2613vm_page_dequeue(vm_page_t m)
2614{
2615	struct vm_pagequeue *pq;
2616
2617	vm_page_assert_locked(m);
2618	KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued",
2619	    m));
2620	pq = vm_page_pagequeue(m);
2621	vm_pagequeue_lock(pq);
2622	m->queue = PQ_NONE;
2623	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
2624	vm_pagequeue_cnt_dec(pq);
2625	vm_pagequeue_unlock(pq);
2626}
2627
2628/*
2629 *	vm_page_dequeue_locked:
2630 *
2631 *	Remove the given page from its current page queue.
2632 *
2633 *	The page and page queue must be locked.
2634 */
2635void
2636vm_page_dequeue_locked(vm_page_t m)
2637{
2638	struct vm_pagequeue *pq;
2639
2640	vm_page_lock_assert(m, MA_OWNED);
2641	pq = vm_page_pagequeue(m);
2642	vm_pagequeue_assert_locked(pq);
2643	m->queue = PQ_NONE;
2644	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
2645	vm_pagequeue_cnt_dec(pq);
2646}
2647
2648/*
2649 *	vm_page_enqueue:
2650 *
2651 *	Add the given page to the specified page queue.
2652 *
2653 *	The page must be locked.
2654 */
2655static void
2656vm_page_enqueue(uint8_t queue, vm_page_t m)
2657{
2658	struct vm_pagequeue *pq;
2659
2660	vm_page_lock_assert(m, MA_OWNED);
2661	KASSERT(queue < PQ_COUNT,
2662	    ("vm_page_enqueue: invalid queue %u request for page %p",
2663	    queue, m));
2664	if (queue == PQ_LAUNDRY)
2665		pq = &vm_dom[0].vmd_pagequeues[queue];
2666	else
2667		pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
2668	vm_pagequeue_lock(pq);
2669	m->queue = queue;
2670	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
2671	vm_pagequeue_cnt_inc(pq);
2672	vm_pagequeue_unlock(pq);
2673}
2674
2675/*
2676 *	vm_page_requeue:
2677 *
2678 *	Move the given page to the tail of its current page queue.
2679 *
2680 *	The page must be locked.
2681 */
2682void
2683vm_page_requeue(vm_page_t m)
2684{
2685	struct vm_pagequeue *pq;
2686
2687	vm_page_lock_assert(m, MA_OWNED);
2688	KASSERT(m->queue != PQ_NONE,
2689	    ("vm_page_requeue: page %p is not queued", m));
2690	pq = vm_page_pagequeue(m);
2691	vm_pagequeue_lock(pq);
2692	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
2693	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
2694	vm_pagequeue_unlock(pq);
2695}
2696
2697/*
2698 *	vm_page_requeue_locked:
2699 *
2700 *	Move the given page to the tail of its current page queue.
2701 *
2702 *	The page queue must be locked.
2703 */
2704void
2705vm_page_requeue_locked(vm_page_t m)
2706{
2707	struct vm_pagequeue *pq;
2708
2709	KASSERT(m->queue != PQ_NONE,
2710	    ("vm_page_requeue_locked: page %p is not queued", m));
2711	pq = vm_page_pagequeue(m);
2712	vm_pagequeue_assert_locked(pq);
2713	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
2714	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
2715}
2716
2717/*
2718 *	vm_page_activate:
2719 *
2720 *	Put the specified page on the active list (if appropriate).
2721 *	Ensure that act_count is at least ACT_INIT but do not otherwise
2722 *	mess with it.
2723 *
2724 *	The page must be locked.
2725 */
2726void
2727vm_page_activate(vm_page_t m)
2728{
2729	int queue;
2730
2731	vm_page_lock_assert(m, MA_OWNED);
2732	if ((queue = m->queue) != PQ_ACTIVE) {
2733		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
2734			if (m->act_count < ACT_INIT)
2735				m->act_count = ACT_INIT;
2736			if (queue != PQ_NONE)
2737				vm_page_dequeue(m);
2738			vm_page_enqueue(PQ_ACTIVE, m);
2739		} else
2740			KASSERT(queue == PQ_NONE,
2741			    ("vm_page_activate: wired page %p is queued", m));
2742	} else {
2743		if (m->act_count < ACT_INIT)
2744			m->act_count = ACT_INIT;
2745	}
2746}
2747
2748/*
2749 *	vm_page_free_wakeup:
2750 *
2751 *	Helper routine for vm_page_free_toq().  This routine is called
2752 *	when a page is added to the free queues.
2753 *
2754 *	The page queues must be locked.
2755 */
2756static void
2757vm_page_free_wakeup(void)
2758{
2759
2760	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
2761	/*
2762	 * if pageout daemon needs pages, then tell it that there are
2763	 * some free.
2764	 */
2765	if (vm_pageout_pages_needed &&
2766	    vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) {
2767		wakeup(&vm_pageout_pages_needed);
2768		vm_pageout_pages_needed = 0;
2769	}
2770	/*
2771	 * wakeup processes that are waiting on memory if we hit a
2772	 * high water mark. And wakeup scheduler process if we have
2773	 * lots of memory. this process will swapin processes.
2774	 */
2775	if (vm_pages_needed && !vm_page_count_min()) {
2776		vm_pages_needed = false;
2777		wakeup(&vm_cnt.v_free_count);
2778	}
2779}
2780
2781/*
2782 *	vm_page_free_prep:
2783 *
2784 *	Prepares the given page to be put on the free list,
2785 *	disassociating it from any VM object. The caller may return
2786 *	the page to the free list only if this function returns true.
2787 *
2788 *	The object must be locked.  The page must be locked if it is
2789 *	managed.  For a queued managed page, the pagequeue_locked
2790 *	argument specifies whether the page queue is already locked.
2791 */
2792bool
2793vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
2794{
2795
2796#if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP)
2797	if ((m->flags & PG_ZERO) != 0) {
2798		uint64_t *p;
2799		int i;
2800		p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
2801		for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++)
2802			KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx",
2803			    m, i, (uintmax_t)*p));
2804	}
2805#endif
2806	if ((m->oflags & VPO_UNMANAGED) == 0) {
2807		vm_page_lock_assert(m, MA_OWNED);
2808		KASSERT(!pmap_page_is_mapped(m),
2809		    ("vm_page_free_toq: freeing mapped page %p", m));
2810	} else
2811		KASSERT(m->queue == PQ_NONE,
2812		    ("vm_page_free_toq: unmanaged page %p is queued", m));
2813	PCPU_INC(cnt.v_tfree);
2814
2815	if (vm_page_sbusied(m))
2816		panic("vm_page_free: freeing busy page %p", m);
2817
2818	/*
2819	 * Unqueue, then remove page.  Note that we cannot destroy
2820	 * the page here because we do not want to call the pager's
2821	 * callback routine until after we've put the page on the
2822	 * appropriate free queue.
2823	 */
2824	if (m->queue != PQ_NONE) {
2825		if (pagequeue_locked)
2826			vm_page_dequeue_locked(m);
2827		else
2828			vm_page_dequeue(m);
2829	}
2830	vm_page_remove(m);
2831
2832	/*
2833	 * If fictitious remove object association and
2834	 * return, otherwise delay object association removal.
2835	 */
2836	if ((m->flags & PG_FICTITIOUS) != 0)
2837		return (false);
2838
2839	m->valid = 0;
2840	vm_page_undirty(m);
2841
2842	if (m->wire_count != 0)
2843		panic("vm_page_free: freeing wired page %p", m);
2844	if (m->hold_count != 0) {
2845		m->flags &= ~PG_ZERO;
2846		KASSERT((m->flags & PG_UNHOLDFREE) == 0,
2847		    ("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
2848		m->flags |= PG_UNHOLDFREE;
2849		return (false);
2850	}
2851
2852	/*
2853	 * Restore the default memory attribute to the page.
2854	 */
2855	if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
2856		pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
2857
2858	return (true);
2859}
2860
2861/*
2862 * Insert the page into the physical memory allocator's free page
2863 * queues.  This is the last step to free a page.
2864 */
2865static void
2866vm_page_free_phys(vm_page_t m)
2867{
2868
2869	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
2870
2871	vm_phys_freecnt_adj(m, 1);
2872#if VM_NRESERVLEVEL > 0
2873	if (!vm_reserv_free_page(m))
2874#endif
2875			vm_phys_free_pages(m, 0);
2876	if ((m->flags & PG_ZERO) != 0)
2877		++vm_page_zero_count;
2878	else
2879		vm_page_zero_idle_wakeup();
2880}
2881
2882void
2883vm_page_free_phys_pglist(struct pglist *tq)
2884{
2885	vm_page_t m;
2886
2887	if (TAILQ_EMPTY(tq))
2888		return;
2889	mtx_lock(&vm_page_queue_free_mtx);
2890	TAILQ_FOREACH(m, tq, listq)
2891		vm_page_free_phys(m);
2892	vm_page_free_wakeup();
2893	mtx_unlock(&vm_page_queue_free_mtx);
2894}
2895
2896/*
2897 *	vm_page_free_toq:
2898 *
2899 *	Returns the given page to the free list, disassociating it
2900 *	from any VM object.
2901 *
2902 *	The object must be locked.  The page must be locked if it is
2903 *	managed.
2904 */
2905void
2906vm_page_free_toq(vm_page_t m)
2907{
2908
2909	if (!vm_page_free_prep(m, false))
2910		return;
2911	mtx_lock(&vm_page_queue_free_mtx);
2912	vm_page_free_phys(m);
2913	vm_page_free_wakeup();
2914	mtx_unlock(&vm_page_queue_free_mtx);
2915}
2916
2917/*
2918 *	vm_page_wire:
2919 *
2920 *	Mark this page as wired down by yet
2921 *	another map, removing it from paging queues
2922 *	as necessary.
2923 *
2924 *	If the page is fictitious, then its wire count must remain one.
2925 *
2926 *	The page must be locked.
2927 */
2928void
2929vm_page_wire(vm_page_t m)
2930{
2931
2932	/*
2933	 * Only bump the wire statistics if the page is not already wired,
2934	 * and only unqueue the page if it is on some queue (if it is unmanaged
2935	 * it is already off the queues).
2936	 */
2937	vm_page_lock_assert(m, MA_OWNED);
2938	if ((m->flags & PG_FICTITIOUS) != 0) {
2939		KASSERT(m->wire_count == 1,
2940		    ("vm_page_wire: fictitious page %p's wire count isn't one",
2941		    m));
2942		return;
2943	}
2944	if (m->wire_count == 0) {
2945		KASSERT((m->oflags & VPO_UNMANAGED) == 0 ||
2946		    m->queue == PQ_NONE,
2947		    ("vm_page_wire: unmanaged page %p is queued", m));
2948		vm_page_remque(m);
2949		atomic_add_int(&vm_cnt.v_wire_count, 1);
2950	}
2951	m->wire_count++;
2952	KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
2953}
2954
2955/*
2956 * vm_page_unwire:
2957 *
2958 * Release one wiring of the specified page, potentially allowing it to be
2959 * paged out.  Returns TRUE if the number of wirings transitions to zero and
2960 * FALSE otherwise.
2961 *
2962 * Only managed pages belonging to an object can be paged out.  If the number
2963 * of wirings transitions to zero and the page is eligible for page out, then
2964 * the page is added to the specified paging queue (unless PQ_NONE is
2965 * specified).
2966 *
2967 * If a page is fictitious, then its wire count must always be one.
2968 *
2969 * A managed page must be locked.
2970 */
2971boolean_t
2972vm_page_unwire(vm_page_t m, uint8_t queue)
2973{
2974
2975	KASSERT(queue < PQ_COUNT || queue == PQ_NONE,
2976	    ("vm_page_unwire: invalid queue %u request for page %p",
2977	    queue, m));
2978	if ((m->oflags & VPO_UNMANAGED) == 0)
2979		vm_page_assert_locked(m);
2980	if ((m->flags & PG_FICTITIOUS) != 0) {
2981		KASSERT(m->wire_count == 1,
2982	    ("vm_page_unwire: fictitious page %p's wire count isn't one", m));
2983		return (FALSE);
2984	}
2985	if (m->wire_count > 0) {
2986		m->wire_count--;
2987		if (m->wire_count == 0) {
2988			atomic_subtract_int(&vm_cnt.v_wire_count, 1);
2989			if ((m->oflags & VPO_UNMANAGED) == 0 &&
2990			    m->object != NULL && queue != PQ_NONE)
2991				vm_page_enqueue(queue, m);
2992			return (TRUE);
2993		} else
2994			return (FALSE);
2995	} else
2996		panic("vm_page_unwire: page %p's wire count is zero", m);
2997}
2998
2999/*
3000 * Move the specified page to the inactive queue.
3001 *
3002 * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive
3003 * queue.  However, setting "noreuse" to TRUE will accelerate the specified
3004 * page's reclamation, but it will not unmap the page from any address space.
3005 * This is implemented by inserting the page near the head of the inactive
3006 * queue, using a marker page to guide FIFO insertion ordering.
3007 *
3008 * The page must be locked.
3009 */
3010static inline void
3011_vm_page_deactivate(vm_page_t m, boolean_t noreuse)
3012{
3013	struct vm_pagequeue *pq;
3014	int queue;
3015
3016	vm_page_assert_locked(m);
3017
3018	/*
3019	 * Ignore if the page is already inactive, unless it is unlikely to be
3020	 * reactivated.
3021	 */
3022	if ((queue = m->queue) == PQ_INACTIVE && !noreuse)
3023		return;
3024	if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
3025		pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE];
3026		/* Avoid multiple acquisitions of the inactive queue lock. */
3027		if (queue == PQ_INACTIVE) {
3028			vm_pagequeue_lock(pq);
3029			vm_page_dequeue_locked(m);
3030		} else {
3031			if (queue != PQ_NONE)
3032				vm_page_dequeue(m);
3033			vm_pagequeue_lock(pq);
3034		}
3035		m->queue = PQ_INACTIVE;
3036		if (noreuse)
3037			TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead,
3038			    m, plinks.q);
3039		else
3040			TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
3041		vm_pagequeue_cnt_inc(pq);
3042		vm_pagequeue_unlock(pq);
3043	}
3044}
3045
3046/*
3047 * Move the specified page to the inactive queue.
3048 *
3049 * The page must be locked.
3050 */
3051void
3052vm_page_deactivate(vm_page_t m)
3053{
3054
3055	_vm_page_deactivate(m, FALSE);
3056}
3057
3058/*
3059 * Move the specified page to the inactive queue with the expectation
3060 * that it is unlikely to be reused.
3061 *
3062 * The page must be locked.
3063 */
3064void
3065vm_page_deactivate_noreuse(vm_page_t m)
3066{
3067
3068	_vm_page_deactivate(m, TRUE);
3069}
3070
3071/*
3072 * vm_page_launder
3073 *
3074 * 	Put a page in the laundry.
3075 */
3076void
3077vm_page_launder(vm_page_t m)
3078{
3079	int queue;
3080
3081	vm_page_assert_locked(m);
3082	if ((queue = m->queue) != PQ_LAUNDRY) {
3083		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
3084			if (queue != PQ_NONE)
3085				vm_page_dequeue(m);
3086			vm_page_enqueue(PQ_LAUNDRY, m);
3087		} else
3088			KASSERT(queue == PQ_NONE,
3089			    ("wired page %p is queued", m));
3090	}
3091}
3092
3093/*
3094 * vm_page_try_to_free()
3095 *
3096 *	Attempt to free the page.  If we cannot free it, we do nothing.
3097 *	true is returned on success, false on failure.
3098 */
3099bool
3100vm_page_try_to_free(vm_page_t m)
3101{
3102
3103	vm_page_assert_locked(m);
3104	if (m->object != NULL)
3105		VM_OBJECT_ASSERT_WLOCKED(m->object);
3106	if (m->dirty != 0 || m->hold_count != 0 || m->wire_count != 0 ||
3107	    (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
3108		return (false);
3109	if (m->object != NULL && m->object->ref_count != 0) {
3110		pmap_remove_all(m);
3111		if (m->dirty != 0)
3112			return (false);
3113	}
3114	vm_page_free(m);
3115	return (true);
3116}
3117
3118/*
3119 * vm_page_advise
3120 *
3121 * 	Apply the specified advice to the given page.
3122 *
3123 *	The object and page must be locked.
3124 */
3125void
3126vm_page_advise(vm_page_t m, int advice)
3127{
3128
3129	vm_page_assert_locked(m);
3130	VM_OBJECT_ASSERT_WLOCKED(m->object);
3131	if (advice == MADV_FREE)
3132		/*
3133		 * Mark the page clean.  This will allow the page to be freed
3134		 * without first paging it out.  MADV_FREE pages are often
3135		 * quickly reused by malloc(3), so we do not do anything that
3136		 * would result in a page fault on a later access.
3137		 */
3138		vm_page_undirty(m);
3139	else if (advice != MADV_DONTNEED) {
3140		if (advice == MADV_WILLNEED)
3141			vm_page_activate(m);
3142		return;
3143	}
3144
3145	/*
3146	 * Clear any references to the page.  Otherwise, the page daemon will
3147	 * immediately reactivate the page.
3148	 */
3149	vm_page_aflag_clear(m, PGA_REFERENCED);
3150
3151	if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m))
3152		vm_page_dirty(m);
3153
3154	/*
3155	 * Place clean pages near the head of the inactive queue rather than
3156	 * the tail, thus defeating the queue's LRU operation and ensuring that
3157	 * the page will be reused quickly.  Dirty pages not already in the
3158	 * laundry are moved there.
3159	 */
3160	if (m->dirty == 0)
3161		vm_page_deactivate_noreuse(m);
3162	else
3163		vm_page_launder(m);
3164}
3165
3166/*
3167 * Grab a page, waiting until we are waken up due to the page
3168 * changing state.  We keep on waiting, if the page continues
3169 * to be in the object.  If the page doesn't exist, first allocate it
3170 * and then conditionally zero it.
3171 *
3172 * This routine may sleep.
3173 *
3174 * The object must be locked on entry.  The lock will, however, be released
3175 * and reacquired if the routine sleeps.
3176 */
3177vm_page_t
3178vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
3179{
3180	vm_page_t m;
3181	int sleep;
3182
3183	VM_OBJECT_ASSERT_WLOCKED(object);
3184	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
3185	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
3186	    ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
3187retrylookup:
3188	if ((m = vm_page_lookup(object, pindex)) != NULL) {
3189		sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
3190		    vm_page_xbusied(m) : vm_page_busied(m);
3191		if (sleep) {
3192			if ((allocflags & VM_ALLOC_NOWAIT) != 0)
3193				return (NULL);
3194			/*
3195			 * Reference the page before unlocking and
3196			 * sleeping so that the page daemon is less
3197			 * likely to reclaim it.
3198			 */
3199			vm_page_aflag_set(m, PGA_REFERENCED);
3200			vm_page_lock(m);
3201			VM_OBJECT_WUNLOCK(object);
3202			vm_page_busy_sleep(m, "pgrbwt", (allocflags &
3203			    VM_ALLOC_IGN_SBUSY) != 0);
3204			VM_OBJECT_WLOCK(object);
3205			goto retrylookup;
3206		} else {
3207			if ((allocflags & VM_ALLOC_WIRED) != 0) {
3208				vm_page_lock(m);
3209				vm_page_wire(m);
3210				vm_page_unlock(m);
3211			}
3212			if ((allocflags &
3213			    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
3214				vm_page_xbusy(m);
3215			if ((allocflags & VM_ALLOC_SBUSY) != 0)
3216				vm_page_sbusy(m);
3217			return (m);
3218		}
3219	}
3220	m = vm_page_alloc(object, pindex, allocflags);
3221	if (m == NULL) {
3222		if ((allocflags & VM_ALLOC_NOWAIT) != 0)
3223			return (NULL);
3224		VM_OBJECT_WUNLOCK(object);
3225		VM_WAIT;
3226		VM_OBJECT_WLOCK(object);
3227		goto retrylookup;
3228	}
3229	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
3230		pmap_zero_page(m);
3231	return (m);
3232}
3233
3234/*
3235 * Return the specified range of pages from the given object.  For each
3236 * page offset within the range, if a page already exists within the object
3237 * at that offset and it is busy, then wait for it to change state.  If,
3238 * instead, the page doesn't exist, then allocate it.
3239 *
3240 * The caller must always specify an allocation class.
3241 *
3242 * allocation classes:
3243 *	VM_ALLOC_NORMAL		normal process request
3244 *	VM_ALLOC_SYSTEM		system *really* needs the pages
3245 *
3246 * The caller must always specify that the pages are to be busied and/or
3247 * wired.
3248 *
3249 * optional allocation flags:
3250 *	VM_ALLOC_IGN_SBUSY	do not sleep on soft busy pages
3251 *	VM_ALLOC_NOBUSY		do not exclusive busy the page
3252 *	VM_ALLOC_NOWAIT		do not sleep
3253 *	VM_ALLOC_SBUSY		set page to sbusy state
3254 *	VM_ALLOC_WIRED		wire the pages
3255 *	VM_ALLOC_ZERO		zero and validate any invalid pages
3256 *
3257 * If VM_ALLOC_NOWAIT is not specified, this routine may sleep.  Otherwise, it
3258 * may return a partial prefix of the requested range.
3259 */
3260int
3261vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
3262    vm_page_t *ma, int count)
3263{
3264	vm_page_t m;
3265	int i;
3266	bool sleep;
3267
3268	VM_OBJECT_ASSERT_WLOCKED(object);
3269	KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0,
3270	    ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed"));
3271	KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 ||
3272	    (allocflags & VM_ALLOC_WIRED) != 0,
3273	    ("vm_page_grab_pages: the pages must be busied or wired"));
3274	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
3275	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
3276	    ("vm_page_grab_pages: VM_ALLOC_SBUSY/IGN_SBUSY mismatch"));
3277	if (count == 0)
3278		return (0);
3279	i = 0;
3280retrylookup:
3281	m = vm_page_lookup(object, pindex + i);
3282	for (; i < count; i++) {
3283		if (m != NULL) {
3284			sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
3285			    vm_page_xbusied(m) : vm_page_busied(m);
3286			if (sleep) {
3287				if ((allocflags & VM_ALLOC_NOWAIT) != 0)
3288					break;
3289				/*
3290				 * Reference the page before unlocking and
3291				 * sleeping so that the page daemon is less
3292				 * likely to reclaim it.
3293				 */
3294				vm_page_aflag_set(m, PGA_REFERENCED);
3295				vm_page_lock(m);
3296				VM_OBJECT_WUNLOCK(object);
3297				vm_page_busy_sleep(m, "grbmaw", (allocflags &
3298				    VM_ALLOC_IGN_SBUSY) != 0);
3299				VM_OBJECT_WLOCK(object);
3300				goto retrylookup;
3301			}
3302			if ((allocflags & VM_ALLOC_WIRED) != 0) {
3303				vm_page_lock(m);
3304				vm_page_wire(m);
3305				vm_page_unlock(m);
3306			}
3307			if ((allocflags & (VM_ALLOC_NOBUSY |
3308			    VM_ALLOC_SBUSY)) == 0)
3309				vm_page_xbusy(m);
3310			if ((allocflags & VM_ALLOC_SBUSY) != 0)
3311				vm_page_sbusy(m);
3312		} else {
3313			m = vm_page_alloc(object, pindex + i, (allocflags &
3314			    ~VM_ALLOC_IGN_SBUSY) | VM_ALLOC_COUNT(count - i));
3315			if (m == NULL) {
3316				if ((allocflags & VM_ALLOC_NOWAIT) != 0)
3317					break;
3318				VM_OBJECT_WUNLOCK(object);
3319				VM_WAIT;
3320				VM_OBJECT_WLOCK(object);
3321				goto retrylookup;
3322			}
3323		}
3324		if (m->valid == 0 && (allocflags & VM_ALLOC_ZERO) != 0) {
3325			if ((m->flags & PG_ZERO) == 0)
3326				pmap_zero_page(m);
3327			m->valid = VM_PAGE_BITS_ALL;
3328		}
3329		ma[i] = m;
3330		m = vm_page_next(m);
3331	}
3332	return (i);
3333}
3334
3335/*
3336 * Mapping function for valid or dirty bits in a page.
3337 *
3338 * Inputs are required to range within a page.
3339 */
3340vm_page_bits_t
3341vm_page_bits(int base, int size)
3342{
3343	int first_bit;
3344	int last_bit;
3345
3346	KASSERT(
3347	    base + size <= PAGE_SIZE,
3348	    ("vm_page_bits: illegal base/size %d/%d", base, size)
3349	);
3350
3351	if (size == 0)		/* handle degenerate case */
3352		return (0);
3353
3354	first_bit = base >> DEV_BSHIFT;
3355	last_bit = (base + size - 1) >> DEV_BSHIFT;
3356
3357	return (((vm_page_bits_t)2 << last_bit) -
3358	    ((vm_page_bits_t)1 << first_bit));
3359}
3360
3361/*
3362 *	vm_page_set_valid_range:
3363 *
3364 *	Sets portions of a page valid.  The arguments are expected
3365 *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
3366 *	of any partial chunks touched by the range.  The invalid portion of
3367 *	such chunks will be zeroed.
3368 *
3369 *	(base + size) must be less then or equal to PAGE_SIZE.
3370 */
3371void
3372vm_page_set_valid_range(vm_page_t m, int base, int size)
3373{
3374	int endoff, frag;
3375
3376	VM_OBJECT_ASSERT_WLOCKED(m->object);
3377	if (size == 0)	/* handle degenerate case */
3378		return;
3379
3380	/*
3381	 * If the base is not DEV_BSIZE aligned and the valid
3382	 * bit is clear, we have to zero out a portion of the
3383	 * first block.
3384	 */
3385	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
3386	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
3387		pmap_zero_page_area(m, frag, base - frag);
3388
3389	/*
3390	 * If the ending offset is not DEV_BSIZE aligned and the
3391	 * valid bit is clear, we have to zero out a portion of
3392	 * the last block.
3393	 */
3394	endoff = base + size;
3395	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
3396	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
3397		pmap_zero_page_area(m, endoff,
3398		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
3399
3400	/*
3401	 * Assert that no previously invalid block that is now being validated
3402	 * is already dirty.
3403	 */
3404	KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
3405	    ("vm_page_set_valid_range: page %p is dirty", m));
3406
3407	/*
3408	 * Set valid bits inclusive of any overlap.
3409	 */
3410	m->valid |= vm_page_bits(base, size);
3411}
3412
3413/*
3414 * Clear the given bits from the specified page's dirty field.
3415 */
3416static __inline void
3417vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
3418{
3419	uintptr_t addr;
3420#if PAGE_SIZE < 16384
3421	int shift;
3422#endif
3423
3424	/*
3425	 * If the object is locked and the page is neither exclusive busy nor
3426	 * write mapped, then the page's dirty field cannot possibly be
3427	 * set by a concurrent pmap operation.
3428	 */
3429	VM_OBJECT_ASSERT_WLOCKED(m->object);
3430	if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m))
3431		m->dirty &= ~pagebits;
3432	else {
3433		/*
3434		 * The pmap layer can call vm_page_dirty() without
3435		 * holding a distinguished lock.  The combination of
3436		 * the object's lock and an atomic operation suffice
3437		 * to guarantee consistency of the page dirty field.
3438		 *
3439		 * For PAGE_SIZE == 32768 case, compiler already
3440		 * properly aligns the dirty field, so no forcible
3441		 * alignment is needed. Only require existence of
3442		 * atomic_clear_64 when page size is 32768.
3443		 */
3444		addr = (uintptr_t)&m->dirty;
3445#if PAGE_SIZE == 32768
3446		atomic_clear_64((uint64_t *)addr, pagebits);
3447#elif PAGE_SIZE == 16384
3448		atomic_clear_32((uint32_t *)addr, pagebits);
3449#else		/* PAGE_SIZE <= 8192 */
3450		/*
3451		 * Use a trick to perform a 32-bit atomic on the
3452		 * containing aligned word, to not depend on the existence
3453		 * of atomic_clear_{8, 16}.
3454		 */
3455		shift = addr & (sizeof(uint32_t) - 1);
3456#if BYTE_ORDER == BIG_ENDIAN
3457		shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY;
3458#else
3459		shift *= NBBY;
3460#endif
3461		addr &= ~(sizeof(uint32_t) - 1);
3462		atomic_clear_32((uint32_t *)addr, pagebits << shift);
3463#endif		/* PAGE_SIZE */
3464	}
3465}
3466
3467/*
3468 *	vm_page_set_validclean:
3469 *
3470 *	Sets portions of a page valid and clean.  The arguments are expected
3471 *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
3472 *	of any partial chunks touched by the range.  The invalid portion of
3473 *	such chunks will be zero'd.
3474 *
3475 *	(base + size) must be less then or equal to PAGE_SIZE.
3476 */
3477void
3478vm_page_set_validclean(vm_page_t m, int base, int size)
3479{
3480	vm_page_bits_t oldvalid, pagebits;
3481	int endoff, frag;
3482
3483	VM_OBJECT_ASSERT_WLOCKED(m->object);
3484	if (size == 0)	/* handle degenerate case */
3485		return;
3486
3487	/*
3488	 * If the base is not DEV_BSIZE aligned and the valid
3489	 * bit is clear, we have to zero out a portion of the
3490	 * first block.
3491	 */
3492	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
3493	    (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
3494		pmap_zero_page_area(m, frag, base - frag);
3495
3496	/*
3497	 * If the ending offset is not DEV_BSIZE aligned and the
3498	 * valid bit is clear, we have to zero out a portion of
3499	 * the last block.
3500	 */
3501	endoff = base + size;
3502	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
3503	    (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
3504		pmap_zero_page_area(m, endoff,
3505		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
3506
3507	/*
3508	 * Set valid, clear dirty bits.  If validating the entire
3509	 * page we can safely clear the pmap modify bit.  We also
3510	 * use this opportunity to clear the VPO_NOSYNC flag.  If a process
3511	 * takes a write fault on a MAP_NOSYNC memory area the flag will
3512	 * be set again.
3513	 *
3514	 * We set valid bits inclusive of any overlap, but we can only
3515	 * clear dirty bits for DEV_BSIZE chunks that are fully within
3516	 * the range.
3517	 */
3518	oldvalid = m->valid;
3519	pagebits = vm_page_bits(base, size);
3520	m->valid |= pagebits;
3521#if 0	/* NOT YET */
3522	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
3523		frag = DEV_BSIZE - frag;
3524		base += frag;
3525		size -= frag;
3526		if (size < 0)
3527			size = 0;
3528	}
3529	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
3530#endif
3531	if (base == 0 && size == PAGE_SIZE) {
3532		/*
3533		 * The page can only be modified within the pmap if it is
3534		 * mapped, and it can only be mapped if it was previously
3535		 * fully valid.
3536		 */
3537		if (oldvalid == VM_PAGE_BITS_ALL)
3538			/*
3539			 * Perform the pmap_clear_modify() first.  Otherwise,
3540			 * a concurrent pmap operation, such as
3541			 * pmap_protect(), could clear a modification in the
3542			 * pmap and set the dirty field on the page before
3543			 * pmap_clear_modify() had begun and after the dirty
3544			 * field was cleared here.
3545			 */
3546			pmap_clear_modify(m);
3547		m->dirty = 0;
3548		m->oflags &= ~VPO_NOSYNC;
3549	} else if (oldvalid != VM_PAGE_BITS_ALL)
3550		m->dirty &= ~pagebits;
3551	else
3552		vm_page_clear_dirty_mask(m, pagebits);
3553}
3554
3555void
3556vm_page_clear_dirty(vm_page_t m, int base, int size)
3557{
3558
3559	vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
3560}
3561
3562/*
3563 *	vm_page_set_invalid:
3564 *
3565 *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
3566 *	valid and dirty bits for the effected areas are cleared.
3567 */
3568void
3569vm_page_set_invalid(vm_page_t m, int base, int size)
3570{
3571	vm_page_bits_t bits;
3572	vm_object_t object;
3573
3574	object = m->object;
3575	VM_OBJECT_ASSERT_WLOCKED(object);
3576	if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) +
3577	    size >= object->un_pager.vnp.vnp_size)
3578		bits = VM_PAGE_BITS_ALL;
3579	else
3580		bits = vm_page_bits(base, size);
3581	if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL &&
3582	    bits != 0)
3583		pmap_remove_all(m);
3584	KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) ||
3585	    !pmap_page_is_mapped(m),
3586	    ("vm_page_set_invalid: page %p is mapped", m));
3587	m->valid &= ~bits;
3588	m->dirty &= ~bits;
3589}
3590
3591/*
3592 * vm_page_zero_invalid()
3593 *
3594 *	The kernel assumes that the invalid portions of a page contain
3595 *	garbage, but such pages can be mapped into memory by user code.
3596 *	When this occurs, we must zero out the non-valid portions of the
3597 *	page so user code sees what it expects.
3598 *
3599 *	Pages are most often semi-valid when the end of a file is mapped
3600 *	into memory and the file's size is not page aligned.
3601 */
3602void
3603vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
3604{
3605	int b;
3606	int i;
3607
3608	VM_OBJECT_ASSERT_WLOCKED(m->object);
3609	/*
3610	 * Scan the valid bits looking for invalid sections that
3611	 * must be zeroed.  Invalid sub-DEV_BSIZE'd areas ( where the
3612	 * valid bit may be set ) have already been zeroed by
3613	 * vm_page_set_validclean().
3614	 */
3615	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
3616		if (i == (PAGE_SIZE / DEV_BSIZE) ||
3617		    (m->valid & ((vm_page_bits_t)1 << i))) {
3618			if (i > b) {
3619				pmap_zero_page_area(m,
3620				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
3621			}
3622			b = i + 1;
3623		}
3624	}
3625
3626	/*
3627	 * setvalid is TRUE when we can safely set the zero'd areas
3628	 * as being valid.  We can do this if there are no cache consistancy
3629	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
3630	 */
3631	if (setvalid)
3632		m->valid = VM_PAGE_BITS_ALL;
3633}
3634
3635/*
3636 *	vm_page_is_valid:
3637 *
3638 *	Is (partial) page valid?  Note that the case where size == 0
3639 *	will return FALSE in the degenerate case where the page is
3640 *	entirely invalid, and TRUE otherwise.
3641 */
3642int
3643vm_page_is_valid(vm_page_t m, int base, int size)
3644{
3645	vm_page_bits_t bits;
3646
3647	VM_OBJECT_ASSERT_LOCKED(m->object);
3648	bits = vm_page_bits(base, size);
3649	return (m->valid != 0 && (m->valid & bits) == bits);
3650}
3651
3652/*
3653 * Returns true if all of the specified predicates are true for the entire
3654 * (super)page and false otherwise.
3655 */
3656bool
3657vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m)
3658{
3659	vm_object_t object;
3660	int i, npages;
3661
3662	object = m->object;
3663	VM_OBJECT_ASSERT_LOCKED(object);
3664	npages = atop(pagesizes[m->psind]);
3665
3666	/*
3667	 * The physically contiguous pages that make up a superpage, i.e., a
3668	 * page with a page size index ("psind") greater than zero, will
3669	 * occupy adjacent entries in vm_page_array[].
3670	 */
3671	for (i = 0; i < npages; i++) {
3672		/* Always test object consistency, including "skip_m". */
3673		if (m[i].object != object)
3674			return (false);
3675		if (&m[i] == skip_m)
3676			continue;
3677		if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i]))
3678			return (false);
3679		if ((flags & PS_ALL_DIRTY) != 0) {
3680			/*
3681			 * Calling vm_page_test_dirty() or pmap_is_modified()
3682			 * might stop this case from spuriously returning
3683			 * "false".  However, that would require a write lock
3684			 * on the object containing "m[i]".
3685			 */
3686			if (m[i].dirty != VM_PAGE_BITS_ALL)
3687				return (false);
3688		}
3689		if ((flags & PS_ALL_VALID) != 0 &&
3690		    m[i].valid != VM_PAGE_BITS_ALL)
3691			return (false);
3692	}
3693	return (true);
3694}
3695
3696/*
3697 * Set the page's dirty bits if the page is modified.
3698 */
3699void
3700vm_page_test_dirty(vm_page_t m)
3701{
3702
3703	VM_OBJECT_ASSERT_WLOCKED(m->object);
3704	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
3705		vm_page_dirty(m);
3706}
3707
3708void
3709vm_page_lock_KBI(vm_page_t m, const char *file, int line)
3710{
3711
3712	mtx_lock_flags_(vm_page_lockptr(m), 0, file, line);
3713}
3714
3715void
3716vm_page_unlock_KBI(vm_page_t m, const char *file, int line)
3717{
3718
3719	mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line);
3720}
3721
3722int
3723vm_page_trylock_KBI(vm_page_t m, const char *file, int line)
3724{
3725
3726	return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line));
3727}
3728
3729#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
3730void
3731vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line)
3732{
3733
3734	vm_page_lock_assert_KBI(m, MA_OWNED, file, line);
3735}
3736
3737void
3738vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
3739{
3740
3741	mtx_assert_(vm_page_lockptr(m), a, file, line);
3742}
3743#endif
3744
3745#ifdef INVARIANTS
3746void
3747vm_page_object_lock_assert(vm_page_t m)
3748{
3749
3750	/*
3751	 * Certain of the page's fields may only be modified by the
3752	 * holder of the containing object's lock or the exclusive busy.
3753	 * holder.  Unfortunately, the holder of the write busy is
3754	 * not recorded, and thus cannot be checked here.
3755	 */
3756	if (m->object != NULL && !vm_page_xbusied(m))
3757		VM_OBJECT_ASSERT_WLOCKED(m->object);
3758}
3759
3760void
3761vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits)
3762{
3763
3764	if ((bits & PGA_WRITEABLE) == 0)
3765		return;
3766
3767	/*
3768	 * The PGA_WRITEABLE flag can only be set if the page is
3769	 * managed, is exclusively busied or the object is locked.
3770	 * Currently, this flag is only set by pmap_enter().
3771	 */
3772	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3773	    ("PGA_WRITEABLE on unmanaged page"));
3774	if (!vm_page_xbusied(m))
3775		VM_OBJECT_ASSERT_LOCKED(m->object);
3776}
3777#endif
3778
3779#include "opt_ddb.h"
3780#ifdef DDB
3781#include <sys/kernel.h>
3782
3783#include <ddb/ddb.h>
3784
3785DB_SHOW_COMMAND(page, vm_page_print_page_info)
3786{
3787
3788	db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count);
3789	db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count);
3790	db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count);
3791	db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count);
3792	db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count);
3793	db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
3794	db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
3795	db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target);
3796	db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target);
3797}
3798
3799DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
3800{
3801	int dom;
3802
3803	db_printf("pq_free %d\n", vm_cnt.v_free_count);
3804	for (dom = 0; dom < vm_ndomains; dom++) {
3805		db_printf(
3806	    "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d\n",
3807		    dom,
3808		    vm_dom[dom].vmd_page_count,
3809		    vm_dom[dom].vmd_free_count,
3810		    vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
3811		    vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
3812		    vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt);
3813	}
3814}
3815
3816DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
3817{
3818	vm_page_t m;
3819	boolean_t phys;
3820
3821	if (!have_addr) {
3822		db_printf("show pginfo addr\n");
3823		return;
3824	}
3825
3826	phys = strchr(modif, 'p') != NULL;
3827	if (phys)
3828		m = PHYS_TO_VM_PAGE(addr);
3829	else
3830		m = (vm_page_t)addr;
3831	db_printf(
3832    "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n"
3833    "  af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n",
3834	    m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr,
3835	    m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags,
3836	    m->flags, m->act_count, m->busy_lock, m->valid, m->dirty);
3837}
3838#endif /* DDB */
3839