vm_page.c revision 250849
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * The Mach Operating System project at Carnegie-Mellon University.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
34 */
35
36/*-
37 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38 * All rights reserved.
39 *
40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41 *
42 * Permission to use, copy, modify and distribute this software and
43 * its documentation is hereby granted, provided that both the copyright
44 * notice and this permission notice appear in all copies of the
45 * software, derivative works or modified versions, and any portions
46 * thereof, and that both notices appear in supporting documentation.
47 *
48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51 *
52 * Carnegie Mellon requests users of this software to return to
53 *
54 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55 *  School of Computer Science
56 *  Carnegie Mellon University
57 *  Pittsburgh PA 15213-3890
58 *
59 * any improvements or extensions that they make and grant Carnegie the
60 * rights to redistribute these changes.
61 */
62
63/*
64 *			GENERAL RULES ON VM_PAGE MANIPULATION
65 *
66 *	- A page queue lock is required when adding or removing a page from a
67 *	  page queue (vm_pagequeues[]), regardless of other locks or the
68 *	  busy state of a page.
69 *
70 *		* In general, no thread besides the page daemon can acquire or
71 *		  hold more than one page queue lock at a time.
72 *
73 *		* The page daemon can acquire and hold any pair of page queue
74 *		  locks in any order.
75 *
76 *	- The object lock is required when inserting or removing
77 *	  pages from an object (vm_page_insert() or vm_page_remove()).
78 *
79 */
80
81/*
82 *	Resident memory management module.
83 */
84
85#include <sys/cdefs.h>
86__FBSDID("$FreeBSD: head/sys/vm/vm_page.c 250849 2013-05-21 11:04:00Z kib $");
87
88#include "opt_vm.h"
89
90#include <sys/param.h>
91#include <sys/systm.h>
92#include <sys/lock.h>
93#include <sys/kernel.h>
94#include <sys/limits.h>
95#include <sys/malloc.h>
96#include <sys/msgbuf.h>
97#include <sys/mutex.h>
98#include <sys/proc.h>
99#include <sys/rwlock.h>
100#include <sys/sysctl.h>
101#include <sys/vmmeter.h>
102#include <sys/vnode.h>
103
104#include <vm/vm.h>
105#include <vm/pmap.h>
106#include <vm/vm_param.h>
107#include <vm/vm_kern.h>
108#include <vm/vm_object.h>
109#include <vm/vm_page.h>
110#include <vm/vm_pageout.h>
111#include <vm/vm_pager.h>
112#include <vm/vm_phys.h>
113#include <vm/vm_radix.h>
114#include <vm/vm_reserv.h>
115#include <vm/vm_extern.h>
116#include <vm/uma.h>
117#include <vm/uma_int.h>
118
119#include <machine/md_var.h>
120
121/*
122 *	Associated with page of user-allocatable memory is a
123 *	page structure.
124 */
125
126struct vm_pagequeue vm_pagequeues[PQ_COUNT] = {
127	[PQ_INACTIVE] = {
128		.pq_pl = TAILQ_HEAD_INITIALIZER(
129		    vm_pagequeues[PQ_INACTIVE].pq_pl),
130		.pq_cnt = &cnt.v_inactive_count,
131		.pq_name = "vm inactive pagequeue"
132	},
133	[PQ_ACTIVE] = {
134		.pq_pl = TAILQ_HEAD_INITIALIZER(
135		    vm_pagequeues[PQ_ACTIVE].pq_pl),
136		.pq_cnt = &cnt.v_active_count,
137		.pq_name = "vm active pagequeue"
138	}
139};
140struct mtx_padalign vm_page_queue_free_mtx;
141
142struct mtx_padalign pa_lock[PA_LOCK_COUNT];
143
144vm_page_t vm_page_array;
145long vm_page_array_size;
146long first_page;
147int vm_page_zero_count;
148
149static int boot_pages = UMA_BOOT_PAGES;
150TUNABLE_INT("vm.boot_pages", &boot_pages);
151SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
152	"number of pages allocated for bootstrapping the VM system");
153
154static int pa_tryrelock_restart;
155SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
156    &pa_tryrelock_restart, 0, "Number of tryrelock restarts");
157
158static uma_zone_t fakepg_zone;
159
160static struct vnode *vm_page_alloc_init(vm_page_t m);
161static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
162static void vm_page_enqueue(int queue, vm_page_t m);
163static void vm_page_init_fakepg(void *dummy);
164static void vm_page_insert_after(vm_page_t m, vm_object_t object,
165    vm_pindex_t pindex, vm_page_t mpred);
166
167SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
168
169static void
170vm_page_init_fakepg(void *dummy)
171{
172
173	fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
174	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
175}
176
177/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
178#if PAGE_SIZE == 32768
179#ifdef CTASSERT
180CTASSERT(sizeof(u_long) >= 8);
181#endif
182#endif
183
184/*
185 * Try to acquire a physical address lock while a pmap is locked.  If we
186 * fail to trylock we unlock and lock the pmap directly and cache the
187 * locked pa in *locked.  The caller should then restart their loop in case
188 * the virtual to physical mapping has changed.
189 */
190int
191vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked)
192{
193	vm_paddr_t lockpa;
194
195	lockpa = *locked;
196	*locked = pa;
197	if (lockpa) {
198		PA_LOCK_ASSERT(lockpa, MA_OWNED);
199		if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa))
200			return (0);
201		PA_UNLOCK(lockpa);
202	}
203	if (PA_TRYLOCK(pa))
204		return (0);
205	PMAP_UNLOCK(pmap);
206	atomic_add_int(&pa_tryrelock_restart, 1);
207	PA_LOCK(pa);
208	PMAP_LOCK(pmap);
209	return (EAGAIN);
210}
211
212/*
213 *	vm_set_page_size:
214 *
215 *	Sets the page size, perhaps based upon the memory
216 *	size.  Must be called before any use of page-size
217 *	dependent functions.
218 */
219void
220vm_set_page_size(void)
221{
222	if (cnt.v_page_size == 0)
223		cnt.v_page_size = PAGE_SIZE;
224	if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
225		panic("vm_set_page_size: page size not a power of two");
226}
227
228/*
229 *	vm_page_blacklist_lookup:
230 *
231 *	See if a physical address in this page has been listed
232 *	in the blacklist tunable.  Entries in the tunable are
233 *	separated by spaces or commas.  If an invalid integer is
234 *	encountered then the rest of the string is skipped.
235 */
236static int
237vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
238{
239	vm_paddr_t bad;
240	char *cp, *pos;
241
242	for (pos = list; *pos != '\0'; pos = cp) {
243		bad = strtoq(pos, &cp, 0);
244		if (*cp != '\0') {
245			if (*cp == ' ' || *cp == ',') {
246				cp++;
247				if (cp == pos)
248					continue;
249			} else
250				break;
251		}
252		if (pa == trunc_page(bad))
253			return (1);
254	}
255	return (0);
256}
257
258/*
259 *	vm_page_startup:
260 *
261 *	Initializes the resident memory module.
262 *
263 *	Allocates memory for the page cells, and
264 *	for the object/offset-to-page hash table headers.
265 *	Each page cell is initialized and placed on the free list.
266 */
267vm_offset_t
268vm_page_startup(vm_offset_t vaddr)
269{
270	vm_offset_t mapped;
271	vm_paddr_t page_range;
272	vm_paddr_t new_end;
273	int i;
274	vm_paddr_t pa;
275	vm_paddr_t last_pa;
276	char *list;
277
278	/* the biggest memory array is the second group of pages */
279	vm_paddr_t end;
280	vm_paddr_t biggestsize;
281	vm_paddr_t low_water, high_water;
282	int biggestone;
283
284	biggestsize = 0;
285	biggestone = 0;
286	vaddr = round_page(vaddr);
287
288	for (i = 0; phys_avail[i + 1]; i += 2) {
289		phys_avail[i] = round_page(phys_avail[i]);
290		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
291	}
292
293	low_water = phys_avail[0];
294	high_water = phys_avail[1];
295
296	for (i = 0; phys_avail[i + 1]; i += 2) {
297		vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
298
299		if (size > biggestsize) {
300			biggestone = i;
301			biggestsize = size;
302		}
303		if (phys_avail[i] < low_water)
304			low_water = phys_avail[i];
305		if (phys_avail[i + 1] > high_water)
306			high_water = phys_avail[i + 1];
307	}
308
309#ifdef XEN
310	low_water = 0;
311#endif
312
313	end = phys_avail[biggestone+1];
314
315	/*
316	 * Initialize the page and queue locks.
317	 */
318	mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF);
319	for (i = 0; i < PA_LOCK_COUNT; i++)
320		mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
321	for (i = 0; i < PQ_COUNT; i++)
322		vm_pagequeue_init_lock(&vm_pagequeues[i]);
323
324	/*
325	 * Allocate memory for use when boot strapping the kernel memory
326	 * allocator.
327	 */
328	new_end = end - (boot_pages * UMA_SLAB_SIZE);
329	new_end = trunc_page(new_end);
330	mapped = pmap_map(&vaddr, new_end, end,
331	    VM_PROT_READ | VM_PROT_WRITE);
332	bzero((void *)mapped, end - new_end);
333	uma_startup((void *)mapped, boot_pages);
334
335#if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \
336    defined(__mips__)
337	/*
338	 * Allocate a bitmap to indicate that a random physical page
339	 * needs to be included in a minidump.
340	 *
341	 * The amd64 port needs this to indicate which direct map pages
342	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
343	 *
344	 * However, i386 still needs this workspace internally within the
345	 * minidump code.  In theory, they are not needed on i386, but are
346	 * included should the sf_buf code decide to use them.
347	 */
348	last_pa = 0;
349	for (i = 0; dump_avail[i + 1] != 0; i += 2)
350		if (dump_avail[i + 1] > last_pa)
351			last_pa = dump_avail[i + 1];
352	page_range = last_pa / PAGE_SIZE;
353	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
354	new_end -= vm_page_dump_size;
355	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
356	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
357	bzero((void *)vm_page_dump, vm_page_dump_size);
358#endif
359#ifdef __amd64__
360	/*
361	 * Request that the physical pages underlying the message buffer be
362	 * included in a crash dump.  Since the message buffer is accessed
363	 * through the direct map, they are not automatically included.
364	 */
365	pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
366	last_pa = pa + round_page(msgbufsize);
367	while (pa < last_pa) {
368		dump_add_page(pa);
369		pa += PAGE_SIZE;
370	}
371#endif
372	/*
373	 * Compute the number of pages of memory that will be available for
374	 * use (taking into account the overhead of a page structure per
375	 * page).
376	 */
377	first_page = low_water / PAGE_SIZE;
378#ifdef VM_PHYSSEG_SPARSE
379	page_range = 0;
380	for (i = 0; phys_avail[i + 1] != 0; i += 2)
381		page_range += atop(phys_avail[i + 1] - phys_avail[i]);
382#elif defined(VM_PHYSSEG_DENSE)
383	page_range = high_water / PAGE_SIZE - first_page;
384#else
385#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
386#endif
387	end = new_end;
388
389	/*
390	 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
391	 */
392	vaddr += PAGE_SIZE;
393
394	/*
395	 * Initialize the mem entry structures now, and put them in the free
396	 * queue.
397	 */
398	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
399	mapped = pmap_map(&vaddr, new_end, end,
400	    VM_PROT_READ | VM_PROT_WRITE);
401	vm_page_array = (vm_page_t) mapped;
402#if VM_NRESERVLEVEL > 0
403	/*
404	 * Allocate memory for the reservation management system's data
405	 * structures.
406	 */
407	new_end = vm_reserv_startup(&vaddr, new_end, high_water);
408#endif
409#if defined(__amd64__) || defined(__mips__)
410	/*
411	 * pmap_map on amd64 and mips can come out of the direct-map, not kvm
412	 * like i386, so the pages must be tracked for a crashdump to include
413	 * this data.  This includes the vm_page_array and the early UMA
414	 * bootstrap pages.
415	 */
416	for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
417		dump_add_page(pa);
418#endif
419	phys_avail[biggestone + 1] = new_end;
420
421	/*
422	 * Clear all of the page structures
423	 */
424	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
425	for (i = 0; i < page_range; i++)
426		vm_page_array[i].order = VM_NFREEORDER;
427	vm_page_array_size = page_range;
428
429	/*
430	 * Initialize the physical memory allocator.
431	 */
432	vm_phys_init();
433
434	/*
435	 * Add every available physical page that is not blacklisted to
436	 * the free lists.
437	 */
438	cnt.v_page_count = 0;
439	cnt.v_free_count = 0;
440	list = getenv("vm.blacklist");
441	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
442		pa = phys_avail[i];
443		last_pa = phys_avail[i + 1];
444		while (pa < last_pa) {
445			if (list != NULL &&
446			    vm_page_blacklist_lookup(list, pa))
447				printf("Skipping page with pa 0x%jx\n",
448				    (uintmax_t)pa);
449			else
450				vm_phys_add_page(pa);
451			pa += PAGE_SIZE;
452		}
453	}
454	freeenv(list);
455#if VM_NRESERVLEVEL > 0
456	/*
457	 * Initialize the reservation management system.
458	 */
459	vm_reserv_init();
460#endif
461	return (vaddr);
462}
463
464void
465vm_page_reference(vm_page_t m)
466{
467
468	vm_page_aflag_set(m, PGA_REFERENCED);
469}
470
471void
472vm_page_busy(vm_page_t m)
473{
474
475	VM_OBJECT_ASSERT_WLOCKED(m->object);
476	KASSERT((m->oflags & VPO_BUSY) == 0,
477	    ("vm_page_busy: page already busy!!!"));
478	m->oflags |= VPO_BUSY;
479}
480
481/*
482 *      vm_page_flash:
483 *
484 *      wakeup anyone waiting for the page.
485 */
486void
487vm_page_flash(vm_page_t m)
488{
489
490	VM_OBJECT_ASSERT_WLOCKED(m->object);
491	if (m->oflags & VPO_WANTED) {
492		m->oflags &= ~VPO_WANTED;
493		wakeup(m);
494	}
495}
496
497/*
498 *      vm_page_wakeup:
499 *
500 *      clear the VPO_BUSY flag and wakeup anyone waiting for the
501 *      page.
502 *
503 */
504void
505vm_page_wakeup(vm_page_t m)
506{
507
508	VM_OBJECT_ASSERT_WLOCKED(m->object);
509	KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!"));
510	m->oflags &= ~VPO_BUSY;
511	vm_page_flash(m);
512}
513
514void
515vm_page_io_start(vm_page_t m)
516{
517
518	VM_OBJECT_ASSERT_WLOCKED(m->object);
519	m->busy++;
520}
521
522void
523vm_page_io_finish(vm_page_t m)
524{
525
526	VM_OBJECT_ASSERT_WLOCKED(m->object);
527	KASSERT(m->busy > 0, ("vm_page_io_finish: page %p is not busy", m));
528	m->busy--;
529	if (m->busy == 0)
530		vm_page_flash(m);
531}
532
533/*
534 * Keep page from being freed by the page daemon
535 * much of the same effect as wiring, except much lower
536 * overhead and should be used only for *very* temporary
537 * holding ("wiring").
538 */
539void
540vm_page_hold(vm_page_t mem)
541{
542
543	vm_page_lock_assert(mem, MA_OWNED);
544        mem->hold_count++;
545}
546
547void
548vm_page_unhold(vm_page_t mem)
549{
550
551	vm_page_lock_assert(mem, MA_OWNED);
552	--mem->hold_count;
553	KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
554	if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0)
555		vm_page_free_toq(mem);
556}
557
558/*
559 *	vm_page_unhold_pages:
560 *
561 *	Unhold each of the pages that is referenced by the given array.
562 */
563void
564vm_page_unhold_pages(vm_page_t *ma, int count)
565{
566	struct mtx *mtx, *new_mtx;
567
568	mtx = NULL;
569	for (; count != 0; count--) {
570		/*
571		 * Avoid releasing and reacquiring the same page lock.
572		 */
573		new_mtx = vm_page_lockptr(*ma);
574		if (mtx != new_mtx) {
575			if (mtx != NULL)
576				mtx_unlock(mtx);
577			mtx = new_mtx;
578			mtx_lock(mtx);
579		}
580		vm_page_unhold(*ma);
581		ma++;
582	}
583	if (mtx != NULL)
584		mtx_unlock(mtx);
585}
586
587vm_page_t
588PHYS_TO_VM_PAGE(vm_paddr_t pa)
589{
590	vm_page_t m;
591
592#ifdef VM_PHYSSEG_SPARSE
593	m = vm_phys_paddr_to_vm_page(pa);
594	if (m == NULL)
595		m = vm_phys_fictitious_to_vm_page(pa);
596	return (m);
597#elif defined(VM_PHYSSEG_DENSE)
598	long pi;
599
600	pi = atop(pa);
601	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
602		m = &vm_page_array[pi - first_page];
603		return (m);
604	}
605	return (vm_phys_fictitious_to_vm_page(pa));
606#else
607#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
608#endif
609}
610
611/*
612 *	vm_page_getfake:
613 *
614 *	Create a fictitious page with the specified physical address and
615 *	memory attribute.  The memory attribute is the only the machine-
616 *	dependent aspect of a fictitious page that must be initialized.
617 */
618vm_page_t
619vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
620{
621	vm_page_t m;
622
623	m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
624	vm_page_initfake(m, paddr, memattr);
625	return (m);
626}
627
628void
629vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
630{
631
632	if ((m->flags & PG_FICTITIOUS) != 0) {
633		/*
634		 * The page's memattr might have changed since the
635		 * previous initialization.  Update the pmap to the
636		 * new memattr.
637		 */
638		goto memattr;
639	}
640	m->phys_addr = paddr;
641	m->queue = PQ_NONE;
642	/* Fictitious pages don't use "segind". */
643	m->flags = PG_FICTITIOUS;
644	/* Fictitious pages don't use "order" or "pool". */
645	m->oflags = VPO_BUSY | VPO_UNMANAGED;
646	m->wire_count = 1;
647memattr:
648	pmap_page_set_memattr(m, memattr);
649}
650
651/*
652 *	vm_page_putfake:
653 *
654 *	Release a fictitious page.
655 */
656void
657vm_page_putfake(vm_page_t m)
658{
659
660	KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m));
661	KASSERT((m->flags & PG_FICTITIOUS) != 0,
662	    ("vm_page_putfake: bad page %p", m));
663	uma_zfree(fakepg_zone, m);
664}
665
666/*
667 *	vm_page_updatefake:
668 *
669 *	Update the given fictitious page to the specified physical address and
670 *	memory attribute.
671 */
672void
673vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
674{
675
676	KASSERT((m->flags & PG_FICTITIOUS) != 0,
677	    ("vm_page_updatefake: bad page %p", m));
678	m->phys_addr = paddr;
679	pmap_page_set_memattr(m, memattr);
680}
681
682/*
683 *	vm_page_free:
684 *
685 *	Free a page.
686 */
687void
688vm_page_free(vm_page_t m)
689{
690
691	m->flags &= ~PG_ZERO;
692	vm_page_free_toq(m);
693}
694
695/*
696 *	vm_page_free_zero:
697 *
698 *	Free a page to the zerod-pages queue
699 */
700void
701vm_page_free_zero(vm_page_t m)
702{
703
704	m->flags |= PG_ZERO;
705	vm_page_free_toq(m);
706}
707
708/*
709 * Unbusy and handle the page queueing for a page from the VOP_GETPAGES()
710 * array which is not the request page.
711 */
712void
713vm_page_readahead_finish(vm_page_t m)
714{
715
716	if (m->valid != 0) {
717		/*
718		 * Since the page is not the requested page, whether
719		 * it should be activated or deactivated is not
720		 * obvious.  Empirical results have shown that
721		 * deactivating the page is usually the best choice,
722		 * unless the page is wanted by another thread.
723		 */
724		if (m->oflags & VPO_WANTED) {
725			vm_page_lock(m);
726			vm_page_activate(m);
727			vm_page_unlock(m);
728		} else {
729			vm_page_lock(m);
730			vm_page_deactivate(m);
731			vm_page_unlock(m);
732		}
733		vm_page_wakeup(m);
734	} else {
735		/*
736		 * Free the completely invalid page.  Such page state
737		 * occurs due to the short read operation which did
738		 * not covered our page at all, or in case when a read
739		 * error happens.
740		 */
741		vm_page_lock(m);
742		vm_page_free(m);
743		vm_page_unlock(m);
744	}
745}
746
747/*
748 *	vm_page_sleep:
749 *
750 *	Sleep and release the page lock.
751 *
752 *	The object containing the given page must be locked.
753 */
754void
755vm_page_sleep(vm_page_t m, const char *msg)
756{
757
758	VM_OBJECT_ASSERT_WLOCKED(m->object);
759	if (mtx_owned(vm_page_lockptr(m)))
760		vm_page_unlock(m);
761
762	/*
763	 * It's possible that while we sleep, the page will get
764	 * unbusied and freed.  If we are holding the object
765	 * lock, we will assume we hold a reference to the object
766	 * such that even if m->object changes, we can re-lock
767	 * it.
768	 */
769	m->oflags |= VPO_WANTED;
770	VM_OBJECT_SLEEP(m->object, m, PVM, msg, 0);
771}
772
773/*
774 *	vm_page_dirty_KBI:		[ internal use only ]
775 *
776 *	Set all bits in the page's dirty field.
777 *
778 *	The object containing the specified page must be locked if the
779 *	call is made from the machine-independent layer.
780 *
781 *	See vm_page_clear_dirty_mask().
782 *
783 *	This function should only be called by vm_page_dirty().
784 */
785void
786vm_page_dirty_KBI(vm_page_t m)
787{
788
789	/* These assertions refer to this operation by its public name. */
790	KASSERT((m->flags & PG_CACHED) == 0,
791	    ("vm_page_dirty: page in cache!"));
792	KASSERT(!VM_PAGE_IS_FREE(m),
793	    ("vm_page_dirty: page is free!"));
794	KASSERT(m->valid == VM_PAGE_BITS_ALL,
795	    ("vm_page_dirty: page is invalid!"));
796	m->dirty = VM_PAGE_BITS_ALL;
797}
798
799/*
800 *	vm_page_insert:		[ internal use only ]
801 *
802 *	Inserts the given mem entry into the object and object list.
803 *
804 *	The pagetables are not updated but will presumably fault the page
805 *	in if necessary, or if a kernel page the caller will at some point
806 *	enter the page into the kernel's pmap.  We are not allowed to sleep
807 *	here so we *can't* do this anyway.
808 *
809 *	The object must be locked.
810 */
811void
812vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
813{
814	vm_page_t mpred;
815
816	VM_OBJECT_ASSERT_WLOCKED(object);
817	mpred = vm_radix_lookup_le(&object->rtree, pindex);
818	vm_page_insert_after(m, object, pindex, mpred);
819}
820
821/*
822 *	vm_page_insert_after:
823 *
824 *	Inserts the page "m" into the specified object at offset "pindex".
825 *
826 *	The page "mpred" must immediately precede the offset "pindex" within
827 *	the specified object.
828 *
829 *	The object must be locked.
830 */
831static void
832vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
833    vm_page_t mpred)
834{
835	vm_page_t msucc;
836
837	VM_OBJECT_ASSERT_WLOCKED(object);
838	KASSERT(m->object == NULL,
839	    ("vm_page_insert_after: page already inserted"));
840	if (mpred != NULL) {
841		KASSERT(mpred->object == object ||
842		    (mpred->flags & PG_SLAB) != 0,
843		    ("vm_page_insert_after: object doesn't contain mpred"));
844		KASSERT(mpred->pindex < pindex,
845		    ("vm_page_insert_after: mpred doesn't precede pindex"));
846		msucc = TAILQ_NEXT(mpred, listq);
847	} else
848		msucc = TAILQ_FIRST(&object->memq);
849	if (msucc != NULL)
850		KASSERT(msucc->pindex > pindex,
851		    ("vm_page_insert_after: msucc doesn't succeed pindex"));
852
853	/*
854	 * Record the object/offset pair in this page
855	 */
856	m->object = object;
857	m->pindex = pindex;
858
859	/*
860	 * Now link into the object's ordered list of backed pages.
861	 */
862	if (mpred != NULL)
863		TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq);
864	else
865		TAILQ_INSERT_HEAD(&object->memq, m, listq);
866	vm_radix_insert(&object->rtree, m);
867
868	/*
869	 * Show that the object has one more resident page.
870	 */
871	object->resident_page_count++;
872
873	/*
874	 * Hold the vnode until the last page is released.
875	 */
876	if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
877		vhold(object->handle);
878
879	/*
880	 * Since we are inserting a new and possibly dirty page,
881	 * update the object's OBJ_MIGHTBEDIRTY flag.
882	 */
883	if (pmap_page_is_write_mapped(m))
884		vm_object_set_writeable_dirty(object);
885}
886
887/*
888 *	vm_page_remove:
889 *
890 *	Removes the given mem entry from the object/offset-page
891 *	table and the object page list, but do not invalidate/terminate
892 *	the backing store.
893 *
894 *	The underlying pmap entry (if any) is NOT removed here.
895 *
896 *	The object must be locked.  The page must be locked if it is managed.
897 */
898void
899vm_page_remove(vm_page_t m)
900{
901	vm_object_t object;
902
903	if ((m->oflags & VPO_UNMANAGED) == 0)
904		vm_page_lock_assert(m, MA_OWNED);
905	if ((object = m->object) == NULL)
906		return;
907	VM_OBJECT_ASSERT_WLOCKED(object);
908	if (m->oflags & VPO_BUSY) {
909		m->oflags &= ~VPO_BUSY;
910		vm_page_flash(m);
911	}
912
913	/*
914	 * Now remove from the object's list of backed pages.
915	 */
916	vm_radix_remove(&object->rtree, m->pindex);
917	TAILQ_REMOVE(&object->memq, m, listq);
918
919	/*
920	 * And show that the object has one fewer resident page.
921	 */
922	object->resident_page_count--;
923
924	/*
925	 * The vnode may now be recycled.
926	 */
927	if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
928		vdrop(object->handle);
929
930	m->object = NULL;
931}
932
933/*
934 *	vm_page_lookup:
935 *
936 *	Returns the page associated with the object/offset
937 *	pair specified; if none is found, NULL is returned.
938 *
939 *	The object must be locked.
940 */
941vm_page_t
942vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
943{
944
945	VM_OBJECT_ASSERT_LOCKED(object);
946	return (vm_radix_lookup(&object->rtree, pindex));
947}
948
949/*
950 *	vm_page_find_least:
951 *
952 *	Returns the page associated with the object with least pindex
953 *	greater than or equal to the parameter pindex, or NULL.
954 *
955 *	The object must be locked.
956 */
957vm_page_t
958vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
959{
960	vm_page_t m;
961
962	VM_OBJECT_ASSERT_WLOCKED(object);
963	if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex)
964		m = vm_radix_lookup_ge(&object->rtree, pindex);
965	return (m);
966}
967
968/*
969 * Returns the given page's successor (by pindex) within the object if it is
970 * resident; if none is found, NULL is returned.
971 *
972 * The object must be locked.
973 */
974vm_page_t
975vm_page_next(vm_page_t m)
976{
977	vm_page_t next;
978
979	VM_OBJECT_ASSERT_WLOCKED(m->object);
980	if ((next = TAILQ_NEXT(m, listq)) != NULL &&
981	    next->pindex != m->pindex + 1)
982		next = NULL;
983	return (next);
984}
985
986/*
987 * Returns the given page's predecessor (by pindex) within the object if it is
988 * resident; if none is found, NULL is returned.
989 *
990 * The object must be locked.
991 */
992vm_page_t
993vm_page_prev(vm_page_t m)
994{
995	vm_page_t prev;
996
997	VM_OBJECT_ASSERT_WLOCKED(m->object);
998	if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
999	    prev->pindex != m->pindex - 1)
1000		prev = NULL;
1001	return (prev);
1002}
1003
1004/*
1005 *	vm_page_rename:
1006 *
1007 *	Move the given memory entry from its
1008 *	current object to the specified target object/offset.
1009 *
1010 *	Note: swap associated with the page must be invalidated by the move.  We
1011 *	      have to do this for several reasons:  (1) we aren't freeing the
1012 *	      page, (2) we are dirtying the page, (3) the VM system is probably
1013 *	      moving the page from object A to B, and will then later move
1014 *	      the backing store from A to B and we can't have a conflict.
1015 *
1016 *	Note: we *always* dirty the page.  It is necessary both for the
1017 *	      fact that we moved it, and because we may be invalidating
1018 *	      swap.  If the page is on the cache, we have to deactivate it
1019 *	      or vm_page_dirty() will panic.  Dirty pages are not allowed
1020 *	      on the cache.
1021 *
1022 *	The objects must be locked.  The page must be locked if it is managed.
1023 */
1024void
1025vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1026{
1027
1028	vm_page_remove(m);
1029	vm_page_insert(m, new_object, new_pindex);
1030	vm_page_dirty(m);
1031}
1032
1033/*
1034 *	Convert all of the given object's cached pages that have a
1035 *	pindex within the given range into free pages.  If the value
1036 *	zero is given for "end", then the range's upper bound is
1037 *	infinity.  If the given object is backed by a vnode and it
1038 *	transitions from having one or more cached pages to none, the
1039 *	vnode's hold count is reduced.
1040 */
1041void
1042vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1043{
1044	vm_page_t m;
1045	boolean_t empty;
1046
1047	mtx_lock(&vm_page_queue_free_mtx);
1048	if (__predict_false(vm_radix_is_empty(&object->cache))) {
1049		mtx_unlock(&vm_page_queue_free_mtx);
1050		return;
1051	}
1052	while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) {
1053		if (end != 0 && m->pindex >= end)
1054			break;
1055		vm_radix_remove(&object->cache, m->pindex);
1056		m->object = NULL;
1057		m->valid = 0;
1058		/* Clear PG_CACHED and set PG_FREE. */
1059		m->flags ^= PG_CACHED | PG_FREE;
1060		KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
1061		    ("vm_page_cache_free: page %p has inconsistent flags", m));
1062		cnt.v_cache_count--;
1063		cnt.v_free_count++;
1064	}
1065	empty = vm_radix_is_empty(&object->cache);
1066	mtx_unlock(&vm_page_queue_free_mtx);
1067	if (object->type == OBJT_VNODE && empty)
1068		vdrop(object->handle);
1069}
1070
1071/*
1072 *	Returns the cached page that is associated with the given
1073 *	object and offset.  If, however, none exists, returns NULL.
1074 *
1075 *	The free page queue must be locked.
1076 */
1077static inline vm_page_t
1078vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
1079{
1080
1081	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1082	return (vm_radix_lookup(&object->cache, pindex));
1083}
1084
1085/*
1086 *	Remove the given cached page from its containing object's
1087 *	collection of cached pages.
1088 *
1089 *	The free page queue must be locked.
1090 */
1091static void
1092vm_page_cache_remove(vm_page_t m)
1093{
1094
1095	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1096	KASSERT((m->flags & PG_CACHED) != 0,
1097	    ("vm_page_cache_remove: page %p is not cached", m));
1098	vm_radix_remove(&m->object->cache, m->pindex);
1099	m->object = NULL;
1100	cnt.v_cache_count--;
1101}
1102
1103/*
1104 *	Transfer all of the cached pages with offset greater than or
1105 *	equal to 'offidxstart' from the original object's cache to the
1106 *	new object's cache.  However, any cached pages with offset
1107 *	greater than or equal to the new object's size are kept in the
1108 *	original object.  Initially, the new object's cache must be
1109 *	empty.  Offset 'offidxstart' in the original object must
1110 *	correspond to offset zero in the new object.
1111 *
1112 *	The new object must be locked.
1113 */
1114void
1115vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
1116    vm_object_t new_object)
1117{
1118	vm_page_t m;
1119
1120	/*
1121	 * Insertion into an object's collection of cached pages
1122	 * requires the object to be locked.  In contrast, removal does
1123	 * not.
1124	 */
1125	VM_OBJECT_ASSERT_WLOCKED(new_object);
1126	KASSERT(vm_radix_is_empty(&new_object->cache),
1127	    ("vm_page_cache_transfer: object %p has cached pages",
1128	    new_object));
1129	mtx_lock(&vm_page_queue_free_mtx);
1130	while ((m = vm_radix_lookup_ge(&orig_object->cache,
1131	    offidxstart)) != NULL) {
1132		/*
1133		 * Transfer all of the pages with offset greater than or
1134		 * equal to 'offidxstart' from the original object's
1135		 * cache to the new object's cache.
1136		 */
1137		if ((m->pindex - offidxstart) >= new_object->size)
1138			break;
1139		vm_radix_remove(&orig_object->cache, m->pindex);
1140		/* Update the page's object and offset. */
1141		m->object = new_object;
1142		m->pindex -= offidxstart;
1143		vm_radix_insert(&new_object->cache, m);
1144	}
1145	mtx_unlock(&vm_page_queue_free_mtx);
1146}
1147
1148/*
1149 *	Returns TRUE if a cached page is associated with the given object and
1150 *	offset, and FALSE otherwise.
1151 *
1152 *	The object must be locked.
1153 */
1154boolean_t
1155vm_page_is_cached(vm_object_t object, vm_pindex_t pindex)
1156{
1157	vm_page_t m;
1158
1159	/*
1160	 * Insertion into an object's collection of cached pages requires the
1161	 * object to be locked.  Therefore, if the object is locked and the
1162	 * object's collection is empty, there is no need to acquire the free
1163	 * page queues lock in order to prove that the specified page doesn't
1164	 * exist.
1165	 */
1166	VM_OBJECT_ASSERT_WLOCKED(object);
1167	if (__predict_true(vm_object_cache_is_empty(object)))
1168		return (FALSE);
1169	mtx_lock(&vm_page_queue_free_mtx);
1170	m = vm_page_cache_lookup(object, pindex);
1171	mtx_unlock(&vm_page_queue_free_mtx);
1172	return (m != NULL);
1173}
1174
1175/*
1176 *	vm_page_alloc:
1177 *
1178 *	Allocate and return a page that is associated with the specified
1179 *	object and offset pair.  By default, this page has the flag VPO_BUSY
1180 *	set.
1181 *
1182 *	The caller must always specify an allocation class.
1183 *
1184 *	allocation classes:
1185 *	VM_ALLOC_NORMAL		normal process request
1186 *	VM_ALLOC_SYSTEM		system *really* needs a page
1187 *	VM_ALLOC_INTERRUPT	interrupt time request
1188 *
1189 *	optional allocation flags:
1190 *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
1191 *				intends to allocate
1192 *	VM_ALLOC_IFCACHED	return page only if it is cached
1193 *	VM_ALLOC_IFNOTCACHED	return NULL, do not reactivate if the page
1194 *				is cached
1195 *	VM_ALLOC_NOBUSY		do not set the flag VPO_BUSY on the page
1196 *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
1197 *	VM_ALLOC_NOOBJ		page is not associated with an object and
1198 *				should not have the flag VPO_BUSY set
1199 *	VM_ALLOC_WIRED		wire the allocated page
1200 *	VM_ALLOC_ZERO		prefer a zeroed page
1201 *
1202 *	This routine may not sleep.
1203 */
1204vm_page_t
1205vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
1206{
1207	struct vnode *vp = NULL;
1208	vm_object_t m_object;
1209	vm_page_t m, mpred;
1210	int flags, req_class;
1211
1212	mpred = 0;	/* XXX: pacify gcc */
1213	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0),
1214	    ("vm_page_alloc: inconsistent object/req"));
1215	if (object != NULL)
1216		VM_OBJECT_ASSERT_WLOCKED(object);
1217
1218	req_class = req & VM_ALLOC_CLASS_MASK;
1219
1220	/*
1221	 * The page daemon is allowed to dig deeper into the free page list.
1222	 */
1223	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1224		req_class = VM_ALLOC_SYSTEM;
1225
1226	if (object != NULL) {
1227		mpred = vm_radix_lookup_le(&object->rtree, pindex);
1228		KASSERT(mpred == NULL || mpred->pindex != pindex,
1229		   ("vm_page_alloc: pindex already allocated"));
1230	}
1231	mtx_lock(&vm_page_queue_free_mtx);
1232	if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
1233	    (req_class == VM_ALLOC_SYSTEM &&
1234	    cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
1235	    (req_class == VM_ALLOC_INTERRUPT &&
1236	    cnt.v_free_count + cnt.v_cache_count > 0)) {
1237		/*
1238		 * Allocate from the free queue if the number of free pages
1239		 * exceeds the minimum for the request class.
1240		 */
1241		if (object != NULL &&
1242		    (m = vm_page_cache_lookup(object, pindex)) != NULL) {
1243			if ((req & VM_ALLOC_IFNOTCACHED) != 0) {
1244				mtx_unlock(&vm_page_queue_free_mtx);
1245				return (NULL);
1246			}
1247			if (vm_phys_unfree_page(m))
1248				vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0);
1249#if VM_NRESERVLEVEL > 0
1250			else if (!vm_reserv_reactivate_page(m))
1251#else
1252			else
1253#endif
1254				panic("vm_page_alloc: cache page %p is missing"
1255				    " from the free queue", m);
1256		} else if ((req & VM_ALLOC_IFCACHED) != 0) {
1257			mtx_unlock(&vm_page_queue_free_mtx);
1258			return (NULL);
1259#if VM_NRESERVLEVEL > 0
1260		} else if (object == NULL || (object->flags & (OBJ_COLORED |
1261		    OBJ_FICTITIOUS)) != OBJ_COLORED || (m =
1262		    vm_reserv_alloc_page(object, pindex, mpred)) == NULL) {
1263#else
1264		} else {
1265#endif
1266			m = vm_phys_alloc_pages(object != NULL ?
1267			    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
1268#if VM_NRESERVLEVEL > 0
1269			if (m == NULL && vm_reserv_reclaim_inactive()) {
1270				m = vm_phys_alloc_pages(object != NULL ?
1271				    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT,
1272				    0);
1273			}
1274#endif
1275		}
1276	} else {
1277		/*
1278		 * Not allocatable, give up.
1279		 */
1280		mtx_unlock(&vm_page_queue_free_mtx);
1281		atomic_add_int(&vm_pageout_deficit,
1282		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
1283		pagedaemon_wakeup();
1284		return (NULL);
1285	}
1286
1287	/*
1288	 *  At this point we had better have found a good page.
1289	 */
1290	KASSERT(m != NULL, ("vm_page_alloc: missing page"));
1291	KASSERT(m->queue == PQ_NONE,
1292	    ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue));
1293	KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m));
1294	KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m));
1295	KASSERT(m->busy == 0, ("vm_page_alloc: page %p is busy", m));
1296	KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m));
1297	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
1298	    ("vm_page_alloc: page %p has unexpected memattr %d", m,
1299	    pmap_page_get_memattr(m)));
1300	if ((m->flags & PG_CACHED) != 0) {
1301		KASSERT((m->flags & PG_ZERO) == 0,
1302		    ("vm_page_alloc: cached page %p is PG_ZERO", m));
1303		KASSERT(m->valid != 0,
1304		    ("vm_page_alloc: cached page %p is invalid", m));
1305		if (m->object == object && m->pindex == pindex)
1306	  		cnt.v_reactivated++;
1307		else
1308			m->valid = 0;
1309		m_object = m->object;
1310		vm_page_cache_remove(m);
1311		if (m_object->type == OBJT_VNODE &&
1312		    vm_object_cache_is_empty(m_object))
1313			vp = m_object->handle;
1314	} else {
1315		KASSERT(VM_PAGE_IS_FREE(m),
1316		    ("vm_page_alloc: page %p is not free", m));
1317		KASSERT(m->valid == 0,
1318		    ("vm_page_alloc: free page %p is valid", m));
1319		cnt.v_free_count--;
1320	}
1321
1322	/*
1323	 * Only the PG_ZERO flag is inherited.  The PG_CACHED or PG_FREE flag
1324	 * must be cleared before the free page queues lock is released.
1325	 */
1326	flags = 0;
1327	if (m->flags & PG_ZERO) {
1328		vm_page_zero_count--;
1329		if (req & VM_ALLOC_ZERO)
1330			flags = PG_ZERO;
1331	}
1332	if (req & VM_ALLOC_NODUMP)
1333		flags |= PG_NODUMP;
1334	m->flags = flags;
1335	mtx_unlock(&vm_page_queue_free_mtx);
1336	m->aflags = 0;
1337	m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
1338	    VPO_UNMANAGED : 0;
1339	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ)) == 0)
1340		m->oflags |= VPO_BUSY;
1341	if (req & VM_ALLOC_WIRED) {
1342		/*
1343		 * The page lock is not required for wiring a page until that
1344		 * page is inserted into the object.
1345		 */
1346		atomic_add_int(&cnt.v_wire_count, 1);
1347		m->wire_count = 1;
1348	}
1349	m->act_count = 0;
1350
1351	if (object != NULL) {
1352		/* Ignore device objects; the pager sets "memattr" for them. */
1353		if (object->memattr != VM_MEMATTR_DEFAULT &&
1354		    (object->flags & OBJ_FICTITIOUS) == 0)
1355			pmap_page_set_memattr(m, object->memattr);
1356		vm_page_insert_after(m, object, pindex, mpred);
1357	} else
1358		m->pindex = pindex;
1359
1360	/*
1361	 * The following call to vdrop() must come after the above call
1362	 * to vm_page_insert() in case both affect the same object and
1363	 * vnode.  Otherwise, the affected vnode's hold count could
1364	 * temporarily become zero.
1365	 */
1366	if (vp != NULL)
1367		vdrop(vp);
1368
1369	/*
1370	 * Don't wakeup too often - wakeup the pageout daemon when
1371	 * we would be nearly out of memory.
1372	 */
1373	if (vm_paging_needed())
1374		pagedaemon_wakeup();
1375
1376	return (m);
1377}
1378
1379/*
1380 *	vm_page_alloc_contig:
1381 *
1382 *	Allocate a contiguous set of physical pages of the given size "npages"
1383 *	from the free lists.  All of the physical pages must be at or above
1384 *	the given physical address "low" and below the given physical address
1385 *	"high".  The given value "alignment" determines the alignment of the
1386 *	first physical page in the set.  If the given value "boundary" is
1387 *	non-zero, then the set of physical pages cannot cross any physical
1388 *	address boundary that is a multiple of that value.  Both "alignment"
1389 *	and "boundary" must be a power of two.
1390 *
1391 *	If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
1392 *	then the memory attribute setting for the physical pages is configured
1393 *	to the object's memory attribute setting.  Otherwise, the memory
1394 *	attribute setting for the physical pages is configured to "memattr",
1395 *	overriding the object's memory attribute setting.  However, if the
1396 *	object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
1397 *	memory attribute setting for the physical pages cannot be configured
1398 *	to VM_MEMATTR_DEFAULT.
1399 *
1400 *	The caller must always specify an allocation class.
1401 *
1402 *	allocation classes:
1403 *	VM_ALLOC_NORMAL		normal process request
1404 *	VM_ALLOC_SYSTEM		system *really* needs a page
1405 *	VM_ALLOC_INTERRUPT	interrupt time request
1406 *
1407 *	optional allocation flags:
1408 *	VM_ALLOC_NOBUSY		do not set the flag VPO_BUSY on the page
1409 *	VM_ALLOC_NOOBJ		page is not associated with an object and
1410 *				should not have the flag VPO_BUSY set
1411 *	VM_ALLOC_WIRED		wire the allocated page
1412 *	VM_ALLOC_ZERO		prefer a zeroed page
1413 *
1414 *	This routine may not sleep.
1415 */
1416vm_page_t
1417vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
1418    u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
1419    vm_paddr_t boundary, vm_memattr_t memattr)
1420{
1421	struct vnode *drop;
1422	vm_page_t deferred_vdrop_list, m, m_ret;
1423	u_int flags, oflags;
1424	int req_class;
1425
1426	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0),
1427	    ("vm_page_alloc_contig: inconsistent object/req"));
1428	if (object != NULL) {
1429		VM_OBJECT_ASSERT_WLOCKED(object);
1430		KASSERT(object->type == OBJT_PHYS,
1431		    ("vm_page_alloc_contig: object %p isn't OBJT_PHYS",
1432		    object));
1433	}
1434	KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
1435	req_class = req & VM_ALLOC_CLASS_MASK;
1436
1437	/*
1438	 * The page daemon is allowed to dig deeper into the free page list.
1439	 */
1440	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1441		req_class = VM_ALLOC_SYSTEM;
1442
1443	deferred_vdrop_list = NULL;
1444	mtx_lock(&vm_page_queue_free_mtx);
1445	if (cnt.v_free_count + cnt.v_cache_count >= npages +
1446	    cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM &&
1447	    cnt.v_free_count + cnt.v_cache_count >= npages +
1448	    cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT &&
1449	    cnt.v_free_count + cnt.v_cache_count >= npages)) {
1450#if VM_NRESERVLEVEL > 0
1451retry:
1452		if (object == NULL || (object->flags & OBJ_COLORED) == 0 ||
1453		    (m_ret = vm_reserv_alloc_contig(object, pindex, npages,
1454		    low, high, alignment, boundary)) == NULL)
1455#endif
1456			m_ret = vm_phys_alloc_contig(npages, low, high,
1457			    alignment, boundary);
1458	} else {
1459		mtx_unlock(&vm_page_queue_free_mtx);
1460		atomic_add_int(&vm_pageout_deficit, npages);
1461		pagedaemon_wakeup();
1462		return (NULL);
1463	}
1464	if (m_ret != NULL)
1465		for (m = m_ret; m < &m_ret[npages]; m++) {
1466			drop = vm_page_alloc_init(m);
1467			if (drop != NULL) {
1468				/*
1469				 * Enqueue the vnode for deferred vdrop().
1470				 *
1471				 * Once the pages are removed from the free
1472				 * page list, "pageq" can be safely abused to
1473				 * construct a short-lived list of vnodes.
1474				 */
1475				m->pageq.tqe_prev = (void *)drop;
1476				m->pageq.tqe_next = deferred_vdrop_list;
1477				deferred_vdrop_list = m;
1478			}
1479		}
1480	else {
1481#if VM_NRESERVLEVEL > 0
1482		if (vm_reserv_reclaim_contig(npages, low, high, alignment,
1483		    boundary))
1484			goto retry;
1485#endif
1486	}
1487	mtx_unlock(&vm_page_queue_free_mtx);
1488	if (m_ret == NULL)
1489		return (NULL);
1490
1491	/*
1492	 * Initialize the pages.  Only the PG_ZERO flag is inherited.
1493	 */
1494	flags = 0;
1495	if ((req & VM_ALLOC_ZERO) != 0)
1496		flags = PG_ZERO;
1497	if ((req & VM_ALLOC_NODUMP) != 0)
1498		flags |= PG_NODUMP;
1499	if ((req & VM_ALLOC_WIRED) != 0)
1500		atomic_add_int(&cnt.v_wire_count, npages);
1501	oflags = VPO_UNMANAGED;
1502	if (object != NULL) {
1503		if ((req & VM_ALLOC_NOBUSY) == 0)
1504			oflags |= VPO_BUSY;
1505		if (object->memattr != VM_MEMATTR_DEFAULT &&
1506		    memattr == VM_MEMATTR_DEFAULT)
1507			memattr = object->memattr;
1508	}
1509	for (m = m_ret; m < &m_ret[npages]; m++) {
1510		m->aflags = 0;
1511		m->flags = (m->flags | PG_NODUMP) & flags;
1512		if ((req & VM_ALLOC_WIRED) != 0)
1513			m->wire_count = 1;
1514		/* Unmanaged pages don't use "act_count". */
1515		m->oflags = oflags;
1516		if (memattr != VM_MEMATTR_DEFAULT)
1517			pmap_page_set_memattr(m, memattr);
1518		if (object != NULL)
1519			vm_page_insert(m, object, pindex);
1520		else
1521			m->pindex = pindex;
1522		pindex++;
1523	}
1524	while (deferred_vdrop_list != NULL) {
1525		vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
1526		deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
1527	}
1528	if (vm_paging_needed())
1529		pagedaemon_wakeup();
1530	return (m_ret);
1531}
1532
1533/*
1534 * Initialize a page that has been freshly dequeued from a freelist.
1535 * The caller has to drop the vnode returned, if it is not NULL.
1536 *
1537 * This function may only be used to initialize unmanaged pages.
1538 *
1539 * To be called with vm_page_queue_free_mtx held.
1540 */
1541static struct vnode *
1542vm_page_alloc_init(vm_page_t m)
1543{
1544	struct vnode *drop;
1545	vm_object_t m_object;
1546
1547	KASSERT(m->queue == PQ_NONE,
1548	    ("vm_page_alloc_init: page %p has unexpected queue %d",
1549	    m, m->queue));
1550	KASSERT(m->wire_count == 0,
1551	    ("vm_page_alloc_init: page %p is wired", m));
1552	KASSERT(m->hold_count == 0,
1553	    ("vm_page_alloc_init: page %p is held", m));
1554	KASSERT(m->busy == 0,
1555	    ("vm_page_alloc_init: page %p is busy", m));
1556	KASSERT(m->dirty == 0,
1557	    ("vm_page_alloc_init: page %p is dirty", m));
1558	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
1559	    ("vm_page_alloc_init: page %p has unexpected memattr %d",
1560	    m, pmap_page_get_memattr(m)));
1561	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1562	drop = NULL;
1563	if ((m->flags & PG_CACHED) != 0) {
1564		KASSERT((m->flags & PG_ZERO) == 0,
1565		    ("vm_page_alloc_init: cached page %p is PG_ZERO", m));
1566		m->valid = 0;
1567		m_object = m->object;
1568		vm_page_cache_remove(m);
1569		if (m_object->type == OBJT_VNODE &&
1570		    vm_object_cache_is_empty(m_object))
1571			drop = m_object->handle;
1572	} else {
1573		KASSERT(VM_PAGE_IS_FREE(m),
1574		    ("vm_page_alloc_init: page %p is not free", m));
1575		KASSERT(m->valid == 0,
1576		    ("vm_page_alloc_init: free page %p is valid", m));
1577		cnt.v_free_count--;
1578		if ((m->flags & PG_ZERO) != 0)
1579			vm_page_zero_count--;
1580	}
1581	/* Don't clear the PG_ZERO flag; we'll need it later. */
1582	m->flags &= PG_ZERO;
1583	return (drop);
1584}
1585
1586/*
1587 * 	vm_page_alloc_freelist:
1588 *
1589 *	Allocate a physical page from the specified free page list.
1590 *
1591 *	The caller must always specify an allocation class.
1592 *
1593 *	allocation classes:
1594 *	VM_ALLOC_NORMAL		normal process request
1595 *	VM_ALLOC_SYSTEM		system *really* needs a page
1596 *	VM_ALLOC_INTERRUPT	interrupt time request
1597 *
1598 *	optional allocation flags:
1599 *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
1600 *				intends to allocate
1601 *	VM_ALLOC_WIRED		wire the allocated page
1602 *	VM_ALLOC_ZERO		prefer a zeroed page
1603 *
1604 *	This routine may not sleep.
1605 */
1606vm_page_t
1607vm_page_alloc_freelist(int flind, int req)
1608{
1609	struct vnode *drop;
1610	vm_page_t m;
1611	u_int flags;
1612	int req_class;
1613
1614	req_class = req & VM_ALLOC_CLASS_MASK;
1615
1616	/*
1617	 * The page daemon is allowed to dig deeper into the free page list.
1618	 */
1619	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1620		req_class = VM_ALLOC_SYSTEM;
1621
1622	/*
1623	 * Do not allocate reserved pages unless the req has asked for it.
1624	 */
1625	mtx_lock(&vm_page_queue_free_mtx);
1626	if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
1627	    (req_class == VM_ALLOC_SYSTEM &&
1628	    cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
1629	    (req_class == VM_ALLOC_INTERRUPT &&
1630	    cnt.v_free_count + cnt.v_cache_count > 0))
1631		m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
1632	else {
1633		mtx_unlock(&vm_page_queue_free_mtx);
1634		atomic_add_int(&vm_pageout_deficit,
1635		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
1636		pagedaemon_wakeup();
1637		return (NULL);
1638	}
1639	if (m == NULL) {
1640		mtx_unlock(&vm_page_queue_free_mtx);
1641		return (NULL);
1642	}
1643	drop = vm_page_alloc_init(m);
1644	mtx_unlock(&vm_page_queue_free_mtx);
1645
1646	/*
1647	 * Initialize the page.  Only the PG_ZERO flag is inherited.
1648	 */
1649	m->aflags = 0;
1650	flags = 0;
1651	if ((req & VM_ALLOC_ZERO) != 0)
1652		flags = PG_ZERO;
1653	m->flags &= flags;
1654	if ((req & VM_ALLOC_WIRED) != 0) {
1655		/*
1656		 * The page lock is not required for wiring a page that does
1657		 * not belong to an object.
1658		 */
1659		atomic_add_int(&cnt.v_wire_count, 1);
1660		m->wire_count = 1;
1661	}
1662	/* Unmanaged pages don't use "act_count". */
1663	m->oflags = VPO_UNMANAGED;
1664	if (drop != NULL)
1665		vdrop(drop);
1666	if (vm_paging_needed())
1667		pagedaemon_wakeup();
1668	return (m);
1669}
1670
1671/*
1672 *	vm_wait:	(also see VM_WAIT macro)
1673 *
1674 *	Sleep until free pages are available for allocation.
1675 *	- Called in various places before memory allocations.
1676 */
1677void
1678vm_wait(void)
1679{
1680
1681	mtx_lock(&vm_page_queue_free_mtx);
1682	if (curproc == pageproc) {
1683		vm_pageout_pages_needed = 1;
1684		msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
1685		    PDROP | PSWP, "VMWait", 0);
1686	} else {
1687		if (!vm_pages_needed) {
1688			vm_pages_needed = 1;
1689			wakeup(&vm_pages_needed);
1690		}
1691		msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
1692		    "vmwait", 0);
1693	}
1694}
1695
1696/*
1697 *	vm_waitpfault:	(also see VM_WAITPFAULT macro)
1698 *
1699 *	Sleep until free pages are available for allocation.
1700 *	- Called only in vm_fault so that processes page faulting
1701 *	  can be easily tracked.
1702 *	- Sleeps at a lower priority than vm_wait() so that vm_wait()ing
1703 *	  processes will be able to grab memory first.  Do not change
1704 *	  this balance without careful testing first.
1705 */
1706void
1707vm_waitpfault(void)
1708{
1709
1710	mtx_lock(&vm_page_queue_free_mtx);
1711	if (!vm_pages_needed) {
1712		vm_pages_needed = 1;
1713		wakeup(&vm_pages_needed);
1714	}
1715	msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
1716	    "pfault", 0);
1717}
1718
1719/*
1720 *	vm_page_dequeue:
1721 *
1722 *	Remove the given page from its current page queue.
1723 *
1724 *	The page must be locked.
1725 */
1726void
1727vm_page_dequeue(vm_page_t m)
1728{
1729	struct vm_pagequeue *pq;
1730
1731	vm_page_lock_assert(m, MA_OWNED);
1732	KASSERT(m->queue != PQ_NONE,
1733	    ("vm_page_dequeue: page %p is not queued", m));
1734	pq = &vm_pagequeues[m->queue];
1735	vm_pagequeue_lock(pq);
1736	m->queue = PQ_NONE;
1737	TAILQ_REMOVE(&pq->pq_pl, m, pageq);
1738	(*pq->pq_cnt)--;
1739	vm_pagequeue_unlock(pq);
1740}
1741
1742/*
1743 *	vm_page_dequeue_locked:
1744 *
1745 *	Remove the given page from its current page queue.
1746 *
1747 *	The page and page queue must be locked.
1748 */
1749void
1750vm_page_dequeue_locked(vm_page_t m)
1751{
1752	struct vm_pagequeue *pq;
1753
1754	vm_page_lock_assert(m, MA_OWNED);
1755	pq = &vm_pagequeues[m->queue];
1756	vm_pagequeue_assert_locked(pq);
1757	m->queue = PQ_NONE;
1758	TAILQ_REMOVE(&pq->pq_pl, m, pageq);
1759	(*pq->pq_cnt)--;
1760}
1761
1762/*
1763 *	vm_page_enqueue:
1764 *
1765 *	Add the given page to the specified page queue.
1766 *
1767 *	The page must be locked.
1768 */
1769static void
1770vm_page_enqueue(int queue, vm_page_t m)
1771{
1772	struct vm_pagequeue *pq;
1773
1774	vm_page_lock_assert(m, MA_OWNED);
1775	pq = &vm_pagequeues[queue];
1776	vm_pagequeue_lock(pq);
1777	m->queue = queue;
1778	TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
1779	++*pq->pq_cnt;
1780	vm_pagequeue_unlock(pq);
1781}
1782
1783/*
1784 *	vm_page_requeue:
1785 *
1786 *	Move the given page to the tail of its current page queue.
1787 *
1788 *	The page must be locked.
1789 */
1790void
1791vm_page_requeue(vm_page_t m)
1792{
1793	struct vm_pagequeue *pq;
1794
1795	vm_page_lock_assert(m, MA_OWNED);
1796	KASSERT(m->queue != PQ_NONE,
1797	    ("vm_page_requeue: page %p is not queued", m));
1798	pq = &vm_pagequeues[m->queue];
1799	vm_pagequeue_lock(pq);
1800	TAILQ_REMOVE(&pq->pq_pl, m, pageq);
1801	TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
1802	vm_pagequeue_unlock(pq);
1803}
1804
1805/*
1806 *	vm_page_requeue_locked:
1807 *
1808 *	Move the given page to the tail of its current page queue.
1809 *
1810 *	The page queue must be locked.
1811 */
1812void
1813vm_page_requeue_locked(vm_page_t m)
1814{
1815	struct vm_pagequeue *pq;
1816
1817	KASSERT(m->queue != PQ_NONE,
1818	    ("vm_page_requeue_locked: page %p is not queued", m));
1819	pq = &vm_pagequeues[m->queue];
1820	vm_pagequeue_assert_locked(pq);
1821	TAILQ_REMOVE(&pq->pq_pl, m, pageq);
1822	TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
1823}
1824
1825/*
1826 *	vm_page_activate:
1827 *
1828 *	Put the specified page on the active list (if appropriate).
1829 *	Ensure that act_count is at least ACT_INIT but do not otherwise
1830 *	mess with it.
1831 *
1832 *	The page must be locked.
1833 */
1834void
1835vm_page_activate(vm_page_t m)
1836{
1837	int queue;
1838
1839	vm_page_lock_assert(m, MA_OWNED);
1840	VM_OBJECT_ASSERT_WLOCKED(m->object);
1841	if ((queue = m->queue) != PQ_ACTIVE) {
1842		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
1843			if (m->act_count < ACT_INIT)
1844				m->act_count = ACT_INIT;
1845			if (queue != PQ_NONE)
1846				vm_page_dequeue(m);
1847			vm_page_enqueue(PQ_ACTIVE, m);
1848		} else
1849			KASSERT(queue == PQ_NONE,
1850			    ("vm_page_activate: wired page %p is queued", m));
1851	} else {
1852		if (m->act_count < ACT_INIT)
1853			m->act_count = ACT_INIT;
1854	}
1855}
1856
1857/*
1858 *	vm_page_free_wakeup:
1859 *
1860 *	Helper routine for vm_page_free_toq() and vm_page_cache().  This
1861 *	routine is called when a page has been added to the cache or free
1862 *	queues.
1863 *
1864 *	The page queues must be locked.
1865 */
1866static inline void
1867vm_page_free_wakeup(void)
1868{
1869
1870	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1871	/*
1872	 * if pageout daemon needs pages, then tell it that there are
1873	 * some free.
1874	 */
1875	if (vm_pageout_pages_needed &&
1876	    cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
1877		wakeup(&vm_pageout_pages_needed);
1878		vm_pageout_pages_needed = 0;
1879	}
1880	/*
1881	 * wakeup processes that are waiting on memory if we hit a
1882	 * high water mark. And wakeup scheduler process if we have
1883	 * lots of memory. this process will swapin processes.
1884	 */
1885	if (vm_pages_needed && !vm_page_count_min()) {
1886		vm_pages_needed = 0;
1887		wakeup(&cnt.v_free_count);
1888	}
1889}
1890
1891/*
1892 *	vm_page_free_toq:
1893 *
1894 *	Returns the given page to the free list,
1895 *	disassociating it with any VM object.
1896 *
1897 *	The object must be locked.  The page must be locked if it is managed.
1898 */
1899void
1900vm_page_free_toq(vm_page_t m)
1901{
1902
1903	if ((m->oflags & VPO_UNMANAGED) == 0) {
1904		vm_page_lock_assert(m, MA_OWNED);
1905		KASSERT(!pmap_page_is_mapped(m),
1906		    ("vm_page_free_toq: freeing mapped page %p", m));
1907	} else
1908		KASSERT(m->queue == PQ_NONE,
1909		    ("vm_page_free_toq: unmanaged page %p is queued", m));
1910	PCPU_INC(cnt.v_tfree);
1911
1912	if (VM_PAGE_IS_FREE(m))
1913		panic("vm_page_free: freeing free page %p", m);
1914	else if (m->busy != 0)
1915		panic("vm_page_free: freeing busy page %p", m);
1916
1917	/*
1918	 * Unqueue, then remove page.  Note that we cannot destroy
1919	 * the page here because we do not want to call the pager's
1920	 * callback routine until after we've put the page on the
1921	 * appropriate free queue.
1922	 */
1923	vm_page_remque(m);
1924	vm_page_remove(m);
1925
1926	/*
1927	 * If fictitious remove object association and
1928	 * return, otherwise delay object association removal.
1929	 */
1930	if ((m->flags & PG_FICTITIOUS) != 0) {
1931		return;
1932	}
1933
1934	m->valid = 0;
1935	vm_page_undirty(m);
1936
1937	if (m->wire_count != 0)
1938		panic("vm_page_free: freeing wired page %p", m);
1939	if (m->hold_count != 0) {
1940		m->flags &= ~PG_ZERO;
1941		KASSERT((m->flags & PG_UNHOLDFREE) == 0,
1942		    ("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
1943		m->flags |= PG_UNHOLDFREE;
1944	} else {
1945		/*
1946		 * Restore the default memory attribute to the page.
1947		 */
1948		if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
1949			pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
1950
1951		/*
1952		 * Insert the page into the physical memory allocator's
1953		 * cache/free page queues.
1954		 */
1955		mtx_lock(&vm_page_queue_free_mtx);
1956		m->flags |= PG_FREE;
1957		cnt.v_free_count++;
1958#if VM_NRESERVLEVEL > 0
1959		if (!vm_reserv_free_page(m))
1960#else
1961		if (TRUE)
1962#endif
1963			vm_phys_free_pages(m, 0);
1964		if ((m->flags & PG_ZERO) != 0)
1965			++vm_page_zero_count;
1966		else
1967			vm_page_zero_idle_wakeup();
1968		vm_page_free_wakeup();
1969		mtx_unlock(&vm_page_queue_free_mtx);
1970	}
1971}
1972
1973/*
1974 *	vm_page_wire:
1975 *
1976 *	Mark this page as wired down by yet
1977 *	another map, removing it from paging queues
1978 *	as necessary.
1979 *
1980 *	If the page is fictitious, then its wire count must remain one.
1981 *
1982 *	The page must be locked.
1983 */
1984void
1985vm_page_wire(vm_page_t m)
1986{
1987
1988	/*
1989	 * Only bump the wire statistics if the page is not already wired,
1990	 * and only unqueue the page if it is on some queue (if it is unmanaged
1991	 * it is already off the queues).
1992	 */
1993	vm_page_lock_assert(m, MA_OWNED);
1994	if ((m->flags & PG_FICTITIOUS) != 0) {
1995		KASSERT(m->wire_count == 1,
1996		    ("vm_page_wire: fictitious page %p's wire count isn't one",
1997		    m));
1998		return;
1999	}
2000	if (m->wire_count == 0) {
2001		KASSERT((m->oflags & VPO_UNMANAGED) == 0 ||
2002		    m->queue == PQ_NONE,
2003		    ("vm_page_wire: unmanaged page %p is queued", m));
2004		vm_page_remque(m);
2005		atomic_add_int(&cnt.v_wire_count, 1);
2006	}
2007	m->wire_count++;
2008	KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
2009}
2010
2011/*
2012 * vm_page_unwire:
2013 *
2014 * Release one wiring of the specified page, potentially enabling it to be
2015 * paged again.  If paging is enabled, then the value of the parameter
2016 * "activate" determines to which queue the page is added.  If "activate" is
2017 * non-zero, then the page is added to the active queue.  Otherwise, it is
2018 * added to the inactive queue.
2019 *
2020 * However, unless the page belongs to an object, it is not enqueued because
2021 * it cannot be paged out.
2022 *
2023 * If a page is fictitious, then its wire count must alway be one.
2024 *
2025 * A managed page must be locked.
2026 */
2027void
2028vm_page_unwire(vm_page_t m, int activate)
2029{
2030
2031	if ((m->oflags & VPO_UNMANAGED) == 0)
2032		vm_page_lock_assert(m, MA_OWNED);
2033	if ((m->flags & PG_FICTITIOUS) != 0) {
2034		KASSERT(m->wire_count == 1,
2035	    ("vm_page_unwire: fictitious page %p's wire count isn't one", m));
2036		return;
2037	}
2038	if (m->wire_count > 0) {
2039		m->wire_count--;
2040		if (m->wire_count == 0) {
2041			atomic_subtract_int(&cnt.v_wire_count, 1);
2042			if ((m->oflags & VPO_UNMANAGED) != 0 ||
2043			    m->object == NULL)
2044				return;
2045			if (!activate)
2046				m->flags &= ~PG_WINATCFLS;
2047			vm_page_enqueue(activate ? PQ_ACTIVE : PQ_INACTIVE, m);
2048		}
2049	} else
2050		panic("vm_page_unwire: page %p's wire count is zero", m);
2051}
2052
2053/*
2054 * Move the specified page to the inactive queue.
2055 *
2056 * Many pages placed on the inactive queue should actually go
2057 * into the cache, but it is difficult to figure out which.  What
2058 * we do instead, if the inactive target is well met, is to put
2059 * clean pages at the head of the inactive queue instead of the tail.
2060 * This will cause them to be moved to the cache more quickly and
2061 * if not actively re-referenced, reclaimed more quickly.  If we just
2062 * stick these pages at the end of the inactive queue, heavy filesystem
2063 * meta-data accesses can cause an unnecessary paging load on memory bound
2064 * processes.  This optimization causes one-time-use metadata to be
2065 * reused more quickly.
2066 *
2067 * Normally athead is 0 resulting in LRU operation.  athead is set
2068 * to 1 if we want this page to be 'as if it were placed in the cache',
2069 * except without unmapping it from the process address space.
2070 *
2071 * The page must be locked.
2072 */
2073static inline void
2074_vm_page_deactivate(vm_page_t m, int athead)
2075{
2076	struct vm_pagequeue *pq;
2077	int queue;
2078
2079	vm_page_lock_assert(m, MA_OWNED);
2080
2081	/*
2082	 * Ignore if already inactive.
2083	 */
2084	if ((queue = m->queue) == PQ_INACTIVE)
2085		return;
2086	if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
2087		if (queue != PQ_NONE)
2088			vm_page_dequeue(m);
2089		m->flags &= ~PG_WINATCFLS;
2090		pq = &vm_pagequeues[PQ_INACTIVE];
2091		vm_pagequeue_lock(pq);
2092		m->queue = PQ_INACTIVE;
2093		if (athead)
2094			TAILQ_INSERT_HEAD(&pq->pq_pl, m, pageq);
2095		else
2096			TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
2097		cnt.v_inactive_count++;
2098		vm_pagequeue_unlock(pq);
2099	}
2100}
2101
2102/*
2103 * Move the specified page to the inactive queue.
2104 *
2105 * The page must be locked.
2106 */
2107void
2108vm_page_deactivate(vm_page_t m)
2109{
2110
2111	_vm_page_deactivate(m, 0);
2112}
2113
2114/*
2115 * vm_page_try_to_cache:
2116 *
2117 * Returns 0 on failure, 1 on success
2118 */
2119int
2120vm_page_try_to_cache(vm_page_t m)
2121{
2122
2123	vm_page_lock_assert(m, MA_OWNED);
2124	VM_OBJECT_ASSERT_WLOCKED(m->object);
2125	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
2126	    (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0)
2127		return (0);
2128	pmap_remove_all(m);
2129	if (m->dirty)
2130		return (0);
2131	vm_page_cache(m);
2132	return (1);
2133}
2134
2135/*
2136 * vm_page_try_to_free()
2137 *
2138 *	Attempt to free the page.  If we cannot free it, we do nothing.
2139 *	1 is returned on success, 0 on failure.
2140 */
2141int
2142vm_page_try_to_free(vm_page_t m)
2143{
2144
2145	vm_page_lock_assert(m, MA_OWNED);
2146	if (m->object != NULL)
2147		VM_OBJECT_ASSERT_WLOCKED(m->object);
2148	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
2149	    (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0)
2150		return (0);
2151	pmap_remove_all(m);
2152	if (m->dirty)
2153		return (0);
2154	vm_page_free(m);
2155	return (1);
2156}
2157
2158/*
2159 * vm_page_cache
2160 *
2161 * Put the specified page onto the page cache queue (if appropriate).
2162 *
2163 * The object and page must be locked.
2164 */
2165void
2166vm_page_cache(vm_page_t m)
2167{
2168	vm_object_t object;
2169	boolean_t cache_was_empty;
2170
2171	vm_page_lock_assert(m, MA_OWNED);
2172	object = m->object;
2173	VM_OBJECT_ASSERT_WLOCKED(object);
2174	if ((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) || m->busy ||
2175	    m->hold_count || m->wire_count)
2176		panic("vm_page_cache: attempting to cache busy page");
2177	KASSERT(!pmap_page_is_mapped(m),
2178	    ("vm_page_cache: page %p is mapped", m));
2179	KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m));
2180	if (m->valid == 0 || object->type == OBJT_DEFAULT ||
2181	    (object->type == OBJT_SWAP &&
2182	    !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
2183		/*
2184		 * Hypothesis: A cache-elgible page belonging to a
2185		 * default object or swap object but without a backing
2186		 * store must be zero filled.
2187		 */
2188		vm_page_free(m);
2189		return;
2190	}
2191	KASSERT((m->flags & PG_CACHED) == 0,
2192	    ("vm_page_cache: page %p is already cached", m));
2193	PCPU_INC(cnt.v_tcached);
2194
2195	/*
2196	 * Remove the page from the paging queues.
2197	 */
2198	vm_page_remque(m);
2199
2200	/*
2201	 * Remove the page from the object's collection of resident
2202	 * pages.
2203	 */
2204	vm_radix_remove(&object->rtree, m->pindex);
2205	TAILQ_REMOVE(&object->memq, m, listq);
2206	object->resident_page_count--;
2207
2208	/*
2209	 * Restore the default memory attribute to the page.
2210	 */
2211	if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
2212		pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
2213
2214	/*
2215	 * Insert the page into the object's collection of cached pages
2216	 * and the physical memory allocator's cache/free page queues.
2217	 */
2218	m->flags &= ~PG_ZERO;
2219	mtx_lock(&vm_page_queue_free_mtx);
2220	m->flags |= PG_CACHED;
2221	cnt.v_cache_count++;
2222	cache_was_empty = vm_radix_is_empty(&object->cache);
2223	vm_radix_insert(&object->cache, m);
2224#if VM_NRESERVLEVEL > 0
2225	if (!vm_reserv_free_page(m)) {
2226#else
2227	if (TRUE) {
2228#endif
2229		vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0);
2230		vm_phys_free_pages(m, 0);
2231	}
2232	vm_page_free_wakeup();
2233	mtx_unlock(&vm_page_queue_free_mtx);
2234
2235	/*
2236	 * Increment the vnode's hold count if this is the object's only
2237	 * cached page.  Decrement the vnode's hold count if this was
2238	 * the object's only resident page.
2239	 */
2240	if (object->type == OBJT_VNODE) {
2241		if (cache_was_empty && object->resident_page_count != 0)
2242			vhold(object->handle);
2243		else if (!cache_was_empty && object->resident_page_count == 0)
2244			vdrop(object->handle);
2245	}
2246}
2247
2248/*
2249 * vm_page_dontneed
2250 *
2251 *	Cache, deactivate, or do nothing as appropriate.  This routine
2252 *	is typically used by madvise() MADV_DONTNEED.
2253 *
2254 *	Generally speaking we want to move the page into the cache so
2255 *	it gets reused quickly.  However, this can result in a silly syndrome
2256 *	due to the page recycling too quickly.  Small objects will not be
2257 *	fully cached.  On the otherhand, if we move the page to the inactive
2258 *	queue we wind up with a problem whereby very large objects
2259 *	unnecessarily blow away our inactive and cache queues.
2260 *
2261 *	The solution is to move the pages based on a fixed weighting.  We
2262 *	either leave them alone, deactivate them, or move them to the cache,
2263 *	where moving them to the cache has the highest weighting.
2264 *	By forcing some pages into other queues we eventually force the
2265 *	system to balance the queues, potentially recovering other unrelated
2266 *	space from active.  The idea is to not force this to happen too
2267 *	often.
2268 *
2269 *	The object and page must be locked.
2270 */
2271void
2272vm_page_dontneed(vm_page_t m)
2273{
2274	int dnw;
2275	int head;
2276
2277	vm_page_lock_assert(m, MA_OWNED);
2278	VM_OBJECT_ASSERT_WLOCKED(m->object);
2279	dnw = PCPU_GET(dnweight);
2280	PCPU_INC(dnweight);
2281
2282	/*
2283	 * Occasionally leave the page alone.
2284	 */
2285	if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE) {
2286		if (m->act_count >= ACT_INIT)
2287			--m->act_count;
2288		return;
2289	}
2290
2291	/*
2292	 * Clear any references to the page.  Otherwise, the page daemon will
2293	 * immediately reactivate the page.
2294	 *
2295	 * Perform the pmap_clear_reference() first.  Otherwise, a concurrent
2296	 * pmap operation, such as pmap_remove(), could clear a reference in
2297	 * the pmap and set PGA_REFERENCED on the page before the
2298	 * pmap_clear_reference() had completed.  Consequently, the page would
2299	 * appear referenced based upon an old reference that occurred before
2300	 * this function ran.
2301	 */
2302	pmap_clear_reference(m);
2303	vm_page_aflag_clear(m, PGA_REFERENCED);
2304
2305	if (m->dirty == 0 && pmap_is_modified(m))
2306		vm_page_dirty(m);
2307
2308	if (m->dirty || (dnw & 0x0070) == 0) {
2309		/*
2310		 * Deactivate the page 3 times out of 32.
2311		 */
2312		head = 0;
2313	} else {
2314		/*
2315		 * Cache the page 28 times out of every 32.  Note that
2316		 * the page is deactivated instead of cached, but placed
2317		 * at the head of the queue instead of the tail.
2318		 */
2319		head = 1;
2320	}
2321	_vm_page_deactivate(m, head);
2322}
2323
2324/*
2325 * Grab a page, waiting until we are waken up due to the page
2326 * changing state.  We keep on waiting, if the page continues
2327 * to be in the object.  If the page doesn't exist, first allocate it
2328 * and then conditionally zero it.
2329 *
2330 * The caller must always specify the VM_ALLOC_RETRY flag.  This is intended
2331 * to facilitate its eventual removal.
2332 *
2333 * This routine may sleep.
2334 *
2335 * The object must be locked on entry.  The lock will, however, be released
2336 * and reacquired if the routine sleeps.
2337 */
2338vm_page_t
2339vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
2340{
2341	vm_page_t m;
2342
2343	VM_OBJECT_ASSERT_WLOCKED(object);
2344	KASSERT((allocflags & VM_ALLOC_RETRY) != 0,
2345	    ("vm_page_grab: VM_ALLOC_RETRY is required"));
2346retrylookup:
2347	if ((m = vm_page_lookup(object, pindex)) != NULL) {
2348		if ((m->oflags & VPO_BUSY) != 0 ||
2349		    ((allocflags & VM_ALLOC_IGN_SBUSY) == 0 && m->busy != 0)) {
2350			/*
2351			 * Reference the page before unlocking and
2352			 * sleeping so that the page daemon is less
2353			 * likely to reclaim it.
2354			 */
2355			vm_page_aflag_set(m, PGA_REFERENCED);
2356			vm_page_sleep(m, "pgrbwt");
2357			goto retrylookup;
2358		} else {
2359			if ((allocflags & VM_ALLOC_WIRED) != 0) {
2360				vm_page_lock(m);
2361				vm_page_wire(m);
2362				vm_page_unlock(m);
2363			}
2364			if ((allocflags & VM_ALLOC_NOBUSY) == 0)
2365				vm_page_busy(m);
2366			return (m);
2367		}
2368	}
2369	m = vm_page_alloc(object, pindex, allocflags & ~(VM_ALLOC_RETRY |
2370	    VM_ALLOC_IGN_SBUSY));
2371	if (m == NULL) {
2372		VM_OBJECT_WUNLOCK(object);
2373		VM_WAIT;
2374		VM_OBJECT_WLOCK(object);
2375		goto retrylookup;
2376	} else if (m->valid != 0)
2377		return (m);
2378	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
2379		pmap_zero_page(m);
2380	return (m);
2381}
2382
2383/*
2384 * Mapping function for valid or dirty bits in a page.
2385 *
2386 * Inputs are required to range within a page.
2387 */
2388vm_page_bits_t
2389vm_page_bits(int base, int size)
2390{
2391	int first_bit;
2392	int last_bit;
2393
2394	KASSERT(
2395	    base + size <= PAGE_SIZE,
2396	    ("vm_page_bits: illegal base/size %d/%d", base, size)
2397	);
2398
2399	if (size == 0)		/* handle degenerate case */
2400		return (0);
2401
2402	first_bit = base >> DEV_BSHIFT;
2403	last_bit = (base + size - 1) >> DEV_BSHIFT;
2404
2405	return (((vm_page_bits_t)2 << last_bit) -
2406	    ((vm_page_bits_t)1 << first_bit));
2407}
2408
2409/*
2410 *	vm_page_set_valid_range:
2411 *
2412 *	Sets portions of a page valid.  The arguments are expected
2413 *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
2414 *	of any partial chunks touched by the range.  The invalid portion of
2415 *	such chunks will be zeroed.
2416 *
2417 *	(base + size) must be less then or equal to PAGE_SIZE.
2418 */
2419void
2420vm_page_set_valid_range(vm_page_t m, int base, int size)
2421{
2422	int endoff, frag;
2423
2424	VM_OBJECT_ASSERT_WLOCKED(m->object);
2425	if (size == 0)	/* handle degenerate case */
2426		return;
2427
2428	/*
2429	 * If the base is not DEV_BSIZE aligned and the valid
2430	 * bit is clear, we have to zero out a portion of the
2431	 * first block.
2432	 */
2433	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
2434	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
2435		pmap_zero_page_area(m, frag, base - frag);
2436
2437	/*
2438	 * If the ending offset is not DEV_BSIZE aligned and the
2439	 * valid bit is clear, we have to zero out a portion of
2440	 * the last block.
2441	 */
2442	endoff = base + size;
2443	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
2444	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
2445		pmap_zero_page_area(m, endoff,
2446		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
2447
2448	/*
2449	 * Assert that no previously invalid block that is now being validated
2450	 * is already dirty.
2451	 */
2452	KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
2453	    ("vm_page_set_valid_range: page %p is dirty", m));
2454
2455	/*
2456	 * Set valid bits inclusive of any overlap.
2457	 */
2458	m->valid |= vm_page_bits(base, size);
2459}
2460
2461/*
2462 * Clear the given bits from the specified page's dirty field.
2463 */
2464static __inline void
2465vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
2466{
2467	uintptr_t addr;
2468#if PAGE_SIZE < 16384
2469	int shift;
2470#endif
2471
2472	/*
2473	 * If the object is locked and the page is neither VPO_BUSY nor
2474	 * write mapped, then the page's dirty field cannot possibly be
2475	 * set by a concurrent pmap operation.
2476	 */
2477	VM_OBJECT_ASSERT_WLOCKED(m->object);
2478	if ((m->oflags & VPO_BUSY) == 0 && !pmap_page_is_write_mapped(m))
2479		m->dirty &= ~pagebits;
2480	else {
2481		/*
2482		 * The pmap layer can call vm_page_dirty() without
2483		 * holding a distinguished lock.  The combination of
2484		 * the object's lock and an atomic operation suffice
2485		 * to guarantee consistency of the page dirty field.
2486		 *
2487		 * For PAGE_SIZE == 32768 case, compiler already
2488		 * properly aligns the dirty field, so no forcible
2489		 * alignment is needed. Only require existence of
2490		 * atomic_clear_64 when page size is 32768.
2491		 */
2492		addr = (uintptr_t)&m->dirty;
2493#if PAGE_SIZE == 32768
2494		atomic_clear_64((uint64_t *)addr, pagebits);
2495#elif PAGE_SIZE == 16384
2496		atomic_clear_32((uint32_t *)addr, pagebits);
2497#else		/* PAGE_SIZE <= 8192 */
2498		/*
2499		 * Use a trick to perform a 32-bit atomic on the
2500		 * containing aligned word, to not depend on the existence
2501		 * of atomic_clear_{8, 16}.
2502		 */
2503		shift = addr & (sizeof(uint32_t) - 1);
2504#if BYTE_ORDER == BIG_ENDIAN
2505		shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY;
2506#else
2507		shift *= NBBY;
2508#endif
2509		addr &= ~(sizeof(uint32_t) - 1);
2510		atomic_clear_32((uint32_t *)addr, pagebits << shift);
2511#endif		/* PAGE_SIZE */
2512	}
2513}
2514
2515/*
2516 *	vm_page_set_validclean:
2517 *
2518 *	Sets portions of a page valid and clean.  The arguments are expected
2519 *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
2520 *	of any partial chunks touched by the range.  The invalid portion of
2521 *	such chunks will be zero'd.
2522 *
2523 *	(base + size) must be less then or equal to PAGE_SIZE.
2524 */
2525void
2526vm_page_set_validclean(vm_page_t m, int base, int size)
2527{
2528	vm_page_bits_t oldvalid, pagebits;
2529	int endoff, frag;
2530
2531	VM_OBJECT_ASSERT_WLOCKED(m->object);
2532	if (size == 0)	/* handle degenerate case */
2533		return;
2534
2535	/*
2536	 * If the base is not DEV_BSIZE aligned and the valid
2537	 * bit is clear, we have to zero out a portion of the
2538	 * first block.
2539	 */
2540	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
2541	    (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
2542		pmap_zero_page_area(m, frag, base - frag);
2543
2544	/*
2545	 * If the ending offset is not DEV_BSIZE aligned and the
2546	 * valid bit is clear, we have to zero out a portion of
2547	 * the last block.
2548	 */
2549	endoff = base + size;
2550	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
2551	    (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
2552		pmap_zero_page_area(m, endoff,
2553		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
2554
2555	/*
2556	 * Set valid, clear dirty bits.  If validating the entire
2557	 * page we can safely clear the pmap modify bit.  We also
2558	 * use this opportunity to clear the VPO_NOSYNC flag.  If a process
2559	 * takes a write fault on a MAP_NOSYNC memory area the flag will
2560	 * be set again.
2561	 *
2562	 * We set valid bits inclusive of any overlap, but we can only
2563	 * clear dirty bits for DEV_BSIZE chunks that are fully within
2564	 * the range.
2565	 */
2566	oldvalid = m->valid;
2567	pagebits = vm_page_bits(base, size);
2568	m->valid |= pagebits;
2569#if 0	/* NOT YET */
2570	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
2571		frag = DEV_BSIZE - frag;
2572		base += frag;
2573		size -= frag;
2574		if (size < 0)
2575			size = 0;
2576	}
2577	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
2578#endif
2579	if (base == 0 && size == PAGE_SIZE) {
2580		/*
2581		 * The page can only be modified within the pmap if it is
2582		 * mapped, and it can only be mapped if it was previously
2583		 * fully valid.
2584		 */
2585		if (oldvalid == VM_PAGE_BITS_ALL)
2586			/*
2587			 * Perform the pmap_clear_modify() first.  Otherwise,
2588			 * a concurrent pmap operation, such as
2589			 * pmap_protect(), could clear a modification in the
2590			 * pmap and set the dirty field on the page before
2591			 * pmap_clear_modify() had begun and after the dirty
2592			 * field was cleared here.
2593			 */
2594			pmap_clear_modify(m);
2595		m->dirty = 0;
2596		m->oflags &= ~VPO_NOSYNC;
2597	} else if (oldvalid != VM_PAGE_BITS_ALL)
2598		m->dirty &= ~pagebits;
2599	else
2600		vm_page_clear_dirty_mask(m, pagebits);
2601}
2602
2603void
2604vm_page_clear_dirty(vm_page_t m, int base, int size)
2605{
2606
2607	vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
2608}
2609
2610/*
2611 *	vm_page_set_invalid:
2612 *
2613 *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
2614 *	valid and dirty bits for the effected areas are cleared.
2615 */
2616void
2617vm_page_set_invalid(vm_page_t m, int base, int size)
2618{
2619	vm_page_bits_t bits;
2620
2621	VM_OBJECT_ASSERT_WLOCKED(m->object);
2622	KASSERT((m->oflags & VPO_BUSY) == 0,
2623	    ("vm_page_set_invalid: page %p is busy", m));
2624	bits = vm_page_bits(base, size);
2625	if (m->valid == VM_PAGE_BITS_ALL && bits != 0)
2626		pmap_remove_all(m);
2627	KASSERT(!pmap_page_is_mapped(m),
2628	    ("vm_page_set_invalid: page %p is mapped", m));
2629	m->valid &= ~bits;
2630	m->dirty &= ~bits;
2631}
2632
2633/*
2634 * vm_page_zero_invalid()
2635 *
2636 *	The kernel assumes that the invalid portions of a page contain
2637 *	garbage, but such pages can be mapped into memory by user code.
2638 *	When this occurs, we must zero out the non-valid portions of the
2639 *	page so user code sees what it expects.
2640 *
2641 *	Pages are most often semi-valid when the end of a file is mapped
2642 *	into memory and the file's size is not page aligned.
2643 */
2644void
2645vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
2646{
2647	int b;
2648	int i;
2649
2650	VM_OBJECT_ASSERT_WLOCKED(m->object);
2651	/*
2652	 * Scan the valid bits looking for invalid sections that
2653	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
2654	 * valid bit may be set ) have already been zerod by
2655	 * vm_page_set_validclean().
2656	 */
2657	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
2658		if (i == (PAGE_SIZE / DEV_BSIZE) ||
2659		    (m->valid & ((vm_page_bits_t)1 << i))) {
2660			if (i > b) {
2661				pmap_zero_page_area(m,
2662				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
2663			}
2664			b = i + 1;
2665		}
2666	}
2667
2668	/*
2669	 * setvalid is TRUE when we can safely set the zero'd areas
2670	 * as being valid.  We can do this if there are no cache consistancy
2671	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
2672	 */
2673	if (setvalid)
2674		m->valid = VM_PAGE_BITS_ALL;
2675}
2676
2677/*
2678 *	vm_page_is_valid:
2679 *
2680 *	Is (partial) page valid?  Note that the case where size == 0
2681 *	will return FALSE in the degenerate case where the page is
2682 *	entirely invalid, and TRUE otherwise.
2683 */
2684int
2685vm_page_is_valid(vm_page_t m, int base, int size)
2686{
2687	vm_page_bits_t bits;
2688
2689	VM_OBJECT_ASSERT_WLOCKED(m->object);
2690	bits = vm_page_bits(base, size);
2691	return (m->valid != 0 && (m->valid & bits) == bits);
2692}
2693
2694/*
2695 * Set the page's dirty bits if the page is modified.
2696 */
2697void
2698vm_page_test_dirty(vm_page_t m)
2699{
2700
2701	VM_OBJECT_ASSERT_WLOCKED(m->object);
2702	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
2703		vm_page_dirty(m);
2704}
2705
2706void
2707vm_page_lock_KBI(vm_page_t m, const char *file, int line)
2708{
2709
2710	mtx_lock_flags_(vm_page_lockptr(m), 0, file, line);
2711}
2712
2713void
2714vm_page_unlock_KBI(vm_page_t m, const char *file, int line)
2715{
2716
2717	mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line);
2718}
2719
2720int
2721vm_page_trylock_KBI(vm_page_t m, const char *file, int line)
2722{
2723
2724	return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line));
2725}
2726
2727#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
2728void
2729vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
2730{
2731
2732	mtx_assert_(vm_page_lockptr(m), a, file, line);
2733}
2734#endif
2735
2736int so_zerocp_fullpage = 0;
2737
2738/*
2739 *	Replace the given page with a copy.  The copied page assumes
2740 *	the portion of the given page's "wire_count" that is not the
2741 *	responsibility of this copy-on-write mechanism.
2742 *
2743 *	The object containing the given page must have a non-zero
2744 *	paging-in-progress count and be locked.
2745 */
2746void
2747vm_page_cowfault(vm_page_t m)
2748{
2749	vm_page_t mnew;
2750	vm_object_t object;
2751	vm_pindex_t pindex;
2752
2753	vm_page_lock_assert(m, MA_OWNED);
2754	object = m->object;
2755	VM_OBJECT_ASSERT_WLOCKED(object);
2756	KASSERT(object->paging_in_progress != 0,
2757	    ("vm_page_cowfault: object %p's paging-in-progress count is zero.",
2758	    object));
2759	pindex = m->pindex;
2760
2761 retry_alloc:
2762	pmap_remove_all(m);
2763	vm_page_remove(m);
2764	mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
2765	if (mnew == NULL) {
2766		vm_page_insert(m, object, pindex);
2767		vm_page_unlock(m);
2768		VM_OBJECT_WUNLOCK(object);
2769		VM_WAIT;
2770		VM_OBJECT_WLOCK(object);
2771		if (m == vm_page_lookup(object, pindex)) {
2772			vm_page_lock(m);
2773			goto retry_alloc;
2774		} else {
2775			/*
2776			 * Page disappeared during the wait.
2777			 */
2778			return;
2779		}
2780	}
2781
2782	if (m->cow == 0) {
2783		/*
2784		 * check to see if we raced with an xmit complete when
2785		 * waiting to allocate a page.  If so, put things back
2786		 * the way they were
2787		 */
2788		vm_page_unlock(m);
2789		vm_page_lock(mnew);
2790		vm_page_free(mnew);
2791		vm_page_unlock(mnew);
2792		vm_page_insert(m, object, pindex);
2793	} else { /* clear COW & copy page */
2794		if (!so_zerocp_fullpage)
2795			pmap_copy_page(m, mnew);
2796		mnew->valid = VM_PAGE_BITS_ALL;
2797		vm_page_dirty(mnew);
2798		mnew->wire_count = m->wire_count - m->cow;
2799		m->wire_count = m->cow;
2800		vm_page_unlock(m);
2801	}
2802}
2803
2804void
2805vm_page_cowclear(vm_page_t m)
2806{
2807
2808	vm_page_lock_assert(m, MA_OWNED);
2809	if (m->cow) {
2810		m->cow--;
2811		/*
2812		 * let vm_fault add back write permission  lazily
2813		 */
2814	}
2815	/*
2816	 *  sf_buf_free() will free the page, so we needn't do it here
2817	 */
2818}
2819
2820int
2821vm_page_cowsetup(vm_page_t m)
2822{
2823
2824	vm_page_lock_assert(m, MA_OWNED);
2825	if ((m->flags & PG_FICTITIOUS) != 0 ||
2826	    (m->oflags & VPO_UNMANAGED) != 0 ||
2827	    m->cow == USHRT_MAX - 1 || !VM_OBJECT_TRYWLOCK(m->object))
2828		return (EBUSY);
2829	m->cow++;
2830	pmap_remove_write(m);
2831	VM_OBJECT_WUNLOCK(m->object);
2832	return (0);
2833}
2834
2835#ifdef INVARIANTS
2836void
2837vm_page_object_lock_assert(vm_page_t m)
2838{
2839
2840	/*
2841	 * Certain of the page's fields may only be modified by the
2842	 * holder of the containing object's lock or the setter of the
2843	 * page's VPO_BUSY flag.  Unfortunately, the setter of the
2844	 * VPO_BUSY flag is not recorded, and thus cannot be checked
2845	 * here.
2846	 */
2847	if (m->object != NULL && (m->oflags & VPO_BUSY) == 0)
2848		VM_OBJECT_ASSERT_WLOCKED(m->object);
2849}
2850#endif
2851
2852#include "opt_ddb.h"
2853#ifdef DDB
2854#include <sys/kernel.h>
2855
2856#include <ddb/ddb.h>
2857
2858DB_SHOW_COMMAND(page, vm_page_print_page_info)
2859{
2860	db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
2861	db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
2862	db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
2863	db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
2864	db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
2865	db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
2866	db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
2867	db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
2868	db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
2869	db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
2870}
2871
2872DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
2873{
2874
2875	db_printf("PQ_FREE:");
2876	db_printf(" %d", cnt.v_free_count);
2877	db_printf("\n");
2878
2879	db_printf("PQ_CACHE:");
2880	db_printf(" %d", cnt.v_cache_count);
2881	db_printf("\n");
2882
2883	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
2884		*vm_pagequeues[PQ_ACTIVE].pq_cnt,
2885		*vm_pagequeues[PQ_INACTIVE].pq_cnt);
2886}
2887
2888DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
2889{
2890	vm_page_t m;
2891	boolean_t phys;
2892
2893	if (!have_addr) {
2894		db_printf("show pginfo addr\n");
2895		return;
2896	}
2897
2898	phys = strchr(modif, 'p') != NULL;
2899	if (phys)
2900		m = PHYS_TO_VM_PAGE(addr);
2901	else
2902		m = (vm_page_t)addr;
2903	db_printf(
2904    "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n"
2905    "  af 0x%x of 0x%x f 0x%x act %d busy %d valid 0x%x dirty 0x%x\n",
2906	    m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr,
2907	    m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags,
2908	    m->flags, m->act_count, m->busy, m->valid, m->dirty);
2909}
2910#endif /* DDB */
2911