vm_page.c revision 192034
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * The Mach Operating System project at Carnegie-Mellon University.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
34 */
35
36/*-
37 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38 * All rights reserved.
39 *
40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41 *
42 * Permission to use, copy, modify and distribute this software and
43 * its documentation is hereby granted, provided that both the copyright
44 * notice and this permission notice appear in all copies of the
45 * software, derivative works or modified versions, and any portions
46 * thereof, and that both notices appear in supporting documentation.
47 *
48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51 *
52 * Carnegie Mellon requests users of this software to return to
53 *
54 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55 *  School of Computer Science
56 *  Carnegie Mellon University
57 *  Pittsburgh PA 15213-3890
58 *
59 * any improvements or extensions that they make and grant Carnegie the
60 * rights to redistribute these changes.
61 */
62
63/*
64 *			GENERAL RULES ON VM_PAGE MANIPULATION
65 *
66 *	- a pageq mutex is required when adding or removing a page from a
67 *	  page queue (vm_page_queue[]), regardless of other mutexes or the
68 *	  busy state of a page.
69 *
70 *	- a hash chain mutex is required when associating or disassociating
71 *	  a page from the VM PAGE CACHE hash table (vm_page_buckets),
72 *	  regardless of other mutexes or the busy state of a page.
73 *
74 *	- either a hash chain mutex OR a busied page is required in order
75 *	  to modify the page flags.  A hash chain mutex must be obtained in
76 *	  order to busy a page.  A page's flags cannot be modified by a
77 *	  hash chain mutex if the page is marked busy.
78 *
79 *	- The object memq mutex is held when inserting or removing
80 *	  pages from an object (vm_page_insert() or vm_page_remove()).  This
81 *	  is different from the object's main mutex.
82 *
83 *	Generally speaking, you have to be aware of side effects when running
84 *	vm_page ops.  A vm_page_lookup() will return with the hash chain
85 *	locked, whether it was able to lookup the page or not.  vm_page_free(),
86 *	vm_page_cache(), vm_page_activate(), and a number of other routines
87 *	will release the hash chain mutex for you.  Intermediate manipulation
88 *	routines such as vm_page_flag_set() expect the hash chain to be held
89 *	on entry and the hash chain will remain held on return.
90 *
91 *	pageq scanning can only occur with the pageq in question locked.
92 *	We have a known bottleneck with the active queue, but the cache
93 *	and free queues are actually arrays already.
94 */
95
96/*
97 *	Resident memory management module.
98 */
99
100#include <sys/cdefs.h>
101__FBSDID("$FreeBSD: head/sys/vm/vm_page.c 192034 2009-05-13 05:39:39Z alc $");
102
103#include "opt_vm.h"
104
105#include <sys/param.h>
106#include <sys/systm.h>
107#include <sys/lock.h>
108#include <sys/kernel.h>
109#include <sys/limits.h>
110#include <sys/malloc.h>
111#include <sys/mutex.h>
112#include <sys/proc.h>
113#include <sys/sysctl.h>
114#include <sys/vmmeter.h>
115#include <sys/vnode.h>
116
117#include <vm/vm.h>
118#include <vm/vm_param.h>
119#include <vm/vm_kern.h>
120#include <vm/vm_object.h>
121#include <vm/vm_page.h>
122#include <vm/vm_pageout.h>
123#include <vm/vm_pager.h>
124#include <vm/vm_phys.h>
125#include <vm/vm_reserv.h>
126#include <vm/vm_extern.h>
127#include <vm/uma.h>
128#include <vm/uma_int.h>
129
130#include <machine/md_var.h>
131
132/*
133 *	Associated with page of user-allocatable memory is a
134 *	page structure.
135 */
136
137struct vpgqueues vm_page_queues[PQ_COUNT];
138struct mtx vm_page_queue_mtx;
139struct mtx vm_page_queue_free_mtx;
140
141vm_page_t vm_page_array = 0;
142int vm_page_array_size = 0;
143long first_page = 0;
144int vm_page_zero_count = 0;
145
146static int boot_pages = UMA_BOOT_PAGES;
147TUNABLE_INT("vm.boot_pages", &boot_pages);
148SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
149	"number of pages allocated for bootstrapping the VM system");
150
151static void vm_page_enqueue(int queue, vm_page_t m);
152
153/* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
154#if PAGE_SIZE == 32768
155#ifdef CTASSERT
156CTASSERT(sizeof(u_long) >= 8);
157#endif
158#endif
159
160/*
161 *	vm_set_page_size:
162 *
163 *	Sets the page size, perhaps based upon the memory
164 *	size.  Must be called before any use of page-size
165 *	dependent functions.
166 */
167void
168vm_set_page_size(void)
169{
170	if (cnt.v_page_size == 0)
171		cnt.v_page_size = PAGE_SIZE;
172	if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
173		panic("vm_set_page_size: page size not a power of two");
174}
175
176/*
177 *	vm_page_blacklist_lookup:
178 *
179 *	See if a physical address in this page has been listed
180 *	in the blacklist tunable.  Entries in the tunable are
181 *	separated by spaces or commas.  If an invalid integer is
182 *	encountered then the rest of the string is skipped.
183 */
184static int
185vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
186{
187	vm_paddr_t bad;
188	char *cp, *pos;
189
190	for (pos = list; *pos != '\0'; pos = cp) {
191		bad = strtoq(pos, &cp, 0);
192		if (*cp != '\0') {
193			if (*cp == ' ' || *cp == ',') {
194				cp++;
195				if (cp == pos)
196					continue;
197			} else
198				break;
199		}
200		if (pa == trunc_page(bad))
201			return (1);
202	}
203	return (0);
204}
205
206/*
207 *	vm_page_startup:
208 *
209 *	Initializes the resident memory module.
210 *
211 *	Allocates memory for the page cells, and
212 *	for the object/offset-to-page hash table headers.
213 *	Each page cell is initialized and placed on the free list.
214 */
215vm_offset_t
216vm_page_startup(vm_offset_t vaddr)
217{
218	vm_offset_t mapped;
219	vm_paddr_t page_range;
220	vm_paddr_t new_end;
221	int i;
222	vm_paddr_t pa;
223	int nblocks;
224	vm_paddr_t last_pa;
225	char *list;
226
227	/* the biggest memory array is the second group of pages */
228	vm_paddr_t end;
229	vm_paddr_t biggestsize;
230	vm_paddr_t low_water, high_water;
231	int biggestone;
232
233	biggestsize = 0;
234	biggestone = 0;
235	nblocks = 0;
236	vaddr = round_page(vaddr);
237
238	for (i = 0; phys_avail[i + 1]; i += 2) {
239		phys_avail[i] = round_page(phys_avail[i]);
240		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
241	}
242
243	low_water = phys_avail[0];
244	high_water = phys_avail[1];
245
246	for (i = 0; phys_avail[i + 1]; i += 2) {
247		vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
248
249		if (size > biggestsize) {
250			biggestone = i;
251			biggestsize = size;
252		}
253		if (phys_avail[i] < low_water)
254			low_water = phys_avail[i];
255		if (phys_avail[i + 1] > high_water)
256			high_water = phys_avail[i + 1];
257		++nblocks;
258	}
259
260#ifdef XEN
261	low_water = 0;
262#endif
263
264	end = phys_avail[biggestone+1];
265
266	/*
267	 * Initialize the locks.
268	 */
269	mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF |
270	    MTX_RECURSE);
271	mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL,
272	    MTX_DEF);
273
274	/*
275	 * Initialize the queue headers for the hold queue, the active queue,
276	 * and the inactive queue.
277	 */
278	for (i = 0; i < PQ_COUNT; i++)
279		TAILQ_INIT(&vm_page_queues[i].pl);
280	vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
281	vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
282	vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count;
283
284	/*
285	 * Allocate memory for use when boot strapping the kernel memory
286	 * allocator.
287	 */
288	new_end = end - (boot_pages * UMA_SLAB_SIZE);
289	new_end = trunc_page(new_end);
290	mapped = pmap_map(&vaddr, new_end, end,
291	    VM_PROT_READ | VM_PROT_WRITE);
292	bzero((void *)mapped, end - new_end);
293	uma_startup((void *)mapped, boot_pages);
294
295#if defined(__amd64__) || defined(__i386__) || defined(__arm__)
296	/*
297	 * Allocate a bitmap to indicate that a random physical page
298	 * needs to be included in a minidump.
299	 *
300	 * The amd64 port needs this to indicate which direct map pages
301	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
302	 *
303	 * However, i386 still needs this workspace internally within the
304	 * minidump code.  In theory, they are not needed on i386, but are
305	 * included should the sf_buf code decide to use them.
306	 */
307	page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE;
308	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
309	new_end -= vm_page_dump_size;
310	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
311	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
312	bzero((void *)vm_page_dump, vm_page_dump_size);
313#endif
314	/*
315	 * Compute the number of pages of memory that will be available for
316	 * use (taking into account the overhead of a page structure per
317	 * page).
318	 */
319	first_page = low_water / PAGE_SIZE;
320#ifdef VM_PHYSSEG_SPARSE
321	page_range = 0;
322	for (i = 0; phys_avail[i + 1] != 0; i += 2)
323		page_range += atop(phys_avail[i + 1] - phys_avail[i]);
324#elif defined(VM_PHYSSEG_DENSE)
325	page_range = high_water / PAGE_SIZE - first_page;
326#else
327#error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
328#endif
329	end = new_end;
330
331	/*
332	 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
333	 */
334	vaddr += PAGE_SIZE;
335
336	/*
337	 * Initialize the mem entry structures now, and put them in the free
338	 * queue.
339	 */
340	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
341	mapped = pmap_map(&vaddr, new_end, end,
342	    VM_PROT_READ | VM_PROT_WRITE);
343	vm_page_array = (vm_page_t) mapped;
344#if VM_NRESERVLEVEL > 0
345	/*
346	 * Allocate memory for the reservation management system's data
347	 * structures.
348	 */
349	new_end = vm_reserv_startup(&vaddr, new_end, high_water);
350#endif
351#ifdef __amd64__
352	/*
353	 * pmap_map on amd64 comes out of the direct-map, not kvm like i386,
354	 * so the pages must be tracked for a crashdump to include this data.
355	 * This includes the vm_page_array and the early UMA bootstrap pages.
356	 */
357	for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
358		dump_add_page(pa);
359#endif
360	phys_avail[biggestone + 1] = new_end;
361
362	/*
363	 * Clear all of the page structures
364	 */
365	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
366	for (i = 0; i < page_range; i++)
367		vm_page_array[i].order = VM_NFREEORDER;
368	vm_page_array_size = page_range;
369
370	/*
371	 * Initialize the physical memory allocator.
372	 */
373	vm_phys_init();
374
375	/*
376	 * Add every available physical page that is not blacklisted to
377	 * the free lists.
378	 */
379	cnt.v_page_count = 0;
380	cnt.v_free_count = 0;
381	list = getenv("vm.blacklist");
382	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
383		pa = phys_avail[i];
384		last_pa = phys_avail[i + 1];
385		while (pa < last_pa) {
386			if (list != NULL &&
387			    vm_page_blacklist_lookup(list, pa))
388				printf("Skipping page with pa 0x%jx\n",
389				    (uintmax_t)pa);
390			else
391				vm_phys_add_page(pa);
392			pa += PAGE_SIZE;
393		}
394	}
395	freeenv(list);
396#if VM_NRESERVLEVEL > 0
397	/*
398	 * Initialize the reservation management system.
399	 */
400	vm_reserv_init();
401#endif
402	return (vaddr);
403}
404
405void
406vm_page_flag_set(vm_page_t m, unsigned short bits)
407{
408
409	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
410	m->flags |= bits;
411}
412
413void
414vm_page_flag_clear(vm_page_t m, unsigned short bits)
415{
416
417	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
418	m->flags &= ~bits;
419}
420
421void
422vm_page_busy(vm_page_t m)
423{
424
425	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
426	KASSERT((m->oflags & VPO_BUSY) == 0,
427	    ("vm_page_busy: page already busy!!!"));
428	m->oflags |= VPO_BUSY;
429}
430
431/*
432 *      vm_page_flash:
433 *
434 *      wakeup anyone waiting for the page.
435 */
436void
437vm_page_flash(vm_page_t m)
438{
439
440	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
441	if (m->oflags & VPO_WANTED) {
442		m->oflags &= ~VPO_WANTED;
443		wakeup(m);
444	}
445}
446
447/*
448 *      vm_page_wakeup:
449 *
450 *      clear the VPO_BUSY flag and wakeup anyone waiting for the
451 *      page.
452 *
453 */
454void
455vm_page_wakeup(vm_page_t m)
456{
457
458	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
459	KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!"));
460	m->oflags &= ~VPO_BUSY;
461	vm_page_flash(m);
462}
463
464void
465vm_page_io_start(vm_page_t m)
466{
467
468	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
469	m->busy++;
470}
471
472void
473vm_page_io_finish(vm_page_t m)
474{
475
476	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
477	m->busy--;
478	if (m->busy == 0)
479		vm_page_flash(m);
480}
481
482/*
483 * Keep page from being freed by the page daemon
484 * much of the same effect as wiring, except much lower
485 * overhead and should be used only for *very* temporary
486 * holding ("wiring").
487 */
488void
489vm_page_hold(vm_page_t mem)
490{
491
492	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
493        mem->hold_count++;
494}
495
496void
497vm_page_unhold(vm_page_t mem)
498{
499
500	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
501	--mem->hold_count;
502	KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
503	if (mem->hold_count == 0 && VM_PAGE_INQUEUE2(mem, PQ_HOLD))
504		vm_page_free_toq(mem);
505}
506
507/*
508 *	vm_page_free:
509 *
510 *	Free a page.
511 */
512void
513vm_page_free(vm_page_t m)
514{
515
516	m->flags &= ~PG_ZERO;
517	vm_page_free_toq(m);
518}
519
520/*
521 *	vm_page_free_zero:
522 *
523 *	Free a page to the zerod-pages queue
524 */
525void
526vm_page_free_zero(vm_page_t m)
527{
528
529	m->flags |= PG_ZERO;
530	vm_page_free_toq(m);
531}
532
533/*
534 *	vm_page_sleep:
535 *
536 *	Sleep and release the page queues lock.
537 *
538 *	The object containing the given page must be locked.
539 */
540void
541vm_page_sleep(vm_page_t m, const char *msg)
542{
543
544	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
545	if (!mtx_owned(&vm_page_queue_mtx))
546		vm_page_lock_queues();
547	vm_page_flag_set(m, PG_REFERENCED);
548	vm_page_unlock_queues();
549
550	/*
551	 * It's possible that while we sleep, the page will get
552	 * unbusied and freed.  If we are holding the object
553	 * lock, we will assume we hold a reference to the object
554	 * such that even if m->object changes, we can re-lock
555	 * it.
556	 */
557	m->oflags |= VPO_WANTED;
558	msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0);
559}
560
561/*
562 *	vm_page_dirty:
563 *
564 *	make page all dirty
565 */
566void
567vm_page_dirty(vm_page_t m)
568{
569	KASSERT((m->flags & PG_CACHED) == 0,
570	    ("vm_page_dirty: page in cache!"));
571	KASSERT(!VM_PAGE_IS_FREE(m),
572	    ("vm_page_dirty: page is free!"));
573	m->dirty = VM_PAGE_BITS_ALL;
574}
575
576/*
577 *	vm_page_splay:
578 *
579 *	Implements Sleator and Tarjan's top-down splay algorithm.  Returns
580 *	the vm_page containing the given pindex.  If, however, that
581 *	pindex is not found in the vm_object, returns a vm_page that is
582 *	adjacent to the pindex, coming before or after it.
583 */
584vm_page_t
585vm_page_splay(vm_pindex_t pindex, vm_page_t root)
586{
587	struct vm_page dummy;
588	vm_page_t lefttreemax, righttreemin, y;
589
590	if (root == NULL)
591		return (root);
592	lefttreemax = righttreemin = &dummy;
593	for (;; root = y) {
594		if (pindex < root->pindex) {
595			if ((y = root->left) == NULL)
596				break;
597			if (pindex < y->pindex) {
598				/* Rotate right. */
599				root->left = y->right;
600				y->right = root;
601				root = y;
602				if ((y = root->left) == NULL)
603					break;
604			}
605			/* Link into the new root's right tree. */
606			righttreemin->left = root;
607			righttreemin = root;
608		} else if (pindex > root->pindex) {
609			if ((y = root->right) == NULL)
610				break;
611			if (pindex > y->pindex) {
612				/* Rotate left. */
613				root->right = y->left;
614				y->left = root;
615				root = y;
616				if ((y = root->right) == NULL)
617					break;
618			}
619			/* Link into the new root's left tree. */
620			lefttreemax->right = root;
621			lefttreemax = root;
622		} else
623			break;
624	}
625	/* Assemble the new root. */
626	lefttreemax->right = root->left;
627	righttreemin->left = root->right;
628	root->left = dummy.right;
629	root->right = dummy.left;
630	return (root);
631}
632
633/*
634 *	vm_page_insert:		[ internal use only ]
635 *
636 *	Inserts the given mem entry into the object and object list.
637 *
638 *	The pagetables are not updated but will presumably fault the page
639 *	in if necessary, or if a kernel page the caller will at some point
640 *	enter the page into the kernel's pmap.  We are not allowed to block
641 *	here so we *can't* do this anyway.
642 *
643 *	The object and page must be locked.
644 *	This routine may not block.
645 */
646void
647vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
648{
649	vm_page_t root;
650
651	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
652	if (m->object != NULL)
653		panic("vm_page_insert: page already inserted");
654
655	/*
656	 * Record the object/offset pair in this page
657	 */
658	m->object = object;
659	m->pindex = pindex;
660
661	/*
662	 * Now link into the object's ordered list of backed pages.
663	 */
664	root = object->root;
665	if (root == NULL) {
666		m->left = NULL;
667		m->right = NULL;
668		TAILQ_INSERT_TAIL(&object->memq, m, listq);
669	} else {
670		root = vm_page_splay(pindex, root);
671		if (pindex < root->pindex) {
672			m->left = root->left;
673			m->right = root;
674			root->left = NULL;
675			TAILQ_INSERT_BEFORE(root, m, listq);
676		} else if (pindex == root->pindex)
677			panic("vm_page_insert: offset already allocated");
678		else {
679			m->right = root->right;
680			m->left = root;
681			root->right = NULL;
682			TAILQ_INSERT_AFTER(&object->memq, root, m, listq);
683		}
684	}
685	object->root = m;
686	object->generation++;
687
688	/*
689	 * show that the object has one more resident page.
690	 */
691	object->resident_page_count++;
692	/*
693	 * Hold the vnode until the last page is released.
694	 */
695	if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
696		vhold((struct vnode *)object->handle);
697
698	/*
699	 * Since we are inserting a new and possibly dirty page,
700	 * update the object's OBJ_MIGHTBEDIRTY flag.
701	 */
702	if (m->flags & PG_WRITEABLE)
703		vm_object_set_writeable_dirty(object);
704}
705
706/*
707 *	vm_page_remove:
708 *				NOTE: used by device pager as well -wfj
709 *
710 *	Removes the given mem entry from the object/offset-page
711 *	table and the object page list, but do not invalidate/terminate
712 *	the backing store.
713 *
714 *	The object and page must be locked.
715 *	The underlying pmap entry (if any) is NOT removed here.
716 *	This routine may not block.
717 */
718void
719vm_page_remove(vm_page_t m)
720{
721	vm_object_t object;
722	vm_page_t root;
723
724	if ((object = m->object) == NULL)
725		return;
726	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
727	if (m->oflags & VPO_BUSY) {
728		m->oflags &= ~VPO_BUSY;
729		vm_page_flash(m);
730	}
731	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
732
733	/*
734	 * Now remove from the object's list of backed pages.
735	 */
736	if (m != object->root)
737		vm_page_splay(m->pindex, object->root);
738	if (m->left == NULL)
739		root = m->right;
740	else {
741		root = vm_page_splay(m->pindex, m->left);
742		root->right = m->right;
743	}
744	object->root = root;
745	TAILQ_REMOVE(&object->memq, m, listq);
746
747	/*
748	 * And show that the object has one fewer resident page.
749	 */
750	object->resident_page_count--;
751	object->generation++;
752	/*
753	 * The vnode may now be recycled.
754	 */
755	if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
756		vdrop((struct vnode *)object->handle);
757
758	m->object = NULL;
759}
760
761/*
762 *	vm_page_lookup:
763 *
764 *	Returns the page associated with the object/offset
765 *	pair specified; if none is found, NULL is returned.
766 *
767 *	The object must be locked.
768 *	This routine may not block.
769 *	This is a critical path routine
770 */
771vm_page_t
772vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
773{
774	vm_page_t m;
775
776	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
777	if ((m = object->root) != NULL && m->pindex != pindex) {
778		m = vm_page_splay(pindex, m);
779		if ((object->root = m)->pindex != pindex)
780			m = NULL;
781	}
782	return (m);
783}
784
785/*
786 *	vm_page_rename:
787 *
788 *	Move the given memory entry from its
789 *	current object to the specified target object/offset.
790 *
791 *	The object must be locked.
792 *	This routine may not block.
793 *
794 *	Note: swap associated with the page must be invalidated by the move.  We
795 *	      have to do this for several reasons:  (1) we aren't freeing the
796 *	      page, (2) we are dirtying the page, (3) the VM system is probably
797 *	      moving the page from object A to B, and will then later move
798 *	      the backing store from A to B and we can't have a conflict.
799 *
800 *	Note: we *always* dirty the page.  It is necessary both for the
801 *	      fact that we moved it, and because we may be invalidating
802 *	      swap.  If the page is on the cache, we have to deactivate it
803 *	      or vm_page_dirty() will panic.  Dirty pages are not allowed
804 *	      on the cache.
805 */
806void
807vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
808{
809
810	vm_page_remove(m);
811	vm_page_insert(m, new_object, new_pindex);
812	vm_page_dirty(m);
813}
814
815/*
816 *	Convert all of the given object's cached pages that have a
817 *	pindex within the given range into free pages.  If the value
818 *	zero is given for "end", then the range's upper bound is
819 *	infinity.  If the given object is backed by a vnode and it
820 *	transitions from having one or more cached pages to none, the
821 *	vnode's hold count is reduced.
822 */
823void
824vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
825{
826	vm_page_t m, m_next;
827	boolean_t empty;
828
829	mtx_lock(&vm_page_queue_free_mtx);
830	if (__predict_false(object->cache == NULL)) {
831		mtx_unlock(&vm_page_queue_free_mtx);
832		return;
833	}
834	m = object->cache = vm_page_splay(start, object->cache);
835	if (m->pindex < start) {
836		if (m->right == NULL)
837			m = NULL;
838		else {
839			m_next = vm_page_splay(start, m->right);
840			m_next->left = m;
841			m->right = NULL;
842			m = object->cache = m_next;
843		}
844	}
845
846	/*
847	 * At this point, "m" is either (1) a reference to the page
848	 * with the least pindex that is greater than or equal to
849	 * "start" or (2) NULL.
850	 */
851	for (; m != NULL && (m->pindex < end || end == 0); m = m_next) {
852		/*
853		 * Find "m"'s successor and remove "m" from the
854		 * object's cache.
855		 */
856		if (m->right == NULL) {
857			object->cache = m->left;
858			m_next = NULL;
859		} else {
860			m_next = vm_page_splay(start, m->right);
861			m_next->left = m->left;
862			object->cache = m_next;
863		}
864		/* Convert "m" to a free page. */
865		m->object = NULL;
866		m->valid = 0;
867		/* Clear PG_CACHED and set PG_FREE. */
868		m->flags ^= PG_CACHED | PG_FREE;
869		KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
870		    ("vm_page_cache_free: page %p has inconsistent flags", m));
871		cnt.v_cache_count--;
872		cnt.v_free_count++;
873	}
874	empty = object->cache == NULL;
875	mtx_unlock(&vm_page_queue_free_mtx);
876	if (object->type == OBJT_VNODE && empty)
877		vdrop(object->handle);
878}
879
880/*
881 *	Returns the cached page that is associated with the given
882 *	object and offset.  If, however, none exists, returns NULL.
883 *
884 *	The free page queue must be locked.
885 */
886static inline vm_page_t
887vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
888{
889	vm_page_t m;
890
891	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
892	if ((m = object->cache) != NULL && m->pindex != pindex) {
893		m = vm_page_splay(pindex, m);
894		if ((object->cache = m)->pindex != pindex)
895			m = NULL;
896	}
897	return (m);
898}
899
900/*
901 *	Remove the given cached page from its containing object's
902 *	collection of cached pages.
903 *
904 *	The free page queue must be locked.
905 */
906void
907vm_page_cache_remove(vm_page_t m)
908{
909	vm_object_t object;
910	vm_page_t root;
911
912	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
913	KASSERT((m->flags & PG_CACHED) != 0,
914	    ("vm_page_cache_remove: page %p is not cached", m));
915	object = m->object;
916	if (m != object->cache) {
917		root = vm_page_splay(m->pindex, object->cache);
918		KASSERT(root == m,
919		    ("vm_page_cache_remove: page %p is not cached in object %p",
920		    m, object));
921	}
922	if (m->left == NULL)
923		root = m->right;
924	else if (m->right == NULL)
925		root = m->left;
926	else {
927		root = vm_page_splay(m->pindex, m->left);
928		root->right = m->right;
929	}
930	object->cache = root;
931	m->object = NULL;
932	cnt.v_cache_count--;
933}
934
935/*
936 *	Transfer all of the cached pages with offset greater than or
937 *	equal to 'offidxstart' from the original object's cache to the
938 *	new object's cache.  However, any cached pages with offset
939 *	greater than or equal to the new object's size are kept in the
940 *	original object.  Initially, the new object's cache must be
941 *	empty.  Offset 'offidxstart' in the original object must
942 *	correspond to offset zero in the new object.
943 *
944 *	The new object must be locked.
945 */
946void
947vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
948    vm_object_t new_object)
949{
950	vm_page_t m, m_next;
951
952	/*
953	 * Insertion into an object's collection of cached pages
954	 * requires the object to be locked.  In contrast, removal does
955	 * not.
956	 */
957	VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
958	KASSERT(new_object->cache == NULL,
959	    ("vm_page_cache_transfer: object %p has cached pages",
960	    new_object));
961	mtx_lock(&vm_page_queue_free_mtx);
962	if ((m = orig_object->cache) != NULL) {
963		/*
964		 * Transfer all of the pages with offset greater than or
965		 * equal to 'offidxstart' from the original object's
966		 * cache to the new object's cache.
967		 */
968		m = vm_page_splay(offidxstart, m);
969		if (m->pindex < offidxstart) {
970			orig_object->cache = m;
971			new_object->cache = m->right;
972			m->right = NULL;
973		} else {
974			orig_object->cache = m->left;
975			new_object->cache = m;
976			m->left = NULL;
977		}
978		while ((m = new_object->cache) != NULL) {
979			if ((m->pindex - offidxstart) >= new_object->size) {
980				/*
981				 * Return all of the cached pages with
982				 * offset greater than or equal to the
983				 * new object's size to the original
984				 * object's cache.
985				 */
986				new_object->cache = m->left;
987				m->left = orig_object->cache;
988				orig_object->cache = m;
989				break;
990			}
991			m_next = vm_page_splay(m->pindex, m->right);
992			/* Update the page's object and offset. */
993			m->object = new_object;
994			m->pindex -= offidxstart;
995			if (m_next == NULL)
996				break;
997			m->right = NULL;
998			m_next->left = m;
999			new_object->cache = m_next;
1000		}
1001		KASSERT(new_object->cache == NULL ||
1002		    new_object->type == OBJT_SWAP,
1003		    ("vm_page_cache_transfer: object %p's type is incompatible"
1004		    " with cached pages", new_object));
1005	}
1006	mtx_unlock(&vm_page_queue_free_mtx);
1007}
1008
1009/*
1010 *	vm_page_alloc:
1011 *
1012 *	Allocate and return a memory cell associated
1013 *	with this VM object/offset pair.
1014 *
1015 *	page_req classes:
1016 *	VM_ALLOC_NORMAL		normal process request
1017 *	VM_ALLOC_SYSTEM		system *really* needs a page
1018 *	VM_ALLOC_INTERRUPT	interrupt time request
1019 *	VM_ALLOC_ZERO		zero page
1020 *
1021 *	This routine may not block.
1022 */
1023vm_page_t
1024vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
1025{
1026	struct vnode *vp = NULL;
1027	vm_object_t m_object;
1028	vm_page_t m;
1029	int flags, page_req;
1030
1031	page_req = req & VM_ALLOC_CLASS_MASK;
1032	KASSERT(curthread->td_intr_nesting_level == 0 ||
1033	    page_req == VM_ALLOC_INTERRUPT,
1034	    ("vm_page_alloc(NORMAL|SYSTEM) in interrupt context"));
1035
1036	if ((req & VM_ALLOC_NOOBJ) == 0) {
1037		KASSERT(object != NULL,
1038		    ("vm_page_alloc: NULL object."));
1039		VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1040	}
1041
1042	/*
1043	 * The pager is allowed to eat deeper into the free page list.
1044	 */
1045	if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) {
1046		page_req = VM_ALLOC_SYSTEM;
1047	};
1048
1049	mtx_lock(&vm_page_queue_free_mtx);
1050	if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
1051	    (page_req == VM_ALLOC_SYSTEM &&
1052	    cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
1053	    (page_req == VM_ALLOC_INTERRUPT &&
1054	    cnt.v_free_count + cnt.v_cache_count > 0)) {
1055		/*
1056		 * Allocate from the free queue if the number of free pages
1057		 * exceeds the minimum for the request class.
1058		 */
1059		if (object != NULL &&
1060		    (m = vm_page_cache_lookup(object, pindex)) != NULL) {
1061			if ((req & VM_ALLOC_IFNOTCACHED) != 0) {
1062				mtx_unlock(&vm_page_queue_free_mtx);
1063				return (NULL);
1064			}
1065			if (vm_phys_unfree_page(m))
1066				vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0);
1067#if VM_NRESERVLEVEL > 0
1068			else if (!vm_reserv_reactivate_page(m))
1069#else
1070			else
1071#endif
1072				panic("vm_page_alloc: cache page %p is missing"
1073				    " from the free queue", m);
1074		} else if ((req & VM_ALLOC_IFCACHED) != 0) {
1075			mtx_unlock(&vm_page_queue_free_mtx);
1076			return (NULL);
1077#if VM_NRESERVLEVEL > 0
1078		} else if (object == NULL || object->type == OBJT_DEVICE ||
1079		    (object->flags & OBJ_COLORED) == 0 ||
1080		    (m = vm_reserv_alloc_page(object, pindex)) == NULL) {
1081#else
1082		} else {
1083#endif
1084			m = vm_phys_alloc_pages(object != NULL ?
1085			    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
1086#if VM_NRESERVLEVEL > 0
1087			if (m == NULL && vm_reserv_reclaim_inactive()) {
1088				m = vm_phys_alloc_pages(object != NULL ?
1089				    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT,
1090				    0);
1091			}
1092#endif
1093		}
1094	} else {
1095		/*
1096		 * Not allocatable, give up.
1097		 */
1098		mtx_unlock(&vm_page_queue_free_mtx);
1099		atomic_add_int(&vm_pageout_deficit, 1);
1100		pagedaemon_wakeup();
1101		return (NULL);
1102	}
1103
1104	/*
1105	 *  At this point we had better have found a good page.
1106	 */
1107
1108	KASSERT(
1109	    m != NULL,
1110	    ("vm_page_alloc(): missing page on free queue")
1111	);
1112	if ((m->flags & PG_CACHED) != 0) {
1113		KASSERT(m->valid != 0,
1114		    ("vm_page_alloc: cached page %p is invalid", m));
1115		if (m->object == object && m->pindex == pindex)
1116	  		cnt.v_reactivated++;
1117		else
1118			m->valid = 0;
1119		m_object = m->object;
1120		vm_page_cache_remove(m);
1121		if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
1122			vp = m_object->handle;
1123	} else {
1124		KASSERT(VM_PAGE_IS_FREE(m),
1125		    ("vm_page_alloc: page %p is not free", m));
1126		KASSERT(m->valid == 0,
1127		    ("vm_page_alloc: free page %p is valid", m));
1128		cnt.v_free_count--;
1129	}
1130
1131	/*
1132	 * Initialize structure.  Only the PG_ZERO flag is inherited.
1133	 */
1134	flags = 0;
1135	if (m->flags & PG_ZERO) {
1136		vm_page_zero_count--;
1137		if (req & VM_ALLOC_ZERO)
1138			flags = PG_ZERO;
1139	}
1140	if (object == NULL || object->type == OBJT_PHYS)
1141		flags |= PG_UNMANAGED;
1142	m->flags = flags;
1143	if (req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ))
1144		m->oflags = 0;
1145	else
1146		m->oflags = VPO_BUSY;
1147	if (req & VM_ALLOC_WIRED) {
1148		atomic_add_int(&cnt.v_wire_count, 1);
1149		m->wire_count = 1;
1150	} else
1151		m->wire_count = 0;
1152	m->hold_count = 0;
1153	m->act_count = 0;
1154	m->busy = 0;
1155	KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m));
1156	mtx_unlock(&vm_page_queue_free_mtx);
1157
1158	if ((req & VM_ALLOC_NOOBJ) == 0)
1159		vm_page_insert(m, object, pindex);
1160	else
1161		m->pindex = pindex;
1162
1163	/*
1164	 * The following call to vdrop() must come after the above call
1165	 * to vm_page_insert() in case both affect the same object and
1166	 * vnode.  Otherwise, the affected vnode's hold count could
1167	 * temporarily become zero.
1168	 */
1169	if (vp != NULL)
1170		vdrop(vp);
1171
1172	/*
1173	 * Don't wakeup too often - wakeup the pageout daemon when
1174	 * we would be nearly out of memory.
1175	 */
1176	if (vm_paging_needed())
1177		pagedaemon_wakeup();
1178
1179	return (m);
1180}
1181
1182/*
1183 *	vm_wait:	(also see VM_WAIT macro)
1184 *
1185 *	Block until free pages are available for allocation
1186 *	- Called in various places before memory allocations.
1187 */
1188void
1189vm_wait(void)
1190{
1191
1192	mtx_lock(&vm_page_queue_free_mtx);
1193	if (curproc == pageproc) {
1194		vm_pageout_pages_needed = 1;
1195		msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
1196		    PDROP | PSWP, "VMWait", 0);
1197	} else {
1198		if (!vm_pages_needed) {
1199			vm_pages_needed = 1;
1200			wakeup(&vm_pages_needed);
1201		}
1202		msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
1203		    "vmwait", 0);
1204	}
1205}
1206
1207/*
1208 *	vm_waitpfault:	(also see VM_WAITPFAULT macro)
1209 *
1210 *	Block until free pages are available for allocation
1211 *	- Called only in vm_fault so that processes page faulting
1212 *	  can be easily tracked.
1213 *	- Sleeps at a lower priority than vm_wait() so that vm_wait()ing
1214 *	  processes will be able to grab memory first.  Do not change
1215 *	  this balance without careful testing first.
1216 */
1217void
1218vm_waitpfault(void)
1219{
1220
1221	mtx_lock(&vm_page_queue_free_mtx);
1222	if (!vm_pages_needed) {
1223		vm_pages_needed = 1;
1224		wakeup(&vm_pages_needed);
1225	}
1226	msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
1227	    "pfault", 0);
1228}
1229
1230/*
1231 *	vm_page_requeue:
1232 *
1233 *	If the given page is contained within a page queue, move it to the tail
1234 *	of that queue.
1235 *
1236 *	The page queues must be locked.
1237 */
1238void
1239vm_page_requeue(vm_page_t m)
1240{
1241	int queue = VM_PAGE_GETQUEUE(m);
1242	struct vpgqueues *vpq;
1243
1244	if (queue != PQ_NONE) {
1245		vpq = &vm_page_queues[queue];
1246		TAILQ_REMOVE(&vpq->pl, m, pageq);
1247		TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
1248	}
1249}
1250
1251/*
1252 *	vm_pageq_remove:
1253 *
1254 *	Remove a page from its queue.
1255 *
1256 *	The queue containing the given page must be locked.
1257 *	This routine may not block.
1258 */
1259void
1260vm_pageq_remove(vm_page_t m)
1261{
1262	int queue = VM_PAGE_GETQUEUE(m);
1263	struct vpgqueues *pq;
1264
1265	if (queue != PQ_NONE) {
1266		VM_PAGE_SETQUEUE2(m, PQ_NONE);
1267		pq = &vm_page_queues[queue];
1268		TAILQ_REMOVE(&pq->pl, m, pageq);
1269		(*pq->cnt)--;
1270	}
1271}
1272
1273/*
1274 *	vm_page_enqueue:
1275 *
1276 *	Add the given page to the specified queue.
1277 *
1278 *	The page queues must be locked.
1279 */
1280static void
1281vm_page_enqueue(int queue, vm_page_t m)
1282{
1283	struct vpgqueues *vpq;
1284
1285	vpq = &vm_page_queues[queue];
1286	VM_PAGE_SETQUEUE2(m, queue);
1287	TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
1288	++*vpq->cnt;
1289}
1290
1291/*
1292 *	vm_page_activate:
1293 *
1294 *	Put the specified page on the active list (if appropriate).
1295 *	Ensure that act_count is at least ACT_INIT but do not otherwise
1296 *	mess with it.
1297 *
1298 *	The page queues must be locked.
1299 *	This routine may not block.
1300 */
1301void
1302vm_page_activate(vm_page_t m)
1303{
1304
1305	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1306	if (VM_PAGE_GETKNOWNQUEUE2(m) != PQ_ACTIVE) {
1307		vm_pageq_remove(m);
1308		if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
1309			if (m->act_count < ACT_INIT)
1310				m->act_count = ACT_INIT;
1311			vm_page_enqueue(PQ_ACTIVE, m);
1312		}
1313	} else {
1314		if (m->act_count < ACT_INIT)
1315			m->act_count = ACT_INIT;
1316	}
1317}
1318
1319/*
1320 *	vm_page_free_wakeup:
1321 *
1322 *	Helper routine for vm_page_free_toq() and vm_page_cache().  This
1323 *	routine is called when a page has been added to the cache or free
1324 *	queues.
1325 *
1326 *	The page queues must be locked.
1327 *	This routine may not block.
1328 */
1329static inline void
1330vm_page_free_wakeup(void)
1331{
1332
1333	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1334	/*
1335	 * if pageout daemon needs pages, then tell it that there are
1336	 * some free.
1337	 */
1338	if (vm_pageout_pages_needed &&
1339	    cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
1340		wakeup(&vm_pageout_pages_needed);
1341		vm_pageout_pages_needed = 0;
1342	}
1343	/*
1344	 * wakeup processes that are waiting on memory if we hit a
1345	 * high water mark. And wakeup scheduler process if we have
1346	 * lots of memory. this process will swapin processes.
1347	 */
1348	if (vm_pages_needed && !vm_page_count_min()) {
1349		vm_pages_needed = 0;
1350		wakeup(&cnt.v_free_count);
1351	}
1352}
1353
1354/*
1355 *	vm_page_free_toq:
1356 *
1357 *	Returns the given page to the free list,
1358 *	disassociating it with any VM object.
1359 *
1360 *	Object and page must be locked prior to entry.
1361 *	This routine may not block.
1362 */
1363
1364void
1365vm_page_free_toq(vm_page_t m)
1366{
1367
1368	if (VM_PAGE_GETQUEUE(m) != PQ_NONE)
1369		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1370	KASSERT(!pmap_page_is_mapped(m),
1371	    ("vm_page_free_toq: freeing mapped page %p", m));
1372	PCPU_INC(cnt.v_tfree);
1373
1374	if (m->busy || VM_PAGE_IS_FREE(m)) {
1375		printf(
1376		"vm_page_free: pindex(%lu), busy(%d), VPO_BUSY(%d), hold(%d)\n",
1377		    (u_long)m->pindex, m->busy, (m->oflags & VPO_BUSY) ? 1 : 0,
1378		    m->hold_count);
1379		if (VM_PAGE_IS_FREE(m))
1380			panic("vm_page_free: freeing free page");
1381		else
1382			panic("vm_page_free: freeing busy page");
1383	}
1384
1385	/*
1386	 * unqueue, then remove page.  Note that we cannot destroy
1387	 * the page here because we do not want to call the pager's
1388	 * callback routine until after we've put the page on the
1389	 * appropriate free queue.
1390	 */
1391	vm_pageq_remove(m);
1392	vm_page_remove(m);
1393
1394	/*
1395	 * If fictitious remove object association and
1396	 * return, otherwise delay object association removal.
1397	 */
1398	if ((m->flags & PG_FICTITIOUS) != 0) {
1399		return;
1400	}
1401
1402	m->valid = 0;
1403	vm_page_undirty(m);
1404
1405	if (m->wire_count != 0) {
1406		if (m->wire_count > 1) {
1407			panic("vm_page_free: invalid wire count (%d), pindex: 0x%lx",
1408				m->wire_count, (long)m->pindex);
1409		}
1410		panic("vm_page_free: freeing wired page");
1411	}
1412	if (m->hold_count != 0) {
1413		m->flags &= ~PG_ZERO;
1414		vm_page_enqueue(PQ_HOLD, m);
1415	} else {
1416		mtx_lock(&vm_page_queue_free_mtx);
1417		m->flags |= PG_FREE;
1418		cnt.v_free_count++;
1419#if VM_NRESERVLEVEL > 0
1420		if (!vm_reserv_free_page(m))
1421#else
1422		if (TRUE)
1423#endif
1424			vm_phys_free_pages(m, 0);
1425		if ((m->flags & PG_ZERO) != 0)
1426			++vm_page_zero_count;
1427		else
1428			vm_page_zero_idle_wakeup();
1429		vm_page_free_wakeup();
1430		mtx_unlock(&vm_page_queue_free_mtx);
1431	}
1432}
1433
1434/*
1435 *	vm_page_wire:
1436 *
1437 *	Mark this page as wired down by yet
1438 *	another map, removing it from paging queues
1439 *	as necessary.
1440 *
1441 *	The page queues must be locked.
1442 *	This routine may not block.
1443 */
1444void
1445vm_page_wire(vm_page_t m)
1446{
1447
1448	/*
1449	 * Only bump the wire statistics if the page is not already wired,
1450	 * and only unqueue the page if it is on some queue (if it is unmanaged
1451	 * it is already off the queues).
1452	 */
1453	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1454	if (m->flags & PG_FICTITIOUS)
1455		return;
1456	if (m->wire_count == 0) {
1457		if ((m->flags & PG_UNMANAGED) == 0)
1458			vm_pageq_remove(m);
1459		atomic_add_int(&cnt.v_wire_count, 1);
1460	}
1461	m->wire_count++;
1462	KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
1463}
1464
1465/*
1466 *	vm_page_unwire:
1467 *
1468 *	Release one wiring of this page, potentially
1469 *	enabling it to be paged again.
1470 *
1471 *	Many pages placed on the inactive queue should actually go
1472 *	into the cache, but it is difficult to figure out which.  What
1473 *	we do instead, if the inactive target is well met, is to put
1474 *	clean pages at the head of the inactive queue instead of the tail.
1475 *	This will cause them to be moved to the cache more quickly and
1476 *	if not actively re-referenced, freed more quickly.  If we just
1477 *	stick these pages at the end of the inactive queue, heavy filesystem
1478 *	meta-data accesses can cause an unnecessary paging load on memory bound
1479 *	processes.  This optimization causes one-time-use metadata to be
1480 *	reused more quickly.
1481 *
1482 *	BUT, if we are in a low-memory situation we have no choice but to
1483 *	put clean pages on the cache queue.
1484 *
1485 *	A number of routines use vm_page_unwire() to guarantee that the page
1486 *	will go into either the inactive or active queues, and will NEVER
1487 *	be placed in the cache - for example, just after dirtying a page.
1488 *	dirty pages in the cache are not allowed.
1489 *
1490 *	The page queues must be locked.
1491 *	This routine may not block.
1492 */
1493void
1494vm_page_unwire(vm_page_t m, int activate)
1495{
1496
1497	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1498	if (m->flags & PG_FICTITIOUS)
1499		return;
1500	if (m->wire_count > 0) {
1501		m->wire_count--;
1502		if (m->wire_count == 0) {
1503			atomic_subtract_int(&cnt.v_wire_count, 1);
1504			if (m->flags & PG_UNMANAGED) {
1505				;
1506			} else if (activate)
1507				vm_page_enqueue(PQ_ACTIVE, m);
1508			else {
1509				vm_page_flag_clear(m, PG_WINATCFLS);
1510				vm_page_enqueue(PQ_INACTIVE, m);
1511			}
1512		}
1513	} else {
1514		panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
1515	}
1516}
1517
1518
1519/*
1520 * Move the specified page to the inactive queue.  If the page has
1521 * any associated swap, the swap is deallocated.
1522 *
1523 * Normally athead is 0 resulting in LRU operation.  athead is set
1524 * to 1 if we want this page to be 'as if it were placed in the cache',
1525 * except without unmapping it from the process address space.
1526 *
1527 * This routine may not block.
1528 */
1529static inline void
1530_vm_page_deactivate(vm_page_t m, int athead)
1531{
1532
1533	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1534
1535	/*
1536	 * Ignore if already inactive.
1537	 */
1538	if (VM_PAGE_INQUEUE2(m, PQ_INACTIVE))
1539		return;
1540	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
1541		vm_page_flag_clear(m, PG_WINATCFLS);
1542		vm_pageq_remove(m);
1543		if (athead)
1544			TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1545		else
1546			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1547		VM_PAGE_SETQUEUE2(m, PQ_INACTIVE);
1548		cnt.v_inactive_count++;
1549	}
1550}
1551
1552void
1553vm_page_deactivate(vm_page_t m)
1554{
1555    _vm_page_deactivate(m, 0);
1556}
1557
1558/*
1559 * vm_page_try_to_cache:
1560 *
1561 * Returns 0 on failure, 1 on success
1562 */
1563int
1564vm_page_try_to_cache(vm_page_t m)
1565{
1566
1567	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1568	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1569	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
1570	    (m->oflags & VPO_BUSY) || (m->flags & PG_UNMANAGED)) {
1571		return (0);
1572	}
1573	pmap_remove_all(m);
1574	if (m->dirty)
1575		return (0);
1576	vm_page_cache(m);
1577	return (1);
1578}
1579
1580/*
1581 * vm_page_try_to_free()
1582 *
1583 *	Attempt to free the page.  If we cannot free it, we do nothing.
1584 *	1 is returned on success, 0 on failure.
1585 */
1586int
1587vm_page_try_to_free(vm_page_t m)
1588{
1589
1590	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1591	if (m->object != NULL)
1592		VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1593	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
1594	    (m->oflags & VPO_BUSY) || (m->flags & PG_UNMANAGED)) {
1595		return (0);
1596	}
1597	pmap_remove_all(m);
1598	if (m->dirty)
1599		return (0);
1600	vm_page_free(m);
1601	return (1);
1602}
1603
1604/*
1605 * vm_page_cache
1606 *
1607 * Put the specified page onto the page cache queue (if appropriate).
1608 *
1609 * This routine may not block.
1610 */
1611void
1612vm_page_cache(vm_page_t m)
1613{
1614	vm_object_t object;
1615	vm_page_t root;
1616
1617	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1618	object = m->object;
1619	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1620	if ((m->flags & PG_UNMANAGED) || (m->oflags & VPO_BUSY) || m->busy ||
1621	    m->hold_count || m->wire_count) {
1622		panic("vm_page_cache: attempting to cache busy page");
1623	}
1624	pmap_remove_all(m);
1625	if (m->dirty != 0)
1626		panic("vm_page_cache: page %p is dirty", m);
1627	if (m->valid == 0 || object->type == OBJT_DEFAULT ||
1628	    (object->type == OBJT_SWAP &&
1629	    !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
1630		/*
1631		 * Hypothesis: A cache-elgible page belonging to a
1632		 * default object or swap object but without a backing
1633		 * store must be zero filled.
1634		 */
1635		vm_page_free(m);
1636		return;
1637	}
1638	KASSERT((m->flags & PG_CACHED) == 0,
1639	    ("vm_page_cache: page %p is already cached", m));
1640	cnt.v_tcached++;
1641
1642	/*
1643	 * Remove the page from the paging queues.
1644	 */
1645	vm_pageq_remove(m);
1646
1647	/*
1648	 * Remove the page from the object's collection of resident
1649	 * pages.
1650	 */
1651	if (m != object->root)
1652		vm_page_splay(m->pindex, object->root);
1653	if (m->left == NULL)
1654		root = m->right;
1655	else {
1656		root = vm_page_splay(m->pindex, m->left);
1657		root->right = m->right;
1658	}
1659	object->root = root;
1660	TAILQ_REMOVE(&object->memq, m, listq);
1661	object->resident_page_count--;
1662	object->generation++;
1663
1664	/*
1665	 * Insert the page into the object's collection of cached pages
1666	 * and the physical memory allocator's cache/free page queues.
1667	 */
1668	vm_page_flag_clear(m, PG_ZERO);
1669	mtx_lock(&vm_page_queue_free_mtx);
1670	m->flags |= PG_CACHED;
1671	cnt.v_cache_count++;
1672	root = object->cache;
1673	if (root == NULL) {
1674		m->left = NULL;
1675		m->right = NULL;
1676	} else {
1677		root = vm_page_splay(m->pindex, root);
1678		if (m->pindex < root->pindex) {
1679			m->left = root->left;
1680			m->right = root;
1681			root->left = NULL;
1682		} else if (__predict_false(m->pindex == root->pindex))
1683			panic("vm_page_cache: offset already cached");
1684		else {
1685			m->right = root->right;
1686			m->left = root;
1687			root->right = NULL;
1688		}
1689	}
1690	object->cache = m;
1691#if VM_NRESERVLEVEL > 0
1692	if (!vm_reserv_free_page(m)) {
1693#else
1694	if (TRUE) {
1695#endif
1696		vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0);
1697		vm_phys_free_pages(m, 0);
1698	}
1699	vm_page_free_wakeup();
1700	mtx_unlock(&vm_page_queue_free_mtx);
1701
1702	/*
1703	 * Increment the vnode's hold count if this is the object's only
1704	 * cached page.  Decrement the vnode's hold count if this was
1705	 * the object's only resident page.
1706	 */
1707	if (object->type == OBJT_VNODE) {
1708		if (root == NULL && object->resident_page_count != 0)
1709			vhold(object->handle);
1710		else if (root != NULL && object->resident_page_count == 0)
1711			vdrop(object->handle);
1712	}
1713}
1714
1715/*
1716 * vm_page_dontneed
1717 *
1718 *	Cache, deactivate, or do nothing as appropriate.  This routine
1719 *	is typically used by madvise() MADV_DONTNEED.
1720 *
1721 *	Generally speaking we want to move the page into the cache so
1722 *	it gets reused quickly.  However, this can result in a silly syndrome
1723 *	due to the page recycling too quickly.  Small objects will not be
1724 *	fully cached.  On the otherhand, if we move the page to the inactive
1725 *	queue we wind up with a problem whereby very large objects
1726 *	unnecessarily blow away our inactive and cache queues.
1727 *
1728 *	The solution is to move the pages based on a fixed weighting.  We
1729 *	either leave them alone, deactivate them, or move them to the cache,
1730 *	where moving them to the cache has the highest weighting.
1731 *	By forcing some pages into other queues we eventually force the
1732 *	system to balance the queues, potentially recovering other unrelated
1733 *	space from active.  The idea is to not force this to happen too
1734 *	often.
1735 */
1736void
1737vm_page_dontneed(vm_page_t m)
1738{
1739	static int dnweight;
1740	int dnw;
1741	int head;
1742
1743	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1744	dnw = ++dnweight;
1745
1746	/*
1747	 * occassionally leave the page alone
1748	 */
1749	if ((dnw & 0x01F0) == 0 ||
1750	    VM_PAGE_INQUEUE2(m, PQ_INACTIVE)) {
1751		if (m->act_count >= ACT_INIT)
1752			--m->act_count;
1753		return;
1754	}
1755
1756	/*
1757	 * Clear any references to the page.  Otherwise, the page daemon will
1758	 * immediately reactivate the page.
1759	 */
1760	vm_page_flag_clear(m, PG_REFERENCED);
1761	pmap_clear_reference(m);
1762
1763	if (m->dirty == 0 && pmap_is_modified(m))
1764		vm_page_dirty(m);
1765
1766	if (m->dirty || (dnw & 0x0070) == 0) {
1767		/*
1768		 * Deactivate the page 3 times out of 32.
1769		 */
1770		head = 0;
1771	} else {
1772		/*
1773		 * Cache the page 28 times out of every 32.  Note that
1774		 * the page is deactivated instead of cached, but placed
1775		 * at the head of the queue instead of the tail.
1776		 */
1777		head = 1;
1778	}
1779	_vm_page_deactivate(m, head);
1780}
1781
1782/*
1783 * Grab a page, waiting until we are waken up due to the page
1784 * changing state.  We keep on waiting, if the page continues
1785 * to be in the object.  If the page doesn't exist, first allocate it
1786 * and then conditionally zero it.
1787 *
1788 * This routine may block.
1789 */
1790vm_page_t
1791vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
1792{
1793	vm_page_t m;
1794
1795	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1796retrylookup:
1797	if ((m = vm_page_lookup(object, pindex)) != NULL) {
1798		if (vm_page_sleep_if_busy(m, TRUE, "pgrbwt")) {
1799			if ((allocflags & VM_ALLOC_RETRY) == 0)
1800				return (NULL);
1801			goto retrylookup;
1802		} else {
1803			if ((allocflags & VM_ALLOC_WIRED) != 0) {
1804				vm_page_lock_queues();
1805				vm_page_wire(m);
1806				vm_page_unlock_queues();
1807			}
1808			if ((allocflags & VM_ALLOC_NOBUSY) == 0)
1809				vm_page_busy(m);
1810			return (m);
1811		}
1812	}
1813	m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY);
1814	if (m == NULL) {
1815		VM_OBJECT_UNLOCK(object);
1816		VM_WAIT;
1817		VM_OBJECT_LOCK(object);
1818		if ((allocflags & VM_ALLOC_RETRY) == 0)
1819			return (NULL);
1820		goto retrylookup;
1821	} else if (m->valid != 0)
1822		return (m);
1823	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
1824		pmap_zero_page(m);
1825	return (m);
1826}
1827
1828/*
1829 * Mapping function for valid bits or for dirty bits in
1830 * a page.  May not block.
1831 *
1832 * Inputs are required to range within a page.
1833 */
1834int
1835vm_page_bits(int base, int size)
1836{
1837	int first_bit;
1838	int last_bit;
1839
1840	KASSERT(
1841	    base + size <= PAGE_SIZE,
1842	    ("vm_page_bits: illegal base/size %d/%d", base, size)
1843	);
1844
1845	if (size == 0)		/* handle degenerate case */
1846		return (0);
1847
1848	first_bit = base >> DEV_BSHIFT;
1849	last_bit = (base + size - 1) >> DEV_BSHIFT;
1850
1851	return ((2 << last_bit) - (1 << first_bit));
1852}
1853
1854/*
1855 *	vm_page_set_valid:
1856 *
1857 *	Sets portions of a page valid.  The arguments are expected
1858 *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
1859 *	of any partial chunks touched by the range.  The invalid portion of
1860 *	such chunks will be zeroed.
1861 *
1862 *	(base + size) must be less then or equal to PAGE_SIZE.
1863 */
1864void
1865vm_page_set_valid(vm_page_t m, int base, int size)
1866{
1867	int endoff, frag;
1868
1869	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1870	if (size == 0)	/* handle degenerate case */
1871		return;
1872
1873	/*
1874	 * If the base is not DEV_BSIZE aligned and the valid
1875	 * bit is clear, we have to zero out a portion of the
1876	 * first block.
1877	 */
1878	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
1879	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
1880		pmap_zero_page_area(m, frag, base - frag);
1881
1882	/*
1883	 * If the ending offset is not DEV_BSIZE aligned and the
1884	 * valid bit is clear, we have to zero out a portion of
1885	 * the last block.
1886	 */
1887	endoff = base + size;
1888	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
1889	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
1890		pmap_zero_page_area(m, endoff,
1891		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
1892
1893	/*
1894	 * Set valid bits inclusive of any overlap.
1895	 */
1896	m->valid |= vm_page_bits(base, size);
1897}
1898
1899/*
1900 *	vm_page_set_validclean:
1901 *
1902 *	Sets portions of a page valid and clean.  The arguments are expected
1903 *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
1904 *	of any partial chunks touched by the range.  The invalid portion of
1905 *	such chunks will be zero'd.
1906 *
1907 *	This routine may not block.
1908 *
1909 *	(base + size) must be less then or equal to PAGE_SIZE.
1910 */
1911void
1912vm_page_set_validclean(vm_page_t m, int base, int size)
1913{
1914	int pagebits;
1915	int frag;
1916	int endoff;
1917
1918	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1919	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1920	if (size == 0)	/* handle degenerate case */
1921		return;
1922
1923	/*
1924	 * If the base is not DEV_BSIZE aligned and the valid
1925	 * bit is clear, we have to zero out a portion of the
1926	 * first block.
1927	 */
1928	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
1929	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
1930		pmap_zero_page_area(m, frag, base - frag);
1931
1932	/*
1933	 * If the ending offset is not DEV_BSIZE aligned and the
1934	 * valid bit is clear, we have to zero out a portion of
1935	 * the last block.
1936	 */
1937	endoff = base + size;
1938	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
1939	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
1940		pmap_zero_page_area(m, endoff,
1941		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
1942
1943	/*
1944	 * Set valid, clear dirty bits.  If validating the entire
1945	 * page we can safely clear the pmap modify bit.  We also
1946	 * use this opportunity to clear the VPO_NOSYNC flag.  If a process
1947	 * takes a write fault on a MAP_NOSYNC memory area the flag will
1948	 * be set again.
1949	 *
1950	 * We set valid bits inclusive of any overlap, but we can only
1951	 * clear dirty bits for DEV_BSIZE chunks that are fully within
1952	 * the range.
1953	 */
1954	pagebits = vm_page_bits(base, size);
1955	m->valid |= pagebits;
1956#if 0	/* NOT YET */
1957	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
1958		frag = DEV_BSIZE - frag;
1959		base += frag;
1960		size -= frag;
1961		if (size < 0)
1962			size = 0;
1963	}
1964	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
1965#endif
1966	m->dirty &= ~pagebits;
1967	if (base == 0 && size == PAGE_SIZE) {
1968		pmap_clear_modify(m);
1969		m->oflags &= ~VPO_NOSYNC;
1970	}
1971}
1972
1973void
1974vm_page_clear_dirty(vm_page_t m, int base, int size)
1975{
1976
1977	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1978	m->dirty &= ~vm_page_bits(base, size);
1979}
1980
1981/*
1982 *	vm_page_set_invalid:
1983 *
1984 *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
1985 *	valid and dirty bits for the effected areas are cleared.
1986 *
1987 *	May not block.
1988 */
1989void
1990vm_page_set_invalid(vm_page_t m, int base, int size)
1991{
1992	int bits;
1993
1994	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
1995	bits = vm_page_bits(base, size);
1996	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1997	if (m->valid == VM_PAGE_BITS_ALL && bits != 0)
1998		pmap_remove_all(m);
1999	m->valid &= ~bits;
2000	m->dirty &= ~bits;
2001	m->object->generation++;
2002}
2003
2004/*
2005 * vm_page_zero_invalid()
2006 *
2007 *	The kernel assumes that the invalid portions of a page contain
2008 *	garbage, but such pages can be mapped into memory by user code.
2009 *	When this occurs, we must zero out the non-valid portions of the
2010 *	page so user code sees what it expects.
2011 *
2012 *	Pages are most often semi-valid when the end of a file is mapped
2013 *	into memory and the file's size is not page aligned.
2014 */
2015void
2016vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
2017{
2018	int b;
2019	int i;
2020
2021	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2022	/*
2023	 * Scan the valid bits looking for invalid sections that
2024	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
2025	 * valid bit may be set ) have already been zerod by
2026	 * vm_page_set_validclean().
2027	 */
2028	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
2029		if (i == (PAGE_SIZE / DEV_BSIZE) ||
2030		    (m->valid & (1 << i))
2031		) {
2032			if (i > b) {
2033				pmap_zero_page_area(m,
2034				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
2035			}
2036			b = i + 1;
2037		}
2038	}
2039
2040	/*
2041	 * setvalid is TRUE when we can safely set the zero'd areas
2042	 * as being valid.  We can do this if there are no cache consistancy
2043	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
2044	 */
2045	if (setvalid)
2046		m->valid = VM_PAGE_BITS_ALL;
2047}
2048
2049/*
2050 *	vm_page_is_valid:
2051 *
2052 *	Is (partial) page valid?  Note that the case where size == 0
2053 *	will return FALSE in the degenerate case where the page is
2054 *	entirely invalid, and TRUE otherwise.
2055 *
2056 *	May not block.
2057 */
2058int
2059vm_page_is_valid(vm_page_t m, int base, int size)
2060{
2061	int bits = vm_page_bits(base, size);
2062
2063	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2064	if (m->valid && ((m->valid & bits) == bits))
2065		return 1;
2066	else
2067		return 0;
2068}
2069
2070/*
2071 * update dirty bits from pmap/mmu.  May not block.
2072 */
2073void
2074vm_page_test_dirty(vm_page_t m)
2075{
2076	if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
2077		vm_page_dirty(m);
2078	}
2079}
2080
2081int so_zerocp_fullpage = 0;
2082
2083/*
2084 *	Replace the given page with a copy.  The copied page assumes
2085 *	the portion of the given page's "wire_count" that is not the
2086 *	responsibility of this copy-on-write mechanism.
2087 *
2088 *	The object containing the given page must have a non-zero
2089 *	paging-in-progress count and be locked.
2090 */
2091void
2092vm_page_cowfault(vm_page_t m)
2093{
2094	vm_page_t mnew;
2095	vm_object_t object;
2096	vm_pindex_t pindex;
2097
2098	object = m->object;
2099	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2100	KASSERT(object->paging_in_progress != 0,
2101	    ("vm_page_cowfault: object %p's paging-in-progress count is zero.",
2102	    object));
2103	pindex = m->pindex;
2104
2105 retry_alloc:
2106	pmap_remove_all(m);
2107	vm_page_remove(m);
2108	mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
2109	if (mnew == NULL) {
2110		vm_page_insert(m, object, pindex);
2111		vm_page_unlock_queues();
2112		VM_OBJECT_UNLOCK(object);
2113		VM_WAIT;
2114		VM_OBJECT_LOCK(object);
2115		if (m == vm_page_lookup(object, pindex)) {
2116			vm_page_lock_queues();
2117			goto retry_alloc;
2118		} else {
2119			/*
2120			 * Page disappeared during the wait.
2121			 */
2122			vm_page_lock_queues();
2123			return;
2124		}
2125	}
2126
2127	if (m->cow == 0) {
2128		/*
2129		 * check to see if we raced with an xmit complete when
2130		 * waiting to allocate a page.  If so, put things back
2131		 * the way they were
2132		 */
2133		vm_page_free(mnew);
2134		vm_page_insert(m, object, pindex);
2135	} else { /* clear COW & copy page */
2136		if (!so_zerocp_fullpage)
2137			pmap_copy_page(m, mnew);
2138		mnew->valid = VM_PAGE_BITS_ALL;
2139		vm_page_dirty(mnew);
2140		mnew->wire_count = m->wire_count - m->cow;
2141		m->wire_count = m->cow;
2142	}
2143}
2144
2145void
2146vm_page_cowclear(vm_page_t m)
2147{
2148
2149	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2150	if (m->cow) {
2151		m->cow--;
2152		/*
2153		 * let vm_fault add back write permission  lazily
2154		 */
2155	}
2156	/*
2157	 *  sf_buf_free() will free the page, so we needn't do it here
2158	 */
2159}
2160
2161int
2162vm_page_cowsetup(vm_page_t m)
2163{
2164
2165	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2166	if (m->cow == USHRT_MAX - 1)
2167		return (EBUSY);
2168	m->cow++;
2169	pmap_remove_write(m);
2170	return (0);
2171}
2172
2173#include "opt_ddb.h"
2174#ifdef DDB
2175#include <sys/kernel.h>
2176
2177#include <ddb/ddb.h>
2178
2179DB_SHOW_COMMAND(page, vm_page_print_page_info)
2180{
2181	db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
2182	db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
2183	db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
2184	db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
2185	db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
2186	db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
2187	db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
2188	db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
2189	db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
2190	db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
2191}
2192
2193DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
2194{
2195
2196	db_printf("PQ_FREE:");
2197	db_printf(" %d", cnt.v_free_count);
2198	db_printf("\n");
2199
2200	db_printf("PQ_CACHE:");
2201	db_printf(" %d", cnt.v_cache_count);
2202	db_printf("\n");
2203
2204	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
2205		*vm_page_queues[PQ_ACTIVE].cnt,
2206		*vm_page_queues[PQ_INACTIVE].cnt);
2207}
2208#endif /* DDB */
2209