vm_page.c revision 79248
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
37 * $FreeBSD: head/sys/vm/vm_page.c 79248 2001-07-04 20:15:18Z dillon $
38 */
39
40/*
41 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42 * All rights reserved.
43 *
44 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45 *
46 * Permission to use, copy, modify and distribute this software and
47 * its documentation is hereby granted, provided that both the copyright
48 * notice and this permission notice appear in all copies of the
49 * software, derivative works or modified versions, and any portions
50 * thereof, and that both notices appear in supporting documentation.
51 *
52 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55 *
56 * Carnegie Mellon requests users of this software to return to
57 *
58 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
59 *  School of Computer Science
60 *  Carnegie Mellon University
61 *  Pittsburgh PA 15213-3890
62 *
63 * any improvements or extensions that they make and grant Carnegie the
64 * rights to redistribute these changes.
65 */
66
67/*
68 *	Resident memory management module.
69 */
70
71#include <sys/param.h>
72#include <sys/systm.h>
73#include <sys/lock.h>
74#include <sys/malloc.h>
75#include <sys/mutex.h>
76#include <sys/proc.h>
77#include <sys/vmmeter.h>
78#include <sys/vnode.h>
79
80#include <vm/vm.h>
81#include <vm/vm_param.h>
82#include <vm/vm_kern.h>
83#include <vm/vm_object.h>
84#include <vm/vm_page.h>
85#include <vm/vm_pageout.h>
86#include <vm/vm_pager.h>
87#include <vm/vm_extern.h>
88
89static void	vm_page_queue_init __P((void));
90static vm_page_t vm_page_select_cache __P((vm_object_t, vm_pindex_t));
91
92/*
93 *	Associated with page of user-allocatable memory is a
94 *	page structure.
95 */
96
97static struct vm_page **vm_page_buckets; /* Array of buckets */
98static int vm_page_bucket_count;	/* How big is array? */
99static int vm_page_hash_mask;		/* Mask for hash function */
100static volatile int vm_page_bucket_generation;
101
102struct vpgqueues vm_page_queues[PQ_COUNT];
103
104static void
105vm_page_queue_init(void)
106{
107	int i;
108
109	for (i = 0; i < PQ_L2_SIZE; i++) {
110		vm_page_queues[PQ_FREE+i].cnt = &cnt.v_free_count;
111	}
112	for (i = 0; i < PQ_L2_SIZE; i++) {
113		vm_page_queues[PQ_CACHE+i].cnt = &cnt.v_cache_count;
114	}
115	vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
116	vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
117
118	for (i = 0; i < PQ_COUNT; i++) {
119		TAILQ_INIT(&vm_page_queues[i].pl);
120	}
121}
122
123vm_page_t vm_page_array = 0;
124int vm_page_array_size = 0;
125long first_page = 0;
126int vm_page_zero_count = 0;
127
128static vm_page_t _vm_page_list_find(int basequeue, int index);
129
130/*
131 *	vm_set_page_size:
132 *
133 *	Sets the page size, perhaps based upon the memory
134 *	size.  Must be called before any use of page-size
135 *	dependent functions.
136 */
137void
138vm_set_page_size(void)
139{
140	if (cnt.v_page_size == 0)
141		cnt.v_page_size = PAGE_SIZE;
142	if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
143		panic("vm_set_page_size: page size not a power of two");
144}
145
146/*
147 *	vm_add_new_page:
148 *
149 *	Add a new page to the freelist for use by the system.
150 *	Must be called at splhigh().
151 */
152vm_page_t
153vm_add_new_page(vm_offset_t pa)
154{
155	vm_page_t m;
156
157	GIANT_REQUIRED;
158
159	++cnt.v_page_count;
160	++cnt.v_free_count;
161	m = PHYS_TO_VM_PAGE(pa);
162	m->phys_addr = pa;
163	m->flags = 0;
164	m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK;
165	m->queue = m->pc + PQ_FREE;
166	TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
167	vm_page_queues[m->queue].lcnt++;
168	return (m);
169}
170
171/*
172 *	vm_page_startup:
173 *
174 *	Initializes the resident memory module.
175 *
176 *	Allocates memory for the page cells, and
177 *	for the object/offset-to-page hash table headers.
178 *	Each page cell is initialized and placed on the free list.
179 */
180
181vm_offset_t
182vm_page_startup(vm_offset_t starta, vm_offset_t enda, vm_offset_t vaddr)
183{
184	vm_offset_t mapped;
185	struct vm_page **bucket;
186	vm_size_t npages, page_range;
187	vm_offset_t new_end;
188	int i;
189	vm_offset_t pa;
190	int nblocks;
191	vm_offset_t last_pa;
192
193	/* the biggest memory array is the second group of pages */
194	vm_offset_t end;
195	vm_offset_t biggestone, biggestsize;
196
197	vm_offset_t total;
198
199	total = 0;
200	biggestsize = 0;
201	biggestone = 0;
202	nblocks = 0;
203	vaddr = round_page(vaddr);
204
205	for (i = 0; phys_avail[i + 1]; i += 2) {
206		phys_avail[i] = round_page(phys_avail[i]);
207		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
208	}
209
210	for (i = 0; phys_avail[i + 1]; i += 2) {
211		int size = phys_avail[i + 1] - phys_avail[i];
212
213		if (size > biggestsize) {
214			biggestone = i;
215			biggestsize = size;
216		}
217		++nblocks;
218		total += size;
219	}
220
221	end = phys_avail[biggestone+1];
222
223	/*
224	 * Initialize the queue headers for the free queue, the active queue
225	 * and the inactive queue.
226	 */
227
228	vm_page_queue_init();
229
230	/*
231	 * Allocate (and initialize) the hash table buckets.
232	 *
233	 * The number of buckets MUST BE a power of 2, and the actual value is
234	 * the next power of 2 greater than the number of physical pages in
235	 * the system.
236	 *
237	 * We make the hash table approximately 2x the number of pages to
238	 * reduce the chain length.  This is about the same size using the
239	 * singly-linked list as the 1x hash table we were using before
240	 * using TAILQ but the chain length will be smaller.
241	 *
242	 * Note: This computation can be tweaked if desired.
243	 */
244	if (vm_page_bucket_count == 0) {
245		vm_page_bucket_count = 1;
246		while (vm_page_bucket_count < atop(total))
247			vm_page_bucket_count <<= 1;
248	}
249	vm_page_bucket_count <<= 1;
250	vm_page_hash_mask = vm_page_bucket_count - 1;
251
252	/*
253	 * Validate these addresses.
254	 */
255	new_end = end - vm_page_bucket_count * sizeof(struct vm_page *);
256	new_end = trunc_page(new_end);
257	mapped = pmap_map(&vaddr, new_end, end,
258	    VM_PROT_READ | VM_PROT_WRITE);
259	bzero((caddr_t) mapped, end - new_end);
260
261	vm_page_buckets = (struct vm_page **)mapped;
262	bucket = vm_page_buckets;
263	for (i = 0; i < vm_page_bucket_count; i++) {
264		*bucket = NULL;
265		bucket++;
266	}
267
268	/*
269	 * Compute the number of pages of memory that will be available for
270	 * use (taking into account the overhead of a page structure per
271	 * page).
272	 */
273
274	first_page = phys_avail[0] / PAGE_SIZE;
275
276	page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE - first_page;
277	npages = (total - (page_range * sizeof(struct vm_page)) -
278	    (end - new_end)) / PAGE_SIZE;
279
280	end = new_end;
281
282	/*
283	 * Initialize the mem entry structures now, and put them in the free
284	 * queue.
285	 */
286	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
287	mapped = pmap_map(&vaddr, new_end, end,
288	    VM_PROT_READ | VM_PROT_WRITE);
289	vm_page_array = (vm_page_t) mapped;
290
291	/*
292	 * Clear all of the page structures
293	 */
294	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
295	vm_page_array_size = page_range;
296
297	/*
298	 * Construct the free queue(s) in descending order (by physical
299	 * address) so that the first 16MB of physical memory is allocated
300	 * last rather than first.  On large-memory machines, this avoids
301	 * the exhaustion of low physical memory before isa_dmainit has run.
302	 */
303	cnt.v_page_count = 0;
304	cnt.v_free_count = 0;
305	for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) {
306		pa = phys_avail[i];
307		if (i == biggestone)
308			last_pa = new_end;
309		else
310			last_pa = phys_avail[i + 1];
311		while (pa < last_pa && npages-- > 0) {
312			vm_add_new_page(pa);
313			pa += PAGE_SIZE;
314		}
315	}
316	return (vaddr);
317}
318
319/*
320 *	vm_page_hash:
321 *
322 *	Distributes the object/offset key pair among hash buckets.
323 *
324 *	NOTE:  This macro depends on vm_page_bucket_count being a power of 2.
325 *	This routine may not block.
326 *
327 *	We try to randomize the hash based on the object to spread the pages
328 *	out in the hash table without it costing us too much.
329 */
330static __inline int
331vm_page_hash(vm_object_t object, vm_pindex_t pindex)
332{
333	int i = ((uintptr_t)object + pindex) ^ object->hash_rand;
334
335	return(i & vm_page_hash_mask);
336}
337
338void
339vm_page_flag_set(vm_page_t m, unsigned short bits)
340{
341	GIANT_REQUIRED;
342	atomic_set_short(&(m)->flags, bits);
343	/* m->flags |= bits; */
344}
345
346void
347vm_page_flag_clear(vm_page_t m, unsigned short bits)
348{
349	GIANT_REQUIRED;
350	atomic_clear_short(&(m)->flags, bits);
351	/* m->flags &= ~bits; */
352}
353
354void
355vm_page_busy(vm_page_t m)
356{
357	KASSERT((m->flags & PG_BUSY) == 0,
358	    ("vm_page_busy: page already busy!!!"));
359	vm_page_flag_set(m, PG_BUSY);
360}
361
362/*
363 *      vm_page_flash:
364 *
365 *      wakeup anyone waiting for the page.
366 */
367
368void
369vm_page_flash(vm_page_t m)
370{
371	if (m->flags & PG_WANTED) {
372		vm_page_flag_clear(m, PG_WANTED);
373		wakeup(m);
374	}
375}
376
377/*
378 *      vm_page_wakeup:
379 *
380 *      clear the PG_BUSY flag and wakeup anyone waiting for the
381 *      page.
382 *
383 */
384
385void
386vm_page_wakeup(vm_page_t m)
387{
388	KASSERT(m->flags & PG_BUSY, ("vm_page_wakeup: page not busy!!!"));
389	vm_page_flag_clear(m, PG_BUSY);
390	vm_page_flash(m);
391}
392
393/*
394 *
395 *
396 */
397
398void
399vm_page_io_start(vm_page_t m)
400{
401	GIANT_REQUIRED;
402	atomic_add_char(&(m)->busy, 1);
403}
404
405void
406vm_page_io_finish(vm_page_t m)
407{
408	GIANT_REQUIRED;
409	atomic_subtract_char(&(m)->busy, 1);
410	if (m->busy == 0)
411		vm_page_flash(m);
412}
413
414/*
415 * Keep page from being freed by the page daemon
416 * much of the same effect as wiring, except much lower
417 * overhead and should be used only for *very* temporary
418 * holding ("wiring").
419 */
420void
421vm_page_hold(vm_page_t mem)
422{
423        GIANT_REQUIRED;
424        mem->hold_count++;
425}
426
427void
428vm_page_unhold(vm_page_t mem)
429{
430	GIANT_REQUIRED;
431	--mem->hold_count;
432	KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
433}
434
435/*
436 *	vm_page_protect:
437 *
438 *	Reduce the protection of a page.  This routine never raises the
439 *	protection and therefore can be safely called if the page is already
440 *	at VM_PROT_NONE (it will be a NOP effectively ).
441 */
442
443void
444vm_page_protect(vm_page_t mem, int prot)
445{
446	if (prot == VM_PROT_NONE) {
447		if (mem->flags & (PG_WRITEABLE|PG_MAPPED)) {
448			pmap_page_protect(mem, VM_PROT_NONE);
449			vm_page_flag_clear(mem, PG_WRITEABLE|PG_MAPPED);
450		}
451	} else if ((prot == VM_PROT_READ) && (mem->flags & PG_WRITEABLE)) {
452		pmap_page_protect(mem, VM_PROT_READ);
453		vm_page_flag_clear(mem, PG_WRITEABLE);
454	}
455}
456/*
457 *	vm_page_zero_fill:
458 *
459 *	Zero-fill the specified page.
460 *	Written as a standard pagein routine, to
461 *	be used by the zero-fill object.
462 */
463boolean_t
464vm_page_zero_fill(vm_page_t m)
465{
466	pmap_zero_page(VM_PAGE_TO_PHYS(m));
467	return (TRUE);
468}
469
470/*
471 *	vm_page_copy:
472 *
473 *	Copy one page to another
474 */
475void
476vm_page_copy(vm_page_t src_m, vm_page_t dest_m)
477{
478	pmap_copy_page(VM_PAGE_TO_PHYS(src_m), VM_PAGE_TO_PHYS(dest_m));
479	dest_m->valid = VM_PAGE_BITS_ALL;
480}
481
482/*
483 *	vm_page_free:
484 *
485 *	Free a page
486 *
487 *	The clearing of PG_ZERO is a temporary safety until the code can be
488 *	reviewed to determine that PG_ZERO is being properly cleared on
489 *	write faults or maps.  PG_ZERO was previously cleared in
490 *	vm_page_alloc().
491 */
492void
493vm_page_free(vm_page_t m)
494{
495	vm_page_flag_clear(m, PG_ZERO);
496	vm_page_free_toq(m);
497}
498
499/*
500 *	vm_page_free_zero:
501 *
502 *	Free a page to the zerod-pages queue
503 */
504void
505vm_page_free_zero(vm_page_t m)
506{
507	vm_page_flag_set(m, PG_ZERO);
508	vm_page_free_toq(m);
509}
510
511/*
512 *	vm_page_sleep_busy:
513 *
514 *	Wait until page is no longer PG_BUSY or (if also_m_busy is TRUE)
515 *	m->busy is zero.  Returns TRUE if it had to sleep ( including if
516 *	it almost had to sleep and made temporary spl*() mods), FALSE
517 *	otherwise.
518 *
519 *	This routine assumes that interrupts can only remove the busy
520 *	status from a page, not set the busy status or change it from
521 *	PG_BUSY to m->busy or vise versa (which would create a timing
522 *	window).
523 */
524
525int
526vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
527{
528	GIANT_REQUIRED;
529	if ((m->flags & PG_BUSY) || (also_m_busy && m->busy))  {
530		int s = splvm();
531		if ((m->flags & PG_BUSY) || (also_m_busy && m->busy)) {
532			/*
533			 * Page is busy. Wait and retry.
534			 */
535			vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
536			tsleep(m, PVM, msg, 0);
537		}
538		splx(s);
539		return(TRUE);
540		/* not reached */
541	}
542	return(FALSE);
543}
544/*
545 *	vm_page_dirty:
546 *
547 *	make page all dirty
548 */
549
550void
551vm_page_dirty(vm_page_t m)
552{
553	KASSERT(m->queue - m->pc != PQ_CACHE,
554	    ("vm_page_dirty: page in cache!"));
555	m->dirty = VM_PAGE_BITS_ALL;
556}
557
558/*
559 *	vm_page_undirty:
560 *
561 *	Set page to not be dirty.  Note: does not clear pmap modify bits
562 */
563
564void
565vm_page_undirty(vm_page_t m)
566{
567	m->dirty = 0;
568}
569
570/*
571 *	vm_page_insert:		[ internal use only ]
572 *
573 *	Inserts the given mem entry into the object and object list.
574 *
575 *	The pagetables are not updated but will presumably fault the page
576 *	in if necessary, or if a kernel page the caller will at some point
577 *	enter the page into the kernel's pmap.  We are not allowed to block
578 *	here so we *can't* do this anyway.
579 *
580 *	The object and page must be locked, and must be splhigh.
581 *	This routine may not block.
582 */
583
584void
585vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
586{
587	struct vm_page **bucket;
588
589	GIANT_REQUIRED;
590
591	if (m->object != NULL)
592		panic("vm_page_insert: already inserted");
593
594	/*
595	 * Record the object/offset pair in this page
596	 */
597
598	m->object = object;
599	m->pindex = pindex;
600
601	/*
602	 * Insert it into the object_object/offset hash table
603	 */
604
605	bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
606	m->hnext = *bucket;
607	*bucket = m;
608	vm_page_bucket_generation++;
609
610	/*
611	 * Now link into the object's list of backed pages.
612	 */
613
614	TAILQ_INSERT_TAIL(&object->memq, m, listq);
615	object->generation++;
616
617	/*
618	 * show that the object has one more resident page.
619	 */
620
621	object->resident_page_count++;
622
623	/*
624	 * Since we are inserting a new and possibly dirty page,
625	 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
626	 */
627	if (m->flags & PG_WRITEABLE)
628	    vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
629}
630
631/*
632 *	vm_page_remove:
633 *				NOTE: used by device pager as well -wfj
634 *
635 *	Removes the given mem entry from the object/offset-page
636 *	table and the object page list, but do not invalidate/terminate
637 *	the backing store.
638 *
639 *	The object and page must be locked, and at splhigh.
640 *	The underlying pmap entry (if any) is NOT removed here.
641 *	This routine may not block.
642 */
643
644void
645vm_page_remove(vm_page_t m)
646{
647	vm_object_t object;
648
649	GIANT_REQUIRED;
650
651	if (m->object == NULL)
652		return;
653
654	if ((m->flags & PG_BUSY) == 0) {
655		panic("vm_page_remove: page not busy");
656	}
657
658	/*
659	 * Basically destroy the page.
660	 */
661
662	vm_page_wakeup(m);
663
664	object = m->object;
665
666	/*
667	 * Remove from the object_object/offset hash table.  The object
668	 * must be on the hash queue, we will panic if it isn't
669	 *
670	 * Note: we must NULL-out m->hnext to prevent loops in detached
671	 * buffers with vm_page_lookup().
672	 */
673
674	{
675		struct vm_page **bucket;
676
677		bucket = &vm_page_buckets[vm_page_hash(m->object, m->pindex)];
678		while (*bucket != m) {
679			if (*bucket == NULL)
680				panic("vm_page_remove(): page not found in hash");
681			bucket = &(*bucket)->hnext;
682		}
683		*bucket = m->hnext;
684		m->hnext = NULL;
685		vm_page_bucket_generation++;
686	}
687
688	/*
689	 * Now remove from the object's list of backed pages.
690	 */
691
692	TAILQ_REMOVE(&object->memq, m, listq);
693
694	/*
695	 * And show that the object has one fewer resident page.
696	 */
697
698	object->resident_page_count--;
699	object->generation++;
700
701	m->object = NULL;
702}
703
704/*
705 *	vm_page_lookup:
706 *
707 *	Returns the page associated with the object/offset
708 *	pair specified; if none is found, NULL is returned.
709 *
710 *	NOTE: the code below does not lock.  It will operate properly if
711 *	an interrupt makes a change, but the generation algorithm will not
712 *	operate properly in an SMP environment where both cpu's are able to run
713 *	kernel code simultaneously.
714 *
715 *	The object must be locked.  No side effects.
716 *	This routine may not block.
717 *	This is a critical path routine
718 */
719
720vm_page_t
721vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
722{
723	vm_page_t m;
724	struct vm_page **bucket;
725	int generation;
726
727	/*
728	 * Search the hash table for this object/offset pair
729	 */
730
731retry:
732	generation = vm_page_bucket_generation;
733	bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
734	for (m = *bucket; m != NULL; m = m->hnext) {
735		if ((m->object == object) && (m->pindex == pindex)) {
736			if (vm_page_bucket_generation != generation)
737				goto retry;
738			return (m);
739		}
740	}
741	if (vm_page_bucket_generation != generation)
742		goto retry;
743	return (NULL);
744}
745
746/*
747 *	vm_page_rename:
748 *
749 *	Move the given memory entry from its
750 *	current object to the specified target object/offset.
751 *
752 *	The object must be locked.
753 *	This routine may not block.
754 *
755 *	Note: this routine will raise itself to splvm(), the caller need not.
756 *
757 *	Note: swap associated with the page must be invalidated by the move.  We
758 *	      have to do this for several reasons:  (1) we aren't freeing the
759 *	      page, (2) we are dirtying the page, (3) the VM system is probably
760 *	      moving the page from object A to B, and will then later move
761 *	      the backing store from A to B and we can't have a conflict.
762 *
763 *	Note: we *always* dirty the page.  It is necessary both for the
764 *	      fact that we moved it, and because we may be invalidating
765 *	      swap.  If the page is on the cache, we have to deactivate it
766 *	      or vm_page_dirty() will panic.  Dirty pages are not allowed
767 *	      on the cache.
768 */
769
770void
771vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
772{
773	int s;
774
775	s = splvm();
776	vm_page_remove(m);
777	vm_page_insert(m, new_object, new_pindex);
778	if (m->queue - m->pc == PQ_CACHE)
779		vm_page_deactivate(m);
780	vm_page_dirty(m);
781	splx(s);
782}
783
784/*
785 * vm_page_unqueue_nowakeup:
786 *
787 * 	vm_page_unqueue() without any wakeup
788 *
789 *	This routine must be called at splhigh().
790 *	This routine may not block.
791 */
792
793void
794vm_page_unqueue_nowakeup(vm_page_t m)
795{
796	int queue = m->queue;
797	struct vpgqueues *pq;
798	if (queue != PQ_NONE) {
799		pq = &vm_page_queues[queue];
800		m->queue = PQ_NONE;
801		TAILQ_REMOVE(&pq->pl, m, pageq);
802		(*pq->cnt)--;
803		pq->lcnt--;
804	}
805}
806
807/*
808 * vm_page_unqueue:
809 *
810 *	Remove a page from its queue.
811 *
812 *	This routine must be called at splhigh().
813 *	This routine may not block.
814 */
815
816void
817vm_page_unqueue(vm_page_t m)
818{
819	int queue = m->queue;
820	struct vpgqueues *pq;
821
822	GIANT_REQUIRED;
823	if (queue != PQ_NONE) {
824		m->queue = PQ_NONE;
825		pq = &vm_page_queues[queue];
826		TAILQ_REMOVE(&pq->pl, m, pageq);
827		(*pq->cnt)--;
828		pq->lcnt--;
829		if ((queue - m->pc) == PQ_CACHE) {
830			if (vm_paging_needed())
831				pagedaemon_wakeup();
832		}
833	}
834}
835
836vm_page_t
837vm_page_list_find(int basequeue, int index, boolean_t prefer_zero)
838{
839        vm_page_t m;
840
841	GIANT_REQUIRED;
842
843#if PQ_L2_SIZE > 1
844        if (prefer_zero) {
845                m = TAILQ_LAST(&vm_page_queues[basequeue+index].pl, pglist);
846        } else {
847                m = TAILQ_FIRST(&vm_page_queues[basequeue+index].pl);
848        }
849        if (m == NULL) {
850                m = _vm_page_list_find(basequeue, index);
851	}
852#else
853        if (prefer_zero) {
854                m = TAILQ_LAST(&vm_page_queues[basequeue].pl, pglist);
855        } else {
856                m = TAILQ_FIRST(&vm_page_queues[basequeue].pl);
857        }
858#endif
859        return(m);
860}
861
862
863#if PQ_L2_SIZE > 1
864
865/*
866 *	vm_page_list_find:
867 *
868 *	Find a page on the specified queue with color optimization.
869 *
870 *	The page coloring optimization attempts to locate a page
871 *	that does not overload other nearby pages in the object in
872 *	the cpu's L1 or L2 caches.  We need this optimization because
873 *	cpu caches tend to be physical caches, while object spaces tend
874 *	to be virtual.
875 *
876 *	This routine must be called at splvm().
877 *	This routine may not block.
878 *
879 *	This routine may only be called from the vm_page_list_find() macro
880 *	in vm_page.h
881 */
882static vm_page_t
883_vm_page_list_find(int basequeue, int index)
884{
885	int i;
886	vm_page_t m = NULL;
887	struct vpgqueues *pq;
888
889	GIANT_REQUIRED;
890	pq = &vm_page_queues[basequeue];
891
892	/*
893	 * Note that for the first loop, index+i and index-i wind up at the
894	 * same place.  Even though this is not totally optimal, we've already
895	 * blown it by missing the cache case so we do not care.
896	 */
897
898	for(i = PQ_L2_SIZE / 2; i > 0; --i) {
899		if ((m = TAILQ_FIRST(&pq[(index + i) & PQ_L2_MASK].pl)) != NULL)
900			break;
901
902		if ((m = TAILQ_FIRST(&pq[(index - i) & PQ_L2_MASK].pl)) != NULL)
903			break;
904	}
905	return(m);
906}
907
908#endif
909
910/*
911 *	vm_page_select_cache:
912 *
913 *	Find a page on the cache queue with color optimization.  As pages
914 *	might be found, but not applicable, they are deactivated.  This
915 *	keeps us from using potentially busy cached pages.
916 *
917 *	This routine must be called at splvm().
918 *	This routine may not block.
919 */
920vm_page_t
921vm_page_select_cache(vm_object_t object, vm_pindex_t pindex)
922{
923	vm_page_t m;
924
925	GIANT_REQUIRED;
926	while (TRUE) {
927		m = vm_page_list_find(
928		    PQ_CACHE,
929		    (pindex + object->pg_color) & PQ_L2_MASK,
930		    FALSE
931		);
932		if (m && ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy ||
933			       m->hold_count || m->wire_count)) {
934			vm_page_deactivate(m);
935			continue;
936		}
937		return m;
938	}
939}
940
941/*
942 *	vm_page_select_free:
943 *
944 *	Find a free or zero page, with specified preference.
945 *
946 *	This routine must be called at splvm().
947 *	This routine may not block.
948 */
949
950static __inline vm_page_t
951vm_page_select_free(vm_object_t object, vm_pindex_t pindex, boolean_t prefer_zero)
952{
953	vm_page_t m;
954
955	m = vm_page_list_find(
956		PQ_FREE,
957		(pindex + object->pg_color) & PQ_L2_MASK,
958		prefer_zero
959	);
960	return(m);
961}
962
963/*
964 *	vm_page_alloc:
965 *
966 *	Allocate and return a memory cell associated
967 *	with this VM object/offset pair.
968 *
969 *	page_req classes:
970 *	VM_ALLOC_NORMAL		normal process request
971 *	VM_ALLOC_SYSTEM		system *really* needs a page
972 *	VM_ALLOC_INTERRUPT	interrupt time request
973 *	VM_ALLOC_ZERO		zero page
974 *
975 *	This routine may not block.
976 *
977 *	Additional special handling is required when called from an
978 *	interrupt (VM_ALLOC_INTERRUPT).  We are not allowed to mess with
979 *	the page cache in this case.
980 */
981
982vm_page_t
983vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
984{
985	vm_page_t m = NULL;
986	int s;
987
988	GIANT_REQUIRED;
989
990	KASSERT(!vm_page_lookup(object, pindex),
991		("vm_page_alloc: page already allocated"));
992
993	/*
994	 * The pager is allowed to eat deeper into the free page list.
995	 */
996
997	if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) {
998		page_req = VM_ALLOC_SYSTEM;
999	};
1000
1001	s = splvm();
1002
1003loop:
1004	if (cnt.v_free_count > cnt.v_free_reserved) {
1005		/*
1006		 * Allocate from the free queue if there are plenty of pages
1007		 * in it.
1008		 */
1009		if (page_req == VM_ALLOC_ZERO)
1010			m = vm_page_select_free(object, pindex, TRUE);
1011		else
1012			m = vm_page_select_free(object, pindex, FALSE);
1013	} else if (
1014	    (page_req == VM_ALLOC_SYSTEM &&
1015	     cnt.v_cache_count == 0 &&
1016	     cnt.v_free_count > cnt.v_interrupt_free_min) ||
1017	    (page_req == VM_ALLOC_INTERRUPT && cnt.v_free_count > 0)
1018	) {
1019		/*
1020		 * Interrupt or system, dig deeper into the free list.
1021		 */
1022		m = vm_page_select_free(object, pindex, FALSE);
1023	} else if (page_req != VM_ALLOC_INTERRUPT) {
1024		/*
1025		 * Allocatable from cache (non-interrupt only).  On success,
1026		 * we must free the page and try again, thus ensuring that
1027		 * cnt.v_*_free_min counters are replenished.
1028		 */
1029		m = vm_page_select_cache(object, pindex);
1030		if (m == NULL) {
1031			splx(s);
1032#if defined(DIAGNOSTIC)
1033			if (cnt.v_cache_count > 0)
1034				printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", cnt.v_cache_count);
1035#endif
1036			vm_pageout_deficit++;
1037			pagedaemon_wakeup();
1038			return (NULL);
1039		}
1040		KASSERT(m->dirty == 0, ("Found dirty cache page %p", m));
1041		vm_page_busy(m);
1042		vm_page_protect(m, VM_PROT_NONE);
1043		vm_page_free(m);
1044		goto loop;
1045	} else {
1046		/*
1047		 * Not allocatable from cache from interrupt, give up.
1048		 */
1049		splx(s);
1050		vm_pageout_deficit++;
1051		pagedaemon_wakeup();
1052		return (NULL);
1053	}
1054
1055	/*
1056	 *  At this point we had better have found a good page.
1057	 */
1058
1059	KASSERT(
1060	    m != NULL,
1061	    ("vm_page_alloc(): missing page on free queue\n")
1062	);
1063
1064	/*
1065	 * Remove from free queue
1066	 */
1067
1068	vm_page_unqueue_nowakeup(m);
1069
1070	/*
1071	 * Initialize structure.  Only the PG_ZERO flag is inherited.
1072	 */
1073
1074	if (m->flags & PG_ZERO) {
1075		vm_page_zero_count--;
1076		m->flags = PG_ZERO | PG_BUSY;
1077	} else {
1078		m->flags = PG_BUSY;
1079	}
1080	m->wire_count = 0;
1081	m->hold_count = 0;
1082	m->act_count = 0;
1083	m->busy = 0;
1084	m->valid = 0;
1085	KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m));
1086
1087	/*
1088	 * vm_page_insert() is safe prior to the splx().  Note also that
1089	 * inserting a page here does not insert it into the pmap (which
1090	 * could cause us to block allocating memory).  We cannot block
1091	 * anywhere.
1092	 */
1093
1094	vm_page_insert(m, object, pindex);
1095
1096	/*
1097	 * Don't wakeup too often - wakeup the pageout daemon when
1098	 * we would be nearly out of memory.
1099	 */
1100	if (vm_paging_needed())
1101		pagedaemon_wakeup();
1102
1103	splx(s);
1104
1105	return (m);
1106}
1107
1108/*
1109 *	vm_wait:	(also see VM_WAIT macro)
1110 *
1111 *	Block until free pages are available for allocation
1112 */
1113
1114void
1115vm_wait(void)
1116{
1117	int s;
1118
1119	s = splvm();
1120	if (curproc == pageproc) {
1121		vm_pageout_pages_needed = 1;
1122		tsleep(&vm_pageout_pages_needed, PSWP, "VMWait", 0);
1123	} else {
1124		if (!vm_pages_needed) {
1125			vm_pages_needed = 1;
1126			wakeup(&vm_pages_needed);
1127		}
1128		tsleep(&cnt.v_free_count, PVM, "vmwait", 0);
1129	}
1130	splx(s);
1131}
1132
1133/*
1134 *	vm_await:	(also see VM_AWAIT macro)
1135 *
1136 *	asleep on an event that will signal when free pages are available
1137 *	for allocation.
1138 */
1139
1140void
1141vm_await(void)
1142{
1143	int s;
1144
1145	s = splvm();
1146	if (curproc == pageproc) {
1147		vm_pageout_pages_needed = 1;
1148		asleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0);
1149	} else {
1150		if (!vm_pages_needed) {
1151			vm_pages_needed++;
1152			wakeup(&vm_pages_needed);
1153		}
1154		asleep(&cnt.v_free_count, PVM, "vmwait", 0);
1155	}
1156	splx(s);
1157}
1158
1159/*
1160 *	vm_page_activate:
1161 *
1162 *	Put the specified page on the active list (if appropriate).
1163 *	Ensure that act_count is at least ACT_INIT but do not otherwise
1164 *	mess with it.
1165 *
1166 *	The page queues must be locked.
1167 *	This routine may not block.
1168 */
1169void
1170vm_page_activate(vm_page_t m)
1171{
1172	int s;
1173
1174	GIANT_REQUIRED;
1175	s = splvm();
1176
1177	if (m->queue != PQ_ACTIVE) {
1178		if ((m->queue - m->pc) == PQ_CACHE)
1179			cnt.v_reactivated++;
1180
1181		vm_page_unqueue(m);
1182
1183		if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
1184			m->queue = PQ_ACTIVE;
1185			vm_page_queues[PQ_ACTIVE].lcnt++;
1186			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1187			if (m->act_count < ACT_INIT)
1188				m->act_count = ACT_INIT;
1189			cnt.v_active_count++;
1190		}
1191	} else {
1192		if (m->act_count < ACT_INIT)
1193			m->act_count = ACT_INIT;
1194	}
1195
1196	splx(s);
1197}
1198
1199/*
1200 *	vm_page_free_wakeup:
1201 *
1202 *	Helper routine for vm_page_free_toq() and vm_page_cache().  This
1203 *	routine is called when a page has been added to the cache or free
1204 *	queues.
1205 *
1206 *	This routine may not block.
1207 *	This routine must be called at splvm()
1208 */
1209static __inline void
1210vm_page_free_wakeup(void)
1211{
1212	/*
1213	 * if pageout daemon needs pages, then tell it that there are
1214	 * some free.
1215	 */
1216	if (vm_pageout_pages_needed &&
1217	    cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
1218		wakeup(&vm_pageout_pages_needed);
1219		vm_pageout_pages_needed = 0;
1220	}
1221	/*
1222	 * wakeup processes that are waiting on memory if we hit a
1223	 * high water mark. And wakeup scheduler process if we have
1224	 * lots of memory. this process will swapin processes.
1225	 */
1226	if (vm_pages_needed && !vm_page_count_min()) {
1227		vm_pages_needed = 0;
1228		wakeup(&cnt.v_free_count);
1229	}
1230}
1231
1232/*
1233 *	vm_page_free_toq:
1234 *
1235 *	Returns the given page to the PQ_FREE list,
1236 *	disassociating it with any VM object.
1237 *
1238 *	Object and page must be locked prior to entry.
1239 *	This routine may not block.
1240 */
1241
1242void
1243vm_page_free_toq(vm_page_t m)
1244{
1245	int s;
1246	struct vpgqueues *pq;
1247	vm_object_t object = m->object;
1248
1249	GIANT_REQUIRED;
1250	s = splvm();
1251	cnt.v_tfree++;
1252
1253	if (m->busy || ((m->queue - m->pc) == PQ_FREE) ||
1254		(m->hold_count != 0)) {
1255		printf(
1256		"vm_page_free: pindex(%lu), busy(%d), PG_BUSY(%d), hold(%d)\n",
1257		    (u_long)m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0,
1258		    m->hold_count);
1259		if ((m->queue - m->pc) == PQ_FREE)
1260			panic("vm_page_free: freeing free page");
1261		else
1262			panic("vm_page_free: freeing busy page");
1263	}
1264
1265	/*
1266	 * unqueue, then remove page.  Note that we cannot destroy
1267	 * the page here because we do not want to call the pager's
1268	 * callback routine until after we've put the page on the
1269	 * appropriate free queue.
1270	 */
1271
1272	vm_page_unqueue_nowakeup(m);
1273	vm_page_remove(m);
1274
1275	/*
1276	 * If fictitious remove object association and
1277	 * return, otherwise delay object association removal.
1278	 */
1279
1280	if ((m->flags & PG_FICTITIOUS) != 0) {
1281		splx(s);
1282		return;
1283	}
1284
1285	m->valid = 0;
1286	vm_page_undirty(m);
1287
1288	if (m->wire_count != 0) {
1289		if (m->wire_count > 1) {
1290			panic("vm_page_free: invalid wire count (%d), pindex: 0x%lx",
1291				m->wire_count, (long)m->pindex);
1292		}
1293		panic("vm_page_free: freeing wired page\n");
1294	}
1295
1296	/*
1297	 * If we've exhausted the object's resident pages we want to free
1298	 * it up.
1299	 */
1300
1301	if (object &&
1302	    (object->type == OBJT_VNODE) &&
1303	    ((object->flags & OBJ_DEAD) == 0)
1304	) {
1305		struct vnode *vp = (struct vnode *)object->handle;
1306
1307		if (vp && VSHOULDFREE(vp))
1308			vfree(vp);
1309	}
1310
1311	/*
1312	 * Clear the UNMANAGED flag when freeing an unmanaged page.
1313	 */
1314
1315	if (m->flags & PG_UNMANAGED) {
1316	    m->flags &= ~PG_UNMANAGED;
1317	} else {
1318#ifdef __alpha__
1319	    pmap_page_is_free(m);
1320#endif
1321	}
1322
1323	m->queue = PQ_FREE + m->pc;
1324	pq = &vm_page_queues[m->queue];
1325	pq->lcnt++;
1326	++(*pq->cnt);
1327
1328	/*
1329	 * Put zero'd pages on the end ( where we look for zero'd pages
1330	 * first ) and non-zerod pages at the head.
1331	 */
1332
1333	if (m->flags & PG_ZERO) {
1334		TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
1335		++vm_page_zero_count;
1336	} else {
1337		TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1338	}
1339
1340	vm_page_free_wakeup();
1341
1342	splx(s);
1343}
1344
1345/*
1346 *	vm_page_unmanage:
1347 *
1348 * 	Prevent PV management from being done on the page.  The page is
1349 *	removed from the paging queues as if it were wired, and as a
1350 *	consequence of no longer being managed the pageout daemon will not
1351 *	touch it (since there is no way to locate the pte mappings for the
1352 *	page).  madvise() calls that mess with the pmap will also no longer
1353 *	operate on the page.
1354 *
1355 *	Beyond that the page is still reasonably 'normal'.  Freeing the page
1356 *	will clear the flag.
1357 *
1358 *	This routine is used by OBJT_PHYS objects - objects using unswappable
1359 *	physical memory as backing store rather then swap-backed memory and
1360 *	will eventually be extended to support 4MB unmanaged physical
1361 *	mappings.
1362 */
1363
1364void
1365vm_page_unmanage(vm_page_t m)
1366{
1367	int s;
1368
1369	s = splvm();
1370	if ((m->flags & PG_UNMANAGED) == 0) {
1371		if (m->wire_count == 0)
1372			vm_page_unqueue(m);
1373	}
1374	vm_page_flag_set(m, PG_UNMANAGED);
1375	splx(s);
1376}
1377
1378/*
1379 *	vm_page_wire:
1380 *
1381 *	Mark this page as wired down by yet
1382 *	another map, removing it from paging queues
1383 *	as necessary.
1384 *
1385 *	The page queues must be locked.
1386 *	This routine may not block.
1387 */
1388void
1389vm_page_wire(vm_page_t m)
1390{
1391	int s;
1392
1393	/*
1394	 * Only bump the wire statistics if the page is not already wired,
1395	 * and only unqueue the page if it is on some queue (if it is unmanaged
1396	 * it is already off the queues).
1397	 */
1398	s = splvm();
1399	if (m->wire_count == 0) {
1400		if ((m->flags & PG_UNMANAGED) == 0)
1401			vm_page_unqueue(m);
1402		cnt.v_wire_count++;
1403	}
1404	m->wire_count++;
1405	splx(s);
1406	vm_page_flag_set(m, PG_MAPPED);
1407}
1408
1409/*
1410 *	vm_page_unwire:
1411 *
1412 *	Release one wiring of this page, potentially
1413 *	enabling it to be paged again.
1414 *
1415 *	Many pages placed on the inactive queue should actually go
1416 *	into the cache, but it is difficult to figure out which.  What
1417 *	we do instead, if the inactive target is well met, is to put
1418 *	clean pages at the head of the inactive queue instead of the tail.
1419 *	This will cause them to be moved to the cache more quickly and
1420 *	if not actively re-referenced, freed more quickly.  If we just
1421 *	stick these pages at the end of the inactive queue, heavy filesystem
1422 *	meta-data accesses can cause an unnecessary paging load on memory bound
1423 *	processes.  This optimization causes one-time-use metadata to be
1424 *	reused more quickly.
1425 *
1426 *	BUT, if we are in a low-memory situation we have no choice but to
1427 *	put clean pages on the cache queue.
1428 *
1429 *	A number of routines use vm_page_unwire() to guarantee that the page
1430 *	will go into either the inactive or active queues, and will NEVER
1431 *	be placed in the cache - for example, just after dirtying a page.
1432 *	dirty pages in the cache are not allowed.
1433 *
1434 *	The page queues must be locked.
1435 *	This routine may not block.
1436 */
1437void
1438vm_page_unwire(vm_page_t m, int activate)
1439{
1440	int s;
1441
1442	s = splvm();
1443
1444	if (m->wire_count > 0) {
1445		m->wire_count--;
1446		if (m->wire_count == 0) {
1447			cnt.v_wire_count--;
1448			if (m->flags & PG_UNMANAGED) {
1449				;
1450			} else if (activate) {
1451				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1452				m->queue = PQ_ACTIVE;
1453				vm_page_queues[PQ_ACTIVE].lcnt++;
1454				cnt.v_active_count++;
1455			} else {
1456				vm_page_flag_clear(m, PG_WINATCFLS);
1457				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1458				m->queue = PQ_INACTIVE;
1459				vm_page_queues[PQ_INACTIVE].lcnt++;
1460				cnt.v_inactive_count++;
1461			}
1462		}
1463	} else {
1464		panic("vm_page_unwire: invalid wire count: %d\n", m->wire_count);
1465	}
1466	splx(s);
1467}
1468
1469
1470/*
1471 * Move the specified page to the inactive queue.  If the page has
1472 * any associated swap, the swap is deallocated.
1473 *
1474 * Normally athead is 0 resulting in LRU operation.  athead is set
1475 * to 1 if we want this page to be 'as if it were placed in the cache',
1476 * except without unmapping it from the process address space.
1477 *
1478 * This routine may not block.
1479 */
1480static __inline void
1481_vm_page_deactivate(vm_page_t m, int athead)
1482{
1483	int s;
1484
1485	GIANT_REQUIRED;
1486	/*
1487	 * Ignore if already inactive.
1488	 */
1489	if (m->queue == PQ_INACTIVE)
1490		return;
1491
1492	s = splvm();
1493	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
1494		if ((m->queue - m->pc) == PQ_CACHE)
1495			cnt.v_reactivated++;
1496		vm_page_flag_clear(m, PG_WINATCFLS);
1497		vm_page_unqueue(m);
1498		if (athead)
1499			TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1500		else
1501			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1502		m->queue = PQ_INACTIVE;
1503		vm_page_queues[PQ_INACTIVE].lcnt++;
1504		cnt.v_inactive_count++;
1505	}
1506	splx(s);
1507}
1508
1509void
1510vm_page_deactivate(vm_page_t m)
1511{
1512    _vm_page_deactivate(m, 0);
1513}
1514
1515/*
1516 * vm_page_try_to_cache:
1517 *
1518 * Returns 0 on failure, 1 on success
1519 */
1520int
1521vm_page_try_to_cache(vm_page_t m)
1522{
1523	GIANT_REQUIRED;
1524
1525	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
1526	    (m->flags & (PG_BUSY|PG_UNMANAGED))) {
1527		return(0);
1528	}
1529	vm_page_test_dirty(m);
1530	if (m->dirty)
1531		return(0);
1532	vm_page_cache(m);
1533	return(1);
1534}
1535
1536/*
1537 * vm_page_try_to_free()
1538 *
1539 *	Attempt to free the page.  If we cannot free it, we do nothing.
1540 *	1 is returned on success, 0 on failure.
1541 */
1542int
1543vm_page_try_to_free(vm_page_t m)
1544{
1545	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
1546	    (m->flags & (PG_BUSY|PG_UNMANAGED))) {
1547		return(0);
1548	}
1549	vm_page_test_dirty(m);
1550	if (m->dirty)
1551		return(0);
1552	vm_page_busy(m);
1553	vm_page_protect(m, VM_PROT_NONE);
1554	vm_page_free(m);
1555	return(1);
1556}
1557
1558/*
1559 * vm_page_cache
1560 *
1561 * Put the specified page onto the page cache queue (if appropriate).
1562 *
1563 * This routine may not block.
1564 */
1565void
1566vm_page_cache(vm_page_t m)
1567{
1568	int s;
1569
1570	GIANT_REQUIRED;
1571	if ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy || m->wire_count) {
1572		printf("vm_page_cache: attempting to cache busy page\n");
1573		return;
1574	}
1575	if ((m->queue - m->pc) == PQ_CACHE)
1576		return;
1577
1578	/*
1579	 * Remove all pmaps and indicate that the page is not
1580	 * writeable or mapped.
1581	 */
1582
1583	vm_page_protect(m, VM_PROT_NONE);
1584	if (m->dirty != 0) {
1585		panic("vm_page_cache: caching a dirty page, pindex: %ld",
1586			(long)m->pindex);
1587	}
1588	s = splvm();
1589	vm_page_unqueue_nowakeup(m);
1590	m->queue = PQ_CACHE + m->pc;
1591	vm_page_queues[m->queue].lcnt++;
1592	TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
1593	cnt.v_cache_count++;
1594	vm_page_free_wakeup();
1595	splx(s);
1596}
1597
1598/*
1599 * vm_page_dontneed
1600 *
1601 *	Cache, deactivate, or do nothing as appropriate.  This routine
1602 *	is typically used by madvise() MADV_DONTNEED.
1603 *
1604 *	Generally speaking we want to move the page into the cache so
1605 *	it gets reused quickly.  However, this can result in a silly syndrome
1606 *	due to the page recycling too quickly.  Small objects will not be
1607 *	fully cached.  On the otherhand, if we move the page to the inactive
1608 *	queue we wind up with a problem whereby very large objects
1609 *	unnecessarily blow away our inactive and cache queues.
1610 *
1611 *	The solution is to move the pages based on a fixed weighting.  We
1612 *	either leave them alone, deactivate them, or move them to the cache,
1613 *	where moving them to the cache has the highest weighting.
1614 *	By forcing some pages into other queues we eventually force the
1615 *	system to balance the queues, potentially recovering other unrelated
1616 *	space from active.  The idea is to not force this to happen too
1617 *	often.
1618 */
1619
1620void
1621vm_page_dontneed(vm_page_t m)
1622{
1623	static int dnweight;
1624	int dnw;
1625	int head;
1626
1627	GIANT_REQUIRED;
1628	dnw = ++dnweight;
1629
1630	/*
1631	 * occassionally leave the page alone
1632	 */
1633
1634	if ((dnw & 0x01F0) == 0 ||
1635	    m->queue == PQ_INACTIVE ||
1636	    m->queue - m->pc == PQ_CACHE
1637	) {
1638		if (m->act_count >= ACT_INIT)
1639			--m->act_count;
1640		return;
1641	}
1642
1643	if (m->dirty == 0)
1644		vm_page_test_dirty(m);
1645
1646	if (m->dirty || (dnw & 0x0070) == 0) {
1647		/*
1648		 * Deactivate the page 3 times out of 32.
1649		 */
1650		head = 0;
1651	} else {
1652		/*
1653		 * Cache the page 28 times out of every 32.  Note that
1654		 * the page is deactivated instead of cached, but placed
1655		 * at the head of the queue instead of the tail.
1656		 */
1657		head = 1;
1658	}
1659	_vm_page_deactivate(m, head);
1660}
1661
1662/*
1663 * Grab a page, waiting until we are waken up due to the page
1664 * changing state.  We keep on waiting, if the page continues
1665 * to be in the object.  If the page doesn't exist, allocate it.
1666 *
1667 * This routine may block.
1668 */
1669vm_page_t
1670vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
1671{
1672	vm_page_t m;
1673	int s, generation;
1674
1675	GIANT_REQUIRED;
1676retrylookup:
1677	if ((m = vm_page_lookup(object, pindex)) != NULL) {
1678		if (m->busy || (m->flags & PG_BUSY)) {
1679			generation = object->generation;
1680
1681			s = splvm();
1682			while ((object->generation == generation) &&
1683					(m->busy || (m->flags & PG_BUSY))) {
1684				vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
1685				tsleep(m, PVM, "pgrbwt", 0);
1686				if ((allocflags & VM_ALLOC_RETRY) == 0) {
1687					splx(s);
1688					return NULL;
1689				}
1690			}
1691			splx(s);
1692			goto retrylookup;
1693		} else {
1694			vm_page_busy(m);
1695			return m;
1696		}
1697	}
1698
1699	m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY);
1700	if (m == NULL) {
1701		VM_WAIT;
1702		if ((allocflags & VM_ALLOC_RETRY) == 0)
1703			return NULL;
1704		goto retrylookup;
1705	}
1706
1707	return m;
1708}
1709
1710/*
1711 * Mapping function for valid bits or for dirty bits in
1712 * a page.  May not block.
1713 *
1714 * Inputs are required to range within a page.
1715 */
1716
1717__inline int
1718vm_page_bits(int base, int size)
1719{
1720	int first_bit;
1721	int last_bit;
1722
1723	KASSERT(
1724	    base + size <= PAGE_SIZE,
1725	    ("vm_page_bits: illegal base/size %d/%d", base, size)
1726	);
1727
1728	if (size == 0)		/* handle degenerate case */
1729		return(0);
1730
1731	first_bit = base >> DEV_BSHIFT;
1732	last_bit = (base + size - 1) >> DEV_BSHIFT;
1733
1734	return ((2 << last_bit) - (1 << first_bit));
1735}
1736
1737/*
1738 *	vm_page_set_validclean:
1739 *
1740 *	Sets portions of a page valid and clean.  The arguments are expected
1741 *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
1742 *	of any partial chunks touched by the range.  The invalid portion of
1743 *	such chunks will be zero'd.
1744 *
1745 *	This routine may not block.
1746 *
1747 *	(base + size) must be less then or equal to PAGE_SIZE.
1748 */
1749void
1750vm_page_set_validclean(vm_page_t m, int base, int size)
1751{
1752	int pagebits;
1753	int frag;
1754	int endoff;
1755
1756	GIANT_REQUIRED;
1757	if (size == 0)	/* handle degenerate case */
1758		return;
1759
1760	/*
1761	 * If the base is not DEV_BSIZE aligned and the valid
1762	 * bit is clear, we have to zero out a portion of the
1763	 * first block.
1764	 */
1765
1766	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
1767	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
1768	) {
1769		pmap_zero_page_area(
1770		    VM_PAGE_TO_PHYS(m),
1771		    frag,
1772		    base - frag
1773		);
1774	}
1775
1776	/*
1777	 * If the ending offset is not DEV_BSIZE aligned and the
1778	 * valid bit is clear, we have to zero out a portion of
1779	 * the last block.
1780	 */
1781
1782	endoff = base + size;
1783
1784	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
1785	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
1786	) {
1787		pmap_zero_page_area(
1788		    VM_PAGE_TO_PHYS(m),
1789		    endoff,
1790		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
1791		);
1792	}
1793
1794	/*
1795	 * Set valid, clear dirty bits.  If validating the entire
1796	 * page we can safely clear the pmap modify bit.  We also
1797	 * use this opportunity to clear the PG_NOSYNC flag.  If a process
1798	 * takes a write fault on a MAP_NOSYNC memory area the flag will
1799	 * be set again.
1800	 */
1801
1802	pagebits = vm_page_bits(base, size);
1803	m->valid |= pagebits;
1804	m->dirty &= ~pagebits;
1805	if (base == 0 && size == PAGE_SIZE) {
1806		pmap_clear_modify(m);
1807		vm_page_flag_clear(m, PG_NOSYNC);
1808	}
1809}
1810
1811#if 0
1812
1813void
1814vm_page_set_dirty(vm_page_t m, int base, int size)
1815{
1816	m->dirty |= vm_page_bits(base, size);
1817}
1818
1819#endif
1820
1821void
1822vm_page_clear_dirty(vm_page_t m, int base, int size)
1823{
1824	GIANT_REQUIRED;
1825	m->dirty &= ~vm_page_bits(base, size);
1826}
1827
1828/*
1829 *	vm_page_set_invalid:
1830 *
1831 *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
1832 *	valid and dirty bits for the effected areas are cleared.
1833 *
1834 *	May not block.
1835 */
1836void
1837vm_page_set_invalid(vm_page_t m, int base, int size)
1838{
1839	int bits;
1840
1841	GIANT_REQUIRED;
1842	bits = vm_page_bits(base, size);
1843	m->valid &= ~bits;
1844	m->dirty &= ~bits;
1845	m->object->generation++;
1846}
1847
1848/*
1849 * vm_page_zero_invalid()
1850 *
1851 *	The kernel assumes that the invalid portions of a page contain
1852 *	garbage, but such pages can be mapped into memory by user code.
1853 *	When this occurs, we must zero out the non-valid portions of the
1854 *	page so user code sees what it expects.
1855 *
1856 *	Pages are most often semi-valid when the end of a file is mapped
1857 *	into memory and the file's size is not page aligned.
1858 */
1859
1860void
1861vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
1862{
1863	int b;
1864	int i;
1865
1866	/*
1867	 * Scan the valid bits looking for invalid sections that
1868	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
1869	 * valid bit may be set ) have already been zerod by
1870	 * vm_page_set_validclean().
1871	 */
1872
1873	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
1874		if (i == (PAGE_SIZE / DEV_BSIZE) ||
1875		    (m->valid & (1 << i))
1876		) {
1877			if (i > b) {
1878				pmap_zero_page_area(
1879				    VM_PAGE_TO_PHYS(m),
1880				    b << DEV_BSHIFT,
1881				    (i - b) << DEV_BSHIFT
1882				);
1883			}
1884			b = i + 1;
1885		}
1886	}
1887
1888	/*
1889	 * setvalid is TRUE when we can safely set the zero'd areas
1890	 * as being valid.  We can do this if there are no cache consistancy
1891	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
1892	 */
1893
1894	if (setvalid)
1895		m->valid = VM_PAGE_BITS_ALL;
1896}
1897
1898/*
1899 *	vm_page_is_valid:
1900 *
1901 *	Is (partial) page valid?  Note that the case where size == 0
1902 *	will return FALSE in the degenerate case where the page is
1903 *	entirely invalid, and TRUE otherwise.
1904 *
1905 *	May not block.
1906 */
1907
1908int
1909vm_page_is_valid(vm_page_t m, int base, int size)
1910{
1911	int bits = vm_page_bits(base, size);
1912
1913	if (m->valid && ((m->valid & bits) == bits))
1914		return 1;
1915	else
1916		return 0;
1917}
1918
1919/*
1920 * update dirty bits from pmap/mmu.  May not block.
1921 */
1922
1923void
1924vm_page_test_dirty(vm_page_t m)
1925{
1926	if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
1927		vm_page_dirty(m);
1928	}
1929}
1930
1931/*
1932 * This interface is for merging with malloc() someday.
1933 * Even if we never implement compaction so that contiguous allocation
1934 * works after initialization time, malloc()'s data structures are good
1935 * for statistics and for allocations of less than a page.
1936 */
1937void *
1938contigmalloc1(
1939	unsigned long size,	/* should be size_t here and for malloc() */
1940	struct malloc_type *type,
1941	int flags,
1942	unsigned long low,
1943	unsigned long high,
1944	unsigned long alignment,
1945	unsigned long boundary,
1946	vm_map_t map)
1947{
1948	int i, s, start;
1949	vm_offset_t addr, phys, tmp_addr;
1950	int pass;
1951	vm_page_t pga = vm_page_array;
1952
1953	size = round_page(size);
1954	if (size == 0)
1955		panic("contigmalloc1: size must not be 0");
1956	if ((alignment & (alignment - 1)) != 0)
1957		panic("contigmalloc1: alignment must be a power of 2");
1958	if ((boundary & (boundary - 1)) != 0)
1959		panic("contigmalloc1: boundary must be a power of 2");
1960
1961	start = 0;
1962	for (pass = 0; pass <= 1; pass++) {
1963		s = splvm();
1964again:
1965		/*
1966		 * Find first page in array that is free, within range, aligned, and
1967		 * such that the boundary won't be crossed.
1968		 */
1969		for (i = start; i < cnt.v_page_count; i++) {
1970			int pqtype;
1971			phys = VM_PAGE_TO_PHYS(&pga[i]);
1972			pqtype = pga[i].queue - pga[i].pc;
1973			if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) &&
1974			    (phys >= low) && (phys < high) &&
1975			    ((phys & (alignment - 1)) == 0) &&
1976			    (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0))
1977				break;
1978		}
1979
1980		/*
1981		 * If the above failed or we will exceed the upper bound, fail.
1982		 */
1983		if ((i == cnt.v_page_count) ||
1984			((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) {
1985			vm_page_t m, next;
1986
1987again1:
1988			for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
1989				m != NULL;
1990				m = next) {
1991
1992				KASSERT(m->queue == PQ_INACTIVE,
1993					("contigmalloc1: page %p is not PQ_INACTIVE", m));
1994
1995				next = TAILQ_NEXT(m, pageq);
1996				if (vm_page_sleep_busy(m, TRUE, "vpctw0"))
1997					goto again1;
1998				vm_page_test_dirty(m);
1999				if (m->dirty) {
2000					if (m->object->type == OBJT_VNODE) {
2001						vn_lock(m->object->handle, LK_EXCLUSIVE | LK_RETRY, curproc);
2002						vm_object_page_clean(m->object, 0, 0, OBJPC_SYNC);
2003						VOP_UNLOCK(m->object->handle, 0, curproc);
2004						goto again1;
2005					} else if (m->object->type == OBJT_SWAP ||
2006								m->object->type == OBJT_DEFAULT) {
2007						vm_pageout_flush(&m, 1, 0);
2008						goto again1;
2009					}
2010				}
2011				if ((m->dirty == 0) && (m->busy == 0) && (m->hold_count == 0))
2012					vm_page_cache(m);
2013			}
2014
2015			for (m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
2016				m != NULL;
2017				m = next) {
2018
2019				KASSERT(m->queue == PQ_ACTIVE,
2020					("contigmalloc1: page %p is not PQ_ACTIVE", m));
2021
2022				next = TAILQ_NEXT(m, pageq);
2023				if (vm_page_sleep_busy(m, TRUE, "vpctw1"))
2024					goto again1;
2025				vm_page_test_dirty(m);
2026				if (m->dirty) {
2027					if (m->object->type == OBJT_VNODE) {
2028						vn_lock(m->object->handle, LK_EXCLUSIVE | LK_RETRY, curproc);
2029						vm_object_page_clean(m->object, 0, 0, OBJPC_SYNC);
2030						VOP_UNLOCK(m->object->handle, 0, curproc);
2031						goto again1;
2032					} else if (m->object->type == OBJT_SWAP ||
2033								m->object->type == OBJT_DEFAULT) {
2034						vm_pageout_flush(&m, 1, 0);
2035						goto again1;
2036					}
2037				}
2038				if ((m->dirty == 0) && (m->busy == 0) && (m->hold_count == 0))
2039					vm_page_cache(m);
2040			}
2041
2042			splx(s);
2043			continue;
2044		}
2045		start = i;
2046
2047		/*
2048		 * Check successive pages for contiguous and free.
2049		 */
2050		for (i = start + 1; i < (start + size / PAGE_SIZE); i++) {
2051			int pqtype;
2052			pqtype = pga[i].queue - pga[i].pc;
2053			if ((VM_PAGE_TO_PHYS(&pga[i]) !=
2054			    (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) ||
2055			    ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE))) {
2056				start++;
2057				goto again;
2058			}
2059		}
2060
2061		for (i = start; i < (start + size / PAGE_SIZE); i++) {
2062			int pqtype;
2063			vm_page_t m = &pga[i];
2064
2065			pqtype = m->queue - m->pc;
2066			if (pqtype == PQ_CACHE) {
2067				vm_page_busy(m);
2068				vm_page_free(m);
2069			}
2070
2071			TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq);
2072			vm_page_queues[m->queue].lcnt--;
2073			cnt.v_free_count--;
2074			m->valid = VM_PAGE_BITS_ALL;
2075			m->flags = 0;
2076			KASSERT(m->dirty == 0, ("contigmalloc1: page %p was dirty", m));
2077			m->wire_count = 0;
2078			m->busy = 0;
2079			m->queue = PQ_NONE;
2080			m->object = NULL;
2081			vm_page_wire(m);
2082		}
2083
2084		/*
2085		 * We've found a contiguous chunk that meets are requirements.
2086		 * Allocate kernel VM, unfree and assign the physical pages to it and
2087		 * return kernel VM pointer.
2088		 */
2089		tmp_addr = addr = kmem_alloc_pageable(map, size);
2090		if (addr == 0) {
2091			/*
2092			 * XXX We almost never run out of kernel virtual
2093			 * space, so we don't make the allocated memory
2094			 * above available.
2095			 */
2096			splx(s);
2097			return (NULL);
2098		}
2099
2100		for (i = start; i < (start + size / PAGE_SIZE); i++) {
2101			vm_page_t m = &pga[i];
2102			vm_page_insert(m, kernel_object,
2103				OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
2104			pmap_kenter(tmp_addr, VM_PAGE_TO_PHYS(m));
2105			tmp_addr += PAGE_SIZE;
2106		}
2107
2108		splx(s);
2109		return ((void *)addr);
2110	}
2111	return NULL;
2112}
2113
2114void *
2115contigmalloc(
2116	unsigned long size,	/* should be size_t here and for malloc() */
2117	struct malloc_type *type,
2118	int flags,
2119	unsigned long low,
2120	unsigned long high,
2121	unsigned long alignment,
2122	unsigned long boundary)
2123{
2124	void * ret;
2125
2126	GIANT_REQUIRED;
2127	ret = contigmalloc1(size, type, flags, low, high, alignment, boundary,
2128			     kernel_map);
2129	return (ret);
2130
2131}
2132
2133void
2134contigfree(void *addr, unsigned long size, struct malloc_type *type)
2135{
2136	GIANT_REQUIRED;
2137	kmem_free(kernel_map, (vm_offset_t)addr, size);
2138}
2139
2140vm_offset_t
2141vm_page_alloc_contig(
2142	vm_offset_t size,
2143	vm_offset_t low,
2144	vm_offset_t high,
2145	vm_offset_t alignment)
2146{
2147	vm_offset_t ret;
2148
2149	GIANT_REQUIRED;
2150	ret = ((vm_offset_t)contigmalloc1(size, M_DEVBUF, M_NOWAIT, low, high,
2151					  alignment, 0ul, kernel_map));
2152	return (ret);
2153
2154}
2155
2156#include "opt_ddb.h"
2157#ifdef DDB
2158#include <sys/kernel.h>
2159
2160#include <ddb/ddb.h>
2161
2162DB_SHOW_COMMAND(page, vm_page_print_page_info)
2163{
2164	db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
2165	db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
2166	db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
2167	db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
2168	db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
2169	db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
2170	db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
2171	db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
2172	db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
2173	db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
2174}
2175
2176DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
2177{
2178	int i;
2179	db_printf("PQ_FREE:");
2180	for (i = 0; i < PQ_L2_SIZE; i++) {
2181		db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
2182	}
2183	db_printf("\n");
2184
2185	db_printf("PQ_CACHE:");
2186	for (i = 0; i < PQ_L2_SIZE; i++) {
2187		db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
2188	}
2189	db_printf("\n");
2190
2191	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
2192		vm_page_queues[PQ_ACTIVE].lcnt,
2193		vm_page_queues[PQ_INACTIVE].lcnt);
2194}
2195#endif /* DDB */
2196