vm_page.c revision 48099
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
37 *	$Id: vm_page.c,v 1.132 1999/06/20 21:47:02 alc Exp $
38 */
39
40/*
41 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42 * All rights reserved.
43 *
44 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45 *
46 * Permission to use, copy, modify and distribute this software and
47 * its documentation is hereby granted, provided that both the copyright
48 * notice and this permission notice appear in all copies of the
49 * software, derivative works or modified versions, and any portions
50 * thereof, and that both notices appear in supporting documentation.
51 *
52 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55 *
56 * Carnegie Mellon requests users of this software to return to
57 *
58 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
59 *  School of Computer Science
60 *  Carnegie Mellon University
61 *  Pittsburgh PA 15213-3890
62 *
63 * any improvements or extensions that they make and grant Carnegie the
64 * rights to redistribute these changes.
65 */
66
67/*
68 *	Resident memory management module.
69 */
70
71#include <sys/param.h>
72#include <sys/systm.h>
73#include <sys/malloc.h>
74#include <sys/proc.h>
75#include <sys/vmmeter.h>
76#include <sys/vnode.h>
77
78#include <vm/vm.h>
79#include <vm/vm_param.h>
80#include <vm/vm_prot.h>
81#include <sys/lock.h>
82#include <vm/vm_kern.h>
83#include <vm/vm_object.h>
84#include <vm/vm_page.h>
85#include <vm/vm_pageout.h>
86#include <vm/vm_pager.h>
87#include <vm/vm_extern.h>
88
89static void	vm_page_queue_init __P((void));
90static vm_page_t vm_page_select_cache __P((vm_object_t, vm_pindex_t));
91
92/*
93 *	Associated with page of user-allocatable memory is a
94 *	page structure.
95 */
96
97static struct vm_page **vm_page_buckets; /* Array of buckets */
98static int vm_page_bucket_count;	/* How big is array? */
99static int vm_page_hash_mask;		/* Mask for hash function */
100static volatile int vm_page_bucket_generation;
101
102struct pglist vm_page_queue_free[PQ_L2_SIZE] = {{0}};
103struct pglist vm_page_queue_active = {0};
104struct pglist vm_page_queue_inactive = {0};
105struct pglist vm_page_queue_cache[PQ_L2_SIZE] = {{0}};
106
107static int no_queue=0;
108
109struct vpgqueues vm_page_queues[PQ_COUNT] = {{0}};
110static int pqcnt[PQ_COUNT] = {0};
111
112static void
113vm_page_queue_init(void) {
114	int i;
115
116	vm_page_queues[PQ_NONE].pl = NULL;
117	vm_page_queues[PQ_NONE].cnt = &no_queue;
118	for(i=0;i<PQ_L2_SIZE;i++) {
119		vm_page_queues[PQ_FREE+i].pl = &vm_page_queue_free[i];
120		vm_page_queues[PQ_FREE+i].cnt = &cnt.v_free_count;
121	}
122	vm_page_queues[PQ_INACTIVE].pl = &vm_page_queue_inactive;
123	vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
124
125	vm_page_queues[PQ_ACTIVE].pl = &vm_page_queue_active;
126	vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
127	for(i=0;i<PQ_L2_SIZE;i++) {
128		vm_page_queues[PQ_CACHE+i].pl = &vm_page_queue_cache[i];
129		vm_page_queues[PQ_CACHE+i].cnt = &cnt.v_cache_count;
130	}
131	for(i=0;i<PQ_COUNT;i++) {
132		if (vm_page_queues[i].pl) {
133			TAILQ_INIT(vm_page_queues[i].pl);
134		} else if (i != 0) {
135			panic("vm_page_queue_init: queue %d is null", i);
136		}
137		vm_page_queues[i].lcnt = &pqcnt[i];
138	}
139}
140
141vm_page_t vm_page_array = 0;
142static int vm_page_array_size = 0;
143long first_page = 0;
144int vm_page_zero_count = 0;
145
146static __inline int vm_page_hash __P((vm_object_t object, vm_pindex_t pindex));
147static void vm_page_free_wakeup __P((void));
148
149/*
150 *	vm_set_page_size:
151 *
152 *	Sets the page size, perhaps based upon the memory
153 *	size.  Must be called before any use of page-size
154 *	dependent functions.
155 */
156void
157vm_set_page_size()
158{
159	if (cnt.v_page_size == 0)
160		cnt.v_page_size = PAGE_SIZE;
161	if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
162		panic("vm_set_page_size: page size not a power of two");
163}
164
165/*
166 *	vm_page_startup:
167 *
168 *	Initializes the resident memory module.
169 *
170 *	Allocates memory for the page cells, and
171 *	for the object/offset-to-page hash table headers.
172 *	Each page cell is initialized and placed on the free list.
173 */
174
175vm_offset_t
176vm_page_startup(starta, enda, vaddr)
177	register vm_offset_t starta;
178	vm_offset_t enda;
179	register vm_offset_t vaddr;
180{
181	register vm_offset_t mapped;
182	register vm_page_t m;
183	register struct vm_page **bucket;
184	vm_size_t npages, page_range;
185	register vm_offset_t new_start;
186	int i;
187	vm_offset_t pa;
188	int nblocks;
189	vm_offset_t first_managed_page;
190
191	/* the biggest memory array is the second group of pages */
192	vm_offset_t start;
193	vm_offset_t biggestone, biggestsize;
194
195	vm_offset_t total;
196
197	total = 0;
198	biggestsize = 0;
199	biggestone = 0;
200	nblocks = 0;
201	vaddr = round_page(vaddr);
202
203	for (i = 0; phys_avail[i + 1]; i += 2) {
204		phys_avail[i] = round_page(phys_avail[i]);
205		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
206	}
207
208	for (i = 0; phys_avail[i + 1]; i += 2) {
209		int size = phys_avail[i + 1] - phys_avail[i];
210
211		if (size > biggestsize) {
212			biggestone = i;
213			biggestsize = size;
214		}
215		++nblocks;
216		total += size;
217	}
218
219	start = phys_avail[biggestone];
220
221	/*
222	 * Initialize the queue headers for the free queue, the active queue
223	 * and the inactive queue.
224	 */
225
226	vm_page_queue_init();
227
228	/*
229	 * Allocate (and initialize) the hash table buckets.
230	 *
231	 * The number of buckets MUST BE a power of 2, and the actual value is
232	 * the next power of 2 greater than the number of physical pages in
233	 * the system.
234	 *
235	 * We make the hash table approximately 2x the number of pages to
236	 * reduce the chain length.  This is about the same size using the
237	 * singly-linked list as the 1x hash table we were using before
238	 * using TAILQ but the chain length will be smaller.
239	 *
240	 * Note: This computation can be tweaked if desired.
241	 */
242	vm_page_buckets = (struct vm_page **)vaddr;
243	bucket = vm_page_buckets;
244	if (vm_page_bucket_count == 0) {
245		vm_page_bucket_count = 1;
246		while (vm_page_bucket_count < atop(total))
247			vm_page_bucket_count <<= 1;
248	}
249	vm_page_bucket_count <<= 1;
250	vm_page_hash_mask = vm_page_bucket_count - 1;
251
252	/*
253	 * Validate these addresses.
254	 */
255
256	new_start = start + vm_page_bucket_count * sizeof(struct vm_page *);
257	new_start = round_page(new_start);
258	mapped = round_page(vaddr);
259	vaddr = pmap_map(mapped, start, new_start,
260	    VM_PROT_READ | VM_PROT_WRITE);
261	start = new_start;
262	vaddr = round_page(vaddr);
263	bzero((caddr_t) mapped, vaddr - mapped);
264
265	for (i = 0; i < vm_page_bucket_count; i++) {
266		*bucket = NULL;
267		bucket++;
268	}
269
270	/*
271	 * Compute the number of pages of memory that will be available for
272	 * use (taking into account the overhead of a page structure per
273	 * page).
274	 */
275
276	first_page = phys_avail[0] / PAGE_SIZE;
277
278	page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE - first_page;
279	npages = (total - (page_range * sizeof(struct vm_page)) -
280	    (start - phys_avail[biggestone])) / PAGE_SIZE;
281
282	/*
283	 * Initialize the mem entry structures now, and put them in the free
284	 * queue.
285	 */
286	vm_page_array = (vm_page_t) vaddr;
287	mapped = vaddr;
288
289	/*
290	 * Validate these addresses.
291	 */
292	new_start = round_page(start + page_range * sizeof(struct vm_page));
293	mapped = pmap_map(mapped, start, new_start,
294	    VM_PROT_READ | VM_PROT_WRITE);
295	start = new_start;
296
297	first_managed_page = start / PAGE_SIZE;
298
299	/*
300	 * Clear all of the page structures
301	 */
302	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
303	vm_page_array_size = page_range;
304
305	/*
306	 * Construct the free queue(s) in descending order (by physical
307	 * address) so that the first 16MB of physical memory is allocated
308	 * last rather than first.  On large-memory machines, this avoids
309	 * the exhaustion of low physical memory before isa_dmainit has run.
310	 */
311	cnt.v_page_count = 0;
312	cnt.v_free_count = 0;
313	for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) {
314		if (i == biggestone)
315			pa = ptoa(first_managed_page);
316		else
317			pa = phys_avail[i];
318		while (pa < phys_avail[i + 1] && npages-- > 0) {
319			++cnt.v_page_count;
320			++cnt.v_free_count;
321			m = PHYS_TO_VM_PAGE(pa);
322			m->phys_addr = pa;
323			m->flags = 0;
324			m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK;
325			m->queue = m->pc + PQ_FREE;
326			TAILQ_INSERT_HEAD(vm_page_queues[m->queue].pl, m, pageq);
327			++(*vm_page_queues[m->queue].lcnt);
328			pa += PAGE_SIZE;
329		}
330	}
331	return (mapped);
332}
333
334/*
335 *	vm_page_hash:
336 *
337 *	Distributes the object/offset key pair among hash buckets.
338 *
339 *	NOTE:  This macro depends on vm_page_bucket_count being a power of 2.
340 *	This routine may not block.
341 *
342 *	We try to randomize the hash based on the object to spread the pages
343 *	out in the hash table without it costing us too much.
344 */
345static __inline int
346vm_page_hash(object, pindex)
347	vm_object_t object;
348	vm_pindex_t pindex;
349{
350	int i = ((uintptr_t)object + pindex) ^ object->hash_rand;
351
352	return(i & vm_page_hash_mask);
353}
354
355/*
356 *	vm_page_insert:		[ internal use only ]
357 *
358 *	Inserts the given mem entry into the object and object list.
359 *
360 *	The pagetables are not updated but will presumably fault the page
361 *	in if necessary, or if a kernel page the caller will at some point
362 *	enter the page into the kernel's pmap.  We are not allowed to block
363 *	here so we *can't* do this anyway.
364 *
365 *	The object and page must be locked, and must be splhigh.
366 *	This routine may not block.
367 */
368
369void
370vm_page_insert(m, object, pindex)
371	register vm_page_t m;
372	register vm_object_t object;
373	register vm_pindex_t pindex;
374{
375	register struct vm_page **bucket;
376
377	if (m->object != NULL)
378		panic("vm_page_insert: already inserted");
379
380	/*
381	 * Record the object/offset pair in this page
382	 */
383
384	m->object = object;
385	m->pindex = pindex;
386
387	/*
388	 * Insert it into the object_object/offset hash table
389	 */
390
391	bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
392	m->hnext = *bucket;
393	*bucket = m;
394	vm_page_bucket_generation++;
395
396	/*
397	 * Now link into the object's list of backed pages.
398	 */
399
400	TAILQ_INSERT_TAIL(&object->memq, m, listq);
401	m->object->generation++;
402
403	/*
404	 * show that the object has one more resident page.
405	 */
406
407	object->resident_page_count++;
408
409	/*
410	 * Since we are inserting a new and possibly dirty page,
411	 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
412	 */
413	if (m->flags & PG_WRITEABLE)
414	    vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
415}
416
417/*
418 *	vm_page_remove:
419 *				NOTE: used by device pager as well -wfj
420 *
421 *	Removes the given mem entry from the object/offset-page
422 *	table and the object page list, but do not invalidate/terminate
423 *	the backing store.
424 *
425 *	The object and page must be locked, and at splhigh.
426 *	The underlying pmap entry (if any) is NOT removed here.
427 *	This routine may not block.
428 */
429
430void
431vm_page_remove(m)
432	vm_page_t m;
433{
434	vm_object_t object;
435
436	if (m->object == NULL)
437		return;
438
439#if !defined(MAX_PERF)
440	if ((m->flags & PG_BUSY) == 0) {
441		panic("vm_page_remove: page not busy");
442	}
443#endif
444
445	/*
446	 * Basically destroy the page.
447	 */
448
449	vm_page_wakeup(m);
450
451	object = m->object;
452
453	/*
454	 * Remove from the object_object/offset hash table.  The object
455	 * must be on the hash queue, we will panic if it isn't
456	 *
457	 * Note: we must NULL-out m->hnext to prevent loops in detached
458	 * buffers with vm_page_lookup().
459	 */
460
461	{
462		struct vm_page **bucket;
463
464		bucket = &vm_page_buckets[vm_page_hash(m->object, m->pindex)];
465		while (*bucket != m) {
466#if !defined(MAX_PERF)
467			if (*bucket == NULL)
468				panic("vm_page_remove(): page not found in hash");
469#endif
470			bucket = &(*bucket)->hnext;
471		}
472		*bucket = m->hnext;
473		m->hnext = NULL;
474		vm_page_bucket_generation++;
475	}
476
477	/*
478	 * Now remove from the object's list of backed pages.
479	 */
480
481	TAILQ_REMOVE(&object->memq, m, listq);
482
483	/*
484	 * And show that the object has one fewer resident page.
485	 */
486
487	object->resident_page_count--;
488	object->generation++;
489
490	m->object = NULL;
491}
492
493/*
494 *	vm_page_lookup:
495 *
496 *	Returns the page associated with the object/offset
497 *	pair specified; if none is found, NULL is returned.
498 *
499 *	NOTE: the code below does not lock.  It will operate properly if
500 *	an interrupt makes a change, but the generation algorithm will not
501 *	operate properly in an SMP environment where both cpu's are able to run
502 *	kernel code simultaniously.
503 *
504 *	The object must be locked.  No side effects.
505 *	This routine may not block.
506 *	This is a critical path routine
507 */
508
509vm_page_t
510vm_page_lookup(object, pindex)
511	register vm_object_t object;
512	register vm_pindex_t pindex;
513{
514	register vm_page_t m;
515	register struct vm_page **bucket;
516	int generation;
517
518	/*
519	 * Search the hash table for this object/offset pair
520	 */
521
522retry:
523	generation = vm_page_bucket_generation;
524	bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
525	for (m = *bucket; m != NULL; m = m->hnext) {
526		if ((m->object == object) && (m->pindex == pindex)) {
527			if (vm_page_bucket_generation != generation)
528				goto retry;
529			return (m);
530		}
531	}
532	if (vm_page_bucket_generation != generation)
533		goto retry;
534	return (NULL);
535}
536
537/*
538 *	vm_page_rename:
539 *
540 *	Move the given memory entry from its
541 *	current object to the specified target object/offset.
542 *
543 *	The object must be locked.
544 *	This routine may not block.
545 *
546 *	Note: this routine will raise itself to splvm(), the caller need not.
547 *
548 *	Note: swap associated with the page must be invalidated by the move.  We
549 *	      have to do this for several reasons:  (1) we aren't freeing the
550 *	      page, (2) we are dirtying the page, (3) the VM system is probably
551 *	      moving the page from object A to B, and will then later move
552 *	      the backing store from A to B and we can't have a conflict.
553 *
554 *	Note: we *always* dirty the page.  It is necessary both for the
555 *	      fact that we moved it, and because we may be invalidating
556 *	      swap.  If the page is on the cache, we have to deactivate it
557 *	      or vm_page_dirty() will panic.  Dirty pages are not allowed
558 *	      on the cache.
559 */
560
561void
562vm_page_rename(m, new_object, new_pindex)
563	register vm_page_t m;
564	register vm_object_t new_object;
565	vm_pindex_t new_pindex;
566{
567	int s;
568
569	s = splvm();
570	vm_page_remove(m);
571	vm_page_insert(m, new_object, new_pindex);
572	if (m->queue - m->pc == PQ_CACHE)
573		vm_page_deactivate(m);
574	vm_page_dirty(m);
575	splx(s);
576}
577
578/*
579 * vm_page_unqueue_nowakeup:
580 *
581 * 	vm_page_unqueue() without any wakeup
582 *
583 *	This routine must be called at splhigh().
584 *	This routine may not block.
585 */
586
587void
588vm_page_unqueue_nowakeup(m)
589	vm_page_t m;
590{
591	int queue = m->queue;
592	struct vpgqueues *pq;
593	if (queue != PQ_NONE) {
594		pq = &vm_page_queues[queue];
595		m->queue = PQ_NONE;
596		TAILQ_REMOVE(pq->pl, m, pageq);
597		(*pq->cnt)--;
598		(*pq->lcnt)--;
599	}
600}
601
602/*
603 * vm_page_unqueue:
604 *
605 *	Remove a page from its queue.
606 *
607 *	This routine must be called at splhigh().
608 *	This routine may not block.
609 */
610
611void
612vm_page_unqueue(m)
613	vm_page_t m;
614{
615	int queue = m->queue;
616	struct vpgqueues *pq;
617	if (queue != PQ_NONE) {
618		m->queue = PQ_NONE;
619		pq = &vm_page_queues[queue];
620		TAILQ_REMOVE(pq->pl, m, pageq);
621		(*pq->cnt)--;
622		(*pq->lcnt)--;
623		if ((queue - m->pc) == PQ_CACHE) {
624			if ((cnt.v_cache_count + cnt.v_free_count) <
625				(cnt.v_free_reserved + cnt.v_cache_min))
626				pagedaemon_wakeup();
627		}
628	}
629}
630
631#if PQ_L2_SIZE > 1
632
633/*
634 *	vm_page_list_find:
635 *
636 *	Find a page on the specified queue with color optimization.
637 *
638 *	The page coloring optimization attempts to locate a page
639 *	that does not overload other nearby pages in the object in
640 *	the cpu's L1 or L2 caches.  We need this optmization because
641 *	cpu caches tend to be physical caches, while object spaces tend
642 *	to be virtual.
643 *
644 *	This routine must be called at splvm().
645 *	This routine may not block.
646 *
647 *	This routine may only be called from the vm_page_list_find() macro
648 *	in vm_page.h
649 */
650vm_page_t
651_vm_page_list_find(basequeue, index)
652	int basequeue, index;
653{
654	int i;
655	vm_page_t m = NULL;
656	struct vpgqueues *pq;
657
658	pq = &vm_page_queues[basequeue];
659
660	/*
661	 * Note that for the first loop, index+i and index-i wind up at the
662	 * same place.  Even though this is not totally optimal, we've already
663	 * blown it by missing the cache case so we do not care.
664	 */
665
666	for(i = PQ_L2_SIZE / 2; i > 0; --i) {
667		if ((m = TAILQ_FIRST(pq[(index + i) & PQ_L2_MASK].pl)) != NULL)
668			break;
669
670		if ((m = TAILQ_FIRST(pq[(index - i) & PQ_L2_MASK].pl)) != NULL)
671			break;
672	}
673	return(m);
674}
675
676#endif
677
678/*
679 *	vm_page_select_cache:
680 *
681 *	Find a page on the cache queue with color optimization.  As pages
682 *	might be found, but not applicable, they are deactivated.  This
683 *	keeps us from using potentially busy cached pages.
684 *
685 *	This routine must be called at splvm().
686 *	This routine may not block.
687 */
688vm_page_t
689vm_page_select_cache(object, pindex)
690	vm_object_t object;
691	vm_pindex_t pindex;
692{
693	vm_page_t m;
694
695	while (TRUE) {
696		m = vm_page_list_find(
697		    PQ_CACHE,
698		    (pindex + object->pg_color) & PQ_L2_MASK,
699		    FALSE
700		);
701		if (m && ((m->flags & PG_BUSY) || m->busy ||
702			       m->hold_count || m->wire_count)) {
703			vm_page_deactivate(m);
704			continue;
705		}
706		return m;
707	}
708}
709
710/*
711 *	vm_page_select_free:
712 *
713 *	Find a free or zero page, with specified preference.  We attempt to
714 *	inline the nominal case and fall back to _vm_page_select_free()
715 *	otherwise.
716 *
717 *	This routine must be called at splvm().
718 *	This routine may not block.
719 */
720
721static __inline vm_page_t
722vm_page_select_free(vm_object_t object, vm_pindex_t pindex, boolean_t prefer_zero)
723{
724	vm_page_t m;
725
726	m = vm_page_list_find(
727		PQ_FREE,
728		(pindex + object->pg_color) & PQ_L2_MASK,
729		prefer_zero
730	);
731	return(m);
732}
733
734/*
735 *	vm_page_alloc:
736 *
737 *	Allocate and return a memory cell associated
738 *	with this VM object/offset pair.
739 *
740 *	page_req classes:
741 *	VM_ALLOC_NORMAL		normal process request
742 *	VM_ALLOC_SYSTEM		system *really* needs a page
743 *	VM_ALLOC_INTERRUPT	interrupt time request
744 *	VM_ALLOC_ZERO		zero page
745 *
746 *	Object must be locked.
747 *	This routine may not block.
748 *
749 *	Additional special handling is required when called from an
750 *	interrupt (VM_ALLOC_INTERRUPT).  We are not allowed to mess with
751 *	the page cache in this case.
752 */
753
754vm_page_t
755vm_page_alloc(object, pindex, page_req)
756	vm_object_t object;
757	vm_pindex_t pindex;
758	int page_req;
759{
760	register vm_page_t m = NULL;
761	int s;
762
763	KASSERT(!vm_page_lookup(object, pindex),
764		("vm_page_alloc: page already allocated"));
765
766	/*
767	 * The pager is allowed to eat deeper into the free page list.
768	 */
769
770	if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) {
771		page_req = VM_ALLOC_SYSTEM;
772	};
773
774	s = splvm();
775
776loop:
777	if (cnt.v_free_count > cnt.v_free_reserved) {
778		/*
779		 * Allocate from the free queue if there are plenty of pages
780		 * in it.
781		 */
782		if (page_req == VM_ALLOC_ZERO)
783			m = vm_page_select_free(object, pindex, TRUE);
784		else
785			m = vm_page_select_free(object, pindex, FALSE);
786	} else if (
787	    (page_req == VM_ALLOC_SYSTEM &&
788	     cnt.v_cache_count == 0 &&
789	     cnt.v_free_count > cnt.v_interrupt_free_min) ||
790	    (page_req == VM_ALLOC_INTERRUPT && cnt.v_free_count > 0)
791	) {
792		/*
793		 * Interrupt or system, dig deeper into the free list.
794		 */
795		m = vm_page_select_free(object, pindex, FALSE);
796	} else if (page_req != VM_ALLOC_INTERRUPT) {
797		/*
798		 * Allocateable from cache (non-interrupt only).  On success,
799		 * we must free the page and try again, thus ensuring that
800		 * cnt.v_*_free_min counters are replenished.
801		 */
802		m = vm_page_select_cache(object, pindex);
803		if (m == NULL) {
804			splx(s);
805#if defined(DIAGNOSTIC)
806			if (cnt.v_cache_count > 0)
807				printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", cnt.v_cache_count);
808#endif
809			vm_pageout_deficit++;
810			pagedaemon_wakeup();
811			return (NULL);
812		}
813		KASSERT(m->dirty == 0, ("Found dirty cache page %p", m));
814		vm_page_busy(m);
815		vm_page_protect(m, VM_PROT_NONE);
816		vm_page_free(m);
817		goto loop;
818	} else {
819		/*
820		 * Not allocateable from cache from interrupt, give up.
821		 */
822		splx(s);
823		vm_pageout_deficit++;
824		pagedaemon_wakeup();
825		return (NULL);
826	}
827
828	/*
829	 *  At this point we had better have found a good page.
830	 */
831
832	KASSERT(
833	    m != NULL,
834	    ("vm_page_alloc(): missing page on free queue\n")
835	);
836
837	/*
838	 * Remove from free queue
839	 */
840
841	{
842		struct vpgqueues *pq = &vm_page_queues[m->queue];
843
844		TAILQ_REMOVE(pq->pl, m, pageq);
845		(*pq->cnt)--;
846		(*pq->lcnt)--;
847	}
848
849	/*
850	 * Initialize structure.  Only the PG_ZERO flag is inherited.
851	 */
852
853	if (m->flags & PG_ZERO) {
854		vm_page_zero_count--;
855		m->flags = PG_ZERO | PG_BUSY;
856	} else {
857		m->flags = PG_BUSY;
858	}
859	m->wire_count = 0;
860	m->hold_count = 0;
861	m->act_count = 0;
862	m->busy = 0;
863	m->valid = 0;
864	m->dirty = 0;
865	m->queue = PQ_NONE;
866
867	/*
868	 * vm_page_insert() is safe prior to the splx().  Note also that
869	 * inserting a page here does not insert it into the pmap (which
870	 * could cause us to block allocating memory).  We cannot block
871	 * anywhere.
872	 */
873
874	vm_page_insert(m, object, pindex);
875
876	/*
877	 * Don't wakeup too often - wakeup the pageout daemon when
878	 * we would be nearly out of memory.
879	 */
880	if (((cnt.v_free_count + cnt.v_cache_count) <
881		(cnt.v_free_reserved + cnt.v_cache_min)) ||
882			(cnt.v_free_count < cnt.v_pageout_free_min))
883		pagedaemon_wakeup();
884
885	splx(s);
886
887	return (m);
888}
889
890/*
891 *	vm_wait:	(also see VM_WAIT macro)
892 *
893 *	Block until free pages are available for allocation
894 */
895
896void
897vm_wait()
898{
899	int s;
900
901	s = splvm();
902	if (curproc == pageproc) {
903		vm_pageout_pages_needed = 1;
904		tsleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0);
905	} else {
906		if (!vm_pages_needed) {
907			vm_pages_needed++;
908			wakeup(&vm_pages_needed);
909		}
910		tsleep(&cnt.v_free_count, PVM, "vmwait", 0);
911	}
912	splx(s);
913}
914
915/*
916 *	vm_await:	(also see VM_AWAIT macro)
917 *
918 *	asleep on an event that will signal when free pages are available
919 *	for allocation.
920 */
921
922void
923vm_await()
924{
925	int s;
926
927	s = splvm();
928	if (curproc == pageproc) {
929		vm_pageout_pages_needed = 1;
930		asleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0);
931	} else {
932		if (!vm_pages_needed) {
933			vm_pages_needed++;
934			wakeup(&vm_pages_needed);
935		}
936		asleep(&cnt.v_free_count, PVM, "vmwait", 0);
937	}
938	splx(s);
939}
940
941#if 0
942/*
943 *	vm_page_sleep:
944 *
945 *	Block until page is no longer busy.
946 */
947
948int
949vm_page_sleep(vm_page_t m, char *msg, char *busy) {
950	int slept = 0;
951	if ((busy && *busy) || (m->flags & PG_BUSY)) {
952		int s;
953		s = splvm();
954		if ((busy && *busy) || (m->flags & PG_BUSY)) {
955			vm_page_flag_set(m, PG_WANTED);
956			tsleep(m, PVM, msg, 0);
957			slept = 1;
958		}
959		splx(s);
960	}
961	return slept;
962}
963
964#endif
965
966#if 0
967
968/*
969 *	vm_page_asleep:
970 *
971 *	Similar to vm_page_sleep(), but does not block.  Returns 0 if
972 *	the page is not busy, or 1 if the page is busy.
973 *
974 *	This routine has the side effect of calling asleep() if the page
975 *	was busy (1 returned).
976 */
977
978int
979vm_page_asleep(vm_page_t m, char *msg, char *busy) {
980	int slept = 0;
981	if ((busy && *busy) || (m->flags & PG_BUSY)) {
982		int s;
983		s = splvm();
984		if ((busy && *busy) || (m->flags & PG_BUSY)) {
985			vm_page_flag_set(m, PG_WANTED);
986			asleep(m, PVM, msg, 0);
987			slept = 1;
988		}
989		splx(s);
990	}
991	return slept;
992}
993
994#endif
995
996/*
997 *	vm_page_activate:
998 *
999 *	Put the specified page on the active list (if appropriate).
1000 *
1001 *	The page queues must be locked.
1002 *	This routine may not block.
1003 */
1004void
1005vm_page_activate(m)
1006	register vm_page_t m;
1007{
1008	int s;
1009
1010	s = splvm();
1011	if (m->queue != PQ_ACTIVE) {
1012		if ((m->queue - m->pc) == PQ_CACHE)
1013			cnt.v_reactivated++;
1014
1015		vm_page_unqueue(m);
1016
1017		if (m->wire_count == 0) {
1018			m->queue = PQ_ACTIVE;
1019			++(*vm_page_queues[PQ_ACTIVE].lcnt);
1020			TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
1021			if (m->act_count < ACT_INIT)
1022				m->act_count = ACT_INIT;
1023			cnt.v_active_count++;
1024		}
1025	} else {
1026		if (m->act_count < ACT_INIT)
1027			m->act_count = ACT_INIT;
1028	}
1029
1030	splx(s);
1031}
1032
1033/*
1034 *	vm_page_free_wakeup:
1035 *
1036 *	Helper routine for vm_page_free_toq() and vm_page_cache().  This
1037 *	routine is called when a page has been added to the cache or free
1038 *	queues.
1039 *
1040 *	This routine may not block.
1041 *	This routine must be called at splvm()
1042 */
1043static __inline void
1044vm_page_free_wakeup()
1045{
1046	/*
1047	 * if pageout daemon needs pages, then tell it that there are
1048	 * some free.
1049	 */
1050	if (vm_pageout_pages_needed) {
1051		wakeup(&vm_pageout_pages_needed);
1052		vm_pageout_pages_needed = 0;
1053	}
1054	/*
1055	 * wakeup processes that are waiting on memory if we hit a
1056	 * high water mark. And wakeup scheduler process if we have
1057	 * lots of memory. this process will swapin processes.
1058	 */
1059	if (vm_pages_needed &&
1060		((cnt.v_free_count + cnt.v_cache_count) >= cnt.v_free_min)) {
1061		wakeup(&cnt.v_free_count);
1062		vm_pages_needed = 0;
1063	}
1064}
1065
1066/*
1067 *	vm_page_free_toq:
1068 *
1069 *	Returns the given page to the PQ_FREE or PQ_ZERO list,
1070 *	disassociating it with any VM object.
1071 *
1072 *	Object and page must be locked prior to entry.
1073 *	This routine may not block.
1074 */
1075
1076void
1077vm_page_free_toq(vm_page_t m)
1078{
1079	int s;
1080	struct vpgqueues *pq;
1081	vm_object_t object = m->object;
1082
1083	s = splvm();
1084
1085	cnt.v_tfree++;
1086
1087#if !defined(MAX_PERF)
1088	if (m->busy || ((m->queue - m->pc) == PQ_FREE) ||
1089		(m->hold_count != 0)) {
1090		printf(
1091		"vm_page_free: pindex(%lu), busy(%d), PG_BUSY(%d), hold(%d)\n",
1092		    (u_long)m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0,
1093		    m->hold_count);
1094		if ((m->queue - m->pc) == PQ_FREE)
1095			panic("vm_page_free: freeing free page");
1096		else
1097			panic("vm_page_free: freeing busy page");
1098	}
1099#endif
1100
1101	/*
1102	 * unqueue, then remove page.  Note that we cannot destroy
1103	 * the page here because we do not want to call the pager's
1104	 * callback routine until after we've put the page on the
1105	 * appropriate free queue.
1106	 */
1107
1108	vm_page_unqueue_nowakeup(m);
1109	vm_page_remove(m);
1110
1111	/*
1112	 * If fictitious remove object association and
1113	 * return, otherwise delay object association removal.
1114	 */
1115
1116	if ((m->flags & PG_FICTITIOUS) != 0) {
1117		splx(s);
1118		return;
1119	}
1120
1121	m->valid = 0;
1122
1123	if (m->wire_count != 0) {
1124#if !defined(MAX_PERF)
1125		if (m->wire_count > 1) {
1126			panic("vm_page_free: invalid wire count (%d), pindex: 0x%x",
1127				m->wire_count, m->pindex);
1128		}
1129#endif
1130		printf("vm_page_free: freeing wired page\n");
1131		m->wire_count = 0;
1132		cnt.v_wire_count--;
1133	}
1134
1135	/*
1136	 * If we've exhausted the object's resident pages we want to free
1137	 * it up.
1138	 */
1139
1140	if (object &&
1141	    (object->type == OBJT_VNODE) &&
1142	    ((object->flags & OBJ_DEAD) == 0)
1143	) {
1144		struct vnode *vp = (struct vnode *)object->handle;
1145
1146		if (vp && VSHOULDFREE(vp)) {
1147			if ((vp->v_flag & (VTBFREE|VDOOMED|VFREE)) == 0) {
1148				TAILQ_INSERT_TAIL(&vnode_tobefree_list, vp, v_freelist);
1149				vp->v_flag |= VTBFREE;
1150			}
1151		}
1152	}
1153
1154#ifdef __alpha__
1155	pmap_page_is_free(m);
1156#endif
1157
1158	m->queue = PQ_FREE + m->pc;
1159	pq = &vm_page_queues[m->queue];
1160	++(*pq->lcnt);
1161	++(*pq->cnt);
1162
1163	/*
1164	 * Put zero'd pages on the end ( where we look for zero'd pages
1165	 * first ) and non-zerod pages at the head.
1166	 */
1167
1168	if (m->flags & PG_ZERO) {
1169		TAILQ_INSERT_TAIL(pq->pl, m, pageq);
1170		++vm_page_zero_count;
1171	} else if (curproc == pageproc) {
1172		/*
1173		 * If the pageout daemon is freeing pages, the pages are
1174		 * likely to NOT be in the L1 or L2 caches due to their age.
1175		 * For now we do not try to do anything special with this
1176		 * info.
1177		 */
1178		TAILQ_INSERT_HEAD(pq->pl, m, pageq);
1179	} else {
1180		TAILQ_INSERT_HEAD(pq->pl, m, pageq);
1181	}
1182
1183	vm_page_free_wakeup();
1184
1185	splx(s);
1186}
1187
1188/*
1189 *	vm_page_wire:
1190 *
1191 *	Mark this page as wired down by yet
1192 *	another map, removing it from paging queues
1193 *	as necessary.
1194 *
1195 *	The page queues must be locked.
1196 *	This routine may not block.
1197 */
1198void
1199vm_page_wire(m)
1200	register vm_page_t m;
1201{
1202	int s;
1203
1204	s = splvm();
1205	if (m->wire_count == 0) {
1206		vm_page_unqueue(m);
1207		cnt.v_wire_count++;
1208	}
1209	m->wire_count++;
1210	splx(s);
1211	(*vm_page_queues[PQ_NONE].lcnt)++;
1212	vm_page_flag_set(m, PG_MAPPED);
1213}
1214
1215/*
1216 *	vm_page_unwire:
1217 *
1218 *	Release one wiring of this page, potentially
1219 *	enabling it to be paged again.
1220 *
1221 *	Many pages placed on the inactive queue should actually go
1222 *	into the cache, but it is difficult to figure out which.  What
1223 *	we do instead, if the inactive target is well met, is to put
1224 *	clean pages at the head of the inactive queue instead of the tail.
1225 *	This will cause them to be moved to the cache more quickly and
1226 *	if not actively re-referenced, freed more quickly.  If we just
1227 *	stick these pages at the end of the inactive queue, heavy filesystem
1228 *	meta-data accesses can cause an unnecessary paging load on memory bound
1229 *	processes.  This optimization causes one-time-use metadata to be
1230 *	reused more quickly.
1231 *
1232 *	A number of routines use vm_page_unwire() to guarentee that the page
1233 *	will go into either the inactive or active queues, and will NEVER
1234 *	be placed in the cache - for example, just after dirtying a page.
1235 *	dirty pages in the cache are not allowed.
1236 *
1237 *	The page queues must be locked.
1238 *	This routine may not block.
1239 */
1240void
1241vm_page_unwire(m, activate)
1242	register vm_page_t m;
1243	int activate;
1244{
1245	int s;
1246
1247	s = splvm();
1248
1249	if (m->wire_count > 0) {
1250		m->wire_count--;
1251		if (m->wire_count == 0) {
1252			cnt.v_wire_count--;
1253			if (activate) {
1254				TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
1255				m->queue = PQ_ACTIVE;
1256				(*vm_page_queues[PQ_ACTIVE].lcnt)++;
1257				cnt.v_active_count++;
1258			} else {
1259				TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
1260				m->queue = PQ_INACTIVE;
1261				(*vm_page_queues[PQ_INACTIVE].lcnt)++;
1262				cnt.v_inactive_count++;
1263			}
1264		}
1265	} else {
1266#if !defined(MAX_PERF)
1267		panic("vm_page_unwire: invalid wire count: %d\n", m->wire_count);
1268#endif
1269	}
1270	splx(s);
1271}
1272
1273
1274/*
1275 * Move the specified page to the inactive queue.  If the page has
1276 * any associated swap, the swap is deallocated.
1277 *
1278 * This routine may not block.
1279 */
1280void
1281vm_page_deactivate(m)
1282	register vm_page_t m;
1283{
1284	int s;
1285
1286	/*
1287	 * Ignore if already inactive.
1288	 */
1289	if (m->queue == PQ_INACTIVE)
1290		return;
1291
1292	s = splvm();
1293	if (m->wire_count == 0) {
1294		if ((m->queue - m->pc) == PQ_CACHE)
1295			cnt.v_reactivated++;
1296		vm_page_unqueue(m);
1297		TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
1298		m->queue = PQ_INACTIVE;
1299		++(*vm_page_queues[PQ_INACTIVE].lcnt);
1300		cnt.v_inactive_count++;
1301	}
1302	splx(s);
1303}
1304
1305/*
1306 * vm_page_cache
1307 *
1308 * Put the specified page onto the page cache queue (if appropriate).
1309 *
1310 * This routine may not block.
1311 */
1312void
1313vm_page_cache(m)
1314	register vm_page_t m;
1315{
1316	int s;
1317
1318#if !defined(MAX_PERF)
1319	if ((m->flags & PG_BUSY) || m->busy || m->wire_count) {
1320		printf("vm_page_cache: attempting to cache busy page\n");
1321		return;
1322	}
1323#endif
1324	if ((m->queue - m->pc) == PQ_CACHE)
1325		return;
1326
1327	/*
1328	 * Remove all pmaps and indicate that the page is not
1329	 * writeable or mapped.
1330	 */
1331
1332	vm_page_protect(m, VM_PROT_NONE);
1333#if !defined(MAX_PERF)
1334	if (m->dirty != 0) {
1335		panic("vm_page_cache: caching a dirty page, pindex: %d", m->pindex);
1336	}
1337#endif
1338	s = splvm();
1339	vm_page_unqueue_nowakeup(m);
1340	m->queue = PQ_CACHE + m->pc;
1341	(*vm_page_queues[m->queue].lcnt)++;
1342	TAILQ_INSERT_TAIL(vm_page_queues[m->queue].pl, m, pageq);
1343	cnt.v_cache_count++;
1344	vm_page_free_wakeup();
1345	splx(s);
1346}
1347
1348/*
1349 * Grab a page, waiting until we are waken up due to the page
1350 * changing state.  We keep on waiting, if the page continues
1351 * to be in the object.  If the page doesn't exist, allocate it.
1352 *
1353 * This routine may block.
1354 */
1355vm_page_t
1356vm_page_grab(object, pindex, allocflags)
1357	vm_object_t object;
1358	vm_pindex_t pindex;
1359	int allocflags;
1360{
1361
1362	vm_page_t m;
1363	int s, generation;
1364
1365retrylookup:
1366	if ((m = vm_page_lookup(object, pindex)) != NULL) {
1367		if (m->busy || (m->flags & PG_BUSY)) {
1368			generation = object->generation;
1369
1370			s = splvm();
1371			while ((object->generation == generation) &&
1372					(m->busy || (m->flags & PG_BUSY))) {
1373				vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
1374				tsleep(m, PVM, "pgrbwt", 0);
1375				if ((allocflags & VM_ALLOC_RETRY) == 0) {
1376					splx(s);
1377					return NULL;
1378				}
1379			}
1380			splx(s);
1381			goto retrylookup;
1382		} else {
1383			vm_page_busy(m);
1384			return m;
1385		}
1386	}
1387
1388	m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY);
1389	if (m == NULL) {
1390		VM_WAIT;
1391		if ((allocflags & VM_ALLOC_RETRY) == 0)
1392			return NULL;
1393		goto retrylookup;
1394	}
1395
1396	return m;
1397}
1398
1399/*
1400 * Mapping function for valid bits or for dirty bits in
1401 * a page.  May not block.
1402 *
1403 * Inputs are required to range within a page.
1404 */
1405
1406__inline int
1407vm_page_bits(int base, int size)
1408{
1409	int first_bit;
1410	int last_bit;
1411
1412	KASSERT(
1413	    base + size <= PAGE_SIZE,
1414	    ("vm_page_bits: illegal base/size %d/%d", base, size)
1415	);
1416
1417	if (size == 0)		/* handle degenerate case */
1418		return(0);
1419
1420	first_bit = base >> DEV_BSHIFT;
1421	last_bit = (base + size - 1) >> DEV_BSHIFT;
1422
1423	return ((2 << last_bit) - (1 << first_bit));
1424}
1425
1426/*
1427 *	vm_page_set_validclean:
1428 *
1429 *	Sets portions of a page valid and clean.  The arguments are expected
1430 *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
1431 *	of any partial chunks touched by the range.  The invalid portion of
1432 *	such chunks will be zero'd.
1433 *
1434 *	This routine may not block.
1435 *
1436 *	(base + size) must be less then or equal to PAGE_SIZE.
1437 */
1438void
1439vm_page_set_validclean(m, base, size)
1440	vm_page_t m;
1441	int base;
1442	int size;
1443{
1444	int pagebits;
1445	int frag;
1446	int endoff;
1447
1448	if (size == 0)	/* handle degenerate case */
1449		return;
1450
1451	/*
1452	 * If the base is not DEV_BSIZE aligned and the valid
1453	 * bit is clear, we have to zero out a portion of the
1454	 * first block.
1455	 */
1456
1457	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
1458	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
1459	) {
1460		pmap_zero_page_area(
1461		    VM_PAGE_TO_PHYS(m),
1462		    frag,
1463		    base - frag
1464		);
1465	}
1466
1467	/*
1468	 * If the ending offset is not DEV_BSIZE aligned and the
1469	 * valid bit is clear, we have to zero out a portion of
1470	 * the last block.
1471	 */
1472
1473	endoff = base + size;
1474
1475	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
1476	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
1477	) {
1478		pmap_zero_page_area(
1479		    VM_PAGE_TO_PHYS(m),
1480		    endoff,
1481		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
1482		);
1483	}
1484
1485	/*
1486	 * Set valid, clear dirty bits.  If validating the entire
1487	 * page we can safely clear the pmap modify bit.
1488	 */
1489
1490	pagebits = vm_page_bits(base, size);
1491	m->valid |= pagebits;
1492	m->dirty &= ~pagebits;
1493
1494	if (base == 0 && size == PAGE_SIZE)
1495		pmap_clear_modify(VM_PAGE_TO_PHYS(m));
1496}
1497
1498#if 0
1499
1500void
1501vm_page_set_dirty(m, base, size)
1502	vm_page_t m;
1503	int base;
1504	int size;
1505{
1506	m->dirty |= vm_page_bits(base, size);
1507}
1508
1509#endif
1510
1511void
1512vm_page_clear_dirty(m, base, size)
1513	vm_page_t m;
1514	int base;
1515	int size;
1516{
1517	m->dirty &= ~vm_page_bits(base, size);
1518}
1519
1520/*
1521 *	vm_page_set_invalid:
1522 *
1523 *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
1524 *	valid and dirty bits for the effected areas are cleared.
1525 *
1526 *	May not block.
1527 */
1528void
1529vm_page_set_invalid(m, base, size)
1530	vm_page_t m;
1531	int base;
1532	int size;
1533{
1534	int bits;
1535
1536	bits = vm_page_bits(base, size);
1537	m->valid &= ~bits;
1538	m->dirty &= ~bits;
1539	m->object->generation++;
1540}
1541
1542/*
1543 * vm_page_zero_invalid()
1544 *
1545 *	The kernel assumes that the invalid portions of a page contain
1546 *	garbage, but such pages can be mapped into memory by user code.
1547 *	When this occurs, we must zero out the non-valid portions of the
1548 *	page so user code sees what it expects.
1549 *
1550 *	Pages are most often semi-valid when the end of a file is mapped
1551 *	into memory and the file's size is not page aligned.
1552 */
1553
1554void
1555vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
1556{
1557	int b;
1558	int i;
1559
1560	/*
1561	 * Scan the valid bits looking for invalid sections that
1562	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
1563	 * valid bit may be set ) have already been zerod by
1564	 * vm_page_set_validclean().
1565	 */
1566
1567	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
1568		if (i == (PAGE_SIZE / DEV_BSIZE) ||
1569		    (m->valid & (1 << i))
1570		) {
1571			if (i > b) {
1572				pmap_zero_page_area(
1573				    VM_PAGE_TO_PHYS(m),
1574				    b << DEV_BSHIFT,
1575				    (i - b) << DEV_BSHIFT
1576				);
1577			}
1578			b = i + 1;
1579		}
1580	}
1581
1582	/*
1583	 * setvalid is TRUE when we can safely set the zero'd areas
1584	 * as being valid.  We can do this if there are no cache consistancy
1585	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
1586	 */
1587
1588	if (setvalid)
1589		m->valid = VM_PAGE_BITS_ALL;
1590}
1591
1592/*
1593 *	vm_page_is_valid:
1594 *
1595 *	Is (partial) page valid?  Note that the case where size == 0
1596 *	will return FALSE in the degenerate case where the page is
1597 *	entirely invalid, and TRUE otherwise.
1598 *
1599 *	May not block.
1600 */
1601
1602int
1603vm_page_is_valid(m, base, size)
1604	vm_page_t m;
1605	int base;
1606	int size;
1607{
1608	int bits = vm_page_bits(base, size);
1609
1610	if (m->valid && ((m->valid & bits) == bits))
1611		return 1;
1612	else
1613		return 0;
1614}
1615
1616/*
1617 * update dirty bits from pmap/mmu.  May not block.
1618 */
1619
1620void
1621vm_page_test_dirty(m)
1622	vm_page_t m;
1623{
1624	if ((m->dirty != VM_PAGE_BITS_ALL) &&
1625	    pmap_is_modified(VM_PAGE_TO_PHYS(m))) {
1626		vm_page_dirty(m);
1627	}
1628}
1629
1630/*
1631 * This interface is for merging with malloc() someday.
1632 * Even if we never implement compaction so that contiguous allocation
1633 * works after initialization time, malloc()'s data structures are good
1634 * for statistics and for allocations of less than a page.
1635 */
1636void *
1637contigmalloc1(size, type, flags, low, high, alignment, boundary, map)
1638	unsigned long size;	/* should be size_t here and for malloc() */
1639	struct malloc_type *type;
1640	int flags;
1641	unsigned long low;
1642	unsigned long high;
1643	unsigned long alignment;
1644	unsigned long boundary;
1645	vm_map_t map;
1646{
1647	int i, s, start;
1648	vm_offset_t addr, phys, tmp_addr;
1649	int pass;
1650	vm_page_t pga = vm_page_array;
1651
1652	size = round_page(size);
1653#if !defined(MAX_PERF)
1654	if (size == 0)
1655		panic("contigmalloc1: size must not be 0");
1656	if ((alignment & (alignment - 1)) != 0)
1657		panic("contigmalloc1: alignment must be a power of 2");
1658	if ((boundary & (boundary - 1)) != 0)
1659		panic("contigmalloc1: boundary must be a power of 2");
1660#endif
1661
1662	start = 0;
1663	for (pass = 0; pass <= 1; pass++) {
1664		s = splvm();
1665again:
1666		/*
1667		 * Find first page in array that is free, within range, aligned, and
1668		 * such that the boundary won't be crossed.
1669		 */
1670		for (i = start; i < cnt.v_page_count; i++) {
1671			int pqtype;
1672			phys = VM_PAGE_TO_PHYS(&pga[i]);
1673			pqtype = pga[i].queue - pga[i].pc;
1674			if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) &&
1675			    (phys >= low) && (phys < high) &&
1676			    ((phys & (alignment - 1)) == 0) &&
1677			    (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0))
1678				break;
1679		}
1680
1681		/*
1682		 * If the above failed or we will exceed the upper bound, fail.
1683		 */
1684		if ((i == cnt.v_page_count) ||
1685			((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) {
1686			vm_page_t m, next;
1687
1688again1:
1689			for (m = TAILQ_FIRST(&vm_page_queue_inactive);
1690				m != NULL;
1691				m = next) {
1692
1693				if (m->queue != PQ_INACTIVE) {
1694					break;
1695				}
1696
1697				next = TAILQ_NEXT(m, pageq);
1698				if (vm_page_sleep_busy(m, TRUE, "vpctw0"))
1699					goto again1;
1700				vm_page_test_dirty(m);
1701				if (m->dirty) {
1702					if (m->object->type == OBJT_VNODE) {
1703						vn_lock(m->object->handle, LK_EXCLUSIVE | LK_RETRY, curproc);
1704						vm_object_page_clean(m->object, 0, 0, OBJPC_SYNC);
1705						VOP_UNLOCK(m->object->handle, 0, curproc);
1706						goto again1;
1707					} else if (m->object->type == OBJT_SWAP ||
1708								m->object->type == OBJT_DEFAULT) {
1709						vm_pageout_flush(&m, 1, 0);
1710						goto again1;
1711					}
1712				}
1713				if ((m->dirty == 0) && (m->busy == 0) && (m->hold_count == 0))
1714					vm_page_cache(m);
1715			}
1716
1717			for (m = TAILQ_FIRST(&vm_page_queue_active);
1718				m != NULL;
1719				m = next) {
1720
1721				if (m->queue != PQ_ACTIVE) {
1722					break;
1723				}
1724
1725				next = TAILQ_NEXT(m, pageq);
1726				if (vm_page_sleep_busy(m, TRUE, "vpctw1"))
1727					goto again1;
1728				vm_page_test_dirty(m);
1729				if (m->dirty) {
1730					if (m->object->type == OBJT_VNODE) {
1731						vn_lock(m->object->handle, LK_EXCLUSIVE | LK_RETRY, curproc);
1732						vm_object_page_clean(m->object, 0, 0, OBJPC_SYNC);
1733						VOP_UNLOCK(m->object->handle, 0, curproc);
1734						goto again1;
1735					} else if (m->object->type == OBJT_SWAP ||
1736								m->object->type == OBJT_DEFAULT) {
1737						vm_pageout_flush(&m, 1, 0);
1738						goto again1;
1739					}
1740				}
1741				if ((m->dirty == 0) && (m->busy == 0) && (m->hold_count == 0))
1742					vm_page_cache(m);
1743			}
1744
1745			splx(s);
1746			continue;
1747		}
1748		start = i;
1749
1750		/*
1751		 * Check successive pages for contiguous and free.
1752		 */
1753		for (i = start + 1; i < (start + size / PAGE_SIZE); i++) {
1754			int pqtype;
1755			pqtype = pga[i].queue - pga[i].pc;
1756			if ((VM_PAGE_TO_PHYS(&pga[i]) !=
1757			    (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) ||
1758			    ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE))) {
1759				start++;
1760				goto again;
1761			}
1762		}
1763
1764		for (i = start; i < (start + size / PAGE_SIZE); i++) {
1765			int pqtype;
1766			vm_page_t m = &pga[i];
1767
1768			pqtype = m->queue - m->pc;
1769			if (pqtype == PQ_CACHE) {
1770				vm_page_busy(m);
1771				vm_page_free(m);
1772			}
1773
1774			TAILQ_REMOVE(vm_page_queues[m->queue].pl, m, pageq);
1775			(*vm_page_queues[m->queue].lcnt)--;
1776			cnt.v_free_count--;
1777			m->valid = VM_PAGE_BITS_ALL;
1778			m->flags = 0;
1779			m->dirty = 0;
1780			m->wire_count = 0;
1781			m->busy = 0;
1782			m->queue = PQ_NONE;
1783			m->object = NULL;
1784			vm_page_wire(m);
1785		}
1786
1787		/*
1788		 * We've found a contiguous chunk that meets are requirements.
1789		 * Allocate kernel VM, unfree and assign the physical pages to it and
1790		 * return kernel VM pointer.
1791		 */
1792		tmp_addr = addr = kmem_alloc_pageable(map, size);
1793		if (addr == 0) {
1794			/*
1795			 * XXX We almost never run out of kernel virtual
1796			 * space, so we don't make the allocated memory
1797			 * above available.
1798			 */
1799			splx(s);
1800			return (NULL);
1801		}
1802
1803		for (i = start; i < (start + size / PAGE_SIZE); i++) {
1804			vm_page_t m = &pga[i];
1805			vm_page_insert(m, kernel_object,
1806				OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
1807			pmap_kenter(tmp_addr, VM_PAGE_TO_PHYS(m));
1808			tmp_addr += PAGE_SIZE;
1809		}
1810
1811		splx(s);
1812		return ((void *)addr);
1813	}
1814	return NULL;
1815}
1816
1817void *
1818contigmalloc(size, type, flags, low, high, alignment, boundary)
1819	unsigned long size;	/* should be size_t here and for malloc() */
1820	struct malloc_type *type;
1821	int flags;
1822	unsigned long low;
1823	unsigned long high;
1824	unsigned long alignment;
1825	unsigned long boundary;
1826{
1827	return contigmalloc1(size, type, flags, low, high, alignment, boundary,
1828			     kernel_map);
1829}
1830
1831vm_offset_t
1832vm_page_alloc_contig(size, low, high, alignment)
1833	vm_offset_t size;
1834	vm_offset_t low;
1835	vm_offset_t high;
1836	vm_offset_t alignment;
1837{
1838	return ((vm_offset_t)contigmalloc1(size, M_DEVBUF, M_NOWAIT, low, high,
1839					  alignment, 0ul, kernel_map));
1840}
1841
1842#include "opt_ddb.h"
1843#ifdef DDB
1844#include <sys/kernel.h>
1845
1846#include <ddb/ddb.h>
1847
1848DB_SHOW_COMMAND(page, vm_page_print_page_info)
1849{
1850	db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
1851	db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
1852	db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
1853	db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
1854	db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
1855	db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
1856	db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
1857	db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
1858	db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
1859	db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
1860}
1861
1862DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
1863{
1864	int i;
1865	db_printf("PQ_FREE:");
1866	for(i=0;i<PQ_L2_SIZE;i++) {
1867		db_printf(" %d", *vm_page_queues[PQ_FREE + i].lcnt);
1868	}
1869	db_printf("\n");
1870
1871	db_printf("PQ_CACHE:");
1872	for(i=0;i<PQ_L2_SIZE;i++) {
1873		db_printf(" %d", *vm_page_queues[PQ_CACHE + i].lcnt);
1874	}
1875	db_printf("\n");
1876
1877	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
1878		*vm_page_queues[PQ_ACTIVE].lcnt,
1879		*vm_page_queues[PQ_INACTIVE].lcnt);
1880}
1881#endif /* DDB */
1882