vm_page.c revision 43121
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
37 *	$Id: vm_page.c,v 1.118 1999/01/21 10:01:49 dillon Exp $
38 */
39
40/*
41 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42 * All rights reserved.
43 *
44 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45 *
46 * Permission to use, copy, modify and distribute this software and
47 * its documentation is hereby granted, provided that both the copyright
48 * notice and this permission notice appear in all copies of the
49 * software, derivative works or modified versions, and any portions
50 * thereof, and that both notices appear in supporting documentation.
51 *
52 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55 *
56 * Carnegie Mellon requests users of this software to return to
57 *
58 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
59 *  School of Computer Science
60 *  Carnegie Mellon University
61 *  Pittsburgh PA 15213-3890
62 *
63 * any improvements or extensions that they make and grant Carnegie the
64 * rights to redistribute these changes.
65 */
66
67/*
68 *	Resident memory management module.
69 */
70
71#include <sys/param.h>
72#include <sys/systm.h>
73#include <sys/malloc.h>
74#include <sys/proc.h>
75#include <sys/vmmeter.h>
76#include <sys/vnode.h>
77
78#include <vm/vm.h>
79#include <vm/vm_param.h>
80#include <vm/vm_prot.h>
81#include <sys/lock.h>
82#include <vm/vm_kern.h>
83#include <vm/vm_object.h>
84#include <vm/vm_page.h>
85#include <vm/vm_pageout.h>
86#include <vm/vm_pager.h>
87#include <vm/vm_extern.h>
88
89static void	vm_page_queue_init __P((void));
90static vm_page_t vm_page_select_free __P((vm_object_t object,
91			vm_pindex_t pindex, int prefqueue));
92static vm_page_t vm_page_select_cache __P((vm_object_t, vm_pindex_t));
93
94/*
95 *	Associated with page of user-allocatable memory is a
96 *	page structure.
97 */
98
99static struct vm_page **vm_page_buckets; /* Array of buckets */
100static int vm_page_bucket_count;	/* How big is array? */
101static int vm_page_hash_mask;		/* Mask for hash function */
102static volatile int vm_page_bucket_generation;
103
104struct pglist vm_page_queue_free[PQ_L2_SIZE] = {0};
105struct pglist vm_page_queue_zero[PQ_L2_SIZE] = {0};
106struct pglist vm_page_queue_active = {0};
107struct pglist vm_page_queue_inactive = {0};
108struct pglist vm_page_queue_cache[PQ_L2_SIZE] = {0};
109
110static int no_queue=0;
111
112struct vpgqueues vm_page_queues[PQ_COUNT] = {0};
113static int pqcnt[PQ_COUNT] = {0};
114
115static void
116vm_page_queue_init(void) {
117	int i;
118
119	vm_page_queues[PQ_NONE].pl = NULL;
120	vm_page_queues[PQ_NONE].cnt = &no_queue;
121	for(i=0;i<PQ_L2_SIZE;i++) {
122		vm_page_queues[PQ_FREE+i].pl = &vm_page_queue_free[i];
123		vm_page_queues[PQ_FREE+i].cnt = &cnt.v_free_count;
124	}
125	for(i=0;i<PQ_L2_SIZE;i++) {
126		vm_page_queues[PQ_ZERO+i].pl = &vm_page_queue_zero[i];
127		vm_page_queues[PQ_ZERO+i].cnt = &cnt.v_free_count;
128	}
129	vm_page_queues[PQ_INACTIVE].pl = &vm_page_queue_inactive;
130	vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
131
132	vm_page_queues[PQ_ACTIVE].pl = &vm_page_queue_active;
133	vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
134	for(i=0;i<PQ_L2_SIZE;i++) {
135		vm_page_queues[PQ_CACHE+i].pl = &vm_page_queue_cache[i];
136		vm_page_queues[PQ_CACHE+i].cnt = &cnt.v_cache_count;
137	}
138	for(i=0;i<PQ_COUNT;i++) {
139		if (vm_page_queues[i].pl) {
140			TAILQ_INIT(vm_page_queues[i].pl);
141		} else if (i != 0) {
142			panic("vm_page_queue_init: queue %d is null", i);
143		}
144		vm_page_queues[i].lcnt = &pqcnt[i];
145	}
146}
147
148vm_page_t vm_page_array = 0;
149static int vm_page_array_size = 0;
150long first_page = 0;
151static long last_page;
152static vm_size_t page_mask;
153static int page_shift;
154int vm_page_zero_count = 0;
155
156/*
157 * map of contiguous valid DEV_BSIZE chunks in a page
158 * (this list is valid for page sizes upto 16*DEV_BSIZE)
159 */
160static u_short vm_page_dev_bsize_chunks[] = {
161	0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff,
162	0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff
163};
164
165static __inline int vm_page_hash __P((vm_object_t object, vm_pindex_t pindex));
166static void vm_page_free_wakeup __P((void));
167
168/*
169 *	vm_set_page_size:
170 *
171 *	Sets the page size, perhaps based upon the memory
172 *	size.  Must be called before any use of page-size
173 *	dependent functions.
174 *
175 *	Sets page_shift and page_mask from cnt.v_page_size.
176 */
177void
178vm_set_page_size()
179{
180
181	if (cnt.v_page_size == 0)
182		cnt.v_page_size = DEFAULT_PAGE_SIZE;
183	page_mask = cnt.v_page_size - 1;
184	if ((page_mask & cnt.v_page_size) != 0)
185		panic("vm_set_page_size: page size not a power of two");
186	for (page_shift = 0;; page_shift++)
187		if ((1 << page_shift) == cnt.v_page_size)
188			break;
189}
190
191/*
192 *	vm_page_startup:
193 *
194 *	Initializes the resident memory module.
195 *
196 *	Allocates memory for the page cells, and
197 *	for the object/offset-to-page hash table headers.
198 *	Each page cell is initialized and placed on the free list.
199 */
200
201vm_offset_t
202vm_page_startup(starta, enda, vaddr)
203	register vm_offset_t starta;
204	vm_offset_t enda;
205	register vm_offset_t vaddr;
206{
207	register vm_offset_t mapped;
208	register vm_page_t m;
209	register struct vm_page **bucket;
210	vm_size_t npages, page_range;
211	register vm_offset_t new_start;
212	int i;
213	vm_offset_t pa;
214	int nblocks;
215	vm_offset_t first_managed_page;
216
217	/* the biggest memory array is the second group of pages */
218	vm_offset_t start;
219	vm_offset_t biggestone, biggestsize;
220
221	vm_offset_t total;
222
223	total = 0;
224	biggestsize = 0;
225	biggestone = 0;
226	nblocks = 0;
227	vaddr = round_page(vaddr);
228
229	for (i = 0; phys_avail[i + 1]; i += 2) {
230		phys_avail[i] = round_page(phys_avail[i]);
231		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
232	}
233
234	for (i = 0; phys_avail[i + 1]; i += 2) {
235		int size = phys_avail[i + 1] - phys_avail[i];
236
237		if (size > biggestsize) {
238			biggestone = i;
239			biggestsize = size;
240		}
241		++nblocks;
242		total += size;
243	}
244
245	start = phys_avail[biggestone];
246
247	/*
248	 * Initialize the queue headers for the free queue, the active queue
249	 * and the inactive queue.
250	 */
251
252	vm_page_queue_init();
253
254	/*
255	 * Allocate (and initialize) the hash table buckets.
256	 *
257	 * The number of buckets MUST BE a power of 2, and the actual value is
258	 * the next power of 2 greater than the number of physical pages in
259	 * the system.
260	 *
261	 * We make the hash table approximately 2x the number of pages to
262	 * reduce the chain length.  This is about the same size using the
263	 * singly-linked list as the 1x hash table we were using before
264	 * using TAILQ but the chain length will be smaller.
265	 *
266	 * Note: This computation can be tweaked if desired.
267	 */
268	vm_page_buckets = (struct vm_page **)vaddr;
269	bucket = vm_page_buckets;
270	if (vm_page_bucket_count == 0) {
271		vm_page_bucket_count = 1;
272		while (vm_page_bucket_count < atop(total))
273			vm_page_bucket_count <<= 1;
274	}
275	vm_page_bucket_count <<= 1;
276	vm_page_hash_mask = vm_page_bucket_count - 1;
277
278	/*
279	 * Validate these addresses.
280	 */
281
282	new_start = start + vm_page_bucket_count * sizeof(struct vm_page *);
283	new_start = round_page(new_start);
284	mapped = round_page(vaddr);
285	vaddr = pmap_map(mapped, start, new_start,
286	    VM_PROT_READ | VM_PROT_WRITE);
287	start = new_start;
288	vaddr = round_page(vaddr);
289	bzero((caddr_t) mapped, vaddr - mapped);
290
291	for (i = 0; i < vm_page_bucket_count; i++) {
292		*bucket = NULL;
293		bucket++;
294	}
295
296	/*
297	 * Compute the number of pages of memory that will be available for
298	 * use (taking into account the overhead of a page structure per
299	 * page).
300	 */
301
302	first_page = phys_avail[0] / PAGE_SIZE;
303	last_page = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE;
304
305	page_range = last_page - (phys_avail[0] / PAGE_SIZE);
306	npages = (total - (page_range * sizeof(struct vm_page)) -
307	    (start - phys_avail[biggestone])) / PAGE_SIZE;
308
309	/*
310	 * Initialize the mem entry structures now, and put them in the free
311	 * queue.
312	 */
313	vm_page_array = (vm_page_t) vaddr;
314	mapped = vaddr;
315
316	/*
317	 * Validate these addresses.
318	 */
319	new_start = round_page(start + page_range * sizeof(struct vm_page));
320	mapped = pmap_map(mapped, start, new_start,
321	    VM_PROT_READ | VM_PROT_WRITE);
322	start = new_start;
323
324	first_managed_page = start / PAGE_SIZE;
325
326	/*
327	 * Clear all of the page structures
328	 */
329	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
330	vm_page_array_size = page_range;
331
332	cnt.v_page_count = 0;
333	cnt.v_free_count = 0;
334	for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) {
335		if (i == biggestone)
336			pa = ptoa(first_managed_page);
337		else
338			pa = phys_avail[i];
339		while (pa < phys_avail[i + 1] && npages-- > 0) {
340			++cnt.v_page_count;
341			++cnt.v_free_count;
342			m = PHYS_TO_VM_PAGE(pa);
343			m->phys_addr = pa;
344			m->flags = 0;
345			m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK;
346			m->queue = m->pc + PQ_FREE;
347			TAILQ_INSERT_TAIL(vm_page_queues[m->queue].pl, m, pageq);
348			++(*vm_page_queues[m->queue].lcnt);
349			pa += PAGE_SIZE;
350		}
351	}
352	return (mapped);
353}
354
355/*
356 *	vm_page_hash:
357 *
358 *	Distributes the object/offset key pair among hash buckets.
359 *
360 *	NOTE:  This macro depends on vm_page_bucket_count being a power of 2.
361 *	This routine may not block.
362 *
363 *	We try to randomize the hash based on the object to spread the pages
364 *	out in the hash table without it costing us too much.
365 */
366static __inline int
367vm_page_hash(object, pindex)
368	vm_object_t object;
369	vm_pindex_t pindex;
370{
371	int i = ((uintptr_t)object + pindex) ^ object->hash_rand;
372
373	return(i & vm_page_hash_mask);
374}
375
376/*
377 *	vm_page_insert:		[ internal use only ]
378 *
379 *	Inserts the given mem entry into the object and object list.
380 *
381 *	The pagetables are not updated but will presumably fault the page
382 *	in if necessary, or if a kernel page the caller will at some point
383 *	enter the page into the kernel's pmap.  We are not allowed to block
384 *	here so we *can't* do this anyway.
385 *
386 *	The object and page must be locked, and must be splhigh.
387 *	This routine may not block.
388 */
389
390void
391vm_page_insert(m, object, pindex)
392	register vm_page_t m;
393	register vm_object_t object;
394	register vm_pindex_t pindex;
395{
396	register struct vm_page **bucket;
397
398	if (m->object != NULL)
399		panic("vm_page_insert: already inserted");
400
401	/*
402	 * Record the object/offset pair in this page
403	 */
404
405	m->object = object;
406	m->pindex = pindex;
407
408	/*
409	 * Insert it into the object_object/offset hash table
410	 */
411
412	bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
413	m->hnext = *bucket;
414	*bucket = m;
415	vm_page_bucket_generation++;
416
417	/*
418	 * Now link into the object's list of backed pages.
419	 */
420
421	TAILQ_INSERT_TAIL(&object->memq, m, listq);
422#if 0
423	m->object->page_hint = m;
424#endif
425	m->object->generation++;
426
427	if (m->wire_count)
428		object->wire_count++;
429
430	if ((m->queue - m->pc) == PQ_CACHE)
431		object->cache_count++;
432
433	/*
434	 * show that the object has one more resident page.
435	 */
436
437	object->resident_page_count++;
438}
439
440/*
441 *	vm_page_remove:
442 *				NOTE: used by device pager as well -wfj
443 *
444 *	Removes the given mem entry from the object/offset-page
445 *	table and the object page list, but do not invalidate/terminate
446 *	the backing store.
447 *
448 *	The object and page must be locked, and at splhigh.
449 *	The underlying pmap entry (if any) is NOT removed here.
450 *	This routine may not block.
451 */
452
453vm_object_t
454vm_page_remove(m)
455	vm_page_t m;
456{
457	register struct vm_page **bucket;
458	vm_object_t object;
459
460	if (m->object == NULL)
461		return(NULL);
462
463#if !defined(MAX_PERF)
464	if ((m->flags & PG_BUSY) == 0) {
465		panic("vm_page_remove: page not busy");
466	}
467#endif
468
469	/*
470	 * Basically destroy the page.
471	 */
472
473	vm_page_wakeup(m);
474
475	object = m->object;
476
477	if (m->wire_count)
478		object->wire_count--;
479
480	if ((m->queue - m->pc) == PQ_CACHE)
481		object->cache_count--;
482
483	/*
484	 * Remove from the object_object/offset hash table.  The object
485	 * must be on the hash queue, we will panic if it isn't
486	 *
487	 * Note: we must NULL-out m->hnext to prevent loops in detached
488	 * buffers with vm_page_lookup().
489	 */
490
491	bucket = &vm_page_buckets[vm_page_hash(m->object, m->pindex)];
492	while (*bucket != m) {
493#if !defined(MAX_PERF)
494		if (*bucket == NULL)
495			panic("vm_page_remove(): page not found in hash");
496#endif
497		bucket = &(*bucket)->hnext;
498	}
499	*bucket = m->hnext;
500	m->hnext = NULL;
501	vm_page_bucket_generation++;
502
503	/*
504	 * Now remove from the object's list of backed pages.
505	 */
506
507	TAILQ_REMOVE(&object->memq, m, listq);
508
509	/*
510	 * And show that the object has one fewer resident page.
511	 */
512
513	object->resident_page_count--;
514	object->generation++;
515
516	m->object = NULL;
517
518	return(object);
519}
520
521/*
522 *	vm_page_lookup:
523 *
524 *	Returns the page associated with the object/offset
525 *	pair specified; if none is found, NULL is returned.
526 *
527 *	NOTE: the code below does not lock.  It will operate properly if
528 *	an interrupt makes a change, but the generation algorithm will not
529 *	operate properly in an SMP environment where both cpu's are able to run
530 *	kernel code simultaniously.
531 *
532 *	The object must be locked.  No side effects.
533 *	This routine may not block.
534 *	This is a critical path routine
535 */
536
537vm_page_t
538vm_page_lookup(object, pindex)
539	register vm_object_t object;
540	register vm_pindex_t pindex;
541{
542	register vm_page_t m;
543	register struct vm_page **bucket;
544	int generation;
545
546	/*
547	 * Search the hash table for this object/offset pair
548	 */
549
550#if 0
551	if (object->page_hint && (object->page_hint->pindex == pindex) &&
552		(object->page_hint->object == object))
553		return object->page_hint;
554#endif
555
556retry:
557	generation = vm_page_bucket_generation;
558	bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
559	for (m = *bucket; m != NULL; m = m->hnext) {
560		if ((m->object == object) && (m->pindex == pindex)) {
561			if (vm_page_bucket_generation != generation)
562				goto retry;
563#if 0
564			m->object->page_hint = m;
565#endif
566			return (m);
567		}
568	}
569	if (vm_page_bucket_generation != generation)
570		goto retry;
571	return (NULL);
572}
573
574/*
575 *	vm_page_rename:
576 *
577 *	Move the given memory entry from its
578 *	current object to the specified target object/offset.
579 *
580 *	The object must be locked.
581 *	This routine may not block.
582 *
583 *	Note: this routine will raise itself to splvm(), the caller need not.
584 *
585 *	Note: swap associated with the page must be invalidated by the move.  We
586 *	      have to do this for several reasons:  (1) we aren't freeing the
587 *	      page, (2) we are dirtying the page, (3) the VM system is probably
588 *	      moving the page from object A to B, and will then later move
589 *	      the backing store from A to B and we can't have a conflict.
590 *
591 *	Note: we *always* dirty the page.  It is necessary both for the
592 *	      fact that we moved it, and because we may be invalidating
593 *	      swap.
594 */
595
596void
597vm_page_rename(m, new_object, new_pindex)
598	register vm_page_t m;
599	register vm_object_t new_object;
600	vm_pindex_t new_pindex;
601{
602	int s;
603
604	s = splvm();
605	vm_page_remove(m);
606	vm_page_insert(m, new_object, new_pindex);
607	m->dirty = VM_PAGE_BITS_ALL;
608	splx(s);
609}
610
611/*
612 * vm_page_unqueue_nowakeup:
613 *
614 * 	vm_page_unqueue() without any wakeup
615 *
616 *	This routine must be called at splhigh().
617 *	This routine may not block.
618 */
619
620void
621vm_page_unqueue_nowakeup(m)
622	vm_page_t m;
623{
624	int queue = m->queue;
625	struct vpgqueues *pq;
626	if (queue != PQ_NONE) {
627		pq = &vm_page_queues[queue];
628		m->queue = PQ_NONE;
629		TAILQ_REMOVE(pq->pl, m, pageq);
630		(*pq->cnt)--;
631		(*pq->lcnt)--;
632		if ((queue - m->pc) == PQ_CACHE) {
633			if (m->object)
634				m->object->cache_count--;
635		}
636	}
637}
638
639/*
640 * vm_page_unqueue:
641 *
642 *	Remove a page from its queue.
643 *
644 *	This routine must be called at splhigh().
645 *	This routine may not block.
646 */
647
648void
649vm_page_unqueue(m)
650	vm_page_t m;
651{
652	int queue = m->queue;
653	struct vpgqueues *pq;
654	if (queue != PQ_NONE) {
655		m->queue = PQ_NONE;
656		pq = &vm_page_queues[queue];
657		TAILQ_REMOVE(pq->pl, m, pageq);
658		(*pq->cnt)--;
659		(*pq->lcnt)--;
660		if ((queue - m->pc) == PQ_CACHE) {
661			if ((cnt.v_cache_count + cnt.v_free_count) <
662				(cnt.v_free_reserved + cnt.v_cache_min))
663				pagedaemon_wakeup();
664			if (m->object)
665				m->object->cache_count--;
666		}
667	}
668}
669
670/*
671 *	vm_page_list_find:
672 *
673 *	Find a page on the specified queue with color optimization.
674 *
675 *	The page coloring optimization attempts to locate a page
676 *	that does not overload other nearby pages in the object in
677 *	the cpu's L1 or L2 caches.  We need this optmization because
678 *	cpu caches tend to be physical caches, while object spaces tend
679 *	to be virtual.
680 *
681 *	This routine must be called at splvm().
682 *	This routine may not block.
683 */
684vm_page_t
685vm_page_list_find(basequeue, index)
686	int basequeue, index;
687{
688#if PQ_L2_SIZE > 1
689
690	int i,j;
691	vm_page_t m;
692	int hindex;
693	struct vpgqueues *pq;
694
695	pq = &vm_page_queues[basequeue];
696
697	m = TAILQ_FIRST(pq[index].pl);
698	if (m)
699		return m;
700
701	for(j = 0; j < PQ_L1_SIZE; j++) {
702		int ij;
703		for(i = (PQ_L2_SIZE / 2) - PQ_L1_SIZE;
704			(ij = i + j) > 0;
705			i -= PQ_L1_SIZE) {
706
707			hindex = index + ij;
708			if (hindex >= PQ_L2_SIZE)
709				hindex -= PQ_L2_SIZE;
710			if (m = TAILQ_FIRST(pq[hindex].pl))
711				return m;
712
713			hindex = index - ij;
714			if (hindex < 0)
715				hindex += PQ_L2_SIZE;
716			if (m = TAILQ_FIRST(pq[hindex].pl))
717				return m;
718		}
719	}
720
721	hindex = index + PQ_L2_SIZE / 2;
722	if (hindex >= PQ_L2_SIZE)
723		hindex -= PQ_L2_SIZE;
724	m = TAILQ_FIRST(pq[hindex].pl);
725	if (m)
726		return m;
727
728	return NULL;
729#else
730	return TAILQ_FIRST(vm_page_queues[basequeue].pl);
731#endif
732
733}
734
735/*
736 *	vm_page_select:
737 *
738 *	Find a page on the specified queue with color optimization.
739 *
740 *	This routine must be called at splvm().
741 *	This routine may not block.
742 */
743vm_page_t
744vm_page_select(object, pindex, basequeue)
745	vm_object_t object;
746	vm_pindex_t pindex;
747	int basequeue;
748{
749
750#if PQ_L2_SIZE > 1
751	int index;
752	index = (pindex + object->pg_color) & PQ_L2_MASK;
753	return vm_page_list_find(basequeue, index);
754
755#else
756	return TAILQ_FIRST(vm_page_queues[basequeue].pl);
757#endif
758
759}
760
761/*
762 *	vm_page_select_cache:
763 *
764 *	Find a page on the cache queue with color optimization.  As pages
765 *	might be found, but not applicable, they are deactivated.  This
766 *	keeps us from using potentially busy cached pages.
767 *
768 *	This routine must be called at splvm().
769 *	This routine may not block.
770 */
771vm_page_t
772vm_page_select_cache(object, pindex)
773	vm_object_t object;
774	vm_pindex_t pindex;
775{
776	vm_page_t m;
777
778	while (TRUE) {
779#if PQ_L2_SIZE > 1
780		int index;
781		index = (pindex + object->pg_color) & PQ_L2_MASK;
782		m = vm_page_list_find(PQ_CACHE, index);
783
784#else
785		m = TAILQ_FIRST(vm_page_queues[PQ_CACHE].pl);
786#endif
787		if (m && ((m->flags & PG_BUSY) || m->busy ||
788			       m->hold_count || m->wire_count)) {
789			vm_page_deactivate(m);
790			continue;
791		}
792		return m;
793	}
794}
795
796/*
797 *	vm_page_select_free:
798 *
799 *	Find a free or zero page, with specified preference.
800 *
801 *	This routine must be called at splvm().
802 *	This routine may not block.
803 */
804
805static vm_page_t
806vm_page_select_free(object, pindex, prefqueue)
807	vm_object_t object;
808	vm_pindex_t pindex;
809	int prefqueue;
810{
811#if PQ_L2_SIZE > 1
812	int i,j;
813	int index, hindex;
814#endif
815	vm_page_t m;
816#if 0
817	vm_page_t mh;
818#endif
819	int oqueuediff;
820	struct vpgqueues *pq;
821
822	if (prefqueue == PQ_ZERO)
823		oqueuediff = PQ_FREE - PQ_ZERO;
824	else
825		oqueuediff = PQ_ZERO - PQ_FREE;
826
827#if 0
828	if (mh = object->page_hint) {
829		 if (mh->pindex == (pindex - 1)) {
830			if ((mh->flags & PG_FICTITIOUS) == 0) {
831				if ((mh < &vm_page_array[cnt.v_page_count-1]) &&
832					(mh >= &vm_page_array[0])) {
833					int queue;
834					m = mh + 1;
835					if (VM_PAGE_TO_PHYS(m) == (VM_PAGE_TO_PHYS(mh) + PAGE_SIZE)) {
836						queue = m->queue - m->pc;
837						if (queue == PQ_FREE || queue == PQ_ZERO) {
838							return m;
839						}
840					}
841				}
842			}
843		}
844	}
845#endif
846
847	pq = &vm_page_queues[prefqueue];
848
849#if PQ_L2_SIZE > 1
850
851	index = (pindex + object->pg_color) & PQ_L2_MASK;
852
853	if (m = TAILQ_FIRST(pq[index].pl))
854		return m;
855	if (m = TAILQ_FIRST(pq[index + oqueuediff].pl))
856		return m;
857
858	for(j = 0; j < PQ_L1_SIZE; j++) {
859		int ij;
860		for(i = (PQ_L2_SIZE / 2) - PQ_L1_SIZE;
861			(ij = i + j) >= 0;
862			i -= PQ_L1_SIZE) {
863
864			hindex = index + ij;
865			if (hindex >= PQ_L2_SIZE)
866				hindex -= PQ_L2_SIZE;
867			if (m = TAILQ_FIRST(pq[hindex].pl))
868				return m;
869			if (m = TAILQ_FIRST(pq[hindex + oqueuediff].pl))
870				return m;
871
872			hindex = index - ij;
873			if (hindex < 0)
874				hindex += PQ_L2_SIZE;
875			if (m = TAILQ_FIRST(pq[hindex].pl))
876				return m;
877			if (m = TAILQ_FIRST(pq[hindex + oqueuediff].pl))
878				return m;
879		}
880	}
881
882	hindex = index + PQ_L2_SIZE / 2;
883	if (hindex >= PQ_L2_SIZE)
884		hindex -= PQ_L2_SIZE;
885	if (m = TAILQ_FIRST(pq[hindex].pl))
886		return m;
887	if (m = TAILQ_FIRST(pq[hindex+oqueuediff].pl))
888		return m;
889
890#else
891	if (m = TAILQ_FIRST(pq[0].pl))
892		return m;
893	else
894		return TAILQ_FIRST(pq[oqueuediff].pl);
895#endif
896
897	return NULL;
898}
899
900/*
901 *	vm_page_alloc:
902 *
903 *	Allocate and return a memory cell associated
904 *	with this VM object/offset pair.
905 *
906 *	page_req classes:
907 *	VM_ALLOC_NORMAL		normal process request
908 *	VM_ALLOC_SYSTEM		system *really* needs a page
909 *	VM_ALLOC_INTERRUPT	interrupt time request
910 *	VM_ALLOC_ZERO		zero page
911 *
912 *	Object must be locked.
913 *	This routine may not block.
914 *
915 *	Additional special handling is required when called from an
916 *	interrupt (VM_ALLOC_INTERRUPT).  We are not allowed to mess with
917 *	the page cache in this case.
918 *
919 *	vm_page_alloc()
920 */
921vm_page_t
922vm_page_alloc(object, pindex, page_req)
923	vm_object_t object;
924	vm_pindex_t pindex;
925	int page_req;
926{
927	register vm_page_t m = NULL;
928	struct vpgqueues *pq;
929	vm_object_t oldobject;
930	int queue, qtype;
931	int s;
932
933	KASSERT(!vm_page_lookup(object, pindex),
934		("vm_page_alloc: page already allocated"));
935
936	/*
937	 * The pager is allowed to eat deeper into the free page list.
938	 */
939
940	if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) {
941		page_req = VM_ALLOC_SYSTEM;
942	};
943
944	s = splvm();
945
946loop:
947	switch (page_req) {
948
949	case VM_ALLOC_NORMAL:
950		if (cnt.v_free_count >= cnt.v_free_reserved) {
951			m = vm_page_select_free(object, pindex, PQ_FREE);
952			KASSERT(m != NULL, ("vm_page_alloc(NORMAL): missing page on free queue\n"));
953		} else {
954			m = vm_page_select_cache(object, pindex);
955			if (m == NULL) {
956				splx(s);
957#if defined(DIAGNOSTIC)
958				if (cnt.v_cache_count > 0)
959					printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", cnt.v_cache_count);
960#endif
961				vm_pageout_deficit++;
962				pagedaemon_wakeup();
963				return (NULL);
964			}
965		}
966		break;
967
968	case VM_ALLOC_ZERO:
969		if (cnt.v_free_count >= cnt.v_free_reserved) {
970			m = vm_page_select_free(object, pindex, PQ_ZERO);
971			KASSERT(m != NULL, ("vm_page_alloc(ZERO): missing page on free queue\n"));
972		} else {
973			m = vm_page_select_cache(object, pindex);
974			if (m == NULL) {
975				splx(s);
976#if defined(DIAGNOSTIC)
977				if (cnt.v_cache_count > 0)
978					printf("vm_page_alloc(ZERO): missing pages on cache queue: %d\n", cnt.v_cache_count);
979#endif
980				vm_pageout_deficit++;
981				pagedaemon_wakeup();
982				return (NULL);
983			}
984		}
985		break;
986
987	case VM_ALLOC_SYSTEM:
988		if ((cnt.v_free_count >= cnt.v_free_reserved) ||
989		    ((cnt.v_cache_count == 0) &&
990		    (cnt.v_free_count >= cnt.v_interrupt_free_min))) {
991			m = vm_page_select_free(object, pindex, PQ_FREE);
992			KASSERT(m != NULL, ("vm_page_alloc(SYSTEM): missing page on free queue\n"));
993		} else {
994			m = vm_page_select_cache(object, pindex);
995			if (m == NULL) {
996				splx(s);
997#if defined(DIAGNOSTIC)
998				if (cnt.v_cache_count > 0)
999					printf("vm_page_alloc(SYSTEM): missing pages on cache queue: %d\n", cnt.v_cache_count);
1000#endif
1001				vm_pageout_deficit++;
1002				pagedaemon_wakeup();
1003				return (NULL);
1004			}
1005		}
1006		break;
1007
1008	case VM_ALLOC_INTERRUPT:
1009		if (cnt.v_free_count > 0) {
1010			m = vm_page_select_free(object, pindex, PQ_FREE);
1011			KASSERT(m != NULL, ("vm_page_alloc(INTERRUPT): missing page on free queue\n"));
1012		} else {
1013			splx(s);
1014			vm_pageout_deficit++;
1015			pagedaemon_wakeup();
1016			return (NULL);
1017		}
1018		break;
1019
1020	default:
1021		m = NULL;
1022#if !defined(MAX_PERF)
1023		panic("vm_page_alloc: invalid allocation class");
1024#endif
1025	}
1026
1027	queue = m->queue;
1028	qtype = queue - m->pc;
1029
1030	/*
1031	 * Cache pages must be formally freed (and doubly so with the
1032	 * new pagerops functions).  We free the page and try again.
1033	 *
1034	 * This also has the side effect of ensuring that the minfreepage
1035	 * wall is held more tightly verses the old code.
1036	 */
1037
1038	if (qtype == PQ_CACHE) {
1039#if !defined(MAX_PERF)
1040		if (m->dirty)
1041			panic("found dirty cache page %p", m);
1042#endif
1043		vm_page_busy(m);
1044		vm_page_protect(m, VM_PROT_NONE);
1045		vm_page_free(m);
1046		goto loop;
1047	}
1048
1049	pq = &vm_page_queues[queue];
1050	TAILQ_REMOVE(pq->pl, m, pageq);
1051	(*pq->cnt)--;
1052	(*pq->lcnt)--;
1053	oldobject = NULL;
1054
1055	if (qtype == PQ_ZERO) {
1056		vm_page_zero_count--;
1057		m->flags = PG_ZERO | PG_BUSY;
1058	} else {
1059		m->flags = PG_BUSY;
1060	}
1061	m->wire_count = 0;
1062	m->hold_count = 0;
1063	m->act_count = 0;
1064	m->busy = 0;
1065	m->valid = 0;
1066	m->dirty = 0;
1067	m->queue = PQ_NONE;
1068
1069	/*
1070	 * vm_page_insert() is safe prior to the splx().  Note also that
1071	 * inserting a page here does not insert it into the pmap (which
1072	 * could cause us to block allocating memory).  We cannot block
1073	 * anywhere.
1074	 */
1075
1076	vm_page_insert(m, object, pindex);
1077
1078	/*
1079	 * Don't wakeup too often - wakeup the pageout daemon when
1080	 * we would be nearly out of memory.
1081	 */
1082	if (((cnt.v_free_count + cnt.v_cache_count) <
1083		(cnt.v_free_reserved + cnt.v_cache_min)) ||
1084			(cnt.v_free_count < cnt.v_pageout_free_min))
1085		pagedaemon_wakeup();
1086
1087#if 0
1088	/*
1089	 * (code removed - was previously a manual breakout of the act of
1090	 * freeing a page from cache.  We now just call vm_page_free() on
1091	 * a cache page an loop so this code no longer needs to be here)
1092	 */
1093	if ((qtype == PQ_CACHE) &&
1094		((page_req == VM_ALLOC_NORMAL) || (page_req == VM_ALLOC_ZERO)) &&
1095		oldobject && (oldobject->type == OBJT_VNODE) &&
1096		((oldobject->flags & OBJ_DEAD) == 0)) {
1097		struct vnode *vp;
1098		vp = (struct vnode *) oldobject->handle;
1099		if (vp && VSHOULDFREE(vp)) {
1100			if ((vp->v_flag & (VFREE|VTBFREE|VDOOMED)) == 0) {
1101				TAILQ_INSERT_TAIL(&vnode_tobefree_list, vp, v_freelist);
1102				vp->v_flag |= VTBFREE;
1103			}
1104		}
1105	}
1106#endif
1107	splx(s);
1108
1109	return (m);
1110}
1111
1112/*
1113 *	vm_wait:	(also see VM_WAIT macro)
1114 *
1115 *	Block until free pages are available for allocation
1116 */
1117
1118void
1119vm_wait()
1120{
1121	int s;
1122
1123	s = splvm();
1124	if (curproc == pageproc) {
1125		vm_pageout_pages_needed = 1;
1126		tsleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0);
1127	} else {
1128		if (!vm_pages_needed) {
1129			vm_pages_needed++;
1130			wakeup(&vm_pages_needed);
1131		}
1132		tsleep(&cnt.v_free_count, PVM, "vmwait", 0);
1133	}
1134	splx(s);
1135}
1136
1137/*
1138 *	vm_await:	(also see VM_AWAIT macro)
1139 *
1140 *	asleep on an event that will signal when free pages are available
1141 *	for allocation.
1142 */
1143
1144void
1145vm_await()
1146{
1147	int s;
1148
1149	s = splvm();
1150	if (curproc == pageproc) {
1151		vm_pageout_pages_needed = 1;
1152		asleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0);
1153	} else {
1154		if (!vm_pages_needed) {
1155			vm_pages_needed++;
1156			wakeup(&vm_pages_needed);
1157		}
1158		asleep(&cnt.v_free_count, PVM, "vmwait", 0);
1159	}
1160	splx(s);
1161}
1162
1163#if 0
1164/*
1165 *	vm_page_sleep:
1166 *
1167 *	Block until page is no longer busy.
1168 */
1169
1170int
1171vm_page_sleep(vm_page_t m, char *msg, char *busy) {
1172	int slept = 0;
1173	if ((busy && *busy) || (m->flags & PG_BUSY)) {
1174		int s;
1175		s = splvm();
1176		if ((busy && *busy) || (m->flags & PG_BUSY)) {
1177			vm_page_flag_set(m, PG_WANTED);
1178			tsleep(m, PVM, msg, 0);
1179			slept = 1;
1180		}
1181		splx(s);
1182	}
1183	return slept;
1184}
1185
1186#endif
1187
1188#if 0
1189
1190/*
1191 *	vm_page_asleep:
1192 *
1193 *	Similar to vm_page_sleep(), but does not block.  Returns 0 if
1194 *	the page is not busy, or 1 if the page is busy.
1195 *
1196 *	This routine has the side effect of calling asleep() if the page
1197 *	was busy (1 returned).
1198 */
1199
1200int
1201vm_page_asleep(vm_page_t m, char *msg, char *busy) {
1202	int slept = 0;
1203	if ((busy && *busy) || (m->flags & PG_BUSY)) {
1204		int s;
1205		s = splvm();
1206		if ((busy && *busy) || (m->flags & PG_BUSY)) {
1207			vm_page_flag_set(m, PG_WANTED);
1208			asleep(m, PVM, msg, 0);
1209			slept = 1;
1210		}
1211		splx(s);
1212	}
1213	return slept;
1214}
1215
1216#endif
1217
1218/*
1219 *	vm_page_activate:
1220 *
1221 *	Put the specified page on the active list (if appropriate).
1222 *
1223 *	The page queues must be locked.
1224 *	This routine may not block.
1225 */
1226void
1227vm_page_activate(m)
1228	register vm_page_t m;
1229{
1230	int s;
1231
1232	s = splvm();
1233	if (m->queue != PQ_ACTIVE) {
1234		if ((m->queue - m->pc) == PQ_CACHE)
1235			cnt.v_reactivated++;
1236
1237		vm_page_unqueue(m);
1238
1239		if (m->wire_count == 0) {
1240			m->queue = PQ_ACTIVE;
1241			++(*vm_page_queues[PQ_ACTIVE].lcnt);
1242			TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
1243			if (m->act_count < ACT_INIT)
1244				m->act_count = ACT_INIT;
1245			cnt.v_active_count++;
1246		}
1247	} else {
1248		if (m->act_count < ACT_INIT)
1249			m->act_count = ACT_INIT;
1250	}
1251
1252	splx(s);
1253}
1254
1255/*
1256 * helper routine for vm_page_free and vm_page_free_zero.
1257 *
1258 * This routine may not block.
1259 */
1260static __inline void
1261vm_page_free_wakeup()
1262{
1263	/*
1264	 * if pageout daemon needs pages, then tell it that there are
1265	 * some free.
1266	 */
1267	if (vm_pageout_pages_needed) {
1268		wakeup(&vm_pageout_pages_needed);
1269		vm_pageout_pages_needed = 0;
1270	}
1271	/*
1272	 * wakeup processes that are waiting on memory if we hit a
1273	 * high water mark. And wakeup scheduler process if we have
1274	 * lots of memory. this process will swapin processes.
1275	 */
1276	if (vm_pages_needed &&
1277		((cnt.v_free_count + cnt.v_cache_count) >= cnt.v_free_min)) {
1278		wakeup(&cnt.v_free_count);
1279		vm_pages_needed = 0;
1280	}
1281}
1282
1283/*
1284 *	vm_page_free_toq:
1285 *
1286 *	Returns the given page to the PQ_FREE or PQ_ZERO list,
1287 *	disassociating it with any VM object.
1288 *
1289 *	Object and page must be locked prior to entry.
1290 *	This routine may not block.
1291 */
1292
1293void
1294vm_page_free_toq(vm_page_t m, int queue)
1295{
1296	int s;
1297	struct vpgqueues *pq;
1298	vm_object_t object = m->object;
1299
1300	s = splvm();
1301
1302	cnt.v_tfree++;
1303
1304#if !defined(MAX_PERF)
1305	if (m->busy || ((m->queue - m->pc) == PQ_FREE) ||
1306		(m->hold_count != 0)) {
1307		printf(
1308		"vm_page_free: pindex(%lu), busy(%d), PG_BUSY(%d), hold(%d)\n",
1309		    (u_long)m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0,
1310		    m->hold_count);
1311		if ((m->queue - m->pc) == PQ_FREE)
1312			panic("vm_page_free: freeing free page");
1313		else
1314			panic("vm_page_free: freeing busy page");
1315	}
1316#endif
1317
1318	/*
1319	 * unqueue, then remove page.  Note that we cannot destroy
1320	 * the page here because we do not want to call the pager's
1321	 * callback routine until after we've put the page on the
1322	 * appropriate free queue.
1323	 */
1324
1325	vm_page_unqueue_nowakeup(m);
1326	vm_page_remove(m);
1327
1328	/*
1329	 * If fictitious remove object association and
1330	 * return, otherwise delay object association removal.
1331	 */
1332
1333	if ((m->flags & PG_FICTITIOUS) != 0) {
1334		splx(s);
1335		return;
1336	}
1337
1338	m->valid = 0;
1339
1340	if (m->wire_count != 0) {
1341#if !defined(MAX_PERF)
1342		if (m->wire_count > 1) {
1343			panic("vm_page_free: invalid wire count (%d), pindex: 0x%x",
1344				m->wire_count, m->pindex);
1345		}
1346#endif
1347		printf("vm_page_free: freeing wired page\n");
1348		m->wire_count = 0;
1349		if (m->object)
1350			m->object->wire_count--;
1351		cnt.v_wire_count--;
1352	}
1353
1354	/*
1355	 * If we've exhausted the object's resident pages we want to free
1356	 * it up.
1357	 */
1358
1359	if (object &&
1360	    (object->type == OBJT_VNODE) &&
1361	    ((object->flags & OBJ_DEAD) == 0)
1362	) {
1363		struct vnode *vp = (struct vnode *)object->handle;
1364
1365		if (vp && VSHOULDFREE(vp)) {
1366			if ((vp->v_flag & (VTBFREE|VDOOMED|VFREE)) == 0) {
1367				TAILQ_INSERT_TAIL(&vnode_tobefree_list, vp, v_freelist);
1368				vp->v_flag |= VTBFREE;
1369			}
1370		}
1371	}
1372
1373#ifdef __alpha__
1374	pmap_page_is_free(m);
1375#endif
1376
1377	m->queue = queue + m->pc;
1378	pq = &vm_page_queues[m->queue];
1379	++(*pq->lcnt);
1380	++(*pq->cnt);
1381
1382	if (queue == PQ_ZERO) {
1383		TAILQ_INSERT_HEAD(pq->pl, m, pageq);
1384		++vm_page_zero_count;
1385	} else {
1386		/*
1387		 * If the pageout process is grabbing the page, it is likely
1388		 * that the page is NOT in the cache.  It is more likely that
1389		 * the page will be partially in the cache if it is being
1390		 * explicitly freed.
1391		 */
1392
1393		if (curproc == pageproc) {
1394			TAILQ_INSERT_TAIL(pq->pl, m, pageq);
1395		} else {
1396			TAILQ_INSERT_HEAD(pq->pl, m, pageq);
1397		}
1398	}
1399
1400	vm_page_free_wakeup();
1401
1402	splx(s);
1403}
1404
1405/*
1406 *	vm_page_wire:
1407 *
1408 *	Mark this page as wired down by yet
1409 *	another map, removing it from paging queues
1410 *	as necessary.
1411 *
1412 *	The page queues must be locked.
1413 *	This routine may not block.
1414 */
1415void
1416vm_page_wire(m)
1417	register vm_page_t m;
1418{
1419	int s;
1420
1421	s = splvm();
1422	if (m->wire_count == 0) {
1423		vm_page_unqueue(m);
1424		cnt.v_wire_count++;
1425		if (m->object)
1426			m->object->wire_count++;
1427	}
1428	m->wire_count++;
1429	splx(s);
1430	(*vm_page_queues[PQ_NONE].lcnt)++;
1431	vm_page_flag_set(m, PG_MAPPED);
1432}
1433
1434/*
1435 *	vm_page_unwire:
1436 *
1437 *	Release one wiring of this page, potentially
1438 *	enabling it to be paged again.
1439 *
1440 *	Many pages placed on the inactive queue should actually go
1441 *	into the cache, but it is difficult to figure out which.  What
1442 *	we do instead, if the inactive target is well met, is to put
1443 *	clean pages at the head of the inactive queue instead of the tail.
1444 *	This will cause them to be moved to the cache more quickly and
1445 *	if not actively re-referenced, freed more quickly.  If we just
1446 *	stick these pages at the end of the inactive queue, heavy filesystem
1447 *	meta-data accesses can cause an unnecessary paging load on memory bound
1448 *	processes.  This optimization causes one-time-use metadata to be
1449 *	reused more quickly.
1450 *
1451 *	The page queues must be locked.
1452 *	This routine may not block.
1453 */
1454void
1455vm_page_unwire(m, activate)
1456	register vm_page_t m;
1457	int activate;
1458{
1459	int s;
1460
1461	s = splvm();
1462
1463	if (m->wire_count > 0) {
1464		m->wire_count--;
1465		if (m->wire_count == 0) {
1466			if (m->object)
1467				m->object->wire_count--;
1468			cnt.v_wire_count--;
1469			if (activate) {
1470				TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
1471				m->queue = PQ_ACTIVE;
1472				(*vm_page_queues[PQ_ACTIVE].lcnt)++;
1473				cnt.v_active_count++;
1474			} else {
1475				TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
1476				m->queue = PQ_INACTIVE;
1477				(*vm_page_queues[PQ_INACTIVE].lcnt)++;
1478				cnt.v_inactive_count++;
1479			}
1480		}
1481	} else {
1482#if !defined(MAX_PERF)
1483		panic("vm_page_unwire: invalid wire count: %d\n", m->wire_count);
1484#endif
1485	}
1486	splx(s);
1487}
1488
1489
1490/*
1491 * Move the specified page to the inactive queue.  If the page has
1492 * any associated swap, the swap is deallocated.
1493 *
1494 * This routine may not block.
1495 */
1496void
1497vm_page_deactivate(m)
1498	register vm_page_t m;
1499{
1500	int s;
1501
1502	/*
1503	 * Ignore if already inactive.
1504	 */
1505	if (m->queue == PQ_INACTIVE)
1506		return;
1507
1508	s = splvm();
1509	if (m->wire_count == 0) {
1510		if ((m->queue - m->pc) == PQ_CACHE)
1511			cnt.v_reactivated++;
1512		vm_page_unqueue(m);
1513		TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
1514		m->queue = PQ_INACTIVE;
1515		++(*vm_page_queues[PQ_INACTIVE].lcnt);
1516		cnt.v_inactive_count++;
1517	}
1518	splx(s);
1519}
1520
1521/*
1522 * vm_page_cache
1523 *
1524 * Put the specified page onto the page cache queue (if appropriate).
1525 *
1526 * This routine may not block.
1527 */
1528void
1529vm_page_cache(m)
1530	register vm_page_t m;
1531{
1532	int s;
1533
1534#if !defined(MAX_PERF)
1535	if ((m->flags & PG_BUSY) || m->busy || m->wire_count) {
1536		printf("vm_page_cache: attempting to cache busy page\n");
1537		return;
1538	}
1539#endif
1540	if ((m->queue - m->pc) == PQ_CACHE)
1541		return;
1542
1543	/*
1544	 * Remove all pmaps and indicate that the page is not
1545	 * writeable.
1546	 */
1547
1548	vm_page_protect(m, VM_PROT_NONE);
1549	vm_page_flag_clear(m, PG_WRITEABLE);
1550#if !defined(MAX_PERF)
1551	if (m->dirty != 0) {
1552		panic("vm_page_cache: caching a dirty page, pindex: %d", m->pindex);
1553	}
1554#endif
1555	s = splvm();
1556	vm_page_unqueue_nowakeup(m);
1557	m->queue = PQ_CACHE + m->pc;
1558	(*vm_page_queues[m->queue].lcnt)++;
1559	TAILQ_INSERT_TAIL(vm_page_queues[m->queue].pl, m, pageq);
1560	cnt.v_cache_count++;
1561	m->object->cache_count++;
1562	vm_page_free_wakeup();
1563	splx(s);
1564}
1565
1566/*
1567 * Grab a page, waiting until we are waken up due to the page
1568 * changing state.  We keep on waiting, if the page continues
1569 * to be in the object.  If the page doesn't exist, allocate it.
1570 *
1571 * This routine may block.
1572 */
1573vm_page_t
1574vm_page_grab(object, pindex, allocflags)
1575	vm_object_t object;
1576	vm_pindex_t pindex;
1577	int allocflags;
1578{
1579
1580	vm_page_t m;
1581	int s, generation;
1582
1583retrylookup:
1584	if ((m = vm_page_lookup(object, pindex)) != NULL) {
1585		if (m->busy || (m->flags & PG_BUSY)) {
1586			generation = object->generation;
1587
1588			s = splvm();
1589			while ((object->generation == generation) &&
1590					(m->busy || (m->flags & PG_BUSY))) {
1591				vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
1592				tsleep(m, PVM, "pgrbwt", 0);
1593				if ((allocflags & VM_ALLOC_RETRY) == 0) {
1594					splx(s);
1595					return NULL;
1596				}
1597			}
1598			splx(s);
1599			goto retrylookup;
1600		} else {
1601			vm_page_busy(m);
1602			return m;
1603		}
1604	}
1605
1606	m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY);
1607	if (m == NULL) {
1608		VM_WAIT;
1609		if ((allocflags & VM_ALLOC_RETRY) == 0)
1610			return NULL;
1611		goto retrylookup;
1612	}
1613
1614	return m;
1615}
1616
1617/*
1618 * mapping function for valid bits or for dirty bits in
1619 * a page.  May not block.
1620 */
1621__inline int
1622vm_page_bits(int base, int size)
1623{
1624	u_short chunk;
1625
1626	if ((base == 0) && (size >= PAGE_SIZE))
1627		return VM_PAGE_BITS_ALL;
1628
1629	size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1630	base &= PAGE_MASK;
1631	if (size > PAGE_SIZE - base) {
1632		size = PAGE_SIZE - base;
1633	}
1634
1635	base = base / DEV_BSIZE;
1636	chunk = vm_page_dev_bsize_chunks[size / DEV_BSIZE];
1637	return (chunk << base) & VM_PAGE_BITS_ALL;
1638}
1639
1640/*
1641 * set a page valid and clean.  May not block.
1642 */
1643void
1644vm_page_set_validclean(m, base, size)
1645	vm_page_t m;
1646	int base;
1647	int size;
1648{
1649	int pagebits = vm_page_bits(base, size);
1650	m->valid |= pagebits;
1651	m->dirty &= ~pagebits;
1652	if( base == 0 && size == PAGE_SIZE)
1653		pmap_clear_modify(VM_PAGE_TO_PHYS(m));
1654}
1655
1656/*
1657 * set a page (partially) invalid.  May not block.
1658 */
1659void
1660vm_page_set_invalid(m, base, size)
1661	vm_page_t m;
1662	int base;
1663	int size;
1664{
1665	int bits;
1666
1667	m->valid &= ~(bits = vm_page_bits(base, size));
1668	if (m->valid == 0)
1669		m->dirty &= ~bits;
1670	m->object->generation++;
1671}
1672
1673/*
1674 * is (partial) page valid?  May not block.
1675 */
1676int
1677vm_page_is_valid(m, base, size)
1678	vm_page_t m;
1679	int base;
1680	int size;
1681{
1682	int bits = vm_page_bits(base, size);
1683
1684	if (m->valid && ((m->valid & bits) == bits))
1685		return 1;
1686	else
1687		return 0;
1688}
1689
1690/*
1691 * update dirty bits from pmap/mmu.  May not block.
1692 */
1693
1694void
1695vm_page_test_dirty(m)
1696	vm_page_t m;
1697{
1698	if ((m->dirty != VM_PAGE_BITS_ALL) &&
1699	    pmap_is_modified(VM_PAGE_TO_PHYS(m))) {
1700		m->dirty = VM_PAGE_BITS_ALL;
1701	}
1702}
1703
1704/*
1705 * This interface is for merging with malloc() someday.
1706 * Even if we never implement compaction so that contiguous allocation
1707 * works after initialization time, malloc()'s data structures are good
1708 * for statistics and for allocations of less than a page.
1709 */
1710void *
1711contigmalloc1(size, type, flags, low, high, alignment, boundary, map)
1712	unsigned long size;	/* should be size_t here and for malloc() */
1713	struct malloc_type *type;
1714	int flags;
1715	unsigned long low;
1716	unsigned long high;
1717	unsigned long alignment;
1718	unsigned long boundary;
1719	vm_map_t map;
1720{
1721	int i, s, start;
1722	vm_offset_t addr, phys, tmp_addr;
1723	int pass;
1724	vm_page_t pga = vm_page_array;
1725
1726	size = round_page(size);
1727#if !defined(MAX_PERF)
1728	if (size == 0)
1729		panic("contigmalloc1: size must not be 0");
1730	if ((alignment & (alignment - 1)) != 0)
1731		panic("contigmalloc1: alignment must be a power of 2");
1732	if ((boundary & (boundary - 1)) != 0)
1733		panic("contigmalloc1: boundary must be a power of 2");
1734#endif
1735
1736	start = 0;
1737	for (pass = 0; pass <= 1; pass++) {
1738		s = splvm();
1739again:
1740		/*
1741		 * Find first page in array that is free, within range, aligned, and
1742		 * such that the boundary won't be crossed.
1743		 */
1744		for (i = start; i < cnt.v_page_count; i++) {
1745			int pqtype;
1746			phys = VM_PAGE_TO_PHYS(&pga[i]);
1747			pqtype = pga[i].queue - pga[i].pc;
1748			if (((pqtype == PQ_ZERO) || (pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) &&
1749			    (phys >= low) && (phys < high) &&
1750			    ((phys & (alignment - 1)) == 0) &&
1751			    (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0))
1752				break;
1753		}
1754
1755		/*
1756		 * If the above failed or we will exceed the upper bound, fail.
1757		 */
1758		if ((i == cnt.v_page_count) ||
1759			((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) {
1760			vm_page_t m, next;
1761
1762again1:
1763			for (m = TAILQ_FIRST(&vm_page_queue_inactive);
1764				m != NULL;
1765				m = next) {
1766
1767				if (m->queue != PQ_INACTIVE) {
1768					break;
1769				}
1770
1771				next = TAILQ_NEXT(m, pageq);
1772				if (vm_page_sleep_busy(m, TRUE, "vpctw0"))
1773					goto again1;
1774				vm_page_test_dirty(m);
1775				if (m->dirty) {
1776					if (m->object->type == OBJT_VNODE) {
1777						vn_lock(m->object->handle, LK_EXCLUSIVE | LK_RETRY, curproc);
1778						vm_object_page_clean(m->object, 0, 0, OBJPC_SYNC);
1779						VOP_UNLOCK(m->object->handle, 0, curproc);
1780						goto again1;
1781					} else if (m->object->type == OBJT_SWAP ||
1782								m->object->type == OBJT_DEFAULT) {
1783						vm_pageout_flush(&m, 1, 0);
1784						goto again1;
1785					}
1786				}
1787				if ((m->dirty == 0) && (m->busy == 0) && (m->hold_count == 0))
1788					vm_page_cache(m);
1789			}
1790
1791			for (m = TAILQ_FIRST(&vm_page_queue_active);
1792				m != NULL;
1793				m = next) {
1794
1795				if (m->queue != PQ_ACTIVE) {
1796					break;
1797				}
1798
1799				next = TAILQ_NEXT(m, pageq);
1800				if (vm_page_sleep_busy(m, TRUE, "vpctw1"))
1801					goto again1;
1802				vm_page_test_dirty(m);
1803				if (m->dirty) {
1804					if (m->object->type == OBJT_VNODE) {
1805						vn_lock(m->object->handle, LK_EXCLUSIVE | LK_RETRY, curproc);
1806						vm_object_page_clean(m->object, 0, 0, OBJPC_SYNC);
1807						VOP_UNLOCK(m->object->handle, 0, curproc);
1808						goto again1;
1809					} else if (m->object->type == OBJT_SWAP ||
1810								m->object->type == OBJT_DEFAULT) {
1811						vm_pageout_flush(&m, 1, 0);
1812						goto again1;
1813					}
1814				}
1815				if ((m->dirty == 0) && (m->busy == 0) && (m->hold_count == 0))
1816					vm_page_cache(m);
1817			}
1818
1819			splx(s);
1820			continue;
1821		}
1822		start = i;
1823
1824		/*
1825		 * Check successive pages for contiguous and free.
1826		 */
1827		for (i = start + 1; i < (start + size / PAGE_SIZE); i++) {
1828			int pqtype;
1829			pqtype = pga[i].queue - pga[i].pc;
1830			if ((VM_PAGE_TO_PHYS(&pga[i]) !=
1831			    (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) ||
1832			    ((pqtype != PQ_ZERO) && (pqtype != PQ_FREE) && (pqtype != PQ_CACHE))) {
1833				start++;
1834				goto again;
1835			}
1836		}
1837
1838		for (i = start; i < (start + size / PAGE_SIZE); i++) {
1839			int pqtype;
1840			vm_page_t m = &pga[i];
1841
1842			pqtype = m->queue - m->pc;
1843			if (pqtype == PQ_CACHE) {
1844				vm_page_busy(m);
1845				vm_page_free(m);
1846			}
1847
1848			TAILQ_REMOVE(vm_page_queues[m->queue].pl, m, pageq);
1849			(*vm_page_queues[m->queue].lcnt)--;
1850			cnt.v_free_count--;
1851			m->valid = VM_PAGE_BITS_ALL;
1852			m->flags = 0;
1853			m->dirty = 0;
1854			m->wire_count = 0;
1855			m->busy = 0;
1856			m->queue = PQ_NONE;
1857			m->object = NULL;
1858			vm_page_wire(m);
1859		}
1860
1861		/*
1862		 * We've found a contiguous chunk that meets are requirements.
1863		 * Allocate kernel VM, unfree and assign the physical pages to it and
1864		 * return kernel VM pointer.
1865		 */
1866		tmp_addr = addr = kmem_alloc_pageable(map, size);
1867		if (addr == 0) {
1868			/*
1869			 * XXX We almost never run out of kernel virtual
1870			 * space, so we don't make the allocated memory
1871			 * above available.
1872			 */
1873			splx(s);
1874			return (NULL);
1875		}
1876
1877		for (i = start; i < (start + size / PAGE_SIZE); i++) {
1878			vm_page_t m = &pga[i];
1879			vm_page_insert(m, kernel_object,
1880				OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
1881			pmap_kenter(tmp_addr, VM_PAGE_TO_PHYS(m));
1882			tmp_addr += PAGE_SIZE;
1883		}
1884
1885		splx(s);
1886		return ((void *)addr);
1887	}
1888	return NULL;
1889}
1890
1891void *
1892contigmalloc(size, type, flags, low, high, alignment, boundary)
1893	unsigned long size;	/* should be size_t here and for malloc() */
1894	struct malloc_type *type;
1895	int flags;
1896	unsigned long low;
1897	unsigned long high;
1898	unsigned long alignment;
1899	unsigned long boundary;
1900{
1901	return contigmalloc1(size, type, flags, low, high, alignment, boundary,
1902			     kernel_map);
1903}
1904
1905vm_offset_t
1906vm_page_alloc_contig(size, low, high, alignment)
1907	vm_offset_t size;
1908	vm_offset_t low;
1909	vm_offset_t high;
1910	vm_offset_t alignment;
1911{
1912	return ((vm_offset_t)contigmalloc1(size, M_DEVBUF, M_NOWAIT, low, high,
1913					  alignment, 0ul, kernel_map));
1914}
1915
1916#include "opt_ddb.h"
1917#ifdef DDB
1918#include <sys/kernel.h>
1919
1920#include <ddb/ddb.h>
1921
1922DB_SHOW_COMMAND(page, vm_page_print_page_info)
1923{
1924	db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
1925	db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
1926	db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
1927	db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
1928	db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
1929	db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
1930	db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
1931	db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
1932	db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
1933	db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
1934}
1935
1936DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
1937{
1938	int i;
1939	db_printf("PQ_FREE:");
1940	for(i=0;i<PQ_L2_SIZE;i++) {
1941		db_printf(" %d", *vm_page_queues[PQ_FREE + i].lcnt);
1942	}
1943	db_printf("\n");
1944
1945	db_printf("PQ_CACHE:");
1946	for(i=0;i<PQ_L2_SIZE;i++) {
1947		db_printf(" %d", *vm_page_queues[PQ_CACHE + i].lcnt);
1948	}
1949	db_printf("\n");
1950
1951	db_printf("PQ_ZERO:");
1952	for(i=0;i<PQ_L2_SIZE;i++) {
1953		db_printf(" %d", *vm_page_queues[PQ_ZERO + i].lcnt);
1954	}
1955	db_printf("\n");
1956
1957	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
1958		*vm_page_queues[PQ_ACTIVE].lcnt,
1959		*vm_page_queues[PQ_INACTIVE].lcnt);
1960}
1961#endif /* DDB */
1962