vm_pageout.c revision 69972
1/*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * The Mach Operating System project at Carnegie-Mellon University.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 *    must display the following acknowledgement:
22 *	This product includes software developed by the University of
23 *	California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 *    may be used to endorse or promote products derived from this software
26 *    without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
41 *
42 *
43 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
44 * All rights reserved.
45 *
46 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
47 *
48 * Permission to use, copy, modify and distribute this software and
49 * its documentation is hereby granted, provided that both the copyright
50 * notice and this permission notice appear in all copies of the
51 * software, derivative works or modified versions, and any portions
52 * thereof, and that both notices appear in supporting documentation.
53 *
54 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
55 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
56 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
57 *
58 * Carnegie Mellon requests users of this software to return to
59 *
60 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
61 *  School of Computer Science
62 *  Carnegie Mellon University
63 *  Pittsburgh PA 15213-3890
64 *
65 * any improvements or extensions that they make and grant Carnegie the
66 * rights to redistribute these changes.
67 *
68 * $FreeBSD: head/sys/vm/vm_pageout.c 69972 2000-12-13 10:01:00Z tanimura $
69 */
70
71/*
72 *	The proverbial page-out daemon.
73 */
74
75#include "opt_vm.h"
76#include <sys/param.h>
77#include <sys/systm.h>
78#include <sys/kernel.h>
79#include <sys/proc.h>
80#include <sys/kthread.h>
81#include <sys/ktr.h>
82#include <sys/resourcevar.h>
83#include <sys/signalvar.h>
84#include <sys/vnode.h>
85#include <sys/vmmeter.h>
86#include <sys/sysctl.h>
87
88#include <vm/vm.h>
89#include <vm/vm_param.h>
90#include <sys/lock.h>
91#include <vm/vm_object.h>
92#include <vm/vm_page.h>
93#include <vm/vm_map.h>
94#include <vm/vm_pageout.h>
95#include <vm/vm_pager.h>
96#include <vm/vm_zone.h>
97#include <vm/swap_pager.h>
98#include <vm/vm_extern.h>
99
100#include <machine/mutex.h>
101
102/*
103 * System initialization
104 */
105
106/* the kernel process "vm_pageout"*/
107static void vm_pageout __P((void));
108static int vm_pageout_clean __P((vm_page_t));
109static int vm_pageout_scan __P((void));
110static int vm_pageout_free_page_calc __P((vm_size_t count));
111struct proc *pageproc;
112
113static struct kproc_desc page_kp = {
114	"pagedaemon",
115	vm_pageout,
116	&pageproc
117};
118SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)
119
120#if !defined(NO_SWAPPING)
121/* the kernel process "vm_daemon"*/
122static void vm_daemon __P((void));
123static struct	proc *vmproc;
124
125static struct kproc_desc vm_kp = {
126	"vmdaemon",
127	vm_daemon,
128	&vmproc
129};
130SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
131#endif
132
133
134int vm_pages_needed=0;		/* Event on which pageout daemon sleeps */
135int vm_pageout_deficit=0;	/* Estimated number of pages deficit */
136int vm_pageout_pages_needed=0;	/* flag saying that the pageout daemon needs pages */
137
138#if !defined(NO_SWAPPING)
139static int vm_pageout_req_swapout;	/* XXX */
140static int vm_daemon_needed;
141#endif
142extern int vm_swap_size;
143static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
144static int vm_pageout_full_stats_interval = 0;
145static int vm_pageout_stats_free_max=0, vm_pageout_algorithm_lru=0;
146static int defer_swap_pageouts=0;
147static int disable_swap_pageouts=0;
148
149static int max_page_launder=100;
150static int vm_pageout_actcmp=0;
151#if defined(NO_SWAPPING)
152static int vm_swap_enabled=0;
153static int vm_swap_idle_enabled=0;
154#else
155static int vm_swap_enabled=1;
156static int vm_swap_idle_enabled=0;
157#endif
158
159SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
160	CTLFLAG_RW, &vm_pageout_algorithm_lru, 0, "LRU page mgmt");
161
162SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
163	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
164
165SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
166	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
167
168SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
169	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
170
171SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
172	CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
173
174#if defined(NO_SWAPPING)
175SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
176	CTLFLAG_RD, &vm_swap_enabled, 0, "");
177SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
178	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
179#else
180SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
181	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
182SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
183	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
184#endif
185
186SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
187	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
188
189SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
190	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
191
192SYSCTL_INT(_vm, OID_AUTO, max_page_launder,
193	CTLFLAG_RW, &max_page_launder, 0, "Maximum number of pages to clean per pass");
194SYSCTL_INT(_vm, OID_AUTO, vm_pageout_actcmp,
195	CTLFLAG_RD, &vm_pageout_actcmp, 0, "pagedaemon agressiveness");
196
197
198#define VM_PAGEOUT_PAGE_COUNT 16
199int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
200
201int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
202
203#if !defined(NO_SWAPPING)
204typedef void freeer_fcn_t __P((vm_map_t, vm_object_t, vm_pindex_t, int));
205static void vm_pageout_map_deactivate_pages __P((vm_map_t, vm_pindex_t));
206static freeer_fcn_t vm_pageout_object_deactivate_pages;
207static void vm_req_vmdaemon __P((void));
208#endif
209static void vm_pageout_page_stats(void);
210
211/*
212 * vm_pageout_clean:
213 *
214 * Clean the page and remove it from the laundry.
215 *
216 * We set the busy bit to cause potential page faults on this page to
217 * block.  Note the careful timing, however, the busy bit isn't set till
218 * late and we cannot do anything that will mess with the page.
219 */
220
221static int
222vm_pageout_clean(m)
223	vm_page_t m;
224{
225	register vm_object_t object;
226	vm_page_t mc[2*vm_pageout_page_count];
227	int pageout_count;
228	int ib, is, page_base;
229	vm_pindex_t pindex = m->pindex;
230
231	object = m->object;
232
233	/*
234	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
235	 * with the new swapper, but we could have serious problems paging
236	 * out other object types if there is insufficient memory.
237	 *
238	 * Unfortunately, checking free memory here is far too late, so the
239	 * check has been moved up a procedural level.
240	 */
241
242	/*
243	 * Don't mess with the page if it's busy, held, or special
244	 */
245	if ((m->hold_count != 0) ||
246	    ((m->busy != 0) || (m->flags & (PG_BUSY|PG_UNMANAGED)))) {
247		return 0;
248	}
249
250	mc[vm_pageout_page_count] = m;
251	pageout_count = 1;
252	page_base = vm_pageout_page_count;
253	ib = 1;
254	is = 1;
255
256	/*
257	 * Scan object for clusterable pages.
258	 *
259	 * We can cluster ONLY if: ->> the page is NOT
260	 * clean, wired, busy, held, or mapped into a
261	 * buffer, and one of the following:
262	 * 1) The page is inactive, or a seldom used
263	 *    active page.
264	 * -or-
265	 * 2) we force the issue.
266	 *
267	 * During heavy mmap/modification loads the pageout
268	 * daemon can really fragment the underlying file
269	 * due to flushing pages out of order and not trying
270	 * align the clusters (which leave sporatic out-of-order
271	 * holes).  To solve this problem we do the reverse scan
272	 * first and attempt to align our cluster, then do a
273	 * forward scan if room remains.
274	 */
275
276more:
277	while (ib && pageout_count < vm_pageout_page_count) {
278		vm_page_t p;
279
280		if (ib > pindex) {
281			ib = 0;
282			break;
283		}
284
285		if ((p = vm_page_lookup(object, pindex - ib)) == NULL) {
286			ib = 0;
287			break;
288		}
289		if (((p->queue - p->pc) == PQ_CACHE) ||
290		    (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
291			ib = 0;
292			break;
293		}
294		vm_page_test_dirty(p);
295		if ((p->dirty & p->valid) == 0 ||
296		    p->queue != PQ_INACTIVE ||
297		    p->wire_count != 0 ||
298		    p->hold_count != 0) {
299			ib = 0;
300			break;
301		}
302		mc[--page_base] = p;
303		++pageout_count;
304		++ib;
305		/*
306		 * alignment boundry, stop here and switch directions.  Do
307		 * not clear ib.
308		 */
309		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
310			break;
311	}
312
313	while (pageout_count < vm_pageout_page_count &&
314	    pindex + is < object->size) {
315		vm_page_t p;
316
317		if ((p = vm_page_lookup(object, pindex + is)) == NULL)
318			break;
319		if (((p->queue - p->pc) == PQ_CACHE) ||
320		    (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
321			break;
322		}
323		vm_page_test_dirty(p);
324		if ((p->dirty & p->valid) == 0 ||
325		    p->queue != PQ_INACTIVE ||
326		    p->wire_count != 0 ||
327		    p->hold_count != 0) {
328			break;
329		}
330		mc[page_base + pageout_count] = p;
331		++pageout_count;
332		++is;
333	}
334
335	/*
336	 * If we exhausted our forward scan, continue with the reverse scan
337	 * when possible, even past a page boundry.  This catches boundry
338	 * conditions.
339	 */
340	if (ib && pageout_count < vm_pageout_page_count)
341		goto more;
342
343	/*
344	 * we allow reads during pageouts...
345	 */
346	return vm_pageout_flush(&mc[page_base], pageout_count, 0);
347}
348
349/*
350 * vm_pageout_flush() - launder the given pages
351 *
352 *	The given pages are laundered.  Note that we setup for the start of
353 *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
354 *	reference count all in here rather then in the parent.  If we want
355 *	the parent to do more sophisticated things we may have to change
356 *	the ordering.
357 */
358
359int
360vm_pageout_flush(mc, count, flags)
361	vm_page_t *mc;
362	int count;
363	int flags;
364{
365	register vm_object_t object;
366	int pageout_status[count];
367	int numpagedout = 0;
368	int i;
369
370	/*
371	 * Initiate I/O.  Bump the vm_page_t->busy counter and
372	 * mark the pages read-only.
373	 *
374	 * We do not have to fixup the clean/dirty bits here... we can
375	 * allow the pager to do it after the I/O completes.
376	 *
377	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
378	 * edge case with file fragments.
379	 */
380
381	for (i = 0; i < count; i++) {
382		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush page %p index %d/%d: partially invalid page", mc[i], i, count));
383		vm_page_io_start(mc[i]);
384		vm_page_protect(mc[i], VM_PROT_READ);
385	}
386
387	object = mc[0]->object;
388	vm_object_pip_add(object, count);
389
390	vm_pager_put_pages(object, mc, count,
391	    (flags | ((object == kernel_object) ? OBJPC_SYNC : 0)),
392	    pageout_status);
393
394	for (i = 0; i < count; i++) {
395		vm_page_t mt = mc[i];
396
397		switch (pageout_status[i]) {
398		case VM_PAGER_OK:
399			numpagedout++;
400			break;
401		case VM_PAGER_PEND:
402			numpagedout++;
403			break;
404		case VM_PAGER_BAD:
405			/*
406			 * Page outside of range of object. Right now we
407			 * essentially lose the changes by pretending it
408			 * worked.
409			 */
410			pmap_clear_modify(mt);
411			vm_page_undirty(mt);
412			break;
413		case VM_PAGER_ERROR:
414		case VM_PAGER_FAIL:
415			/*
416			 * If page couldn't be paged out, then reactivate the
417			 * page so it doesn't clog the inactive list.  (We
418			 * will try paging out it again later).
419			 */
420			vm_page_activate(mt);
421			break;
422		case VM_PAGER_AGAIN:
423			break;
424		}
425
426		/*
427		 * If the operation is still going, leave the page busy to
428		 * block all other accesses. Also, leave the paging in
429		 * progress indicator set so that we don't attempt an object
430		 * collapse.
431		 */
432		if (pageout_status[i] != VM_PAGER_PEND) {
433			vm_object_pip_wakeup(object);
434			vm_page_io_finish(mt);
435			if (!vm_page_count_severe() || !vm_page_try_to_cache(mt))
436				vm_page_protect(mt, VM_PROT_READ);
437		}
438	}
439	return numpagedout;
440}
441
442#if !defined(NO_SWAPPING)
443/*
444 *	vm_pageout_object_deactivate_pages
445 *
446 *	deactivate enough pages to satisfy the inactive target
447 *	requirements or if vm_page_proc_limit is set, then
448 *	deactivate all of the pages in the object and its
449 *	backing_objects.
450 *
451 *	The object and map must be locked.
452 */
453static void
454vm_pageout_object_deactivate_pages(map, object, desired, map_remove_only)
455	vm_map_t map;
456	vm_object_t object;
457	vm_pindex_t desired;
458	int map_remove_only;
459{
460	register vm_page_t p, next;
461	int rcount;
462	int remove_mode;
463	int s;
464
465	if (object->type == OBJT_DEVICE || object->type == OBJT_PHYS)
466		return;
467
468	while (object) {
469		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
470			return;
471		if (object->paging_in_progress)
472			return;
473
474		remove_mode = map_remove_only;
475		if (object->shadow_count > 1)
476			remove_mode = 1;
477	/*
478	 * scan the objects entire memory queue
479	 */
480		rcount = object->resident_page_count;
481		p = TAILQ_FIRST(&object->memq);
482		while (p && (rcount-- > 0)) {
483			int actcount;
484			if (pmap_resident_count(vm_map_pmap(map)) <= desired)
485				return;
486			next = TAILQ_NEXT(p, listq);
487			cnt.v_pdpages++;
488			if (p->wire_count != 0 ||
489			    p->hold_count != 0 ||
490			    p->busy != 0 ||
491			    (p->flags & (PG_BUSY|PG_UNMANAGED)) ||
492			    !pmap_page_exists(vm_map_pmap(map), p)) {
493				p = next;
494				continue;
495			}
496
497			actcount = pmap_ts_referenced(p);
498			if (actcount) {
499				vm_page_flag_set(p, PG_REFERENCED);
500			} else if (p->flags & PG_REFERENCED) {
501				actcount = 1;
502			}
503
504			if ((p->queue != PQ_ACTIVE) &&
505				(p->flags & PG_REFERENCED)) {
506				vm_page_activate(p);
507				p->act_count += actcount;
508				vm_page_flag_clear(p, PG_REFERENCED);
509			} else if (p->queue == PQ_ACTIVE) {
510				if ((p->flags & PG_REFERENCED) == 0) {
511					p->act_count -= min(p->act_count, ACT_DECLINE);
512					if (!remove_mode && (vm_pageout_algorithm_lru || (p->act_count == 0))) {
513						vm_page_protect(p, VM_PROT_NONE);
514						vm_page_deactivate(p);
515					} else {
516						s = splvm();
517						TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
518						TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
519						splx(s);
520					}
521				} else {
522					vm_page_activate(p);
523					vm_page_flag_clear(p, PG_REFERENCED);
524					if (p->act_count < (ACT_MAX - ACT_ADVANCE))
525						p->act_count += ACT_ADVANCE;
526					s = splvm();
527					TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
528					TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
529					splx(s);
530				}
531			} else if (p->queue == PQ_INACTIVE) {
532				vm_page_protect(p, VM_PROT_NONE);
533			}
534			p = next;
535		}
536		object = object->backing_object;
537	}
538	return;
539}
540
541/*
542 * deactivate some number of pages in a map, try to do it fairly, but
543 * that is really hard to do.
544 */
545static void
546vm_pageout_map_deactivate_pages(map, desired)
547	vm_map_t map;
548	vm_pindex_t desired;
549{
550	vm_map_entry_t tmpe;
551	vm_object_t obj, bigobj;
552
553	if (lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT, (void *)0, curproc)) {
554		return;
555	}
556
557	bigobj = NULL;
558
559	/*
560	 * first, search out the biggest object, and try to free pages from
561	 * that.
562	 */
563	tmpe = map->header.next;
564	while (tmpe != &map->header) {
565		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
566			obj = tmpe->object.vm_object;
567			if ((obj != NULL) && (obj->shadow_count <= 1) &&
568				((bigobj == NULL) ||
569				 (bigobj->resident_page_count < obj->resident_page_count))) {
570				bigobj = obj;
571			}
572		}
573		tmpe = tmpe->next;
574	}
575
576	if (bigobj)
577		vm_pageout_object_deactivate_pages(map, bigobj, desired, 0);
578
579	/*
580	 * Next, hunt around for other pages to deactivate.  We actually
581	 * do this search sort of wrong -- .text first is not the best idea.
582	 */
583	tmpe = map->header.next;
584	while (tmpe != &map->header) {
585		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
586			break;
587		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
588			obj = tmpe->object.vm_object;
589			if (obj)
590				vm_pageout_object_deactivate_pages(map, obj, desired, 0);
591		}
592		tmpe = tmpe->next;
593	};
594
595	/*
596	 * Remove all mappings if a process is swapped out, this will free page
597	 * table pages.
598	 */
599	if (desired == 0)
600		pmap_remove(vm_map_pmap(map),
601			VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
602	vm_map_unlock(map);
603	return;
604}
605#endif
606
607/*
608 * Don't try to be fancy - being fancy can lead to VOP_LOCK's and therefore
609 * to vnode deadlocks.  We only do it for OBJT_DEFAULT and OBJT_SWAP objects
610 * which we know can be trivially freed.
611 */
612
613void
614vm_pageout_page_free(vm_page_t m) {
615	vm_object_t object = m->object;
616	int type = object->type;
617
618	if (type == OBJT_SWAP || type == OBJT_DEFAULT)
619		vm_object_reference(object);
620	vm_page_busy(m);
621	vm_page_protect(m, VM_PROT_NONE);
622	vm_page_free(m);
623	if (type == OBJT_SWAP || type == OBJT_DEFAULT)
624		vm_object_deallocate(object);
625}
626
627/*
628 *	vm_pageout_scan does the dirty work for the pageout daemon.
629 */
630static int
631vm_pageout_scan()
632{
633	vm_page_t m, next;
634	struct vm_page marker;
635	int page_shortage, maxscan, pcount;
636	int addl_page_shortage, addl_page_shortage_init;
637	int maxlaunder;
638	struct proc *p, *bigproc;
639	vm_offset_t size, bigsize;
640	vm_object_t object;
641	int force_wakeup = 0;
642	int actcount;
643	int vnodes_skipped = 0;
644	int s;
645
646	/*
647	 * Do whatever cleanup that the pmap code can.
648	 */
649	pmap_collect();
650
651	addl_page_shortage_init = vm_pageout_deficit;
652	vm_pageout_deficit = 0;
653
654	if (max_page_launder == 0)
655		max_page_launder = 1;
656
657	/*
658	 * Calculate the number of pages we want to either free or move
659	 * to the cache.  Be more agressive if we aren't making our target.
660	 */
661
662	page_shortage = vm_paging_target() +
663		addl_page_shortage_init + vm_pageout_actcmp;
664
665	/*
666	 * Figure out how agressively we should flush dirty pages.
667	 */
668	{
669		int factor = vm_pageout_actcmp;
670
671		maxlaunder = cnt.v_inactive_target / 3 + factor;
672		if (maxlaunder > max_page_launder + factor)
673			maxlaunder = max_page_launder + factor;
674	}
675
676	/*
677	 * Initialize our marker
678	 */
679	bzero(&marker, sizeof(marker));
680	marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
681	marker.queue = PQ_INACTIVE;
682	marker.wire_count = 1;
683
684	/*
685	 * Start scanning the inactive queue for pages we can move to the
686	 * cache or free.  The scan will stop when the target is reached or
687	 * we have scanned the entire inactive queue.  Note that m->act_count
688	 * is not used to form decisions for the inactive queue, only for the
689	 * active queue.
690	 */
691
692rescan0:
693	addl_page_shortage = addl_page_shortage_init;
694	maxscan = cnt.v_inactive_count;
695	for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
696	     m != NULL && maxscan-- > 0 && page_shortage > 0;
697	     m = next) {
698
699		cnt.v_pdpages++;
700
701		if (m->queue != PQ_INACTIVE) {
702			goto rescan0;
703		}
704
705		next = TAILQ_NEXT(m, pageq);
706
707		/*
708		 * skip marker pages
709		 */
710		if (m->flags & PG_MARKER)
711			continue;
712
713		if (m->hold_count) {
714			s = splvm();
715			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
716			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
717			splx(s);
718			addl_page_shortage++;
719			continue;
720		}
721		/*
722		 * Dont mess with busy pages, keep in the front of the
723		 * queue, most likely are being paged out.
724		 */
725		if (m->busy || (m->flags & PG_BUSY)) {
726			addl_page_shortage++;
727			continue;
728		}
729
730		/*
731		 * If the object is not being used, we ignore previous
732		 * references.
733		 */
734		if (m->object->ref_count == 0) {
735			vm_page_flag_clear(m, PG_REFERENCED);
736			pmap_clear_reference(m);
737
738		/*
739		 * Otherwise, if the page has been referenced while in the
740		 * inactive queue, we bump the "activation count" upwards,
741		 * making it less likely that the page will be added back to
742		 * the inactive queue prematurely again.  Here we check the
743		 * page tables (or emulated bits, if any), given the upper
744		 * level VM system not knowing anything about existing
745		 * references.
746		 */
747		} else if (((m->flags & PG_REFERENCED) == 0) &&
748			(actcount = pmap_ts_referenced(m))) {
749			vm_page_activate(m);
750			m->act_count += (actcount + ACT_ADVANCE);
751			continue;
752		}
753
754		/*
755		 * If the upper level VM system knows about any page
756		 * references, we activate the page.  We also set the
757		 * "activation count" higher than normal so that we will less
758		 * likely place pages back onto the inactive queue again.
759		 */
760		if ((m->flags & PG_REFERENCED) != 0) {
761			vm_page_flag_clear(m, PG_REFERENCED);
762			actcount = pmap_ts_referenced(m);
763			vm_page_activate(m);
764			m->act_count += (actcount + ACT_ADVANCE + 1);
765			continue;
766		}
767
768		/*
769		 * If the upper level VM system doesn't know anything about
770		 * the page being dirty, we have to check for it again.  As
771		 * far as the VM code knows, any partially dirty pages are
772		 * fully dirty.
773		 */
774		if (m->dirty == 0) {
775			vm_page_test_dirty(m);
776		} else {
777			vm_page_dirty(m);
778		}
779
780		/*
781		 * Invalid pages can be easily freed
782		 */
783		if (m->valid == 0) {
784			vm_pageout_page_free(m);
785			cnt.v_dfree++;
786			--page_shortage;
787
788		/*
789		 * Clean pages can be placed onto the cache queue.  This
790		 * effectively frees them.
791		 */
792		} else if (m->dirty == 0) {
793			vm_page_cache(m);
794			--page_shortage;
795
796		/*
797		 * Dirty pages need to be paged out.  Note that we clean
798		 * only a limited number of pages per pagedaemon pass.
799		 */
800		} else if (maxlaunder > 0) {
801			int swap_pageouts_ok;
802			struct vnode *vp = NULL;
803			struct mount *mp;
804
805			object = m->object;
806
807			if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
808				swap_pageouts_ok = 1;
809			} else {
810				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
811				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
812				vm_page_count_min());
813
814			}
815
816			/*
817			 * We don't bother paging objects that are "dead".
818			 * Those objects are in a "rundown" state.
819			 */
820			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
821				s = splvm();
822				TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
823				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
824				splx(s);
825				continue;
826			}
827
828			/*
829			 * Presumably we have sufficient free memory to do
830			 * the more sophisticated checks and locking required
831			 * for vnodes.
832			 *
833			 * The object is already known NOT to be dead.  The
834			 * vget() may still block, though, because
835			 * VOP_ISLOCKED() doesn't check to see if an inode
836			 * (v_data) is associated with the vnode.  If it isn't,
837			 * vget() will load in it from disk.  Worse, vget()
838			 * may actually get stuck waiting on "inode" if another
839			 * process is in the process of bringing the inode in.
840			 * This is bad news for us either way.
841			 *
842			 * So for the moment we check v_data == NULL as a
843			 * workaround.  This means that vnodes which do not
844			 * use v_data in the way we expect probably will not
845			 * wind up being paged out by the pager and it will be
846			 * up to the syncer to get them.  That's better then
847			 * us blocking here.
848			 *
849			 * This whole code section is bogus - we need to fix
850			 * the vnode pager to handle vm_page_t's without us
851			 * having to do any sophisticated VOP tests.
852			 */
853
854			if (object->type == OBJT_VNODE) {
855				vp = object->handle;
856
857				mp = NULL;
858				if (vp->v_type == VREG)
859					vn_start_write(vp, &mp, V_NOWAIT);
860				if (VOP_ISLOCKED(vp, NULL) ||
861				    vp->v_data == NULL ||
862				    vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) {
863					vn_finished_write(mp);
864					if ((m->queue == PQ_INACTIVE) &&
865						(m->hold_count == 0) &&
866						(m->busy == 0) &&
867						(m->flags & PG_BUSY) == 0) {
868						s = splvm();
869						TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
870						TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
871						splx(s);
872					}
873					if (object->flags & OBJ_MIGHTBEDIRTY)
874						vnodes_skipped++;
875					continue;
876				}
877
878				/*
879				 * The page might have been moved to another
880				 * queue during potential blocking in vget()
881				 * above.  The page might have been freed and
882				 * reused for another vnode.  The object might
883				 * have been reused for another vnode.
884				 */
885				if (m->queue != PQ_INACTIVE ||
886				    m->object != object ||
887				    object->handle != vp) {
888					if (object->flags & OBJ_MIGHTBEDIRTY)
889						vnodes_skipped++;
890					vput(vp);
891					vn_finished_write(mp);
892					continue;
893				}
894
895				/*
896				 * The page may have been busied during the
897				 * blocking in vput();  We don't move the
898				 * page back onto the end of the queue so that
899				 * statistics are more correct if we don't.
900				 */
901				if (m->busy || (m->flags & PG_BUSY)) {
902					vput(vp);
903					vn_finished_write(mp);
904					continue;
905				}
906
907				/*
908				 * If the page has become held, then skip it
909				 */
910				if (m->hold_count) {
911					s = splvm();
912					TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
913					TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
914					splx(s);
915					if (object->flags & OBJ_MIGHTBEDIRTY)
916						vnodes_skipped++;
917					vput(vp);
918					vn_finished_write(mp);
919					continue;
920				}
921			}
922
923			/*
924			 * If a page is dirty, then it is either being washed
925			 * (but not yet cleaned) or it is still in the
926			 * laundry.  If it is still in the laundry, then we
927			 * start the cleaning operation.  maxlaunder nominally
928			 * counts I/O cost (seeks) rather then bytes.
929			 *
930			 * This operation may cluster, invalidating the 'next'
931			 * pointer.  To prevent an inordinate number of
932			 * restarts we use our marker to remember our place.
933			 */
934			s = splvm();
935			TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
936			splx(s);
937			if (vm_pageout_clean(m) != 0)
938				--maxlaunder;
939			s = splvm();
940			next = TAILQ_NEXT(&marker, pageq);
941			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
942			splx(s);
943			if (vp) {
944				vput(vp);
945				vn_finished_write(mp);
946			}
947		}
948	}
949
950	/*
951	 * If we were not able to meet our target, increase actcmp
952	 */
953
954	if (vm_page_count_min()) {
955		if (vm_pageout_actcmp < ACT_MAX / 2)
956			vm_pageout_actcmp += ACT_ADVANCE;
957	} else {
958		if (vm_pageout_actcmp < ACT_DECLINE)
959			vm_pageout_actcmp = 0;
960		else
961			vm_pageout_actcmp -= ACT_DECLINE;
962	}
963
964	/*
965	 * Compute the number of pages we want to try to move from the
966	 * active queue to the inactive queue.
967	 */
968
969	page_shortage = vm_paging_target() +
970		cnt.v_inactive_target - cnt.v_inactive_count;
971	page_shortage += addl_page_shortage;
972	page_shortage += vm_pageout_actcmp;
973
974	/*
975	 * Scan the active queue for things we can deactivate. We nominally
976	 * track the per-page activity counter and use it to locate
977	 * deactivation candidates.
978	 */
979
980	pcount = cnt.v_active_count;
981	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
982
983	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
984
985		/*
986		 * This is a consistency check, and should likely be a panic
987		 * or warning.
988		 */
989		if (m->queue != PQ_ACTIVE) {
990			break;
991		}
992
993		next = TAILQ_NEXT(m, pageq);
994		/*
995		 * Don't deactivate pages that are busy.
996		 */
997		if ((m->busy != 0) ||
998		    (m->flags & PG_BUSY) ||
999		    (m->hold_count != 0)) {
1000			s = splvm();
1001			TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1002			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1003			splx(s);
1004			m = next;
1005			continue;
1006		}
1007
1008		/*
1009		 * The count for pagedaemon pages is done after checking the
1010		 * page for eligibility...
1011		 */
1012		cnt.v_pdpages++;
1013
1014		/*
1015		 * Check to see "how much" the page has been used.
1016		 */
1017		actcount = 0;
1018		if (m->object->ref_count != 0) {
1019			if (m->flags & PG_REFERENCED) {
1020				actcount += 1;
1021			}
1022			actcount += pmap_ts_referenced(m);
1023			if (actcount) {
1024				m->act_count += ACT_ADVANCE + actcount;
1025				if (m->act_count > ACT_MAX)
1026					m->act_count = ACT_MAX;
1027			}
1028		}
1029
1030		/*
1031		 * Since we have "tested" this bit, we need to clear it now.
1032		 */
1033		vm_page_flag_clear(m, PG_REFERENCED);
1034
1035		/*
1036		 * Only if an object is currently being used, do we use the
1037		 * page activation count stats.
1038		 */
1039		if (actcount && (m->object->ref_count != 0)) {
1040			s = splvm();
1041			TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1042			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1043			splx(s);
1044		} else {
1045			m->act_count -= min(m->act_count, ACT_DECLINE);
1046			if (vm_pageout_algorithm_lru ||
1047			    (m->object->ref_count == 0) ||
1048			    (m->act_count <= vm_pageout_actcmp)) {
1049				page_shortage--;
1050				if (m->object->ref_count == 0) {
1051					vm_page_protect(m, VM_PROT_NONE);
1052					if (m->dirty == 0)
1053						vm_page_cache(m);
1054					else
1055						vm_page_deactivate(m);
1056				} else {
1057					vm_page_deactivate(m);
1058				}
1059			} else {
1060				s = splvm();
1061				TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1062				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1063				splx(s);
1064			}
1065		}
1066		m = next;
1067	}
1068
1069	s = splvm();
1070
1071	/*
1072	 * We try to maintain some *really* free pages, this allows interrupt
1073	 * code to be guaranteed space.  Since both cache and free queues
1074	 * are considered basically 'free', moving pages from cache to free
1075	 * does not effect other calculations.
1076	 */
1077
1078	while (cnt.v_free_count < cnt.v_free_reserved) {
1079		static int cache_rover = 0;
1080		m = vm_page_list_find(PQ_CACHE, cache_rover, FALSE);
1081		if (!m)
1082			break;
1083		if ((m->flags & (PG_BUSY|PG_UNMANAGED)) ||
1084		    m->busy ||
1085		    m->hold_count ||
1086		    m->wire_count) {
1087#ifdef INVARIANTS
1088			printf("Warning: busy page %p found in cache\n", m);
1089#endif
1090			vm_page_deactivate(m);
1091			continue;
1092		}
1093		cache_rover = (cache_rover + PQ_PRIME2) & PQ_L2_MASK;
1094		vm_pageout_page_free(m);
1095		cnt.v_dfree++;
1096	}
1097	splx(s);
1098
1099#if !defined(NO_SWAPPING)
1100	/*
1101	 * Idle process swapout -- run once per second.
1102	 */
1103	if (vm_swap_idle_enabled) {
1104		static long lsec;
1105		if (time_second != lsec) {
1106			vm_pageout_req_swapout |= VM_SWAP_IDLE;
1107			vm_req_vmdaemon();
1108			lsec = time_second;
1109		}
1110	}
1111#endif
1112
1113	/*
1114	 * If we didn't get enough free pages, and we have skipped a vnode
1115	 * in a writeable object, wakeup the sync daemon.  And kick swapout
1116	 * if we did not get enough free pages.
1117	 */
1118	if (vm_paging_target() > 0) {
1119		if (vnodes_skipped && vm_page_count_min())
1120			(void) speedup_syncer();
1121#if !defined(NO_SWAPPING)
1122		if (vm_swap_enabled && vm_page_count_target()) {
1123			vm_req_vmdaemon();
1124			vm_pageout_req_swapout |= VM_SWAP_NORMAL;
1125		}
1126#endif
1127	}
1128
1129	/*
1130	 * make sure that we have swap space -- if we are low on memory and
1131	 * swap -- then kill the biggest process.
1132	 */
1133	if ((vm_swap_size < 64 || swap_pager_full) && vm_page_count_min()) {
1134		bigproc = NULL;
1135		bigsize = 0;
1136		ALLPROC_LOCK(AP_SHARED);
1137		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
1138			/*
1139			 * if this is a system process, skip it
1140			 */
1141			if ((p->p_flag & P_SYSTEM) || (p->p_lock > 0) ||
1142			    (p->p_pid == 1) ||
1143			    ((p->p_pid < 48) && (vm_swap_size != 0))) {
1144				continue;
1145			}
1146			/*
1147			 * if the process is in a non-running type state,
1148			 * don't touch it.
1149			 */
1150			mtx_enter(&sched_lock, MTX_SPIN);
1151			if (p->p_stat != SRUN && p->p_stat != SSLEEP) {
1152				mtx_exit(&sched_lock, MTX_SPIN);
1153				continue;
1154			}
1155			mtx_exit(&sched_lock, MTX_SPIN);
1156			/*
1157			 * get the process size
1158			 */
1159			size = vmspace_resident_count(p->p_vmspace);
1160			/*
1161			 * if the this process is bigger than the biggest one
1162			 * remember it.
1163			 */
1164			if (size > bigsize) {
1165				bigproc = p;
1166				bigsize = size;
1167			}
1168		}
1169		ALLPROC_LOCK(AP_RELEASE);
1170		if (bigproc != NULL) {
1171			killproc(bigproc, "out of swap space");
1172			bigproc->p_estcpu = 0;
1173			bigproc->p_nice = PRIO_MIN;
1174			resetpriority(bigproc);
1175			wakeup(&cnt.v_free_count);
1176		}
1177	}
1178	return force_wakeup;
1179}
1180
1181/*
1182 * This routine tries to maintain the pseudo LRU active queue,
1183 * so that during long periods of time where there is no paging,
1184 * that some statistic accumulation still occurs.  This code
1185 * helps the situation where paging just starts to occur.
1186 */
1187static void
1188vm_pageout_page_stats()
1189{
1190	int s;
1191	vm_page_t m,next;
1192	int pcount,tpcount;		/* Number of pages to check */
1193	static int fullintervalcount = 0;
1194	int page_shortage;
1195	int s0;
1196
1197	page_shortage =
1198	    (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
1199	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
1200
1201	if (page_shortage <= 0)
1202		return;
1203
1204	s0 = splvm();
1205
1206	pcount = cnt.v_active_count;
1207	fullintervalcount += vm_pageout_stats_interval;
1208	if (fullintervalcount < vm_pageout_full_stats_interval) {
1209		tpcount = (vm_pageout_stats_max * cnt.v_active_count) / cnt.v_page_count;
1210		if (pcount > tpcount)
1211			pcount = tpcount;
1212	} else {
1213		fullintervalcount = 0;
1214	}
1215
1216	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
1217	while ((m != NULL) && (pcount-- > 0)) {
1218		int actcount;
1219
1220		if (m->queue != PQ_ACTIVE) {
1221			break;
1222		}
1223
1224		next = TAILQ_NEXT(m, pageq);
1225		/*
1226		 * Don't deactivate pages that are busy.
1227		 */
1228		if ((m->busy != 0) ||
1229		    (m->flags & PG_BUSY) ||
1230		    (m->hold_count != 0)) {
1231			s = splvm();
1232			TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1233			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1234			splx(s);
1235			m = next;
1236			continue;
1237		}
1238
1239		actcount = 0;
1240		if (m->flags & PG_REFERENCED) {
1241			vm_page_flag_clear(m, PG_REFERENCED);
1242			actcount += 1;
1243		}
1244
1245		actcount += pmap_ts_referenced(m);
1246		if (actcount) {
1247			m->act_count += ACT_ADVANCE + actcount;
1248			if (m->act_count > ACT_MAX)
1249				m->act_count = ACT_MAX;
1250			s = splvm();
1251			TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1252			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1253			splx(s);
1254		} else {
1255			if (m->act_count == 0) {
1256				/*
1257				 * We turn off page access, so that we have more accurate
1258				 * RSS stats.  We don't do this in the normal page deactivation
1259				 * when the system is loaded VM wise, because the cost of
1260				 * the large number of page protect operations would be higher
1261				 * than the value of doing the operation.
1262				 */
1263				vm_page_protect(m, VM_PROT_NONE);
1264				vm_page_deactivate(m);
1265			} else {
1266				m->act_count -= min(m->act_count, ACT_DECLINE);
1267				s = splvm();
1268				TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1269				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1270				splx(s);
1271			}
1272		}
1273
1274		m = next;
1275	}
1276	splx(s0);
1277}
1278
1279static int
1280vm_pageout_free_page_calc(count)
1281vm_size_t count;
1282{
1283	if (count < cnt.v_page_count)
1284		 return 0;
1285	/*
1286	 * free_reserved needs to include enough for the largest swap pager
1287	 * structures plus enough for any pv_entry structs when paging.
1288	 */
1289	if (cnt.v_page_count > 1024)
1290		cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
1291	else
1292		cnt.v_free_min = 4;
1293	cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
1294		cnt.v_interrupt_free_min;
1295	cnt.v_free_reserved = vm_pageout_page_count +
1296		cnt.v_pageout_free_min + (count / 768) + PQ_L2_SIZE;
1297	cnt.v_free_severe = cnt.v_free_min / 2;
1298	cnt.v_free_min += cnt.v_free_reserved;
1299	cnt.v_free_severe += cnt.v_free_reserved;
1300	return 1;
1301}
1302
1303
1304/*
1305 *	vm_pageout is the high level pageout daemon.
1306 */
1307static void
1308vm_pageout()
1309{
1310
1311	mtx_enter(&Giant, MTX_DEF);
1312
1313	/*
1314	 * Initialize some paging parameters.
1315	 */
1316
1317	cnt.v_interrupt_free_min = 2;
1318	if (cnt.v_page_count < 2000)
1319		vm_pageout_page_count = 8;
1320
1321	vm_pageout_free_page_calc(cnt.v_page_count);
1322	/*
1323	 * free_reserved needs to include enough for the largest swap pager
1324	 * structures plus enough for any pv_entry structs when paging.
1325	 */
1326	if (cnt.v_free_count > 6144)
1327		cnt.v_free_target = 3 * cnt.v_free_min + cnt.v_free_reserved;
1328	else
1329		cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
1330
1331	if (cnt.v_free_count > 2048) {
1332		cnt.v_cache_min = cnt.v_free_target;
1333		cnt.v_cache_max = 2 * cnt.v_cache_min;
1334		cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
1335	} else {
1336		cnt.v_cache_min = 0;
1337		cnt.v_cache_max = 0;
1338		cnt.v_inactive_target = cnt.v_free_count / 4;
1339	}
1340	if (cnt.v_inactive_target > cnt.v_free_count / 3)
1341		cnt.v_inactive_target = cnt.v_free_count / 3;
1342
1343	/* XXX does not really belong here */
1344	if (vm_page_max_wired == 0)
1345		vm_page_max_wired = cnt.v_free_count / 3;
1346
1347	if (vm_pageout_stats_max == 0)
1348		vm_pageout_stats_max = cnt.v_free_target;
1349
1350	/*
1351	 * Set interval in seconds for stats scan.
1352	 */
1353	if (vm_pageout_stats_interval == 0)
1354		vm_pageout_stats_interval = 5;
1355	if (vm_pageout_full_stats_interval == 0)
1356		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
1357
1358
1359	/*
1360	 * Set maximum free per pass
1361	 */
1362	if (vm_pageout_stats_free_max == 0)
1363		vm_pageout_stats_free_max = 5;
1364
1365	max_page_launder = (cnt.v_page_count > 1800 ? 32 : 16);
1366
1367	curproc->p_flag |= P_BUFEXHAUST;
1368	swap_pager_swap_init();
1369	/*
1370	 * The pageout daemon is never done, so loop forever.
1371	 */
1372	while (TRUE) {
1373		int error;
1374		int s = splvm();
1375
1376		/*
1377		 * If we have enough free memory, wakeup waiters.  Do
1378		 * not clear vm_pages_needed until we reach our target,
1379		 * otherwise we may be woken up over and over again and
1380		 * waste a lot of cpu.
1381		 */
1382		if (vm_pages_needed && !vm_page_count_min()) {
1383			if (vm_paging_needed() <= 0)
1384				vm_pages_needed = 0;
1385			wakeup(&cnt.v_free_count);
1386		}
1387		if (vm_pages_needed) {
1388			/*
1389			 * Still not done, sleep a bit and go again
1390			 */
1391			tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
1392		} else {
1393			/*
1394			 * Good enough, sleep & handle stats
1395			 */
1396			error = tsleep(&vm_pages_needed,
1397				PVM, "psleep", vm_pageout_stats_interval * hz);
1398			if (error && !vm_pages_needed) {
1399				if (vm_pageout_actcmp > 0)
1400					--vm_pageout_actcmp;
1401				splx(s);
1402				vm_pageout_page_stats();
1403				continue;
1404			}
1405		}
1406
1407		if (vm_pages_needed)
1408			cnt.v_pdwakeups++;
1409		splx(s);
1410		vm_pageout_scan();
1411		vm_pageout_deficit = 0;
1412	}
1413}
1414
1415void
1416pagedaemon_wakeup()
1417{
1418	if (!vm_pages_needed && curproc != pageproc) {
1419		vm_pages_needed++;
1420		wakeup(&vm_pages_needed);
1421	}
1422}
1423
1424#if !defined(NO_SWAPPING)
1425static void
1426vm_req_vmdaemon()
1427{
1428	static int lastrun = 0;
1429
1430	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
1431		wakeup(&vm_daemon_needed);
1432		lastrun = ticks;
1433	}
1434}
1435
1436static void
1437vm_daemon()
1438{
1439	struct proc *p;
1440
1441	mtx_enter(&Giant, MTX_DEF);
1442
1443	while (TRUE) {
1444		tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0);
1445		if (vm_pageout_req_swapout) {
1446			swapout_procs(vm_pageout_req_swapout);
1447			vm_pageout_req_swapout = 0;
1448		}
1449		/*
1450		 * scan the processes for exceeding their rlimits or if
1451		 * process is swapped out -- deactivate pages
1452		 */
1453
1454		ALLPROC_LOCK(AP_SHARED);
1455		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
1456			vm_pindex_t limit, size;
1457
1458			/*
1459			 * if this is a system process or if we have already
1460			 * looked at this process, skip it.
1461			 */
1462			if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
1463				continue;
1464			}
1465			/*
1466			 * if the process is in a non-running type state,
1467			 * don't touch it.
1468			 */
1469			mtx_enter(&sched_lock, MTX_SPIN);
1470			if (p->p_stat != SRUN && p->p_stat != SSLEEP) {
1471				mtx_exit(&sched_lock, MTX_SPIN);
1472				continue;
1473			}
1474			mtx_exit(&sched_lock, MTX_SPIN);
1475			/*
1476			 * get a limit
1477			 */
1478			limit = OFF_TO_IDX(
1479			    qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
1480				p->p_rlimit[RLIMIT_RSS].rlim_max));
1481
1482			/*
1483			 * let processes that are swapped out really be
1484			 * swapped out set the limit to nothing (will force a
1485			 * swap-out.)
1486			 */
1487			if ((p->p_flag & P_INMEM) == 0)
1488				limit = 0;	/* XXX */
1489
1490			size = vmspace_resident_count(p->p_vmspace);
1491			if (limit >= 0 && size >= limit) {
1492				vm_pageout_map_deactivate_pages(
1493				    &p->p_vmspace->vm_map, limit);
1494			}
1495		}
1496		ALLPROC_LOCK(AP_RELEASE);
1497	}
1498}
1499#endif
1500