vm_pageout.c revision 212360
118334Speter/*-
256392Sobrien * Copyright (c) 1991 Regents of the University of California.
318334Speter * All rights reserved.
418334Speter * Copyright (c) 1994 John S. Dyson
518334Speter * All rights reserved.
618334Speter * Copyright (c) 1994 David Greenman
718334Speter * All rights reserved.
818334Speter * Copyright (c) 2005 Yahoo! Technologies Norway AS
918334Speter * All rights reserved.
1018334Speter *
1118334Speter * This code is derived from software contributed to Berkeley by
1218334Speter * The Mach Operating System project at Carnegie-Mellon University.
1318334Speter *
1418334Speter * Redistribution and use in source and binary forms, with or without
1518334Speter * modification, are permitted provided that the following conditions
1618334Speter * are met:
1718334Speter * 1. Redistributions of source code must retain the above copyright
1818334Speter *    notice, this list of conditions and the following disclaimer.
1918334Speter * 2. Redistributions in binary form must reproduce the above copyright
2018334Speter *    notice, this list of conditions and the following disclaimer in the
2152558Sobrien *    documentation and/or other materials provided with the distribution.
2252558Sobrien * 3. All advertising materials mentioning features or use of this software
2318334Speter *    must display the following acknowledgement:
2418334Speter *	This product includes software developed by the University of
2518334Speter *	California, Berkeley and its contributors.
2618334Speter * 4. Neither the name of the University nor the names of its contributors
2718334Speter *    may be used to endorse or promote products derived from this software
2818334Speter *    without specific prior written permission.
2950615Sobrien *
3050615Sobrien * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
3150615Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3218334Speter * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
3318334Speter * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
3418334Speter * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
3550615Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
3650615Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
3718334Speter * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3850615Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3950615Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
4050615Sobrien * SUCH DAMAGE.
4118334Speter *
4218334Speter *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
4318334Speter *
4418334Speter *
4518334Speter * Copyright (c) 1987, 1990 Carnegie-Mellon University.
4618334Speter * All rights reserved.
4718334Speter *
4850615Sobrien * Authors: Avadis Tevanian, Jr., Michael Wayne Young
4950615Sobrien *
5050615Sobrien * Permission to use, copy, modify and distribute this software and
5118334Speter * its documentation is hereby granted, provided that both the copyright
5218334Speter * notice and this permission notice appear in all copies of the
5350615Sobrien * software, derivative works or modified versions, and any portions
5450615Sobrien * thereof, and that both notices appear in supporting documentation.
5552558Sobrien *
5652558Sobrien * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
5752558Sobrien * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
5818334Speter * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
5950615Sobrien *
6050615Sobrien * Carnegie Mellon requests users of this software to return to
6150615Sobrien *
6250615Sobrien *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
6350615Sobrien *  School of Computer Science
6450615Sobrien *  Carnegie Mellon University
6550615Sobrien *  Pittsburgh PA 15213-3890
6650615Sobrien *
6750615Sobrien * any improvements or extensions that they make and grant Carnegie the
6850615Sobrien * rights to redistribute these changes.
6950615Sobrien */
7050615Sobrien
7150615Sobrien/*
7250615Sobrien *	The proverbial page-out daemon.
7350615Sobrien */
7450615Sobrien
7518334Speter#include <sys/cdefs.h>
7618334Speter__FBSDID("$FreeBSD: head/sys/vm/vm_pageout.c 212360 2010-09-09 13:32:58Z nwhitehorn $");
7718334Speter
7818334Speter#include "opt_vm.h"
7918334Speter#include <sys/param.h>
8018334Speter#include <sys/systm.h>
8118334Speter#include <sys/kernel.h>
8218334Speter#include <sys/eventhandler.h>
8318334Speter#include <sys/lock.h>
8418334Speter#include <sys/mutex.h>
8518334Speter#include <sys/proc.h>
8618334Speter#include <sys/kthread.h>
8718334Speter#include <sys/ktr.h>
8818334Speter#include <sys/mount.h>
8918334Speter#include <sys/resourcevar.h>
9018334Speter#include <sys/sched.h>
9118334Speter#include <sys/signalvar.h>
9218334Speter#include <sys/vnode.h>
9318334Speter#include <sys/vmmeter.h>
9418334Speter#include <sys/sx.h>
9518334Speter#include <sys/sysctl.h>
9618334Speter
9718334Speter#include <vm/vm.h>
9818334Speter#include <vm/vm_param.h>
9918334Speter#include <vm/vm_object.h>
10018334Speter#include <vm/vm_page.h>
10118334Speter#include <vm/vm_map.h>
10218334Speter#include <vm/vm_pageout.h>
10318334Speter#include <vm/vm_pager.h>
10450615Sobrien#include <vm/swap_pager.h>
10550615Sobrien#include <vm/vm_extern.h>
10650615Sobrien#include <vm/uma.h>
10750615Sobrien
10850615Sobrien/*
10950615Sobrien * System initialization
11050615Sobrien */
11150615Sobrien
11250615Sobrien/* the kernel process "vm_pageout"*/
11350615Sobrienstatic void vm_pageout(void);
11450615Sobrienstatic int vm_pageout_clean(vm_page_t);
11550615Sobrienstatic void vm_pageout_scan(int pass);
11650615Sobrien
11750615Sobrienstruct proc *pageproc;
11850615Sobrien
11950615Sobrienstatic struct kproc_desc page_kp = {
12050615Sobrien	"pagedaemon",
12150615Sobrien	vm_pageout,
12250615Sobrien	&pageproc
12350615Sobrien};
12450615SobrienSYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start,
12550615Sobrien    &page_kp);
12650615Sobrien
12750615Sobrien#if !defined(NO_SWAPPING)
12850615Sobrien/* the kernel process "vm_daemon"*/
12950615Sobrienstatic void vm_daemon(void);
13050615Sobrienstatic struct	proc *vmproc;
13150615Sobrien
13250615Sobrienstatic struct kproc_desc vm_kp = {
13350615Sobrien	"vmdaemon",
13450615Sobrien	vm_daemon,
13550615Sobrien	&vmproc
13650615Sobrien};
13752558SobrienSYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
13852558Sobrien#endif
13952558Sobrien
14052558Sobrien
14118334Speterint vm_pages_needed;		/* Event on which pageout daemon sleeps */
14218334Speterint vm_pageout_deficit;		/* Estimated number of pages deficit */
14318334Speterint vm_pageout_pages_needed;	/* flag saying that the pageout daemon needs pages */
14418334Speter
14518334Speter#if !defined(NO_SWAPPING)
14618334Speterstatic int vm_pageout_req_swapout;	/* XXX */
14718334Speterstatic int vm_daemon_needed;
14818334Speterstatic struct mtx vm_daemon_mtx;
14918334Speter/* Allow for use by vm_pageout before vm_daemon is initialized. */
15018334SpeterMTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
15118334Speter#endif
15218334Speterstatic int vm_max_launder = 32;
15318334Speterstatic int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
15450615Sobrienstatic int vm_pageout_full_stats_interval = 0;
15550615Sobrienstatic int vm_pageout_algorithm=0;
15618334Speterstatic int defer_swap_pageouts=0;
15718334Speterstatic int disable_swap_pageouts=0;
15818334Speter
15918334Speter#if defined(NO_SWAPPING)
16018334Speterstatic int vm_swap_enabled=0;
16118334Speterstatic int vm_swap_idle_enabled=0;
16218334Speter#else
16318334Speterstatic int vm_swap_enabled=1;
16418334Speterstatic int vm_swap_idle_enabled=0;
16518334Speter#endif
16618334Speter
16750615SobrienSYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
16818334Speter	CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
16950615Sobrien
17050615SobrienSYSCTL_INT(_vm, OID_AUTO, max_launder,
17150615Sobrien	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
17218334Speter
17352558SobrienSYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
17452558Sobrien	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
17552558Sobrien
17652558SobrienSYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
17752558Sobrien	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
17852558Sobrien
17952558SobrienSYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
18052558Sobrien	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
18152558Sobrien
18252558Sobrien#if defined(NO_SWAPPING)
18352558SobrienSYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
18452558Sobrien	CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout");
18552558SobrienSYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
18652558Sobrien	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
18752558Sobrien#else
18852558SobrienSYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
18952558Sobrien	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
19050615SobrienSYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
19152558Sobrien	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
19252558Sobrien#endif
19352558Sobrien
19452558SobrienSYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
19550615Sobrien	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
19652558Sobrien
19752558SobrienSYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
19852558Sobrien	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
19952558Sobrien
20052558Sobrienstatic int pageout_lock_miss;
20152558SobrienSYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
20252558Sobrien	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
20352558Sobrien
20452558Sobrien#define VM_PAGEOUT_PAGE_COUNT 16
20552558Sobrienint vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
20652558Sobrien
20752558Sobrienint vm_page_max_wired;		/* XXX max # of wired pages system-wide */
20852558SobrienSYSCTL_INT(_vm, OID_AUTO, max_wired,
20952558Sobrien	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
21052558Sobrien
21152558Sobrien#if !defined(NO_SWAPPING)
21252558Sobrienstatic void vm_pageout_map_deactivate_pages(vm_map_t, long);
21352558Sobrienstatic void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
21452558Sobrienstatic void vm_req_vmdaemon(int req);
21552558Sobrien#endif
21652558Sobrienstatic void vm_pageout_page_stats(void);
21752558Sobrien
21852558Sobrienstatic void
21950615Sobrienvm_pageout_init_marker(vm_page_t marker, u_short queue)
22052558Sobrien{
22152558Sobrien
22250615Sobrien	bzero(marker, sizeof(*marker));
22350615Sobrien	marker->flags = PG_FICTITIOUS | PG_MARKER;
22418334Speter	marker->oflags = VPO_BUSY;
22552558Sobrien	marker->queue = queue;
22652558Sobrien	marker->wire_count = 1;
22752558Sobrien}
22852558Sobrien
22952558Sobrien/*
23052558Sobrien * vm_pageout_fallback_object_lock:
23152558Sobrien *
23252558Sobrien * Lock vm object currently associated with `m'. VM_OBJECT_TRYLOCK is
23352558Sobrien * known to have failed and page queue must be either PQ_ACTIVE or
23452558Sobrien * PQ_INACTIVE.  To avoid lock order violation, unlock the page queues
23550615Sobrien * while locking the vm object.  Use marker page to detect page queue
23650615Sobrien * changes and maintain notion of next page on page queue.  Return
23718334Speter * TRUE if no changes were detected, FALSE otherwise.  vm object is
23818334Speter * locked on return.
23918334Speter *
24018334Speter * This function depends on both the lock portion of struct vm_object
24118334Speter * and normal struct vm_page being type stable.
24218334Speter */
24318334Speterboolean_t
24418334Spetervm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
24518334Speter{
24618334Speter	struct vm_page marker;
24718334Speter	boolean_t unchanged;
24818334Speter	u_short queue;
24918334Speter	vm_object_t object;
25018334Speter
25118334Speter	queue = m->queue;
25218334Speter	vm_pageout_init_marker(&marker, queue);
25318334Speter	object = m->object;
25418334Speter
25518334Speter	TAILQ_INSERT_AFTER(&vm_page_queues[queue].pl,
25618334Speter			   m, &marker, pageq);
25718334Speter	vm_page_unlock_queues();
25818334Speter	vm_page_unlock(m);
25918334Speter	VM_OBJECT_LOCK(object);
26018334Speter	vm_page_lock(m);
26152558Sobrien	vm_page_lock_queues();
26252558Sobrien
26352558Sobrien	/* Page queue might have changed. */
26418334Speter	*next = TAILQ_NEXT(&marker, pageq);
26518334Speter	unchanged = (m->queue == queue &&
26618334Speter		     m->object == object &&
26718334Speter		     &marker == TAILQ_NEXT(m, pageq));
26818334Speter	TAILQ_REMOVE(&vm_page_queues[queue].pl,
26918334Speter		     &marker, pageq);
27018334Speter	return (unchanged);
27118334Speter}
27218334Speter
27318334Speter/*
27418334Speter * Lock the page while holding the page queue lock.  Use marker page
27518334Speter * to detect page queue changes and maintain notion of next page on
27618334Speter * page queue.  Return TRUE if no changes were detected, FALSE
27752558Sobrien * otherwise.  The page is locked on return. The page queue lock might
27818334Speter * be dropped and reacquired.
27918334Speter *
28018334Speter * This function depends on normal struct vm_page being type stable.
28118334Speter */
28218334Speterboolean_t
28318334Spetervm_pageout_page_lock(vm_page_t m, vm_page_t *next)
28418334Speter{
28518334Speter	struct vm_page marker;
28618334Speter	boolean_t unchanged;
28718334Speter	u_short queue;
28818334Speter
28918334Speter	vm_page_lock_assert(m, MA_NOTOWNED);
29050615Sobrien	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
29118334Speter
29250615Sobrien	if (vm_page_trylock(m))
29318334Speter		return (TRUE);
29418334Speter
29550615Sobrien	queue = m->queue;
29618334Speter	vm_pageout_init_marker(&marker, queue);
29718334Speter
29850615Sobrien	TAILQ_INSERT_AFTER(&vm_page_queues[queue].pl, m, &marker, pageq);
29918334Speter	vm_page_unlock_queues();
30018334Speter	vm_page_lock(m);
30118334Speter	vm_page_lock_queues();
30252558Sobrien
30318334Speter	/* Page queue might have changed. */
30418334Speter	*next = TAILQ_NEXT(&marker, pageq);
30550615Sobrien	unchanged = (m->queue == queue && &marker == TAILQ_NEXT(m, pageq));
30618334Speter	TAILQ_REMOVE(&vm_page_queues[queue].pl, &marker, pageq);
30750615Sobrien	return (unchanged);
30818334Speter}
30950615Sobrien
31018334Speter/*
31150615Sobrien * vm_pageout_clean:
31250615Sobrien *
31350615Sobrien * Clean the page and remove it from the laundry.
31450615Sobrien *
31552558Sobrien * We set the busy bit to cause potential page faults on this page to
31618334Speter * block.  Note the careful timing, however, the busy bit isn't set till
31718334Speter * late and we cannot do anything that will mess with the page.
31818334Speter */
31918334Speterstatic int
32018334Spetervm_pageout_clean(vm_page_t m)
32118334Speter{
32218334Speter	vm_object_t object;
32318334Speter	vm_page_t mc[2*vm_pageout_page_count], pb, ps;
32418334Speter	int pageout_count;
32518334Speter	int ib, is, page_base;
32618334Speter	vm_pindex_t pindex = m->pindex;
32718334Speter
32818334Speter	vm_page_lock_assert(m, MA_OWNED);
32918334Speter	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
33018334Speter
33118334Speter	/*
33218334Speter	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
33318334Speter	 * with the new swapper, but we could have serious problems paging
33418334Speter	 * out other object types if there is insufficient memory.
33518334Speter	 *
33618334Speter	 * Unfortunately, checking free memory here is far too late, so the
33718334Speter	 * check has been moved up a procedural level.
33818334Speter	 */
33918334Speter
34018334Speter	/*
34118334Speter	 * Can't clean the page if it's busy or held.
34218334Speter	 */
34318334Speter	KASSERT(m->busy == 0 && (m->oflags & VPO_BUSY) == 0,
34418334Speter	    ("vm_pageout_clean: page %p is busy", m));
34518334Speter	KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m));
34618334Speter
34718334Speter	mc[vm_pageout_page_count] = pb = ps = m;
34818334Speter	pageout_count = 1;
34950615Sobrien	page_base = vm_pageout_page_count;
35050615Sobrien	ib = 1;
35150615Sobrien	is = 1;
35250615Sobrien
35350615Sobrien	/*
35450615Sobrien	 * Scan object for clusterable pages.
35550615Sobrien	 *
35618334Speter	 * We can cluster ONLY if: ->> the page is NOT
35718334Speter	 * clean, wired, busy, held, or mapped into a
35818334Speter	 * buffer, and one of the following:
35918334Speter	 * 1) The page is inactive, or a seldom used
36018334Speter	 *    active page.
36118334Speter	 * -or-
36250615Sobrien	 * 2) we force the issue.
36350615Sobrien	 *
36450615Sobrien	 * During heavy mmap/modification loads the pageout
36550615Sobrien	 * daemon can really fragment the underlying file
36650615Sobrien	 * due to flushing pages out of order and not trying
36750615Sobrien	 * align the clusters (which leave sporatic out-of-order
36850615Sobrien	 * holes).  To solve this problem we do the reverse scan
36918334Speter	 * first and attempt to align our cluster, then do a
37052558Sobrien	 * forward scan if room remains.
37118334Speter	 */
37218334Speter	object = m->object;
37318334Spetermore:
37452558Sobrien	while (ib && pageout_count < vm_pageout_page_count) {
37552558Sobrien		vm_page_t p;
37652558Sobrien
37718334Speter		if (ib > pindex) {
37852558Sobrien			ib = 0;
37952558Sobrien			break;
38052558Sobrien		}
38152558Sobrien
38218334Speter		if ((p = vm_page_prev(pb)) == NULL ||
38318334Speter		    (p->oflags & VPO_BUSY) != 0 || p->busy != 0) {
38418334Speter			ib = 0;
38552558Sobrien			break;
38618334Speter		}
38718334Speter		vm_page_lock(p);
38818334Speter		vm_page_test_dirty(p);
38918334Speter		if (p->dirty == 0 ||
39018334Speter		    p->queue != PQ_INACTIVE ||
39118334Speter		    p->hold_count != 0) {	/* may be undergoing I/O */
39218334Speter			vm_page_unlock(p);
39318334Speter			ib = 0;
39418334Speter			break;
39550615Sobrien		}
39650615Sobrien		vm_page_unlock(p);
39750615Sobrien		mc[--page_base] = pb = p;
39850615Sobrien		++pageout_count;
39950615Sobrien		++ib;
40050615Sobrien		/*
40150615Sobrien		 * alignment boundry, stop here and switch directions.  Do
40250615Sobrien		 * not clear ib.
40350615Sobrien		 */
40450615Sobrien		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
40550615Sobrien			break;
40650615Sobrien	}
40718334Speter
40818334Speter	while (pageout_count < vm_pageout_page_count &&
40918334Speter	    pindex + is < object->size) {
41018334Speter		vm_page_t p;
41118334Speter
41218334Speter		if ((p = vm_page_next(ps)) == NULL ||
41318334Speter		    (p->oflags & VPO_BUSY) != 0 || p->busy != 0)
41418334Speter			break;
41518334Speter		vm_page_lock(p);
41618334Speter		vm_page_test_dirty(p);
41718334Speter		if (p->dirty == 0 ||
41818334Speter		    p->queue != PQ_INACTIVE ||
41918334Speter		    p->hold_count != 0) {	/* may be undergoing I/O */
42018334Speter			vm_page_unlock(p);
42118334Speter			break;
42218334Speter		}
42318334Speter		vm_page_unlock(p);
42418334Speter		mc[page_base + pageout_count] = ps = p;
42518334Speter		++pageout_count;
42618334Speter		++is;
42718334Speter	}
42818334Speter
42918334Speter	/*
43018334Speter	 * If we exhausted our forward scan, continue with the reverse scan
43118334Speter	 * when possible, even past a page boundry.  This catches boundry
43218334Speter	 * conditions.
43318334Speter	 */
43418334Speter	if (ib && pageout_count < vm_pageout_page_count)
43518334Speter		goto more;
43618334Speter
43718334Speter	vm_page_unlock(m);
43818334Speter	/*
43918334Speter	 * we allow reads during pageouts...
44018334Speter	 */
44118334Speter	return (vm_pageout_flush(&mc[page_base], pageout_count, 0));
44218334Speter}
44318334Speter
44418334Speter/*
44518334Speter * vm_pageout_flush() - launder the given pages
44618334Speter *
44718334Speter *	The given pages are laundered.  Note that we setup for the start of
44818334Speter *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
44918334Speter *	reference count all in here rather then in the parent.  If we want
45018334Speter *	the parent to do more sophisticated things we may have to change
45118334Speter *	the ordering.
45218334Speter */
45318334Speterint
45418334Spetervm_pageout_flush(vm_page_t *mc, int count, int flags)
45518334Speter{
45618334Speter	vm_object_t object = mc[0]->object;
45718334Speter	int pageout_status[count];
45818334Speter	int numpagedout = 0;
45918334Speter	int i;
46018334Speter
46118334Speter	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
46218334Speter	mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
46318334Speter
46418334Speter	/*
46518334Speter	 * Initiate I/O.  Bump the vm_page_t->busy counter and
46618334Speter	 * mark the pages read-only.
46718334Speter	 *
46818334Speter	 * We do not have to fixup the clean/dirty bits here... we can
46918334Speter	 * allow the pager to do it after the I/O completes.
47018334Speter	 *
47118334Speter	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
47218334Speter	 * edge case with file fragments.
47318334Speter	 */
47418334Speter	for (i = 0; i < count; i++) {
47518334Speter		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
47618334Speter		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
47718334Speter			mc[i], i, count));
47818334Speter		vm_page_io_start(mc[i]);
47918334Speter		pmap_remove_write(mc[i]);
48018334Speter	}
48118334Speter	vm_object_pip_add(object, count);
48218334Speter
48318334Speter	vm_pager_put_pages(object, mc, count, flags, pageout_status);
48418334Speter
48518334Speter	for (i = 0; i < count; i++) {
48618334Speter		vm_page_t mt = mc[i];
48718334Speter
48818334Speter		KASSERT(pageout_status[i] == VM_PAGER_PEND ||
48918334Speter		    (mt->flags & PG_WRITEABLE) == 0,
49018334Speter		    ("vm_pageout_flush: page %p is not write protected", mt));
49118334Speter		switch (pageout_status[i]) {
49218334Speter		case VM_PAGER_OK:
49318334Speter		case VM_PAGER_PEND:
49418334Speter			numpagedout++;
49518334Speter			break;
49618334Speter		case VM_PAGER_BAD:
49718334Speter			/*
49818334Speter			 * Page outside of range of object. Right now we
49918334Speter			 * essentially lose the changes by pretending it
50018334Speter			 * worked.
50118334Speter			 */
50218334Speter			vm_page_undirty(mt);
50318334Speter			break;
50418334Speter		case VM_PAGER_ERROR:
50518334Speter		case VM_PAGER_FAIL:
50618334Speter			/*
50718334Speter			 * If page couldn't be paged out, then reactivate the
50818334Speter			 * page so it doesn't clog the inactive list.  (We
50918334Speter			 * will try paging out it again later).
51018334Speter			 */
51118334Speter			vm_page_lock(mt);
51218334Speter			vm_page_activate(mt);
51318334Speter			vm_page_unlock(mt);
51450615Sobrien			break;
51550615Sobrien		case VM_PAGER_AGAIN:
51650615Sobrien			break;
51750615Sobrien		}
51850615Sobrien
51950615Sobrien		/*
52050615Sobrien		 * If the operation is still going, leave the page busy to
52150615Sobrien		 * block all other accesses. Also, leave the paging in
52250615Sobrien		 * progress indicator set so that we don't attempt an object
52350615Sobrien		 * collapse.
52450615Sobrien		 */
52550615Sobrien		if (pageout_status[i] != VM_PAGER_PEND) {
52650615Sobrien			vm_object_pip_wakeup(object);
52750615Sobrien			vm_page_io_finish(mt);
52850615Sobrien			if (vm_page_count_severe()) {
52918334Speter				vm_page_lock(mt);
53018334Speter				vm_page_try_to_cache(mt);
53118334Speter				vm_page_unlock(mt);
53218334Speter			}
53318334Speter		}
53418334Speter	}
53518334Speter	return (numpagedout);
53618334Speter}
53718334Speter
53818334Speter#if !defined(NO_SWAPPING)
53918334Speter/*
54018334Speter *	vm_pageout_object_deactivate_pages
54118334Speter *
54218334Speter *	Deactivate enough pages to satisfy the inactive target
54318334Speter *	requirements.
54418334Speter *
54550615Sobrien *	The object and map must be locked.
54650615Sobrien */
54750615Sobrienstatic void
54850615Sobrienvm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
54950615Sobrien    long desired)
55052558Sobrien{
55152558Sobrien	vm_object_t backing_object, object;
55252558Sobrien	vm_page_t p;
55352558Sobrien	int actcount, remove_mode;
55418334Speter
55518334Speter	VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED);
55618334Speter	if (first_object->type == OBJT_DEVICE ||
55718334Speter	    first_object->type == OBJT_SG)
55818334Speter		return;
55918334Speter	for (object = first_object;; object = backing_object) {
56018334Speter		if (pmap_resident_count(pmap) <= desired)
56150615Sobrien			goto unlock_return;
56218334Speter		VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
56318334Speter		if (object->type == OBJT_PHYS || object->paging_in_progress)
56418334Speter			goto unlock_return;
56552558Sobrien
56652558Sobrien		remove_mode = 0;
56752558Sobrien		if (object->shadow_count > 1)
56852558Sobrien			remove_mode = 1;
56952558Sobrien		/*
57052558Sobrien		 * Scan the object's entire memory queue.
57152558Sobrien		 */
57252558Sobrien		TAILQ_FOREACH(p, &object->memq, listq) {
57352558Sobrien			if (pmap_resident_count(pmap) <= desired)
57452558Sobrien				goto unlock_return;
57552558Sobrien			if ((p->oflags & VPO_BUSY) != 0 || p->busy != 0)
57618334Speter				continue;
57718334Speter			PCPU_INC(cnt.v_pdpages);
57818334Speter			vm_page_lock(p);
57918334Speter			if (p->wire_count != 0 || p->hold_count != 0 ||
58052558Sobrien			    !pmap_page_exists_quick(pmap, p)) {
58118334Speter				vm_page_unlock(p);
58218334Speter				continue;
58318334Speter			}
58452558Sobrien			actcount = pmap_ts_referenced(p);
58552558Sobrien			if ((p->flags & PG_REFERENCED) != 0) {
58652558Sobrien				if (actcount == 0)
58752558Sobrien					actcount = 1;
58818334Speter				vm_page_lock_queues();
58918334Speter				vm_page_flag_clear(p, PG_REFERENCED);
59018334Speter				vm_page_unlock_queues();
59118334Speter			}
59250615Sobrien			if (p->queue != PQ_ACTIVE && actcount != 0) {
59350615Sobrien				vm_page_activate(p);
59450615Sobrien				p->act_count += actcount;
59550615Sobrien			} else if (p->queue == PQ_ACTIVE) {
59618334Speter				if (actcount == 0) {
59718334Speter					p->act_count -= min(p->act_count,
59818334Speter					    ACT_DECLINE);
59918334Speter					if (!remove_mode &&
60018334Speter					    (vm_pageout_algorithm ||
60150615Sobrien					    p->act_count == 0)) {
60250615Sobrien						pmap_remove_all(p);
60350615Sobrien						vm_page_deactivate(p);
60450615Sobrien					} else {
60518334Speter						vm_page_lock_queues();
60618334Speter						vm_page_requeue(p);
60718334Speter						vm_page_unlock_queues();
60818334Speter					}
60918334Speter				} else {
61018334Speter					vm_page_activate(p);
61118334Speter					if (p->act_count < ACT_MAX -
61218334Speter					    ACT_ADVANCE)
61318334Speter						p->act_count += ACT_ADVANCE;
61418334Speter					vm_page_lock_queues();
61518334Speter					vm_page_requeue(p);
61618334Speter					vm_page_unlock_queues();
61718334Speter				}
61818334Speter			} else if (p->queue == PQ_INACTIVE)
61918334Speter				pmap_remove_all(p);
62050615Sobrien			vm_page_unlock(p);
62150615Sobrien		}
62250615Sobrien		if ((backing_object = object->backing_object) == NULL)
62350615Sobrien			goto unlock_return;
62450615Sobrien		VM_OBJECT_LOCK(backing_object);
62518334Speter		if (object != first_object)
62618334Speter			VM_OBJECT_UNLOCK(object);
62718334Speter	}
62818334Speterunlock_return:
62918334Speter	if (object != first_object)
63018334Speter		VM_OBJECT_UNLOCK(object);
63118334Speter}
63218334Speter
63318334Speter/*
63418334Speter * deactivate some number of pages in a map, try to do it fairly, but
63518334Speter * that is really hard to do.
63618334Speter */
63718334Speterstatic void
63818334Spetervm_pageout_map_deactivate_pages(map, desired)
63918334Speter	vm_map_t map;
64018334Speter	long desired;
64118334Speter{
64218334Speter	vm_map_entry_t tmpe;
64318334Speter	vm_object_t obj, bigobj;
64418334Speter	int nothingwired;
64518334Speter
64618334Speter	if (!vm_map_trylock(map))
64718334Speter		return;
64850615Sobrien
64950615Sobrien	bigobj = NULL;
65018334Speter	nothingwired = TRUE;
65150615Sobrien
65250615Sobrien	/*
65350615Sobrien	 * first, search out the biggest object, and try to free pages from
65450615Sobrien	 * that.
65550615Sobrien	 */
65650615Sobrien	tmpe = map->header.next;
65750615Sobrien	while (tmpe != &map->header) {
65850615Sobrien		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
65950615Sobrien			obj = tmpe->object.vm_object;
66050615Sobrien			if (obj != NULL && VM_OBJECT_TRYLOCK(obj)) {
66118334Speter				if (obj->shadow_count <= 1 &&
66218334Speter				    (bigobj == NULL ||
66318334Speter				     bigobj->resident_page_count < obj->resident_page_count)) {
66418334Speter					if (bigobj != NULL)
66518334Speter						VM_OBJECT_UNLOCK(bigobj);
66618334Speter					bigobj = obj;
66718334Speter				} else
66818334Speter					VM_OBJECT_UNLOCK(obj);
66918334Speter			}
67018334Speter		}
67118334Speter		if (tmpe->wired_count > 0)
67218334Speter			nothingwired = FALSE;
67318334Speter		tmpe = tmpe->next;
67418334Speter	}
67518334Speter
67618334Speter	if (bigobj != NULL) {
67718334Speter		vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
67818334Speter		VM_OBJECT_UNLOCK(bigobj);
67918334Speter	}
68018334Speter	/*
68118334Speter	 * Next, hunt around for other pages to deactivate.  We actually
68250615Sobrien	 * do this search sort of wrong -- .text first is not the best idea.
68350615Sobrien	 */
68450615Sobrien	tmpe = map->header.next;
68550615Sobrien	while (tmpe != &map->header) {
68650615Sobrien		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
68750615Sobrien			break;
68850615Sobrien		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
68950615Sobrien			obj = tmpe->object.vm_object;
69050615Sobrien			if (obj != NULL) {
69150615Sobrien				VM_OBJECT_LOCK(obj);
69250615Sobrien				vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
69350615Sobrien				VM_OBJECT_UNLOCK(obj);
69450615Sobrien			}
69550615Sobrien		}
69650615Sobrien		tmpe = tmpe->next;
69752558Sobrien	}
69850615Sobrien
69950615Sobrien	/*
70050615Sobrien	 * Remove all mappings if a process is swapped out, this will free page
70150615Sobrien	 * table pages.
70250615Sobrien	 */
70318334Speter	if (desired == 0 && nothingwired) {
70418334Speter		tmpe = map->header.next;
70518334Speter		while (tmpe != &map->header) {
70618334Speter			pmap_remove(vm_map_pmap(map), tmpe->start, tmpe->end);
70718334Speter			tmpe = tmpe->next;
70818334Speter		}
70918334Speter	}
71018334Speter	vm_map_unlock(map);
71118334Speter}
71250615Sobrien#endif		/* !defined(NO_SWAPPING) */
71350615Sobrien
71450615Sobrien/*
71518334Speter *	vm_pageout_scan does the dirty work for the pageout daemon.
71618334Speter */
71718334Speterstatic void
71850615Sobrienvm_pageout_scan(int pass)
71950615Sobrien{
72050615Sobrien	vm_page_t m, next;
72150615Sobrien	struct vm_page marker;
72250615Sobrien	int page_shortage, maxscan, pcount;
72350615Sobrien	int addl_page_shortage, addl_page_shortage_init;
72450615Sobrien	vm_object_t object;
72550615Sobrien	int actcount;
72650615Sobrien	int vnodes_skipped = 0;
72718334Speter	int maxlaunder;
72818334Speter
72918334Speter	/*
73018334Speter	 * Decrease registered cache sizes.
73118334Speter	 */
73218334Speter	EVENTHANDLER_INVOKE(vm_lowmem, 0);
73318334Speter	/*
73418334Speter	 * We do this explicitly after the caches have been drained above.
73518334Speter	 */
73618334Speter	uma_reclaim();
73718334Speter
73818334Speter	addl_page_shortage_init = atomic_readandclear_int(&vm_pageout_deficit);
73938510Sbde
74038510Sbde	/*
74138510Sbde	 * Calculate the number of pages we want to either free or move
74250615Sobrien	 * to the cache.
74350615Sobrien	 */
74450615Sobrien	page_shortage = vm_paging_target() + addl_page_shortage_init;
74550615Sobrien
74650615Sobrien	vm_pageout_init_marker(&marker, PQ_INACTIVE);
74750615Sobrien
74850615Sobrien	/*
74950615Sobrien	 * Start scanning the inactive queue for pages we can move to the
75050615Sobrien	 * cache or free.  The scan will stop when the target is reached or
75150615Sobrien	 * we have scanned the entire inactive queue.  Note that m->act_count
75250615Sobrien	 * is not used to form decisions for the inactive queue, only for the
75350615Sobrien	 * active queue.
75450615Sobrien	 *
75550615Sobrien	 * maxlaunder limits the number of dirty pages we flush per scan.
75650615Sobrien	 * For most systems a smaller value (16 or 32) is more robust under
75750615Sobrien	 * extreme memory and disk pressure because any unnecessary writes
75850615Sobrien	 * to disk can result in extreme performance degredation.  However,
75950615Sobrien	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
76050615Sobrien	 * used) will die horribly with limited laundering.  If the pageout
76150615Sobrien	 * daemon cannot clean enough pages in the first pass, we let it go
76250615Sobrien	 * all out in succeeding passes.
76350615Sobrien	 */
76450615Sobrien	if ((maxlaunder = vm_max_launder) <= 1)
76550615Sobrien		maxlaunder = 1;
76650615Sobrien	if (pass)
76750615Sobrien		maxlaunder = 10000;
76850615Sobrien	vm_page_lock_queues();
76950615Sobrienrescan0:
77052558Sobrien	addl_page_shortage = addl_page_shortage_init;
77152558Sobrien	maxscan = cnt.v_inactive_count;
77250615Sobrien
77352558Sobrien	for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
77452558Sobrien	     m != NULL && maxscan-- > 0 && page_shortage > 0;
77552558Sobrien	     m = next) {
77650615Sobrien
77752558Sobrien		cnt.v_pdpages++;
77852558Sobrien
77950615Sobrien		if (m->queue != PQ_INACTIVE)
78050615Sobrien			goto rescan0;
78150615Sobrien
78252558Sobrien		next = TAILQ_NEXT(m, pageq);
78350615Sobrien
78450615Sobrien		/*
78550615Sobrien		 * skip marker pages
78650615Sobrien		 */
78752558Sobrien		if (m->flags & PG_MARKER)
78850615Sobrien			continue;
78950615Sobrien
79050615Sobrien		/*
79150615Sobrien		 * Lock the page.
79250615Sobrien		 */
79350615Sobrien		if (!vm_pageout_page_lock(m, &next)) {
79450615Sobrien			vm_page_unlock(m);
79550615Sobrien			addl_page_shortage++;
79650615Sobrien			continue;
79750615Sobrien		}
79850615Sobrien
79950615Sobrien		/*
80050615Sobrien		 * A held page may be undergoing I/O, so skip it.
80150615Sobrien		 */
80250615Sobrien		if (m->hold_count) {
80350615Sobrien			vm_page_unlock(m);
80450615Sobrien			vm_page_requeue(m);
80550615Sobrien			addl_page_shortage++;
80650615Sobrien			continue;
80750615Sobrien		}
80850615Sobrien
80950615Sobrien		/*
81050615Sobrien		 * Don't mess with busy pages, keep in the front of the
81150615Sobrien		 * queue, most likely are being paged out.
81250615Sobrien		 */
81352558Sobrien		object = m->object;
81450615Sobrien		if (!VM_OBJECT_TRYLOCK(object) &&
81550615Sobrien		    (!vm_pageout_fallback_object_lock(m, &next) ||
81650615Sobrien			m->hold_count != 0)) {
81750615Sobrien			VM_OBJECT_UNLOCK(object);
81852558Sobrien			vm_page_unlock(m);
81950615Sobrien			addl_page_shortage++;
82050615Sobrien			continue;
82152558Sobrien		}
82250615Sobrien		if (m->busy || (m->oflags & VPO_BUSY)) {
82350615Sobrien			vm_page_unlock(m);
82450615Sobrien			VM_OBJECT_UNLOCK(object);
82552558Sobrien			addl_page_shortage++;
82652558Sobrien			continue;
82752558Sobrien		}
82852558Sobrien
82952558Sobrien		/*
83052558Sobrien		 * If the object is not being used, we ignore previous
83152558Sobrien		 * references.
83252558Sobrien		 */
83352558Sobrien		if (object->ref_count == 0) {
83452558Sobrien			vm_page_flag_clear(m, PG_REFERENCED);
83552558Sobrien			KASSERT(!pmap_page_is_mapped(m),
83618334Speter			    ("vm_pageout_scan: page %p is mapped", m));
83718334Speter
83818334Speter		/*
83918334Speter		 * Otherwise, if the page has been referenced while in the
84018334Speter		 * inactive queue, we bump the "activation count" upwards,
84118334Speter		 * making it less likely that the page will be added back to
84250615Sobrien		 * the inactive queue prematurely again.  Here we check the
84318334Speter		 * page tables (or emulated bits, if any), given the upper
84450615Sobrien		 * level VM system not knowing anything about existing
84550615Sobrien		 * references.
84650615Sobrien		 */
84750615Sobrien		} else if (((m->flags & PG_REFERENCED) == 0) &&
84850615Sobrien			(actcount = pmap_ts_referenced(m))) {
84950615Sobrien			vm_page_activate(m);
85052558Sobrien			VM_OBJECT_UNLOCK(object);
85152558Sobrien			m->act_count += (actcount + ACT_ADVANCE);
85250615Sobrien			vm_page_unlock(m);
85350615Sobrien			continue;
85450615Sobrien		}
85550615Sobrien
85650615Sobrien		/*
85750615Sobrien		 * If the upper level VM system knows about any page
85850615Sobrien		 * references, we activate the page.  We also set the
85950615Sobrien		 * "activation count" higher than normal so that we will less
86050615Sobrien		 * likely place pages back onto the inactive queue again.
86150615Sobrien		 */
86250615Sobrien		if ((m->flags & PG_REFERENCED) != 0) {
86350615Sobrien			vm_page_flag_clear(m, PG_REFERENCED);
86450615Sobrien			actcount = pmap_ts_referenced(m);
86550615Sobrien			vm_page_activate(m);
86650615Sobrien			VM_OBJECT_UNLOCK(object);
86752558Sobrien			m->act_count += (actcount + ACT_ADVANCE + 1);
86850615Sobrien			vm_page_unlock(m);
86952558Sobrien			continue;
87050615Sobrien		}
87150615Sobrien
87250615Sobrien		/*
87350615Sobrien		 * If the upper level VM system does not believe that the page
87450615Sobrien		 * is fully dirty, but it is mapped for write access, then we
87550615Sobrien		 * consult the pmap to see if the page's dirty status should
87650615Sobrien		 * be updated.
87750615Sobrien		 */
87850615Sobrien		if (m->dirty != VM_PAGE_BITS_ALL &&
87950615Sobrien		    (m->flags & PG_WRITEABLE) != 0) {
88050615Sobrien			/*
88150615Sobrien			 * Avoid a race condition: Unless write access is
88250615Sobrien			 * removed from the page, another processor could
88350615Sobrien			 * modify it before all access is removed by the call
88450615Sobrien			 * to vm_page_cache() below.  If vm_page_cache() finds
88550615Sobrien			 * that the page has been modified when it removes all
88650615Sobrien			 * access, it panics because it cannot cache dirty
88750615Sobrien			 * pages.  In principle, we could eliminate just write
88850615Sobrien			 * access here rather than all access.  In the expected
88950615Sobrien			 * case, when there are no last instant modifications
89050615Sobrien			 * to the page, removing all access will be cheaper
89150615Sobrien			 * overall.
89250615Sobrien			 */
89350615Sobrien			if (pmap_is_modified(m))
89450615Sobrien				vm_page_dirty(m);
89550615Sobrien			else if (m->dirty == 0)
89650615Sobrien				pmap_remove_all(m);
89750615Sobrien		}
89850615Sobrien
89950615Sobrien		if (m->valid == 0) {
90050615Sobrien			/*
90150615Sobrien			 * Invalid pages can be easily freed
90250615Sobrien			 */
90350615Sobrien			vm_page_free(m);
90450615Sobrien			cnt.v_dfree++;
90550615Sobrien			--page_shortage;
90650615Sobrien		} else if (m->dirty == 0) {
90750615Sobrien			/*
90850615Sobrien			 * Clean pages can be placed onto the cache queue.
90950615Sobrien			 * This effectively frees them.
91050615Sobrien			 */
91150615Sobrien			vm_page_cache(m);
91250615Sobrien			--page_shortage;
91350615Sobrien		} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
91450615Sobrien			/*
91550615Sobrien			 * Dirty pages need to be paged out, but flushing
91650615Sobrien			 * a page is extremely expensive verses freeing
91750615Sobrien			 * a clean page.  Rather then artificially limiting
91850615Sobrien			 * the number of pages we can flush, we instead give
91950615Sobrien			 * dirty pages extra priority on the inactive queue
92050615Sobrien			 * by forcing them to be cycled through the queue
92150615Sobrien			 * twice before being flushed, after which the
92250615Sobrien			 * (now clean) page will cycle through once more
92350615Sobrien			 * before being freed.  This significantly extends
92450615Sobrien			 * the thrash point for a heavily loaded machine.
92552558Sobrien			 */
92650615Sobrien			vm_page_flag_set(m, PG_WINATCFLS);
92750615Sobrien			vm_page_requeue(m);
92850615Sobrien		} else if (maxlaunder > 0) {
92950615Sobrien			/*
93050615Sobrien			 * We always want to try to flush some dirty pages if
93150615Sobrien			 * we encounter them, to keep the system stable.
93250615Sobrien			 * Normally this number is small, but under extreme
93350615Sobrien			 * pressure where there are insufficient clean pages
93450615Sobrien			 * on the inactive queue, we may have to go all out.
93550615Sobrien			 */
93650615Sobrien			int swap_pageouts_ok, vfslocked = 0;
93750615Sobrien			struct vnode *vp = NULL;
93850615Sobrien			struct mount *mp = NULL;
93950615Sobrien
94050615Sobrien			if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
94150615Sobrien				swap_pageouts_ok = 1;
94250615Sobrien			} else {
94350615Sobrien				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
94452558Sobrien				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
94550615Sobrien				vm_page_count_min());
94650615Sobrien
94750615Sobrien			}
94850615Sobrien
94950615Sobrien			/*
95050615Sobrien			 * We don't bother paging objects that are "dead".
95150615Sobrien			 * Those objects are in a "rundown" state.
95250615Sobrien			 */
95352558Sobrien			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
95452558Sobrien				vm_page_unlock(m);
95550615Sobrien				VM_OBJECT_UNLOCK(object);
95650615Sobrien				vm_page_requeue(m);
95750615Sobrien				continue;
95850615Sobrien			}
95950615Sobrien
96052558Sobrien			/*
96152558Sobrien			 * Following operations may unlock
96252558Sobrien			 * vm_page_queue_mtx, invalidating the 'next'
96350615Sobrien			 * pointer.  To prevent an inordinate number
96450615Sobrien			 * of restarts we use our marker to remember
96550615Sobrien			 * our place.
96650615Sobrien			 *
96750615Sobrien			 */
96850615Sobrien			TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl,
96950615Sobrien					   m, &marker, pageq);
97050615Sobrien			/*
97150615Sobrien			 * The object is already known NOT to be dead.   It
97250615Sobrien			 * is possible for the vget() to block the whole
97350615Sobrien			 * pageout daemon, but the new low-memory handling
97450615Sobrien			 * code should prevent it.
97550615Sobrien			 *
97650615Sobrien			 * The previous code skipped locked vnodes and, worse,
97750615Sobrien			 * reordered pages in the queue.  This results in
97850615Sobrien			 * completely non-deterministic operation and, on a
97950615Sobrien			 * busy system, can lead to extremely non-optimal
98050615Sobrien			 * pageouts.  For example, it can cause clean pages
98152558Sobrien			 * to be freed and dirty pages to be moved to the end
98252558Sobrien			 * of the queue.  Since dirty pages are also moved to
98352558Sobrien			 * the end of the queue once-cleaned, this gives
98452558Sobrien			 * way too large a weighting to defering the freeing
98552558Sobrien			 * of dirty pages.
98652558Sobrien			 *
98752558Sobrien			 * We can't wait forever for the vnode lock, we might
98852558Sobrien			 * deadlock due to a vn_read() getting stuck in
98918334Speter			 * vm_wait while holding this vnode.  We skip the
99018334Speter			 * vnode if we can't get it in a reasonable amount
99150615Sobrien			 * of time.
99250615Sobrien			 */
99318334Speter			if (object->type == OBJT_VNODE) {
99418334Speter				vm_page_unlock_queues();
99550615Sobrien				vm_page_unlock(m);
99618334Speter				vp = object->handle;
99752558Sobrien				if (vp->v_type == VREG &&
99852558Sobrien				    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
99950615Sobrien					mp = NULL;
100050615Sobrien					++pageout_lock_miss;
100150615Sobrien					if (object->flags & OBJ_MIGHTBEDIRTY)
100250615Sobrien						vnodes_skipped++;
100350615Sobrien					vm_page_lock_queues();
100450615Sobrien					goto unlock_and_continue;
100518334Speter				}
100650615Sobrien				KASSERT(mp != NULL,
100750615Sobrien				    ("vp %p with NULL v_mount", vp));
100850615Sobrien				vm_object_reference_locked(object);
100952558Sobrien				VM_OBJECT_UNLOCK(object);
101018334Speter				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
101150615Sobrien				if (vget(vp, LK_EXCLUSIVE | LK_TIMELOCK,
101250615Sobrien				    curthread)) {
101350615Sobrien					VM_OBJECT_LOCK(object);
101450615Sobrien					vm_page_lock_queues();
101550615Sobrien					++pageout_lock_miss;
101650615Sobrien					if (object->flags & OBJ_MIGHTBEDIRTY)
101750615Sobrien						vnodes_skipped++;
101850615Sobrien					vp = NULL;
101918334Speter					goto unlock_and_continue;
102050615Sobrien				}
102150615Sobrien				VM_OBJECT_LOCK(object);
102250615Sobrien				vm_page_lock(m);
102350615Sobrien				vm_page_lock_queues();
102418334Speter				/*
102550615Sobrien				 * The page might have been moved to another
102650615Sobrien				 * queue during potential blocking in vget()
102750615Sobrien				 * above.  The page might have been freed and
102850615Sobrien				 * reused for another vnode.
102950615Sobrien				 */
103050615Sobrien				if (m->queue != PQ_INACTIVE ||
103150615Sobrien				    m->object != object ||
103250615Sobrien				    TAILQ_NEXT(m, pageq) != &marker) {
103350615Sobrien					vm_page_unlock(m);
103450615Sobrien					if (object->flags & OBJ_MIGHTBEDIRTY)
103550615Sobrien						vnodes_skipped++;
103652558Sobrien					goto unlock_and_continue;
103750615Sobrien				}
103850615Sobrien
103950615Sobrien				/*
104050615Sobrien				 * The page may have been busied during the
104150615Sobrien				 * blocking in vget().  We don't move the
104218334Speter				 * page back onto the end of the queue so that
104350615Sobrien				 * statistics are more correct if we don't.
104450615Sobrien				 */
104550615Sobrien				if (m->busy || (m->oflags & VPO_BUSY)) {
104650615Sobrien					vm_page_unlock(m);
104752558Sobrien					goto unlock_and_continue;
104852558Sobrien				}
104952558Sobrien
105050615Sobrien				/*
105150615Sobrien				 * If the page has become held it might
105252558Sobrien				 * be undergoing I/O, so skip it
105350615Sobrien				 */
105450615Sobrien				if (m->hold_count) {
105552558Sobrien					vm_page_unlock(m);
105652558Sobrien					vm_page_requeue(m);
105752558Sobrien					if (object->flags & OBJ_MIGHTBEDIRTY)
105850615Sobrien						vnodes_skipped++;
105950615Sobrien					goto unlock_and_continue;
106050615Sobrien				}
106150615Sobrien			}
106250615Sobrien
106350615Sobrien			/*
106450615Sobrien			 * If a page is dirty, then it is either being washed
106550615Sobrien			 * (but not yet cleaned) or it is still in the
106650615Sobrien			 * laundry.  If it is still in the laundry, then we
106750615Sobrien			 * start the cleaning operation.
106850615Sobrien			 *
106950615Sobrien			 * decrement page_shortage on success to account for
107050615Sobrien			 * the (future) cleaned page.  Otherwise we could wind
107150615Sobrien			 * up laundering or cleaning too many pages.
107250615Sobrien			 */
107350615Sobrien			vm_page_unlock_queues();
107450615Sobrien			if (vm_pageout_clean(m) != 0) {
107550615Sobrien				--page_shortage;
107650615Sobrien				--maxlaunder;
107750615Sobrien			}
107850615Sobrien			vm_page_lock_queues();
107950615Sobrienunlock_and_continue:
108050615Sobrien			vm_page_lock_assert(m, MA_NOTOWNED);
108150615Sobrien			VM_OBJECT_UNLOCK(object);
108250615Sobrien			if (mp != NULL) {
108350615Sobrien				vm_page_unlock_queues();
108450615Sobrien				if (vp != NULL)
108550615Sobrien					vput(vp);
108650615Sobrien				VFS_UNLOCK_GIANT(vfslocked);
108750615Sobrien				vm_object_deallocate(object);
108850615Sobrien				vn_finished_write(mp);
108950615Sobrien				vm_page_lock_queues();
109050615Sobrien			}
109150615Sobrien			next = TAILQ_NEXT(&marker, pageq);
109250615Sobrien			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl,
109350615Sobrien				     &marker, pageq);
109450615Sobrien			vm_page_lock_assert(m, MA_NOTOWNED);
109550615Sobrien			continue;
109650615Sobrien		}
109750615Sobrien		vm_page_unlock(m);
109850615Sobrien		VM_OBJECT_UNLOCK(object);
109950615Sobrien	}
110050615Sobrien
110152558Sobrien	/*
110250615Sobrien	 * Compute the number of pages we want to try to move from the
110350615Sobrien	 * active queue to the inactive queue.
110450615Sobrien	 */
110550615Sobrien	page_shortage = vm_paging_target() +
110650615Sobrien		cnt.v_inactive_target - cnt.v_inactive_count;
110750615Sobrien	page_shortage += addl_page_shortage;
110850615Sobrien
110918334Speter	/*
111050615Sobrien	 * Scan the active queue for things we can deactivate. We nominally
111150615Sobrien	 * track the per-page activity counter and use it to locate
111250615Sobrien	 * deactivation candidates.
111350615Sobrien	 */
111450615Sobrien	pcount = cnt.v_active_count;
111550615Sobrien	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
111652558Sobrien	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
111752558Sobrien
111852558Sobrien	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
111952558Sobrien
112052558Sobrien		KASSERT(m->queue == PQ_ACTIVE,
112152558Sobrien		    ("vm_pageout_scan: page %p isn't active", m));
112250615Sobrien
112352558Sobrien		next = TAILQ_NEXT(m, pageq);
112450615Sobrien		if ((m->flags & PG_MARKER) != 0) {
112552558Sobrien			m = next;
112652558Sobrien			continue;
112750615Sobrien		}
112850615Sobrien		if (!vm_pageout_page_lock(m, &next)) {
112952558Sobrien			vm_page_unlock(m);
113052558Sobrien			m = next;
113150615Sobrien			continue;
113250615Sobrien		}
113350615Sobrien		object = m->object;
113452558Sobrien		if (!VM_OBJECT_TRYLOCK(object) &&
113550615Sobrien		    !vm_pageout_fallback_object_lock(m, &next)) {
113650615Sobrien			VM_OBJECT_UNLOCK(object);
113750615Sobrien			vm_page_unlock(m);
113850615Sobrien			m = next;
113950615Sobrien			continue;
114050615Sobrien		}
114150615Sobrien
114250615Sobrien		/*
114350615Sobrien		 * Don't deactivate pages that are busy.
114450615Sobrien		 */
114550615Sobrien		if ((m->busy != 0) ||
114650615Sobrien		    (m->oflags & VPO_BUSY) ||
114752558Sobrien		    (m->hold_count != 0)) {
114850615Sobrien			vm_page_unlock(m);
114950615Sobrien			VM_OBJECT_UNLOCK(object);
115050615Sobrien			vm_page_requeue(m);
115150615Sobrien			m = next;
115250615Sobrien			continue;
115350615Sobrien		}
115450615Sobrien
115550615Sobrien		/*
115650615Sobrien		 * The count for pagedaemon pages is done after checking the
115750615Sobrien		 * page for eligibility...
115818334Speter		 */
115950615Sobrien		cnt.v_pdpages++;
116018334Speter
116150615Sobrien		/*
116250615Sobrien		 * Check to see "how much" the page has been used.
116350615Sobrien		 */
116450615Sobrien		actcount = 0;
116550615Sobrien		if (object->ref_count != 0) {
116650615Sobrien			if (m->flags & PG_REFERENCED) {
116750615Sobrien				actcount += 1;
116850615Sobrien			}
116950615Sobrien			actcount += pmap_ts_referenced(m);
117052558Sobrien			if (actcount) {
117150615Sobrien				m->act_count += ACT_ADVANCE + actcount;
117252558Sobrien				if (m->act_count > ACT_MAX)
117350615Sobrien					m->act_count = ACT_MAX;
117450615Sobrien			}
117550615Sobrien		}
117650615Sobrien
117750615Sobrien		/*
117850615Sobrien		 * Since we have "tested" this bit, we need to clear it now.
117950615Sobrien		 */
118050615Sobrien		vm_page_flag_clear(m, PG_REFERENCED);
118152558Sobrien
118252558Sobrien		/*
118352558Sobrien		 * Only if an object is currently being used, do we use the
118450615Sobrien		 * page activation count stats.
118550615Sobrien		 */
118650615Sobrien		if (actcount && (object->ref_count != 0)) {
118718334Speter			vm_page_requeue(m);
118818334Speter		} else {
118918334Speter			m->act_count -= min(m->act_count, ACT_DECLINE);
119018334Speter			if (vm_pageout_algorithm ||
119118334Speter			    object->ref_count == 0 ||
119218334Speter			    m->act_count == 0) {
119318334Speter				page_shortage--;
119418334Speter				if (object->ref_count == 0) {
119518334Speter					KASSERT(!pmap_page_is_mapped(m),
119618334Speter				    ("vm_pageout_scan: page %p is mapped", m));
119718334Speter					if (m->dirty == 0)
119818334Speter						vm_page_cache(m);
119918334Speter					else
120018334Speter						vm_page_deactivate(m);
120118334Speter				} else {
120218334Speter					vm_page_deactivate(m);
120318334Speter				}
120418334Speter			} else {
120518334Speter				vm_page_requeue(m);
120618334Speter			}
120718334Speter		}
120818334Speter		vm_page_unlock(m);
120918334Speter		VM_OBJECT_UNLOCK(object);
121018334Speter		m = next;
121118334Speter	}
121218334Speter	vm_page_unlock_queues();
121318334Speter#if !defined(NO_SWAPPING)
121418334Speter	/*
121518334Speter	 * Idle process swapout -- run once per second.
121618334Speter	 */
121718334Speter	if (vm_swap_idle_enabled) {
121818334Speter		static long lsec;
121918334Speter		if (time_second != lsec) {
122018334Speter			vm_req_vmdaemon(VM_SWAP_IDLE);
122118334Speter			lsec = time_second;
122218334Speter		}
122318334Speter	}
122418334Speter#endif
122518334Speter
122618334Speter	/*
122718334Speter	 * If we didn't get enough free pages, and we have skipped a vnode
122818334Speter	 * in a writeable object, wakeup the sync daemon.  And kick swapout
122918334Speter	 * if we did not get enough free pages.
123018334Speter	 */
123118334Speter	if (vm_paging_target() > 0) {
123218334Speter		if (vnodes_skipped && vm_page_count_min())
123318334Speter			(void) speedup_syncer();
123418334Speter#if !defined(NO_SWAPPING)
123518334Speter		if (vm_swap_enabled && vm_page_count_target())
123618334Speter			vm_req_vmdaemon(VM_SWAP_NORMAL);
123718334Speter#endif
123818334Speter	}
123918334Speter
124018334Speter	/*
124118334Speter	 * If we are critically low on one of RAM or swap and low on
124218334Speter	 * the other, kill the largest process.  However, we avoid
124318334Speter	 * doing this on the first pass in order to give ourselves a
124418334Speter	 * chance to flush out dirty vnode-backed pages and to allow
124518334Speter	 * active pages to be moved to the inactive queue and reclaimed.
124618334Speter	 */
124718334Speter	if (pass != 0 &&
124818334Speter	    ((swap_pager_avail < 64 && vm_page_count_min()) ||
124918334Speter	     (swap_pager_full && vm_paging_target() > 0)))
125018334Speter		vm_pageout_oom(VM_OOM_MEM);
125118334Speter}
125218334Speter
125350615Sobrien
125418334Spetervoid
125550615Sobrienvm_pageout_oom(int shortage)
125650615Sobrien{
125750615Sobrien	struct proc *p, *bigproc;
125850615Sobrien	vm_offset_t size, bigsize;
125950615Sobrien	struct thread *td;
126050615Sobrien	struct vmspace *vm;
126150615Sobrien
126250615Sobrien	/*
126350615Sobrien	 * We keep the process bigproc locked once we find it to keep anyone
126450615Sobrien	 * from messing with it; however, there is a possibility of
126550615Sobrien	 * deadlock if process B is bigproc and one of it's child processes
126650615Sobrien	 * attempts to propagate a signal to B while we are waiting for A's
126750615Sobrien	 * lock while walking this list.  To avoid this, we don't block on
126818334Speter	 * the process lock but just skip a process if it is already locked.
126918334Speter	 */
127018334Speter	bigproc = NULL;
127118334Speter	bigsize = 0;
127218334Speter	sx_slock(&allproc_lock);
127318334Speter	FOREACH_PROC_IN_SYSTEM(p) {
127418334Speter		int breakout;
127550615Sobrien
127618334Speter		if (PROC_TRYLOCK(p) == 0)
127752558Sobrien			continue;
127852558Sobrien		/*
127952558Sobrien		 * If this is a system, protected or killed process, skip it.
128052558Sobrien		 */
128152558Sobrien		if ((p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM)) ||
128252558Sobrien		    (p->p_pid == 1) || P_KILLED(p) ||
128352558Sobrien		    ((p->p_pid < 48) && (swap_pager_avail != 0))) {
128452558Sobrien			PROC_UNLOCK(p);
128552558Sobrien			continue;
128652558Sobrien		}
128752558Sobrien		/*
128852558Sobrien		 * If the process is in a non-running type state,
128952558Sobrien		 * don't touch it.  Check all the threads individually.
129052558Sobrien		 */
129152558Sobrien		breakout = 0;
129252558Sobrien		FOREACH_THREAD_IN_PROC(p, td) {
129352558Sobrien			thread_lock(td);
129452558Sobrien			if (!TD_ON_RUNQ(td) &&
129552558Sobrien			    !TD_IS_RUNNING(td) &&
129652558Sobrien			    !TD_IS_SLEEPING(td)) {
129752558Sobrien				thread_unlock(td);
129852558Sobrien				breakout = 1;
129952558Sobrien				break;
130052558Sobrien			}
130152558Sobrien			thread_unlock(td);
130252558Sobrien		}
130352558Sobrien		if (breakout) {
130452558Sobrien			PROC_UNLOCK(p);
130552558Sobrien			continue;
130652558Sobrien		}
130752558Sobrien		/*
130852558Sobrien		 * get the process size
130918334Speter		 */
131018334Speter		vm = vmspace_acquire_ref(p);
131118334Speter		if (vm == NULL) {
131218334Speter			PROC_UNLOCK(p);
131318334Speter			continue;
131418334Speter		}
131518334Speter		if (!vm_map_trylock_read(&vm->vm_map)) {
131650615Sobrien			vmspace_free(vm);
131718334Speter			PROC_UNLOCK(p);
131818334Speter			continue;
131950615Sobrien		}
132018334Speter		size = vmspace_swap_count(vm);
132118334Speter		vm_map_unlock_read(&vm->vm_map);
132250615Sobrien		if (shortage == VM_OOM_MEM)
132318334Speter			size += vmspace_resident_count(vm);
132418334Speter		vmspace_free(vm);
132518334Speter		/*
132652558Sobrien		 * if the this process is bigger than the biggest one
132718334Speter		 * remember it.
132850615Sobrien		 */
132918334Speter		if (size > bigsize) {
133050615Sobrien			if (bigproc != NULL)
133118334Speter				PROC_UNLOCK(bigproc);
133218334Speter			bigproc = p;
133318334Speter			bigsize = size;
133418334Speter		} else
133518334Speter			PROC_UNLOCK(p);
133618334Speter	}
133718334Speter	sx_sunlock(&allproc_lock);
133818334Speter	if (bigproc != NULL) {
133952558Sobrien		killproc(bigproc, "out of swap space");
134018334Speter		sched_nice(bigproc, PRIO_MIN);
134118334Speter		PROC_UNLOCK(bigproc);
134218334Speter		wakeup(&cnt.v_free_count);
134318334Speter	}
134450615Sobrien}
134550615Sobrien
134650615Sobrien/*
134750615Sobrien * This routine tries to maintain the pseudo LRU active queue,
134852558Sobrien * so that during long periods of time where there is no paging,
134918334Speter * that some statistic accumulation still occurs.  This code
135018334Speter * helps the situation where paging just starts to occur.
135118334Speter */
135218334Speterstatic void
135318334Spetervm_pageout_page_stats()
135450615Sobrien{
135550615Sobrien	vm_object_t object;
135650615Sobrien	vm_page_t m,next;
135750615Sobrien	int pcount,tpcount;		/* Number of pages to check */
135850615Sobrien	static int fullintervalcount = 0;
135950615Sobrien	int page_shortage;
136050615Sobrien
136150615Sobrien	page_shortage =
136250615Sobrien	    (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
136350615Sobrien	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
136418334Speter
136550615Sobrien	if (page_shortage <= 0)
136650615Sobrien		return;
136750615Sobrien
136850615Sobrien	vm_page_lock_queues();
136950615Sobrien	pcount = cnt.v_active_count;
137050615Sobrien	fullintervalcount += vm_pageout_stats_interval;
137150615Sobrien	if (fullintervalcount < vm_pageout_full_stats_interval) {
137250615Sobrien		tpcount = (int64_t)vm_pageout_stats_max * cnt.v_active_count /
137350615Sobrien		    cnt.v_page_count;
137450615Sobrien		if (pcount > tpcount)
137550615Sobrien			pcount = tpcount;
137650615Sobrien	} else {
137750615Sobrien		fullintervalcount = 0;
137850615Sobrien	}
137918334Speter
138018334Speter	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
138150615Sobrien	while ((m != NULL) && (pcount-- > 0)) {
138250615Sobrien		int actcount;
138350615Sobrien
138450615Sobrien		KASSERT(m->queue == PQ_ACTIVE,
138550615Sobrien		    ("vm_pageout_page_stats: page %p isn't active", m));
138650615Sobrien
138718334Speter		next = TAILQ_NEXT(m, pageq);
138850615Sobrien		if ((m->flags & PG_MARKER) != 0) {
138950615Sobrien			m = next;
139050615Sobrien			continue;
139150615Sobrien		}
139250615Sobrien		vm_page_lock_assert(m, MA_NOTOWNED);
139350615Sobrien		if (!vm_pageout_page_lock(m, &next)) {
139450615Sobrien			vm_page_unlock(m);
139550615Sobrien			m = next;
139650615Sobrien			continue;
139750615Sobrien		}
139850615Sobrien		object = m->object;
139950615Sobrien		if (!VM_OBJECT_TRYLOCK(object) &&
140050615Sobrien		    !vm_pageout_fallback_object_lock(m, &next)) {
140150615Sobrien			VM_OBJECT_UNLOCK(object);
140250615Sobrien			vm_page_unlock(m);
140350615Sobrien			m = next;
140418334Speter			continue;
140518334Speter		}
140618334Speter
140718334Speter		/*
140818334Speter		 * Don't deactivate pages that are busy.
140918334Speter		 */
141018334Speter		if ((m->busy != 0) ||
141152558Sobrien		    (m->oflags & VPO_BUSY) ||
141218334Speter		    (m->hold_count != 0)) {
141318334Speter			vm_page_unlock(m);
141418334Speter			VM_OBJECT_UNLOCK(object);
141518334Speter			vm_page_requeue(m);
141618334Speter			m = next;
141718334Speter			continue;
141818334Speter		}
141918334Speter
142018334Speter		actcount = 0;
142118334Speter		if (m->flags & PG_REFERENCED) {
142218334Speter			vm_page_flag_clear(m, PG_REFERENCED);
142318334Speter			actcount += 1;
142418334Speter		}
142518334Speter
142618334Speter		actcount += pmap_ts_referenced(m);
142718334Speter		if (actcount) {
142818334Speter			m->act_count += ACT_ADVANCE + actcount;
142918334Speter			if (m->act_count > ACT_MAX)
143018334Speter				m->act_count = ACT_MAX;
143118334Speter			vm_page_requeue(m);
143218334Speter		} else {
143318334Speter			if (m->act_count == 0) {
143418334Speter				/*
143518334Speter				 * We turn off page access, so that we have
143652558Sobrien				 * more accurate RSS stats.  We don't do this
143718334Speter				 * in the normal page deactivation when the
143818334Speter				 * system is loaded VM wise, because the
143918334Speter				 * cost of the large number of page protect
144018334Speter				 * operations would be higher than the value
144118334Speter				 * of doing the operation.
144218334Speter				 */
144318334Speter				pmap_remove_all(m);
144418334Speter				vm_page_deactivate(m);
144518334Speter			} else {
144618334Speter				m->act_count -= min(m->act_count, ACT_DECLINE);
144718334Speter				vm_page_requeue(m);
144818334Speter			}
144918334Speter		}
145052558Sobrien		vm_page_unlock(m);
145118334Speter		VM_OBJECT_UNLOCK(object);
145218334Speter		m = next;
145318334Speter	}
145418334Speter	vm_page_unlock_queues();
145518334Speter}
145618334Speter
145718334Speter/*
145818334Speter *	vm_pageout is the high level pageout daemon.
145952558Sobrien */
146018334Speterstatic void
146152558Sobrienvm_pageout()
146218334Speter{
146318334Speter	int error, pass;
146418334Speter
146518334Speter	/*
146618334Speter	 * Initialize some paging parameters.
146718334Speter	 */
146818334Speter	cnt.v_interrupt_free_min = 2;
146952558Sobrien	if (cnt.v_page_count < 2000)
147018334Speter		vm_pageout_page_count = 8;
147152558Sobrien
147252558Sobrien	/*
147352558Sobrien	 * v_free_reserved needs to include enough for the largest
147452558Sobrien	 * swap pager structures plus enough for any pv_entry structs
147552558Sobrien	 * when paging.
147652558Sobrien	 */
147752558Sobrien	if (cnt.v_page_count > 1024)
147852558Sobrien		cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
147952558Sobrien	else
148052558Sobrien		cnt.v_free_min = 4;
148152558Sobrien	cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
148252558Sobrien	    cnt.v_interrupt_free_min;
148352558Sobrien	cnt.v_free_reserved = vm_pageout_page_count +
148452558Sobrien	    cnt.v_pageout_free_min + (cnt.v_page_count / 768);
148550615Sobrien	cnt.v_free_severe = cnt.v_free_min / 2;
148652558Sobrien	cnt.v_free_min += cnt.v_free_reserved;
148718334Speter	cnt.v_free_severe += cnt.v_free_reserved;
148818334Speter
148918334Speter	/*
149018334Speter	 * v_free_target and v_cache_min control pageout hysteresis.  Note
149118334Speter	 * that these are more a measure of the VM cache queue hysteresis
149218334Speter	 * then the VM free queue.  Specifically, v_free_target is the
149318334Speter	 * high water mark (free+cache pages).
149418334Speter	 *
149518334Speter	 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
149618334Speter	 * low water mark, while v_free_min is the stop.  v_cache_min must
149718334Speter	 * be big enough to handle memory needs while the pageout daemon
149818334Speter	 * is signalled and run to free more pages.
149918334Speter	 */
150018334Speter	if (cnt.v_free_count > 6144)
150118334Speter		cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
150218334Speter	else
150318334Speter		cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
150418334Speter
150518334Speter	if (cnt.v_free_count > 2048) {
150650615Sobrien		cnt.v_cache_min = cnt.v_free_target;
150718334Speter		cnt.v_cache_max = 2 * cnt.v_cache_min;
150852558Sobrien		cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
150918334Speter	} else {
151018334Speter		cnt.v_cache_min = 0;
151118334Speter		cnt.v_cache_max = 0;
151218334Speter		cnt.v_inactive_target = cnt.v_free_count / 4;
151318334Speter	}
151418334Speter	if (cnt.v_inactive_target > cnt.v_free_count / 3)
151518334Speter		cnt.v_inactive_target = cnt.v_free_count / 3;
151618334Speter
151718334Speter	/* XXX does not really belong here */
151818334Speter	if (vm_page_max_wired == 0)
151918334Speter		vm_page_max_wired = cnt.v_free_count / 3;
152018334Speter
152118334Speter	if (vm_pageout_stats_max == 0)
152218334Speter		vm_pageout_stats_max = cnt.v_free_target;
152318334Speter
152418334Speter	/*
152518334Speter	 * Set interval in seconds for stats scan.
152618334Speter	 */
152718334Speter	if (vm_pageout_stats_interval == 0)
152818334Speter		vm_pageout_stats_interval = 5;
152918334Speter	if (vm_pageout_full_stats_interval == 0)
153018334Speter		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
153118334Speter
153218334Speter	swap_pager_swap_init();
153318334Speter	pass = 0;
153418334Speter	/*
153550615Sobrien	 * The pageout daemon is never done, so loop forever.
153618334Speter	 */
153718334Speter	while (TRUE) {
153818334Speter		/*
153918334Speter		 * If we have enough free memory, wakeup waiters.  Do
154018334Speter		 * not clear vm_pages_needed until we reach our target,
154118334Speter		 * otherwise we may be woken up over and over again and
154218334Speter		 * waste a lot of cpu.
154318334Speter		 */
154418334Speter		mtx_lock(&vm_page_queue_free_mtx);
154518334Speter		if (vm_pages_needed && !vm_page_count_min()) {
154618334Speter			if (!vm_paging_needed())
154752558Sobrien				vm_pages_needed = 0;
154818334Speter			wakeup(&cnt.v_free_count);
154918334Speter		}
155018334Speter		if (vm_pages_needed) {
155118334Speter			/*
155218334Speter			 * Still not done, take a second pass without waiting
155318334Speter			 * (unlimited dirty cleaning), otherwise sleep a bit
155418334Speter			 * and try again.
155552558Sobrien			 */
155618334Speter			++pass;
155752558Sobrien			if (pass > 1)
155852558Sobrien				msleep(&vm_pages_needed,
155952558Sobrien				    &vm_page_queue_free_mtx, PVM, "psleep",
156052558Sobrien				    hz / 2);
156118334Speter		} else {
156218334Speter			/*
156318334Speter			 * Good enough, sleep & handle stats.  Prime the pass
156418334Speter			 * for the next run.
156518334Speter			 */
156618334Speter			if (pass > 1)
156750615Sobrien				pass = 1;
156818334Speter			else
156952558Sobrien				pass = 0;
157052558Sobrien			error = msleep(&vm_pages_needed,
157118334Speter			    &vm_page_queue_free_mtx, PVM, "psleep",
157218334Speter			    vm_pageout_stats_interval * hz);
157352558Sobrien			if (error && !vm_pages_needed) {
157452558Sobrien				mtx_unlock(&vm_page_queue_free_mtx);
157518334Speter				pass = 0;
157618334Speter				vm_pageout_page_stats();
157718334Speter				continue;
157852558Sobrien			}
157918334Speter		}
158018334Speter		if (vm_pages_needed)
158118334Speter			cnt.v_pdwakeups++;
158218334Speter		mtx_unlock(&vm_page_queue_free_mtx);
158318334Speter		vm_pageout_scan(pass);
158418334Speter	}
158518334Speter}
158618334Speter
158718334Speter/*
158818334Speter * Unless the free page queue lock is held by the caller, this function
158952558Sobrien * should be regarded as advisory.  Specifically, the caller should
159018334Speter * not msleep() on &cnt.v_free_count following this function unless
159118334Speter * the free page queue lock is held until the msleep() is performed.
159252558Sobrien */
159352558Sobrienvoid
159452558Sobrienpagedaemon_wakeup()
159552558Sobrien{
159618334Speter
159718334Speter	if (!vm_pages_needed && curthread->td_proc != pageproc) {
159818334Speter		vm_pages_needed = 1;
159952558Sobrien		wakeup(&vm_pages_needed);
160052558Sobrien	}
160118334Speter}
160218334Speter
160318334Speter#if !defined(NO_SWAPPING)
160418334Speterstatic void
160518334Spetervm_req_vmdaemon(int req)
160652558Sobrien{
160752558Sobrien	static int lastrun = 0;
160852558Sobrien
160918334Speter	mtx_lock(&vm_daemon_mtx);
161018334Speter	vm_pageout_req_swapout |= req;
161152558Sobrien	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
161252558Sobrien		wakeup(&vm_daemon_needed);
161318334Speter		lastrun = ticks;
161452558Sobrien	}
161552558Sobrien	mtx_unlock(&vm_daemon_mtx);
161652558Sobrien}
161752558Sobrien
161852558Sobrienstatic void
161952558Sobrienvm_daemon()
162052558Sobrien{
162152558Sobrien	struct rlimit rsslim;
162252558Sobrien	struct proc *p;
162352558Sobrien	struct thread *td;
162452558Sobrien	struct vmspace *vm;
162552558Sobrien	int breakout, swapout_flags;
162652558Sobrien
162752558Sobrien	while (TRUE) {
162852558Sobrien		mtx_lock(&vm_daemon_mtx);
162952558Sobrien		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 0);
163018334Speter		swapout_flags = vm_pageout_req_swapout;
163118334Speter		vm_pageout_req_swapout = 0;
163252558Sobrien		mtx_unlock(&vm_daemon_mtx);
163352558Sobrien		if (swapout_flags)
163452558Sobrien			swapout_procs(swapout_flags);
163552558Sobrien
163652558Sobrien		/*
163752558Sobrien		 * scan the processes for exceeding their rlimits or if
163852558Sobrien		 * process is swapped out -- deactivate pages
163952558Sobrien		 */
164018334Speter		sx_slock(&allproc_lock);
164152558Sobrien		FOREACH_PROC_IN_SYSTEM(p) {
164252558Sobrien			vm_pindex_t limit, size;
164352558Sobrien
164452558Sobrien			/*
164552558Sobrien			 * if this is a system process or if we have already
164652558Sobrien			 * looked at this process, skip it.
164752558Sobrien			 */
164852558Sobrien			PROC_LOCK(p);
164952558Sobrien			if (p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
165052558Sobrien				PROC_UNLOCK(p);
165152558Sobrien				continue;
165252558Sobrien			}
165352558Sobrien			/*
165418334Speter			 * if the process is in a non-running type state,
165552558Sobrien			 * don't touch it.
165618334Speter			 */
165718334Speter			breakout = 0;
165852558Sobrien			FOREACH_THREAD_IN_PROC(p, td) {
165918334Speter				thread_lock(td);
166018334Speter				if (!TD_ON_RUNQ(td) &&
166118334Speter				    !TD_IS_RUNNING(td) &&
166218334Speter				    !TD_IS_SLEEPING(td)) {
166318334Speter					thread_unlock(td);
166418334Speter					breakout = 1;
166552558Sobrien					break;
166652558Sobrien				}
166752558Sobrien				thread_unlock(td);
166852558Sobrien			}
166952558Sobrien			if (breakout) {
167052558Sobrien				PROC_UNLOCK(p);
167152558Sobrien				continue;
167252558Sobrien			}
167352558Sobrien			/*
167452558Sobrien			 * get a limit
167552558Sobrien			 */
167652558Sobrien			lim_rlimit(p, RLIMIT_RSS, &rsslim);
167752558Sobrien			limit = OFF_TO_IDX(
167852558Sobrien			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
167952558Sobrien
168052558Sobrien			/*
168152558Sobrien			 * let processes that are swapped out really be
168252558Sobrien			 * swapped out set the limit to nothing (will force a
168352558Sobrien			 * swap-out.)
168452558Sobrien			 */
168552558Sobrien			if ((p->p_flag & P_INMEM) == 0)
168652558Sobrien				limit = 0;	/* XXX */
168752558Sobrien			vm = vmspace_acquire_ref(p);
168852558Sobrien			PROC_UNLOCK(p);
168952558Sobrien			if (vm == NULL)
169052558Sobrien				continue;
169152558Sobrien
169252558Sobrien			size = vmspace_resident_count(vm);
169352558Sobrien			if (limit >= 0 && size >= limit) {
169452558Sobrien				vm_pageout_map_deactivate_pages(
169518334Speter				    &vm->vm_map, limit);
169618334Speter			}
169718334Speter			vmspace_free(vm);
169818334Speter		}
169918334Speter		sx_sunlock(&allproc_lock);
170018334Speter	}
170152558Sobrien}
170218334Speter#endif			/* !defined(NO_SWAPPING) */
170352558Sobrien