1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27/*	  All Rights Reserved  	*/
28
29/*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39#include <sys/types.h>
40#include <sys/t_lock.h>
41#include <sys/param.h>
42#include <sys/buf.h>
43#include <sys/uio.h>
44#include <sys/proc.h>
45#include <sys/systm.h>
46#include <sys/mman.h>
47#include <sys/cred.h>
48#include <sys/vnode.h>
49#include <sys/vm.h>
50#include <sys/vmparam.h>
51#include <sys/vtrace.h>
52#include <sys/cmn_err.h>
53#include <sys/cpuvar.h>
54#include <sys/user.h>
55#include <sys/kmem.h>
56#include <sys/debug.h>
57#include <sys/callb.h>
58#include <sys/tnf_probe.h>
59#include <sys/mem_cage.h>
60#include <sys/time.h>
61
62#include <vm/hat.h>
63#include <vm/as.h>
64#include <vm/seg.h>
65#include <vm/page.h>
66#include <vm/pvn.h>
67#include <vm/seg_kmem.h>
68
69static int checkpage(page_t *, int);
70
71/*
72 * The following parameters control operation of the page replacement
73 * algorithm.  They are initialized to 0, and then computed at boot time
74 * based on the size of the system.  If they are patched non-zero in
75 * a loaded vmunix they are left alone and may thus be changed per system
76 * using adb on the loaded system.
77 */
78pgcnt_t		slowscan = 0;
79pgcnt_t		fastscan = 0;
80
81static pgcnt_t	handspreadpages = 0;
82static int	loopfraction = 2;
83static pgcnt_t	looppages;
84static int	min_percent_cpu = 4;
85static int	max_percent_cpu = 80;
86static pgcnt_t	maxfastscan = 0;
87static pgcnt_t	maxslowscan = 100;
88
89pgcnt_t	maxpgio = 0;
90pgcnt_t	minfree = 0;
91pgcnt_t	desfree = 0;
92pgcnt_t	lotsfree = 0;
93pgcnt_t	needfree = 0;
94pgcnt_t	throttlefree = 0;
95pgcnt_t	pageout_reserve = 0;
96
97pgcnt_t	deficit;
98pgcnt_t	nscan;
99pgcnt_t	desscan;
100
101/*
102 * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
103 * are the number of ticks in each wakeup cycle that gives the
104 * equivalent of some underlying %CPU duty cycle.
105 * When RATETOSCHEDPAGING is 4,  and hz is 100, pageout_scanner is
106 * awakened every 25 clock ticks.  So, converting from %CPU to ticks
107 * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
108 * So, for example, 4% == 1 tick and 80% == 20 ticks.
109 *
110 * min_pageout_ticks:
111 *     ticks/wakeup equivalent of min_percent_cpu.
112 *
113 * max_pageout_ticks:
114 *     ticks/wakeup equivalent of max_percent_cpu.
115 *
116 * pageout_ticks:
117 *     Number of clock ticks budgeted for each wakeup cycle.
118 *     Computed each time around by schedpaging().
119 *     Varies between min_pageout_ticks .. max_pageout_ticks,
120 *     depending on memory pressure.
121 *
122 * pageout_lbolt:
123 *     Timestamp of the last time pageout_scanner woke up and started
124 *     (or resumed) scanning for not recently referenced pages.
125 */
126
127static clock_t	min_pageout_ticks;
128static clock_t	max_pageout_ticks;
129static clock_t	pageout_ticks;
130static clock_t	pageout_lbolt;
131
132static uint_t	reset_hands;
133
134#define	PAGES_POLL_MASK	1023
135
136/*
137 * pageout_sample_lim:
138 *     The limit on the number of samples needed to establish a value
139 *     for new pageout parameters, fastscan, slowscan, and handspreadpages.
140 *
141 * pageout_sample_cnt:
142 *     Current sample number.  Once the sample gets large enough,
143 *     set new values for handspreadpages, fastscan and slowscan.
144 *
145 * pageout_sample_pages:
146 *     The accumulated number of pages scanned during sampling.
147 *
148 * pageout_sample_ticks:
149 *     The accumulated clock ticks for the sample.
150 *
151 * pageout_rate:
152 *     Rate in pages/nanosecond, computed at the end of sampling.
153 *
154 * pageout_new_spread:
155 *     The new value to use for fastscan and handspreadpages.
156 *     Calculated after enough samples have been taken.
157 */
158
159typedef hrtime_t hrrate_t;
160
161static uint64_t	pageout_sample_lim = 4;
162static uint64_t	pageout_sample_cnt = 0;
163static pgcnt_t	pageout_sample_pages = 0;
164static hrrate_t	pageout_rate = 0;
165static pgcnt_t	pageout_new_spread = 0;
166
167static clock_t	pageout_cycle_ticks;
168static hrtime_t	sample_start, sample_end;
169static hrtime_t	pageout_sample_etime = 0;
170
171/*
172 * Record number of times a pageout_scanner wakeup cycle finished because it
173 * timed out (exceeded its CPU budget), rather than because it visited
174 * its budgeted number of pages.
175 */
176uint64_t pageout_timeouts = 0;
177
178#ifdef VM_STATS
179static struct pageoutvmstats_str {
180	ulong_t	checkpage[3];
181} pageoutvmstats;
182#endif /* VM_STATS */
183
184/*
185 * Threads waiting for free memory use this condition variable and lock until
186 * memory becomes available.
187 */
188kmutex_t	memavail_lock;
189kcondvar_t	memavail_cv;
190
191/*
192 * The size of the clock loop.
193 */
194#define	LOOPPAGES	total_pages
195
196/*
197 * Set up the paging constants for the clock algorithm.
198 * Called after the system is initialized and the amount of memory
199 * and number of paging devices is known.
200 *
201 * lotsfree is 1/64 of memory, but at least 512K.
202 * desfree is 1/2 of lotsfree.
203 * minfree is 1/2 of desfree.
204 *
205 * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
206 *
207 *	lotsfree = btop(512K)
208 *	desfree = btop(200K)
209 *	minfree = btop(100K)
210 *	throttlefree = INT_MIN
211 *	max_percent_cpu = 4
212 */
213void
214setupclock(int recalc)
215{
216
217	static spgcnt_t init_lfree, init_dfree, init_mfree;
218	static spgcnt_t init_tfree, init_preserve, init_mpgio;
219	static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
220
221	looppages = LOOPPAGES;
222
223	/*
224	 * setupclock can now be called to recalculate the paging
225	 * parameters in the case of dynamic addition of memory.
226	 * So to make sure we make the proper calculations, if such a
227	 * situation should arise, we save away the initial values
228	 * of each parameter so we can recall them when needed. This
229	 * way we don't lose the settings an admin might have made
230	 * through the /etc/system file.
231	 */
232
233	if (!recalc) {
234		init_lfree = lotsfree;
235		init_dfree = desfree;
236		init_mfree = minfree;
237		init_tfree = throttlefree;
238		init_preserve = pageout_reserve;
239		init_mpgio = maxpgio;
240		init_mfscan = maxfastscan;
241		init_fscan = fastscan;
242		init_sscan = slowscan;
243		init_hspages = handspreadpages;
244	}
245
246	/*
247	 * Set up thresholds for paging:
248	 */
249
250	/*
251	 * Lotsfree is threshold where paging daemon turns on.
252	 */
253	if (init_lfree == 0 || init_lfree >= looppages)
254		lotsfree = MAX(looppages / 64, btop(512 * 1024));
255	else
256		lotsfree = init_lfree;
257
258	/*
259	 * Desfree is amount of memory desired free.
260	 * If less than this for extended period, start swapping.
261	 */
262	if (init_dfree == 0 || init_dfree >= lotsfree)
263		desfree = lotsfree / 2;
264	else
265		desfree = init_dfree;
266
267	/*
268	 * Minfree is minimal amount of free memory which is tolerable.
269	 */
270	if (init_mfree == 0 || init_mfree >= desfree)
271		minfree = desfree / 2;
272	else
273		minfree = init_mfree;
274
275	/*
276	 * Throttlefree is the point at which we start throttling
277	 * PG_WAIT requests until enough memory becomes available.
278	 */
279	if (init_tfree == 0 || init_tfree >= desfree)
280		throttlefree = minfree;
281	else
282		throttlefree = init_tfree;
283
284	/*
285	 * Pageout_reserve is the number of pages that we keep in
286	 * stock for pageout's own use.  Having a few such pages
287	 * provides insurance against system deadlock due to
288	 * pageout needing pages.  When freemem < pageout_reserve,
289	 * non-blocking allocations are denied to any threads
290	 * other than pageout and sched.  (At some point we might
291	 * want to consider a per-thread flag like T_PUSHING_PAGES
292	 * to indicate that a thread is part of the page-pushing
293	 * dance (e.g. an interrupt thread) and thus is entitled
294	 * to the same special dispensation we accord pageout.)
295	 */
296	if (init_preserve == 0 || init_preserve >= throttlefree)
297		pageout_reserve = throttlefree / 2;
298	else
299		pageout_reserve = init_preserve;
300
301	/*
302	 * Maxpgio thresholds how much paging is acceptable.
303	 * This figures that 2/3 busy on an arm is all that is
304	 * tolerable for paging.  We assume one operation per disk rev.
305	 *
306	 * XXX - Does not account for multiple swap devices.
307	 */
308	if (init_mpgio == 0)
309		maxpgio = (DISKRPM * 2) / 3;
310	else
311		maxpgio = init_mpgio;
312
313	/*
314	 * The clock scan rate varies between fastscan and slowscan
315	 * based on the amount of free memory available.  Fastscan
316	 * rate should be set based on the number pages that can be
317	 * scanned per sec using ~10% of processor time.  Since this
318	 * value depends on the processor, MMU, Mhz etc., it is
319	 * difficult to determine it in a generic manner for all
320	 * architectures.
321	 *
322	 * Instead of trying to determine the number of pages scanned
323	 * per sec for every processor, fastscan is set to be the smaller
324	 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
325	 * time is limited to ~4% of processor time.
326	 *
327	 * Setting fastscan to be 1/2 of memory allows pageout to scan
328	 * all of memory in ~2 secs.  This implies that user pages not
329	 * accessed within 1 sec (assuming, handspreadpages == fastscan)
330	 * can be reclaimed when free memory is very low.  Stealing pages
331	 * not accessed within 1 sec seems reasonable and ensures that
332	 * active user processes don't thrash.
333	 *
334	 * Smaller values of fastscan result in scanning fewer pages
335	 * every second and consequently pageout may not be able to free
336	 * sufficient memory to maintain the minimum threshold.  Larger
337	 * values of fastscan result in scanning a lot more pages which
338	 * could lead to thrashing and higher CPU usage.
339	 *
340	 * Fastscan needs to be limited to a maximum value and should not
341	 * scale with memory to prevent pageout from consuming too much
342	 * time for scanning on slow CPU's and avoid thrashing, as a
343	 * result of scanning too many pages, on faster CPU's.
344	 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
345	 * (the upper bound for fastscan) based on the average number
346	 * of pages that can potentially be scanned in ~1 sec (using ~4%
347	 * of the CPU) on some of the following machines that currently
348	 * run Solaris 2.x:
349	 *
350	 *			average memory scanned in ~1 sec
351	 *
352	 *	25 Mhz SS1+:		23 Meg
353	 *	LX:			37 Meg
354	 *	50 Mhz SC2000:		68 Meg
355	 *
356	 *	40 Mhz 486:		26 Meg
357	 *	66 Mhz 486:		42 Meg
358	 *
359	 * When free memory falls just below lotsfree, the scan rate
360	 * goes from 0 to slowscan (i.e., pageout starts running).  This
361	 * transition needs to be smooth and is achieved by ensuring that
362	 * pageout scans a small number of pages to satisfy the transient
363	 * memory demand.  This is set to not exceed 100 pages/sec (25 per
364	 * wakeup) since scanning that many pages has no noticible impact
365	 * on system performance.
366	 *
367	 * In addition to setting fastscan and slowscan, pageout is
368	 * limited to using ~4% of the CPU.  This results in increasing
369	 * the time taken to scan all of memory, which in turn means that
370	 * user processes have a better opportunity of preventing their
371	 * pages from being stolen.  This has a positive effect on
372	 * interactive and overall system performance when memory demand
373	 * is high.
374	 *
375	 * Thus, the rate at which pages are scanned for replacement will
376	 * vary linearly between slowscan and the number of pages that
377	 * can be scanned using ~4% of processor time instead of varying
378	 * linearly between slowscan and fastscan.
379	 *
380	 * Also, the processor time used by pageout will vary from ~1%
381	 * at slowscan to ~4% at fastscan instead of varying between
382	 * ~1% at slowscan and ~10% at fastscan.
383	 *
384	 * The values chosen for the various VM parameters (fastscan,
385	 * handspreadpages, etc) are not universally true for all machines,
386	 * but appear to be a good rule of thumb for the machines we've
387	 * tested.  They have the following ranges:
388	 *
389	 *	cpu speed:	20 to 70 Mhz
390	 *	page size:	4K to 8K
391	 *	memory size:	16M to 5G
392	 *	page scan rate:	4000 - 17400 4K pages per sec
393	 *
394	 * The values need to be re-examined for machines which don't
395	 * fall into the various ranges (e.g., slower or faster CPUs,
396	 * smaller or larger pagesizes etc) shown above.
397	 *
398	 * On an MP machine, pageout is often unable to maintain the
399	 * minimum paging thresholds under heavy load.  This is due to
400	 * the fact that user processes running on other CPU's can be
401	 * dirtying memory at a much faster pace than pageout can find
402	 * pages to free.  The memory demands could be met by enabling
403	 * more than one CPU to run the clock algorithm in such a manner
404	 * that the various clock hands don't overlap.  This also makes
405	 * it more difficult to determine the values for fastscan, slowscan
406	 * and handspreadpages.
407	 *
408	 * The swapper is currently used to free up memory when pageout
409	 * is unable to meet memory demands by swapping out processes.
410	 * In addition to freeing up memory, swapping also reduces the
411	 * demand for memory by preventing user processes from running
412	 * and thereby consuming memory.
413	 */
414	if (init_mfscan == 0) {
415		if (pageout_new_spread != 0)
416			maxfastscan = pageout_new_spread;
417		else
418			maxfastscan = MAXHANDSPREADPAGES;
419	} else {
420		maxfastscan = init_mfscan;
421	}
422	if (init_fscan == 0)
423		fastscan = MIN(looppages / loopfraction, maxfastscan);
424	else
425		fastscan = init_fscan;
426	if (fastscan > looppages / loopfraction)
427		fastscan = looppages / loopfraction;
428
429	/*
430	 * Set slow scan time to 1/10 the fast scan time, but
431	 * not to exceed maxslowscan.
432	 */
433	if (init_sscan == 0)
434		slowscan = MIN(fastscan / 10, maxslowscan);
435	else
436		slowscan = init_sscan;
437	if (slowscan > fastscan / 2)
438		slowscan = fastscan / 2;
439
440	/*
441	 * Handspreadpages is distance (in pages) between front and back
442	 * pageout daemon hands.  The amount of time to reclaim a page
443	 * once pageout examines it increases with this distance and
444	 * decreases as the scan rate rises. It must be < the amount
445	 * of pageable memory.
446	 *
447	 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
448	 * to be "fastscan" results in the front hand being a few secs
449	 * (varies based on the processor speed) ahead of the back hand
450	 * at fastscan rates.  This distance can be further reduced, if
451	 * necessary, by increasing the processor time used by pageout
452	 * to be more than ~4% and preferrably not more than ~10%.
453	 *
454	 * As a result, user processes have a much better chance of
455	 * referencing their pages before the back hand examines them.
456	 * This also significantly lowers the number of reclaims from
457	 * the freelist since pageout does not end up freeing pages which
458	 * may be referenced a sec later.
459	 */
460	if (init_hspages == 0)
461		handspreadpages = fastscan;
462	else
463		handspreadpages = init_hspages;
464
465	/*
466	 * Make sure that back hand follows front hand by at least
467	 * 1/RATETOSCHEDPAGING seconds.  Without this test, it is possible
468	 * for the back hand to look at a page during the same wakeup of
469	 * the pageout daemon in which the front hand cleared its ref bit.
470	 */
471	if (handspreadpages >= looppages)
472		handspreadpages = looppages - 1;
473
474	/*
475	 * If we have been called to recalculate the parameters,
476	 * set a flag to re-evaluate the clock hand pointers.
477	 */
478	if (recalc)
479		reset_hands = 1;
480}
481
482/*
483 * Pageout scheduling.
484 *
485 * Schedpaging controls the rate at which the page out daemon runs by
486 * setting the global variables nscan and desscan RATETOSCHEDPAGING
487 * times a second.  Nscan records the number of pages pageout has examined
488 * in its current pass; schedpaging resets this value to zero each time
489 * it runs.  Desscan records the number of pages pageout should examine
490 * in its next pass; schedpaging sets this value based on the amount of
491 * currently available memory.
492 */
493
494#define	RATETOSCHEDPAGING	4		/* hz that is */
495
496static kmutex_t	pageout_mutex;	/* held while pageout or schedpaging running */
497
498/*
499 * Pool of available async pageout putpage requests.
500 */
501static struct async_reqs *push_req;
502static struct async_reqs *req_freelist;	/* available req structs */
503static struct async_reqs *push_list;	/* pending reqs */
504static kmutex_t push_lock;		/* protects req pool */
505static kcondvar_t push_cv;
506
507static int async_list_size = 256;	/* number of async request structs */
508
509static void pageout_scanner(void);
510
511/*
512 * If a page is being shared more than "po_share" times
513 * then leave it alone- don't page it out.
514 */
515#define	MIN_PO_SHARE	(8)
516#define	MAX_PO_SHARE	((MIN_PO_SHARE) << 24)
517ulong_t	po_share = MIN_PO_SHARE;
518
519/*
520 * Schedule rate for paging.
521 * Rate is linear interpolation between
522 * slowscan with lotsfree and fastscan when out of memory.
523 */
524static void
525schedpaging(void *arg)
526{
527	spgcnt_t vavail;
528
529	if (freemem < lotsfree + needfree + kmem_reapahead)
530		kmem_reap();
531
532	if (freemem < lotsfree + needfree)
533		seg_preap();
534
535	if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
536		kcage_cageout_wakeup();
537
538	if (mutex_tryenter(&pageout_mutex)) {
539		/* pageout() not running */
540		nscan = 0;
541		vavail = freemem - deficit;
542		if (pageout_new_spread != 0)
543			vavail -= needfree;
544		if (vavail < 0)
545			vavail = 0;
546		if (vavail > lotsfree)
547			vavail = lotsfree;
548
549		/*
550		 * Fix for 1161438 (CRS SPR# 73922).  All variables
551		 * in the original calculation for desscan were 32 bit signed
552		 * ints.  As freemem approaches 0x0 on a system with 1 Gig or
553		 * more of memory, the calculation can overflow.  When this
554		 * happens, desscan becomes negative and pageout_scanner()
555		 * stops paging out.
556		 */
557		if ((needfree) && (pageout_new_spread == 0)) {
558			/*
559			 * If we've not yet collected enough samples to
560			 * calculate a spread, use the old logic of kicking
561			 * into high gear anytime needfree is non-zero.
562			 */
563			desscan = fastscan / RATETOSCHEDPAGING;
564		} else {
565			/*
566			 * Once we've calculated a spread based on system
567			 * memory and usage, just treat needfree as another
568			 * form of deficit.
569			 */
570			spgcnt_t faststmp, slowstmp, result;
571
572			slowstmp = slowscan * vavail;
573			faststmp = fastscan * (lotsfree - vavail);
574			result = (slowstmp + faststmp) /
575			    nz(lotsfree) / RATETOSCHEDPAGING;
576			desscan = (pgcnt_t)result;
577		}
578
579		pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
580		    (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
581
582		if (freemem < lotsfree + needfree ||
583		    pageout_sample_cnt < pageout_sample_lim) {
584			TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
585			    "pageout_cv_signal:freemem %ld", freemem);
586			cv_signal(&proc_pageout->p_cv);
587		} else {
588			/*
589			 * There are enough free pages, no need to
590			 * kick the scanner thread.  And next time
591			 * around, keep more of the `highly shared'
592			 * pages.
593			 */
594			cv_signal_pageout();
595			if (po_share > MIN_PO_SHARE) {
596				po_share >>= 1;
597			}
598		}
599		mutex_exit(&pageout_mutex);
600	}
601
602	/*
603	 * Signal threads waiting for available memory.
604	 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
605	 * in this case it is not needed - the waiters will be waken up during
606	 * the next invocation of this function.
607	 */
608	if (kmem_avail() > 0)
609		cv_broadcast(&memavail_cv);
610
611	(void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
612}
613
614pgcnt_t		pushes;
615ulong_t		push_list_size;		/* # of requests on pageout queue */
616
617#define	FRONT	1
618#define	BACK	2
619
620int dopageout = 1;	/* must be non-zero to turn page stealing on */
621
622/*
623 * The page out daemon, which runs as process 2.
624 *
625 * As long as there are at least lotsfree pages,
626 * this process is not run.  When the number of free
627 * pages stays in the range desfree to lotsfree,
628 * this daemon runs through the pages in the loop
629 * at a rate determined in schedpaging().  Pageout manages
630 * two hands on the clock.  The front hand moves through
631 * memory, clearing the reference bit,
632 * and stealing pages from procs that are over maxrss.
633 * The back hand travels a distance behind the front hand,
634 * freeing the pages that have not been referenced in the time
635 * since the front hand passed.  If modified, they are pushed to
636 * swap before being freed.
637 *
638 * There are 2 threads that act on behalf of the pageout process.
639 * One thread scans pages (pageout_scanner) and frees them up if
640 * they don't require any VOP_PUTPAGE operation. If a page must be
641 * written back to its backing store, the request is put on a list
642 * and the other (pageout) thread is signaled. The pageout thread
643 * grabs VOP_PUTPAGE requests from the list, and processes them.
644 * Some filesystems may require resources for the VOP_PUTPAGE
645 * operations (like memory) and hence can block the pageout
646 * thread, but the scanner thread can still operate. There is still
647 * no guarantee that memory deadlocks cannot occur.
648 *
649 * For now, this thing is in very rough form.
650 */
651void
652pageout()
653{
654	struct async_reqs *arg;
655	pri_t pageout_pri;
656	int i;
657	pgcnt_t max_pushes;
658	callb_cpr_t cprinfo;
659
660	proc_pageout = ttoproc(curthread);
661	proc_pageout->p_cstime = 0;
662	proc_pageout->p_stime =  0;
663	proc_pageout->p_cutime =  0;
664	proc_pageout->p_utime = 0;
665	bcopy("pageout", PTOU(curproc)->u_psargs, 8);
666	bcopy("pageout", PTOU(curproc)->u_comm, 7);
667
668	/*
669	 * Create pageout scanner thread
670	 */
671	mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
672	mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
673
674	/*
675	 * Allocate and initialize the async request structures
676	 * for pageout.
677	 */
678	push_req = (struct async_reqs *)
679	    kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
680
681	req_freelist = push_req;
682	for (i = 0; i < async_list_size - 1; i++)
683		push_req[i].a_next = &push_req[i + 1];
684
685	pageout_pri = curthread->t_pri;
686
687	/* Create the pageout scanner thread. */
688	(void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
689	    pageout_pri - 1);
690
691	/*
692	 * kick off pageout scheduler.
693	 */
694	schedpaging(NULL);
695
696	/*
697	 * Create kernel cage thread.
698	 * The kernel cage thread is started under the pageout process
699	 * to take advantage of the less restricted page allocation
700	 * in page_create_throttle().
701	 */
702	kcage_cageout_init();
703
704	/*
705	 * Limit pushes to avoid saturating pageout devices.
706	 */
707	max_pushes = maxpgio / RATETOSCHEDPAGING;
708	CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
709
710	for (;;) {
711		mutex_enter(&push_lock);
712
713		while ((arg = push_list) == NULL || pushes > max_pushes) {
714			CALLB_CPR_SAFE_BEGIN(&cprinfo);
715			cv_wait(&push_cv, &push_lock);
716			pushes = 0;
717			CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
718		}
719		push_list = arg->a_next;
720		arg->a_next = NULL;
721		mutex_exit(&push_lock);
722
723		if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
724		    arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
725			pushes++;
726		}
727
728		/* vp held by checkpage() */
729		VN_RELE(arg->a_vp);
730
731		mutex_enter(&push_lock);
732		arg->a_next = req_freelist;	/* back on freelist */
733		req_freelist = arg;
734		push_list_size--;
735		mutex_exit(&push_lock);
736	}
737}
738
739/*
740 * Kernel thread that scans pages looking for ones to free
741 */
742static void
743pageout_scanner(void)
744{
745	struct page *fronthand, *backhand;
746	uint_t count;
747	callb_cpr_t cprinfo;
748	pgcnt_t	nscan_limit;
749	pgcnt_t	pcount;
750
751	CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
752	mutex_enter(&pageout_mutex);
753
754	/*
755	 * The restart case does not attempt to point the hands at roughly
756	 * the right point on the assumption that after one circuit things
757	 * will have settled down - and restarts shouldn't be that often.
758	 */
759
760	/*
761	 * Set the two clock hands to be separated by a reasonable amount,
762	 * but no more than 360 degrees apart.
763	 */
764	backhand = page_first();
765	if (handspreadpages >= total_pages)
766		fronthand = page_nextn(backhand, total_pages - 1);
767	else
768		fronthand = page_nextn(backhand, handspreadpages);
769
770	min_pageout_ticks = MAX(1,
771	    ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
772	max_pageout_ticks = MAX(min_pageout_ticks,
773	    ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
774
775loop:
776	cv_signal_pageout();
777
778	CALLB_CPR_SAFE_BEGIN(&cprinfo);
779	cv_wait(&proc_pageout->p_cv, &pageout_mutex);
780	CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
781
782	if (!dopageout)
783		goto loop;
784
785	if (reset_hands) {
786		reset_hands = 0;
787
788		backhand = page_first();
789		if (handspreadpages >= total_pages)
790			fronthand = page_nextn(backhand, total_pages - 1);
791		else
792			fronthand = page_nextn(backhand, handspreadpages);
793	}
794
795	CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
796	count = 0;
797
798	TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
799	    "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
800	    freemem, lotsfree, nscan, desscan);
801
802	/* Kernel probe */
803	TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
804	    tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
805
806	pcount = 0;
807	if (pageout_sample_cnt < pageout_sample_lim) {
808		nscan_limit = total_pages;
809	} else {
810		nscan_limit = desscan;
811	}
812	pageout_lbolt = ddi_get_lbolt();
813	sample_start = gethrtime();
814
815	/*
816	 * Scan the appropriate number of pages for a single duty cycle.
817	 * However, stop scanning as soon as there is enough free memory.
818	 * For a short while, we will be sampling the performance of the
819	 * scanner and need to keep running just to get sample data, in
820	 * which case we keep going and don't pay attention to whether
821	 * or not there is enough free memory.
822	 */
823
824	while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
825	    pageout_sample_cnt < pageout_sample_lim)) {
826		int rvfront, rvback;
827
828		/*
829		 * Check to see if we have exceeded our %CPU budget
830		 * for this wakeup, but not on every single page visited,
831		 * just every once in a while.
832		 */
833		if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
834			pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
835			if (pageout_cycle_ticks >= pageout_ticks) {
836				++pageout_timeouts;
837				break;
838			}
839		}
840
841		/*
842		 * If checkpage manages to add a page to the free list,
843		 * we give ourselves another couple of trips around the loop.
844		 */
845		if ((rvfront = checkpage(fronthand, FRONT)) == 1)
846			count = 0;
847		if ((rvback = checkpage(backhand, BACK)) == 1)
848			count = 0;
849
850		++pcount;
851
852		/*
853		 * protected by pageout_mutex instead of cpu_stat_lock
854		 */
855		CPU_STATS_ADDQ(CPU, vm, scan, 1);
856
857		/*
858		 * Don't include ineligible pages in the number scanned.
859		 */
860		if (rvfront != -1 || rvback != -1)
861			nscan++;
862
863		backhand = page_next(backhand);
864
865		/*
866		 * backhand update and wraparound check are done separately
867		 * because lint barks when it finds an empty "if" body
868		 */
869
870		if ((fronthand = page_next(fronthand)) == page_first())	{
871			TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
872			    "pageout_hand_wrap:freemem %ld whichhand %d",
873			    freemem, FRONT);
874
875			/*
876			 * protected by pageout_mutex instead of cpu_stat_lock
877			 */
878			CPU_STATS_ADDQ(CPU, vm, rev, 1);
879			if (++count > 1) {
880				/*
881				 * Extremely unlikely, but it happens.
882				 * We went around the loop at least once
883				 * and didn't get far enough.
884				 * If we are still skipping `highly shared'
885				 * pages, skip fewer of them.  Otherwise,
886				 * give up till the next clock tick.
887				 */
888				if (po_share < MAX_PO_SHARE) {
889					po_share <<= 1;
890				} else {
891					/*
892					 * Really a "goto loop", but
893					 * if someone is TRACing or
894					 * TNF_PROBE_ing, at least
895					 * make records to show
896					 * where we are.
897					 */
898					break;
899				}
900			}
901		}
902	}
903
904	sample_end = gethrtime();
905
906	TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
907	    "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
908	    freemem, lotsfree, nscan, desscan, count);
909
910	/* Kernel probe */
911	TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
912	    tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem);
913
914	if (pageout_sample_cnt < pageout_sample_lim) {
915		pageout_sample_pages += pcount;
916		pageout_sample_etime += sample_end - sample_start;
917		++pageout_sample_cnt;
918	}
919	if (pageout_sample_cnt >= pageout_sample_lim &&
920	    pageout_new_spread == 0) {
921		pageout_rate = (hrrate_t)pageout_sample_pages *
922		    (hrrate_t)(NANOSEC) / pageout_sample_etime;
923		pageout_new_spread = pageout_rate / 10;
924		setupclock(1);
925	}
926
927	goto loop;
928}
929
930/*
931 * Look at the page at hand.  If it is locked (e.g., for physical i/o),
932 * system (u., page table) or free, then leave it alone.  Otherwise,
933 * if we are running the front hand, turn off the page's reference bit.
934 * If the proc is over maxrss, we take it.  If running the back hand,
935 * check whether the page has been reclaimed.  If not, free the page,
936 * pushing it to disk first if necessary.
937 *
938 * Return values:
939 *	-1 if the page is not a candidate at all,
940 *	 0 if not freed, or
941 *	 1 if we freed it.
942 */
943static int
944checkpage(struct page *pp, int whichhand)
945{
946	int ppattr;
947	int isfs = 0;
948	int isexec = 0;
949	int pagesync_flag;
950
951	/*
952	 * Skip pages:
953	 * 	- associated with the kernel vnode since
954	 *	    they are always "exclusively" locked.
955	 *	- that are free
956	 *	- that are shared more than po_share'd times
957	 *	- its already locked
958	 *
959	 * NOTE:  These optimizations assume that reads are atomic.
960	 */
961
962	if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
963	    pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
964	    hat_page_checkshare(pp, po_share)) {
965		return (-1);
966	}
967
968	if (!page_trylock(pp, SE_EXCL)) {
969		/*
970		 * Skip the page if we can't acquire the "exclusive" lock.
971		 */
972		return (-1);
973	} else if (PP_ISFREE(pp)) {
974		/*
975		 * It became free between the above check and our actually
976		 * locking the page.  Oh, well there will be other pages.
977		 */
978		page_unlock(pp);
979		return (-1);
980	}
981
982	/*
983	 * Reject pages that cannot be freed. The page_struct_lock
984	 * need not be acquired to examine these
985	 * fields since the page has an "exclusive" lock.
986	 */
987	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
988		page_unlock(pp);
989		return (-1);
990	}
991
992	/*
993	 * Maintain statistics for what we are freeing
994	 */
995
996	if (pp->p_vnode != NULL) {
997		if (pp->p_vnode->v_flag & VVMEXEC)
998			isexec = 1;
999
1000		if (!IS_SWAPFSVP(pp->p_vnode))
1001			isfs = 1;
1002	}
1003
1004	/*
1005	 * Turn off REF and MOD bits with the front hand.
1006	 * The back hand examines the REF bit and always considers
1007	 * SHARED pages as referenced.
1008	 */
1009	if (whichhand == FRONT)
1010		pagesync_flag = HAT_SYNC_ZERORM;
1011	else
1012		pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1013		    HAT_SYNC_STOPON_SHARED;
1014
1015	ppattr = hat_pagesync(pp, pagesync_flag);
1016
1017recheck:
1018	/*
1019	 * If page is referenced; make unreferenced but reclaimable.
1020	 * If this page is not referenced, then it must be reclaimable
1021	 * and we can add it to the free list.
1022	 */
1023	if (ppattr & P_REF) {
1024		TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
1025		    "pageout_isref:pp %p whichhand %d", pp, whichhand);
1026		if (whichhand == FRONT) {
1027			/*
1028			 * Checking of rss or madvise flags needed here...
1029			 *
1030			 * If not "well-behaved", fall through into the code
1031			 * for not referenced.
1032			 */
1033			hat_clrref(pp);
1034		}
1035		/*
1036		 * Somebody referenced the page since the front
1037		 * hand went by, so it's not a candidate for
1038		 * freeing up.
1039		 */
1040		page_unlock(pp);
1041		return (0);
1042	}
1043
1044	VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1045
1046	/*
1047	 * If large page, attempt to demote it. If successfully demoted,
1048	 * retry the checkpage.
1049	 */
1050	if (pp->p_szc != 0) {
1051		if (!page_try_demote_pages(pp)) {
1052			VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1053			page_unlock(pp);
1054			return (-1);
1055		}
1056		ASSERT(pp->p_szc == 0);
1057		VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1058		/*
1059		 * since page_try_demote_pages() could have unloaded some
1060		 * mappings it makes sense to reload ppattr.
1061		 */
1062		ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1063	}
1064
1065	/*
1066	 * If the page is currently dirty, we have to arrange
1067	 * to have it cleaned before it can be freed.
1068	 *
1069	 * XXX - ASSERT(pp->p_vnode != NULL);
1070	 */
1071	if ((ppattr & P_MOD) && pp->p_vnode) {
1072		struct vnode *vp = pp->p_vnode;
1073		u_offset_t offset = pp->p_offset;
1074
1075		/*
1076		 * XXX - Test for process being swapped out or about to exit?
1077		 * [Can't get back to process(es) using the page.]
1078		 */
1079
1080		/*
1081		 * Hold the vnode before releasing the page lock to
1082		 * prevent it from being freed and re-used by some
1083		 * other thread.
1084		 */
1085		VN_HOLD(vp);
1086		page_unlock(pp);
1087
1088		/*
1089		 * Queue i/o request for the pageout thread.
1090		 */
1091		if (!queue_io_request(vp, offset)) {
1092			VN_RELE(vp);
1093			return (0);
1094		}
1095		return (1);
1096	}
1097
1098	/*
1099	 * Now we unload all the translations,
1100	 * and put the page back on to the free list.
1101	 * If the page was used (referenced or modified) after
1102	 * the pagesync but before it was unloaded we catch it
1103	 * and handle the page properly.
1104	 */
1105	TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
1106	    "pageout_free:pp %p whichhand %d", pp, whichhand);
1107	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1108	ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1109	if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
1110		goto recheck;
1111
1112	/*LINTED: constant in conditional context*/
1113	VN_DISPOSE(pp, B_FREE, 0, kcred);
1114
1115	CPU_STATS_ADD_K(vm, dfree, 1);
1116
1117	if (isfs) {
1118		if (isexec) {
1119			CPU_STATS_ADD_K(vm, execfree, 1);
1120		} else {
1121			CPU_STATS_ADD_K(vm, fsfree, 1);
1122		}
1123	} else {
1124		CPU_STATS_ADD_K(vm, anonfree, 1);
1125	}
1126
1127	return (1);		/* freed a page! */
1128}
1129
1130/*
1131 * Queue async i/o request from pageout_scanner and segment swapout
1132 * routines on one common list.  This ensures that pageout devices (swap)
1133 * are not saturated by pageout_scanner or swapout requests.
1134 * The pageout thread empties this list by initiating i/o operations.
1135 */
1136int
1137queue_io_request(vnode_t *vp, u_offset_t off)
1138{
1139	struct async_reqs *arg;
1140
1141	/*
1142	 * If we cannot allocate an async request struct,
1143	 * skip this page.
1144	 */
1145	mutex_enter(&push_lock);
1146	if ((arg = req_freelist) == NULL) {
1147		mutex_exit(&push_lock);
1148		return (0);
1149	}
1150	req_freelist = arg->a_next;		/* adjust freelist */
1151	push_list_size++;
1152
1153	arg->a_vp = vp;
1154	arg->a_off = off;
1155	arg->a_len = PAGESIZE;
1156	arg->a_flags = B_ASYNC | B_FREE;
1157	arg->a_cred = kcred;		/* always held */
1158
1159	/*
1160	 * Add to list of pending write requests.
1161	 */
1162	arg->a_next = push_list;
1163	push_list = arg;
1164
1165	if (req_freelist == NULL) {
1166		/*
1167		 * No free async requests left. The lock is held so we
1168		 * might as well signal the pusher thread now.
1169		 */
1170		cv_signal(&push_cv);
1171	}
1172	mutex_exit(&push_lock);
1173	return (1);
1174}
1175
1176/*
1177 * Wakeup pageout to initiate i/o if push_list is not empty.
1178 */
1179void
1180cv_signal_pageout()
1181{
1182	if (push_list != NULL) {
1183		mutex_enter(&push_lock);
1184		cv_signal(&push_cv);
1185		mutex_exit(&push_lock);
1186	}
1187}
1188