vm_glue.c revision 137910
1/*
2 * Copyright (c) 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	from: @(#)vm_glue.c	8.6 (Berkeley) 1/5/94
33 *
34 *
35 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36 * All rights reserved.
37 *
38 * Permission to use, copy, modify and distribute this software and
39 * its documentation is hereby granted, provided that both the copyright
40 * notice and this permission notice appear in all copies of the
41 * software, derivative works or modified versions, and any portions
42 * thereof, and that both notices appear in supporting documentation.
43 *
44 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
45 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
46 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
47 *
48 * Carnegie Mellon requests users of this software to return to
49 *
50 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
51 *  School of Computer Science
52 *  Carnegie Mellon University
53 *  Pittsburgh PA 15213-3890
54 *
55 * any improvements or extensions that they make and grant Carnegie the
56 * rights to redistribute these changes.
57 */
58
59#include <sys/cdefs.h>
60__FBSDID("$FreeBSD: head/sys/vm/vm_glue.c 137910 2004-11-20 02:29:00Z das $");
61
62#include "opt_vm.h"
63#include "opt_kstack_pages.h"
64#include "opt_kstack_max_pages.h"
65
66#include <sys/param.h>
67#include <sys/systm.h>
68#include <sys/limits.h>
69#include <sys/lock.h>
70#include <sys/mutex.h>
71#include <sys/proc.h>
72#include <sys/resourcevar.h>
73#include <sys/shm.h>
74#include <sys/vmmeter.h>
75#include <sys/sx.h>
76#include <sys/sysctl.h>
77
78#include <sys/kernel.h>
79#include <sys/ktr.h>
80#include <sys/unistd.h>
81
82#include <vm/vm.h>
83#include <vm/vm_param.h>
84#include <vm/pmap.h>
85#include <vm/vm_map.h>
86#include <vm/vm_page.h>
87#include <vm/vm_pageout.h>
88#include <vm/vm_object.h>
89#include <vm/vm_kern.h>
90#include <vm/vm_extern.h>
91#include <vm/vm_pager.h>
92#include <vm/swap_pager.h>
93
94#include <sys/user.h>
95
96extern int maxslp;
97
98/*
99 * System initialization
100 *
101 * Note: proc0 from proc.h
102 */
103static void vm_init_limits(void *);
104SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0)
105
106/*
107 * THIS MUST BE THE LAST INITIALIZATION ITEM!!!
108 *
109 * Note: run scheduling should be divorced from the vm system.
110 */
111static void scheduler(void *);
112SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, scheduler, NULL)
113
114#ifndef NO_SWAPPING
115static void swapout(struct proc *);
116#endif
117
118/*
119 * MPSAFE
120 *
121 * WARNING!  This code calls vm_map_check_protection() which only checks
122 * the associated vm_map_entry range.  It does not determine whether the
123 * contents of the memory is actually readable or writable.  In most cases
124 * just checking the vm_map_entry is sufficient within the kernel's address
125 * space.
126 */
127int
128kernacc(addr, len, rw)
129	void *addr;
130	int len, rw;
131{
132	boolean_t rv;
133	vm_offset_t saddr, eaddr;
134	vm_prot_t prot;
135
136	KASSERT((rw & ~VM_PROT_ALL) == 0,
137	    ("illegal ``rw'' argument to kernacc (%x)\n", rw));
138	prot = rw;
139	saddr = trunc_page((vm_offset_t)addr);
140	eaddr = round_page((vm_offset_t)addr + len);
141	vm_map_lock_read(kernel_map);
142	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
143	vm_map_unlock_read(kernel_map);
144	return (rv == TRUE);
145}
146
147/*
148 * MPSAFE
149 *
150 * WARNING!  This code calls vm_map_check_protection() which only checks
151 * the associated vm_map_entry range.  It does not determine whether the
152 * contents of the memory is actually readable or writable.  vmapbuf(),
153 * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
154 * used in conjuction with this call.
155 */
156int
157useracc(addr, len, rw)
158	void *addr;
159	int len, rw;
160{
161	boolean_t rv;
162	vm_prot_t prot;
163	vm_map_t map;
164
165	KASSERT((rw & ~VM_PROT_ALL) == 0,
166	    ("illegal ``rw'' argument to useracc (%x)\n", rw));
167	prot = rw;
168	map = &curproc->p_vmspace->vm_map;
169	if ((vm_offset_t)addr + len > vm_map_max(map) ||
170	    (vm_offset_t)addr + len < (vm_offset_t)addr) {
171		return (FALSE);
172	}
173	vm_map_lock_read(map);
174	rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
175	    round_page((vm_offset_t)addr + len), prot);
176	vm_map_unlock_read(map);
177	return (rv == TRUE);
178}
179
180int
181vslock(void *addr, size_t len)
182{
183	vm_offset_t end, last, start;
184	vm_size_t npages;
185	int error;
186
187	last = (vm_offset_t)addr + len;
188	start = trunc_page((vm_offset_t)addr);
189	end = round_page(last);
190	if (last < (vm_offset_t)addr || end < (vm_offset_t)addr)
191		return (EINVAL);
192	npages = atop(end - start);
193	if (npages > vm_page_max_wired)
194		return (ENOMEM);
195	PROC_LOCK(curproc);
196	if (ptoa(npages +
197	    pmap_wired_count(vm_map_pmap(&curproc->p_vmspace->vm_map))) >
198	    lim_cur(curproc, RLIMIT_MEMLOCK)) {
199		PROC_UNLOCK(curproc);
200		return (ENOMEM);
201	}
202	PROC_UNLOCK(curproc);
203#if 0
204	/*
205	 * XXX - not yet
206	 *
207	 * The limit for transient usage of wired pages should be
208	 * larger than for "permanent" wired pages (mlock()).
209	 *
210	 * Also, the sysctl code, which is the only present user
211	 * of vslock(), does a hard loop on EAGAIN.
212	 */
213	if (npages + cnt.v_wire_count > vm_page_max_wired)
214		return (EAGAIN);
215#endif
216	error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
217	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
218	/*
219	 * Return EFAULT on error to match copy{in,out}() behaviour
220	 * rather than returning ENOMEM like mlock() would.
221	 */
222	return (error == KERN_SUCCESS ? 0 : EFAULT);
223}
224
225void
226vsunlock(void *addr, size_t len)
227{
228
229	/* Rely on the parameter sanity checks performed by vslock(). */
230	(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
231	    trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
232	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
233}
234
235#ifndef KSTACK_MAX_PAGES
236#define KSTACK_MAX_PAGES 32
237#endif
238
239/*
240 * Create the kernel stack (including pcb for i386) for a new thread.
241 * This routine directly affects the fork perf for a process and
242 * create performance for a thread.
243 */
244void
245vm_thread_new(struct thread *td, int pages)
246{
247	vm_object_t ksobj;
248	vm_offset_t ks;
249	vm_page_t m, ma[KSTACK_MAX_PAGES];
250	int i;
251
252	/* Bounds check */
253	if (pages <= 1)
254		pages = KSTACK_PAGES;
255	else if (pages > KSTACK_MAX_PAGES)
256		pages = KSTACK_MAX_PAGES;
257	/*
258	 * Allocate an object for the kstack.
259	 */
260	ksobj = vm_object_allocate(OBJT_DEFAULT, pages);
261	td->td_kstack_obj = ksobj;
262	/*
263	 * Get a kernel virtual address for this thread's kstack.
264	 */
265	ks = kmem_alloc_nofault(kernel_map,
266	   (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
267	if (ks == 0)
268		panic("vm_thread_new: kstack allocation failed");
269	if (KSTACK_GUARD_PAGES != 0) {
270		pmap_qremove(ks, KSTACK_GUARD_PAGES);
271		ks += KSTACK_GUARD_PAGES * PAGE_SIZE;
272	}
273	td->td_kstack = ks;
274	/*
275	 * Knowing the number of pages allocated is useful when you
276	 * want to deallocate them.
277	 */
278	td->td_kstack_pages = pages;
279	/*
280	 * For the length of the stack, link in a real page of ram for each
281	 * page of stack.
282	 */
283	VM_OBJECT_LOCK(ksobj);
284	for (i = 0; i < pages; i++) {
285		/*
286		 * Get a kernel stack page.
287		 */
288		m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY |
289		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED);
290		ma[i] = m;
291		m->valid = VM_PAGE_BITS_ALL;
292	}
293	VM_OBJECT_UNLOCK(ksobj);
294	pmap_qenter(ks, ma, pages);
295}
296
297/*
298 * Dispose of a thread's kernel stack.
299 */
300void
301vm_thread_dispose(struct thread *td)
302{
303	vm_object_t ksobj;
304	vm_offset_t ks;
305	vm_page_t m;
306	int i, pages;
307
308	pages = td->td_kstack_pages;
309	ksobj = td->td_kstack_obj;
310	ks = td->td_kstack;
311	pmap_qremove(ks, pages);
312	VM_OBJECT_LOCK(ksobj);
313	for (i = 0; i < pages; i++) {
314		m = vm_page_lookup(ksobj, i);
315		if (m == NULL)
316			panic("vm_thread_dispose: kstack already missing?");
317		vm_page_lock_queues();
318		vm_page_unwire(m, 0);
319		vm_page_free(m);
320		vm_page_unlock_queues();
321	}
322	VM_OBJECT_UNLOCK(ksobj);
323	vm_object_deallocate(ksobj);
324	kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
325	    (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
326}
327
328/*
329 * Allow a thread's kernel stack to be paged out.
330 */
331void
332vm_thread_swapout(struct thread *td)
333{
334	vm_object_t ksobj;
335	vm_page_t m;
336	int i, pages;
337
338	cpu_thread_swapout(td);
339	pages = td->td_kstack_pages;
340	ksobj = td->td_kstack_obj;
341	pmap_qremove(td->td_kstack, pages);
342	VM_OBJECT_LOCK(ksobj);
343	for (i = 0; i < pages; i++) {
344		m = vm_page_lookup(ksobj, i);
345		if (m == NULL)
346			panic("vm_thread_swapout: kstack already missing?");
347		vm_page_lock_queues();
348		vm_page_dirty(m);
349		vm_page_unwire(m, 0);
350		vm_page_unlock_queues();
351	}
352	VM_OBJECT_UNLOCK(ksobj);
353}
354
355/*
356 * Bring the kernel stack for a specified thread back in.
357 */
358void
359vm_thread_swapin(struct thread *td)
360{
361	vm_object_t ksobj;
362	vm_page_t m, ma[KSTACK_MAX_PAGES];
363	int i, pages, rv;
364
365	pages = td->td_kstack_pages;
366	ksobj = td->td_kstack_obj;
367	VM_OBJECT_LOCK(ksobj);
368	for (i = 0; i < pages; i++) {
369		m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
370		if (m->valid != VM_PAGE_BITS_ALL) {
371			rv = vm_pager_get_pages(ksobj, &m, 1, 0);
372			if (rv != VM_PAGER_OK)
373				panic("vm_thread_swapin: cannot get kstack for proc: %d", td->td_proc->p_pid);
374			m = vm_page_lookup(ksobj, i);
375			m->valid = VM_PAGE_BITS_ALL;
376		}
377		ma[i] = m;
378		vm_page_lock_queues();
379		vm_page_wire(m);
380		vm_page_wakeup(m);
381		vm_page_unlock_queues();
382	}
383	VM_OBJECT_UNLOCK(ksobj);
384	pmap_qenter(td->td_kstack, ma, pages);
385	cpu_thread_swapin(td);
386}
387
388/*
389 * Set up a variable-sized alternate kstack.
390 */
391void
392vm_thread_new_altkstack(struct thread *td, int pages)
393{
394
395	td->td_altkstack = td->td_kstack;
396	td->td_altkstack_obj = td->td_kstack_obj;
397	td->td_altkstack_pages = td->td_kstack_pages;
398
399	vm_thread_new(td, pages);
400}
401
402/*
403 * Restore the original kstack.
404 */
405void
406vm_thread_dispose_altkstack(struct thread *td)
407{
408
409	vm_thread_dispose(td);
410
411	td->td_kstack = td->td_altkstack;
412	td->td_kstack_obj = td->td_altkstack_obj;
413	td->td_kstack_pages = td->td_altkstack_pages;
414	td->td_altkstack = 0;
415	td->td_altkstack_obj = NULL;
416	td->td_altkstack_pages = 0;
417}
418
419/*
420 * Implement fork's actions on an address space.
421 * Here we arrange for the address space to be copied or referenced,
422 * allocate a user struct (pcb and kernel stack), then call the
423 * machine-dependent layer to fill those in and make the new process
424 * ready to run.  The new process is set up so that it returns directly
425 * to user mode to avoid stack copying and relocation problems.
426 */
427void
428vm_forkproc(td, p2, td2, flags)
429	struct thread *td;
430	struct proc *p2;
431	struct thread *td2;
432	int flags;
433{
434	struct proc *p1 = td->td_proc;
435
436	if ((flags & RFPROC) == 0) {
437		/*
438		 * Divorce the memory, if it is shared, essentially
439		 * this changes shared memory amongst threads, into
440		 * COW locally.
441		 */
442		if ((flags & RFMEM) == 0) {
443			if (p1->p_vmspace->vm_refcnt > 1) {
444				vmspace_unshare(p1);
445			}
446		}
447		cpu_fork(td, p2, td2, flags);
448		return;
449	}
450
451	if (flags & RFMEM) {
452		p2->p_vmspace = p1->p_vmspace;
453		atomic_add_int(&p1->p_vmspace->vm_refcnt, 1);
454	}
455
456	while (vm_page_count_severe()) {
457		VM_WAIT;
458	}
459
460	if ((flags & RFMEM) == 0) {
461		p2->p_vmspace = vmspace_fork(p1->p_vmspace);
462		if (p1->p_vmspace->vm_shm)
463			shmfork(p1, p2);
464	}
465
466	/*
467	 * cpu_fork will copy and update the pcb, set up the kernel stack,
468	 * and make the child ready to run.
469	 */
470	cpu_fork(td, p2, td2, flags);
471}
472
473/*
474 * Called after process has been wait(2)'ed apon and is being reaped.
475 * The idea is to reclaim resources that we could not reclaim while
476 * the process was still executing.
477 */
478void
479vm_waitproc(p)
480	struct proc *p;
481{
482
483	vmspace_exitfree(p);		/* and clean-out the vmspace */
484}
485
486/*
487 * Set default limits for VM system.
488 * Called for proc 0, and then inherited by all others.
489 *
490 * XXX should probably act directly on proc0.
491 */
492static void
493vm_init_limits(udata)
494	void *udata;
495{
496	struct proc *p = udata;
497	struct plimit *limp;
498	int rss_limit;
499
500	/*
501	 * Set up the initial limits on process VM. Set the maximum resident
502	 * set size to be half of (reasonably) available memory.  Since this
503	 * is a soft limit, it comes into effect only when the system is out
504	 * of memory - half of main memory helps to favor smaller processes,
505	 * and reduces thrashing of the object cache.
506	 */
507	limp = p->p_limit;
508	limp->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
509	limp->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
510	limp->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
511	limp->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
512	/* limit the limit to no less than 2MB */
513	rss_limit = max(cnt.v_free_count, 512);
514	limp->pl_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit);
515	limp->pl_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY;
516}
517
518void
519faultin(p)
520	struct proc *p;
521{
522#ifdef NO_SWAPPING
523
524	PROC_LOCK_ASSERT(p, MA_OWNED);
525	if ((p->p_sflag & PS_INMEM) == 0)
526		panic("faultin: proc swapped out with NO_SWAPPING!");
527#else /* !NO_SWAPPING */
528	struct thread *td;
529
530	GIANT_REQUIRED;
531	PROC_LOCK_ASSERT(p, MA_OWNED);
532	/*
533	 * If another process is swapping in this process,
534	 * just wait until it finishes.
535	 */
536	if (p->p_sflag & PS_SWAPPINGIN)
537		msleep(&p->p_sflag, &p->p_mtx, PVM, "faultin", 0);
538	else if ((p->p_sflag & PS_INMEM) == 0) {
539		/*
540		 * Don't let another thread swap process p out while we are
541		 * busy swapping it in.
542		 */
543		++p->p_lock;
544		mtx_lock_spin(&sched_lock);
545		p->p_sflag |= PS_SWAPPINGIN;
546		mtx_unlock_spin(&sched_lock);
547		PROC_UNLOCK(p);
548
549		FOREACH_THREAD_IN_PROC(p, td)
550			vm_thread_swapin(td);
551
552		PROC_LOCK(p);
553		mtx_lock_spin(&sched_lock);
554		p->p_sflag &= ~PS_SWAPPINGIN;
555		p->p_sflag |= PS_INMEM;
556		FOREACH_THREAD_IN_PROC(p, td) {
557			TD_CLR_SWAPPED(td);
558			if (TD_CAN_RUN(td))
559				setrunnable(td);
560		}
561		mtx_unlock_spin(&sched_lock);
562
563		wakeup(&p->p_sflag);
564
565		/* Allow other threads to swap p out now. */
566		--p->p_lock;
567	}
568#endif /* NO_SWAPPING */
569}
570
571/*
572 * This swapin algorithm attempts to swap-in processes only if there
573 * is enough space for them.  Of course, if a process waits for a long
574 * time, it will be swapped in anyway.
575 *
576 *  XXXKSE - process with the thread with highest priority counts..
577 *
578 * Giant is still held at this point, to be released in tsleep.
579 */
580/* ARGSUSED*/
581static void
582scheduler(dummy)
583	void *dummy;
584{
585	struct proc *p;
586	struct thread *td;
587	int pri;
588	struct proc *pp;
589	int ppri;
590
591	mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
592	/* GIANT_REQUIRED */
593
594loop:
595	if (vm_page_count_min()) {
596		VM_WAIT;
597		goto loop;
598	}
599
600	pp = NULL;
601	ppri = INT_MIN;
602	sx_slock(&allproc_lock);
603	FOREACH_PROC_IN_SYSTEM(p) {
604		struct ksegrp *kg;
605		if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) {
606			continue;
607		}
608		mtx_lock_spin(&sched_lock);
609		FOREACH_THREAD_IN_PROC(p, td) {
610			/*
611			 * An otherwise runnable thread of a process
612			 * swapped out has only the TDI_SWAPPED bit set.
613			 *
614			 */
615			if (td->td_inhibitors == TDI_SWAPPED) {
616				kg = td->td_ksegrp;
617				pri = p->p_swtime + kg->kg_slptime;
618				if ((p->p_sflag & PS_SWAPINREQ) == 0) {
619					pri -= p->p_nice * 8;
620				}
621
622				/*
623				 * if this ksegrp is higher priority
624				 * and there is enough space, then select
625				 * this process instead of the previous
626				 * selection.
627				 */
628				if (pri > ppri) {
629					pp = p;
630					ppri = pri;
631				}
632			}
633		}
634		mtx_unlock_spin(&sched_lock);
635	}
636	sx_sunlock(&allproc_lock);
637
638	/*
639	 * Nothing to do, back to sleep.
640	 */
641	if ((p = pp) == NULL) {
642		tsleep(&proc0, PVM, "sched", maxslp * hz / 2);
643		goto loop;
644	}
645	PROC_LOCK(p);
646
647	/*
648	 * Another process may be bringing or may have already
649	 * brought this process in while we traverse all threads.
650	 * Or, this process may even be being swapped out again.
651	 */
652	if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) {
653		PROC_UNLOCK(p);
654		goto loop;
655	}
656
657	mtx_lock_spin(&sched_lock);
658	p->p_sflag &= ~PS_SWAPINREQ;
659	mtx_unlock_spin(&sched_lock);
660
661	/*
662	 * We would like to bring someone in. (only if there is space).
663	 * [What checks the space? ]
664	 */
665	faultin(p);
666	PROC_UNLOCK(p);
667	mtx_lock_spin(&sched_lock);
668	p->p_swtime = 0;
669	mtx_unlock_spin(&sched_lock);
670	goto loop;
671}
672
673#ifndef NO_SWAPPING
674
675/*
676 * Swap_idle_threshold1 is the guaranteed swapped in time for a process
677 */
678static int swap_idle_threshold1 = 2;
679SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
680    &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process");
681
682/*
683 * Swap_idle_threshold2 is the time that a process can be idle before
684 * it will be swapped out, if idle swapping is enabled.
685 */
686static int swap_idle_threshold2 = 10;
687SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
688    &swap_idle_threshold2, 0, "Time before a process will be swapped out");
689
690/*
691 * Swapout is driven by the pageout daemon.  Very simple, we find eligible
692 * procs and unwire their u-areas.  We try to always "swap" at least one
693 * process in case we need the room for a swapin.
694 * If any procs have been sleeping/stopped for at least maxslp seconds,
695 * they are swapped.  Else, we swap the longest-sleeping or stopped process,
696 * if any, otherwise the longest-resident process.
697 */
698void
699swapout_procs(action)
700int action;
701{
702	struct proc *p;
703	struct thread *td;
704	struct ksegrp *kg;
705	int didswap = 0;
706
707	GIANT_REQUIRED;
708
709retry:
710	sx_slock(&allproc_lock);
711	FOREACH_PROC_IN_SYSTEM(p) {
712		struct vmspace *vm;
713		int minslptime = 100000;
714
715		/*
716		 * Watch out for a process in
717		 * creation.  It may have no
718		 * address space or lock yet.
719		 */
720		mtx_lock_spin(&sched_lock);
721		if (p->p_state == PRS_NEW) {
722			mtx_unlock_spin(&sched_lock);
723			continue;
724		}
725		mtx_unlock_spin(&sched_lock);
726
727		/*
728		 * An aio daemon switches its
729		 * address space while running.
730		 * Perform a quick check whether
731		 * a process has P_SYSTEM.
732		 */
733		if ((p->p_flag & P_SYSTEM) != 0)
734			continue;
735
736		/*
737		 * Do not swapout a process that
738		 * is waiting for VM data
739		 * structures as there is a possible
740		 * deadlock.  Test this first as
741		 * this may block.
742		 *
743		 * Lock the map until swapout
744		 * finishes, or a thread of this
745		 * process may attempt to alter
746		 * the map.
747		 */
748		PROC_LOCK(p);
749		vm = p->p_vmspace;
750		KASSERT(vm != NULL,
751			("swapout_procs: a process has no address space"));
752		atomic_add_int(&vm->vm_refcnt, 1);
753		PROC_UNLOCK(p);
754		if (!vm_map_trylock(&vm->vm_map))
755			goto nextproc1;
756
757		PROC_LOCK(p);
758		if (p->p_lock != 0 ||
759		    (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT)
760		    ) != 0) {
761			goto nextproc2;
762		}
763		/*
764		 * only aiod changes vmspace, however it will be
765		 * skipped because of the if statement above checking
766		 * for P_SYSTEM
767		 */
768		if ((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) != PS_INMEM)
769			goto nextproc2;
770
771		switch (p->p_state) {
772		default:
773			/* Don't swap out processes in any sort
774			 * of 'special' state. */
775			break;
776
777		case PRS_NORMAL:
778			mtx_lock_spin(&sched_lock);
779			/*
780			 * do not swapout a realtime process
781			 * Check all the thread groups..
782			 */
783			FOREACH_KSEGRP_IN_PROC(p, kg) {
784				if (PRI_IS_REALTIME(kg->kg_pri_class))
785					goto nextproc;
786
787				/*
788				 * Guarantee swap_idle_threshold1
789				 * time in memory.
790				 */
791				if (kg->kg_slptime < swap_idle_threshold1)
792					goto nextproc;
793
794				/*
795				 * Do not swapout a process if it is
796				 * waiting on a critical event of some
797				 * kind or there is a thread whose
798				 * pageable memory may be accessed.
799				 *
800				 * This could be refined to support
801				 * swapping out a thread.
802				 */
803				FOREACH_THREAD_IN_GROUP(kg, td) {
804					if ((td->td_priority) < PSOCK ||
805					    !thread_safetoswapout(td))
806						goto nextproc;
807				}
808				/*
809				 * If the system is under memory stress,
810				 * or if we are swapping
811				 * idle processes >= swap_idle_threshold2,
812				 * then swap the process out.
813				 */
814				if (((action & VM_SWAP_NORMAL) == 0) &&
815				    (((action & VM_SWAP_IDLE) == 0) ||
816				    (kg->kg_slptime < swap_idle_threshold2)))
817					goto nextproc;
818
819				if (minslptime > kg->kg_slptime)
820					minslptime = kg->kg_slptime;
821			}
822
823			/*
824			 * If the pageout daemon didn't free enough pages,
825			 * or if this process is idle and the system is
826			 * configured to swap proactively, swap it out.
827			 */
828			if ((action & VM_SWAP_NORMAL) ||
829				((action & VM_SWAP_IDLE) &&
830				 (minslptime > swap_idle_threshold2))) {
831				swapout(p);
832				didswap++;
833				mtx_unlock_spin(&sched_lock);
834				PROC_UNLOCK(p);
835				vm_map_unlock(&vm->vm_map);
836				vmspace_free(vm);
837				sx_sunlock(&allproc_lock);
838				goto retry;
839			}
840nextproc:
841			mtx_unlock_spin(&sched_lock);
842		}
843nextproc2:
844		PROC_UNLOCK(p);
845		vm_map_unlock(&vm->vm_map);
846nextproc1:
847		vmspace_free(vm);
848		continue;
849	}
850	sx_sunlock(&allproc_lock);
851	/*
852	 * If we swapped something out, and another process needed memory,
853	 * then wakeup the sched process.
854	 */
855	if (didswap)
856		wakeup(&proc0);
857}
858
859static void
860swapout(p)
861	struct proc *p;
862{
863	struct thread *td;
864
865	PROC_LOCK_ASSERT(p, MA_OWNED);
866	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
867#if defined(SWAP_DEBUG)
868	printf("swapping out %d\n", p->p_pid);
869#endif
870
871	/*
872	 * The states of this process and its threads may have changed
873	 * by now.  Assuming that there is only one pageout daemon thread,
874	 * this process should still be in memory.
875	 */
876	KASSERT((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) == PS_INMEM,
877		("swapout: lost a swapout race?"));
878
879#if defined(INVARIANTS)
880	/*
881	 * Make sure that all threads are safe to be swapped out.
882	 *
883	 * Alternatively, we could swap out only safe threads.
884	 */
885	FOREACH_THREAD_IN_PROC(p, td) {
886		KASSERT(thread_safetoswapout(td),
887			("swapout: there is a thread not safe for swapout"));
888	}
889#endif /* INVARIANTS */
890
891	++p->p_stats->p_ru.ru_nswap;
892	/*
893	 * remember the process resident count
894	 */
895	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
896
897	p->p_sflag &= ~PS_INMEM;
898	p->p_sflag |= PS_SWAPPINGOUT;
899	PROC_UNLOCK(p);
900	FOREACH_THREAD_IN_PROC(p, td)
901		TD_SET_SWAPPED(td);
902	mtx_unlock_spin(&sched_lock);
903
904	FOREACH_THREAD_IN_PROC(p, td)
905		vm_thread_swapout(td);
906
907	PROC_LOCK(p);
908	mtx_lock_spin(&sched_lock);
909	p->p_sflag &= ~PS_SWAPPINGOUT;
910	p->p_swtime = 0;
911}
912#endif /* !NO_SWAPPING */
913