vm_glue.c revision 127013
1/*
2 * Copyright (c) 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	from: @(#)vm_glue.c	8.6 (Berkeley) 1/5/94
37 *
38 *
39 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
40 * All rights reserved.
41 *
42 * Permission to use, copy, modify and distribute this software and
43 * its documentation is hereby granted, provided that both the copyright
44 * notice and this permission notice appear in all copies of the
45 * software, derivative works or modified versions, and any portions
46 * thereof, and that both notices appear in supporting documentation.
47 *
48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51 *
52 * Carnegie Mellon requests users of this software to return to
53 *
54 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55 *  School of Computer Science
56 *  Carnegie Mellon University
57 *  Pittsburgh PA 15213-3890
58 *
59 * any improvements or extensions that they make and grant Carnegie the
60 * rights to redistribute these changes.
61 */
62
63#include <sys/cdefs.h>
64__FBSDID("$FreeBSD: head/sys/vm/vm_glue.c 127013 2004-03-15 09:11:23Z truckman $");
65
66#include "opt_vm.h"
67#include "opt_kstack_pages.h"
68#include "opt_kstack_max_pages.h"
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/limits.h>
73#include <sys/lock.h>
74#include <sys/mutex.h>
75#include <sys/proc.h>
76#include <sys/resourcevar.h>
77#include <sys/shm.h>
78#include <sys/vmmeter.h>
79#include <sys/sx.h>
80#include <sys/sysctl.h>
81
82#include <sys/kernel.h>
83#include <sys/ktr.h>
84#include <sys/unistd.h>
85
86#include <vm/vm.h>
87#include <vm/vm_param.h>
88#include <vm/pmap.h>
89#include <vm/vm_map.h>
90#include <vm/vm_page.h>
91#include <vm/vm_pageout.h>
92#include <vm/vm_object.h>
93#include <vm/vm_kern.h>
94#include <vm/vm_extern.h>
95#include <vm/vm_pager.h>
96#include <vm/swap_pager.h>
97
98#include <sys/user.h>
99
100extern int maxslp;
101
102/*
103 * System initialization
104 *
105 * Note: proc0 from proc.h
106 */
107static void vm_init_limits(void *);
108SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0)
109
110/*
111 * THIS MUST BE THE LAST INITIALIZATION ITEM!!!
112 *
113 * Note: run scheduling should be divorced from the vm system.
114 */
115static void scheduler(void *);
116SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, scheduler, NULL)
117
118#ifndef NO_SWAPPING
119static void swapout(struct proc *);
120static void vm_proc_swapin(struct proc *p);
121static void vm_proc_swapout(struct proc *p);
122#endif
123
124/*
125 * MPSAFE
126 *
127 * WARNING!  This code calls vm_map_check_protection() which only checks
128 * the associated vm_map_entry range.  It does not determine whether the
129 * contents of the memory is actually readable or writable.  In most cases
130 * just checking the vm_map_entry is sufficient within the kernel's address
131 * space.
132 */
133int
134kernacc(addr, len, rw)
135	void *addr;
136	int len, rw;
137{
138	boolean_t rv;
139	vm_offset_t saddr, eaddr;
140	vm_prot_t prot;
141
142	KASSERT((rw & ~VM_PROT_ALL) == 0,
143	    ("illegal ``rw'' argument to kernacc (%x)\n", rw));
144	prot = rw;
145	saddr = trunc_page((vm_offset_t)addr);
146	eaddr = round_page((vm_offset_t)addr + len);
147	vm_map_lock_read(kernel_map);
148	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
149	vm_map_unlock_read(kernel_map);
150	return (rv == TRUE);
151}
152
153/*
154 * MPSAFE
155 *
156 * WARNING!  This code calls vm_map_check_protection() which only checks
157 * the associated vm_map_entry range.  It does not determine whether the
158 * contents of the memory is actually readable or writable.  vmapbuf(),
159 * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
160 * used in conjuction with this call.
161 */
162int
163useracc(addr, len, rw)
164	void *addr;
165	int len, rw;
166{
167	boolean_t rv;
168	vm_prot_t prot;
169	vm_map_t map;
170
171	KASSERT((rw & ~VM_PROT_ALL) == 0,
172	    ("illegal ``rw'' argument to useracc (%x)\n", rw));
173	prot = rw;
174	map = &curproc->p_vmspace->vm_map;
175	if ((vm_offset_t)addr + len > vm_map_max(map) ||
176	    (vm_offset_t)addr + len < (vm_offset_t)addr) {
177		return (FALSE);
178	}
179	vm_map_lock_read(map);
180	rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
181	    round_page((vm_offset_t)addr + len), prot);
182	vm_map_unlock_read(map);
183	return (rv == TRUE);
184}
185
186int
187vslock(void *addr, size_t len)
188{
189	vm_offset_t end, last, start;
190	vm_size_t npages;
191	int error;
192
193	last = (vm_offset_t)addr + len;
194	start = trunc_page((vm_offset_t)addr);
195	end = round_page(last);
196	if (last < (vm_offset_t)addr || end < (vm_offset_t)addr)
197		return (EINVAL);
198	npages = atop(end - start);
199	if (npages > vm_page_max_wired)
200		return (ENOMEM);
201	PROC_LOCK(curproc);
202	if (ptoa(npages +
203	    pmap_wired_count(vm_map_pmap(&curproc->p_vmspace->vm_map))) >
204	    lim_cur(curproc, RLIMIT_MEMLOCK)) {
205		PROC_UNLOCK(curproc);
206		return (ENOMEM);
207	}
208	PROC_UNLOCK(curproc);
209#if 0
210	/*
211	 * XXX - not yet
212	 *
213	 * The limit for transient usage of wired pages should be
214	 * larger than for "permanent" wired pages (mlock()).
215	 *
216	 * Also, the sysctl code, which is the only present user
217	 * of vslock(), does a hard loop on EAGAIN.
218	 */
219	if (npages + cnt.v_wire_count > vm_page_max_wired)
220		return (EAGAIN);
221#endif
222	error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
223	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
224	/*
225	 * Return EFAULT on error to match copy{in,out}() behaviour
226	 * rather than returning ENOMEM like mlock() would.
227	 */
228	return (error == KERN_SUCCESS ? 0 : EFAULT);
229}
230
231void
232vsunlock(void *addr, size_t len)
233{
234
235	/* Rely on the parameter sanity checks performed by vslock(). */
236	(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
237	    trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
238	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
239}
240
241/*
242 * Create the U area for a new process.
243 * This routine directly affects the fork perf for a process.
244 */
245void
246vm_proc_new(struct proc *p)
247{
248	vm_page_t ma[UAREA_PAGES];
249	vm_object_t upobj;
250	vm_offset_t up;
251	vm_page_t m;
252	u_int i;
253
254	/*
255	 * Get a kernel virtual address for the U area for this process.
256	 */
257	up = kmem_alloc_nofault(kernel_map, UAREA_PAGES * PAGE_SIZE);
258	if (up == 0)
259		panic("vm_proc_new: upage allocation failed");
260	p->p_uarea = (struct user *)up;
261
262	/*
263	 * Allocate object and page(s) for the U area.
264	 */
265	upobj = vm_object_allocate(OBJT_DEFAULT, UAREA_PAGES);
266	p->p_upages_obj = upobj;
267	VM_OBJECT_LOCK(upobj);
268	for (i = 0; i < UAREA_PAGES; i++) {
269		m = vm_page_grab(upobj, i,
270		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED);
271		ma[i] = m;
272
273		vm_page_lock_queues();
274		vm_page_wakeup(m);
275		m->valid = VM_PAGE_BITS_ALL;
276		vm_page_unlock_queues();
277	}
278	VM_OBJECT_UNLOCK(upobj);
279
280	/*
281	 * Enter the pages into the kernel address space.
282	 */
283	pmap_qenter(up, ma, UAREA_PAGES);
284}
285
286/*
287 * Dispose the U area for a process that has exited.
288 * This routine directly impacts the exit perf of a process.
289 * XXX proc_zone is marked UMA_ZONE_NOFREE, so this should never be called.
290 */
291void
292vm_proc_dispose(struct proc *p)
293{
294	vm_object_t upobj;
295	vm_offset_t up;
296	vm_page_t m;
297
298	upobj = p->p_upages_obj;
299	VM_OBJECT_LOCK(upobj);
300	if (upobj->resident_page_count != UAREA_PAGES)
301		panic("vm_proc_dispose: incorrect number of pages in upobj");
302	vm_page_lock_queues();
303	while ((m = TAILQ_FIRST(&upobj->memq)) != NULL) {
304		vm_page_busy(m);
305		vm_page_unwire(m, 0);
306		vm_page_free(m);
307	}
308	vm_page_unlock_queues();
309	VM_OBJECT_UNLOCK(upobj);
310	up = (vm_offset_t)p->p_uarea;
311	pmap_qremove(up, UAREA_PAGES);
312	kmem_free(kernel_map, up, UAREA_PAGES * PAGE_SIZE);
313	vm_object_deallocate(upobj);
314}
315
316#ifndef NO_SWAPPING
317/*
318 * Allow the U area for a process to be prejudicially paged out.
319 */
320static void
321vm_proc_swapout(struct proc *p)
322{
323	vm_object_t upobj;
324	vm_offset_t up;
325	vm_page_t m;
326
327	upobj = p->p_upages_obj;
328	VM_OBJECT_LOCK(upobj);
329	if (upobj->resident_page_count != UAREA_PAGES)
330		panic("vm_proc_dispose: incorrect number of pages in upobj");
331	vm_page_lock_queues();
332	TAILQ_FOREACH(m, &upobj->memq, listq) {
333		vm_page_dirty(m);
334		vm_page_unwire(m, 0);
335	}
336	vm_page_unlock_queues();
337	VM_OBJECT_UNLOCK(upobj);
338	up = (vm_offset_t)p->p_uarea;
339	pmap_qremove(up, UAREA_PAGES);
340}
341
342/*
343 * Bring the U area for a specified process back in.
344 */
345static void
346vm_proc_swapin(struct proc *p)
347{
348	vm_page_t ma[UAREA_PAGES];
349	vm_object_t upobj;
350	vm_offset_t up;
351	vm_page_t m;
352	int rv;
353	int i;
354
355	upobj = p->p_upages_obj;
356	VM_OBJECT_LOCK(upobj);
357	for (i = 0; i < UAREA_PAGES; i++) {
358		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
359		if (m->valid != VM_PAGE_BITS_ALL) {
360			rv = vm_pager_get_pages(upobj, &m, 1, 0);
361			if (rv != VM_PAGER_OK)
362				panic("vm_proc_swapin: cannot get upage");
363		}
364		ma[i] = m;
365	}
366	if (upobj->resident_page_count != UAREA_PAGES)
367		panic("vm_proc_swapin: lost pages from upobj");
368	vm_page_lock_queues();
369	TAILQ_FOREACH(m, &upobj->memq, listq) {
370		m->valid = VM_PAGE_BITS_ALL;
371		vm_page_wire(m);
372		vm_page_wakeup(m);
373	}
374	vm_page_unlock_queues();
375	VM_OBJECT_UNLOCK(upobj);
376	up = (vm_offset_t)p->p_uarea;
377	pmap_qenter(up, ma, UAREA_PAGES);
378}
379
380/*
381 * Swap in the UAREAs of all processes swapped out to the given device.
382 * The pages in the UAREA are marked dirty and their swap metadata is freed.
383 */
384void
385vm_proc_swapin_all(struct swdevt *devidx)
386{
387	struct proc *p;
388	vm_object_t object;
389	vm_page_t m;
390
391retry:
392	sx_slock(&allproc_lock);
393	FOREACH_PROC_IN_SYSTEM(p) {
394		PROC_LOCK(p);
395		object = p->p_upages_obj;
396		if (object != NULL) {
397			VM_OBJECT_LOCK(object);
398			if (swap_pager_isswapped(object, devidx)) {
399				VM_OBJECT_UNLOCK(object);
400				sx_sunlock(&allproc_lock);
401				faultin(p);
402				PROC_UNLOCK(p);
403				VM_OBJECT_LOCK(object);
404				vm_page_lock_queues();
405				TAILQ_FOREACH(m, &object->memq, listq)
406					vm_page_dirty(m);
407				vm_page_unlock_queues();
408				swap_pager_freespace(object, 0,
409				    object->un_pager.swp.swp_bcount);
410				VM_OBJECT_UNLOCK(object);
411				goto retry;
412			}
413			VM_OBJECT_UNLOCK(object);
414		}
415		PROC_UNLOCK(p);
416	}
417	sx_sunlock(&allproc_lock);
418}
419#endif
420
421#ifndef KSTACK_MAX_PAGES
422#define KSTACK_MAX_PAGES 32
423#endif
424
425/*
426 * Create the kernel stack (including pcb for i386) for a new thread.
427 * This routine directly affects the fork perf for a process and
428 * create performance for a thread.
429 */
430void
431vm_thread_new(struct thread *td, int pages)
432{
433	vm_object_t ksobj;
434	vm_offset_t ks;
435	vm_page_t m, ma[KSTACK_MAX_PAGES];
436	int i;
437
438	/* Bounds check */
439	if (pages <= 1)
440		pages = KSTACK_PAGES;
441	else if (pages > KSTACK_MAX_PAGES)
442		pages = KSTACK_MAX_PAGES;
443	/*
444	 * Allocate an object for the kstack.
445	 */
446	ksobj = vm_object_allocate(OBJT_DEFAULT, pages);
447	td->td_kstack_obj = ksobj;
448	/*
449	 * Get a kernel virtual address for this thread's kstack.
450	 */
451	ks = kmem_alloc_nofault(kernel_map,
452	   (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
453	if (ks == 0)
454		panic("vm_thread_new: kstack allocation failed");
455	if (KSTACK_GUARD_PAGES != 0) {
456		pmap_qremove(ks, KSTACK_GUARD_PAGES);
457		ks += KSTACK_GUARD_PAGES * PAGE_SIZE;
458	}
459	td->td_kstack = ks;
460	/*
461	 * Knowing the number of pages allocated is useful when you
462	 * want to deallocate them.
463	 */
464	td->td_kstack_pages = pages;
465	/*
466	 * For the length of the stack, link in a real page of ram for each
467	 * page of stack.
468	 */
469	VM_OBJECT_LOCK(ksobj);
470	for (i = 0; i < pages; i++) {
471		/*
472		 * Get a kernel stack page.
473		 */
474		m = vm_page_grab(ksobj, i,
475		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED);
476		ma[i] = m;
477		vm_page_lock_queues();
478		vm_page_wakeup(m);
479		m->valid = VM_PAGE_BITS_ALL;
480		vm_page_unlock_queues();
481	}
482	VM_OBJECT_UNLOCK(ksobj);
483	pmap_qenter(ks, ma, pages);
484}
485
486/*
487 * Dispose of a thread's kernel stack.
488 */
489void
490vm_thread_dispose(struct thread *td)
491{
492	vm_object_t ksobj;
493	vm_offset_t ks;
494	vm_page_t m;
495	int i, pages;
496
497	pages = td->td_kstack_pages;
498	ksobj = td->td_kstack_obj;
499	ks = td->td_kstack;
500	pmap_qremove(ks, pages);
501	VM_OBJECT_LOCK(ksobj);
502	for (i = 0; i < pages; i++) {
503		m = vm_page_lookup(ksobj, i);
504		if (m == NULL)
505			panic("vm_thread_dispose: kstack already missing?");
506		vm_page_lock_queues();
507		vm_page_busy(m);
508		vm_page_unwire(m, 0);
509		vm_page_free(m);
510		vm_page_unlock_queues();
511	}
512	VM_OBJECT_UNLOCK(ksobj);
513	vm_object_deallocate(ksobj);
514	kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
515	    (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
516}
517
518/*
519 * Allow a thread's kernel stack to be paged out.
520 */
521void
522vm_thread_swapout(struct thread *td)
523{
524	vm_object_t ksobj;
525	vm_page_t m;
526	int i, pages;
527
528	cpu_thread_swapout(td);
529	pages = td->td_kstack_pages;
530	ksobj = td->td_kstack_obj;
531	pmap_qremove(td->td_kstack, pages);
532	VM_OBJECT_LOCK(ksobj);
533	for (i = 0; i < pages; i++) {
534		m = vm_page_lookup(ksobj, i);
535		if (m == NULL)
536			panic("vm_thread_swapout: kstack already missing?");
537		vm_page_lock_queues();
538		vm_page_dirty(m);
539		vm_page_unwire(m, 0);
540		vm_page_unlock_queues();
541	}
542	VM_OBJECT_UNLOCK(ksobj);
543}
544
545/*
546 * Bring the kernel stack for a specified thread back in.
547 */
548void
549vm_thread_swapin(struct thread *td)
550{
551	vm_object_t ksobj;
552	vm_page_t m, ma[KSTACK_MAX_PAGES];
553	int i, pages, rv;
554
555	pages = td->td_kstack_pages;
556	ksobj = td->td_kstack_obj;
557	VM_OBJECT_LOCK(ksobj);
558	for (i = 0; i < pages; i++) {
559		m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
560		if (m->valid != VM_PAGE_BITS_ALL) {
561			rv = vm_pager_get_pages(ksobj, &m, 1, 0);
562			if (rv != VM_PAGER_OK)
563				panic("vm_thread_swapin: cannot get kstack for proc: %d", td->td_proc->p_pid);
564			m = vm_page_lookup(ksobj, i);
565			m->valid = VM_PAGE_BITS_ALL;
566		}
567		ma[i] = m;
568		vm_page_lock_queues();
569		vm_page_wire(m);
570		vm_page_wakeup(m);
571		vm_page_unlock_queues();
572	}
573	VM_OBJECT_UNLOCK(ksobj);
574	pmap_qenter(td->td_kstack, ma, pages);
575	cpu_thread_swapin(td);
576}
577
578/*
579 * Set up a variable-sized alternate kstack.
580 */
581void
582vm_thread_new_altkstack(struct thread *td, int pages)
583{
584
585	td->td_altkstack = td->td_kstack;
586	td->td_altkstack_obj = td->td_kstack_obj;
587	td->td_altkstack_pages = td->td_kstack_pages;
588
589	vm_thread_new(td, pages);
590}
591
592/*
593 * Restore the original kstack.
594 */
595void
596vm_thread_dispose_altkstack(struct thread *td)
597{
598
599	vm_thread_dispose(td);
600
601	td->td_kstack = td->td_altkstack;
602	td->td_kstack_obj = td->td_altkstack_obj;
603	td->td_kstack_pages = td->td_altkstack_pages;
604	td->td_altkstack = 0;
605	td->td_altkstack_obj = NULL;
606	td->td_altkstack_pages = 0;
607}
608
609/*
610 * Implement fork's actions on an address space.
611 * Here we arrange for the address space to be copied or referenced,
612 * allocate a user struct (pcb and kernel stack), then call the
613 * machine-dependent layer to fill those in and make the new process
614 * ready to run.  The new process is set up so that it returns directly
615 * to user mode to avoid stack copying and relocation problems.
616 */
617void
618vm_forkproc(td, p2, td2, flags)
619	struct thread *td;
620	struct proc *p2;
621	struct thread *td2;
622	int flags;
623{
624	struct proc *p1 = td->td_proc;
625	struct user *up;
626
627	GIANT_REQUIRED;
628
629	if ((flags & RFPROC) == 0) {
630		/*
631		 * Divorce the memory, if it is shared, essentially
632		 * this changes shared memory amongst threads, into
633		 * COW locally.
634		 */
635		if ((flags & RFMEM) == 0) {
636			if (p1->p_vmspace->vm_refcnt > 1) {
637				vmspace_unshare(p1);
638			}
639		}
640		cpu_fork(td, p2, td2, flags);
641		return;
642	}
643
644	if (flags & RFMEM) {
645		p2->p_vmspace = p1->p_vmspace;
646		p1->p_vmspace->vm_refcnt++;
647	}
648
649	while (vm_page_count_severe()) {
650		VM_WAIT;
651	}
652
653	if ((flags & RFMEM) == 0) {
654		p2->p_vmspace = vmspace_fork(p1->p_vmspace);
655		if (p1->p_vmspace->vm_shm)
656			shmfork(p1, p2);
657	}
658
659	/* XXXKSE this is unsatisfactory but should be adequate */
660	up = p2->p_uarea;
661	MPASS(p2->p_sigacts != NULL);
662
663	/*
664	 * p_stats currently points at fields in the user struct
665	 * but not at &u, instead at p_addr. Copy parts of
666	 * p_stats; zero the rest of p_stats (statistics).
667	 */
668	p2->p_stats = &up->u_stats;
669	bzero(&up->u_stats.pstat_startzero,
670	    (unsigned) ((caddr_t) &up->u_stats.pstat_endzero -
671		(caddr_t) &up->u_stats.pstat_startzero));
672	bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy,
673	    ((caddr_t) &up->u_stats.pstat_endcopy -
674		(caddr_t) &up->u_stats.pstat_startcopy));
675
676	/*
677	 * cpu_fork will copy and update the pcb, set up the kernel stack,
678	 * and make the child ready to run.
679	 */
680	cpu_fork(td, p2, td2, flags);
681}
682
683/*
684 * Called after process has been wait(2)'ed apon and is being reaped.
685 * The idea is to reclaim resources that we could not reclaim while
686 * the process was still executing.
687 */
688void
689vm_waitproc(p)
690	struct proc *p;
691{
692
693	GIANT_REQUIRED;
694	vmspace_exitfree(p);		/* and clean-out the vmspace */
695}
696
697/*
698 * Set default limits for VM system.
699 * Called for proc 0, and then inherited by all others.
700 *
701 * XXX should probably act directly on proc0.
702 */
703static void
704vm_init_limits(udata)
705	void *udata;
706{
707	struct proc *p = udata;
708	struct plimit *limp;
709	int rss_limit;
710
711	/*
712	 * Set up the initial limits on process VM. Set the maximum resident
713	 * set size to be half of (reasonably) available memory.  Since this
714	 * is a soft limit, it comes into effect only when the system is out
715	 * of memory - half of main memory helps to favor smaller processes,
716	 * and reduces thrashing of the object cache.
717	 */
718	limp = p->p_limit;
719	limp->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
720	limp->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
721	limp->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
722	limp->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
723	/* limit the limit to no less than 2MB */
724	rss_limit = max(cnt.v_free_count, 512);
725	limp->pl_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit);
726	limp->pl_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY;
727}
728
729void
730faultin(p)
731	struct proc *p;
732{
733#ifdef NO_SWAPPING
734
735	PROC_LOCK_ASSERT(p, MA_OWNED);
736	if ((p->p_sflag & PS_INMEM) == 0)
737		panic("faultin: proc swapped out with NO_SWAPPING!");
738#else /* !NO_SWAPPING */
739	struct thread *td;
740
741	GIANT_REQUIRED;
742	PROC_LOCK_ASSERT(p, MA_OWNED);
743	/*
744	 * If another process is swapping in this process,
745	 * just wait until it finishes.
746	 */
747	if (p->p_sflag & PS_SWAPPINGIN)
748		msleep(&p->p_sflag, &p->p_mtx, PVM, "faultin", 0);
749	else if ((p->p_sflag & PS_INMEM) == 0) {
750		/*
751		 * Don't let another thread swap process p out while we are
752		 * busy swapping it in.
753		 */
754		++p->p_lock;
755		mtx_lock_spin(&sched_lock);
756		p->p_sflag |= PS_SWAPPINGIN;
757		mtx_unlock_spin(&sched_lock);
758		PROC_UNLOCK(p);
759
760		vm_proc_swapin(p);
761		FOREACH_THREAD_IN_PROC(p, td)
762			vm_thread_swapin(td);
763
764		PROC_LOCK(p);
765		mtx_lock_spin(&sched_lock);
766		p->p_sflag &= ~PS_SWAPPINGIN;
767		p->p_sflag |= PS_INMEM;
768		FOREACH_THREAD_IN_PROC(p, td) {
769			TD_CLR_SWAPPED(td);
770			if (TD_CAN_RUN(td))
771				setrunnable(td);
772		}
773		mtx_unlock_spin(&sched_lock);
774
775		wakeup(&p->p_sflag);
776
777		/* Allow other threads to swap p out now. */
778		--p->p_lock;
779	}
780#endif /* NO_SWAPPING */
781}
782
783/*
784 * This swapin algorithm attempts to swap-in processes only if there
785 * is enough space for them.  Of course, if a process waits for a long
786 * time, it will be swapped in anyway.
787 *
788 *  XXXKSE - process with the thread with highest priority counts..
789 *
790 * Giant is still held at this point, to be released in tsleep.
791 */
792/* ARGSUSED*/
793static void
794scheduler(dummy)
795	void *dummy;
796{
797	struct proc *p;
798	struct thread *td;
799	int pri;
800	struct proc *pp;
801	int ppri;
802
803	mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
804	/* GIANT_REQUIRED */
805
806loop:
807	if (vm_page_count_min()) {
808		VM_WAIT;
809		goto loop;
810	}
811
812	pp = NULL;
813	ppri = INT_MIN;
814	sx_slock(&allproc_lock);
815	FOREACH_PROC_IN_SYSTEM(p) {
816		struct ksegrp *kg;
817		if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) {
818			continue;
819		}
820		mtx_lock_spin(&sched_lock);
821		FOREACH_THREAD_IN_PROC(p, td) {
822			/*
823			 * An otherwise runnable thread of a process
824			 * swapped out has only the TDI_SWAPPED bit set.
825			 *
826			 */
827			if (td->td_inhibitors == TDI_SWAPPED) {
828				kg = td->td_ksegrp;
829				pri = p->p_swtime + kg->kg_slptime;
830				if ((p->p_sflag & PS_SWAPINREQ) == 0) {
831					pri -= kg->kg_nice * 8;
832				}
833
834				/*
835				 * if this ksegrp is higher priority
836				 * and there is enough space, then select
837				 * this process instead of the previous
838				 * selection.
839				 */
840				if (pri > ppri) {
841					pp = p;
842					ppri = pri;
843				}
844			}
845		}
846		mtx_unlock_spin(&sched_lock);
847	}
848	sx_sunlock(&allproc_lock);
849
850	/*
851	 * Nothing to do, back to sleep.
852	 */
853	if ((p = pp) == NULL) {
854		tsleep(&proc0, PVM, "sched", maxslp * hz / 2);
855		goto loop;
856	}
857	PROC_LOCK(p);
858
859	/*
860	 * Another process may be bringing or may have already
861	 * brought this process in while we traverse all threads.
862	 * Or, this process may even be being swapped out again.
863	 */
864	if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) {
865		PROC_UNLOCK(p);
866		goto loop;
867	}
868
869	mtx_lock_spin(&sched_lock);
870	p->p_sflag &= ~PS_SWAPINREQ;
871	mtx_unlock_spin(&sched_lock);
872
873	/*
874	 * We would like to bring someone in. (only if there is space).
875	 * [What checks the space? ]
876	 */
877	faultin(p);
878	PROC_UNLOCK(p);
879	mtx_lock_spin(&sched_lock);
880	p->p_swtime = 0;
881	mtx_unlock_spin(&sched_lock);
882	goto loop;
883}
884
885#ifndef NO_SWAPPING
886
887/*
888 * Swap_idle_threshold1 is the guaranteed swapped in time for a process
889 */
890static int swap_idle_threshold1 = 2;
891SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
892    &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process");
893
894/*
895 * Swap_idle_threshold2 is the time that a process can be idle before
896 * it will be swapped out, if idle swapping is enabled.
897 */
898static int swap_idle_threshold2 = 10;
899SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
900    &swap_idle_threshold2, 0, "Time before a process will be swapped out");
901
902/*
903 * Swapout is driven by the pageout daemon.  Very simple, we find eligible
904 * procs and unwire their u-areas.  We try to always "swap" at least one
905 * process in case we need the room for a swapin.
906 * If any procs have been sleeping/stopped for at least maxslp seconds,
907 * they are swapped.  Else, we swap the longest-sleeping or stopped process,
908 * if any, otherwise the longest-resident process.
909 */
910void
911swapout_procs(action)
912int action;
913{
914	struct proc *p;
915	struct thread *td;
916	struct ksegrp *kg;
917	int didswap = 0;
918
919	GIANT_REQUIRED;
920
921retry:
922	sx_slock(&allproc_lock);
923	FOREACH_PROC_IN_SYSTEM(p) {
924		struct vmspace *vm;
925		int minslptime = 100000;
926
927		/*
928		 * Watch out for a process in
929		 * creation.  It may have no
930		 * address space or lock yet.
931		 */
932		mtx_lock_spin(&sched_lock);
933		if (p->p_state == PRS_NEW) {
934			mtx_unlock_spin(&sched_lock);
935			continue;
936		}
937		mtx_unlock_spin(&sched_lock);
938
939		/*
940		 * An aio daemon switches its
941		 * address space while running.
942		 * Perform a quick check whether
943		 * a process has P_SYSTEM.
944		 */
945		if ((p->p_flag & P_SYSTEM) != 0)
946			continue;
947
948		/*
949		 * Do not swapout a process that
950		 * is waiting for VM data
951		 * structures as there is a possible
952		 * deadlock.  Test this first as
953		 * this may block.
954		 *
955		 * Lock the map until swapout
956		 * finishes, or a thread of this
957		 * process may attempt to alter
958		 * the map.
959		 */
960		PROC_LOCK(p);
961		vm = p->p_vmspace;
962		KASSERT(vm != NULL,
963			("swapout_procs: a process has no address space"));
964		++vm->vm_refcnt;
965		PROC_UNLOCK(p);
966		if (!vm_map_trylock(&vm->vm_map))
967			goto nextproc1;
968
969		PROC_LOCK(p);
970		if (p->p_lock != 0 ||
971		    (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT)
972		    ) != 0) {
973			goto nextproc2;
974		}
975		/*
976		 * only aiod changes vmspace, however it will be
977		 * skipped because of the if statement above checking
978		 * for P_SYSTEM
979		 */
980		if ((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) != PS_INMEM)
981			goto nextproc2;
982
983		switch (p->p_state) {
984		default:
985			/* Don't swap out processes in any sort
986			 * of 'special' state. */
987			break;
988
989		case PRS_NORMAL:
990			mtx_lock_spin(&sched_lock);
991			/*
992			 * do not swapout a realtime process
993			 * Check all the thread groups..
994			 */
995			FOREACH_KSEGRP_IN_PROC(p, kg) {
996				if (PRI_IS_REALTIME(kg->kg_pri_class))
997					goto nextproc;
998
999				/*
1000				 * Guarantee swap_idle_threshold1
1001				 * time in memory.
1002				 */
1003				if (kg->kg_slptime < swap_idle_threshold1)
1004					goto nextproc;
1005
1006				/*
1007				 * Do not swapout a process if it is
1008				 * waiting on a critical event of some
1009				 * kind or there is a thread whose
1010				 * pageable memory may be accessed.
1011				 *
1012				 * This could be refined to support
1013				 * swapping out a thread.
1014				 */
1015				FOREACH_THREAD_IN_GROUP(kg, td) {
1016					if ((td->td_priority) < PSOCK ||
1017					    !thread_safetoswapout(td))
1018						goto nextproc;
1019				}
1020				/*
1021				 * If the system is under memory stress,
1022				 * or if we are swapping
1023				 * idle processes >= swap_idle_threshold2,
1024				 * then swap the process out.
1025				 */
1026				if (((action & VM_SWAP_NORMAL) == 0) &&
1027				    (((action & VM_SWAP_IDLE) == 0) ||
1028				    (kg->kg_slptime < swap_idle_threshold2)))
1029					goto nextproc;
1030
1031				if (minslptime > kg->kg_slptime)
1032					minslptime = kg->kg_slptime;
1033			}
1034
1035			/*
1036			 * If the process has been asleep for awhile and had
1037			 * most of its pages taken away already, swap it out.
1038			 */
1039			if ((action & VM_SWAP_NORMAL) ||
1040				((action & VM_SWAP_IDLE) &&
1041				 (minslptime > swap_idle_threshold2))) {
1042				swapout(p);
1043				didswap++;
1044				mtx_unlock_spin(&sched_lock);
1045				PROC_UNLOCK(p);
1046				vm_map_unlock(&vm->vm_map);
1047				vmspace_free(vm);
1048				sx_sunlock(&allproc_lock);
1049				goto retry;
1050			}
1051nextproc:
1052			mtx_unlock_spin(&sched_lock);
1053		}
1054nextproc2:
1055		PROC_UNLOCK(p);
1056		vm_map_unlock(&vm->vm_map);
1057nextproc1:
1058		vmspace_free(vm);
1059		continue;
1060	}
1061	sx_sunlock(&allproc_lock);
1062	/*
1063	 * If we swapped something out, and another process needed memory,
1064	 * then wakeup the sched process.
1065	 */
1066	if (didswap)
1067		wakeup(&proc0);
1068}
1069
1070static void
1071swapout(p)
1072	struct proc *p;
1073{
1074	struct thread *td;
1075
1076	PROC_LOCK_ASSERT(p, MA_OWNED);
1077	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
1078#if defined(SWAP_DEBUG)
1079	printf("swapping out %d\n", p->p_pid);
1080#endif
1081
1082	/*
1083	 * The states of this process and its threads may have changed
1084	 * by now.  Assuming that there is only one pageout daemon thread,
1085	 * this process should still be in memory.
1086	 */
1087	KASSERT((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) == PS_INMEM,
1088		("swapout: lost a swapout race?"));
1089
1090#if defined(INVARIANTS)
1091	/*
1092	 * Make sure that all threads are safe to be swapped out.
1093	 *
1094	 * Alternatively, we could swap out only safe threads.
1095	 */
1096	FOREACH_THREAD_IN_PROC(p, td) {
1097		KASSERT(thread_safetoswapout(td),
1098			("swapout: there is a thread not safe for swapout"));
1099	}
1100#endif /* INVARIANTS */
1101
1102	++p->p_stats->p_ru.ru_nswap;
1103	/*
1104	 * remember the process resident count
1105	 */
1106	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
1107
1108	p->p_sflag &= ~PS_INMEM;
1109	p->p_sflag |= PS_SWAPPINGOUT;
1110	PROC_UNLOCK(p);
1111	FOREACH_THREAD_IN_PROC(p, td)
1112		TD_SET_SWAPPED(td);
1113	mtx_unlock_spin(&sched_lock);
1114
1115	vm_proc_swapout(p);
1116	FOREACH_THREAD_IN_PROC(p, td)
1117		vm_thread_swapout(td);
1118
1119	PROC_LOCK(p);
1120	mtx_lock_spin(&sched_lock);
1121	p->p_sflag &= ~PS_SWAPPINGOUT;
1122	p->p_swtime = 0;
1123}
1124#endif /* !NO_SWAPPING */
1125