vm_glue.c revision 31667
162583Sitojun/* 2122108Sume * Copyright (c) 1991, 1993 362583Sitojun * The Regents of the University of California. All rights reserved. 455505Sshin * 555505Sshin * This code is derived from software contributed to Berkeley by 655505Sshin * The Mach Operating System project at Carnegie-Mellon University. 762583Sitojun * 855505Sshin * Redistribution and use in source and binary forms, with or without 955505Sshin * modification, are permitted provided that the following conditions 1055505Sshin * are met: 1155505Sshin * 1. Redistributions of source code must retain the above copyright 1255505Sshin * notice, this list of conditions and the following disclaimer. 1355505Sshin * 2. Redistributions in binary form must reproduce the above copyright 1455505Sshin * notice, this list of conditions and the following disclaimer in the 1555505Sshin * documentation and/or other materials provided with the distribution. 1655505Sshin * 3. All advertising materials mentioning features or use of this software 1755505Sshin * must display the following acknowledgement: 1855505Sshin * This product includes software developed by the University of 1962583Sitojun * California, Berkeley and its contributors. 2055505Sshin * 4. Neither the name of the University nor the names of its contributors 2155505Sshin * may be used to endorse or promote products derived from this software 2255505Sshin * without specific prior written permission. 2355505Sshin * 2455505Sshin * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 2555505Sshin * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2655505Sshin * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2755505Sshin * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 2855505Sshin * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2955505Sshin * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 3055505Sshin * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 3155505Sshin * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 3255505Sshin * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3355505Sshin * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3455505Sshin * SUCH DAMAGE. 3555505Sshin * 3655505Sshin * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 3755505Sshin * 3855505Sshin * 3955505Sshin * Copyright (c) 1987, 1990 Carnegie-Mellon University. 4055505Sshin * All rights reserved. 41171135Sgnn * 42171135Sgnn * Permission to use, copy, modify and distribute this software and 43171135Sgnn * its documentation is hereby granted, provided that both the copyright 4455505Sshin * notice and this permission notice appear in all copies of the 4555505Sshin * software, derivative works or modified versions, and any portions 4655505Sshin * thereof, and that both notices appear in supporting documentation. 4755505Sshin * 4855505Sshin * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 4955505Sshin * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 5055505Sshin * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 5155505Sshin * 5255505Sshin * Carnegie Mellon requests users of this software to return to 5355505Sshin * 5462583Sitojun * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 5555505Sshin * School of Computer Science 56173412Skevlo * Carnegie Mellon University 57173412Skevlo * Pittsburgh PA 15213-3890 58173412Skevlo * 59173412Skevlo * any improvements or extensions that they make and grant Carnegie the 60173412Skevlo * rights to redistribute these changes. 61173412Skevlo * 62173412Skevlo * $Id: vm_glue.c,v 1.68 1997/12/06 02:23:30 dyson Exp $ 63173412Skevlo */ 64173412Skevlo 65173412Skevlo#include "opt_rlimit.h" 66173412Skevlo 67173412Skevlo#include <sys/param.h> 6862583Sitojun#include <sys/systm.h> 6955505Sshin#include <sys/proc.h> 7055505Sshin#include <sys/resourcevar.h> 7155505Sshin#include <sys/buf.h> 7262583Sitojun#include <sys/shm.h> 7355505Sshin#include <sys/vmmeter.h> 7455505Sshin#include <sys/sysctl.h> 7555505Sshin 7662583Sitojun#include <sys/kernel.h> 7762583Sitojun#include <sys/unistd.h> 7862583Sitojun 7962583Sitojun#include <machine/limits.h> 8062583Sitojun 8162583Sitojun#include <vm/vm.h> 8262583Sitojun#include <vm/vm_param.h> 8378064Sume#include <vm/vm_prot.h> 8478064Sume#include <sys/lock.h> 8578064Sume#include <vm/pmap.h> 8662583Sitojun#include <vm/vm_map.h> 8755505Sshin#include <vm/vm_page.h> 88173412Skevlo#include <vm/vm_pageout.h> 8955505Sshin#include <vm/vm_kern.h> 9055505Sshin#include <vm/vm_extern.h> 91121155Sume 9255505Sshin#include <sys/user.h> 93122108Sume 94122108Sume/* 95122108Sume * System initialization 96122108Sume * 97122108Sume * Note: proc0 from proc.h 98122108Sume */ 9962583Sitojun 10055505Sshinstatic void vm_init_limits __P((void *)); 10155505SshinSYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0) 10255505Sshin 10355505Sshin/* 10455505Sshin * THIS MUST BE THE LAST INITIALIZATION ITEM!!! 10555505Sshin * 10655505Sshin * Note: run scheduling should be divorced from the vm system. 10755505Sshin */ 10855505Sshinstatic void scheduler __P((void *)); 10955505SshinSYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, scheduler, NULL) 110121155Sume 111121155Sume 112121155Sumestatic void swapout __P((struct proc *)); 113121155Sume 11455505Sshinextern char kstack[]; 11578064Sume 11678064Sume/* vm_map_t upages_map; */ 11778064Sume 11855505Sshinint 11955505Sshinkernacc(addr, len, rw) 12055505Sshin caddr_t addr; 12155505Sshin int len, rw; 12255505Sshin{ 12355505Sshin boolean_t rv; 12455505Sshin vm_offset_t saddr, eaddr; 12555505Sshin vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; 12655505Sshin 12755505Sshin saddr = trunc_page(addr); 12855505Sshin eaddr = round_page(addr + len); 12955505Sshin vm_map_lock_read(kernel_map); 13055505Sshin rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); 13155505Sshin vm_map_unlock_read(kernel_map); 13255505Sshin return (rv == TRUE); 13355505Sshin} 13455505Sshin 13555505Sshinint 13655505Sshinuseracc(addr, len, rw) 13755505Sshin caddr_t addr; 13855505Sshin int len, rw; 13955505Sshin{ 14055505Sshin boolean_t rv; 14155505Sshin vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; 14255505Sshin vm_map_t map; 14355505Sshin vm_map_entry_t save_hint; 14455505Sshin 14555505Sshin /* 14662583Sitojun * XXX - check separately to disallow access to user area and user 14778064Sume * page tables - they are in the map. 14862583Sitojun * 14955505Sshin * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. It was once 15055505Sshin * only used (as an end address) in trap.c. Use it as an end address 15155505Sshin * here too. This bogusness has spread. I just fixed where it was 15255505Sshin * used as a max in vm_mmap.c. 15355505Sshin */ 15455505Sshin if ((vm_offset_t) addr + len > /* XXX */ VM_MAXUSER_ADDRESS 15555505Sshin || (vm_offset_t) addr + len < (vm_offset_t) addr) { 156121155Sume return (FALSE); 15755505Sshin } 15855505Sshin map = &curproc->p_vmspace->vm_map; 15955505Sshin vm_map_lock_read(map); 16055505Sshin /* 161122108Sume * We save the map hint, and restore it. Useracc appears to distort 162122108Sume * the map hint unnecessarily. 163122108Sume */ 164122108Sume save_hint = map->hint; 165122108Sume rv = vm_map_check_protection(map, 166122108Sume trunc_page(addr), round_page(addr + len), prot); 16755505Sshin map->hint = save_hint; 16855505Sshin vm_map_unlock_read(map); 16955505Sshin 17055505Sshin return (rv == TRUE); 17155505Sshin} 17255505Sshin 17355505Sshinvoid 17455505Sshinvslock(addr, len) 17555505Sshin caddr_t addr; 17655505Sshin u_int len; 17755505Sshin{ 17855505Sshin vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr), 17962583Sitojun round_page(addr + len), FALSE); 18062583Sitojun} 18155505Sshin 18262583Sitojunvoid 18362583Sitojunvsunlock(addr, len, dirtied) 18462583Sitojun caddr_t addr; 18555505Sshin u_int len; 186121155Sume int dirtied; 18762583Sitojun{ 18855505Sshin#ifdef lint 18955505Sshin dirtied++; 19055505Sshin#endif /* lint */ 19155505Sshin vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr), 19255505Sshin round_page(addr + len), TRUE); 19355505Sshin} 19455505Sshin 19555505Sshin/* 19655505Sshin * Implement fork's actions on an address space. 19778064Sume * Here we arrange for the address space to be copied or referenced, 19855505Sshin * allocate a user struct (pcb and kernel stack), then call the 19955505Sshin * machine-dependent layer to fill those in and make the new process 20055505Sshin * ready to run. The new process is set up so that it returns directly 20155505Sshin * to user mode to avoid stack copying and relocation problems. 20255505Sshin */ 20355505Sshinvoid 20455505Sshinvm_fork(p1, p2, flags) 20555505Sshin register struct proc *p1, *p2; 20655505Sshin int flags; 20755505Sshin{ 20855505Sshin register struct user *up; 20955505Sshin 210122108Sume if (flags & RFMEM) { 21155505Sshin p2->p_vmspace = p1->p_vmspace; 212122108Sume p1->p_vmspace->vm_refcnt++; 213122108Sume } 214122108Sume 215122108Sume while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { 216122108Sume VM_WAIT; 217122108Sume } 218122108Sume 219122108Sume if ((flags & RFMEM) == 0) { 22055505Sshin p2->p_vmspace = vmspace_fork(p1->p_vmspace); 221122108Sume 22255505Sshin if (p1->p_vmspace->vm_shm) 22355505Sshin shmfork(p1, p2); 22455505Sshin } 22555505Sshin 22655505Sshin pmap_new_proc(p2); 22755505Sshin 22855505Sshin up = p2->p_addr; 229122108Sume 23055505Sshin /* 231122108Sume * p_stats and p_sigacts currently point at fields in the user struct 23255505Sshin * but not at &u, instead at p_addr. Copy p_sigacts and parts of 233122108Sume * p_stats; zero the rest of p_stats (statistics). 234122108Sume */ 235122108Sume p2->p_stats = &up->u_stats; 236122108Sume p2->p_sigacts = &up->u_sigacts; 237122108Sume up->u_sigacts = *p1->p_sigacts; 238122108Sume bzero(&up->u_stats.pstat_startzero, 239122108Sume (unsigned) ((caddr_t) &up->u_stats.pstat_endzero - 240122108Sume (caddr_t) &up->u_stats.pstat_startzero)); 24155505Sshin bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy, 242122108Sume ((caddr_t) &up->u_stats.pstat_endcopy - 24355505Sshin (caddr_t) &up->u_stats.pstat_startcopy)); 24455505Sshin 24555505Sshin 24655505Sshin /* 24755505Sshin * cpu_fork will copy and update the pcb, set up the kernel stack, 24855505Sshin * and make the child ready to run. 24955505Sshin */ 250121155Sume cpu_fork(p1, p2); 25155505Sshin} 25255505Sshin 25355505Sshin/* 25455505Sshin * Set default limits for VM system. 255121155Sume * Called for proc 0, and then inherited by all others. 25655505Sshin * 25755505Sshin * XXX should probably act directly on proc0. 25855505Sshin */ 259121155Sumestatic void 26055505Sshinvm_init_limits(udata) 26155505Sshin void *udata; 26255505Sshin{ 26355505Sshin register struct proc *p = udata; 26478064Sume int rss_limit; 26555505Sshin 26655505Sshin /* 267121155Sume * Set up the initial limits on process VM. Set the maximum resident 26855505Sshin * set size to be half of (reasonably) available memory. Since this 26955505Sshin * is a soft limit, it comes into effect only when the system is out 27055505Sshin * of memory - half of main memory helps to favor smaller processes, 27155505Sshin * and reduces thrashing of the object cache. 27255505Sshin */ 27355505Sshin p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ; 274121155Sume p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ; 27555505Sshin p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ; 27655505Sshin p->p_rlimit[RLIMIT_DATA].rlim_max = MAXDSIZ; 27755505Sshin /* limit the limit to no less than 2MB */ 27855505Sshin rss_limit = max(cnt.v_free_count, 512); 279122108Sume p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit); 28055505Sshin p->p_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY; 28155505Sshin} 28255505Sshin 28355505Sshinvoid 28455505Sshinfaultin(p) 28555505Sshin struct proc *p; 28655505Sshin{ 28755505Sshin int s; 28855505Sshin 28955505Sshin if ((p->p_flag & P_INMEM) == 0) { 29055505Sshin 29155505Sshin ++p->p_lock; 29255505Sshin 293122108Sume pmap_swapin_proc(p); 294122108Sume 295122108Sume s = splhigh(); 29655505Sshin 29755505Sshin if (p->p_stat == SRUN) 298122108Sume setrunqueue(p); 29955505Sshin 30055505Sshin p->p_flag |= P_INMEM; 30155505Sshin 30255505Sshin /* undo the effect of setting SLOCK above */ 30355505Sshin --p->p_lock; 30455505Sshin splx(s); 30555505Sshin 30655505Sshin } 30755505Sshin} 30855505Sshin 30955505Sshin/* 31055505Sshin * This swapin algorithm attempts to swap-in processes only if there 31155505Sshin * is enough space for them. Of course, if a process waits for a long 31255505Sshin * time, it will be swapped in anyway. 31355505Sshin */ 31462583Sitojun/* ARGSUSED*/ 315122108Sumestatic void 31662583Sitojunscheduler(dummy) 31762583Sitojun void *dummy; 318122108Sume{ 319122108Sume register struct proc *p; 320122108Sume register int pri; 321122108Sume struct proc *pp; 322122108Sume int ppri; 323122108Sume 324122108Sumeloop: 325122108Sume while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { 326122108Sume VM_WAIT; 327122108Sume } 328122108Sume 329122108Sume pp = NULL; 33055505Sshin ppri = INT_MIN; 331122108Sume for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { 33255505Sshin if (p->p_stat == SRUN && 33355505Sshin (p->p_flag & (P_INMEM | P_SWAPPING)) == 0) { 33455505Sshin int mempri; 33555505Sshin 33655505Sshin pri = p->p_swtime + p->p_slptime; 33755505Sshin if ((p->p_flag & P_SWAPINREQ) == 0) { 338121155Sume pri -= p->p_nice * 8; 33955505Sshin } 34055505Sshin mempri = pri > 0 ? pri : 0; 34155505Sshin /* 34255505Sshin * if this process is higher priority and there is 343121155Sume * enough space, then select this process instead of 34455505Sshin * the previous selection. 34555505Sshin */ 34655505Sshin if (pri > ppri) { 34755505Sshin pp = p; 34862583Sitojun ppri = pri; 34955505Sshin } 35062583Sitojun } 35162583Sitojun } 352121155Sume 35355505Sshin /* 35455505Sshin * Nothing to do, back to sleep. 35555505Sshin */ 35655505Sshin if ((p = pp) == NULL) { 35755505Sshin tsleep(&proc0, PVM, "sched", 0); 35855505Sshin goto loop; 35955505Sshin } 36055505Sshin p->p_flag &= ~P_SWAPINREQ; 36155505Sshin 36255505Sshin /* 36355505Sshin * We would like to bring someone in. (only if there is space). 36455505Sshin */ 36555505Sshin faultin(p); 36655505Sshin p->p_swtime = 0; 36755505Sshin goto loop; 36855505Sshin} 36955505Sshin 37055505Sshin#ifndef NO_SWAPPING 37155505Sshin 37255505Sshin#define swappable(p) \ 37355505Sshin (((p)->p_lock == 0) && \ 374121155Sume ((p)->p_flag & (P_TRACED|P_NOSWAP|P_SYSTEM|P_INMEM|P_WEXIT|P_PHYSIO|P_SWAPPING)) == P_INMEM) 37555505Sshin 37655505Sshin 37755505Sshin/* 37855505Sshin * Swap_idle_threshold1 is the guaranteed swapped in time for a process 37955505Sshin */ 38055505Sshinint swap_idle_threshold1 = 2; 38155505SshinSYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, 38255505Sshin CTLFLAG_RW, &swap_idle_threshold1, 0, ""); 38355505Sshin 38455505Sshin/* 38555505Sshin * Swap_idle_threshold2 is the time that a process can be idle before 38655505Sshin * it will be swapped out, if idle swapping is enabled. 38755505Sshin */ 38855505Sshinint swap_idle_threshold2 = 10; 38955505SshinSYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, 39055505Sshin CTLFLAG_RW, &swap_idle_threshold2, 0, ""); 39155505Sshin 39255505Sshin/* 39355505Sshin * Swapout is driven by the pageout daemon. Very simple, we find eligible 39455505Sshin * procs and unwire their u-areas. We try to always "swap" at least one 39555505Sshin * process in case we need the room for a swapin. 39655505Sshin * If any procs have been sleeping/stopped for at least maxslp seconds, 39755505Sshin * they are swapped. Else, we swap the longest-sleeping or stopped process, 39855505Sshin * if any, otherwise the longest-resident process. 39955505Sshin */ 40055505Sshinvoid 40155505Sshinswapout_procs(action) 40255505Sshinint action; 40355505Sshin{ 40455505Sshin register struct proc *p; 40555505Sshin struct proc *outp, *outp2; 40655505Sshin int outpri, outpri2; 40755505Sshin int didswap = 0; 40855505Sshin 40955505Sshin outp = outp2 = NULL; 41055505Sshin outpri = outpri2 = INT_MIN; 41155505Sshinretry: 41255505Sshin for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { 41355505Sshin struct vmspace *vm; 41455505Sshin if (!swappable(p)) 41555505Sshin continue; 41655505Sshin 41755505Sshin vm = p->p_vmspace; 41855505Sshin 41955505Sshin switch (p->p_stat) { 42055505Sshin default: 42155505Sshin continue; 42255505Sshin 42355505Sshin case SSLEEP: 42455505Sshin case SSTOP: 42555505Sshin /* 42655505Sshin * do not swapout a realtime process 42762583Sitojun */ 42855505Sshin if (p->p_rtprio.type == RTP_PRIO_REALTIME) 42962583Sitojun continue; 43062583Sitojun 43155505Sshin /* 43255505Sshin * Do not swapout a process waiting on a critical 43355505Sshin * event of some kind. Also guarantee swap_idle_threshold1 43455505Sshin * time in memory. 43555505Sshin */ 43655505Sshin if (((p->p_priority & 0x7f) < PSOCK) || 43755505Sshin (p->p_slptime < swap_idle_threshold1)) 43862583Sitojun continue; 43955505Sshin 44062583Sitojun /* 44162583Sitojun * If the system is under memory stress, or if we are swapping 44255505Sshin * idle processes >= swap_idle_threshold2, then swap the process 44355505Sshin * out. 44455505Sshin */ 44555505Sshin if (((action & VM_SWAP_NORMAL) == 0) && 44655505Sshin (((action & VM_SWAP_IDLE) == 0) || 44755505Sshin (p->p_slptime < swap_idle_threshold2))) 44855505Sshin continue; 449121155Sume 45055505Sshin ++vm->vm_refcnt; 45155505Sshin vm_map_reference(&vm->vm_map); 452121155Sume /* 45355505Sshin * do not swapout a process that is waiting for VM 45455505Sshin * data structures there is a possible deadlock. 455121155Sume */ 45655505Sshin if (lockmgr(&vm->vm_map.lock, 45755505Sshin LK_EXCLUSIVE | LK_NOWAIT, 45855505Sshin (void *)0, curproc)) { 45955505Sshin vm_map_deallocate(&vm->vm_map); 46055505Sshin vmspace_free(vm); 46155505Sshin continue; 46255505Sshin } 46355505Sshin vm_map_unlock(&vm->vm_map); 46455505Sshin /* 46555505Sshin * If the process has been asleep for awhile and had 46655505Sshin * most of its pages taken away already, swap it out. 46755505Sshin */ 46855505Sshin if ((action & VM_SWAP_NORMAL) || 46955505Sshin ((action & VM_SWAP_IDLE) && 47055505Sshin (p->p_slptime > swap_idle_threshold2))) { 47155505Sshin swapout(p); 47255505Sshin vm_map_deallocate(&vm->vm_map); 47355505Sshin vmspace_free(vm); 47455505Sshin didswap++; 47555505Sshin goto retry; 47655505Sshin } 47755505Sshin } 47855505Sshin } 47955505Sshin /* 48055505Sshin * If we swapped something out, and another process needed memory, 48155505Sshin * then wakeup the sched process. 48255505Sshin */ 48355505Sshin if (didswap) 48455505Sshin wakeup(&proc0); 48555505Sshin} 48655505Sshin 48755505Sshinstatic void 48855505Sshinswapout(p) 48955505Sshin register struct proc *p; 49055505Sshin{ 49155505Sshin 49255505Sshin#if defined(SWAP_DEBUG) 49355505Sshin printf("swapping out %d\n", p->p_pid); 49455505Sshin#endif 495113552Ssumikawa ++p->p_stats->p_ru.ru_nswap; 49655505Sshin /* 49755505Sshin * remember the process resident count 49855505Sshin */ 49955505Sshin p->p_vmspace->vm_swrss = 50055505Sshin p->p_vmspace->vm_pmap.pm_stats.resident_count; 50155505Sshin 50255505Sshin (void) splhigh(); 50355505Sshin p->p_flag &= ~P_INMEM; 50455505Sshin p->p_flag |= P_SWAPPING; 50555505Sshin if (p->p_stat == SRUN) 50655505Sshin remrq(p); 50755505Sshin (void) spl0(); 50855505Sshin 50955505Sshin pmap_swapout_proc(p); 51055505Sshin 51155505Sshin p->p_flag &= ~P_SWAPPING; 51255505Sshin p->p_swtime = 0; 51355505Sshin} 51455505Sshin#endif /* !NO_SWAPPING */ 51555505Sshin