vm_glue.c revision 99072
1/* 2 * Copyright (c) 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * The Mach Operating System project at Carnegie-Mellon University. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 37 * 38 * 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 40 * All rights reserved. 41 * 42 * Permission to use, copy, modify and distribute this software and 43 * its documentation is hereby granted, provided that both the copyright 44 * notice and this permission notice appear in all copies of the 45 * software, derivative works or modified versions, and any portions 46 * thereof, and that both notices appear in supporting documentation. 47 * 48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 51 * 52 * Carnegie Mellon requests users of this software to return to 53 * 54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 55 * School of Computer Science 56 * Carnegie Mellon University 57 * Pittsburgh PA 15213-3890 58 * 59 * any improvements or extensions that they make and grant Carnegie the 60 * rights to redistribute these changes. 61 * 62 * $FreeBSD: head/sys/vm/vm_glue.c 99072 2002-06-29 17:26:22Z julian $ 63 */ 64 65#include "opt_vm.h" 66 67#include <sys/param.h> 68#include <sys/systm.h> 69#include <sys/lock.h> 70#include <sys/mutex.h> 71#include <sys/proc.h> 72#include <sys/resourcevar.h> 73#include <sys/shm.h> 74#include <sys/vmmeter.h> 75#include <sys/sx.h> 76#include <sys/sysctl.h> 77 78#include <sys/kernel.h> 79#include <sys/ktr.h> 80#include <sys/unistd.h> 81 82#include <machine/limits.h> 83 84#include <vm/vm.h> 85#include <vm/vm_param.h> 86#include <vm/pmap.h> 87#include <vm/vm_map.h> 88#include <vm/vm_page.h> 89#include <vm/vm_pageout.h> 90#include <vm/vm_kern.h> 91#include <vm/vm_extern.h> 92 93#include <sys/user.h> 94 95extern int maxslp; 96 97/* 98 * System initialization 99 * 100 * Note: proc0 from proc.h 101 */ 102static void vm_init_limits(void *); 103SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0) 104 105/* 106 * THIS MUST BE THE LAST INITIALIZATION ITEM!!! 107 * 108 * Note: run scheduling should be divorced from the vm system. 109 */ 110static void scheduler(void *); 111SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, scheduler, NULL) 112 113#ifndef NO_SWAPPING 114static void swapout(struct proc *); 115#endif 116 117/* 118 * MPSAFE 119 */ 120int 121kernacc(addr, len, rw) 122 caddr_t addr; 123 int len, rw; 124{ 125 boolean_t rv; 126 vm_offset_t saddr, eaddr; 127 vm_prot_t prot; 128 129 KASSERT((rw & ~VM_PROT_ALL) == 0, 130 ("illegal ``rw'' argument to kernacc (%x)\n", rw)); 131 prot = rw; 132 saddr = trunc_page((vm_offset_t)addr); 133 eaddr = round_page((vm_offset_t)addr + len); 134 rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); 135 return (rv == TRUE); 136} 137 138/* 139 * MPSAFE 140 */ 141int 142useracc(addr, len, rw) 143 caddr_t addr; 144 int len, rw; 145{ 146 boolean_t rv; 147 vm_prot_t prot; 148 149 KASSERT((rw & ~VM_PROT_ALL) == 0, 150 ("illegal ``rw'' argument to useracc (%x)\n", rw)); 151 prot = rw; 152 /* 153 * XXX - check separately to disallow access to user area and user 154 * page tables - they are in the map. 155 * 156 * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. It was once 157 * only used (as an end address) in trap.c. Use it as an end address 158 * here too. This bogusness has spread. I just fixed where it was 159 * used as a max in vm_mmap.c. 160 */ 161 if ((vm_offset_t) addr + len > /* XXX */ VM_MAXUSER_ADDRESS 162 || (vm_offset_t) addr + len < (vm_offset_t) addr) { 163 return (FALSE); 164 } 165 rv = vm_map_check_protection(&curproc->p_vmspace->vm_map, 166 trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), 167 prot); 168 return (rv == TRUE); 169} 170 171/* 172 * MPSAFE 173 */ 174void 175vslock(addr, len) 176 caddr_t addr; 177 u_int len; 178{ 179 180 vm_map_wire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), 181 round_page((vm_offset_t)addr + len), FALSE); 182} 183 184/* 185 * MPSAFE 186 */ 187void 188vsunlock(addr, len) 189 caddr_t addr; 190 u_int len; 191{ 192 193 vm_map_unwire(&curproc->p_vmspace->vm_map, 194 trunc_page((vm_offset_t)addr), 195 round_page((vm_offset_t)addr + len), FALSE); 196} 197 198/* 199 * Implement fork's actions on an address space. 200 * Here we arrange for the address space to be copied or referenced, 201 * allocate a user struct (pcb and kernel stack), then call the 202 * machine-dependent layer to fill those in and make the new process 203 * ready to run. The new process is set up so that it returns directly 204 * to user mode to avoid stack copying and relocation problems. 205 */ 206void 207vm_forkproc(td, p2, td2, flags) 208 struct thread *td; 209 struct proc *p2; 210 struct thread *td2; 211 int flags; 212{ 213 struct proc *p1 = td->td_proc; 214 struct user *up; 215 216 GIANT_REQUIRED; 217 218 if ((flags & RFPROC) == 0) { 219 /* 220 * Divorce the memory, if it is shared, essentially 221 * this changes shared memory amongst threads, into 222 * COW locally. 223 */ 224 if ((flags & RFMEM) == 0) { 225 if (p1->p_vmspace->vm_refcnt > 1) { 226 vmspace_unshare(p1); 227 } 228 } 229 cpu_fork(td, p2, td2, flags); 230 return; 231 } 232 233 if (flags & RFMEM) { 234 p2->p_vmspace = p1->p_vmspace; 235 p1->p_vmspace->vm_refcnt++; 236 } 237 238 while (vm_page_count_severe()) { 239 VM_WAIT; 240 } 241 242 if ((flags & RFMEM) == 0) { 243 p2->p_vmspace = vmspace_fork(p1->p_vmspace); 244 245 pmap_pinit2(vmspace_pmap(p2->p_vmspace)); 246 247 if (p1->p_vmspace->vm_shm) 248 shmfork(p1, p2); 249 } 250 251 pmap_new_proc(p2); 252 pmap_new_thread(td2); /* Initial thread */ 253 254 /* XXXKSE this is unsatisfactory but should be adequate */ 255 up = p2->p_uarea; 256 257 /* 258 * p_stats currently points at fields in the user struct 259 * but not at &u, instead at p_addr. Copy parts of 260 * p_stats; zero the rest of p_stats (statistics). 261 * 262 * If procsig->ps_refcnt is 1 and p2->p_sigacts is NULL we dont' need 263 * to share sigacts, so we use the up->u_sigacts. 264 */ 265 p2->p_stats = &up->u_stats; 266 if (p2->p_sigacts == NULL) { 267 if (p2->p_procsig->ps_refcnt != 1) 268 printf ("PID:%d NULL sigacts with refcnt not 1!\n",p2->p_pid); 269 p2->p_sigacts = &up->u_sigacts; 270 up->u_sigacts = *p1->p_sigacts; 271 } 272 273 bzero(&up->u_stats.pstat_startzero, 274 (unsigned) ((caddr_t) &up->u_stats.pstat_endzero - 275 (caddr_t) &up->u_stats.pstat_startzero)); 276 bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy, 277 ((caddr_t) &up->u_stats.pstat_endcopy - 278 (caddr_t) &up->u_stats.pstat_startcopy)); 279 280 281 /* 282 * cpu_fork will copy and update the pcb, set up the kernel stack, 283 * and make the child ready to run. 284 */ 285 cpu_fork(td, p2, td2, flags); 286} 287 288/* 289 * Called after process has been wait(2)'ed apon and is being reaped. 290 * The idea is to reclaim resources that we could not reclaim while 291 * the process was still executing. 292 */ 293void 294vm_waitproc(p) 295 struct proc *p; 296{ 297 struct thread *td; 298 299 GIANT_REQUIRED; 300 cpu_wait(p); 301 pmap_dispose_proc(p); /* drop per-process resources */ 302/* XXXKSE by here there should not be any threads left! */ 303 FOREACH_THREAD_IN_PROC(p, td) { 304 panic("vm_waitproc: Survivor thread!"); 305 pmap_dispose_thread(td); 306 } 307 vmspace_exitfree(p); /* and clean-out the vmspace */ 308} 309 310/* 311 * Set default limits for VM system. 312 * Called for proc 0, and then inherited by all others. 313 * 314 * XXX should probably act directly on proc0. 315 */ 316static void 317vm_init_limits(udata) 318 void *udata; 319{ 320 struct proc *p = udata; 321 int rss_limit; 322 323 /* 324 * Set up the initial limits on process VM. Set the maximum resident 325 * set size to be half of (reasonably) available memory. Since this 326 * is a soft limit, it comes into effect only when the system is out 327 * of memory - half of main memory helps to favor smaller processes, 328 * and reduces thrashing of the object cache. 329 */ 330 p->p_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; 331 p->p_rlimit[RLIMIT_STACK].rlim_max = maxssiz; 332 p->p_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; 333 p->p_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; 334 /* limit the limit to no less than 2MB */ 335 rss_limit = max(cnt.v_free_count, 512); 336 p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit); 337 p->p_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY; 338} 339 340void 341faultin(p) 342 struct proc *p; 343{ 344 struct thread *td; 345 GIANT_REQUIRED; 346 347 PROC_LOCK_ASSERT(p, MA_OWNED); 348 mtx_lock_spin(&sched_lock); 349 if ((p->p_sflag & PS_INMEM) == 0) { 350 ++p->p_lock; 351 mtx_unlock_spin(&sched_lock); 352 PROC_UNLOCK(p); 353 354 pmap_swapin_proc(p); 355 FOREACH_THREAD_IN_PROC (p, td) 356 pmap_swapin_thread(td); 357 358 PROC_LOCK(p); 359 mtx_lock_spin(&sched_lock); 360 FOREACH_THREAD_IN_PROC (p, td) 361 if (td->td_state == TDS_RUNQ) /* XXXKSE */ 362 setrunqueue(td); 363 364 p->p_sflag |= PS_INMEM; 365 366 /* undo the effect of setting SLOCK above */ 367 --p->p_lock; 368 } 369 mtx_unlock_spin(&sched_lock); 370} 371 372/* 373 * This swapin algorithm attempts to swap-in processes only if there 374 * is enough space for them. Of course, if a process waits for a long 375 * time, it will be swapped in anyway. 376 * 377 * XXXKSE - process with the thread with highest priority counts.. 378 * 379 * Giant is still held at this point, to be released in tsleep. 380 */ 381/* ARGSUSED*/ 382static void 383scheduler(dummy) 384 void *dummy; 385{ 386 struct proc *p; 387 struct thread *td; 388 int pri; 389 struct proc *pp; 390 int ppri; 391 392 mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); 393 /* GIANT_REQUIRED */ 394 395loop: 396 if (vm_page_count_min()) { 397 VM_WAIT; 398 goto loop; 399 } 400 401 pp = NULL; 402 ppri = INT_MIN; 403 sx_slock(&allproc_lock); 404 FOREACH_PROC_IN_SYSTEM(p) { 405 struct ksegrp *kg; 406 if (p->p_sflag & (PS_INMEM | PS_SWAPPING)) { 407 continue; 408 } 409 mtx_lock_spin(&sched_lock); 410 FOREACH_THREAD_IN_PROC(p, td) { 411 /* Only consider runnable threads */ 412 if (td->td_state == TDS_RUNQ) { 413 kg = td->td_ksegrp; 414 pri = p->p_swtime + kg->kg_slptime; 415 if ((p->p_sflag & PS_SWAPINREQ) == 0) { 416 pri -= kg->kg_nice * 8; 417 } 418 419 /* 420 * if this ksegrp is higher priority 421 * and there is enough space, then select 422 * this process instead of the previous 423 * selection. 424 */ 425 if (pri > ppri) { 426 pp = p; 427 ppri = pri; 428 } 429 } 430 } 431 mtx_unlock_spin(&sched_lock); 432 } 433 sx_sunlock(&allproc_lock); 434 435 /* 436 * Nothing to do, back to sleep. 437 */ 438 if ((p = pp) == NULL) { 439 tsleep(&proc0, PVM, "sched", maxslp * hz / 2); 440 goto loop; 441 } 442 mtx_lock_spin(&sched_lock); 443 p->p_sflag &= ~PS_SWAPINREQ; 444 mtx_unlock_spin(&sched_lock); 445 446 /* 447 * We would like to bring someone in. (only if there is space). 448 * [What checks the space? ] 449 */ 450 PROC_LOCK(p); 451 faultin(p); 452 PROC_UNLOCK(p); 453 mtx_lock_spin(&sched_lock); 454 p->p_swtime = 0; 455 mtx_unlock_spin(&sched_lock); 456 goto loop; 457} 458 459#ifndef NO_SWAPPING 460 461/* 462 * Swap_idle_threshold1 is the guaranteed swapped in time for a process 463 */ 464static int swap_idle_threshold1 = 2; 465SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, 466 CTLFLAG_RW, &swap_idle_threshold1, 0, ""); 467 468/* 469 * Swap_idle_threshold2 is the time that a process can be idle before 470 * it will be swapped out, if idle swapping is enabled. 471 */ 472static int swap_idle_threshold2 = 10; 473SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, 474 CTLFLAG_RW, &swap_idle_threshold2, 0, ""); 475 476/* 477 * Swapout is driven by the pageout daemon. Very simple, we find eligible 478 * procs and unwire their u-areas. We try to always "swap" at least one 479 * process in case we need the room for a swapin. 480 * If any procs have been sleeping/stopped for at least maxslp seconds, 481 * they are swapped. Else, we swap the longest-sleeping or stopped process, 482 * if any, otherwise the longest-resident process. 483 */ 484void 485swapout_procs(action) 486int action; 487{ 488 struct proc *p; 489 struct thread *td; 490 struct ksegrp *kg; 491 struct proc *outp, *outp2; 492 int outpri, outpri2; 493 int didswap = 0; 494 495 GIANT_REQUIRED; 496 497 outp = outp2 = NULL; 498 outpri = outpri2 = INT_MIN; 499retry: 500 sx_slock(&allproc_lock); 501 FOREACH_PROC_IN_SYSTEM(p) { 502 struct vmspace *vm; 503 int minslptime = 100000; 504 505 PROC_LOCK(p); 506 if (p->p_lock != 0 || 507 (p->p_flag & (P_STOPPED_SNGL|P_TRACED|P_SYSTEM|P_WEXIT)) != 0) { 508 PROC_UNLOCK(p); 509 continue; 510 } 511 /* 512 * only aiod changes vmspace, however it will be 513 * skipped because of the if statement above checking 514 * for P_SYSTEM 515 */ 516 vm = p->p_vmspace; 517 mtx_lock_spin(&sched_lock); 518 if ((p->p_sflag & (PS_INMEM|PS_SWAPPING)) != PS_INMEM) { 519 mtx_unlock_spin(&sched_lock); 520 PROC_UNLOCK(p); 521 continue; 522 } 523 524 switch (p->p_state) { 525 default: 526 /* Don't swap out processes in any sort 527 * of 'special' state. */ 528 mtx_unlock_spin(&sched_lock); 529 PROC_UNLOCK(p); 530 continue; 531 532 case PRS_NORMAL: 533 /* 534 * do not swapout a realtime process 535 * Check all the thread groups.. 536 */ 537 FOREACH_KSEGRP_IN_PROC(p, kg) { 538 if (PRI_IS_REALTIME(kg->kg_pri_class)) { 539 mtx_unlock_spin(&sched_lock); 540 PROC_UNLOCK(p); 541 goto nextproc; 542 } 543 544 /* 545 * Do not swapout a process waiting 546 * on a critical event of some kind. 547 * Also guarantee swap_idle_threshold1 548 * time in memory. 549 */ 550 if (kg->kg_slptime < swap_idle_threshold1) { 551 mtx_unlock_spin(&sched_lock); 552 PROC_UNLOCK(p); 553 goto nextproc; 554 } 555 FOREACH_THREAD_IN_PROC(p, td) { 556 if ((td->td_priority) < PSOCK) { 557 mtx_unlock_spin(&sched_lock); 558 PROC_UNLOCK(p); 559 goto nextproc; 560 } 561 } 562 /* 563 * If the system is under memory stress, 564 * or if we are swapping 565 * idle processes >= swap_idle_threshold2, 566 * then swap the process out. 567 */ 568 if (((action & VM_SWAP_NORMAL) == 0) && 569 (((action & VM_SWAP_IDLE) == 0) || 570 (kg->kg_slptime < swap_idle_threshold2))) { 571 mtx_unlock_spin(&sched_lock); 572 PROC_UNLOCK(p); 573 goto nextproc; 574 } 575 if (minslptime > kg->kg_slptime) 576 minslptime = kg->kg_slptime; 577 } 578 579 mtx_unlock_spin(&sched_lock); 580 ++vm->vm_refcnt; 581 /* 582 * do not swapout a process that 583 * is waiting for VM 584 * data structures there is a 585 * possible deadlock. 586 */ 587 if (!vm_map_trylock(&vm->vm_map)) { 588 vmspace_free(vm); 589 PROC_UNLOCK(p); 590 goto nextproc; 591 } 592 vm_map_unlock(&vm->vm_map); 593 /* 594 * If the process has been asleep for awhile and had 595 * most of its pages taken away already, swap it out. 596 */ 597 if ((action & VM_SWAP_NORMAL) || 598 ((action & VM_SWAP_IDLE) && 599 (minslptime > swap_idle_threshold2))) { 600 sx_sunlock(&allproc_lock); 601 swapout(p); 602 vmspace_free(vm); 603 didswap++; 604 goto retry; 605 } 606 PROC_UNLOCK(p); 607 vmspace_free(vm); 608 } 609nextproc: 610 continue; 611 } 612 sx_sunlock(&allproc_lock); 613 /* 614 * If we swapped something out, and another process needed memory, 615 * then wakeup the sched process. 616 */ 617 if (didswap) 618 wakeup(&proc0); 619} 620 621static void 622swapout(p) 623 struct proc *p; 624{ 625 struct thread *td; 626 627 PROC_LOCK_ASSERT(p, MA_OWNED); 628#if defined(SWAP_DEBUG) 629 printf("swapping out %d\n", p->p_pid); 630#endif 631 ++p->p_stats->p_ru.ru_nswap; 632 /* 633 * remember the process resident count 634 */ 635 p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace); 636 637 mtx_lock_spin(&sched_lock); 638 p->p_sflag &= ~PS_INMEM; 639 p->p_sflag |= PS_SWAPPING; 640 PROC_UNLOCK(p); 641 FOREACH_THREAD_IN_PROC (p, td) 642 if (td->td_state == TDS_RUNQ) /* XXXKSE */ 643 remrunqueue(td); /* XXXKSE */ 644 mtx_unlock_spin(&sched_lock); 645 646 pmap_swapout_proc(p); 647 FOREACH_THREAD_IN_PROC(p, td) 648 pmap_swapout_thread(td); 649 mtx_lock_spin(&sched_lock); 650 p->p_sflag &= ~PS_SWAPPING; 651 p->p_swtime = 0; 652 mtx_unlock_spin(&sched_lock); 653} 654#endif /* !NO_SWAPPING */ 655