vm_machdep.c revision 65557
12061Sjkh/*- 250479Speter * Copyright (c) 1982, 1986 The Regents of the University of California. 32061Sjkh * Copyright (c) 1989, 1990 William Jolitz 438666Sjb * Copyright (c) 1994 John Dyson 532427Sjb * All rights reserved. 638666Sjb * 738666Sjb * This code is derived from software contributed to Berkeley by 838666Sjb * the Systems Programming Group of the University of Utah Computer 938666Sjb * Science Department, and William Jolitz. 1064049Salex * 1164049Salex * Redistribution and use in source and binary forms, with or without 1266071Smarkm * modification, are permitted provided that the following conditions 1373504Sobrien * are met: 1438666Sjb * 1. Redistributions of source code must retain the above copyright 1544918Sjkh * notice, this list of conditions and the following disclaimer. 1638666Sjb * 2. Redistributions in binary form must reproduce the above copyright 1738666Sjb * notice, this list of conditions and the following disclaimer in the 1838666Sjb * documentation and/or other materials provided with the distribution. 1938666Sjb * 3. All advertising materials mentioning features or use of this software 2038666Sjb * must display the following acknowledgement: 2138666Sjb * This product includes software developed by the University of 2238666Sjb * California, Berkeley and its contributors. 2338978Sjb * 4. Neither the name of the University nor the names of its contributors 2438978Sjb * may be used to endorse or promote products derived from this software 2532427Sjb * without specific prior written permission. 2638666Sjb * 2738666Sjb * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 2838666Sjb * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2938666Sjb * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 3038666Sjb * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 3138666Sjb * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 3217308Speter * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 3391606Skeramida * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 3419175Sbde * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 3596205Sjwd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3696205Sjwd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3738042Sbde * SUCH DAMAGE. 3896205Sjwd * 3996205Sjwd * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 4038042Sbde * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ 4196205Sjwd * $FreeBSD: head/sys/powerpc/aim/vm_machdep.c 65557 2000-09-07 01:33:02Z jasone $ 4296205Sjwd */ 4317308Speter/* 4496205Sjwd * Copyright (c) 1994, 1995, 1996 Carnegie-Mellon University. 4596205Sjwd * All rights reserved. 4617308Speter * 4796205Sjwd * Author: Chris G. Demetriou 4896205Sjwd * 4996205Sjwd * Permission to use, copy, modify and distribute this software and 5096205Sjwd * its documentation is hereby granted, provided that both the copyright 5196205Sjwd * notice and this permission notice appear in all copies of the 5296205Sjwd * software, derivative works or modified versions, and any portions 5396205Sjwd * thereof, and that both notices appear in supporting documentation. 5496205Sjwd * 5596205Sjwd * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 5696205Sjwd * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 5796205Sjwd * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 5896205Sjwd * 5945108Sobrien * Carnegie Mellon requests users of this software to return to 6042128Speter * 6142128Speter * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 6238666Sjb * School of Computer Science 6351361Sjb * Carnegie Mellon University 6438666Sjb * Pittsburgh PA 15213-3890 6517308Speter * 6698775Sdillon * any improvements or extensions that they make and grant Carnegie the 6798723Sdillon * rights to redistribute these changes. 6898723Sdillon */ 6998723Sdillon 7098723Sdillon#include <sys/param.h> 7138666Sjb#include <sys/systm.h> 7217308Speter#include <sys/proc.h> 7338666Sjb#include <sys/malloc.h> 7417308Speter#include <sys/bio.h> 7527910Sasami#include <sys/buf.h> 7643226Sjkh#include <sys/vnode.h> 7743226Sjkh#include <sys/vmmeter.h> 7843226Sjkh#include <sys/kernel.h> 7938666Sjb#include <sys/sysctl.h> 8027910Sasami#include <sys/unistd.h> 8138666Sjb 8238666Sjb#include <machine/clock.h> 8338666Sjb#include <machine/cpu.h> 8427910Sasami#include <machine/fpu.h> 8538666Sjb#include <machine/md_var.h> 8638666Sjb#include <machine/prom.h> 8743226Sjkh#include <machine/mutex.h> 8843226Sjkh 8927910Sasami#include <vm/vm.h> 9038666Sjb#include <vm/vm_param.h> 9138666Sjb#include <sys/lock.h> 9227910Sasami#include <vm/vm_kern.h> 9338666Sjb#include <vm/vm_page.h> 9427910Sasami#include <vm/vm_map.h> 9517308Speter#include <vm/vm_extern.h> 9638666Sjb 9738666Sjb#include <sys/user.h> 9817308Speter 9995509Sru/* 10095793Sru * quick version of vm_fault 10197252Sru */ 10296164Sruint 10395146Sgshapirovm_fault_quick(v, prot) 1042061Sjkh caddr_t v; 10597769Sru int prot; 10697252Sru{ 10797252Sru int r; 10895730Sru if (prot & VM_PROT_WRITE) 10995793Sru r = subyte(v, fubyte(v)); 11095730Sru else 11195730Sru r = fubyte(v); 11295730Sru return(r); 11355026Smarcel} 11455026Smarcel 11554324Smarcel/* 11617308Speter * Finish a fork operation, with process p2 nearly set up. 11738666Sjb * Copy and update the pcb, set up the stack so that the child 11817308Speter * ready to run and return to user mode. 11997252Sru */ 12038666Sjbvoid 12154324Smarcelcpu_fork(p1, p2, flags) 1222302Spaul register struct proc *p1, *p2; 12339206Sjkh int flags; 12439206Sjkh{ 12539206Sjkh if ((flags & RFPROC) == 0) 12673349Sru return; 12717308Speter 12854324Smarcel p2->p_md.md_tf = p1->p_md.md_tf; 12954324Smarcel p2->p_md.md_flags = p1->p_md.md_flags & (MDP_FPUSED | MDP_UAC_MASK); 13054324Smarcel 13154324Smarcel /* 13254324Smarcel * Cache the physical address of the pcb, so we can 13354324Smarcel * swap to it easily. 13454324Smarcel */ 13569659Sobrien p2->p_md.md_pcbpaddr = (void*)vtophys((vm_offset_t)&p2->p_addr->u_pcb); 13654324Smarcel 13754324Smarcel /* 13854324Smarcel * Copy floating point state from the FP chip to the PCB 13954324Smarcel * if this process has state stored there. 14054324Smarcel */ 14154324Smarcel alpha_fpstate_save(p1, 0); 14254324Smarcel 14354324Smarcel /* 14454324Smarcel * Copy pcb and stack from proc p1 to p2. We do this as 14554324Smarcel * cheaply as possible, copying only the active part of the 14654324Smarcel * stack. The stack and pcb need to agree. Make sure that the 14754324Smarcel * new process has FEN disabled. 14854324Smarcel */ 14954324Smarcel p2->p_addr->u_pcb = p1->p_addr->u_pcb; 15054324Smarcel p2->p_addr->u_pcb.pcb_hw.apcb_usp = alpha_pal_rdusp(); 15154324Smarcel p2->p_addr->u_pcb.pcb_hw.apcb_flags &= ~ALPHA_PCB_FLAGS_FEN; 15254324Smarcel 15354324Smarcel /* 15454324Smarcel * Set the floating point state. 15597536Sobrien */ 15654324Smarcel if ((p2->p_addr->u_pcb.pcb_fp_control & IEEE_INHERIT) == 0) { 15754324Smarcel p2->p_addr->u_pcb.pcb_fp_control = 0; 15854324Smarcel p2->p_addr->u_pcb.pcb_fp.fpr_cr = (FPCR_DYN_NORMAL 15995730Sru | FPCR_INVD | FPCR_DZED 16095730Sru | FPCR_OVFD | FPCR_INED 16195730Sru | FPCR_UNFD); 16295730Sru } 16395730Sru 16495730Sru /* 16595730Sru * Arrange for a non-local goto when the new process 16638666Sjb * is started, to resume here, returning nonzero from setjmp. 16738666Sjb */ 16817308Speter#ifdef DIAGNOSTIC 16938666Sjb if (p1 != curproc) 17038666Sjb panic("cpu_fork: curproc"); 17138666Sjb alpha_fpstate_check(p1); 17217308Speter#endif 17355678Smarcel 17455678Smarcel /* 17555678Smarcel * create the child's kernel stack, from scratch. 17655678Smarcel */ 17755678Smarcel { 17890395Sru struct user *up = p2->p_addr; 17990395Sru struct trapframe *p2tf; 18090395Sru 18190395Sru /* 18290395Sru * Pick a stack pointer, leaving room for a trapframe; 1832061Sjkh * copy trapframe from parent so return to user mode 18417308Speter * will be to right address, with correct registers. 18538666Sjb */ 18638666Sjb p2tf = p2->p_md.md_tf = (struct trapframe *) 18717308Speter ((char *)p2->p_addr + USPACE - sizeof(struct trapframe)); 18855678Smarcel bcopy(p1->p_md.md_tf, p2->p_md.md_tf, 1893626Swollman sizeof(struct trapframe)); 19017308Speter 19155678Smarcel /* 19255678Smarcel * Set up return-value registers as fork() libc stub expects. 19355678Smarcel */ 19455678Smarcel p2tf->tf_regs[FRAME_V0] = 0; /* child's pid (linux) */ 19555678Smarcel p2tf->tf_regs[FRAME_A3] = 0; /* no error */ 19655678Smarcel p2tf->tf_regs[FRAME_A4] = 1; /* is child (FreeBSD) */ 19755678Smarcel 19855678Smarcel /* 19955678Smarcel * Arrange for continuation at child_return(), which 20055678Smarcel * will return to exception_return(). Note that the child 20155678Smarcel * process doesn't stay in the kernel for long! 20255678Smarcel * 20355678Smarcel * This is an inlined version of cpu_set_kpc. 20438666Sjb */ 20538666Sjb up->u_pcb.pcb_hw.apcb_ksp = (u_int64_t)p2tf; 20617308Speter up->u_pcb.pcb_context[0] = 20755678Smarcel (u_int64_t)child_return; /* s0: pc */ 20838978Sjb up->u_pcb.pcb_context[1] = 2093626Swollman (u_int64_t)exception_return; /* s1: ra */ 21017308Speter up->u_pcb.pcb_context[2] = (u_long) p2; /* s2: a0 */ 21138666Sjb up->u_pcb.pcb_context[7] = 21217308Speter (u_int64_t)switch_trampoline; /* ra: assembly magic */ 21343226Sjkh } 21443226Sjkh} 21543226Sjkh 21638666Sjb/* 21738666Sjb * Intercept the return address from a freshly forked process that has NOT 21844103Smsmith * been scheduled yet. 219 * 220 * This is needed to make kernel threads stay in kernel mode. 221 */ 222void 223cpu_set_fork_handler(p, func, arg) 224 struct proc *p; 225 void (*func) __P((void *)); 226 void *arg; 227{ 228 /* 229 * Note that the trap frame follows the args, so the function 230 * is really called like this: func(arg, frame); 231 */ 232 p->p_addr->u_pcb.pcb_context[0] = (u_long) func; 233 p->p_addr->u_pcb.pcb_context[2] = (u_long) arg; 234} 235 236/* 237 * cpu_exit is called as the last action during exit. 238 * We release the address space of the process, block interrupts, 239 * and call switch_exit. switch_exit switches to proc0's PCB and stack, 240 * then jumps into the middle of cpu_switch, as if it were switching 241 * from proc0. 242 */ 243void 244cpu_exit(p) 245 register struct proc *p; 246{ 247 alpha_fpstate_drop(p); 248 249 (void) splhigh(); 250 mtx_enter(&sched_lock, MTX_SPIN); 251 mtx_exit(&Giant, MTX_DEF); 252 cnt.v_swtch++; 253 cpu_switch(); 254 panic("cpu_exit"); 255} 256 257void 258cpu_wait(p) 259 struct proc *p; 260{ 261 /* drop per-process resources */ 262 pmap_dispose_proc(p); 263 264 /* and clean-out the vmspace */ 265 vmspace_free(p->p_vmspace); 266} 267 268/* 269 * Dump the machine specific header information at the start of a core dump. 270 */ 271int 272cpu_coredump(p, vp, cred) 273 struct proc *p; 274 struct vnode *vp; 275 struct ucred *cred; 276{ 277 278 return (vn_rdwr(UIO_WRITE, vp, (caddr_t) p->p_addr, ctob(UPAGES), 279 (off_t)0, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *)NULL, 280 p)); 281} 282 283#ifdef notyet 284static void 285setredzone(pte, vaddr) 286 u_short *pte; 287 caddr_t vaddr; 288{ 289/* eventually do this by setting up an expand-down stack segment 290 for ss0: selector, allowing stack access down to top of u. 291 this means though that protection violations need to be handled 292 thru a double fault exception that must do an integral task 293 switch to a known good context, within which a dump can be 294 taken. a sensible scheme might be to save the initial context 295 used by sched (that has physical memory mapped 1:1 at bottom) 296 and take the dump while still in mapped mode */ 297} 298#endif 299 300/* 301 * Map an IO request into kernel virtual address space. 302 * 303 * All requests are (re)mapped into kernel VA space. 304 * Notice that we use b_bufsize for the size of the buffer 305 * to be mapped. b_bcount might be modified by the driver. 306 */ 307void 308vmapbuf(bp) 309 register struct buf *bp; 310{ 311 register caddr_t addr, v, kva; 312 vm_offset_t pa; 313 314 if ((bp->b_flags & B_PHYS) == 0) 315 panic("vmapbuf"); 316 317 for (v = bp->b_saveaddr, addr = (caddr_t)trunc_page(bp->b_data); 318 addr < bp->b_data + bp->b_bufsize; 319 addr += PAGE_SIZE, v += PAGE_SIZE) { 320 /* 321 * Do the vm_fault if needed; do the copy-on-write thing 322 * when reading stuff off device into memory. 323 */ 324 vm_fault_quick(addr, 325 (bp->b_iocmd == BIO_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ); 326 pa = trunc_page(pmap_kextract((vm_offset_t) addr)); 327 if (pa == 0) 328 panic("vmapbuf: page not present"); 329 vm_page_hold(PHYS_TO_VM_PAGE(pa)); 330 pmap_kenter((vm_offset_t) v, pa); 331 } 332 333 kva = bp->b_saveaddr; 334 bp->b_saveaddr = bp->b_data; 335 bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK); 336} 337 338/* 339 * Free the io map PTEs associated with this IO operation. 340 * We also invalidate the TLB entries and restore the original b_addr. 341 */ 342void 343vunmapbuf(bp) 344 register struct buf *bp; 345{ 346 register caddr_t addr; 347 vm_offset_t pa; 348 349 if ((bp->b_flags & B_PHYS) == 0) 350 panic("vunmapbuf"); 351 352 for (addr = (caddr_t)trunc_page(bp->b_data); 353 addr < bp->b_data + bp->b_bufsize; 354 addr += PAGE_SIZE) { 355 pa = trunc_page(pmap_kextract((vm_offset_t) addr)); 356 pmap_kremove((vm_offset_t) addr); 357 vm_page_unhold(PHYS_TO_VM_PAGE(pa)); 358 } 359 360 bp->b_data = bp->b_saveaddr; 361} 362 363/* 364 * Reset back to firmware. 365 */ 366void 367cpu_reset() 368{ 369 prom_halt(0); 370} 371 372int 373grow_stack(p, sp) 374 struct proc *p; 375 size_t sp; 376{ 377 int rv; 378 379 rv = vm_map_growstack (p, sp); 380 if (rv != KERN_SUCCESS) 381 return (0); 382 383 return (1); 384} 385 386 387static int cnt_prezero; 388 389SYSCTL_INT(_machdep, OID_AUTO, cnt_prezero, CTLFLAG_RD, &cnt_prezero, 0, ""); 390 391/* 392 * Implement the pre-zeroed page mechanism. 393 * This routine is called from the idle loop. 394 */ 395 396#define ZIDLE_LO(v) ((v) * 2 / 3) 397#define ZIDLE_HI(v) ((v) * 4 / 5) 398 399int 400vm_page_zero_idle() 401{ 402 static int free_rover; 403 static int zero_state; 404 vm_page_t m; 405 int s; 406 407 /* 408 * Attempt to maintain approximately 1/2 of our free pages in a 409 * PG_ZERO'd state. Add some hysteresis to (attempt to) avoid 410 * generally zeroing a page when the system is near steady-state. 411 * Otherwise we might get 'flutter' during disk I/O / IPC or 412 * fast sleeps. We also do not want to be continuously zeroing 413 * pages because doing so may flush our L1 and L2 caches too much. 414 */ 415 416 if (zero_state && vm_page_zero_count >= ZIDLE_LO(cnt.v_free_count)) 417 return(0); 418 if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count)) 419 return(0); 420 421#ifdef SMP 422 if (KLOCK_ENTER(M_TRY)) { 423#endif 424 s = splvm(); 425 m = vm_page_list_find(PQ_FREE, free_rover, FALSE); 426 zero_state = 0; 427 if (m != NULL && (m->flags & PG_ZERO) == 0) { 428 vm_page_queues[m->queue].lcnt--; 429 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 430 m->queue = PQ_NONE; 431 splx(s); 432#if 0 433 rel_mplock(); 434#endif 435 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 436#if 0 437 get_mplock(); 438#endif 439 (void)splvm(); 440 vm_page_flag_set(m, PG_ZERO); 441 m->queue = PQ_FREE + m->pc; 442 vm_page_queues[m->queue].lcnt++; 443 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, 444 pageq); 445 ++vm_page_zero_count; 446 ++cnt_prezero; 447 if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count)) 448 zero_state = 1; 449 } 450 free_rover = (free_rover + PQ_PRIME2) & PQ_L2_MASK; 451 splx(s); 452#ifdef SMP 453 KLOCK_EXIT; 454#endif 455 return (1); 456#ifdef SMP 457 } 458#endif 459 return (0); 460} 461 462/* 463 * Software interrupt handler for queued VM system processing. 464 */ 465void 466swi_vm() 467{ 468 if (busdma_swi_pending != 0) 469 busdma_swi(); 470} 471 472/* 473 * Tell whether this address is in some physical memory region. 474 * Currently used by the kernel coredump code in order to avoid 475 * dumping the ``ISA memory hole'' which could cause indefinite hangs, 476 * or other unpredictable behaviour. 477 */ 478 479 480int 481is_physical_memory(addr) 482 vm_offset_t addr; 483{ 484 /* 485 * stuff other tests for known memory-mapped devices (PCI?) 486 * here 487 */ 488 489 return 1; 490} 491