vm_machdep.c revision 42175
1/*-
2 * Copyright (c) 1982, 1986 The Regents of the University of California.
3 * Copyright (c) 1989, 1990 William Jolitz
4 * Copyright (c) 1994 John Dyson
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * the Systems Programming Group of the University of Utah Computer
9 * Science Department, and William Jolitz.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by the University of
22 *	California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 *    may be used to endorse or promote products derived from this software
25 *    without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
40 *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
41 *	$Id: vm_machdep.c,v 1.6 1998/12/16 15:21:50 bde Exp $
42 */
43/*
44 * Copyright (c) 1994, 1995, 1996 Carnegie-Mellon University.
45 * All rights reserved.
46 *
47 * Author: Chris G. Demetriou
48 *
49 * Permission to use, copy, modify and distribute this software and
50 * its documentation is hereby granted, provided that both the copyright
51 * notice and this permission notice appear in all copies of the
52 * software, derivative works or modified versions, and any portions
53 * thereof, and that both notices appear in supporting documentation.
54 *
55 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
56 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
57 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
58 *
59 * Carnegie Mellon requests users of this software to return to
60 *
61 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
62 *  School of Computer Science
63 *  Carnegie Mellon University
64 *  Pittsburgh PA 15213-3890
65 *
66 * any improvements or extensions that they make and grant Carnegie the
67 * rights to redistribute these changes.
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/proc.h>
73#include <sys/malloc.h>
74#include <sys/buf.h>
75#include <sys/vnode.h>
76#include <sys/vmmeter.h>
77#include <sys/kernel.h>
78#include <sys/sysctl.h>
79
80#include <machine/clock.h>
81#include <machine/cpu.h>
82#include <machine/fpu.h>
83#include <machine/md_var.h>
84#include <machine/prom.h>
85
86#include <vm/vm.h>
87#include <vm/vm_param.h>
88#include <vm/vm_prot.h>
89#include <sys/lock.h>
90#include <vm/vm_kern.h>
91#include <vm/vm_page.h>
92#include <vm/vm_map.h>
93#include <vm/vm_extern.h>
94
95#include <sys/user.h>
96
97/*
98 * quick version of vm_fault
99 */
100void
101vm_fault_quick(v, prot)
102	caddr_t v;
103	int prot;
104{
105	if (prot & VM_PROT_WRITE)
106		subyte(v, fubyte(v));
107	else
108		fubyte(v);
109}
110
111/*
112 * Finish a fork operation, with process p2 nearly set up.
113 * Copy and update the pcb, set up the stack so that the child
114 * ready to run and return to user mode.
115 */
116void
117cpu_fork(p1, p2)
118	register struct proc *p1, *p2;
119{
120	struct user *up = p2->p_addr;
121	int i;
122
123	p2->p_md.md_tf = p1->p_md.md_tf;
124	p2->p_md.md_flags = p1->p_md.md_flags & MDP_FPUSED;
125
126	/*
127	 * Cache the physical address of the pcb, so we can
128	 * swap to it easily.
129	 */
130	p2->p_md.md_pcbpaddr = (void*) vtophys((vm_offset_t) &up->u_pcb);
131
132	/*
133	 * Copy floating point state from the FP chip to the PCB
134	 * if this process has state stored there.
135	 */
136	if (p1 == fpcurproc) {
137		alpha_pal_wrfen(1);
138		savefpstate(&fpcurproc->p_addr->u_pcb.pcb_fp);
139		alpha_pal_wrfen(0);
140	}
141
142	/*
143	 * Copy pcb and stack from proc p1 to p2.
144	 * We do this as cheaply as possible, copying only the active
145	 * part of the stack.  The stack and pcb need to agree;
146	 */
147	p2->p_addr->u_pcb = p1->p_addr->u_pcb;
148	p2->p_addr->u_pcb.pcb_hw.apcb_usp = alpha_pal_rdusp();
149
150	/*
151	 * Set the floating point state.
152	 */
153	if ((p2->p_addr->u_pcb.pcb_fp_control & IEEE_INHERIT) == 0) {
154		p2->p_addr->u_pcb.pcb_fp_control = 0;
155		p2->p_addr->u_pcb.pcb_fp.fpr_cr = (FPCR_DYN_NORMAL
156						   | FPCR_INVD | FPCR_DZED
157						   | FPCR_OVFD | FPCR_INED
158						   | FPCR_UNFD);
159	}
160
161	/*
162	 * Arrange for a non-local goto when the new process
163	 * is started, to resume here, returning nonzero from setjmp.
164	 */
165#ifdef DIAGNOSTIC
166	if (p1 != curproc)
167		panic("cpu_fork: curproc");
168	if ((up->u_pcb.pcb_hw.apcb_flags & ALPHA_PCB_FLAGS_FEN) != 0)
169		printf("DANGER WILL ROBINSON: FEN SET IN cpu_fork!\n");
170#endif
171
172	/*
173	 * create the child's kernel stack, from scratch.
174	 */
175	{
176		struct trapframe *p2tf;
177
178		/*
179		 * Pick a stack pointer, leaving room for a trapframe;
180		 * copy trapframe from parent so return to user mode
181		 * will be to right address, with correct registers.
182		 */
183		p2tf = p2->p_md.md_tf = (struct trapframe *)
184		    ((char *)p2->p_addr + USPACE - sizeof(struct trapframe));
185		bcopy(p1->p_md.md_tf, p2->p_md.md_tf,
186		    sizeof(struct trapframe));
187
188		/*
189		 * Set up return-value registers as fork() libc stub expects.
190		 */
191		p2tf->tf_regs[FRAME_V0] = p1->p_pid;	/* parent's pid */
192		p2tf->tf_regs[FRAME_A3] = 0;		/* no error */
193		p2tf->tf_regs[FRAME_A4] = 1;		/* is child */
194
195		/*
196		 * Arrange for continuation at child_return(), which
197		 * will return to exception_return().  Note that the child
198		 * process doesn't stay in the kernel for long!
199		 *
200		 * This is an inlined version of cpu_set_kpc.
201		 */
202		up->u_pcb.pcb_hw.apcb_ksp = (u_int64_t)p2tf;
203		up->u_pcb.pcb_context[0] =
204		    (u_int64_t)child_return;		/* s0: pc */
205		up->u_pcb.pcb_context[1] =
206		    (u_int64_t)exception_return;	/* s1: ra */
207		up->u_pcb.pcb_context[2] = (u_long) p2;	/* s2: a0 */
208		up->u_pcb.pcb_context[7] =
209		    (u_int64_t)switch_trampoline;	/* ra: assembly magic */
210	}
211}
212
213/*
214 * Intercept the return address from a freshly forked process that has NOT
215 * been scheduled yet.
216 *
217 * This is needed to make kernel threads stay in kernel mode.
218 */
219void
220cpu_set_fork_handler(p, func, arg)
221	struct proc *p;
222	void (*func) __P((void *));
223	void *arg;
224{
225	/*
226	 * Note that the trap frame follows the args, so the function
227	 * is really called like this:  func(arg, frame);
228	 */
229	p->p_addr->u_pcb.pcb_context[0] = (u_long) func;
230	p->p_addr->u_pcb.pcb_context[2] = (u_long) arg;
231}
232
233/*
234 * cpu_exit is called as the last action during exit.
235 * We release the address space of the process, block interrupts,
236 * and call switch_exit.  switch_exit switches to proc0's PCB and stack,
237 * then jumps into the middle of cpu_switch, as if it were switching
238 * from proc0.
239 */
240void
241cpu_exit(p)
242	register struct proc *p;
243{
244	if (p == fpcurproc)
245		fpcurproc = NULL;
246
247	(void) splhigh();
248	cnt.v_swtch++;
249	cpu_switch(p);
250	panic("cpu_exit");
251}
252
253void
254cpu_wait(p)
255	struct proc *p;
256{
257	/* drop per-process resources */
258	pmap_dispose_proc(p);
259
260	/* and clean-out the vmspace */
261	vmspace_free(p->p_vmspace);
262}
263
264/*
265 * Dump the machine specific header information at the start of a core dump.
266 */
267int
268cpu_coredump(p, vp, cred)
269	struct proc *p;
270	struct vnode *vp;
271	struct ucred *cred;
272{
273
274	return (vn_rdwr(UIO_WRITE, vp, (caddr_t) p->p_addr, ctob(UPAGES),
275	    (off_t)0, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *)NULL,
276	    p));
277}
278
279#ifdef notyet
280static void
281setredzone(pte, vaddr)
282	u_short *pte;
283	caddr_t vaddr;
284{
285/* eventually do this by setting up an expand-down stack segment
286   for ss0: selector, allowing stack access down to top of u.
287   this means though that protection violations need to be handled
288   thru a double fault exception that must do an integral task
289   switch to a known good context, within which a dump can be
290   taken. a sensible scheme might be to save the initial context
291   used by sched (that has physical memory mapped 1:1 at bottom)
292   and take the dump while still in mapped mode */
293}
294#endif
295
296/*
297 * Map an IO request into kernel virtual address space.
298 *
299 * All requests are (re)mapped into kernel VA space.
300 * Notice that we use b_bufsize for the size of the buffer
301 * to be mapped.  b_bcount might be modified by the driver.
302 */
303void
304vmapbuf(bp)
305	register struct buf *bp;
306{
307	register caddr_t addr, v, kva;
308	vm_offset_t pa;
309
310	if ((bp->b_flags & B_PHYS) == 0)
311		panic("vmapbuf");
312
313	for (v = bp->b_saveaddr, addr = (caddr_t)trunc_page(bp->b_data);
314	    addr < bp->b_data + bp->b_bufsize;
315	    addr += PAGE_SIZE, v += PAGE_SIZE) {
316		/*
317		 * Do the vm_fault if needed; do the copy-on-write thing
318		 * when reading stuff off device into memory.
319		 */
320		vm_fault_quick(addr,
321			(bp->b_flags&B_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ);
322		pa = trunc_page(pmap_kextract((vm_offset_t) addr));
323		if (pa == 0)
324			panic("vmapbuf: page not present");
325		vm_page_hold(PHYS_TO_VM_PAGE(pa));
326		pmap_kenter((vm_offset_t) v, pa);
327	}
328
329	kva = bp->b_saveaddr;
330	bp->b_saveaddr = bp->b_data;
331	bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK);
332}
333
334/*
335 * Free the io map PTEs associated with this IO operation.
336 * We also invalidate the TLB entries and restore the original b_addr.
337 */
338void
339vunmapbuf(bp)
340	register struct buf *bp;
341{
342	register caddr_t addr;
343	vm_offset_t pa;
344
345	if ((bp->b_flags & B_PHYS) == 0)
346		panic("vunmapbuf");
347
348	for (addr = (caddr_t)trunc_page(bp->b_data);
349	    addr < bp->b_data + bp->b_bufsize;
350	    addr += PAGE_SIZE) {
351		pa = trunc_page(pmap_kextract((vm_offset_t) addr));
352		pmap_kremove((vm_offset_t) addr);
353		vm_page_unhold(PHYS_TO_VM_PAGE(pa));
354	}
355
356	bp->b_data = bp->b_saveaddr;
357}
358
359/*
360 * Force reset the processor by invalidating the entire address space!
361 */
362void
363cpu_reset()
364{
365	prom_halt(0);
366}
367
368/*
369 * Grow the user stack to allow for 'sp'. This version grows the stack in
370 *	chunks of SGROWSIZ.
371 */
372int
373grow(p, sp)
374	struct proc *p;
375	size_t sp;
376{
377	unsigned int nss;
378	caddr_t v;
379	struct vmspace *vm = p->p_vmspace;
380
381	if ((caddr_t)sp <= vm->vm_maxsaddr || sp >= USRSTACK)
382		return (1);
383
384	nss = roundup(USRSTACK - sp, PAGE_SIZE);
385
386	if (nss > p->p_rlimit[RLIMIT_STACK].rlim_cur)
387		return (0);
388
389	if (vm->vm_ssize && roundup(vm->vm_ssize << PAGE_SHIFT,
390	    SGROWSIZ) < nss) {
391		int grow_amount;
392		/*
393		 * If necessary, grow the VM that the stack occupies
394		 * to allow for the rlimit. This allows us to not have
395		 * to allocate all of the VM up-front in execve (which
396		 * is expensive).
397		 * Grow the VM by the amount requested rounded up to
398		 * the nearest SGROWSIZ to provide for some hysteresis.
399		 */
400		grow_amount = roundup((nss - (vm->vm_ssize << PAGE_SHIFT)), SGROWSIZ);
401		v = (char *)USRSTACK - roundup(vm->vm_ssize << PAGE_SHIFT,
402		    SGROWSIZ) - grow_amount;
403		/*
404		 * If there isn't enough room to extend by SGROWSIZ, then
405		 * just extend to the maximum size
406		 */
407		if (v < vm->vm_maxsaddr) {
408			v = vm->vm_maxsaddr;
409			grow_amount = MAXSSIZ - (vm->vm_ssize << PAGE_SHIFT);
410		}
411		if ((grow_amount == 0) || (vm_map_find(&vm->vm_map, NULL, 0, (vm_offset_t *)&v,
412		    grow_amount, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != KERN_SUCCESS)) {
413			return (0);
414		}
415		vm->vm_ssize += grow_amount >> PAGE_SHIFT;
416	}
417
418	return (1);
419}
420
421static int cnt_prezero;
422
423SYSCTL_INT(_machdep, OID_AUTO, cnt_prezero, CTLFLAG_RD, &cnt_prezero, 0, "");
424
425/*
426 * Implement the pre-zeroed page mechanism.
427 * This routine is called from the idle loop.
428 */
429int
430vm_page_zero_idle()
431{
432	static int free_rover;
433	vm_page_t m;
434	int s;
435
436	/*
437	 * XXX
438	 * We stop zeroing pages when there are sufficent prezeroed pages.
439	 * This threshold isn't really needed, except we want to
440	 * bypass unneeded calls to vm_page_list_find, and the
441	 * associated cache flush and latency.  The pre-zero will
442	 * still be called when there are significantly more
443	 * non-prezeroed pages than zeroed pages.  The threshold
444	 * of half the number of reserved pages is arbitrary, but
445	 * approximately the right amount.  Eventually, we should
446	 * perhaps interrupt the zero operation when a process
447	 * is found to be ready to run.
448	 */
449	if (cnt.v_free_count - vm_page_zero_count <= cnt.v_free_reserved / 2)
450		return (0);
451#ifdef SMP
452	if (try_mplock()) {
453#endif
454		s = splvm();
455		m = vm_page_list_find(PQ_FREE, free_rover);
456		if (m != NULL) {
457			--(*vm_page_queues[m->queue].lcnt);
458			TAILQ_REMOVE(vm_page_queues[m->queue].pl, m, pageq);
459			m->queue = PQ_NONE;
460			splx(s);
461#if 0
462			rel_mplock();
463#endif
464			pmap_zero_page(VM_PAGE_TO_PHYS(m));
465#if 0
466			get_mplock();
467#endif
468			(void)splvm();
469			m->queue = PQ_ZERO + m->pc;
470			++(*vm_page_queues[m->queue].lcnt);
471			TAILQ_INSERT_HEAD(vm_page_queues[m->queue].pl, m,
472			    pageq);
473			free_rover = (free_rover + PQ_PRIME3) & PQ_L2_MASK;
474			++vm_page_zero_count;
475			++cnt_prezero;
476		}
477		splx(s);
478#ifdef SMP
479		rel_mplock();
480#endif
481		return (1);
482#ifdef SMP
483	}
484#endif
485	return (0);
486}
487
488/*
489 * Software interrupt handler for queued VM system processing.
490 */
491void
492swi_vm()
493{
494#if 0
495	if (busdma_swi_pending != 0)
496		busdma_swi();
497#endif
498}
499
500/*
501 * Tell whether this address is in some physical memory region.
502 * Currently used by the kernel coredump code in order to avoid
503 * dumping the ``ISA memory hole'' which could cause indefinite hangs,
504 * or other unpredictable behaviour.
505 */
506
507
508int
509is_physical_memory(addr)
510	vm_offset_t addr;
511{
512	/*
513	 * stuff other tests for known memory-mapped devices (PCI?)
514	 * here
515	 */
516
517	return 1;
518}
519