vm_machdep.c revision 39648
1/*-
2 * Copyright (c) 1982, 1986 The Regents of the University of California.
3 * Copyright (c) 1989, 1990 William Jolitz
4 * Copyright (c) 1994 John Dyson
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * the Systems Programming Group of the University of Utah Computer
9 * Science Department, and William Jolitz.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by the University of
22 *	California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 *    may be used to endorse or promote products derived from this software
25 *    without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
40 *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
41 *	$Id: vm_machdep.c,v 1.109 1998/08/18 07:46:58 msmith Exp $
42 */
43
44#include "npx.h"
45#include "opt_user_ldt.h"
46#include "opt_vm86.h"
47#ifdef PC98
48#include "opt_pc98.h"
49#endif
50
51#include <sys/param.h>
52#include <sys/systm.h>
53#include <sys/proc.h>
54#include <sys/malloc.h>
55#include <sys/buf.h>
56#include <sys/vnode.h>
57#include <sys/vmmeter.h>
58#include <sys/kernel.h>
59#include <sys/sysctl.h>
60
61#include <machine/clock.h>
62#include <machine/cpu.h>
63#include <machine/md_var.h>
64#ifdef SMP
65#include <machine/smp.h>
66#endif
67#ifdef VM86
68#include <machine/pcb_ext.h>
69#include <machine/vm86.h>
70#endif
71
72#include <vm/vm.h>
73#include <vm/vm_param.h>
74#include <vm/vm_prot.h>
75#include <sys/lock.h>
76#include <vm/vm_kern.h>
77#include <vm/vm_page.h>
78#include <vm/vm_map.h>
79#include <vm/vm_extern.h>
80
81#include <sys/user.h>
82
83#ifdef PC98
84#include <pc98/pc98/pc98.h>
85#else
86#include <i386/isa/isa.h>
87#endif
88
89static void	cpu_reset_real __P((void));
90#ifdef SMP
91static void	cpu_reset_proxy __P((void));
92static u_int	cpu_reset_proxyid;
93static volatile u_int	cpu_reset_proxy_active;
94#endif
95
96/*
97 * quick version of vm_fault
98 */
99void
100vm_fault_quick(v, prot)
101	caddr_t v;
102	int prot;
103{
104	if (prot & VM_PROT_WRITE)
105		subyte(v, fubyte(v));
106	else
107		fubyte(v);
108}
109
110/*
111 * Finish a fork operation, with process p2 nearly set up.
112 * Copy and update the pcb, set up the stack so that the child
113 * ready to run and return to user mode.
114 */
115void
116cpu_fork(p1, p2)
117	register struct proc *p1, *p2;
118{
119	struct pcb *pcb2 = &p2->p_addr->u_pcb;
120
121#if NNPX > 0
122	/* Ensure that p1's pcb is up to date. */
123	if (npxproc == p1)
124		npxsave(&p1->p_addr->u_pcb.pcb_savefpu);
125#endif
126
127	/* Copy p1's pcb. */
128	p2->p_addr->u_pcb = p1->p_addr->u_pcb;
129
130	/*
131	 * Create a new fresh stack for the new process.
132	 * Copy the trap frame for the return to user mode as if from a
133	 * syscall.  This copies the user mode register values.
134	 */
135	p2->p_md.md_regs = (struct trapframe *)
136#ifdef VM86
137			   ((int)p2->p_addr + UPAGES * PAGE_SIZE - 16) - 1;
138#else
139			   ((int)p2->p_addr + UPAGES * PAGE_SIZE) - 1;
140#endif /* VM86 */
141	*p2->p_md.md_regs = *p1->p_md.md_regs;
142
143	/*
144	 * Set registers for trampoline to user mode.  Leave space for the
145	 * return address on stack.  These are the kernel mode register values.
146	 */
147	pcb2->pcb_cr3 = vtophys(p2->p_vmspace->vm_pmap.pm_pdir);
148	pcb2->pcb_edi = p2->p_md.md_regs->tf_edi;
149	pcb2->pcb_esi = (int)fork_return;
150	pcb2->pcb_ebp = p2->p_md.md_regs->tf_ebp;
151	pcb2->pcb_esp = (int)p2->p_md.md_regs - sizeof(void *);
152	pcb2->pcb_ebx = (int)p2;
153	pcb2->pcb_eip = (int)fork_trampoline;
154	/*
155	 * pcb2->pcb_ldt:	duplicated below, if necessary.
156	 * pcb2->pcb_ldt_len:	cloned above.
157	 * pcb2->pcb_savefpu:	cloned above.
158	 * pcb2->pcb_flags:	cloned above (always 0 here?).
159	 * pcb2->pcb_onfault:	cloned above (always NULL here?).
160	 */
161
162#ifdef VM86
163	/*
164	 * XXX don't copy the i/o pages.  this should probably be fixed.
165	 */
166	pcb2->pcb_ext = 0;
167#endif
168
169#ifdef USER_LDT
170        /* Copy the LDT, if necessary. */
171        if (pcb2->pcb_ldt != 0) {
172                union descriptor *new_ldt;
173                size_t len = pcb2->pcb_ldt_len * sizeof(union descriptor);
174
175                new_ldt = (union descriptor *)kmem_alloc(kernel_map, len);
176                bcopy(pcb2->pcb_ldt, new_ldt, len);
177                pcb2->pcb_ldt = (caddr_t)new_ldt;
178        }
179#endif
180
181	/*
182	 * Now, cpu_switch() can schedule the new process.
183	 * pcb_esp is loaded pointing to the cpu_switch() stack frame
184	 * containing the return address when exiting cpu_switch.
185	 * This will normally be to proc_trampoline(), which will have
186	 * %ebx loaded with the new proc's pointer.  proc_trampoline()
187	 * will set up a stack to call fork_return(p, frame); to complete
188	 * the return to user-mode.
189	 */
190}
191
192/*
193 * Intercept the return address from a freshly forked process that has NOT
194 * been scheduled yet.
195 *
196 * This is needed to make kernel threads stay in kernel mode.
197 */
198void
199cpu_set_fork_handler(p, func, arg)
200	struct proc *p;
201	void (*func) __P((void *));
202	void *arg;
203{
204	/*
205	 * Note that the trap frame follows the args, so the function
206	 * is really called like this:  func(arg, frame);
207	 */
208	p->p_addr->u_pcb.pcb_esi = (int) func;	/* function */
209	p->p_addr->u_pcb.pcb_ebx = (int) arg;	/* first arg */
210}
211
212void
213cpu_exit(p)
214	register struct proc *p;
215{
216#if defined(USER_LDT) || defined(VM86)
217	struct pcb *pcb = &p->p_addr->u_pcb;
218#endif
219
220#if NNPX > 0
221	npxexit(p);
222#endif	/* NNPX */
223#ifdef VM86
224	if (pcb->pcb_ext != 0) {
225	        /*
226		 * XXX do we need to move the TSS off the allocated pages
227		 * before freeing them?  (not done here)
228		 */
229		kmem_free(kernel_map, (vm_offset_t)pcb->pcb_ext,
230		    ctob(IOPAGES + 1));
231		pcb->pcb_ext = 0;
232	}
233#endif
234#ifdef USER_LDT
235	if (pcb->pcb_ldt != 0) {
236		if (pcb == curpcb) {
237			lldt(_default_ldt);
238			currentldt = _default_ldt;
239		}
240		kmem_free(kernel_map, (vm_offset_t)pcb->pcb_ldt,
241			pcb->pcb_ldt_len * sizeof(union descriptor));
242		pcb->pcb_ldt_len = (int)pcb->pcb_ldt = 0;
243	}
244#endif
245	cnt.v_swtch++;
246	cpu_switch(p);
247	panic("cpu_exit");
248}
249
250void
251cpu_wait(p)
252	struct proc *p;
253{
254	/* drop per-process resources */
255	pmap_dispose_proc(p);
256
257	/* and clean-out the vmspace */
258	vmspace_free(p->p_vmspace);
259}
260
261/*
262 * Dump the machine specific header information at the start of a core dump.
263 */
264int
265cpu_coredump(p, vp, cred)
266	struct proc *p;
267	struct vnode *vp;
268	struct ucred *cred;
269{
270	int error;
271	caddr_t tempuser;
272
273	tempuser = malloc(ctob(UPAGES), M_TEMP, M_WAITOK);
274	if (!tempuser)
275		return EINVAL;
276
277	bzero(tempuser, ctob(UPAGES));
278	bcopy(p->p_addr, tempuser, sizeof(struct user));
279	bcopy(p->p_md.md_regs,
280	      tempuser + ((caddr_t) p->p_md.md_regs - (caddr_t) p->p_addr),
281	      sizeof(struct trapframe));
282
283	error = vn_rdwr(UIO_WRITE, vp, (caddr_t) tempuser,
284			ctob(UPAGES),
285			(off_t)0, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT,
286			cred, (int *)NULL, p);
287
288	free(tempuser, M_TEMP);
289
290	return error;
291}
292
293#ifdef notyet
294static void
295setredzone(pte, vaddr)
296	u_short *pte;
297	caddr_t vaddr;
298{
299/* eventually do this by setting up an expand-down stack segment
300   for ss0: selector, allowing stack access down to top of u.
301   this means though that protection violations need to be handled
302   thru a double fault exception that must do an integral task
303   switch to a known good context, within which a dump can be
304   taken. a sensible scheme might be to save the initial context
305   used by sched (that has physical memory mapped 1:1 at bottom)
306   and take the dump while still in mapped mode */
307}
308#endif
309
310/*
311 * Convert kernel VA to physical address
312 */
313u_long
314kvtop(void *addr)
315{
316	vm_offset_t va;
317
318	va = pmap_kextract((vm_offset_t)addr);
319	if (va == 0)
320		panic("kvtop: zero page frame");
321	return((int)va);
322}
323
324/*
325 * Map an IO request into kernel virtual address space.
326 *
327 * All requests are (re)mapped into kernel VA space.
328 * Notice that we use b_bufsize for the size of the buffer
329 * to be mapped.  b_bcount might be modified by the driver.
330 */
331void
332vmapbuf(bp)
333	register struct buf *bp;
334{
335	register caddr_t addr, v, kva;
336	vm_offset_t pa;
337
338	if ((bp->b_flags & B_PHYS) == 0)
339		panic("vmapbuf");
340
341	for (v = bp->b_saveaddr, addr = (caddr_t)trunc_page(bp->b_data);
342	    addr < bp->b_data + bp->b_bufsize;
343	    addr += PAGE_SIZE, v += PAGE_SIZE) {
344		/*
345		 * Do the vm_fault if needed; do the copy-on-write thing
346		 * when reading stuff off device into memory.
347		 */
348		vm_fault_quick(addr,
349			(bp->b_flags&B_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ);
350		pa = trunc_page(pmap_kextract((vm_offset_t) addr));
351		if (pa == 0)
352			panic("vmapbuf: page not present");
353		vm_page_hold(PHYS_TO_VM_PAGE(pa));
354		pmap_kenter((vm_offset_t) v, pa);
355	}
356
357	kva = bp->b_saveaddr;
358	bp->b_saveaddr = bp->b_data;
359	bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK);
360}
361
362/*
363 * Free the io map PTEs associated with this IO operation.
364 * We also invalidate the TLB entries and restore the original b_addr.
365 */
366void
367vunmapbuf(bp)
368	register struct buf *bp;
369{
370	register caddr_t addr;
371	vm_offset_t pa;
372
373	if ((bp->b_flags & B_PHYS) == 0)
374		panic("vunmapbuf");
375
376	for (addr = (caddr_t)trunc_page(bp->b_data);
377	    addr < bp->b_data + bp->b_bufsize;
378	    addr += PAGE_SIZE) {
379		pa = trunc_page(pmap_kextract((vm_offset_t) addr));
380		pmap_kremove((vm_offset_t) addr);
381		vm_page_unhold(PHYS_TO_VM_PAGE(pa));
382	}
383
384	bp->b_data = bp->b_saveaddr;
385}
386
387/*
388 * Force reset the processor by invalidating the entire address space!
389 */
390
391#ifdef SMP
392static void
393cpu_reset_proxy()
394{
395	u_int saved_mp_lock;
396
397	cpu_reset_proxy_active = 1;
398	while (cpu_reset_proxy_active == 1)
399		;	 /* Wait for other cpu to disable interupts */
400	saved_mp_lock = mp_lock;
401	mp_lock = 1;
402	printf("cpu_reset_proxy: Grabbed mp lock for BSP\n");
403	cpu_reset_proxy_active = 3;
404	while (cpu_reset_proxy_active == 3)
405		;	/* Wait for other cpu to enable interrupts */
406	stop_cpus((1<<cpu_reset_proxyid));
407	printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
408	DELAY(1000000);
409	cpu_reset_real();
410}
411#endif
412
413void
414cpu_reset()
415{
416#ifdef SMP
417	if (smp_active == 0) {
418		cpu_reset_real();
419		/* NOTREACHED */
420	} else {
421
422		u_int map;
423		int cnt;
424		printf("cpu_reset called on cpu#%d\n",cpuid);
425
426		map = other_cpus & ~ stopped_cpus;
427
428		if (map != 0) {
429			printf("cpu_reset: Stopping other CPUs\n");
430			stop_cpus(map);		/* Stop all other CPUs */
431		}
432
433		if (cpuid == 0) {
434			DELAY(1000000);
435			cpu_reset_real();
436			/* NOTREACHED */
437		} else {
438			/* We are not BSP (CPU #0) */
439
440			cpu_reset_proxyid = cpuid;
441			cpustop_restartfunc = cpu_reset_proxy;
442			printf("cpu_reset: Restarting BSP\n");
443			started_cpus = (1<<0);		/* Restart CPU #0 */
444
445			cnt = 0;
446			while (cpu_reset_proxy_active == 0 && cnt < 10000000)
447				cnt++;	/* Wait for BSP to announce restart */
448			if (cpu_reset_proxy_active == 0)
449				printf("cpu_reset: Failed to restart BSP\n");
450			__asm __volatile("cli" : : : "memory");
451			cpu_reset_proxy_active = 2;
452			cnt = 0;
453			while (cpu_reset_proxy_active == 2 && cnt < 10000000)
454				cnt++;	/* Do nothing */
455			if (cpu_reset_proxy_active == 2) {
456				printf("cpu_reset: BSP did not grab mp lock\n");
457				cpu_reset_real();	/* XXX: Bogus ? */
458			}
459			cpu_reset_proxy_active = 4;
460			__asm __volatile("sti" : : : "memory");
461			while (1);
462			/* NOTREACHED */
463		}
464	}
465#else
466	cpu_reset_real();
467#endif
468}
469
470static void
471cpu_reset_real()
472{
473
474#ifdef PC98
475	/*
476	 * Attempt to do a CPU reset via CPU reset port.
477	 */
478	disable_intr();
479	if ((inb(0x35) & 0xa0) != 0xa0) {
480		outb(0x37, 0x0f);		/* SHUT0 = 0. */
481		outb(0x37, 0x0b);		/* SHUT1 = 0. */
482	}
483	outb(0xf0, 0x00);		/* Reset. */
484#else
485	/*
486	 * Attempt to do a CPU reset via the keyboard controller,
487	 * do not turn of the GateA20, as any machine that fails
488	 * to do the reset here would then end up in no man's land.
489	 */
490
491#if !defined(BROKEN_KEYBOARD_RESET)
492	outb(IO_KBD + 4, 0xFE);
493	DELAY(500000);	/* wait 0.5 sec to see if that did it */
494	printf("Keyboard reset did not work, attempting CPU shutdown\n");
495	DELAY(1000000);	/* wait 1 sec for printf to complete */
496#endif
497#endif /* PC98 */
498	/* force a shutdown by unmapping entire address space ! */
499	bzero((caddr_t) PTD, PAGE_SIZE);
500
501	/* "good night, sweet prince .... <THUNK!>" */
502	invltlb();
503	/* NOTREACHED */
504	while(1);
505}
506
507/*
508 * Grow the user stack to allow for 'sp'. This version grows the stack in
509 *	chunks of SGROWSIZ.
510 */
511int
512grow(p, sp)
513	struct proc *p;
514	u_int sp;
515{
516	unsigned int nss;
517	caddr_t v;
518	struct vmspace *vm = p->p_vmspace;
519
520	if ((caddr_t)sp <= vm->vm_maxsaddr || (unsigned)sp >= (unsigned)USRSTACK)
521	    return (1);
522
523	nss = roundup(USRSTACK - (unsigned)sp, PAGE_SIZE);
524
525	if (nss > p->p_rlimit[RLIMIT_STACK].rlim_cur)
526		return (0);
527
528	if (vm->vm_ssize && roundup(vm->vm_ssize << PAGE_SHIFT,
529	    SGROWSIZ) < nss) {
530		int grow_amount;
531		/*
532		 * If necessary, grow the VM that the stack occupies
533		 * to allow for the rlimit. This allows us to not have
534		 * to allocate all of the VM up-front in execve (which
535		 * is expensive).
536		 * Grow the VM by the amount requested rounded up to
537		 * the nearest SGROWSIZ to provide for some hysteresis.
538		 */
539		grow_amount = roundup((nss - (vm->vm_ssize << PAGE_SHIFT)), SGROWSIZ);
540		v = (char *)USRSTACK - roundup(vm->vm_ssize << PAGE_SHIFT,
541		    SGROWSIZ) - grow_amount;
542		/*
543		 * If there isn't enough room to extend by SGROWSIZ, then
544		 * just extend to the maximum size
545		 */
546		if (v < vm->vm_maxsaddr) {
547			v = vm->vm_maxsaddr;
548			grow_amount = MAXSSIZ - (vm->vm_ssize << PAGE_SHIFT);
549		}
550		if ((grow_amount == 0) || (vm_map_find(&vm->vm_map, NULL, 0, (vm_offset_t *)&v,
551		    grow_amount, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != KERN_SUCCESS)) {
552			return (0);
553		}
554		vm->vm_ssize += grow_amount >> PAGE_SHIFT;
555	}
556
557	return (1);
558}
559
560static int cnt_prezero;
561
562SYSCTL_INT(_machdep, OID_AUTO, cnt_prezero, CTLFLAG_RD, &cnt_prezero, 0, "");
563
564/*
565 * Implement the pre-zeroed page mechanism.
566 * This routine is called from the idle loop.
567 */
568int
569vm_page_zero_idle()
570{
571	static int free_rover;
572	vm_page_t m;
573	int s;
574
575	/*
576	 * XXX
577	 * We stop zeroing pages when there are sufficent prezeroed pages.
578	 * This threshold isn't really needed, except we want to
579	 * bypass unneeded calls to vm_page_list_find, and the
580	 * associated cache flush and latency.  The pre-zero will
581	 * still be called when there are significantly more
582	 * non-prezeroed pages than zeroed pages.  The threshold
583	 * of half the number of reserved pages is arbitrary, but
584	 * approximately the right amount.  Eventually, we should
585	 * perhaps interrupt the zero operation when a process
586	 * is found to be ready to run.
587	 */
588	if (cnt.v_free_count - vm_page_zero_count <= cnt.v_free_reserved / 2)
589		return (0);
590#ifdef SMP
591	if (try_mplock()) {
592#endif
593		s = splvm();
594		__asm __volatile("sti" : : : "memory");
595		m = vm_page_list_find(PQ_FREE, free_rover);
596		if (m != NULL) {
597			--(*vm_page_queues[m->queue].lcnt);
598			TAILQ_REMOVE(vm_page_queues[m->queue].pl, m, pageq);
599			m->queue = PQ_NONE;
600			splx(s);
601#if 0
602			rel_mplock();
603#endif
604			pmap_zero_page(VM_PAGE_TO_PHYS(m));
605#if 0
606			get_mplock();
607#endif
608			(void)splvm();
609			m->queue = PQ_ZERO + m->pc;
610			++(*vm_page_queues[m->queue].lcnt);
611			TAILQ_INSERT_HEAD(vm_page_queues[m->queue].pl, m,
612			    pageq);
613			free_rover = (free_rover + PQ_PRIME3) & PQ_L2_MASK;
614			++vm_page_zero_count;
615			++cnt_prezero;
616		}
617		splx(s);
618		__asm __volatile("cli" : : : "memory");
619#ifdef SMP
620		rel_mplock();
621#endif
622		return (1);
623#ifdef SMP
624	}
625#endif
626	return (0);
627}
628
629/*
630 * Software interrupt handler for queued VM system processing.
631 */
632void
633swi_vm()
634{
635	if (busdma_swi_pending != 0)
636		busdma_swi();
637}
638
639/*
640 * Tell whether this address is in some physical memory region.
641 * Currently used by the kernel coredump code in order to avoid
642 * dumping the ``ISA memory hole'' which could cause indefinite hangs,
643 * or other unpredictable behaviour.
644 */
645
646#include "isa.h"
647
648int
649is_physical_memory(addr)
650	vm_offset_t addr;
651{
652
653#if NISA > 0
654	/* The ISA ``memory hole''. */
655	if (addr >= 0xa0000 && addr < 0x100000)
656		return 0;
657#endif
658
659	/*
660	 * stuff other tests for known memory-mapped devices (PCI?)
661	 * here
662	 */
663
664	return 1;
665}
666