subr_syscall.c revision 77097
1/*-
2 * Copyright (C) 1994, David Greenman
3 * Copyright (c) 1990, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the University of Utah, and William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
38 * $FreeBSD: head/sys/kern/subr_trap.c 77097 2001-05-23 22:58:09Z jhb $
39 */
40
41/*
42 * 386 Trap and System call handling
43 */
44
45#include "opt_clock.h"
46#include "opt_cpu.h"
47#include "opt_ddb.h"
48#include "opt_isa.h"
49#include "opt_ktrace.h"
50#include "opt_npx.h"
51#include "opt_trap.h"
52
53#include <sys/param.h>
54#include <sys/bus.h>
55#include <sys/systm.h>
56#include <sys/proc.h>
57#include <sys/pioctl.h>
58#include <sys/kernel.h>
59#include <sys/ktr.h>
60#include <sys/mutex.h>
61#include <sys/resourcevar.h>
62#include <sys/signalvar.h>
63#include <sys/syscall.h>
64#include <sys/sysctl.h>
65#include <sys/sysent.h>
66#include <sys/uio.h>
67#include <sys/vmmeter.h>
68#ifdef KTRACE
69#include <sys/ktrace.h>
70#endif
71
72#include <vm/vm.h>
73#include <vm/vm_param.h>
74#include <sys/lock.h>
75#include <vm/pmap.h>
76#include <vm/vm_kern.h>
77#include <vm/vm_map.h>
78#include <vm/vm_page.h>
79#include <vm/vm_extern.h>
80
81#include <machine/cpu.h>
82#include <machine/md_var.h>
83#include <machine/pcb.h>
84#ifdef SMP
85#include <machine/smp.h>
86#endif
87#include <machine/tss.h>
88
89#include <i386/isa/icu.h>
90#include <i386/isa/intr_machdep.h>
91
92#ifdef POWERFAIL_NMI
93#include <sys/syslog.h>
94#include <machine/clock.h>
95#endif
96
97#include <machine/vm86.h>
98
99#include <ddb/ddb.h>
100
101#include <sys/sysctl.h>
102
103int (*pmath_emulate) __P((struct trapframe *));
104
105extern void trap __P((struct trapframe frame));
106extern int trapwrite __P((unsigned addr));
107extern void syscall __P((struct trapframe frame));
108extern void ast __P((struct trapframe *framep));
109
110static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
111static void trap_fatal __P((struct trapframe *, vm_offset_t));
112void dblfault_handler __P((void));
113
114extern inthand_t IDTVEC(lcall_syscall);
115
116#define MAX_TRAP_MSG		28
117static char *trap_msg[] = {
118	"",					/*  0 unused */
119	"privileged instruction fault",		/*  1 T_PRIVINFLT */
120	"",					/*  2 unused */
121	"breakpoint instruction fault",		/*  3 T_BPTFLT */
122	"",					/*  4 unused */
123	"",					/*  5 unused */
124	"arithmetic trap",			/*  6 T_ARITHTRAP */
125	"",					/*  7 unused */
126	"",					/*  8 unused */
127	"general protection fault",		/*  9 T_PROTFLT */
128	"trace trap",				/* 10 T_TRCTRAP */
129	"",					/* 11 unused */
130	"page fault",				/* 12 T_PAGEFLT */
131	"",					/* 13 unused */
132	"alignment fault",			/* 14 T_ALIGNFLT */
133	"",					/* 15 unused */
134	"",					/* 16 unused */
135	"",					/* 17 unused */
136	"integer divide fault",			/* 18 T_DIVIDE */
137	"non-maskable interrupt trap",		/* 19 T_NMI */
138	"overflow trap",			/* 20 T_OFLOW */
139	"FPU bounds check fault",		/* 21 T_BOUND */
140	"FPU device not available",		/* 22 T_DNA */
141	"double fault",				/* 23 T_DOUBLEFLT */
142	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
143	"invalid TSS fault",			/* 25 T_TSSFLT */
144	"segment not present fault",		/* 26 T_SEGNPFLT */
145	"stack fault",				/* 27 T_STKFLT */
146	"machine check trap",			/* 28 T_MCHK */
147};
148
149#if defined(I586_CPU) && !defined(NO_F00F_HACK)
150extern int has_f00f_bug;
151#endif
152
153#ifdef DDB
154static int ddb_on_nmi = 1;
155SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
156	&ddb_on_nmi, 0, "Go to DDB on NMI");
157#endif
158static int panic_on_nmi = 1;
159SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
160	&panic_on_nmi, 0, "Panic on NMI");
161
162#ifdef WITNESS
163extern char *syscallnames[];
164#endif
165
166void
167userret(p, frame, oticks)
168	struct proc *p;
169	struct trapframe *frame;
170	u_quad_t oticks;
171{
172	int sig;
173
174	while ((sig = CURSIG(p)) != 0)
175		postsig(sig);
176
177	mtx_lock_spin(&sched_lock);
178	p->p_pri.pri_level = p->p_pri.pri_user;
179	if (resched_wanted(p)) {
180		/*
181		 * Since we are curproc, clock will normally just change
182		 * our priority without moving us from one queue to another
183		 * (since the running process is not on a queue.)
184		 * If that happened after we setrunqueue ourselves but before we
185		 * mi_switch()'ed, we might not be on the queue indicated by
186		 * our priority.
187		 */
188		DROP_GIANT_NOSWITCH();
189		setrunqueue(p);
190		p->p_stats->p_ru.ru_nivcsw++;
191		mi_switch();
192		mtx_unlock_spin(&sched_lock);
193		PICKUP_GIANT();
194		while ((sig = CURSIG(p)) != 0)
195			postsig(sig);
196		mtx_lock_spin(&sched_lock);
197	}
198
199	/*
200	 * Charge system time if profiling.
201	 */
202	if (p->p_sflag & PS_PROFIL) {
203		mtx_unlock_spin(&sched_lock);
204		/* XXX - do we need Giant? */
205		if (!mtx_owned(&Giant))
206			mtx_lock(&Giant);
207		addupc_task(p, TRAPF_PC(frame),
208			    (u_int)(p->p_sticks - oticks) * psratio);
209	} else
210		mtx_unlock_spin(&sched_lock);
211}
212
213/*
214 * Exception, fault, and trap interface to the FreeBSD kernel.
215 * This common code is called from assembly language IDT gate entry
216 * routines that prepare a suitable stack frame, and restore this
217 * frame after the exception has been processed.
218 */
219
220void
221trap(frame)
222	struct trapframe frame;
223{
224	struct proc *p = curproc;
225	u_quad_t sticks = 0;
226	int i = 0, ucode = 0, type, code;
227	vm_offset_t eva;
228#ifdef POWERFAIL_NMI
229	static int lastalert = 0;
230#endif
231
232	atomic_add_int(&cnt.v_trap, 1);
233
234	if ((frame.tf_eflags & PSL_I) == 0) {
235		/*
236		 * Buggy application or kernel code has disabled
237		 * interrupts and then trapped.  Enabling interrupts
238		 * now is wrong, but it is better than running with
239		 * interrupts disabled until they are accidentally
240		 * enabled later.  XXX This is really bad if we trap
241		 * while holding a spin lock.
242		 */
243		type = frame.tf_trapno;
244		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
245			printf(
246			    "pid %ld (%s): trap %d with interrupts disabled\n",
247			    (long)curproc->p_pid, curproc->p_comm, type);
248		else if (type != T_BPTFLT && type != T_TRCTRAP) {
249			/*
250			 * XXX not quite right, since this may be for a
251			 * multiple fault in user mode.
252			 */
253			printf("kernel trap %d with interrupts disabled\n",
254			    type);
255			/*
256			 * We should walk p_heldmtx here and see if any are
257			 * spin mutexes, and not do this if so.
258			 */
259			enable_intr();
260		}
261	}
262
263	eva = 0;
264
265#if defined(I586_CPU) && !defined(NO_F00F_HACK)
266restart:
267#endif
268
269	type = frame.tf_trapno;
270	code = frame.tf_err;
271
272        if ((ISPL(frame.tf_cs) == SEL_UPL) ||
273	    ((frame.tf_eflags & PSL_VM) && !in_vm86call)) {
274		/* user trap */
275
276		mtx_lock_spin(&sched_lock);
277		sticks = p->p_sticks;
278		mtx_unlock_spin(&sched_lock);
279		p->p_md.md_regs = &frame;
280
281		switch (type) {
282		case T_PRIVINFLT:	/* privileged instruction fault */
283			ucode = type;
284			i = SIGILL;
285			break;
286
287		case T_BPTFLT:		/* bpt instruction fault */
288		case T_TRCTRAP:		/* trace trap */
289			frame.tf_eflags &= ~PSL_T;
290			i = SIGTRAP;
291			break;
292
293		case T_ARITHTRAP:	/* arithmetic trap */
294#ifdef DEV_NPX
295			ucode = npxtrap();
296			if (ucode == -1)
297				return;
298#else
299			ucode = code;
300#endif
301			i = SIGFPE;
302			break;
303
304			/*
305			 * The following two traps can happen in
306			 * vm86 mode, and, if so, we want to handle
307			 * them specially.
308			 */
309		case T_PROTFLT:		/* general protection fault */
310		case T_STKFLT:		/* stack fault */
311			if (frame.tf_eflags & PSL_VM) {
312				mtx_lock(&Giant);
313				i = vm86_emulate((struct vm86frame *)&frame);
314				mtx_unlock(&Giant);
315				if (i == 0)
316					goto user;
317				break;
318			}
319			/* FALL THROUGH */
320
321		case T_SEGNPFLT:	/* segment not present fault */
322		case T_TSSFLT:		/* invalid TSS fault */
323		case T_DOUBLEFLT:	/* double fault */
324		default:
325			ucode = code + BUS_SEGM_FAULT ;
326			i = SIGBUS;
327			break;
328
329		case T_PAGEFLT:		/* page fault */
330			/*
331			 * For some Cyrix CPUs, %cr2 is clobbered by
332			 * interrupts.  This problem is worked around by using
333			 * an interrupt gate for the pagefault handler.  We
334			 * are finally ready to read %cr2 and then must
335			 * reenable interrupts.
336			 */
337			eva = rcr2();
338			enable_intr();
339			i = trap_pfault(&frame, TRUE, eva);
340#if defined(I586_CPU) && !defined(NO_F00F_HACK)
341			if (i == -2) {
342				/*
343				 * f00f hack workaround has triggered, treat
344				 * as illegal instruction not page fault.
345				 */
346				frame.tf_trapno = T_PRIVINFLT;
347				goto restart;
348			}
349#endif
350			if (i == -1)
351				goto out;
352			if (i == 0)
353				goto user;
354
355			ucode = T_PAGEFLT;
356			break;
357
358		case T_DIVIDE:		/* integer divide fault */
359			ucode = FPE_INTDIV;
360			i = SIGFPE;
361			break;
362
363#ifdef DEV_ISA
364		case T_NMI:
365#ifdef POWERFAIL_NMI
366#ifndef TIMER_FREQ
367#  define TIMER_FREQ 1193182
368#endif
369			mtx_lock(&Giant);
370			if (time_second - lastalert > 10) {
371				log(LOG_WARNING, "NMI: power fail\n");
372				sysbeep(TIMER_FREQ/880, hz);
373				lastalert = time_second;
374			}
375			mtx_unlock(&Giant);
376			goto out;
377#else /* !POWERFAIL_NMI */
378			/* machine/parity/power fail/"kitchen sink" faults */
379			/* XXX Giant */
380			if (isa_nmi(code) == 0) {
381#ifdef DDB
382				/*
383				 * NMI can be hooked up to a pushbutton
384				 * for debugging.
385				 */
386				if (ddb_on_nmi) {
387					printf ("NMI ... going to debugger\n");
388					kdb_trap (type, 0, &frame);
389				}
390#endif /* DDB */
391				goto out;
392			} else if (panic_on_nmi)
393				panic("NMI indicates hardware failure");
394			break;
395#endif /* POWERFAIL_NMI */
396#endif /* DEV_ISA */
397
398		case T_OFLOW:		/* integer overflow fault */
399			ucode = FPE_INTOVF;
400			i = SIGFPE;
401			break;
402
403		case T_BOUND:		/* bounds check fault */
404			ucode = FPE_FLTSUB;
405			i = SIGFPE;
406			break;
407
408		case T_DNA:
409#ifdef DEV_NPX
410			/* transparent fault (due to context switch "late") */
411			if (npxdna())
412				goto out;
413#endif
414			if (!pmath_emulate) {
415				i = SIGFPE;
416				ucode = FPE_FPU_NP_TRAP;
417				break;
418			}
419			mtx_lock(&Giant);
420			i = (*pmath_emulate)(&frame);
421			mtx_unlock(&Giant);
422			if (i == 0) {
423				if (!(frame.tf_eflags & PSL_T))
424					goto out;
425				frame.tf_eflags &= ~PSL_T;
426				i = SIGTRAP;
427			}
428			/* else ucode = emulator_only_knows() XXX */
429			break;
430
431		case T_FPOPFLT:		/* FPU operand fetch fault */
432			ucode = T_FPOPFLT;
433			i = SIGILL;
434			break;
435		}
436	} else {
437		/* kernel trap */
438
439		switch (type) {
440		case T_PAGEFLT:			/* page fault */
441			/*
442			 * For some Cyrix CPUs, %cr2 is clobbered by
443			 * interrupts.  This problem is worked around by using
444			 * an interrupt gate for the pagefault handler.  We
445			 * are finally ready to read %cr2 and then must
446			 * reenable interrupts.
447			 */
448			eva = rcr2();
449			enable_intr();
450			(void) trap_pfault(&frame, FALSE, eva);
451			goto out;
452
453		case T_DNA:
454#ifdef DEV_NPX
455			/*
456			 * The kernel is apparently using npx for copying.
457			 * XXX this should be fatal unless the kernel has
458			 * registered such use.
459			 */
460			if (npxdna())
461				goto out;
462#endif
463			break;
464
465			/*
466			 * The following two traps can happen in
467			 * vm86 mode, and, if so, we want to handle
468			 * them specially.
469			 */
470		case T_PROTFLT:		/* general protection fault */
471		case T_STKFLT:		/* stack fault */
472			if (frame.tf_eflags & PSL_VM) {
473				mtx_lock(&Giant);
474				i = vm86_emulate((struct vm86frame *)&frame);
475				mtx_unlock(&Giant);
476				if (i != 0)
477					/*
478					 * returns to original process
479					 */
480					vm86_trap((struct vm86frame *)&frame);
481				goto out;
482			}
483			if (type == T_STKFLT)
484				break;
485
486			/* FALL THROUGH */
487
488		case T_SEGNPFLT:	/* segment not present fault */
489			if (in_vm86call)
490				break;
491
492			if (p->p_intr_nesting_level != 0)
493				break;
494
495			/*
496			 * Invalid %fs's and %gs's can be created using
497			 * procfs or PT_SETREGS or by invalidating the
498			 * underlying LDT entry.  This causes a fault
499			 * in kernel mode when the kernel attempts to
500			 * switch contexts.  Lose the bad context
501			 * (XXX) so that we can continue, and generate
502			 * a signal.
503			 */
504			if (frame.tf_eip == (int)cpu_switch_load_gs) {
505				PCPU_GET(curpcb)->pcb_gs = 0;
506				PROC_LOCK(p);
507				psignal(p, SIGBUS);
508				PROC_UNLOCK(p);
509				goto out;
510			}
511
512			/*
513			 * Invalid segment selectors and out of bounds
514			 * %eip's and %esp's can be set up in user mode.
515			 * This causes a fault in kernel mode when the
516			 * kernel tries to return to user mode.  We want
517			 * to get this fault so that we can fix the
518			 * problem here and not have to check all the
519			 * selectors and pointers when the user changes
520			 * them.
521			 */
522			if (frame.tf_eip == (int)doreti_iret) {
523				frame.tf_eip = (int)doreti_iret_fault;
524				goto out;
525			}
526			if (frame.tf_eip == (int)doreti_popl_ds) {
527				frame.tf_eip = (int)doreti_popl_ds_fault;
528				goto out;
529			}
530			if (frame.tf_eip == (int)doreti_popl_es) {
531				frame.tf_eip = (int)doreti_popl_es_fault;
532				goto out;
533			}
534			if (frame.tf_eip == (int)doreti_popl_fs) {
535				frame.tf_eip = (int)doreti_popl_fs_fault;
536				goto out;
537			}
538			if (PCPU_GET(curpcb) != NULL &&
539			    PCPU_GET(curpcb)->pcb_onfault != NULL) {
540				frame.tf_eip =
541				    (int)PCPU_GET(curpcb)->pcb_onfault;
542				goto out;
543			}
544			break;
545
546		case T_TSSFLT:
547			/*
548			 * PSL_NT can be set in user mode and isn't cleared
549			 * automatically when the kernel is entered.  This
550			 * causes a TSS fault when the kernel attempts to
551			 * `iret' because the TSS link is uninitialized.  We
552			 * want to get this fault so that we can fix the
553			 * problem here and not every time the kernel is
554			 * entered.
555			 */
556			if (frame.tf_eflags & PSL_NT) {
557				frame.tf_eflags &= ~PSL_NT;
558				goto out;
559			}
560			break;
561
562		case T_TRCTRAP:	 /* trace trap */
563			if (frame.tf_eip == (int)IDTVEC(lcall_syscall)) {
564				/*
565				 * We've just entered system mode via the
566				 * syscall lcall.  Continue single stepping
567				 * silently until the syscall handler has
568				 * saved the flags.
569				 */
570				goto out;
571			}
572			if (frame.tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
573				/*
574				 * The syscall handler has now saved the
575				 * flags.  Stop single stepping it.
576				 */
577				frame.tf_eflags &= ~PSL_T;
578				goto out;
579			}
580			/*
581			 * Ignore debug register trace traps due to
582			 * accesses in the user's address space, which
583			 * can happen under several conditions such as
584			 * if a user sets a watchpoint on a buffer and
585			 * then passes that buffer to a system call.
586			 * We still want to get TRCTRAPS for addresses
587			 * in kernel space because that is useful when
588			 * debugging the kernel.
589			 */
590			/* XXX Giant */
591			if (user_dbreg_trap() && !in_vm86call) {
592				/*
593				 * Reset breakpoint bits because the
594				 * processor doesn't
595				 */
596				load_dr6(rdr6() & 0xfffffff0);
597				goto out;
598			}
599			/*
600			 * Fall through (TRCTRAP kernel mode, kernel address)
601			 */
602		case T_BPTFLT:
603			/*
604			 * If DDB is enabled, let it handle the debugger trap.
605			 * Otherwise, debugger traps "can't happen".
606			 */
607#ifdef DDB
608			/* XXX Giant */
609			if (kdb_trap (type, 0, &frame))
610				goto out;
611#endif
612			break;
613
614#ifdef DEV_ISA
615		case T_NMI:
616#ifdef POWERFAIL_NMI
617			mtx_lock(&Giant);
618			if (time_second - lastalert > 10) {
619				log(LOG_WARNING, "NMI: power fail\n");
620				sysbeep(TIMER_FREQ/880, hz);
621				lastalert = time_second;
622			}
623			mtx_unlock(&Giant);
624			goto out;
625#else /* !POWERFAIL_NMI */
626			/* XXX Giant */
627			/* machine/parity/power fail/"kitchen sink" faults */
628			if (isa_nmi(code) == 0) {
629#ifdef DDB
630				/*
631				 * NMI can be hooked up to a pushbutton
632				 * for debugging.
633				 */
634				if (ddb_on_nmi) {
635					printf ("NMI ... going to debugger\n");
636					kdb_trap (type, 0, &frame);
637				}
638#endif /* DDB */
639				goto out;
640			} else if (panic_on_nmi == 0)
641				goto out;
642			/* FALL THROUGH */
643#endif /* POWERFAIL_NMI */
644#endif /* DEV_ISA */
645		}
646
647		trap_fatal(&frame, eva);
648		goto out;
649	}
650
651	mtx_lock(&Giant);
652	/* Translate fault for emulators (e.g. Linux) */
653	if (*p->p_sysent->sv_transtrap)
654		i = (*p->p_sysent->sv_transtrap)(i, type);
655
656	trapsignal(p, i, ucode);
657
658#ifdef DEBUG
659	if (type <= MAX_TRAP_MSG) {
660		uprintf("fatal process exception: %s",
661			trap_msg[type]);
662		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
663			uprintf(", fault VA = 0x%lx", (u_long)eva);
664		uprintf("\n");
665	}
666#endif
667	mtx_unlock(&Giant);
668
669user:
670	userret(p, &frame, sticks);
671	if (mtx_owned(&Giant))
672		mtx_unlock(&Giant);
673out:
674	return;
675}
676
677#ifdef notyet
678/*
679 * This version doesn't allow a page fault to user space while
680 * in the kernel. The rest of the kernel needs to be made "safe"
681 * before this can be used. I think the only things remaining
682 * to be made safe are the iBCS2 code and the process tracing/
683 * debugging code.
684 */
685static int
686trap_pfault(frame, usermode, eva)
687	struct trapframe *frame;
688	int usermode;
689	vm_offset_t eva;
690{
691	vm_offset_t va;
692	struct vmspace *vm = NULL;
693	vm_map_t map = 0;
694	int rv = 0;
695	vm_prot_t ftype;
696	struct proc *p = curproc;
697
698	if (frame->tf_err & PGEX_W)
699		ftype = VM_PROT_WRITE;
700	else
701		ftype = VM_PROT_READ;
702
703	va = trunc_page(eva);
704	if (va < VM_MIN_KERNEL_ADDRESS) {
705		vm_offset_t v;
706		vm_page_t mpte;
707
708		if (p == NULL ||
709		    (!usermode && va < VM_MAXUSER_ADDRESS &&
710		     (p->p_intr_nesting_level != 0 ||
711		      PCPU_GET(curpcb) == NULL ||
712		      PCPU_GET(curpcb)->pcb_onfault == NULL))) {
713			trap_fatal(frame, eva);
714			return (-1);
715		}
716
717		/*
718		 * This is a fault on non-kernel virtual memory.
719		 * vm is initialized above to NULL. If curproc is NULL
720		 * or curproc->p_vmspace is NULL the fault is fatal.
721		 */
722		vm = p->p_vmspace;
723		if (vm == NULL)
724			goto nogo;
725
726		map = &vm->vm_map;
727
728		/*
729		 * Keep swapout from messing with us during this
730		 *	critical time.
731		 */
732		PROC_LOCK(p);
733		++p->p_lock;
734		PROC_UNLOCK(p);
735
736		/*
737		 * Grow the stack if necessary
738		 */
739		/* grow_stack returns false only if va falls into
740		 * a growable stack region and the stack growth
741		 * fails.  It returns true if va was not within
742		 * a growable stack region, or if the stack
743		 * growth succeeded.
744		 */
745		if (!grow_stack (p, va))
746			rv = KERN_FAILURE;
747		else
748			/* Fault in the user page: */
749			rv = vm_fault(map, va, ftype,
750			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
751						      : VM_FAULT_NORMAL);
752
753		PROC_LOCK(p);
754		--p->p_lock;
755		PROC_UNLOCK(p);
756	} else {
757		/*
758		 * Don't allow user-mode faults in kernel address space.
759		 */
760		if (usermode)
761			goto nogo;
762
763		/*
764		 * Since we know that kernel virtual address addresses
765		 * always have pte pages mapped, we just have to fault
766		 * the page.
767		 */
768		rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL);
769	}
770
771	if (rv == KERN_SUCCESS)
772		return (0);
773nogo:
774	if (!usermode) {
775		if (p->p_intr_nesting_level == 0 &&
776		    PCPU_GET(curpcb) != NULL &&
777		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
778			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
779			return (0);
780		}
781		trap_fatal(frame, eva);
782		return (-1);
783	}
784
785	/* kludge to pass faulting virtual address to sendsig */
786	frame->tf_err = eva;
787
788	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
789}
790#endif
791
792int
793trap_pfault(frame, usermode, eva)
794	struct trapframe *frame;
795	int usermode;
796	vm_offset_t eva;
797{
798	vm_offset_t va;
799	struct vmspace *vm = NULL;
800	vm_map_t map = 0;
801	int rv = 0;
802	vm_prot_t ftype;
803	struct proc *p = curproc;
804
805	va = trunc_page(eva);
806	if (va >= KERNBASE) {
807		/*
808		 * Don't allow user-mode faults in kernel address space.
809		 * An exception:  if the faulting address is the invalid
810		 * instruction entry in the IDT, then the Intel Pentium
811		 * F00F bug workaround was triggered, and we need to
812		 * treat it is as an illegal instruction, and not a page
813		 * fault.
814		 */
815#if defined(I586_CPU) && !defined(NO_F00F_HACK)
816		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
817			return -2;
818#endif
819		if (usermode)
820			goto nogo;
821
822		map = kernel_map;
823	} else {
824		/*
825		 * This is a fault on non-kernel virtual memory.
826		 * vm is initialized above to NULL. If curproc is NULL
827		 * or curproc->p_vmspace is NULL the fault is fatal.
828		 */
829		if (p != NULL)
830			vm = p->p_vmspace;
831
832		if (vm == NULL)
833			goto nogo;
834
835		map = &vm->vm_map;
836	}
837
838	if (frame->tf_err & PGEX_W)
839		ftype = VM_PROT_WRITE;
840	else
841		ftype = VM_PROT_READ;
842
843	if (map != kernel_map) {
844		/*
845		 * Keep swapout from messing with us during this
846		 *	critical time.
847		 */
848		PROC_LOCK(p);
849		++p->p_lock;
850		PROC_UNLOCK(p);
851
852		/*
853		 * Grow the stack if necessary
854		 */
855		/* grow_stack returns false only if va falls into
856		 * a growable stack region and the stack growth
857		 * fails.  It returns true if va was not within
858		 * a growable stack region, or if the stack
859		 * growth succeeded.
860		 */
861		if (!grow_stack (p, va))
862			rv = KERN_FAILURE;
863		else
864			/* Fault in the user page: */
865			rv = vm_fault(map, va, ftype,
866			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
867						      : VM_FAULT_NORMAL);
868
869		PROC_LOCK(p);
870		--p->p_lock;
871		PROC_UNLOCK(p);
872	} else {
873		/*
874		 * Don't have to worry about process locking or stacks in the
875		 * kernel.
876		 */
877		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
878	}
879
880	if (rv == KERN_SUCCESS)
881		return (0);
882nogo:
883	if (!usermode) {
884		if (p->p_intr_nesting_level == 0 &&
885		    PCPU_GET(curpcb) != NULL &&
886		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
887			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
888			return (0);
889		}
890		trap_fatal(frame, eva);
891		return (-1);
892	}
893
894	/* kludge to pass faulting virtual address to sendsig */
895	frame->tf_err = eva;
896
897	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
898}
899
900static void
901trap_fatal(frame, eva)
902	struct trapframe *frame;
903	vm_offset_t eva;
904{
905	int code, type, ss, esp;
906	struct soft_segment_descriptor softseg;
907
908	code = frame->tf_err;
909	type = frame->tf_trapno;
910	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
911
912	if (type <= MAX_TRAP_MSG)
913		printf("\n\nFatal trap %d: %s while in %s mode\n",
914			type, trap_msg[type],
915        		frame->tf_eflags & PSL_VM ? "vm86" :
916			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
917#ifdef SMP
918	/* two separate prints in case of a trap on an unmapped page */
919	printf("cpuid = %d; ", PCPU_GET(cpuid));
920	printf("lapic.id = %08x\n", lapic.id);
921#endif
922	if (type == T_PAGEFLT) {
923		printf("fault virtual address	= 0x%x\n", eva);
924		printf("fault code		= %s %s, %s\n",
925			code & PGEX_U ? "user" : "supervisor",
926			code & PGEX_W ? "write" : "read",
927			code & PGEX_P ? "protection violation" : "page not present");
928	}
929	printf("instruction pointer	= 0x%x:0x%x\n",
930	       frame->tf_cs & 0xffff, frame->tf_eip);
931        if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
932		ss = frame->tf_ss & 0xffff;
933		esp = frame->tf_esp;
934	} else {
935		ss = GSEL(GDATA_SEL, SEL_KPL);
936		esp = (int)&frame->tf_esp;
937	}
938	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
939	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
940	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
941	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
942	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
943	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
944	       softseg.ssd_gran);
945	printf("processor eflags	= ");
946	if (frame->tf_eflags & PSL_T)
947		printf("trace trap, ");
948	if (frame->tf_eflags & PSL_I)
949		printf("interrupt enabled, ");
950	if (frame->tf_eflags & PSL_NT)
951		printf("nested task, ");
952	if (frame->tf_eflags & PSL_RF)
953		printf("resume, ");
954	if (frame->tf_eflags & PSL_VM)
955		printf("vm86, ");
956	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
957	printf("current process		= ");
958	if (curproc) {
959		printf("%lu (%s)\n",
960		    (u_long)curproc->p_pid, curproc->p_comm ?
961		    curproc->p_comm : "");
962	} else {
963		printf("Idle\n");
964	}
965
966#ifdef KDB
967	if (kdb_trap(&psl))
968		return;
969#endif
970#ifdef DDB
971	if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame))
972		return;
973#endif
974	printf("trap number		= %d\n", type);
975	if (type <= MAX_TRAP_MSG)
976		panic(trap_msg[type]);
977	else
978		panic("unknown/reserved trap");
979}
980
981/*
982 * Double fault handler. Called when a fault occurs while writing
983 * a frame for a trap/exception onto the stack. This usually occurs
984 * when the stack overflows (such is the case with infinite recursion,
985 * for example).
986 *
987 * XXX Note that the current PTD gets replaced by IdlePTD when the
988 * task switch occurs. This means that the stack that was active at
989 * the time of the double fault is not available at <kstack> unless
990 * the machine was idle when the double fault occurred. The downside
991 * of this is that "trace <ebp>" in ddb won't work.
992 */
993void
994dblfault_handler()
995{
996	printf("\nFatal double fault:\n");
997	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
998	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
999	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
1000#ifdef SMP
1001	/* two separate prints in case of a trap on an unmapped page */
1002	printf("cpuid = %d; ", PCPU_GET(cpuid));
1003	printf("lapic.id = %08x\n", lapic.id);
1004#endif
1005	panic("double fault");
1006}
1007
1008/*
1009 * Compensate for 386 brain damage (missing URKR).
1010 * This is a little simpler than the pagefault handler in trap() because
1011 * it the page tables have already been faulted in and high addresses
1012 * are thrown out early for other reasons.
1013 */
1014int trapwrite(addr)
1015	unsigned addr;
1016{
1017	struct proc *p;
1018	vm_offset_t va;
1019	struct vmspace *vm;
1020	int rv;
1021
1022	va = trunc_page((vm_offset_t)addr);
1023	/*
1024	 * XXX - MAX is END.  Changed > to >= for temp. fix.
1025	 */
1026	if (va >= VM_MAXUSER_ADDRESS)
1027		return (1);
1028
1029	p = curproc;
1030	vm = p->p_vmspace;
1031
1032	PROC_LOCK(p);
1033	++p->p_lock;
1034	PROC_UNLOCK(p);
1035
1036	if (!grow_stack (p, va))
1037		rv = KERN_FAILURE;
1038	else
1039		/*
1040		 * fault the data page
1041		 */
1042		rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
1043
1044	PROC_LOCK(p);
1045	--p->p_lock;
1046	PROC_UNLOCK(p);
1047
1048	if (rv != KERN_SUCCESS)
1049		return 1;
1050
1051	return (0);
1052}
1053
1054/*
1055 *	syscall -	MP aware system call request C handler
1056 *
1057 *	A system call is essentially treated as a trap except that the
1058 *	MP lock is not held on entry or return.  We are responsible for
1059 *	obtaining the MP lock if necessary and for handling ASTs
1060 *	(e.g. a task switch) prior to return.
1061 *
1062 *	In general, only simple access and manipulation of curproc and
1063 *	the current stack is allowed without having to hold MP lock.
1064 */
1065void
1066syscall(frame)
1067	struct trapframe frame;
1068{
1069	caddr_t params;
1070	int i;
1071	struct sysent *callp;
1072	struct proc *p = curproc;
1073	u_quad_t sticks;
1074	int error;
1075	int narg;
1076	int args[8];
1077	u_int code;
1078
1079	atomic_add_int(&cnt.v_syscall, 1);
1080
1081#ifdef DIAGNOSTIC
1082	if (ISPL(frame.tf_cs) != SEL_UPL) {
1083		mtx_lock(&Giant);
1084		panic("syscall");
1085		/* NOT REACHED */
1086	}
1087#endif
1088
1089	mtx_lock_spin(&sched_lock);
1090	sticks = p->p_sticks;
1091	mtx_unlock_spin(&sched_lock);
1092
1093	p->p_md.md_regs = &frame;
1094	params = (caddr_t)frame.tf_esp + sizeof(int);
1095	code = frame.tf_eax;
1096
1097	if (p->p_sysent->sv_prepsyscall) {
1098		/*
1099		 * The prep code is not MP aware.
1100		 */
1101		mtx_lock(&Giant);
1102		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
1103		mtx_unlock(&Giant);
1104	} else {
1105		/*
1106		 * Need to check if this is a 32 bit or 64 bit syscall.
1107		 * fuword is MP aware.
1108		 */
1109		if (code == SYS_syscall) {
1110			/*
1111			 * Code is first argument, followed by actual args.
1112			 */
1113			code = fuword(params);
1114			params += sizeof(int);
1115		} else if (code == SYS___syscall) {
1116			/*
1117			 * Like syscall, but code is a quad, so as to maintain
1118			 * quad alignment for the rest of the arguments.
1119			 */
1120			code = fuword(params);
1121			params += sizeof(quad_t);
1122		}
1123	}
1124
1125 	if (p->p_sysent->sv_mask)
1126 		code &= p->p_sysent->sv_mask;
1127
1128 	if (code >= p->p_sysent->sv_size)
1129 		callp = &p->p_sysent->sv_table[0];
1130  	else
1131 		callp = &p->p_sysent->sv_table[code];
1132
1133	narg = callp->sy_narg & SYF_ARGMASK;
1134
1135	/*
1136	 * copyin is MP aware, but the tracing code is not
1137	 */
1138	if (params && (i = narg * sizeof(int)) &&
1139	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
1140		mtx_lock(&Giant);
1141#ifdef KTRACE
1142		if (KTRPOINT(p, KTR_SYSCALL))
1143			ktrsyscall(p->p_tracep, code, narg, args);
1144#endif
1145		goto bad;
1146	}
1147
1148	/*
1149	 * Try to run the syscall without the MP lock if the syscall
1150	 * is MP safe.
1151	 */
1152	if ((callp->sy_narg & SYF_MPSAFE) == 0) {
1153		mtx_lock(&Giant);
1154	}
1155
1156#ifdef KTRACE
1157	/*
1158	 * We have to obtain the MP lock no matter what if
1159	 * we are ktracing
1160	 */
1161	if (KTRPOINT(p, KTR_SYSCALL)) {
1162		if (!mtx_owned(&Giant))
1163			mtx_lock(&Giant);
1164		ktrsyscall(p->p_tracep, code, narg, args);
1165	}
1166#endif
1167	p->p_retval[0] = 0;
1168	p->p_retval[1] = frame.tf_edx;
1169
1170	STOPEVENT(p, S_SCE, narg);	/* MP aware */
1171
1172	error = (*callp->sy_call)(p, args);
1173
1174	/*
1175	 * MP SAFE (we may or may not have the MP lock at this point)
1176	 */
1177	switch (error) {
1178	case 0:
1179		frame.tf_eax = p->p_retval[0];
1180		frame.tf_edx = p->p_retval[1];
1181		frame.tf_eflags &= ~PSL_C;
1182		break;
1183
1184	case ERESTART:
1185		/*
1186		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
1187		 * int 0x80 is 2 bytes. We saved this in tf_err.
1188		 */
1189		frame.tf_eip -= frame.tf_err;
1190		break;
1191
1192	case EJUSTRETURN:
1193		break;
1194
1195	default:
1196bad:
1197 		if (p->p_sysent->sv_errsize) {
1198 			if (error >= p->p_sysent->sv_errsize)
1199  				error = -1;	/* XXX */
1200   			else
1201  				error = p->p_sysent->sv_errtbl[error];
1202		}
1203		frame.tf_eax = error;
1204		frame.tf_eflags |= PSL_C;
1205		break;
1206	}
1207
1208	/*
1209	 * Traced syscall.  trapsignal() is not MP aware.
1210	 */
1211	if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
1212		if (!mtx_owned(&Giant))
1213			mtx_lock(&Giant);
1214		frame.tf_eflags &= ~PSL_T;
1215		trapsignal(p, SIGTRAP, 0);
1216	}
1217
1218	/*
1219	 * Handle reschedule and other end-of-syscall issues
1220	 */
1221	userret(p, &frame, sticks);
1222
1223#ifdef KTRACE
1224	if (KTRPOINT(p, KTR_SYSRET)) {
1225		if (!mtx_owned(&Giant))
1226			mtx_lock(&Giant);
1227		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
1228	}
1229#endif
1230
1231	/*
1232	 * Release Giant if we had to get it
1233	 */
1234	if (mtx_owned(&Giant))
1235		mtx_unlock(&Giant);
1236
1237	/*
1238	 * This works because errno is findable through the
1239	 * register set.  If we ever support an emulation where this
1240	 * is not the case, this code will need to be revisited.
1241	 */
1242	STOPEVENT(p, S_SCX, code);
1243
1244#ifdef WITNESS
1245	if (witness_list(p)) {
1246		panic("system call %s returning with mutex(s) held\n",
1247		    syscallnames[code]);
1248	}
1249#endif
1250	mtx_assert(&sched_lock, MA_NOTOWNED);
1251	mtx_assert(&Giant, MA_NOTOWNED);
1252}
1253
1254void
1255ast(framep)
1256	struct trapframe *framep;
1257{
1258	struct proc *p = CURPROC;
1259	u_quad_t sticks;
1260#if defined(DEV_NPX) && !defined(SMP)
1261	int ucode;
1262#endif
1263
1264	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
1265
1266	/*
1267	 * We check for a pending AST here rather than in the assembly as
1268	 * acquiring and releasing mutexes in assembly is not fun.
1269	 */
1270	mtx_lock_spin(&sched_lock);
1271	if (!(astpending(p) || resched_wanted(p))) {
1272		mtx_unlock_spin(&sched_lock);
1273		return;
1274	}
1275
1276	sticks = p->p_sticks;
1277	p->p_md.md_regs = framep;
1278
1279	astoff(p);
1280	cnt.v_soft++;
1281	mtx_intr_enable(&sched_lock);
1282	if (p->p_sflag & PS_OWEUPC) {
1283		p->p_sflag &= ~PS_OWEUPC;
1284		mtx_unlock_spin(&sched_lock);
1285		mtx_lock(&Giant);
1286		mtx_lock_spin(&sched_lock);
1287		addupc_task(p, p->p_stats->p_prof.pr_addr,
1288			    p->p_stats->p_prof.pr_ticks);
1289	}
1290	if (p->p_sflag & PS_ALRMPEND) {
1291		p->p_sflag &= ~PS_ALRMPEND;
1292		mtx_unlock_spin(&sched_lock);
1293		PROC_LOCK(p);
1294		psignal(p, SIGVTALRM);
1295		PROC_UNLOCK(p);
1296		mtx_lock_spin(&sched_lock);
1297	}
1298#if defined(DEV_NPX) && !defined(SMP)
1299	if (PCPU_GET(curpcb)->pcb_flags & PCB_NPXTRAP) {
1300		PCPU_GET(curpcb)->pcb_flags &= ~PCB_NPXTRAP;
1301		mtx_unlock_spin(&sched_lock);
1302		ucode = npxtrap();
1303		if (ucode != -1) {
1304			if (!mtx_owned(&Giant))
1305				mtx_lock(&Giant);
1306			trapsignal(p, SIGFPE, ucode);
1307		}
1308		mtx_lock_spin(&sched_lock);
1309	}
1310#endif
1311	if (p->p_sflag & PS_PROFPEND) {
1312		p->p_sflag &= ~PS_PROFPEND;
1313		mtx_unlock_spin(&sched_lock);
1314		PROC_LOCK(p);
1315		psignal(p, SIGPROF);
1316		PROC_UNLOCK(p);
1317	} else
1318		mtx_unlock_spin(&sched_lock);
1319
1320	userret(p, framep, sticks);
1321
1322	if (mtx_owned(&Giant))
1323		mtx_unlock(&Giant);
1324}
1325