subr_syscall.c revision 76650
1/*-
2 * Copyright (C) 1994, David Greenman
3 * Copyright (c) 1990, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the University of Utah, and William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
38 * $FreeBSD: head/sys/kern/subr_trap.c 76650 2001-05-15 23:22:29Z jhb $
39 */
40
41/*
42 * 386 Trap and System call handling
43 */
44
45#include "opt_clock.h"
46#include "opt_cpu.h"
47#include "opt_ddb.h"
48#include "opt_isa.h"
49#include "opt_ktrace.h"
50#include "opt_npx.h"
51#include "opt_trap.h"
52
53#include <sys/param.h>
54#include <sys/bus.h>
55#include <sys/systm.h>
56#include <sys/proc.h>
57#include <sys/pioctl.h>
58#include <sys/kernel.h>
59#include <sys/ktr.h>
60#include <sys/mutex.h>
61#include <sys/resourcevar.h>
62#include <sys/signalvar.h>
63#include <sys/syscall.h>
64#include <sys/sysctl.h>
65#include <sys/sysent.h>
66#include <sys/uio.h>
67#include <sys/vmmeter.h>
68#ifdef KTRACE
69#include <sys/ktrace.h>
70#endif
71
72#include <vm/vm.h>
73#include <vm/vm_param.h>
74#include <sys/lock.h>
75#include <vm/pmap.h>
76#include <vm/vm_kern.h>
77#include <vm/vm_map.h>
78#include <vm/vm_page.h>
79#include <vm/vm_extern.h>
80
81#include <machine/cpu.h>
82#include <machine/md_var.h>
83#include <machine/pcb.h>
84#ifdef SMP
85#include <machine/smp.h>
86#endif
87#include <machine/tss.h>
88
89#include <i386/isa/icu.h>
90#include <i386/isa/intr_machdep.h>
91
92#ifdef POWERFAIL_NMI
93#include <sys/syslog.h>
94#include <machine/clock.h>
95#endif
96
97#include <machine/vm86.h>
98
99#include <ddb/ddb.h>
100
101#include <sys/sysctl.h>
102
103int (*pmath_emulate) __P((struct trapframe *));
104
105extern void trap __P((struct trapframe frame));
106extern int trapwrite __P((unsigned addr));
107extern void syscall __P((struct trapframe frame));
108extern void ast __P((struct trapframe *framep));
109
110static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
111static void trap_fatal __P((struct trapframe *, vm_offset_t));
112void dblfault_handler __P((void));
113
114extern inthand_t IDTVEC(lcall_syscall);
115
116#define MAX_TRAP_MSG		28
117static char *trap_msg[] = {
118	"",					/*  0 unused */
119	"privileged instruction fault",		/*  1 T_PRIVINFLT */
120	"",					/*  2 unused */
121	"breakpoint instruction fault",		/*  3 T_BPTFLT */
122	"",					/*  4 unused */
123	"",					/*  5 unused */
124	"arithmetic trap",			/*  6 T_ARITHTRAP */
125	"",					/*  7 unused */
126	"",					/*  8 unused */
127	"general protection fault",		/*  9 T_PROTFLT */
128	"trace trap",				/* 10 T_TRCTRAP */
129	"",					/* 11 unused */
130	"page fault",				/* 12 T_PAGEFLT */
131	"",					/* 13 unused */
132	"alignment fault",			/* 14 T_ALIGNFLT */
133	"",					/* 15 unused */
134	"",					/* 16 unused */
135	"",					/* 17 unused */
136	"integer divide fault",			/* 18 T_DIVIDE */
137	"non-maskable interrupt trap",		/* 19 T_NMI */
138	"overflow trap",			/* 20 T_OFLOW */
139	"FPU bounds check fault",		/* 21 T_BOUND */
140	"FPU device not available",		/* 22 T_DNA */
141	"double fault",				/* 23 T_DOUBLEFLT */
142	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
143	"invalid TSS fault",			/* 25 T_TSSFLT */
144	"segment not present fault",		/* 26 T_SEGNPFLT */
145	"stack fault",				/* 27 T_STKFLT */
146	"machine check trap",			/* 28 T_MCHK */
147};
148
149#if defined(I586_CPU) && !defined(NO_F00F_HACK)
150extern int has_f00f_bug;
151#endif
152
153#ifdef DDB
154static int ddb_on_nmi = 1;
155SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
156	&ddb_on_nmi, 0, "Go to DDB on NMI");
157#endif
158static int panic_on_nmi = 1;
159SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
160	&panic_on_nmi, 0, "Panic on NMI");
161
162#ifdef WITNESS
163extern char *syscallnames[];
164#endif
165
166void
167userret(p, frame, oticks)
168	struct proc *p;
169	struct trapframe *frame;
170	u_quad_t oticks;
171{
172	int sig;
173
174	while ((sig = CURSIG(p)) != 0)
175		postsig(sig);
176
177	mtx_lock_spin(&sched_lock);
178	p->p_pri.pri_level = p->p_pri.pri_user;
179	if (resched_wanted(p)) {
180		/*
181		 * Since we are curproc, clock will normally just change
182		 * our priority without moving us from one queue to another
183		 * (since the running process is not on a queue.)
184		 * If that happened after we setrunqueue ourselves but before we
185		 * mi_switch()'ed, we might not be on the queue indicated by
186		 * our priority.
187		 */
188		DROP_GIANT_NOSWITCH();
189		setrunqueue(p);
190		p->p_stats->p_ru.ru_nivcsw++;
191		mi_switch();
192		mtx_unlock_spin(&sched_lock);
193		PICKUP_GIANT();
194		while ((sig = CURSIG(p)) != 0)
195			postsig(sig);
196		mtx_lock_spin(&sched_lock);
197	}
198
199	/*
200	 * Charge system time if profiling.
201	 */
202	if (p->p_sflag & PS_PROFIL) {
203		mtx_unlock_spin(&sched_lock);
204		/* XXX - do we need Giant? */
205		if (!mtx_owned(&Giant))
206			mtx_lock(&Giant);
207		addupc_task(p, TRAPF_PC(frame),
208			    (u_int)(p->p_sticks - oticks) * psratio);
209	} else
210		mtx_unlock_spin(&sched_lock);
211}
212
213/*
214 * Exception, fault, and trap interface to the FreeBSD kernel.
215 * This common code is called from assembly language IDT gate entry
216 * routines that prepare a suitable stack frame, and restore this
217 * frame after the exception has been processed.
218 */
219
220void
221trap(frame)
222	struct trapframe frame;
223{
224	struct proc *p = curproc;
225	u_quad_t sticks = 0;
226	int i = 0, ucode = 0, type, code;
227	vm_offset_t eva;
228#ifdef POWERFAIL_NMI
229	static int lastalert = 0;
230#endif
231
232	atomic_add_int(&cnt.v_trap, 1);
233
234	if ((frame.tf_eflags & PSL_I) == 0) {
235		/*
236		 * Buggy application or kernel code has disabled
237		 * interrupts and then trapped.  Enabling interrupts
238		 * now is wrong, but it is better than running with
239		 * interrupts disabled until they are accidentally
240		 * enabled later.  XXX This is really bad if we trap
241		 * while holding a spin lock.
242		 */
243		type = frame.tf_trapno;
244		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
245			printf(
246			    "pid %ld (%s): trap %d with interrupts disabled\n",
247			    (long)curproc->p_pid, curproc->p_comm, type);
248		else if (type != T_BPTFLT && type != T_TRCTRAP) {
249			/*
250			 * XXX not quite right, since this may be for a
251			 * multiple fault in user mode.
252			 */
253			printf("kernel trap %d with interrupts disabled\n",
254			    type);
255			/*
256			 * We should walk p_heldmtx here and see if any are
257			 * spin mutexes, and not do this if so.
258			 */
259			enable_intr();
260		}
261	}
262
263	eva = 0;
264
265#if defined(I586_CPU) && !defined(NO_F00F_HACK)
266restart:
267#endif
268
269	type = frame.tf_trapno;
270	code = frame.tf_err;
271
272        if ((ISPL(frame.tf_cs) == SEL_UPL) ||
273	    ((frame.tf_eflags & PSL_VM) && !in_vm86call)) {
274		/* user trap */
275
276		mtx_lock_spin(&sched_lock);
277		sticks = p->p_sticks;
278		mtx_unlock_spin(&sched_lock);
279		p->p_md.md_regs = &frame;
280
281		switch (type) {
282		case T_PRIVINFLT:	/* privileged instruction fault */
283			ucode = type;
284			i = SIGILL;
285			break;
286
287		case T_BPTFLT:		/* bpt instruction fault */
288		case T_TRCTRAP:		/* trace trap */
289			frame.tf_eflags &= ~PSL_T;
290			i = SIGTRAP;
291			break;
292
293		case T_ARITHTRAP:	/* arithmetic trap */
294			ucode = code;
295			i = SIGFPE;
296			break;
297
298			/*
299			 * The following two traps can happen in
300			 * vm86 mode, and, if so, we want to handle
301			 * them specially.
302			 */
303		case T_PROTFLT:		/* general protection fault */
304		case T_STKFLT:		/* stack fault */
305			if (frame.tf_eflags & PSL_VM) {
306				mtx_lock(&Giant);
307				i = vm86_emulate((struct vm86frame *)&frame);
308				mtx_unlock(&Giant);
309				if (i == 0)
310					goto user;
311				break;
312			}
313			/* FALL THROUGH */
314
315		case T_SEGNPFLT:	/* segment not present fault */
316		case T_TSSFLT:		/* invalid TSS fault */
317		case T_DOUBLEFLT:	/* double fault */
318		default:
319			ucode = code + BUS_SEGM_FAULT ;
320			i = SIGBUS;
321			break;
322
323		case T_PAGEFLT:		/* page fault */
324			/*
325			 * For some Cyrix CPUs, %cr2 is clobbered by
326			 * interrupts.  This problem is worked around by using
327			 * an interrupt gate for the pagefault handler.  We
328			 * are finally ready to read %cr2 and then must
329			 * reenable interrupts.
330			 */
331			eva = rcr2();
332			enable_intr();
333			mtx_lock(&Giant);
334			i = trap_pfault(&frame, TRUE, eva);
335			mtx_unlock(&Giant);
336#if defined(I586_CPU) && !defined(NO_F00F_HACK)
337			if (i == -2) {
338				/*
339				 * f00f hack workaround has triggered, treat
340				 * as illegal instruction not page fault.
341				 */
342				frame.tf_trapno = T_PRIVINFLT;
343				goto restart;
344			}
345#endif
346			if (i == -1)
347				goto out;
348			if (i == 0)
349				goto user;
350
351			ucode = T_PAGEFLT;
352			break;
353
354		case T_DIVIDE:		/* integer divide fault */
355			ucode = FPE_INTDIV;
356			i = SIGFPE;
357			break;
358
359#ifdef DEV_ISA
360		case T_NMI:
361#ifdef POWERFAIL_NMI
362#ifndef TIMER_FREQ
363#  define TIMER_FREQ 1193182
364#endif
365			mtx_lock(&Giant);
366			if (time_second - lastalert > 10) {
367				log(LOG_WARNING, "NMI: power fail\n");
368				sysbeep(TIMER_FREQ/880, hz);
369				lastalert = time_second;
370			}
371			mtx_unlock(&Giant);
372			goto out;
373#else /* !POWERFAIL_NMI */
374			/* machine/parity/power fail/"kitchen sink" faults */
375			/* XXX Giant */
376			if (isa_nmi(code) == 0) {
377#ifdef DDB
378				/*
379				 * NMI can be hooked up to a pushbutton
380				 * for debugging.
381				 */
382				if (ddb_on_nmi) {
383					printf ("NMI ... going to debugger\n");
384					kdb_trap (type, 0, &frame);
385				}
386#endif /* DDB */
387				goto out;
388			} else if (panic_on_nmi)
389				panic("NMI indicates hardware failure");
390			break;
391#endif /* POWERFAIL_NMI */
392#endif /* DEV_ISA */
393
394		case T_OFLOW:		/* integer overflow fault */
395			ucode = FPE_INTOVF;
396			i = SIGFPE;
397			break;
398
399		case T_BOUND:		/* bounds check fault */
400			ucode = FPE_FLTSUB;
401			i = SIGFPE;
402			break;
403
404		case T_DNA:
405#ifdef DEV_NPX
406			/* transparent fault (due to context switch "late") */
407			if (npxdna())
408				goto out;
409#endif
410			if (!pmath_emulate) {
411				i = SIGFPE;
412				ucode = FPE_FPU_NP_TRAP;
413				break;
414			}
415			mtx_lock(&Giant);
416			i = (*pmath_emulate)(&frame);
417			mtx_unlock(&Giant);
418			if (i == 0) {
419				if (!(frame.tf_eflags & PSL_T))
420					goto out;
421				frame.tf_eflags &= ~PSL_T;
422				i = SIGTRAP;
423			}
424			/* else ucode = emulator_only_knows() XXX */
425			break;
426
427		case T_FPOPFLT:		/* FPU operand fetch fault */
428			ucode = T_FPOPFLT;
429			i = SIGILL;
430			break;
431		}
432	} else {
433		/* kernel trap */
434
435		switch (type) {
436		case T_PAGEFLT:			/* page fault */
437			/*
438			 * For some Cyrix CPUs, %cr2 is clobbered by
439			 * interrupts.  This problem is worked around by using
440			 * an interrupt gate for the pagefault handler.  We
441			 * are finally ready to read %cr2 and then must
442			 * reenable interrupts.
443			 */
444			eva = rcr2();
445			enable_intr();
446			mtx_lock(&Giant);
447			(void) trap_pfault(&frame, FALSE, eva);
448			mtx_unlock(&Giant);
449			goto out;
450
451		case T_DNA:
452#ifdef DEV_NPX
453			/*
454			 * The kernel is apparently using npx for copying.
455			 * XXX this should be fatal unless the kernel has
456			 * registered such use.
457			 */
458			if (npxdna())
459				goto out;
460#endif
461			break;
462
463			/*
464			 * The following two traps can happen in
465			 * vm86 mode, and, if so, we want to handle
466			 * them specially.
467			 */
468		case T_PROTFLT:		/* general protection fault */
469		case T_STKFLT:		/* stack fault */
470			if (frame.tf_eflags & PSL_VM) {
471				mtx_lock(&Giant);
472				i = vm86_emulate((struct vm86frame *)&frame);
473				mtx_unlock(&Giant);
474				if (i != 0)
475					/*
476					 * returns to original process
477					 */
478					vm86_trap((struct vm86frame *)&frame);
479				goto out;
480			}
481			if (type == T_STKFLT)
482				break;
483
484			/* FALL THROUGH */
485
486		case T_SEGNPFLT:	/* segment not present fault */
487			if (in_vm86call)
488				break;
489
490			if (p->p_intr_nesting_level != 0)
491				break;
492
493			/*
494			 * Invalid %fs's and %gs's can be created using
495			 * procfs or PT_SETREGS or by invalidating the
496			 * underlying LDT entry.  This causes a fault
497			 * in kernel mode when the kernel attempts to
498			 * switch contexts.  Lose the bad context
499			 * (XXX) so that we can continue, and generate
500			 * a signal.
501			 */
502			if (frame.tf_eip == (int)cpu_switch_load_gs) {
503				PCPU_GET(curpcb)->pcb_gs = 0;
504				PROC_LOCK(p);
505				psignal(p, SIGBUS);
506				PROC_UNLOCK(p);
507				goto out;
508			}
509
510			/*
511			 * Invalid segment selectors and out of bounds
512			 * %eip's and %esp's can be set up in user mode.
513			 * This causes a fault in kernel mode when the
514			 * kernel tries to return to user mode.  We want
515			 * to get this fault so that we can fix the
516			 * problem here and not have to check all the
517			 * selectors and pointers when the user changes
518			 * them.
519			 */
520			if (frame.tf_eip == (int)doreti_iret) {
521				frame.tf_eip = (int)doreti_iret_fault;
522				goto out;
523			}
524			if (frame.tf_eip == (int)doreti_popl_ds) {
525				frame.tf_eip = (int)doreti_popl_ds_fault;
526				goto out;
527			}
528			if (frame.tf_eip == (int)doreti_popl_es) {
529				frame.tf_eip = (int)doreti_popl_es_fault;
530				goto out;
531			}
532			if (frame.tf_eip == (int)doreti_popl_fs) {
533				frame.tf_eip = (int)doreti_popl_fs_fault;
534				goto out;
535			}
536			if (PCPU_GET(curpcb) != NULL &&
537			    PCPU_GET(curpcb)->pcb_onfault != NULL) {
538				frame.tf_eip =
539				    (int)PCPU_GET(curpcb)->pcb_onfault;
540				goto out;
541			}
542			break;
543
544		case T_TSSFLT:
545			/*
546			 * PSL_NT can be set in user mode and isn't cleared
547			 * automatically when the kernel is entered.  This
548			 * causes a TSS fault when the kernel attempts to
549			 * `iret' because the TSS link is uninitialized.  We
550			 * want to get this fault so that we can fix the
551			 * problem here and not every time the kernel is
552			 * entered.
553			 */
554			if (frame.tf_eflags & PSL_NT) {
555				frame.tf_eflags &= ~PSL_NT;
556				goto out;
557			}
558			break;
559
560		case T_TRCTRAP:	 /* trace trap */
561			if (frame.tf_eip == (int)IDTVEC(lcall_syscall)) {
562				/*
563				 * We've just entered system mode via the
564				 * syscall lcall.  Continue single stepping
565				 * silently until the syscall handler has
566				 * saved the flags.
567				 */
568				goto out;
569			}
570			if (frame.tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
571				/*
572				 * The syscall handler has now saved the
573				 * flags.  Stop single stepping it.
574				 */
575				frame.tf_eflags &= ~PSL_T;
576				goto out;
577			}
578			/*
579			 * Ignore debug register trace traps due to
580			 * accesses in the user's address space, which
581			 * can happen under several conditions such as
582			 * if a user sets a watchpoint on a buffer and
583			 * then passes that buffer to a system call.
584			 * We still want to get TRCTRAPS for addresses
585			 * in kernel space because that is useful when
586			 * debugging the kernel.
587			 */
588			/* XXX Giant */
589			if (user_dbreg_trap() && !in_vm86call) {
590				/*
591				 * Reset breakpoint bits because the
592				 * processor doesn't
593				 */
594				load_dr6(rdr6() & 0xfffffff0);
595				goto out;
596			}
597			/*
598			 * Fall through (TRCTRAP kernel mode, kernel address)
599			 */
600		case T_BPTFLT:
601			/*
602			 * If DDB is enabled, let it handle the debugger trap.
603			 * Otherwise, debugger traps "can't happen".
604			 */
605#ifdef DDB
606			/* XXX Giant */
607			if (kdb_trap (type, 0, &frame))
608				goto out;
609#endif
610			break;
611
612#ifdef DEV_ISA
613		case T_NMI:
614#ifdef POWERFAIL_NMI
615			mtx_lock(&Giant);
616			if (time_second - lastalert > 10) {
617				log(LOG_WARNING, "NMI: power fail\n");
618				sysbeep(TIMER_FREQ/880, hz);
619				lastalert = time_second;
620			}
621			mtx_unlock(&Giant);
622			goto out;
623#else /* !POWERFAIL_NMI */
624			/* XXX Giant */
625			/* machine/parity/power fail/"kitchen sink" faults */
626			if (isa_nmi(code) == 0) {
627#ifdef DDB
628				/*
629				 * NMI can be hooked up to a pushbutton
630				 * for debugging.
631				 */
632				if (ddb_on_nmi) {
633					printf ("NMI ... going to debugger\n");
634					kdb_trap (type, 0, &frame);
635				}
636#endif /* DDB */
637				goto out;
638			} else if (panic_on_nmi == 0)
639				goto out;
640			/* FALL THROUGH */
641#endif /* POWERFAIL_NMI */
642#endif /* DEV_ISA */
643		}
644
645		mtx_lock(&Giant);
646		trap_fatal(&frame, eva);
647		mtx_unlock(&Giant);
648		goto out;
649	}
650
651	mtx_lock(&Giant);
652	/* Translate fault for emulators (e.g. Linux) */
653	if (*p->p_sysent->sv_transtrap)
654		i = (*p->p_sysent->sv_transtrap)(i, type);
655
656	trapsignal(p, i, ucode);
657
658#ifdef DEBUG
659	if (type <= MAX_TRAP_MSG) {
660		uprintf("fatal process exception: %s",
661			trap_msg[type]);
662		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
663			uprintf(", fault VA = 0x%lx", (u_long)eva);
664		uprintf("\n");
665	}
666#endif
667	mtx_unlock(&Giant);
668
669user:
670	userret(p, &frame, sticks);
671	if (mtx_owned(&Giant))
672		mtx_unlock(&Giant);
673out:
674	return;
675}
676
677#ifdef notyet
678/*
679 * This version doesn't allow a page fault to user space while
680 * in the kernel. The rest of the kernel needs to be made "safe"
681 * before this can be used. I think the only things remaining
682 * to be made safe are the iBCS2 code and the process tracing/
683 * debugging code.
684 */
685static int
686trap_pfault(frame, usermode, eva)
687	struct trapframe *frame;
688	int usermode;
689	vm_offset_t eva;
690{
691	vm_offset_t va;
692	struct vmspace *vm = NULL;
693	vm_map_t map = 0;
694	int rv = 0;
695	vm_prot_t ftype;
696	struct proc *p = curproc;
697
698	if (frame->tf_err & PGEX_W)
699		ftype = VM_PROT_WRITE;
700	else
701		ftype = VM_PROT_READ;
702
703	va = trunc_page(eva);
704	if (va < VM_MIN_KERNEL_ADDRESS) {
705		vm_offset_t v;
706		vm_page_t mpte;
707
708		if (p == NULL ||
709		    (!usermode && va < VM_MAXUSER_ADDRESS &&
710		     (p->p_intr_nesting_level != 0 ||
711		      PCPU_GET(curpcb) == NULL ||
712		      PCPU_GET(curpcb)->pcb_onfault == NULL))) {
713			trap_fatal(frame, eva);
714			return (-1);
715		}
716
717		/*
718		 * This is a fault on non-kernel virtual memory.
719		 * vm is initialized above to NULL. If curproc is NULL
720		 * or curproc->p_vmspace is NULL the fault is fatal.
721		 */
722		vm = p->p_vmspace;
723		if (vm == NULL)
724			goto nogo;
725
726		map = &vm->vm_map;
727
728		/*
729		 * Keep swapout from messing with us during this
730		 *	critical time.
731		 */
732		PROC_LOCK(p);
733		++p->p_lock;
734		PROC_UNLOCK(p);
735
736		/*
737		 * Grow the stack if necessary
738		 */
739		/* grow_stack returns false only if va falls into
740		 * a growable stack region and the stack growth
741		 * fails.  It returns true if va was not within
742		 * a growable stack region, or if the stack
743		 * growth succeeded.
744		 */
745		if (!grow_stack (p, va))
746			rv = KERN_FAILURE;
747		else
748			/* Fault in the user page: */
749			rv = vm_fault(map, va, ftype,
750			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
751						      : VM_FAULT_NORMAL);
752
753		PROC_LOCK(p);
754		--p->p_lock;
755		PROC_UNLOCK(p);
756	} else {
757		/*
758		 * Don't allow user-mode faults in kernel address space.
759		 */
760		if (usermode)
761			goto nogo;
762
763		/*
764		 * Since we know that kernel virtual address addresses
765		 * always have pte pages mapped, we just have to fault
766		 * the page.
767		 */
768		rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL);
769	}
770
771	if (rv == KERN_SUCCESS)
772		return (0);
773nogo:
774	if (!usermode) {
775		if (p->p_intr_nesting_level == 0 &&
776		    PCPU_GET(curpcb) != NULL &&
777		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
778			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
779			return (0);
780		}
781		trap_fatal(frame, eva);
782		return (-1);
783	}
784
785	/* kludge to pass faulting virtual address to sendsig */
786	frame->tf_err = eva;
787
788	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
789}
790#endif
791
792int
793trap_pfault(frame, usermode, eva)
794	struct trapframe *frame;
795	int usermode;
796	vm_offset_t eva;
797{
798	vm_offset_t va;
799	struct vmspace *vm = NULL;
800	vm_map_t map = 0;
801	int rv = 0;
802	vm_prot_t ftype;
803	struct proc *p = curproc;
804
805	va = trunc_page(eva);
806	if (va >= KERNBASE) {
807		/*
808		 * Don't allow user-mode faults in kernel address space.
809		 * An exception:  if the faulting address is the invalid
810		 * instruction entry in the IDT, then the Intel Pentium
811		 * F00F bug workaround was triggered, and we need to
812		 * treat it is as an illegal instruction, and not a page
813		 * fault.
814		 */
815#if defined(I586_CPU) && !defined(NO_F00F_HACK)
816		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
817			return -2;
818#endif
819		if (usermode)
820			goto nogo;
821
822		map = kernel_map;
823	} else {
824		/*
825		 * This is a fault on non-kernel virtual memory.
826		 * vm is initialized above to NULL. If curproc is NULL
827		 * or curproc->p_vmspace is NULL the fault is fatal.
828		 */
829		if (p != NULL)
830			vm = p->p_vmspace;
831
832		if (vm == NULL)
833			goto nogo;
834
835		map = &vm->vm_map;
836	}
837
838	if (frame->tf_err & PGEX_W)
839		ftype = VM_PROT_WRITE;
840	else
841		ftype = VM_PROT_READ;
842
843	if (map != kernel_map) {
844		/*
845		 * Keep swapout from messing with us during this
846		 *	critical time.
847		 */
848		PROC_LOCK(p);
849		++p->p_lock;
850		PROC_UNLOCK(p);
851
852		/*
853		 * Grow the stack if necessary
854		 */
855		/* grow_stack returns false only if va falls into
856		 * a growable stack region and the stack growth
857		 * fails.  It returns true if va was not within
858		 * a growable stack region, or if the stack
859		 * growth succeeded.
860		 */
861		if (!grow_stack (p, va))
862			rv = KERN_FAILURE;
863		else
864			/* Fault in the user page: */
865			rv = vm_fault(map, va, ftype,
866			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
867						      : VM_FAULT_NORMAL);
868
869		PROC_LOCK(p);
870		--p->p_lock;
871		PROC_UNLOCK(p);
872	} else {
873		/*
874		 * Don't have to worry about process locking or stacks in the
875		 * kernel.
876		 */
877		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
878	}
879
880	if (rv == KERN_SUCCESS)
881		return (0);
882nogo:
883	if (!usermode) {
884		if (p->p_intr_nesting_level == 0 &&
885		    PCPU_GET(curpcb) != NULL &&
886		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
887			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
888			return (0);
889		}
890		trap_fatal(frame, eva);
891		return (-1);
892	}
893
894	/* kludge to pass faulting virtual address to sendsig */
895	frame->tf_err = eva;
896
897	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
898}
899
900static void
901trap_fatal(frame, eva)
902	struct trapframe *frame;
903	vm_offset_t eva;
904{
905	int code, type, ss, esp;
906	struct soft_segment_descriptor softseg;
907
908	code = frame->tf_err;
909	type = frame->tf_trapno;
910	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
911
912	if (type <= MAX_TRAP_MSG)
913		printf("\n\nFatal trap %d: %s while in %s mode\n",
914			type, trap_msg[type],
915        		frame->tf_eflags & PSL_VM ? "vm86" :
916			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
917#ifdef SMP
918	/* two separate prints in case of a trap on an unmapped page */
919	printf("cpuid = %d; ", PCPU_GET(cpuid));
920	printf("lapic.id = %08x\n", lapic.id);
921#endif
922	if (type == T_PAGEFLT) {
923		printf("fault virtual address	= 0x%x\n", eva);
924		printf("fault code		= %s %s, %s\n",
925			code & PGEX_U ? "user" : "supervisor",
926			code & PGEX_W ? "write" : "read",
927			code & PGEX_P ? "protection violation" : "page not present");
928	}
929	printf("instruction pointer	= 0x%x:0x%x\n",
930	       frame->tf_cs & 0xffff, frame->tf_eip);
931        if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
932		ss = frame->tf_ss & 0xffff;
933		esp = frame->tf_esp;
934	} else {
935		ss = GSEL(GDATA_SEL, SEL_KPL);
936		esp = (int)&frame->tf_esp;
937	}
938	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
939	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
940	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
941	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
942	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
943	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
944	       softseg.ssd_gran);
945	printf("processor eflags	= ");
946	if (frame->tf_eflags & PSL_T)
947		printf("trace trap, ");
948	if (frame->tf_eflags & PSL_I)
949		printf("interrupt enabled, ");
950	if (frame->tf_eflags & PSL_NT)
951		printf("nested task, ");
952	if (frame->tf_eflags & PSL_RF)
953		printf("resume, ");
954	if (frame->tf_eflags & PSL_VM)
955		printf("vm86, ");
956	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
957	printf("current process		= ");
958	if (curproc) {
959		printf("%lu (%s)\n",
960		    (u_long)curproc->p_pid, curproc->p_comm ?
961		    curproc->p_comm : "");
962	} else {
963		printf("Idle\n");
964	}
965
966#ifdef KDB
967	if (kdb_trap(&psl))
968		return;
969#endif
970#ifdef DDB
971	if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame))
972		return;
973#endif
974	printf("trap number		= %d\n", type);
975	if (type <= MAX_TRAP_MSG)
976		panic(trap_msg[type]);
977	else
978		panic("unknown/reserved trap");
979}
980
981/*
982 * Double fault handler. Called when a fault occurs while writing
983 * a frame for a trap/exception onto the stack. This usually occurs
984 * when the stack overflows (such is the case with infinite recursion,
985 * for example).
986 *
987 * XXX Note that the current PTD gets replaced by IdlePTD when the
988 * task switch occurs. This means that the stack that was active at
989 * the time of the double fault is not available at <kstack> unless
990 * the machine was idle when the double fault occurred. The downside
991 * of this is that "trace <ebp>" in ddb won't work.
992 */
993void
994dblfault_handler()
995{
996	printf("\nFatal double fault:\n");
997	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
998	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
999	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
1000#ifdef SMP
1001	/* two separate prints in case of a trap on an unmapped page */
1002	printf("cpuid = %d; ", PCPU_GET(cpuid));
1003	printf("lapic.id = %08x\n", lapic.id);
1004#endif
1005	panic("double fault");
1006}
1007
1008/*
1009 * Compensate for 386 brain damage (missing URKR).
1010 * This is a little simpler than the pagefault handler in trap() because
1011 * it the page tables have already been faulted in and high addresses
1012 * are thrown out early for other reasons.
1013 */
1014int trapwrite(addr)
1015	unsigned addr;
1016{
1017	struct proc *p;
1018	vm_offset_t va;
1019	struct vmspace *vm;
1020	int rv;
1021
1022	va = trunc_page((vm_offset_t)addr);
1023	/*
1024	 * XXX - MAX is END.  Changed > to >= for temp. fix.
1025	 */
1026	if (va >= VM_MAXUSER_ADDRESS)
1027		return (1);
1028
1029	p = curproc;
1030	vm = p->p_vmspace;
1031
1032	PROC_LOCK(p);
1033	++p->p_lock;
1034	PROC_UNLOCK(p);
1035
1036	if (!grow_stack (p, va))
1037		rv = KERN_FAILURE;
1038	else
1039		/*
1040		 * fault the data page
1041		 */
1042		rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
1043
1044	PROC_LOCK(p);
1045	--p->p_lock;
1046	PROC_UNLOCK(p);
1047
1048	if (rv != KERN_SUCCESS)
1049		return 1;
1050
1051	return (0);
1052}
1053
1054/*
1055 *	syscall -	MP aware system call request C handler
1056 *
1057 *	A system call is essentially treated as a trap except that the
1058 *	MP lock is not held on entry or return.  We are responsible for
1059 *	obtaining the MP lock if necessary and for handling ASTs
1060 *	(e.g. a task switch) prior to return.
1061 *
1062 *	In general, only simple access and manipulation of curproc and
1063 *	the current stack is allowed without having to hold MP lock.
1064 */
1065void
1066syscall(frame)
1067	struct trapframe frame;
1068{
1069	caddr_t params;
1070	int i;
1071	struct sysent *callp;
1072	struct proc *p = curproc;
1073	u_quad_t sticks;
1074	int error;
1075	int narg;
1076	int args[8];
1077	u_int code;
1078
1079	atomic_add_int(&cnt.v_syscall, 1);
1080
1081#ifdef DIAGNOSTIC
1082	if (ISPL(frame.tf_cs) != SEL_UPL) {
1083		mtx_lock(&Giant);
1084		panic("syscall");
1085		/* NOT REACHED */
1086	}
1087#endif
1088
1089	mtx_lock_spin(&sched_lock);
1090	sticks = p->p_sticks;
1091	mtx_unlock_spin(&sched_lock);
1092
1093	p->p_md.md_regs = &frame;
1094	params = (caddr_t)frame.tf_esp + sizeof(int);
1095	code = frame.tf_eax;
1096
1097	if (p->p_sysent->sv_prepsyscall) {
1098		/*
1099		 * The prep code is not MP aware.
1100		 */
1101		mtx_lock(&Giant);
1102		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
1103		mtx_unlock(&Giant);
1104	} else {
1105		/*
1106		 * Need to check if this is a 32 bit or 64 bit syscall.
1107		 * fuword is MP aware.
1108		 */
1109		if (code == SYS_syscall) {
1110			/*
1111			 * Code is first argument, followed by actual args.
1112			 */
1113			code = fuword(params);
1114			params += sizeof(int);
1115		} else if (code == SYS___syscall) {
1116			/*
1117			 * Like syscall, but code is a quad, so as to maintain
1118			 * quad alignment for the rest of the arguments.
1119			 */
1120			code = fuword(params);
1121			params += sizeof(quad_t);
1122		}
1123	}
1124
1125 	if (p->p_sysent->sv_mask)
1126 		code &= p->p_sysent->sv_mask;
1127
1128 	if (code >= p->p_sysent->sv_size)
1129 		callp = &p->p_sysent->sv_table[0];
1130  	else
1131 		callp = &p->p_sysent->sv_table[code];
1132
1133	narg = callp->sy_narg & SYF_ARGMASK;
1134
1135	/*
1136	 * copyin is MP aware, but the tracing code is not
1137	 */
1138	if (params && (i = narg * sizeof(int)) &&
1139	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
1140		mtx_lock(&Giant);
1141#ifdef KTRACE
1142		if (KTRPOINT(p, KTR_SYSCALL))
1143			ktrsyscall(p->p_tracep, code, narg, args);
1144#endif
1145		goto bad;
1146	}
1147
1148	/*
1149	 * Try to run the syscall without the MP lock if the syscall
1150	 * is MP safe.  We have to obtain the MP lock no matter what if
1151	 * we are ktracing
1152	 */
1153	if ((callp->sy_narg & SYF_MPSAFE) == 0) {
1154		mtx_lock(&Giant);
1155	}
1156
1157#ifdef KTRACE
1158	if (KTRPOINT(p, KTR_SYSCALL)) {
1159		if (!mtx_owned(&Giant))
1160			mtx_lock(&Giant);
1161		ktrsyscall(p->p_tracep, code, narg, args);
1162	}
1163#endif
1164	p->p_retval[0] = 0;
1165	p->p_retval[1] = frame.tf_edx;
1166
1167	STOPEVENT(p, S_SCE, narg);	/* MP aware */
1168
1169	error = (*callp->sy_call)(p, args);
1170
1171	/*
1172	 * MP SAFE (we may or may not have the MP lock at this point)
1173	 */
1174	switch (error) {
1175	case 0:
1176		frame.tf_eax = p->p_retval[0];
1177		frame.tf_edx = p->p_retval[1];
1178		frame.tf_eflags &= ~PSL_C;
1179		break;
1180
1181	case ERESTART:
1182		/*
1183		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
1184		 * int 0x80 is 2 bytes. We saved this in tf_err.
1185		 */
1186		frame.tf_eip -= frame.tf_err;
1187		break;
1188
1189	case EJUSTRETURN:
1190		break;
1191
1192	default:
1193bad:
1194 		if (p->p_sysent->sv_errsize) {
1195 			if (error >= p->p_sysent->sv_errsize)
1196  				error = -1;	/* XXX */
1197   			else
1198  				error = p->p_sysent->sv_errtbl[error];
1199		}
1200		frame.tf_eax = error;
1201		frame.tf_eflags |= PSL_C;
1202		break;
1203	}
1204
1205	/*
1206	 * Traced syscall.  trapsignal() is not MP aware.
1207	 */
1208	if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
1209		if (!mtx_owned(&Giant))
1210			mtx_lock(&Giant);
1211		frame.tf_eflags &= ~PSL_T;
1212		trapsignal(p, SIGTRAP, 0);
1213	}
1214
1215	/*
1216	 * Handle reschedule and other end-of-syscall issues
1217	 */
1218	userret(p, &frame, sticks);
1219
1220#ifdef KTRACE
1221	if (KTRPOINT(p, KTR_SYSRET)) {
1222		if (!mtx_owned(&Giant))
1223			mtx_lock(&Giant);
1224		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
1225	}
1226#endif
1227
1228	/*
1229	 * Release Giant if we had to get it
1230	 */
1231	if (mtx_owned(&Giant))
1232		mtx_unlock(&Giant);
1233
1234	/*
1235	 * This works because errno is findable through the
1236	 * register set.  If we ever support an emulation where this
1237	 * is not the case, this code will need to be revisited.
1238	 */
1239	STOPEVENT(p, S_SCX, code);
1240
1241#ifdef WITNESS
1242	if (witness_list(p)) {
1243		panic("system call %s returning with mutex(s) held\n",
1244		    syscallnames[code]);
1245	}
1246#endif
1247	mtx_assert(&sched_lock, MA_NOTOWNED);
1248	mtx_assert(&Giant, MA_NOTOWNED);
1249}
1250
1251void
1252ast(framep)
1253	struct trapframe *framep;
1254{
1255	struct proc *p = CURPROC;
1256	u_quad_t sticks;
1257
1258	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
1259
1260	/*
1261	 * We check for a pending AST here rather than in the assembly as
1262	 * acquiring and releasing mutexes in assembly is not fun.
1263	 */
1264	mtx_lock_spin(&sched_lock);
1265	if (!(astpending(p) || resched_wanted(p))) {
1266		mtx_unlock_spin(&sched_lock);
1267		return;
1268	}
1269
1270	sticks = p->p_sticks;
1271	p->p_md.md_regs = framep;
1272
1273	astoff(p);
1274	cnt.v_soft++;
1275	mtx_intr_enable(&sched_lock);
1276	if (p->p_sflag & PS_OWEUPC) {
1277		p->p_sflag &= ~PS_OWEUPC;
1278		mtx_unlock_spin(&sched_lock);
1279		mtx_lock(&Giant);
1280		mtx_lock_spin(&sched_lock);
1281		addupc_task(p, p->p_stats->p_prof.pr_addr,
1282			    p->p_stats->p_prof.pr_ticks);
1283	}
1284	if (p->p_sflag & PS_ALRMPEND) {
1285		p->p_sflag &= ~PS_ALRMPEND;
1286		mtx_unlock_spin(&sched_lock);
1287		PROC_LOCK(p);
1288		psignal(p, SIGVTALRM);
1289		PROC_UNLOCK(p);
1290		mtx_lock_spin(&sched_lock);
1291	}
1292	if (p->p_sflag & PS_PROFPEND) {
1293		p->p_sflag &= ~PS_PROFPEND;
1294		mtx_unlock_spin(&sched_lock);
1295		PROC_LOCK(p);
1296		psignal(p, SIGPROF);
1297		PROC_UNLOCK(p);
1298	} else
1299		mtx_unlock_spin(&sched_lock);
1300
1301	userret(p, framep, sticks);
1302
1303	if (mtx_owned(&Giant))
1304		mtx_unlock(&Giant);
1305}
1306