trap.c revision 323145
1/*-
2 * Copyright (C) 1994, David Greenman
3 * Copyright (c) 1990, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the University of Utah, and William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
38 */
39
40#include <sys/cdefs.h>
41__FBSDID("$FreeBSD: stable/11/sys/i386/i386/trap.c 323145 2017-09-03 09:16:23Z kib $");
42
43/*
44 * 386 Trap and System call handling
45 */
46
47#include "opt_clock.h"
48#include "opt_cpu.h"
49#include "opt_hwpmc_hooks.h"
50#include "opt_isa.h"
51#include "opt_kdb.h"
52#include "opt_stack.h"
53#include "opt_trap.h"
54
55#include <sys/param.h>
56#include <sys/bus.h>
57#include <sys/systm.h>
58#include <sys/proc.h>
59#include <sys/pioctl.h>
60#include <sys/ptrace.h>
61#include <sys/kdb.h>
62#include <sys/kernel.h>
63#include <sys/ktr.h>
64#include <sys/lock.h>
65#include <sys/mutex.h>
66#include <sys/resourcevar.h>
67#include <sys/signalvar.h>
68#include <sys/syscall.h>
69#include <sys/sysctl.h>
70#include <sys/sysent.h>
71#include <sys/uio.h>
72#include <sys/vmmeter.h>
73#ifdef HWPMC_HOOKS
74#include <sys/pmckern.h>
75PMC_SOFT_DEFINE( , , page_fault, all);
76PMC_SOFT_DEFINE( , , page_fault, read);
77PMC_SOFT_DEFINE( , , page_fault, write);
78#endif
79#include <security/audit/audit.h>
80
81#include <vm/vm.h>
82#include <vm/vm_param.h>
83#include <vm/pmap.h>
84#include <vm/vm_kern.h>
85#include <vm/vm_map.h>
86#include <vm/vm_page.h>
87#include <vm/vm_extern.h>
88
89#include <machine/cpu.h>
90#include <machine/intr_machdep.h>
91#include <x86/mca.h>
92#include <machine/md_var.h>
93#include <machine/pcb.h>
94#ifdef SMP
95#include <machine/smp.h>
96#endif
97#include <machine/stack.h>
98#include <machine/tss.h>
99#include <machine/vm86.h>
100
101#ifdef POWERFAIL_NMI
102#include <sys/syslog.h>
103#include <machine/clock.h>
104#endif
105
106#ifdef KDTRACE_HOOKS
107#include <sys/dtrace_bsd.h>
108#endif
109
110void trap(struct trapframe *frame);
111void syscall(struct trapframe *frame);
112
113static int trap_pfault(struct trapframe *, int, vm_offset_t);
114static void trap_fatal(struct trapframe *, vm_offset_t);
115void dblfault_handler(void);
116
117extern inthand_t IDTVEC(lcall_syscall);
118
119#define MAX_TRAP_MSG		32
120static char *trap_msg[] = {
121	"",					/*  0 unused */
122	"privileged instruction fault",		/*  1 T_PRIVINFLT */
123	"",					/*  2 unused */
124	"breakpoint instruction fault",		/*  3 T_BPTFLT */
125	"",					/*  4 unused */
126	"",					/*  5 unused */
127	"arithmetic trap",			/*  6 T_ARITHTRAP */
128	"",					/*  7 unused */
129	"",					/*  8 unused */
130	"general protection fault",		/*  9 T_PROTFLT */
131	"trace trap",				/* 10 T_TRCTRAP */
132	"",					/* 11 unused */
133	"page fault",				/* 12 T_PAGEFLT */
134	"",					/* 13 unused */
135	"alignment fault",			/* 14 T_ALIGNFLT */
136	"",					/* 15 unused */
137	"",					/* 16 unused */
138	"",					/* 17 unused */
139	"integer divide fault",			/* 18 T_DIVIDE */
140	"non-maskable interrupt trap",		/* 19 T_NMI */
141	"overflow trap",			/* 20 T_OFLOW */
142	"FPU bounds check fault",		/* 21 T_BOUND */
143	"FPU device not available",		/* 22 T_DNA */
144	"double fault",				/* 23 T_DOUBLEFLT */
145	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
146	"invalid TSS fault",			/* 25 T_TSSFLT */
147	"segment not present fault",		/* 26 T_SEGNPFLT */
148	"stack fault",				/* 27 T_STKFLT */
149	"machine check trap",			/* 28 T_MCHK */
150	"SIMD floating-point exception",	/* 29 T_XMMFLT */
151	"reserved (unknown) fault",		/* 30 T_RESERVED */
152	"",					/* 31 unused (reserved) */
153	"DTrace pid return trap",               /* 32 T_DTRACE_RET */
154};
155
156#if defined(I586_CPU) && !defined(NO_F00F_HACK)
157int has_f00f_bug = 0;		/* Initialized so that it can be patched. */
158#endif
159
160static int prot_fault_translation = 0;
161SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW,
162	&prot_fault_translation, 0, "Select signal to deliver on protection fault");
163static int uprintf_signal;
164SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW,
165    &uprintf_signal, 0,
166    "Print debugging information on trap signal to ctty");
167
168/*
169 * Exception, fault, and trap interface to the FreeBSD kernel.
170 * This common code is called from assembly language IDT gate entry
171 * routines that prepare a suitable stack frame, and restore this
172 * frame after the exception has been processed.
173 */
174
175void
176trap(struct trapframe *frame)
177{
178#ifdef KDTRACE_HOOKS
179	struct reg regs;
180#endif
181	struct thread *td = curthread;
182	struct proc *p = td->td_proc;
183#ifdef KDB
184	register_t dr6;
185#endif
186	int i = 0, ucode = 0;
187	u_int type;
188	register_t addr = 0;
189	vm_offset_t eva;
190	ksiginfo_t ksi;
191#ifdef POWERFAIL_NMI
192	static int lastalert = 0;
193#endif
194
195	PCPU_INC(cnt.v_trap);
196	type = frame->tf_trapno;
197
198#ifdef SMP
199	/* Handler for NMI IPIs used for stopping CPUs. */
200	if (type == T_NMI) {
201	         if (ipi_nmi_handler() == 0)
202	                   goto out;
203	}
204#endif /* SMP */
205
206#ifdef KDB
207	if (kdb_active) {
208		kdb_reenter();
209		goto out;
210	}
211#endif
212
213	if (type == T_RESERVED) {
214		trap_fatal(frame, 0);
215		goto out;
216	}
217
218	if (type == T_NMI) {
219#ifdef HWPMC_HOOKS
220		/*
221		 * CPU PMCs interrupt using an NMI so we check for that first.
222		 * If the HWPMC module is active, 'pmc_hook' will point to
223		 * the function to be called.  A non-zero return value from the
224		 * hook means that the NMI was consumed by it and that we can
225		 * return immediately.
226		 */
227		if (pmc_intr != NULL &&
228		    (*pmc_intr)(PCPU_GET(cpuid), frame) != 0)
229			goto out;
230#endif
231
232#ifdef STACK
233		if (stack_nmi_handler(frame) != 0)
234			goto out;
235#endif
236	}
237
238	if (type == T_MCHK) {
239		mca_intr();
240		goto out;
241	}
242
243#ifdef KDTRACE_HOOKS
244	/*
245	 * A trap can occur while DTrace executes a probe. Before
246	 * executing the probe, DTrace blocks re-scheduling and sets
247	 * a flag in its per-cpu flags to indicate that it doesn't
248	 * want to fault. On returning from the probe, the no-fault
249	 * flag is cleared and finally re-scheduling is enabled.
250	 */
251	if ((type == T_PROTFLT || type == T_PAGEFLT) &&
252	    dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
253		goto out;
254#endif
255
256	if ((frame->tf_eflags & PSL_I) == 0) {
257		/*
258		 * Buggy application or kernel code has disabled
259		 * interrupts and then trapped.  Enabling interrupts
260		 * now is wrong, but it is better than running with
261		 * interrupts disabled until they are accidentally
262		 * enabled later.
263		 */
264		if (TRAPF_USERMODE(frame) &&
265		    (curpcb->pcb_flags & PCB_VM86CALL) == 0)
266			uprintf(
267			    "pid %ld (%s): trap %d with interrupts disabled\n",
268			    (long)curproc->p_pid, curthread->td_name, type);
269		else if (type != T_NMI && type != T_BPTFLT &&
270		    type != T_TRCTRAP &&
271		    frame->tf_eip != (int)cpu_switch_load_gs) {
272			/*
273			 * XXX not quite right, since this may be for a
274			 * multiple fault in user mode.
275			 */
276			printf("kernel trap %d with interrupts disabled\n",
277			    type);
278			/*
279			 * Page faults need interrupts disabled until later,
280			 * and we shouldn't enable interrupts while holding
281			 * a spin lock.
282			 */
283			if (type != T_PAGEFLT &&
284			    td->td_md.md_spinlock_count == 0)
285				enable_intr();
286		}
287	}
288	eva = 0;
289	if (type == T_PAGEFLT) {
290		/*
291		 * For some Cyrix CPUs, %cr2 is clobbered by
292		 * interrupts.  This problem is worked around by using
293		 * an interrupt gate for the pagefault handler.  We
294		 * are finally ready to read %cr2 and conditionally
295		 * reenable interrupts.  If we hold a spin lock, then
296		 * we must not reenable interrupts.  This might be a
297		 * spurious page fault.
298		 */
299		eva = rcr2();
300		if (td->td_md.md_spinlock_count == 0)
301			enable_intr();
302	}
303
304        if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) {
305		/* user trap */
306
307		td->td_pticks = 0;
308		td->td_frame = frame;
309		addr = frame->tf_eip;
310		if (td->td_cowgen != p->p_cowgen)
311			thread_cow_update(td);
312
313		switch (type) {
314		case T_PRIVINFLT:	/* privileged instruction fault */
315			i = SIGILL;
316			ucode = ILL_PRVOPC;
317			break;
318
319		case T_BPTFLT:		/* bpt instruction fault */
320		case T_TRCTRAP:		/* trace trap */
321			enable_intr();
322#ifdef KDTRACE_HOOKS
323			if (type == T_BPTFLT) {
324				fill_frame_regs(frame, &regs);
325				if (dtrace_pid_probe_ptr != NULL &&
326				    dtrace_pid_probe_ptr(&regs) == 0)
327					goto out;
328			}
329#endif
330user_trctrap_out:
331			frame->tf_eflags &= ~PSL_T;
332			i = SIGTRAP;
333			ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
334			break;
335
336		case T_ARITHTRAP:	/* arithmetic trap */
337			ucode = npxtrap_x87();
338			if (ucode == -1)
339				goto userout;
340			i = SIGFPE;
341			break;
342
343			/*
344			 * The following two traps can happen in
345			 * vm86 mode, and, if so, we want to handle
346			 * them specially.
347			 */
348		case T_PROTFLT:		/* general protection fault */
349		case T_STKFLT:		/* stack fault */
350			if (frame->tf_eflags & PSL_VM) {
351				i = vm86_emulate((struct vm86frame *)frame);
352				if (i == SIGTRAP) {
353					type = T_TRCTRAP;
354					load_dr6(rdr6() | 0x4000);
355					goto user_trctrap_out;
356				}
357				if (i == 0)
358					goto user;
359				break;
360			}
361			i = SIGBUS;
362			ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
363			break;
364		case T_SEGNPFLT:	/* segment not present fault */
365			i = SIGBUS;
366			ucode = BUS_ADRERR;
367			break;
368		case T_TSSFLT:		/* invalid TSS fault */
369			i = SIGBUS;
370			ucode = BUS_OBJERR;
371			break;
372		case T_ALIGNFLT:
373			i = SIGBUS;
374			ucode = BUS_ADRALN;
375			break;
376		case T_DOUBLEFLT:	/* double fault */
377		default:
378			i = SIGBUS;
379			ucode = BUS_OBJERR;
380			break;
381
382		case T_PAGEFLT:		/* page fault */
383
384			i = trap_pfault(frame, TRUE, eva);
385#if defined(I586_CPU) && !defined(NO_F00F_HACK)
386			if (i == -2) {
387				/*
388				 * The f00f hack workaround has triggered, so
389				 * treat the fault as an illegal instruction
390				 * (T_PRIVINFLT) instead of a page fault.
391				 */
392				type = frame->tf_trapno = T_PRIVINFLT;
393
394				/* Proceed as in that case. */
395				ucode = ILL_PRVOPC;
396				i = SIGILL;
397				break;
398			}
399#endif
400			if (i == -1)
401				goto userout;
402			if (i == 0)
403				goto user;
404
405			if (i == SIGSEGV)
406				ucode = SEGV_MAPERR;
407			else {
408				if (prot_fault_translation == 0) {
409					/*
410					 * Autodetect.
411					 * This check also covers the images
412					 * without the ABI-tag ELF note.
413					 */
414					if (SV_CURPROC_ABI() == SV_ABI_FREEBSD
415					    && p->p_osrel >= P_OSREL_SIGSEGV) {
416						i = SIGSEGV;
417						ucode = SEGV_ACCERR;
418					} else {
419						i = SIGBUS;
420						ucode = BUS_PAGE_FAULT;
421					}
422				} else if (prot_fault_translation == 1) {
423					/*
424					 * Always compat mode.
425					 */
426					i = SIGBUS;
427					ucode = BUS_PAGE_FAULT;
428				} else {
429					/*
430					 * Always SIGSEGV mode.
431					 */
432					i = SIGSEGV;
433					ucode = SEGV_ACCERR;
434				}
435			}
436			addr = eva;
437			break;
438
439		case T_DIVIDE:		/* integer divide fault */
440			ucode = FPE_INTDIV;
441			i = SIGFPE;
442			break;
443
444#ifdef DEV_ISA
445		case T_NMI:
446#ifdef POWERFAIL_NMI
447#ifndef TIMER_FREQ
448#  define TIMER_FREQ 1193182
449#endif
450			if (time_second - lastalert > 10) {
451				log(LOG_WARNING, "NMI: power fail\n");
452				sysbeep(880, hz);
453				lastalert = time_second;
454			}
455			goto userout;
456#else /* !POWERFAIL_NMI */
457			nmi_handle_intr(type, frame);
458			goto out;
459#endif /* POWERFAIL_NMI */
460#endif /* DEV_ISA */
461
462		case T_OFLOW:		/* integer overflow fault */
463			ucode = FPE_INTOVF;
464			i = SIGFPE;
465			break;
466
467		case T_BOUND:		/* bounds check fault */
468			ucode = FPE_FLTSUB;
469			i = SIGFPE;
470			break;
471
472		case T_DNA:
473			KASSERT(PCB_USER_FPU(td->td_pcb),
474			    ("kernel FPU ctx has leaked"));
475			/* transparent fault (due to context switch "late") */
476			if (npxdna())
477				goto userout;
478			uprintf("pid %d killed due to lack of floating point\n",
479				p->p_pid);
480			i = SIGKILL;
481			ucode = 0;
482			break;
483
484		case T_FPOPFLT:		/* FPU operand fetch fault */
485			ucode = ILL_COPROC;
486			i = SIGILL;
487			break;
488
489		case T_XMMFLT:		/* SIMD floating-point exception */
490			ucode = npxtrap_sse();
491			if (ucode == -1)
492				goto userout;
493			i = SIGFPE;
494			break;
495#ifdef KDTRACE_HOOKS
496		case T_DTRACE_RET:
497			enable_intr();
498			fill_frame_regs(frame, &regs);
499			if (dtrace_return_probe_ptr != NULL &&
500			    dtrace_return_probe_ptr(&regs) == 0)
501				goto out;
502			goto userout;
503#endif
504		}
505	} else {
506		/* kernel trap */
507
508		KASSERT(cold || td->td_ucred != NULL,
509		    ("kernel trap doesn't have ucred"));
510		switch (type) {
511		case T_PAGEFLT:			/* page fault */
512			(void) trap_pfault(frame, FALSE, eva);
513			goto out;
514
515		case T_DNA:
516			if (PCB_USER_FPU(td->td_pcb))
517				panic("Unregistered use of FPU in kernel");
518			if (npxdna())
519				goto out;
520			break;
521
522		case T_ARITHTRAP:	/* arithmetic trap */
523		case T_XMMFLT:		/* SIMD floating-point exception */
524		case T_FPOPFLT:		/* FPU operand fetch fault */
525			/*
526			 * XXXKIB for now disable any FPU traps in kernel
527			 * handler registration seems to be overkill
528			 */
529			trap_fatal(frame, 0);
530			goto out;
531
532			/*
533			 * The following two traps can happen in
534			 * vm86 mode, and, if so, we want to handle
535			 * them specially.
536			 */
537		case T_PROTFLT:		/* general protection fault */
538		case T_STKFLT:		/* stack fault */
539			if (frame->tf_eflags & PSL_VM) {
540				i = vm86_emulate((struct vm86frame *)frame);
541				if (i == SIGTRAP) {
542					type = T_TRCTRAP;
543					load_dr6(rdr6() | 0x4000);
544					goto kernel_trctrap;
545				}
546				if (i != 0)
547					/*
548					 * returns to original process
549					 */
550					vm86_trap((struct vm86frame *)frame);
551				goto out;
552			}
553			/* FALL THROUGH */
554		case T_SEGNPFLT:	/* segment not present fault */
555			if (curpcb->pcb_flags & PCB_VM86CALL)
556				break;
557
558			/*
559			 * Invalid %fs's and %gs's can be created using
560			 * procfs or PT_SETREGS or by invalidating the
561			 * underlying LDT entry.  This causes a fault
562			 * in kernel mode when the kernel attempts to
563			 * switch contexts.  Lose the bad context
564			 * (XXX) so that we can continue, and generate
565			 * a signal.
566			 */
567			if (frame->tf_eip == (int)cpu_switch_load_gs) {
568				curpcb->pcb_gs = 0;
569#if 0
570				PROC_LOCK(p);
571				kern_psignal(p, SIGBUS);
572				PROC_UNLOCK(p);
573#endif
574				goto out;
575			}
576
577			if (td->td_intr_nesting_level != 0)
578				break;
579
580			/*
581			 * Invalid segment selectors and out of bounds
582			 * %eip's and %esp's can be set up in user mode.
583			 * This causes a fault in kernel mode when the
584			 * kernel tries to return to user mode.  We want
585			 * to get this fault so that we can fix the
586			 * problem here and not have to check all the
587			 * selectors and pointers when the user changes
588			 * them.
589			 */
590			if (frame->tf_eip == (int)doreti_iret) {
591				frame->tf_eip = (int)doreti_iret_fault;
592				goto out;
593			}
594			if (type == T_STKFLT)
595				break;
596
597			if (frame->tf_eip == (int)doreti_popl_ds) {
598				frame->tf_eip = (int)doreti_popl_ds_fault;
599				goto out;
600			}
601			if (frame->tf_eip == (int)doreti_popl_es) {
602				frame->tf_eip = (int)doreti_popl_es_fault;
603				goto out;
604			}
605			if (frame->tf_eip == (int)doreti_popl_fs) {
606				frame->tf_eip = (int)doreti_popl_fs_fault;
607				goto out;
608			}
609			if (curpcb->pcb_onfault != NULL) {
610				frame->tf_eip =
611				    (int)curpcb->pcb_onfault;
612				goto out;
613			}
614			break;
615
616		case T_TSSFLT:
617			/*
618			 * PSL_NT can be set in user mode and isn't cleared
619			 * automatically when the kernel is entered.  This
620			 * causes a TSS fault when the kernel attempts to
621			 * `iret' because the TSS link is uninitialized.  We
622			 * want to get this fault so that we can fix the
623			 * problem here and not every time the kernel is
624			 * entered.
625			 */
626			if (frame->tf_eflags & PSL_NT) {
627				frame->tf_eflags &= ~PSL_NT;
628				goto out;
629			}
630			break;
631
632		case T_TRCTRAP:	 /* trace trap */
633kernel_trctrap:
634			if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) {
635				/*
636				 * We've just entered system mode via the
637				 * syscall lcall.  Continue single stepping
638				 * silently until the syscall handler has
639				 * saved the flags.
640				 */
641				goto out;
642			}
643			if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
644				/*
645				 * The syscall handler has now saved the
646				 * flags.  Stop single stepping it.
647				 */
648				frame->tf_eflags &= ~PSL_T;
649				goto out;
650			}
651			/*
652			 * Ignore debug register trace traps due to
653			 * accesses in the user's address space, which
654			 * can happen under several conditions such as
655			 * if a user sets a watchpoint on a buffer and
656			 * then passes that buffer to a system call.
657			 * We still want to get TRCTRAPS for addresses
658			 * in kernel space because that is useful when
659			 * debugging the kernel.
660			 */
661			if (user_dbreg_trap() &&
662			   !(curpcb->pcb_flags & PCB_VM86CALL)) {
663				/*
664				 * Reset breakpoint bits because the
665				 * processor doesn't
666				 */
667				load_dr6(rdr6() & ~0xf);
668				goto out;
669			}
670			/*
671			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
672			 */
673		case T_BPTFLT:
674			/*
675			 * If KDB is enabled, let it handle the debugger trap.
676			 * Otherwise, debugger traps "can't happen".
677			 */
678#ifdef KDB
679			/* XXX %dr6 is not quite reentrant. */
680			dr6 = rdr6();
681			load_dr6(dr6 & ~0x4000);
682			if (kdb_trap(type, dr6, frame))
683				goto out;
684#endif
685			break;
686
687#ifdef DEV_ISA
688		case T_NMI:
689#ifdef POWERFAIL_NMI
690			if (time_second - lastalert > 10) {
691				log(LOG_WARNING, "NMI: power fail\n");
692				sysbeep(880, hz);
693				lastalert = time_second;
694			}
695			goto out;
696#else /* !POWERFAIL_NMI */
697			nmi_handle_intr(type, frame);
698			goto out;
699#endif /* POWERFAIL_NMI */
700#endif /* DEV_ISA */
701		}
702
703		trap_fatal(frame, eva);
704		goto out;
705	}
706
707	/* Translate fault for emulators (e.g. Linux) */
708	if (*p->p_sysent->sv_transtrap)
709		i = (*p->p_sysent->sv_transtrap)(i, type);
710
711	ksiginfo_init_trap(&ksi);
712	ksi.ksi_signo = i;
713	ksi.ksi_code = ucode;
714	ksi.ksi_addr = (void *)addr;
715	ksi.ksi_trapno = type;
716	if (uprintf_signal) {
717		uprintf("pid %d comm %s: signal %d err %x code %d type %d "
718		    "addr 0x%x esp 0x%08x eip 0x%08x "
719		    "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
720		    p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr,
721		    frame->tf_esp, frame->tf_eip,
722		    fubyte((void *)(frame->tf_eip + 0)),
723		    fubyte((void *)(frame->tf_eip + 1)),
724		    fubyte((void *)(frame->tf_eip + 2)),
725		    fubyte((void *)(frame->tf_eip + 3)),
726		    fubyte((void *)(frame->tf_eip + 4)),
727		    fubyte((void *)(frame->tf_eip + 5)),
728		    fubyte((void *)(frame->tf_eip + 6)),
729		    fubyte((void *)(frame->tf_eip + 7)));
730	}
731	KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled"));
732	trapsignal(td, &ksi);
733
734user:
735	userret(td, frame);
736	KASSERT(PCB_USER_FPU(td->td_pcb),
737	    ("Return from trap with kernel FPU ctx leaked"));
738userout:
739out:
740	return;
741}
742
743static int
744trap_pfault(struct trapframe *frame, int usermode, vm_offset_t eva)
745{
746	struct thread *td;
747	struct proc *p;
748	vm_offset_t va;
749	vm_map_t map;
750	int rv;
751	vm_prot_t ftype;
752
753	td = curthread;
754	p = td->td_proc;
755	rv = 0;
756
757	if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
758		/*
759		 * Due to both processor errata and lazy TLB invalidation when
760		 * access restrictions are removed from virtual pages, memory
761		 * accesses that are allowed by the physical mapping layer may
762		 * nonetheless cause one spurious page fault per virtual page.
763		 * When the thread is executing a "no faulting" section that
764		 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
765		 * every page fault is treated as a spurious page fault,
766		 * unless it accesses the same virtual address as the most
767		 * recent page fault within the same "no faulting" section.
768		 */
769		if (td->td_md.md_spurflt_addr != eva ||
770		    (td->td_pflags & TDP_RESETSPUR) != 0) {
771			/*
772			 * Do nothing to the TLB.  A stale TLB entry is
773			 * flushed automatically by a page fault.
774			 */
775			td->td_md.md_spurflt_addr = eva;
776			td->td_pflags &= ~TDP_RESETSPUR;
777			return (0);
778		}
779	} else {
780		/*
781		 * If we get a page fault while in a critical section, then
782		 * it is most likely a fatal kernel page fault.  The kernel
783		 * is already going to panic trying to get a sleep lock to
784		 * do the VM lookup, so just consider it a fatal trap so the
785		 * kernel can print out a useful trap message and even get
786		 * to the debugger.
787		 *
788		 * If we get a page fault while holding a non-sleepable
789		 * lock, then it is most likely a fatal kernel page fault.
790		 * If WITNESS is enabled, then it's going to whine about
791		 * bogus LORs with various VM locks, so just skip to the
792		 * fatal trap handling directly.
793		 */
794		if (td->td_critnest != 0 ||
795		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
796		    "Kernel page fault") != 0) {
797			trap_fatal(frame, eva);
798			return (-1);
799		}
800	}
801	va = trunc_page(eva);
802	if (va >= KERNBASE) {
803		/*
804		 * Don't allow user-mode faults in kernel address space.
805		 * An exception:  if the faulting address is the invalid
806		 * instruction entry in the IDT, then the Intel Pentium
807		 * F00F bug workaround was triggered, and we need to
808		 * treat it is as an illegal instruction, and not a page
809		 * fault.
810		 */
811#if defined(I586_CPU) && !defined(NO_F00F_HACK)
812		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
813			return (-2);
814#endif
815		if (usermode)
816			goto nogo;
817
818		map = kernel_map;
819	} else {
820		map = &p->p_vmspace->vm_map;
821
822		/*
823		 * When accessing a user-space address, kernel must be
824		 * ready to accept the page fault, and provide a
825		 * handling routine.  Since accessing the address
826		 * without the handler is a bug, do not try to handle
827		 * it normally, and panic immediately.
828		 */
829		if (!usermode && (td->td_intr_nesting_level != 0 ||
830		    curpcb->pcb_onfault == NULL)) {
831			trap_fatal(frame, eva);
832			return (-1);
833		}
834	}
835
836	/*
837	 * If the trap was caused by errant bits in the PTE then panic.
838	 */
839	if (frame->tf_err & PGEX_RSV) {
840		trap_fatal(frame, eva);
841		return (-1);
842	}
843
844	/*
845	 * PGEX_I is defined only if the execute disable bit capability is
846	 * supported and enabled.
847	 */
848	if (frame->tf_err & PGEX_W)
849		ftype = VM_PROT_WRITE;
850#if defined(PAE) || defined(PAE_TABLES)
851	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
852		ftype = VM_PROT_EXECUTE;
853#endif
854	else
855		ftype = VM_PROT_READ;
856
857	/* Fault in the page. */
858	rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
859	if (rv == KERN_SUCCESS) {
860#ifdef HWPMC_HOOKS
861		if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
862			PMC_SOFT_CALL_TF( , , page_fault, all, frame);
863			if (ftype == VM_PROT_READ)
864				PMC_SOFT_CALL_TF( , , page_fault, read,
865				    frame);
866			else
867				PMC_SOFT_CALL_TF( , , page_fault, write,
868				    frame);
869		}
870#endif
871		return (0);
872	}
873nogo:
874	if (!usermode) {
875		if (td->td_intr_nesting_level == 0 &&
876		    curpcb->pcb_onfault != NULL) {
877			frame->tf_eip = (int)curpcb->pcb_onfault;
878			return (0);
879		}
880		trap_fatal(frame, eva);
881		return (-1);
882	}
883	return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
884}
885
886static void
887trap_fatal(frame, eva)
888	struct trapframe *frame;
889	vm_offset_t eva;
890{
891	int code, ss, esp;
892	u_int type;
893	struct soft_segment_descriptor softseg;
894	char *msg;
895
896	code = frame->tf_err;
897	type = frame->tf_trapno;
898	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
899
900	if (type <= MAX_TRAP_MSG)
901		msg = trap_msg[type];
902	else
903		msg = "UNKNOWN";
904	printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
905	    frame->tf_eflags & PSL_VM ? "vm86" :
906	    ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
907#ifdef SMP
908	/* two separate prints in case of a trap on an unmapped page */
909	printf("cpuid = %d; ", PCPU_GET(cpuid));
910	printf("apic id = %02x\n", PCPU_GET(apic_id));
911#endif
912	if (type == T_PAGEFLT) {
913		printf("fault virtual address	= 0x%x\n", eva);
914		printf("fault code		= %s %s%s, %s\n",
915			code & PGEX_U ? "user" : "supervisor",
916			code & PGEX_W ? "write" : "read",
917#if defined(PAE) || defined(PAE_TABLES)
918			pg_nx != 0 ?
919			(code & PGEX_I ? " instruction" : " data") :
920#endif
921			"",
922			code & PGEX_RSV ? "reserved bits in PTE" :
923			code & PGEX_P ? "protection violation" : "page not present");
924	}
925	printf("instruction pointer	= 0x%x:0x%x\n",
926	       frame->tf_cs & 0xffff, frame->tf_eip);
927        if (TF_HAS_STACKREGS(frame)) {
928		ss = frame->tf_ss & 0xffff;
929		esp = frame->tf_esp;
930	} else {
931		ss = GSEL(GDATA_SEL, SEL_KPL);
932		esp = (int)&frame->tf_esp;
933	}
934	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
935	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
936	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
937	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
938	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
939	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
940	       softseg.ssd_gran);
941	printf("processor eflags	= ");
942	if (frame->tf_eflags & PSL_T)
943		printf("trace trap, ");
944	if (frame->tf_eflags & PSL_I)
945		printf("interrupt enabled, ");
946	if (frame->tf_eflags & PSL_NT)
947		printf("nested task, ");
948	if (frame->tf_eflags & PSL_RF)
949		printf("resume, ");
950	if (frame->tf_eflags & PSL_VM)
951		printf("vm86, ");
952	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
953	printf("current process		= %d (%s)\n",
954	    curproc->p_pid, curthread->td_name);
955
956#ifdef KDB
957	if (debugger_on_panic || kdb_active) {
958		frame->tf_err = eva;	/* smuggle fault address to ddb */
959		if (kdb_trap(type, 0, frame)) {
960			frame->tf_err = code;	/* restore error code */
961			return;
962		}
963		frame->tf_err = code;		/* restore error code */
964	}
965#endif
966	printf("trap number		= %d\n", type);
967	if (type <= MAX_TRAP_MSG)
968		panic("%s", trap_msg[type]);
969	else
970		panic("unknown/reserved trap");
971}
972
973/*
974 * Double fault handler. Called when a fault occurs while writing
975 * a frame for a trap/exception onto the stack. This usually occurs
976 * when the stack overflows (such is the case with infinite recursion,
977 * for example).
978 *
979 * XXX Note that the current PTD gets replaced by IdlePTD when the
980 * task switch occurs. This means that the stack that was active at
981 * the time of the double fault is not available at <kstack> unless
982 * the machine was idle when the double fault occurred. The downside
983 * of this is that "trace <ebp>" in ddb won't work.
984 */
985void
986dblfault_handler()
987{
988#ifdef KDTRACE_HOOKS
989	if (dtrace_doubletrap_func != NULL)
990		(*dtrace_doubletrap_func)();
991#endif
992	printf("\nFatal double fault:\n");
993	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
994	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
995	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
996#ifdef SMP
997	/* two separate prints in case of a trap on an unmapped page */
998	printf("cpuid = %d; ", PCPU_GET(cpuid));
999	printf("apic id = %02x\n", PCPU_GET(apic_id));
1000#endif
1001	panic("double fault");
1002}
1003
1004int
1005cpu_fetch_syscall_args(struct thread *td)
1006{
1007	struct proc *p;
1008	struct trapframe *frame;
1009	struct syscall_args *sa;
1010	caddr_t params;
1011	long tmp;
1012	int error;
1013
1014	p = td->td_proc;
1015	frame = td->td_frame;
1016	sa = &td->td_sa;
1017
1018	params = (caddr_t)frame->tf_esp + sizeof(int);
1019	sa->code = frame->tf_eax;
1020
1021	/*
1022	 * Need to check if this is a 32 bit or 64 bit syscall.
1023	 */
1024	if (sa->code == SYS_syscall) {
1025		/*
1026		 * Code is first argument, followed by actual args.
1027		 */
1028		error = fueword(params, &tmp);
1029		if (error == -1)
1030			return (EFAULT);
1031		sa->code = tmp;
1032		params += sizeof(int);
1033	} else if (sa->code == SYS___syscall) {
1034		/*
1035		 * Like syscall, but code is a quad, so as to maintain
1036		 * quad alignment for the rest of the arguments.
1037		 */
1038		error = fueword(params, &tmp);
1039		if (error == -1)
1040			return (EFAULT);
1041		sa->code = tmp;
1042		params += sizeof(quad_t);
1043	}
1044
1045 	if (p->p_sysent->sv_mask)
1046 		sa->code &= p->p_sysent->sv_mask;
1047 	if (sa->code >= p->p_sysent->sv_size)
1048 		sa->callp = &p->p_sysent->sv_table[0];
1049  	else
1050 		sa->callp = &p->p_sysent->sv_table[sa->code];
1051	sa->narg = sa->callp->sy_narg;
1052
1053	if (params != NULL && sa->narg != 0)
1054		error = copyin(params, (caddr_t)sa->args,
1055		    (u_int)(sa->narg * sizeof(int)));
1056	else
1057		error = 0;
1058
1059	if (error == 0) {
1060		td->td_retval[0] = 0;
1061		td->td_retval[1] = frame->tf_edx;
1062	}
1063
1064	return (error);
1065}
1066
1067#include "../../kern/subr_syscall.c"
1068
1069/*
1070 * syscall - system call request C handler.  A system call is
1071 * essentially treated as a trap by reusing the frame layout.
1072 */
1073void
1074syscall(struct trapframe *frame)
1075{
1076	struct thread *td;
1077	register_t orig_tf_eflags;
1078	int error;
1079	ksiginfo_t ksi;
1080
1081#ifdef DIAGNOSTIC
1082	if (!(TRAPF_USERMODE(frame) &&
1083	    (curpcb->pcb_flags & PCB_VM86CALL) == 0)) {
1084		panic("syscall");
1085		/* NOT REACHED */
1086	}
1087#endif
1088	orig_tf_eflags = frame->tf_eflags;
1089
1090	td = curthread;
1091	td->td_frame = frame;
1092
1093	error = syscallenter(td);
1094
1095	/*
1096	 * Traced syscall.
1097	 */
1098	if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
1099		frame->tf_eflags &= ~PSL_T;
1100		ksiginfo_init_trap(&ksi);
1101		ksi.ksi_signo = SIGTRAP;
1102		ksi.ksi_code = TRAP_TRACE;
1103		ksi.ksi_addr = (void *)frame->tf_eip;
1104		trapsignal(td, &ksi);
1105	}
1106
1107	KASSERT(PCB_USER_FPU(td->td_pcb),
1108	    ("System call %s returning with kernel FPU ctx leaked",
1109	     syscallname(td->td_proc, td->td_sa.code)));
1110	KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
1111	    ("System call %s returning with mangled pcb_save",
1112	     syscallname(td->td_proc, td->td_sa.code)));
1113
1114	syscallret(td, error);
1115}
1116