1/*-
2 * Copyright (C) 1994, David Greenman
3 * Copyright (c) 1990, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the University of Utah, and William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
38 */
39
40#include <sys/cdefs.h>
41__FBSDID("$FreeBSD: stable/11/sys/i386/i386/trap.c 344905 2019-03-08 00:20:37Z jhb $");
42
43/*
44 * 386 Trap and System call handling
45 */
46
47#include "opt_clock.h"
48#include "opt_cpu.h"
49#include "opt_hwpmc_hooks.h"
50#include "opt_isa.h"
51#include "opt_kdb.h"
52#include "opt_stack.h"
53#include "opt_trap.h"
54
55#include <sys/param.h>
56#include <sys/bus.h>
57#include <sys/systm.h>
58#include <sys/proc.h>
59#include <sys/pioctl.h>
60#include <sys/ptrace.h>
61#include <sys/kdb.h>
62#include <sys/kernel.h>
63#include <sys/ktr.h>
64#include <sys/lock.h>
65#include <sys/mutex.h>
66#include <sys/resourcevar.h>
67#include <sys/signalvar.h>
68#include <sys/syscall.h>
69#include <sys/sysctl.h>
70#include <sys/sysent.h>
71#include <sys/uio.h>
72#include <sys/vmmeter.h>
73#ifdef HWPMC_HOOKS
74#include <sys/pmckern.h>
75PMC_SOFT_DEFINE( , , page_fault, all);
76PMC_SOFT_DEFINE( , , page_fault, read);
77PMC_SOFT_DEFINE( , , page_fault, write);
78#endif
79#include <security/audit/audit.h>
80
81#include <vm/vm.h>
82#include <vm/vm_param.h>
83#include <vm/pmap.h>
84#include <vm/vm_kern.h>
85#include <vm/vm_map.h>
86#include <vm/vm_page.h>
87#include <vm/vm_extern.h>
88
89#include <machine/cpu.h>
90#include <machine/intr_machdep.h>
91#include <x86/mca.h>
92#include <machine/md_var.h>
93#include <machine/pcb.h>
94#ifdef SMP
95#include <machine/smp.h>
96#endif
97#include <machine/stack.h>
98#include <machine/tss.h>
99#include <machine/vm86.h>
100
101#ifdef POWERFAIL_NMI
102#include <sys/syslog.h>
103#include <machine/clock.h>
104#endif
105
106#ifdef KDTRACE_HOOKS
107#include <sys/dtrace_bsd.h>
108#endif
109
110void trap(struct trapframe *frame);
111void syscall(struct trapframe *frame);
112
113static int trap_pfault(struct trapframe *, int, vm_offset_t);
114static void trap_fatal(struct trapframe *, vm_offset_t);
115void dblfault_handler(void);
116
117extern inthand_t IDTVEC(lcall_syscall);
118
119extern inthand_t IDTVEC(bpt), IDTVEC(dbg), IDTVEC(int0x80_syscall);
120
121#define MAX_TRAP_MSG		32
122static char *trap_msg[] = {
123	"",					/*  0 unused */
124	"privileged instruction fault",		/*  1 T_PRIVINFLT */
125	"",					/*  2 unused */
126	"breakpoint instruction fault",		/*  3 T_BPTFLT */
127	"",					/*  4 unused */
128	"",					/*  5 unused */
129	"arithmetic trap",			/*  6 T_ARITHTRAP */
130	"",					/*  7 unused */
131	"",					/*  8 unused */
132	"general protection fault",		/*  9 T_PROTFLT */
133	"debug exception",			/* 10 T_TRCTRAP */
134	"",					/* 11 unused */
135	"page fault",				/* 12 T_PAGEFLT */
136	"",					/* 13 unused */
137	"alignment fault",			/* 14 T_ALIGNFLT */
138	"",					/* 15 unused */
139	"",					/* 16 unused */
140	"",					/* 17 unused */
141	"integer divide fault",			/* 18 T_DIVIDE */
142	"non-maskable interrupt trap",		/* 19 T_NMI */
143	"overflow trap",			/* 20 T_OFLOW */
144	"FPU bounds check fault",		/* 21 T_BOUND */
145	"FPU device not available",		/* 22 T_DNA */
146	"double fault",				/* 23 T_DOUBLEFLT */
147	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
148	"invalid TSS fault",			/* 25 T_TSSFLT */
149	"segment not present fault",		/* 26 T_SEGNPFLT */
150	"stack fault",				/* 27 T_STKFLT */
151	"machine check trap",			/* 28 T_MCHK */
152	"SIMD floating-point exception",	/* 29 T_XMMFLT */
153	"reserved (unknown) fault",		/* 30 T_RESERVED */
154	"",					/* 31 unused (reserved) */
155	"DTrace pid return trap",               /* 32 T_DTRACE_RET */
156};
157
158#if defined(I586_CPU) && !defined(NO_F00F_HACK)
159int has_f00f_bug = 0;		/* Initialized so that it can be patched. */
160#endif
161
162static int prot_fault_translation = 0;
163SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW,
164	&prot_fault_translation, 0, "Select signal to deliver on protection fault");
165static int uprintf_signal;
166SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW,
167    &uprintf_signal, 0,
168    "Print debugging information on trap signal to ctty");
169
170/*
171 * Exception, fault, and trap interface to the FreeBSD kernel.
172 * This common code is called from assembly language IDT gate entry
173 * routines that prepare a suitable stack frame, and restore this
174 * frame after the exception has been processed.
175 */
176
177void
178trap(struct trapframe *frame)
179{
180	ksiginfo_t ksi;
181	struct thread *td;
182	struct proc *p;
183	int signo, ucode;
184	u_int type;
185	register_t addr, dr6;
186	vm_offset_t eva;
187#ifdef POWERFAIL_NMI
188	static int lastalert = 0;
189#endif
190
191	td = curthread;
192	p = td->td_proc;
193	signo = 0;
194	ucode = 0;
195	addr = 0;
196	dr6 = 0;
197
198	PCPU_INC(cnt.v_trap);
199	type = frame->tf_trapno;
200
201#ifdef SMP
202	/* Handler for NMI IPIs used for stopping CPUs. */
203	if (type == T_NMI && ipi_nmi_handler() == 0)
204		return;
205#endif /* SMP */
206
207#ifdef KDB
208	if (kdb_active) {
209		kdb_reenter();
210		return;
211	}
212#endif
213
214	if (type == T_RESERVED) {
215		trap_fatal(frame, 0);
216		return;
217	}
218
219	if (type == T_NMI) {
220#ifdef HWPMC_HOOKS
221		/*
222		 * CPU PMCs interrupt using an NMI so we check for that first.
223		 * If the HWPMC module is active, 'pmc_hook' will point to
224		 * the function to be called.  A non-zero return value from the
225		 * hook means that the NMI was consumed by it and that we can
226		 * return immediately.
227		 */
228		if (pmc_intr != NULL &&
229		    (*pmc_intr)(PCPU_GET(cpuid), frame) != 0)
230			return;
231#endif
232
233#ifdef STACK
234		if (stack_nmi_handler(frame) != 0)
235			return;
236#endif
237	}
238
239	if (type == T_MCHK) {
240		mca_intr();
241		return;
242	}
243
244#ifdef KDTRACE_HOOKS
245	/*
246	 * A trap can occur while DTrace executes a probe. Before
247	 * executing the probe, DTrace blocks re-scheduling and sets
248	 * a flag in its per-cpu flags to indicate that it doesn't
249	 * want to fault. On returning from the probe, the no-fault
250	 * flag is cleared and finally re-scheduling is enabled.
251	 */
252	if ((type == T_PROTFLT || type == T_PAGEFLT) &&
253	    dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
254		return;
255#endif
256
257	if ((frame->tf_eflags & PSL_I) == 0) {
258		/*
259		 * Buggy application or kernel code has disabled
260		 * interrupts and then trapped.  Enabling interrupts
261		 * now is wrong, but it is better than running with
262		 * interrupts disabled until they are accidentally
263		 * enabled later.
264		 */
265		if (TRAPF_USERMODE(frame) &&
266		    (curpcb->pcb_flags & PCB_VM86CALL) == 0)
267			uprintf(
268			    "pid %ld (%s): trap %d with interrupts disabled\n",
269			    (long)curproc->p_pid, curthread->td_name, type);
270		else if (type != T_NMI && type != T_BPTFLT &&
271		    type != T_TRCTRAP &&
272		    frame->tf_eip != (int)cpu_switch_load_gs) {
273			/*
274			 * XXX not quite right, since this may be for a
275			 * multiple fault in user mode.
276			 */
277			printf("kernel trap %d with interrupts disabled\n",
278			    type);
279			/*
280			 * Page faults need interrupts disabled until later,
281			 * and we shouldn't enable interrupts while holding
282			 * a spin lock.
283			 */
284			if (type != T_PAGEFLT &&
285			    td->td_md.md_spinlock_count == 0)
286				enable_intr();
287		}
288	}
289	eva = 0;
290	if (type == T_PAGEFLT) {
291		/*
292		 * For some Cyrix CPUs, %cr2 is clobbered by
293		 * interrupts.  This problem is worked around by using
294		 * an interrupt gate for the pagefault handler.  We
295		 * are finally ready to read %cr2 and conditionally
296		 * reenable interrupts.  If we hold a spin lock, then
297		 * we must not reenable interrupts.  This might be a
298		 * spurious page fault.
299		 */
300		eva = rcr2();
301		if (td->td_md.md_spinlock_count == 0)
302			enable_intr();
303	}
304
305        if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) {
306		/* user trap */
307
308		td->td_pticks = 0;
309		td->td_frame = frame;
310		addr = frame->tf_eip;
311		if (td->td_cowgen != p->p_cowgen)
312			thread_cow_update(td);
313
314		switch (type) {
315		case T_PRIVINFLT:	/* privileged instruction fault */
316			signo = SIGILL;
317			ucode = ILL_PRVOPC;
318			break;
319
320		case T_BPTFLT:		/* bpt instruction fault */
321			enable_intr();
322#ifdef KDTRACE_HOOKS
323			if (dtrace_pid_probe_ptr != NULL &&
324			    dtrace_pid_probe_ptr(frame) == 0)
325				return;
326#endif
327			signo = SIGTRAP;
328			ucode = TRAP_BRKPT;
329			break;
330
331		case T_TRCTRAP:		/* debug exception */
332			enable_intr();
333user_trctrap_out:
334			signo = SIGTRAP;
335			ucode = TRAP_TRACE;
336			dr6 = rdr6();
337			if ((dr6 & DBREG_DR6_BS) != 0) {
338				PROC_LOCK(td->td_proc);
339				if ((td->td_dbgflags & TDB_STEP) != 0) {
340					td->td_frame->tf_eflags &= ~PSL_T;
341					td->td_dbgflags &= ~TDB_STEP;
342				}
343				PROC_UNLOCK(td->td_proc);
344			}
345			break;
346
347		case T_ARITHTRAP:	/* arithmetic trap */
348			ucode = npxtrap_x87();
349			if (ucode == -1)
350				return;
351			signo = SIGFPE;
352			break;
353
354		/*
355		 * The following two traps can happen in vm86 mode,
356		 * and, if so, we want to handle them specially.
357		 */
358		case T_PROTFLT:		/* general protection fault */
359		case T_STKFLT:		/* stack fault */
360			if (frame->tf_eflags & PSL_VM) {
361				signo = vm86_emulate((struct vm86frame *)frame);
362				if (signo == SIGTRAP) {
363					type = T_TRCTRAP;
364					load_dr6(rdr6() | 0x4000);
365					goto user_trctrap_out;
366				}
367				if (signo == 0)
368					goto user;
369				break;
370			}
371			signo = SIGBUS;
372			ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
373			break;
374		case T_SEGNPFLT:	/* segment not present fault */
375			signo = SIGBUS;
376			ucode = BUS_ADRERR;
377			break;
378		case T_TSSFLT:		/* invalid TSS fault */
379			signo = SIGBUS;
380			ucode = BUS_OBJERR;
381			break;
382		case T_ALIGNFLT:
383			signo = SIGBUS;
384			ucode = BUS_ADRALN;
385			break;
386		case T_DOUBLEFLT:	/* double fault */
387		default:
388			signo = SIGBUS;
389			ucode = BUS_OBJERR;
390			break;
391
392		case T_PAGEFLT:		/* page fault */
393			signo = trap_pfault(frame, TRUE, eva);
394#if defined(I586_CPU) && !defined(NO_F00F_HACK)
395			if (signo == -2) {
396				/*
397				 * The f00f hack workaround has triggered, so
398				 * treat the fault as an illegal instruction
399				 * (T_PRIVINFLT) instead of a page fault.
400				 */
401				type = frame->tf_trapno = T_PRIVINFLT;
402
403				/* Proceed as in that case. */
404				ucode = ILL_PRVOPC;
405				signo = SIGILL;
406				break;
407			}
408#endif
409			if (signo == -1)
410				return;
411			if (signo == 0)
412				goto user;
413
414			if (signo == SIGSEGV)
415				ucode = SEGV_MAPERR;
416			else if (prot_fault_translation == 0) {
417				/*
418				 * Autodetect.  This check also covers
419				 * the images without the ABI-tag ELF
420				 * note.
421				 */
422				if (SV_CURPROC_ABI() == SV_ABI_FREEBSD &&
423				    p->p_osrel >= P_OSREL_SIGSEGV) {
424					signo = SIGSEGV;
425					ucode = SEGV_ACCERR;
426				} else {
427					signo = SIGBUS;
428					ucode = BUS_PAGE_FAULT;
429				}
430			} else if (prot_fault_translation == 1) {
431				/*
432				 * Always compat mode.
433				 */
434				signo = SIGBUS;
435				ucode = BUS_PAGE_FAULT;
436			} else {
437				/*
438				 * Always SIGSEGV mode.
439				 */
440				signo = SIGSEGV;
441				ucode = SEGV_ACCERR;
442			}
443			addr = eva;
444			break;
445
446		case T_DIVIDE:		/* integer divide fault */
447			ucode = FPE_INTDIV;
448			signo = SIGFPE;
449			break;
450
451#ifdef DEV_ISA
452		case T_NMI:
453#ifdef POWERFAIL_NMI
454#ifndef TIMER_FREQ
455#  define TIMER_FREQ 1193182
456#endif
457			if (time_second - lastalert > 10) {
458				log(LOG_WARNING, "NMI: power fail\n");
459				sysbeep(880, hz);
460				lastalert = time_second;
461			}
462			return;
463#else /* !POWERFAIL_NMI */
464			nmi_handle_intr(type, frame);
465			return;
466#endif /* POWERFAIL_NMI */
467#endif /* DEV_ISA */
468
469		case T_OFLOW:		/* integer overflow fault */
470			ucode = FPE_INTOVF;
471			signo = SIGFPE;
472			break;
473
474		case T_BOUND:		/* bounds check fault */
475			ucode = FPE_FLTSUB;
476			signo = SIGFPE;
477			break;
478
479		case T_DNA:
480			KASSERT(PCB_USER_FPU(td->td_pcb),
481			    ("kernel FPU ctx has leaked"));
482			/* transparent fault (due to context switch "late") */
483			if (npxdna())
484				return;
485			uprintf("pid %d killed due to lack of floating point\n",
486				p->p_pid);
487			signo = SIGKILL;
488			ucode = 0;
489			break;
490
491		case T_FPOPFLT:		/* FPU operand fetch fault */
492			ucode = ILL_COPROC;
493			signo = SIGILL;
494			break;
495
496		case T_XMMFLT:		/* SIMD floating-point exception */
497			ucode = npxtrap_sse();
498			if (ucode == -1)
499				return;
500			signo = SIGFPE;
501			break;
502#ifdef KDTRACE_HOOKS
503		case T_DTRACE_RET:
504			enable_intr();
505			if (dtrace_return_probe_ptr != NULL)
506				dtrace_return_probe_ptr(frame);
507			return;
508#endif
509		}
510	} else {
511		/* kernel trap */
512
513		KASSERT(cold || td->td_ucred != NULL,
514		    ("kernel trap doesn't have ucred"));
515		switch (type) {
516		case T_PAGEFLT:			/* page fault */
517			(void) trap_pfault(frame, FALSE, eva);
518			return;
519
520		case T_DNA:
521			if (PCB_USER_FPU(td->td_pcb))
522				panic("Unregistered use of FPU in kernel");
523			if (npxdna())
524				return;
525			break;
526
527		case T_ARITHTRAP:	/* arithmetic trap */
528		case T_XMMFLT:		/* SIMD floating-point exception */
529		case T_FPOPFLT:		/* FPU operand fetch fault */
530			/*
531			 * XXXKIB for now disable any FPU traps in kernel
532			 * handler registration seems to be overkill
533			 */
534			trap_fatal(frame, 0);
535			return;
536
537			/*
538			 * The following two traps can happen in
539			 * vm86 mode, and, if so, we want to handle
540			 * them specially.
541			 */
542		case T_PROTFLT:		/* general protection fault */
543		case T_STKFLT:		/* stack fault */
544			if (frame->tf_eflags & PSL_VM) {
545				signo = vm86_emulate((struct vm86frame *)frame);
546				if (signo == SIGTRAP) {
547					type = T_TRCTRAP;
548					load_dr6(rdr6() | 0x4000);
549					goto kernel_trctrap;
550				}
551				if (signo != 0)
552					/*
553					 * returns to original process
554					 */
555					vm86_trap((struct vm86frame *)frame);
556				return;
557			}
558			/* FALL THROUGH */
559		case T_SEGNPFLT:	/* segment not present fault */
560			if (curpcb->pcb_flags & PCB_VM86CALL)
561				break;
562
563			/*
564			 * Invalid %fs's and %gs's can be created using
565			 * procfs or PT_SETREGS or by invalidating the
566			 * underlying LDT entry.  This causes a fault
567			 * in kernel mode when the kernel attempts to
568			 * switch contexts.  Lose the bad context
569			 * (XXX) so that we can continue, and generate
570			 * a signal.
571			 */
572			if (frame->tf_eip == (int)cpu_switch_load_gs) {
573				curpcb->pcb_gs = 0;
574#if 0
575				PROC_LOCK(p);
576				kern_psignal(p, SIGBUS);
577				PROC_UNLOCK(p);
578#endif
579				return;
580			}
581
582			if (td->td_intr_nesting_level != 0)
583				break;
584
585			/*
586			 * Invalid segment selectors and out of bounds
587			 * %eip's and %esp's can be set up in user mode.
588			 * This causes a fault in kernel mode when the
589			 * kernel tries to return to user mode.  We want
590			 * to get this fault so that we can fix the
591			 * problem here and not have to check all the
592			 * selectors and pointers when the user changes
593			 * them.
594			 */
595			if (frame->tf_eip == (int)doreti_iret) {
596				frame->tf_eip = (int)doreti_iret_fault;
597				return;
598			}
599			if (type == T_STKFLT)
600				break;
601
602			if (frame->tf_eip == (int)doreti_popl_ds) {
603				frame->tf_eip = (int)doreti_popl_ds_fault;
604				return;
605			}
606			if (frame->tf_eip == (int)doreti_popl_es) {
607				frame->tf_eip = (int)doreti_popl_es_fault;
608				return;
609			}
610			if (frame->tf_eip == (int)doreti_popl_fs) {
611				frame->tf_eip = (int)doreti_popl_fs_fault;
612				return;
613			}
614			if (curpcb->pcb_onfault != NULL) {
615				frame->tf_eip = (int)curpcb->pcb_onfault;
616				return;
617			}
618			break;
619
620		case T_TSSFLT:
621			/*
622			 * PSL_NT can be set in user mode and isn't cleared
623			 * automatically when the kernel is entered.  This
624			 * causes a TSS fault when the kernel attempts to
625			 * `iret' because the TSS link is uninitialized.  We
626			 * want to get this fault so that we can fix the
627			 * problem here and not every time the kernel is
628			 * entered.
629			 */
630			if (frame->tf_eflags & PSL_NT) {
631				frame->tf_eflags &= ~PSL_NT;
632				return;
633			}
634			break;
635
636		case T_TRCTRAP:	 /* debug exception */
637kernel_trctrap:
638			/* Clear any pending debug events. */
639			dr6 = rdr6();
640			load_dr6(0);
641
642			if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) {
643				/*
644				 * We've just entered system mode via the
645				 * syscall lcall.  Continue single stepping
646				 * silently until the syscall handler has
647				 * saved the flags.
648				 */
649				return;
650			}
651			if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
652				/*
653				 * The syscall handler has now saved the
654				 * flags.  Stop single stepping it.
655				 */
656				frame->tf_eflags &= ~PSL_T;
657				return;
658			}
659			/*
660			 * Ignore debug register exceptions due to
661			 * accesses in the user's address space, which
662			 * can happen under several conditions such as
663			 * if a user sets a watchpoint on a buffer and
664			 * then passes that buffer to a system call.
665			 * We still want to get TRCTRAPS for addresses
666			 * in kernel space because that is useful when
667			 * debugging the kernel.
668			 */
669			if (user_dbreg_trap(dr6) &&
670			   !(curpcb->pcb_flags & PCB_VM86CALL))
671				return;
672
673			/*
674			 * Malicious user code can configure a debug
675			 * register watchpoint to trap on data access
676			 * to the top of stack and then execute 'pop
677			 * %ss; int 3'.  Due to exception deferral for
678			 * 'pop %ss', the CPU will not interrupt 'int
679			 * 3' to raise the DB# exception for the debug
680			 * register but will postpone the DB# until
681			 * execution of the first instruction of the
682			 * BP# handler (in kernel mode).  Normally the
683			 * previous check would ignore DB# exceptions
684			 * for watchpoints on user addresses raised in
685			 * kernel mode.  However, some CPU errata
686			 * include cases where DB# exceptions do not
687			 * properly set bits in %dr6, e.g. Haswell
688			 * HSD23 and Skylake-X SKZ24.
689			 *
690			 * A deferred DB# can also be raised on the
691			 * first instructions of system call entry
692			 * points or single-step traps via similar use
693			 * of 'pop %ss' or 'mov xxx, %ss'.
694			 */
695			if (frame->tf_eip ==
696			    (uintptr_t)IDTVEC(int0x80_syscall) ||
697			    frame->tf_eip == (uintptr_t)IDTVEC(bpt) ||
698			    frame->tf_eip == (uintptr_t)IDTVEC(dbg))
699				return;
700			/*
701			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
702			 */
703		case T_BPTFLT:
704			/*
705			 * If KDB is enabled, let it handle the debugger trap.
706			 * Otherwise, debugger traps "can't happen".
707			 */
708#ifdef KDB
709			if (kdb_trap(type, dr6, frame))
710				return;
711#endif
712			break;
713
714#ifdef DEV_ISA
715		case T_NMI:
716#ifdef POWERFAIL_NMI
717			if (time_second - lastalert > 10) {
718				log(LOG_WARNING, "NMI: power fail\n");
719				sysbeep(880, hz);
720				lastalert = time_second;
721			}
722			return;
723#else /* !POWERFAIL_NMI */
724			nmi_handle_intr(type, frame);
725			return;
726#endif /* POWERFAIL_NMI */
727#endif /* DEV_ISA */
728		}
729
730		trap_fatal(frame, eva);
731		return;
732	}
733
734	/* Translate fault for emulators (e.g. Linux) */
735	if (*p->p_sysent->sv_transtrap != NULL)
736		signo = (*p->p_sysent->sv_transtrap)(signo, type);
737
738	ksiginfo_init_trap(&ksi);
739	ksi.ksi_signo = signo;
740	ksi.ksi_code = ucode;
741	ksi.ksi_addr = (void *)addr;
742	ksi.ksi_trapno = type;
743	if (uprintf_signal) {
744		uprintf("pid %d comm %s: signal %d err %x code %d type %d "
745		    "addr 0x%x esp 0x%08x eip 0x%08x "
746		    "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
747		    p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type,
748		    addr, frame->tf_esp, frame->tf_eip,
749		    fubyte((void *)(frame->tf_eip + 0)),
750		    fubyte((void *)(frame->tf_eip + 1)),
751		    fubyte((void *)(frame->tf_eip + 2)),
752		    fubyte((void *)(frame->tf_eip + 3)),
753		    fubyte((void *)(frame->tf_eip + 4)),
754		    fubyte((void *)(frame->tf_eip + 5)),
755		    fubyte((void *)(frame->tf_eip + 6)),
756		    fubyte((void *)(frame->tf_eip + 7)));
757	}
758	KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled"));
759	trapsignal(td, &ksi);
760
761user:
762	userret(td, frame);
763	KASSERT(PCB_USER_FPU(td->td_pcb),
764	    ("Return from trap with kernel FPU ctx leaked"));
765}
766
767static int
768trap_pfault(struct trapframe *frame, int usermode, vm_offset_t eva)
769{
770	struct thread *td;
771	struct proc *p;
772	vm_offset_t va;
773	vm_map_t map;
774	int rv;
775	vm_prot_t ftype;
776
777	td = curthread;
778	p = td->td_proc;
779
780	if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
781		/*
782		 * Due to both processor errata and lazy TLB invalidation when
783		 * access restrictions are removed from virtual pages, memory
784		 * accesses that are allowed by the physical mapping layer may
785		 * nonetheless cause one spurious page fault per virtual page.
786		 * When the thread is executing a "no faulting" section that
787		 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
788		 * every page fault is treated as a spurious page fault,
789		 * unless it accesses the same virtual address as the most
790		 * recent page fault within the same "no faulting" section.
791		 */
792		if (td->td_md.md_spurflt_addr != eva ||
793		    (td->td_pflags & TDP_RESETSPUR) != 0) {
794			/*
795			 * Do nothing to the TLB.  A stale TLB entry is
796			 * flushed automatically by a page fault.
797			 */
798			td->td_md.md_spurflt_addr = eva;
799			td->td_pflags &= ~TDP_RESETSPUR;
800			return (0);
801		}
802	} else {
803		/*
804		 * If we get a page fault while in a critical section, then
805		 * it is most likely a fatal kernel page fault.  The kernel
806		 * is already going to panic trying to get a sleep lock to
807		 * do the VM lookup, so just consider it a fatal trap so the
808		 * kernel can print out a useful trap message and even get
809		 * to the debugger.
810		 *
811		 * If we get a page fault while holding a non-sleepable
812		 * lock, then it is most likely a fatal kernel page fault.
813		 * If WITNESS is enabled, then it's going to whine about
814		 * bogus LORs with various VM locks, so just skip to the
815		 * fatal trap handling directly.
816		 */
817		if (td->td_critnest != 0 ||
818		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
819		    "Kernel page fault") != 0) {
820			trap_fatal(frame, eva);
821			return (-1);
822		}
823	}
824	va = trunc_page(eva);
825	if (va >= KERNBASE) {
826		/*
827		 * Don't allow user-mode faults in kernel address space.
828		 * An exception:  if the faulting address is the invalid
829		 * instruction entry in the IDT, then the Intel Pentium
830		 * F00F bug workaround was triggered, and we need to
831		 * treat it is as an illegal instruction, and not a page
832		 * fault.
833		 */
834#if defined(I586_CPU) && !defined(NO_F00F_HACK)
835		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
836			return (-2);
837#endif
838		if (usermode)
839			return (SIGSEGV);
840
841		map = kernel_map;
842	} else {
843		map = &p->p_vmspace->vm_map;
844
845		/*
846		 * When accessing a user-space address, kernel must be
847		 * ready to accept the page fault, and provide a
848		 * handling routine.  Since accessing the address
849		 * without the handler is a bug, do not try to handle
850		 * it normally, and panic immediately.
851		 */
852		if (!usermode && (td->td_intr_nesting_level != 0 ||
853		    curpcb->pcb_onfault == NULL)) {
854			trap_fatal(frame, eva);
855			return (-1);
856		}
857	}
858
859	/*
860	 * If the trap was caused by errant bits in the PTE then panic.
861	 */
862	if (frame->tf_err & PGEX_RSV) {
863		trap_fatal(frame, eva);
864		return (-1);
865	}
866
867	/*
868	 * PGEX_I is defined only if the execute disable bit capability is
869	 * supported and enabled.
870	 */
871	if (frame->tf_err & PGEX_W)
872		ftype = VM_PROT_WRITE;
873#if defined(PAE) || defined(PAE_TABLES)
874	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
875		ftype = VM_PROT_EXECUTE;
876#endif
877	else
878		ftype = VM_PROT_READ;
879
880	/* Fault in the page. */
881	rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
882	if (rv == KERN_SUCCESS) {
883#ifdef HWPMC_HOOKS
884		if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
885			PMC_SOFT_CALL_TF( , , page_fault, all, frame);
886			if (ftype == VM_PROT_READ)
887				PMC_SOFT_CALL_TF( , , page_fault, read,
888				    frame);
889			else
890				PMC_SOFT_CALL_TF( , , page_fault, write,
891				    frame);
892		}
893#endif
894		return (0);
895	}
896	if (!usermode) {
897		if (td->td_intr_nesting_level == 0 &&
898		    curpcb->pcb_onfault != NULL) {
899			frame->tf_eip = (int)curpcb->pcb_onfault;
900			return (0);
901		}
902		trap_fatal(frame, eva);
903		return (-1);
904	}
905	return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
906}
907
908static void
909trap_fatal(frame, eva)
910	struct trapframe *frame;
911	vm_offset_t eva;
912{
913	int code, ss, esp;
914	u_int type;
915	struct soft_segment_descriptor softseg;
916	char *msg;
917#ifdef KDB
918	bool handled;
919#endif
920
921	code = frame->tf_err;
922	type = frame->tf_trapno;
923	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
924
925	if (type <= MAX_TRAP_MSG)
926		msg = trap_msg[type];
927	else
928		msg = "UNKNOWN";
929	printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
930	    frame->tf_eflags & PSL_VM ? "vm86" :
931	    ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
932#ifdef SMP
933	/* two separate prints in case of a trap on an unmapped page */
934	printf("cpuid = %d; ", PCPU_GET(cpuid));
935	printf("apic id = %02x\n", PCPU_GET(apic_id));
936#endif
937	if (type == T_PAGEFLT) {
938		printf("fault virtual address	= 0x%x\n", eva);
939		printf("fault code		= %s %s%s, %s\n",
940			code & PGEX_U ? "user" : "supervisor",
941			code & PGEX_W ? "write" : "read",
942#if defined(PAE) || defined(PAE_TABLES)
943			pg_nx != 0 ?
944			(code & PGEX_I ? " instruction" : " data") :
945#endif
946			"",
947			code & PGEX_RSV ? "reserved bits in PTE" :
948			code & PGEX_P ? "protection violation" : "page not present");
949	} else {
950		printf("error code		= %#x\n", code);
951	}
952	printf("instruction pointer	= 0x%x:0x%x\n",
953	       frame->tf_cs & 0xffff, frame->tf_eip);
954        if (TF_HAS_STACKREGS(frame)) {
955		ss = frame->tf_ss & 0xffff;
956		esp = frame->tf_esp;
957	} else {
958		ss = GSEL(GDATA_SEL, SEL_KPL);
959		esp = (int)&frame->tf_esp;
960	}
961	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
962	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
963	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
964	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
965	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
966	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
967	       softseg.ssd_gran);
968	printf("processor eflags	= ");
969	if (frame->tf_eflags & PSL_T)
970		printf("trace trap, ");
971	if (frame->tf_eflags & PSL_I)
972		printf("interrupt enabled, ");
973	if (frame->tf_eflags & PSL_NT)
974		printf("nested task, ");
975	if (frame->tf_eflags & PSL_RF)
976		printf("resume, ");
977	if (frame->tf_eflags & PSL_VM)
978		printf("vm86, ");
979	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
980	printf("current process		= %d (%s)\n",
981	    curproc->p_pid, curthread->td_name);
982
983#ifdef KDB
984	if (debugger_on_trap) {
985		kdb_why = KDB_WHY_TRAP;
986		frame->tf_err = eva;	/* smuggle fault address to ddb */
987		handled = kdb_trap(type, 0, frame);
988		frame->tf_err = code;	/* restore error code */
989		kdb_why = KDB_WHY_UNSET;
990		if (handled)
991			return;
992	}
993#endif
994	printf("trap number		= %d\n", type);
995	if (type <= MAX_TRAP_MSG)
996		panic("%s", trap_msg[type]);
997	else
998		panic("unknown/reserved trap");
999}
1000
1001/*
1002 * Double fault handler. Called when a fault occurs while writing
1003 * a frame for a trap/exception onto the stack. This usually occurs
1004 * when the stack overflows (such is the case with infinite recursion,
1005 * for example).
1006 *
1007 * XXX Note that the current PTD gets replaced by IdlePTD when the
1008 * task switch occurs. This means that the stack that was active at
1009 * the time of the double fault is not available at <kstack> unless
1010 * the machine was idle when the double fault occurred. The downside
1011 * of this is that "trace <ebp>" in ddb won't work.
1012 */
1013void
1014dblfault_handler()
1015{
1016#ifdef KDTRACE_HOOKS
1017	if (dtrace_doubletrap_func != NULL)
1018		(*dtrace_doubletrap_func)();
1019#endif
1020	printf("\nFatal double fault:\n");
1021	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
1022	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
1023	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
1024#ifdef SMP
1025	/* two separate prints in case of a trap on an unmapped page */
1026	printf("cpuid = %d; ", PCPU_GET(cpuid));
1027	printf("apic id = %02x\n", PCPU_GET(apic_id));
1028#endif
1029	panic("double fault");
1030}
1031
1032int
1033cpu_fetch_syscall_args(struct thread *td)
1034{
1035	struct proc *p;
1036	struct trapframe *frame;
1037	struct syscall_args *sa;
1038	caddr_t params;
1039	long tmp;
1040	int error;
1041
1042	p = td->td_proc;
1043	frame = td->td_frame;
1044	sa = &td->td_sa;
1045
1046	params = (caddr_t)frame->tf_esp + sizeof(int);
1047	sa->code = frame->tf_eax;
1048
1049	/*
1050	 * Need to check if this is a 32 bit or 64 bit syscall.
1051	 */
1052	if (sa->code == SYS_syscall) {
1053		/*
1054		 * Code is first argument, followed by actual args.
1055		 */
1056		error = fueword(params, &tmp);
1057		if (error == -1)
1058			return (EFAULT);
1059		sa->code = tmp;
1060		params += sizeof(int);
1061	} else if (sa->code == SYS___syscall) {
1062		/*
1063		 * Like syscall, but code is a quad, so as to maintain
1064		 * quad alignment for the rest of the arguments.
1065		 */
1066		error = fueword(params, &tmp);
1067		if (error == -1)
1068			return (EFAULT);
1069		sa->code = tmp;
1070		params += sizeof(quad_t);
1071	}
1072
1073 	if (p->p_sysent->sv_mask)
1074 		sa->code &= p->p_sysent->sv_mask;
1075 	if (sa->code >= p->p_sysent->sv_size)
1076 		sa->callp = &p->p_sysent->sv_table[0];
1077  	else
1078 		sa->callp = &p->p_sysent->sv_table[sa->code];
1079	sa->narg = sa->callp->sy_narg;
1080
1081	if (params != NULL && sa->narg != 0)
1082		error = copyin(params, (caddr_t)sa->args,
1083		    (u_int)(sa->narg * sizeof(int)));
1084	else
1085		error = 0;
1086
1087	if (error == 0) {
1088		td->td_retval[0] = 0;
1089		td->td_retval[1] = frame->tf_edx;
1090	}
1091
1092	return (error);
1093}
1094
1095#include "../../kern/subr_syscall.c"
1096
1097/*
1098 * syscall - system call request C handler.  A system call is
1099 * essentially treated as a trap by reusing the frame layout.
1100 */
1101void
1102syscall(struct trapframe *frame)
1103{
1104	struct thread *td;
1105	register_t orig_tf_eflags;
1106	int error;
1107	ksiginfo_t ksi;
1108
1109#ifdef DIAGNOSTIC
1110	if (!(TRAPF_USERMODE(frame) &&
1111	    (curpcb->pcb_flags & PCB_VM86CALL) == 0)) {
1112		panic("syscall");
1113		/* NOT REACHED */
1114	}
1115#endif
1116	orig_tf_eflags = frame->tf_eflags;
1117
1118	td = curthread;
1119	td->td_frame = frame;
1120
1121	error = syscallenter(td);
1122
1123	/*
1124	 * Traced syscall.
1125	 */
1126	if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
1127		frame->tf_eflags &= ~PSL_T;
1128		ksiginfo_init_trap(&ksi);
1129		ksi.ksi_signo = SIGTRAP;
1130		ksi.ksi_code = TRAP_TRACE;
1131		ksi.ksi_addr = (void *)frame->tf_eip;
1132		trapsignal(td, &ksi);
1133	}
1134
1135	KASSERT(PCB_USER_FPU(td->td_pcb),
1136	    ("System call %s returning with kernel FPU ctx leaked",
1137	     syscallname(td->td_proc, td->td_sa.code)));
1138	KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
1139	    ("System call %s returning with mangled pcb_save",
1140	     syscallname(td->td_proc, td->td_sa.code)));
1141
1142	syscallret(td, error);
1143}
1144