1/*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (C) 1994, David Greenman
5 * Copyright (c) 1990, 1993
6 *	The Regents of the University of California.  All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * the University of Utah, and William Jolitz.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by the University of
22 *	California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 *    may be used to endorse or promote products derived from this software
25 *    without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
40 */
41
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD$");
44
45/*
46 * 386 Trap and System call handling
47 */
48
49#include "opt_clock.h"
50#include "opt_compat.h"
51#include "opt_cpu.h"
52#include "opt_hwpmc_hooks.h"
53#include "opt_isa.h"
54#include "opt_kdb.h"
55#include "opt_stack.h"
56#include "opt_trap.h"
57
58#include <sys/param.h>
59#include <sys/bus.h>
60#include <sys/systm.h>
61#include <sys/proc.h>
62#include <sys/pioctl.h>
63#include <sys/ptrace.h>
64#include <sys/kdb.h>
65#include <sys/kernel.h>
66#include <sys/ktr.h>
67#include <sys/lock.h>
68#include <sys/mutex.h>
69#include <sys/resourcevar.h>
70#include <sys/signalvar.h>
71#include <sys/syscall.h>
72#include <sys/sysctl.h>
73#include <sys/sysent.h>
74#include <sys/uio.h>
75#include <sys/vmmeter.h>
76#ifdef HWPMC_HOOKS
77#include <sys/pmckern.h>
78PMC_SOFT_DEFINE( , , page_fault, all);
79PMC_SOFT_DEFINE( , , page_fault, read);
80PMC_SOFT_DEFINE( , , page_fault, write);
81#endif
82#include <security/audit/audit.h>
83
84#include <vm/vm.h>
85#include <vm/vm_param.h>
86#include <vm/pmap.h>
87#include <vm/vm_kern.h>
88#include <vm/vm_map.h>
89#include <vm/vm_page.h>
90#include <vm/vm_extern.h>
91
92#include <machine/cpu.h>
93#include <machine/intr_machdep.h>
94#include <x86/mca.h>
95#include <machine/md_var.h>
96#include <machine/pcb.h>
97#ifdef SMP
98#include <machine/smp.h>
99#endif
100#include <machine/stack.h>
101#include <machine/trap.h>
102#include <machine/tss.h>
103#include <machine/vm86.h>
104
105#ifdef POWERFAIL_NMI
106#include <sys/syslog.h>
107#include <machine/clock.h>
108#endif
109
110#ifdef KDTRACE_HOOKS
111#include <sys/dtrace_bsd.h>
112#endif
113
114void trap(struct trapframe *frame);
115void syscall(struct trapframe *frame);
116
117static int trap_pfault(struct trapframe *, bool, vm_offset_t, int *, int *);
118static void trap_fatal(struct trapframe *, vm_offset_t);
119#ifdef KDTRACE_HOOKS
120static bool trap_user_dtrace(struct trapframe *,
121    int (**hook)(struct trapframe *));
122#endif
123void dblfault_handler(void);
124
125extern inthand_t IDTVEC(bpt), IDTVEC(dbg), IDTVEC(int0x80_syscall);
126
127struct trap_data {
128	bool		ei;
129	const char	*msg;
130};
131
132static const struct trap_data trap_data[] = {
133	[T_PRIVINFLT] =	{ .ei = true,	.msg = "privileged instruction fault" },
134	[T_BPTFLT] =	{ .ei = false,	.msg = "breakpoint instruction fault" },
135	[T_ARITHTRAP] =	{ .ei = true,	.msg = "arithmetic trap" },
136	[T_PROTFLT] =	{ .ei = true,	.msg = "general protection fault" },
137	[T_TRCTRAP] =	{ .ei = false,	.msg = "debug exception" },
138	[T_PAGEFLT] =	{ .ei = true,	.msg = "page fault" },
139	[T_ALIGNFLT] = 	{ .ei = true,	.msg = "alignment fault" },
140	[T_DIVIDE] =	{ .ei = true,	.msg = "integer divide fault" },
141	[T_NMI] =	{ .ei = false,	.msg = "non-maskable interrupt trap" },
142	[T_OFLOW] =	{ .ei = true,	.msg = "overflow trap" },
143	[T_BOUND] =	{ .ei = true,	.msg = "FPU bounds check fault" },
144	[T_DNA] =	{ .ei = true,	.msg = "FPU device not available" },
145	[T_DOUBLEFLT] =	{ .ei = false,	.msg = "double fault" },
146	[T_FPOPFLT] =	{ .ei = true,	.msg = "FPU operand fetch fault" },
147	[T_TSSFLT] =	{ .ei = true,	.msg = "invalid TSS fault" },
148	[T_SEGNPFLT] =	{ .ei = true,	.msg = "segment not present fault" },
149	[T_STKFLT] =	{ .ei = true,	.msg = "stack fault" },
150	[T_MCHK] =	{ .ei = true,	.msg = "machine check trap" },
151	[T_XMMFLT] =	{ .ei = true,	.msg = "SIMD floating-point exception" },
152	[T_DTRACE_RET] ={ .ei = true,	.msg = "DTrace pid return trap" },
153};
154
155static bool
156trap_enable_intr(int trapno)
157{
158
159	MPASS(trapno > 0);
160	if (trapno < nitems(trap_data) && trap_data[trapno].msg != NULL)
161		return (trap_data[trapno].ei);
162	return (false);
163}
164
165static const char *
166trap_msg(int trapno)
167{
168	const char *res;
169	static const char unkn[] = "UNKNOWN";
170
171	res = NULL;
172	if (trapno < nitems(trap_data))
173		res = trap_data[trapno].msg;
174	if (res == NULL)
175		res = unkn;
176	return (res);
177}
178
179#if defined(I586_CPU) && !defined(NO_F00F_HACK)
180int has_f00f_bug = 0;		/* Initialized so that it can be patched. */
181#endif
182
183static int uprintf_signal;
184SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW,
185    &uprintf_signal, 0,
186    "Print debugging information on trap signal to ctty");
187
188/*
189 * Exception, fault, and trap interface to the FreeBSD kernel.
190 * This common code is called from assembly language IDT gate entry
191 * routines that prepare a suitable stack frame, and restore this
192 * frame after the exception has been processed.
193 */
194
195void
196trap(struct trapframe *frame)
197{
198	ksiginfo_t ksi;
199	struct thread *td;
200	struct proc *p;
201	int pf, signo, ucode;
202	u_int type;
203	register_t addr, dr6;
204	vm_offset_t eva;
205#ifdef POWERFAIL_NMI
206	static int lastalert = 0;
207#endif
208
209	td = curthread;
210	p = td->td_proc;
211	dr6 = 0;
212
213	VM_CNT_INC(v_trap);
214	type = frame->tf_trapno;
215
216	KASSERT((read_eflags() & PSL_I) == 0,
217	    ("trap: interrupts enabled, type %d frame %p", type, frame));
218
219#ifdef SMP
220	/* Handler for NMI IPIs used for stopping CPUs. */
221	if (type == T_NMI && ipi_nmi_handler() == 0)
222		return;
223#endif /* SMP */
224
225#ifdef KDB
226	if (kdb_active) {
227		kdb_reenter();
228		return;
229	}
230#endif
231
232	if (type == T_RESERVED) {
233		trap_fatal(frame, 0);
234		return;
235	}
236
237	if (type == T_NMI) {
238#ifdef HWPMC_HOOKS
239		/*
240		 * CPU PMCs interrupt using an NMI so we check for that first.
241		 * If the HWPMC module is active, 'pmc_hook' will point to
242		 * the function to be called.  A non-zero return value from the
243		 * hook means that the NMI was consumed by it and that we can
244		 * return immediately.
245		 */
246		if (pmc_intr != NULL &&
247		    (*pmc_intr)(frame) != 0)
248			return;
249#endif
250
251#ifdef STACK
252		if (stack_nmi_handler(frame) != 0)
253			return;
254#endif
255	}
256
257	if (type == T_MCHK) {
258		mca_intr();
259		return;
260	}
261
262#ifdef KDTRACE_HOOKS
263	/*
264	 * A trap can occur while DTrace executes a probe. Before
265	 * executing the probe, DTrace blocks re-scheduling and sets
266	 * a flag in its per-cpu flags to indicate that it doesn't
267	 * want to fault. On returning from the probe, the no-fault
268	 * flag is cleared and finally re-scheduling is enabled.
269	 */
270	if ((type == T_PROTFLT || type == T_PAGEFLT) &&
271	    dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
272		return;
273#endif
274
275	/*
276	 * We must not allow context switches until %cr2 is read.
277	 * Also, for some Cyrix CPUs, %cr2 is clobbered by interrupts.
278	 * All faults use interrupt gates, so %cr2 can be safely read
279	 * now, before optional enable of the interrupts below.
280	 */
281	if (type == T_PAGEFLT)
282		eva = rcr2();
283
284	/*
285	 * Buggy application or kernel code has disabled interrupts
286	 * and then trapped.  Enabling interrupts now is wrong, but it
287	 * is better than running with interrupts disabled until they
288	 * are accidentally enabled later.
289	 */
290	if ((frame->tf_eflags & PSL_I) == 0 && TRAPF_USERMODE(frame) &&
291	    (curpcb->pcb_flags & PCB_VM86CALL) == 0)
292		uprintf("pid %ld (%s): trap %d with interrupts disabled\n",
293		    (long)curproc->p_pid, curthread->td_name, type);
294
295	/*
296	 * Conditionally reenable interrupts.  If we hold a spin lock,
297	 * then we must not reenable interrupts.  This might be a
298	 * spurious page fault.
299	 */
300	if (trap_enable_intr(type) && td->td_md.md_spinlock_count == 0 &&
301	    frame->tf_eip != (int)cpu_switch_load_gs)
302		enable_intr();
303
304        if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) {
305		/* user trap */
306
307		td->td_pticks = 0;
308		td->td_frame = frame;
309		addr = frame->tf_eip;
310		if (td->td_cowgen != p->p_cowgen)
311			thread_cow_update(td);
312
313		switch (type) {
314		case T_PRIVINFLT:	/* privileged instruction fault */
315			signo = SIGILL;
316			ucode = ILL_PRVOPC;
317			break;
318
319		case T_BPTFLT:		/* bpt instruction fault */
320#ifdef KDTRACE_HOOKS
321			if (trap_user_dtrace(frame, &dtrace_pid_probe_ptr))
322				return;
323#else
324			enable_intr();
325#endif
326			signo = SIGTRAP;
327			ucode = TRAP_BRKPT;
328			break;
329
330		case T_TRCTRAP:		/* debug exception */
331			enable_intr();
332user_trctrap_out:
333			signo = SIGTRAP;
334			ucode = TRAP_TRACE;
335			dr6 = rdr6();
336			if ((dr6 & DBREG_DR6_BS) != 0) {
337				PROC_LOCK(td->td_proc);
338				if ((td->td_dbgflags & TDB_STEP) != 0) {
339					td->td_frame->tf_eflags &= ~PSL_T;
340					td->td_dbgflags &= ~TDB_STEP;
341				}
342				PROC_UNLOCK(td->td_proc);
343			}
344			break;
345
346		case T_ARITHTRAP:	/* arithmetic trap */
347			ucode = npxtrap_x87();
348			if (ucode == -1)
349				return;
350			signo = SIGFPE;
351			break;
352
353		/*
354		 * The following two traps can happen in vm86 mode,
355		 * and, if so, we want to handle them specially.
356		 */
357		case T_PROTFLT:		/* general protection fault */
358		case T_STKFLT:		/* stack fault */
359			if (frame->tf_eflags & PSL_VM) {
360				signo = vm86_emulate((struct vm86frame *)frame);
361				ucode = 0;	/* XXXKIB: better code ? */
362				if (signo == SIGTRAP) {
363					load_dr6(rdr6() | 0x4000);
364					goto user_trctrap_out;
365				}
366				if (signo == 0)
367					goto user;
368				break;
369			}
370			signo = SIGBUS;
371			ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
372			break;
373		case T_SEGNPFLT:	/* segment not present fault */
374			signo = SIGBUS;
375			ucode = BUS_ADRERR;
376			break;
377		case T_TSSFLT:		/* invalid TSS fault */
378			signo = SIGBUS;
379			ucode = BUS_OBJERR;
380			break;
381		case T_ALIGNFLT:
382			signo = SIGBUS;
383			ucode = BUS_ADRALN;
384			break;
385		case T_DOUBLEFLT:	/* double fault */
386		default:
387			signo = SIGBUS;
388			ucode = BUS_OBJERR;
389			break;
390
391		case T_PAGEFLT:		/* page fault */
392			addr = eva;
393			pf = trap_pfault(frame, true, eva, &signo, &ucode);
394#if defined(I586_CPU) && !defined(NO_F00F_HACK)
395			if (pf == -2) {
396				/*
397				 * The f00f hack workaround has triggered, so
398				 * treat the fault as an illegal instruction
399				 * (T_PRIVINFLT) instead of a page fault.
400				 */
401				type = frame->tf_trapno = T_PRIVINFLT;
402				break;
403			}
404#endif
405			if (pf == -1)
406				return;
407			if (pf == 0)
408				goto user;
409			break;
410
411		case T_DIVIDE:		/* integer divide fault */
412			ucode = FPE_INTDIV;
413			signo = SIGFPE;
414			break;
415
416		case T_NMI:
417#ifdef POWERFAIL_NMI
418#ifndef TIMER_FREQ
419#  define TIMER_FREQ 1193182
420#endif
421			if (time_second - lastalert > 10) {
422				log(LOG_WARNING, "NMI: power fail\n");
423				sysbeep(880, hz);
424				lastalert = time_second;
425			}
426			return;
427#else /* !POWERFAIL_NMI */
428			nmi_handle_intr(type, frame);
429			return;
430#endif /* POWERFAIL_NMI */
431
432		case T_OFLOW:		/* integer overflow fault */
433			ucode = FPE_INTOVF;
434			signo = SIGFPE;
435			break;
436
437		case T_BOUND:		/* bounds check fault */
438			ucode = FPE_FLTSUB;
439			signo = SIGFPE;
440			break;
441
442		case T_DNA:
443			KASSERT(PCB_USER_FPU(td->td_pcb),
444			    ("kernel FPU ctx has leaked"));
445			/* transparent fault (due to context switch "late") */
446			if (npxdna())
447				return;
448			uprintf("pid %d killed due to lack of floating point\n",
449				p->p_pid);
450			signo = SIGKILL;
451			ucode = 0;
452			break;
453
454		case T_FPOPFLT:		/* FPU operand fetch fault */
455			ucode = ILL_COPROC;
456			signo = SIGILL;
457			break;
458
459		case T_XMMFLT:		/* SIMD floating-point exception */
460			ucode = npxtrap_sse();
461			if (ucode == -1)
462				return;
463			signo = SIGFPE;
464			break;
465#ifdef KDTRACE_HOOKS
466		case T_DTRACE_RET:
467			(void)trap_user_dtrace(frame, &dtrace_return_probe_ptr);
468			return;
469#endif
470		}
471	} else {
472		/* kernel trap */
473
474		KASSERT(cold || td->td_ucred != NULL,
475		    ("kernel trap doesn't have ucred"));
476		switch (type) {
477		case T_PAGEFLT:			/* page fault */
478			(void)trap_pfault(frame, false, eva, NULL, NULL);
479			return;
480
481		case T_DNA:
482			if (PCB_USER_FPU(td->td_pcb))
483				panic("Unregistered use of FPU in kernel");
484			if (npxdna())
485				return;
486			break;
487
488		case T_ARITHTRAP:	/* arithmetic trap */
489		case T_XMMFLT:		/* SIMD floating-point exception */
490		case T_FPOPFLT:		/* FPU operand fetch fault */
491			/*
492			 * XXXKIB for now disable any FPU traps in kernel
493			 * handler registration seems to be overkill
494			 */
495			trap_fatal(frame, 0);
496			return;
497
498			/*
499			 * The following two traps can happen in
500			 * vm86 mode, and, if so, we want to handle
501			 * them specially.
502			 */
503		case T_PROTFLT:		/* general protection fault */
504		case T_STKFLT:		/* stack fault */
505			if (frame->tf_eflags & PSL_VM) {
506				signo = vm86_emulate((struct vm86frame *)frame);
507				if (signo == SIGTRAP) {
508					type = T_TRCTRAP;
509					load_dr6(rdr6() | 0x4000);
510					goto kernel_trctrap;
511				}
512				if (signo != 0)
513					/*
514					 * returns to original process
515					 */
516					vm86_trap((struct vm86frame *)frame);
517				return;
518			}
519			/* FALL THROUGH */
520		case T_SEGNPFLT:	/* segment not present fault */
521			if (curpcb->pcb_flags & PCB_VM86CALL)
522				break;
523
524			/*
525			 * Invalid %fs's and %gs's can be created using
526			 * procfs or PT_SETREGS or by invalidating the
527			 * underlying LDT entry.  This causes a fault
528			 * in kernel mode when the kernel attempts to
529			 * switch contexts.  Lose the bad context
530			 * (XXX) so that we can continue, and generate
531			 * a signal.
532			 */
533			if (frame->tf_eip == (int)cpu_switch_load_gs) {
534				curpcb->pcb_gs = 0;
535#if 0
536				PROC_LOCK(p);
537				kern_psignal(p, SIGBUS);
538				PROC_UNLOCK(p);
539#endif
540				return;
541			}
542
543			if (td->td_intr_nesting_level != 0)
544				break;
545
546			/*
547			 * Invalid segment selectors and out of bounds
548			 * %eip's and %esp's can be set up in user mode.
549			 * This causes a fault in kernel mode when the
550			 * kernel tries to return to user mode.  We want
551			 * to get this fault so that we can fix the
552			 * problem here and not have to check all the
553			 * selectors and pointers when the user changes
554			 * them.
555			 *
556			 * N.B. Comparing to long mode, 32-bit mode
557			 * does not push %esp on the trap frame,
558			 * because iretl faulted while in ring 0.  As
559			 * the consequence, there is no need to fixup
560			 * the stack pointer for doreti_iret_fault,
561			 * the fixup and the complimentary trap() call
562			 * are executed on the main thread stack, not
563			 * on the trampoline stack.
564			 */
565			if (frame->tf_eip == (int)doreti_iret + setidt_disp) {
566				frame->tf_eip = (int)doreti_iret_fault +
567				    setidt_disp;
568				return;
569			}
570			if (type == T_STKFLT)
571				break;
572
573			if (frame->tf_eip == (int)doreti_popl_ds +
574			    setidt_disp) {
575				frame->tf_eip = (int)doreti_popl_ds_fault +
576				    setidt_disp;
577				return;
578			}
579			if (frame->tf_eip == (int)doreti_popl_es +
580			    setidt_disp) {
581				frame->tf_eip = (int)doreti_popl_es_fault +
582				    setidt_disp;
583				return;
584			}
585			if (frame->tf_eip == (int)doreti_popl_fs +
586			    setidt_disp) {
587				frame->tf_eip = (int)doreti_popl_fs_fault +
588				    setidt_disp;
589				return;
590			}
591			if (curpcb->pcb_onfault != NULL) {
592				frame->tf_eip = (int)curpcb->pcb_onfault;
593				return;
594			}
595			break;
596
597		case T_TSSFLT:
598			/*
599			 * PSL_NT can be set in user mode and isn't cleared
600			 * automatically when the kernel is entered.  This
601			 * causes a TSS fault when the kernel attempts to
602			 * `iret' because the TSS link is uninitialized.  We
603			 * want to get this fault so that we can fix the
604			 * problem here and not every time the kernel is
605			 * entered.
606			 */
607			if (frame->tf_eflags & PSL_NT) {
608				frame->tf_eflags &= ~PSL_NT;
609				return;
610			}
611			break;
612
613		case T_TRCTRAP:	 /* debug exception */
614kernel_trctrap:
615			/* Clear any pending debug events. */
616			dr6 = rdr6();
617			load_dr6(0);
618
619			/*
620			 * Ignore debug register exceptions due to
621			 * accesses in the user's address space, which
622			 * can happen under several conditions such as
623			 * if a user sets a watchpoint on a buffer and
624			 * then passes that buffer to a system call.
625			 * We still want to get TRCTRAPS for addresses
626			 * in kernel space because that is useful when
627			 * debugging the kernel.
628			 */
629			if (user_dbreg_trap(dr6) &&
630			   !(curpcb->pcb_flags & PCB_VM86CALL))
631				return;
632
633			/*
634			 * Malicious user code can configure a debug
635			 * register watchpoint to trap on data access
636			 * to the top of stack and then execute 'pop
637			 * %ss; int 3'.  Due to exception deferral for
638			 * 'pop %ss', the CPU will not interrupt 'int
639			 * 3' to raise the DB# exception for the debug
640			 * register but will postpone the DB# until
641			 * execution of the first instruction of the
642			 * BP# handler (in kernel mode).  Normally the
643			 * previous check would ignore DB# exceptions
644			 * for watchpoints on user addresses raised in
645			 * kernel mode.  However, some CPU errata
646			 * include cases where DB# exceptions do not
647			 * properly set bits in %dr6, e.g. Haswell
648			 * HSD23 and Skylake-X SKZ24.
649			 *
650			 * A deferred DB# can also be raised on the
651			 * first instructions of system call entry
652			 * points or single-step traps via similar use
653			 * of 'pop %ss' or 'mov xxx, %ss'.
654			 */
655			if (frame->tf_eip ==
656			    (uintptr_t)IDTVEC(int0x80_syscall) + setidt_disp ||
657			    frame->tf_eip == (uintptr_t)IDTVEC(bpt) +
658			    setidt_disp ||
659			    frame->tf_eip == (uintptr_t)IDTVEC(dbg) +
660			    setidt_disp)
661				return;
662			/*
663			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
664			 */
665		case T_BPTFLT:
666			/*
667			 * If KDB is enabled, let it handle the debugger trap.
668			 * Otherwise, debugger traps "can't happen".
669			 */
670#ifdef KDB
671			if (kdb_trap(type, dr6, frame))
672				return;
673#endif
674			break;
675
676		case T_NMI:
677#ifdef POWERFAIL_NMI
678			if (time_second - lastalert > 10) {
679				log(LOG_WARNING, "NMI: power fail\n");
680				sysbeep(880, hz);
681				lastalert = time_second;
682			}
683			return;
684#else /* !POWERFAIL_NMI */
685			nmi_handle_intr(type, frame);
686			return;
687#endif /* POWERFAIL_NMI */
688		}
689
690		trap_fatal(frame, eva);
691		return;
692	}
693
694	/* Translate fault for emulators (e.g. Linux) */
695	if (*p->p_sysent->sv_transtrap != NULL)
696		signo = (*p->p_sysent->sv_transtrap)(signo, type);
697
698	ksiginfo_init_trap(&ksi);
699	ksi.ksi_signo = signo;
700	ksi.ksi_code = ucode;
701	ksi.ksi_addr = (void *)addr;
702	ksi.ksi_trapno = type;
703	if (uprintf_signal) {
704		uprintf("pid %d comm %s: signal %d err %x code %d type %d "
705		    "addr 0x%x ss 0x%04x esp 0x%08x cs 0x%04x eip 0x%08x "
706		    "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
707		    p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type,
708		    addr, frame->tf_ss, frame->tf_esp, frame->tf_cs,
709		    frame->tf_eip,
710		    fubyte((void *)(frame->tf_eip + 0)),
711		    fubyte((void *)(frame->tf_eip + 1)),
712		    fubyte((void *)(frame->tf_eip + 2)),
713		    fubyte((void *)(frame->tf_eip + 3)),
714		    fubyte((void *)(frame->tf_eip + 4)),
715		    fubyte((void *)(frame->tf_eip + 5)),
716		    fubyte((void *)(frame->tf_eip + 6)),
717		    fubyte((void *)(frame->tf_eip + 7)));
718	}
719	KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled"));
720	trapsignal(td, &ksi);
721
722user:
723	userret(td, frame);
724	KASSERT(PCB_USER_FPU(td->td_pcb),
725	    ("Return from trap with kernel FPU ctx leaked"));
726}
727
728/*
729 * Handle all details of a page fault.
730 * Returns:
731 * -2 if the fault was caused by triggered workaround for Intel Pentium
732 *    0xf00f bug.
733 * -1 if this fault was fatal, typically from kernel mode
734 *    (cannot happen, but we need to return something).
735 * 0  if this fault was handled by updating either the user or kernel
736 *    page table, execution can continue.
737 * 1  if this fault was from usermode and it was not handled, a synchronous
738 *    signal should be delivered to the thread.  *signo returns the signal
739 *    number, *ucode gives si_code.
740 */
741static int
742trap_pfault(struct trapframe *frame, bool usermode, vm_offset_t eva,
743    int *signo, int *ucode)
744{
745	struct thread *td;
746	struct proc *p;
747	vm_map_t map;
748	int rv;
749	vm_prot_t ftype;
750
751	MPASS(!usermode || (signo != NULL && ucode != NULL));
752
753	td = curthread;
754	p = td->td_proc;
755
756	if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
757		/*
758		 * Due to both processor errata and lazy TLB invalidation when
759		 * access restrictions are removed from virtual pages, memory
760		 * accesses that are allowed by the physical mapping layer may
761		 * nonetheless cause one spurious page fault per virtual page.
762		 * When the thread is executing a "no faulting" section that
763		 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
764		 * every page fault is treated as a spurious page fault,
765		 * unless it accesses the same virtual address as the most
766		 * recent page fault within the same "no faulting" section.
767		 */
768		if (td->td_md.md_spurflt_addr != eva ||
769		    (td->td_pflags & TDP_RESETSPUR) != 0) {
770			/*
771			 * Do nothing to the TLB.  A stale TLB entry is
772			 * flushed automatically by a page fault.
773			 */
774			td->td_md.md_spurflt_addr = eva;
775			td->td_pflags &= ~TDP_RESETSPUR;
776			return (0);
777		}
778	} else {
779		/*
780		 * If we get a page fault while in a critical section, then
781		 * it is most likely a fatal kernel page fault.  The kernel
782		 * is already going to panic trying to get a sleep lock to
783		 * do the VM lookup, so just consider it a fatal trap so the
784		 * kernel can print out a useful trap message and even get
785		 * to the debugger.
786		 *
787		 * If we get a page fault while holding a non-sleepable
788		 * lock, then it is most likely a fatal kernel page fault.
789		 * If WITNESS is enabled, then it's going to whine about
790		 * bogus LORs with various VM locks, so just skip to the
791		 * fatal trap handling directly.
792		 */
793		if (td->td_critnest != 0 ||
794		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
795		    "Kernel page fault") != 0) {
796			trap_fatal(frame, eva);
797			return (-1);
798		}
799	}
800	if (eva >= PMAP_TRM_MIN_ADDRESS) {
801		/*
802		 * Don't allow user-mode faults in kernel address space.
803		 * An exception:  if the faulting address is the invalid
804		 * instruction entry in the IDT, then the Intel Pentium
805		 * F00F bug workaround was triggered, and we need to
806		 * treat it is as an illegal instruction, and not a page
807		 * fault.
808		 */
809#if defined(I586_CPU) && !defined(NO_F00F_HACK)
810		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) {
811			*ucode = ILL_PRVOPC;
812			*signo = SIGILL;
813			return (-2);
814		}
815#endif
816		if (usermode) {
817			*signo = SIGSEGV;
818			*ucode = SEGV_MAPERR;
819			return (1);
820		}
821		trap_fatal(frame, eva);
822		return (-1);
823	} else {
824		map = usermode ? &p->p_vmspace->vm_map : kernel_map;
825
826		/*
827		 * Kernel cannot access a user-space address directly
828		 * because user pages are not mapped.  Also, page
829		 * faults must not be caused during the interrupts.
830		 */
831		if (!usermode && td->td_intr_nesting_level != 0) {
832			trap_fatal(frame, eva);
833			return (-1);
834		}
835	}
836
837	/*
838	 * If the trap was caused by errant bits in the PTE then panic.
839	 */
840	if (frame->tf_err & PGEX_RSV) {
841		trap_fatal(frame, eva);
842		return (-1);
843	}
844
845	/*
846	 * PGEX_I is defined only if the execute disable bit capability is
847	 * supported and enabled.
848	 */
849	if (frame->tf_err & PGEX_W)
850		ftype = VM_PROT_WRITE;
851#if defined(PAE) || defined(PAE_TABLES)
852	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
853		ftype = VM_PROT_EXECUTE;
854#endif
855	else
856		ftype = VM_PROT_READ;
857
858	/* Fault in the page. */
859	rv = vm_fault_trap(map, eva, ftype, VM_FAULT_NORMAL, signo, ucode);
860	if (rv == KERN_SUCCESS) {
861#ifdef HWPMC_HOOKS
862		if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
863			PMC_SOFT_CALL_TF( , , page_fault, all, frame);
864			if (ftype == VM_PROT_READ)
865				PMC_SOFT_CALL_TF( , , page_fault, read,
866				    frame);
867			else
868				PMC_SOFT_CALL_TF( , , page_fault, write,
869				    frame);
870		}
871#endif
872		return (0);
873	}
874	if (usermode)
875		return (1);
876	if (td->td_intr_nesting_level == 0 &&
877	    curpcb->pcb_onfault != NULL) {
878		frame->tf_eip = (int)curpcb->pcb_onfault;
879		return (0);
880	}
881	trap_fatal(frame, eva);
882	return (-1);
883}
884
885static void
886trap_fatal(frame, eva)
887	struct trapframe *frame;
888	vm_offset_t eva;
889{
890	int code, ss, esp;
891	u_int type;
892	struct soft_segment_descriptor softseg;
893#ifdef KDB
894	bool handled;
895#endif
896
897	code = frame->tf_err;
898	type = frame->tf_trapno;
899	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
900
901	printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg(type),
902	    frame->tf_eflags & PSL_VM ? "vm86" :
903	    ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
904#ifdef SMP
905	/* two separate prints in case of a trap on an unmapped page */
906	printf("cpuid = %d; ", PCPU_GET(cpuid));
907	printf("apic id = %02x\n", PCPU_GET(apic_id));
908#endif
909	if (type == T_PAGEFLT) {
910		printf("fault virtual address	= 0x%x\n", eva);
911		printf("fault code		= %s %s%s, %s\n",
912			code & PGEX_U ? "user" : "supervisor",
913			code & PGEX_W ? "write" : "read",
914#if defined(PAE) || defined(PAE_TABLES)
915			pg_nx != 0 ?
916			(code & PGEX_I ? " instruction" : " data") :
917#endif
918			"",
919			code & PGEX_RSV ? "reserved bits in PTE" :
920			code & PGEX_P ? "protection violation" : "page not present");
921	} else {
922		printf("error code		= %#x\n", code);
923	}
924	printf("instruction pointer	= 0x%x:0x%x\n",
925	       frame->tf_cs & 0xffff, frame->tf_eip);
926        if (TF_HAS_STACKREGS(frame)) {
927		ss = frame->tf_ss & 0xffff;
928		esp = frame->tf_esp;
929	} else {
930		ss = GSEL(GDATA_SEL, SEL_KPL);
931		esp = (int)&frame->tf_esp;
932	}
933	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
934	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
935	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
936	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
937	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
938	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
939	       softseg.ssd_gran);
940	printf("processor eflags	= ");
941	if (frame->tf_eflags & PSL_T)
942		printf("trace trap, ");
943	if (frame->tf_eflags & PSL_I)
944		printf("interrupt enabled, ");
945	if (frame->tf_eflags & PSL_NT)
946		printf("nested task, ");
947	if (frame->tf_eflags & PSL_RF)
948		printf("resume, ");
949	if (frame->tf_eflags & PSL_VM)
950		printf("vm86, ");
951	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
952	printf("current process		= %d (%s)\n",
953	    curproc->p_pid, curthread->td_name);
954
955#ifdef KDB
956	if (debugger_on_trap) {
957		kdb_why = KDB_WHY_TRAP;
958		frame->tf_err = eva;	/* smuggle fault address to ddb */
959		handled = kdb_trap(type, 0, frame);
960		frame->tf_err = code;	/* restore error code */
961		kdb_why = KDB_WHY_UNSET;
962		if (handled)
963			return;
964	}
965#endif
966	printf("trap number		= %d\n", type);
967	if (trap_msg(type) != NULL)
968		panic("%s", trap_msg(type));
969	else
970		panic("unknown/reserved trap");
971}
972
973#ifdef KDTRACE_HOOKS
974/*
975 * Invoke a userspace DTrace hook.  The hook pointer is cleared when no
976 * userspace probes are enabled, so we must synchronize with DTrace to ensure
977 * that a trapping thread is able to call the hook before it is cleared.
978 */
979static bool
980trap_user_dtrace(struct trapframe *frame, int (**hookp)(struct trapframe *))
981{
982	int (*hook)(struct trapframe *);
983
984	hook = (int (*)(struct trapframe *))atomic_load_ptr(hookp);
985	enable_intr();
986	if (hook != NULL)
987		return ((hook)(frame) == 0);
988	return (false);
989}
990#endif
991
992/*
993 * Double fault handler. Called when a fault occurs while writing
994 * a frame for a trap/exception onto the stack. This usually occurs
995 * when the stack overflows (such is the case with infinite recursion,
996 * for example).
997 *
998 * XXX Note that the current PTD gets replaced by IdlePTD when the
999 * task switch occurs. This means that the stack that was active at
1000 * the time of the double fault is not available at <kstack> unless
1001 * the machine was idle when the double fault occurred. The downside
1002 * of this is that "trace <ebp>" in ddb won't work.
1003 */
1004void
1005dblfault_handler(void)
1006{
1007#ifdef KDTRACE_HOOKS
1008	if (dtrace_doubletrap_func != NULL)
1009		(*dtrace_doubletrap_func)();
1010#endif
1011	printf("\nFatal double fault:\n");
1012	printf("eip = 0x%x\n", PCPU_GET(common_tssp)->tss_eip);
1013	printf("esp = 0x%x\n", PCPU_GET(common_tssp)->tss_esp);
1014	printf("ebp = 0x%x\n", PCPU_GET(common_tssp)->tss_ebp);
1015#ifdef SMP
1016	/* two separate prints in case of a trap on an unmapped page */
1017	printf("cpuid = %d; ", PCPU_GET(cpuid));
1018	printf("apic id = %02x\n", PCPU_GET(apic_id));
1019#endif
1020	panic("double fault");
1021}
1022
1023int
1024cpu_fetch_syscall_args(struct thread *td)
1025{
1026	struct proc *p;
1027	struct trapframe *frame;
1028	struct syscall_args *sa;
1029	caddr_t params;
1030	long tmp;
1031	int error;
1032#ifdef COMPAT_43
1033	u_int32_t eip;
1034	int cs;
1035#endif
1036
1037	p = td->td_proc;
1038	frame = td->td_frame;
1039	sa = &td->td_sa;
1040
1041#ifdef COMPAT_43
1042	if (__predict_false(frame->tf_cs == 7 && frame->tf_eip == 2)) {
1043		/*
1044		 * In lcall $7,$0 after int $0x80.  Convert the user
1045		 * frame to what it would be for a direct int 0x80 instead
1046		 * of lcall $7,$0, by popping the lcall return address.
1047		 */
1048		error = fueword32((void *)frame->tf_esp, &eip);
1049		if (error == -1)
1050			return (EFAULT);
1051		cs = fuword16((void *)(frame->tf_esp + sizeof(u_int32_t)));
1052		if (cs == -1)
1053			return (EFAULT);
1054
1055		/*
1056		 * Unwind in-kernel frame after all stack frame pieces
1057		 * were successfully read.
1058		 */
1059		frame->tf_eip = eip;
1060		frame->tf_cs = cs;
1061		frame->tf_esp += 2 * sizeof(u_int32_t);
1062		frame->tf_err = 7;	/* size of lcall $7,$0 */
1063	}
1064#endif
1065
1066	sa->code = frame->tf_eax;
1067	params = (caddr_t)frame->tf_esp + sizeof(uint32_t);
1068
1069	/*
1070	 * Need to check if this is a 32 bit or 64 bit syscall.
1071	 */
1072	if (sa->code == SYS_syscall) {
1073		/*
1074		 * Code is first argument, followed by actual args.
1075		 */
1076		error = fueword(params, &tmp);
1077		if (error == -1)
1078			return (EFAULT);
1079		sa->code = tmp;
1080		params += sizeof(uint32_t);
1081	} else if (sa->code == SYS___syscall) {
1082		/*
1083		 * Like syscall, but code is a quad, so as to maintain
1084		 * quad alignment for the rest of the arguments.
1085		 */
1086		error = fueword(params, &tmp);
1087		if (error == -1)
1088			return (EFAULT);
1089		sa->code = tmp;
1090		params += sizeof(quad_t);
1091	}
1092
1093 	if (p->p_sysent->sv_mask)
1094 		sa->code &= p->p_sysent->sv_mask;
1095 	if (sa->code >= p->p_sysent->sv_size)
1096 		sa->callp = &p->p_sysent->sv_table[0];
1097  	else
1098 		sa->callp = &p->p_sysent->sv_table[sa->code];
1099	sa->narg = sa->callp->sy_narg;
1100
1101	if (params != NULL && sa->narg != 0)
1102		error = copyin(params, (caddr_t)sa->args,
1103		    (u_int)(sa->narg * sizeof(uint32_t)));
1104	else
1105		error = 0;
1106
1107	if (error == 0) {
1108		td->td_retval[0] = 0;
1109		td->td_retval[1] = frame->tf_edx;
1110	}
1111
1112	return (error);
1113}
1114
1115#include "../../kern/subr_syscall.c"
1116
1117/*
1118 * syscall - system call request C handler.  A system call is
1119 * essentially treated as a trap by reusing the frame layout.
1120 */
1121void
1122syscall(struct trapframe *frame)
1123{
1124	struct thread *td;
1125	register_t orig_tf_eflags;
1126	ksiginfo_t ksi;
1127
1128#ifdef DIAGNOSTIC
1129	if (!(TRAPF_USERMODE(frame) &&
1130	    (curpcb->pcb_flags & PCB_VM86CALL) == 0)) {
1131		panic("syscall");
1132		/* NOT REACHED */
1133	}
1134#endif
1135	orig_tf_eflags = frame->tf_eflags;
1136
1137	td = curthread;
1138	td->td_frame = frame;
1139
1140	syscallenter(td);
1141
1142	/*
1143	 * Traced syscall.
1144	 */
1145	if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
1146		frame->tf_eflags &= ~PSL_T;
1147		ksiginfo_init_trap(&ksi);
1148		ksi.ksi_signo = SIGTRAP;
1149		ksi.ksi_code = TRAP_TRACE;
1150		ksi.ksi_addr = (void *)frame->tf_eip;
1151		trapsignal(td, &ksi);
1152	}
1153
1154	KASSERT(PCB_USER_FPU(td->td_pcb),
1155	    ("System call %s returning with kernel FPU ctx leaked",
1156	     syscallname(td->td_proc, td->td_sa.code)));
1157	KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
1158	    ("System call %s returning with mangled pcb_save",
1159	     syscallname(td->td_proc, td->td_sa.code)));
1160
1161	syscallret(td);
1162}
1163