1
2/*	$NetBSD: trap.c,v 1.309 2023/10/05 19:41:04 ad Exp $	*/
3
4/*-
5 * Copyright (c) 1998, 2000, 2005, 2006, 2007, 2008 The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Charles M. Hannum.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*-
34 * Copyright (c) 1990 The Regents of the University of California.
35 * All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the University of Utah, and William Jolitz.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 * 1. Redistributions of source code must retain the above copyright
44 *    notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 *    notice, this list of conditions and the following disclaimer in the
47 *    documentation and/or other materials provided with the distribution.
48 * 3. Neither the name of the University nor the names of its contributors
49 *    may be used to endorse or promote products derived from this software
50 *    without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 *	@(#)trap.c	7.4 (Berkeley) 5/13/91
65 */
66
67/*
68 * 386 Trap and System call handling
69 */
70
71#include <sys/cdefs.h>
72__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.309 2023/10/05 19:41:04 ad Exp $");
73
74#include "opt_ddb.h"
75#include "opt_kgdb.h"
76#include "opt_lockdebug.h"
77#include "opt_multiprocessor.h"
78#include "opt_xen.h"
79#include "opt_dtrace.h"
80#include "opt_compat_netbsd.h"
81
82#include <sys/param.h>
83#include <sys/systm.h>
84#include <sys/proc.h>
85#include <sys/acct.h>
86#include <sys/kauth.h>
87#include <sys/kernel.h>
88#include <sys/kmem.h>
89#include <sys/ras.h>
90#include <sys/signal.h>
91#include <sys/syscall.h>
92#include <sys/cpu.h>
93#include <sys/ucontext.h>
94
95#include <uvm/uvm_extern.h>
96
97#include <machine/cpufunc.h>
98#include <machine/psl.h>
99#include <machine/reg.h>
100#include <machine/trap.h>
101#include <machine/userret.h>
102#include <machine/db_machdep.h>
103#include <machine/pmap_private.h>
104
105#include "mca.h"
106#if NMCA > 0
107#include <machine/mca_machdep.h>
108#endif
109
110#include <x86/dbregs.h>
111#include <x86/nmi.h>
112
113#include "isa.h"
114
115#include <sys/kgdb.h>
116
117#ifdef KDTRACE_HOOKS
118#include <sys/dtrace_bsd.h>
119
120/*
121 * This is a hook which is initialized by the dtrace module
122 * to handle traps which might occur during DTrace probe
123 * execution.
124 */
125dtrace_trap_func_t	dtrace_trap_func = NULL;
126
127dtrace_doubletrap_func_t	dtrace_doubletrap_func = NULL;
128#endif
129
130void trap(struct trapframe *);
131void trap_tss(struct i386tss *, int, int);
132void trap_return_fault_return(struct trapframe *) __dead;
133#ifndef XENPV
134int ss_shadow(struct trapframe *tf);
135#endif
136
137const char * const trap_type[] = {
138	"privileged instruction fault",		/*  0 T_PRIVINFLT */
139	"breakpoint trap",			/*  1 T_BPTFLT */
140	"arithmetic trap",			/*  2 T_ARITHTRAP */
141	"asynchronous system trap",		/*  3 T_ASTFLT */
142	"protection fault",			/*  4 T_PROTFLT */
143	"trace trap",				/*  5 T_TRCTRAP */
144	"page fault",				/*  6 T_PAGEFLT */
145	"alignment fault",			/*  7 T_ALIGNFLT */
146	"integer divide fault",			/*  8 T_DIVIDE */
147	"non-maskable interrupt",		/*  9 T_NMI */
148	"overflow trap",			/* 10 T_OFLOW */
149	"bounds check fault",			/* 11 T_BOUND */
150	"FPU not available fault",		/* 12 T_DNA */
151	"double fault",				/* 13 T_DOUBLEFLT */
152	"FPU operand fetch fault",		/* 14 T_FPOPFLT */
153	"invalid TSS fault",			/* 15 T_TSSFLT */
154	"segment not present fault",		/* 16 T_SEGNPFLT */
155	"stack fault",				/* 17 T_STKFLT */
156	"machine check fault",			/* 18 T_MCA */
157	"SSE FP exception",			/* 19 T_XMM */
158	"reserved trap",			/* 20 T_RESERVED */
159};
160int	trap_types = __arraycount(trap_type);
161
162#ifdef DEBUG
163int	trapdebug = 0;
164#endif
165
166#define	IDTVEC(name)	__CONCAT(X, name)
167
168#ifdef TRAP_SIGDEBUG
169static void sigdebug(const struct trapframe *, const ksiginfo_t *, int);
170#define SIGDEBUG(a, b, c) sigdebug(a, b, c)
171#else
172#define SIGDEBUG(a, b, c)
173#endif
174
175void
176trap_tss(struct i386tss *tss, int trapno, int code)
177{
178	struct trapframe tf;
179
180	tf.tf_gs = tss->tss_gs;
181	tf.tf_fs = tss->tss_fs;
182	tf.tf_es = tss->__tss_es;
183	tf.tf_ds = tss->__tss_ds;
184	tf.tf_edi = tss->__tss_edi;
185	tf.tf_esi = tss->__tss_esi;
186	tf.tf_ebp = tss->tss_ebp;
187	tf.tf_ebx = tss->__tss_ebx;
188	tf.tf_edx = tss->__tss_edx;
189	tf.tf_ecx = tss->__tss_ecx;
190	tf.tf_eax = tss->__tss_eax;
191	tf.tf_trapno = trapno;
192	tf.tf_err = code | TC_TSS;
193	tf.tf_eip = tss->__tss_eip;
194	tf.tf_cs = tss->__tss_cs;
195	tf.tf_eflags = tss->__tss_eflags;
196	tf.tf_esp = tss->tss_esp;
197	tf.tf_ss = tss->__tss_ss;
198	trap(&tf);
199}
200
201static void *
202onfault_handler(const struct pcb *pcb, const struct trapframe *tf)
203{
204	struct onfault_table {
205		uintptr_t start;
206		uintptr_t end;
207		void *handler;
208	};
209	extern const struct onfault_table onfault_table[];
210	const struct onfault_table *p;
211	uintptr_t pc;
212
213	if (pcb->pcb_onfault != NULL) {
214		return pcb->pcb_onfault;
215	}
216
217	pc = tf->tf_eip;
218	for (p = onfault_table; p->start; p++) {
219		if (p->start <= pc && pc < p->end) {
220			return p->handler;
221		}
222	}
223	return NULL;
224}
225
226static void
227trap_print(const struct trapframe *frame, const lwp_t *l)
228{
229	const int type = frame->tf_trapno;
230
231	if (frame->tf_trapno < trap_types) {
232		printf("fatal %s", trap_type[type]);
233	} else {
234		printf("unknown trap %d", type);
235	}
236	printf(" in %s mode\n", (type & T_USER) ? "user" : "supervisor");
237
238	printf("trap type %d code %#x eip %#x cs %#x eflags %#x cr2 %#lx "
239	    "ilevel %#x esp %#x\n",
240	    type, frame->tf_err, frame->tf_eip, frame->tf_cs, frame->tf_eflags,
241	    (long)rcr2(), curcpu()->ci_ilevel, frame->tf_esp);
242
243	printf("curlwp %p pid %d lid %d lowest kstack %p\n",
244	    l, l->l_proc->p_pid, l->l_lid, KSTACK_LOWEST_ADDR(l));
245}
246
247#ifndef XENPV
248int
249ss_shadow(struct trapframe *tf)
250{
251	struct gate_descriptor *gd;
252	struct cpu_info *ci;
253	struct idt_vec *iv;
254	idt_descriptor_t *idt;
255	uintptr_t eip, func;
256	size_t i;
257
258	eip = tf->tf_eip;
259	ci = curcpu();
260	iv = idt_vec_ref(&ci->ci_idtvec);
261	idt = iv->iv_idt;
262
263	for (i = 0; i < 256; i++) {
264		gd = &idt[i];
265		func = (gd->gd_hioffset << 16) | gd->gd_looffset;
266		if (eip == func)
267			return 1;
268	}
269
270	return 0;
271}
272#endif
273
274/*
275 * trap(frame): exception, fault, and trap interface to BSD kernel.
276 *
277 * This common code is called from assembly language IDT gate entry routines
278 * that prepare a suitable stack frame, and restore this frame after the
279 * exception has been processed. Note that the effect is as if the arguments
280 * were passed call by reference.
281 */
282void
283trap(struct trapframe *frame)
284{
285	struct lwp *l = curlwp;
286	struct proc *p;
287	struct pcb *pcb;
288	extern char kcopy_fault[], return_address_fault[];
289	struct trapframe *vframe;
290	ksiginfo_t ksi;
291	void *onfault;
292	int type, error = 0;
293	uint32_t cr2;
294	bool pfail;
295
296	if (__predict_true(l != NULL)) {
297		pcb = lwp_getpcb(l);
298		p = l->l_proc;
299	} else {
300		/*
301		 * this can happen eg. on break points in early on boot.
302		 */
303		pcb = NULL;
304		p = NULL;
305	}
306	type = frame->tf_trapno;
307
308#ifdef DEBUG
309	if (trapdebug) {
310		trap_print(frame, l);
311	}
312#endif
313	if (type != T_NMI && !KERNELMODE(frame->tf_cs)) {
314		type |= T_USER;
315		l->l_md.md_regs = frame;
316		pcb->pcb_cr2 = 0;
317	}
318
319#ifdef KDTRACE_HOOKS
320	/*
321	 * A trap can occur while DTrace executes a probe. Before
322	 * executing the probe, DTrace blocks re-scheduling and sets
323	 * a flag in its per-cpu flags to indicate that it doesn't
324	 * want to fault. On returning from the probe, the no-fault
325	 * flag is cleared and finally re-scheduling is enabled.
326	 *
327	 * If the DTrace kernel module has registered a trap handler,
328	 * call it and if it returns non-zero, assume that it has
329	 * handled the trap and modified the trap frame so that this
330	 * function can return normally.
331	 */
332	if ((type == T_PROTFLT || type == T_PAGEFLT) &&
333	    dtrace_trap_func != NULL) {
334		if ((*dtrace_trap_func)(frame, type)) {
335			return;
336		}
337	}
338#endif
339
340	switch (type) {
341
342	default:
343	we_re_toast:
344		trap_print(frame, l);
345
346		if (kdb_trap(type, 0, frame))
347			return;
348		if (kgdb_trap(type, frame))
349			return;
350		/*
351		 * If this is a breakpoint, don't panic if we're not connected.
352		 */
353		if (type == T_BPTFLT && kgdb_disconnected()) {
354			printf("kgdb: ignored %s\n", trap_type[type]);
355			return;
356		}
357		panic("trap");
358		/*NOTREACHED*/
359
360	case T_PROTFLT:
361	case T_SEGNPFLT:
362	case T_ALIGNFLT:
363	case T_STKFLT:
364	case T_TSSFLT:
365		if (p == NULL)
366			goto we_re_toast;
367		/* Check for copyin/copyout fault. */
368		onfault = onfault_handler(pcb, frame);
369		if (onfault != NULL) {
370copyefault:
371			error = EFAULT;
372copyfault:
373			frame->tf_eip = (uintptr_t)onfault;
374			frame->tf_eax = error;
375			return;
376		}
377
378		/*
379		 * Check for failure during return to user mode.
380		 * This can happen loading invalid values into the segment
381		 * registers, or during the 'iret' itself.
382		 *
383		 * We do this by looking at the instruction we faulted on.
384		 * The specific instructions we recognize only happen when
385		 * returning from a trap, syscall, or interrupt.
386		 */
387
388kernelfault:
389		KSI_INIT_TRAP(&ksi);
390		ksi.ksi_signo = SIGSEGV;
391		ksi.ksi_code = SEGV_ACCERR;
392		ksi.ksi_trap = type;
393
394		switch (*(u_char *)frame->tf_eip) {
395		case 0xcf:	/* iret */
396			/*
397			 * The 'iret' instruction faulted, so we have the
398			 * 'user' registers saved after the kernel %eip:%cs:%fl
399			 * of the 'iret' and below that the user %eip:%cs:%fl
400			 * the 'iret' was processing.
401			 * We must delete the 3 words of kernel return address
402			 * from the stack to generate a normal stack frame
403			 * (eg for sending a SIGSEGV).
404			 */
405			vframe = (void *)((int *)frame + 3);
406			if (KERNELMODE(vframe->tf_cs))
407				goto we_re_toast;
408			memmove(vframe, frame,
409			    offsetof(struct trapframe, tf_eip));
410			/* Set the faulting address to the user %eip */
411			ksi.ksi_addr = (void *)vframe->tf_eip;
412			break;
413		case 0x8e:
414			switch (*(uint32_t *)frame->tf_eip) {
415			case 0x8e242c8e:	/* mov (%esp,%gs), then */
416			case 0x0424648e:	/* mov 0x4(%esp),%fs */
417			case 0x0824448e:	/* mov 0x8(%esp),%es */
418			case 0x0c245c8e:	/* mov 0xc(%esp),%ds */
419				break;
420			default:
421				goto we_re_toast;
422			}
423			/*
424			 * We faulted loading one of the user segment registers.
425			 * The stack frame containing the user registers is
426			 * still valid and is just below the %eip:%cs:%fl of
427			 * the kernel fault frame.
428			 */
429			vframe = (void *)(&frame->tf_eflags + 1);
430			if (KERNELMODE(vframe->tf_cs))
431				goto we_re_toast;
432			/* There is no valid address for the fault */
433			break;
434		default:
435			goto we_re_toast;
436		}
437		/*
438		 * We might have faulted trying to execute the
439		 * trampoline for a local (nested) signal handler.
440		 * Only generate SIGSEGV if the user %cs isn't changed.
441		 * (This is only strictly necessary in the 'iret' case.)
442		 */
443		if (!pmap_exec_fixup(&p->p_vmspace->vm_map, vframe, pcb)) {
444			/* Save outer frame for any signal return */
445			l->l_md.md_regs = vframe;
446			SIGDEBUG(vframe, &ksi, error);
447			(*p->p_emul->e_trapsignal)(l, &ksi);
448		}
449		/* Return to user by reloading the user frame */
450		trap_return_fault_return(vframe);
451		/* NOTREACHED */
452
453	case T_PROTFLT|T_USER:		/* protection fault */
454#if defined(COMPAT_10) || defined(COMPAT_NOMID)
455	{
456#define LCALLSZ 7
457		/* Check for the osyscall lcall instruction. */
458		if (frame->tf_eip < VM_MAXUSER_ADDRESS - LCALLSZ &&
459		    x86_cpu_is_lcall((const void *)frame->tf_eip) == 0) {
460
461			/* Advance past the lcall. */
462			frame->tf_eip += LCALLSZ;
463
464			/* Do the syscall. */
465			p->p_md.md_syscall(frame);
466			goto out;
467		}
468	}
469#endif
470		/* FALLTHROUGH */
471	case T_TSSFLT|T_USER:
472	case T_SEGNPFLT|T_USER:
473	case T_STKFLT|T_USER:
474	case T_ALIGNFLT|T_USER:
475		KSI_INIT_TRAP(&ksi);
476
477		ksi.ksi_addr = (void *)rcr2();
478		switch (type) {
479		case T_SEGNPFLT|T_USER:
480		case T_STKFLT|T_USER:
481			ksi.ksi_signo = SIGBUS;
482			ksi.ksi_code = BUS_ADRERR;
483			break;
484		case T_TSSFLT|T_USER:
485			ksi.ksi_signo = SIGBUS;
486			ksi.ksi_code = BUS_OBJERR;
487			break;
488		case T_ALIGNFLT|T_USER:
489			ksi.ksi_signo = SIGBUS;
490			ksi.ksi_code = BUS_ADRALN;
491			break;
492		case T_PROTFLT|T_USER:
493			/*
494			 * If pmap_exec_fixup does something,
495			 * let's retry the trap.
496			 */
497			if (pmap_exec_fixup(&p->p_vmspace->vm_map, frame, pcb)){
498				goto out;
499			}
500			ksi.ksi_signo = SIGSEGV;
501			ksi.ksi_code = SEGV_ACCERR;
502			break;
503		default:
504			KASSERT(0);
505			break;
506		}
507		goto trapsignal;
508
509	case T_PRIVINFLT|T_USER:	/* privileged instruction fault */
510	case T_FPOPFLT|T_USER:		/* coprocessor operand fault */
511		KSI_INIT_TRAP(&ksi);
512		ksi.ksi_signo = SIGILL;
513		ksi.ksi_addr = (void *) frame->tf_eip;
514		switch (type) {
515		case T_PRIVINFLT|T_USER:
516			ksi.ksi_code = ILL_PRVOPC;
517			break;
518		case T_FPOPFLT|T_USER:
519			ksi.ksi_code = ILL_COPROC;
520			break;
521		default:
522			ksi.ksi_code = 0;
523			break;
524		}
525		goto trapsignal;
526
527	case T_ASTFLT|T_USER:
528		/* Allow process switch. */
529		//curcpu()->ci_data.cpu_nast++;
530		if (l->l_pflag & LP_OWEUPC) {
531			l->l_pflag &= ~LP_OWEUPC;
532			ADDUPROF(l);
533		}
534		goto out;
535
536	case T_BOUND|T_USER:
537	case T_OFLOW|T_USER:
538	case T_DIVIDE|T_USER:
539		KSI_INIT_TRAP(&ksi);
540		ksi.ksi_signo = SIGFPE;
541		ksi.ksi_addr = (void *)frame->tf_eip;
542		switch (type) {
543		case T_BOUND|T_USER:
544			ksi.ksi_code = FPE_FLTSUB;
545			break;
546		case T_OFLOW|T_USER:
547			ksi.ksi_code = FPE_INTOVF;
548			break;
549		case T_DIVIDE|T_USER:
550			ksi.ksi_code = FPE_INTDIV;
551			break;
552		default:
553			ksi.ksi_code = 0;
554			break;
555		}
556		goto trapsignal;
557
558	case T_PAGEFLT:
559		/* Allow page faults in kernel mode. */
560		if (__predict_false(l == NULL))
561			goto we_re_toast;
562
563		onfault = pcb->pcb_onfault;
564		if (onfault == return_address_fault) {
565			goto copyefault;
566		}
567		if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) {
568			goto we_re_toast;
569		}
570
571		cr2 = rcr2();
572
573		if (frame->tf_err & PGEX_I) {
574			/* SMEP might have brought us here */
575			if (cr2 > VM_MIN_ADDRESS && cr2 <= VM_MAXUSER_ADDRESS) {
576				printf("prevented execution of %p (SMEP)\n",
577				    (void *)cr2);
578				goto we_re_toast;
579			}
580		}
581
582		if ((frame->tf_err & PGEX_P) &&
583		    cr2 < VM_MAXUSER_ADDRESS) {
584			/* SMAP might have brought us here */
585			if (onfault_handler(pcb, frame) == NULL) {
586				printf("prevented access to %p (SMAP)\n",
587				    (void *)cr2);
588				goto we_re_toast;
589			}
590		}
591
592		goto faultcommon;
593
594	case T_PAGEFLT|T_USER: {	/* page fault */
595		register vaddr_t va;
596		register struct vmspace *vm;
597		register struct vm_map *map;
598		vm_prot_t ftype;
599		extern struct vm_map *kernel_map;
600
601		cr2 = rcr2();
602faultcommon:
603		vm = p->p_vmspace;
604		if (__predict_false(vm == NULL)) {
605			goto we_re_toast;
606		}
607		pcb->pcb_cr2 = cr2;
608		va = trunc_page((vaddr_t)cr2);
609		/*
610		 * It is only a kernel address space fault iff:
611		 *	1. (type & T_USER) == 0  and
612		 *	2. pcb_onfault not set or
613		 *	3. pcb_onfault set but supervisor space fault
614		 * The last can occur during an exec() copyin where the
615		 * argument space is lazy-allocated.
616		 */
617		if (type == T_PAGEFLT && va >= VM_MIN_KERNEL_ADDRESS)
618			map = kernel_map;
619		else
620			map = &vm->vm_map;
621		if (frame->tf_err & PGEX_W)
622			ftype = VM_PROT_WRITE;
623		else if (frame->tf_err & PGEX_I)
624			ftype = VM_PROT_EXECUTE;
625		else
626			ftype = VM_PROT_READ;
627
628#ifdef DIAGNOSTIC
629		if (map == kernel_map && va == 0) {
630			printf("trap: bad kernel access at %lx\n", va);
631			goto we_re_toast;
632		}
633#endif
634		/* Fault the original page in. */
635		onfault = pcb->pcb_onfault;
636		pcb->pcb_onfault = NULL;
637		error = uvm_fault(map, va, ftype);
638		pcb->pcb_onfault = onfault;
639		if (error == 0) {
640			if (map != kernel_map && (void *)va >= vm->vm_maxsaddr)
641				uvm_grow(p, va);
642
643			pfail = false;
644			while (type == T_PAGEFLT) {
645				/*
646				 * we need to switch pmap now if we're in
647				 * the middle of copyin/out.
648				 *
649				 * but we don't need to do so for kcopy as
650				 * it never touch userspace.
651 				 */
652				kpreempt_disable();
653				if (curcpu()->ci_want_pmapload) {
654					onfault = onfault_handler(pcb, frame);
655					if (onfault != kcopy_fault) {
656						pmap_load();
657					}
658				}
659				/*
660				 * We need to keep the pmap loaded and
661				 * so avoid being preempted until back
662				 * into the copy functions.  Disable
663				 * interrupts at the hardware level before
664				 * re-enabling preemption.  Interrupts
665				 * will be re-enabled by 'iret' when
666				 * returning back out of the trap stub.
667				 * They'll only be re-enabled when the
668				 * program counter is once again in
669				 * the copy functions, and so visible
670				 * to cpu_kpreempt_exit().
671				 */
672#ifndef XENPV
673				x86_disable_intr();
674#endif
675				l->l_nopreempt--;
676				if (l->l_nopreempt > 0 || !l->l_dopreempt ||
677				    pfail) {
678					return;
679				}
680#ifndef XENPV
681				x86_enable_intr();
682#endif
683				/*
684				 * If preemption fails for some reason,
685				 * don't retry it.  The conditions won't
686				 * change under our nose.
687				 */
688				pfail = kpreempt(0);
689			}
690			goto out;
691		}
692
693		if (type == T_PAGEFLT) {
694			onfault = onfault_handler(pcb, frame);
695			if (onfault != NULL)
696				goto copyfault;
697			printf("uvm_fault(%p, %#lx, %d) -> %#x\n",
698			    map, va, ftype, error);
699			goto kernelfault;
700		}
701
702		KSI_INIT_TRAP(&ksi);
703		ksi.ksi_trap = type & ~T_USER;
704		ksi.ksi_addr = (void *)cr2;
705		switch (error) {
706		case EINVAL:
707			ksi.ksi_signo = SIGBUS;
708			ksi.ksi_code = BUS_ADRERR;
709			break;
710		case EACCES:
711			ksi.ksi_signo = SIGSEGV;
712			ksi.ksi_code = SEGV_ACCERR;
713			error = EFAULT;
714			break;
715		case ENOMEM:
716			ksi.ksi_signo = SIGKILL;
717			printf("UVM: pid %d.%d (%s), uid %d killed: "
718			    "out of swap\n", p->p_pid, l->l_lid, p->p_comm,
719			    l->l_cred ?  kauth_cred_geteuid(l->l_cred) : -1);
720			break;
721		default:
722			ksi.ksi_signo = SIGSEGV;
723			ksi.ksi_code = SEGV_MAPERR;
724			break;
725		}
726
727		SIGDEBUG(frame, &ksi, error);
728		(*p->p_emul->e_trapsignal)(l, &ksi);
729		break;
730	}
731
732	case T_TRCTRAP:
733		/*
734		 * Ignore debug register trace traps due to
735		 * accesses in the user's address space, which
736		 * can happen under several conditions such as
737		 * if a user sets a watchpoint on a buffer and
738		 * then passes that buffer to a system call.
739		 * We still want to get TRCTRAPS for addresses
740		 * in kernel space because that is useful when
741		 * debugging the kernel.
742		 */
743		if (x86_dbregs_user_trap())
744			break;
745
746		goto we_re_toast;
747
748	case T_BPTFLT|T_USER:		/* bpt instruction fault */
749	case T_TRCTRAP|T_USER:		/* trace trap */
750		/*
751		 * Don't go single-stepping into a RAS.
752		 */
753		if (p->p_raslist == NULL ||
754		    (ras_lookup(p, (void *)frame->tf_eip) == (void *)-1)) {
755			KSI_INIT_TRAP(&ksi);
756			ksi.ksi_signo = SIGTRAP;
757			ksi.ksi_trap = type & ~T_USER;
758			if (x86_dbregs_user_trap()) {
759				x86_dbregs_store_dr6(l);
760				ksi.ksi_code = TRAP_DBREG;
761			} else if (type == (T_BPTFLT|T_USER))
762				ksi.ksi_code = TRAP_BRKPT;
763			else
764				ksi.ksi_code = TRAP_TRACE;
765			ksi.ksi_addr = (void *)frame->tf_eip;
766			SIGDEBUG(frame, &ksi, error);
767			(*p->p_emul->e_trapsignal)(l, &ksi);
768		}
769		break;
770
771	case T_NMI:
772		if (nmi_dispatch(frame))
773			return;
774		/* NMI can be hooked up to a pushbutton for debugging */
775		if (kgdb_trap(type, frame))
776			return;
777		if (kdb_trap(type, 0, frame))
778			return;
779		/* machine/parity/power fail/"kitchen sink" faults */
780#if NMCA > 0
781		mca_nmi();
782#endif
783		x86_nmi();
784	}
785
786	if ((type & T_USER) == 0)
787		return;
788out:
789	userret(l);
790	return;
791trapsignal:
792	ksi.ksi_trap = type & ~T_USER;
793	SIGDEBUG(frame, &ksi, error);
794	(*p->p_emul->e_trapsignal)(l, &ksi);
795	userret(l);
796}
797
798/*
799 * startlwp: start of a new LWP.
800 */
801void
802startlwp(void *arg)
803{
804	ucontext_t *uc = arg;
805	lwp_t *l = curlwp;
806	int error __diagused;
807
808	error = cpu_setmcontext(l, &uc->uc_mcontext, uc->uc_flags);
809	KASSERT(error == 0);
810
811	kmem_free(uc, sizeof(ucontext_t));
812	userret(l);
813}
814
815#ifdef TRAP_SIGDEBUG
816static void
817frame_dump(const struct trapframe *tf, const struct pcb *pcb)
818{
819	uint64_t fsd, gsd;
820
821	printf("trapframe %p\n", tf);
822	printf("eip 0x%08x  esp 0x%08x  efl 0x%08x\n",
823	    tf->tf_eip, tf->tf_esp, tf->tf_eflags);
824	printf("edi 0x%08x  esi 0x%08x  edx 0x%08x\n",
825	    tf->tf_edi, tf->tf_esi, tf->tf_edx);
826	printf("ecx 0x%08x\n",
827	    tf->tf_ecx);
828	printf("ebp 0x%08x  ebx 0x%08x  eax 0x%08x\n",
829	    tf->tf_ebp, tf->tf_ebx, tf->tf_eax);
830	printf("cs 0x%04x  ds 0x%04x  es 0x%04x  "
831	       "fs 0x%04x  gs 0x%04x  ss 0x%04x\n",
832		tf->tf_cs & 0xffff, tf->tf_ds & 0xffff, tf->tf_es & 0xffff,
833		tf->tf_fs & 0xffff, tf->tf_gs & 0xffff, tf->tf_ss & 0xffff);
834	memcpy(&fsd, &pcb->pcb_fsd, sizeof(fsd));
835	memcpy(&gsd, &pcb->pcb_gsd, sizeof(gsd));
836	printf("fsbase 0x%016llx gsbase 0x%016llx\n", fsd, gsd);
837	printf("\n");
838	hexdump(printf, "Stack dump", tf, 256);
839}
840
841static void
842sigdebug(const struct trapframe *tf, const ksiginfo_t *ksi, int e)
843{
844	struct lwp *l = curlwp;
845	struct proc *p = l->l_proc;
846
847	printf("pid %d.%d (%s): signal %d code=%d (trap %x) "
848	    "@eip %#x addr %#x error=%d\n",
849	    p->p_pid, l->l_lid, p->p_comm, ksi->ksi_signo, ksi->ksi_code,
850	    tf->tf_trapno, tf->tf_eip, rcr2(), e);
851	frame_dump(tf, lwp_getpcb(l));
852}
853#endif
854