subr_syscall.c revision 78946
1/*-
2 * Copyright (C) 1994, David Greenman
3 * Copyright (c) 1990, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the University of Utah, and William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
38 * $FreeBSD: head/sys/kern/subr_trap.c 78946 2001-06-29 04:18:10Z jhb $
39 */
40
41/*
42 * 386 Trap and System call handling
43 */
44
45#include "opt_clock.h"
46#include "opt_cpu.h"
47#include "opt_ddb.h"
48#include "opt_isa.h"
49#include "opt_ktrace.h"
50#include "opt_npx.h"
51#include "opt_trap.h"
52
53#include <sys/param.h>
54#include <sys/bus.h>
55#include <sys/systm.h>
56#include <sys/proc.h>
57#include <sys/pioctl.h>
58#include <sys/kernel.h>
59#include <sys/ktr.h>
60#include <sys/mutex.h>
61#include <sys/resourcevar.h>
62#include <sys/signalvar.h>
63#include <sys/syscall.h>
64#include <sys/sysctl.h>
65#include <sys/sysent.h>
66#include <sys/uio.h>
67#include <sys/vmmeter.h>
68#ifdef KTRACE
69#include <sys/ktrace.h>
70#endif
71
72#include <vm/vm.h>
73#include <vm/vm_param.h>
74#include <sys/lock.h>
75#include <vm/pmap.h>
76#include <vm/vm_kern.h>
77#include <vm/vm_map.h>
78#include <vm/vm_page.h>
79#include <vm/vm_extern.h>
80
81#include <machine/cpu.h>
82#include <machine/md_var.h>
83#include <machine/pcb.h>
84#ifdef SMP
85#include <machine/smp.h>
86#endif
87#include <machine/tss.h>
88
89#include <i386/isa/icu.h>
90#include <i386/isa/intr_machdep.h>
91
92#ifdef POWERFAIL_NMI
93#include <sys/syslog.h>
94#include <machine/clock.h>
95#endif
96
97#include <machine/vm86.h>
98
99#include <ddb/ddb.h>
100
101#include <sys/sysctl.h>
102
103int (*pmath_emulate) __P((struct trapframe *));
104
105extern void trap __P((struct trapframe frame));
106extern int trapwrite __P((unsigned addr));
107extern void syscall __P((struct trapframe frame));
108extern void ast __P((struct trapframe *framep));
109
110static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
111static void trap_fatal __P((struct trapframe *, vm_offset_t));
112void dblfault_handler __P((void));
113
114extern inthand_t IDTVEC(lcall_syscall);
115
116#define MAX_TRAP_MSG		28
117static char *trap_msg[] = {
118	"",					/*  0 unused */
119	"privileged instruction fault",		/*  1 T_PRIVINFLT */
120	"",					/*  2 unused */
121	"breakpoint instruction fault",		/*  3 T_BPTFLT */
122	"",					/*  4 unused */
123	"",					/*  5 unused */
124	"arithmetic trap",			/*  6 T_ARITHTRAP */
125	"",					/*  7 unused */
126	"",					/*  8 unused */
127	"general protection fault",		/*  9 T_PROTFLT */
128	"trace trap",				/* 10 T_TRCTRAP */
129	"",					/* 11 unused */
130	"page fault",				/* 12 T_PAGEFLT */
131	"",					/* 13 unused */
132	"alignment fault",			/* 14 T_ALIGNFLT */
133	"",					/* 15 unused */
134	"",					/* 16 unused */
135	"",					/* 17 unused */
136	"integer divide fault",			/* 18 T_DIVIDE */
137	"non-maskable interrupt trap",		/* 19 T_NMI */
138	"overflow trap",			/* 20 T_OFLOW */
139	"FPU bounds check fault",		/* 21 T_BOUND */
140	"FPU device not available",		/* 22 T_DNA */
141	"double fault",				/* 23 T_DOUBLEFLT */
142	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
143	"invalid TSS fault",			/* 25 T_TSSFLT */
144	"segment not present fault",		/* 26 T_SEGNPFLT */
145	"stack fault",				/* 27 T_STKFLT */
146	"machine check trap",			/* 28 T_MCHK */
147};
148
149#if defined(I586_CPU) && !defined(NO_F00F_HACK)
150extern int has_f00f_bug;
151#endif
152
153#ifdef DDB
154static int ddb_on_nmi = 1;
155SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
156	&ddb_on_nmi, 0, "Go to DDB on NMI");
157#endif
158static int panic_on_nmi = 1;
159SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
160	&panic_on_nmi, 0, "Panic on NMI");
161
162#ifdef WITNESS
163extern char *syscallnames[];
164#endif
165
166void
167userret(p, frame, oticks)
168	struct proc *p;
169	struct trapframe *frame;
170	u_quad_t oticks;
171{
172	int sig;
173
174	PROC_LOCK(p);
175	while ((sig = CURSIG(p)) != 0)
176		postsig(sig);
177
178	mtx_lock_spin(&sched_lock);
179	PROC_UNLOCK_NOSWITCH(p);
180	p->p_pri.pri_level = p->p_pri.pri_user;
181	if (resched_wanted(p)) {
182		/*
183		 * Since we are curproc, clock will normally just change
184		 * our priority without moving us from one queue to another
185		 * (since the running process is not on a queue.)
186		 * If that happened after we setrunqueue ourselves but before we
187		 * mi_switch()'ed, we might not be on the queue indicated by
188		 * our priority.
189		 */
190		DROP_GIANT_NOSWITCH();
191		setrunqueue(p);
192		p->p_stats->p_ru.ru_nivcsw++;
193		mi_switch();
194		mtx_unlock_spin(&sched_lock);
195		PICKUP_GIANT();
196		PROC_LOCK(p);
197		while ((sig = CURSIG(p)) != 0)
198			postsig(sig);
199		mtx_lock_spin(&sched_lock);
200		PROC_UNLOCK_NOSWITCH(p);
201	}
202
203	/*
204	 * Charge system time if profiling.
205	 */
206	if (p->p_sflag & PS_PROFIL) {
207		mtx_unlock_spin(&sched_lock);
208		addupc_task(p, TRAPF_PC(frame),
209			    (u_int)(p->p_sticks - oticks) * psratio);
210	} else
211		mtx_unlock_spin(&sched_lock);
212}
213
214/*
215 * Exception, fault, and trap interface to the FreeBSD kernel.
216 * This common code is called from assembly language IDT gate entry
217 * routines that prepare a suitable stack frame, and restore this
218 * frame after the exception has been processed.
219 */
220
221void
222trap(frame)
223	struct trapframe frame;
224{
225	struct proc *p = curproc;
226	u_quad_t sticks = 0;
227	int i = 0, ucode = 0, type, code;
228	vm_offset_t eva;
229#ifdef POWERFAIL_NMI
230	static int lastalert = 0;
231#endif
232
233	atomic_add_int(&cnt.v_trap, 1);
234
235	if ((frame.tf_eflags & PSL_I) == 0) {
236		/*
237		 * Buggy application or kernel code has disabled
238		 * interrupts and then trapped.  Enabling interrupts
239		 * now is wrong, but it is better than running with
240		 * interrupts disabled until they are accidentally
241		 * enabled later.  XXX This is really bad if we trap
242		 * while holding a spin lock.
243		 */
244		type = frame.tf_trapno;
245		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
246			printf(
247			    "pid %ld (%s): trap %d with interrupts disabled\n",
248			    (long)curproc->p_pid, curproc->p_comm, type);
249		else if (type != T_BPTFLT && type != T_TRCTRAP) {
250			/*
251			 * XXX not quite right, since this may be for a
252			 * multiple fault in user mode.
253			 */
254			printf("kernel trap %d with interrupts disabled\n",
255			    type);
256			/*
257			 * We should walk p_heldmtx here and see if any are
258			 * spin mutexes, and not do this if so.
259			 */
260			enable_intr();
261		}
262	}
263
264	eva = 0;
265
266#if defined(I586_CPU) && !defined(NO_F00F_HACK)
267restart:
268#endif
269
270	type = frame.tf_trapno;
271	code = frame.tf_err;
272
273        if ((ISPL(frame.tf_cs) == SEL_UPL) ||
274	    ((frame.tf_eflags & PSL_VM) && !in_vm86call)) {
275		/* user trap */
276
277		mtx_lock_spin(&sched_lock);
278		sticks = p->p_sticks;
279		mtx_unlock_spin(&sched_lock);
280		p->p_md.md_regs = &frame;
281
282		switch (type) {
283		case T_PRIVINFLT:	/* privileged instruction fault */
284			ucode = type;
285			i = SIGILL;
286			break;
287
288		case T_BPTFLT:		/* bpt instruction fault */
289		case T_TRCTRAP:		/* trace trap */
290			frame.tf_eflags &= ~PSL_T;
291			i = SIGTRAP;
292			break;
293
294		case T_ARITHTRAP:	/* arithmetic trap */
295#ifdef DEV_NPX
296			ucode = npxtrap();
297			if (ucode == -1)
298				return;
299#else
300			ucode = code;
301#endif
302			i = SIGFPE;
303			break;
304
305			/*
306			 * The following two traps can happen in
307			 * vm86 mode, and, if so, we want to handle
308			 * them specially.
309			 */
310		case T_PROTFLT:		/* general protection fault */
311		case T_STKFLT:		/* stack fault */
312			if (frame.tf_eflags & PSL_VM) {
313				mtx_lock(&Giant);
314				i = vm86_emulate((struct vm86frame *)&frame);
315				mtx_unlock(&Giant);
316				if (i == 0)
317					goto user;
318				break;
319			}
320			/* FALL THROUGH */
321
322		case T_SEGNPFLT:	/* segment not present fault */
323		case T_TSSFLT:		/* invalid TSS fault */
324		case T_DOUBLEFLT:	/* double fault */
325		default:
326			ucode = code + BUS_SEGM_FAULT ;
327			i = SIGBUS;
328			break;
329
330		case T_PAGEFLT:		/* page fault */
331			/*
332			 * For some Cyrix CPUs, %cr2 is clobbered by
333			 * interrupts.  This problem is worked around by using
334			 * an interrupt gate for the pagefault handler.  We
335			 * are finally ready to read %cr2 and then must
336			 * reenable interrupts.
337			 */
338			eva = rcr2();
339			enable_intr();
340			mtx_lock(&Giant);
341			i = trap_pfault(&frame, TRUE, eva);
342			mtx_unlock(&Giant);
343#if defined(I586_CPU) && !defined(NO_F00F_HACK)
344			if (i == -2) {
345				/*
346				 * f00f hack workaround has triggered, treat
347				 * as illegal instruction not page fault.
348				 */
349				frame.tf_trapno = T_PRIVINFLT;
350				goto restart;
351			}
352#endif
353			if (i == -1)
354				goto out;
355			if (i == 0)
356				goto user;
357
358			ucode = T_PAGEFLT;
359			break;
360
361		case T_DIVIDE:		/* integer divide fault */
362			ucode = FPE_INTDIV;
363			i = SIGFPE;
364			break;
365
366#ifdef DEV_ISA
367		case T_NMI:
368#ifdef POWERFAIL_NMI
369#ifndef TIMER_FREQ
370#  define TIMER_FREQ 1193182
371#endif
372			mtx_lock(&Giant);
373			if (time_second - lastalert > 10) {
374				log(LOG_WARNING, "NMI: power fail\n");
375				sysbeep(TIMER_FREQ/880, hz);
376				lastalert = time_second;
377			}
378			mtx_unlock(&Giant);
379			goto out;
380#else /* !POWERFAIL_NMI */
381			/* machine/parity/power fail/"kitchen sink" faults */
382			/* XXX Giant */
383			if (isa_nmi(code) == 0) {
384#ifdef DDB
385				/*
386				 * NMI can be hooked up to a pushbutton
387				 * for debugging.
388				 */
389				if (ddb_on_nmi) {
390					printf ("NMI ... going to debugger\n");
391					kdb_trap (type, 0, &frame);
392				}
393#endif /* DDB */
394				goto out;
395			} else if (panic_on_nmi)
396				panic("NMI indicates hardware failure");
397			break;
398#endif /* POWERFAIL_NMI */
399#endif /* DEV_ISA */
400
401		case T_OFLOW:		/* integer overflow fault */
402			ucode = FPE_INTOVF;
403			i = SIGFPE;
404			break;
405
406		case T_BOUND:		/* bounds check fault */
407			ucode = FPE_FLTSUB;
408			i = SIGFPE;
409			break;
410
411		case T_DNA:
412#ifdef DEV_NPX
413			/* transparent fault (due to context switch "late") */
414			if (npxdna())
415				goto out;
416#endif
417			if (!pmath_emulate) {
418				i = SIGFPE;
419				ucode = FPE_FPU_NP_TRAP;
420				break;
421			}
422			mtx_lock(&Giant);
423			i = (*pmath_emulate)(&frame);
424			mtx_unlock(&Giant);
425			if (i == 0) {
426				if (!(frame.tf_eflags & PSL_T))
427					goto out;
428				frame.tf_eflags &= ~PSL_T;
429				i = SIGTRAP;
430			}
431			/* else ucode = emulator_only_knows() XXX */
432			break;
433
434		case T_FPOPFLT:		/* FPU operand fetch fault */
435			ucode = T_FPOPFLT;
436			i = SIGILL;
437			break;
438		}
439	} else {
440		/* kernel trap */
441
442		switch (type) {
443		case T_PAGEFLT:			/* page fault */
444			/*
445			 * For some Cyrix CPUs, %cr2 is clobbered by
446			 * interrupts.  This problem is worked around by using
447			 * an interrupt gate for the pagefault handler.  We
448			 * are finally ready to read %cr2 and then must
449			 * reenable interrupts.
450			 */
451			eva = rcr2();
452			enable_intr();
453			mtx_lock(&Giant);
454			(void) trap_pfault(&frame, FALSE, eva);
455			mtx_unlock(&Giant);
456			goto out;
457
458		case T_DNA:
459#ifdef DEV_NPX
460			/*
461			 * The kernel is apparently using npx for copying.
462			 * XXX this should be fatal unless the kernel has
463			 * registered such use.
464			 */
465			if (npxdna())
466				goto out;
467#endif
468			break;
469
470			/*
471			 * The following two traps can happen in
472			 * vm86 mode, and, if so, we want to handle
473			 * them specially.
474			 */
475		case T_PROTFLT:		/* general protection fault */
476		case T_STKFLT:		/* stack fault */
477			if (frame.tf_eflags & PSL_VM) {
478				mtx_lock(&Giant);
479				i = vm86_emulate((struct vm86frame *)&frame);
480				mtx_unlock(&Giant);
481				if (i != 0)
482					/*
483					 * returns to original process
484					 */
485					vm86_trap((struct vm86frame *)&frame);
486				goto out;
487			}
488			if (type == T_STKFLT)
489				break;
490
491			/* FALL THROUGH */
492
493		case T_SEGNPFLT:	/* segment not present fault */
494			if (in_vm86call)
495				break;
496
497			if (p->p_intr_nesting_level != 0)
498				break;
499
500			/*
501			 * Invalid %fs's and %gs's can be created using
502			 * procfs or PT_SETREGS or by invalidating the
503			 * underlying LDT entry.  This causes a fault
504			 * in kernel mode when the kernel attempts to
505			 * switch contexts.  Lose the bad context
506			 * (XXX) so that we can continue, and generate
507			 * a signal.
508			 */
509			if (frame.tf_eip == (int)cpu_switch_load_gs) {
510				PCPU_GET(curpcb)->pcb_gs = 0;
511				PROC_LOCK(p);
512				psignal(p, SIGBUS);
513				PROC_UNLOCK(p);
514				goto out;
515			}
516
517			/*
518			 * Invalid segment selectors and out of bounds
519			 * %eip's and %esp's can be set up in user mode.
520			 * This causes a fault in kernel mode when the
521			 * kernel tries to return to user mode.  We want
522			 * to get this fault so that we can fix the
523			 * problem here and not have to check all the
524			 * selectors and pointers when the user changes
525			 * them.
526			 */
527			if (frame.tf_eip == (int)doreti_iret) {
528				frame.tf_eip = (int)doreti_iret_fault;
529				goto out;
530			}
531			if (frame.tf_eip == (int)doreti_popl_ds) {
532				frame.tf_eip = (int)doreti_popl_ds_fault;
533				goto out;
534			}
535			if (frame.tf_eip == (int)doreti_popl_es) {
536				frame.tf_eip = (int)doreti_popl_es_fault;
537				goto out;
538			}
539			if (frame.tf_eip == (int)doreti_popl_fs) {
540				frame.tf_eip = (int)doreti_popl_fs_fault;
541				goto out;
542			}
543			if (PCPU_GET(curpcb) != NULL &&
544			    PCPU_GET(curpcb)->pcb_onfault != NULL) {
545				frame.tf_eip =
546				    (int)PCPU_GET(curpcb)->pcb_onfault;
547				goto out;
548			}
549			break;
550
551		case T_TSSFLT:
552			/*
553			 * PSL_NT can be set in user mode and isn't cleared
554			 * automatically when the kernel is entered.  This
555			 * causes a TSS fault when the kernel attempts to
556			 * `iret' because the TSS link is uninitialized.  We
557			 * want to get this fault so that we can fix the
558			 * problem here and not every time the kernel is
559			 * entered.
560			 */
561			if (frame.tf_eflags & PSL_NT) {
562				frame.tf_eflags &= ~PSL_NT;
563				goto out;
564			}
565			break;
566
567		case T_TRCTRAP:	 /* trace trap */
568			if (frame.tf_eip == (int)IDTVEC(lcall_syscall)) {
569				/*
570				 * We've just entered system mode via the
571				 * syscall lcall.  Continue single stepping
572				 * silently until the syscall handler has
573				 * saved the flags.
574				 */
575				goto out;
576			}
577			if (frame.tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
578				/*
579				 * The syscall handler has now saved the
580				 * flags.  Stop single stepping it.
581				 */
582				frame.tf_eflags &= ~PSL_T;
583				goto out;
584			}
585			/*
586			 * Ignore debug register trace traps due to
587			 * accesses in the user's address space, which
588			 * can happen under several conditions such as
589			 * if a user sets a watchpoint on a buffer and
590			 * then passes that buffer to a system call.
591			 * We still want to get TRCTRAPS for addresses
592			 * in kernel space because that is useful when
593			 * debugging the kernel.
594			 */
595			/* XXX Giant */
596			if (user_dbreg_trap() && !in_vm86call) {
597				/*
598				 * Reset breakpoint bits because the
599				 * processor doesn't
600				 */
601				load_dr6(rdr6() & 0xfffffff0);
602				goto out;
603			}
604			/*
605			 * Fall through (TRCTRAP kernel mode, kernel address)
606			 */
607		case T_BPTFLT:
608			/*
609			 * If DDB is enabled, let it handle the debugger trap.
610			 * Otherwise, debugger traps "can't happen".
611			 */
612#ifdef DDB
613			/* XXX Giant */
614			if (kdb_trap (type, 0, &frame))
615				goto out;
616#endif
617			break;
618
619#ifdef DEV_ISA
620		case T_NMI:
621#ifdef POWERFAIL_NMI
622			mtx_lock(&Giant);
623			if (time_second - lastalert > 10) {
624				log(LOG_WARNING, "NMI: power fail\n");
625				sysbeep(TIMER_FREQ/880, hz);
626				lastalert = time_second;
627			}
628			mtx_unlock(&Giant);
629			goto out;
630#else /* !POWERFAIL_NMI */
631			/* XXX Giant */
632			/* machine/parity/power fail/"kitchen sink" faults */
633			if (isa_nmi(code) == 0) {
634#ifdef DDB
635				/*
636				 * NMI can be hooked up to a pushbutton
637				 * for debugging.
638				 */
639				if (ddb_on_nmi) {
640					printf ("NMI ... going to debugger\n");
641					kdb_trap (type, 0, &frame);
642				}
643#endif /* DDB */
644				goto out;
645			} else if (panic_on_nmi == 0)
646				goto out;
647			/* FALL THROUGH */
648#endif /* POWERFAIL_NMI */
649#endif /* DEV_ISA */
650		}
651
652		trap_fatal(&frame, eva);
653		goto out;
654	}
655
656	mtx_lock(&Giant);
657	/* Translate fault for emulators (e.g. Linux) */
658	if (*p->p_sysent->sv_transtrap)
659		i = (*p->p_sysent->sv_transtrap)(i, type);
660
661	trapsignal(p, i, ucode);
662
663#ifdef DEBUG
664	if (type <= MAX_TRAP_MSG) {
665		uprintf("fatal process exception: %s",
666			trap_msg[type]);
667		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
668			uprintf(", fault VA = 0x%lx", (u_long)eva);
669		uprintf("\n");
670	}
671#endif
672	mtx_unlock(&Giant);
673
674user:
675	userret(p, &frame, sticks);
676	if (mtx_owned(&Giant))
677		mtx_unlock(&Giant);
678out:
679	return;
680}
681
682#ifdef notyet
683/*
684 * This version doesn't allow a page fault to user space while
685 * in the kernel. The rest of the kernel needs to be made "safe"
686 * before this can be used. I think the only things remaining
687 * to be made safe are the iBCS2 code and the process tracing/
688 * debugging code.
689 */
690static int
691trap_pfault(frame, usermode, eva)
692	struct trapframe *frame;
693	int usermode;
694	vm_offset_t eva;
695{
696	vm_offset_t va;
697	struct vmspace *vm = NULL;
698	vm_map_t map = 0;
699	int rv = 0;
700	vm_prot_t ftype;
701	struct proc *p = curproc;
702
703	if (frame->tf_err & PGEX_W)
704		ftype = VM_PROT_WRITE;
705	else
706		ftype = VM_PROT_READ;
707
708	va = trunc_page(eva);
709	if (va < VM_MIN_KERNEL_ADDRESS) {
710		vm_offset_t v;
711		vm_page_t mpte;
712
713		if (p == NULL ||
714		    (!usermode && va < VM_MAXUSER_ADDRESS &&
715		     (p->p_intr_nesting_level != 0 ||
716		      PCPU_GET(curpcb) == NULL ||
717		      PCPU_GET(curpcb)->pcb_onfault == NULL))) {
718			trap_fatal(frame, eva);
719			return (-1);
720		}
721
722		/*
723		 * This is a fault on non-kernel virtual memory.
724		 * vm is initialized above to NULL. If curproc is NULL
725		 * or curproc->p_vmspace is NULL the fault is fatal.
726		 */
727		vm = p->p_vmspace;
728		if (vm == NULL)
729			goto nogo;
730
731		map = &vm->vm_map;
732
733		/*
734		 * Keep swapout from messing with us during this
735		 *	critical time.
736		 */
737		PROC_LOCK(p);
738		++p->p_lock;
739		PROC_UNLOCK(p);
740
741		/*
742		 * Grow the stack if necessary
743		 */
744		/* grow_stack returns false only if va falls into
745		 * a growable stack region and the stack growth
746		 * fails.  It returns true if va was not within
747		 * a growable stack region, or if the stack
748		 * growth succeeded.
749		 */
750		if (!grow_stack (p, va))
751			rv = KERN_FAILURE;
752		else
753			/* Fault in the user page: */
754			rv = vm_fault(map, va, ftype,
755			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
756						      : VM_FAULT_NORMAL);
757
758		PROC_LOCK(p);
759		--p->p_lock;
760		PROC_UNLOCK(p);
761	} else {
762		/*
763		 * Don't allow user-mode faults in kernel address space.
764		 */
765		if (usermode)
766			goto nogo;
767
768		/*
769		 * Since we know that kernel virtual address addresses
770		 * always have pte pages mapped, we just have to fault
771		 * the page.
772		 */
773		rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL);
774	}
775
776	if (rv == KERN_SUCCESS)
777		return (0);
778nogo:
779	if (!usermode) {
780		if (p->p_intr_nesting_level == 0 &&
781		    PCPU_GET(curpcb) != NULL &&
782		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
783			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
784			return (0);
785		}
786		trap_fatal(frame, eva);
787		return (-1);
788	}
789
790	/* kludge to pass faulting virtual address to sendsig */
791	frame->tf_err = eva;
792
793	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
794}
795#endif
796
797int
798trap_pfault(frame, usermode, eva)
799	struct trapframe *frame;
800	int usermode;
801	vm_offset_t eva;
802{
803	vm_offset_t va;
804	struct vmspace *vm = NULL;
805	vm_map_t map = 0;
806	int rv = 0;
807	vm_prot_t ftype;
808	struct proc *p = curproc;
809
810	va = trunc_page(eva);
811	if (va >= KERNBASE) {
812		/*
813		 * Don't allow user-mode faults in kernel address space.
814		 * An exception:  if the faulting address is the invalid
815		 * instruction entry in the IDT, then the Intel Pentium
816		 * F00F bug workaround was triggered, and we need to
817		 * treat it is as an illegal instruction, and not a page
818		 * fault.
819		 */
820#if defined(I586_CPU) && !defined(NO_F00F_HACK)
821		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
822			return -2;
823#endif
824		if (usermode)
825			goto nogo;
826
827		map = kernel_map;
828	} else {
829		/*
830		 * This is a fault on non-kernel virtual memory.
831		 * vm is initialized above to NULL. If curproc is NULL
832		 * or curproc->p_vmspace is NULL the fault is fatal.
833		 */
834		if (p != NULL)
835			vm = p->p_vmspace;
836
837		if (vm == NULL)
838			goto nogo;
839
840		map = &vm->vm_map;
841	}
842
843	if (frame->tf_err & PGEX_W)
844		ftype = VM_PROT_WRITE;
845	else
846		ftype = VM_PROT_READ;
847
848	if (map != kernel_map) {
849		/*
850		 * Keep swapout from messing with us during this
851		 *	critical time.
852		 */
853		PROC_LOCK(p);
854		++p->p_lock;
855		PROC_UNLOCK(p);
856
857		/*
858		 * Grow the stack if necessary
859		 */
860		/* grow_stack returns false only if va falls into
861		 * a growable stack region and the stack growth
862		 * fails.  It returns true if va was not within
863		 * a growable stack region, or if the stack
864		 * growth succeeded.
865		 */
866		if (!grow_stack (p, va))
867			rv = KERN_FAILURE;
868		else
869			/* Fault in the user page: */
870			rv = vm_fault(map, va, ftype,
871			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
872						      : VM_FAULT_NORMAL);
873
874		PROC_LOCK(p);
875		--p->p_lock;
876		PROC_UNLOCK(p);
877	} else {
878		/*
879		 * Don't have to worry about process locking or stacks in the
880		 * kernel.
881		 */
882		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
883	}
884
885	if (rv == KERN_SUCCESS)
886		return (0);
887nogo:
888	if (!usermode) {
889		if (p->p_intr_nesting_level == 0 &&
890		    PCPU_GET(curpcb) != NULL &&
891		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
892			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
893			return (0);
894		}
895		trap_fatal(frame, eva);
896		return (-1);
897	}
898
899	/* kludge to pass faulting virtual address to sendsig */
900	frame->tf_err = eva;
901
902	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
903}
904
905static void
906trap_fatal(frame, eva)
907	struct trapframe *frame;
908	vm_offset_t eva;
909{
910	int code, type, ss, esp;
911	struct soft_segment_descriptor softseg;
912
913	code = frame->tf_err;
914	type = frame->tf_trapno;
915	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
916
917	if (type <= MAX_TRAP_MSG)
918		printf("\n\nFatal trap %d: %s while in %s mode\n",
919			type, trap_msg[type],
920        		frame->tf_eflags & PSL_VM ? "vm86" :
921			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
922#ifdef SMP
923	/* two separate prints in case of a trap on an unmapped page */
924	printf("cpuid = %d; ", PCPU_GET(cpuid));
925	printf("lapic.id = %08x\n", lapic.id);
926#endif
927	if (type == T_PAGEFLT) {
928		printf("fault virtual address	= 0x%x\n", eva);
929		printf("fault code		= %s %s, %s\n",
930			code & PGEX_U ? "user" : "supervisor",
931			code & PGEX_W ? "write" : "read",
932			code & PGEX_P ? "protection violation" : "page not present");
933	}
934	printf("instruction pointer	= 0x%x:0x%x\n",
935	       frame->tf_cs & 0xffff, frame->tf_eip);
936        if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
937		ss = frame->tf_ss & 0xffff;
938		esp = frame->tf_esp;
939	} else {
940		ss = GSEL(GDATA_SEL, SEL_KPL);
941		esp = (int)&frame->tf_esp;
942	}
943	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
944	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
945	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
946	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
947	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
948	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
949	       softseg.ssd_gran);
950	printf("processor eflags	= ");
951	if (frame->tf_eflags & PSL_T)
952		printf("trace trap, ");
953	if (frame->tf_eflags & PSL_I)
954		printf("interrupt enabled, ");
955	if (frame->tf_eflags & PSL_NT)
956		printf("nested task, ");
957	if (frame->tf_eflags & PSL_RF)
958		printf("resume, ");
959	if (frame->tf_eflags & PSL_VM)
960		printf("vm86, ");
961	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
962	printf("current process		= ");
963	if (curproc) {
964		printf("%lu (%s)\n",
965		    (u_long)curproc->p_pid, curproc->p_comm ?
966		    curproc->p_comm : "");
967	} else {
968		printf("Idle\n");
969	}
970
971#ifdef KDB
972	if (kdb_trap(&psl))
973		return;
974#endif
975#ifdef DDB
976	if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame))
977		return;
978#endif
979	printf("trap number		= %d\n", type);
980	if (type <= MAX_TRAP_MSG)
981		panic(trap_msg[type]);
982	else
983		panic("unknown/reserved trap");
984}
985
986/*
987 * Double fault handler. Called when a fault occurs while writing
988 * a frame for a trap/exception onto the stack. This usually occurs
989 * when the stack overflows (such is the case with infinite recursion,
990 * for example).
991 *
992 * XXX Note that the current PTD gets replaced by IdlePTD when the
993 * task switch occurs. This means that the stack that was active at
994 * the time of the double fault is not available at <kstack> unless
995 * the machine was idle when the double fault occurred. The downside
996 * of this is that "trace <ebp>" in ddb won't work.
997 */
998void
999dblfault_handler()
1000{
1001	printf("\nFatal double fault:\n");
1002	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
1003	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
1004	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
1005#ifdef SMP
1006	/* two separate prints in case of a trap on an unmapped page */
1007	printf("cpuid = %d; ", PCPU_GET(cpuid));
1008	printf("lapic.id = %08x\n", lapic.id);
1009#endif
1010	panic("double fault");
1011}
1012
1013/*
1014 * Compensate for 386 brain damage (missing URKR).
1015 * This is a little simpler than the pagefault handler in trap() because
1016 * it the page tables have already been faulted in and high addresses
1017 * are thrown out early for other reasons.
1018 */
1019int trapwrite(addr)
1020	unsigned addr;
1021{
1022	struct proc *p;
1023	vm_offset_t va;
1024	struct vmspace *vm;
1025	int rv;
1026
1027	va = trunc_page((vm_offset_t)addr);
1028	/*
1029	 * XXX - MAX is END.  Changed > to >= for temp. fix.
1030	 */
1031	if (va >= VM_MAXUSER_ADDRESS)
1032		return (1);
1033
1034	p = curproc;
1035	vm = p->p_vmspace;
1036
1037	PROC_LOCK(p);
1038	++p->p_lock;
1039	PROC_UNLOCK(p);
1040
1041	if (!grow_stack (p, va))
1042		rv = KERN_FAILURE;
1043	else
1044		/*
1045		 * fault the data page
1046		 */
1047		rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
1048
1049	PROC_LOCK(p);
1050	--p->p_lock;
1051	PROC_UNLOCK(p);
1052
1053	if (rv != KERN_SUCCESS)
1054		return 1;
1055
1056	return (0);
1057}
1058
1059/*
1060 *	syscall -	MP aware system call request C handler
1061 *
1062 *	A system call is essentially treated as a trap except that the
1063 *	MP lock is not held on entry or return.  We are responsible for
1064 *	obtaining the MP lock if necessary and for handling ASTs
1065 *	(e.g. a task switch) prior to return.
1066 *
1067 *	In general, only simple access and manipulation of curproc and
1068 *	the current stack is allowed without having to hold MP lock.
1069 */
1070void
1071syscall(frame)
1072	struct trapframe frame;
1073{
1074	caddr_t params;
1075	int i;
1076	struct sysent *callp;
1077	struct proc *p = curproc;
1078	u_quad_t sticks;
1079	int error;
1080	int narg;
1081	int args[8];
1082	u_int code;
1083
1084	atomic_add_int(&cnt.v_syscall, 1);
1085
1086#ifdef DIAGNOSTIC
1087	if (ISPL(frame.tf_cs) != SEL_UPL) {
1088		mtx_lock(&Giant);
1089		panic("syscall");
1090		/* NOT REACHED */
1091	}
1092#endif
1093
1094	mtx_lock_spin(&sched_lock);
1095	sticks = p->p_sticks;
1096	mtx_unlock_spin(&sched_lock);
1097
1098	p->p_md.md_regs = &frame;
1099	params = (caddr_t)frame.tf_esp + sizeof(int);
1100	code = frame.tf_eax;
1101
1102	if (p->p_sysent->sv_prepsyscall) {
1103		/*
1104		 * The prep code is not MP aware.
1105		 */
1106		mtx_lock(&Giant);
1107		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
1108		mtx_unlock(&Giant);
1109	} else {
1110		/*
1111		 * Need to check if this is a 32 bit or 64 bit syscall.
1112		 * fuword is MP aware.
1113		 */
1114		if (code == SYS_syscall) {
1115			/*
1116			 * Code is first argument, followed by actual args.
1117			 */
1118			code = fuword(params);
1119			params += sizeof(int);
1120		} else if (code == SYS___syscall) {
1121			/*
1122			 * Like syscall, but code is a quad, so as to maintain
1123			 * quad alignment for the rest of the arguments.
1124			 */
1125			code = fuword(params);
1126			params += sizeof(quad_t);
1127		}
1128	}
1129
1130 	if (p->p_sysent->sv_mask)
1131 		code &= p->p_sysent->sv_mask;
1132
1133 	if (code >= p->p_sysent->sv_size)
1134 		callp = &p->p_sysent->sv_table[0];
1135  	else
1136 		callp = &p->p_sysent->sv_table[code];
1137
1138	narg = callp->sy_narg & SYF_ARGMASK;
1139
1140	/*
1141	 * copyin is MP aware, but the tracing code is not
1142	 */
1143	if (params && (i = narg * sizeof(int)) &&
1144	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
1145		mtx_lock(&Giant);
1146#ifdef KTRACE
1147		if (KTRPOINT(p, KTR_SYSCALL))
1148			ktrsyscall(p->p_tracep, code, narg, args);
1149#endif
1150		goto bad;
1151	}
1152
1153	/*
1154	 * Try to run the syscall without the MP lock if the syscall
1155	 * is MP safe.
1156	 */
1157	if ((callp->sy_narg & SYF_MPSAFE) == 0) {
1158		mtx_lock(&Giant);
1159	}
1160
1161#ifdef KTRACE
1162	/*
1163	 * We have to obtain the MP lock no matter what if
1164	 * we are ktracing
1165	 */
1166	if (KTRPOINT(p, KTR_SYSCALL)) {
1167		if (!mtx_owned(&Giant))
1168			mtx_lock(&Giant);
1169		ktrsyscall(p->p_tracep, code, narg, args);
1170	}
1171#endif
1172	p->p_retval[0] = 0;
1173	p->p_retval[1] = frame.tf_edx;
1174
1175	STOPEVENT(p, S_SCE, narg);	/* MP aware */
1176
1177	error = (*callp->sy_call)(p, args);
1178
1179	/*
1180	 * MP SAFE (we may or may not have the MP lock at this point)
1181	 */
1182	switch (error) {
1183	case 0:
1184		frame.tf_eax = p->p_retval[0];
1185		frame.tf_edx = p->p_retval[1];
1186		frame.tf_eflags &= ~PSL_C;
1187		break;
1188
1189	case ERESTART:
1190		/*
1191		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
1192		 * int 0x80 is 2 bytes. We saved this in tf_err.
1193		 */
1194		frame.tf_eip -= frame.tf_err;
1195		break;
1196
1197	case EJUSTRETURN:
1198		break;
1199
1200	default:
1201bad:
1202 		if (p->p_sysent->sv_errsize) {
1203 			if (error >= p->p_sysent->sv_errsize)
1204  				error = -1;	/* XXX */
1205   			else
1206  				error = p->p_sysent->sv_errtbl[error];
1207		}
1208		frame.tf_eax = error;
1209		frame.tf_eflags |= PSL_C;
1210		break;
1211	}
1212
1213	/*
1214	 * Traced syscall.  trapsignal() is not MP aware.
1215	 */
1216	if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
1217		if (!mtx_owned(&Giant))
1218			mtx_lock(&Giant);
1219		frame.tf_eflags &= ~PSL_T;
1220		trapsignal(p, SIGTRAP, 0);
1221	}
1222
1223	/*
1224	 * Handle reschedule and other end-of-syscall issues
1225	 */
1226	userret(p, &frame, sticks);
1227
1228#ifdef KTRACE
1229	if (KTRPOINT(p, KTR_SYSRET)) {
1230		if (!mtx_owned(&Giant))
1231			mtx_lock(&Giant);
1232		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
1233	}
1234#endif
1235
1236	/*
1237	 * Release Giant if we had to get it
1238	 */
1239	if (mtx_owned(&Giant))
1240		mtx_unlock(&Giant);
1241
1242	/*
1243	 * This works because errno is findable through the
1244	 * register set.  If we ever support an emulation where this
1245	 * is not the case, this code will need to be revisited.
1246	 */
1247	STOPEVENT(p, S_SCX, code);
1248
1249#ifdef WITNESS
1250	if (witness_list(p)) {
1251		panic("system call %s returning with mutex(s) held\n",
1252		    syscallnames[code]);
1253	}
1254#endif
1255	mtx_assert(&sched_lock, MA_NOTOWNED);
1256	mtx_assert(&Giant, MA_NOTOWNED);
1257}
1258
1259void
1260ast(framep)
1261	struct trapframe *framep;
1262{
1263	struct proc *p = CURPROC;
1264	u_quad_t sticks;
1265#if defined(DEV_NPX) && !defined(SMP)
1266	int ucode;
1267#endif
1268
1269	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
1270
1271	/*
1272	 * We check for a pending AST here rather than in the assembly as
1273	 * acquiring and releasing mutexes in assembly is not fun.
1274	 */
1275	mtx_lock_spin(&sched_lock);
1276	if (!(astpending(p) || resched_wanted(p))) {
1277		mtx_unlock_spin(&sched_lock);
1278		return;
1279	}
1280
1281	sticks = p->p_sticks;
1282	p->p_md.md_regs = framep;
1283
1284	astoff(p);
1285	cnt.v_soft++;
1286	mtx_intr_enable(&sched_lock);
1287	if (p->p_sflag & PS_OWEUPC) {
1288		p->p_sflag &= ~PS_OWEUPC;
1289		mtx_unlock_spin(&sched_lock);
1290		mtx_lock(&Giant);
1291		addupc_task(p, p->p_stats->p_prof.pr_addr,
1292			    p->p_stats->p_prof.pr_ticks);
1293		mtx_lock_spin(&sched_lock);
1294	}
1295	if (p->p_sflag & PS_ALRMPEND) {
1296		p->p_sflag &= ~PS_ALRMPEND;
1297		mtx_unlock_spin(&sched_lock);
1298		PROC_LOCK(p);
1299		psignal(p, SIGVTALRM);
1300		PROC_UNLOCK(p);
1301		mtx_lock_spin(&sched_lock);
1302	}
1303#if defined(DEV_NPX) && !defined(SMP)
1304	if (PCPU_GET(curpcb)->pcb_flags & PCB_NPXTRAP) {
1305		PCPU_GET(curpcb)->pcb_flags &= ~PCB_NPXTRAP;
1306		mtx_unlock_spin(&sched_lock);
1307		ucode = npxtrap();
1308		if (ucode != -1) {
1309			if (!mtx_owned(&Giant))
1310				mtx_lock(&Giant);
1311			trapsignal(p, SIGFPE, ucode);
1312		}
1313		mtx_lock_spin(&sched_lock);
1314	}
1315#endif
1316	if (p->p_sflag & PS_PROFPEND) {
1317		p->p_sflag &= ~PS_PROFPEND;
1318		mtx_unlock_spin(&sched_lock);
1319		PROC_LOCK(p);
1320		psignal(p, SIGPROF);
1321		PROC_UNLOCK(p);
1322	} else
1323		mtx_unlock_spin(&sched_lock);
1324
1325	userret(p, framep, sticks);
1326
1327	if (mtx_owned(&Giant))
1328		mtx_unlock(&Giant);
1329}
1330