subr_syscall.c revision 41547
1/*-
2 * Copyright (C) 1994, David Greenman
3 * Copyright (c) 1990, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the University of Utah, and William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
38 *	$Id: trap.c,v 1.129 1998/12/02 08:15:16 kato Exp $
39 */
40
41/*
42 * 386 Trap and System call handling
43 */
44
45#include "opt_cpu.h"
46#include "opt_ddb.h"
47#include "opt_ktrace.h"
48#include "opt_trap.h"
49#include "opt_vm86.h"
50
51#include <sys/param.h>
52#include <sys/systm.h>
53#include <sys/proc.h>
54#include <sys/pioctl.h>
55#include <sys/kernel.h>
56#include <sys/resourcevar.h>
57#include <sys/signalvar.h>
58#include <sys/syscall.h>
59#include <sys/sysent.h>
60#include <sys/uio.h>
61#include <sys/vmmeter.h>
62#ifdef KTRACE
63#include <sys/ktrace.h>
64#endif
65
66#include <vm/vm.h>
67#include <vm/vm_param.h>
68#include <vm/vm_prot.h>
69#include <sys/lock.h>
70#include <vm/pmap.h>
71#include <vm/vm_kern.h>
72#include <vm/vm_map.h>
73#include <vm/vm_page.h>
74#include <vm/vm_extern.h>
75
76#include <machine/cpu.h>
77#include <machine/ipl.h>
78#include <machine/md_var.h>
79#include <machine/pcb.h>
80#ifdef SMP
81#include <machine/smp.h>
82#endif
83#include <machine/tss.h>
84
85#include <i386/isa/intr_machdep.h>
86
87#ifdef POWERFAIL_NMI
88#include <sys/syslog.h>
89#include <machine/clock.h>
90#endif
91
92#ifdef VM86
93#include <machine/vm86.h>
94#endif
95
96#include "isa.h"
97#include "npx.h"
98
99extern struct i386tss common_tss;
100
101int (*pmath_emulate) __P((struct trapframe *));
102
103extern void trap __P((struct trapframe frame));
104extern int trapwrite __P((unsigned addr));
105extern void syscall __P((struct trapframe frame));
106
107static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
108static void trap_fatal __P((struct trapframe *, vm_offset_t));
109void dblfault_handler __P((void));
110
111extern inthand_t IDTVEC(syscall);
112
113#define MAX_TRAP_MSG		28
114static char *trap_msg[] = {
115	"",					/*  0 unused */
116	"privileged instruction fault",		/*  1 T_PRIVINFLT */
117	"",					/*  2 unused */
118	"breakpoint instruction fault",		/*  3 T_BPTFLT */
119	"",					/*  4 unused */
120	"",					/*  5 unused */
121	"arithmetic trap",			/*  6 T_ARITHTRAP */
122	"system forced exception",		/*  7 T_ASTFLT */
123	"",					/*  8 unused */
124	"general protection fault",		/*  9 T_PROTFLT */
125	"trace trap",				/* 10 T_TRCTRAP */
126	"",					/* 11 unused */
127	"page fault",				/* 12 T_PAGEFLT */
128	"",					/* 13 unused */
129	"alignment fault",			/* 14 T_ALIGNFLT */
130	"",					/* 15 unused */
131	"",					/* 16 unused */
132	"",					/* 17 unused */
133	"integer divide fault",			/* 18 T_DIVIDE */
134	"non-maskable interrupt trap",		/* 19 T_NMI */
135	"overflow trap",			/* 20 T_OFLOW */
136	"FPU bounds check fault",		/* 21 T_BOUND */
137	"FPU device not available",		/* 22 T_DNA */
138	"double fault",				/* 23 T_DOUBLEFLT */
139	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
140	"invalid TSS fault",			/* 25 T_TSSFLT */
141	"segment not present fault",		/* 26 T_SEGNPFLT */
142	"stack fault",				/* 27 T_STKFLT */
143	"machine check trap",			/* 28 T_MCHK */
144};
145
146static __inline void userret __P((struct proc *p, struct trapframe *frame,
147				  u_quad_t oticks));
148
149#if defined(I586_CPU) && !defined(NO_F00F_HACK)
150extern struct gate_descriptor *t_idt;
151extern int has_f00f_bug;
152#endif
153
154static __inline void
155userret(p, frame, oticks)
156	struct proc *p;
157	struct trapframe *frame;
158	u_quad_t oticks;
159{
160	int sig, s;
161
162	while ((sig = CURSIG(p)) != 0)
163		postsig(sig);
164
165#if 0
166	if (!want_resched &&
167		(p->p_priority <= p->p_usrpri) &&
168		(p->p_rtprio.type == RTP_PRIO_NORMAL)) {
169		 int newpriority;
170		 p->p_estcpu += 1;
171		 newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
172		 newpriority = min(newpriority, MAXPRI);
173		 p->p_usrpri = newpriority;
174	}
175#endif
176
177	p->p_priority = p->p_usrpri;
178	if (want_resched) {
179		/*
180		 * Since we are curproc, clock will normally just change
181		 * our priority without moving us from one queue to another
182		 * (since the running process is not on a queue.)
183		 * If that happened after we setrunqueue ourselves but before we
184		 * mi_switch()'ed, we might not be on the queue indicated by
185		 * our priority.
186		 */
187		s = splhigh();
188		setrunqueue(p);
189		p->p_stats->p_ru.ru_nivcsw++;
190		mi_switch();
191		splx(s);
192		while ((sig = CURSIG(p)) != 0)
193			postsig(sig);
194	}
195	/*
196	 * Charge system time if profiling.
197	 */
198	if (p->p_flag & P_PROFIL)
199		addupc_task(p, frame->tf_eip,
200			    (u_int)(p->p_sticks - oticks) * psratio);
201
202	curpriority = p->p_priority;
203}
204
205/*
206 * Exception, fault, and trap interface to the FreeBSD kernel.
207 * This common code is called from assembly language IDT gate entry
208 * routines that prepare a suitable stack frame, and restore this
209 * frame after the exception has been processed.
210 */
211
212void
213trap(frame)
214	struct trapframe frame;
215{
216	struct proc *p = curproc;
217	u_quad_t sticks = 0;
218	int i = 0, ucode = 0, type, code;
219	vm_offset_t eva;
220
221	if (!(frame.tf_eflags & PSL_I)) {
222		/*
223		 * Buggy application or kernel code has disabled interrupts
224		 * and then trapped.  Enabling interrupts now is wrong, but
225		 * it is better than running with interrupts disabled until
226		 * they are accidentally enabled later.
227		 */
228		type = frame.tf_trapno;
229		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
230			printf(
231			    "pid %ld (%s): trap %d with interrupts disabled\n",
232			    (long)curproc->p_pid, curproc->p_comm, type);
233		else if (type != T_BPTFLT && type != T_TRCTRAP)
234			/*
235			 * XXX not quite right, since this may be for a
236			 * multiple fault in user mode.
237			 */
238			printf("kernel trap %d with interrupts disabled\n",
239			    type);
240		enable_intr();
241	}
242
243	eva = 0;
244	if (frame.tf_trapno == T_PAGEFLT) {
245		/*
246		 * For some Cyrix CPUs, %cr2 is clobbered by interrupts.
247		 * This problem is worked around by using an interrupt
248		 * gate for the pagefault handler.  We are finally ready
249		 * to read %cr2 and then must reenable interrupts.
250		 *
251		 * XXX this should be in the switch statement, but the
252		 * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the
253		 * flow of control too much for this to be obviously
254		 * correct.
255		 */
256		eva = rcr2();
257		enable_intr();
258	}
259
260#if defined(I586_CPU) && !defined(NO_F00F_HACK)
261restart:
262#endif
263	type = frame.tf_trapno;
264	code = frame.tf_err;
265
266#ifdef VM86
267	if (in_vm86call) {
268		if (frame.tf_eflags & PSL_VM &&
269		    (type == T_PROTFLT || type == T_STKFLT)) {
270			i = vm86_emulate((struct vm86frame *)&frame);
271			if (i != 0)
272				/*
273				 * returns to original process
274				 */
275				vm86_trap((struct vm86frame *)&frame);
276			return;
277		}
278		switch (type) {
279			/*
280			 * these traps want either a process context, or
281			 * assume a normal userspace trap.
282			 */
283		case T_PROTFLT:
284		case T_SEGNPFLT:
285			trap_fatal(&frame, eva);
286			return;
287		case T_TRCTRAP:
288			type = T_BPTFLT;	/* kernel breakpoint */
289			/* FALL THROUGH */
290		}
291		goto kernel_trap;	/* normal kernel trap handling */
292	}
293#endif
294
295        if ((ISPL(frame.tf_cs) == SEL_UPL) || (frame.tf_eflags & PSL_VM)) {
296		/* user trap */
297
298		sticks = p->p_sticks;
299		p->p_md.md_regs = &frame;
300
301		switch (type) {
302		case T_PRIVINFLT:	/* privileged instruction fault */
303			ucode = type;
304			i = SIGILL;
305			break;
306
307		case T_BPTFLT:		/* bpt instruction fault */
308		case T_TRCTRAP:		/* trace trap */
309			frame.tf_eflags &= ~PSL_T;
310			i = SIGTRAP;
311			break;
312
313		case T_ARITHTRAP:	/* arithmetic trap */
314			ucode = code;
315			i = SIGFPE;
316			break;
317
318		case T_ASTFLT:		/* Allow process switch */
319			astoff();
320			cnt.v_soft++;
321			if (p->p_flag & P_OWEUPC) {
322				p->p_flag &= ~P_OWEUPC;
323				addupc_task(p, p->p_stats->p_prof.pr_addr,
324					    p->p_stats->p_prof.pr_ticks);
325			}
326			goto out;
327
328			/*
329			 * The following two traps can happen in
330			 * vm86 mode, and, if so, we want to handle
331			 * them specially.
332			 */
333		case T_PROTFLT:		/* general protection fault */
334		case T_STKFLT:		/* stack fault */
335#ifdef VM86
336			if (frame.tf_eflags & PSL_VM) {
337				i = vm86_emulate((struct vm86frame *)&frame);
338				if (i == 0)
339					goto out;
340				break;
341			}
342#endif /* VM86 */
343			/* FALL THROUGH */
344
345		case T_SEGNPFLT:	/* segment not present fault */
346		case T_TSSFLT:		/* invalid TSS fault */
347		case T_DOUBLEFLT:	/* double fault */
348		default:
349			ucode = code + BUS_SEGM_FAULT ;
350			i = SIGBUS;
351			break;
352
353		case T_PAGEFLT:		/* page fault */
354			i = trap_pfault(&frame, TRUE, eva);
355			if (i == -1)
356				return;
357#if defined(I586_CPU) && !defined(NO_F00F_HACK)
358			if (i == -2)
359				goto restart;
360#endif
361			if (i == 0)
362				goto out;
363
364			ucode = T_PAGEFLT;
365			break;
366
367		case T_DIVIDE:		/* integer divide fault */
368			ucode = FPE_INTDIV_TRAP;
369			i = SIGFPE;
370			break;
371
372#if NISA > 0
373		case T_NMI:
374#ifdef POWERFAIL_NMI
375			goto handle_powerfail;
376#else /* !POWERFAIL_NMI */
377#ifdef DDB
378			/* NMI can be hooked up to a pushbutton for debugging */
379			printf ("NMI ... going to debugger\n");
380			if (kdb_trap (type, 0, &frame))
381				return;
382#endif /* DDB */
383			/* machine/parity/power fail/"kitchen sink" faults */
384			if (isa_nmi(code) == 0) return;
385			panic("NMI indicates hardware failure");
386#endif /* POWERFAIL_NMI */
387#endif /* NISA > 0 */
388
389		case T_OFLOW:		/* integer overflow fault */
390			ucode = FPE_INTOVF_TRAP;
391			i = SIGFPE;
392			break;
393
394		case T_BOUND:		/* bounds check fault */
395			ucode = FPE_SUBRNG_TRAP;
396			i = SIGFPE;
397			break;
398
399		case T_DNA:
400#if NNPX > 0
401			/* if a transparent fault (due to context switch "late") */
402			if (npxdna())
403				return;
404#endif
405			if (!pmath_emulate) {
406				i = SIGFPE;
407				ucode = FPE_FPU_NP_TRAP;
408				break;
409			}
410			i = (*pmath_emulate)(&frame);
411			if (i == 0) {
412				if (!(frame.tf_eflags & PSL_T))
413					return;
414				frame.tf_eflags &= ~PSL_T;
415				i = SIGTRAP;
416			}
417			/* else ucode = emulator_only_knows() XXX */
418			break;
419
420		case T_FPOPFLT:		/* FPU operand fetch fault */
421			ucode = T_FPOPFLT;
422			i = SIGILL;
423			break;
424		}
425	} else {
426#ifdef VM86
427kernel_trap:
428#endif
429		/* kernel trap */
430
431		switch (type) {
432		case T_PAGEFLT:			/* page fault */
433			(void) trap_pfault(&frame, FALSE, eva);
434			return;
435
436		case T_DNA:
437#if NNPX > 0
438			/*
439			 * The kernel is apparently using npx for copying.
440			 * XXX this should be fatal unless the kernel has
441			 * registered such use.
442			 */
443			if (npxdna())
444				return;
445#endif
446			break;
447
448		case T_PROTFLT:		/* general protection fault */
449		case T_SEGNPFLT:	/* segment not present fault */
450			/*
451			 * Invalid segment selectors and out of bounds
452			 * %eip's and %esp's can be set up in user mode.
453			 * This causes a fault in kernel mode when the
454			 * kernel tries to return to user mode.  We want
455			 * to get this fault so that we can fix the
456			 * problem here and not have to check all the
457			 * selectors and pointers when the user changes
458			 * them.
459			 */
460#define	MAYBE_DORETI_FAULT(where, whereto)				\
461	do {								\
462		if (frame.tf_eip == (int)where) {			\
463			frame.tf_eip = (int)whereto;			\
464			return;						\
465		}							\
466	} while (0)
467
468			if (intr_nesting_level == 0) {
469				/*
470				 * Invalid %fs's and %gs's can be created using
471				 * procfs or PT_SETREGS or by invalidating the
472				 * underlying LDT entry.  This causes a fault
473				 * in kernel mode when the kernel attempts to
474				 * switch contexts.  Lose the bad context
475				 * (XXX) so that we can continue, and generate
476				 * a signal.
477				 */
478				if (frame.tf_eip == (int)cpu_switch_load_fs) {
479					curpcb->pcb_fs = 0;
480					psignal(p, SIGBUS);
481					return;
482				}
483				if (frame.tf_eip == (int)cpu_switch_load_gs) {
484					curpcb->pcb_gs = 0;
485					psignal(p, SIGBUS);
486					return;
487				}
488				MAYBE_DORETI_FAULT(doreti_iret,
489						   doreti_iret_fault);
490				MAYBE_DORETI_FAULT(doreti_popl_ds,
491						   doreti_popl_ds_fault);
492				MAYBE_DORETI_FAULT(doreti_popl_es,
493						   doreti_popl_es_fault);
494				if (curpcb && curpcb->pcb_onfault) {
495					frame.tf_eip = (int)curpcb->pcb_onfault;
496					return;
497				}
498			}
499			break;
500
501		case T_TSSFLT:
502			/*
503			 * PSL_NT can be set in user mode and isn't cleared
504			 * automatically when the kernel is entered.  This
505			 * causes a TSS fault when the kernel attempts to
506			 * `iret' because the TSS link is uninitialized.  We
507			 * want to get this fault so that we can fix the
508			 * problem here and not every time the kernel is
509			 * entered.
510			 */
511			if (frame.tf_eflags & PSL_NT) {
512				frame.tf_eflags &= ~PSL_NT;
513				return;
514			}
515			break;
516
517		case T_TRCTRAP:	 /* trace trap */
518			if (frame.tf_eip == (int)IDTVEC(syscall)) {
519				/*
520				 * We've just entered system mode via the
521				 * syscall lcall.  Continue single stepping
522				 * silently until the syscall handler has
523				 * saved the flags.
524				 */
525				return;
526			}
527			if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
528				/*
529				 * The syscall handler has now saved the
530				 * flags.  Stop single stepping it.
531				 */
532				frame.tf_eflags &= ~PSL_T;
533				return;
534			}
535			/*
536			 * Fall through.
537			 */
538		case T_BPTFLT:
539			/*
540			 * If DDB is enabled, let it handle the debugger trap.
541			 * Otherwise, debugger traps "can't happen".
542			 */
543#ifdef DDB
544			if (kdb_trap (type, 0, &frame))
545				return;
546#endif
547			break;
548
549#if NISA > 0
550		case T_NMI:
551#ifdef POWERFAIL_NMI
552#ifndef TIMER_FREQ
553#  define TIMER_FREQ 1193182
554#endif
555	handle_powerfail:
556		{
557		  static unsigned lastalert = 0;
558
559		  if(time_second - lastalert > 10)
560		    {
561		      log(LOG_WARNING, "NMI: power fail\n");
562		      sysbeep(TIMER_FREQ/880, hz);
563		      lastalert = time_second;
564		    }
565		  return;
566		}
567#else /* !POWERFAIL_NMI */
568#ifdef DDB
569			/* NMI can be hooked up to a pushbutton for debugging */
570			printf ("NMI ... going to debugger\n");
571			if (kdb_trap (type, 0, &frame))
572				return;
573#endif /* DDB */
574			/* machine/parity/power fail/"kitchen sink" faults */
575			if (isa_nmi(code) == 0) return;
576			/* FALL THROUGH */
577#endif /* POWERFAIL_NMI */
578#endif /* NISA > 0 */
579		}
580
581		trap_fatal(&frame, eva);
582		return;
583	}
584
585	/* Translate fault for emulators (e.g. Linux) */
586	if (*p->p_sysent->sv_transtrap)
587		i = (*p->p_sysent->sv_transtrap)(i, type);
588
589	trapsignal(p, i, ucode);
590
591#ifdef DEBUG
592	if (type <= MAX_TRAP_MSG) {
593		uprintf("fatal process exception: %s",
594			trap_msg[type]);
595		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
596			uprintf(", fault VA = 0x%lx", (u_long)eva);
597		uprintf("\n");
598	}
599#endif
600
601out:
602	userret(p, &frame, sticks);
603}
604
605#ifdef notyet
606/*
607 * This version doesn't allow a page fault to user space while
608 * in the kernel. The rest of the kernel needs to be made "safe"
609 * before this can be used. I think the only things remaining
610 * to be made safe are the iBCS2 code and the process tracing/
611 * debugging code.
612 */
613static int
614trap_pfault(frame, usermode, eva)
615	struct trapframe *frame;
616	int usermode;
617	vm_offset_t eva;
618{
619	vm_offset_t va;
620	struct vmspace *vm = NULL;
621	vm_map_t map = 0;
622	int rv = 0;
623	vm_prot_t ftype;
624	struct proc *p = curproc;
625
626	if (frame->tf_err & PGEX_W)
627		ftype = VM_PROT_READ | VM_PROT_WRITE;
628	else
629		ftype = VM_PROT_READ;
630
631	va = trunc_page(eva);
632	if (va < VM_MIN_KERNEL_ADDRESS) {
633		vm_offset_t v;
634		vm_page_t mpte;
635
636		if (p == NULL ||
637		    (!usermode && va < VM_MAXUSER_ADDRESS &&
638		     (intr_nesting_level != 0 || curpcb == NULL ||
639		      curpcb->pcb_onfault == NULL))) {
640			trap_fatal(frame, eva);
641			return (-1);
642		}
643
644		/*
645		 * This is a fault on non-kernel virtual memory.
646		 * vm is initialized above to NULL. If curproc is NULL
647		 * or curproc->p_vmspace is NULL the fault is fatal.
648		 */
649		vm = p->p_vmspace;
650		if (vm == NULL)
651			goto nogo;
652
653		map = &vm->vm_map;
654
655		/*
656		 * Keep swapout from messing with us during this
657		 *	critical time.
658		 */
659		++p->p_lock;
660
661		/*
662		 * Grow the stack if necessary
663		 */
664		if ((caddr_t)va > vm->vm_maxsaddr
665		    && (caddr_t)va < (caddr_t)USRSTACK) {
666			if (!grow(p, va)) {
667				rv = KERN_FAILURE;
668				--p->p_lock;
669				goto nogo;
670			}
671		}
672
673		/* Fault in the user page: */
674		rv = vm_fault(map, va, ftype,
675			(ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : 0);
676
677		--p->p_lock;
678	} else {
679		/*
680		 * Don't allow user-mode faults in kernel address space.
681		 */
682		if (usermode)
683			goto nogo;
684
685		/*
686		 * Since we know that kernel virtual address addresses
687		 * always have pte pages mapped, we just have to fault
688		 * the page.
689		 */
690		rv = vm_fault(kernel_map, va, ftype, FALSE);
691	}
692
693	if (rv == KERN_SUCCESS)
694		return (0);
695nogo:
696	if (!usermode) {
697		if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
698			frame->tf_eip = (int)curpcb->pcb_onfault;
699			return (0);
700		}
701		trap_fatal(frame, eva);
702		return (-1);
703	}
704
705	/* kludge to pass faulting virtual address to sendsig */
706	frame->tf_err = eva;
707
708	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
709}
710#endif
711
712int
713trap_pfault(frame, usermode, eva)
714	struct trapframe *frame;
715	int usermode;
716	vm_offset_t eva;
717{
718	vm_offset_t va;
719	struct vmspace *vm = NULL;
720	vm_map_t map = 0;
721	int rv = 0;
722	vm_prot_t ftype;
723	struct proc *p = curproc;
724
725	va = trunc_page(eva);
726	if (va >= KERNBASE) {
727		/*
728		 * Don't allow user-mode faults in kernel address space.
729		 * An exception:  if the faulting address is the invalid
730		 * instruction entry in the IDT, then the Intel Pentium
731		 * F00F bug workaround was triggered, and we need to
732		 * treat it is as an illegal instruction, and not a page
733		 * fault.
734		 */
735#if defined(I586_CPU) && !defined(NO_F00F_HACK)
736		if ((eva == (unsigned int)&t_idt[6]) && has_f00f_bug) {
737			frame->tf_trapno = T_PRIVINFLT;
738			return -2;
739		}
740#endif
741		if (usermode)
742			goto nogo;
743
744		map = kernel_map;
745	} else {
746		/*
747		 * This is a fault on non-kernel virtual memory.
748		 * vm is initialized above to NULL. If curproc is NULL
749		 * or curproc->p_vmspace is NULL the fault is fatal.
750		 */
751		if (p != NULL)
752			vm = p->p_vmspace;
753
754		if (vm == NULL)
755			goto nogo;
756
757		map = &vm->vm_map;
758	}
759
760	if (frame->tf_err & PGEX_W)
761		ftype = VM_PROT_READ | VM_PROT_WRITE;
762	else
763		ftype = VM_PROT_READ;
764
765	if (map != kernel_map) {
766		/*
767		 * Keep swapout from messing with us during this
768		 *	critical time.
769		 */
770		++p->p_lock;
771
772		/*
773		 * Grow the stack if necessary
774		 */
775		if ((caddr_t)va > vm->vm_maxsaddr
776		    && (caddr_t)va < (caddr_t)USRSTACK) {
777			if (!grow(p, va)) {
778				rv = KERN_FAILURE;
779				--p->p_lock;
780				goto nogo;
781			}
782		}
783
784		/* Fault in the user page: */
785		rv = vm_fault(map, va, ftype,
786			(ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : 0);
787
788		--p->p_lock;
789	} else {
790		/*
791		 * Don't have to worry about process locking or stacks in the kernel.
792		 */
793		rv = vm_fault(map, va, ftype, FALSE);
794	}
795
796	if (rv == KERN_SUCCESS)
797		return (0);
798nogo:
799	if (!usermode) {
800		if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
801			frame->tf_eip = (int)curpcb->pcb_onfault;
802			return (0);
803		}
804		trap_fatal(frame, eva);
805		return (-1);
806	}
807
808	/* kludge to pass faulting virtual address to sendsig */
809	frame->tf_err = eva;
810
811	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
812}
813
814static void
815trap_fatal(frame, eva)
816	struct trapframe *frame;
817	vm_offset_t eva;
818{
819	int code, type, ss, esp;
820	struct soft_segment_descriptor softseg;
821
822	code = frame->tf_err;
823	type = frame->tf_trapno;
824	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
825
826	if (type <= MAX_TRAP_MSG)
827		printf("\n\nFatal trap %d: %s while in %s mode\n",
828			type, trap_msg[type],
829        		frame->tf_eflags & PSL_VM ? "vm86" :
830			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
831#ifdef SMP
832	/* three seperate prints in case of a trap on an unmapped page */
833	printf("mp_lock = %08x; ", mp_lock);
834	printf("cpuid = %d; ", cpuid);
835	printf("lapic.id = %08x\n", lapic.id);
836#endif
837	if (type == T_PAGEFLT) {
838		printf("fault virtual address	= 0x%x\n", eva);
839		printf("fault code		= %s %s, %s\n",
840			code & PGEX_U ? "user" : "supervisor",
841			code & PGEX_W ? "write" : "read",
842			code & PGEX_P ? "protection violation" : "page not present");
843	}
844	printf("instruction pointer	= 0x%x:0x%x\n",
845	       frame->tf_cs & 0xffff, frame->tf_eip);
846        if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
847		ss = frame->tf_ss & 0xffff;
848		esp = frame->tf_esp;
849	} else {
850		ss = GSEL(GDATA_SEL, SEL_KPL);
851		esp = (int)&frame->tf_esp;
852	}
853	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
854	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
855	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
856	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
857	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
858	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
859	       softseg.ssd_gran);
860	printf("processor eflags	= ");
861	if (frame->tf_eflags & PSL_T)
862		printf("trace trap, ");
863	if (frame->tf_eflags & PSL_I)
864		printf("interrupt enabled, ");
865	if (frame->tf_eflags & PSL_NT)
866		printf("nested task, ");
867	if (frame->tf_eflags & PSL_RF)
868		printf("resume, ");
869	if (frame->tf_eflags & PSL_VM)
870		printf("vm86, ");
871	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
872	printf("current process		= ");
873	if (curproc) {
874		printf("%lu (%s)\n",
875		    (u_long)curproc->p_pid, curproc->p_comm ?
876		    curproc->p_comm : "");
877	} else {
878		printf("Idle\n");
879	}
880	printf("interrupt mask		= ");
881	if ((cpl & net_imask) == net_imask)
882		printf("net ");
883	if ((cpl & tty_imask) == tty_imask)
884		printf("tty ");
885	if ((cpl & bio_imask) == bio_imask)
886		printf("bio ");
887	if ((cpl & cam_imask) == cam_imask)
888		printf("cam ");
889	if (cpl == 0)
890		printf("none");
891#ifdef SMP
892/**
893 *  XXX FIXME:
894 *	we probably SHOULD have stopped the other CPUs before now!
895 *	another CPU COULD have been touching cpl at this moment...
896 */
897	printf(" <- SMP: XXX");
898#endif
899	printf("\n");
900
901#ifdef KDB
902	if (kdb_trap(&psl))
903		return;
904#endif
905#ifdef DDB
906	if (kdb_trap (type, 0, frame))
907		return;
908#endif
909	printf("trap number		= %d\n", type);
910	if (type <= MAX_TRAP_MSG)
911		panic(trap_msg[type]);
912	else
913		panic("unknown/reserved trap");
914}
915
916/*
917 * Double fault handler. Called when a fault occurs while writing
918 * a frame for a trap/exception onto the stack. This usually occurs
919 * when the stack overflows (such is the case with infinite recursion,
920 * for example).
921 *
922 * XXX Note that the current PTD gets replaced by IdlePTD when the
923 * task switch occurs. This means that the stack that was active at
924 * the time of the double fault is not available at <kstack> unless
925 * the machine was idle when the double fault occurred. The downside
926 * of this is that "trace <ebp>" in ddb won't work.
927 */
928void
929dblfault_handler()
930{
931	printf("\nFatal double fault:\n");
932	printf("eip = 0x%x\n", common_tss.tss_eip);
933	printf("esp = 0x%x\n", common_tss.tss_esp);
934	printf("ebp = 0x%x\n", common_tss.tss_ebp);
935#ifdef SMP
936	/* three seperate prints in case of a trap on an unmapped page */
937	printf("mp_lock = %08x; ", mp_lock);
938	printf("cpuid = %d; ", cpuid);
939	printf("lapic.id = %08x\n", lapic.id);
940#endif
941	panic("double fault");
942}
943
944/*
945 * Compensate for 386 brain damage (missing URKR).
946 * This is a little simpler than the pagefault handler in trap() because
947 * it the page tables have already been faulted in and high addresses
948 * are thrown out early for other reasons.
949 */
950int trapwrite(addr)
951	unsigned addr;
952{
953	struct proc *p;
954	vm_offset_t va;
955	struct vmspace *vm;
956	int rv;
957
958	va = trunc_page((vm_offset_t)addr);
959	/*
960	 * XXX - MAX is END.  Changed > to >= for temp. fix.
961	 */
962	if (va >= VM_MAXUSER_ADDRESS)
963		return (1);
964
965	p = curproc;
966	vm = p->p_vmspace;
967
968	++p->p_lock;
969
970	if ((caddr_t)va >= vm->vm_maxsaddr
971	    && (caddr_t)va < (caddr_t)USRSTACK) {
972		if (!grow(p, va)) {
973			--p->p_lock;
974			return (1);
975		}
976	}
977
978	/*
979	 * fault the data page
980	 */
981	rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, VM_FAULT_DIRTY);
982
983	--p->p_lock;
984
985	if (rv != KERN_SUCCESS)
986		return 1;
987
988	return (0);
989}
990
991/*
992 * System call request from POSIX system call gate interface to kernel.
993 * Like trap(), argument is call by reference.
994 */
995void
996syscall(frame)
997	struct trapframe frame;
998{
999	caddr_t params;
1000	int i;
1001	struct sysent *callp;
1002	struct proc *p = curproc;
1003	u_quad_t sticks;
1004	int error;
1005	int args[8];
1006	u_int code;
1007
1008#ifdef DIAGNOSTIC
1009	if (ISPL(frame.tf_cs) != SEL_UPL)
1010		panic("syscall");
1011#endif
1012	sticks = p->p_sticks;
1013	p->p_md.md_regs = &frame;
1014	params = (caddr_t)frame.tf_esp + sizeof(int);
1015	code = frame.tf_eax;
1016	if (p->p_sysent->sv_prepsyscall) {
1017		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
1018	} else {
1019		/*
1020		 * Need to check if this is a 32 bit or 64 bit syscall.
1021		 */
1022		if (code == SYS_syscall) {
1023			/*
1024			 * Code is first argument, followed by actual args.
1025			 */
1026			code = fuword(params);
1027			params += sizeof(int);
1028		} else if (code == SYS___syscall) {
1029			/*
1030			 * Like syscall, but code is a quad, so as to maintain
1031			 * quad alignment for the rest of the arguments.
1032			 */
1033			code = fuword(params);
1034			params += sizeof(quad_t);
1035		}
1036	}
1037
1038 	if (p->p_sysent->sv_mask)
1039 		code &= p->p_sysent->sv_mask;
1040
1041 	if (code >= p->p_sysent->sv_size)
1042 		callp = &p->p_sysent->sv_table[0];
1043  	else
1044 		callp = &p->p_sysent->sv_table[code];
1045
1046	if (params && (i = callp->sy_narg * sizeof(int)) &&
1047	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
1048#ifdef KTRACE
1049		if (KTRPOINT(p, KTR_SYSCALL))
1050			ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
1051#endif
1052		goto bad;
1053	}
1054#ifdef KTRACE
1055	if (KTRPOINT(p, KTR_SYSCALL))
1056		ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
1057#endif
1058	p->p_retval[0] = 0;
1059	p->p_retval[1] = frame.tf_edx;
1060
1061	STOPEVENT(p, S_SCE, callp->sy_narg);
1062
1063	error = (*callp->sy_call)(p, args);
1064
1065	switch (error) {
1066
1067	case 0:
1068		/*
1069		 * Reinitialize proc pointer `p' as it may be different
1070		 * if this is a child returning from fork syscall.
1071		 */
1072		p = curproc;
1073		frame.tf_eax = p->p_retval[0];
1074		frame.tf_edx = p->p_retval[1];
1075		frame.tf_eflags &= ~PSL_C;
1076		break;
1077
1078	case ERESTART:
1079		/*
1080		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
1081		 * int 0x80 is 2 bytes. We saved this in tf_err.
1082		 */
1083		frame.tf_eip -= frame.tf_err;
1084		break;
1085
1086	case EJUSTRETURN:
1087		break;
1088
1089	default:
1090bad:
1091 		if (p->p_sysent->sv_errsize)
1092 			if (error >= p->p_sysent->sv_errsize)
1093  				error = -1;	/* XXX */
1094   			else
1095  				error = p->p_sysent->sv_errtbl[error];
1096		frame.tf_eax = error;
1097		frame.tf_eflags |= PSL_C;
1098		break;
1099	}
1100
1101	if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
1102		/* Traced syscall. */
1103		frame.tf_eflags &= ~PSL_T;
1104		trapsignal(p, SIGTRAP, 0);
1105	}
1106
1107	userret(p, &frame, sticks);
1108
1109#ifdef KTRACE
1110	if (KTRPOINT(p, KTR_SYSRET))
1111		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
1112#endif
1113
1114	/*
1115	 * This works because errno is findable through the
1116	 * register set.  If we ever support an emulation where this
1117	 * is not the case, this code will need to be revisited.
1118	 */
1119	STOPEVENT(p, S_SCX, code);
1120
1121}
1122
1123/*
1124 * Simplified back end of syscall(), used when returning from fork()
1125 * directly into user mode.
1126 */
1127void
1128fork_return(p, frame)
1129	struct proc *p;
1130	struct trapframe frame;
1131{
1132	frame.tf_eax = 0;		/* Child returns zero */
1133	frame.tf_eflags &= ~PSL_C;	/* success */
1134	frame.tf_edx = 1;
1135
1136	userret(p, &frame, 0);
1137#ifdef KTRACE
1138	if (KTRPOINT(p, KTR_SYSRET))
1139		ktrsysret(p->p_tracep, SYS_fork, 0, 0);
1140#endif
1141}
1142