subr_syscall.c revision 47678
1/*-
2 * Copyright (C) 1994, David Greenman
3 * Copyright (c) 1990, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the University of Utah, and William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
38 *	$Id: trap.c,v 1.137 1999/05/06 18:12:17 peter Exp $
39 */
40
41/*
42 * 386 Trap and System call handling
43 */
44
45#include "opt_cpu.h"
46#include "opt_ddb.h"
47#include "opt_ktrace.h"
48#include "opt_clock.h"
49#include "opt_trap.h"
50
51#include <sys/param.h>
52#include <sys/systm.h>
53#include <sys/proc.h>
54#include <sys/pioctl.h>
55#include <sys/kernel.h>
56#include <sys/resourcevar.h>
57#include <sys/signalvar.h>
58#include <sys/syscall.h>
59#include <sys/sysent.h>
60#include <sys/uio.h>
61#include <sys/vmmeter.h>
62#ifdef KTRACE
63#include <sys/ktrace.h>
64#endif
65
66#include <vm/vm.h>
67#include <vm/vm_param.h>
68#include <vm/vm_prot.h>
69#include <sys/lock.h>
70#include <vm/pmap.h>
71#include <vm/vm_kern.h>
72#include <vm/vm_map.h>
73#include <vm/vm_page.h>
74#include <vm/vm_extern.h>
75
76#include <machine/cpu.h>
77#include <machine/ipl.h>
78#include <machine/md_var.h>
79#include <machine/pcb.h>
80#ifdef SMP
81#include <machine/smp.h>
82#endif
83#include <machine/tss.h>
84
85#include <i386/isa/intr_machdep.h>
86
87#ifdef POWERFAIL_NMI
88#include <sys/syslog.h>
89#include <machine/clock.h>
90#endif
91
92#include <machine/vm86.h>
93
94#ifdef DDB
95	extern int in_Debugger, debugger_on_panic;
96#endif
97
98#include "isa.h"
99#include "npx.h"
100
101int (*pmath_emulate) __P((struct trapframe *));
102
103extern void trap __P((struct trapframe frame));
104extern int trapwrite __P((unsigned addr));
105extern void syscall __P((struct trapframe frame));
106
107static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
108static void trap_fatal __P((struct trapframe *, vm_offset_t));
109void dblfault_handler __P((void));
110
111extern inthand_t IDTVEC(syscall);
112
113#define MAX_TRAP_MSG		28
114static char *trap_msg[] = {
115	"",					/*  0 unused */
116	"privileged instruction fault",		/*  1 T_PRIVINFLT */
117	"",					/*  2 unused */
118	"breakpoint instruction fault",		/*  3 T_BPTFLT */
119	"",					/*  4 unused */
120	"",					/*  5 unused */
121	"arithmetic trap",			/*  6 T_ARITHTRAP */
122	"system forced exception",		/*  7 T_ASTFLT */
123	"",					/*  8 unused */
124	"general protection fault",		/*  9 T_PROTFLT */
125	"trace trap",				/* 10 T_TRCTRAP */
126	"",					/* 11 unused */
127	"page fault",				/* 12 T_PAGEFLT */
128	"",					/* 13 unused */
129	"alignment fault",			/* 14 T_ALIGNFLT */
130	"",					/* 15 unused */
131	"",					/* 16 unused */
132	"",					/* 17 unused */
133	"integer divide fault",			/* 18 T_DIVIDE */
134	"non-maskable interrupt trap",		/* 19 T_NMI */
135	"overflow trap",			/* 20 T_OFLOW */
136	"FPU bounds check fault",		/* 21 T_BOUND */
137	"FPU device not available",		/* 22 T_DNA */
138	"double fault",				/* 23 T_DOUBLEFLT */
139	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
140	"invalid TSS fault",			/* 25 T_TSSFLT */
141	"segment not present fault",		/* 26 T_SEGNPFLT */
142	"stack fault",				/* 27 T_STKFLT */
143	"machine check trap",			/* 28 T_MCHK */
144};
145
146static __inline void userret __P((struct proc *p, struct trapframe *frame,
147				  u_quad_t oticks));
148
149#if defined(I586_CPU) && !defined(NO_F00F_HACK)
150extern struct gate_descriptor *t_idt;
151extern int has_f00f_bug;
152#endif
153
154static __inline void
155userret(p, frame, oticks)
156	struct proc *p;
157	struct trapframe *frame;
158	u_quad_t oticks;
159{
160	int sig, s;
161
162	while ((sig = CURSIG(p)) != 0)
163		postsig(sig);
164
165#if 0
166	if (!want_resched &&
167		(p->p_priority <= p->p_usrpri) &&
168		(p->p_rtprio.type == RTP_PRIO_NORMAL)) {
169		 int newpriority;
170		 p->p_estcpu += 1;
171		 newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
172		 newpriority = min(newpriority, MAXPRI);
173		 p->p_usrpri = newpriority;
174	}
175#endif
176
177	p->p_priority = p->p_usrpri;
178	if (want_resched) {
179		/*
180		 * Since we are curproc, clock will normally just change
181		 * our priority without moving us from one queue to another
182		 * (since the running process is not on a queue.)
183		 * If that happened after we setrunqueue ourselves but before we
184		 * mi_switch()'ed, we might not be on the queue indicated by
185		 * our priority.
186		 */
187		s = splhigh();
188		setrunqueue(p);
189		p->p_stats->p_ru.ru_nivcsw++;
190		mi_switch();
191		splx(s);
192		while ((sig = CURSIG(p)) != 0)
193			postsig(sig);
194	}
195	/*
196	 * Charge system time if profiling.
197	 */
198	if (p->p_flag & P_PROFIL)
199		addupc_task(p, frame->tf_eip,
200			    (u_int)(p->p_sticks - oticks) * psratio);
201
202	curpriority = p->p_priority;
203}
204
205/*
206 * Exception, fault, and trap interface to the FreeBSD kernel.
207 * This common code is called from assembly language IDT gate entry
208 * routines that prepare a suitable stack frame, and restore this
209 * frame after the exception has been processed.
210 */
211
212void
213trap(frame)
214	struct trapframe frame;
215{
216	struct proc *p = curproc;
217	u_quad_t sticks = 0;
218	int i = 0, ucode = 0, type, code;
219	vm_offset_t eva;
220
221	if (!(frame.tf_eflags & PSL_I)) {
222		/*
223		 * Buggy application or kernel code has disabled interrupts
224		 * and then trapped.  Enabling interrupts now is wrong, but
225		 * it is better than running with interrupts disabled until
226		 * they are accidentally enabled later.
227		 */
228		type = frame.tf_trapno;
229		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
230			printf(
231			    "pid %ld (%s): trap %d with interrupts disabled\n",
232			    (long)curproc->p_pid, curproc->p_comm, type);
233		else if (type != T_BPTFLT && type != T_TRCTRAP)
234			/*
235			 * XXX not quite right, since this may be for a
236			 * multiple fault in user mode.
237			 */
238			printf("kernel trap %d with interrupts disabled\n",
239			    type);
240		enable_intr();
241	}
242
243	eva = 0;
244	if (frame.tf_trapno == T_PAGEFLT) {
245		/*
246		 * For some Cyrix CPUs, %cr2 is clobbered by interrupts.
247		 * This problem is worked around by using an interrupt
248		 * gate for the pagefault handler.  We are finally ready
249		 * to read %cr2 and then must reenable interrupts.
250		 *
251		 * XXX this should be in the switch statement, but the
252		 * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the
253		 * flow of control too much for this to be obviously
254		 * correct.
255		 */
256		eva = rcr2();
257		enable_intr();
258	}
259
260#if defined(I586_CPU) && !defined(NO_F00F_HACK)
261restart:
262#endif
263	type = frame.tf_trapno;
264	code = frame.tf_err;
265
266	if (in_vm86call) {
267		if (frame.tf_eflags & PSL_VM &&
268		    (type == T_PROTFLT || type == T_STKFLT)) {
269			i = vm86_emulate((struct vm86frame *)&frame);
270			if (i != 0)
271				/*
272				 * returns to original process
273				 */
274				vm86_trap((struct vm86frame *)&frame);
275			return;
276		}
277		switch (type) {
278			/*
279			 * these traps want either a process context, or
280			 * assume a normal userspace trap.
281			 */
282		case T_PROTFLT:
283		case T_SEGNPFLT:
284			trap_fatal(&frame, eva);
285			return;
286		case T_TRCTRAP:
287			type = T_BPTFLT;	/* kernel breakpoint */
288			/* FALL THROUGH */
289		}
290		goto kernel_trap;	/* normal kernel trap handling */
291	}
292
293        if ((ISPL(frame.tf_cs) == SEL_UPL) || (frame.tf_eflags & PSL_VM)) {
294		/* user trap */
295
296		sticks = p->p_sticks;
297		p->p_md.md_regs = &frame;
298
299		switch (type) {
300		case T_PRIVINFLT:	/* privileged instruction fault */
301			ucode = type;
302			i = SIGILL;
303			break;
304
305		case T_BPTFLT:		/* bpt instruction fault */
306		case T_TRCTRAP:		/* trace trap */
307			frame.tf_eflags &= ~PSL_T;
308			i = SIGTRAP;
309			break;
310
311		case T_ARITHTRAP:	/* arithmetic trap */
312			ucode = code;
313			i = SIGFPE;
314			break;
315
316		case T_ASTFLT:		/* Allow process switch */
317			astoff();
318			cnt.v_soft++;
319			if (p->p_flag & P_OWEUPC) {
320				p->p_flag &= ~P_OWEUPC;
321				addupc_task(p, p->p_stats->p_prof.pr_addr,
322					    p->p_stats->p_prof.pr_ticks);
323			}
324			goto out;
325
326			/*
327			 * The following two traps can happen in
328			 * vm86 mode, and, if so, we want to handle
329			 * them specially.
330			 */
331		case T_PROTFLT:		/* general protection fault */
332		case T_STKFLT:		/* stack fault */
333			if (frame.tf_eflags & PSL_VM) {
334				i = vm86_emulate((struct vm86frame *)&frame);
335				if (i == 0)
336					goto out;
337				break;
338			}
339			/* FALL THROUGH */
340
341		case T_SEGNPFLT:	/* segment not present fault */
342		case T_TSSFLT:		/* invalid TSS fault */
343		case T_DOUBLEFLT:	/* double fault */
344		default:
345			ucode = code + BUS_SEGM_FAULT ;
346			i = SIGBUS;
347			break;
348
349		case T_PAGEFLT:		/* page fault */
350			i = trap_pfault(&frame, TRUE, eva);
351			if (i == -1)
352				return;
353#if defined(I586_CPU) && !defined(NO_F00F_HACK)
354			if (i == -2)
355				goto restart;
356#endif
357			if (i == 0)
358				goto out;
359
360			ucode = T_PAGEFLT;
361			break;
362
363		case T_DIVIDE:		/* integer divide fault */
364			ucode = FPE_INTDIV_TRAP;
365			i = SIGFPE;
366			break;
367
368#if NISA > 0
369		case T_NMI:
370#ifdef POWERFAIL_NMI
371			goto handle_powerfail;
372#else /* !POWERFAIL_NMI */
373#ifdef DDB
374			/* NMI can be hooked up to a pushbutton for debugging */
375			printf ("NMI ... going to debugger\n");
376			if (kdb_trap (type, 0, &frame))
377				return;
378#endif /* DDB */
379			/* machine/parity/power fail/"kitchen sink" faults */
380			if (isa_nmi(code) == 0) return;
381			panic("NMI indicates hardware failure");
382#endif /* POWERFAIL_NMI */
383#endif /* NISA > 0 */
384
385		case T_OFLOW:		/* integer overflow fault */
386			ucode = FPE_INTOVF_TRAP;
387			i = SIGFPE;
388			break;
389
390		case T_BOUND:		/* bounds check fault */
391			ucode = FPE_SUBRNG_TRAP;
392			i = SIGFPE;
393			break;
394
395		case T_DNA:
396#if NNPX > 0
397			/* if a transparent fault (due to context switch "late") */
398			if (npxdna())
399				return;
400#endif
401			if (!pmath_emulate) {
402				i = SIGFPE;
403				ucode = FPE_FPU_NP_TRAP;
404				break;
405			}
406			i = (*pmath_emulate)(&frame);
407			if (i == 0) {
408				if (!(frame.tf_eflags & PSL_T))
409					return;
410				frame.tf_eflags &= ~PSL_T;
411				i = SIGTRAP;
412			}
413			/* else ucode = emulator_only_knows() XXX */
414			break;
415
416		case T_FPOPFLT:		/* FPU operand fetch fault */
417			ucode = T_FPOPFLT;
418			i = SIGILL;
419			break;
420		}
421	} else {
422kernel_trap:
423		/* kernel trap */
424
425		switch (type) {
426		case T_PAGEFLT:			/* page fault */
427			(void) trap_pfault(&frame, FALSE, eva);
428			return;
429
430		case T_DNA:
431#if NNPX > 0
432			/*
433			 * The kernel is apparently using npx for copying.
434			 * XXX this should be fatal unless the kernel has
435			 * registered such use.
436			 */
437			if (npxdna())
438				return;
439#endif
440			break;
441
442		case T_PROTFLT:		/* general protection fault */
443		case T_SEGNPFLT:	/* segment not present fault */
444			/*
445			 * Invalid segment selectors and out of bounds
446			 * %eip's and %esp's can be set up in user mode.
447			 * This causes a fault in kernel mode when the
448			 * kernel tries to return to user mode.  We want
449			 * to get this fault so that we can fix the
450			 * problem here and not have to check all the
451			 * selectors and pointers when the user changes
452			 * them.
453			 */
454#define	MAYBE_DORETI_FAULT(where, whereto)				\
455	do {								\
456		if (frame.tf_eip == (int)where) {			\
457			frame.tf_eip = (int)whereto;			\
458			return;						\
459		}							\
460	} while (0)
461
462			if (intr_nesting_level == 0) {
463				/*
464				 * Invalid %fs's and %gs's can be created using
465				 * procfs or PT_SETREGS or by invalidating the
466				 * underlying LDT entry.  This causes a fault
467				 * in kernel mode when the kernel attempts to
468				 * switch contexts.  Lose the bad context
469				 * (XXX) so that we can continue, and generate
470				 * a signal.
471				 */
472				if (frame.tf_eip == (int)cpu_switch_load_gs) {
473					curpcb->pcb_gs = 0;
474					psignal(p, SIGBUS);
475					return;
476				}
477				MAYBE_DORETI_FAULT(doreti_iret,
478						   doreti_iret_fault);
479				MAYBE_DORETI_FAULT(doreti_popl_ds,
480						   doreti_popl_ds_fault);
481				MAYBE_DORETI_FAULT(doreti_popl_es,
482						   doreti_popl_es_fault);
483				MAYBE_DORETI_FAULT(doreti_popl_fs,
484						   doreti_popl_fs_fault);
485				if (curpcb && curpcb->pcb_onfault) {
486					frame.tf_eip = (int)curpcb->pcb_onfault;
487					return;
488				}
489			}
490			break;
491
492		case T_TSSFLT:
493			/*
494			 * PSL_NT can be set in user mode and isn't cleared
495			 * automatically when the kernel is entered.  This
496			 * causes a TSS fault when the kernel attempts to
497			 * `iret' because the TSS link is uninitialized.  We
498			 * want to get this fault so that we can fix the
499			 * problem here and not every time the kernel is
500			 * entered.
501			 */
502			if (frame.tf_eflags & PSL_NT) {
503				frame.tf_eflags &= ~PSL_NT;
504				return;
505			}
506			break;
507
508		case T_TRCTRAP:	 /* trace trap */
509			if (frame.tf_eip == (int)IDTVEC(syscall)) {
510				/*
511				 * We've just entered system mode via the
512				 * syscall lcall.  Continue single stepping
513				 * silently until the syscall handler has
514				 * saved the flags.
515				 */
516				return;
517			}
518			if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
519				/*
520				 * The syscall handler has now saved the
521				 * flags.  Stop single stepping it.
522				 */
523				frame.tf_eflags &= ~PSL_T;
524				return;
525			}
526			/*
527			 * Fall through.
528			 */
529		case T_BPTFLT:
530			/*
531			 * If DDB is enabled, let it handle the debugger trap.
532			 * Otherwise, debugger traps "can't happen".
533			 */
534#ifdef DDB
535			if (kdb_trap (type, 0, &frame))
536				return;
537#endif
538			break;
539
540#if NISA > 0
541		case T_NMI:
542#ifdef POWERFAIL_NMI
543#ifndef TIMER_FREQ
544#  define TIMER_FREQ 1193182
545#endif
546	handle_powerfail:
547		{
548		  static unsigned lastalert = 0;
549
550		  if(time_second - lastalert > 10)
551		    {
552		      log(LOG_WARNING, "NMI: power fail\n");
553		      sysbeep(TIMER_FREQ/880, hz);
554		      lastalert = time_second;
555		    }
556		  return;
557		}
558#else /* !POWERFAIL_NMI */
559#ifdef DDB
560			/* NMI can be hooked up to a pushbutton for debugging */
561			printf ("NMI ... going to debugger\n");
562			if (kdb_trap (type, 0, &frame))
563				return;
564#endif /* DDB */
565			/* machine/parity/power fail/"kitchen sink" faults */
566			if (isa_nmi(code) == 0) return;
567			/* FALL THROUGH */
568#endif /* POWERFAIL_NMI */
569#endif /* NISA > 0 */
570		}
571
572		trap_fatal(&frame, eva);
573		return;
574	}
575
576	/* Translate fault for emulators (e.g. Linux) */
577	if (*p->p_sysent->sv_transtrap)
578		i = (*p->p_sysent->sv_transtrap)(i, type);
579
580	trapsignal(p, i, ucode);
581
582#ifdef DEBUG
583	if (type <= MAX_TRAP_MSG) {
584		uprintf("fatal process exception: %s",
585			trap_msg[type]);
586		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
587			uprintf(", fault VA = 0x%lx", (u_long)eva);
588		uprintf("\n");
589	}
590#endif
591
592out:
593	userret(p, &frame, sticks);
594}
595
596#ifdef notyet
597/*
598 * This version doesn't allow a page fault to user space while
599 * in the kernel. The rest of the kernel needs to be made "safe"
600 * before this can be used. I think the only things remaining
601 * to be made safe are the iBCS2 code and the process tracing/
602 * debugging code.
603 */
604static int
605trap_pfault(frame, usermode, eva)
606	struct trapframe *frame;
607	int usermode;
608	vm_offset_t eva;
609{
610	vm_offset_t va;
611	struct vmspace *vm = NULL;
612	vm_map_t map = 0;
613	int rv = 0;
614	vm_prot_t ftype;
615	struct proc *p = curproc;
616
617	if (frame->tf_err & PGEX_W)
618		ftype = VM_PROT_READ | VM_PROT_WRITE;
619	else
620		ftype = VM_PROT_READ;
621
622	va = trunc_page(eva);
623	if (va < VM_MIN_KERNEL_ADDRESS) {
624		vm_offset_t v;
625		vm_page_t mpte;
626
627		if (p == NULL ||
628		    (!usermode && va < VM_MAXUSER_ADDRESS &&
629		     (intr_nesting_level != 0 || curpcb == NULL ||
630		      curpcb->pcb_onfault == NULL))) {
631			trap_fatal(frame, eva);
632			return (-1);
633		}
634
635		/*
636		 * This is a fault on non-kernel virtual memory.
637		 * vm is initialized above to NULL. If curproc is NULL
638		 * or curproc->p_vmspace is NULL the fault is fatal.
639		 */
640		vm = p->p_vmspace;
641		if (vm == NULL)
642			goto nogo;
643
644		map = &vm->vm_map;
645
646		/*
647		 * Keep swapout from messing with us during this
648		 *	critical time.
649		 */
650		++p->p_lock;
651
652		/*
653		 * Grow the stack if necessary
654		 */
655		/* grow_stack returns false only if va falls into
656		 * a growable stack region and the stack growth
657		 * fails.  It returns true if va was not within
658		 * a growable stack region, or if the stack
659		 * growth succeeded.
660		 */
661		if (!grow_stack (p, va)) {
662			rv = KERN_FAILURE;
663			--p->p_lock;
664			goto nogo;
665		}
666
667		/* Fault in the user page: */
668		rv = vm_fault(map, va, ftype,
669			(ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : 0);
670
671		--p->p_lock;
672	} else {
673		/*
674		 * Don't allow user-mode faults in kernel address space.
675		 */
676		if (usermode)
677			goto nogo;
678
679		/*
680		 * Since we know that kernel virtual address addresses
681		 * always have pte pages mapped, we just have to fault
682		 * the page.
683		 */
684		rv = vm_fault(kernel_map, va, ftype, FALSE);
685	}
686
687	if (rv == KERN_SUCCESS)
688		return (0);
689nogo:
690	if (!usermode) {
691		if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
692			frame->tf_eip = (int)curpcb->pcb_onfault;
693			return (0);
694		}
695		trap_fatal(frame, eva);
696		return (-1);
697	}
698
699	/* kludge to pass faulting virtual address to sendsig */
700	frame->tf_err = eva;
701
702	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
703}
704#endif
705
706int
707trap_pfault(frame, usermode, eva)
708	struct trapframe *frame;
709	int usermode;
710	vm_offset_t eva;
711{
712	vm_offset_t va;
713	struct vmspace *vm = NULL;
714	vm_map_t map = 0;
715	int rv = 0;
716	vm_prot_t ftype;
717	struct proc *p = curproc;
718
719	va = trunc_page(eva);
720	if (va >= KERNBASE) {
721		/*
722		 * Don't allow user-mode faults in kernel address space.
723		 * An exception:  if the faulting address is the invalid
724		 * instruction entry in the IDT, then the Intel Pentium
725		 * F00F bug workaround was triggered, and we need to
726		 * treat it is as an illegal instruction, and not a page
727		 * fault.
728		 */
729#if defined(I586_CPU) && !defined(NO_F00F_HACK)
730		if ((eva == (unsigned int)&t_idt[6]) && has_f00f_bug) {
731			frame->tf_trapno = T_PRIVINFLT;
732			return -2;
733		}
734#endif
735		if (usermode)
736			goto nogo;
737
738		map = kernel_map;
739	} else {
740		/*
741		 * This is a fault on non-kernel virtual memory.
742		 * vm is initialized above to NULL. If curproc is NULL
743		 * or curproc->p_vmspace is NULL the fault is fatal.
744		 */
745		if (p != NULL)
746			vm = p->p_vmspace;
747
748		if (vm == NULL)
749			goto nogo;
750
751		map = &vm->vm_map;
752	}
753
754	if (frame->tf_err & PGEX_W)
755		ftype = VM_PROT_READ | VM_PROT_WRITE;
756	else
757		ftype = VM_PROT_READ;
758
759	if (map != kernel_map) {
760		/*
761		 * Keep swapout from messing with us during this
762		 *	critical time.
763		 */
764		++p->p_lock;
765
766		/*
767		 * Grow the stack if necessary
768		 */
769		/* grow_stack returns false only if va falls into
770		 * a growable stack region and the stack growth
771		 * fails.  It returns true if va was not within
772		 * a growable stack region, or if the stack
773		 * growth succeeded.
774		 */
775		if (!grow_stack (p, va)) {
776			rv = KERN_FAILURE;
777			--p->p_lock;
778			goto nogo;
779		}
780
781		/* Fault in the user page: */
782		rv = vm_fault(map, va, ftype,
783			(ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : 0);
784
785		--p->p_lock;
786	} else {
787		/*
788		 * Don't have to worry about process locking or stacks in the kernel.
789		 */
790		rv = vm_fault(map, va, ftype, FALSE);
791	}
792
793	if (rv == KERN_SUCCESS)
794		return (0);
795nogo:
796	if (!usermode) {
797		if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
798			frame->tf_eip = (int)curpcb->pcb_onfault;
799			return (0);
800		}
801		trap_fatal(frame, eva);
802		return (-1);
803	}
804
805	/* kludge to pass faulting virtual address to sendsig */
806	frame->tf_err = eva;
807
808	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
809}
810
811static void
812trap_fatal(frame, eva)
813	struct trapframe *frame;
814	vm_offset_t eva;
815{
816	int code, type, ss, esp;
817	struct soft_segment_descriptor softseg;
818
819	code = frame->tf_err;
820	type = frame->tf_trapno;
821	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
822
823	if (type <= MAX_TRAP_MSG)
824		printf("\n\nFatal trap %d: %s while in %s mode\n",
825			type, trap_msg[type],
826        		frame->tf_eflags & PSL_VM ? "vm86" :
827			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
828#ifdef SMP
829	/* three seperate prints in case of a trap on an unmapped page */
830	printf("mp_lock = %08x; ", mp_lock);
831	printf("cpuid = %d; ", cpuid);
832	printf("lapic.id = %08x\n", lapic.id);
833#endif
834	if (type == T_PAGEFLT) {
835		printf("fault virtual address	= 0x%x\n", eva);
836		printf("fault code		= %s %s, %s\n",
837			code & PGEX_U ? "user" : "supervisor",
838			code & PGEX_W ? "write" : "read",
839			code & PGEX_P ? "protection violation" : "page not present");
840	}
841	printf("instruction pointer	= 0x%x:0x%x\n",
842	       frame->tf_cs & 0xffff, frame->tf_eip);
843        if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
844		ss = frame->tf_ss & 0xffff;
845		esp = frame->tf_esp;
846	} else {
847		ss = GSEL(GDATA_SEL, SEL_KPL);
848		esp = (int)&frame->tf_esp;
849	}
850	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
851	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
852	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
853	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
854	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
855	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
856	       softseg.ssd_gran);
857	printf("processor eflags	= ");
858	if (frame->tf_eflags & PSL_T)
859		printf("trace trap, ");
860	if (frame->tf_eflags & PSL_I)
861		printf("interrupt enabled, ");
862	if (frame->tf_eflags & PSL_NT)
863		printf("nested task, ");
864	if (frame->tf_eflags & PSL_RF)
865		printf("resume, ");
866	if (frame->tf_eflags & PSL_VM)
867		printf("vm86, ");
868	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
869	printf("current process		= ");
870	if (curproc) {
871		printf("%lu (%s)\n",
872		    (u_long)curproc->p_pid, curproc->p_comm ?
873		    curproc->p_comm : "");
874	} else {
875		printf("Idle\n");
876	}
877	printf("interrupt mask		= ");
878	if ((cpl & net_imask) == net_imask)
879		printf("net ");
880	if ((cpl & tty_imask) == tty_imask)
881		printf("tty ");
882	if ((cpl & bio_imask) == bio_imask)
883		printf("bio ");
884	if ((cpl & cam_imask) == cam_imask)
885		printf("cam ");
886	if (cpl == 0)
887		printf("none");
888#ifdef SMP
889/**
890 *  XXX FIXME:
891 *	we probably SHOULD have stopped the other CPUs before now!
892 *	another CPU COULD have been touching cpl at this moment...
893 */
894	printf(" <- SMP: XXX");
895#endif
896	printf("\n");
897
898#ifdef KDB
899	if (kdb_trap(&psl))
900		return;
901#endif
902#ifdef DDB
903	if ((debugger_on_panic || in_Debugger) && kdb_trap(type, 0, frame))
904		return;
905#endif
906	printf("trap number		= %d\n", type);
907	if (type <= MAX_TRAP_MSG)
908		panic(trap_msg[type]);
909	else
910		panic("unknown/reserved trap");
911}
912
913/*
914 * Double fault handler. Called when a fault occurs while writing
915 * a frame for a trap/exception onto the stack. This usually occurs
916 * when the stack overflows (such is the case with infinite recursion,
917 * for example).
918 *
919 * XXX Note that the current PTD gets replaced by IdlePTD when the
920 * task switch occurs. This means that the stack that was active at
921 * the time of the double fault is not available at <kstack> unless
922 * the machine was idle when the double fault occurred. The downside
923 * of this is that "trace <ebp>" in ddb won't work.
924 */
925void
926dblfault_handler()
927{
928	printf("\nFatal double fault:\n");
929	printf("eip = 0x%x\n", common_tss.tss_eip);
930	printf("esp = 0x%x\n", common_tss.tss_esp);
931	printf("ebp = 0x%x\n", common_tss.tss_ebp);
932#ifdef SMP
933	/* three seperate prints in case of a trap on an unmapped page */
934	printf("mp_lock = %08x; ", mp_lock);
935	printf("cpuid = %d; ", cpuid);
936	printf("lapic.id = %08x\n", lapic.id);
937#endif
938	panic("double fault");
939}
940
941/*
942 * Compensate for 386 brain damage (missing URKR).
943 * This is a little simpler than the pagefault handler in trap() because
944 * it the page tables have already been faulted in and high addresses
945 * are thrown out early for other reasons.
946 */
947int trapwrite(addr)
948	unsigned addr;
949{
950	struct proc *p;
951	vm_offset_t va;
952	struct vmspace *vm;
953	int rv;
954
955	va = trunc_page((vm_offset_t)addr);
956	/*
957	 * XXX - MAX is END.  Changed > to >= for temp. fix.
958	 */
959	if (va >= VM_MAXUSER_ADDRESS)
960		return (1);
961
962	p = curproc;
963	vm = p->p_vmspace;
964
965	++p->p_lock;
966
967	if (!grow_stack (p, va)) {
968		--p->p_lock;
969		return (1);
970	}
971
972	/*
973	 * fault the data page
974	 */
975	rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, VM_FAULT_DIRTY);
976
977	--p->p_lock;
978
979	if (rv != KERN_SUCCESS)
980		return 1;
981
982	return (0);
983}
984
985/*
986 * System call request from POSIX system call gate interface to kernel.
987 * Like trap(), argument is call by reference.
988 */
989void
990syscall(frame)
991	struct trapframe frame;
992{
993	caddr_t params;
994	int i;
995	struct sysent *callp;
996	struct proc *p = curproc;
997	u_quad_t sticks;
998	int error;
999	int args[8];
1000	u_int code;
1001
1002#ifdef DIAGNOSTIC
1003	if (ISPL(frame.tf_cs) != SEL_UPL)
1004		panic("syscall");
1005#endif
1006	sticks = p->p_sticks;
1007	p->p_md.md_regs = &frame;
1008	params = (caddr_t)frame.tf_esp + sizeof(int);
1009	code = frame.tf_eax;
1010	if (p->p_sysent->sv_prepsyscall) {
1011		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
1012	} else {
1013		/*
1014		 * Need to check if this is a 32 bit or 64 bit syscall.
1015		 */
1016		if (code == SYS_syscall) {
1017			/*
1018			 * Code is first argument, followed by actual args.
1019			 */
1020			code = fuword(params);
1021			params += sizeof(int);
1022		} else if (code == SYS___syscall) {
1023			/*
1024			 * Like syscall, but code is a quad, so as to maintain
1025			 * quad alignment for the rest of the arguments.
1026			 */
1027			code = fuword(params);
1028			params += sizeof(quad_t);
1029		}
1030	}
1031
1032 	if (p->p_sysent->sv_mask)
1033 		code &= p->p_sysent->sv_mask;
1034
1035 	if (code >= p->p_sysent->sv_size)
1036 		callp = &p->p_sysent->sv_table[0];
1037  	else
1038 		callp = &p->p_sysent->sv_table[code];
1039
1040	if (params && (i = callp->sy_narg * sizeof(int)) &&
1041	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
1042#ifdef KTRACE
1043		if (KTRPOINT(p, KTR_SYSCALL))
1044			ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
1045#endif
1046		goto bad;
1047	}
1048#ifdef KTRACE
1049	if (KTRPOINT(p, KTR_SYSCALL))
1050		ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
1051#endif
1052	p->p_retval[0] = 0;
1053	p->p_retval[1] = frame.tf_edx;
1054
1055	STOPEVENT(p, S_SCE, callp->sy_narg);
1056
1057	error = (*callp->sy_call)(p, args);
1058
1059	switch (error) {
1060
1061	case 0:
1062		/*
1063		 * Reinitialize proc pointer `p' as it may be different
1064		 * if this is a child returning from fork syscall.
1065		 */
1066		p = curproc;
1067		frame.tf_eax = p->p_retval[0];
1068		frame.tf_edx = p->p_retval[1];
1069		frame.tf_eflags &= ~PSL_C;
1070		break;
1071
1072	case ERESTART:
1073		/*
1074		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
1075		 * int 0x80 is 2 bytes. We saved this in tf_err.
1076		 */
1077		frame.tf_eip -= frame.tf_err;
1078		break;
1079
1080	case EJUSTRETURN:
1081		break;
1082
1083	default:
1084bad:
1085 		if (p->p_sysent->sv_errsize) {
1086 			if (error >= p->p_sysent->sv_errsize)
1087  				error = -1;	/* XXX */
1088   			else
1089  				error = p->p_sysent->sv_errtbl[error];
1090		}
1091		frame.tf_eax = error;
1092		frame.tf_eflags |= PSL_C;
1093		break;
1094	}
1095
1096	if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
1097		/* Traced syscall. */
1098		frame.tf_eflags &= ~PSL_T;
1099		trapsignal(p, SIGTRAP, 0);
1100	}
1101
1102	userret(p, &frame, sticks);
1103
1104#ifdef KTRACE
1105	if (KTRPOINT(p, KTR_SYSRET))
1106		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
1107#endif
1108
1109	/*
1110	 * This works because errno is findable through the
1111	 * register set.  If we ever support an emulation where this
1112	 * is not the case, this code will need to be revisited.
1113	 */
1114	STOPEVENT(p, S_SCX, code);
1115
1116}
1117
1118/*
1119 * Simplified back end of syscall(), used when returning from fork()
1120 * directly into user mode.
1121 */
1122void
1123fork_return(p, frame)
1124	struct proc *p;
1125	struct trapframe frame;
1126{
1127	frame.tf_eax = 0;		/* Child returns zero */
1128	frame.tf_eflags &= ~PSL_C;	/* success */
1129	frame.tf_edx = 1;
1130
1131	userret(p, &frame, 0);
1132#ifdef KTRACE
1133	if (KTRPOINT(p, KTR_SYSRET))
1134		ktrsysret(p->p_tracep, SYS_fork, 0, 0);
1135#endif
1136}
1137