subr_syscall.c revision 58717
1/*-
2 * Copyright (C) 1994, David Greenman
3 * Copyright (c) 1990, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the University of Utah, and William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
38 * $FreeBSD: head/sys/kern/subr_trap.c 58717 2000-03-28 07:16:37Z dillon $
39 */
40
41/*
42 * 386 Trap and System call handling
43 */
44
45#include "opt_cpu.h"
46#include "opt_ddb.h"
47#include "opt_ktrace.h"
48#include "opt_clock.h"
49#include "opt_trap.h"
50
51#include <sys/param.h>
52#include <sys/systm.h>
53#include <sys/proc.h>
54#include <sys/pioctl.h>
55#include <sys/kernel.h>
56#include <sys/resourcevar.h>
57#include <sys/signalvar.h>
58#include <sys/syscall.h>
59#include <sys/sysent.h>
60#include <sys/uio.h>
61#include <sys/vmmeter.h>
62#ifdef KTRACE
63#include <sys/ktrace.h>
64#endif
65
66#include <vm/vm.h>
67#include <vm/vm_param.h>
68#include <sys/lock.h>
69#include <vm/pmap.h>
70#include <vm/vm_kern.h>
71#include <vm/vm_map.h>
72#include <vm/vm_page.h>
73#include <vm/vm_extern.h>
74
75#include <machine/cpu.h>
76#include <machine/ipl.h>
77#include <machine/md_var.h>
78#include <machine/pcb.h>
79#ifdef SMP
80#include <machine/smp.h>
81#endif
82#include <machine/tss.h>
83
84#include <i386/isa/intr_machdep.h>
85
86#ifdef POWERFAIL_NMI
87#include <sys/syslog.h>
88#include <machine/clock.h>
89#endif
90
91#include <machine/vm86.h>
92
93#include <ddb/ddb.h>
94
95#include "isa.h"
96#include "npx.h"
97
98int (*pmath_emulate) __P((struct trapframe *));
99
100extern void trap __P((struct trapframe frame));
101extern int trapwrite __P((unsigned addr));
102extern void syscall2 __P((struct trapframe frame));
103
104static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
105static void trap_fatal __P((struct trapframe *, vm_offset_t));
106void dblfault_handler __P((void));
107
108extern inthand_t IDTVEC(syscall);
109
110#define MAX_TRAP_MSG		28
111static char *trap_msg[] = {
112	"",					/*  0 unused */
113	"privileged instruction fault",		/*  1 T_PRIVINFLT */
114	"",					/*  2 unused */
115	"breakpoint instruction fault",		/*  3 T_BPTFLT */
116	"",					/*  4 unused */
117	"",					/*  5 unused */
118	"arithmetic trap",			/*  6 T_ARITHTRAP */
119	"system forced exception",		/*  7 T_ASTFLT */
120	"",					/*  8 unused */
121	"general protection fault",		/*  9 T_PROTFLT */
122	"trace trap",				/* 10 T_TRCTRAP */
123	"",					/* 11 unused */
124	"page fault",				/* 12 T_PAGEFLT */
125	"",					/* 13 unused */
126	"alignment fault",			/* 14 T_ALIGNFLT */
127	"",					/* 15 unused */
128	"",					/* 16 unused */
129	"",					/* 17 unused */
130	"integer divide fault",			/* 18 T_DIVIDE */
131	"non-maskable interrupt trap",		/* 19 T_NMI */
132	"overflow trap",			/* 20 T_OFLOW */
133	"FPU bounds check fault",		/* 21 T_BOUND */
134	"FPU device not available",		/* 22 T_DNA */
135	"double fault",				/* 23 T_DOUBLEFLT */
136	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
137	"invalid TSS fault",			/* 25 T_TSSFLT */
138	"segment not present fault",		/* 26 T_SEGNPFLT */
139	"stack fault",				/* 27 T_STKFLT */
140	"machine check trap",			/* 28 T_MCHK */
141};
142
143static __inline int userret __P((struct proc *p, struct trapframe *frame,
144				  u_quad_t oticks, int have_mplock));
145
146#if defined(I586_CPU) && !defined(NO_F00F_HACK)
147extern int has_f00f_bug;
148#endif
149
150static __inline int
151userret(p, frame, oticks, have_mplock)
152	struct proc *p;
153	struct trapframe *frame;
154	u_quad_t oticks;
155	int have_mplock;
156{
157	int sig, s;
158
159	while ((sig = CURSIG(p)) != 0) {
160		if (have_mplock == 0) {
161			get_mplock();
162			have_mplock = 1;
163		}
164		postsig(sig);
165	}
166
167	p->p_priority = p->p_usrpri;
168	if (resched_wanted()) {
169		/*
170		 * Since we are curproc, clock will normally just change
171		 * our priority without moving us from one queue to another
172		 * (since the running process is not on a queue.)
173		 * If that happened after we setrunqueue ourselves but before we
174		 * mi_switch()'ed, we might not be on the queue indicated by
175		 * our priority.
176		 */
177		if (have_mplock == 0) {
178			get_mplock();
179			have_mplock = 1;
180		}
181		s = splhigh();
182		setrunqueue(p);
183		p->p_stats->p_ru.ru_nivcsw++;
184		mi_switch();
185		splx(s);
186		while ((sig = CURSIG(p)) != 0)
187			postsig(sig);
188	}
189	/*
190	 * Charge system time if profiling.
191	 */
192	if (p->p_flag & P_PROFIL) {
193		if (have_mplock == 0) {
194			get_mplock();
195			have_mplock = 1;
196		}
197		addupc_task(p, frame->tf_eip,
198			    (u_int)(p->p_sticks - oticks) * psratio);
199	}
200	curpriority = p->p_priority;
201	return(have_mplock);
202}
203
204/*
205 * Exception, fault, and trap interface to the FreeBSD kernel.
206 * This common code is called from assembly language IDT gate entry
207 * routines that prepare a suitable stack frame, and restore this
208 * frame after the exception has been processed.
209 */
210
211void
212trap(frame)
213	struct trapframe frame;
214{
215	struct proc *p = curproc;
216	u_quad_t sticks = 0;
217	int i = 0, ucode = 0, type, code;
218	vm_offset_t eva;
219
220	if (!(frame.tf_eflags & PSL_I)) {
221		/*
222		 * Buggy application or kernel code has disabled interrupts
223		 * and then trapped.  Enabling interrupts now is wrong, but
224		 * it is better than running with interrupts disabled until
225		 * they are accidentally enabled later.
226		 */
227		type = frame.tf_trapno;
228		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
229			printf(
230			    "pid %ld (%s): trap %d with interrupts disabled\n",
231			    (long)curproc->p_pid, curproc->p_comm, type);
232		else if (type != T_BPTFLT && type != T_TRCTRAP)
233			/*
234			 * XXX not quite right, since this may be for a
235			 * multiple fault in user mode.
236			 */
237			printf("kernel trap %d with interrupts disabled\n",
238			    type);
239		enable_intr();
240	}
241
242	eva = 0;
243	if (frame.tf_trapno == T_PAGEFLT) {
244		/*
245		 * For some Cyrix CPUs, %cr2 is clobbered by interrupts.
246		 * This problem is worked around by using an interrupt
247		 * gate for the pagefault handler.  We are finally ready
248		 * to read %cr2 and then must reenable interrupts.
249		 *
250		 * XXX this should be in the switch statement, but the
251		 * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the
252		 * flow of control too much for this to be obviously
253		 * correct.
254		 */
255		eva = rcr2();
256		enable_intr();
257	}
258
259#if defined(I586_CPU) && !defined(NO_F00F_HACK)
260restart:
261#endif
262	type = frame.tf_trapno;
263	code = frame.tf_err;
264
265	if (in_vm86call) {
266		if (frame.tf_eflags & PSL_VM &&
267		    (type == T_PROTFLT || type == T_STKFLT)) {
268			i = vm86_emulate((struct vm86frame *)&frame);
269			if (i != 0)
270				/*
271				 * returns to original process
272				 */
273				vm86_trap((struct vm86frame *)&frame);
274			return;
275		}
276		switch (type) {
277			/*
278			 * these traps want either a process context, or
279			 * assume a normal userspace trap.
280			 */
281		case T_PROTFLT:
282		case T_SEGNPFLT:
283			trap_fatal(&frame, eva);
284			return;
285		case T_TRCTRAP:
286			type = T_BPTFLT;	/* kernel breakpoint */
287			/* FALL THROUGH */
288		}
289		goto kernel_trap;	/* normal kernel trap handling */
290	}
291
292        if ((ISPL(frame.tf_cs) == SEL_UPL) || (frame.tf_eflags & PSL_VM)) {
293		/* user trap */
294
295		sticks = p->p_sticks;
296		p->p_md.md_regs = &frame;
297
298		switch (type) {
299		case T_PRIVINFLT:	/* privileged instruction fault */
300			ucode = type;
301			i = SIGILL;
302			break;
303
304		case T_BPTFLT:		/* bpt instruction fault */
305		case T_TRCTRAP:		/* trace trap */
306			frame.tf_eflags &= ~PSL_T;
307			i = SIGTRAP;
308			break;
309
310		case T_ARITHTRAP:	/* arithmetic trap */
311			ucode = code;
312			i = SIGFPE;
313			break;
314
315		case T_ASTFLT:		/* Allow process switch */
316			astoff();
317			cnt.v_soft++;
318			if (p->p_flag & P_OWEUPC) {
319				p->p_flag &= ~P_OWEUPC;
320				addupc_task(p, p->p_stats->p_prof.pr_addr,
321					    p->p_stats->p_prof.pr_ticks);
322			}
323			goto out;
324
325			/*
326			 * The following two traps can happen in
327			 * vm86 mode, and, if so, we want to handle
328			 * them specially.
329			 */
330		case T_PROTFLT:		/* general protection fault */
331		case T_STKFLT:		/* stack fault */
332			if (frame.tf_eflags & PSL_VM) {
333				i = vm86_emulate((struct vm86frame *)&frame);
334				if (i == 0)
335					goto out;
336				break;
337			}
338			/* FALL THROUGH */
339
340		case T_SEGNPFLT:	/* segment not present fault */
341		case T_TSSFLT:		/* invalid TSS fault */
342		case T_DOUBLEFLT:	/* double fault */
343		default:
344			ucode = code + BUS_SEGM_FAULT ;
345			i = SIGBUS;
346			break;
347
348		case T_PAGEFLT:		/* page fault */
349			i = trap_pfault(&frame, TRUE, eva);
350			if (i == -1)
351				return;
352#if defined(I586_CPU) && !defined(NO_F00F_HACK)
353			if (i == -2)
354				goto restart;
355#endif
356			if (i == 0)
357				goto out;
358
359			ucode = T_PAGEFLT;
360			break;
361
362		case T_DIVIDE:		/* integer divide fault */
363			ucode = FPE_INTDIV;
364			i = SIGFPE;
365			break;
366
367#if NISA > 0
368		case T_NMI:
369#ifdef POWERFAIL_NMI
370			goto handle_powerfail;
371#else /* !POWERFAIL_NMI */
372#ifdef DDB
373			/* NMI can be hooked up to a pushbutton for debugging */
374			printf ("NMI ... going to debugger\n");
375			if (kdb_trap (type, 0, &frame))
376				return;
377#endif /* DDB */
378			/* machine/parity/power fail/"kitchen sink" faults */
379			if (isa_nmi(code) == 0) return;
380			panic("NMI indicates hardware failure");
381#endif /* POWERFAIL_NMI */
382#endif /* NISA > 0 */
383
384		case T_OFLOW:		/* integer overflow fault */
385			ucode = FPE_INTOVF;
386			i = SIGFPE;
387			break;
388
389		case T_BOUND:		/* bounds check fault */
390			ucode = FPE_FLTSUB;
391			i = SIGFPE;
392			break;
393
394		case T_DNA:
395#if NNPX > 0
396			/* if a transparent fault (due to context switch "late") */
397			if (npxdna())
398				return;
399#endif
400			if (!pmath_emulate) {
401				i = SIGFPE;
402				ucode = FPE_FPU_NP_TRAP;
403				break;
404			}
405			i = (*pmath_emulate)(&frame);
406			if (i == 0) {
407				if (!(frame.tf_eflags & PSL_T))
408					return;
409				frame.tf_eflags &= ~PSL_T;
410				i = SIGTRAP;
411			}
412			/* else ucode = emulator_only_knows() XXX */
413			break;
414
415		case T_FPOPFLT:		/* FPU operand fetch fault */
416			ucode = T_FPOPFLT;
417			i = SIGILL;
418			break;
419		}
420	} else {
421kernel_trap:
422		/* kernel trap */
423
424		switch (type) {
425		case T_PAGEFLT:			/* page fault */
426			(void) trap_pfault(&frame, FALSE, eva);
427			return;
428
429		case T_DNA:
430#if NNPX > 0
431			/*
432			 * The kernel is apparently using npx for copying.
433			 * XXX this should be fatal unless the kernel has
434			 * registered such use.
435			 */
436			if (npxdna())
437				return;
438#endif
439			break;
440
441		case T_PROTFLT:		/* general protection fault */
442		case T_SEGNPFLT:	/* segment not present fault */
443			/*
444			 * Invalid segment selectors and out of bounds
445			 * %eip's and %esp's can be set up in user mode.
446			 * This causes a fault in kernel mode when the
447			 * kernel tries to return to user mode.  We want
448			 * to get this fault so that we can fix the
449			 * problem here and not have to check all the
450			 * selectors and pointers when the user changes
451			 * them.
452			 */
453#define	MAYBE_DORETI_FAULT(where, whereto)				\
454	do {								\
455		if (frame.tf_eip == (int)where) {			\
456			frame.tf_eip = (int)whereto;			\
457			return;						\
458		}							\
459	} while (0)
460
461			if (intr_nesting_level == 0) {
462				/*
463				 * Invalid %fs's and %gs's can be created using
464				 * procfs or PT_SETREGS or by invalidating the
465				 * underlying LDT entry.  This causes a fault
466				 * in kernel mode when the kernel attempts to
467				 * switch contexts.  Lose the bad context
468				 * (XXX) so that we can continue, and generate
469				 * a signal.
470				 */
471				if (frame.tf_eip == (int)cpu_switch_load_gs) {
472					curpcb->pcb_gs = 0;
473					psignal(p, SIGBUS);
474					return;
475				}
476				MAYBE_DORETI_FAULT(doreti_iret,
477						   doreti_iret_fault);
478				MAYBE_DORETI_FAULT(doreti_popl_ds,
479						   doreti_popl_ds_fault);
480				MAYBE_DORETI_FAULT(doreti_popl_es,
481						   doreti_popl_es_fault);
482				MAYBE_DORETI_FAULT(doreti_popl_fs,
483						   doreti_popl_fs_fault);
484				if (curpcb && curpcb->pcb_onfault) {
485					frame.tf_eip = (int)curpcb->pcb_onfault;
486					return;
487				}
488			}
489			break;
490
491		case T_TSSFLT:
492			/*
493			 * PSL_NT can be set in user mode and isn't cleared
494			 * automatically when the kernel is entered.  This
495			 * causes a TSS fault when the kernel attempts to
496			 * `iret' because the TSS link is uninitialized.  We
497			 * want to get this fault so that we can fix the
498			 * problem here and not every time the kernel is
499			 * entered.
500			 */
501			if (frame.tf_eflags & PSL_NT) {
502				frame.tf_eflags &= ~PSL_NT;
503				return;
504			}
505			break;
506
507		case T_TRCTRAP:	 /* trace trap */
508			if (frame.tf_eip == (int)IDTVEC(syscall)) {
509				/*
510				 * We've just entered system mode via the
511				 * syscall lcall.  Continue single stepping
512				 * silently until the syscall handler has
513				 * saved the flags.
514				 */
515				return;
516			}
517			if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
518				/*
519				 * The syscall handler has now saved the
520				 * flags.  Stop single stepping it.
521				 */
522				frame.tf_eflags &= ~PSL_T;
523				return;
524			}
525                        /*
526                         * Ignore debug register trace traps due to
527                         * accesses in the user's address space, which
528                         * can happen under several conditions such as
529                         * if a user sets a watchpoint on a buffer and
530                         * then passes that buffer to a system call.
531                         * We still want to get TRCTRAPS for addresses
532                         * in kernel space because that is useful when
533                         * debugging the kernel.
534                         */
535                        if (user_dbreg_trap()) {
536                                /*
537                                 * Reset breakpoint bits because the
538                                 * processor doesn't
539                                 */
540                                load_dr6(rdr6() & 0xfffffff0);
541                                return;
542                        }
543			/*
544			 * Fall through (TRCTRAP kernel mode, kernel address)
545			 */
546		case T_BPTFLT:
547			/*
548			 * If DDB is enabled, let it handle the debugger trap.
549			 * Otherwise, debugger traps "can't happen".
550			 */
551#ifdef DDB
552			if (kdb_trap (type, 0, &frame))
553				return;
554#endif
555			break;
556
557#if NISA > 0
558		case T_NMI:
559#ifdef POWERFAIL_NMI
560#ifndef TIMER_FREQ
561#  define TIMER_FREQ 1193182
562#endif
563	handle_powerfail:
564		{
565		  static unsigned lastalert = 0;
566
567		  if(time_second - lastalert > 10)
568		    {
569		      log(LOG_WARNING, "NMI: power fail\n");
570		      sysbeep(TIMER_FREQ/880, hz);
571		      lastalert = time_second;
572		    }
573		  return;
574		}
575#else /* !POWERFAIL_NMI */
576#ifdef DDB
577			/* NMI can be hooked up to a pushbutton for debugging */
578			printf ("NMI ... going to debugger\n");
579			if (kdb_trap (type, 0, &frame))
580				return;
581#endif /* DDB */
582			/* machine/parity/power fail/"kitchen sink" faults */
583			if (isa_nmi(code) == 0) return;
584			/* FALL THROUGH */
585#endif /* POWERFAIL_NMI */
586#endif /* NISA > 0 */
587		}
588
589		trap_fatal(&frame, eva);
590		return;
591	}
592
593	/* Translate fault for emulators (e.g. Linux) */
594	if (*p->p_sysent->sv_transtrap)
595		i = (*p->p_sysent->sv_transtrap)(i, type);
596
597	trapsignal(p, i, ucode);
598
599#ifdef DEBUG
600	if (type <= MAX_TRAP_MSG) {
601		uprintf("fatal process exception: %s",
602			trap_msg[type]);
603		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
604			uprintf(", fault VA = 0x%lx", (u_long)eva);
605		uprintf("\n");
606	}
607#endif
608
609out:
610	userret(p, &frame, sticks, 1);
611}
612
613#ifdef notyet
614/*
615 * This version doesn't allow a page fault to user space while
616 * in the kernel. The rest of the kernel needs to be made "safe"
617 * before this can be used. I think the only things remaining
618 * to be made safe are the iBCS2 code and the process tracing/
619 * debugging code.
620 */
621static int
622trap_pfault(frame, usermode, eva)
623	struct trapframe *frame;
624	int usermode;
625	vm_offset_t eva;
626{
627	vm_offset_t va;
628	struct vmspace *vm = NULL;
629	vm_map_t map = 0;
630	int rv = 0;
631	vm_prot_t ftype;
632	struct proc *p = curproc;
633
634	if (frame->tf_err & PGEX_W)
635		ftype = VM_PROT_READ | VM_PROT_WRITE;
636	else
637		ftype = VM_PROT_READ;
638
639	va = trunc_page(eva);
640	if (va < VM_MIN_KERNEL_ADDRESS) {
641		vm_offset_t v;
642		vm_page_t mpte;
643
644		if (p == NULL ||
645		    (!usermode && va < VM_MAXUSER_ADDRESS &&
646		     (intr_nesting_level != 0 || curpcb == NULL ||
647		      curpcb->pcb_onfault == NULL))) {
648			trap_fatal(frame, eva);
649			return (-1);
650		}
651
652		/*
653		 * This is a fault on non-kernel virtual memory.
654		 * vm is initialized above to NULL. If curproc is NULL
655		 * or curproc->p_vmspace is NULL the fault is fatal.
656		 */
657		vm = p->p_vmspace;
658		if (vm == NULL)
659			goto nogo;
660
661		map = &vm->vm_map;
662
663		/*
664		 * Keep swapout from messing with us during this
665		 *	critical time.
666		 */
667		++p->p_lock;
668
669		/*
670		 * Grow the stack if necessary
671		 */
672		/* grow_stack returns false only if va falls into
673		 * a growable stack region and the stack growth
674		 * fails.  It returns true if va was not within
675		 * a growable stack region, or if the stack
676		 * growth succeeded.
677		 */
678		if (!grow_stack (p, va)) {
679			rv = KERN_FAILURE;
680			--p->p_lock;
681			goto nogo;
682		}
683
684		/* Fault in the user page: */
685		rv = vm_fault(map, va, ftype,
686			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
687						      : VM_FAULT_NORMAL);
688
689		--p->p_lock;
690	} else {
691		/*
692		 * Don't allow user-mode faults in kernel address space.
693		 */
694		if (usermode)
695			goto nogo;
696
697		/*
698		 * Since we know that kernel virtual address addresses
699		 * always have pte pages mapped, we just have to fault
700		 * the page.
701		 */
702		rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL);
703	}
704
705	if (rv == KERN_SUCCESS)
706		return (0);
707nogo:
708	if (!usermode) {
709		if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
710			frame->tf_eip = (int)curpcb->pcb_onfault;
711			return (0);
712		}
713		trap_fatal(frame, eva);
714		return (-1);
715	}
716
717	/* kludge to pass faulting virtual address to sendsig */
718	frame->tf_err = eva;
719
720	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
721}
722#endif
723
724int
725trap_pfault(frame, usermode, eva)
726	struct trapframe *frame;
727	int usermode;
728	vm_offset_t eva;
729{
730	vm_offset_t va;
731	struct vmspace *vm = NULL;
732	vm_map_t map = 0;
733	int rv = 0;
734	vm_prot_t ftype;
735	struct proc *p = curproc;
736
737	va = trunc_page(eva);
738	if (va >= KERNBASE) {
739		/*
740		 * Don't allow user-mode faults in kernel address space.
741		 * An exception:  if the faulting address is the invalid
742		 * instruction entry in the IDT, then the Intel Pentium
743		 * F00F bug workaround was triggered, and we need to
744		 * treat it is as an illegal instruction, and not a page
745		 * fault.
746		 */
747#if defined(I586_CPU) && !defined(NO_F00F_HACK)
748		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) {
749			frame->tf_trapno = T_PRIVINFLT;
750			return -2;
751		}
752#endif
753		if (usermode)
754			goto nogo;
755
756		map = kernel_map;
757	} else {
758		/*
759		 * This is a fault on non-kernel virtual memory.
760		 * vm is initialized above to NULL. If curproc is NULL
761		 * or curproc->p_vmspace is NULL the fault is fatal.
762		 */
763		if (p != NULL)
764			vm = p->p_vmspace;
765
766		if (vm == NULL)
767			goto nogo;
768
769		map = &vm->vm_map;
770	}
771
772	if (frame->tf_err & PGEX_W)
773		ftype = VM_PROT_READ | VM_PROT_WRITE;
774	else
775		ftype = VM_PROT_READ;
776
777	if (map != kernel_map) {
778		/*
779		 * Keep swapout from messing with us during this
780		 *	critical time.
781		 */
782		++p->p_lock;
783
784		/*
785		 * Grow the stack if necessary
786		 */
787		/* grow_stack returns false only if va falls into
788		 * a growable stack region and the stack growth
789		 * fails.  It returns true if va was not within
790		 * a growable stack region, or if the stack
791		 * growth succeeded.
792		 */
793		if (!grow_stack (p, va)) {
794			rv = KERN_FAILURE;
795			--p->p_lock;
796			goto nogo;
797		}
798
799		/* Fault in the user page: */
800		rv = vm_fault(map, va, ftype,
801			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
802						      : VM_FAULT_NORMAL);
803
804		--p->p_lock;
805	} else {
806		/*
807		 * Don't have to worry about process locking or stacks in the kernel.
808		 */
809		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
810	}
811
812	if (rv == KERN_SUCCESS)
813		return (0);
814nogo:
815	if (!usermode) {
816		if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
817			frame->tf_eip = (int)curpcb->pcb_onfault;
818			return (0);
819		}
820		trap_fatal(frame, eva);
821		return (-1);
822	}
823
824	/* kludge to pass faulting virtual address to sendsig */
825	frame->tf_err = eva;
826
827	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
828}
829
830static void
831trap_fatal(frame, eva)
832	struct trapframe *frame;
833	vm_offset_t eva;
834{
835	int code, type, ss, esp;
836	struct soft_segment_descriptor softseg;
837
838	code = frame->tf_err;
839	type = frame->tf_trapno;
840	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
841
842	if (type <= MAX_TRAP_MSG)
843		printf("\n\nFatal trap %d: %s while in %s mode\n",
844			type, trap_msg[type],
845        		frame->tf_eflags & PSL_VM ? "vm86" :
846			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
847#ifdef SMP
848	/* three seperate prints in case of a trap on an unmapped page */
849	printf("mp_lock = %08x; ", mp_lock);
850	printf("cpuid = %d; ", cpuid);
851	printf("lapic.id = %08x\n", lapic.id);
852#endif
853	if (type == T_PAGEFLT) {
854		printf("fault virtual address	= 0x%x\n", eva);
855		printf("fault code		= %s %s, %s\n",
856			code & PGEX_U ? "user" : "supervisor",
857			code & PGEX_W ? "write" : "read",
858			code & PGEX_P ? "protection violation" : "page not present");
859	}
860	printf("instruction pointer	= 0x%x:0x%x\n",
861	       frame->tf_cs & 0xffff, frame->tf_eip);
862        if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
863		ss = frame->tf_ss & 0xffff;
864		esp = frame->tf_esp;
865	} else {
866		ss = GSEL(GDATA_SEL, SEL_KPL);
867		esp = (int)&frame->tf_esp;
868	}
869	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
870	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
871	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
872	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
873	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
874	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
875	       softseg.ssd_gran);
876	printf("processor eflags	= ");
877	if (frame->tf_eflags & PSL_T)
878		printf("trace trap, ");
879	if (frame->tf_eflags & PSL_I)
880		printf("interrupt enabled, ");
881	if (frame->tf_eflags & PSL_NT)
882		printf("nested task, ");
883	if (frame->tf_eflags & PSL_RF)
884		printf("resume, ");
885	if (frame->tf_eflags & PSL_VM)
886		printf("vm86, ");
887	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
888	printf("current process		= ");
889	if (curproc) {
890		printf("%lu (%s)\n",
891		    (u_long)curproc->p_pid, curproc->p_comm ?
892		    curproc->p_comm : "");
893	} else {
894		printf("Idle\n");
895	}
896	printf("interrupt mask		= ");
897	if ((cpl & net_imask) == net_imask)
898		printf("net ");
899	if ((cpl & tty_imask) == tty_imask)
900		printf("tty ");
901	if ((cpl & bio_imask) == bio_imask)
902		printf("bio ");
903	if ((cpl & cam_imask) == cam_imask)
904		printf("cam ");
905	if (cpl == 0)
906		printf("none");
907#ifdef SMP
908/**
909 *  XXX FIXME:
910 *	we probably SHOULD have stopped the other CPUs before now!
911 *	another CPU COULD have been touching cpl at this moment...
912 */
913	printf(" <- SMP: XXX");
914#endif
915	printf("\n");
916
917#ifdef KDB
918	if (kdb_trap(&psl))
919		return;
920#endif
921#ifdef DDB
922	if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame))
923		return;
924#endif
925	printf("trap number		= %d\n", type);
926	if (type <= MAX_TRAP_MSG)
927		panic(trap_msg[type]);
928	else
929		panic("unknown/reserved trap");
930}
931
932/*
933 * Double fault handler. Called when a fault occurs while writing
934 * a frame for a trap/exception onto the stack. This usually occurs
935 * when the stack overflows (such is the case with infinite recursion,
936 * for example).
937 *
938 * XXX Note that the current PTD gets replaced by IdlePTD when the
939 * task switch occurs. This means that the stack that was active at
940 * the time of the double fault is not available at <kstack> unless
941 * the machine was idle when the double fault occurred. The downside
942 * of this is that "trace <ebp>" in ddb won't work.
943 */
944void
945dblfault_handler()
946{
947	printf("\nFatal double fault:\n");
948	printf("eip = 0x%x\n", common_tss.tss_eip);
949	printf("esp = 0x%x\n", common_tss.tss_esp);
950	printf("ebp = 0x%x\n", common_tss.tss_ebp);
951#ifdef SMP
952	/* three seperate prints in case of a trap on an unmapped page */
953	printf("mp_lock = %08x; ", mp_lock);
954	printf("cpuid = %d; ", cpuid);
955	printf("lapic.id = %08x\n", lapic.id);
956#endif
957	panic("double fault");
958}
959
960/*
961 * Compensate for 386 brain damage (missing URKR).
962 * This is a little simpler than the pagefault handler in trap() because
963 * it the page tables have already been faulted in and high addresses
964 * are thrown out early for other reasons.
965 */
966int trapwrite(addr)
967	unsigned addr;
968{
969	struct proc *p;
970	vm_offset_t va;
971	struct vmspace *vm;
972	int rv;
973
974	va = trunc_page((vm_offset_t)addr);
975	/*
976	 * XXX - MAX is END.  Changed > to >= for temp. fix.
977	 */
978	if (va >= VM_MAXUSER_ADDRESS)
979		return (1);
980
981	p = curproc;
982	vm = p->p_vmspace;
983
984	++p->p_lock;
985
986	if (!grow_stack (p, va)) {
987		--p->p_lock;
988		return (1);
989	}
990
991	/*
992	 * fault the data page
993	 */
994	rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, VM_FAULT_DIRTY);
995
996	--p->p_lock;
997
998	if (rv != KERN_SUCCESS)
999		return 1;
1000
1001	return (0);
1002}
1003
1004/*
1005 *	syscall2 -	MP aware system call request C handler
1006 *
1007 *	A system call is essentially treated as a trap except that the
1008 *	MP lock is not held on entry or return.  We are responsible for
1009 *	obtaining the MP lock if necessary and for handling ASTs
1010 *	(e.g. a task switch) prior to return.
1011 *
1012 *	In general, only simple access and manipulation of curproc and
1013 *	the current stack is allowed without having to hold MP lock.
1014 */
1015void
1016syscall2(frame)
1017	struct trapframe frame;
1018{
1019	caddr_t params;
1020	int i;
1021	struct sysent *callp;
1022	struct proc *p = curproc;
1023	u_quad_t sticks;
1024	int error;
1025	int narg;
1026	int args[8];
1027	int have_mplock = 0;
1028	u_int code;
1029
1030#ifdef DIAGNOSTIC
1031	if (ISPL(frame.tf_cs) != SEL_UPL) {
1032		get_mplock();
1033		panic("syscall");
1034		/* NOT REACHED */
1035	}
1036#endif
1037
1038	/*
1039	 * handle atomicy by looping since interrupts are enabled and the
1040	 * MP lock is not held.
1041	 */
1042	sticks = ((volatile struct proc *)p)->p_sticks;
1043	while (sticks != ((volatile struct proc *)p)->p_sticks)
1044		sticks = ((volatile struct proc *)p)->p_sticks;
1045
1046	p->p_md.md_regs = &frame;
1047	params = (caddr_t)frame.tf_esp + sizeof(int);
1048	code = frame.tf_eax;
1049
1050	if (p->p_sysent->sv_prepsyscall) {
1051		/*
1052		 * The prep code is not MP aware.
1053		 */
1054		get_mplock();
1055		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
1056		rel_mplock();
1057	} else {
1058		/*
1059		 * Need to check if this is a 32 bit or 64 bit syscall.
1060		 * fuword is MP aware.
1061		 */
1062		if (code == SYS_syscall) {
1063			/*
1064			 * Code is first argument, followed by actual args.
1065			 */
1066			code = fuword(params);
1067			params += sizeof(int);
1068		} else if (code == SYS___syscall) {
1069			/*
1070			 * Like syscall, but code is a quad, so as to maintain
1071			 * quad alignment for the rest of the arguments.
1072			 */
1073			code = fuword(params);
1074			params += sizeof(quad_t);
1075		}
1076	}
1077
1078 	if (p->p_sysent->sv_mask)
1079 		code &= p->p_sysent->sv_mask;
1080
1081 	if (code >= p->p_sysent->sv_size)
1082 		callp = &p->p_sysent->sv_table[0];
1083  	else
1084 		callp = &p->p_sysent->sv_table[code];
1085
1086	narg = callp->sy_narg & SYF_ARGMASK;
1087
1088	/*
1089	 * copyin is MP aware, but the tracing code is not
1090	 */
1091	if (params && (i = narg * sizeof(int)) &&
1092	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
1093		get_mplock();
1094		have_mplock = 1;
1095#ifdef KTRACE
1096		if (KTRPOINT(p, KTR_SYSCALL))
1097			ktrsyscall(p->p_tracep, code, narg, args);
1098#endif
1099		goto bad;
1100	}
1101
1102	/*
1103	 * Try to run the syscall without the MP lock if the syscall
1104	 * is MP safe.  We have to obtain the MP lock no matter what if
1105	 * we are ktracing
1106	 */
1107	if ((callp->sy_narg & SYF_MPSAFE) == 0) {
1108		get_mplock();
1109		have_mplock = 1;
1110	}
1111
1112#ifdef KTRACE
1113	if (KTRPOINT(p, KTR_SYSCALL)) {
1114		if (have_mplock == 0) {
1115			get_mplock();
1116			have_mplock = 1;
1117		}
1118		ktrsyscall(p->p_tracep, code, narg, args);
1119	}
1120#endif
1121	p->p_retval[0] = 0;
1122	p->p_retval[1] = frame.tf_edx;
1123
1124	STOPEVENT(p, S_SCE, narg);	/* MP aware */
1125
1126	error = (*callp->sy_call)(p, args);
1127
1128	/*
1129	 * MP SAFE (we may or may not have the MP lock at this point)
1130	 */
1131	switch (error) {
1132	case 0:
1133		/*
1134		 * Reinitialize proc pointer `p' as it may be different
1135		 * if this is a child returning from fork syscall.
1136		 */
1137		p = curproc;
1138		frame.tf_eax = p->p_retval[0];
1139		frame.tf_edx = p->p_retval[1];
1140		frame.tf_eflags &= ~PSL_C;
1141		break;
1142
1143	case ERESTART:
1144		/*
1145		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
1146		 * int 0x80 is 2 bytes. We saved this in tf_err.
1147		 */
1148		frame.tf_eip -= frame.tf_err;
1149		break;
1150
1151	case EJUSTRETURN:
1152		break;
1153
1154	default:
1155bad:
1156 		if (p->p_sysent->sv_errsize) {
1157 			if (error >= p->p_sysent->sv_errsize)
1158  				error = -1;	/* XXX */
1159   			else
1160  				error = p->p_sysent->sv_errtbl[error];
1161		}
1162		frame.tf_eax = error;
1163		frame.tf_eflags |= PSL_C;
1164		break;
1165	}
1166
1167	/*
1168	 * Traced syscall.  trapsignal() is not MP aware.
1169	 */
1170	if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
1171		if (have_mplock == 0) {
1172			get_mplock();
1173			have_mplock = 1;
1174		}
1175		frame.tf_eflags &= ~PSL_T;
1176		trapsignal(p, SIGTRAP, 0);
1177	}
1178
1179	/*
1180	 * Handle reschedule and other end-of-syscall issues
1181	 */
1182	have_mplock = userret(p, &frame, sticks, have_mplock);
1183
1184#ifdef KTRACE
1185	if (KTRPOINT(p, KTR_SYSRET)) {
1186		if (have_mplock == 0) {
1187			get_mplock();
1188			have_mplock = 1;
1189		}
1190		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
1191	}
1192#endif
1193
1194	/*
1195	 * This works because errno is findable through the
1196	 * register set.  If we ever support an emulation where this
1197	 * is not the case, this code will need to be revisited.
1198	 */
1199	STOPEVENT(p, S_SCX, code);
1200
1201	/*
1202	 * Release the MP lock if we had to get it
1203	 */
1204	if (have_mplock)
1205		rel_mplock();
1206}
1207
1208/*
1209 * Simplified back end of syscall(), used when returning from fork()
1210 * directly into user mode.  MP lock is held on entry and should be
1211 * held on return.
1212 */
1213void
1214fork_return(p, frame)
1215	struct proc *p;
1216	struct trapframe frame;
1217{
1218	frame.tf_eax = 0;		/* Child returns zero */
1219	frame.tf_eflags &= ~PSL_C;	/* success */
1220	frame.tf_edx = 1;
1221
1222	userret(p, &frame, 0, 1);
1223#ifdef KTRACE
1224	if (KTRPOINT(p, KTR_SYSRET))
1225		ktrsysret(p->p_tracep, SYS_fork, 0, 0);
1226#endif
1227}
1228