subr_syscall.c revision 49081
1/*-
2 * Copyright (C) 1994, David Greenman
3 * Copyright (c) 1990, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the University of Utah, and William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
38 *	$Id: trap.c,v 1.139 1999/06/18 14:32:16 bde Exp $
39 */
40
41/*
42 * 386 Trap and System call handling
43 */
44
45#include "opt_cpu.h"
46#include "opt_ddb.h"
47#include "opt_ktrace.h"
48#include "opt_clock.h"
49#include "opt_trap.h"
50
51#include <sys/param.h>
52#include <sys/systm.h>
53#include <sys/proc.h>
54#include <sys/pioctl.h>
55#include <sys/kernel.h>
56#include <sys/resourcevar.h>
57#include <sys/signalvar.h>
58#include <sys/syscall.h>
59#include <sys/sysent.h>
60#include <sys/uio.h>
61#include <sys/vmmeter.h>
62#ifdef KTRACE
63#include <sys/ktrace.h>
64#endif
65
66#include <vm/vm.h>
67#include <vm/vm_param.h>
68#include <vm/vm_prot.h>
69#include <sys/lock.h>
70#include <vm/pmap.h>
71#include <vm/vm_kern.h>
72#include <vm/vm_map.h>
73#include <vm/vm_page.h>
74#include <vm/vm_extern.h>
75
76#include <machine/cpu.h>
77#include <machine/ipl.h>
78#include <machine/md_var.h>
79#include <machine/pcb.h>
80#ifdef SMP
81#include <machine/smp.h>
82#endif
83#include <machine/tss.h>
84
85#include <i386/isa/intr_machdep.h>
86
87#ifdef POWERFAIL_NMI
88#include <sys/syslog.h>
89#include <machine/clock.h>
90#endif
91
92#include <machine/vm86.h>
93
94#ifdef DDB
95	extern int in_Debugger, debugger_on_panic;
96#endif
97
98#include "isa.h"
99#include "npx.h"
100
101int (*pmath_emulate) __P((struct trapframe *));
102
103extern void trap __P((struct trapframe frame));
104extern int trapwrite __P((unsigned addr));
105extern void syscall __P((struct trapframe frame));
106
107static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
108static void trap_fatal __P((struct trapframe *, vm_offset_t));
109void dblfault_handler __P((void));
110
111extern inthand_t IDTVEC(syscall);
112
113#define MAX_TRAP_MSG		28
114static char *trap_msg[] = {
115	"",					/*  0 unused */
116	"privileged instruction fault",		/*  1 T_PRIVINFLT */
117	"",					/*  2 unused */
118	"breakpoint instruction fault",		/*  3 T_BPTFLT */
119	"",					/*  4 unused */
120	"",					/*  5 unused */
121	"arithmetic trap",			/*  6 T_ARITHTRAP */
122	"system forced exception",		/*  7 T_ASTFLT */
123	"",					/*  8 unused */
124	"general protection fault",		/*  9 T_PROTFLT */
125	"trace trap",				/* 10 T_TRCTRAP */
126	"",					/* 11 unused */
127	"page fault",				/* 12 T_PAGEFLT */
128	"",					/* 13 unused */
129	"alignment fault",			/* 14 T_ALIGNFLT */
130	"",					/* 15 unused */
131	"",					/* 16 unused */
132	"",					/* 17 unused */
133	"integer divide fault",			/* 18 T_DIVIDE */
134	"non-maskable interrupt trap",		/* 19 T_NMI */
135	"overflow trap",			/* 20 T_OFLOW */
136	"FPU bounds check fault",		/* 21 T_BOUND */
137	"FPU device not available",		/* 22 T_DNA */
138	"double fault",				/* 23 T_DOUBLEFLT */
139	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
140	"invalid TSS fault",			/* 25 T_TSSFLT */
141	"segment not present fault",		/* 26 T_SEGNPFLT */
142	"stack fault",				/* 27 T_STKFLT */
143	"machine check trap",			/* 28 T_MCHK */
144};
145
146static __inline void userret __P((struct proc *p, struct trapframe *frame,
147				  u_quad_t oticks));
148
149#if defined(I586_CPU) && !defined(NO_F00F_HACK)
150extern int has_f00f_bug;
151#endif
152
153static __inline void
154userret(p, frame, oticks)
155	struct proc *p;
156	struct trapframe *frame;
157	u_quad_t oticks;
158{
159	int sig, s;
160
161	while ((sig = CURSIG(p)) != 0)
162		postsig(sig);
163
164#if 0
165	if (!want_resched &&
166		(p->p_priority <= p->p_usrpri) &&
167		(p->p_rtprio.type == RTP_PRIO_NORMAL)) {
168		 int newpriority;
169		 p->p_estcpu += 1;
170		 newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
171		 newpriority = min(newpriority, MAXPRI);
172		 p->p_usrpri = newpriority;
173	}
174#endif
175
176	p->p_priority = p->p_usrpri;
177	if (want_resched) {
178		/*
179		 * Since we are curproc, clock will normally just change
180		 * our priority without moving us from one queue to another
181		 * (since the running process is not on a queue.)
182		 * If that happened after we setrunqueue ourselves but before we
183		 * mi_switch()'ed, we might not be on the queue indicated by
184		 * our priority.
185		 */
186		s = splhigh();
187		setrunqueue(p);
188		p->p_stats->p_ru.ru_nivcsw++;
189		mi_switch();
190		splx(s);
191		while ((sig = CURSIG(p)) != 0)
192			postsig(sig);
193	}
194	/*
195	 * Charge system time if profiling.
196	 */
197	if (p->p_flag & P_PROFIL)
198		addupc_task(p, frame->tf_eip,
199			    (u_int)(p->p_sticks - oticks) * psratio);
200
201	curpriority = p->p_priority;
202}
203
204/*
205 * Exception, fault, and trap interface to the FreeBSD kernel.
206 * This common code is called from assembly language IDT gate entry
207 * routines that prepare a suitable stack frame, and restore this
208 * frame after the exception has been processed.
209 */
210
211void
212trap(frame)
213	struct trapframe frame;
214{
215	struct proc *p = curproc;
216	u_quad_t sticks = 0;
217	int i = 0, ucode = 0, type, code;
218	vm_offset_t eva;
219
220	if (!(frame.tf_eflags & PSL_I)) {
221		/*
222		 * Buggy application or kernel code has disabled interrupts
223		 * and then trapped.  Enabling interrupts now is wrong, but
224		 * it is better than running with interrupts disabled until
225		 * they are accidentally enabled later.
226		 */
227		type = frame.tf_trapno;
228		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
229			printf(
230			    "pid %ld (%s): trap %d with interrupts disabled\n",
231			    (long)curproc->p_pid, curproc->p_comm, type);
232		else if (type != T_BPTFLT && type != T_TRCTRAP)
233			/*
234			 * XXX not quite right, since this may be for a
235			 * multiple fault in user mode.
236			 */
237			printf("kernel trap %d with interrupts disabled\n",
238			    type);
239		enable_intr();
240	}
241
242	eva = 0;
243	if (frame.tf_trapno == T_PAGEFLT) {
244		/*
245		 * For some Cyrix CPUs, %cr2 is clobbered by interrupts.
246		 * This problem is worked around by using an interrupt
247		 * gate for the pagefault handler.  We are finally ready
248		 * to read %cr2 and then must reenable interrupts.
249		 *
250		 * XXX this should be in the switch statement, but the
251		 * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the
252		 * flow of control too much for this to be obviously
253		 * correct.
254		 */
255		eva = rcr2();
256		enable_intr();
257	}
258
259#if defined(I586_CPU) && !defined(NO_F00F_HACK)
260restart:
261#endif
262	type = frame.tf_trapno;
263	code = frame.tf_err;
264
265	if (in_vm86call) {
266		if (frame.tf_eflags & PSL_VM &&
267		    (type == T_PROTFLT || type == T_STKFLT)) {
268			i = vm86_emulate((struct vm86frame *)&frame);
269			if (i != 0)
270				/*
271				 * returns to original process
272				 */
273				vm86_trap((struct vm86frame *)&frame);
274			return;
275		}
276		switch (type) {
277			/*
278			 * these traps want either a process context, or
279			 * assume a normal userspace trap.
280			 */
281		case T_PROTFLT:
282		case T_SEGNPFLT:
283			trap_fatal(&frame, eva);
284			return;
285		case T_TRCTRAP:
286			type = T_BPTFLT;	/* kernel breakpoint */
287			/* FALL THROUGH */
288		}
289		goto kernel_trap;	/* normal kernel trap handling */
290	}
291
292        if ((ISPL(frame.tf_cs) == SEL_UPL) || (frame.tf_eflags & PSL_VM)) {
293		/* user trap */
294
295		sticks = p->p_sticks;
296		p->p_md.md_regs = &frame;
297
298		switch (type) {
299		case T_PRIVINFLT:	/* privileged instruction fault */
300			ucode = type;
301			i = SIGILL;
302			break;
303
304		case T_BPTFLT:		/* bpt instruction fault */
305		case T_TRCTRAP:		/* trace trap */
306			frame.tf_eflags &= ~PSL_T;
307			i = SIGTRAP;
308			break;
309
310		case T_ARITHTRAP:	/* arithmetic trap */
311			ucode = code;
312			i = SIGFPE;
313			break;
314
315		case T_ASTFLT:		/* Allow process switch */
316			astoff();
317			cnt.v_soft++;
318			if (p->p_flag & P_OWEUPC) {
319				p->p_flag &= ~P_OWEUPC;
320				addupc_task(p, p->p_stats->p_prof.pr_addr,
321					    p->p_stats->p_prof.pr_ticks);
322			}
323			goto out;
324
325			/*
326			 * The following two traps can happen in
327			 * vm86 mode, and, if so, we want to handle
328			 * them specially.
329			 */
330		case T_PROTFLT:		/* general protection fault */
331		case T_STKFLT:		/* stack fault */
332			if (frame.tf_eflags & PSL_VM) {
333				i = vm86_emulate((struct vm86frame *)&frame);
334				if (i == 0)
335					goto out;
336				break;
337			}
338			/* FALL THROUGH */
339
340		case T_SEGNPFLT:	/* segment not present fault */
341		case T_TSSFLT:		/* invalid TSS fault */
342		case T_DOUBLEFLT:	/* double fault */
343		default:
344			ucode = code + BUS_SEGM_FAULT ;
345			i = SIGBUS;
346			break;
347
348		case T_PAGEFLT:		/* page fault */
349			i = trap_pfault(&frame, TRUE, eva);
350			if (i == -1)
351				return;
352#if defined(I586_CPU) && !defined(NO_F00F_HACK)
353			if (i == -2)
354				goto restart;
355#endif
356			if (i == 0)
357				goto out;
358
359			ucode = T_PAGEFLT;
360			break;
361
362		case T_DIVIDE:		/* integer divide fault */
363			ucode = FPE_INTDIV;
364			i = SIGFPE;
365			break;
366
367#if NISA > 0
368		case T_NMI:
369#ifdef POWERFAIL_NMI
370			goto handle_powerfail;
371#else /* !POWERFAIL_NMI */
372#ifdef DDB
373			/* NMI can be hooked up to a pushbutton for debugging */
374			printf ("NMI ... going to debugger\n");
375			if (kdb_trap (type, 0, &frame))
376				return;
377#endif /* DDB */
378			/* machine/parity/power fail/"kitchen sink" faults */
379			if (isa_nmi(code) == 0) return;
380			panic("NMI indicates hardware failure");
381#endif /* POWERFAIL_NMI */
382#endif /* NISA > 0 */
383
384		case T_OFLOW:		/* integer overflow fault */
385			ucode = FPE_INTOVF;
386			i = SIGFPE;
387			break;
388
389		case T_BOUND:		/* bounds check fault */
390			ucode = FPE_FLTSUB;
391			i = SIGFPE;
392			break;
393
394		case T_DNA:
395#if NNPX > 0
396			/* if a transparent fault (due to context switch "late") */
397			if (npxdna())
398				return;
399#endif
400			if (!pmath_emulate) {
401				i = SIGFPE;
402				ucode = FPE_FPU_NP_TRAP;
403				break;
404			}
405			i = (*pmath_emulate)(&frame);
406			if (i == 0) {
407				if (!(frame.tf_eflags & PSL_T))
408					return;
409				frame.tf_eflags &= ~PSL_T;
410				i = SIGTRAP;
411			}
412			/* else ucode = emulator_only_knows() XXX */
413			break;
414
415		case T_FPOPFLT:		/* FPU operand fetch fault */
416			ucode = T_FPOPFLT;
417			i = SIGILL;
418			break;
419		}
420	} else {
421kernel_trap:
422		/* kernel trap */
423
424		switch (type) {
425		case T_PAGEFLT:			/* page fault */
426			(void) trap_pfault(&frame, FALSE, eva);
427			return;
428
429		case T_DNA:
430#if NNPX > 0
431			/*
432			 * The kernel is apparently using npx for copying.
433			 * XXX this should be fatal unless the kernel has
434			 * registered such use.
435			 */
436			if (npxdna())
437				return;
438#endif
439			break;
440
441		case T_PROTFLT:		/* general protection fault */
442		case T_SEGNPFLT:	/* segment not present fault */
443			/*
444			 * Invalid segment selectors and out of bounds
445			 * %eip's and %esp's can be set up in user mode.
446			 * This causes a fault in kernel mode when the
447			 * kernel tries to return to user mode.  We want
448			 * to get this fault so that we can fix the
449			 * problem here and not have to check all the
450			 * selectors and pointers when the user changes
451			 * them.
452			 */
453#define	MAYBE_DORETI_FAULT(where, whereto)				\
454	do {								\
455		if (frame.tf_eip == (int)where) {			\
456			frame.tf_eip = (int)whereto;			\
457			return;						\
458		}							\
459	} while (0)
460
461			if (intr_nesting_level == 0) {
462				/*
463				 * Invalid %fs's and %gs's can be created using
464				 * procfs or PT_SETREGS or by invalidating the
465				 * underlying LDT entry.  This causes a fault
466				 * in kernel mode when the kernel attempts to
467				 * switch contexts.  Lose the bad context
468				 * (XXX) so that we can continue, and generate
469				 * a signal.
470				 */
471				if (frame.tf_eip == (int)cpu_switch_load_gs) {
472					curpcb->pcb_gs = 0;
473					psignal(p, SIGBUS);
474					return;
475				}
476				MAYBE_DORETI_FAULT(doreti_iret,
477						   doreti_iret_fault);
478				MAYBE_DORETI_FAULT(doreti_popl_ds,
479						   doreti_popl_ds_fault);
480				MAYBE_DORETI_FAULT(doreti_popl_es,
481						   doreti_popl_es_fault);
482				MAYBE_DORETI_FAULT(doreti_popl_fs,
483						   doreti_popl_fs_fault);
484				if (curpcb && curpcb->pcb_onfault) {
485					frame.tf_eip = (int)curpcb->pcb_onfault;
486					return;
487				}
488			}
489			break;
490
491		case T_TSSFLT:
492			/*
493			 * PSL_NT can be set in user mode and isn't cleared
494			 * automatically when the kernel is entered.  This
495			 * causes a TSS fault when the kernel attempts to
496			 * `iret' because the TSS link is uninitialized.  We
497			 * want to get this fault so that we can fix the
498			 * problem here and not every time the kernel is
499			 * entered.
500			 */
501			if (frame.tf_eflags & PSL_NT) {
502				frame.tf_eflags &= ~PSL_NT;
503				return;
504			}
505			break;
506
507		case T_TRCTRAP:	 /* trace trap */
508			if (frame.tf_eip == (int)IDTVEC(syscall)) {
509				/*
510				 * We've just entered system mode via the
511				 * syscall lcall.  Continue single stepping
512				 * silently until the syscall handler has
513				 * saved the flags.
514				 */
515				return;
516			}
517			if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
518				/*
519				 * The syscall handler has now saved the
520				 * flags.  Stop single stepping it.
521				 */
522				frame.tf_eflags &= ~PSL_T;
523				return;
524			}
525			/*
526			 * Fall through.
527			 */
528		case T_BPTFLT:
529			/*
530			 * If DDB is enabled, let it handle the debugger trap.
531			 * Otherwise, debugger traps "can't happen".
532			 */
533#ifdef DDB
534			if (kdb_trap (type, 0, &frame))
535				return;
536#endif
537			break;
538
539#if NISA > 0
540		case T_NMI:
541#ifdef POWERFAIL_NMI
542#ifndef TIMER_FREQ
543#  define TIMER_FREQ 1193182
544#endif
545	handle_powerfail:
546		{
547		  static unsigned lastalert = 0;
548
549		  if(time_second - lastalert > 10)
550		    {
551		      log(LOG_WARNING, "NMI: power fail\n");
552		      sysbeep(TIMER_FREQ/880, hz);
553		      lastalert = time_second;
554		    }
555		  return;
556		}
557#else /* !POWERFAIL_NMI */
558#ifdef DDB
559			/* NMI can be hooked up to a pushbutton for debugging */
560			printf ("NMI ... going to debugger\n");
561			if (kdb_trap (type, 0, &frame))
562				return;
563#endif /* DDB */
564			/* machine/parity/power fail/"kitchen sink" faults */
565			if (isa_nmi(code) == 0) return;
566			/* FALL THROUGH */
567#endif /* POWERFAIL_NMI */
568#endif /* NISA > 0 */
569		}
570
571		trap_fatal(&frame, eva);
572		return;
573	}
574
575	/* Translate fault for emulators (e.g. Linux) */
576	if (*p->p_sysent->sv_transtrap)
577		i = (*p->p_sysent->sv_transtrap)(i, type);
578
579	trapsignal(p, i, ucode);
580
581#ifdef DEBUG
582	if (type <= MAX_TRAP_MSG) {
583		uprintf("fatal process exception: %s",
584			trap_msg[type]);
585		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
586			uprintf(", fault VA = 0x%lx", (u_long)eva);
587		uprintf("\n");
588	}
589#endif
590
591out:
592	userret(p, &frame, sticks);
593}
594
595#ifdef notyet
596/*
597 * This version doesn't allow a page fault to user space while
598 * in the kernel. The rest of the kernel needs to be made "safe"
599 * before this can be used. I think the only things remaining
600 * to be made safe are the iBCS2 code and the process tracing/
601 * debugging code.
602 */
603static int
604trap_pfault(frame, usermode, eva)
605	struct trapframe *frame;
606	int usermode;
607	vm_offset_t eva;
608{
609	vm_offset_t va;
610	struct vmspace *vm = NULL;
611	vm_map_t map = 0;
612	int rv = 0;
613	vm_prot_t ftype;
614	struct proc *p = curproc;
615
616	if (frame->tf_err & PGEX_W)
617		ftype = VM_PROT_READ | VM_PROT_WRITE;
618	else
619		ftype = VM_PROT_READ;
620
621	va = trunc_page(eva);
622	if (va < VM_MIN_KERNEL_ADDRESS) {
623		vm_offset_t v;
624		vm_page_t mpte;
625
626		if (p == NULL ||
627		    (!usermode && va < VM_MAXUSER_ADDRESS &&
628		     (intr_nesting_level != 0 || curpcb == NULL ||
629		      curpcb->pcb_onfault == NULL))) {
630			trap_fatal(frame, eva);
631			return (-1);
632		}
633
634		/*
635		 * This is a fault on non-kernel virtual memory.
636		 * vm is initialized above to NULL. If curproc is NULL
637		 * or curproc->p_vmspace is NULL the fault is fatal.
638		 */
639		vm = p->p_vmspace;
640		if (vm == NULL)
641			goto nogo;
642
643		map = &vm->vm_map;
644
645		/*
646		 * Keep swapout from messing with us during this
647		 *	critical time.
648		 */
649		++p->p_lock;
650
651		/*
652		 * Grow the stack if necessary
653		 */
654		/* grow_stack returns false only if va falls into
655		 * a growable stack region and the stack growth
656		 * fails.  It returns true if va was not within
657		 * a growable stack region, or if the stack
658		 * growth succeeded.
659		 */
660		if (!grow_stack (p, va)) {
661			rv = KERN_FAILURE;
662			--p->p_lock;
663			goto nogo;
664		}
665
666		/* Fault in the user page: */
667		rv = vm_fault(map, va, ftype,
668			(ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : 0);
669
670		--p->p_lock;
671	} else {
672		/*
673		 * Don't allow user-mode faults in kernel address space.
674		 */
675		if (usermode)
676			goto nogo;
677
678		/*
679		 * Since we know that kernel virtual address addresses
680		 * always have pte pages mapped, we just have to fault
681		 * the page.
682		 */
683		rv = vm_fault(kernel_map, va, ftype, FALSE);
684	}
685
686	if (rv == KERN_SUCCESS)
687		return (0);
688nogo:
689	if (!usermode) {
690		if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
691			frame->tf_eip = (int)curpcb->pcb_onfault;
692			return (0);
693		}
694		trap_fatal(frame, eva);
695		return (-1);
696	}
697
698	/* kludge to pass faulting virtual address to sendsig */
699	frame->tf_err = eva;
700
701	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
702}
703#endif
704
705int
706trap_pfault(frame, usermode, eva)
707	struct trapframe *frame;
708	int usermode;
709	vm_offset_t eva;
710{
711	vm_offset_t va;
712	struct vmspace *vm = NULL;
713	vm_map_t map = 0;
714	int rv = 0;
715	vm_prot_t ftype;
716	struct proc *p = curproc;
717
718	va = trunc_page(eva);
719	if (va >= KERNBASE) {
720		/*
721		 * Don't allow user-mode faults in kernel address space.
722		 * An exception:  if the faulting address is the invalid
723		 * instruction entry in the IDT, then the Intel Pentium
724		 * F00F bug workaround was triggered, and we need to
725		 * treat it is as an illegal instruction, and not a page
726		 * fault.
727		 */
728#if defined(I586_CPU) && !defined(NO_F00F_HACK)
729		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) {
730			frame->tf_trapno = T_PRIVINFLT;
731			return -2;
732		}
733#endif
734		if (usermode)
735			goto nogo;
736
737		map = kernel_map;
738	} else {
739		/*
740		 * This is a fault on non-kernel virtual memory.
741		 * vm is initialized above to NULL. If curproc is NULL
742		 * or curproc->p_vmspace is NULL the fault is fatal.
743		 */
744		if (p != NULL)
745			vm = p->p_vmspace;
746
747		if (vm == NULL)
748			goto nogo;
749
750		map = &vm->vm_map;
751	}
752
753	if (frame->tf_err & PGEX_W)
754		ftype = VM_PROT_READ | VM_PROT_WRITE;
755	else
756		ftype = VM_PROT_READ;
757
758	if (map != kernel_map) {
759		/*
760		 * Keep swapout from messing with us during this
761		 *	critical time.
762		 */
763		++p->p_lock;
764
765		/*
766		 * Grow the stack if necessary
767		 */
768		/* grow_stack returns false only if va falls into
769		 * a growable stack region and the stack growth
770		 * fails.  It returns true if va was not within
771		 * a growable stack region, or if the stack
772		 * growth succeeded.
773		 */
774		if (!grow_stack (p, va)) {
775			rv = KERN_FAILURE;
776			--p->p_lock;
777			goto nogo;
778		}
779
780		/* Fault in the user page: */
781		rv = vm_fault(map, va, ftype,
782			(ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : 0);
783
784		--p->p_lock;
785	} else {
786		/*
787		 * Don't have to worry about process locking or stacks in the kernel.
788		 */
789		rv = vm_fault(map, va, ftype, FALSE);
790	}
791
792	if (rv == KERN_SUCCESS)
793		return (0);
794nogo:
795	if (!usermode) {
796		if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
797			frame->tf_eip = (int)curpcb->pcb_onfault;
798			return (0);
799		}
800		trap_fatal(frame, eva);
801		return (-1);
802	}
803
804	/* kludge to pass faulting virtual address to sendsig */
805	frame->tf_err = eva;
806
807	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
808}
809
810static void
811trap_fatal(frame, eva)
812	struct trapframe *frame;
813	vm_offset_t eva;
814{
815	int code, type, ss, esp;
816	struct soft_segment_descriptor softseg;
817
818	code = frame->tf_err;
819	type = frame->tf_trapno;
820	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
821
822	if (type <= MAX_TRAP_MSG)
823		printf("\n\nFatal trap %d: %s while in %s mode\n",
824			type, trap_msg[type],
825        		frame->tf_eflags & PSL_VM ? "vm86" :
826			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
827#ifdef SMP
828	/* three seperate prints in case of a trap on an unmapped page */
829	printf("mp_lock = %08x; ", mp_lock);
830	printf("cpuid = %d; ", cpuid);
831	printf("lapic.id = %08x\n", lapic.id);
832#endif
833	if (type == T_PAGEFLT) {
834		printf("fault virtual address	= 0x%x\n", eva);
835		printf("fault code		= %s %s, %s\n",
836			code & PGEX_U ? "user" : "supervisor",
837			code & PGEX_W ? "write" : "read",
838			code & PGEX_P ? "protection violation" : "page not present");
839	}
840	printf("instruction pointer	= 0x%x:0x%x\n",
841	       frame->tf_cs & 0xffff, frame->tf_eip);
842        if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
843		ss = frame->tf_ss & 0xffff;
844		esp = frame->tf_esp;
845	} else {
846		ss = GSEL(GDATA_SEL, SEL_KPL);
847		esp = (int)&frame->tf_esp;
848	}
849	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
850	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
851	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
852	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
853	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
854	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
855	       softseg.ssd_gran);
856	printf("processor eflags	= ");
857	if (frame->tf_eflags & PSL_T)
858		printf("trace trap, ");
859	if (frame->tf_eflags & PSL_I)
860		printf("interrupt enabled, ");
861	if (frame->tf_eflags & PSL_NT)
862		printf("nested task, ");
863	if (frame->tf_eflags & PSL_RF)
864		printf("resume, ");
865	if (frame->tf_eflags & PSL_VM)
866		printf("vm86, ");
867	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
868	printf("current process		= ");
869	if (curproc) {
870		printf("%lu (%s)\n",
871		    (u_long)curproc->p_pid, curproc->p_comm ?
872		    curproc->p_comm : "");
873	} else {
874		printf("Idle\n");
875	}
876	printf("interrupt mask		= ");
877	if ((cpl & net_imask) == net_imask)
878		printf("net ");
879	if ((cpl & tty_imask) == tty_imask)
880		printf("tty ");
881	if ((cpl & bio_imask) == bio_imask)
882		printf("bio ");
883	if ((cpl & cam_imask) == cam_imask)
884		printf("cam ");
885	if (cpl == 0)
886		printf("none");
887#ifdef SMP
888/**
889 *  XXX FIXME:
890 *	we probably SHOULD have stopped the other CPUs before now!
891 *	another CPU COULD have been touching cpl at this moment...
892 */
893	printf(" <- SMP: XXX");
894#endif
895	printf("\n");
896
897#ifdef KDB
898	if (kdb_trap(&psl))
899		return;
900#endif
901#ifdef DDB
902	if ((debugger_on_panic || in_Debugger) && kdb_trap(type, 0, frame))
903		return;
904#endif
905	printf("trap number		= %d\n", type);
906	if (type <= MAX_TRAP_MSG)
907		panic(trap_msg[type]);
908	else
909		panic("unknown/reserved trap");
910}
911
912/*
913 * Double fault handler. Called when a fault occurs while writing
914 * a frame for a trap/exception onto the stack. This usually occurs
915 * when the stack overflows (such is the case with infinite recursion,
916 * for example).
917 *
918 * XXX Note that the current PTD gets replaced by IdlePTD when the
919 * task switch occurs. This means that the stack that was active at
920 * the time of the double fault is not available at <kstack> unless
921 * the machine was idle when the double fault occurred. The downside
922 * of this is that "trace <ebp>" in ddb won't work.
923 */
924void
925dblfault_handler()
926{
927	printf("\nFatal double fault:\n");
928	printf("eip = 0x%x\n", common_tss.tss_eip);
929	printf("esp = 0x%x\n", common_tss.tss_esp);
930	printf("ebp = 0x%x\n", common_tss.tss_ebp);
931#ifdef SMP
932	/* three seperate prints in case of a trap on an unmapped page */
933	printf("mp_lock = %08x; ", mp_lock);
934	printf("cpuid = %d; ", cpuid);
935	printf("lapic.id = %08x\n", lapic.id);
936#endif
937	panic("double fault");
938}
939
940/*
941 * Compensate for 386 brain damage (missing URKR).
942 * This is a little simpler than the pagefault handler in trap() because
943 * it the page tables have already been faulted in and high addresses
944 * are thrown out early for other reasons.
945 */
946int trapwrite(addr)
947	unsigned addr;
948{
949	struct proc *p;
950	vm_offset_t va;
951	struct vmspace *vm;
952	int rv;
953
954	va = trunc_page((vm_offset_t)addr);
955	/*
956	 * XXX - MAX is END.  Changed > to >= for temp. fix.
957	 */
958	if (va >= VM_MAXUSER_ADDRESS)
959		return (1);
960
961	p = curproc;
962	vm = p->p_vmspace;
963
964	++p->p_lock;
965
966	if (!grow_stack (p, va)) {
967		--p->p_lock;
968		return (1);
969	}
970
971	/*
972	 * fault the data page
973	 */
974	rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, VM_FAULT_DIRTY);
975
976	--p->p_lock;
977
978	if (rv != KERN_SUCCESS)
979		return 1;
980
981	return (0);
982}
983
984/*
985 * System call request from POSIX system call gate interface to kernel.
986 * Like trap(), argument is call by reference.
987 */
988void
989syscall(frame)
990	struct trapframe frame;
991{
992	caddr_t params;
993	int i;
994	struct sysent *callp;
995	struct proc *p = curproc;
996	u_quad_t sticks;
997	int error;
998	int args[8];
999	u_int code;
1000
1001#ifdef DIAGNOSTIC
1002	if (ISPL(frame.tf_cs) != SEL_UPL)
1003		panic("syscall");
1004#endif
1005	sticks = p->p_sticks;
1006	p->p_md.md_regs = &frame;
1007	params = (caddr_t)frame.tf_esp + sizeof(int);
1008	code = frame.tf_eax;
1009	if (p->p_sysent->sv_prepsyscall) {
1010		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
1011	} else {
1012		/*
1013		 * Need to check if this is a 32 bit or 64 bit syscall.
1014		 */
1015		if (code == SYS_syscall) {
1016			/*
1017			 * Code is first argument, followed by actual args.
1018			 */
1019			code = fuword(params);
1020			params += sizeof(int);
1021		} else if (code == SYS___syscall) {
1022			/*
1023			 * Like syscall, but code is a quad, so as to maintain
1024			 * quad alignment for the rest of the arguments.
1025			 */
1026			code = fuword(params);
1027			params += sizeof(quad_t);
1028		}
1029	}
1030
1031 	if (p->p_sysent->sv_mask)
1032 		code &= p->p_sysent->sv_mask;
1033
1034 	if (code >= p->p_sysent->sv_size)
1035 		callp = &p->p_sysent->sv_table[0];
1036  	else
1037 		callp = &p->p_sysent->sv_table[code];
1038
1039	if (params && (i = callp->sy_narg * sizeof(int)) &&
1040	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
1041#ifdef KTRACE
1042		if (KTRPOINT(p, KTR_SYSCALL))
1043			ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
1044#endif
1045		goto bad;
1046	}
1047#ifdef KTRACE
1048	if (KTRPOINT(p, KTR_SYSCALL))
1049		ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
1050#endif
1051	p->p_retval[0] = 0;
1052	p->p_retval[1] = frame.tf_edx;
1053
1054	STOPEVENT(p, S_SCE, callp->sy_narg);
1055
1056	error = (*callp->sy_call)(p, args);
1057
1058	switch (error) {
1059
1060	case 0:
1061		/*
1062		 * Reinitialize proc pointer `p' as it may be different
1063		 * if this is a child returning from fork syscall.
1064		 */
1065		p = curproc;
1066		frame.tf_eax = p->p_retval[0];
1067		frame.tf_edx = p->p_retval[1];
1068		frame.tf_eflags &= ~PSL_C;
1069		break;
1070
1071	case ERESTART:
1072		/*
1073		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
1074		 * int 0x80 is 2 bytes. We saved this in tf_err.
1075		 */
1076		frame.tf_eip -= frame.tf_err;
1077		break;
1078
1079	case EJUSTRETURN:
1080		break;
1081
1082	default:
1083bad:
1084 		if (p->p_sysent->sv_errsize) {
1085 			if (error >= p->p_sysent->sv_errsize)
1086  				error = -1;	/* XXX */
1087   			else
1088  				error = p->p_sysent->sv_errtbl[error];
1089		}
1090		frame.tf_eax = error;
1091		frame.tf_eflags |= PSL_C;
1092		break;
1093	}
1094
1095	if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
1096		/* Traced syscall. */
1097		frame.tf_eflags &= ~PSL_T;
1098		trapsignal(p, SIGTRAP, 0);
1099	}
1100
1101	userret(p, &frame, sticks);
1102
1103#ifdef KTRACE
1104	if (KTRPOINT(p, KTR_SYSRET))
1105		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
1106#endif
1107
1108	/*
1109	 * This works because errno is findable through the
1110	 * register set.  If we ever support an emulation where this
1111	 * is not the case, this code will need to be revisited.
1112	 */
1113	STOPEVENT(p, S_SCX, code);
1114
1115}
1116
1117/*
1118 * Simplified back end of syscall(), used when returning from fork()
1119 * directly into user mode.
1120 */
1121void
1122fork_return(p, frame)
1123	struct proc *p;
1124	struct trapframe frame;
1125{
1126	frame.tf_eax = 0;		/* Child returns zero */
1127	frame.tf_eflags &= ~PSL_C;	/* success */
1128	frame.tf_edx = 1;
1129
1130	userret(p, &frame, 0);
1131#ifdef KTRACE
1132	if (KTRPOINT(p, KTR_SYSRET))
1133		ktrsysret(p->p_tracep, SYS_fork, 0, 0);
1134#endif
1135}
1136