subr_syscall.c revision 71527
14Srgrimes/*- 21690Sdg * Copyright (C) 1994, David Greenman 31690Sdg * Copyright (c) 1990, 1993 41690Sdg * The Regents of the University of California. All rights reserved. 54Srgrimes * 64Srgrimes * This code is derived from software contributed to Berkeley by 74Srgrimes * the University of Utah, and William Jolitz. 84Srgrimes * 94Srgrimes * Redistribution and use in source and binary forms, with or without 104Srgrimes * modification, are permitted provided that the following conditions 114Srgrimes * are met: 124Srgrimes * 1. Redistributions of source code must retain the above copyright 134Srgrimes * notice, this list of conditions and the following disclaimer. 144Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 154Srgrimes * notice, this list of conditions and the following disclaimer in the 164Srgrimes * documentation and/or other materials provided with the distribution. 174Srgrimes * 3. All advertising materials mentioning features or use of this software 184Srgrimes * must display the following acknowledgement: 194Srgrimes * This product includes software developed by the University of 204Srgrimes * California, Berkeley and its contributors. 214Srgrimes * 4. Neither the name of the University nor the names of its contributors 224Srgrimes * may be used to endorse or promote products derived from this software 234Srgrimes * without specific prior written permission. 244Srgrimes * 254Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 264Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 274Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 284Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 294Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 304Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 314Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 324Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 334Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 344Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 354Srgrimes * SUCH DAMAGE. 364Srgrimes * 37608Srgrimes * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 3850477Speter * $FreeBSD: head/sys/kern/subr_trap.c 71527 2001-01-24 09:53:49Z jhb $ 394Srgrimes */ 404Srgrimes 414Srgrimes/* 421704Sdg * 386 Trap and System call handling 434Srgrimes */ 444Srgrimes 4571257Speter#include "opt_clock.h" 4631544Sjmg#include "opt_cpu.h" 4731544Sjmg#include "opt_ddb.h" 4813203Swollman#include "opt_ktrace.h" 4971257Speter#include "opt_npx.h" 5032925Seivind#include "opt_trap.h" 5113203Swollman 521549Srgrimes#include <sys/param.h> 5365557Sjasone#include <sys/bus.h> 541549Srgrimes#include <sys/systm.h> 551549Srgrimes#include <sys/proc.h> 5631564Ssef#include <sys/pioctl.h> 5767365Sjhb#include <sys/ipl.h> 581549Srgrimes#include <sys/kernel.h> 5965557Sjasone#include <sys/ktr.h> 6067365Sjhb#include <sys/mutex.h> 6131389Sbde#include <sys/resourcevar.h> 6231389Sbde#include <sys/signalvar.h> 631549Srgrimes#include <sys/syscall.h> 6464294Sps#include <sys/sysctl.h> 652257Ssos#include <sys/sysent.h> 6634924Sbde#include <sys/uio.h> 6712662Sdg#include <sys/vmmeter.h> 684Srgrimes#ifdef KTRACE 691549Srgrimes#include <sys/ktrace.h> 704Srgrimes#endif 714Srgrimes 7212662Sdg#include <vm/vm.h> 731549Srgrimes#include <vm/vm_param.h> 7422521Sdyson#include <sys/lock.h> 751549Srgrimes#include <vm/pmap.h> 767090Sbde#include <vm/vm_kern.h> 771549Srgrimes#include <vm/vm_map.h> 781549Srgrimes#include <vm/vm_page.h> 7912662Sdg#include <vm/vm_extern.h> 804Srgrimes 811549Srgrimes#include <machine/cpu.h> 827090Sbde#include <machine/md_var.h> 8331389Sbde#include <machine/pcb.h> 8431389Sbde#ifdef SMP 8525164Speter#include <machine/smp.h> 8631389Sbde#endif 8730275Speter#include <machine/tss.h> 881549Srgrimes 8965557Sjasone#include <i386/isa/icu.h> 9031389Sbde#include <i386/isa/intr_machdep.h> 9131389Sbde 929545Sjoerg#ifdef POWERFAIL_NMI 9318207Sbde#include <sys/syslog.h> 9418207Sbde#include <machine/clock.h> 959545Sjoerg#endif 969545Sjoerg 9730275Speter#include <machine/vm86.h> 9830275Speter 9955823Syokota#include <ddb/ddb.h> 10042135Smsmith 1011549Srgrimes#include "isa.h" 1021549Srgrimes 10365557Sjasone#include <sys/sysctl.h> 10465557Sjasone 10512817Sphkint (*pmath_emulate) __P((struct trapframe *)); 10612817Sphk 10711343Sbdeextern void trap __P((struct trapframe frame)); 10811343Sbdeextern int trapwrite __P((unsigned addr)); 10958717Sdillonextern void syscall2 __P((struct trapframe frame)); 11065557Sjasoneextern void ast __P((struct trapframe frame)); 11111343Sbde 11241454Skatostatic int trap_pfault __P((struct trapframe *, int, vm_offset_t)); 11341454Skatostatic void trap_fatal __P((struct trapframe *, vm_offset_t)); 11412929Sdgvoid dblfault_handler __P((void)); 1154Srgrimes 11611163Sjulianextern inthand_t IDTVEC(syscall); 11711163Sjulian 11817521Sdg#define MAX_TRAP_MSG 28 11912702Sphkstatic char *trap_msg[] = { 1205603Sbde "", /* 0 unused */ 121757Sdg "privileged instruction fault", /* 1 T_PRIVINFLT */ 1225603Sbde "", /* 2 unused */ 123757Sdg "breakpoint instruction fault", /* 3 T_BPTFLT */ 124757Sdg "", /* 4 unused */ 1255603Sbde "", /* 5 unused */ 126757Sdg "arithmetic trap", /* 6 T_ARITHTRAP */ 127757Sdg "system forced exception", /* 7 T_ASTFLT */ 1285603Sbde "", /* 8 unused */ 1291690Sdg "general protection fault", /* 9 T_PROTFLT */ 130757Sdg "trace trap", /* 10 T_TRCTRAP */ 131757Sdg "", /* 11 unused */ 132757Sdg "page fault", /* 12 T_PAGEFLT */ 1335603Sbde "", /* 13 unused */ 134757Sdg "alignment fault", /* 14 T_ALIGNFLT */ 1355603Sbde "", /* 15 unused */ 1365603Sbde "", /* 16 unused */ 1375603Sbde "", /* 17 unused */ 138757Sdg "integer divide fault", /* 18 T_DIVIDE */ 139757Sdg "non-maskable interrupt trap", /* 19 T_NMI */ 140757Sdg "overflow trap", /* 20 T_OFLOW */ 141757Sdg "FPU bounds check fault", /* 21 T_BOUND */ 142757Sdg "FPU device not available", /* 22 T_DNA */ 143757Sdg "double fault", /* 23 T_DOUBLEFLT */ 144757Sdg "FPU operand fetch fault", /* 24 T_FPOPFLT */ 145757Sdg "invalid TSS fault", /* 25 T_TSSFLT */ 146757Sdg "segment not present fault", /* 26 T_SEGNPFLT */ 147757Sdg "stack fault", /* 27 T_STKFLT */ 14817521Sdg "machine check trap", /* 28 T_MCHK */ 149757Sdg}; 1504Srgrimes 15131535Sjkh#if defined(I586_CPU) && !defined(NO_F00F_HACK) 15231507Ssefextern int has_f00f_bug; 15331507Ssef#endif 15431507Ssef 15564294Sps#ifdef DDB 15664294Spsstatic int ddb_on_nmi = 1; 15764294SpsSYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, 15864294Sps &ddb_on_nmi, 0, "Go to DDB on NMI"); 15964294Sps#endif 16064294Spsstatic int panic_on_nmi = 1; 16164294SpsSYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, 16264294Sps &panic_on_nmi, 0, "Panic on NMI"); 16364294Sps 16469881Sjake#ifdef WITNESS 16569881Sjakeextern char *syscallnames[]; 16669881Sjake#endif 16769881Sjake 16871527Sjhbvoid 16971527Sjhbuserret(p, frame, oticks) 1701690Sdg struct proc *p; 1711690Sdg struct trapframe *frame; 1721690Sdg u_quad_t oticks; 1731690Sdg{ 17471527Sjhb int sig; 175757Sdg 17658717Sdillon while ((sig = CURSIG(p)) != 0) { 17771527Sjhb if (!mtx_owned(&Giant)) 17865557Sjasone mtx_enter(&Giant, MTX_DEF); 1791690Sdg postsig(sig); 18058717Sdillon } 18128013Sdyson 18271527Sjhb mtx_enter(&sched_lock, MTX_SPIN); 1831690Sdg p->p_priority = p->p_usrpri; 18458717Sdillon if (resched_wanted()) { 1851690Sdg /* 1861690Sdg * Since we are curproc, clock will normally just change 1871690Sdg * our priority without moving us from one queue to another 1881690Sdg * (since the running process is not on a queue.) 1891690Sdg * If that happened after we setrunqueue ourselves but before we 1901690Sdg * mi_switch()'ed, we might not be on the queue indicated by 1911690Sdg * our priority. 1921690Sdg */ 19368808Sjhb DROP_GIANT_NOSWITCH(); 1941690Sdg setrunqueue(p); 1951690Sdg p->p_stats->p_ru.ru_nivcsw++; 1961690Sdg mi_switch(); 19765557Sjasone mtx_exit(&sched_lock, MTX_SPIN); 19868808Sjhb PICKUP_GIANT(); 19965557Sjasone while ((sig = CURSIG(p)) != 0) { 20071527Sjhb if (!mtx_owned(&Giant)) 20165557Sjasone mtx_enter(&Giant, MTX_DEF); 2021690Sdg postsig(sig); 20365557Sjasone } 20471527Sjhb mtx_enter(&sched_lock, MTX_SPIN); 2051690Sdg } 20671527Sjhb 2076296Sdg /* 2086296Sdg * Charge system time if profiling. 2096296Sdg */ 21071527Sjhb if (p->p_sflag & PS_PROFIL) { 21171527Sjhb mtx_exit(&sched_lock, MTX_SPIN); 21271527Sjhb /* XXX - do we need Giant? */ 21371527Sjhb if (!mtx_owned(&Giant)) 21465557Sjasone mtx_enter(&Giant, MTX_DEF); 21571527Sjhb mtx_enter(&sched_lock, MTX_SPIN); 21616725Sbde addupc_task(p, frame->tf_eip, 21716725Sbde (u_int)(p->p_sticks - oticks) * psratio); 21858717Sdillon } 2191690Sdg curpriority = p->p_priority; 22071527Sjhb mtx_exit(&sched_lock, MTX_SPIN); 2211690Sdg} 2221690Sdg 2234Srgrimes/* 22411343Sbde * Exception, fault, and trap interface to the FreeBSD kernel. 2251690Sdg * This common code is called from assembly language IDT gate entry 2264Srgrimes * routines that prepare a suitable stack frame, and restore this 2271690Sdg * frame after the exception has been processed. 2284Srgrimes */ 2294Srgrimes 230798Swollmanvoid 2314Srgrimestrap(frame) 2324Srgrimes struct trapframe frame; 2334Srgrimes{ 2341690Sdg struct proc *p = curproc; 2351549Srgrimes u_quad_t sticks = 0; 2363436Sphk int i = 0, ucode = 0, type, code; 23741454Skato vm_offset_t eva; 23865557Sjasone#ifdef POWERFAIL_NMI 23965557Sjasone static int lastalert = 0; 24065557Sjasone#endif 2414Srgrimes 24265557Sjasone atomic_add_int(&cnt.v_trap, 1); 24365557Sjasone 24465557Sjasone if ((frame.tf_eflags & PSL_I) == 0) { 24541454Skato /* 24665557Sjasone * Buggy application or kernel code has disabled 24765557Sjasone * interrupts and then trapped. Enabling interrupts 24865557Sjasone * now is wrong, but it is better than running with 24965557Sjasone * interrupts disabled until they are accidentally 25071527Sjhb * enabled later. XXX This is really bad if we trap 25171527Sjhb * while holding a spin lock. 25241454Skato */ 25341454Skato type = frame.tf_trapno; 25441454Skato if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) 25541454Skato printf( 25641454Skato "pid %ld (%s): trap %d with interrupts disabled\n", 25741454Skato (long)curproc->p_pid, curproc->p_comm, type); 25841454Skato else if (type != T_BPTFLT && type != T_TRCTRAP) 25941454Skato /* 26041454Skato * XXX not quite right, since this may be for a 26141454Skato * multiple fault in user mode. 26241454Skato */ 26341454Skato printf("kernel trap %d with interrupts disabled\n", 26441454Skato type); 26571527Sjhb /* 26671527Sjhb * We should walk p_heldmtx here and see if any are 26771527Sjhb * spin mutexes, and not do this if so. 26871527Sjhb */ 26941454Skato enable_intr(); 27041454Skato } 27141454Skato 27241454Skato eva = 0; 27341454Skato if (frame.tf_trapno == T_PAGEFLT) { 27441454Skato /* 27565557Sjasone * For some Cyrix CPUs, %cr2 is clobbered by 27665557Sjasone * interrupts. This problem is worked around by using 27765557Sjasone * an interrupt gate for the pagefault handler. We 27865557Sjasone * are finally ready to read %cr2 and then must 27965557Sjasone * reenable interrupts. 28041454Skato */ 28141454Skato eva = rcr2(); 28241454Skato enable_intr(); 28365557Sjasone } 28441454Skato 28565811Sbde mtx_enter(&Giant, MTX_DEF); 28665557Sjasone 28731535Sjkh#if defined(I586_CPU) && !defined(NO_F00F_HACK) 28831507Ssefrestart: 28931507Ssef#endif 29065557Sjasone 2914Srgrimes type = frame.tf_trapno; 2921690Sdg code = frame.tf_err; 2938876Srgrimes 29465557Sjasone if ((ISPL(frame.tf_cs) == SEL_UPL) || 29565557Sjasone ((frame.tf_eflags & PSL_VM) && !in_vm86call)) { 2961690Sdg /* user trap */ 297200Sdg 29871527Sjhb mtx_enter(&sched_lock, MTX_SPIN); 2991690Sdg sticks = p->p_sticks; 30071527Sjhb mtx_exit(&sched_lock, MTX_SPIN); 30125555Speter p->p_md.md_regs = &frame; 3024Srgrimes 3031690Sdg switch (type) { 3041690Sdg case T_PRIVINFLT: /* privileged instruction fault */ 3051690Sdg ucode = type; 3061690Sdg i = SIGILL; 3071690Sdg break; 308974Sdg 3091690Sdg case T_BPTFLT: /* bpt instruction fault */ 3101690Sdg case T_TRCTRAP: /* trace trap */ 3111690Sdg frame.tf_eflags &= ~PSL_T; 3121690Sdg i = SIGTRAP; 3131690Sdg break; 314974Sdg 3151690Sdg case T_ARITHTRAP: /* arithmetic trap */ 3161690Sdg ucode = code; 3171690Sdg i = SIGFPE; 3181690Sdg break; 3194Srgrimes 32027993Sdyson /* 32127993Sdyson * The following two traps can happen in 32227993Sdyson * vm86 mode, and, if so, we want to handle 32327993Sdyson * them specially. 32427993Sdyson */ 3251690Sdg case T_PROTFLT: /* general protection fault */ 32627993Sdyson case T_STKFLT: /* stack fault */ 32728872Sjlemon if (frame.tf_eflags & PSL_VM) { 32828872Sjlemon i = vm86_emulate((struct vm86frame *)&frame); 32927993Sdyson if (i == 0) 33065557Sjasone goto user; 33127993Sdyson break; 33227993Sdyson } 33327993Sdyson /* FALL THROUGH */ 33427993Sdyson 3351690Sdg case T_SEGNPFLT: /* segment not present fault */ 3365603Sbde case T_TSSFLT: /* invalid TSS fault */ 3375603Sbde case T_DOUBLEFLT: /* double fault */ 3385603Sbde default: 3391690Sdg ucode = code + BUS_SEGM_FAULT ; 3401690Sdg i = SIGBUS; 3411690Sdg break; 3424Srgrimes 3431690Sdg case T_PAGEFLT: /* page fault */ 34441454Skato i = trap_pfault(&frame, TRUE, eva); 34531535Sjkh#if defined(I586_CPU) && !defined(NO_F00F_HACK) 34665557Sjasone if (i == -2) { 34765557Sjasone /* 34865557Sjasone * f00f hack workaround has triggered, treat 34965557Sjasone * as illegal instruction not page fault. 35065557Sjasone */ 35165557Sjasone frame.tf_trapno = T_PRIVINFLT; 35231507Ssef goto restart; 35365557Sjasone } 35431507Ssef#endif 35565557Sjasone if (i == -1) 35665557Sjasone goto out; 3571690Sdg if (i == 0) 35865557Sjasone goto user; 3594Srgrimes 3601690Sdg ucode = T_PAGEFLT; 3611690Sdg break; 3624Srgrimes 3631690Sdg case T_DIVIDE: /* integer divide fault */ 36449081Scracauer ucode = FPE_INTDIV; 3651690Sdg i = SIGFPE; 3661690Sdg break; 3674Srgrimes 3681690Sdg#if NISA > 0 3691690Sdg case T_NMI: 3709545Sjoerg#ifdef POWERFAIL_NMI 37165557Sjasone#ifndef TIMER_FREQ 37265557Sjasone# define TIMER_FREQ 1193182 37365557Sjasone#endif 37465557Sjasone if (time_second - lastalert > 10) { 37565557Sjasone log(LOG_WARNING, "NMI: power fail\n"); 37665557Sjasone sysbeep(TIMER_FREQ/880, hz); 37765557Sjasone lastalert = time_second; 37865557Sjasone } 37965557Sjasone goto out; 3809545Sjoerg#else /* !POWERFAIL_NMI */ 38163140Sps /* machine/parity/power fail/"kitchen sink" faults */ 38263140Sps if (isa_nmi(code) == 0) { 3832320Sdg#ifdef DDB 38464294Sps /* 38564294Sps * NMI can be hooked up to a pushbutton 38664294Sps * for debugging. 38764294Sps */ 38864294Sps if (ddb_on_nmi) { 38964294Sps printf ("NMI ... going to debugger\n"); 39064294Sps kdb_trap (type, 0, &frame); 39164294Sps } 39263140Sps#endif /* DDB */ 39365557Sjasone goto out; 39464294Sps } else if (panic_on_nmi) 39564294Sps panic("NMI indicates hardware failure"); 39664294Sps break; 3979545Sjoerg#endif /* POWERFAIL_NMI */ 3989545Sjoerg#endif /* NISA > 0 */ 3994Srgrimes 4001690Sdg case T_OFLOW: /* integer overflow fault */ 40149081Scracauer ucode = FPE_INTOVF; 4021690Sdg i = SIGFPE; 4031690Sdg break; 4044Srgrimes 4051690Sdg case T_BOUND: /* bounds check fault */ 40649081Scracauer ucode = FPE_FLTSUB; 4071690Sdg i = SIGFPE; 4081690Sdg break; 4094Srgrimes 4101690Sdg case T_DNA: 41171257Speter#ifdef DEV_NPX 41265557Sjasone /* transparent fault (due to context switch "late") */ 4131690Sdg if (npxdna()) 41465557Sjasone goto out; 41517117Sbde#endif 41612817Sphk if (!pmath_emulate) { 41712817Sphk i = SIGFPE; 41812817Sphk ucode = FPE_FPU_NP_TRAP; 41912817Sphk break; 42012817Sphk } 42112817Sphk i = (*pmath_emulate)(&frame); 4225220Sbde if (i == 0) { 4235220Sbde if (!(frame.tf_eflags & PSL_T)) 42465557Sjasone goto out; 4255220Sbde frame.tf_eflags &= ~PSL_T; 4265220Sbde i = SIGTRAP; 4275220Sbde } 4285220Sbde /* else ucode = emulator_only_knows() XXX */ 4291690Sdg break; 430974Sdg 4311690Sdg case T_FPOPFLT: /* FPU operand fetch fault */ 4321690Sdg ucode = T_FPOPFLT; 4331690Sdg i = SIGILL; 4341690Sdg break; 435974Sdg } 4361690Sdg } else { 4371690Sdg /* kernel trap */ 438974Sdg 4391690Sdg switch (type) { 4401690Sdg case T_PAGEFLT: /* page fault */ 44141454Skato (void) trap_pfault(&frame, FALSE, eva); 44265557Sjasone goto out; 4434Srgrimes 44416344Sasami case T_DNA: 44571257Speter#ifdef DEV_NPX 44617117Sbde /* 44717117Sbde * The kernel is apparently using npx for copying. 44817117Sbde * XXX this should be fatal unless the kernel has 44917117Sbde * registered such use. 45017117Sbde */ 45116344Sasami if (npxdna()) 45265557Sjasone goto out; 45317117Sbde#endif 45416344Sasami break; 45516344Sasami 4565603Sbde /* 45765557Sjasone * The following two traps can happen in 45865557Sjasone * vm86 mode, and, if so, we want to handle 45965557Sjasone * them specially. 4605603Sbde */ 46165557Sjasone case T_PROTFLT: /* general protection fault */ 46265557Sjasone case T_STKFLT: /* stack fault */ 46365557Sjasone if (frame.tf_eflags & PSL_VM) { 46465557Sjasone i = vm86_emulate((struct vm86frame *)&frame); 46565557Sjasone if (i != 0) 46665557Sjasone /* 46765557Sjasone * returns to original process 46865557Sjasone */ 46969987Sjhb mtx_exit(&Giant, MTX_DEF); 47065557Sjasone vm86_trap((struct vm86frame *)&frame); 47165557Sjasone goto out; 47265557Sjasone } 47366712Sjhb if (type == T_STKFLT) 47466712Sjhb break; 47566712Sjhb 47665557Sjasone /* FALL THROUGH */ 4775603Sbde 47865557Sjasone case T_SEGNPFLT: /* segment not present fault */ 47965557Sjasone if (in_vm86call) 48065557Sjasone break; 48165557Sjasone 48271337Sjake if (p->p_intr_nesting_level != 0) 48365557Sjasone break; 48465557Sjasone 48566713Sjhb /* 48666713Sjhb * Invalid %fs's and %gs's can be created using 48766713Sjhb * procfs or PT_SETREGS or by invalidating the 48866713Sjhb * underlying LDT entry. This causes a fault 48966713Sjhb * in kernel mode when the kernel attempts to 49066713Sjhb * switch contexts. Lose the bad context 49166713Sjhb * (XXX) so that we can continue, and generate 49266713Sjhb * a signal. 49366713Sjhb */ 49466713Sjhb if (frame.tf_eip == (int)cpu_switch_load_gs) { 49570861Sjake PCPU_GET(curpcb)->pcb_gs = 0; 49666713Sjhb psignal(p, SIGBUS); 49765557Sjasone goto out; 49865557Sjasone } 49965557Sjasone 50065557Sjasone /* 50165557Sjasone * Invalid segment selectors and out of bounds 50265557Sjasone * %eip's and %esp's can be set up in user mode. 50365557Sjasone * This causes a fault in kernel mode when the 50465557Sjasone * kernel tries to return to user mode. We want 50565557Sjasone * to get this fault so that we can fix the 50665557Sjasone * problem here and not have to check all the 50765557Sjasone * selectors and pointers when the user changes 50865557Sjasone * them. 50965557Sjasone */ 51065557Sjasone if (frame.tf_eip == (int)doreti_iret) { 51165557Sjasone frame.tf_eip = (int)doreti_iret_fault; 51265557Sjasone goto out; 51365557Sjasone } 51465557Sjasone if (frame.tf_eip == (int)doreti_popl_ds) { 51565557Sjasone frame.tf_eip = (int)doreti_popl_ds_fault; 51665557Sjasone goto out; 51765557Sjasone } 51865557Sjasone if (frame.tf_eip == (int)doreti_popl_es) { 51965557Sjasone frame.tf_eip = (int)doreti_popl_es_fault; 52065557Sjasone goto out; 52170861Sjake } 52265557Sjasone if (frame.tf_eip == (int)doreti_popl_fs) { 52365557Sjasone frame.tf_eip = (int)doreti_popl_fs_fault; 52465557Sjasone goto out; 52565557Sjasone } 52670861Sjake if (PCPU_GET(curpcb) != NULL && 52770861Sjake PCPU_GET(curpcb)->pcb_onfault != NULL) { 52870861Sjake frame.tf_eip = 52970861Sjake (int)PCPU_GET(curpcb)->pcb_onfault; 53065557Sjasone goto out; 5315603Sbde } 5321690Sdg break; 5334Srgrimes 5345603Sbde case T_TSSFLT: 5355603Sbde /* 5365603Sbde * PSL_NT can be set in user mode and isn't cleared 5375603Sbde * automatically when the kernel is entered. This 5385603Sbde * causes a TSS fault when the kernel attempts to 5395603Sbde * `iret' because the TSS link is uninitialized. We 5405603Sbde * want to get this fault so that we can fix the 5415603Sbde * problem here and not every time the kernel is 5425603Sbde * entered. 5435603Sbde */ 5445603Sbde if (frame.tf_eflags & PSL_NT) { 5455603Sbde frame.tf_eflags &= ~PSL_NT; 54665557Sjasone goto out; 5475603Sbde } 5485603Sbde break; 5495603Sbde 55011343Sbde case T_TRCTRAP: /* trace trap */ 55111343Sbde if (frame.tf_eip == (int)IDTVEC(syscall)) { 55211343Sbde /* 55311343Sbde * We've just entered system mode via the 55411343Sbde * syscall lcall. Continue single stepping 55511343Sbde * silently until the syscall handler has 55611343Sbde * saved the flags. 55711343Sbde */ 55865557Sjasone goto out; 55911343Sbde } 56011343Sbde if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { 56111343Sbde /* 56211343Sbde * The syscall handler has now saved the 56311343Sbde * flags. Stop single stepping it. 56411343Sbde */ 56511343Sbde frame.tf_eflags &= ~PSL_T; 56665557Sjasone goto out; 56711343Sbde } 56811343Sbde /* 56962298Sbsd * Ignore debug register trace traps due to 57062298Sbsd * accesses in the user's address space, which 57162298Sbsd * can happen under several conditions such as 57262298Sbsd * if a user sets a watchpoint on a buffer and 57362298Sbsd * then passes that buffer to a system call. 57462298Sbsd * We still want to get TRCTRAPS for addresses 57562298Sbsd * in kernel space because that is useful when 57662298Sbsd * debugging the kernel. 57762298Sbsd */ 57865557Sjasone if (user_dbreg_trap() && !in_vm86call) { 57962298Sbsd /* 58062298Sbsd * Reset breakpoint bits because the 58162298Sbsd * processor doesn't 58262298Sbsd */ 58362298Sbsd load_dr6(rdr6() & 0xfffffff0); 58465557Sjasone goto out; 58562298Sbsd } 58662298Sbsd /* 58757362Sbsd * Fall through (TRCTRAP kernel mode, kernel address) 58811343Sbde */ 58911343Sbde case T_BPTFLT: 59011343Sbde /* 59111343Sbde * If DDB is enabled, let it handle the debugger trap. 59211343Sbde * Otherwise, debugger traps "can't happen". 59311343Sbde */ 5942320Sdg#ifdef DDB 5951690Sdg if (kdb_trap (type, 0, &frame)) 59665557Sjasone goto out; 59711343Sbde#endif 5981690Sdg break; 5998876Srgrimes 6001072Sdg#if NISA > 0 6011690Sdg case T_NMI: 6029545Sjoerg#ifdef POWERFAIL_NMI 60365557Sjasone if (time_second - lastalert > 10) { 60466713Sjhb log(LOG_WARNING, "NMI: power fail\n"); 60566713Sjhb sysbeep(TIMER_FREQ/880, hz); 60666713Sjhb lastalert = time_second; 60766713Sjhb } 60865557Sjasone goto out; 6099545Sjoerg#else /* !POWERFAIL_NMI */ 61063140Sps /* machine/parity/power fail/"kitchen sink" faults */ 61163140Sps if (isa_nmi(code) == 0) { 6122320Sdg#ifdef DDB 61364294Sps /* 61464294Sps * NMI can be hooked up to a pushbutton 61564294Sps * for debugging. 61664294Sps */ 61764294Sps if (ddb_on_nmi) { 61864294Sps printf ("NMI ... going to debugger\n"); 61964294Sps kdb_trap (type, 0, &frame); 62064294Sps } 62163140Sps#endif /* DDB */ 62265557Sjasone goto out; 62364294Sps } else if (panic_on_nmi == 0) 62465557Sjasone goto out; 6251690Sdg /* FALL THROUGH */ 6269545Sjoerg#endif /* POWERFAIL_NMI */ 6279545Sjoerg#endif /* NISA > 0 */ 6281072Sdg } 6291072Sdg 63041454Skato trap_fatal(&frame, eva); 63165557Sjasone goto out; 6324Srgrimes } 6334Srgrimes 63435496Seivind /* Translate fault for emulators (e.g. Linux) */ 63535496Seivind if (*p->p_sysent->sv_transtrap) 63635496Seivind i = (*p->p_sysent->sv_transtrap)(i, type); 63735496Seivind 6384Srgrimes trapsignal(p, i, ucode); 6391342Sdg 6407213Sdg#ifdef DEBUG 6411690Sdg if (type <= MAX_TRAP_MSG) { 6428876Srgrimes uprintf("fatal process exception: %s", 6431690Sdg trap_msg[type]); 6441690Sdg if ((type == T_PAGEFLT) || (type == T_PROTFLT)) 64541547Sarchie uprintf(", fault VA = 0x%lx", (u_long)eva); 6461342Sdg uprintf("\n"); 6471342Sdg } 6481342Sdg#endif 6491342Sdg 65065557Sjasoneuser: 65171527Sjhb userret(p, &frame, sticks); 6524Srgrimesout: 65371527Sjhb if (mtx_owned(&Giant)) 65471527Sjhb mtx_exit(&Giant, MTX_DEF); 6551690Sdg} 6561690Sdg 6577214Sdg#ifdef notyet 6587214Sdg/* 6597214Sdg * This version doesn't allow a page fault to user space while 6607214Sdg * in the kernel. The rest of the kernel needs to be made "safe" 6617214Sdg * before this can be used. I think the only things remaining 6627214Sdg * to be made safe are the iBCS2 code and the process tracing/ 6637214Sdg * debugging code. 6647214Sdg */ 66512702Sphkstatic int 66641454Skatotrap_pfault(frame, usermode, eva) 6671690Sdg struct trapframe *frame; 6681690Sdg int usermode; 66941454Skato vm_offset_t eva; 6701690Sdg{ 6711690Sdg vm_offset_t va; 6722660Sdg struct vmspace *vm = NULL; 6731690Sdg vm_map_t map = 0; 6743436Sphk int rv = 0; 6751690Sdg vm_prot_t ftype; 6761690Sdg struct proc *p = curproc; 6771690Sdg 6787214Sdg if (frame->tf_err & PGEX_W) 67964063Sluoqi ftype = VM_PROT_WRITE; 6807214Sdg else 6817214Sdg ftype = VM_PROT_READ; 6827214Sdg 68341454Skato va = trunc_page(eva); 6847214Sdg if (va < VM_MIN_KERNEL_ADDRESS) { 6857214Sdg vm_offset_t v; 68614243Sdyson vm_page_t mpte; 6877214Sdg 6889799Sdg if (p == NULL || 6897214Sdg (!usermode && va < VM_MAXUSER_ADDRESS && 69071337Sjake (p->p_intr_nesting_level != 0 || 69170861Sjake PCPU_GET(curpcb) == NULL || 69270861Sjake PCPU_GET(curpcb)->pcb_onfault == NULL))) { 69341454Skato trap_fatal(frame, eva); 6947214Sdg return (-1); 6957214Sdg } 6967214Sdg 6977214Sdg /* 6987214Sdg * This is a fault on non-kernel virtual memory. 6997214Sdg * vm is initialized above to NULL. If curproc is NULL 7007214Sdg * or curproc->p_vmspace is NULL the fault is fatal. 7017214Sdg */ 7027214Sdg vm = p->p_vmspace; 7037214Sdg if (vm == NULL) 7047214Sdg goto nogo; 7057214Sdg 7067214Sdg map = &vm->vm_map; 7077214Sdg 7087214Sdg /* 7097214Sdg * Keep swapout from messing with us during this 7107214Sdg * critical time. 7117214Sdg */ 71271527Sjhb PROC_LOCK(p); 7137214Sdg ++p->p_lock; 71471527Sjhb PROC_UNLOCK(p); 7157214Sdg 7167214Sdg /* 7177214Sdg * Grow the stack if necessary 7187214Sdg */ 71942360Sjulian /* grow_stack returns false only if va falls into 72042360Sjulian * a growable stack region and the stack growth 72142360Sjulian * fails. It returns true if va was not within 72242360Sjulian * a growable stack region, or if the stack 72342360Sjulian * growth succeeded. 72442360Sjulian */ 72542360Sjulian if (!grow_stack (p, va)) { 72642360Sjulian rv = KERN_FAILURE; 72771527Sjhb PROC_LOCK(p); 72842360Sjulian --p->p_lock; 72971527Sjhb PROC_UNLOCK(p); 73042360Sjulian goto nogo; 73142360Sjulian } 73242360Sjulian 7337214Sdg /* Fault in the user page: */ 73424666Sdyson rv = vm_fault(map, va, ftype, 73553045Salc (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY 73653045Salc : VM_FAULT_NORMAL); 7377214Sdg 73871527Sjhb PROC_LOCK(p); 7397214Sdg --p->p_lock; 74071527Sjhb PROC_UNLOCK(p); 7417214Sdg } else { 7427214Sdg /* 7437214Sdg * Don't allow user-mode faults in kernel address space. 7447214Sdg */ 7457214Sdg if (usermode) 7467214Sdg goto nogo; 7477214Sdg 7487214Sdg /* 7497214Sdg * Since we know that kernel virtual address addresses 7507214Sdg * always have pte pages mapped, we just have to fault 7517214Sdg * the page. 7527214Sdg */ 75353045Salc rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL); 7547214Sdg } 7557214Sdg 7567214Sdg if (rv == KERN_SUCCESS) 7577214Sdg return (0); 7587214Sdgnogo: 7597214Sdg if (!usermode) { 76071337Sjake if (p->p_intr_nesting_level == 0 && 76170861Sjake PCPU_GET(curpcb) != NULL && 76270861Sjake PCPU_GET(curpcb)->pcb_onfault != NULL) { 76370861Sjake frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; 7647214Sdg return (0); 7657214Sdg } 76641454Skato trap_fatal(frame, eva); 7677214Sdg return (-1); 7687214Sdg } 7697214Sdg 7707214Sdg /* kludge to pass faulting virtual address to sendsig */ 7717214Sdg frame->tf_err = eva; 7727214Sdg 7737214Sdg return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 7747214Sdg} 7757214Sdg#endif 7767214Sdg 7777214Sdgint 77841454Skatotrap_pfault(frame, usermode, eva) 7797214Sdg struct trapframe *frame; 7807214Sdg int usermode; 78141454Skato vm_offset_t eva; 7827214Sdg{ 7837214Sdg vm_offset_t va; 7847214Sdg struct vmspace *vm = NULL; 7857214Sdg vm_map_t map = 0; 7867214Sdg int rv = 0; 7877214Sdg vm_prot_t ftype; 7887214Sdg struct proc *p = curproc; 7897214Sdg 79041454Skato va = trunc_page(eva); 7912660Sdg if (va >= KERNBASE) { 7922660Sdg /* 7932660Sdg * Don't allow user-mode faults in kernel address space. 79431507Ssef * An exception: if the faulting address is the invalid 79531507Ssef * instruction entry in the IDT, then the Intel Pentium 79631507Ssef * F00F bug workaround was triggered, and we need to 79731507Ssef * treat it is as an illegal instruction, and not a page 79831507Ssef * fault. 7992660Sdg */ 80031535Sjkh#if defined(I586_CPU) && !defined(NO_F00F_HACK) 80165557Sjasone if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) 80231507Ssef return -2; 80331507Ssef#endif 8042660Sdg if (usermode) 8052660Sdg goto nogo; 8061690Sdg 8071690Sdg map = kernel_map; 8081690Sdg } else { 8092660Sdg /* 8102660Sdg * This is a fault on non-kernel virtual memory. 8112660Sdg * vm is initialized above to NULL. If curproc is NULL 8122660Sdg * or curproc->p_vmspace is NULL the fault is fatal. 8132660Sdg */ 8142660Sdg if (p != NULL) 8152660Sdg vm = p->p_vmspace; 8162660Sdg 8172660Sdg if (vm == NULL) 8182660Sdg goto nogo; 8192660Sdg 8201690Sdg map = &vm->vm_map; 8211690Sdg } 8221690Sdg 8231690Sdg if (frame->tf_err & PGEX_W) 82464063Sluoqi ftype = VM_PROT_WRITE; 8251690Sdg else 8261690Sdg ftype = VM_PROT_READ; 8271690Sdg 8281690Sdg if (map != kernel_map) { 8294Srgrimes /* 8301690Sdg * Keep swapout from messing with us during this 8311690Sdg * critical time. 8324Srgrimes */ 83371527Sjhb PROC_LOCK(p); 8341690Sdg ++p->p_lock; 83571527Sjhb PROC_UNLOCK(p); 8361690Sdg 8371690Sdg /* 8381690Sdg * Grow the stack if necessary 8391690Sdg */ 84042360Sjulian /* grow_stack returns false only if va falls into 84142360Sjulian * a growable stack region and the stack growth 84242360Sjulian * fails. It returns true if va was not within 84342360Sjulian * a growable stack region, or if the stack 84442360Sjulian * growth succeeded. 84542360Sjulian */ 84642360Sjulian if (!grow_stack (p, va)) { 84742360Sjulian rv = KERN_FAILURE; 84871527Sjhb PROC_LOCK(p); 84942360Sjulian --p->p_lock; 85071527Sjhb PROC_UNLOCK(p); 85142360Sjulian goto nogo; 85242360Sjulian } 8531690Sdg 8541690Sdg /* Fault in the user page: */ 85524666Sdyson rv = vm_fault(map, va, ftype, 85653045Salc (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY 85753045Salc : VM_FAULT_NORMAL); 8581690Sdg 85971527Sjhb PROC_LOCK(p); 8601690Sdg --p->p_lock; 86171527Sjhb PROC_UNLOCK(p); 8621690Sdg } else { 8631690Sdg /* 86471527Sjhb * Don't have to worry about process locking or stacks in the 86571527Sjhb * kernel. 8661690Sdg */ 86753045Salc rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); 8684Srgrimes } 8694Srgrimes 8701690Sdg if (rv == KERN_SUCCESS) 8711690Sdg return (0); 8721690Sdgnogo: 8731690Sdg if (!usermode) { 87471337Sjake if (p->p_intr_nesting_level == 0 && 87570861Sjake PCPU_GET(curpcb) != NULL && 87670861Sjake PCPU_GET(curpcb)->pcb_onfault != NULL) { 87770861Sjake frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; 8781690Sdg return (0); 8794Srgrimes } 88041454Skato trap_fatal(frame, eva); 8814014Sbde return (-1); 8824Srgrimes } 8831690Sdg 8841690Sdg /* kludge to pass faulting virtual address to sendsig */ 8851690Sdg frame->tf_err = eva; 8861690Sdg 8871690Sdg return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 8884Srgrimes} 8894Srgrimes 89012702Sphkstatic void 89141454Skatotrap_fatal(frame, eva) 8921690Sdg struct trapframe *frame; 89341454Skato vm_offset_t eva; 8941690Sdg{ 89541454Skato int code, type, ss, esp; 8963258Sdg struct soft_segment_descriptor softseg; 8971690Sdg 8981690Sdg code = frame->tf_err; 8991690Sdg type = frame->tf_trapno; 9004014Sbde sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); 9011690Sdg 9021690Sdg if (type <= MAX_TRAP_MSG) 9031690Sdg printf("\n\nFatal trap %d: %s while in %s mode\n", 9041690Sdg type, trap_msg[type], 90527993Sdyson frame->tf_eflags & PSL_VM ? "vm86" : 90628496Scharnier ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); 90725164Speter#ifdef SMP 90865557Sjasone /* two seperate prints in case of a trap on an unmapped page */ 90970861Sjake printf("cpuid = %d; ", PCPU_GET(cpuid)); 91029128Speter printf("lapic.id = %08x\n", lapic.id); 91125164Speter#endif 9121690Sdg if (type == T_PAGEFLT) { 9131690Sdg printf("fault virtual address = 0x%x\n", eva); 9141690Sdg printf("fault code = %s %s, %s\n", 9151690Sdg code & PGEX_U ? "user" : "supervisor", 9161690Sdg code & PGEX_W ? "write" : "read", 9171690Sdg code & PGEX_P ? "protection violation" : "page not present"); 9181690Sdg } 91914837Sbde printf("instruction pointer = 0x%x:0x%x\n", 92014837Sbde frame->tf_cs & 0xffff, frame->tf_eip); 92128496Scharnier if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) { 92214837Sbde ss = frame->tf_ss & 0xffff; 92314837Sbde esp = frame->tf_esp; 92414837Sbde } else { 92514837Sbde ss = GSEL(GDATA_SEL, SEL_KPL); 92614837Sbde esp = (int)&frame->tf_esp; 92714837Sbde } 92814837Sbde printf("stack pointer = 0x%x:0x%x\n", ss, esp); 92914837Sbde printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); 9303258Sdg printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", 93114837Sbde softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); 9323258Sdg printf(" = DPL %d, pres %d, def32 %d, gran %d\n", 93314837Sbde softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, 93414837Sbde softseg.ssd_gran); 9351690Sdg printf("processor eflags = "); 9362578Sbde if (frame->tf_eflags & PSL_T) 93714837Sbde printf("trace trap, "); 9382578Sbde if (frame->tf_eflags & PSL_I) 9391690Sdg printf("interrupt enabled, "); 9402578Sbde if (frame->tf_eflags & PSL_NT) 9411690Sdg printf("nested task, "); 9422578Sbde if (frame->tf_eflags & PSL_RF) 9431690Sdg printf("resume, "); 9442578Sbde if (frame->tf_eflags & PSL_VM) 9451690Sdg printf("vm86, "); 9462578Sbde printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); 9471690Sdg printf("current process = "); 9481690Sdg if (curproc) { 9493436Sphk printf("%lu (%s)\n", 9503436Sphk (u_long)curproc->p_pid, curproc->p_comm ? 9511690Sdg curproc->p_comm : ""); 9521690Sdg } else { 9531690Sdg printf("Idle\n"); 9541690Sdg } 9551690Sdg 9561690Sdg#ifdef KDB 9571690Sdg if (kdb_trap(&psl)) 9581690Sdg return; 9591690Sdg#endif 9602320Sdg#ifdef DDB 96155823Syokota if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame)) 9621690Sdg return; 9631690Sdg#endif 96425164Speter printf("trap number = %d\n", type); 9651690Sdg if (type <= MAX_TRAP_MSG) 9661690Sdg panic(trap_msg[type]); 9671690Sdg else 9681690Sdg panic("unknown/reserved trap"); 9691690Sdg} 9701690Sdg 9714Srgrimes/* 97212929Sdg * Double fault handler. Called when a fault occurs while writing 97312929Sdg * a frame for a trap/exception onto the stack. This usually occurs 97412929Sdg * when the stack overflows (such is the case with infinite recursion, 97512929Sdg * for example). 97612929Sdg * 97712929Sdg * XXX Note that the current PTD gets replaced by IdlePTD when the 97812929Sdg * task switch occurs. This means that the stack that was active at 97912929Sdg * the time of the double fault is not available at <kstack> unless 98012930Sdg * the machine was idle when the double fault occurred. The downside 98112929Sdg * of this is that "trace <ebp>" in ddb won't work. 98212929Sdg */ 98312929Sdgvoid 98412929Sdgdblfault_handler() 98512929Sdg{ 98624925Sbde printf("\nFatal double fault:\n"); 98770861Sjake printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); 98870861Sjake printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); 98970861Sjake printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); 99026812Speter#ifdef SMP 99165557Sjasone /* two seperate prints in case of a trap on an unmapped page */ 99270861Sjake printf("cpuid = %d; ", PCPU_GET(cpuid)); 99329128Speter printf("lapic.id = %08x\n", lapic.id); 99425164Speter#endif 99512929Sdg panic("double fault"); 99612929Sdg} 99712929Sdg 99812929Sdg/* 999200Sdg * Compensate for 386 brain damage (missing URKR). 1000200Sdg * This is a little simpler than the pagefault handler in trap() because 1001200Sdg * it the page tables have already been faulted in and high addresses 1002200Sdg * are thrown out early for other reasons. 10034Srgrimes */ 1004200Sdgint trapwrite(addr) 1005200Sdg unsigned addr; 1006200Sdg{ 1007200Sdg struct proc *p; 100821953Sdyson vm_offset_t va; 1009200Sdg struct vmspace *vm; 1010974Sdg int rv; 10114Srgrimes 10124Srgrimes va = trunc_page((vm_offset_t)addr); 1013200Sdg /* 1014200Sdg * XXX - MAX is END. Changed > to >= for temp. fix. 1015200Sdg */ 1016200Sdg if (va >= VM_MAXUSER_ADDRESS) 1017200Sdg return (1); 10181127Sdg 1019200Sdg p = curproc; 1020200Sdg vm = p->p_vmspace; 1021974Sdg 102271527Sjhb PROC_LOCK(p); 10231549Srgrimes ++p->p_lock; 102471527Sjhb PROC_UNLOCK(p); 1025974Sdg 102642360Sjulian if (!grow_stack (p, va)) { 102771527Sjhb PROC_LOCK(p); 102842360Sjulian --p->p_lock; 102971527Sjhb PROC_UNLOCK(p); 103042360Sjulian return (1); 103142360Sjulian } 1032200Sdg 10331127Sdg /* 10341127Sdg * fault the data page 10351127Sdg */ 103664063Sluoqi rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); 10371127Sdg 103871527Sjhb PROC_LOCK(p); 10391549Srgrimes --p->p_lock; 104071527Sjhb PROC_UNLOCK(p); 1041974Sdg 1042974Sdg if (rv != KERN_SUCCESS) 1043974Sdg return 1; 10441127Sdg 1045200Sdg return (0); 10464Srgrimes} 10474Srgrimes 10484Srgrimes/* 104958717Sdillon * syscall2 - MP aware system call request C handler 105058717Sdillon * 105158717Sdillon * A system call is essentially treated as a trap except that the 105258717Sdillon * MP lock is not held on entry or return. We are responsible for 105358717Sdillon * obtaining the MP lock if necessary and for handling ASTs 105458717Sdillon * (e.g. a task switch) prior to return. 105558717Sdillon * 105658717Sdillon * In general, only simple access and manipulation of curproc and 105758717Sdillon * the current stack is allowed without having to hold MP lock. 10584Srgrimes */ 1059798Swollmanvoid 106058717Sdillonsyscall2(frame) 10611690Sdg struct trapframe frame; 10624Srgrimes{ 10631690Sdg caddr_t params; 10641690Sdg int i; 10651690Sdg struct sysent *callp; 10661690Sdg struct proc *p = curproc; 10671549Srgrimes u_quad_t sticks; 106810157Sdg int error; 106958717Sdillon int narg; 107030994Sphk int args[8]; 10711549Srgrimes u_int code; 10724Srgrimes 107365557Sjasone atomic_add_int(&cnt.v_syscall, 1); 107465557Sjasone 107531389Sbde#ifdef DIAGNOSTIC 107658717Sdillon if (ISPL(frame.tf_cs) != SEL_UPL) { 107765557Sjasone mtx_enter(&Giant, MTX_DEF); 10784Srgrimes panic("syscall"); 107958717Sdillon /* NOT REACHED */ 108058717Sdillon } 108131389Sbde#endif 108258717Sdillon 108371527Sjhb mtx_enter(&sched_lock, MTX_SPIN); 108471527Sjhb sticks = p->p_sticks; 108571527Sjhb mtx_exit(&sched_lock, MTX_SPIN); 108658717Sdillon 108725555Speter p->p_md.md_regs = &frame; 108810157Sdg params = (caddr_t)frame.tf_esp + sizeof(int); 1089924Sdg code = frame.tf_eax; 109058717Sdillon 109114331Speter if (p->p_sysent->sv_prepsyscall) { 109258717Sdillon /* 109358717Sdillon * The prep code is not MP aware. 109458717Sdillon */ 109565557Sjasone mtx_enter(&Giant, MTX_DEF); 109614331Speter (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); 109765557Sjasone mtx_exit(&Giant, MTX_DEF); 109814331Speter } else { 10991549Srgrimes /* 110014331Speter * Need to check if this is a 32 bit or 64 bit syscall. 110158717Sdillon * fuword is MP aware. 11021549Srgrimes */ 110314331Speter if (code == SYS_syscall) { 110414331Speter /* 110514331Speter * Code is first argument, followed by actual args. 110614331Speter */ 110714331Speter code = fuword(params); 110814331Speter params += sizeof(int); 110914331Speter } else if (code == SYS___syscall) { 111014331Speter /* 111114331Speter * Like syscall, but code is a quad, so as to maintain 111214331Speter * quad alignment for the rest of the arguments. 111314331Speter */ 111414331Speter code = fuword(params); 111514331Speter params += sizeof(quad_t); 111614331Speter } 11174Srgrimes } 11181549Srgrimes 11192257Ssos if (p->p_sysent->sv_mask) 112010157Sdg code &= p->p_sysent->sv_mask; 11218876Srgrimes 11222357Sbde if (code >= p->p_sysent->sv_size) 11232257Ssos callp = &p->p_sysent->sv_table[0]; 11242257Ssos else 11252257Ssos callp = &p->p_sysent->sv_table[code]; 11264Srgrimes 112758717Sdillon narg = callp->sy_narg & SYF_ARGMASK; 112858717Sdillon 112958717Sdillon /* 113058717Sdillon * copyin is MP aware, but the tracing code is not 113158717Sdillon */ 113258717Sdillon if (params && (i = narg * sizeof(int)) && 11334Srgrimes (error = copyin(params, (caddr_t)args, (u_int)i))) { 113465557Sjasone mtx_enter(&Giant, MTX_DEF); 11354Srgrimes#ifdef KTRACE 11364Srgrimes if (KTRPOINT(p, KTR_SYSCALL)) 113758717Sdillon ktrsyscall(p->p_tracep, code, narg, args); 11384Srgrimes#endif 11391690Sdg goto bad; 11404Srgrimes } 114158717Sdillon 114258717Sdillon /* 114358717Sdillon * Try to run the syscall without the MP lock if the syscall 114458717Sdillon * is MP safe. We have to obtain the MP lock no matter what if 114558717Sdillon * we are ktracing 114658717Sdillon */ 114758717Sdillon if ((callp->sy_narg & SYF_MPSAFE) == 0) { 114865557Sjasone mtx_enter(&Giant, MTX_DEF); 114958717Sdillon } 115058717Sdillon 11514Srgrimes#ifdef KTRACE 115258717Sdillon if (KTRPOINT(p, KTR_SYSCALL)) { 115371527Sjhb if (!mtx_owned(&Giant)) 115465557Sjasone mtx_enter(&Giant, MTX_DEF); 115558717Sdillon ktrsyscall(p->p_tracep, code, narg, args); 115658717Sdillon } 11574Srgrimes#endif 115830994Sphk p->p_retval[0] = 0; 115930994Sphk p->p_retval[1] = frame.tf_edx; 11601690Sdg 116158717Sdillon STOPEVENT(p, S_SCE, narg); /* MP aware */ 116231564Ssef 116330994Sphk error = (*callp->sy_call)(p, args); 11641690Sdg 116558717Sdillon /* 116658717Sdillon * MP SAFE (we may or may not have the MP lock at this point) 116758717Sdillon */ 11681690Sdg switch (error) { 11691690Sdg case 0: 117030994Sphk frame.tf_eax = p->p_retval[0]; 117130994Sphk frame.tf_edx = p->p_retval[1]; 117211343Sbde frame.tf_eflags &= ~PSL_C; 11731690Sdg break; 11741690Sdg 11751690Sdg case ERESTART: 117610157Sdg /* 117714331Speter * Reconstruct pc, assuming lcall $X,y is 7 bytes, 117814331Speter * int 0x80 is 2 bytes. We saved this in tf_err. 117910157Sdg */ 118014331Speter frame.tf_eip -= frame.tf_err; 11811690Sdg break; 11821690Sdg 11831690Sdg case EJUSTRETURN: 11841690Sdg break; 11851690Sdg 11861690Sdg default: 118710157Sdgbad: 118846568Speter if (p->p_sysent->sv_errsize) { 11893495Ssos if (error >= p->p_sysent->sv_errsize) 11903495Ssos error = -1; /* XXX */ 11918876Srgrimes else 11923495Ssos error = p->p_sysent->sv_errtbl[error]; 119346568Speter } 11941690Sdg frame.tf_eax = error; 119511343Sbde frame.tf_eflags |= PSL_C; 11961690Sdg break; 11974Srgrimes } 11984Srgrimes 119958717Sdillon /* 120058717Sdillon * Traced syscall. trapsignal() is not MP aware. 120158717Sdillon */ 120227993Sdyson if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { 120371527Sjhb if (!mtx_owned(&Giant)) 120465557Sjasone mtx_enter(&Giant, MTX_DEF); 120511163Sjulian frame.tf_eflags &= ~PSL_T; 120611343Sbde trapsignal(p, SIGTRAP, 0); 120711163Sjulian } 120811343Sbde 120958717Sdillon /* 121058717Sdillon * Handle reschedule and other end-of-syscall issues 121158717Sdillon */ 121271527Sjhb userret(p, &frame, sticks); 12131690Sdg 12144Srgrimes#ifdef KTRACE 121558717Sdillon if (KTRPOINT(p, KTR_SYSRET)) { 121671527Sjhb if (!mtx_owned(&Giant)) 121765557Sjasone mtx_enter(&Giant, MTX_DEF); 121830994Sphk ktrsysret(p->p_tracep, code, error, p->p_retval[0]); 121958717Sdillon } 12204Srgrimes#endif 122131564Ssef 122231564Ssef /* 122331564Ssef * This works because errno is findable through the 122431564Ssef * register set. If we ever support an emulation where this 122531564Ssef * is not the case, this code will need to be revisited. 122631564Ssef */ 122731564Ssef STOPEVENT(p, S_SCX, code); 122831564Ssef 122958717Sdillon /* 123071527Sjhb * Release Giant if we had to get it 123158717Sdillon */ 123271527Sjhb if (mtx_owned(&Giant)) 123365557Sjasone mtx_exit(&Giant, MTX_DEF); 123465557Sjasone 123569881Sjake#ifdef WITNESS 123669881Sjake if (witness_list(p)) { 123769881Sjake panic("system call %s returning with mutex(s) held\n", 123869881Sjake syscallnames[code]); 123969881Sjake } 124069881Sjake#endif 124171527Sjhb mtx_assert(&sched_lock, MA_NOTOWNED); 124271527Sjhb mtx_assert(&Giant, MA_NOTOWNED); 12434Srgrimes} 124424691Speter 124565557Sjasonevoid 124665557Sjasoneast(frame) 124765557Sjasone struct trapframe frame; 124865557Sjasone{ 124965557Sjasone struct proc *p = CURPROC; 125065557Sjasone u_quad_t sticks; 125165557Sjasone 125271527Sjhb mtx_enter(&sched_lock, MTX_SPIN); 125371527Sjhb sticks = p->p_sticks; 125465557Sjasone 125565557Sjasone astoff(); 125665557Sjasone atomic_add_int(&cnt.v_soft, 1); 125771527Sjhb if (p->p_sflag & PS_OWEUPC) { 125871527Sjhb p->p_sflag &= ~PS_OWEUPC; 125971527Sjhb mtx_exit(&sched_lock, MTX_SPIN); 126065557Sjasone mtx_enter(&Giant, MTX_DEF); 126171527Sjhb mtx_enter(&sched_lock, MTX_SPIN); 126265557Sjasone addupc_task(p, p->p_stats->p_prof.pr_addr, 126365557Sjasone p->p_stats->p_prof.pr_ticks); 126466713Sjhb } 126571527Sjhb if (p->p_sflag & PS_ALRMPEND) { 126671527Sjhb p->p_sflag &= ~PS_ALRMPEND; 126771527Sjhb mtx_exit(&sched_lock, MTX_SPIN); 126866716Sjhb if (!mtx_owned(&Giant)) 126966716Sjhb mtx_enter(&Giant, MTX_DEF); 127066716Sjhb psignal(p, SIGVTALRM); 127171527Sjhb mtx_enter(&sched_lock, MTX_SPIN); 127266716Sjhb } 127371527Sjhb if (p->p_sflag & PS_PROFPEND) { 127471527Sjhb p->p_sflag &= ~PS_PROFPEND; 127571527Sjhb mtx_exit(&sched_lock, MTX_SPIN); 127666716Sjhb if (!mtx_owned(&Giant)) 127766716Sjhb mtx_enter(&Giant, MTX_DEF); 127866716Sjhb psignal(p, SIGPROF); 127971527Sjhb } else 128071527Sjhb mtx_exit(&sched_lock, MTX_SPIN); 128171527Sjhb 128271527Sjhb userret(p, &frame, sticks); 128365557Sjasone 128471527Sjhb if (mtx_owned(&Giant)) 128565557Sjasone mtx_exit(&Giant, MTX_DEF); 128624691Speter} 1287