subr_syscall.c revision 71665
1/*- 2 * Copyright (C) 1994, David Greenman 3 * Copyright (c) 1990, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the University of Utah, and William Jolitz. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 38 * $FreeBSD: head/sys/kern/subr_trap.c 71665 2001-01-26 04:16:16Z jake $ 39 */ 40 41/* 42 * 386 Trap and System call handling 43 */ 44 45#include "opt_clock.h" 46#include "opt_cpu.h" 47#include "opt_ddb.h" 48#include "opt_ktrace.h" 49#include "opt_npx.h" 50#include "opt_trap.h" 51 52#include <sys/param.h> 53#include <sys/bus.h> 54#include <sys/systm.h> 55#include <sys/proc.h> 56#include <sys/pioctl.h> 57#include <sys/ipl.h> 58#include <sys/kernel.h> 59#include <sys/ktr.h> 60#include <sys/mutex.h> 61#include <sys/resourcevar.h> 62#include <sys/signalvar.h> 63#include <sys/syscall.h> 64#include <sys/sysctl.h> 65#include <sys/sysent.h> 66#include <sys/uio.h> 67#include <sys/vmmeter.h> 68#ifdef KTRACE 69#include <sys/ktrace.h> 70#endif 71 72#include <vm/vm.h> 73#include <vm/vm_param.h> 74#include <sys/lock.h> 75#include <vm/pmap.h> 76#include <vm/vm_kern.h> 77#include <vm/vm_map.h> 78#include <vm/vm_page.h> 79#include <vm/vm_extern.h> 80 81#include <machine/cpu.h> 82#include <machine/md_var.h> 83#include <machine/pcb.h> 84#ifdef SMP 85#include <machine/smp.h> 86#endif 87#include <machine/tss.h> 88 89#include <i386/isa/icu.h> 90#include <i386/isa/intr_machdep.h> 91 92#ifdef POWERFAIL_NMI 93#include <sys/syslog.h> 94#include <machine/clock.h> 95#endif 96 97#include <machine/vm86.h> 98 99#include <ddb/ddb.h> 100 101#include "isa.h" 102 103#include <sys/sysctl.h> 104 105int (*pmath_emulate) __P((struct trapframe *)); 106 107extern void trap __P((struct trapframe frame)); 108extern int trapwrite __P((unsigned addr)); 109extern void syscall2 __P((struct trapframe frame)); 110extern void ast __P((struct trapframe frame)); 111 112static int trap_pfault __P((struct trapframe *, int, vm_offset_t)); 113static void trap_fatal __P((struct trapframe *, vm_offset_t)); 114void dblfault_handler __P((void)); 115 116extern inthand_t IDTVEC(syscall); 117 118#define MAX_TRAP_MSG 28 119static char *trap_msg[] = { 120 "", /* 0 unused */ 121 "privileged instruction fault", /* 1 T_PRIVINFLT */ 122 "", /* 2 unused */ 123 "breakpoint instruction fault", /* 3 T_BPTFLT */ 124 "", /* 4 unused */ 125 "", /* 5 unused */ 126 "arithmetic trap", /* 6 T_ARITHTRAP */ 127 "system forced exception", /* 7 T_ASTFLT */ 128 "", /* 8 unused */ 129 "general protection fault", /* 9 T_PROTFLT */ 130 "trace trap", /* 10 T_TRCTRAP */ 131 "", /* 11 unused */ 132 "page fault", /* 12 T_PAGEFLT */ 133 "", /* 13 unused */ 134 "alignment fault", /* 14 T_ALIGNFLT */ 135 "", /* 15 unused */ 136 "", /* 16 unused */ 137 "", /* 17 unused */ 138 "integer divide fault", /* 18 T_DIVIDE */ 139 "non-maskable interrupt trap", /* 19 T_NMI */ 140 "overflow trap", /* 20 T_OFLOW */ 141 "FPU bounds check fault", /* 21 T_BOUND */ 142 "FPU device not available", /* 22 T_DNA */ 143 "double fault", /* 23 T_DOUBLEFLT */ 144 "FPU operand fetch fault", /* 24 T_FPOPFLT */ 145 "invalid TSS fault", /* 25 T_TSSFLT */ 146 "segment not present fault", /* 26 T_SEGNPFLT */ 147 "stack fault", /* 27 T_STKFLT */ 148 "machine check trap", /* 28 T_MCHK */ 149}; 150 151#if defined(I586_CPU) && !defined(NO_F00F_HACK) 152extern int has_f00f_bug; 153#endif 154 155#ifdef DDB 156static int ddb_on_nmi = 1; 157SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, 158 &ddb_on_nmi, 0, "Go to DDB on NMI"); 159#endif 160static int panic_on_nmi = 1; 161SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, 162 &panic_on_nmi, 0, "Panic on NMI"); 163 164#ifdef WITNESS 165extern char *syscallnames[]; 166#endif 167 168void 169userret(p, frame, oticks) 170 struct proc *p; 171 struct trapframe *frame; 172 u_quad_t oticks; 173{ 174 int sig; 175 176 while ((sig = CURSIG(p)) != 0) { 177 if (!mtx_owned(&Giant)) 178 mtx_enter(&Giant, MTX_DEF); 179 postsig(sig); 180 } 181 182 mtx_enter(&sched_lock, MTX_SPIN); 183 p->p_priority = p->p_usrpri; 184 if (resched_wanted()) { 185 /* 186 * Since we are curproc, clock will normally just change 187 * our priority without moving us from one queue to another 188 * (since the running process is not on a queue.) 189 * If that happened after we setrunqueue ourselves but before we 190 * mi_switch()'ed, we might not be on the queue indicated by 191 * our priority. 192 */ 193 DROP_GIANT_NOSWITCH(); 194 setrunqueue(p); 195 p->p_stats->p_ru.ru_nivcsw++; 196 mi_switch(); 197 mtx_exit(&sched_lock, MTX_SPIN); 198 PICKUP_GIANT(); 199 while ((sig = CURSIG(p)) != 0) { 200 if (!mtx_owned(&Giant)) 201 mtx_enter(&Giant, MTX_DEF); 202 postsig(sig); 203 } 204 mtx_enter(&sched_lock, MTX_SPIN); 205 } 206 207 /* 208 * Charge system time if profiling. 209 */ 210 if (p->p_sflag & PS_PROFIL) { 211 mtx_exit(&sched_lock, MTX_SPIN); 212 /* XXX - do we need Giant? */ 213 if (!mtx_owned(&Giant)) 214 mtx_enter(&Giant, MTX_DEF); 215 mtx_enter(&sched_lock, MTX_SPIN); 216 addupc_task(p, frame->tf_eip, 217 (u_int)(p->p_sticks - oticks) * psratio); 218 } 219 curpriority = p->p_priority; 220 mtx_exit(&sched_lock, MTX_SPIN); 221} 222 223/* 224 * Exception, fault, and trap interface to the FreeBSD kernel. 225 * This common code is called from assembly language IDT gate entry 226 * routines that prepare a suitable stack frame, and restore this 227 * frame after the exception has been processed. 228 */ 229 230void 231trap(frame) 232 struct trapframe frame; 233{ 234 struct proc *p = curproc; 235 u_quad_t sticks = 0; 236 int i = 0, ucode = 0, type, code; 237 vm_offset_t eva; 238#ifdef POWERFAIL_NMI 239 static int lastalert = 0; 240#endif 241 242 atomic_add_int(&cnt.v_trap, 1); 243 244 if ((frame.tf_eflags & PSL_I) == 0) { 245 /* 246 * Buggy application or kernel code has disabled 247 * interrupts and then trapped. Enabling interrupts 248 * now is wrong, but it is better than running with 249 * interrupts disabled until they are accidentally 250 * enabled later. XXX This is really bad if we trap 251 * while holding a spin lock. 252 */ 253 type = frame.tf_trapno; 254 if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) 255 printf( 256 "pid %ld (%s): trap %d with interrupts disabled\n", 257 (long)curproc->p_pid, curproc->p_comm, type); 258 else if (type != T_BPTFLT && type != T_TRCTRAP) 259 /* 260 * XXX not quite right, since this may be for a 261 * multiple fault in user mode. 262 */ 263 printf("kernel trap %d with interrupts disabled\n", 264 type); 265 /* 266 * We should walk p_heldmtx here and see if any are 267 * spin mutexes, and not do this if so. 268 */ 269 enable_intr(); 270 } 271 272 eva = 0; 273 274#if defined(I586_CPU) && !defined(NO_F00F_HACK) 275restart: 276#endif 277 278 type = frame.tf_trapno; 279 code = frame.tf_err; 280 281 if ((ISPL(frame.tf_cs) == SEL_UPL) || 282 ((frame.tf_eflags & PSL_VM) && !in_vm86call)) { 283 /* user trap */ 284 285 mtx_enter(&sched_lock, MTX_SPIN); 286 sticks = p->p_sticks; 287 mtx_exit(&sched_lock, MTX_SPIN); 288 p->p_md.md_regs = &frame; 289 290 switch (type) { 291 case T_PRIVINFLT: /* privileged instruction fault */ 292 ucode = type; 293 i = SIGILL; 294 break; 295 296 case T_BPTFLT: /* bpt instruction fault */ 297 case T_TRCTRAP: /* trace trap */ 298 frame.tf_eflags &= ~PSL_T; 299 i = SIGTRAP; 300 break; 301 302 case T_ARITHTRAP: /* arithmetic trap */ 303 ucode = code; 304 i = SIGFPE; 305 break; 306 307 /* 308 * The following two traps can happen in 309 * vm86 mode, and, if so, we want to handle 310 * them specially. 311 */ 312 case T_PROTFLT: /* general protection fault */ 313 case T_STKFLT: /* stack fault */ 314 if (frame.tf_eflags & PSL_VM) { 315 mtx_enter(&Giant, MTX_DEF); 316 i = vm86_emulate((struct vm86frame *)&frame); 317 mtx_exit(&Giant, MTX_DEF); 318 if (i == 0) 319 goto user; 320 break; 321 } 322 /* FALL THROUGH */ 323 324 case T_SEGNPFLT: /* segment not present fault */ 325 case T_TSSFLT: /* invalid TSS fault */ 326 case T_DOUBLEFLT: /* double fault */ 327 default: 328 ucode = code + BUS_SEGM_FAULT ; 329 i = SIGBUS; 330 break; 331 332 case T_PAGEFLT: /* page fault */ 333 /* 334 * For some Cyrix CPUs, %cr2 is clobbered by 335 * interrupts. This problem is worked around by using 336 * an interrupt gate for the pagefault handler. We 337 * are finally ready to read %cr2 and then must 338 * reenable interrupts. 339 */ 340 eva = rcr2(); 341 enable_intr(); 342 mtx_enter(&Giant, MTX_DEF); 343 i = trap_pfault(&frame, TRUE, eva); 344 mtx_exit(&Giant, MTX_DEF); 345#if defined(I586_CPU) && !defined(NO_F00F_HACK) 346 if (i == -2) { 347 /* 348 * f00f hack workaround has triggered, treat 349 * as illegal instruction not page fault. 350 */ 351 frame.tf_trapno = T_PRIVINFLT; 352 goto restart; 353 } 354#endif 355 if (i == -1) 356 goto out; 357 if (i == 0) 358 goto user; 359 360 ucode = T_PAGEFLT; 361 break; 362 363 case T_DIVIDE: /* integer divide fault */ 364 ucode = FPE_INTDIV; 365 i = SIGFPE; 366 break; 367 368#if NISA > 0 369 case T_NMI: 370#ifdef POWERFAIL_NMI 371#ifndef TIMER_FREQ 372# define TIMER_FREQ 1193182 373#endif 374 mtx_enter(&Giant, MTX_DEF); 375 if (time_second - lastalert > 10) { 376 log(LOG_WARNING, "NMI: power fail\n"); 377 sysbeep(TIMER_FREQ/880, hz); 378 lastalert = time_second; 379 } 380 mtx_exit(&Giant, MTX_DEF); 381 goto out; 382#else /* !POWERFAIL_NMI */ 383 /* machine/parity/power fail/"kitchen sink" faults */ 384 /* XXX Giant */ 385 if (isa_nmi(code) == 0) { 386#ifdef DDB 387 /* 388 * NMI can be hooked up to a pushbutton 389 * for debugging. 390 */ 391 if (ddb_on_nmi) { 392 printf ("NMI ... going to debugger\n"); 393 kdb_trap (type, 0, &frame); 394 } 395#endif /* DDB */ 396 goto out; 397 } else if (panic_on_nmi) 398 panic("NMI indicates hardware failure"); 399 break; 400#endif /* POWERFAIL_NMI */ 401#endif /* NISA > 0 */ 402 403 case T_OFLOW: /* integer overflow fault */ 404 ucode = FPE_INTOVF; 405 i = SIGFPE; 406 break; 407 408 case T_BOUND: /* bounds check fault */ 409 ucode = FPE_FLTSUB; 410 i = SIGFPE; 411 break; 412 413 case T_DNA: 414#ifdef DEV_NPX 415 /* transparent fault (due to context switch "late") */ 416 if (npxdna()) 417 goto out; 418#endif 419 if (!pmath_emulate) { 420 i = SIGFPE; 421 ucode = FPE_FPU_NP_TRAP; 422 break; 423 } 424 mtx_enter(&Giant, MTX_DEF); 425 i = (*pmath_emulate)(&frame); 426 mtx_exit(&Giant, MTX_DEF); 427 if (i == 0) { 428 if (!(frame.tf_eflags & PSL_T)) 429 goto out; 430 frame.tf_eflags &= ~PSL_T; 431 i = SIGTRAP; 432 } 433 /* else ucode = emulator_only_knows() XXX */ 434 break; 435 436 case T_FPOPFLT: /* FPU operand fetch fault */ 437 ucode = T_FPOPFLT; 438 i = SIGILL; 439 break; 440 } 441 } else { 442 /* kernel trap */ 443 444 switch (type) { 445 case T_PAGEFLT: /* page fault */ 446 /* 447 * For some Cyrix CPUs, %cr2 is clobbered by 448 * interrupts. This problem is worked around by using 449 * an interrupt gate for the pagefault handler. We 450 * are finally ready to read %cr2 and then must 451 * reenable interrupts. 452 */ 453 eva = rcr2(); 454 enable_intr(); 455 mtx_enter(&Giant, MTX_DEF); 456 (void) trap_pfault(&frame, FALSE, eva); 457 mtx_exit(&Giant, MTX_DEF); 458 goto out; 459 460 case T_DNA: 461#ifdef DEV_NPX 462 /* 463 * The kernel is apparently using npx for copying. 464 * XXX this should be fatal unless the kernel has 465 * registered such use. 466 */ 467 if (npxdna()) 468 goto out; 469#endif 470 break; 471 472 /* 473 * The following two traps can happen in 474 * vm86 mode, and, if so, we want to handle 475 * them specially. 476 */ 477 case T_PROTFLT: /* general protection fault */ 478 case T_STKFLT: /* stack fault */ 479 if (frame.tf_eflags & PSL_VM) { 480 mtx_enter(&Giant, MTX_DEF); 481 i = vm86_emulate((struct vm86frame *)&frame); 482 mtx_exit(&Giant, MTX_DEF); 483 if (i != 0) 484 /* 485 * returns to original process 486 */ 487 vm86_trap((struct vm86frame *)&frame); 488 goto out; 489 } 490 if (type == T_STKFLT) 491 break; 492 493 /* FALL THROUGH */ 494 495 case T_SEGNPFLT: /* segment not present fault */ 496 if (in_vm86call) 497 break; 498 499 if (p->p_intr_nesting_level != 0) 500 break; 501 502 /* 503 * Invalid %fs's and %gs's can be created using 504 * procfs or PT_SETREGS or by invalidating the 505 * underlying LDT entry. This causes a fault 506 * in kernel mode when the kernel attempts to 507 * switch contexts. Lose the bad context 508 * (XXX) so that we can continue, and generate 509 * a signal. 510 */ 511 if (frame.tf_eip == (int)cpu_switch_load_gs) { 512 PCPU_GET(curpcb)->pcb_gs = 0; 513 mtx_enter(&Giant, MTX_DEF); 514 psignal(p, SIGBUS); 515 mtx_exit(&Giant, MTX_DEF); 516 goto out; 517 } 518 519 /* 520 * Invalid segment selectors and out of bounds 521 * %eip's and %esp's can be set up in user mode. 522 * This causes a fault in kernel mode when the 523 * kernel tries to return to user mode. We want 524 * to get this fault so that we can fix the 525 * problem here and not have to check all the 526 * selectors and pointers when the user changes 527 * them. 528 */ 529 if (frame.tf_eip == (int)doreti_iret) { 530 frame.tf_eip = (int)doreti_iret_fault; 531 goto out; 532 } 533 if (frame.tf_eip == (int)doreti_popl_ds) { 534 frame.tf_eip = (int)doreti_popl_ds_fault; 535 goto out; 536 } 537 if (frame.tf_eip == (int)doreti_popl_es) { 538 frame.tf_eip = (int)doreti_popl_es_fault; 539 goto out; 540 } 541 if (frame.tf_eip == (int)doreti_popl_fs) { 542 frame.tf_eip = (int)doreti_popl_fs_fault; 543 goto out; 544 } 545 if (PCPU_GET(curpcb) != NULL && 546 PCPU_GET(curpcb)->pcb_onfault != NULL) { 547 frame.tf_eip = 548 (int)PCPU_GET(curpcb)->pcb_onfault; 549 goto out; 550 } 551 break; 552 553 case T_TSSFLT: 554 /* 555 * PSL_NT can be set in user mode and isn't cleared 556 * automatically when the kernel is entered. This 557 * causes a TSS fault when the kernel attempts to 558 * `iret' because the TSS link is uninitialized. We 559 * want to get this fault so that we can fix the 560 * problem here and not every time the kernel is 561 * entered. 562 */ 563 if (frame.tf_eflags & PSL_NT) { 564 frame.tf_eflags &= ~PSL_NT; 565 goto out; 566 } 567 break; 568 569 case T_TRCTRAP: /* trace trap */ 570 if (frame.tf_eip == (int)IDTVEC(syscall)) { 571 /* 572 * We've just entered system mode via the 573 * syscall lcall. Continue single stepping 574 * silently until the syscall handler has 575 * saved the flags. 576 */ 577 goto out; 578 } 579 if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { 580 /* 581 * The syscall handler has now saved the 582 * flags. Stop single stepping it. 583 */ 584 frame.tf_eflags &= ~PSL_T; 585 goto out; 586 } 587 /* 588 * Ignore debug register trace traps due to 589 * accesses in the user's address space, which 590 * can happen under several conditions such as 591 * if a user sets a watchpoint on a buffer and 592 * then passes that buffer to a system call. 593 * We still want to get TRCTRAPS for addresses 594 * in kernel space because that is useful when 595 * debugging the kernel. 596 */ 597 /* XXX Giant */ 598 if (user_dbreg_trap() && !in_vm86call) { 599 /* 600 * Reset breakpoint bits because the 601 * processor doesn't 602 */ 603 load_dr6(rdr6() & 0xfffffff0); 604 goto out; 605 } 606 /* 607 * Fall through (TRCTRAP kernel mode, kernel address) 608 */ 609 case T_BPTFLT: 610 /* 611 * If DDB is enabled, let it handle the debugger trap. 612 * Otherwise, debugger traps "can't happen". 613 */ 614#ifdef DDB 615 /* XXX Giant */ 616 if (kdb_trap (type, 0, &frame)) 617 goto out; 618#endif 619 break; 620 621#if NISA > 0 622 case T_NMI: 623#ifdef POWERFAIL_NMI 624 mtx_enter(&Giant, MTX_DEF); 625 if (time_second - lastalert > 10) { 626 log(LOG_WARNING, "NMI: power fail\n"); 627 sysbeep(TIMER_FREQ/880, hz); 628 lastalert = time_second; 629 } 630 mtx_exit(&Giant, MTX_DEF); 631 goto out; 632#else /* !POWERFAIL_NMI */ 633 /* XXX Giant */ 634 /* machine/parity/power fail/"kitchen sink" faults */ 635 if (isa_nmi(code) == 0) { 636#ifdef DDB 637 /* 638 * NMI can be hooked up to a pushbutton 639 * for debugging. 640 */ 641 if (ddb_on_nmi) { 642 printf ("NMI ... going to debugger\n"); 643 kdb_trap (type, 0, &frame); 644 } 645#endif /* DDB */ 646 goto out; 647 } else if (panic_on_nmi == 0) 648 goto out; 649 /* FALL THROUGH */ 650#endif /* POWERFAIL_NMI */ 651#endif /* NISA > 0 */ 652 } 653 654 mtx_enter(&Giant, MTX_DEF); 655 trap_fatal(&frame, eva); 656 mtx_exit(&Giant, MTX_DEF); 657 goto out; 658 } 659 660 mtx_enter(&Giant, MTX_DEF); 661 /* Translate fault for emulators (e.g. Linux) */ 662 if (*p->p_sysent->sv_transtrap) 663 i = (*p->p_sysent->sv_transtrap)(i, type); 664 665 trapsignal(p, i, ucode); 666 667#ifdef DEBUG 668 if (type <= MAX_TRAP_MSG) { 669 uprintf("fatal process exception: %s", 670 trap_msg[type]); 671 if ((type == T_PAGEFLT) || (type == T_PROTFLT)) 672 uprintf(", fault VA = 0x%lx", (u_long)eva); 673 uprintf("\n"); 674 } 675#endif 676 mtx_exit(&Giant, MTX_DEF); 677 678user: 679 userret(p, &frame, sticks); 680 if (mtx_owned(&Giant)) 681 mtx_exit(&Giant, MTX_DEF); 682out: 683 return; 684} 685 686#ifdef notyet 687/* 688 * This version doesn't allow a page fault to user space while 689 * in the kernel. The rest of the kernel needs to be made "safe" 690 * before this can be used. I think the only things remaining 691 * to be made safe are the iBCS2 code and the process tracing/ 692 * debugging code. 693 */ 694static int 695trap_pfault(frame, usermode, eva) 696 struct trapframe *frame; 697 int usermode; 698 vm_offset_t eva; 699{ 700 vm_offset_t va; 701 struct vmspace *vm = NULL; 702 vm_map_t map = 0; 703 int rv = 0; 704 vm_prot_t ftype; 705 struct proc *p = curproc; 706 707 if (frame->tf_err & PGEX_W) 708 ftype = VM_PROT_WRITE; 709 else 710 ftype = VM_PROT_READ; 711 712 va = trunc_page(eva); 713 if (va < VM_MIN_KERNEL_ADDRESS) { 714 vm_offset_t v; 715 vm_page_t mpte; 716 717 if (p == NULL || 718 (!usermode && va < VM_MAXUSER_ADDRESS && 719 (p->p_intr_nesting_level != 0 || 720 PCPU_GET(curpcb) == NULL || 721 PCPU_GET(curpcb)->pcb_onfault == NULL))) { 722 trap_fatal(frame, eva); 723 return (-1); 724 } 725 726 /* 727 * This is a fault on non-kernel virtual memory. 728 * vm is initialized above to NULL. If curproc is NULL 729 * or curproc->p_vmspace is NULL the fault is fatal. 730 */ 731 vm = p->p_vmspace; 732 if (vm == NULL) 733 goto nogo; 734 735 map = &vm->vm_map; 736 737 /* 738 * Keep swapout from messing with us during this 739 * critical time. 740 */ 741 PROC_LOCK(p); 742 ++p->p_lock; 743 PROC_UNLOCK(p); 744 745 /* 746 * Grow the stack if necessary 747 */ 748 /* grow_stack returns false only if va falls into 749 * a growable stack region and the stack growth 750 * fails. It returns true if va was not within 751 * a growable stack region, or if the stack 752 * growth succeeded. 753 */ 754 if (!grow_stack (p, va)) { 755 rv = KERN_FAILURE; 756 PROC_LOCK(p); 757 --p->p_lock; 758 PROC_UNLOCK(p); 759 goto nogo; 760 } 761 762 /* Fault in the user page: */ 763 rv = vm_fault(map, va, ftype, 764 (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY 765 : VM_FAULT_NORMAL); 766 767 PROC_LOCK(p); 768 --p->p_lock; 769 PROC_UNLOCK(p); 770 } else { 771 /* 772 * Don't allow user-mode faults in kernel address space. 773 */ 774 if (usermode) 775 goto nogo; 776 777 /* 778 * Since we know that kernel virtual address addresses 779 * always have pte pages mapped, we just have to fault 780 * the page. 781 */ 782 rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL); 783 } 784 785 if (rv == KERN_SUCCESS) 786 return (0); 787nogo: 788 if (!usermode) { 789 if (p->p_intr_nesting_level == 0 && 790 PCPU_GET(curpcb) != NULL && 791 PCPU_GET(curpcb)->pcb_onfault != NULL) { 792 frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; 793 return (0); 794 } 795 trap_fatal(frame, eva); 796 return (-1); 797 } 798 799 /* kludge to pass faulting virtual address to sendsig */ 800 frame->tf_err = eva; 801 802 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 803} 804#endif 805 806int 807trap_pfault(frame, usermode, eva) 808 struct trapframe *frame; 809 int usermode; 810 vm_offset_t eva; 811{ 812 vm_offset_t va; 813 struct vmspace *vm = NULL; 814 vm_map_t map = 0; 815 int rv = 0; 816 vm_prot_t ftype; 817 struct proc *p = curproc; 818 819 va = trunc_page(eva); 820 if (va >= KERNBASE) { 821 /* 822 * Don't allow user-mode faults in kernel address space. 823 * An exception: if the faulting address is the invalid 824 * instruction entry in the IDT, then the Intel Pentium 825 * F00F bug workaround was triggered, and we need to 826 * treat it is as an illegal instruction, and not a page 827 * fault. 828 */ 829#if defined(I586_CPU) && !defined(NO_F00F_HACK) 830 if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) 831 return -2; 832#endif 833 if (usermode) 834 goto nogo; 835 836 map = kernel_map; 837 } else { 838 /* 839 * This is a fault on non-kernel virtual memory. 840 * vm is initialized above to NULL. If curproc is NULL 841 * or curproc->p_vmspace is NULL the fault is fatal. 842 */ 843 if (p != NULL) 844 vm = p->p_vmspace; 845 846 if (vm == NULL) 847 goto nogo; 848 849 map = &vm->vm_map; 850 } 851 852 if (frame->tf_err & PGEX_W) 853 ftype = VM_PROT_WRITE; 854 else 855 ftype = VM_PROT_READ; 856 857 if (map != kernel_map) { 858 /* 859 * Keep swapout from messing with us during this 860 * critical time. 861 */ 862 PROC_LOCK(p); 863 ++p->p_lock; 864 PROC_UNLOCK(p); 865 866 /* 867 * Grow the stack if necessary 868 */ 869 /* grow_stack returns false only if va falls into 870 * a growable stack region and the stack growth 871 * fails. It returns true if va was not within 872 * a growable stack region, or if the stack 873 * growth succeeded. 874 */ 875 if (!grow_stack (p, va)) { 876 rv = KERN_FAILURE; 877 PROC_LOCK(p); 878 --p->p_lock; 879 PROC_UNLOCK(p); 880 goto nogo; 881 } 882 883 /* Fault in the user page: */ 884 rv = vm_fault(map, va, ftype, 885 (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY 886 : VM_FAULT_NORMAL); 887 888 PROC_LOCK(p); 889 --p->p_lock; 890 PROC_UNLOCK(p); 891 } else { 892 /* 893 * Don't have to worry about process locking or stacks in the 894 * kernel. 895 */ 896 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); 897 } 898 899 if (rv == KERN_SUCCESS) 900 return (0); 901nogo: 902 if (!usermode) { 903 if (p->p_intr_nesting_level == 0 && 904 PCPU_GET(curpcb) != NULL && 905 PCPU_GET(curpcb)->pcb_onfault != NULL) { 906 frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; 907 return (0); 908 } 909 trap_fatal(frame, eva); 910 return (-1); 911 } 912 913 /* kludge to pass faulting virtual address to sendsig */ 914 frame->tf_err = eva; 915 916 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 917} 918 919static void 920trap_fatal(frame, eva) 921 struct trapframe *frame; 922 vm_offset_t eva; 923{ 924 int code, type, ss, esp; 925 struct soft_segment_descriptor softseg; 926 927 code = frame->tf_err; 928 type = frame->tf_trapno; 929 sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); 930 931 if (type <= MAX_TRAP_MSG) 932 printf("\n\nFatal trap %d: %s while in %s mode\n", 933 type, trap_msg[type], 934 frame->tf_eflags & PSL_VM ? "vm86" : 935 ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); 936#ifdef SMP 937 /* two seperate prints in case of a trap on an unmapped page */ 938 printf("cpuid = %d; ", PCPU_GET(cpuid)); 939 printf("lapic.id = %08x\n", lapic.id); 940#endif 941 if (type == T_PAGEFLT) { 942 printf("fault virtual address = 0x%x\n", eva); 943 printf("fault code = %s %s, %s\n", 944 code & PGEX_U ? "user" : "supervisor", 945 code & PGEX_W ? "write" : "read", 946 code & PGEX_P ? "protection violation" : "page not present"); 947 } 948 printf("instruction pointer = 0x%x:0x%x\n", 949 frame->tf_cs & 0xffff, frame->tf_eip); 950 if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) { 951 ss = frame->tf_ss & 0xffff; 952 esp = frame->tf_esp; 953 } else { 954 ss = GSEL(GDATA_SEL, SEL_KPL); 955 esp = (int)&frame->tf_esp; 956 } 957 printf("stack pointer = 0x%x:0x%x\n", ss, esp); 958 printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); 959 printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", 960 softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); 961 printf(" = DPL %d, pres %d, def32 %d, gran %d\n", 962 softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, 963 softseg.ssd_gran); 964 printf("processor eflags = "); 965 if (frame->tf_eflags & PSL_T) 966 printf("trace trap, "); 967 if (frame->tf_eflags & PSL_I) 968 printf("interrupt enabled, "); 969 if (frame->tf_eflags & PSL_NT) 970 printf("nested task, "); 971 if (frame->tf_eflags & PSL_RF) 972 printf("resume, "); 973 if (frame->tf_eflags & PSL_VM) 974 printf("vm86, "); 975 printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); 976 printf("current process = "); 977 if (curproc) { 978 printf("%lu (%s)\n", 979 (u_long)curproc->p_pid, curproc->p_comm ? 980 curproc->p_comm : ""); 981 } else { 982 printf("Idle\n"); 983 } 984 985#ifdef KDB 986 if (kdb_trap(&psl)) 987 return; 988#endif 989#ifdef DDB 990 if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame)) 991 return; 992#endif 993 printf("trap number = %d\n", type); 994 if (type <= MAX_TRAP_MSG) 995 panic(trap_msg[type]); 996 else 997 panic("unknown/reserved trap"); 998} 999 1000/* 1001 * Double fault handler. Called when a fault occurs while writing 1002 * a frame for a trap/exception onto the stack. This usually occurs 1003 * when the stack overflows (such is the case with infinite recursion, 1004 * for example). 1005 * 1006 * XXX Note that the current PTD gets replaced by IdlePTD when the 1007 * task switch occurs. This means that the stack that was active at 1008 * the time of the double fault is not available at <kstack> unless 1009 * the machine was idle when the double fault occurred. The downside 1010 * of this is that "trace <ebp>" in ddb won't work. 1011 */ 1012void 1013dblfault_handler() 1014{ 1015 printf("\nFatal double fault:\n"); 1016 printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); 1017 printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); 1018 printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); 1019#ifdef SMP 1020 /* two seperate prints in case of a trap on an unmapped page */ 1021 printf("cpuid = %d; ", PCPU_GET(cpuid)); 1022 printf("lapic.id = %08x\n", lapic.id); 1023#endif 1024 panic("double fault"); 1025} 1026 1027/* 1028 * Compensate for 386 brain damage (missing URKR). 1029 * This is a little simpler than the pagefault handler in trap() because 1030 * it the page tables have already been faulted in and high addresses 1031 * are thrown out early for other reasons. 1032 */ 1033int trapwrite(addr) 1034 unsigned addr; 1035{ 1036 struct proc *p; 1037 vm_offset_t va; 1038 struct vmspace *vm; 1039 int rv; 1040 1041 va = trunc_page((vm_offset_t)addr); 1042 /* 1043 * XXX - MAX is END. Changed > to >= for temp. fix. 1044 */ 1045 if (va >= VM_MAXUSER_ADDRESS) 1046 return (1); 1047 1048 p = curproc; 1049 vm = p->p_vmspace; 1050 1051 PROC_LOCK(p); 1052 ++p->p_lock; 1053 PROC_UNLOCK(p); 1054 1055 if (!grow_stack (p, va)) { 1056 PROC_LOCK(p); 1057 --p->p_lock; 1058 PROC_UNLOCK(p); 1059 return (1); 1060 } 1061 1062 /* 1063 * fault the data page 1064 */ 1065 rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); 1066 1067 PROC_LOCK(p); 1068 --p->p_lock; 1069 PROC_UNLOCK(p); 1070 1071 if (rv != KERN_SUCCESS) 1072 return 1; 1073 1074 return (0); 1075} 1076 1077/* 1078 * syscall2 - MP aware system call request C handler 1079 * 1080 * A system call is essentially treated as a trap except that the 1081 * MP lock is not held on entry or return. We are responsible for 1082 * obtaining the MP lock if necessary and for handling ASTs 1083 * (e.g. a task switch) prior to return. 1084 * 1085 * In general, only simple access and manipulation of curproc and 1086 * the current stack is allowed without having to hold MP lock. 1087 */ 1088void 1089syscall2(frame) 1090 struct trapframe frame; 1091{ 1092 caddr_t params; 1093 int i; 1094 struct sysent *callp; 1095 struct proc *p = curproc; 1096 u_quad_t sticks; 1097 int error; 1098 int narg; 1099 int args[8]; 1100 u_int code; 1101 1102 atomic_add_int(&cnt.v_syscall, 1); 1103 1104#ifdef DIAGNOSTIC 1105 if (ISPL(frame.tf_cs) != SEL_UPL) { 1106 mtx_enter(&Giant, MTX_DEF); 1107 panic("syscall"); 1108 /* NOT REACHED */ 1109 } 1110#endif 1111 1112 mtx_enter(&sched_lock, MTX_SPIN); 1113 sticks = p->p_sticks; 1114 mtx_exit(&sched_lock, MTX_SPIN); 1115 1116 p->p_md.md_regs = &frame; 1117 params = (caddr_t)frame.tf_esp + sizeof(int); 1118 code = frame.tf_eax; 1119 1120 if (p->p_sysent->sv_prepsyscall) { 1121 /* 1122 * The prep code is not MP aware. 1123 */ 1124 mtx_enter(&Giant, MTX_DEF); 1125 (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); 1126 mtx_exit(&Giant, MTX_DEF); 1127 } else { 1128 /* 1129 * Need to check if this is a 32 bit or 64 bit syscall. 1130 * fuword is MP aware. 1131 */ 1132 if (code == SYS_syscall) { 1133 /* 1134 * Code is first argument, followed by actual args. 1135 */ 1136 code = fuword(params); 1137 params += sizeof(int); 1138 } else if (code == SYS___syscall) { 1139 /* 1140 * Like syscall, but code is a quad, so as to maintain 1141 * quad alignment for the rest of the arguments. 1142 */ 1143 code = fuword(params); 1144 params += sizeof(quad_t); 1145 } 1146 } 1147 1148 if (p->p_sysent->sv_mask) 1149 code &= p->p_sysent->sv_mask; 1150 1151 if (code >= p->p_sysent->sv_size) 1152 callp = &p->p_sysent->sv_table[0]; 1153 else 1154 callp = &p->p_sysent->sv_table[code]; 1155 1156 narg = callp->sy_narg & SYF_ARGMASK; 1157 1158 /* 1159 * copyin is MP aware, but the tracing code is not 1160 */ 1161 if (params && (i = narg * sizeof(int)) && 1162 (error = copyin(params, (caddr_t)args, (u_int)i))) { 1163 mtx_enter(&Giant, MTX_DEF); 1164#ifdef KTRACE 1165 if (KTRPOINT(p, KTR_SYSCALL)) 1166 ktrsyscall(p->p_tracep, code, narg, args); 1167#endif 1168 goto bad; 1169 } 1170 1171 /* 1172 * Try to run the syscall without the MP lock if the syscall 1173 * is MP safe. We have to obtain the MP lock no matter what if 1174 * we are ktracing 1175 */ 1176 if ((callp->sy_narg & SYF_MPSAFE) == 0) { 1177 mtx_enter(&Giant, MTX_DEF); 1178 } 1179 1180#ifdef KTRACE 1181 if (KTRPOINT(p, KTR_SYSCALL)) { 1182 if (!mtx_owned(&Giant)) 1183 mtx_enter(&Giant, MTX_DEF); 1184 ktrsyscall(p->p_tracep, code, narg, args); 1185 } 1186#endif 1187 p->p_retval[0] = 0; 1188 p->p_retval[1] = frame.tf_edx; 1189 1190 STOPEVENT(p, S_SCE, narg); /* MP aware */ 1191 1192 error = (*callp->sy_call)(p, args); 1193 1194 /* 1195 * MP SAFE (we may or may not have the MP lock at this point) 1196 */ 1197 switch (error) { 1198 case 0: 1199 frame.tf_eax = p->p_retval[0]; 1200 frame.tf_edx = p->p_retval[1]; 1201 frame.tf_eflags &= ~PSL_C; 1202 break; 1203 1204 case ERESTART: 1205 /* 1206 * Reconstruct pc, assuming lcall $X,y is 7 bytes, 1207 * int 0x80 is 2 bytes. We saved this in tf_err. 1208 */ 1209 frame.tf_eip -= frame.tf_err; 1210 break; 1211 1212 case EJUSTRETURN: 1213 break; 1214 1215 default: 1216bad: 1217 if (p->p_sysent->sv_errsize) { 1218 if (error >= p->p_sysent->sv_errsize) 1219 error = -1; /* XXX */ 1220 else 1221 error = p->p_sysent->sv_errtbl[error]; 1222 } 1223 frame.tf_eax = error; 1224 frame.tf_eflags |= PSL_C; 1225 break; 1226 } 1227 1228 /* 1229 * Traced syscall. trapsignal() is not MP aware. 1230 */ 1231 if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { 1232 if (!mtx_owned(&Giant)) 1233 mtx_enter(&Giant, MTX_DEF); 1234 frame.tf_eflags &= ~PSL_T; 1235 trapsignal(p, SIGTRAP, 0); 1236 } 1237 1238 /* 1239 * Handle reschedule and other end-of-syscall issues 1240 */ 1241 userret(p, &frame, sticks); 1242 1243#ifdef KTRACE 1244 if (KTRPOINT(p, KTR_SYSRET)) { 1245 if (!mtx_owned(&Giant)) 1246 mtx_enter(&Giant, MTX_DEF); 1247 ktrsysret(p->p_tracep, code, error, p->p_retval[0]); 1248 } 1249#endif 1250 1251 /* 1252 * This works because errno is findable through the 1253 * register set. If we ever support an emulation where this 1254 * is not the case, this code will need to be revisited. 1255 */ 1256 STOPEVENT(p, S_SCX, code); 1257 1258 /* 1259 * Release Giant if we had to get it 1260 */ 1261 if (mtx_owned(&Giant)) 1262 mtx_exit(&Giant, MTX_DEF); 1263 1264#ifdef WITNESS 1265 if (witness_list(p)) { 1266 panic("system call %s returning with mutex(s) held\n", 1267 syscallnames[code]); 1268 } 1269#endif 1270 mtx_assert(&sched_lock, MA_NOTOWNED); 1271 mtx_assert(&Giant, MA_NOTOWNED); 1272} 1273 1274void 1275ast(frame) 1276 struct trapframe frame; 1277{ 1278 struct proc *p = CURPROC; 1279 u_quad_t sticks; 1280 1281 mtx_enter(&sched_lock, MTX_SPIN); 1282 sticks = p->p_sticks; 1283 1284 astoff(); 1285 atomic_add_int(&cnt.v_soft, 1); 1286 if (p->p_sflag & PS_OWEUPC) { 1287 p->p_sflag &= ~PS_OWEUPC; 1288 mtx_exit(&sched_lock, MTX_SPIN); 1289 mtx_enter(&Giant, MTX_DEF); 1290 mtx_enter(&sched_lock, MTX_SPIN); 1291 addupc_task(p, p->p_stats->p_prof.pr_addr, 1292 p->p_stats->p_prof.pr_ticks); 1293 } 1294 if (p->p_sflag & PS_ALRMPEND) { 1295 p->p_sflag &= ~PS_ALRMPEND; 1296 mtx_exit(&sched_lock, MTX_SPIN); 1297 if (!mtx_owned(&Giant)) 1298 mtx_enter(&Giant, MTX_DEF); 1299 psignal(p, SIGVTALRM); 1300 mtx_enter(&sched_lock, MTX_SPIN); 1301 } 1302 if (p->p_sflag & PS_PROFPEND) { 1303 p->p_sflag &= ~PS_PROFPEND; 1304 mtx_exit(&sched_lock, MTX_SPIN); 1305 if (!mtx_owned(&Giant)) 1306 mtx_enter(&Giant, MTX_DEF); 1307 psignal(p, SIGPROF); 1308 } else 1309 mtx_exit(&sched_lock, MTX_SPIN); 1310 1311 userret(p, &frame, sticks); 1312 1313 if (mtx_owned(&Giant)) 1314 mtx_exit(&Giant, MTX_DEF); 1315} 1316