subr_syscall.c revision 76650
1/*- 2 * Copyright (C) 1994, David Greenman 3 * Copyright (c) 1990, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the University of Utah, and William Jolitz. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 38 * $FreeBSD: head/sys/kern/subr_trap.c 76650 2001-05-15 23:22:29Z jhb $ 39 */ 40 41/* 42 * 386 Trap and System call handling 43 */ 44 45#include "opt_clock.h" 46#include "opt_cpu.h" 47#include "opt_ddb.h" 48#include "opt_isa.h" 49#include "opt_ktrace.h" 50#include "opt_npx.h" 51#include "opt_trap.h" 52 53#include <sys/param.h> 54#include <sys/bus.h> 55#include <sys/systm.h> 56#include <sys/proc.h> 57#include <sys/pioctl.h> 58#include <sys/kernel.h> 59#include <sys/ktr.h> 60#include <sys/mutex.h> 61#include <sys/resourcevar.h> 62#include <sys/signalvar.h> 63#include <sys/syscall.h> 64#include <sys/sysctl.h> 65#include <sys/sysent.h> 66#include <sys/uio.h> 67#include <sys/vmmeter.h> 68#ifdef KTRACE 69#include <sys/ktrace.h> 70#endif 71 72#include <vm/vm.h> 73#include <vm/vm_param.h> 74#include <sys/lock.h> 75#include <vm/pmap.h> 76#include <vm/vm_kern.h> 77#include <vm/vm_map.h> 78#include <vm/vm_page.h> 79#include <vm/vm_extern.h> 80 81#include <machine/cpu.h> 82#include <machine/md_var.h> 83#include <machine/pcb.h> 84#ifdef SMP 85#include <machine/smp.h> 86#endif 87#include <machine/tss.h> 88 89#include <i386/isa/icu.h> 90#include <i386/isa/intr_machdep.h> 91 92#ifdef POWERFAIL_NMI 93#include <sys/syslog.h> 94#include <machine/clock.h> 95#endif 96 97#include <machine/vm86.h> 98 99#include <ddb/ddb.h> 100 101#include <sys/sysctl.h> 102 103int (*pmath_emulate) __P((struct trapframe *)); 104 105extern void trap __P((struct trapframe frame)); 106extern int trapwrite __P((unsigned addr)); 107extern void syscall __P((struct trapframe frame)); 108extern void ast __P((struct trapframe *framep)); 109 110static int trap_pfault __P((struct trapframe *, int, vm_offset_t)); 111static void trap_fatal __P((struct trapframe *, vm_offset_t)); 112void dblfault_handler __P((void)); 113 114extern inthand_t IDTVEC(lcall_syscall); 115 116#define MAX_TRAP_MSG 28 117static char *trap_msg[] = { 118 "", /* 0 unused */ 119 "privileged instruction fault", /* 1 T_PRIVINFLT */ 120 "", /* 2 unused */ 121 "breakpoint instruction fault", /* 3 T_BPTFLT */ 122 "", /* 4 unused */ 123 "", /* 5 unused */ 124 "arithmetic trap", /* 6 T_ARITHTRAP */ 125 "", /* 7 unused */ 126 "", /* 8 unused */ 127 "general protection fault", /* 9 T_PROTFLT */ 128 "trace trap", /* 10 T_TRCTRAP */ 129 "", /* 11 unused */ 130 "page fault", /* 12 T_PAGEFLT */ 131 "", /* 13 unused */ 132 "alignment fault", /* 14 T_ALIGNFLT */ 133 "", /* 15 unused */ 134 "", /* 16 unused */ 135 "", /* 17 unused */ 136 "integer divide fault", /* 18 T_DIVIDE */ 137 "non-maskable interrupt trap", /* 19 T_NMI */ 138 "overflow trap", /* 20 T_OFLOW */ 139 "FPU bounds check fault", /* 21 T_BOUND */ 140 "FPU device not available", /* 22 T_DNA */ 141 "double fault", /* 23 T_DOUBLEFLT */ 142 "FPU operand fetch fault", /* 24 T_FPOPFLT */ 143 "invalid TSS fault", /* 25 T_TSSFLT */ 144 "segment not present fault", /* 26 T_SEGNPFLT */ 145 "stack fault", /* 27 T_STKFLT */ 146 "machine check trap", /* 28 T_MCHK */ 147}; 148 149#if defined(I586_CPU) && !defined(NO_F00F_HACK) 150extern int has_f00f_bug; 151#endif 152 153#ifdef DDB 154static int ddb_on_nmi = 1; 155SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, 156 &ddb_on_nmi, 0, "Go to DDB on NMI"); 157#endif 158static int panic_on_nmi = 1; 159SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, 160 &panic_on_nmi, 0, "Panic on NMI"); 161 162#ifdef WITNESS 163extern char *syscallnames[]; 164#endif 165 166void 167userret(p, frame, oticks) 168 struct proc *p; 169 struct trapframe *frame; 170 u_quad_t oticks; 171{ 172 int sig; 173 174 while ((sig = CURSIG(p)) != 0) 175 postsig(sig); 176 177 mtx_lock_spin(&sched_lock); 178 p->p_pri.pri_level = p->p_pri.pri_user; 179 if (resched_wanted(p)) { 180 /* 181 * Since we are curproc, clock will normally just change 182 * our priority without moving us from one queue to another 183 * (since the running process is not on a queue.) 184 * If that happened after we setrunqueue ourselves but before we 185 * mi_switch()'ed, we might not be on the queue indicated by 186 * our priority. 187 */ 188 DROP_GIANT_NOSWITCH(); 189 setrunqueue(p); 190 p->p_stats->p_ru.ru_nivcsw++; 191 mi_switch(); 192 mtx_unlock_spin(&sched_lock); 193 PICKUP_GIANT(); 194 while ((sig = CURSIG(p)) != 0) 195 postsig(sig); 196 mtx_lock_spin(&sched_lock); 197 } 198 199 /* 200 * Charge system time if profiling. 201 */ 202 if (p->p_sflag & PS_PROFIL) { 203 mtx_unlock_spin(&sched_lock); 204 /* XXX - do we need Giant? */ 205 if (!mtx_owned(&Giant)) 206 mtx_lock(&Giant); 207 addupc_task(p, TRAPF_PC(frame), 208 (u_int)(p->p_sticks - oticks) * psratio); 209 } else 210 mtx_unlock_spin(&sched_lock); 211} 212 213/* 214 * Exception, fault, and trap interface to the FreeBSD kernel. 215 * This common code is called from assembly language IDT gate entry 216 * routines that prepare a suitable stack frame, and restore this 217 * frame after the exception has been processed. 218 */ 219 220void 221trap(frame) 222 struct trapframe frame; 223{ 224 struct proc *p = curproc; 225 u_quad_t sticks = 0; 226 int i = 0, ucode = 0, type, code; 227 vm_offset_t eva; 228#ifdef POWERFAIL_NMI 229 static int lastalert = 0; 230#endif 231 232 atomic_add_int(&cnt.v_trap, 1); 233 234 if ((frame.tf_eflags & PSL_I) == 0) { 235 /* 236 * Buggy application or kernel code has disabled 237 * interrupts and then trapped. Enabling interrupts 238 * now is wrong, but it is better than running with 239 * interrupts disabled until they are accidentally 240 * enabled later. XXX This is really bad if we trap 241 * while holding a spin lock. 242 */ 243 type = frame.tf_trapno; 244 if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) 245 printf( 246 "pid %ld (%s): trap %d with interrupts disabled\n", 247 (long)curproc->p_pid, curproc->p_comm, type); 248 else if (type != T_BPTFLT && type != T_TRCTRAP) { 249 /* 250 * XXX not quite right, since this may be for a 251 * multiple fault in user mode. 252 */ 253 printf("kernel trap %d with interrupts disabled\n", 254 type); 255 /* 256 * We should walk p_heldmtx here and see if any are 257 * spin mutexes, and not do this if so. 258 */ 259 enable_intr(); 260 } 261 } 262 263 eva = 0; 264 265#if defined(I586_CPU) && !defined(NO_F00F_HACK) 266restart: 267#endif 268 269 type = frame.tf_trapno; 270 code = frame.tf_err; 271 272 if ((ISPL(frame.tf_cs) == SEL_UPL) || 273 ((frame.tf_eflags & PSL_VM) && !in_vm86call)) { 274 /* user trap */ 275 276 mtx_lock_spin(&sched_lock); 277 sticks = p->p_sticks; 278 mtx_unlock_spin(&sched_lock); 279 p->p_md.md_regs = &frame; 280 281 switch (type) { 282 case T_PRIVINFLT: /* privileged instruction fault */ 283 ucode = type; 284 i = SIGILL; 285 break; 286 287 case T_BPTFLT: /* bpt instruction fault */ 288 case T_TRCTRAP: /* trace trap */ 289 frame.tf_eflags &= ~PSL_T; 290 i = SIGTRAP; 291 break; 292 293 case T_ARITHTRAP: /* arithmetic trap */ 294 ucode = code; 295 i = SIGFPE; 296 break; 297 298 /* 299 * The following two traps can happen in 300 * vm86 mode, and, if so, we want to handle 301 * them specially. 302 */ 303 case T_PROTFLT: /* general protection fault */ 304 case T_STKFLT: /* stack fault */ 305 if (frame.tf_eflags & PSL_VM) { 306 mtx_lock(&Giant); 307 i = vm86_emulate((struct vm86frame *)&frame); 308 mtx_unlock(&Giant); 309 if (i == 0) 310 goto user; 311 break; 312 } 313 /* FALL THROUGH */ 314 315 case T_SEGNPFLT: /* segment not present fault */ 316 case T_TSSFLT: /* invalid TSS fault */ 317 case T_DOUBLEFLT: /* double fault */ 318 default: 319 ucode = code + BUS_SEGM_FAULT ; 320 i = SIGBUS; 321 break; 322 323 case T_PAGEFLT: /* page fault */ 324 /* 325 * For some Cyrix CPUs, %cr2 is clobbered by 326 * interrupts. This problem is worked around by using 327 * an interrupt gate for the pagefault handler. We 328 * are finally ready to read %cr2 and then must 329 * reenable interrupts. 330 */ 331 eva = rcr2(); 332 enable_intr(); 333 mtx_lock(&Giant); 334 i = trap_pfault(&frame, TRUE, eva); 335 mtx_unlock(&Giant); 336#if defined(I586_CPU) && !defined(NO_F00F_HACK) 337 if (i == -2) { 338 /* 339 * f00f hack workaround has triggered, treat 340 * as illegal instruction not page fault. 341 */ 342 frame.tf_trapno = T_PRIVINFLT; 343 goto restart; 344 } 345#endif 346 if (i == -1) 347 goto out; 348 if (i == 0) 349 goto user; 350 351 ucode = T_PAGEFLT; 352 break; 353 354 case T_DIVIDE: /* integer divide fault */ 355 ucode = FPE_INTDIV; 356 i = SIGFPE; 357 break; 358 359#ifdef DEV_ISA 360 case T_NMI: 361#ifdef POWERFAIL_NMI 362#ifndef TIMER_FREQ 363# define TIMER_FREQ 1193182 364#endif 365 mtx_lock(&Giant); 366 if (time_second - lastalert > 10) { 367 log(LOG_WARNING, "NMI: power fail\n"); 368 sysbeep(TIMER_FREQ/880, hz); 369 lastalert = time_second; 370 } 371 mtx_unlock(&Giant); 372 goto out; 373#else /* !POWERFAIL_NMI */ 374 /* machine/parity/power fail/"kitchen sink" faults */ 375 /* XXX Giant */ 376 if (isa_nmi(code) == 0) { 377#ifdef DDB 378 /* 379 * NMI can be hooked up to a pushbutton 380 * for debugging. 381 */ 382 if (ddb_on_nmi) { 383 printf ("NMI ... going to debugger\n"); 384 kdb_trap (type, 0, &frame); 385 } 386#endif /* DDB */ 387 goto out; 388 } else if (panic_on_nmi) 389 panic("NMI indicates hardware failure"); 390 break; 391#endif /* POWERFAIL_NMI */ 392#endif /* DEV_ISA */ 393 394 case T_OFLOW: /* integer overflow fault */ 395 ucode = FPE_INTOVF; 396 i = SIGFPE; 397 break; 398 399 case T_BOUND: /* bounds check fault */ 400 ucode = FPE_FLTSUB; 401 i = SIGFPE; 402 break; 403 404 case T_DNA: 405#ifdef DEV_NPX 406 /* transparent fault (due to context switch "late") */ 407 if (npxdna()) 408 goto out; 409#endif 410 if (!pmath_emulate) { 411 i = SIGFPE; 412 ucode = FPE_FPU_NP_TRAP; 413 break; 414 } 415 mtx_lock(&Giant); 416 i = (*pmath_emulate)(&frame); 417 mtx_unlock(&Giant); 418 if (i == 0) { 419 if (!(frame.tf_eflags & PSL_T)) 420 goto out; 421 frame.tf_eflags &= ~PSL_T; 422 i = SIGTRAP; 423 } 424 /* else ucode = emulator_only_knows() XXX */ 425 break; 426 427 case T_FPOPFLT: /* FPU operand fetch fault */ 428 ucode = T_FPOPFLT; 429 i = SIGILL; 430 break; 431 } 432 } else { 433 /* kernel trap */ 434 435 switch (type) { 436 case T_PAGEFLT: /* page fault */ 437 /* 438 * For some Cyrix CPUs, %cr2 is clobbered by 439 * interrupts. This problem is worked around by using 440 * an interrupt gate for the pagefault handler. We 441 * are finally ready to read %cr2 and then must 442 * reenable interrupts. 443 */ 444 eva = rcr2(); 445 enable_intr(); 446 mtx_lock(&Giant); 447 (void) trap_pfault(&frame, FALSE, eva); 448 mtx_unlock(&Giant); 449 goto out; 450 451 case T_DNA: 452#ifdef DEV_NPX 453 /* 454 * The kernel is apparently using npx for copying. 455 * XXX this should be fatal unless the kernel has 456 * registered such use. 457 */ 458 if (npxdna()) 459 goto out; 460#endif 461 break; 462 463 /* 464 * The following two traps can happen in 465 * vm86 mode, and, if so, we want to handle 466 * them specially. 467 */ 468 case T_PROTFLT: /* general protection fault */ 469 case T_STKFLT: /* stack fault */ 470 if (frame.tf_eflags & PSL_VM) { 471 mtx_lock(&Giant); 472 i = vm86_emulate((struct vm86frame *)&frame); 473 mtx_unlock(&Giant); 474 if (i != 0) 475 /* 476 * returns to original process 477 */ 478 vm86_trap((struct vm86frame *)&frame); 479 goto out; 480 } 481 if (type == T_STKFLT) 482 break; 483 484 /* FALL THROUGH */ 485 486 case T_SEGNPFLT: /* segment not present fault */ 487 if (in_vm86call) 488 break; 489 490 if (p->p_intr_nesting_level != 0) 491 break; 492 493 /* 494 * Invalid %fs's and %gs's can be created using 495 * procfs or PT_SETREGS or by invalidating the 496 * underlying LDT entry. This causes a fault 497 * in kernel mode when the kernel attempts to 498 * switch contexts. Lose the bad context 499 * (XXX) so that we can continue, and generate 500 * a signal. 501 */ 502 if (frame.tf_eip == (int)cpu_switch_load_gs) { 503 PCPU_GET(curpcb)->pcb_gs = 0; 504 PROC_LOCK(p); 505 psignal(p, SIGBUS); 506 PROC_UNLOCK(p); 507 goto out; 508 } 509 510 /* 511 * Invalid segment selectors and out of bounds 512 * %eip's and %esp's can be set up in user mode. 513 * This causes a fault in kernel mode when the 514 * kernel tries to return to user mode. We want 515 * to get this fault so that we can fix the 516 * problem here and not have to check all the 517 * selectors and pointers when the user changes 518 * them. 519 */ 520 if (frame.tf_eip == (int)doreti_iret) { 521 frame.tf_eip = (int)doreti_iret_fault; 522 goto out; 523 } 524 if (frame.tf_eip == (int)doreti_popl_ds) { 525 frame.tf_eip = (int)doreti_popl_ds_fault; 526 goto out; 527 } 528 if (frame.tf_eip == (int)doreti_popl_es) { 529 frame.tf_eip = (int)doreti_popl_es_fault; 530 goto out; 531 } 532 if (frame.tf_eip == (int)doreti_popl_fs) { 533 frame.tf_eip = (int)doreti_popl_fs_fault; 534 goto out; 535 } 536 if (PCPU_GET(curpcb) != NULL && 537 PCPU_GET(curpcb)->pcb_onfault != NULL) { 538 frame.tf_eip = 539 (int)PCPU_GET(curpcb)->pcb_onfault; 540 goto out; 541 } 542 break; 543 544 case T_TSSFLT: 545 /* 546 * PSL_NT can be set in user mode and isn't cleared 547 * automatically when the kernel is entered. This 548 * causes a TSS fault when the kernel attempts to 549 * `iret' because the TSS link is uninitialized. We 550 * want to get this fault so that we can fix the 551 * problem here and not every time the kernel is 552 * entered. 553 */ 554 if (frame.tf_eflags & PSL_NT) { 555 frame.tf_eflags &= ~PSL_NT; 556 goto out; 557 } 558 break; 559 560 case T_TRCTRAP: /* trace trap */ 561 if (frame.tf_eip == (int)IDTVEC(lcall_syscall)) { 562 /* 563 * We've just entered system mode via the 564 * syscall lcall. Continue single stepping 565 * silently until the syscall handler has 566 * saved the flags. 567 */ 568 goto out; 569 } 570 if (frame.tf_eip == (int)IDTVEC(lcall_syscall) + 1) { 571 /* 572 * The syscall handler has now saved the 573 * flags. Stop single stepping it. 574 */ 575 frame.tf_eflags &= ~PSL_T; 576 goto out; 577 } 578 /* 579 * Ignore debug register trace traps due to 580 * accesses in the user's address space, which 581 * can happen under several conditions such as 582 * if a user sets a watchpoint on a buffer and 583 * then passes that buffer to a system call. 584 * We still want to get TRCTRAPS for addresses 585 * in kernel space because that is useful when 586 * debugging the kernel. 587 */ 588 /* XXX Giant */ 589 if (user_dbreg_trap() && !in_vm86call) { 590 /* 591 * Reset breakpoint bits because the 592 * processor doesn't 593 */ 594 load_dr6(rdr6() & 0xfffffff0); 595 goto out; 596 } 597 /* 598 * Fall through (TRCTRAP kernel mode, kernel address) 599 */ 600 case T_BPTFLT: 601 /* 602 * If DDB is enabled, let it handle the debugger trap. 603 * Otherwise, debugger traps "can't happen". 604 */ 605#ifdef DDB 606 /* XXX Giant */ 607 if (kdb_trap (type, 0, &frame)) 608 goto out; 609#endif 610 break; 611 612#ifdef DEV_ISA 613 case T_NMI: 614#ifdef POWERFAIL_NMI 615 mtx_lock(&Giant); 616 if (time_second - lastalert > 10) { 617 log(LOG_WARNING, "NMI: power fail\n"); 618 sysbeep(TIMER_FREQ/880, hz); 619 lastalert = time_second; 620 } 621 mtx_unlock(&Giant); 622 goto out; 623#else /* !POWERFAIL_NMI */ 624 /* XXX Giant */ 625 /* machine/parity/power fail/"kitchen sink" faults */ 626 if (isa_nmi(code) == 0) { 627#ifdef DDB 628 /* 629 * NMI can be hooked up to a pushbutton 630 * for debugging. 631 */ 632 if (ddb_on_nmi) { 633 printf ("NMI ... going to debugger\n"); 634 kdb_trap (type, 0, &frame); 635 } 636#endif /* DDB */ 637 goto out; 638 } else if (panic_on_nmi == 0) 639 goto out; 640 /* FALL THROUGH */ 641#endif /* POWERFAIL_NMI */ 642#endif /* DEV_ISA */ 643 } 644 645 mtx_lock(&Giant); 646 trap_fatal(&frame, eva); 647 mtx_unlock(&Giant); 648 goto out; 649 } 650 651 mtx_lock(&Giant); 652 /* Translate fault for emulators (e.g. Linux) */ 653 if (*p->p_sysent->sv_transtrap) 654 i = (*p->p_sysent->sv_transtrap)(i, type); 655 656 trapsignal(p, i, ucode); 657 658#ifdef DEBUG 659 if (type <= MAX_TRAP_MSG) { 660 uprintf("fatal process exception: %s", 661 trap_msg[type]); 662 if ((type == T_PAGEFLT) || (type == T_PROTFLT)) 663 uprintf(", fault VA = 0x%lx", (u_long)eva); 664 uprintf("\n"); 665 } 666#endif 667 mtx_unlock(&Giant); 668 669user: 670 userret(p, &frame, sticks); 671 if (mtx_owned(&Giant)) 672 mtx_unlock(&Giant); 673out: 674 return; 675} 676 677#ifdef notyet 678/* 679 * This version doesn't allow a page fault to user space while 680 * in the kernel. The rest of the kernel needs to be made "safe" 681 * before this can be used. I think the only things remaining 682 * to be made safe are the iBCS2 code and the process tracing/ 683 * debugging code. 684 */ 685static int 686trap_pfault(frame, usermode, eva) 687 struct trapframe *frame; 688 int usermode; 689 vm_offset_t eva; 690{ 691 vm_offset_t va; 692 struct vmspace *vm = NULL; 693 vm_map_t map = 0; 694 int rv = 0; 695 vm_prot_t ftype; 696 struct proc *p = curproc; 697 698 if (frame->tf_err & PGEX_W) 699 ftype = VM_PROT_WRITE; 700 else 701 ftype = VM_PROT_READ; 702 703 va = trunc_page(eva); 704 if (va < VM_MIN_KERNEL_ADDRESS) { 705 vm_offset_t v; 706 vm_page_t mpte; 707 708 if (p == NULL || 709 (!usermode && va < VM_MAXUSER_ADDRESS && 710 (p->p_intr_nesting_level != 0 || 711 PCPU_GET(curpcb) == NULL || 712 PCPU_GET(curpcb)->pcb_onfault == NULL))) { 713 trap_fatal(frame, eva); 714 return (-1); 715 } 716 717 /* 718 * This is a fault on non-kernel virtual memory. 719 * vm is initialized above to NULL. If curproc is NULL 720 * or curproc->p_vmspace is NULL the fault is fatal. 721 */ 722 vm = p->p_vmspace; 723 if (vm == NULL) 724 goto nogo; 725 726 map = &vm->vm_map; 727 728 /* 729 * Keep swapout from messing with us during this 730 * critical time. 731 */ 732 PROC_LOCK(p); 733 ++p->p_lock; 734 PROC_UNLOCK(p); 735 736 /* 737 * Grow the stack if necessary 738 */ 739 /* grow_stack returns false only if va falls into 740 * a growable stack region and the stack growth 741 * fails. It returns true if va was not within 742 * a growable stack region, or if the stack 743 * growth succeeded. 744 */ 745 if (!grow_stack (p, va)) 746 rv = KERN_FAILURE; 747 else 748 /* Fault in the user page: */ 749 rv = vm_fault(map, va, ftype, 750 (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY 751 : VM_FAULT_NORMAL); 752 753 PROC_LOCK(p); 754 --p->p_lock; 755 PROC_UNLOCK(p); 756 } else { 757 /* 758 * Don't allow user-mode faults in kernel address space. 759 */ 760 if (usermode) 761 goto nogo; 762 763 /* 764 * Since we know that kernel virtual address addresses 765 * always have pte pages mapped, we just have to fault 766 * the page. 767 */ 768 rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL); 769 } 770 771 if (rv == KERN_SUCCESS) 772 return (0); 773nogo: 774 if (!usermode) { 775 if (p->p_intr_nesting_level == 0 && 776 PCPU_GET(curpcb) != NULL && 777 PCPU_GET(curpcb)->pcb_onfault != NULL) { 778 frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; 779 return (0); 780 } 781 trap_fatal(frame, eva); 782 return (-1); 783 } 784 785 /* kludge to pass faulting virtual address to sendsig */ 786 frame->tf_err = eva; 787 788 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 789} 790#endif 791 792int 793trap_pfault(frame, usermode, eva) 794 struct trapframe *frame; 795 int usermode; 796 vm_offset_t eva; 797{ 798 vm_offset_t va; 799 struct vmspace *vm = NULL; 800 vm_map_t map = 0; 801 int rv = 0; 802 vm_prot_t ftype; 803 struct proc *p = curproc; 804 805 va = trunc_page(eva); 806 if (va >= KERNBASE) { 807 /* 808 * Don't allow user-mode faults in kernel address space. 809 * An exception: if the faulting address is the invalid 810 * instruction entry in the IDT, then the Intel Pentium 811 * F00F bug workaround was triggered, and we need to 812 * treat it is as an illegal instruction, and not a page 813 * fault. 814 */ 815#if defined(I586_CPU) && !defined(NO_F00F_HACK) 816 if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) 817 return -2; 818#endif 819 if (usermode) 820 goto nogo; 821 822 map = kernel_map; 823 } else { 824 /* 825 * This is a fault on non-kernel virtual memory. 826 * vm is initialized above to NULL. If curproc is NULL 827 * or curproc->p_vmspace is NULL the fault is fatal. 828 */ 829 if (p != NULL) 830 vm = p->p_vmspace; 831 832 if (vm == NULL) 833 goto nogo; 834 835 map = &vm->vm_map; 836 } 837 838 if (frame->tf_err & PGEX_W) 839 ftype = VM_PROT_WRITE; 840 else 841 ftype = VM_PROT_READ; 842 843 if (map != kernel_map) { 844 /* 845 * Keep swapout from messing with us during this 846 * critical time. 847 */ 848 PROC_LOCK(p); 849 ++p->p_lock; 850 PROC_UNLOCK(p); 851 852 /* 853 * Grow the stack if necessary 854 */ 855 /* grow_stack returns false only if va falls into 856 * a growable stack region and the stack growth 857 * fails. It returns true if va was not within 858 * a growable stack region, or if the stack 859 * growth succeeded. 860 */ 861 if (!grow_stack (p, va)) 862 rv = KERN_FAILURE; 863 else 864 /* Fault in the user page: */ 865 rv = vm_fault(map, va, ftype, 866 (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY 867 : VM_FAULT_NORMAL); 868 869 PROC_LOCK(p); 870 --p->p_lock; 871 PROC_UNLOCK(p); 872 } else { 873 /* 874 * Don't have to worry about process locking or stacks in the 875 * kernel. 876 */ 877 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); 878 } 879 880 if (rv == KERN_SUCCESS) 881 return (0); 882nogo: 883 if (!usermode) { 884 if (p->p_intr_nesting_level == 0 && 885 PCPU_GET(curpcb) != NULL && 886 PCPU_GET(curpcb)->pcb_onfault != NULL) { 887 frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; 888 return (0); 889 } 890 trap_fatal(frame, eva); 891 return (-1); 892 } 893 894 /* kludge to pass faulting virtual address to sendsig */ 895 frame->tf_err = eva; 896 897 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 898} 899 900static void 901trap_fatal(frame, eva) 902 struct trapframe *frame; 903 vm_offset_t eva; 904{ 905 int code, type, ss, esp; 906 struct soft_segment_descriptor softseg; 907 908 code = frame->tf_err; 909 type = frame->tf_trapno; 910 sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); 911 912 if (type <= MAX_TRAP_MSG) 913 printf("\n\nFatal trap %d: %s while in %s mode\n", 914 type, trap_msg[type], 915 frame->tf_eflags & PSL_VM ? "vm86" : 916 ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); 917#ifdef SMP 918 /* two separate prints in case of a trap on an unmapped page */ 919 printf("cpuid = %d; ", PCPU_GET(cpuid)); 920 printf("lapic.id = %08x\n", lapic.id); 921#endif 922 if (type == T_PAGEFLT) { 923 printf("fault virtual address = 0x%x\n", eva); 924 printf("fault code = %s %s, %s\n", 925 code & PGEX_U ? "user" : "supervisor", 926 code & PGEX_W ? "write" : "read", 927 code & PGEX_P ? "protection violation" : "page not present"); 928 } 929 printf("instruction pointer = 0x%x:0x%x\n", 930 frame->tf_cs & 0xffff, frame->tf_eip); 931 if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) { 932 ss = frame->tf_ss & 0xffff; 933 esp = frame->tf_esp; 934 } else { 935 ss = GSEL(GDATA_SEL, SEL_KPL); 936 esp = (int)&frame->tf_esp; 937 } 938 printf("stack pointer = 0x%x:0x%x\n", ss, esp); 939 printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); 940 printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", 941 softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); 942 printf(" = DPL %d, pres %d, def32 %d, gran %d\n", 943 softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, 944 softseg.ssd_gran); 945 printf("processor eflags = "); 946 if (frame->tf_eflags & PSL_T) 947 printf("trace trap, "); 948 if (frame->tf_eflags & PSL_I) 949 printf("interrupt enabled, "); 950 if (frame->tf_eflags & PSL_NT) 951 printf("nested task, "); 952 if (frame->tf_eflags & PSL_RF) 953 printf("resume, "); 954 if (frame->tf_eflags & PSL_VM) 955 printf("vm86, "); 956 printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); 957 printf("current process = "); 958 if (curproc) { 959 printf("%lu (%s)\n", 960 (u_long)curproc->p_pid, curproc->p_comm ? 961 curproc->p_comm : ""); 962 } else { 963 printf("Idle\n"); 964 } 965 966#ifdef KDB 967 if (kdb_trap(&psl)) 968 return; 969#endif 970#ifdef DDB 971 if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame)) 972 return; 973#endif 974 printf("trap number = %d\n", type); 975 if (type <= MAX_TRAP_MSG) 976 panic(trap_msg[type]); 977 else 978 panic("unknown/reserved trap"); 979} 980 981/* 982 * Double fault handler. Called when a fault occurs while writing 983 * a frame for a trap/exception onto the stack. This usually occurs 984 * when the stack overflows (such is the case with infinite recursion, 985 * for example). 986 * 987 * XXX Note that the current PTD gets replaced by IdlePTD when the 988 * task switch occurs. This means that the stack that was active at 989 * the time of the double fault is not available at <kstack> unless 990 * the machine was idle when the double fault occurred. The downside 991 * of this is that "trace <ebp>" in ddb won't work. 992 */ 993void 994dblfault_handler() 995{ 996 printf("\nFatal double fault:\n"); 997 printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); 998 printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); 999 printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); 1000#ifdef SMP 1001 /* two separate prints in case of a trap on an unmapped page */ 1002 printf("cpuid = %d; ", PCPU_GET(cpuid)); 1003 printf("lapic.id = %08x\n", lapic.id); 1004#endif 1005 panic("double fault"); 1006} 1007 1008/* 1009 * Compensate for 386 brain damage (missing URKR). 1010 * This is a little simpler than the pagefault handler in trap() because 1011 * it the page tables have already been faulted in and high addresses 1012 * are thrown out early for other reasons. 1013 */ 1014int trapwrite(addr) 1015 unsigned addr; 1016{ 1017 struct proc *p; 1018 vm_offset_t va; 1019 struct vmspace *vm; 1020 int rv; 1021 1022 va = trunc_page((vm_offset_t)addr); 1023 /* 1024 * XXX - MAX is END. Changed > to >= for temp. fix. 1025 */ 1026 if (va >= VM_MAXUSER_ADDRESS) 1027 return (1); 1028 1029 p = curproc; 1030 vm = p->p_vmspace; 1031 1032 PROC_LOCK(p); 1033 ++p->p_lock; 1034 PROC_UNLOCK(p); 1035 1036 if (!grow_stack (p, va)) 1037 rv = KERN_FAILURE; 1038 else 1039 /* 1040 * fault the data page 1041 */ 1042 rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); 1043 1044 PROC_LOCK(p); 1045 --p->p_lock; 1046 PROC_UNLOCK(p); 1047 1048 if (rv != KERN_SUCCESS) 1049 return 1; 1050 1051 return (0); 1052} 1053 1054/* 1055 * syscall - MP aware system call request C handler 1056 * 1057 * A system call is essentially treated as a trap except that the 1058 * MP lock is not held on entry or return. We are responsible for 1059 * obtaining the MP lock if necessary and for handling ASTs 1060 * (e.g. a task switch) prior to return. 1061 * 1062 * In general, only simple access and manipulation of curproc and 1063 * the current stack is allowed without having to hold MP lock. 1064 */ 1065void 1066syscall(frame) 1067 struct trapframe frame; 1068{ 1069 caddr_t params; 1070 int i; 1071 struct sysent *callp; 1072 struct proc *p = curproc; 1073 u_quad_t sticks; 1074 int error; 1075 int narg; 1076 int args[8]; 1077 u_int code; 1078 1079 atomic_add_int(&cnt.v_syscall, 1); 1080 1081#ifdef DIAGNOSTIC 1082 if (ISPL(frame.tf_cs) != SEL_UPL) { 1083 mtx_lock(&Giant); 1084 panic("syscall"); 1085 /* NOT REACHED */ 1086 } 1087#endif 1088 1089 mtx_lock_spin(&sched_lock); 1090 sticks = p->p_sticks; 1091 mtx_unlock_spin(&sched_lock); 1092 1093 p->p_md.md_regs = &frame; 1094 params = (caddr_t)frame.tf_esp + sizeof(int); 1095 code = frame.tf_eax; 1096 1097 if (p->p_sysent->sv_prepsyscall) { 1098 /* 1099 * The prep code is not MP aware. 1100 */ 1101 mtx_lock(&Giant); 1102 (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); 1103 mtx_unlock(&Giant); 1104 } else { 1105 /* 1106 * Need to check if this is a 32 bit or 64 bit syscall. 1107 * fuword is MP aware. 1108 */ 1109 if (code == SYS_syscall) { 1110 /* 1111 * Code is first argument, followed by actual args. 1112 */ 1113 code = fuword(params); 1114 params += sizeof(int); 1115 } else if (code == SYS___syscall) { 1116 /* 1117 * Like syscall, but code is a quad, so as to maintain 1118 * quad alignment for the rest of the arguments. 1119 */ 1120 code = fuword(params); 1121 params += sizeof(quad_t); 1122 } 1123 } 1124 1125 if (p->p_sysent->sv_mask) 1126 code &= p->p_sysent->sv_mask; 1127 1128 if (code >= p->p_sysent->sv_size) 1129 callp = &p->p_sysent->sv_table[0]; 1130 else 1131 callp = &p->p_sysent->sv_table[code]; 1132 1133 narg = callp->sy_narg & SYF_ARGMASK; 1134 1135 /* 1136 * copyin is MP aware, but the tracing code is not 1137 */ 1138 if (params && (i = narg * sizeof(int)) && 1139 (error = copyin(params, (caddr_t)args, (u_int)i))) { 1140 mtx_lock(&Giant); 1141#ifdef KTRACE 1142 if (KTRPOINT(p, KTR_SYSCALL)) 1143 ktrsyscall(p->p_tracep, code, narg, args); 1144#endif 1145 goto bad; 1146 } 1147 1148 /* 1149 * Try to run the syscall without the MP lock if the syscall 1150 * is MP safe. We have to obtain the MP lock no matter what if 1151 * we are ktracing 1152 */ 1153 if ((callp->sy_narg & SYF_MPSAFE) == 0) { 1154 mtx_lock(&Giant); 1155 } 1156 1157#ifdef KTRACE 1158 if (KTRPOINT(p, KTR_SYSCALL)) { 1159 if (!mtx_owned(&Giant)) 1160 mtx_lock(&Giant); 1161 ktrsyscall(p->p_tracep, code, narg, args); 1162 } 1163#endif 1164 p->p_retval[0] = 0; 1165 p->p_retval[1] = frame.tf_edx; 1166 1167 STOPEVENT(p, S_SCE, narg); /* MP aware */ 1168 1169 error = (*callp->sy_call)(p, args); 1170 1171 /* 1172 * MP SAFE (we may or may not have the MP lock at this point) 1173 */ 1174 switch (error) { 1175 case 0: 1176 frame.tf_eax = p->p_retval[0]; 1177 frame.tf_edx = p->p_retval[1]; 1178 frame.tf_eflags &= ~PSL_C; 1179 break; 1180 1181 case ERESTART: 1182 /* 1183 * Reconstruct pc, assuming lcall $X,y is 7 bytes, 1184 * int 0x80 is 2 bytes. We saved this in tf_err. 1185 */ 1186 frame.tf_eip -= frame.tf_err; 1187 break; 1188 1189 case EJUSTRETURN: 1190 break; 1191 1192 default: 1193bad: 1194 if (p->p_sysent->sv_errsize) { 1195 if (error >= p->p_sysent->sv_errsize) 1196 error = -1; /* XXX */ 1197 else 1198 error = p->p_sysent->sv_errtbl[error]; 1199 } 1200 frame.tf_eax = error; 1201 frame.tf_eflags |= PSL_C; 1202 break; 1203 } 1204 1205 /* 1206 * Traced syscall. trapsignal() is not MP aware. 1207 */ 1208 if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { 1209 if (!mtx_owned(&Giant)) 1210 mtx_lock(&Giant); 1211 frame.tf_eflags &= ~PSL_T; 1212 trapsignal(p, SIGTRAP, 0); 1213 } 1214 1215 /* 1216 * Handle reschedule and other end-of-syscall issues 1217 */ 1218 userret(p, &frame, sticks); 1219 1220#ifdef KTRACE 1221 if (KTRPOINT(p, KTR_SYSRET)) { 1222 if (!mtx_owned(&Giant)) 1223 mtx_lock(&Giant); 1224 ktrsysret(p->p_tracep, code, error, p->p_retval[0]); 1225 } 1226#endif 1227 1228 /* 1229 * Release Giant if we had to get it 1230 */ 1231 if (mtx_owned(&Giant)) 1232 mtx_unlock(&Giant); 1233 1234 /* 1235 * This works because errno is findable through the 1236 * register set. If we ever support an emulation where this 1237 * is not the case, this code will need to be revisited. 1238 */ 1239 STOPEVENT(p, S_SCX, code); 1240 1241#ifdef WITNESS 1242 if (witness_list(p)) { 1243 panic("system call %s returning with mutex(s) held\n", 1244 syscallnames[code]); 1245 } 1246#endif 1247 mtx_assert(&sched_lock, MA_NOTOWNED); 1248 mtx_assert(&Giant, MA_NOTOWNED); 1249} 1250 1251void 1252ast(framep) 1253 struct trapframe *framep; 1254{ 1255 struct proc *p = CURPROC; 1256 u_quad_t sticks; 1257 1258 KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode")); 1259 1260 /* 1261 * We check for a pending AST here rather than in the assembly as 1262 * acquiring and releasing mutexes in assembly is not fun. 1263 */ 1264 mtx_lock_spin(&sched_lock); 1265 if (!(astpending(p) || resched_wanted(p))) { 1266 mtx_unlock_spin(&sched_lock); 1267 return; 1268 } 1269 1270 sticks = p->p_sticks; 1271 p->p_md.md_regs = framep; 1272 1273 astoff(p); 1274 cnt.v_soft++; 1275 mtx_intr_enable(&sched_lock); 1276 if (p->p_sflag & PS_OWEUPC) { 1277 p->p_sflag &= ~PS_OWEUPC; 1278 mtx_unlock_spin(&sched_lock); 1279 mtx_lock(&Giant); 1280 mtx_lock_spin(&sched_lock); 1281 addupc_task(p, p->p_stats->p_prof.pr_addr, 1282 p->p_stats->p_prof.pr_ticks); 1283 } 1284 if (p->p_sflag & PS_ALRMPEND) { 1285 p->p_sflag &= ~PS_ALRMPEND; 1286 mtx_unlock_spin(&sched_lock); 1287 PROC_LOCK(p); 1288 psignal(p, SIGVTALRM); 1289 PROC_UNLOCK(p); 1290 mtx_lock_spin(&sched_lock); 1291 } 1292 if (p->p_sflag & PS_PROFPEND) { 1293 p->p_sflag &= ~PS_PROFPEND; 1294 mtx_unlock_spin(&sched_lock); 1295 PROC_LOCK(p); 1296 psignal(p, SIGPROF); 1297 PROC_UNLOCK(p); 1298 } else 1299 mtx_unlock_spin(&sched_lock); 1300 1301 userret(p, framep, sticks); 1302 1303 if (mtx_owned(&Giant)) 1304 mtx_unlock(&Giant); 1305} 1306