subr_syscall.c revision 77015
1/*- 2 * Copyright (C) 1994, David Greenman 3 * Copyright (c) 1990, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the University of Utah, and William Jolitz. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 38 * $FreeBSD: head/sys/kern/subr_trap.c 77015 2001-05-22 21:20:49Z bde $ 39 */ 40 41/* 42 * 386 Trap and System call handling 43 */ 44 45#include "opt_clock.h" 46#include "opt_cpu.h" 47#include "opt_ddb.h" 48#include "opt_isa.h" 49#include "opt_ktrace.h" 50#include "opt_npx.h" 51#include "opt_trap.h" 52 53#include <sys/param.h> 54#include <sys/bus.h> 55#include <sys/systm.h> 56#include <sys/proc.h> 57#include <sys/pioctl.h> 58#include <sys/kernel.h> 59#include <sys/ktr.h> 60#include <sys/mutex.h> 61#include <sys/resourcevar.h> 62#include <sys/signalvar.h> 63#include <sys/syscall.h> 64#include <sys/sysctl.h> 65#include <sys/sysent.h> 66#include <sys/uio.h> 67#include <sys/vmmeter.h> 68#ifdef KTRACE 69#include <sys/ktrace.h> 70#endif 71 72#include <vm/vm.h> 73#include <vm/vm_param.h> 74#include <sys/lock.h> 75#include <vm/pmap.h> 76#include <vm/vm_kern.h> 77#include <vm/vm_map.h> 78#include <vm/vm_page.h> 79#include <vm/vm_extern.h> 80 81#include <machine/cpu.h> 82#include <machine/md_var.h> 83#include <machine/pcb.h> 84#ifdef SMP 85#include <machine/smp.h> 86#endif 87#include <machine/tss.h> 88 89#include <i386/isa/icu.h> 90#include <i386/isa/intr_machdep.h> 91 92#ifdef POWERFAIL_NMI 93#include <sys/syslog.h> 94#include <machine/clock.h> 95#endif 96 97#include <machine/vm86.h> 98 99#include <ddb/ddb.h> 100 101#include <sys/sysctl.h> 102 103int (*pmath_emulate) __P((struct trapframe *)); 104 105extern void trap __P((struct trapframe frame)); 106extern int trapwrite __P((unsigned addr)); 107extern void syscall __P((struct trapframe frame)); 108extern void ast __P((struct trapframe *framep)); 109 110static int trap_pfault __P((struct trapframe *, int, vm_offset_t)); 111static void trap_fatal __P((struct trapframe *, vm_offset_t)); 112void dblfault_handler __P((void)); 113 114extern inthand_t IDTVEC(lcall_syscall); 115 116#define MAX_TRAP_MSG 28 117static char *trap_msg[] = { 118 "", /* 0 unused */ 119 "privileged instruction fault", /* 1 T_PRIVINFLT */ 120 "", /* 2 unused */ 121 "breakpoint instruction fault", /* 3 T_BPTFLT */ 122 "", /* 4 unused */ 123 "", /* 5 unused */ 124 "arithmetic trap", /* 6 T_ARITHTRAP */ 125 "", /* 7 unused */ 126 "", /* 8 unused */ 127 "general protection fault", /* 9 T_PROTFLT */ 128 "trace trap", /* 10 T_TRCTRAP */ 129 "", /* 11 unused */ 130 "page fault", /* 12 T_PAGEFLT */ 131 "", /* 13 unused */ 132 "alignment fault", /* 14 T_ALIGNFLT */ 133 "", /* 15 unused */ 134 "", /* 16 unused */ 135 "", /* 17 unused */ 136 "integer divide fault", /* 18 T_DIVIDE */ 137 "non-maskable interrupt trap", /* 19 T_NMI */ 138 "overflow trap", /* 20 T_OFLOW */ 139 "FPU bounds check fault", /* 21 T_BOUND */ 140 "FPU device not available", /* 22 T_DNA */ 141 "double fault", /* 23 T_DOUBLEFLT */ 142 "FPU operand fetch fault", /* 24 T_FPOPFLT */ 143 "invalid TSS fault", /* 25 T_TSSFLT */ 144 "segment not present fault", /* 26 T_SEGNPFLT */ 145 "stack fault", /* 27 T_STKFLT */ 146 "machine check trap", /* 28 T_MCHK */ 147}; 148 149#if defined(I586_CPU) && !defined(NO_F00F_HACK) 150extern int has_f00f_bug; 151#endif 152 153#ifdef DDB 154static int ddb_on_nmi = 1; 155SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, 156 &ddb_on_nmi, 0, "Go to DDB on NMI"); 157#endif 158static int panic_on_nmi = 1; 159SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, 160 &panic_on_nmi, 0, "Panic on NMI"); 161 162#ifdef WITNESS 163extern char *syscallnames[]; 164#endif 165 166void 167userret(p, frame, oticks) 168 struct proc *p; 169 struct trapframe *frame; 170 u_quad_t oticks; 171{ 172 int sig; 173 174 while ((sig = CURSIG(p)) != 0) 175 postsig(sig); 176 177 mtx_lock_spin(&sched_lock); 178 p->p_pri.pri_level = p->p_pri.pri_user; 179 if (resched_wanted(p)) { 180 /* 181 * Since we are curproc, clock will normally just change 182 * our priority without moving us from one queue to another 183 * (since the running process is not on a queue.) 184 * If that happened after we setrunqueue ourselves but before we 185 * mi_switch()'ed, we might not be on the queue indicated by 186 * our priority. 187 */ 188 DROP_GIANT_NOSWITCH(); 189 setrunqueue(p); 190 p->p_stats->p_ru.ru_nivcsw++; 191 mi_switch(); 192 mtx_unlock_spin(&sched_lock); 193 PICKUP_GIANT(); 194 while ((sig = CURSIG(p)) != 0) 195 postsig(sig); 196 mtx_lock_spin(&sched_lock); 197 } 198 199 /* 200 * Charge system time if profiling. 201 */ 202 if (p->p_sflag & PS_PROFIL) { 203 mtx_unlock_spin(&sched_lock); 204 /* XXX - do we need Giant? */ 205 if (!mtx_owned(&Giant)) 206 mtx_lock(&Giant); 207 addupc_task(p, TRAPF_PC(frame), 208 (u_int)(p->p_sticks - oticks) * psratio); 209 } else 210 mtx_unlock_spin(&sched_lock); 211} 212 213/* 214 * Exception, fault, and trap interface to the FreeBSD kernel. 215 * This common code is called from assembly language IDT gate entry 216 * routines that prepare a suitable stack frame, and restore this 217 * frame after the exception has been processed. 218 */ 219 220void 221trap(frame) 222 struct trapframe frame; 223{ 224 struct proc *p = curproc; 225 u_quad_t sticks = 0; 226 int i = 0, ucode = 0, type, code; 227 vm_offset_t eva; 228#ifdef POWERFAIL_NMI 229 static int lastalert = 0; 230#endif 231 232 atomic_add_int(&cnt.v_trap, 1); 233 234 if ((frame.tf_eflags & PSL_I) == 0) { 235 /* 236 * Buggy application or kernel code has disabled 237 * interrupts and then trapped. Enabling interrupts 238 * now is wrong, but it is better than running with 239 * interrupts disabled until they are accidentally 240 * enabled later. XXX This is really bad if we trap 241 * while holding a spin lock. 242 */ 243 type = frame.tf_trapno; 244 if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) 245 printf( 246 "pid %ld (%s): trap %d with interrupts disabled\n", 247 (long)curproc->p_pid, curproc->p_comm, type); 248 else if (type != T_BPTFLT && type != T_TRCTRAP) { 249 /* 250 * XXX not quite right, since this may be for a 251 * multiple fault in user mode. 252 */ 253 printf("kernel trap %d with interrupts disabled\n", 254 type); 255 /* 256 * We should walk p_heldmtx here and see if any are 257 * spin mutexes, and not do this if so. 258 */ 259 enable_intr(); 260 } 261 } 262 263 eva = 0; 264 265#if defined(I586_CPU) && !defined(NO_F00F_HACK) 266restart: 267#endif 268 269 type = frame.tf_trapno; 270 code = frame.tf_err; 271 272 if ((ISPL(frame.tf_cs) == SEL_UPL) || 273 ((frame.tf_eflags & PSL_VM) && !in_vm86call)) { 274 /* user trap */ 275 276 mtx_lock_spin(&sched_lock); 277 sticks = p->p_sticks; 278 mtx_unlock_spin(&sched_lock); 279 p->p_md.md_regs = &frame; 280 281 switch (type) { 282 case T_PRIVINFLT: /* privileged instruction fault */ 283 ucode = type; 284 i = SIGILL; 285 break; 286 287 case T_BPTFLT: /* bpt instruction fault */ 288 case T_TRCTRAP: /* trace trap */ 289 frame.tf_eflags &= ~PSL_T; 290 i = SIGTRAP; 291 break; 292 293 case T_ARITHTRAP: /* arithmetic trap */ 294#ifdef DEV_NPX 295 ucode = npxtrap(); 296 if (ucode == -1) 297 return; 298#else 299 ucode = code; 300#endif 301 i = SIGFPE; 302 break; 303 304 /* 305 * The following two traps can happen in 306 * vm86 mode, and, if so, we want to handle 307 * them specially. 308 */ 309 case T_PROTFLT: /* general protection fault */ 310 case T_STKFLT: /* stack fault */ 311 if (frame.tf_eflags & PSL_VM) { 312 mtx_lock(&Giant); 313 i = vm86_emulate((struct vm86frame *)&frame); 314 mtx_unlock(&Giant); 315 if (i == 0) 316 goto user; 317 break; 318 } 319 /* FALL THROUGH */ 320 321 case T_SEGNPFLT: /* segment not present fault */ 322 case T_TSSFLT: /* invalid TSS fault */ 323 case T_DOUBLEFLT: /* double fault */ 324 default: 325 ucode = code + BUS_SEGM_FAULT ; 326 i = SIGBUS; 327 break; 328 329 case T_PAGEFLT: /* page fault */ 330 /* 331 * For some Cyrix CPUs, %cr2 is clobbered by 332 * interrupts. This problem is worked around by using 333 * an interrupt gate for the pagefault handler. We 334 * are finally ready to read %cr2 and then must 335 * reenable interrupts. 336 */ 337 eva = rcr2(); 338 enable_intr(); 339 i = trap_pfault(&frame, TRUE, eva); 340#if defined(I586_CPU) && !defined(NO_F00F_HACK) 341 if (i == -2) { 342 /* 343 * f00f hack workaround has triggered, treat 344 * as illegal instruction not page fault. 345 */ 346 frame.tf_trapno = T_PRIVINFLT; 347 goto restart; 348 } 349#endif 350 if (i == -1) 351 goto out; 352 if (i == 0) 353 goto user; 354 355 ucode = T_PAGEFLT; 356 break; 357 358 case T_DIVIDE: /* integer divide fault */ 359 ucode = FPE_INTDIV; 360 i = SIGFPE; 361 break; 362 363#ifdef DEV_ISA 364 case T_NMI: 365#ifdef POWERFAIL_NMI 366#ifndef TIMER_FREQ 367# define TIMER_FREQ 1193182 368#endif 369 mtx_lock(&Giant); 370 if (time_second - lastalert > 10) { 371 log(LOG_WARNING, "NMI: power fail\n"); 372 sysbeep(TIMER_FREQ/880, hz); 373 lastalert = time_second; 374 } 375 mtx_unlock(&Giant); 376 goto out; 377#else /* !POWERFAIL_NMI */ 378 /* machine/parity/power fail/"kitchen sink" faults */ 379 /* XXX Giant */ 380 if (isa_nmi(code) == 0) { 381#ifdef DDB 382 /* 383 * NMI can be hooked up to a pushbutton 384 * for debugging. 385 */ 386 if (ddb_on_nmi) { 387 printf ("NMI ... going to debugger\n"); 388 kdb_trap (type, 0, &frame); 389 } 390#endif /* DDB */ 391 goto out; 392 } else if (panic_on_nmi) 393 panic("NMI indicates hardware failure"); 394 break; 395#endif /* POWERFAIL_NMI */ 396#endif /* DEV_ISA */ 397 398 case T_OFLOW: /* integer overflow fault */ 399 ucode = FPE_INTOVF; 400 i = SIGFPE; 401 break; 402 403 case T_BOUND: /* bounds check fault */ 404 ucode = FPE_FLTSUB; 405 i = SIGFPE; 406 break; 407 408 case T_DNA: 409#ifdef DEV_NPX 410 /* transparent fault (due to context switch "late") */ 411 if (npxdna()) 412 goto out; 413#endif 414 if (!pmath_emulate) { 415 i = SIGFPE; 416 ucode = FPE_FPU_NP_TRAP; 417 break; 418 } 419 mtx_lock(&Giant); 420 i = (*pmath_emulate)(&frame); 421 mtx_unlock(&Giant); 422 if (i == 0) { 423 if (!(frame.tf_eflags & PSL_T)) 424 goto out; 425 frame.tf_eflags &= ~PSL_T; 426 i = SIGTRAP; 427 } 428 /* else ucode = emulator_only_knows() XXX */ 429 break; 430 431 case T_FPOPFLT: /* FPU operand fetch fault */ 432 ucode = T_FPOPFLT; 433 i = SIGILL; 434 break; 435 } 436 } else { 437 /* kernel trap */ 438 439 switch (type) { 440 case T_PAGEFLT: /* page fault */ 441 /* 442 * For some Cyrix CPUs, %cr2 is clobbered by 443 * interrupts. This problem is worked around by using 444 * an interrupt gate for the pagefault handler. We 445 * are finally ready to read %cr2 and then must 446 * reenable interrupts. 447 */ 448 eva = rcr2(); 449 enable_intr(); 450 (void) trap_pfault(&frame, FALSE, eva); 451 goto out; 452 453 case T_DNA: 454#ifdef DEV_NPX 455 /* 456 * The kernel is apparently using npx for copying. 457 * XXX this should be fatal unless the kernel has 458 * registered such use. 459 */ 460 if (npxdna()) 461 goto out; 462#endif 463 break; 464 465 /* 466 * The following two traps can happen in 467 * vm86 mode, and, if so, we want to handle 468 * them specially. 469 */ 470 case T_PROTFLT: /* general protection fault */ 471 case T_STKFLT: /* stack fault */ 472 if (frame.tf_eflags & PSL_VM) { 473 mtx_lock(&Giant); 474 i = vm86_emulate((struct vm86frame *)&frame); 475 mtx_unlock(&Giant); 476 if (i != 0) 477 /* 478 * returns to original process 479 */ 480 vm86_trap((struct vm86frame *)&frame); 481 goto out; 482 } 483 if (type == T_STKFLT) 484 break; 485 486 /* FALL THROUGH */ 487 488 case T_SEGNPFLT: /* segment not present fault */ 489 if (in_vm86call) 490 break; 491 492 if (p->p_intr_nesting_level != 0) 493 break; 494 495 /* 496 * Invalid %fs's and %gs's can be created using 497 * procfs or PT_SETREGS or by invalidating the 498 * underlying LDT entry. This causes a fault 499 * in kernel mode when the kernel attempts to 500 * switch contexts. Lose the bad context 501 * (XXX) so that we can continue, and generate 502 * a signal. 503 */ 504 if (frame.tf_eip == (int)cpu_switch_load_gs) { 505 PCPU_GET(curpcb)->pcb_gs = 0; 506 PROC_LOCK(p); 507 psignal(p, SIGBUS); 508 PROC_UNLOCK(p); 509 goto out; 510 } 511 512 /* 513 * Invalid segment selectors and out of bounds 514 * %eip's and %esp's can be set up in user mode. 515 * This causes a fault in kernel mode when the 516 * kernel tries to return to user mode. We want 517 * to get this fault so that we can fix the 518 * problem here and not have to check all the 519 * selectors and pointers when the user changes 520 * them. 521 */ 522 if (frame.tf_eip == (int)doreti_iret) { 523 frame.tf_eip = (int)doreti_iret_fault; 524 goto out; 525 } 526 if (frame.tf_eip == (int)doreti_popl_ds) { 527 frame.tf_eip = (int)doreti_popl_ds_fault; 528 goto out; 529 } 530 if (frame.tf_eip == (int)doreti_popl_es) { 531 frame.tf_eip = (int)doreti_popl_es_fault; 532 goto out; 533 } 534 if (frame.tf_eip == (int)doreti_popl_fs) { 535 frame.tf_eip = (int)doreti_popl_fs_fault; 536 goto out; 537 } 538 if (PCPU_GET(curpcb) != NULL && 539 PCPU_GET(curpcb)->pcb_onfault != NULL) { 540 frame.tf_eip = 541 (int)PCPU_GET(curpcb)->pcb_onfault; 542 goto out; 543 } 544 break; 545 546 case T_TSSFLT: 547 /* 548 * PSL_NT can be set in user mode and isn't cleared 549 * automatically when the kernel is entered. This 550 * causes a TSS fault when the kernel attempts to 551 * `iret' because the TSS link is uninitialized. We 552 * want to get this fault so that we can fix the 553 * problem here and not every time the kernel is 554 * entered. 555 */ 556 if (frame.tf_eflags & PSL_NT) { 557 frame.tf_eflags &= ~PSL_NT; 558 goto out; 559 } 560 break; 561 562 case T_TRCTRAP: /* trace trap */ 563 if (frame.tf_eip == (int)IDTVEC(lcall_syscall)) { 564 /* 565 * We've just entered system mode via the 566 * syscall lcall. Continue single stepping 567 * silently until the syscall handler has 568 * saved the flags. 569 */ 570 goto out; 571 } 572 if (frame.tf_eip == (int)IDTVEC(lcall_syscall) + 1) { 573 /* 574 * The syscall handler has now saved the 575 * flags. Stop single stepping it. 576 */ 577 frame.tf_eflags &= ~PSL_T; 578 goto out; 579 } 580 /* 581 * Ignore debug register trace traps due to 582 * accesses in the user's address space, which 583 * can happen under several conditions such as 584 * if a user sets a watchpoint on a buffer and 585 * then passes that buffer to a system call. 586 * We still want to get TRCTRAPS for addresses 587 * in kernel space because that is useful when 588 * debugging the kernel. 589 */ 590 /* XXX Giant */ 591 if (user_dbreg_trap() && !in_vm86call) { 592 /* 593 * Reset breakpoint bits because the 594 * processor doesn't 595 */ 596 load_dr6(rdr6() & 0xfffffff0); 597 goto out; 598 } 599 /* 600 * Fall through (TRCTRAP kernel mode, kernel address) 601 */ 602 case T_BPTFLT: 603 /* 604 * If DDB is enabled, let it handle the debugger trap. 605 * Otherwise, debugger traps "can't happen". 606 */ 607#ifdef DDB 608 /* XXX Giant */ 609 if (kdb_trap (type, 0, &frame)) 610 goto out; 611#endif 612 break; 613 614#ifdef DEV_ISA 615 case T_NMI: 616#ifdef POWERFAIL_NMI 617 mtx_lock(&Giant); 618 if (time_second - lastalert > 10) { 619 log(LOG_WARNING, "NMI: power fail\n"); 620 sysbeep(TIMER_FREQ/880, hz); 621 lastalert = time_second; 622 } 623 mtx_unlock(&Giant); 624 goto out; 625#else /* !POWERFAIL_NMI */ 626 /* XXX Giant */ 627 /* machine/parity/power fail/"kitchen sink" faults */ 628 if (isa_nmi(code) == 0) { 629#ifdef DDB 630 /* 631 * NMI can be hooked up to a pushbutton 632 * for debugging. 633 */ 634 if (ddb_on_nmi) { 635 printf ("NMI ... going to debugger\n"); 636 kdb_trap (type, 0, &frame); 637 } 638#endif /* DDB */ 639 goto out; 640 } else if (panic_on_nmi == 0) 641 goto out; 642 /* FALL THROUGH */ 643#endif /* POWERFAIL_NMI */ 644#endif /* DEV_ISA */ 645 } 646 647 mtx_lock(&Giant); 648 trap_fatal(&frame, eva); 649 mtx_unlock(&Giant); 650 goto out; 651 } 652 653 mtx_lock(&Giant); 654 /* Translate fault for emulators (e.g. Linux) */ 655 if (*p->p_sysent->sv_transtrap) 656 i = (*p->p_sysent->sv_transtrap)(i, type); 657 658 trapsignal(p, i, ucode); 659 660#ifdef DEBUG 661 if (type <= MAX_TRAP_MSG) { 662 uprintf("fatal process exception: %s", 663 trap_msg[type]); 664 if ((type == T_PAGEFLT) || (type == T_PROTFLT)) 665 uprintf(", fault VA = 0x%lx", (u_long)eva); 666 uprintf("\n"); 667 } 668#endif 669 mtx_unlock(&Giant); 670 671user: 672 userret(p, &frame, sticks); 673 if (mtx_owned(&Giant)) 674 mtx_unlock(&Giant); 675out: 676 return; 677} 678 679#ifdef notyet 680/* 681 * This version doesn't allow a page fault to user space while 682 * in the kernel. The rest of the kernel needs to be made "safe" 683 * before this can be used. I think the only things remaining 684 * to be made safe are the iBCS2 code and the process tracing/ 685 * debugging code. 686 */ 687static int 688trap_pfault(frame, usermode, eva) 689 struct trapframe *frame; 690 int usermode; 691 vm_offset_t eva; 692{ 693 vm_offset_t va; 694 struct vmspace *vm = NULL; 695 vm_map_t map = 0; 696 int rv = 0; 697 vm_prot_t ftype; 698 struct proc *p = curproc; 699 700 if (frame->tf_err & PGEX_W) 701 ftype = VM_PROT_WRITE; 702 else 703 ftype = VM_PROT_READ; 704 705 va = trunc_page(eva); 706 if (va < VM_MIN_KERNEL_ADDRESS) { 707 vm_offset_t v; 708 vm_page_t mpte; 709 710 if (p == NULL || 711 (!usermode && va < VM_MAXUSER_ADDRESS && 712 (p->p_intr_nesting_level != 0 || 713 PCPU_GET(curpcb) == NULL || 714 PCPU_GET(curpcb)->pcb_onfault == NULL))) { 715 trap_fatal(frame, eva); 716 return (-1); 717 } 718 719 /* 720 * This is a fault on non-kernel virtual memory. 721 * vm is initialized above to NULL. If curproc is NULL 722 * or curproc->p_vmspace is NULL the fault is fatal. 723 */ 724 vm = p->p_vmspace; 725 if (vm == NULL) 726 goto nogo; 727 728 map = &vm->vm_map; 729 730 /* 731 * Keep swapout from messing with us during this 732 * critical time. 733 */ 734 PROC_LOCK(p); 735 ++p->p_lock; 736 PROC_UNLOCK(p); 737 738 /* 739 * Grow the stack if necessary 740 */ 741 /* grow_stack returns false only if va falls into 742 * a growable stack region and the stack growth 743 * fails. It returns true if va was not within 744 * a growable stack region, or if the stack 745 * growth succeeded. 746 */ 747 if (!grow_stack (p, va)) 748 rv = KERN_FAILURE; 749 else 750 /* Fault in the user page: */ 751 rv = vm_fault(map, va, ftype, 752 (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY 753 : VM_FAULT_NORMAL); 754 755 PROC_LOCK(p); 756 --p->p_lock; 757 PROC_UNLOCK(p); 758 } else { 759 /* 760 * Don't allow user-mode faults in kernel address space. 761 */ 762 if (usermode) 763 goto nogo; 764 765 /* 766 * Since we know that kernel virtual address addresses 767 * always have pte pages mapped, we just have to fault 768 * the page. 769 */ 770 rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL); 771 } 772 773 if (rv == KERN_SUCCESS) 774 return (0); 775nogo: 776 if (!usermode) { 777 if (p->p_intr_nesting_level == 0 && 778 PCPU_GET(curpcb) != NULL && 779 PCPU_GET(curpcb)->pcb_onfault != NULL) { 780 frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; 781 return (0); 782 } 783 trap_fatal(frame, eva); 784 return (-1); 785 } 786 787 /* kludge to pass faulting virtual address to sendsig */ 788 frame->tf_err = eva; 789 790 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 791} 792#endif 793 794int 795trap_pfault(frame, usermode, eva) 796 struct trapframe *frame; 797 int usermode; 798 vm_offset_t eva; 799{ 800 vm_offset_t va; 801 struct vmspace *vm = NULL; 802 vm_map_t map = 0; 803 int rv = 0; 804 vm_prot_t ftype; 805 struct proc *p = curproc; 806 807 va = trunc_page(eva); 808 if (va >= KERNBASE) { 809 /* 810 * Don't allow user-mode faults in kernel address space. 811 * An exception: if the faulting address is the invalid 812 * instruction entry in the IDT, then the Intel Pentium 813 * F00F bug workaround was triggered, and we need to 814 * treat it is as an illegal instruction, and not a page 815 * fault. 816 */ 817#if defined(I586_CPU) && !defined(NO_F00F_HACK) 818 if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) 819 return -2; 820#endif 821 if (usermode) 822 goto nogo; 823 824 map = kernel_map; 825 } else { 826 /* 827 * This is a fault on non-kernel virtual memory. 828 * vm is initialized above to NULL. If curproc is NULL 829 * or curproc->p_vmspace is NULL the fault is fatal. 830 */ 831 if (p != NULL) 832 vm = p->p_vmspace; 833 834 if (vm == NULL) 835 goto nogo; 836 837 map = &vm->vm_map; 838 } 839 840 if (frame->tf_err & PGEX_W) 841 ftype = VM_PROT_WRITE; 842 else 843 ftype = VM_PROT_READ; 844 845 if (map != kernel_map) { 846 /* 847 * Keep swapout from messing with us during this 848 * critical time. 849 */ 850 PROC_LOCK(p); 851 ++p->p_lock; 852 PROC_UNLOCK(p); 853 854 /* 855 * Grow the stack if necessary 856 */ 857 /* grow_stack returns false only if va falls into 858 * a growable stack region and the stack growth 859 * fails. It returns true if va was not within 860 * a growable stack region, or if the stack 861 * growth succeeded. 862 */ 863 if (!grow_stack (p, va)) 864 rv = KERN_FAILURE; 865 else 866 /* Fault in the user page: */ 867 rv = vm_fault(map, va, ftype, 868 (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY 869 : VM_FAULT_NORMAL); 870 871 PROC_LOCK(p); 872 --p->p_lock; 873 PROC_UNLOCK(p); 874 } else { 875 /* 876 * Don't have to worry about process locking or stacks in the 877 * kernel. 878 */ 879 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); 880 } 881 882 if (rv == KERN_SUCCESS) 883 return (0); 884nogo: 885 if (!usermode) { 886 if (p->p_intr_nesting_level == 0 && 887 PCPU_GET(curpcb) != NULL && 888 PCPU_GET(curpcb)->pcb_onfault != NULL) { 889 frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; 890 return (0); 891 } 892 mtx_lock(&Giant); 893 trap_fatal(frame, eva); 894 mtx_unlock(&Giant); 895 return (-1); 896 } 897 898 /* kludge to pass faulting virtual address to sendsig */ 899 frame->tf_err = eva; 900 901 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 902} 903 904static void 905trap_fatal(frame, eva) 906 struct trapframe *frame; 907 vm_offset_t eva; 908{ 909 int code, type, ss, esp; 910 struct soft_segment_descriptor softseg; 911 912 code = frame->tf_err; 913 type = frame->tf_trapno; 914 sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); 915 916 if (type <= MAX_TRAP_MSG) 917 printf("\n\nFatal trap %d: %s while in %s mode\n", 918 type, trap_msg[type], 919 frame->tf_eflags & PSL_VM ? "vm86" : 920 ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); 921#ifdef SMP 922 /* two separate prints in case of a trap on an unmapped page */ 923 printf("cpuid = %d; ", PCPU_GET(cpuid)); 924 printf("lapic.id = %08x\n", lapic.id); 925#endif 926 if (type == T_PAGEFLT) { 927 printf("fault virtual address = 0x%x\n", eva); 928 printf("fault code = %s %s, %s\n", 929 code & PGEX_U ? "user" : "supervisor", 930 code & PGEX_W ? "write" : "read", 931 code & PGEX_P ? "protection violation" : "page not present"); 932 } 933 printf("instruction pointer = 0x%x:0x%x\n", 934 frame->tf_cs & 0xffff, frame->tf_eip); 935 if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) { 936 ss = frame->tf_ss & 0xffff; 937 esp = frame->tf_esp; 938 } else { 939 ss = GSEL(GDATA_SEL, SEL_KPL); 940 esp = (int)&frame->tf_esp; 941 } 942 printf("stack pointer = 0x%x:0x%x\n", ss, esp); 943 printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); 944 printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", 945 softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); 946 printf(" = DPL %d, pres %d, def32 %d, gran %d\n", 947 softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, 948 softseg.ssd_gran); 949 printf("processor eflags = "); 950 if (frame->tf_eflags & PSL_T) 951 printf("trace trap, "); 952 if (frame->tf_eflags & PSL_I) 953 printf("interrupt enabled, "); 954 if (frame->tf_eflags & PSL_NT) 955 printf("nested task, "); 956 if (frame->tf_eflags & PSL_RF) 957 printf("resume, "); 958 if (frame->tf_eflags & PSL_VM) 959 printf("vm86, "); 960 printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); 961 printf("current process = "); 962 if (curproc) { 963 printf("%lu (%s)\n", 964 (u_long)curproc->p_pid, curproc->p_comm ? 965 curproc->p_comm : ""); 966 } else { 967 printf("Idle\n"); 968 } 969 970#ifdef KDB 971 if (kdb_trap(&psl)) 972 return; 973#endif 974#ifdef DDB 975 if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame)) 976 return; 977#endif 978 printf("trap number = %d\n", type); 979 if (type <= MAX_TRAP_MSG) 980 panic(trap_msg[type]); 981 else 982 panic("unknown/reserved trap"); 983} 984 985/* 986 * Double fault handler. Called when a fault occurs while writing 987 * a frame for a trap/exception onto the stack. This usually occurs 988 * when the stack overflows (such is the case with infinite recursion, 989 * for example). 990 * 991 * XXX Note that the current PTD gets replaced by IdlePTD when the 992 * task switch occurs. This means that the stack that was active at 993 * the time of the double fault is not available at <kstack> unless 994 * the machine was idle when the double fault occurred. The downside 995 * of this is that "trace <ebp>" in ddb won't work. 996 */ 997void 998dblfault_handler() 999{ 1000 printf("\nFatal double fault:\n"); 1001 printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); 1002 printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); 1003 printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); 1004#ifdef SMP 1005 /* two separate prints in case of a trap on an unmapped page */ 1006 printf("cpuid = %d; ", PCPU_GET(cpuid)); 1007 printf("lapic.id = %08x\n", lapic.id); 1008#endif 1009 panic("double fault"); 1010} 1011 1012/* 1013 * Compensate for 386 brain damage (missing URKR). 1014 * This is a little simpler than the pagefault handler in trap() because 1015 * it the page tables have already been faulted in and high addresses 1016 * are thrown out early for other reasons. 1017 */ 1018int trapwrite(addr) 1019 unsigned addr; 1020{ 1021 struct proc *p; 1022 vm_offset_t va; 1023 struct vmspace *vm; 1024 int rv; 1025 1026 va = trunc_page((vm_offset_t)addr); 1027 /* 1028 * XXX - MAX is END. Changed > to >= for temp. fix. 1029 */ 1030 if (va >= VM_MAXUSER_ADDRESS) 1031 return (1); 1032 1033 p = curproc; 1034 vm = p->p_vmspace; 1035 1036 PROC_LOCK(p); 1037 ++p->p_lock; 1038 PROC_UNLOCK(p); 1039 1040 if (!grow_stack (p, va)) 1041 rv = KERN_FAILURE; 1042 else 1043 /* 1044 * fault the data page 1045 */ 1046 rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); 1047 1048 PROC_LOCK(p); 1049 --p->p_lock; 1050 PROC_UNLOCK(p); 1051 1052 if (rv != KERN_SUCCESS) 1053 return 1; 1054 1055 return (0); 1056} 1057 1058/* 1059 * syscall - MP aware system call request C handler 1060 * 1061 * A system call is essentially treated as a trap except that the 1062 * MP lock is not held on entry or return. We are responsible for 1063 * obtaining the MP lock if necessary and for handling ASTs 1064 * (e.g. a task switch) prior to return. 1065 * 1066 * In general, only simple access and manipulation of curproc and 1067 * the current stack is allowed without having to hold MP lock. 1068 */ 1069void 1070syscall(frame) 1071 struct trapframe frame; 1072{ 1073 caddr_t params; 1074 int i; 1075 struct sysent *callp; 1076 struct proc *p = curproc; 1077 u_quad_t sticks; 1078 int error; 1079 int narg; 1080 int args[8]; 1081 u_int code; 1082 1083 atomic_add_int(&cnt.v_syscall, 1); 1084 1085#ifdef DIAGNOSTIC 1086 if (ISPL(frame.tf_cs) != SEL_UPL) { 1087 mtx_lock(&Giant); 1088 panic("syscall"); 1089 /* NOT REACHED */ 1090 } 1091#endif 1092 1093 mtx_lock_spin(&sched_lock); 1094 sticks = p->p_sticks; 1095 mtx_unlock_spin(&sched_lock); 1096 1097 p->p_md.md_regs = &frame; 1098 params = (caddr_t)frame.tf_esp + sizeof(int); 1099 code = frame.tf_eax; 1100 1101 if (p->p_sysent->sv_prepsyscall) { 1102 /* 1103 * The prep code is not MP aware. 1104 */ 1105 mtx_lock(&Giant); 1106 (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); 1107 mtx_unlock(&Giant); 1108 } else { 1109 /* 1110 * Need to check if this is a 32 bit or 64 bit syscall. 1111 * fuword is MP aware. 1112 */ 1113 if (code == SYS_syscall) { 1114 /* 1115 * Code is first argument, followed by actual args. 1116 */ 1117 code = fuword(params); 1118 params += sizeof(int); 1119 } else if (code == SYS___syscall) { 1120 /* 1121 * Like syscall, but code is a quad, so as to maintain 1122 * quad alignment for the rest of the arguments. 1123 */ 1124 code = fuword(params); 1125 params += sizeof(quad_t); 1126 } 1127 } 1128 1129 if (p->p_sysent->sv_mask) 1130 code &= p->p_sysent->sv_mask; 1131 1132 if (code >= p->p_sysent->sv_size) 1133 callp = &p->p_sysent->sv_table[0]; 1134 else 1135 callp = &p->p_sysent->sv_table[code]; 1136 1137 narg = callp->sy_narg & SYF_ARGMASK; 1138 1139 /* 1140 * copyin is MP aware, but the tracing code is not 1141 */ 1142 if (params && (i = narg * sizeof(int)) && 1143 (error = copyin(params, (caddr_t)args, (u_int)i))) { 1144 mtx_lock(&Giant); 1145#ifdef KTRACE 1146 if (KTRPOINT(p, KTR_SYSCALL)) 1147 ktrsyscall(p->p_tracep, code, narg, args); 1148#endif 1149 goto bad; 1150 } 1151 1152 /* 1153 * Try to run the syscall without the MP lock if the syscall 1154 * is MP safe. 1155 */ 1156 if ((callp->sy_narg & SYF_MPSAFE) == 0) { 1157 mtx_lock(&Giant); 1158 } 1159 1160#ifdef KTRACE 1161 /* 1162 * We have to obtain the MP lock no matter what if 1163 * we are ktracing 1164 */ 1165 if (KTRPOINT(p, KTR_SYSCALL)) { 1166 if (!mtx_owned(&Giant)) 1167 mtx_lock(&Giant); 1168 ktrsyscall(p->p_tracep, code, narg, args); 1169 } 1170#endif 1171 p->p_retval[0] = 0; 1172 p->p_retval[1] = frame.tf_edx; 1173 1174 STOPEVENT(p, S_SCE, narg); /* MP aware */ 1175 1176 error = (*callp->sy_call)(p, args); 1177 1178 /* 1179 * MP SAFE (we may or may not have the MP lock at this point) 1180 */ 1181 switch (error) { 1182 case 0: 1183 frame.tf_eax = p->p_retval[0]; 1184 frame.tf_edx = p->p_retval[1]; 1185 frame.tf_eflags &= ~PSL_C; 1186 break; 1187 1188 case ERESTART: 1189 /* 1190 * Reconstruct pc, assuming lcall $X,y is 7 bytes, 1191 * int 0x80 is 2 bytes. We saved this in tf_err. 1192 */ 1193 frame.tf_eip -= frame.tf_err; 1194 break; 1195 1196 case EJUSTRETURN: 1197 break; 1198 1199 default: 1200bad: 1201 if (p->p_sysent->sv_errsize) { 1202 if (error >= p->p_sysent->sv_errsize) 1203 error = -1; /* XXX */ 1204 else 1205 error = p->p_sysent->sv_errtbl[error]; 1206 } 1207 frame.tf_eax = error; 1208 frame.tf_eflags |= PSL_C; 1209 break; 1210 } 1211 1212 /* 1213 * Traced syscall. trapsignal() is not MP aware. 1214 */ 1215 if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { 1216 if (!mtx_owned(&Giant)) 1217 mtx_lock(&Giant); 1218 frame.tf_eflags &= ~PSL_T; 1219 trapsignal(p, SIGTRAP, 0); 1220 } 1221 1222 /* 1223 * Handle reschedule and other end-of-syscall issues 1224 */ 1225 userret(p, &frame, sticks); 1226 1227#ifdef KTRACE 1228 if (KTRPOINT(p, KTR_SYSRET)) { 1229 if (!mtx_owned(&Giant)) 1230 mtx_lock(&Giant); 1231 ktrsysret(p->p_tracep, code, error, p->p_retval[0]); 1232 } 1233#endif 1234 1235 /* 1236 * Release Giant if we had to get it 1237 */ 1238 if (mtx_owned(&Giant)) 1239 mtx_unlock(&Giant); 1240 1241 /* 1242 * This works because errno is findable through the 1243 * register set. If we ever support an emulation where this 1244 * is not the case, this code will need to be revisited. 1245 */ 1246 STOPEVENT(p, S_SCX, code); 1247 1248#ifdef WITNESS 1249 if (witness_list(p)) { 1250 panic("system call %s returning with mutex(s) held\n", 1251 syscallnames[code]); 1252 } 1253#endif 1254 mtx_assert(&sched_lock, MA_NOTOWNED); 1255 mtx_assert(&Giant, MA_NOTOWNED); 1256} 1257 1258void 1259ast(framep) 1260 struct trapframe *framep; 1261{ 1262 struct proc *p = CURPROC; 1263 u_quad_t sticks; 1264#if defined(DEV_NPX) && !defined(SMP) 1265 int ucode; 1266#endif 1267 1268 KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode")); 1269 1270 /* 1271 * We check for a pending AST here rather than in the assembly as 1272 * acquiring and releasing mutexes in assembly is not fun. 1273 */ 1274 mtx_lock_spin(&sched_lock); 1275 if (!(astpending(p) || resched_wanted(p))) { 1276 mtx_unlock_spin(&sched_lock); 1277 return; 1278 } 1279 1280 sticks = p->p_sticks; 1281 p->p_md.md_regs = framep; 1282 1283 astoff(p); 1284 cnt.v_soft++; 1285 mtx_intr_enable(&sched_lock); 1286 if (p->p_sflag & PS_OWEUPC) { 1287 p->p_sflag &= ~PS_OWEUPC; 1288 mtx_unlock_spin(&sched_lock); 1289 mtx_lock(&Giant); 1290 mtx_lock_spin(&sched_lock); 1291 addupc_task(p, p->p_stats->p_prof.pr_addr, 1292 p->p_stats->p_prof.pr_ticks); 1293 } 1294 if (p->p_sflag & PS_ALRMPEND) { 1295 p->p_sflag &= ~PS_ALRMPEND; 1296 mtx_unlock_spin(&sched_lock); 1297 PROC_LOCK(p); 1298 psignal(p, SIGVTALRM); 1299 PROC_UNLOCK(p); 1300 mtx_lock_spin(&sched_lock); 1301 } 1302#if defined(DEV_NPX) && !defined(SMP) 1303 if (PCPU_GET(curpcb)->pcb_flags & PCB_NPXTRAP) { 1304 PCPU_GET(curpcb)->pcb_flags &= ~PCB_NPXTRAP; 1305 mtx_unlock_spin(&sched_lock); 1306 ucode = npxtrap(); 1307 if (ucode != -1) { 1308 if (!mtx_owned(&Giant)) 1309 mtx_lock(&Giant); 1310 trapsignal(p, SIGFPE, ucode); 1311 } 1312 mtx_lock_spin(&sched_lock); 1313 } 1314#endif 1315 if (p->p_sflag & PS_PROFPEND) { 1316 p->p_sflag &= ~PS_PROFPEND; 1317 mtx_unlock_spin(&sched_lock); 1318 PROC_LOCK(p); 1319 psignal(p, SIGPROF); 1320 PROC_UNLOCK(p); 1321 } else 1322 mtx_unlock_spin(&sched_lock); 1323 1324 userret(p, framep, sticks); 1325 1326 if (mtx_owned(&Giant)) 1327 mtx_unlock(&Giant); 1328} 1329