subr_syscall.c revision 77097
1/*- 2 * Copyright (C) 1994, David Greenman 3 * Copyright (c) 1990, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the University of Utah, and William Jolitz. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 38 * $FreeBSD: head/sys/kern/subr_trap.c 77097 2001-05-23 22:58:09Z jhb $ 39 */ 40 41/* 42 * 386 Trap and System call handling 43 */ 44 45#include "opt_clock.h" 46#include "opt_cpu.h" 47#include "opt_ddb.h" 48#include "opt_isa.h" 49#include "opt_ktrace.h" 50#include "opt_npx.h" 51#include "opt_trap.h" 52 53#include <sys/param.h> 54#include <sys/bus.h> 55#include <sys/systm.h> 56#include <sys/proc.h> 57#include <sys/pioctl.h> 58#include <sys/kernel.h> 59#include <sys/ktr.h> 60#include <sys/mutex.h> 61#include <sys/resourcevar.h> 62#include <sys/signalvar.h> 63#include <sys/syscall.h> 64#include <sys/sysctl.h> 65#include <sys/sysent.h> 66#include <sys/uio.h> 67#include <sys/vmmeter.h> 68#ifdef KTRACE 69#include <sys/ktrace.h> 70#endif 71 72#include <vm/vm.h> 73#include <vm/vm_param.h> 74#include <sys/lock.h> 75#include <vm/pmap.h> 76#include <vm/vm_kern.h> 77#include <vm/vm_map.h> 78#include <vm/vm_page.h> 79#include <vm/vm_extern.h> 80 81#include <machine/cpu.h> 82#include <machine/md_var.h> 83#include <machine/pcb.h> 84#ifdef SMP 85#include <machine/smp.h> 86#endif 87#include <machine/tss.h> 88 89#include <i386/isa/icu.h> 90#include <i386/isa/intr_machdep.h> 91 92#ifdef POWERFAIL_NMI 93#include <sys/syslog.h> 94#include <machine/clock.h> 95#endif 96 97#include <machine/vm86.h> 98 99#include <ddb/ddb.h> 100 101#include <sys/sysctl.h> 102 103int (*pmath_emulate) __P((struct trapframe *)); 104 105extern void trap __P((struct trapframe frame)); 106extern int trapwrite __P((unsigned addr)); 107extern void syscall __P((struct trapframe frame)); 108extern void ast __P((struct trapframe *framep)); 109 110static int trap_pfault __P((struct trapframe *, int, vm_offset_t)); 111static void trap_fatal __P((struct trapframe *, vm_offset_t)); 112void dblfault_handler __P((void)); 113 114extern inthand_t IDTVEC(lcall_syscall); 115 116#define MAX_TRAP_MSG 28 117static char *trap_msg[] = { 118 "", /* 0 unused */ 119 "privileged instruction fault", /* 1 T_PRIVINFLT */ 120 "", /* 2 unused */ 121 "breakpoint instruction fault", /* 3 T_BPTFLT */ 122 "", /* 4 unused */ 123 "", /* 5 unused */ 124 "arithmetic trap", /* 6 T_ARITHTRAP */ 125 "", /* 7 unused */ 126 "", /* 8 unused */ 127 "general protection fault", /* 9 T_PROTFLT */ 128 "trace trap", /* 10 T_TRCTRAP */ 129 "", /* 11 unused */ 130 "page fault", /* 12 T_PAGEFLT */ 131 "", /* 13 unused */ 132 "alignment fault", /* 14 T_ALIGNFLT */ 133 "", /* 15 unused */ 134 "", /* 16 unused */ 135 "", /* 17 unused */ 136 "integer divide fault", /* 18 T_DIVIDE */ 137 "non-maskable interrupt trap", /* 19 T_NMI */ 138 "overflow trap", /* 20 T_OFLOW */ 139 "FPU bounds check fault", /* 21 T_BOUND */ 140 "FPU device not available", /* 22 T_DNA */ 141 "double fault", /* 23 T_DOUBLEFLT */ 142 "FPU operand fetch fault", /* 24 T_FPOPFLT */ 143 "invalid TSS fault", /* 25 T_TSSFLT */ 144 "segment not present fault", /* 26 T_SEGNPFLT */ 145 "stack fault", /* 27 T_STKFLT */ 146 "machine check trap", /* 28 T_MCHK */ 147}; 148 149#if defined(I586_CPU) && !defined(NO_F00F_HACK) 150extern int has_f00f_bug; 151#endif 152 153#ifdef DDB 154static int ddb_on_nmi = 1; 155SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, 156 &ddb_on_nmi, 0, "Go to DDB on NMI"); 157#endif 158static int panic_on_nmi = 1; 159SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, 160 &panic_on_nmi, 0, "Panic on NMI"); 161 162#ifdef WITNESS 163extern char *syscallnames[]; 164#endif 165 166void 167userret(p, frame, oticks) 168 struct proc *p; 169 struct trapframe *frame; 170 u_quad_t oticks; 171{ 172 int sig; 173 174 while ((sig = CURSIG(p)) != 0) 175 postsig(sig); 176 177 mtx_lock_spin(&sched_lock); 178 p->p_pri.pri_level = p->p_pri.pri_user; 179 if (resched_wanted(p)) { 180 /* 181 * Since we are curproc, clock will normally just change 182 * our priority without moving us from one queue to another 183 * (since the running process is not on a queue.) 184 * If that happened after we setrunqueue ourselves but before we 185 * mi_switch()'ed, we might not be on the queue indicated by 186 * our priority. 187 */ 188 DROP_GIANT_NOSWITCH(); 189 setrunqueue(p); 190 p->p_stats->p_ru.ru_nivcsw++; 191 mi_switch(); 192 mtx_unlock_spin(&sched_lock); 193 PICKUP_GIANT(); 194 while ((sig = CURSIG(p)) != 0) 195 postsig(sig); 196 mtx_lock_spin(&sched_lock); 197 } 198 199 /* 200 * Charge system time if profiling. 201 */ 202 if (p->p_sflag & PS_PROFIL) { 203 mtx_unlock_spin(&sched_lock); 204 /* XXX - do we need Giant? */ 205 if (!mtx_owned(&Giant)) 206 mtx_lock(&Giant); 207 addupc_task(p, TRAPF_PC(frame), 208 (u_int)(p->p_sticks - oticks) * psratio); 209 } else 210 mtx_unlock_spin(&sched_lock); 211} 212 213/* 214 * Exception, fault, and trap interface to the FreeBSD kernel. 215 * This common code is called from assembly language IDT gate entry 216 * routines that prepare a suitable stack frame, and restore this 217 * frame after the exception has been processed. 218 */ 219 220void 221trap(frame) 222 struct trapframe frame; 223{ 224 struct proc *p = curproc; 225 u_quad_t sticks = 0; 226 int i = 0, ucode = 0, type, code; 227 vm_offset_t eva; 228#ifdef POWERFAIL_NMI 229 static int lastalert = 0; 230#endif 231 232 atomic_add_int(&cnt.v_trap, 1); 233 234 if ((frame.tf_eflags & PSL_I) == 0) { 235 /* 236 * Buggy application or kernel code has disabled 237 * interrupts and then trapped. Enabling interrupts 238 * now is wrong, but it is better than running with 239 * interrupts disabled until they are accidentally 240 * enabled later. XXX This is really bad if we trap 241 * while holding a spin lock. 242 */ 243 type = frame.tf_trapno; 244 if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) 245 printf( 246 "pid %ld (%s): trap %d with interrupts disabled\n", 247 (long)curproc->p_pid, curproc->p_comm, type); 248 else if (type != T_BPTFLT && type != T_TRCTRAP) { 249 /* 250 * XXX not quite right, since this may be for a 251 * multiple fault in user mode. 252 */ 253 printf("kernel trap %d with interrupts disabled\n", 254 type); 255 /* 256 * We should walk p_heldmtx here and see if any are 257 * spin mutexes, and not do this if so. 258 */ 259 enable_intr(); 260 } 261 } 262 263 eva = 0; 264 265#if defined(I586_CPU) && !defined(NO_F00F_HACK) 266restart: 267#endif 268 269 type = frame.tf_trapno; 270 code = frame.tf_err; 271 272 if ((ISPL(frame.tf_cs) == SEL_UPL) || 273 ((frame.tf_eflags & PSL_VM) && !in_vm86call)) { 274 /* user trap */ 275 276 mtx_lock_spin(&sched_lock); 277 sticks = p->p_sticks; 278 mtx_unlock_spin(&sched_lock); 279 p->p_md.md_regs = &frame; 280 281 switch (type) { 282 case T_PRIVINFLT: /* privileged instruction fault */ 283 ucode = type; 284 i = SIGILL; 285 break; 286 287 case T_BPTFLT: /* bpt instruction fault */ 288 case T_TRCTRAP: /* trace trap */ 289 frame.tf_eflags &= ~PSL_T; 290 i = SIGTRAP; 291 break; 292 293 case T_ARITHTRAP: /* arithmetic trap */ 294#ifdef DEV_NPX 295 ucode = npxtrap(); 296 if (ucode == -1) 297 return; 298#else 299 ucode = code; 300#endif 301 i = SIGFPE; 302 break; 303 304 /* 305 * The following two traps can happen in 306 * vm86 mode, and, if so, we want to handle 307 * them specially. 308 */ 309 case T_PROTFLT: /* general protection fault */ 310 case T_STKFLT: /* stack fault */ 311 if (frame.tf_eflags & PSL_VM) { 312 mtx_lock(&Giant); 313 i = vm86_emulate((struct vm86frame *)&frame); 314 mtx_unlock(&Giant); 315 if (i == 0) 316 goto user; 317 break; 318 } 319 /* FALL THROUGH */ 320 321 case T_SEGNPFLT: /* segment not present fault */ 322 case T_TSSFLT: /* invalid TSS fault */ 323 case T_DOUBLEFLT: /* double fault */ 324 default: 325 ucode = code + BUS_SEGM_FAULT ; 326 i = SIGBUS; 327 break; 328 329 case T_PAGEFLT: /* page fault */ 330 /* 331 * For some Cyrix CPUs, %cr2 is clobbered by 332 * interrupts. This problem is worked around by using 333 * an interrupt gate for the pagefault handler. We 334 * are finally ready to read %cr2 and then must 335 * reenable interrupts. 336 */ 337 eva = rcr2(); 338 enable_intr(); 339 i = trap_pfault(&frame, TRUE, eva); 340#if defined(I586_CPU) && !defined(NO_F00F_HACK) 341 if (i == -2) { 342 /* 343 * f00f hack workaround has triggered, treat 344 * as illegal instruction not page fault. 345 */ 346 frame.tf_trapno = T_PRIVINFLT; 347 goto restart; 348 } 349#endif 350 if (i == -1) 351 goto out; 352 if (i == 0) 353 goto user; 354 355 ucode = T_PAGEFLT; 356 break; 357 358 case T_DIVIDE: /* integer divide fault */ 359 ucode = FPE_INTDIV; 360 i = SIGFPE; 361 break; 362 363#ifdef DEV_ISA 364 case T_NMI: 365#ifdef POWERFAIL_NMI 366#ifndef TIMER_FREQ 367# define TIMER_FREQ 1193182 368#endif 369 mtx_lock(&Giant); 370 if (time_second - lastalert > 10) { 371 log(LOG_WARNING, "NMI: power fail\n"); 372 sysbeep(TIMER_FREQ/880, hz); 373 lastalert = time_second; 374 } 375 mtx_unlock(&Giant); 376 goto out; 377#else /* !POWERFAIL_NMI */ 378 /* machine/parity/power fail/"kitchen sink" faults */ 379 /* XXX Giant */ 380 if (isa_nmi(code) == 0) { 381#ifdef DDB 382 /* 383 * NMI can be hooked up to a pushbutton 384 * for debugging. 385 */ 386 if (ddb_on_nmi) { 387 printf ("NMI ... going to debugger\n"); 388 kdb_trap (type, 0, &frame); 389 } 390#endif /* DDB */ 391 goto out; 392 } else if (panic_on_nmi) 393 panic("NMI indicates hardware failure"); 394 break; 395#endif /* POWERFAIL_NMI */ 396#endif /* DEV_ISA */ 397 398 case T_OFLOW: /* integer overflow fault */ 399 ucode = FPE_INTOVF; 400 i = SIGFPE; 401 break; 402 403 case T_BOUND: /* bounds check fault */ 404 ucode = FPE_FLTSUB; 405 i = SIGFPE; 406 break; 407 408 case T_DNA: 409#ifdef DEV_NPX 410 /* transparent fault (due to context switch "late") */ 411 if (npxdna()) 412 goto out; 413#endif 414 if (!pmath_emulate) { 415 i = SIGFPE; 416 ucode = FPE_FPU_NP_TRAP; 417 break; 418 } 419 mtx_lock(&Giant); 420 i = (*pmath_emulate)(&frame); 421 mtx_unlock(&Giant); 422 if (i == 0) { 423 if (!(frame.tf_eflags & PSL_T)) 424 goto out; 425 frame.tf_eflags &= ~PSL_T; 426 i = SIGTRAP; 427 } 428 /* else ucode = emulator_only_knows() XXX */ 429 break; 430 431 case T_FPOPFLT: /* FPU operand fetch fault */ 432 ucode = T_FPOPFLT; 433 i = SIGILL; 434 break; 435 } 436 } else { 437 /* kernel trap */ 438 439 switch (type) { 440 case T_PAGEFLT: /* page fault */ 441 /* 442 * For some Cyrix CPUs, %cr2 is clobbered by 443 * interrupts. This problem is worked around by using 444 * an interrupt gate for the pagefault handler. We 445 * are finally ready to read %cr2 and then must 446 * reenable interrupts. 447 */ 448 eva = rcr2(); 449 enable_intr(); 450 (void) trap_pfault(&frame, FALSE, eva); 451 goto out; 452 453 case T_DNA: 454#ifdef DEV_NPX 455 /* 456 * The kernel is apparently using npx for copying. 457 * XXX this should be fatal unless the kernel has 458 * registered such use. 459 */ 460 if (npxdna()) 461 goto out; 462#endif 463 break; 464 465 /* 466 * The following two traps can happen in 467 * vm86 mode, and, if so, we want to handle 468 * them specially. 469 */ 470 case T_PROTFLT: /* general protection fault */ 471 case T_STKFLT: /* stack fault */ 472 if (frame.tf_eflags & PSL_VM) { 473 mtx_lock(&Giant); 474 i = vm86_emulate((struct vm86frame *)&frame); 475 mtx_unlock(&Giant); 476 if (i != 0) 477 /* 478 * returns to original process 479 */ 480 vm86_trap((struct vm86frame *)&frame); 481 goto out; 482 } 483 if (type == T_STKFLT) 484 break; 485 486 /* FALL THROUGH */ 487 488 case T_SEGNPFLT: /* segment not present fault */ 489 if (in_vm86call) 490 break; 491 492 if (p->p_intr_nesting_level != 0) 493 break; 494 495 /* 496 * Invalid %fs's and %gs's can be created using 497 * procfs or PT_SETREGS or by invalidating the 498 * underlying LDT entry. This causes a fault 499 * in kernel mode when the kernel attempts to 500 * switch contexts. Lose the bad context 501 * (XXX) so that we can continue, and generate 502 * a signal. 503 */ 504 if (frame.tf_eip == (int)cpu_switch_load_gs) { 505 PCPU_GET(curpcb)->pcb_gs = 0; 506 PROC_LOCK(p); 507 psignal(p, SIGBUS); 508 PROC_UNLOCK(p); 509 goto out; 510 } 511 512 /* 513 * Invalid segment selectors and out of bounds 514 * %eip's and %esp's can be set up in user mode. 515 * This causes a fault in kernel mode when the 516 * kernel tries to return to user mode. We want 517 * to get this fault so that we can fix the 518 * problem here and not have to check all the 519 * selectors and pointers when the user changes 520 * them. 521 */ 522 if (frame.tf_eip == (int)doreti_iret) { 523 frame.tf_eip = (int)doreti_iret_fault; 524 goto out; 525 } 526 if (frame.tf_eip == (int)doreti_popl_ds) { 527 frame.tf_eip = (int)doreti_popl_ds_fault; 528 goto out; 529 } 530 if (frame.tf_eip == (int)doreti_popl_es) { 531 frame.tf_eip = (int)doreti_popl_es_fault; 532 goto out; 533 } 534 if (frame.tf_eip == (int)doreti_popl_fs) { 535 frame.tf_eip = (int)doreti_popl_fs_fault; 536 goto out; 537 } 538 if (PCPU_GET(curpcb) != NULL && 539 PCPU_GET(curpcb)->pcb_onfault != NULL) { 540 frame.tf_eip = 541 (int)PCPU_GET(curpcb)->pcb_onfault; 542 goto out; 543 } 544 break; 545 546 case T_TSSFLT: 547 /* 548 * PSL_NT can be set in user mode and isn't cleared 549 * automatically when the kernel is entered. This 550 * causes a TSS fault when the kernel attempts to 551 * `iret' because the TSS link is uninitialized. We 552 * want to get this fault so that we can fix the 553 * problem here and not every time the kernel is 554 * entered. 555 */ 556 if (frame.tf_eflags & PSL_NT) { 557 frame.tf_eflags &= ~PSL_NT; 558 goto out; 559 } 560 break; 561 562 case T_TRCTRAP: /* trace trap */ 563 if (frame.tf_eip == (int)IDTVEC(lcall_syscall)) { 564 /* 565 * We've just entered system mode via the 566 * syscall lcall. Continue single stepping 567 * silently until the syscall handler has 568 * saved the flags. 569 */ 570 goto out; 571 } 572 if (frame.tf_eip == (int)IDTVEC(lcall_syscall) + 1) { 573 /* 574 * The syscall handler has now saved the 575 * flags. Stop single stepping it. 576 */ 577 frame.tf_eflags &= ~PSL_T; 578 goto out; 579 } 580 /* 581 * Ignore debug register trace traps due to 582 * accesses in the user's address space, which 583 * can happen under several conditions such as 584 * if a user sets a watchpoint on a buffer and 585 * then passes that buffer to a system call. 586 * We still want to get TRCTRAPS for addresses 587 * in kernel space because that is useful when 588 * debugging the kernel. 589 */ 590 /* XXX Giant */ 591 if (user_dbreg_trap() && !in_vm86call) { 592 /* 593 * Reset breakpoint bits because the 594 * processor doesn't 595 */ 596 load_dr6(rdr6() & 0xfffffff0); 597 goto out; 598 } 599 /* 600 * Fall through (TRCTRAP kernel mode, kernel address) 601 */ 602 case T_BPTFLT: 603 /* 604 * If DDB is enabled, let it handle the debugger trap. 605 * Otherwise, debugger traps "can't happen". 606 */ 607#ifdef DDB 608 /* XXX Giant */ 609 if (kdb_trap (type, 0, &frame)) 610 goto out; 611#endif 612 break; 613 614#ifdef DEV_ISA 615 case T_NMI: 616#ifdef POWERFAIL_NMI 617 mtx_lock(&Giant); 618 if (time_second - lastalert > 10) { 619 log(LOG_WARNING, "NMI: power fail\n"); 620 sysbeep(TIMER_FREQ/880, hz); 621 lastalert = time_second; 622 } 623 mtx_unlock(&Giant); 624 goto out; 625#else /* !POWERFAIL_NMI */ 626 /* XXX Giant */ 627 /* machine/parity/power fail/"kitchen sink" faults */ 628 if (isa_nmi(code) == 0) { 629#ifdef DDB 630 /* 631 * NMI can be hooked up to a pushbutton 632 * for debugging. 633 */ 634 if (ddb_on_nmi) { 635 printf ("NMI ... going to debugger\n"); 636 kdb_trap (type, 0, &frame); 637 } 638#endif /* DDB */ 639 goto out; 640 } else if (panic_on_nmi == 0) 641 goto out; 642 /* FALL THROUGH */ 643#endif /* POWERFAIL_NMI */ 644#endif /* DEV_ISA */ 645 } 646 647 trap_fatal(&frame, eva); 648 goto out; 649 } 650 651 mtx_lock(&Giant); 652 /* Translate fault for emulators (e.g. Linux) */ 653 if (*p->p_sysent->sv_transtrap) 654 i = (*p->p_sysent->sv_transtrap)(i, type); 655 656 trapsignal(p, i, ucode); 657 658#ifdef DEBUG 659 if (type <= MAX_TRAP_MSG) { 660 uprintf("fatal process exception: %s", 661 trap_msg[type]); 662 if ((type == T_PAGEFLT) || (type == T_PROTFLT)) 663 uprintf(", fault VA = 0x%lx", (u_long)eva); 664 uprintf("\n"); 665 } 666#endif 667 mtx_unlock(&Giant); 668 669user: 670 userret(p, &frame, sticks); 671 if (mtx_owned(&Giant)) 672 mtx_unlock(&Giant); 673out: 674 return; 675} 676 677#ifdef notyet 678/* 679 * This version doesn't allow a page fault to user space while 680 * in the kernel. The rest of the kernel needs to be made "safe" 681 * before this can be used. I think the only things remaining 682 * to be made safe are the iBCS2 code and the process tracing/ 683 * debugging code. 684 */ 685static int 686trap_pfault(frame, usermode, eva) 687 struct trapframe *frame; 688 int usermode; 689 vm_offset_t eva; 690{ 691 vm_offset_t va; 692 struct vmspace *vm = NULL; 693 vm_map_t map = 0; 694 int rv = 0; 695 vm_prot_t ftype; 696 struct proc *p = curproc; 697 698 if (frame->tf_err & PGEX_W) 699 ftype = VM_PROT_WRITE; 700 else 701 ftype = VM_PROT_READ; 702 703 va = trunc_page(eva); 704 if (va < VM_MIN_KERNEL_ADDRESS) { 705 vm_offset_t v; 706 vm_page_t mpte; 707 708 if (p == NULL || 709 (!usermode && va < VM_MAXUSER_ADDRESS && 710 (p->p_intr_nesting_level != 0 || 711 PCPU_GET(curpcb) == NULL || 712 PCPU_GET(curpcb)->pcb_onfault == NULL))) { 713 trap_fatal(frame, eva); 714 return (-1); 715 } 716 717 /* 718 * This is a fault on non-kernel virtual memory. 719 * vm is initialized above to NULL. If curproc is NULL 720 * or curproc->p_vmspace is NULL the fault is fatal. 721 */ 722 vm = p->p_vmspace; 723 if (vm == NULL) 724 goto nogo; 725 726 map = &vm->vm_map; 727 728 /* 729 * Keep swapout from messing with us during this 730 * critical time. 731 */ 732 PROC_LOCK(p); 733 ++p->p_lock; 734 PROC_UNLOCK(p); 735 736 /* 737 * Grow the stack if necessary 738 */ 739 /* grow_stack returns false only if va falls into 740 * a growable stack region and the stack growth 741 * fails. It returns true if va was not within 742 * a growable stack region, or if the stack 743 * growth succeeded. 744 */ 745 if (!grow_stack (p, va)) 746 rv = KERN_FAILURE; 747 else 748 /* Fault in the user page: */ 749 rv = vm_fault(map, va, ftype, 750 (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY 751 : VM_FAULT_NORMAL); 752 753 PROC_LOCK(p); 754 --p->p_lock; 755 PROC_UNLOCK(p); 756 } else { 757 /* 758 * Don't allow user-mode faults in kernel address space. 759 */ 760 if (usermode) 761 goto nogo; 762 763 /* 764 * Since we know that kernel virtual address addresses 765 * always have pte pages mapped, we just have to fault 766 * the page. 767 */ 768 rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL); 769 } 770 771 if (rv == KERN_SUCCESS) 772 return (0); 773nogo: 774 if (!usermode) { 775 if (p->p_intr_nesting_level == 0 && 776 PCPU_GET(curpcb) != NULL && 777 PCPU_GET(curpcb)->pcb_onfault != NULL) { 778 frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; 779 return (0); 780 } 781 trap_fatal(frame, eva); 782 return (-1); 783 } 784 785 /* kludge to pass faulting virtual address to sendsig */ 786 frame->tf_err = eva; 787 788 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 789} 790#endif 791 792int 793trap_pfault(frame, usermode, eva) 794 struct trapframe *frame; 795 int usermode; 796 vm_offset_t eva; 797{ 798 vm_offset_t va; 799 struct vmspace *vm = NULL; 800 vm_map_t map = 0; 801 int rv = 0; 802 vm_prot_t ftype; 803 struct proc *p = curproc; 804 805 va = trunc_page(eva); 806 if (va >= KERNBASE) { 807 /* 808 * Don't allow user-mode faults in kernel address space. 809 * An exception: if the faulting address is the invalid 810 * instruction entry in the IDT, then the Intel Pentium 811 * F00F bug workaround was triggered, and we need to 812 * treat it is as an illegal instruction, and not a page 813 * fault. 814 */ 815#if defined(I586_CPU) && !defined(NO_F00F_HACK) 816 if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) 817 return -2; 818#endif 819 if (usermode) 820 goto nogo; 821 822 map = kernel_map; 823 } else { 824 /* 825 * This is a fault on non-kernel virtual memory. 826 * vm is initialized above to NULL. If curproc is NULL 827 * or curproc->p_vmspace is NULL the fault is fatal. 828 */ 829 if (p != NULL) 830 vm = p->p_vmspace; 831 832 if (vm == NULL) 833 goto nogo; 834 835 map = &vm->vm_map; 836 } 837 838 if (frame->tf_err & PGEX_W) 839 ftype = VM_PROT_WRITE; 840 else 841 ftype = VM_PROT_READ; 842 843 if (map != kernel_map) { 844 /* 845 * Keep swapout from messing with us during this 846 * critical time. 847 */ 848 PROC_LOCK(p); 849 ++p->p_lock; 850 PROC_UNLOCK(p); 851 852 /* 853 * Grow the stack if necessary 854 */ 855 /* grow_stack returns false only if va falls into 856 * a growable stack region and the stack growth 857 * fails. It returns true if va was not within 858 * a growable stack region, or if the stack 859 * growth succeeded. 860 */ 861 if (!grow_stack (p, va)) 862 rv = KERN_FAILURE; 863 else 864 /* Fault in the user page: */ 865 rv = vm_fault(map, va, ftype, 866 (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY 867 : VM_FAULT_NORMAL); 868 869 PROC_LOCK(p); 870 --p->p_lock; 871 PROC_UNLOCK(p); 872 } else { 873 /* 874 * Don't have to worry about process locking or stacks in the 875 * kernel. 876 */ 877 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); 878 } 879 880 if (rv == KERN_SUCCESS) 881 return (0); 882nogo: 883 if (!usermode) { 884 if (p->p_intr_nesting_level == 0 && 885 PCPU_GET(curpcb) != NULL && 886 PCPU_GET(curpcb)->pcb_onfault != NULL) { 887 frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; 888 return (0); 889 } 890 trap_fatal(frame, eva); 891 return (-1); 892 } 893 894 /* kludge to pass faulting virtual address to sendsig */ 895 frame->tf_err = eva; 896 897 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 898} 899 900static void 901trap_fatal(frame, eva) 902 struct trapframe *frame; 903 vm_offset_t eva; 904{ 905 int code, type, ss, esp; 906 struct soft_segment_descriptor softseg; 907 908 code = frame->tf_err; 909 type = frame->tf_trapno; 910 sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); 911 912 if (type <= MAX_TRAP_MSG) 913 printf("\n\nFatal trap %d: %s while in %s mode\n", 914 type, trap_msg[type], 915 frame->tf_eflags & PSL_VM ? "vm86" : 916 ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); 917#ifdef SMP 918 /* two separate prints in case of a trap on an unmapped page */ 919 printf("cpuid = %d; ", PCPU_GET(cpuid)); 920 printf("lapic.id = %08x\n", lapic.id); 921#endif 922 if (type == T_PAGEFLT) { 923 printf("fault virtual address = 0x%x\n", eva); 924 printf("fault code = %s %s, %s\n", 925 code & PGEX_U ? "user" : "supervisor", 926 code & PGEX_W ? "write" : "read", 927 code & PGEX_P ? "protection violation" : "page not present"); 928 } 929 printf("instruction pointer = 0x%x:0x%x\n", 930 frame->tf_cs & 0xffff, frame->tf_eip); 931 if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) { 932 ss = frame->tf_ss & 0xffff; 933 esp = frame->tf_esp; 934 } else { 935 ss = GSEL(GDATA_SEL, SEL_KPL); 936 esp = (int)&frame->tf_esp; 937 } 938 printf("stack pointer = 0x%x:0x%x\n", ss, esp); 939 printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); 940 printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", 941 softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); 942 printf(" = DPL %d, pres %d, def32 %d, gran %d\n", 943 softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, 944 softseg.ssd_gran); 945 printf("processor eflags = "); 946 if (frame->tf_eflags & PSL_T) 947 printf("trace trap, "); 948 if (frame->tf_eflags & PSL_I) 949 printf("interrupt enabled, "); 950 if (frame->tf_eflags & PSL_NT) 951 printf("nested task, "); 952 if (frame->tf_eflags & PSL_RF) 953 printf("resume, "); 954 if (frame->tf_eflags & PSL_VM) 955 printf("vm86, "); 956 printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); 957 printf("current process = "); 958 if (curproc) { 959 printf("%lu (%s)\n", 960 (u_long)curproc->p_pid, curproc->p_comm ? 961 curproc->p_comm : ""); 962 } else { 963 printf("Idle\n"); 964 } 965 966#ifdef KDB 967 if (kdb_trap(&psl)) 968 return; 969#endif 970#ifdef DDB 971 if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame)) 972 return; 973#endif 974 printf("trap number = %d\n", type); 975 if (type <= MAX_TRAP_MSG) 976 panic(trap_msg[type]); 977 else 978 panic("unknown/reserved trap"); 979} 980 981/* 982 * Double fault handler. Called when a fault occurs while writing 983 * a frame for a trap/exception onto the stack. This usually occurs 984 * when the stack overflows (such is the case with infinite recursion, 985 * for example). 986 * 987 * XXX Note that the current PTD gets replaced by IdlePTD when the 988 * task switch occurs. This means that the stack that was active at 989 * the time of the double fault is not available at <kstack> unless 990 * the machine was idle when the double fault occurred. The downside 991 * of this is that "trace <ebp>" in ddb won't work. 992 */ 993void 994dblfault_handler() 995{ 996 printf("\nFatal double fault:\n"); 997 printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); 998 printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); 999 printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); 1000#ifdef SMP 1001 /* two separate prints in case of a trap on an unmapped page */ 1002 printf("cpuid = %d; ", PCPU_GET(cpuid)); 1003 printf("lapic.id = %08x\n", lapic.id); 1004#endif 1005 panic("double fault"); 1006} 1007 1008/* 1009 * Compensate for 386 brain damage (missing URKR). 1010 * This is a little simpler than the pagefault handler in trap() because 1011 * it the page tables have already been faulted in and high addresses 1012 * are thrown out early for other reasons. 1013 */ 1014int trapwrite(addr) 1015 unsigned addr; 1016{ 1017 struct proc *p; 1018 vm_offset_t va; 1019 struct vmspace *vm; 1020 int rv; 1021 1022 va = trunc_page((vm_offset_t)addr); 1023 /* 1024 * XXX - MAX is END. Changed > to >= for temp. fix. 1025 */ 1026 if (va >= VM_MAXUSER_ADDRESS) 1027 return (1); 1028 1029 p = curproc; 1030 vm = p->p_vmspace; 1031 1032 PROC_LOCK(p); 1033 ++p->p_lock; 1034 PROC_UNLOCK(p); 1035 1036 if (!grow_stack (p, va)) 1037 rv = KERN_FAILURE; 1038 else 1039 /* 1040 * fault the data page 1041 */ 1042 rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); 1043 1044 PROC_LOCK(p); 1045 --p->p_lock; 1046 PROC_UNLOCK(p); 1047 1048 if (rv != KERN_SUCCESS) 1049 return 1; 1050 1051 return (0); 1052} 1053 1054/* 1055 * syscall - MP aware system call request C handler 1056 * 1057 * A system call is essentially treated as a trap except that the 1058 * MP lock is not held on entry or return. We are responsible for 1059 * obtaining the MP lock if necessary and for handling ASTs 1060 * (e.g. a task switch) prior to return. 1061 * 1062 * In general, only simple access and manipulation of curproc and 1063 * the current stack is allowed without having to hold MP lock. 1064 */ 1065void 1066syscall(frame) 1067 struct trapframe frame; 1068{ 1069 caddr_t params; 1070 int i; 1071 struct sysent *callp; 1072 struct proc *p = curproc; 1073 u_quad_t sticks; 1074 int error; 1075 int narg; 1076 int args[8]; 1077 u_int code; 1078 1079 atomic_add_int(&cnt.v_syscall, 1); 1080 1081#ifdef DIAGNOSTIC 1082 if (ISPL(frame.tf_cs) != SEL_UPL) { 1083 mtx_lock(&Giant); 1084 panic("syscall"); 1085 /* NOT REACHED */ 1086 } 1087#endif 1088 1089 mtx_lock_spin(&sched_lock); 1090 sticks = p->p_sticks; 1091 mtx_unlock_spin(&sched_lock); 1092 1093 p->p_md.md_regs = &frame; 1094 params = (caddr_t)frame.tf_esp + sizeof(int); 1095 code = frame.tf_eax; 1096 1097 if (p->p_sysent->sv_prepsyscall) { 1098 /* 1099 * The prep code is not MP aware. 1100 */ 1101 mtx_lock(&Giant); 1102 (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); 1103 mtx_unlock(&Giant); 1104 } else { 1105 /* 1106 * Need to check if this is a 32 bit or 64 bit syscall. 1107 * fuword is MP aware. 1108 */ 1109 if (code == SYS_syscall) { 1110 /* 1111 * Code is first argument, followed by actual args. 1112 */ 1113 code = fuword(params); 1114 params += sizeof(int); 1115 } else if (code == SYS___syscall) { 1116 /* 1117 * Like syscall, but code is a quad, so as to maintain 1118 * quad alignment for the rest of the arguments. 1119 */ 1120 code = fuword(params); 1121 params += sizeof(quad_t); 1122 } 1123 } 1124 1125 if (p->p_sysent->sv_mask) 1126 code &= p->p_sysent->sv_mask; 1127 1128 if (code >= p->p_sysent->sv_size) 1129 callp = &p->p_sysent->sv_table[0]; 1130 else 1131 callp = &p->p_sysent->sv_table[code]; 1132 1133 narg = callp->sy_narg & SYF_ARGMASK; 1134 1135 /* 1136 * copyin is MP aware, but the tracing code is not 1137 */ 1138 if (params && (i = narg * sizeof(int)) && 1139 (error = copyin(params, (caddr_t)args, (u_int)i))) { 1140 mtx_lock(&Giant); 1141#ifdef KTRACE 1142 if (KTRPOINT(p, KTR_SYSCALL)) 1143 ktrsyscall(p->p_tracep, code, narg, args); 1144#endif 1145 goto bad; 1146 } 1147 1148 /* 1149 * Try to run the syscall without the MP lock if the syscall 1150 * is MP safe. 1151 */ 1152 if ((callp->sy_narg & SYF_MPSAFE) == 0) { 1153 mtx_lock(&Giant); 1154 } 1155 1156#ifdef KTRACE 1157 /* 1158 * We have to obtain the MP lock no matter what if 1159 * we are ktracing 1160 */ 1161 if (KTRPOINT(p, KTR_SYSCALL)) { 1162 if (!mtx_owned(&Giant)) 1163 mtx_lock(&Giant); 1164 ktrsyscall(p->p_tracep, code, narg, args); 1165 } 1166#endif 1167 p->p_retval[0] = 0; 1168 p->p_retval[1] = frame.tf_edx; 1169 1170 STOPEVENT(p, S_SCE, narg); /* MP aware */ 1171 1172 error = (*callp->sy_call)(p, args); 1173 1174 /* 1175 * MP SAFE (we may or may not have the MP lock at this point) 1176 */ 1177 switch (error) { 1178 case 0: 1179 frame.tf_eax = p->p_retval[0]; 1180 frame.tf_edx = p->p_retval[1]; 1181 frame.tf_eflags &= ~PSL_C; 1182 break; 1183 1184 case ERESTART: 1185 /* 1186 * Reconstruct pc, assuming lcall $X,y is 7 bytes, 1187 * int 0x80 is 2 bytes. We saved this in tf_err. 1188 */ 1189 frame.tf_eip -= frame.tf_err; 1190 break; 1191 1192 case EJUSTRETURN: 1193 break; 1194 1195 default: 1196bad: 1197 if (p->p_sysent->sv_errsize) { 1198 if (error >= p->p_sysent->sv_errsize) 1199 error = -1; /* XXX */ 1200 else 1201 error = p->p_sysent->sv_errtbl[error]; 1202 } 1203 frame.tf_eax = error; 1204 frame.tf_eflags |= PSL_C; 1205 break; 1206 } 1207 1208 /* 1209 * Traced syscall. trapsignal() is not MP aware. 1210 */ 1211 if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { 1212 if (!mtx_owned(&Giant)) 1213 mtx_lock(&Giant); 1214 frame.tf_eflags &= ~PSL_T; 1215 trapsignal(p, SIGTRAP, 0); 1216 } 1217 1218 /* 1219 * Handle reschedule and other end-of-syscall issues 1220 */ 1221 userret(p, &frame, sticks); 1222 1223#ifdef KTRACE 1224 if (KTRPOINT(p, KTR_SYSRET)) { 1225 if (!mtx_owned(&Giant)) 1226 mtx_lock(&Giant); 1227 ktrsysret(p->p_tracep, code, error, p->p_retval[0]); 1228 } 1229#endif 1230 1231 /* 1232 * Release Giant if we had to get it 1233 */ 1234 if (mtx_owned(&Giant)) 1235 mtx_unlock(&Giant); 1236 1237 /* 1238 * This works because errno is findable through the 1239 * register set. If we ever support an emulation where this 1240 * is not the case, this code will need to be revisited. 1241 */ 1242 STOPEVENT(p, S_SCX, code); 1243 1244#ifdef WITNESS 1245 if (witness_list(p)) { 1246 panic("system call %s returning with mutex(s) held\n", 1247 syscallnames[code]); 1248 } 1249#endif 1250 mtx_assert(&sched_lock, MA_NOTOWNED); 1251 mtx_assert(&Giant, MA_NOTOWNED); 1252} 1253 1254void 1255ast(framep) 1256 struct trapframe *framep; 1257{ 1258 struct proc *p = CURPROC; 1259 u_quad_t sticks; 1260#if defined(DEV_NPX) && !defined(SMP) 1261 int ucode; 1262#endif 1263 1264 KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode")); 1265 1266 /* 1267 * We check for a pending AST here rather than in the assembly as 1268 * acquiring and releasing mutexes in assembly is not fun. 1269 */ 1270 mtx_lock_spin(&sched_lock); 1271 if (!(astpending(p) || resched_wanted(p))) { 1272 mtx_unlock_spin(&sched_lock); 1273 return; 1274 } 1275 1276 sticks = p->p_sticks; 1277 p->p_md.md_regs = framep; 1278 1279 astoff(p); 1280 cnt.v_soft++; 1281 mtx_intr_enable(&sched_lock); 1282 if (p->p_sflag & PS_OWEUPC) { 1283 p->p_sflag &= ~PS_OWEUPC; 1284 mtx_unlock_spin(&sched_lock); 1285 mtx_lock(&Giant); 1286 mtx_lock_spin(&sched_lock); 1287 addupc_task(p, p->p_stats->p_prof.pr_addr, 1288 p->p_stats->p_prof.pr_ticks); 1289 } 1290 if (p->p_sflag & PS_ALRMPEND) { 1291 p->p_sflag &= ~PS_ALRMPEND; 1292 mtx_unlock_spin(&sched_lock); 1293 PROC_LOCK(p); 1294 psignal(p, SIGVTALRM); 1295 PROC_UNLOCK(p); 1296 mtx_lock_spin(&sched_lock); 1297 } 1298#if defined(DEV_NPX) && !defined(SMP) 1299 if (PCPU_GET(curpcb)->pcb_flags & PCB_NPXTRAP) { 1300 PCPU_GET(curpcb)->pcb_flags &= ~PCB_NPXTRAP; 1301 mtx_unlock_spin(&sched_lock); 1302 ucode = npxtrap(); 1303 if (ucode != -1) { 1304 if (!mtx_owned(&Giant)) 1305 mtx_lock(&Giant); 1306 trapsignal(p, SIGFPE, ucode); 1307 } 1308 mtx_lock_spin(&sched_lock); 1309 } 1310#endif 1311 if (p->p_sflag & PS_PROFPEND) { 1312 p->p_sflag &= ~PS_PROFPEND; 1313 mtx_unlock_spin(&sched_lock); 1314 PROC_LOCK(p); 1315 psignal(p, SIGPROF); 1316 PROC_UNLOCK(p); 1317 } else 1318 mtx_unlock_spin(&sched_lock); 1319 1320 userret(p, framep, sticks); 1321 1322 if (mtx_owned(&Giant)) 1323 mtx_unlock(&Giant); 1324} 1325