subr_syscall.c revision 65811
1/*- 2 * Copyright (C) 1994, David Greenman 3 * Copyright (c) 1990, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the University of Utah, and William Jolitz. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 38 * $FreeBSD: head/sys/kern/subr_trap.c 65811 2000-09-13 12:40:43Z bde $ 39 */ 40 41/* 42 * 386 Trap and System call handling 43 */ 44 45#include "opt_cpu.h" 46#include "opt_ddb.h" 47#include "opt_ktrace.h" 48#include "opt_clock.h" 49#include "opt_trap.h" 50 51#include <sys/param.h> 52#include <sys/bus.h> 53#include <sys/systm.h> 54#include <sys/proc.h> 55#include <sys/pioctl.h> 56#include <sys/kernel.h> 57#include <sys/ktr.h> 58#include <sys/resourcevar.h> 59#include <sys/signalvar.h> 60#include <sys/syscall.h> 61#include <sys/sysctl.h> 62#include <sys/sysent.h> 63#include <sys/uio.h> 64#include <sys/vmmeter.h> 65#ifdef KTRACE 66#include <sys/ktrace.h> 67#endif 68 69#include <vm/vm.h> 70#include <vm/vm_param.h> 71#include <sys/lock.h> 72#include <vm/pmap.h> 73#include <vm/vm_kern.h> 74#include <vm/vm_map.h> 75#include <vm/vm_page.h> 76#include <vm/vm_extern.h> 77 78#include <machine/cpu.h> 79#include <machine/ipl.h> 80#include <machine/md_var.h> 81#include <machine/mutex.h> 82#include <machine/pcb.h> 83#ifdef SMP 84#include <machine/smp.h> 85#endif 86#include <machine/tss.h> 87 88#include <i386/isa/icu.h> 89#include <i386/isa/intr_machdep.h> 90 91#ifdef POWERFAIL_NMI 92#include <sys/syslog.h> 93#include <machine/clock.h> 94#endif 95 96#include <machine/vm86.h> 97 98#include <ddb/ddb.h> 99 100#include "isa.h" 101#include "npx.h" 102 103#include <sys/sysctl.h> 104 105int (*pmath_emulate) __P((struct trapframe *)); 106 107extern void trap __P((struct trapframe frame)); 108extern int trapwrite __P((unsigned addr)); 109extern void syscall2 __P((struct trapframe frame)); 110extern void ast __P((struct trapframe frame)); 111 112static int trap_pfault __P((struct trapframe *, int, vm_offset_t)); 113static void trap_fatal __P((struct trapframe *, vm_offset_t)); 114void dblfault_handler __P((void)); 115 116extern inthand_t IDTVEC(syscall); 117 118#define MAX_TRAP_MSG 28 119static char *trap_msg[] = { 120 "", /* 0 unused */ 121 "privileged instruction fault", /* 1 T_PRIVINFLT */ 122 "", /* 2 unused */ 123 "breakpoint instruction fault", /* 3 T_BPTFLT */ 124 "", /* 4 unused */ 125 "", /* 5 unused */ 126 "arithmetic trap", /* 6 T_ARITHTRAP */ 127 "system forced exception", /* 7 T_ASTFLT */ 128 "", /* 8 unused */ 129 "general protection fault", /* 9 T_PROTFLT */ 130 "trace trap", /* 10 T_TRCTRAP */ 131 "", /* 11 unused */ 132 "page fault", /* 12 T_PAGEFLT */ 133 "", /* 13 unused */ 134 "alignment fault", /* 14 T_ALIGNFLT */ 135 "", /* 15 unused */ 136 "", /* 16 unused */ 137 "", /* 17 unused */ 138 "integer divide fault", /* 18 T_DIVIDE */ 139 "non-maskable interrupt trap", /* 19 T_NMI */ 140 "overflow trap", /* 20 T_OFLOW */ 141 "FPU bounds check fault", /* 21 T_BOUND */ 142 "FPU device not available", /* 22 T_DNA */ 143 "double fault", /* 23 T_DOUBLEFLT */ 144 "FPU operand fetch fault", /* 24 T_FPOPFLT */ 145 "invalid TSS fault", /* 25 T_TSSFLT */ 146 "segment not present fault", /* 26 T_SEGNPFLT */ 147 "stack fault", /* 27 T_STKFLT */ 148 "machine check trap", /* 28 T_MCHK */ 149}; 150 151static __inline int userret __P((struct proc *p, struct trapframe *frame, 152 u_quad_t oticks, int have_giant)); 153 154#if defined(I586_CPU) && !defined(NO_F00F_HACK) 155extern int has_f00f_bug; 156#endif 157 158#ifdef DDB 159static int ddb_on_nmi = 1; 160SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, 161 &ddb_on_nmi, 0, "Go to DDB on NMI"); 162#endif 163static int panic_on_nmi = 1; 164SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, 165 &panic_on_nmi, 0, "Panic on NMI"); 166 167static __inline int 168userret(p, frame, oticks, have_giant) 169 struct proc *p; 170 struct trapframe *frame; 171 u_quad_t oticks; 172 int have_giant; 173{ 174 int sig, s; 175 176 while ((sig = CURSIG(p)) != 0) { 177 if (have_giant == 0) { 178 mtx_enter(&Giant, MTX_DEF); 179 have_giant = 1; 180 } 181 postsig(sig); 182 } 183 184 p->p_priority = p->p_usrpri; 185 if (resched_wanted()) { 186 /* 187 * Since we are curproc, clock will normally just change 188 * our priority without moving us from one queue to another 189 * (since the running process is not on a queue.) 190 * If that happened after we setrunqueue ourselves but before we 191 * mi_switch()'ed, we might not be on the queue indicated by 192 * our priority. 193 */ 194 s = splhigh(); 195 mtx_enter(&sched_lock, MTX_SPIN); 196 setrunqueue(p); 197 p->p_stats->p_ru.ru_nivcsw++; 198 mi_switch(); 199 mtx_exit(&sched_lock, MTX_SPIN); 200 splx(s); 201 while ((sig = CURSIG(p)) != 0) { 202 if (have_giant == 0) { 203 mtx_enter(&Giant, MTX_DEF); 204 have_giant = 1; 205 } 206 postsig(sig); 207 } 208 } 209 /* 210 * Charge system time if profiling. 211 */ 212 if (p->p_flag & P_PROFIL) { 213 if (have_giant == 0) { 214 mtx_enter(&Giant, MTX_DEF); 215 have_giant = 1; 216 } 217 addupc_task(p, frame->tf_eip, 218 (u_int)(p->p_sticks - oticks) * psratio); 219 } 220 curpriority = p->p_priority; 221 return(have_giant); 222} 223 224/* 225 * Exception, fault, and trap interface to the FreeBSD kernel. 226 * This common code is called from assembly language IDT gate entry 227 * routines that prepare a suitable stack frame, and restore this 228 * frame after the exception has been processed. 229 */ 230 231void 232trap(frame) 233 struct trapframe frame; 234{ 235 struct proc *p = curproc; 236 u_quad_t sticks = 0; 237 int i = 0, ucode = 0, type, code; 238 vm_offset_t eva; 239#ifdef POWERFAIL_NMI 240 static int lastalert = 0; 241#endif 242 243 atomic_add_int(&cnt.v_trap, 1); 244 245 if ((frame.tf_eflags & PSL_I) == 0) { 246 /* 247 * Buggy application or kernel code has disabled 248 * interrupts and then trapped. Enabling interrupts 249 * now is wrong, but it is better than running with 250 * interrupts disabled until they are accidentally 251 * enabled later. XXX Consider whether is this still 252 * correct. 253 */ 254 type = frame.tf_trapno; 255 if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) 256 printf( 257 "pid %ld (%s): trap %d with interrupts disabled\n", 258 (long)curproc->p_pid, curproc->p_comm, type); 259 else if (type != T_BPTFLT && type != T_TRCTRAP) 260 /* 261 * XXX not quite right, since this may be for a 262 * multiple fault in user mode. 263 */ 264 printf("kernel trap %d with interrupts disabled\n", 265 type); 266 enable_intr(); 267 } 268 269 eva = 0; 270 if (frame.tf_trapno == T_PAGEFLT) { 271 /* 272 * For some Cyrix CPUs, %cr2 is clobbered by 273 * interrupts. This problem is worked around by using 274 * an interrupt gate for the pagefault handler. We 275 * are finally ready to read %cr2 and then must 276 * reenable interrupts. 277 */ 278 eva = rcr2(); 279 enable_intr(); 280 } 281 282 mtx_enter(&Giant, MTX_DEF); 283 284#if defined(I586_CPU) && !defined(NO_F00F_HACK) 285restart: 286#endif 287 288 type = frame.tf_trapno; 289 code = frame.tf_err; 290 291 if ((ISPL(frame.tf_cs) == SEL_UPL) || 292 ((frame.tf_eflags & PSL_VM) && !in_vm86call)) { 293 /* user trap */ 294 295 sticks = p->p_sticks; 296 p->p_md.md_regs = &frame; 297 298 switch (type) { 299 case T_PRIVINFLT: /* privileged instruction fault */ 300 ucode = type; 301 i = SIGILL; 302 break; 303 304 case T_BPTFLT: /* bpt instruction fault */ 305 case T_TRCTRAP: /* trace trap */ 306 frame.tf_eflags &= ~PSL_T; 307 i = SIGTRAP; 308 break; 309 310 case T_ARITHTRAP: /* arithmetic trap */ 311 ucode = code; 312 i = SIGFPE; 313 break; 314 315 /* 316 * The following two traps can happen in 317 * vm86 mode, and, if so, we want to handle 318 * them specially. 319 */ 320 case T_PROTFLT: /* general protection fault */ 321 case T_STKFLT: /* stack fault */ 322 if (frame.tf_eflags & PSL_VM) { 323 i = vm86_emulate((struct vm86frame *)&frame); 324 if (i == 0) 325 goto user; 326 break; 327 } 328 /* FALL THROUGH */ 329 330 case T_SEGNPFLT: /* segment not present fault */ 331 case T_TSSFLT: /* invalid TSS fault */ 332 case T_DOUBLEFLT: /* double fault */ 333 default: 334 ucode = code + BUS_SEGM_FAULT ; 335 i = SIGBUS; 336 break; 337 338 case T_PAGEFLT: /* page fault */ 339 i = trap_pfault(&frame, TRUE, eva); 340#if defined(I586_CPU) && !defined(NO_F00F_HACK) 341 if (i == -2) { 342 /* 343 * f00f hack workaround has triggered, treat 344 * as illegal instruction not page fault. 345 */ 346 frame.tf_trapno = T_PRIVINFLT; 347 goto restart; 348 } 349#endif 350 if (i == -1) 351 goto out; 352 if (i == 0) 353 goto user; 354 355 ucode = T_PAGEFLT; 356 break; 357 358 case T_DIVIDE: /* integer divide fault */ 359 ucode = FPE_INTDIV; 360 i = SIGFPE; 361 break; 362 363#if NISA > 0 364 case T_NMI: 365#ifdef POWERFAIL_NMI 366#ifndef TIMER_FREQ 367# define TIMER_FREQ 1193182 368#endif 369 if (time_second - lastalert > 10) { 370 log(LOG_WARNING, "NMI: power fail\n"); 371 sysbeep(TIMER_FREQ/880, hz); 372 lastalert = time_second; 373 } 374 goto out; 375#else /* !POWERFAIL_NMI */ 376 /* machine/parity/power fail/"kitchen sink" faults */ 377 if (isa_nmi(code) == 0) { 378#ifdef DDB 379 /* 380 * NMI can be hooked up to a pushbutton 381 * for debugging. 382 */ 383 if (ddb_on_nmi) { 384 printf ("NMI ... going to debugger\n"); 385 kdb_trap (type, 0, &frame); 386 } 387#endif /* DDB */ 388 goto out; 389 } else if (panic_on_nmi) 390 panic("NMI indicates hardware failure"); 391 break; 392#endif /* POWERFAIL_NMI */ 393#endif /* NISA > 0 */ 394 395 case T_OFLOW: /* integer overflow fault */ 396 ucode = FPE_INTOVF; 397 i = SIGFPE; 398 break; 399 400 case T_BOUND: /* bounds check fault */ 401 ucode = FPE_FLTSUB; 402 i = SIGFPE; 403 break; 404 405 case T_DNA: 406#if NNPX > 0 407 /* transparent fault (due to context switch "late") */ 408 if (npxdna()) 409 goto out; 410#endif 411 if (!pmath_emulate) { 412 i = SIGFPE; 413 ucode = FPE_FPU_NP_TRAP; 414 break; 415 } 416 i = (*pmath_emulate)(&frame); 417 if (i == 0) { 418 if (!(frame.tf_eflags & PSL_T)) 419 goto out; 420 frame.tf_eflags &= ~PSL_T; 421 i = SIGTRAP; 422 } 423 /* else ucode = emulator_only_knows() XXX */ 424 break; 425 426 case T_FPOPFLT: /* FPU operand fetch fault */ 427 ucode = T_FPOPFLT; 428 i = SIGILL; 429 break; 430 } 431 } else { 432 /* kernel trap */ 433 434 switch (type) { 435 case T_PAGEFLT: /* page fault */ 436 (void) trap_pfault(&frame, FALSE, eva); 437 goto out; 438 439 case T_DNA: 440#if NNPX > 0 441 /* 442 * The kernel is apparently using npx for copying. 443 * XXX this should be fatal unless the kernel has 444 * registered such use. 445 */ 446 if (npxdna()) 447 goto out; 448#endif 449 break; 450 451 /* 452 * The following two traps can happen in 453 * vm86 mode, and, if so, we want to handle 454 * them specially. 455 */ 456 case T_PROTFLT: /* general protection fault */ 457 case T_STKFLT: /* stack fault */ 458 if (frame.tf_eflags & PSL_VM) { 459 i = vm86_emulate((struct vm86frame *)&frame); 460 if (i != 0) 461 /* 462 * returns to original process 463 */ 464 vm86_trap((struct vm86frame *)&frame); 465 goto out; 466 } 467 /* FALL THROUGH */ 468 469 case T_SEGNPFLT: /* segment not present fault */ 470 if (in_vm86call) 471 break; 472 473 if (intr_nesting_level != 0) 474 break; 475 476 /* 477 * Invalid %fs's and %gs's can be created using 478 * procfs or PT_SETREGS or by invalidating the 479 * underlying LDT entry. This causes a fault 480 * in kernel mode when the kernel attempts to 481 * switch contexts. Lose the bad context 482 * (XXX) so that we can continue, and generate 483 * a signal. 484 */ 485 if (frame.tf_eip == (int)cpu_switch_load_gs) { 486 curpcb->pcb_gs = 0; 487 psignal(p, SIGBUS); 488 goto out; 489 } 490 491 /* 492 * Invalid segment selectors and out of bounds 493 * %eip's and %esp's can be set up in user mode. 494 * This causes a fault in kernel mode when the 495 * kernel tries to return to user mode. We want 496 * to get this fault so that we can fix the 497 * problem here and not have to check all the 498 * selectors and pointers when the user changes 499 * them. 500 */ 501 if (frame.tf_eip == (int)doreti_iret) { 502 frame.tf_eip = (int)doreti_iret_fault; 503 goto out; 504 } 505 if (frame.tf_eip == (int)doreti_popl_ds) { 506 frame.tf_eip = (int)doreti_popl_ds_fault; 507 goto out; 508 } 509 if (frame.tf_eip == (int)doreti_popl_es) { 510 frame.tf_eip = (int)doreti_popl_es_fault; 511 goto out; 512 } 513 if (frame.tf_eip == (int)doreti_popl_fs) { 514 frame.tf_eip = (int)doreti_popl_fs_fault; 515 goto out; 516 } 517 if (curpcb && curpcb->pcb_onfault) { 518 frame.tf_eip = (int)curpcb->pcb_onfault; 519 goto out; 520 } 521 break; 522 523 case T_TSSFLT: 524 /* 525 * PSL_NT can be set in user mode and isn't cleared 526 * automatically when the kernel is entered. This 527 * causes a TSS fault when the kernel attempts to 528 * `iret' because the TSS link is uninitialized. We 529 * want to get this fault so that we can fix the 530 * problem here and not every time the kernel is 531 * entered. 532 */ 533 if (frame.tf_eflags & PSL_NT) { 534 frame.tf_eflags &= ~PSL_NT; 535 goto out; 536 } 537 break; 538 539 case T_TRCTRAP: /* trace trap */ 540 if (frame.tf_eip == (int)IDTVEC(syscall)) { 541 /* 542 * We've just entered system mode via the 543 * syscall lcall. Continue single stepping 544 * silently until the syscall handler has 545 * saved the flags. 546 */ 547 goto out; 548 } 549 if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { 550 /* 551 * The syscall handler has now saved the 552 * flags. Stop single stepping it. 553 */ 554 frame.tf_eflags &= ~PSL_T; 555 goto out; 556 } 557 /* 558 * Ignore debug register trace traps due to 559 * accesses in the user's address space, which 560 * can happen under several conditions such as 561 * if a user sets a watchpoint on a buffer and 562 * then passes that buffer to a system call. 563 * We still want to get TRCTRAPS for addresses 564 * in kernel space because that is useful when 565 * debugging the kernel. 566 */ 567 if (user_dbreg_trap() && !in_vm86call) { 568 /* 569 * Reset breakpoint bits because the 570 * processor doesn't 571 */ 572 load_dr6(rdr6() & 0xfffffff0); 573 goto out; 574 } 575 /* 576 * Fall through (TRCTRAP kernel mode, kernel address) 577 */ 578 case T_BPTFLT: 579 /* 580 * If DDB is enabled, let it handle the debugger trap. 581 * Otherwise, debugger traps "can't happen". 582 */ 583#ifdef DDB 584 if (kdb_trap (type, 0, &frame)) 585 goto out; 586#endif 587 break; 588 589#if NISA > 0 590 case T_NMI: 591#ifdef POWERFAIL_NMI 592 if (time_second - lastalert > 10) { 593 log(LOG_WARNING, "NMI: power fail\n"); 594 sysbeep(TIMER_FREQ/880, hz); 595 lastalert = time_second; 596 } 597 goto out; 598#else /* !POWERFAIL_NMI */ 599 /* machine/parity/power fail/"kitchen sink" faults */ 600 if (isa_nmi(code) == 0) { 601#ifdef DDB 602 /* 603 * NMI can be hooked up to a pushbutton 604 * for debugging. 605 */ 606 if (ddb_on_nmi) { 607 printf ("NMI ... going to debugger\n"); 608 kdb_trap (type, 0, &frame); 609 } 610#endif /* DDB */ 611 goto out; 612 } else if (panic_on_nmi == 0) 613 goto out; 614 /* FALL THROUGH */ 615#endif /* POWERFAIL_NMI */ 616#endif /* NISA > 0 */ 617 } 618 619 trap_fatal(&frame, eva); 620 goto out; 621 } 622 623 /* Translate fault for emulators (e.g. Linux) */ 624 if (*p->p_sysent->sv_transtrap) 625 i = (*p->p_sysent->sv_transtrap)(i, type); 626 627 trapsignal(p, i, ucode); 628 629#ifdef DEBUG 630 if (type <= MAX_TRAP_MSG) { 631 uprintf("fatal process exception: %s", 632 trap_msg[type]); 633 if ((type == T_PAGEFLT) || (type == T_PROTFLT)) 634 uprintf(", fault VA = 0x%lx", (u_long)eva); 635 uprintf("\n"); 636 } 637#endif 638 639user: 640 userret(p, &frame, sticks, 1); 641out: 642 mtx_exit(&Giant, MTX_DEF); 643} 644 645#ifdef notyet 646/* 647 * This version doesn't allow a page fault to user space while 648 * in the kernel. The rest of the kernel needs to be made "safe" 649 * before this can be used. I think the only things remaining 650 * to be made safe are the iBCS2 code and the process tracing/ 651 * debugging code. 652 */ 653static int 654trap_pfault(frame, usermode, eva) 655 struct trapframe *frame; 656 int usermode; 657 vm_offset_t eva; 658{ 659 vm_offset_t va; 660 struct vmspace *vm = NULL; 661 vm_map_t map = 0; 662 int rv = 0; 663 vm_prot_t ftype; 664 struct proc *p = curproc; 665 666 if (frame->tf_err & PGEX_W) 667 ftype = VM_PROT_WRITE; 668 else 669 ftype = VM_PROT_READ; 670 671 va = trunc_page(eva); 672 if (va < VM_MIN_KERNEL_ADDRESS) { 673 vm_offset_t v; 674 vm_page_t mpte; 675 676 if (p == NULL || 677 (!usermode && va < VM_MAXUSER_ADDRESS && 678 (intr_nesting_level != 0 || curpcb == NULL || 679 curpcb->pcb_onfault == NULL))) { 680 trap_fatal(frame, eva); 681 return (-1); 682 } 683 684 /* 685 * This is a fault on non-kernel virtual memory. 686 * vm is initialized above to NULL. If curproc is NULL 687 * or curproc->p_vmspace is NULL the fault is fatal. 688 */ 689 vm = p->p_vmspace; 690 if (vm == NULL) 691 goto nogo; 692 693 map = &vm->vm_map; 694 695 /* 696 * Keep swapout from messing with us during this 697 * critical time. 698 */ 699 ++p->p_lock; 700 701 /* 702 * Grow the stack if necessary 703 */ 704 /* grow_stack returns false only if va falls into 705 * a growable stack region and the stack growth 706 * fails. It returns true if va was not within 707 * a growable stack region, or if the stack 708 * growth succeeded. 709 */ 710 if (!grow_stack (p, va)) { 711 rv = KERN_FAILURE; 712 --p->p_lock; 713 goto nogo; 714 } 715 716 /* Fault in the user page: */ 717 rv = vm_fault(map, va, ftype, 718 (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY 719 : VM_FAULT_NORMAL); 720 721 --p->p_lock; 722 } else { 723 /* 724 * Don't allow user-mode faults in kernel address space. 725 */ 726 if (usermode) 727 goto nogo; 728 729 /* 730 * Since we know that kernel virtual address addresses 731 * always have pte pages mapped, we just have to fault 732 * the page. 733 */ 734 rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL); 735 } 736 737 if (rv == KERN_SUCCESS) 738 return (0); 739nogo: 740 if (!usermode) { 741 if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) { 742 frame->tf_eip = (int)curpcb->pcb_onfault; 743 return (0); 744 } 745 trap_fatal(frame, eva); 746 return (-1); 747 } 748 749 /* kludge to pass faulting virtual address to sendsig */ 750 frame->tf_err = eva; 751 752 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 753} 754#endif 755 756int 757trap_pfault(frame, usermode, eva) 758 struct trapframe *frame; 759 int usermode; 760 vm_offset_t eva; 761{ 762 vm_offset_t va; 763 struct vmspace *vm = NULL; 764 vm_map_t map = 0; 765 int rv = 0; 766 vm_prot_t ftype; 767 struct proc *p = curproc; 768 769 va = trunc_page(eva); 770 if (va >= KERNBASE) { 771 /* 772 * Don't allow user-mode faults in kernel address space. 773 * An exception: if the faulting address is the invalid 774 * instruction entry in the IDT, then the Intel Pentium 775 * F00F bug workaround was triggered, and we need to 776 * treat it is as an illegal instruction, and not a page 777 * fault. 778 */ 779#if defined(I586_CPU) && !defined(NO_F00F_HACK) 780 if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) 781 return -2; 782#endif 783 if (usermode) 784 goto nogo; 785 786 map = kernel_map; 787 } else { 788 /* 789 * This is a fault on non-kernel virtual memory. 790 * vm is initialized above to NULL. If curproc is NULL 791 * or curproc->p_vmspace is NULL the fault is fatal. 792 */ 793 if (p != NULL) 794 vm = p->p_vmspace; 795 796 if (vm == NULL) 797 goto nogo; 798 799 map = &vm->vm_map; 800 } 801 802 if (frame->tf_err & PGEX_W) 803 ftype = VM_PROT_WRITE; 804 else 805 ftype = VM_PROT_READ; 806 807 if (map != kernel_map) { 808 /* 809 * Keep swapout from messing with us during this 810 * critical time. 811 */ 812 ++p->p_lock; 813 814 /* 815 * Grow the stack if necessary 816 */ 817 /* grow_stack returns false only if va falls into 818 * a growable stack region and the stack growth 819 * fails. It returns true if va was not within 820 * a growable stack region, or if the stack 821 * growth succeeded. 822 */ 823 if (!grow_stack (p, va)) { 824 rv = KERN_FAILURE; 825 --p->p_lock; 826 goto nogo; 827 } 828 829 /* Fault in the user page: */ 830 rv = vm_fault(map, va, ftype, 831 (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY 832 : VM_FAULT_NORMAL); 833 834 --p->p_lock; 835 } else { 836 /* 837 * Don't have to worry about process locking or stacks in the kernel. 838 */ 839 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); 840 } 841 842 if (rv == KERN_SUCCESS) 843 return (0); 844nogo: 845 if (!usermode) { 846 if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) { 847 frame->tf_eip = (int)curpcb->pcb_onfault; 848 return (0); 849 } 850 trap_fatal(frame, eva); 851 return (-1); 852 } 853 854 /* kludge to pass faulting virtual address to sendsig */ 855 frame->tf_err = eva; 856 857 return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); 858} 859 860static void 861trap_fatal(frame, eva) 862 struct trapframe *frame; 863 vm_offset_t eva; 864{ 865 int code, type, ss, esp; 866 struct soft_segment_descriptor softseg; 867 868 code = frame->tf_err; 869 type = frame->tf_trapno; 870 sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); 871 872 if (type <= MAX_TRAP_MSG) 873 printf("\n\nFatal trap %d: %s while in %s mode\n", 874 type, trap_msg[type], 875 frame->tf_eflags & PSL_VM ? "vm86" : 876 ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); 877#ifdef SMP 878 /* two seperate prints in case of a trap on an unmapped page */ 879 printf("cpuid = %d; ", cpuid); 880 printf("lapic.id = %08x\n", lapic.id); 881#endif 882 if (type == T_PAGEFLT) { 883 printf("fault virtual address = 0x%x\n", eva); 884 printf("fault code = %s %s, %s\n", 885 code & PGEX_U ? "user" : "supervisor", 886 code & PGEX_W ? "write" : "read", 887 code & PGEX_P ? "protection violation" : "page not present"); 888 } 889 printf("instruction pointer = 0x%x:0x%x\n", 890 frame->tf_cs & 0xffff, frame->tf_eip); 891 if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) { 892 ss = frame->tf_ss & 0xffff; 893 esp = frame->tf_esp; 894 } else { 895 ss = GSEL(GDATA_SEL, SEL_KPL); 896 esp = (int)&frame->tf_esp; 897 } 898 printf("stack pointer = 0x%x:0x%x\n", ss, esp); 899 printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); 900 printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", 901 softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); 902 printf(" = DPL %d, pres %d, def32 %d, gran %d\n", 903 softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, 904 softseg.ssd_gran); 905 printf("processor eflags = "); 906 if (frame->tf_eflags & PSL_T) 907 printf("trace trap, "); 908 if (frame->tf_eflags & PSL_I) 909 printf("interrupt enabled, "); 910 if (frame->tf_eflags & PSL_NT) 911 printf("nested task, "); 912 if (frame->tf_eflags & PSL_RF) 913 printf("resume, "); 914 if (frame->tf_eflags & PSL_VM) 915 printf("vm86, "); 916 printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); 917 printf("current process = "); 918 if (curproc) { 919 printf("%lu (%s)\n", 920 (u_long)curproc->p_pid, curproc->p_comm ? 921 curproc->p_comm : ""); 922 } else { 923 printf("Idle\n"); 924 } 925 926#ifdef KDB 927 if (kdb_trap(&psl)) 928 return; 929#endif 930#ifdef DDB 931 if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame)) 932 return; 933#endif 934 printf("trap number = %d\n", type); 935 if (type <= MAX_TRAP_MSG) 936 panic(trap_msg[type]); 937 else 938 panic("unknown/reserved trap"); 939} 940 941/* 942 * Double fault handler. Called when a fault occurs while writing 943 * a frame for a trap/exception onto the stack. This usually occurs 944 * when the stack overflows (such is the case with infinite recursion, 945 * for example). 946 * 947 * XXX Note that the current PTD gets replaced by IdlePTD when the 948 * task switch occurs. This means that the stack that was active at 949 * the time of the double fault is not available at <kstack> unless 950 * the machine was idle when the double fault occurred. The downside 951 * of this is that "trace <ebp>" in ddb won't work. 952 */ 953void 954dblfault_handler() 955{ 956 printf("\nFatal double fault:\n"); 957 printf("eip = 0x%x\n", common_tss.tss_eip); 958 printf("esp = 0x%x\n", common_tss.tss_esp); 959 printf("ebp = 0x%x\n", common_tss.tss_ebp); 960#ifdef SMP 961 /* two seperate prints in case of a trap on an unmapped page */ 962 printf("cpuid = %d; ", cpuid); 963 printf("lapic.id = %08x\n", lapic.id); 964#endif 965 panic("double fault"); 966} 967 968/* 969 * Compensate for 386 brain damage (missing URKR). 970 * This is a little simpler than the pagefault handler in trap() because 971 * it the page tables have already been faulted in and high addresses 972 * are thrown out early for other reasons. 973 */ 974int trapwrite(addr) 975 unsigned addr; 976{ 977 struct proc *p; 978 vm_offset_t va; 979 struct vmspace *vm; 980 int rv; 981 982 va = trunc_page((vm_offset_t)addr); 983 /* 984 * XXX - MAX is END. Changed > to >= for temp. fix. 985 */ 986 if (va >= VM_MAXUSER_ADDRESS) 987 return (1); 988 989 p = curproc; 990 vm = p->p_vmspace; 991 992 ++p->p_lock; 993 994 if (!grow_stack (p, va)) { 995 --p->p_lock; 996 return (1); 997 } 998 999 /* 1000 * fault the data page 1001 */ 1002 rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); 1003 1004 --p->p_lock; 1005 1006 if (rv != KERN_SUCCESS) 1007 return 1; 1008 1009 return (0); 1010} 1011 1012/* 1013 * syscall2 - MP aware system call request C handler 1014 * 1015 * A system call is essentially treated as a trap except that the 1016 * MP lock is not held on entry or return. We are responsible for 1017 * obtaining the MP lock if necessary and for handling ASTs 1018 * (e.g. a task switch) prior to return. 1019 * 1020 * In general, only simple access and manipulation of curproc and 1021 * the current stack is allowed without having to hold MP lock. 1022 */ 1023void 1024syscall2(frame) 1025 struct trapframe frame; 1026{ 1027 caddr_t params; 1028 int i; 1029 struct sysent *callp; 1030 struct proc *p = curproc; 1031 u_quad_t sticks; 1032 int error; 1033 int narg; 1034 int args[8]; 1035 int have_giant = 0; 1036 u_int code; 1037 1038 atomic_add_int(&cnt.v_syscall, 1); 1039 1040#ifdef DIAGNOSTIC 1041 if (ISPL(frame.tf_cs) != SEL_UPL) { 1042 mtx_enter(&Giant, MTX_DEF); 1043 panic("syscall"); 1044 /* NOT REACHED */ 1045 } 1046#endif 1047 1048 /* 1049 * handle atomicy by looping since interrupts are enabled and the 1050 * MP lock is not held. 1051 */ 1052 sticks = ((volatile struct proc *)p)->p_sticks; 1053 while (sticks != ((volatile struct proc *)p)->p_sticks) 1054 sticks = ((volatile struct proc *)p)->p_sticks; 1055 1056 p->p_md.md_regs = &frame; 1057 params = (caddr_t)frame.tf_esp + sizeof(int); 1058 code = frame.tf_eax; 1059 1060 if (p->p_sysent->sv_prepsyscall) { 1061 /* 1062 * The prep code is not MP aware. 1063 */ 1064 mtx_enter(&Giant, MTX_DEF); 1065 (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); 1066 mtx_exit(&Giant, MTX_DEF); 1067 } else { 1068 /* 1069 * Need to check if this is a 32 bit or 64 bit syscall. 1070 * fuword is MP aware. 1071 */ 1072 if (code == SYS_syscall) { 1073 /* 1074 * Code is first argument, followed by actual args. 1075 */ 1076 code = fuword(params); 1077 params += sizeof(int); 1078 } else if (code == SYS___syscall) { 1079 /* 1080 * Like syscall, but code is a quad, so as to maintain 1081 * quad alignment for the rest of the arguments. 1082 */ 1083 code = fuword(params); 1084 params += sizeof(quad_t); 1085 } 1086 } 1087 1088 if (p->p_sysent->sv_mask) 1089 code &= p->p_sysent->sv_mask; 1090 1091 if (code >= p->p_sysent->sv_size) 1092 callp = &p->p_sysent->sv_table[0]; 1093 else 1094 callp = &p->p_sysent->sv_table[code]; 1095 1096 narg = callp->sy_narg & SYF_ARGMASK; 1097 1098 /* 1099 * copyin is MP aware, but the tracing code is not 1100 */ 1101 if (params && (i = narg * sizeof(int)) && 1102 (error = copyin(params, (caddr_t)args, (u_int)i))) { 1103 mtx_enter(&Giant, MTX_DEF); 1104 have_giant = 1; 1105#ifdef KTRACE 1106 if (KTRPOINT(p, KTR_SYSCALL)) 1107 ktrsyscall(p->p_tracep, code, narg, args); 1108#endif 1109 goto bad; 1110 } 1111 1112 /* 1113 * Try to run the syscall without the MP lock if the syscall 1114 * is MP safe. We have to obtain the MP lock no matter what if 1115 * we are ktracing 1116 */ 1117 if ((callp->sy_narg & SYF_MPSAFE) == 0) { 1118 mtx_enter(&Giant, MTX_DEF); 1119 have_giant = 1; 1120 } 1121 1122#ifdef KTRACE 1123 if (KTRPOINT(p, KTR_SYSCALL)) { 1124 if (have_giant == 0) { 1125 mtx_enter(&Giant, MTX_DEF); 1126 have_giant = 1; 1127 } 1128 ktrsyscall(p->p_tracep, code, narg, args); 1129 } 1130#endif 1131 p->p_retval[0] = 0; 1132 p->p_retval[1] = frame.tf_edx; 1133 1134 STOPEVENT(p, S_SCE, narg); /* MP aware */ 1135 1136 error = (*callp->sy_call)(p, args); 1137 1138 /* 1139 * MP SAFE (we may or may not have the MP lock at this point) 1140 */ 1141 switch (error) { 1142 case 0: 1143 /* 1144 * Reinitialize proc pointer `p' as it may be different 1145 * if this is a child returning from fork syscall. 1146 */ 1147 p = curproc; 1148 frame.tf_eax = p->p_retval[0]; 1149 frame.tf_edx = p->p_retval[1]; 1150 frame.tf_eflags &= ~PSL_C; 1151 break; 1152 1153 case ERESTART: 1154 /* 1155 * Reconstruct pc, assuming lcall $X,y is 7 bytes, 1156 * int 0x80 is 2 bytes. We saved this in tf_err. 1157 */ 1158 frame.tf_eip -= frame.tf_err; 1159 break; 1160 1161 case EJUSTRETURN: 1162 break; 1163 1164 default: 1165bad: 1166 if (p->p_sysent->sv_errsize) { 1167 if (error >= p->p_sysent->sv_errsize) 1168 error = -1; /* XXX */ 1169 else 1170 error = p->p_sysent->sv_errtbl[error]; 1171 } 1172 frame.tf_eax = error; 1173 frame.tf_eflags |= PSL_C; 1174 break; 1175 } 1176 1177 /* 1178 * Traced syscall. trapsignal() is not MP aware. 1179 */ 1180 if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { 1181 if (have_giant == 0) { 1182 mtx_enter(&Giant, MTX_DEF); 1183 have_giant = 1; 1184 } 1185 frame.tf_eflags &= ~PSL_T; 1186 trapsignal(p, SIGTRAP, 0); 1187 } 1188 1189 /* 1190 * Handle reschedule and other end-of-syscall issues 1191 */ 1192 have_giant = userret(p, &frame, sticks, have_giant); 1193 1194#ifdef KTRACE 1195 if (KTRPOINT(p, KTR_SYSRET)) { 1196 if (have_giant == 0) { 1197 mtx_enter(&Giant, MTX_DEF); 1198 have_giant = 1; 1199 } 1200 ktrsysret(p->p_tracep, code, error, p->p_retval[0]); 1201 } 1202#endif 1203 1204 /* 1205 * This works because errno is findable through the 1206 * register set. If we ever support an emulation where this 1207 * is not the case, this code will need to be revisited. 1208 */ 1209 STOPEVENT(p, S_SCX, code); 1210 1211 /* 1212 * Release the MP lock if we had to get it 1213 */ 1214 if (have_giant) 1215 mtx_exit(&Giant, MTX_DEF); 1216 1217 mtx_assert(&sched_lock, MA_NOTOWNED); 1218 mtx_assert(&Giant, MA_NOTOWNED); 1219} 1220 1221void 1222ast(frame) 1223 struct trapframe frame; 1224{ 1225 struct proc *p = CURPROC; 1226 u_quad_t sticks; 1227 1228 /* 1229 * handle atomicy by looping since interrupts are enabled and the 1230 * MP lock is not held. 1231 */ 1232 sticks = ((volatile struct proc *)p)->p_sticks; 1233 while (sticks != ((volatile struct proc *)p)->p_sticks) 1234 sticks = ((volatile struct proc *)p)->p_sticks; 1235 1236 astoff(); 1237 atomic_add_int(&cnt.v_soft, 1); 1238 if (p->p_flag & P_OWEUPC) { 1239 mtx_enter(&Giant, MTX_DEF); 1240 p->p_flag &= ~P_OWEUPC; 1241 addupc_task(p, p->p_stats->p_prof.pr_addr, 1242 p->p_stats->p_prof.pr_ticks); 1243} 1244 if (userret(p, &frame, sticks, mtx_owned(&Giant)) != 0) 1245 mtx_exit(&Giant, MTX_DEF); 1246} 1247 1248/* 1249 * Simplified back end of syscall(), used when returning from fork() 1250 * directly into user mode. Giant is not held on entry, and must not 1251 * be held on return. 1252 */ 1253void 1254fork_return(p, frame) 1255 struct proc *p; 1256 struct trapframe frame; 1257{ 1258 int have_giant; 1259 1260 frame.tf_eax = 0; /* Child returns zero */ 1261 frame.tf_eflags &= ~PSL_C; /* success */ 1262 frame.tf_edx = 1; 1263 1264 have_giant = userret(p, &frame, 0, mtx_owned(&Giant)); 1265#ifdef KTRACE 1266 if (KTRPOINT(p, KTR_SYSRET)) { 1267 if (have_giant == 0) { 1268 mtx_enter(&Giant, MTX_DEF); 1269 have_giant = 1; 1270 } 1271 ktrsysret(p->p_tracep, SYS_fork, 0, 0); 1272 } 1273#endif 1274 if (have_giant) 1275 mtx_exit(&Giant, MTX_DEF); 1276} 1277