1/* 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29* @OSF_COPYRIGHT@ 30*/ 31/* 32* Mach Operating System 33* Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University 34* All Rights Reserved. 35* 36* Permission to use, copy, modify and distribute this software and its 37* documentation is hereby granted, provided that both the copyright 38* notice and this permission notice appear in all copies of the 39* software, derivative works or modified versions, and any portions 40* thereof, and that both notices appear in supporting documentation. 41* 42* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 43* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR 44* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 45* 46* Carnegie Mellon requests users of this software to return to 47* 48* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 49* School of Computer Science 50* Carnegie Mellon University 51* Pittsburgh PA 15213-3890 52* 53* any improvements or extensions that they make and grant Carnegie Mellon 54* the rights to redistribute these changes. 55*/ 56/* 57*/ 58 59/* 60* Hardware trap/fault handler. 61 */ 62 63#include <mach_kdp.h> 64#include <mach_ldebug.h> 65 66#include <types.h> 67#include <i386/eflags.h> 68#include <i386/trap.h> 69#include <i386/pmap.h> 70#include <i386/fpu.h> 71#include <i386/misc_protos.h> /* panic_io_port_read() */ 72#include <i386/lapic.h> 73 74#include <mach/exception.h> 75#include <mach/kern_return.h> 76#include <mach/vm_param.h> 77#include <mach/i386/thread_status.h> 78 79#include <vm/vm_kern.h> 80#include <vm/vm_fault.h> 81 82#include <kern/kern_types.h> 83#include <kern/processor.h> 84#include <kern/thread.h> 85#include <kern/task.h> 86#include <kern/sched.h> 87#include <kern/sched_prim.h> 88#include <kern/exception.h> 89#include <kern/spl.h> 90#include <kern/misc_protos.h> 91#include <kern/debug.h> 92#if CONFIG_TELEMETRY 93#include <kern/telemetry.h> 94#endif 95#include <sys/kdebug.h> 96 97#include <string.h> 98 99#include <i386/postcode.h> 100#include <i386/mp_desc.h> 101#include <i386/proc_reg.h> 102#if CONFIG_MCA 103#include <i386/machine_check.h> 104#endif 105#include <mach/i386/syscall_sw.h> 106 107#include <libkern/OSDebug.h> 108#include <i386/cpu_threads.h> 109#include <machine/pal_routines.h> 110 111extern void throttle_lowpri_io(int); 112extern void kprint_state(x86_saved_state64_t *saved_state); 113 114/* 115 * Forward declarations 116 */ 117static void user_page_fault_continue(kern_return_t kret); 118static void panic_trap(x86_saved_state64_t *saved_state); 119static void set_recovery_ip(x86_saved_state64_t *saved_state, vm_offset_t ip); 120 121volatile perfCallback perfTrapHook = NULL; /* Pointer to CHUD trap hook routine */ 122 123#if CONFIG_DTRACE 124/* See <rdar://problem/4613924> */ 125perfCallback tempDTraceTrapHook = NULL; /* Pointer to DTrace fbt trap hook routine */ 126 127extern boolean_t dtrace_tally_fault(user_addr_t); 128#endif 129 130extern boolean_t pmap_smep_enabled; 131 132void 133thread_syscall_return( 134 kern_return_t ret) 135{ 136 thread_t thr_act = current_thread(); 137 boolean_t is_mach; 138 int code; 139 140 pal_register_cache_state(thr_act, DIRTY); 141 142 if (thread_is_64bit(thr_act)) { 143 x86_saved_state64_t *regs; 144 145 regs = USER_REGS64(thr_act); 146 147 code = (int) (regs->rax & SYSCALL_NUMBER_MASK); 148 is_mach = (regs->rax & SYSCALL_CLASS_MASK) 149 == (SYSCALL_CLASS_MACH << SYSCALL_CLASS_SHIFT); 150 if (kdebug_enable && is_mach) { 151 /* Mach trap */ 152 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 153 MACHDBG_CODE(DBG_MACH_EXCP_SC,code)|DBG_FUNC_END, 154 ret, 0, 0, 0, 0); 155 } 156 regs->rax = ret; 157#if DEBUG 158 if (is_mach) 159 DEBUG_KPRINT_SYSCALL_MACH( 160 "thread_syscall_return: 64-bit mach ret=%u\n", 161 ret); 162 else 163 DEBUG_KPRINT_SYSCALL_UNIX( 164 "thread_syscall_return: 64-bit unix ret=%u\n", 165 ret); 166#endif 167 } else { 168 x86_saved_state32_t *regs; 169 170 regs = USER_REGS32(thr_act); 171 172 code = ((int) regs->eax); 173 is_mach = (code < 0); 174 if (kdebug_enable && is_mach) { 175 /* Mach trap */ 176 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 177 MACHDBG_CODE(DBG_MACH_EXCP_SC,-code)|DBG_FUNC_END, 178 ret, 0, 0, 0, 0); 179 } 180 regs->eax = ret; 181#if DEBUG 182 if (is_mach) 183 DEBUG_KPRINT_SYSCALL_MACH( 184 "thread_syscall_return: 32-bit mach ret=%u\n", 185 ret); 186 else 187 DEBUG_KPRINT_SYSCALL_UNIX( 188 "thread_syscall_return: 32-bit unix ret=%u\n", 189 ret); 190#endif 191 } 192 throttle_lowpri_io(1); 193 194 thread_exception_return(); 195 /*NOTREACHED*/ 196} 197 198 199static inline void 200user_page_fault_continue( 201 kern_return_t kr) 202{ 203 thread_t thread = current_thread(); 204 user_addr_t vaddr; 205 206 if (thread_is_64bit(thread)) { 207 x86_saved_state64_t *uregs; 208 209 uregs = USER_REGS64(thread); 210 211 vaddr = (user_addr_t)uregs->cr2; 212 } else { 213 x86_saved_state32_t *uregs; 214 215 uregs = USER_REGS32(thread); 216 217 vaddr = uregs->cr2; 218 } 219 220 221 /* PAL debug hook */ 222 pal_dbg_page_fault( thread, vaddr, kr ); 223 224 i386_exception(EXC_BAD_ACCESS, kr, vaddr); 225 /*NOTREACHED*/ 226} 227 228/* 229 * Fault recovery in copyin/copyout routines. 230 */ 231struct recovery { 232 uintptr_t fault_addr; 233 uintptr_t recover_addr; 234}; 235 236extern struct recovery recover_table[]; 237extern struct recovery recover_table_end[]; 238 239const char * trap_type[] = {TRAP_NAMES}; 240unsigned TRAP_TYPES = sizeof(trap_type)/sizeof(trap_type[0]); 241 242extern void PE_incoming_interrupt(int interrupt); 243 244#if defined(__x86_64__) && DEBUG 245void 246kprint_state(x86_saved_state64_t *saved_state) 247{ 248 kprintf("current_cpu_datap() 0x%lx\n", (uintptr_t)current_cpu_datap()); 249 kprintf("Current GS base MSR 0x%llx\n", rdmsr64(MSR_IA32_GS_BASE)); 250 kprintf("Kernel GS base MSR 0x%llx\n", rdmsr64(MSR_IA32_KERNEL_GS_BASE)); 251 kprintf("state at 0x%lx:\n", (uintptr_t) saved_state); 252 253 kprintf(" rdi 0x%llx\n", saved_state->rdi); 254 kprintf(" rsi 0x%llx\n", saved_state->rsi); 255 kprintf(" rdx 0x%llx\n", saved_state->rdx); 256 kprintf(" r10 0x%llx\n", saved_state->r10); 257 kprintf(" r8 0x%llx\n", saved_state->r8); 258 kprintf(" r9 0x%llx\n", saved_state->r9); 259 kprintf(" v_arg6 0x%llx\n", saved_state->v_arg6); 260 kprintf(" v_arg7 0x%llx\n", saved_state->v_arg7); 261 kprintf(" v_arg8 0x%llx\n", saved_state->v_arg8); 262 263 kprintf(" cr2 0x%llx\n", saved_state->cr2); 264 kprintf("real cr2 0x%lx\n", get_cr2()); 265 kprintf(" r15 0x%llx\n", saved_state->r15); 266 kprintf(" r14 0x%llx\n", saved_state->r14); 267 kprintf(" r13 0x%llx\n", saved_state->r13); 268 kprintf(" r12 0x%llx\n", saved_state->r12); 269 kprintf(" r11 0x%llx\n", saved_state->r11); 270 kprintf(" rbp 0x%llx\n", saved_state->rbp); 271 kprintf(" rbx 0x%llx\n", saved_state->rbx); 272 kprintf(" rcx 0x%llx\n", saved_state->rcx); 273 kprintf(" rax 0x%llx\n", saved_state->rax); 274 275 kprintf(" gs 0x%x\n", saved_state->gs); 276 kprintf(" fs 0x%x\n", saved_state->fs); 277 278 kprintf(" isf.trapno 0x%x\n", saved_state->isf.trapno); 279 kprintf(" isf._pad 0x%x\n", saved_state->isf._pad); 280 kprintf(" isf.trapfn 0x%llx\n", saved_state->isf.trapfn); 281 kprintf(" isf.err 0x%llx\n", saved_state->isf.err); 282 kprintf(" isf.rip 0x%llx\n", saved_state->isf.rip); 283 kprintf(" isf.cs 0x%llx\n", saved_state->isf.cs); 284 kprintf(" isf.rflags 0x%llx\n", saved_state->isf.rflags); 285 kprintf(" isf.rsp 0x%llx\n", saved_state->isf.rsp); 286 kprintf(" isf.ss 0x%llx\n", saved_state->isf.ss); 287} 288#endif 289 290 291/* 292 * Non-zero indicates latency assert is enabled and capped at valued 293 * absolute time units. 294 */ 295 296uint64_t interrupt_latency_cap = 0; 297boolean_t ilat_assert = FALSE; 298 299void 300interrupt_latency_tracker_setup(void) { 301 uint32_t ilat_cap_us; 302 if (PE_parse_boot_argn("interrupt_latency_cap_us", &ilat_cap_us, sizeof(ilat_cap_us))) { 303 interrupt_latency_cap = ilat_cap_us * NSEC_PER_USEC; 304 nanoseconds_to_absolutetime(interrupt_latency_cap, &interrupt_latency_cap); 305 } else { 306 interrupt_latency_cap = LockTimeOut; 307 } 308 PE_parse_boot_argn("-interrupt_latency_assert_enable", &ilat_assert, sizeof(ilat_assert)); 309} 310 311void interrupt_reset_latency_stats(void) { 312 uint32_t i; 313 for (i = 0; i < real_ncpus; i++) { 314 cpu_data_ptr[i]->cpu_max_observed_int_latency = 315 cpu_data_ptr[i]->cpu_max_observed_int_latency_vector = 0; 316 } 317} 318 319void interrupt_populate_latency_stats(char *buf, unsigned bufsize) { 320 uint32_t i, tcpu = ~0; 321 uint64_t cur_max = 0; 322 323 for (i = 0; i < real_ncpus; i++) { 324 if (cur_max < cpu_data_ptr[i]->cpu_max_observed_int_latency) { 325 cur_max = cpu_data_ptr[i]->cpu_max_observed_int_latency; 326 tcpu = i; 327 } 328 } 329 330 if (tcpu < real_ncpus) 331 snprintf(buf, bufsize, "0x%x 0x%x 0x%llx", tcpu, cpu_data_ptr[tcpu]->cpu_max_observed_int_latency_vector, cpu_data_ptr[tcpu]->cpu_max_observed_int_latency); 332} 333 334uint32_t interrupt_timer_coalescing_enabled = 1; 335uint64_t interrupt_coalesced_timers; 336 337/* 338 * Handle interrupts: 339 * - local APIC interrupts (IPIs, timers, etc) are handled by the kernel, 340 * - device interrupts go to the platform expert. 341 */ 342void 343interrupt(x86_saved_state_t *state) 344{ 345 uint64_t rip; 346 uint64_t rsp; 347 int interrupt_num; 348 boolean_t user_mode = FALSE; 349 int ipl; 350 int cnum = cpu_number(); 351 cpu_data_t *cdp = cpu_data_ptr[cnum]; 352 int itype = 0; 353 354 if (is_saved_state64(state) == TRUE) { 355 x86_saved_state64_t *state64; 356 357 state64 = saved_state64(state); 358 rip = state64->isf.rip; 359 rsp = state64->isf.rsp; 360 interrupt_num = state64->isf.trapno; 361#ifdef __x86_64__ 362 if(state64->isf.cs & 0x03) 363#endif 364 user_mode = TRUE; 365 } else { 366 x86_saved_state32_t *state32; 367 368 state32 = saved_state32(state); 369 if (state32->cs & 0x03) 370 user_mode = TRUE; 371 rip = state32->eip; 372 rsp = state32->uesp; 373 interrupt_num = state32->trapno; 374 } 375 376 if (cpu_data_ptr[cnum]->lcpu.package->num_idle == topoParms.nLThreadsPerPackage) 377 cpu_data_ptr[cnum]->cpu_hwIntpexits[interrupt_num]++; 378 379 if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_INTERPROCESSOR_INTERRUPT)) 380 itype = 1; 381 else if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_TIMER_INTERRUPT)) 382 itype = 2; 383 else 384 itype = 3; 385 386 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 387 MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_START, 388 interrupt_num, 389 (user_mode ? rip : VM_KERNEL_UNSLIDE(rip)), 390 user_mode, itype, 0); 391 392 SCHED_STATS_INTERRUPT(current_processor()); 393 394#if CONFIG_TELEMETRY 395 if (telemetry_needs_record 396 && (current_task() != kernel_task) 397#if CONFIG_SCHED_IDLE_IN_PLACE 398 && ((current_thread()->state & TH_IDLE) == 0) /* idle-in-place should be treated like the idle thread */ 399#endif 400 ) { 401 telemetry_mark_curthread(user_mode); 402 } 403#endif 404 405 ipl = get_preemption_level(); 406 407 /* 408 * Handle local APIC interrupts 409 * else call platform expert for devices. 410 */ 411 if (!lapic_interrupt(interrupt_num, state)) 412 PE_incoming_interrupt(interrupt_num); 413 414 if (__improbable(get_preemption_level() != ipl)) { 415 panic("Preemption level altered by interrupt vector 0x%x: initial 0x%x, final: 0x%x\n", interrupt_num, ipl, get_preemption_level()); 416 } 417 418 419 if (__improbable(cdp->cpu_nested_istack)) { 420 cdp->cpu_nested_istack_events++; 421 } 422 else { 423 uint64_t ctime = mach_absolute_time(); 424 uint64_t int_latency = ctime - cdp->cpu_int_event_time; 425 uint64_t esdeadline, ehdeadline; 426 /* Attempt to process deferred timers in the context of 427 * this interrupt, unless interrupt time has already exceeded 428 * TCOAL_ILAT_THRESHOLD. 429 */ 430#define TCOAL_ILAT_THRESHOLD (30000ULL) 431 432 if ((int_latency < TCOAL_ILAT_THRESHOLD) && 433 interrupt_timer_coalescing_enabled) { 434 esdeadline = cdp->rtclock_timer.queue.earliest_soft_deadline; 435 ehdeadline = cdp->rtclock_timer.deadline; 436 if ((ctime >= esdeadline) && (ctime < ehdeadline)) { 437 interrupt_coalesced_timers++; 438 TCOAL_DEBUG(0x88880000 | DBG_FUNC_START, ctime, esdeadline, ehdeadline, interrupt_coalesced_timers, 0); 439 rtclock_intr(state); 440 TCOAL_DEBUG(0x88880000 | DBG_FUNC_END, ctime, esdeadline, interrupt_coalesced_timers, 0, 0); 441 } else { 442 TCOAL_DEBUG(0x77770000, ctime, cdp->rtclock_timer.queue.earliest_soft_deadline, cdp->rtclock_timer.deadline, interrupt_coalesced_timers, 0); 443 } 444 } 445 446 if (__improbable(ilat_assert && (int_latency > interrupt_latency_cap) && !machine_timeout_suspended())) { 447 panic("Interrupt vector 0x%x exceeded interrupt latency threshold, 0x%llx absolute time delta, prior signals: 0x%x, current signals: 0x%x", interrupt_num, int_latency, cdp->cpu_prior_signals, cdp->cpu_signals); 448 } 449 450 if (__improbable(int_latency > cdp->cpu_max_observed_int_latency)) { 451 cdp->cpu_max_observed_int_latency = int_latency; 452 cdp->cpu_max_observed_int_latency_vector = interrupt_num; 453 } 454 } 455 456 /* 457 * Having serviced the interrupt first, look at the interrupted stack depth. 458 */ 459 if (!user_mode) { 460 uint64_t depth = cdp->cpu_kernel_stack 461 + sizeof(struct x86_kernel_state) 462 + sizeof(struct i386_exception_link *) 463 - rsp; 464 if (__improbable(depth > kernel_stack_depth_max)) { 465 kernel_stack_depth_max = (vm_offset_t)depth; 466 KERNEL_DEBUG_CONSTANT( 467 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_DEPTH), 468 (long) depth, (long) VM_KERNEL_UNSLIDE(rip), 0, 0, 0); 469 } 470 } 471 472 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 473 MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END, 474 interrupt_num, 0, 0, 0, 0); 475 476} 477 478static inline void 479reset_dr7(void) 480{ 481 long dr7 = 0x400; /* magic dr7 reset value; 32 bit on i386, 64 bit on x86_64 */ 482 __asm__ volatile("mov %0,%%dr7" : : "r" (dr7)); 483} 484#if MACH_KDP 485unsigned kdp_has_active_watchpoints = 0; 486#define NO_WATCHPOINTS (!kdp_has_active_watchpoints) 487#else 488#define NO_WATCHPOINTS 1 489#endif 490/* 491 * Trap from kernel mode. Only page-fault errors are recoverable, 492 * and then only in special circumstances. All other errors are 493 * fatal. Return value indicates if trap was handled. 494 */ 495 496void 497kernel_trap( 498 x86_saved_state_t *state, 499 uintptr_t *lo_spp) 500{ 501 x86_saved_state64_t *saved_state; 502 int code; 503 user_addr_t vaddr; 504 int type; 505 vm_map_t map = 0; /* protected by T_PAGE_FAULT */ 506 kern_return_t result = KERN_FAILURE; 507 thread_t thread; 508 ast_t *myast; 509 boolean_t intr; 510 vm_prot_t prot; 511 struct recovery *rp; 512 vm_offset_t kern_ip; 513#if NCOPY_WINDOWS > 0 514 int fault_in_copy_window = -1; 515#endif 516 int is_user = 0; 517 518 thread = current_thread(); 519 520 if (__improbable(is_saved_state32(state))) 521 panic("kernel_trap(%p) with 32-bit state", state); 522 saved_state = saved_state64(state); 523 524 /* Record cpu where state was captured */ 525 saved_state->isf.cpu = cpu_number(); 526 527 vaddr = (user_addr_t)saved_state->cr2; 528 type = saved_state->isf.trapno; 529 code = (int)(saved_state->isf.err & 0xffff); 530 intr = (saved_state->isf.rflags & EFL_IF) != 0; /* state of ints at trap */ 531 kern_ip = (vm_offset_t)saved_state->isf.rip; 532 533 myast = ast_pending(); 534 535 perfASTCallback astfn = perfASTHook; 536 if (__improbable(astfn != NULL)) { 537 if (*myast & AST_CHUD_ALL) 538 astfn(AST_CHUD_ALL, myast); 539 } else 540 *myast &= ~AST_CHUD_ALL; 541 542 /* 543 * Is there a hook? 544 */ 545 perfCallback fn = perfTrapHook; 546 if (__improbable(fn != NULL)) { 547 if (fn(type, NULL, 0, 0) == KERN_SUCCESS) { 548 /* 549 * If it succeeds, we are done... 550 */ 551 return; 552 } 553 } 554 555#if CONFIG_DTRACE 556 if (__improbable(tempDTraceTrapHook != NULL)) { 557 if (tempDTraceTrapHook(type, state, lo_spp, 0) == KERN_SUCCESS) { 558 /* 559 * If it succeeds, we are done... 560 */ 561 return; 562 } 563 } 564#endif /* CONFIG_DTRACE */ 565 566 /* 567 * we come here with interrupts off as we don't want to recurse 568 * on preemption below. but we do want to re-enable interrupts 569 * as soon we possibly can to hold latency down 570 */ 571 if (__improbable(T_PREEMPT == type)) { 572 ast_taken(AST_PREEMPTION, FALSE); 573 574 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 575 (MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, 576 0, 0, 0, VM_KERNEL_UNSLIDE(kern_ip), 0); 577 return; 578 } 579 580 if (T_PAGE_FAULT == type) { 581 /* 582 * assume we're faulting in the kernel map 583 */ 584 map = kernel_map; 585 586 if (__probable(thread != THREAD_NULL && thread->map != kernel_map)) { 587#if NCOPY_WINDOWS > 0 588 vm_offset_t copy_window_base; 589 vm_offset_t kvaddr; 590 int window_index; 591 592 kvaddr = (vm_offset_t)vaddr; 593 /* 594 * must determine if fault occurred in 595 * the copy window while pre-emption is 596 * disabled for this processor so that 597 * we only need to look at the window 598 * associated with this processor 599 */ 600 copy_window_base = current_cpu_datap()->cpu_copywindow_base; 601 602 if (kvaddr >= copy_window_base && kvaddr < (copy_window_base + (NBPDE * NCOPY_WINDOWS)) ) { 603 604 window_index = (int)((kvaddr - copy_window_base) / NBPDE); 605 606 if (thread->machine.copy_window[window_index].user_base != (user_addr_t)-1) { 607 608 kvaddr -= (copy_window_base + (NBPDE * window_index)); 609 vaddr = thread->machine.copy_window[window_index].user_base + kvaddr; 610 611 map = thread->map; 612 fault_in_copy_window = window_index; 613 } 614 is_user = -1; 615 } 616#else 617 if (__probable(vaddr < VM_MAX_USER_PAGE_ADDRESS)) { 618 /* fault occurred in userspace */ 619 map = thread->map; 620 is_user = -1; 621 622 /* Intercept a potential Supervisor Mode Execute 623 * Protection fault. These criteria identify 624 * both NX faults and SMEP faults, but both 625 * are fatal. We avoid checking PTEs (racy). 626 * (The VM could just redrive a SMEP fault, hence 627 * the intercept). 628 */ 629 if (__improbable((code == (T_PF_PROT | T_PF_EXECUTE)) && (pmap_smep_enabled) && (saved_state->isf.rip == vaddr))) { 630 goto debugger_entry; 631 } 632 633 /* 634 * If we're not sharing cr3 with the user 635 * and we faulted in copyio, 636 * then switch cr3 here and dismiss the fault. 637 */ 638 if (no_shared_cr3 && 639 (thread->machine.specFlags&CopyIOActive) && 640 map->pmap->pm_cr3 != get_cr3_base()) { 641 pmap_assert(current_cpu_datap()->cpu_pmap_pcid_enabled == FALSE); 642 set_cr3_raw(map->pmap->pm_cr3); 643 return; 644 } 645 } 646#endif 647 } 648 } 649 user_addr_t kd_vaddr = is_user ? vaddr : VM_KERNEL_UNSLIDE(vaddr); 650 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 651 (MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, 652 (unsigned)(kd_vaddr >> 32), (unsigned)kd_vaddr, is_user, 653 VM_KERNEL_UNSLIDE(kern_ip), 0); 654 655 656 (void) ml_set_interrupts_enabled(intr); 657 658 switch (type) { 659 660 case T_NO_FPU: 661 fpnoextflt(); 662 return; 663 664 case T_FPU_FAULT: 665 fpextovrflt(); 666 return; 667 668 case T_FLOATING_POINT_ERROR: 669 fpexterrflt(); 670 return; 671 672 case T_SSE_FLOAT_ERROR: 673 fpSSEexterrflt(); 674 return; 675 case T_DEBUG: 676 if ((saved_state->isf.rflags & EFL_TF) == 0 && NO_WATCHPOINTS) 677 { 678 /* We've somehow encountered a debug 679 * register match that does not belong 680 * to the kernel debugger. 681 * This isn't supposed to happen. 682 */ 683 reset_dr7(); 684 return; 685 } 686 goto debugger_entry; 687#ifdef __x86_64__ 688 case T_INT3: 689 goto debugger_entry; 690#endif 691 case T_PAGE_FAULT: 692 693#if CONFIG_DTRACE 694 if (thread != THREAD_NULL && thread->options & TH_OPT_DTRACE) { /* Executing under dtrace_probe? */ 695 if (dtrace_tally_fault(vaddr)) { /* Should a fault under dtrace be ignored? */ 696 /* 697 * DTrace has "anticipated" the possibility of this fault, and has 698 * established the suitable recovery state. Drop down now into the 699 * recovery handling code in "case T_GENERAL_PROTECTION:". 700 */ 701 goto FALL_THROUGH; 702 } 703 } 704#endif /* CONFIG_DTRACE */ 705 706 prot = VM_PROT_READ; 707 708 if (code & T_PF_WRITE) 709 prot |= VM_PROT_WRITE; 710#if PAE 711 if (code & T_PF_EXECUTE) 712 prot |= VM_PROT_EXECUTE; 713#endif 714 715 result = vm_fault(map, 716 vm_map_trunc_page(vaddr, 717 PAGE_MASK), 718 prot, 719 FALSE, 720 THREAD_UNINT, NULL, 0); 721 722 if (result == KERN_SUCCESS) { 723#if NCOPY_WINDOWS > 0 724 if (fault_in_copy_window != -1) { 725 ml_set_interrupts_enabled(FALSE); 726 copy_window_fault(thread, map, 727 fault_in_copy_window); 728 (void) ml_set_interrupts_enabled(intr); 729 } 730#endif /* NCOPY_WINDOWS > 0 */ 731 return; 732 } 733 /* 734 * fall through 735 */ 736#if CONFIG_DTRACE 737FALL_THROUGH: 738#endif /* CONFIG_DTRACE */ 739 740 case T_GENERAL_PROTECTION: 741 /* 742 * If there is a failure recovery address 743 * for this fault, go there. 744 */ 745 for (rp = recover_table; rp < recover_table_end; rp++) { 746 if (kern_ip == rp->fault_addr) { 747 set_recovery_ip(saved_state, rp->recover_addr); 748 return; 749 } 750 } 751 752 /* 753 * Check thread recovery address also. 754 */ 755 if (thread != THREAD_NULL && thread->recover) { 756 set_recovery_ip(saved_state, thread->recover); 757 thread->recover = 0; 758 return; 759 } 760 /* 761 * Unanticipated page-fault errors in kernel 762 * should not happen. 763 * 764 * fall through... 765 */ 766 default: 767 /* 768 * Exception 15 is reserved but some chips may generate it 769 * spuriously. Seen at startup on AMD Athlon-64. 770 */ 771 if (type == 15) { 772 kprintf("kernel_trap() ignoring spurious trap 15\n"); 773 return; 774 } 775debugger_entry: 776 /* Ensure that the i386_kernel_state at the base of the 777 * current thread's stack (if any) is synchronized with the 778 * context at the moment of the trap, to facilitate 779 * access through the debugger. 780 */ 781 sync_iss_to_iks(state); 782#if MACH_KDP 783 if (current_debugger != KDB_CUR_DB) { 784 if (kdp_i386_trap(type, saved_state, result, (vm_offset_t)vaddr)) 785 return; 786 } 787#endif 788 } 789 pal_cli(); 790 panic_trap(saved_state); 791 /* 792 * NO RETURN 793 */ 794} 795 796 797static void 798set_recovery_ip(x86_saved_state64_t *saved_state, vm_offset_t ip) 799{ 800 saved_state->isf.rip = ip; 801} 802 803 804 805 806static void 807panic_trap(x86_saved_state64_t *regs) 808{ 809 const char *trapname = "Unknown"; 810 pal_cr_t cr0, cr2, cr3, cr4; 811 boolean_t potential_smep_fault = FALSE, potential_kernel_NX_fault = FALSE; 812 813 pal_get_control_registers( &cr0, &cr2, &cr3, &cr4 ); 814 assert(ml_get_interrupts_enabled() == FALSE); 815 current_cpu_datap()->cpu_fatal_trap_state = regs; 816 /* 817 * Issue an I/O port read if one has been requested - this is an 818 * event logic analyzers can use as a trigger point. 819 */ 820 panic_io_port_read(); 821 822 kprintf("panic trap number 0x%x, rip 0x%016llx\n", 823 regs->isf.trapno, regs->isf.rip); 824 kprintf("cr0 0x%016llx cr2 0x%016llx cr3 0x%016llx cr4 0x%016llx\n", 825 cr0, cr2, cr3, cr4); 826 827 if (regs->isf.trapno < TRAP_TYPES) 828 trapname = trap_type[regs->isf.trapno]; 829 830 if ((regs->isf.trapno == T_PAGE_FAULT) && (regs->isf.err == (T_PF_PROT | T_PF_EXECUTE)) && (regs->isf.rip == regs->cr2)) { 831 if (pmap_smep_enabled && (regs->isf.rip < VM_MAX_USER_PAGE_ADDRESS)) { 832 potential_smep_fault = TRUE; 833 } else if (regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) { 834 potential_kernel_NX_fault = TRUE; 835 } 836 } 837 838#undef panic 839 panic("Kernel trap at 0x%016llx, type %d=%s, registers:\n" 840 "CR0: 0x%016llx, CR2: 0x%016llx, CR3: 0x%016llx, CR4: 0x%016llx\n" 841 "RAX: 0x%016llx, RBX: 0x%016llx, RCX: 0x%016llx, RDX: 0x%016llx\n" 842 "RSP: 0x%016llx, RBP: 0x%016llx, RSI: 0x%016llx, RDI: 0x%016llx\n" 843 "R8: 0x%016llx, R9: 0x%016llx, R10: 0x%016llx, R11: 0x%016llx\n" 844 "R12: 0x%016llx, R13: 0x%016llx, R14: 0x%016llx, R15: 0x%016llx\n" 845 "RFL: 0x%016llx, RIP: 0x%016llx, CS: 0x%016llx, SS: 0x%016llx\n" 846 "Fault CR2: 0x%016llx, Error code: 0x%016llx, Fault CPU: 0x%x%s%s%s\n", 847 regs->isf.rip, regs->isf.trapno, trapname, 848 cr0, cr2, cr3, cr4, 849 regs->rax, regs->rbx, regs->rcx, regs->rdx, 850 regs->isf.rsp, regs->rbp, regs->rsi, regs->rdi, 851 regs->r8, regs->r9, regs->r10, regs->r11, 852 regs->r12, regs->r13, regs->r14, regs->r15, 853 regs->isf.rflags, regs->isf.rip, regs->isf.cs & 0xFFFF, 854 regs->isf.ss & 0xFFFF,regs->cr2, regs->isf.err, regs->isf.cpu, 855 virtualized ? " VMM" : "", 856 potential_kernel_NX_fault ? " Kernel NX fault" : "", 857 potential_smep_fault ? " SMEP/User NX fault" : ""); 858 /* 859 * This next statement is not executed, 860 * but it's needed to stop the compiler using tail call optimization 861 * for the panic call - which confuses the subsequent backtrace. 862 */ 863 cr0 = 0; 864} 865 866#if CONFIG_DTRACE 867extern kern_return_t dtrace_user_probe(x86_saved_state_t *); 868#endif 869 870/* 871 * Trap from user mode. 872 */ 873void 874user_trap( 875 x86_saved_state_t *saved_state) 876{ 877 int exc; 878 int err; 879 mach_exception_code_t code; 880 mach_exception_subcode_t subcode; 881 int type; 882 user_addr_t vaddr; 883 vm_prot_t prot; 884 thread_t thread = current_thread(); 885 ast_t *myast; 886 kern_return_t kret; 887 user_addr_t rip; 888 unsigned long dr6 = 0; /* 32 bit for i386, 64 bit for x86_64 */ 889 890 assert((is_saved_state32(saved_state) && !thread_is_64bit(thread)) || 891 (is_saved_state64(saved_state) && thread_is_64bit(thread))); 892 893 if (is_saved_state64(saved_state)) { 894 x86_saved_state64_t *regs; 895 896 regs = saved_state64(saved_state); 897 898 /* Record cpu where state was captured */ 899 regs->isf.cpu = cpu_number(); 900 901 type = regs->isf.trapno; 902 err = (int)regs->isf.err & 0xffff; 903 vaddr = (user_addr_t)regs->cr2; 904 rip = (user_addr_t)regs->isf.rip; 905 } else { 906 x86_saved_state32_t *regs; 907 908 regs = saved_state32(saved_state); 909 910 /* Record cpu where state was captured */ 911 regs->cpu = cpu_number(); 912 913 type = regs->trapno; 914 err = regs->err & 0xffff; 915 vaddr = (user_addr_t)regs->cr2; 916 rip = (user_addr_t)regs->eip; 917 } 918 919 if ((type == T_DEBUG) && thread->machine.ids) { 920 unsigned long clear = 0; 921 /* Stash and clear this processor's DR6 value, in the event 922 * this was a debug register match 923 */ 924 __asm__ volatile ("mov %%db6, %0" : "=r" (dr6)); 925 __asm__ volatile ("mov %0, %%db6" : : "r" (clear)); 926 } 927 928 pal_sti(); 929 930 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 931 (MACHDBG_CODE(DBG_MACH_EXCP_UTRAP_x86, type)) | DBG_FUNC_NONE, 932 (unsigned)(vaddr>>32), (unsigned)vaddr, 933 (unsigned)(rip>>32), (unsigned)rip, 0); 934 935 code = 0; 936 subcode = 0; 937 exc = 0; 938 939#if DEBUG_TRACE 940 kprintf("user_trap(0x%08x) type=%d vaddr=0x%016llx\n", 941 saved_state, type, vaddr); 942#endif 943 944 perfASTCallback astfn = perfASTHook; 945 if (__improbable(astfn != NULL)) { 946 myast = ast_pending(); 947 if (*myast & AST_CHUD_ALL) { 948 astfn(AST_CHUD_ALL, myast); 949 } 950 } 951 952 /* Is there a hook? */ 953 perfCallback fn = perfTrapHook; 954 if (__improbable(fn != NULL)) { 955 if (fn(type, saved_state, 0, 0) == KERN_SUCCESS) 956 return; /* If it succeeds, we are done... */ 957 } 958 959 /* 960 * DTrace does not consume all user traps, only INT_3's for now. 961 * Avoid needlessly calling tempDTraceTrapHook here, and let the 962 * INT_3 case handle them. 963 */ 964 DEBUG_KPRINT_SYSCALL_MASK(1, 965 "user_trap: type=0x%x(%s) err=0x%x cr2=%p rip=%p\n", 966 type, trap_type[type], err, (void *)(long) vaddr, (void *)(long) rip); 967 968 switch (type) { 969 970 case T_DIVIDE_ERROR: 971 exc = EXC_ARITHMETIC; 972 code = EXC_I386_DIV; 973 break; 974 975 case T_DEBUG: 976 { 977 pcb_t pcb; 978 /* 979 * Update the PCB with this processor's DR6 value 980 * in the event this was a debug register match. 981 */ 982 pcb = THREAD_TO_PCB(thread); 983 if (pcb->ids) { 984 /* 985 * We can get and set the status register 986 * in 32-bit mode even on a 64-bit thread 987 * because the high order bits are not 988 * used on x86_64 989 */ 990 if (thread_is_64bit(thread)) { 991 x86_debug_state64_t *ids = pcb->ids; 992 ids->dr6 = dr6; 993 } else { /* 32 bit thread */ 994 x86_debug_state32_t *ids = pcb->ids; 995 ids->dr6 = (uint32_t) dr6; 996 } 997 } 998 exc = EXC_BREAKPOINT; 999 code = EXC_I386_SGL; 1000 break; 1001 } 1002 case T_INT3: 1003#if CONFIG_DTRACE 1004 if (dtrace_user_probe(saved_state) == KERN_SUCCESS) 1005 return; /* If it succeeds, we are done... */ 1006#endif 1007 exc = EXC_BREAKPOINT; 1008 code = EXC_I386_BPT; 1009 break; 1010 1011 case T_OVERFLOW: 1012 exc = EXC_ARITHMETIC; 1013 code = EXC_I386_INTO; 1014 break; 1015 1016 case T_OUT_OF_BOUNDS: 1017 exc = EXC_SOFTWARE; 1018 code = EXC_I386_BOUND; 1019 break; 1020 1021 case T_INVALID_OPCODE: 1022 exc = EXC_BAD_INSTRUCTION; 1023 code = EXC_I386_INVOP; 1024 break; 1025 1026 case T_NO_FPU: 1027 fpnoextflt(); 1028 return; 1029 1030 case T_FPU_FAULT: 1031 fpextovrflt(); /* Propagates exception directly, doesn't return */ 1032 return; 1033 1034 case T_INVALID_TSS: /* invalid TSS == iret with NT flag set */ 1035 exc = EXC_BAD_INSTRUCTION; 1036 code = EXC_I386_INVTSSFLT; 1037 subcode = err; 1038 break; 1039 1040 case T_SEGMENT_NOT_PRESENT: 1041 exc = EXC_BAD_INSTRUCTION; 1042 code = EXC_I386_SEGNPFLT; 1043 subcode = err; 1044 break; 1045 1046 case T_STACK_FAULT: 1047 exc = EXC_BAD_INSTRUCTION; 1048 code = EXC_I386_STKFLT; 1049 subcode = err; 1050 break; 1051 1052 case T_GENERAL_PROTECTION: 1053 /* 1054 * There's a wide range of circumstances which generate this 1055 * class of exception. From user-space, many involve bad 1056 * addresses (such as a non-canonical 64-bit address). 1057 * So we map this to EXC_BAD_ACCESS (and thereby SIGSEGV). 1058 * The trouble is cr2 doesn't contain the faulting address; 1059 * we'd need to decode the faulting instruction to really 1060 * determine this. We'll leave that to debuggers. 1061 * However, attempted execution of privileged instructions 1062 * (e.g. cli) also generate GP faults and so we map these to 1063 * to EXC_BAD_ACCESS (and thence SIGSEGV) also - rather than 1064 * EXC_BAD_INSTRUCTION which is more accurate. We just can't 1065 * win! 1066 */ 1067 exc = EXC_BAD_ACCESS; 1068 code = EXC_I386_GPFLT; 1069 subcode = err; 1070 break; 1071 1072 case T_PAGE_FAULT: 1073 { 1074 prot = VM_PROT_READ; 1075 1076 if (err & T_PF_WRITE) 1077 prot |= VM_PROT_WRITE; 1078#if PAE 1079 if (__improbable(err & T_PF_EXECUTE)) 1080 prot |= VM_PROT_EXECUTE; 1081#endif 1082 kret = vm_fault(thread->map, 1083 vm_map_trunc_page(vaddr, 1084 PAGE_MASK), 1085 prot, FALSE, 1086 THREAD_ABORTSAFE, NULL, 0); 1087 1088 if (__probable((kret == KERN_SUCCESS) || (kret == KERN_ABORTED))) { 1089 thread_exception_return(); 1090 /*NOTREACHED*/ 1091 } 1092 1093 user_page_fault_continue(kret); 1094 } /* NOTREACHED */ 1095 break; 1096 1097 case T_SSE_FLOAT_ERROR: 1098 fpSSEexterrflt(); /* Propagates exception directly, doesn't return */ 1099 return; 1100 1101 1102 case T_FLOATING_POINT_ERROR: 1103 fpexterrflt(); /* Propagates exception directly, doesn't return */ 1104 return; 1105 1106 case T_DTRACE_RET: 1107#if CONFIG_DTRACE 1108 if (dtrace_user_probe(saved_state) == KERN_SUCCESS) 1109 return; /* If it succeeds, we are done... */ 1110#endif 1111 /* 1112 * If we get an INT 0x7f when we do not expect to, 1113 * treat it as an illegal instruction 1114 */ 1115 exc = EXC_BAD_INSTRUCTION; 1116 code = EXC_I386_INVOP; 1117 break; 1118 1119 default: 1120 panic("Unexpected user trap, type %d", type); 1121 return; 1122 } 1123 /* Note: Codepaths that directly return from user_trap() have pending 1124 * ASTs processed in locore 1125 */ 1126 i386_exception(exc, code, subcode); 1127 /* NOTREACHED */ 1128} 1129 1130 1131/* 1132 * Handle AST traps for i386. 1133 */ 1134 1135extern void log_thread_action (thread_t, char *); 1136 1137void 1138i386_astintr(int preemption) 1139{ 1140 ast_t mask = AST_ALL; 1141 spl_t s; 1142 1143 if (preemption) 1144 mask = AST_PREEMPTION; 1145 1146 s = splsched(); 1147 1148 ast_taken(mask, s); 1149 1150 splx(s); 1151} 1152 1153/* 1154 * Handle exceptions for i386. 1155 * 1156 * If we are an AT bus machine, we must turn off the AST for a 1157 * delayed floating-point exception. 1158 * 1159 * If we are providing floating-point emulation, we may have 1160 * to retrieve the real register values from the floating point 1161 * emulator. 1162 */ 1163void 1164i386_exception( 1165 int exc, 1166 mach_exception_code_t code, 1167 mach_exception_subcode_t subcode) 1168{ 1169 mach_exception_data_type_t codes[EXCEPTION_CODE_MAX]; 1170 1171 DEBUG_KPRINT_SYSCALL_MACH("i386_exception: exc=%d code=0x%llx subcode=0x%llx\n", 1172 exc, code, subcode); 1173 codes[0] = code; /* new exception interface */ 1174 codes[1] = subcode; 1175 exception_triage(exc, codes, 2); 1176 /*NOTREACHED*/ 1177} 1178 1179 1180/* Synchronize a thread's i386_kernel_state (if any) with the given 1181 * i386_saved_state_t obtained from the trap/IPI handler; called in 1182 * kernel_trap() prior to entering the debugger, and when receiving 1183 * an "MP_KDP" IPI. 1184 */ 1185 1186void 1187sync_iss_to_iks(x86_saved_state_t *saved_state) 1188{ 1189 struct x86_kernel_state *iks; 1190 vm_offset_t kstack; 1191 boolean_t record_active_regs = FALSE; 1192 1193 /* The PAL may have a special way to sync registers */ 1194 if( saved_state->flavor == THREAD_STATE_NONE ) 1195 pal_get_kern_regs( saved_state ); 1196 1197 if ((kstack = current_thread()->kernel_stack) != 0) { 1198 x86_saved_state64_t *regs = saved_state64(saved_state); 1199 1200 iks = STACK_IKS(kstack); 1201 1202 /* Did we take the trap/interrupt in kernel mode? */ 1203 if (regs == USER_REGS64(current_thread())) 1204 record_active_regs = TRUE; 1205 else { 1206 iks->k_rbx = regs->rbx; 1207 iks->k_rsp = regs->isf.rsp; 1208 iks->k_rbp = regs->rbp; 1209 iks->k_r12 = regs->r12; 1210 iks->k_r13 = regs->r13; 1211 iks->k_r14 = regs->r14; 1212 iks->k_r15 = regs->r15; 1213 iks->k_rip = regs->isf.rip; 1214 } 1215 } 1216 1217 if (record_active_regs == TRUE) { 1218 /* Show the trap handler path */ 1219 __asm__ volatile("movq %%rbx, %0" : "=m" (iks->k_rbx)); 1220 __asm__ volatile("movq %%rsp, %0" : "=m" (iks->k_rsp)); 1221 __asm__ volatile("movq %%rbp, %0" : "=m" (iks->k_rbp)); 1222 __asm__ volatile("movq %%r12, %0" : "=m" (iks->k_r12)); 1223 __asm__ volatile("movq %%r13, %0" : "=m" (iks->k_r13)); 1224 __asm__ volatile("movq %%r14, %0" : "=m" (iks->k_r14)); 1225 __asm__ volatile("movq %%r15, %0" : "=m" (iks->k_r15)); 1226 /* "Current" instruction pointer */ 1227 __asm__ volatile("leaq 1f(%%rip), %%rax; mov %%rax, %0\n1:" 1228 : "=m" (iks->k_rip) 1229 : 1230 : "rax"); 1231 } 1232} 1233 1234/* 1235 * This is used by the NMI interrupt handler (from mp.c) to 1236 * uncondtionally sync the trap handler context to the IKS 1237 * irrespective of whether the NMI was fielded in kernel 1238 * or user space. 1239 */ 1240void 1241sync_iss_to_iks_unconditionally(__unused x86_saved_state_t *saved_state) { 1242 struct x86_kernel_state *iks; 1243 vm_offset_t kstack; 1244 1245 if ((kstack = current_thread()->kernel_stack) != 0) { 1246 iks = STACK_IKS(kstack); 1247 /* Display the trap handler path */ 1248 __asm__ volatile("movq %%rbx, %0" : "=m" (iks->k_rbx)); 1249 __asm__ volatile("movq %%rsp, %0" : "=m" (iks->k_rsp)); 1250 __asm__ volatile("movq %%rbp, %0" : "=m" (iks->k_rbp)); 1251 __asm__ volatile("movq %%r12, %0" : "=m" (iks->k_r12)); 1252 __asm__ volatile("movq %%r13, %0" : "=m" (iks->k_r13)); 1253 __asm__ volatile("movq %%r14, %0" : "=m" (iks->k_r14)); 1254 __asm__ volatile("movq %%r15, %0" : "=m" (iks->k_r15)); 1255 /* "Current" instruction pointer */ 1256 __asm__ volatile("leaq 1f(%%rip), %%rax; mov %%rax, %0\n1:" : "=m" (iks->k_rip)::"rax"); 1257 } 1258} 1259