1/* 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29* @OSF_COPYRIGHT@ 30*/ 31/* 32* Mach Operating System 33* Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University 34* All Rights Reserved. 35* 36* Permission to use, copy, modify and distribute this software and its 37* documentation is hereby granted, provided that both the copyright 38* notice and this permission notice appear in all copies of the 39* software, derivative works or modified versions, and any portions 40* thereof, and that both notices appear in supporting documentation. 41* 42* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 43* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR 44* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 45* 46* Carnegie Mellon requests users of this software to return to 47* 48* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 49* School of Computer Science 50* Carnegie Mellon University 51* Pittsburgh PA 15213-3890 52* 53* any improvements or extensions that they make and grant Carnegie Mellon 54* the rights to redistribute these changes. 55*/ 56/* 57*/ 58 59/* 60* Hardware trap/fault handler. 61 */ 62 63#include <mach_kdp.h> 64#include <mach_ldebug.h> 65 66#include <types.h> 67#include <i386/eflags.h> 68#include <i386/trap.h> 69#include <i386/pmap.h> 70#include <i386/fpu.h> 71#include <i386/misc_protos.h> /* panic_io_port_read() */ 72#include <i386/lapic.h> 73 74#include <mach/exception.h> 75#include <mach/kern_return.h> 76#include <mach/vm_param.h> 77#include <mach/i386/thread_status.h> 78 79#include <vm/vm_kern.h> 80#include <vm/vm_fault.h> 81 82#include <kern/kern_types.h> 83#include <kern/processor.h> 84#include <kern/thread.h> 85#include <kern/task.h> 86#include <kern/sched.h> 87#include <kern/sched_prim.h> 88#include <kern/exception.h> 89#include <kern/spl.h> 90#include <kern/misc_protos.h> 91#include <kern/debug.h> 92#if CONFIG_TELEMETRY 93#include <kern/telemetry.h> 94#endif 95#include <sys/kdebug.h> 96#include <prng/random.h> 97 98#include <string.h> 99 100#include <i386/postcode.h> 101#include <i386/mp_desc.h> 102#include <i386/proc_reg.h> 103#if CONFIG_MCA 104#include <i386/machine_check.h> 105#endif 106#include <mach/i386/syscall_sw.h> 107 108#include <libkern/OSDebug.h> 109#include <i386/cpu_threads.h> 110#include <machine/pal_routines.h> 111 112extern void throttle_lowpri_io(int); 113extern void kprint_state(x86_saved_state64_t *saved_state); 114 115/* 116 * Forward declarations 117 */ 118static void user_page_fault_continue(kern_return_t kret); 119static void panic_trap(x86_saved_state64_t *saved_state); 120static void set_recovery_ip(x86_saved_state64_t *saved_state, vm_offset_t ip); 121 122volatile perfCallback perfTrapHook = NULL; /* Pointer to CHUD trap hook routine */ 123 124#if CONFIG_DTRACE 125/* See <rdar://problem/4613924> */ 126perfCallback tempDTraceTrapHook = NULL; /* Pointer to DTrace fbt trap hook routine */ 127 128extern boolean_t dtrace_tally_fault(user_addr_t); 129#endif 130 131extern boolean_t pmap_smep_enabled; 132extern boolean_t pmap_smap_enabled; 133 134void 135thread_syscall_return( 136 kern_return_t ret) 137{ 138 thread_t thr_act = current_thread(); 139 boolean_t is_mach; 140 int code; 141 142 pal_register_cache_state(thr_act, DIRTY); 143 144 if (thread_is_64bit(thr_act)) { 145 x86_saved_state64_t *regs; 146 147 regs = USER_REGS64(thr_act); 148 149 code = (int) (regs->rax & SYSCALL_NUMBER_MASK); 150 is_mach = (regs->rax & SYSCALL_CLASS_MASK) 151 == (SYSCALL_CLASS_MACH << SYSCALL_CLASS_SHIFT); 152 if (kdebug_enable && is_mach) { 153 /* Mach trap */ 154 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 155 MACHDBG_CODE(DBG_MACH_EXCP_SC,code)|DBG_FUNC_END, 156 ret, 0, 0, 0, 0); 157 } 158 regs->rax = ret; 159#if DEBUG 160 if (is_mach) 161 DEBUG_KPRINT_SYSCALL_MACH( 162 "thread_syscall_return: 64-bit mach ret=%u\n", 163 ret); 164 else 165 DEBUG_KPRINT_SYSCALL_UNIX( 166 "thread_syscall_return: 64-bit unix ret=%u\n", 167 ret); 168#endif 169 } else { 170 x86_saved_state32_t *regs; 171 172 regs = USER_REGS32(thr_act); 173 174 code = ((int) regs->eax); 175 is_mach = (code < 0); 176 if (kdebug_enable && is_mach) { 177 /* Mach trap */ 178 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 179 MACHDBG_CODE(DBG_MACH_EXCP_SC,-code)|DBG_FUNC_END, 180 ret, 0, 0, 0, 0); 181 } 182 regs->eax = ret; 183#if DEBUG 184 if (is_mach) 185 DEBUG_KPRINT_SYSCALL_MACH( 186 "thread_syscall_return: 32-bit mach ret=%u\n", 187 ret); 188 else 189 DEBUG_KPRINT_SYSCALL_UNIX( 190 "thread_syscall_return: 32-bit unix ret=%u\n", 191 ret); 192#endif 193 } 194 throttle_lowpri_io(1); 195 196 thread_exception_return(); 197 /*NOTREACHED*/ 198} 199 200 201static inline void 202user_page_fault_continue( 203 kern_return_t kr) 204{ 205 thread_t thread = current_thread(); 206 user_addr_t vaddr; 207 208 if (thread_is_64bit(thread)) { 209 x86_saved_state64_t *uregs; 210 211 uregs = USER_REGS64(thread); 212 213 vaddr = (user_addr_t)uregs->cr2; 214 } else { 215 x86_saved_state32_t *uregs; 216 217 uregs = USER_REGS32(thread); 218 219 vaddr = uregs->cr2; 220 } 221 222 223 /* PAL debug hook */ 224 pal_dbg_page_fault( thread, vaddr, kr ); 225 226 i386_exception(EXC_BAD_ACCESS, kr, vaddr); 227 /*NOTREACHED*/ 228} 229 230/* 231 * Fault recovery in copyin/copyout routines. 232 */ 233struct recovery { 234 uintptr_t fault_addr; 235 uintptr_t recover_addr; 236}; 237 238extern struct recovery recover_table[]; 239extern struct recovery recover_table_end[]; 240 241const char * trap_type[] = {TRAP_NAMES}; 242unsigned TRAP_TYPES = sizeof(trap_type)/sizeof(trap_type[0]); 243 244extern void PE_incoming_interrupt(int interrupt); 245 246#if defined(__x86_64__) && DEBUG 247void 248kprint_state(x86_saved_state64_t *saved_state) 249{ 250 kprintf("current_cpu_datap() 0x%lx\n", (uintptr_t)current_cpu_datap()); 251 kprintf("Current GS base MSR 0x%llx\n", rdmsr64(MSR_IA32_GS_BASE)); 252 kprintf("Kernel GS base MSR 0x%llx\n", rdmsr64(MSR_IA32_KERNEL_GS_BASE)); 253 kprintf("state at 0x%lx:\n", (uintptr_t) saved_state); 254 255 kprintf(" rdi 0x%llx\n", saved_state->rdi); 256 kprintf(" rsi 0x%llx\n", saved_state->rsi); 257 kprintf(" rdx 0x%llx\n", saved_state->rdx); 258 kprintf(" r10 0x%llx\n", saved_state->r10); 259 kprintf(" r8 0x%llx\n", saved_state->r8); 260 kprintf(" r9 0x%llx\n", saved_state->r9); 261 262 kprintf(" cr2 0x%llx\n", saved_state->cr2); 263 kprintf("real cr2 0x%lx\n", get_cr2()); 264 kprintf(" r15 0x%llx\n", saved_state->r15); 265 kprintf(" r14 0x%llx\n", saved_state->r14); 266 kprintf(" r13 0x%llx\n", saved_state->r13); 267 kprintf(" r12 0x%llx\n", saved_state->r12); 268 kprintf(" r11 0x%llx\n", saved_state->r11); 269 kprintf(" rbp 0x%llx\n", saved_state->rbp); 270 kprintf(" rbx 0x%llx\n", saved_state->rbx); 271 kprintf(" rcx 0x%llx\n", saved_state->rcx); 272 kprintf(" rax 0x%llx\n", saved_state->rax); 273 274 kprintf(" gs 0x%x\n", saved_state->gs); 275 kprintf(" fs 0x%x\n", saved_state->fs); 276 277 kprintf(" isf.trapno 0x%x\n", saved_state->isf.trapno); 278 kprintf(" isf._pad 0x%x\n", saved_state->isf._pad); 279 kprintf(" isf.trapfn 0x%llx\n", saved_state->isf.trapfn); 280 kprintf(" isf.err 0x%llx\n", saved_state->isf.err); 281 kprintf(" isf.rip 0x%llx\n", saved_state->isf.rip); 282 kprintf(" isf.cs 0x%llx\n", saved_state->isf.cs); 283 kprintf(" isf.rflags 0x%llx\n", saved_state->isf.rflags); 284 kprintf(" isf.rsp 0x%llx\n", saved_state->isf.rsp); 285 kprintf(" isf.ss 0x%llx\n", saved_state->isf.ss); 286} 287#endif 288 289 290/* 291 * Non-zero indicates latency assert is enabled and capped at valued 292 * absolute time units. 293 */ 294 295uint64_t interrupt_latency_cap = 0; 296boolean_t ilat_assert = FALSE; 297 298void 299interrupt_latency_tracker_setup(void) { 300 uint32_t ilat_cap_us; 301 if (PE_parse_boot_argn("interrupt_latency_cap_us", &ilat_cap_us, sizeof(ilat_cap_us))) { 302 interrupt_latency_cap = ilat_cap_us * NSEC_PER_USEC; 303 nanoseconds_to_absolutetime(interrupt_latency_cap, &interrupt_latency_cap); 304 } else { 305 interrupt_latency_cap = LockTimeOut; 306 } 307 PE_parse_boot_argn("-interrupt_latency_assert_enable", &ilat_assert, sizeof(ilat_assert)); 308} 309 310void interrupt_reset_latency_stats(void) { 311 uint32_t i; 312 for (i = 0; i < real_ncpus; i++) { 313 cpu_data_ptr[i]->cpu_max_observed_int_latency = 314 cpu_data_ptr[i]->cpu_max_observed_int_latency_vector = 0; 315 } 316} 317 318void interrupt_populate_latency_stats(char *buf, unsigned bufsize) { 319 uint32_t i, tcpu = ~0; 320 uint64_t cur_max = 0; 321 322 for (i = 0; i < real_ncpus; i++) { 323 if (cur_max < cpu_data_ptr[i]->cpu_max_observed_int_latency) { 324 cur_max = cpu_data_ptr[i]->cpu_max_observed_int_latency; 325 tcpu = i; 326 } 327 } 328 329 if (tcpu < real_ncpus) 330 snprintf(buf, bufsize, "0x%x 0x%x 0x%llx", tcpu, cpu_data_ptr[tcpu]->cpu_max_observed_int_latency_vector, cpu_data_ptr[tcpu]->cpu_max_observed_int_latency); 331} 332 333uint32_t interrupt_timer_coalescing_enabled = 1; 334uint64_t interrupt_coalesced_timers; 335 336/* 337 * Handle interrupts: 338 * - local APIC interrupts (IPIs, timers, etc) are handled by the kernel, 339 * - device interrupts go to the platform expert. 340 */ 341void 342interrupt(x86_saved_state_t *state) 343{ 344 uint64_t rip; 345 uint64_t rsp; 346 int interrupt_num; 347 boolean_t user_mode = FALSE; 348 int ipl; 349 int cnum = cpu_number(); 350 cpu_data_t *cdp = cpu_data_ptr[cnum]; 351 int itype = 0; 352 353 if (is_saved_state64(state) == TRUE) { 354 x86_saved_state64_t *state64; 355 356 state64 = saved_state64(state); 357 rip = state64->isf.rip; 358 rsp = state64->isf.rsp; 359 interrupt_num = state64->isf.trapno; 360#ifdef __x86_64__ 361 if(state64->isf.cs & 0x03) 362#endif 363 user_mode = TRUE; 364 } else { 365 x86_saved_state32_t *state32; 366 367 state32 = saved_state32(state); 368 if (state32->cs & 0x03) 369 user_mode = TRUE; 370 rip = state32->eip; 371 rsp = state32->uesp; 372 interrupt_num = state32->trapno; 373 } 374 375 if (cpu_data_ptr[cnum]->lcpu.package->num_idle == topoParms.nLThreadsPerPackage) 376 cpu_data_ptr[cnum]->cpu_hwIntpexits[interrupt_num]++; 377 378 if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_INTERPROCESSOR_INTERRUPT)) 379 itype = 1; 380 else if (interrupt_num == (LAPIC_DEFAULT_INTERRUPT_BASE + LAPIC_TIMER_INTERRUPT)) 381 itype = 2; 382 else 383 itype = 3; 384 385 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 386 MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_START, 387 interrupt_num, 388 (user_mode ? rip : VM_KERNEL_UNSLIDE(rip)), 389 user_mode, itype, 0); 390 391 SCHED_STATS_INTERRUPT(current_processor()); 392 393#if CONFIG_TELEMETRY 394 if (telemetry_needs_record 395 && (current_task() != kernel_task) 396#if CONFIG_SCHED_IDLE_IN_PLACE 397 && ((current_thread()->state & TH_IDLE) == 0) /* idle-in-place should be treated like the idle thread */ 398#endif 399 ) { 400 telemetry_mark_curthread(user_mode); 401 } 402#endif 403 404 ipl = get_preemption_level(); 405 406 /* 407 * Handle local APIC interrupts 408 * else call platform expert for devices. 409 */ 410 if (!lapic_interrupt(interrupt_num, state)) { 411 PE_incoming_interrupt(interrupt_num); 412 } 413 414 if (__improbable(get_preemption_level() != ipl)) { 415 panic("Preemption level altered by interrupt vector 0x%x: initial 0x%x, final: 0x%x\n", interrupt_num, ipl, get_preemption_level()); 416 } 417 418 419 if (__improbable(cdp->cpu_nested_istack)) { 420 cdp->cpu_nested_istack_events++; 421 } 422 else { 423 uint64_t ctime = mach_absolute_time(); 424 uint64_t int_latency = ctime - cdp->cpu_int_event_time; 425 uint64_t esdeadline, ehdeadline; 426 /* Attempt to process deferred timers in the context of 427 * this interrupt, unless interrupt time has already exceeded 428 * TCOAL_ILAT_THRESHOLD. 429 */ 430#define TCOAL_ILAT_THRESHOLD (30000ULL) 431 432 if ((int_latency < TCOAL_ILAT_THRESHOLD) && 433 interrupt_timer_coalescing_enabled) { 434 esdeadline = cdp->rtclock_timer.queue.earliest_soft_deadline; 435 ehdeadline = cdp->rtclock_timer.deadline; 436 if ((ctime >= esdeadline) && (ctime < ehdeadline)) { 437 interrupt_coalesced_timers++; 438 TCOAL_DEBUG(0x88880000 | DBG_FUNC_START, ctime, esdeadline, ehdeadline, interrupt_coalesced_timers, 0); 439 rtclock_intr(state); 440 TCOAL_DEBUG(0x88880000 | DBG_FUNC_END, ctime, esdeadline, interrupt_coalesced_timers, 0, 0); 441 } else { 442 TCOAL_DEBUG(0x77770000, ctime, cdp->rtclock_timer.queue.earliest_soft_deadline, cdp->rtclock_timer.deadline, interrupt_coalesced_timers, 0); 443 } 444 } 445 446 if (__improbable(ilat_assert && (int_latency > interrupt_latency_cap) && !machine_timeout_suspended())) { 447 panic("Interrupt vector 0x%x exceeded interrupt latency threshold, 0x%llx absolute time delta, prior signals: 0x%x, current signals: 0x%x", interrupt_num, int_latency, cdp->cpu_prior_signals, cdp->cpu_signals); 448 } 449 450 if (__improbable(int_latency > cdp->cpu_max_observed_int_latency)) { 451 cdp->cpu_max_observed_int_latency = int_latency; 452 cdp->cpu_max_observed_int_latency_vector = interrupt_num; 453 } 454 } 455 456 /* 457 * Having serviced the interrupt first, look at the interrupted stack depth. 458 */ 459 if (!user_mode) { 460 uint64_t depth = cdp->cpu_kernel_stack 461 + sizeof(struct x86_kernel_state) 462 + sizeof(struct i386_exception_link *) 463 - rsp; 464 if (__improbable(depth > kernel_stack_depth_max)) { 465 kernel_stack_depth_max = (vm_offset_t)depth; 466 KERNEL_DEBUG_CONSTANT( 467 MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_DEPTH), 468 (long) depth, (long) VM_KERNEL_UNSLIDE(rip), 0, 0, 0); 469 } 470 } 471 472 if (cnum == master_cpu) 473 ml_entropy_collect(); 474 475 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 476 MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END, 477 interrupt_num, 0, 0, 0, 0); 478 479} 480 481static inline void 482reset_dr7(void) 483{ 484 long dr7 = 0x400; /* magic dr7 reset value; 32 bit on i386, 64 bit on x86_64 */ 485 __asm__ volatile("mov %0,%%dr7" : : "r" (dr7)); 486} 487#if MACH_KDP 488unsigned kdp_has_active_watchpoints = 0; 489#define NO_WATCHPOINTS (!kdp_has_active_watchpoints) 490#else 491#define NO_WATCHPOINTS 1 492#endif 493/* 494 * Trap from kernel mode. Only page-fault errors are recoverable, 495 * and then only in special circumstances. All other errors are 496 * fatal. Return value indicates if trap was handled. 497 */ 498 499void 500kernel_trap( 501 x86_saved_state_t *state, 502 uintptr_t *lo_spp) 503{ 504 x86_saved_state64_t *saved_state; 505 int code; 506 user_addr_t vaddr; 507 int type; 508 vm_map_t map = 0; /* protected by T_PAGE_FAULT */ 509 kern_return_t result = KERN_FAILURE; 510 thread_t thread; 511 ast_t *myast; 512 boolean_t intr; 513 vm_prot_t prot; 514 struct recovery *rp; 515 vm_offset_t kern_ip; 516#if NCOPY_WINDOWS > 0 517 int fault_in_copy_window = -1; 518#endif 519 int is_user = 0; 520 521 thread = current_thread(); 522 523 if (__improbable(is_saved_state32(state))) 524 panic("kernel_trap(%p) with 32-bit state", state); 525 saved_state = saved_state64(state); 526 527 /* Record cpu where state was captured */ 528 saved_state->isf.cpu = cpu_number(); 529 530 vaddr = (user_addr_t)saved_state->cr2; 531 type = saved_state->isf.trapno; 532 code = (int)(saved_state->isf.err & 0xffff); 533 intr = (saved_state->isf.rflags & EFL_IF) != 0; /* state of ints at trap */ 534 kern_ip = (vm_offset_t)saved_state->isf.rip; 535 536 myast = ast_pending(); 537 538 perfASTCallback astfn = perfASTHook; 539 if (__improbable(astfn != NULL)) { 540 if (*myast & AST_CHUD_ALL) 541 astfn(AST_CHUD_ALL, myast); 542 } else 543 *myast &= ~AST_CHUD_ALL; 544 545 546#if CONFIG_DTRACE 547 /* 548 * Is there a DTrace hook? 549 */ 550 if (__improbable(tempDTraceTrapHook != NULL)) { 551 if (tempDTraceTrapHook(type, state, lo_spp, 0) == KERN_SUCCESS) { 552 /* 553 * If it succeeds, we are done... 554 */ 555 return; 556 } 557 } 558#endif /* CONFIG_DTRACE */ 559 560 /* 561 * we come here with interrupts off as we don't want to recurse 562 * on preemption below. but we do want to re-enable interrupts 563 * as soon we possibly can to hold latency down 564 */ 565 if (__improbable(T_PREEMPT == type)) { 566 ast_taken(AST_PREEMPTION, FALSE); 567 568 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 569 (MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, 570 0, 0, 0, VM_KERNEL_UNSLIDE(kern_ip), 0); 571 return; 572 } 573 574 if (T_PAGE_FAULT == type) { 575 /* 576 * assume we're faulting in the kernel map 577 */ 578 map = kernel_map; 579 580 if (__probable(thread != THREAD_NULL && thread->map != kernel_map)) { 581#if NCOPY_WINDOWS > 0 582 vm_offset_t copy_window_base; 583 vm_offset_t kvaddr; 584 int window_index; 585 586 kvaddr = (vm_offset_t)vaddr; 587 /* 588 * must determine if fault occurred in 589 * the copy window while pre-emption is 590 * disabled for this processor so that 591 * we only need to look at the window 592 * associated with this processor 593 */ 594 copy_window_base = current_cpu_datap()->cpu_copywindow_base; 595 596 if (kvaddr >= copy_window_base && kvaddr < (copy_window_base + (NBPDE * NCOPY_WINDOWS)) ) { 597 598 window_index = (int)((kvaddr - copy_window_base) / NBPDE); 599 600 if (thread->machine.copy_window[window_index].user_base != (user_addr_t)-1) { 601 602 kvaddr -= (copy_window_base + (NBPDE * window_index)); 603 vaddr = thread->machine.copy_window[window_index].user_base + kvaddr; 604 605 map = thread->map; 606 fault_in_copy_window = window_index; 607 } 608 is_user = -1; 609 } 610#else 611 if (__probable(vaddr < VM_MAX_USER_PAGE_ADDRESS)) { 612 /* fault occurred in userspace */ 613 map = thread->map; 614 is_user = -1; 615 616 /* Intercept a potential Supervisor Mode Execute 617 * Protection fault. These criteria identify 618 * both NX faults and SMEP faults, but both 619 * are fatal. We avoid checking PTEs (racy). 620 * (The VM could just redrive a SMEP fault, hence 621 * the intercept). 622 */ 623 if (__improbable((code == (T_PF_PROT | T_PF_EXECUTE)) && (pmap_smep_enabled) && (saved_state->isf.rip == vaddr))) { 624 goto debugger_entry; 625 } 626 627 /* 628 * If we're not sharing cr3 with the user 629 * and we faulted in copyio, 630 * then switch cr3 here and dismiss the fault. 631 */ 632 if (no_shared_cr3 && 633 (thread->machine.specFlags&CopyIOActive) && 634 map->pmap->pm_cr3 != get_cr3_base()) { 635 pmap_assert(current_cpu_datap()->cpu_pmap_pcid_enabled == FALSE); 636 set_cr3_raw(map->pmap->pm_cr3); 637 return; 638 } 639 640 } 641#endif 642 } 643 } 644 user_addr_t kd_vaddr = is_user ? vaddr : VM_KERNEL_UNSLIDE(vaddr); 645 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 646 (MACHDBG_CODE(DBG_MACH_EXCP_KTRAP_x86, type)) | DBG_FUNC_NONE, 647 (unsigned)(kd_vaddr >> 32), (unsigned)kd_vaddr, is_user, 648 VM_KERNEL_UNSLIDE(kern_ip), 0); 649 650 651 (void) ml_set_interrupts_enabled(intr); 652 653 switch (type) { 654 655 case T_NO_FPU: 656 fpnoextflt(); 657 return; 658 659 case T_FPU_FAULT: 660 fpextovrflt(); 661 return; 662 663 case T_FLOATING_POINT_ERROR: 664 fpexterrflt(); 665 return; 666 667 case T_SSE_FLOAT_ERROR: 668 fpSSEexterrflt(); 669 return; 670 case T_DEBUG: 671 if ((saved_state->isf.rflags & EFL_TF) == 0 && NO_WATCHPOINTS) 672 { 673 /* We've somehow encountered a debug 674 * register match that does not belong 675 * to the kernel debugger. 676 * This isn't supposed to happen. 677 */ 678 reset_dr7(); 679 return; 680 } 681 goto debugger_entry; 682#ifdef __x86_64__ 683 case T_INT3: 684 goto debugger_entry; 685#endif 686 case T_PAGE_FAULT: 687 688#if CONFIG_DTRACE 689 if (thread != THREAD_NULL && thread->options & TH_OPT_DTRACE) { /* Executing under dtrace_probe? */ 690 if (dtrace_tally_fault(vaddr)) { /* Should a fault under dtrace be ignored? */ 691 /* 692 * DTrace has "anticipated" the possibility of this fault, and has 693 * established the suitable recovery state. Drop down now into the 694 * recovery handling code in "case T_GENERAL_PROTECTION:". 695 */ 696 goto FALL_THROUGH; 697 } 698 } 699#endif /* CONFIG_DTRACE */ 700 701 prot = VM_PROT_READ; 702 703 if (code & T_PF_WRITE) 704 prot |= VM_PROT_WRITE; 705 if (code & T_PF_EXECUTE) 706 prot |= VM_PROT_EXECUTE; 707 708 result = vm_fault(map, 709 vm_map_trunc_page(vaddr, 710 PAGE_MASK), 711 prot, 712 FALSE, 713 THREAD_UNINT, NULL, 0); 714 715 if (result == KERN_SUCCESS) { 716#if NCOPY_WINDOWS > 0 717 if (fault_in_copy_window != -1) { 718 ml_set_interrupts_enabled(FALSE); 719 copy_window_fault(thread, map, 720 fault_in_copy_window); 721 (void) ml_set_interrupts_enabled(intr); 722 } 723#endif /* NCOPY_WINDOWS > 0 */ 724 return; 725 } 726 /* 727 * fall through 728 */ 729#if CONFIG_DTRACE 730FALL_THROUGH: 731#endif /* CONFIG_DTRACE */ 732 733 case T_GENERAL_PROTECTION: 734 /* 735 * If there is a failure recovery address 736 * for this fault, go there. 737 */ 738 for (rp = recover_table; rp < recover_table_end; rp++) { 739 if (kern_ip == rp->fault_addr) { 740 set_recovery_ip(saved_state, rp->recover_addr); 741 return; 742 } 743 } 744 745 /* 746 * Check thread recovery address also. 747 */ 748 if (thread != THREAD_NULL && thread->recover) { 749 set_recovery_ip(saved_state, thread->recover); 750 thread->recover = 0; 751 return; 752 } 753 /* 754 * Unanticipated page-fault errors in kernel 755 * should not happen. 756 * 757 * fall through... 758 */ 759 default: 760 /* 761 * Exception 15 is reserved but some chips may generate it 762 * spuriously. Seen at startup on AMD Athlon-64. 763 */ 764 if (type == 15) { 765 kprintf("kernel_trap() ignoring spurious trap 15\n"); 766 return; 767 } 768debugger_entry: 769 /* Ensure that the i386_kernel_state at the base of the 770 * current thread's stack (if any) is synchronized with the 771 * context at the moment of the trap, to facilitate 772 * access through the debugger. 773 */ 774 sync_iss_to_iks(state); 775#if MACH_KDP 776 if (current_debugger != KDB_CUR_DB) { 777 if (kdp_i386_trap(type, saved_state, result, (vm_offset_t)vaddr)) 778 return; 779 } 780#endif 781 } 782 pal_cli(); 783 panic_trap(saved_state); 784 /* 785 * NO RETURN 786 */ 787} 788 789 790static void 791set_recovery_ip(x86_saved_state64_t *saved_state, vm_offset_t ip) 792{ 793 saved_state->isf.rip = ip; 794} 795 796 797 798 799static void 800panic_trap(x86_saved_state64_t *regs) 801{ 802 const char *trapname = "Unknown"; 803 pal_cr_t cr0, cr2, cr3, cr4; 804 boolean_t potential_smep_fault = FALSE, potential_kernel_NX_fault = FALSE; 805 806 pal_get_control_registers( &cr0, &cr2, &cr3, &cr4 ); 807 assert(ml_get_interrupts_enabled() == FALSE); 808 current_cpu_datap()->cpu_fatal_trap_state = regs; 809 /* 810 * Issue an I/O port read if one has been requested - this is an 811 * event logic analyzers can use as a trigger point. 812 */ 813 panic_io_port_read(); 814 815 kprintf("panic trap number 0x%x, rip 0x%016llx\n", 816 regs->isf.trapno, regs->isf.rip); 817 kprintf("cr0 0x%016llx cr2 0x%016llx cr3 0x%016llx cr4 0x%016llx\n", 818 cr0, cr2, cr3, cr4); 819 820 if (regs->isf.trapno < TRAP_TYPES) 821 trapname = trap_type[regs->isf.trapno]; 822 823 if ((regs->isf.trapno == T_PAGE_FAULT) && (regs->isf.err == (T_PF_PROT | T_PF_EXECUTE)) && (regs->isf.rip == regs->cr2)) { 824 if (pmap_smep_enabled && (regs->isf.rip < VM_MAX_USER_PAGE_ADDRESS)) { 825 potential_smep_fault = TRUE; 826 } else if (regs->isf.rip >= VM_MIN_KERNEL_AND_KEXT_ADDRESS) { 827 potential_kernel_NX_fault = TRUE; 828 } 829 } 830 831#undef panic 832 panic("Kernel trap at 0x%016llx, type %d=%s, registers:\n" 833 "CR0: 0x%016llx, CR2: 0x%016llx, CR3: 0x%016llx, CR4: 0x%016llx\n" 834 "RAX: 0x%016llx, RBX: 0x%016llx, RCX: 0x%016llx, RDX: 0x%016llx\n" 835 "RSP: 0x%016llx, RBP: 0x%016llx, RSI: 0x%016llx, RDI: 0x%016llx\n" 836 "R8: 0x%016llx, R9: 0x%016llx, R10: 0x%016llx, R11: 0x%016llx\n" 837 "R12: 0x%016llx, R13: 0x%016llx, R14: 0x%016llx, R15: 0x%016llx\n" 838 "RFL: 0x%016llx, RIP: 0x%016llx, CS: 0x%016llx, SS: 0x%016llx\n" 839 "Fault CR2: 0x%016llx, Error code: 0x%016llx, Fault CPU: 0x%x%s%s%s%s\n", 840 regs->isf.rip, regs->isf.trapno, trapname, 841 cr0, cr2, cr3, cr4, 842 regs->rax, regs->rbx, regs->rcx, regs->rdx, 843 regs->isf.rsp, regs->rbp, regs->rsi, regs->rdi, 844 regs->r8, regs->r9, regs->r10, regs->r11, 845 regs->r12, regs->r13, regs->r14, regs->r15, 846 regs->isf.rflags, regs->isf.rip, regs->isf.cs & 0xFFFF, 847 regs->isf.ss & 0xFFFF,regs->cr2, regs->isf.err, regs->isf.cpu, 848 virtualized ? " VMM" : "", 849 potential_kernel_NX_fault ? " Kernel NX fault" : "", 850 potential_smep_fault ? " SMEP/User NX fault" : "", 851 ""); 852 /* 853 * This next statement is not executed, 854 * but it's needed to stop the compiler using tail call optimization 855 * for the panic call - which confuses the subsequent backtrace. 856 */ 857 cr0 = 0; 858} 859 860#if CONFIG_DTRACE 861extern kern_return_t dtrace_user_probe(x86_saved_state_t *); 862#endif 863 864/* 865 * Trap from user mode. 866 */ 867void 868user_trap( 869 x86_saved_state_t *saved_state) 870{ 871 int exc; 872 int err; 873 mach_exception_code_t code; 874 mach_exception_subcode_t subcode; 875 int type; 876 user_addr_t vaddr; 877 vm_prot_t prot; 878 thread_t thread = current_thread(); 879 ast_t *myast; 880 kern_return_t kret; 881 user_addr_t rip; 882 unsigned long dr6 = 0; /* 32 bit for i386, 64 bit for x86_64 */ 883 884 assert((is_saved_state32(saved_state) && !thread_is_64bit(thread)) || 885 (is_saved_state64(saved_state) && thread_is_64bit(thread))); 886 887 if (is_saved_state64(saved_state)) { 888 x86_saved_state64_t *regs; 889 890 regs = saved_state64(saved_state); 891 892 /* Record cpu where state was captured */ 893 regs->isf.cpu = cpu_number(); 894 895 type = regs->isf.trapno; 896 err = (int)regs->isf.err & 0xffff; 897 vaddr = (user_addr_t)regs->cr2; 898 rip = (user_addr_t)regs->isf.rip; 899 } else { 900 x86_saved_state32_t *regs; 901 902 regs = saved_state32(saved_state); 903 904 /* Record cpu where state was captured */ 905 regs->cpu = cpu_number(); 906 907 type = regs->trapno; 908 err = regs->err & 0xffff; 909 vaddr = (user_addr_t)regs->cr2; 910 rip = (user_addr_t)regs->eip; 911 } 912 913 if ((type == T_DEBUG) && thread->machine.ids) { 914 unsigned long clear = 0; 915 /* Stash and clear this processor's DR6 value, in the event 916 * this was a debug register match 917 */ 918 __asm__ volatile ("mov %%db6, %0" : "=r" (dr6)); 919 __asm__ volatile ("mov %0, %%db6" : : "r" (clear)); 920 } 921 922 pal_sti(); 923 924 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 925 (MACHDBG_CODE(DBG_MACH_EXCP_UTRAP_x86, type)) | DBG_FUNC_NONE, 926 (unsigned)(vaddr>>32), (unsigned)vaddr, 927 (unsigned)(rip>>32), (unsigned)rip, 0); 928 929 code = 0; 930 subcode = 0; 931 exc = 0; 932 933#if DEBUG_TRACE 934 kprintf("user_trap(0x%08x) type=%d vaddr=0x%016llx\n", 935 saved_state, type, vaddr); 936#endif 937 938 perfASTCallback astfn = perfASTHook; 939 if (__improbable(astfn != NULL)) { 940 myast = ast_pending(); 941 if (*myast & AST_CHUD_ALL) { 942 astfn(AST_CHUD_ALL, myast); 943 } 944 } 945 946 /* Is there a hook? */ 947 perfCallback fn = perfTrapHook; 948 if (__improbable(fn != NULL)) { 949 if (fn(type, saved_state, 0, 0) == KERN_SUCCESS) 950 return; /* If it succeeds, we are done... */ 951 } 952 953#if CONFIG_DTRACE 954 /* 955 * DTrace does not consume all user traps, only INT_3's for now. 956 * Avoid needlessly calling tempDTraceTrapHook here, and let the 957 * INT_3 case handle them. 958 */ 959#endif 960 961 DEBUG_KPRINT_SYSCALL_MASK(1, 962 "user_trap: type=0x%x(%s) err=0x%x cr2=%p rip=%p\n", 963 type, trap_type[type], err, (void *)(long) vaddr, (void *)(long) rip); 964 965 switch (type) { 966 967 case T_DIVIDE_ERROR: 968 exc = EXC_ARITHMETIC; 969 code = EXC_I386_DIV; 970 break; 971 972 case T_DEBUG: 973 { 974 pcb_t pcb; 975 /* 976 * Update the PCB with this processor's DR6 value 977 * in the event this was a debug register match. 978 */ 979 pcb = THREAD_TO_PCB(thread); 980 if (pcb->ids) { 981 /* 982 * We can get and set the status register 983 * in 32-bit mode even on a 64-bit thread 984 * because the high order bits are not 985 * used on x86_64 986 */ 987 if (thread_is_64bit(thread)) { 988 x86_debug_state64_t *ids = pcb->ids; 989 ids->dr6 = dr6; 990 } else { /* 32 bit thread */ 991 x86_debug_state32_t *ids = pcb->ids; 992 ids->dr6 = (uint32_t) dr6; 993 } 994 } 995 exc = EXC_BREAKPOINT; 996 code = EXC_I386_SGL; 997 break; 998 } 999 case T_INT3: 1000#if CONFIG_DTRACE 1001 if (dtrace_user_probe(saved_state) == KERN_SUCCESS) 1002 return; /* If it succeeds, we are done... */ 1003#endif 1004 exc = EXC_BREAKPOINT; 1005 code = EXC_I386_BPT; 1006 break; 1007 1008 case T_OVERFLOW: 1009 exc = EXC_ARITHMETIC; 1010 code = EXC_I386_INTO; 1011 break; 1012 1013 case T_OUT_OF_BOUNDS: 1014 exc = EXC_SOFTWARE; 1015 code = EXC_I386_BOUND; 1016 break; 1017 1018 case T_INVALID_OPCODE: 1019 exc = EXC_BAD_INSTRUCTION; 1020 code = EXC_I386_INVOP; 1021 break; 1022 1023 case T_NO_FPU: 1024 fpnoextflt(); 1025 return; 1026 1027 case T_FPU_FAULT: 1028 fpextovrflt(); /* Propagates exception directly, doesn't return */ 1029 return; 1030 1031 case T_INVALID_TSS: /* invalid TSS == iret with NT flag set */ 1032 exc = EXC_BAD_INSTRUCTION; 1033 code = EXC_I386_INVTSSFLT; 1034 subcode = err; 1035 break; 1036 1037 case T_SEGMENT_NOT_PRESENT: 1038 exc = EXC_BAD_INSTRUCTION; 1039 code = EXC_I386_SEGNPFLT; 1040 subcode = err; 1041 break; 1042 1043 case T_STACK_FAULT: 1044 exc = EXC_BAD_INSTRUCTION; 1045 code = EXC_I386_STKFLT; 1046 subcode = err; 1047 break; 1048 1049 case T_GENERAL_PROTECTION: 1050 /* 1051 * There's a wide range of circumstances which generate this 1052 * class of exception. From user-space, many involve bad 1053 * addresses (such as a non-canonical 64-bit address). 1054 * So we map this to EXC_BAD_ACCESS (and thereby SIGSEGV). 1055 * The trouble is cr2 doesn't contain the faulting address; 1056 * we'd need to decode the faulting instruction to really 1057 * determine this. We'll leave that to debuggers. 1058 * However, attempted execution of privileged instructions 1059 * (e.g. cli) also generate GP faults and so we map these to 1060 * to EXC_BAD_ACCESS (and thence SIGSEGV) also - rather than 1061 * EXC_BAD_INSTRUCTION which is more accurate. We just can't 1062 * win! 1063 */ 1064 exc = EXC_BAD_ACCESS; 1065 code = EXC_I386_GPFLT; 1066 subcode = err; 1067 break; 1068 1069 case T_PAGE_FAULT: 1070 { 1071 prot = VM_PROT_READ; 1072 1073 if (err & T_PF_WRITE) 1074 prot |= VM_PROT_WRITE; 1075 if (__improbable(err & T_PF_EXECUTE)) 1076 prot |= VM_PROT_EXECUTE; 1077 kret = vm_fault(thread->map, 1078 vm_map_trunc_page(vaddr, 1079 PAGE_MASK), 1080 prot, FALSE, 1081 THREAD_ABORTSAFE, NULL, 0); 1082 1083 if (__probable((kret == KERN_SUCCESS) || (kret == KERN_ABORTED))) { 1084 thread_exception_return(); 1085 /*NOTREACHED*/ 1086 } 1087 1088 user_page_fault_continue(kret); 1089 } /* NOTREACHED */ 1090 break; 1091 1092 case T_SSE_FLOAT_ERROR: 1093 fpSSEexterrflt(); /* Propagates exception directly, doesn't return */ 1094 return; 1095 1096 1097 case T_FLOATING_POINT_ERROR: 1098 fpexterrflt(); /* Propagates exception directly, doesn't return */ 1099 return; 1100 1101 case T_DTRACE_RET: 1102#if CONFIG_DTRACE 1103 if (dtrace_user_probe(saved_state) == KERN_SUCCESS) 1104 return; /* If it succeeds, we are done... */ 1105#endif 1106 /* 1107 * If we get an INT 0x7f when we do not expect to, 1108 * treat it as an illegal instruction 1109 */ 1110 exc = EXC_BAD_INSTRUCTION; 1111 code = EXC_I386_INVOP; 1112 break; 1113 1114 default: 1115 panic("Unexpected user trap, type %d", type); 1116 return; 1117 } 1118 /* Note: Codepaths that directly return from user_trap() have pending 1119 * ASTs processed in locore 1120 */ 1121 i386_exception(exc, code, subcode); 1122 /* NOTREACHED */ 1123} 1124 1125 1126/* 1127 * Handle AST traps for i386. 1128 */ 1129 1130extern void log_thread_action (thread_t, char *); 1131 1132void 1133i386_astintr(int preemption) 1134{ 1135 ast_t mask = AST_ALL; 1136 spl_t s; 1137 1138 if (preemption) 1139 mask = AST_PREEMPTION; 1140 1141 s = splsched(); 1142 1143 ast_taken(mask, s); 1144 1145 splx(s); 1146} 1147 1148/* 1149 * Handle exceptions for i386. 1150 * 1151 * If we are an AT bus machine, we must turn off the AST for a 1152 * delayed floating-point exception. 1153 * 1154 * If we are providing floating-point emulation, we may have 1155 * to retrieve the real register values from the floating point 1156 * emulator. 1157 */ 1158void 1159i386_exception( 1160 int exc, 1161 mach_exception_code_t code, 1162 mach_exception_subcode_t subcode) 1163{ 1164 mach_exception_data_type_t codes[EXCEPTION_CODE_MAX]; 1165 1166 DEBUG_KPRINT_SYSCALL_MACH("i386_exception: exc=%d code=0x%llx subcode=0x%llx\n", 1167 exc, code, subcode); 1168 codes[0] = code; /* new exception interface */ 1169 codes[1] = subcode; 1170 exception_triage(exc, codes, 2); 1171 /*NOTREACHED*/ 1172} 1173 1174 1175/* Synchronize a thread's x86_kernel_state (if any) with the given 1176 * x86_saved_state_t obtained from the trap/IPI handler; called in 1177 * kernel_trap() prior to entering the debugger, and when receiving 1178 * an "MP_KDP" IPI. Called with null saved_state if an incoming IPI 1179 * was detected from the kernel while spinning with interrupts masked. 1180 */ 1181 1182void 1183sync_iss_to_iks(x86_saved_state_t *saved_state) 1184{ 1185 struct x86_kernel_state *iks; 1186 vm_offset_t kstack; 1187 boolean_t record_active_regs = FALSE; 1188 1189 /* The PAL may have a special way to sync registers */ 1190 if (saved_state && saved_state->flavor == THREAD_STATE_NONE) 1191 pal_get_kern_regs( saved_state ); 1192 1193 if ((kstack = current_thread()->kernel_stack) != 0) { 1194 x86_saved_state64_t *regs = saved_state64(saved_state); 1195 1196 iks = STACK_IKS(kstack); 1197 1198 /* Did we take the trap/interrupt in kernel mode? */ 1199 if (saved_state == NULL || /* NULL => polling in kernel */ 1200 regs == USER_REGS64(current_thread())) 1201 record_active_regs = TRUE; 1202 else { 1203 iks->k_rbx = regs->rbx; 1204 iks->k_rsp = regs->isf.rsp; 1205 iks->k_rbp = regs->rbp; 1206 iks->k_r12 = regs->r12; 1207 iks->k_r13 = regs->r13; 1208 iks->k_r14 = regs->r14; 1209 iks->k_r15 = regs->r15; 1210 iks->k_rip = regs->isf.rip; 1211 } 1212 } 1213 1214 if (record_active_regs == TRUE) { 1215 /* Show the trap handler path */ 1216 __asm__ volatile("movq %%rbx, %0" : "=m" (iks->k_rbx)); 1217 __asm__ volatile("movq %%rsp, %0" : "=m" (iks->k_rsp)); 1218 __asm__ volatile("movq %%rbp, %0" : "=m" (iks->k_rbp)); 1219 __asm__ volatile("movq %%r12, %0" : "=m" (iks->k_r12)); 1220 __asm__ volatile("movq %%r13, %0" : "=m" (iks->k_r13)); 1221 __asm__ volatile("movq %%r14, %0" : "=m" (iks->k_r14)); 1222 __asm__ volatile("movq %%r15, %0" : "=m" (iks->k_r15)); 1223 /* "Current" instruction pointer */ 1224 __asm__ volatile("leaq 1f(%%rip), %%rax; mov %%rax, %0\n1:" 1225 : "=m" (iks->k_rip) 1226 : 1227 : "rax"); 1228 } 1229} 1230 1231/* 1232 * This is used by the NMI interrupt handler (from mp.c) to 1233 * uncondtionally sync the trap handler context to the IKS 1234 * irrespective of whether the NMI was fielded in kernel 1235 * or user space. 1236 */ 1237void 1238sync_iss_to_iks_unconditionally(__unused x86_saved_state_t *saved_state) { 1239 struct x86_kernel_state *iks; 1240 vm_offset_t kstack; 1241 1242 if ((kstack = current_thread()->kernel_stack) != 0) { 1243 iks = STACK_IKS(kstack); 1244 /* Display the trap handler path */ 1245 __asm__ volatile("movq %%rbx, %0" : "=m" (iks->k_rbx)); 1246 __asm__ volatile("movq %%rsp, %0" : "=m" (iks->k_rsp)); 1247 __asm__ volatile("movq %%rbp, %0" : "=m" (iks->k_rbp)); 1248 __asm__ volatile("movq %%r12, %0" : "=m" (iks->k_r12)); 1249 __asm__ volatile("movq %%r13, %0" : "=m" (iks->k_r13)); 1250 __asm__ volatile("movq %%r14, %0" : "=m" (iks->k_r14)); 1251 __asm__ volatile("movq %%r15, %0" : "=m" (iks->k_r15)); 1252 /* "Current" instruction pointer */ 1253 __asm__ volatile("leaq 1f(%%rip), %%rax; mov %%rax, %0\n1:" : "=m" (iks->k_rip)::"rax"); 1254 } 1255} 1256