1/* 2 * Copyright (C) 1995 Linus Torvalds 3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. 4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar 5 */ 6#include <linux/magic.h> /* STACK_END_MAGIC */ 7#include <linux/sched.h> /* test_thread_flag(), ... */ 8#include <linux/kdebug.h> /* oops_begin/end, ... */ 9#include <linux/module.h> /* search_exception_table */ 10#include <linux/bootmem.h> /* max_low_pfn */ 11#include <linux/kprobes.h> /* __kprobes, ... */ 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */ 13#include <linux/perf_event.h> /* perf_sw_event */ 14 15#include <asm/traps.h> /* dotraplinkage, ... */ 16#include <asm/pgalloc.h> /* pgd_*(), ... */ 17#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 18 19/* 20 * Page fault error code bits: 21 * 22 * bit 0 == 0: no page found 1: protection fault 23 * bit 1 == 0: read access 1: write access 24 * bit 2 == 0: kernel-mode access 1: user-mode access 25 * bit 3 == 1: use of reserved bit detected 26 * bit 4 == 1: fault was an instruction fetch 27 */ 28enum x86_pf_error_code { 29 30 PF_PROT = 1 << 0, 31 PF_WRITE = 1 << 1, 32 PF_USER = 1 << 2, 33 PF_RSVD = 1 << 3, 34 PF_INSTR = 1 << 4, 35}; 36 37/* 38 * Returns 0 if mmiotrace is disabled, or if the fault is not 39 * handled by mmiotrace: 40 */ 41static inline int __kprobes 42kmmio_fault(struct pt_regs *regs, unsigned long addr) 43{ 44 if (unlikely(is_kmmio_active())) 45 if (kmmio_handler(regs, addr) == 1) 46 return -1; 47 return 0; 48} 49 50static inline int __kprobes notify_page_fault(struct pt_regs *regs) 51{ 52 int ret = 0; 53 54 /* kprobe_running() needs smp_processor_id() */ 55 if (kprobes_built_in() && !user_mode_vm(regs)) { 56 preempt_disable(); 57 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 58 ret = 1; 59 preempt_enable(); 60 } 61 62 return ret; 63} 64 65/* 66 * Prefetch quirks: 67 * 68 * 32-bit mode: 69 * 70 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 71 * Check that here and ignore it. 72 * 73 * 64-bit mode: 74 * 75 * Sometimes the CPU reports invalid exceptions on prefetch. 76 * Check that here and ignore it. 77 * 78 * Opcode checker based on code by Richard Brunner. 79 */ 80static inline int 81check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, 82 unsigned char opcode, int *prefetch) 83{ 84 unsigned char instr_hi = opcode & 0xf0; 85 unsigned char instr_lo = opcode & 0x0f; 86 87 switch (instr_hi) { 88 case 0x20: 89 case 0x30: 90 /* 91 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. 92 * In X86_64 long mode, the CPU will signal invalid 93 * opcode if some of these prefixes are present so 94 * X86_64 will never get here anyway 95 */ 96 return ((instr_lo & 7) == 0x6); 97#ifdef CONFIG_X86_64 98 case 0x40: 99 /* 100 * In AMD64 long mode 0x40..0x4F are valid REX prefixes 101 * Need to figure out under what instruction mode the 102 * instruction was issued. Could check the LDT for lm, 103 * but for now it's good enough to assume that long 104 * mode only uses well known segments or kernel. 105 */ 106 return (!user_mode(regs)) || (regs->cs == __USER_CS); 107#endif 108 case 0x60: 109 /* 0x64 thru 0x67 are valid prefixes in all modes. */ 110 return (instr_lo & 0xC) == 0x4; 111 case 0xF0: 112 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ 113 return !instr_lo || (instr_lo>>1) == 1; 114 case 0x00: 115 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 116 if (probe_kernel_address(instr, opcode)) 117 return 0; 118 119 *prefetch = (instr_lo == 0xF) && 120 (opcode == 0x0D || opcode == 0x18); 121 return 0; 122 default: 123 return 0; 124 } 125} 126 127static int 128is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) 129{ 130 unsigned char *max_instr; 131 unsigned char *instr; 132 int prefetch = 0; 133 134 /* 135 * If it was a exec (instruction fetch) fault on NX page, then 136 * do not ignore the fault: 137 */ 138 if (error_code & PF_INSTR) 139 return 0; 140 141 instr = (void *)convert_ip_to_linear(current, regs); 142 max_instr = instr + 15; 143 144 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 145 return 0; 146 147 while (instr < max_instr) { 148 unsigned char opcode; 149 150 if (probe_kernel_address(instr, opcode)) 151 break; 152 153 instr++; 154 155 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) 156 break; 157 } 158 return prefetch; 159} 160 161static void 162force_sig_info_fault(int si_signo, int si_code, unsigned long address, 163 struct task_struct *tsk) 164{ 165 siginfo_t info; 166 167 info.si_signo = si_signo; 168 info.si_errno = 0; 169 info.si_code = si_code; 170 info.si_addr = (void __user *)address; 171 info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; 172 173 force_sig_info(si_signo, &info, tsk); 174} 175 176DEFINE_SPINLOCK(pgd_lock); 177LIST_HEAD(pgd_list); 178 179#ifdef CONFIG_X86_32 180static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) 181{ 182 unsigned index = pgd_index(address); 183 pgd_t *pgd_k; 184 pud_t *pud, *pud_k; 185 pmd_t *pmd, *pmd_k; 186 187 pgd += index; 188 pgd_k = init_mm.pgd + index; 189 190 if (!pgd_present(*pgd_k)) 191 return NULL; 192 193 /* 194 * set_pgd(pgd, *pgd_k); here would be useless on PAE 195 * and redundant with the set_pmd() on non-PAE. As would 196 * set_pud. 197 */ 198 pud = pud_offset(pgd, address); 199 pud_k = pud_offset(pgd_k, address); 200 if (!pud_present(*pud_k)) 201 return NULL; 202 203 pmd = pmd_offset(pud, address); 204 pmd_k = pmd_offset(pud_k, address); 205 if (!pmd_present(*pmd_k)) 206 return NULL; 207 208 if (!pmd_present(*pmd)) 209 set_pmd(pmd, *pmd_k); 210 else 211 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 212 213 return pmd_k; 214} 215 216void vmalloc_sync_all(void) 217{ 218 unsigned long address; 219 220 if (SHARED_KERNEL_PMD) 221 return; 222 223 for (address = VMALLOC_START & PMD_MASK; 224 address >= TASK_SIZE && address < FIXADDR_TOP; 225 address += PMD_SIZE) { 226 227 unsigned long flags; 228 struct page *page; 229 230 spin_lock_irqsave(&pgd_lock, flags); 231 list_for_each_entry(page, &pgd_list, lru) { 232 if (!vmalloc_sync_one(page_address(page), address)) 233 break; 234 } 235 spin_unlock_irqrestore(&pgd_lock, flags); 236 } 237} 238 239/* 240 * 32-bit: 241 * 242 * Handle a fault on the vmalloc or module mapping area 243 */ 244static noinline __kprobes int vmalloc_fault(unsigned long address) 245{ 246 unsigned long pgd_paddr; 247 pmd_t *pmd_k; 248 pte_t *pte_k; 249 250 /* Make sure we are in vmalloc area: */ 251 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 252 return -1; 253 254 /* 255 * Synchronize this task's top level page-table 256 * with the 'reference' page table. 257 * 258 * Do _not_ use "current" here. We might be inside 259 * an interrupt in the middle of a task switch.. 260 */ 261 pgd_paddr = read_cr3(); 262 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 263 if (!pmd_k) 264 return -1; 265 266 pte_k = pte_offset_kernel(pmd_k, address); 267 if (!pte_present(*pte_k)) 268 return -1; 269 270 return 0; 271} 272 273/* 274 * Did it hit the DOS screen memory VA from vm86 mode? 275 */ 276static inline void 277check_v8086_mode(struct pt_regs *regs, unsigned long address, 278 struct task_struct *tsk) 279{ 280 unsigned long bit; 281 282 if (!v8086_mode(regs)) 283 return; 284 285 bit = (address - 0xA0000) >> PAGE_SHIFT; 286 if (bit < 32) 287 tsk->thread.screen_bitmap |= 1 << bit; 288} 289 290static bool low_pfn(unsigned long pfn) 291{ 292 return pfn < max_low_pfn; 293} 294 295static void dump_pagetable(unsigned long address) 296{ 297 pgd_t *base = __va(read_cr3()); 298 pgd_t *pgd = &base[pgd_index(address)]; 299 pmd_t *pmd; 300 pte_t *pte; 301 302#ifdef CONFIG_X86_PAE 303 printk("*pdpt = %016Lx ", pgd_val(*pgd)); 304 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) 305 goto out; 306#endif 307 pmd = pmd_offset(pud_offset(pgd, address), address); 308 printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); 309 310 /* 311 * We must not directly access the pte in the highpte 312 * case if the page table is located in highmem. 313 * And let's rather not kmap-atomic the pte, just in case 314 * it's allocated already: 315 */ 316 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) 317 goto out; 318 319 pte = pte_offset_kernel(pmd, address); 320 printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); 321out: 322 printk("\n"); 323} 324 325#else /* CONFIG_X86_64: */ 326 327void vmalloc_sync_all(void) 328{ 329 unsigned long address; 330 331 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; 332 address += PGDIR_SIZE) { 333 334 const pgd_t *pgd_ref = pgd_offset_k(address); 335 unsigned long flags; 336 struct page *page; 337 338 if (pgd_none(*pgd_ref)) 339 continue; 340 341 spin_lock_irqsave(&pgd_lock, flags); 342 list_for_each_entry(page, &pgd_list, lru) { 343 pgd_t *pgd; 344 pgd = (pgd_t *)page_address(page) + pgd_index(address); 345 if (pgd_none(*pgd)) 346 set_pgd(pgd, *pgd_ref); 347 else 348 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 349 } 350 spin_unlock_irqrestore(&pgd_lock, flags); 351 } 352} 353 354/* 355 * 64-bit: 356 * 357 * Handle a fault on the vmalloc area 358 * 359 * This assumes no large pages in there. 360 */ 361static noinline __kprobes int vmalloc_fault(unsigned long address) 362{ 363 pgd_t *pgd, *pgd_ref; 364 pud_t *pud, *pud_ref; 365 pmd_t *pmd, *pmd_ref; 366 pte_t *pte, *pte_ref; 367 368 /* Make sure we are in vmalloc area: */ 369 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 370 return -1; 371 372 /* 373 * Copy kernel mappings over when needed. This can also 374 * happen within a race in page table update. In the later 375 * case just flush: 376 */ 377 pgd = pgd_offset(current->active_mm, address); 378 pgd_ref = pgd_offset_k(address); 379 if (pgd_none(*pgd_ref)) 380 return -1; 381 382 if (pgd_none(*pgd)) 383 set_pgd(pgd, *pgd_ref); 384 else 385 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 386 387 /* 388 * Below here mismatches are bugs because these lower tables 389 * are shared: 390 */ 391 392 pud = pud_offset(pgd, address); 393 pud_ref = pud_offset(pgd_ref, address); 394 if (pud_none(*pud_ref)) 395 return -1; 396 397 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) 398 BUG(); 399 400 pmd = pmd_offset(pud, address); 401 pmd_ref = pmd_offset(pud_ref, address); 402 if (pmd_none(*pmd_ref)) 403 return -1; 404 405 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) 406 BUG(); 407 408 pte_ref = pte_offset_kernel(pmd_ref, address); 409 if (!pte_present(*pte_ref)) 410 return -1; 411 412 pte = pte_offset_kernel(pmd, address); 413 414 /* 415 * Don't use pte_page here, because the mappings can point 416 * outside mem_map, and the NUMA hash lookup cannot handle 417 * that: 418 */ 419 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) 420 BUG(); 421 422 return 0; 423} 424 425static const char errata93_warning[] = 426KERN_ERR 427"******* Your BIOS seems to not contain a fix for K8 errata #93\n" 428"******* Working around it, but it may cause SEGVs or burn power.\n" 429"******* Please consider a BIOS update.\n" 430"******* Disabling USB legacy in the BIOS may also help.\n"; 431 432/* 433 * No vm86 mode in 64-bit mode: 434 */ 435static inline void 436check_v8086_mode(struct pt_regs *regs, unsigned long address, 437 struct task_struct *tsk) 438{ 439} 440 441static int bad_address(void *p) 442{ 443 unsigned long dummy; 444 445 return probe_kernel_address((unsigned long *)p, dummy); 446} 447 448static void dump_pagetable(unsigned long address) 449{ 450 pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); 451 pgd_t *pgd = base + pgd_index(address); 452 pud_t *pud; 453 pmd_t *pmd; 454 pte_t *pte; 455 456 if (bad_address(pgd)) 457 goto bad; 458 459 printk("PGD %lx ", pgd_val(*pgd)); 460 461 if (!pgd_present(*pgd)) 462 goto out; 463 464 pud = pud_offset(pgd, address); 465 if (bad_address(pud)) 466 goto bad; 467 468 printk("PUD %lx ", pud_val(*pud)); 469 if (!pud_present(*pud) || pud_large(*pud)) 470 goto out; 471 472 pmd = pmd_offset(pud, address); 473 if (bad_address(pmd)) 474 goto bad; 475 476 printk("PMD %lx ", pmd_val(*pmd)); 477 if (!pmd_present(*pmd) || pmd_large(*pmd)) 478 goto out; 479 480 pte = pte_offset_kernel(pmd, address); 481 if (bad_address(pte)) 482 goto bad; 483 484 printk("PTE %lx", pte_val(*pte)); 485out: 486 printk("\n"); 487 return; 488bad: 489 printk("BAD\n"); 490} 491 492#endif /* CONFIG_X86_64 */ 493 494static int is_errata93(struct pt_regs *regs, unsigned long address) 495{ 496#ifdef CONFIG_X86_64 497 if (address != regs->ip) 498 return 0; 499 500 if ((address >> 32) != 0) 501 return 0; 502 503 address |= 0xffffffffUL << 32; 504 if ((address >= (u64)_stext && address <= (u64)_etext) || 505 (address >= MODULES_VADDR && address <= MODULES_END)) { 506 printk_once(errata93_warning); 507 regs->ip = address; 508 return 1; 509 } 510#endif 511 return 0; 512} 513 514static int is_errata100(struct pt_regs *regs, unsigned long address) 515{ 516#ifdef CONFIG_X86_64 517 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) 518 return 1; 519#endif 520 return 0; 521} 522 523static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 524{ 525#ifdef CONFIG_X86_F00F_BUG 526 unsigned long nr; 527 528 if (boot_cpu_data.f00f_bug) { 529 nr = (address - idt_descr.address) >> 3; 530 531 if (nr == 6) { 532 do_invalid_op(regs, 0); 533 return 1; 534 } 535 } 536#endif 537 return 0; 538} 539 540static const char nx_warning[] = KERN_CRIT 541"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; 542 543static void 544show_fault_oops(struct pt_regs *regs, unsigned long error_code, 545 unsigned long address) 546{ 547 if (!oops_may_print()) 548 return; 549 550 if (error_code & PF_INSTR) { 551 unsigned int level; 552 553 pte_t *pte = lookup_address(address, &level); 554 555 if (pte && pte_present(*pte) && !pte_exec(*pte)) 556 printk(nx_warning, current_uid()); 557 } 558 559 printk(KERN_ALERT "BUG: unable to handle kernel "); 560 if (address < PAGE_SIZE) 561 printk(KERN_CONT "NULL pointer dereference"); 562 else 563 printk(KERN_CONT "paging request"); 564 565 printk(KERN_CONT " at %p\n", (void *) address); 566 printk(KERN_ALERT "IP:"); 567 printk_address(regs->ip, 1); 568 569 dump_pagetable(address); 570} 571 572static noinline void 573pgtable_bad(struct pt_regs *regs, unsigned long error_code, 574 unsigned long address) 575{ 576 struct task_struct *tsk; 577 unsigned long flags; 578 int sig; 579 580 flags = oops_begin(); 581 tsk = current; 582 sig = SIGKILL; 583 584 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 585 tsk->comm, address); 586 dump_pagetable(address); 587 588 tsk->thread.cr2 = address; 589 tsk->thread.trap_no = 14; 590 tsk->thread.error_code = error_code; 591 592 if (__die("Bad pagetable", regs, error_code)) 593 sig = 0; 594 595 oops_end(flags, regs, sig); 596} 597 598static noinline void 599no_context(struct pt_regs *regs, unsigned long error_code, 600 unsigned long address) 601{ 602 struct task_struct *tsk = current; 603 unsigned long *stackend; 604 unsigned long flags; 605 int sig; 606 607 /* Are we prepared to handle this kernel fault? */ 608 if (fixup_exception(regs)) 609 return; 610 611 /* 612 * 32-bit: 613 * 614 * Valid to do another page fault here, because if this fault 615 * had been triggered by is_prefetch fixup_exception would have 616 * handled it. 617 * 618 * 64-bit: 619 * 620 * Hall of shame of CPU/BIOS bugs. 621 */ 622 if (is_prefetch(regs, error_code, address)) 623 return; 624 625 if (is_errata93(regs, address)) 626 return; 627 628 /* 629 * Oops. The kernel tried to access some bad page. We'll have to 630 * terminate things with extreme prejudice: 631 */ 632 flags = oops_begin(); 633 634 show_fault_oops(regs, error_code, address); 635 636 stackend = end_of_stack(tsk); 637 if (tsk != &init_task && *stackend != STACK_END_MAGIC) 638 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); 639 640 tsk->thread.cr2 = address; 641 tsk->thread.trap_no = 14; 642 tsk->thread.error_code = error_code; 643 644 sig = SIGKILL; 645 if (__die("Oops", regs, error_code)) 646 sig = 0; 647 648 /* Executive summary in case the body of the oops scrolled away */ 649 printk(KERN_EMERG "CR2: %016lx\n", address); 650 651 oops_end(flags, regs, sig); 652} 653 654/* 655 * Print out info about fatal segfaults, if the show_unhandled_signals 656 * sysctl is set: 657 */ 658static inline void 659show_signal_msg(struct pt_regs *regs, unsigned long error_code, 660 unsigned long address, struct task_struct *tsk) 661{ 662 if (!unhandled_signal(tsk, SIGSEGV)) 663 return; 664 665 if (!printk_ratelimit()) 666 return; 667 668 printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", 669 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 670 tsk->comm, task_pid_nr(tsk), address, 671 (void *)regs->ip, (void *)regs->sp, error_code); 672 673 print_vma_addr(KERN_CONT " in ", regs->ip); 674 675 printk(KERN_CONT "\n"); 676} 677 678static void 679__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 680 unsigned long address, int si_code) 681{ 682 struct task_struct *tsk = current; 683 684 /* User mode accesses just cause a SIGSEGV */ 685 if (error_code & PF_USER) { 686 /* 687 * It's possible to have interrupts off here: 688 */ 689 local_irq_enable(); 690 691 /* 692 * Valid to do another page fault here because this one came 693 * from user space: 694 */ 695 if (is_prefetch(regs, error_code, address)) 696 return; 697 698 if (is_errata100(regs, address)) 699 return; 700 701 if (unlikely(show_unhandled_signals)) 702 show_signal_msg(regs, error_code, address, tsk); 703 704 /* Kernel addresses are always protection faults: */ 705 tsk->thread.cr2 = address; 706 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 707 tsk->thread.trap_no = 14; 708 709 force_sig_info_fault(SIGSEGV, si_code, address, tsk); 710 711 return; 712 } 713 714 if (is_f00f_bug(regs, address)) 715 return; 716 717 no_context(regs, error_code, address); 718} 719 720static noinline void 721bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 722 unsigned long address) 723{ 724 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); 725} 726 727static void 728__bad_area(struct pt_regs *regs, unsigned long error_code, 729 unsigned long address, int si_code) 730{ 731 struct mm_struct *mm = current->mm; 732 733 /* 734 * Something tried to access memory that isn't in our memory map.. 735 * Fix it, but check if it's kernel or user first.. 736 */ 737 up_read(&mm->mmap_sem); 738 739 __bad_area_nosemaphore(regs, error_code, address, si_code); 740} 741 742static noinline void 743bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) 744{ 745 __bad_area(regs, error_code, address, SEGV_MAPERR); 746} 747 748static noinline void 749bad_area_access_error(struct pt_regs *regs, unsigned long error_code, 750 unsigned long address) 751{ 752 __bad_area(regs, error_code, address, SEGV_ACCERR); 753} 754 755/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ 756static void 757out_of_memory(struct pt_regs *regs, unsigned long error_code, 758 unsigned long address) 759{ 760 /* 761 * We ran out of memory, call the OOM killer, and return the userspace 762 * (which will retry the fault, or kill us if we got oom-killed): 763 */ 764 up_read(¤t->mm->mmap_sem); 765 766 pagefault_out_of_memory(); 767} 768 769static void 770do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, 771 unsigned int fault) 772{ 773 struct task_struct *tsk = current; 774 struct mm_struct *mm = tsk->mm; 775 int code = BUS_ADRERR; 776 777 up_read(&mm->mmap_sem); 778 779 /* Kernel mode? Handle exceptions or die: */ 780 if (!(error_code & PF_USER)) { 781 no_context(regs, error_code, address); 782 return; 783 } 784 785 /* User-space => ok to do another page fault: */ 786 if (is_prefetch(regs, error_code, address)) 787 return; 788 789 tsk->thread.cr2 = address; 790 tsk->thread.error_code = error_code; 791 tsk->thread.trap_no = 14; 792 793#ifdef CONFIG_MEMORY_FAILURE 794 if (fault & VM_FAULT_HWPOISON) { 795 printk(KERN_ERR 796 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 797 tsk->comm, tsk->pid, address); 798 code = BUS_MCEERR_AR; 799 } 800#endif 801 force_sig_info_fault(SIGBUS, code, address, tsk); 802} 803 804static noinline void 805mm_fault_error(struct pt_regs *regs, unsigned long error_code, 806 unsigned long address, unsigned int fault) 807{ 808 if (fault & VM_FAULT_OOM) { 809 out_of_memory(regs, error_code, address); 810 } else { 811 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) 812 do_sigbus(regs, error_code, address, fault); 813 else 814 BUG(); 815 } 816} 817 818static int spurious_fault_check(unsigned long error_code, pte_t *pte) 819{ 820 if ((error_code & PF_WRITE) && !pte_write(*pte)) 821 return 0; 822 823 if ((error_code & PF_INSTR) && !pte_exec(*pte)) 824 return 0; 825 826 return 1; 827} 828 829/* 830 * Handle a spurious fault caused by a stale TLB entry. 831 * 832 * This allows us to lazily refresh the TLB when increasing the 833 * permissions of a kernel page (RO -> RW or NX -> X). Doing it 834 * eagerly is very expensive since that implies doing a full 835 * cross-processor TLB flush, even if no stale TLB entries exist 836 * on other processors. 837 * 838 * There are no security implications to leaving a stale TLB when 839 * increasing the permissions on a page. 840 */ 841static noinline __kprobes int 842spurious_fault(unsigned long error_code, unsigned long address) 843{ 844 pgd_t *pgd; 845 pud_t *pud; 846 pmd_t *pmd; 847 pte_t *pte; 848 int ret; 849 850 /* Reserved-bit violation or user access to kernel space? */ 851 if (error_code & (PF_USER | PF_RSVD)) 852 return 0; 853 854 pgd = init_mm.pgd + pgd_index(address); 855 if (!pgd_present(*pgd)) 856 return 0; 857 858 pud = pud_offset(pgd, address); 859 if (!pud_present(*pud)) 860 return 0; 861 862 if (pud_large(*pud)) 863 return spurious_fault_check(error_code, (pte_t *) pud); 864 865 pmd = pmd_offset(pud, address); 866 if (!pmd_present(*pmd)) 867 return 0; 868 869 if (pmd_large(*pmd)) 870 return spurious_fault_check(error_code, (pte_t *) pmd); 871 872 pte = pte_offset_kernel(pmd, address); 873 if (!pte_present(*pte)) 874 return 0; 875 876 ret = spurious_fault_check(error_code, pte); 877 if (!ret) 878 return 0; 879 880 /* 881 * Make sure we have permissions in PMD. 882 * If not, then there's a bug in the page tables: 883 */ 884 ret = spurious_fault_check(error_code, (pte_t *) pmd); 885 WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); 886 887 return ret; 888} 889 890int show_unhandled_signals = 1; 891 892static inline int 893access_error(unsigned long error_code, int write, struct vm_area_struct *vma) 894{ 895 if (write) { 896 /* write, present and write, not present: */ 897 if (unlikely(!(vma->vm_flags & VM_WRITE))) 898 return 1; 899 return 0; 900 } 901 902 /* read, present: */ 903 if (unlikely(error_code & PF_PROT)) 904 return 1; 905 906 /* read, not present: */ 907 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) 908 return 1; 909 910 return 0; 911} 912 913static int fault_in_kernel_space(unsigned long address) 914{ 915 return address >= TASK_SIZE_MAX; 916} 917 918/* 919 * This routine handles page faults. It determines the address, 920 * and the problem, and then passes it off to one of the appropriate 921 * routines. 922 */ 923dotraplinkage void __kprobes 924do_page_fault(struct pt_regs *regs, unsigned long error_code) 925{ 926 struct vm_area_struct *vma; 927 struct task_struct *tsk; 928 unsigned long address; 929 struct mm_struct *mm; 930 int write; 931 int fault; 932 933 tsk = current; 934 mm = tsk->mm; 935 936 /* Get the faulting address: */ 937 address = read_cr2(); 938 939 /* 940 * Detect and handle instructions that would cause a page fault for 941 * both a tracked kernel page and a userspace page. 942 */ 943 if (kmemcheck_active(regs)) 944 kmemcheck_hide(regs); 945 prefetchw(&mm->mmap_sem); 946 947 if (unlikely(kmmio_fault(regs, address))) 948 return; 949 950 /* 951 * We fault-in kernel-space virtual memory on-demand. The 952 * 'reference' page table is init_mm.pgd. 953 * 954 * NOTE! We MUST NOT take any locks for this case. We may 955 * be in an interrupt or a critical region, and should 956 * only copy the information from the master page table, 957 * nothing more. 958 * 959 * This verifies that the fault happens in kernel space 960 * (error_code & 4) == 0, and that the fault was not a 961 * protection error (error_code & 9) == 0. 962 */ 963 if (unlikely(fault_in_kernel_space(address))) { 964 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { 965 if (vmalloc_fault(address) >= 0) 966 return; 967 968 if (kmemcheck_fault(regs, address, error_code)) 969 return; 970 } 971 972 /* Can handle a stale RO->RW TLB: */ 973 if (spurious_fault(error_code, address)) 974 return; 975 976 /* kprobes don't want to hook the spurious faults: */ 977 if (notify_page_fault(regs)) 978 return; 979 /* 980 * Don't take the mm semaphore here. If we fixup a prefetch 981 * fault we could otherwise deadlock: 982 */ 983 bad_area_nosemaphore(regs, error_code, address); 984 985 return; 986 } 987 988 /* kprobes don't want to hook the spurious faults: */ 989 if (unlikely(notify_page_fault(regs))) 990 return; 991 /* 992 * It's safe to allow irq's after cr2 has been saved and the 993 * vmalloc fault has been handled. 994 * 995 * User-mode registers count as a user access even for any 996 * potential system fault or CPU buglet: 997 */ 998 if (user_mode_vm(regs)) { 999 local_irq_enable(); 1000 error_code |= PF_USER; 1001 } else { 1002 if (regs->flags & X86_EFLAGS_IF) 1003 local_irq_enable(); 1004 } 1005 1006 if (unlikely(error_code & PF_RSVD)) 1007 pgtable_bad(regs, error_code, address); 1008 1009 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); 1010 1011 /* 1012 * If we're in an interrupt, have no user context or are running 1013 * in an atomic region then we must not take the fault: 1014 */ 1015 if (unlikely(in_atomic() || !mm)) { 1016 bad_area_nosemaphore(regs, error_code, address); 1017 return; 1018 } 1019 1020 /* 1021 * When running in the kernel we expect faults to occur only to 1022 * addresses in user space. All other faults represent errors in 1023 * the kernel and should generate an OOPS. Unfortunately, in the 1024 * case of an erroneous fault occurring in a code path which already 1025 * holds mmap_sem we will deadlock attempting to validate the fault 1026 * against the address space. Luckily the kernel only validly 1027 * references user space from well defined areas of code, which are 1028 * listed in the exceptions table. 1029 * 1030 * As the vast majority of faults will be valid we will only perform 1031 * the source reference check when there is a possibility of a 1032 * deadlock. Attempt to lock the address space, if we cannot we then 1033 * validate the source. If this is invalid we can skip the address 1034 * space check, thus avoiding the deadlock: 1035 */ 1036 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 1037 if ((error_code & PF_USER) == 0 && 1038 !search_exception_tables(regs->ip)) { 1039 bad_area_nosemaphore(regs, error_code, address); 1040 return; 1041 } 1042 down_read(&mm->mmap_sem); 1043 } else { 1044 /* 1045 * The above down_read_trylock() might have succeeded in 1046 * which case we'll have missed the might_sleep() from 1047 * down_read(): 1048 */ 1049 might_sleep(); 1050 } 1051 1052 vma = find_vma(mm, address); 1053 if (unlikely(!vma)) { 1054 bad_area(regs, error_code, address); 1055 return; 1056 } 1057 if (likely(vma->vm_start <= address)) 1058 goto good_area; 1059 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { 1060 bad_area(regs, error_code, address); 1061 return; 1062 } 1063 if (error_code & PF_USER) { 1064 /* 1065 * Accessing the stack below %sp is always a bug. 1066 * The large cushion allows instructions like enter 1067 * and pusha to work. ("enter $65535, $31" pushes 1068 * 32 pointers and then decrements %sp by 65535.) 1069 */ 1070 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { 1071 bad_area(regs, error_code, address); 1072 return; 1073 } 1074 } 1075 if (unlikely(expand_stack(vma, address))) { 1076 bad_area(regs, error_code, address); 1077 return; 1078 } 1079 1080 /* 1081 * Ok, we have a good vm_area for this memory access, so 1082 * we can handle it.. 1083 */ 1084good_area: 1085 write = error_code & PF_WRITE; 1086 1087 if (unlikely(access_error(error_code, write, vma))) { 1088 bad_area_access_error(regs, error_code, address); 1089 return; 1090 } 1091 1092 /* 1093 * If for any reason at all we couldn't handle the fault, 1094 * make sure we exit gracefully rather than endlessly redo 1095 * the fault: 1096 */ 1097 fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); 1098 1099 if (unlikely(fault & VM_FAULT_ERROR)) { 1100 mm_fault_error(regs, error_code, address, fault); 1101 return; 1102 } 1103 1104 if (fault & VM_FAULT_MAJOR) { 1105 tsk->maj_flt++; 1106 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, 1107 regs, address); 1108 } else { 1109 tsk->min_flt++; 1110 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, 1111 regs, address); 1112 } 1113 1114 check_v8086_mode(regs, address, tsk); 1115 1116 up_read(&mm->mmap_sem); 1117} 1118