1/* 2 * linux/arch/i386/mm/fault.c 3 * 4 * Copyright (C) 1995 Linus Torvalds 5 */ 6 7#include <linux/signal.h> 8#include <linux/sched.h> 9#include <linux/kernel.h> 10#include <linux/errno.h> 11#include <linux/string.h> 12#include <linux/types.h> 13#include <linux/ptrace.h> 14#include <linux/mman.h> 15#include <linux/mm.h> 16#include <linux/smp.h> 17#include <linux/interrupt.h> 18#include <linux/init.h> 19#include <linux/tty.h> 20#include <linux/vt_kern.h> /* For unblank_screen() */ 21#include <linux/highmem.h> 22#include <linux/bootmem.h> /* for max_low_pfn */ 23#include <linux/vmalloc.h> 24#include <linux/module.h> 25#include <linux/kprobes.h> 26#include <linux/uaccess.h> 27#include <linux/kdebug.h> 28 29#include <asm/system.h> 30#include <asm/desc.h> 31#include <asm/segment.h> 32 33extern void die(const char *,struct pt_regs *,long); 34 35static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); 36 37int register_page_fault_notifier(struct notifier_block *nb) 38{ 39 vmalloc_sync_all(); 40 return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); 41} 42EXPORT_SYMBOL_GPL(register_page_fault_notifier); 43 44int unregister_page_fault_notifier(struct notifier_block *nb) 45{ 46 return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); 47} 48EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); 49 50static inline int notify_page_fault(struct pt_regs *regs, long err) 51{ 52 struct die_args args = { 53 .regs = regs, 54 .str = "page fault", 55 .err = err, 56 .trapnr = 14, 57 .signr = SIGSEGV 58 }; 59 return atomic_notifier_call_chain(¬ify_page_fault_chain, 60 DIE_PAGE_FAULT, &args); 61} 62 63/* 64 * Return EIP plus the CS segment base. The segment limit is also 65 * adjusted, clamped to the kernel/user address space (whichever is 66 * appropriate), and returned in *eip_limit. 67 * 68 * The segment is checked, because it might have been changed by another 69 * task between the original faulting instruction and here. 70 * 71 * If CS is no longer a valid code segment, or if EIP is beyond the 72 * limit, or if it is a kernel address when CS is not a kernel segment, 73 * then the returned value will be greater than *eip_limit. 74 * 75 * This is slow, but is very rarely executed. 76 */ 77static inline unsigned long get_segment_eip(struct pt_regs *regs, 78 unsigned long *eip_limit) 79{ 80 unsigned long eip = regs->eip; 81 unsigned seg = regs->xcs & 0xffff; 82 u32 seg_ar, seg_limit, base, *desc; 83 84 /* Unlikely, but must come before segment checks. */ 85 if (unlikely(regs->eflags & VM_MASK)) { 86 base = seg << 4; 87 *eip_limit = base + 0xffff; 88 return base + (eip & 0xffff); 89 } 90 91 /* The standard kernel/user address space limit. */ 92 *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg; 93 94 /* By far the most common cases. */ 95 if (likely(SEGMENT_IS_FLAT_CODE(seg))) 96 return eip; 97 98 /* Check the segment exists, is within the current LDT/GDT size, 99 that kernel/user (ring 0..3) has the appropriate privilege, 100 that it's a code segment, and get the limit. */ 101 __asm__ ("larl %3,%0; lsll %3,%1" 102 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); 103 if ((~seg_ar & 0x9800) || eip > seg_limit) { 104 *eip_limit = 0; 105 return 1; /* So that returned eip > *eip_limit. */ 106 } 107 108 /* Get the GDT/LDT descriptor base. 109 When you look for races in this code remember that 110 LDT and other horrors are only used in user space. */ 111 if (seg & (1<<2)) { 112 /* Must lock the LDT while reading it. */ 113 down(¤t->mm->context.sem); 114 desc = current->mm->context.ldt; 115 desc = (void *)desc + (seg & ~7); 116 } else { 117 /* Must disable preemption while reading the GDT. */ 118 desc = (u32 *)get_cpu_gdt_table(get_cpu()); 119 desc = (void *)desc + (seg & ~7); 120 } 121 122 /* Decode the code segment base from the descriptor */ 123 base = get_desc_base((unsigned long *)desc); 124 125 if (seg & (1<<2)) { 126 up(¤t->mm->context.sem); 127 } else 128 put_cpu(); 129 130 /* Adjust EIP and segment limit, and clamp at the kernel limit. 131 It's legitimate for segments to wrap at 0xffffffff. */ 132 seg_limit += base; 133 if (seg_limit < *eip_limit && seg_limit >= base) 134 *eip_limit = seg_limit; 135 return eip + base; 136} 137 138/* 139 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 140 * Check that here and ignore it. 141 */ 142static int __is_prefetch(struct pt_regs *regs, unsigned long addr) 143{ 144 unsigned long limit; 145 unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit); 146 int scan_more = 1; 147 int prefetch = 0; 148 int i; 149 150 for (i = 0; scan_more && i < 15; i++) { 151 unsigned char opcode; 152 unsigned char instr_hi; 153 unsigned char instr_lo; 154 155 if (instr > (unsigned char *)limit) 156 break; 157 if (probe_kernel_address(instr, opcode)) 158 break; 159 160 instr_hi = opcode & 0xf0; 161 instr_lo = opcode & 0x0f; 162 instr++; 163 164 switch (instr_hi) { 165 case 0x20: 166 case 0x30: 167 /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ 168 scan_more = ((instr_lo & 7) == 0x6); 169 break; 170 171 case 0x60: 172 /* 0x64 thru 0x67 are valid prefixes in all modes. */ 173 scan_more = (instr_lo & 0xC) == 0x4; 174 break; 175 case 0xF0: 176 /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ 177 scan_more = !instr_lo || (instr_lo>>1) == 1; 178 break; 179 case 0x00: 180 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 181 scan_more = 0; 182 if (instr > (unsigned char *)limit) 183 break; 184 if (probe_kernel_address(instr, opcode)) 185 break; 186 prefetch = (instr_lo == 0xF) && 187 (opcode == 0x0D || opcode == 0x18); 188 break; 189 default: 190 scan_more = 0; 191 break; 192 } 193 } 194 return prefetch; 195} 196 197static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, 198 unsigned long error_code) 199{ 200 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 201 boot_cpu_data.x86 >= 6)) { 202 /* Catch an obscure case of prefetch inside an NX page. */ 203 if (nx_enabled && (error_code & 16)) 204 return 0; 205 return __is_prefetch(regs, addr); 206 } 207 return 0; 208} 209 210static noinline void force_sig_info_fault(int si_signo, int si_code, 211 unsigned long address, struct task_struct *tsk) 212{ 213 siginfo_t info; 214 215 info.si_signo = si_signo; 216 info.si_errno = 0; 217 info.si_code = si_code; 218 info.si_addr = (void __user *)address; 219 force_sig_info(si_signo, &info, tsk); 220} 221 222fastcall void do_invalid_op(struct pt_regs *, unsigned long); 223 224static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) 225{ 226 unsigned index = pgd_index(address); 227 pgd_t *pgd_k; 228 pud_t *pud, *pud_k; 229 pmd_t *pmd, *pmd_k; 230 231 pgd += index; 232 pgd_k = init_mm.pgd + index; 233 234 if (!pgd_present(*pgd_k)) 235 return NULL; 236 237 /* 238 * set_pgd(pgd, *pgd_k); here would be useless on PAE 239 * and redundant with the set_pmd() on non-PAE. As would 240 * set_pud. 241 */ 242 243 pud = pud_offset(pgd, address); 244 pud_k = pud_offset(pgd_k, address); 245 if (!pud_present(*pud_k)) 246 return NULL; 247 248 pmd = pmd_offset(pud, address); 249 pmd_k = pmd_offset(pud_k, address); 250 if (!pmd_present(*pmd_k)) 251 return NULL; 252 if (!pmd_present(*pmd)) 253 set_pmd(pmd, *pmd_k); 254 else 255 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 256 return pmd_k; 257} 258 259/* 260 * Handle a fault on the vmalloc or module mapping area 261 * 262 * This assumes no large pages in there. 263 */ 264static inline int vmalloc_fault(unsigned long address) 265{ 266 unsigned long pgd_paddr; 267 pmd_t *pmd_k; 268 pte_t *pte_k; 269 /* 270 * Synchronize this task's top level page-table 271 * with the 'reference' page table. 272 * 273 * Do _not_ use "current" here. We might be inside 274 * an interrupt in the middle of a task switch.. 275 */ 276 pgd_paddr = read_cr3(); 277 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 278 if (!pmd_k) 279 return -1; 280 pte_k = pte_offset_kernel(pmd_k, address); 281 if (!pte_present(*pte_k)) 282 return -1; 283 return 0; 284} 285 286/* 287 * This routine handles page faults. It determines the address, 288 * and the problem, and then passes it off to one of the appropriate 289 * routines. 290 * 291 * error_code: 292 * bit 0 == 0 means no page found, 1 means protection fault 293 * bit 1 == 0 means read, 1 means write 294 * bit 2 == 0 means kernel, 1 means user-mode 295 * bit 3 == 1 means use of reserved bit detected 296 * bit 4 == 1 means fault was an instruction fetch 297 */ 298fastcall void __kprobes do_page_fault(struct pt_regs *regs, 299 unsigned long error_code) 300{ 301 struct task_struct *tsk; 302 struct mm_struct *mm; 303 struct vm_area_struct * vma; 304 unsigned long address; 305 int write, si_code; 306 307 /* get the address */ 308 address = read_cr2(); 309 310 tsk = current; 311 312 si_code = SEGV_MAPERR; 313 314 /* 315 * We fault-in kernel-space virtual memory on-demand. The 316 * 'reference' page table is init_mm.pgd. 317 * 318 * NOTE! We MUST NOT take any locks for this case. We may 319 * be in an interrupt or a critical region, and should 320 * only copy the information from the master page table, 321 * nothing more. 322 * 323 * This verifies that the fault happens in kernel space 324 * (error_code & 4) == 0, and that the fault was not a 325 * protection error (error_code & 9) == 0. 326 */ 327 if (unlikely(address >= TASK_SIZE)) { 328 if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) 329 return; 330 if (notify_page_fault(regs, error_code) == NOTIFY_STOP) 331 return; 332 /* 333 * Don't take the mm semaphore here. If we fixup a prefetch 334 * fault we could otherwise deadlock. 335 */ 336 goto bad_area_nosemaphore; 337 } 338 339 if (notify_page_fault(regs, error_code) == NOTIFY_STOP) 340 return; 341 342 /* It's safe to allow irq's after cr2 has been saved and the vmalloc 343 fault has been handled. */ 344 if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) 345 local_irq_enable(); 346 347 mm = tsk->mm; 348 349 /* 350 * If we're in an interrupt, have no user context or are running in an 351 * atomic region then we must not take the fault.. 352 */ 353 if (in_atomic() || !mm) 354 goto bad_area_nosemaphore; 355 356 /* When running in the kernel we expect faults to occur only to 357 * addresses in user space. All other faults represent errors in the 358 * kernel and should generate an OOPS. Unfortunatly, in the case of an 359 * erroneous fault occurring in a code path which already holds mmap_sem 360 * we will deadlock attempting to validate the fault against the 361 * address space. Luckily the kernel only validly references user 362 * space from well defined areas of code, which are listed in the 363 * exceptions table. 364 * 365 * As the vast majority of faults will be valid we will only perform 366 * the source reference check when there is a possibilty of a deadlock. 367 * Attempt to lock the address space, if we cannot we then validate the 368 * source. If this is invalid we can skip the address space check, 369 * thus avoiding the deadlock. 370 */ 371 if (!down_read_trylock(&mm->mmap_sem)) { 372 if ((error_code & 4) == 0 && 373 !search_exception_tables(regs->eip)) 374 goto bad_area_nosemaphore; 375 down_read(&mm->mmap_sem); 376 } 377 378 vma = find_vma(mm, address); 379 if (!vma) 380 goto bad_area; 381 if (vma->vm_start <= address) 382 goto good_area; 383 if (!(vma->vm_flags & VM_GROWSDOWN)) 384 goto bad_area; 385 if (error_code & 4) { 386 /* 387 * Accessing the stack below %esp is always a bug. 388 * The large cushion allows instructions like enter 389 * and pusha to work. ("enter $65535,$31" pushes 390 * 32 pointers and then decrements %esp by 65535.) 391 */ 392 if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp) 393 goto bad_area; 394 } 395 if (expand_stack(vma, address)) 396 goto bad_area; 397/* 398 * Ok, we have a good vm_area for this memory access, so 399 * we can handle it.. 400 */ 401good_area: 402 si_code = SEGV_ACCERR; 403 write = 0; 404 switch (error_code & 3) { 405 default: /* 3: write, present */ 406 /* fall through */ 407 case 2: /* write, not present */ 408 if (!(vma->vm_flags & VM_WRITE)) 409 goto bad_area; 410 write++; 411 break; 412 case 1: /* read, present */ 413 goto bad_area; 414 case 0: /* read, not present */ 415 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) 416 goto bad_area; 417 } 418 419 survive: 420 /* 421 * If for any reason at all we couldn't handle the fault, 422 * make sure we exit gracefully rather than endlessly redo 423 * the fault. 424 */ 425 switch (handle_mm_fault(mm, vma, address, write)) { 426 case VM_FAULT_MINOR: 427 tsk->min_flt++; 428 break; 429 case VM_FAULT_MAJOR: 430 tsk->maj_flt++; 431 break; 432 case VM_FAULT_SIGBUS: 433 goto do_sigbus; 434 case VM_FAULT_OOM: 435 goto out_of_memory; 436 default: 437 BUG(); 438 } 439 440 /* 441 * Did it hit the DOS screen memory VA from vm86 mode? 442 */ 443 if (regs->eflags & VM_MASK) { 444 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; 445 if (bit < 32) 446 tsk->thread.screen_bitmap |= 1 << bit; 447 } 448 up_read(&mm->mmap_sem); 449 return; 450 451/* 452 * Something tried to access memory that isn't in our memory map.. 453 * Fix it, but check if it's kernel or user first.. 454 */ 455bad_area: 456 up_read(&mm->mmap_sem); 457 458bad_area_nosemaphore: 459 /* User mode accesses just cause a SIGSEGV */ 460 if (error_code & 4) { 461 /* 462 * It's possible to have interrupts off here. 463 */ 464 local_irq_enable(); 465 466 /* 467 * Valid to do another page fault here because this one came 468 * from user space. 469 */ 470 if (is_prefetch(regs, address, error_code)) 471 return; 472 473 tsk->thread.cr2 = address; 474 /* Kernel addresses are always protection faults */ 475 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 476 tsk->thread.trap_no = 14; 477 force_sig_info_fault(SIGSEGV, si_code, address, tsk); 478 return; 479 } 480 481#ifdef CONFIG_X86_F00F_BUG 482 if (boot_cpu_data.f00f_bug) { 483 unsigned long nr; 484 485 nr = (address - idt_descr.address) >> 3; 486 487 if (nr == 6) { 488 do_invalid_op(regs, 0); 489 return; 490 } 491 } 492#endif 493 494no_context: 495 /* Are we prepared to handle this kernel fault? */ 496 if (fixup_exception(regs)) 497 return; 498 499 /* 500 * Valid to do another page fault here, because if this fault 501 * had been triggered by is_prefetch fixup_exception would have 502 * handled it. 503 */ 504 if (is_prefetch(regs, address, error_code)) 505 return; 506 507/* 508 * Oops. The kernel tried to access some bad page. We'll have to 509 * terminate things with extreme prejudice. 510 */ 511 512 bust_spinlocks(1); 513 514 if (oops_may_print()) { 515 __typeof__(pte_val(__pte(0))) page; 516 517#ifdef CONFIG_X86_PAE 518 if (error_code & 16) { 519 pte_t *pte = lookup_address(address); 520 521 if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) 522 printk(KERN_CRIT "kernel tried to execute " 523 "NX-protected page - exploit attempt? " 524 "(uid: %d)\n", current->uid); 525 } 526#endif 527 if (address < PAGE_SIZE) 528 printk(KERN_ALERT "BUG: unable to handle kernel NULL " 529 "pointer dereference"); 530 else 531 printk(KERN_ALERT "BUG: unable to handle kernel paging" 532 " request"); 533 printk(" at virtual address %08lx\n",address); 534 printk(KERN_ALERT " printing eip:\n"); 535 printk("%08lx\n", regs->eip); 536 537 page = read_cr3(); 538 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; 539#ifdef CONFIG_X86_PAE 540 printk(KERN_ALERT "*pdpt = %016Lx\n", page); 541 if ((page >> PAGE_SHIFT) < max_low_pfn 542 && page & _PAGE_PRESENT) { 543 page &= PAGE_MASK; 544 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) 545 & (PTRS_PER_PMD - 1)]; 546 printk(KERN_ALERT "*pde = %016Lx\n", page); 547 page &= ~_PAGE_NX; 548 } 549#else 550 printk(KERN_ALERT "*pde = %08lx\n", page); 551#endif 552 553 /* 554 * We must not directly access the pte in the highpte 555 * case if the page table is located in highmem. 556 * And let's rather not kmap-atomic the pte, just in case 557 * it's allocated already. 558 */ 559 if ((page >> PAGE_SHIFT) < max_low_pfn 560 && (page & _PAGE_PRESENT)) { 561 page &= PAGE_MASK; 562 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) 563 & (PTRS_PER_PTE - 1)]; 564 printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page); 565 } 566 } 567 568 tsk->thread.cr2 = address; 569 tsk->thread.trap_no = 14; 570 tsk->thread.error_code = error_code; 571 die("Oops", regs, error_code); 572 bust_spinlocks(0); 573 do_exit(SIGKILL); 574 575/* 576 * We ran out of memory, or some other thing happened to us that made 577 * us unable to handle the page fault gracefully. 578 */ 579out_of_memory: 580 up_read(&mm->mmap_sem); 581 if (is_init(tsk)) { 582 yield(); 583 down_read(&mm->mmap_sem); 584 goto survive; 585 } 586 printk("VM: killing process %s\n", tsk->comm); 587 if (error_code & 4) 588 do_exit(SIGKILL); 589 goto no_context; 590 591do_sigbus: 592 up_read(&mm->mmap_sem); 593 594 /* Kernel mode? Handle exceptions or die */ 595 if (!(error_code & 4)) 596 goto no_context; 597 598 /* User space => ok to do another page fault */ 599 if (is_prefetch(regs, address, error_code)) 600 return; 601 602 tsk->thread.cr2 = address; 603 tsk->thread.error_code = error_code; 604 tsk->thread.trap_no = 14; 605 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 606} 607 608void vmalloc_sync_all(void) 609{ 610 /* 611 * Note that races in the updates of insync and start aren't 612 * problematic: insync can only get set bits added, and updates to 613 * start are only improving performance (without affecting correctness 614 * if undone). 615 */ 616 static DECLARE_BITMAP(insync, PTRS_PER_PGD); 617 static unsigned long start = TASK_SIZE; 618 unsigned long address; 619 620 if (SHARED_KERNEL_PMD) 621 return; 622 623 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 624 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 625 if (!test_bit(pgd_index(address), insync)) { 626 unsigned long flags; 627 struct page *page; 628 629 spin_lock_irqsave(&pgd_lock, flags); 630 for (page = pgd_list; page; page = 631 (struct page *)page->index) 632 if (!vmalloc_sync_one(page_address(page), 633 address)) { 634 BUG_ON(page != pgd_list); 635 break; 636 } 637 spin_unlock_irqrestore(&pgd_lock, flags); 638 if (!page) 639 set_bit(pgd_index(address), insync); 640 } 641 if (address == start && test_bit(pgd_index(address), insync)) 642 start = address + PGDIR_SIZE; 643 } 644} 645