1/*
2 *  linux/arch/i386/mm/fault.c
3 *
4 *  Copyright (C) 1995  Linus Torvalds
5 */
6
7#include <linux/signal.h>
8#include <linux/sched.h>
9#include <linux/kernel.h>
10#include <linux/errno.h>
11#include <linux/string.h>
12#include <linux/types.h>
13#include <linux/ptrace.h>
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
17#include <linux/interrupt.h>
18#include <linux/init.h>
19#include <linux/tty.h>
20#include <linux/vt_kern.h>		/* For unblank_screen() */
21#include <linux/highmem.h>
22#include <linux/bootmem.h>		/* for max_low_pfn */
23#include <linux/vmalloc.h>
24#include <linux/module.h>
25#include <linux/kprobes.h>
26#include <linux/uaccess.h>
27#include <linux/kdebug.h>
28
29#include <asm/system.h>
30#include <asm/desc.h>
31#include <asm/segment.h>
32
33extern void die(const char *,struct pt_regs *,long);
34
35static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
36
37int register_page_fault_notifier(struct notifier_block *nb)
38{
39	vmalloc_sync_all();
40	return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
41}
42EXPORT_SYMBOL_GPL(register_page_fault_notifier);
43
44int unregister_page_fault_notifier(struct notifier_block *nb)
45{
46	return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
47}
48EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
49
50static inline int notify_page_fault(struct pt_regs *regs, long err)
51{
52	struct die_args args = {
53		.regs = regs,
54		.str = "page fault",
55		.err = err,
56		.trapnr = 14,
57		.signr = SIGSEGV
58	};
59	return atomic_notifier_call_chain(&notify_page_fault_chain,
60	                                  DIE_PAGE_FAULT, &args);
61}
62
63/*
64 * Return EIP plus the CS segment base.  The segment limit is also
65 * adjusted, clamped to the kernel/user address space (whichever is
66 * appropriate), and returned in *eip_limit.
67 *
68 * The segment is checked, because it might have been changed by another
69 * task between the original faulting instruction and here.
70 *
71 * If CS is no longer a valid code segment, or if EIP is beyond the
72 * limit, or if it is a kernel address when CS is not a kernel segment,
73 * then the returned value will be greater than *eip_limit.
74 *
75 * This is slow, but is very rarely executed.
76 */
77static inline unsigned long get_segment_eip(struct pt_regs *regs,
78					    unsigned long *eip_limit)
79{
80	unsigned long eip = regs->eip;
81	unsigned seg = regs->xcs & 0xffff;
82	u32 seg_ar, seg_limit, base, *desc;
83
84	/* Unlikely, but must come before segment checks. */
85	if (unlikely(regs->eflags & VM_MASK)) {
86		base = seg << 4;
87		*eip_limit = base + 0xffff;
88		return base + (eip & 0xffff);
89	}
90
91	/* The standard kernel/user address space limit. */
92	*eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
93
94	/* By far the most common cases. */
95	if (likely(SEGMENT_IS_FLAT_CODE(seg)))
96		return eip;
97
98	/* Check the segment exists, is within the current LDT/GDT size,
99	   that kernel/user (ring 0..3) has the appropriate privilege,
100	   that it's a code segment, and get the limit. */
101	__asm__ ("larl %3,%0; lsll %3,%1"
102		 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
103	if ((~seg_ar & 0x9800) || eip > seg_limit) {
104		*eip_limit = 0;
105		return 1;	 /* So that returned eip > *eip_limit. */
106	}
107
108	/* Get the GDT/LDT descriptor base.
109	   When you look for races in this code remember that
110	   LDT and other horrors are only used in user space. */
111	if (seg & (1<<2)) {
112		/* Must lock the LDT while reading it. */
113		down(&current->mm->context.sem);
114		desc = current->mm->context.ldt;
115		desc = (void *)desc + (seg & ~7);
116	} else {
117		/* Must disable preemption while reading the GDT. */
118 		desc = (u32 *)get_cpu_gdt_table(get_cpu());
119		desc = (void *)desc + (seg & ~7);
120	}
121
122	/* Decode the code segment base from the descriptor */
123	base = get_desc_base((unsigned long *)desc);
124
125	if (seg & (1<<2)) {
126		up(&current->mm->context.sem);
127	} else
128		put_cpu();
129
130	/* Adjust EIP and segment limit, and clamp at the kernel limit.
131	   It's legitimate for segments to wrap at 0xffffffff. */
132	seg_limit += base;
133	if (seg_limit < *eip_limit && seg_limit >= base)
134		*eip_limit = seg_limit;
135	return eip + base;
136}
137
138/*
139 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
140 * Check that here and ignore it.
141 */
142static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
143{
144	unsigned long limit;
145	unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
146	int scan_more = 1;
147	int prefetch = 0;
148	int i;
149
150	for (i = 0; scan_more && i < 15; i++) {
151		unsigned char opcode;
152		unsigned char instr_hi;
153		unsigned char instr_lo;
154
155		if (instr > (unsigned char *)limit)
156			break;
157		if (probe_kernel_address(instr, opcode))
158			break;
159
160		instr_hi = opcode & 0xf0;
161		instr_lo = opcode & 0x0f;
162		instr++;
163
164		switch (instr_hi) {
165		case 0x20:
166		case 0x30:
167			/* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
168			scan_more = ((instr_lo & 7) == 0x6);
169			break;
170
171		case 0x60:
172			/* 0x64 thru 0x67 are valid prefixes in all modes. */
173			scan_more = (instr_lo & 0xC) == 0x4;
174			break;
175		case 0xF0:
176			/* 0xF0, 0xF2, and 0xF3 are valid prefixes */
177			scan_more = !instr_lo || (instr_lo>>1) == 1;
178			break;
179		case 0x00:
180			/* Prefetch instruction is 0x0F0D or 0x0F18 */
181			scan_more = 0;
182			if (instr > (unsigned char *)limit)
183				break;
184			if (probe_kernel_address(instr, opcode))
185				break;
186			prefetch = (instr_lo == 0xF) &&
187				(opcode == 0x0D || opcode == 0x18);
188			break;
189		default:
190			scan_more = 0;
191			break;
192		}
193	}
194	return prefetch;
195}
196
197static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
198			      unsigned long error_code)
199{
200	if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
201		     boot_cpu_data.x86 >= 6)) {
202		/* Catch an obscure case of prefetch inside an NX page. */
203		if (nx_enabled && (error_code & 16))
204			return 0;
205		return __is_prefetch(regs, addr);
206	}
207	return 0;
208}
209
210static noinline void force_sig_info_fault(int si_signo, int si_code,
211	unsigned long address, struct task_struct *tsk)
212{
213	siginfo_t info;
214
215	info.si_signo = si_signo;
216	info.si_errno = 0;
217	info.si_code = si_code;
218	info.si_addr = (void __user *)address;
219	force_sig_info(si_signo, &info, tsk);
220}
221
222fastcall void do_invalid_op(struct pt_regs *, unsigned long);
223
224static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
225{
226	unsigned index = pgd_index(address);
227	pgd_t *pgd_k;
228	pud_t *pud, *pud_k;
229	pmd_t *pmd, *pmd_k;
230
231	pgd += index;
232	pgd_k = init_mm.pgd + index;
233
234	if (!pgd_present(*pgd_k))
235		return NULL;
236
237	/*
238	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
239	 * and redundant with the set_pmd() on non-PAE. As would
240	 * set_pud.
241	 */
242
243	pud = pud_offset(pgd, address);
244	pud_k = pud_offset(pgd_k, address);
245	if (!pud_present(*pud_k))
246		return NULL;
247
248	pmd = pmd_offset(pud, address);
249	pmd_k = pmd_offset(pud_k, address);
250	if (!pmd_present(*pmd_k))
251		return NULL;
252	if (!pmd_present(*pmd))
253		set_pmd(pmd, *pmd_k);
254	else
255		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
256	return pmd_k;
257}
258
259/*
260 * Handle a fault on the vmalloc or module mapping area
261 *
262 * This assumes no large pages in there.
263 */
264static inline int vmalloc_fault(unsigned long address)
265{
266	unsigned long pgd_paddr;
267	pmd_t *pmd_k;
268	pte_t *pte_k;
269	/*
270	 * Synchronize this task's top level page-table
271	 * with the 'reference' page table.
272	 *
273	 * Do _not_ use "current" here. We might be inside
274	 * an interrupt in the middle of a task switch..
275	 */
276	pgd_paddr = read_cr3();
277	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
278	if (!pmd_k)
279		return -1;
280	pte_k = pte_offset_kernel(pmd_k, address);
281	if (!pte_present(*pte_k))
282		return -1;
283	return 0;
284}
285
286/*
287 * This routine handles page faults.  It determines the address,
288 * and the problem, and then passes it off to one of the appropriate
289 * routines.
290 *
291 * error_code:
292 *	bit 0 == 0 means no page found, 1 means protection fault
293 *	bit 1 == 0 means read, 1 means write
294 *	bit 2 == 0 means kernel, 1 means user-mode
295 *	bit 3 == 1 means use of reserved bit detected
296 *	bit 4 == 1 means fault was an instruction fetch
297 */
298fastcall void __kprobes do_page_fault(struct pt_regs *regs,
299				      unsigned long error_code)
300{
301	struct task_struct *tsk;
302	struct mm_struct *mm;
303	struct vm_area_struct * vma;
304	unsigned long address;
305	int write, si_code;
306
307	/* get the address */
308        address = read_cr2();
309
310	tsk = current;
311
312	si_code = SEGV_MAPERR;
313
314	/*
315	 * We fault-in kernel-space virtual memory on-demand. The
316	 * 'reference' page table is init_mm.pgd.
317	 *
318	 * NOTE! We MUST NOT take any locks for this case. We may
319	 * be in an interrupt or a critical region, and should
320	 * only copy the information from the master page table,
321	 * nothing more.
322	 *
323	 * This verifies that the fault happens in kernel space
324	 * (error_code & 4) == 0, and that the fault was not a
325	 * protection error (error_code & 9) == 0.
326	 */
327	if (unlikely(address >= TASK_SIZE)) {
328		if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
329			return;
330		if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
331			return;
332		/*
333		 * Don't take the mm semaphore here. If we fixup a prefetch
334		 * fault we could otherwise deadlock.
335		 */
336		goto bad_area_nosemaphore;
337	}
338
339	if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
340		return;
341
342	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
343	   fault has been handled. */
344	if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
345		local_irq_enable();
346
347	mm = tsk->mm;
348
349	/*
350	 * If we're in an interrupt, have no user context or are running in an
351	 * atomic region then we must not take the fault..
352	 */
353	if (in_atomic() || !mm)
354		goto bad_area_nosemaphore;
355
356	/* When running in the kernel we expect faults to occur only to
357	 * addresses in user space.  All other faults represent errors in the
358	 * kernel and should generate an OOPS.  Unfortunatly, in the case of an
359	 * erroneous fault occurring in a code path which already holds mmap_sem
360	 * we will deadlock attempting to validate the fault against the
361	 * address space.  Luckily the kernel only validly references user
362	 * space from well defined areas of code, which are listed in the
363	 * exceptions table.
364	 *
365	 * As the vast majority of faults will be valid we will only perform
366	 * the source reference check when there is a possibilty of a deadlock.
367	 * Attempt to lock the address space, if we cannot we then validate the
368	 * source.  If this is invalid we can skip the address space check,
369	 * thus avoiding the deadlock.
370	 */
371	if (!down_read_trylock(&mm->mmap_sem)) {
372		if ((error_code & 4) == 0 &&
373		    !search_exception_tables(regs->eip))
374			goto bad_area_nosemaphore;
375		down_read(&mm->mmap_sem);
376	}
377
378	vma = find_vma(mm, address);
379	if (!vma)
380		goto bad_area;
381	if (vma->vm_start <= address)
382		goto good_area;
383	if (!(vma->vm_flags & VM_GROWSDOWN))
384		goto bad_area;
385	if (error_code & 4) {
386		/*
387		 * Accessing the stack below %esp is always a bug.
388		 * The large cushion allows instructions like enter
389		 * and pusha to work.  ("enter $65535,$31" pushes
390		 * 32 pointers and then decrements %esp by 65535.)
391		 */
392		if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
393			goto bad_area;
394	}
395	if (expand_stack(vma, address))
396		goto bad_area;
397/*
398 * Ok, we have a good vm_area for this memory access, so
399 * we can handle it..
400 */
401good_area:
402	si_code = SEGV_ACCERR;
403	write = 0;
404	switch (error_code & 3) {
405		default:	/* 3: write, present */
406				/* fall through */
407		case 2:		/* write, not present */
408			if (!(vma->vm_flags & VM_WRITE))
409				goto bad_area;
410			write++;
411			break;
412		case 1:		/* read, present */
413			goto bad_area;
414		case 0:		/* read, not present */
415			if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
416				goto bad_area;
417	}
418
419 survive:
420	/*
421	 * If for any reason at all we couldn't handle the fault,
422	 * make sure we exit gracefully rather than endlessly redo
423	 * the fault.
424	 */
425	switch (handle_mm_fault(mm, vma, address, write)) {
426		case VM_FAULT_MINOR:
427			tsk->min_flt++;
428			break;
429		case VM_FAULT_MAJOR:
430			tsk->maj_flt++;
431			break;
432		case VM_FAULT_SIGBUS:
433			goto do_sigbus;
434		case VM_FAULT_OOM:
435			goto out_of_memory;
436		default:
437			BUG();
438	}
439
440	/*
441	 * Did it hit the DOS screen memory VA from vm86 mode?
442	 */
443	if (regs->eflags & VM_MASK) {
444		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
445		if (bit < 32)
446			tsk->thread.screen_bitmap |= 1 << bit;
447	}
448	up_read(&mm->mmap_sem);
449	return;
450
451/*
452 * Something tried to access memory that isn't in our memory map..
453 * Fix it, but check if it's kernel or user first..
454 */
455bad_area:
456	up_read(&mm->mmap_sem);
457
458bad_area_nosemaphore:
459	/* User mode accesses just cause a SIGSEGV */
460	if (error_code & 4) {
461		/*
462		 * It's possible to have interrupts off here.
463		 */
464		local_irq_enable();
465
466		/*
467		 * Valid to do another page fault here because this one came
468		 * from user space.
469		 */
470		if (is_prefetch(regs, address, error_code))
471			return;
472
473		tsk->thread.cr2 = address;
474		/* Kernel addresses are always protection faults */
475		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
476		tsk->thread.trap_no = 14;
477		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
478		return;
479	}
480
481#ifdef CONFIG_X86_F00F_BUG
482	if (boot_cpu_data.f00f_bug) {
483		unsigned long nr;
484
485		nr = (address - idt_descr.address) >> 3;
486
487		if (nr == 6) {
488			do_invalid_op(regs, 0);
489			return;
490		}
491	}
492#endif
493
494no_context:
495	/* Are we prepared to handle this kernel fault?  */
496	if (fixup_exception(regs))
497		return;
498
499	/*
500	 * Valid to do another page fault here, because if this fault
501	 * had been triggered by is_prefetch fixup_exception would have
502	 * handled it.
503	 */
504 	if (is_prefetch(regs, address, error_code))
505 		return;
506
507/*
508 * Oops. The kernel tried to access some bad page. We'll have to
509 * terminate things with extreme prejudice.
510 */
511
512	bust_spinlocks(1);
513
514	if (oops_may_print()) {
515		__typeof__(pte_val(__pte(0))) page;
516
517#ifdef CONFIG_X86_PAE
518		if (error_code & 16) {
519			pte_t *pte = lookup_address(address);
520
521			if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
522				printk(KERN_CRIT "kernel tried to execute "
523					"NX-protected page - exploit attempt? "
524					"(uid: %d)\n", current->uid);
525		}
526#endif
527		if (address < PAGE_SIZE)
528			printk(KERN_ALERT "BUG: unable to handle kernel NULL "
529					"pointer dereference");
530		else
531			printk(KERN_ALERT "BUG: unable to handle kernel paging"
532					" request");
533		printk(" at virtual address %08lx\n",address);
534		printk(KERN_ALERT " printing eip:\n");
535		printk("%08lx\n", regs->eip);
536
537		page = read_cr3();
538		page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
539#ifdef CONFIG_X86_PAE
540		printk(KERN_ALERT "*pdpt = %016Lx\n", page);
541		if ((page >> PAGE_SHIFT) < max_low_pfn
542		    && page & _PAGE_PRESENT) {
543			page &= PAGE_MASK;
544			page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
545			                                         & (PTRS_PER_PMD - 1)];
546			printk(KERN_ALERT "*pde = %016Lx\n", page);
547			page &= ~_PAGE_NX;
548		}
549#else
550		printk(KERN_ALERT "*pde = %08lx\n", page);
551#endif
552
553		/*
554		 * We must not directly access the pte in the highpte
555		 * case if the page table is located in highmem.
556		 * And let's rather not kmap-atomic the pte, just in case
557		 * it's allocated already.
558		 */
559		if ((page >> PAGE_SHIFT) < max_low_pfn
560		    && (page & _PAGE_PRESENT)) {
561			page &= PAGE_MASK;
562			page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
563			                                         & (PTRS_PER_PTE - 1)];
564			printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page);
565		}
566	}
567
568	tsk->thread.cr2 = address;
569	tsk->thread.trap_no = 14;
570	tsk->thread.error_code = error_code;
571	die("Oops", regs, error_code);
572	bust_spinlocks(0);
573	do_exit(SIGKILL);
574
575/*
576 * We ran out of memory, or some other thing happened to us that made
577 * us unable to handle the page fault gracefully.
578 */
579out_of_memory:
580	up_read(&mm->mmap_sem);
581	if (is_init(tsk)) {
582		yield();
583		down_read(&mm->mmap_sem);
584		goto survive;
585	}
586	printk("VM: killing process %s\n", tsk->comm);
587	if (error_code & 4)
588		do_exit(SIGKILL);
589	goto no_context;
590
591do_sigbus:
592	up_read(&mm->mmap_sem);
593
594	/* Kernel mode? Handle exceptions or die */
595	if (!(error_code & 4))
596		goto no_context;
597
598	/* User space => ok to do another page fault */
599	if (is_prefetch(regs, address, error_code))
600		return;
601
602	tsk->thread.cr2 = address;
603	tsk->thread.error_code = error_code;
604	tsk->thread.trap_no = 14;
605	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
606}
607
608void vmalloc_sync_all(void)
609{
610	/*
611	 * Note that races in the updates of insync and start aren't
612	 * problematic: insync can only get set bits added, and updates to
613	 * start are only improving performance (without affecting correctness
614	 * if undone).
615	 */
616	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
617	static unsigned long start = TASK_SIZE;
618	unsigned long address;
619
620	if (SHARED_KERNEL_PMD)
621		return;
622
623	BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
624	for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
625		if (!test_bit(pgd_index(address), insync)) {
626			unsigned long flags;
627			struct page *page;
628
629			spin_lock_irqsave(&pgd_lock, flags);
630			for (page = pgd_list; page; page =
631					(struct page *)page->index)
632				if (!vmalloc_sync_one(page_address(page),
633								address)) {
634					BUG_ON(page != pgd_list);
635					break;
636				}
637			spin_unlock_irqrestore(&pgd_lock, flags);
638			if (!page)
639				set_bit(pgd_index(address), insync);
640		}
641		if (address == start && test_bit(pgd_index(address), insync))
642			start = address + PGDIR_SIZE;
643	}
644}
645