1/*
2 *  linux/arch/x86-64/mm/fault.c
3 *
4 *  Copyright (C) 1995  Linus Torvalds
5 *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
7
8#include <linux/signal.h>
9#include <linux/sched.h>
10#include <linux/kernel.h>
11#include <linux/errno.h>
12#include <linux/string.h>
13#include <linux/types.h>
14#include <linux/ptrace.h>
15#include <linux/mman.h>
16#include <linux/mm.h>
17#include <linux/smp.h>
18#include <linux/smp_lock.h>
19#include <linux/interrupt.h>
20#include <linux/init.h>
21#include <linux/tty.h>
22#include <linux/vt_kern.h>		/* For unblank_screen() */
23#include <linux/compiler.h>
24
25#include <asm/system.h>
26#include <asm/uaccess.h>
27#include <asm/pgalloc.h>
28#include <asm/hardirq.h>
29#include <asm/smp.h>
30#include <asm/proto.h>
31#include <asm/kdebug.h>
32
33spinlock_t pcrash_lock;
34int crashing_cpu;
35
36extern spinlock_t console_lock, timerlist_lock;
37
38void bust_spinlocks(int yes)
39{
40 	spin_lock_init(&timerlist_lock);
41	if (yes) {
42		oops_in_progress = 1;
43#ifdef CONFIG_SMP
44		global_irq_lock = 0;	/* Many serial drivers do __global_cli() */
45#endif
46	} else {
47	int loglevel_save = console_loglevel;
48#ifdef CONFIG_VT
49		unblank_screen();
50#endif
51		oops_in_progress = 0;
52		/*
53		 * OK, the message is on the console.  Now we call printk()
54		 * without oops_in_progress set so that printk will give klogd
55		 * a poke.  Hold onto your hats...
56		 */
57		console_loglevel = 15;		/* NMI oopser may have shut the console up */
58		printk(" ");
59		console_loglevel = loglevel_save;
60	}
61}
62
63void dump_pagetable(unsigned long address)
64{
65	static char *name[] = { "PML4", "PGD", "PDE", "PTE" };
66	int i, shift;
67	unsigned long page;
68
69	shift = 9+9+9+12;
70	address &= ~0xFFFF000000000000UL;
71	asm("movq %%cr3,%0" : "=r" (page));
72	for (i = 0; i < 4; i++) {
73		unsigned long *padr = (unsigned long *) __va(page);
74		padr += (address >> shift) & 0x1FFU;
75		if (__get_user(page, padr)) {
76			printk("%s: bad %p\n", name[i], padr);
77			break;
78		}
79		printk("%s: %016lx ", name[i], page);
80		if ((page & (1 | (1<<7))) != 1) /* Not present or 2MB page */
81			break;
82		page &= ~0xFFFUL;
83		shift -= (i == 0) ? 12 : 9;
84	}
85	printk("\n");
86}
87
88int page_fault_trace;
89int exception_trace = 1;
90
91/*
92 * This routine handles page faults.  It determines the address,
93 * and the problem, and then passes it off to one of the appropriate
94 * routines.
95 *
96 * error_code:
97 *	bit 0 == 0 means no page found, 1 means protection fault
98 *	bit 1 == 0 means read, 1 means write
99 *	bit 2 == 0 means kernel, 1 means user-mode
100 *      bit 3 == 1 means fault was an instruction fetch
101 */
102asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
103{
104	struct task_struct *tsk;
105	struct mm_struct *mm;
106	struct vm_area_struct * vma;
107	unsigned long address;
108	unsigned long fixup;
109	int write;
110	siginfo_t info;
111
112	/* get the address */
113	__asm__("movq %%cr2,%0":"=r" (address));
114
115#ifdef CONFIG_CHECKING
116	if (page_fault_trace)
117		printk("pfault %d rip:%lx rsp:%lx cs:%lu ss:%lu addr %lx error %lx\n",
118		       stack_smp_processor_id(), regs->rip,regs->rsp,regs->cs,
119			   regs->ss,address,error_code);
120
121	{
122		unsigned long gs;
123		struct x8664_pda *pda = cpu_pda + stack_smp_processor_id();
124		rdmsrl(MSR_GS_BASE, gs);
125		if (gs != (unsigned long)pda) {
126			wrmsrl(MSR_GS_BASE, pda);
127			printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
128		}
129	}
130#endif
131
132	tsk = current;
133	mm = tsk->mm;
134	info.si_code = SEGV_MAPERR;
135
136	/* 5 => page not present and from supervisor mode */
137	if (unlikely(!(error_code & 5) &&
138		     ((address >= VMALLOC_START && address <= VMALLOC_END) ||
139		      (address >= MODULES_VADDR && address <= MODULES_END))))
140		goto vmalloc_fault;
141
142	/*
143	 * If we're in an interrupt or have no user
144	 * context, we must not take the fault..
145	 */
146	if (in_interrupt() || !mm)
147		goto no_context;
148
149again:
150	down_read(&mm->mmap_sem);
151
152	vma = find_vma(mm, address);
153	if (!vma)
154		goto bad_area;
155	if (vma->vm_start <= address)
156		goto good_area;
157	if (!(vma->vm_flags & VM_GROWSDOWN))
158		goto bad_area;
159	if (error_code & 4) {
160		// XXX: align red zone size with ABI
161		if (address + 128 < regs->rsp)
162			goto bad_area;
163	}
164	if (expand_stack(vma, address))
165		goto bad_area;
166/*
167 * Ok, we have a good vm_area for this memory access, so
168 * we can handle it..
169 */
170good_area:
171	info.si_code = SEGV_ACCERR;
172	write = 0;
173	switch (error_code & 3) {
174		default:	/* 3: write, present */
175			/* fall through */
176		case 2:		/* write, not present */
177			if (!(vma->vm_flags & VM_WRITE))
178				goto bad_area;
179			write++;
180			break;
181		case 1:		/* read, present */
182			goto bad_area;
183		case 0:		/* read, not present */
184			if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
185				goto bad_area;
186	}
187
188	/*
189	 * If for any reason at all we couldn't handle the fault,
190	 * make sure we exit gracefully rather than endlessly redo
191	 * the fault.
192	 */
193	switch (handle_mm_fault(mm, vma, address, write)) {
194	case 1:
195		tsk->min_flt++;
196		break;
197	case 2:
198		tsk->maj_flt++;
199		break;
200	case 0:
201		goto do_sigbus;
202	default:
203		goto out_of_memory;
204	}
205
206	up_read(&mm->mmap_sem);
207	return;
208
209/*
210 * Something tried to access memory that isn't in our memory map..
211 * Fix it, but check if it's kernel or user first..
212 */
213bad_area:
214	up_read(&mm->mmap_sem);
215
216bad_area_nosemaphore:
217
218	/* User mode accesses just cause a SIGSEGV */
219	if (error_code & 4) {
220		if (exception_trace) {
221			dump_pagetable(address);
222			printk("%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
223					current->comm, current->pid, address, regs->rip,
224					regs->rsp, error_code);
225		}
226		tsk->thread.cr2 = address;
227		tsk->thread.error_code = error_code;
228		tsk->thread.trap_no = 14;
229		info.si_signo = SIGSEGV;
230		info.si_errno = 0;
231		/* info.si_code has been set above */
232		info.si_addr = (void *)address;
233		force_sig_info(SIGSEGV, &info, tsk);
234		return;
235	}
236
237no_context:
238
239	/* Are we prepared to handle this kernel fault?  */
240	if ((fixup = search_exception_table(regs->rip)) != 0) {
241		regs->rip = fixup;
242		if (0 && exception_trace)
243		printk(KERN_ERR
244		       "%s: fixed kernel exception at %lx address %lx err:%ld\n",
245		       current->comm, regs->rip, address, error_code);
246		return;
247	}
248
249/*
250 * Oops. The kernel tried to access some bad page. We'll have to
251 * terminate things with extreme prejudice.
252 */
253
254	console_verbose();
255	bust_spinlocks(1);
256
257	if (!in_interrupt()) {
258		if (!spin_trylock(&pcrash_lock)) {
259			if (crashing_cpu != smp_processor_id())
260				spin_lock(&pcrash_lock);
261		}
262		crashing_cpu = smp_processor_id();
263	}
264
265	if (address < PAGE_SIZE)
266		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
267	else
268		printk(KERN_ALERT "Unable to handle kernel paging request");
269	printk(" at virtual address %016lx\n",address);
270	printk(" printing rip:\n");
271	printk("%016lx\n", regs->rip);
272	dump_pagetable(address);
273
274	die("Oops", regs, error_code);
275
276	if (!in_interrupt()) {
277		crashing_cpu = -1;  /* small harmless window */
278		spin_unlock(&pcrash_lock);
279	}
280
281	bust_spinlocks(0);
282	do_exit(SIGKILL);
283
284/*
285 * We ran out of memory, or some other thing happened to us that made
286 * us unable to handle the page fault gracefully.
287 */
288out_of_memory:
289	up_read(&mm->mmap_sem);
290	if (current->pid == 1) {
291		tsk->policy |= SCHED_YIELD;
292		schedule();
293		goto again;
294	}
295	printk("VM: killing process %s\n", tsk->comm);
296	if (error_code & 4)
297		do_exit(SIGKILL);
298	goto no_context;
299
300do_sigbus:
301	up_read(&mm->mmap_sem);
302
303	/*
304	 * Send a sigbus, regardless of whether we were in kernel
305	 * or user mode.
306	 */
307	tsk->thread.cr2 = address;
308	tsk->thread.error_code = error_code;
309	tsk->thread.trap_no = 14;
310	info.si_signo = SIGBUS;
311	info.si_errno = 0;
312	info.si_code = BUS_ADRERR;
313	info.si_addr = (void *)address;
314	force_sig_info(SIGBUS, &info, tsk);
315
316	/* Kernel mode? Handle exceptions or die */
317	if (!(error_code & 4))
318		goto no_context;
319	return;
320
321
322vmalloc_fault:
323	{
324		pgd_t *pgd;
325		pmd_t *pmd;
326		pte_t *pte;
327
328		/*
329		 * x86-64 has the same kernel 3rd level pages for all CPUs.
330		 * But for vmalloc/modules the TLB synchronization works lazily,
331		 * so it can happen that we get a page fault for something
332		 * that is really already in the page table. Just check if it
333		 * is really there and when yes flush the local TLB.
334		 */
335
336		pgd = pgd_offset_k(address);
337		if (pgd != current_pgd_offset_k(address))
338			goto bad_area_nosemaphore;
339		if (!pgd_present(*pgd))
340			goto bad_area_nosemaphore;
341		pmd = pmd_offset(pgd, address);
342		if (!pmd_present(*pmd))
343			goto bad_area_nosemaphore;
344		pte = pte_offset(pmd, address);
345		if (!pte_present(*pte))
346			goto bad_area_nosemaphore;
347
348		__flush_tlb_all();
349		return;
350	}
351}
352