1/*
2 *  arch/s390/mm/fault.c
3 *
4 *  S390 version
5 *    Copyright (C) 1999 IBM Deutschland Entwicklung GmbH, IBM Corporation
6 *    Author(s): Hartmut Penner (hp@de.ibm.com)
7 *               Ulrich Weigand (uweigand@de.ibm.com)
8 *
9 *  Derived from "arch/i386/mm/fault.c"
10 *    Copyright (C) 1995  Linus Torvalds
11 */
12
13#include <linux/signal.h>
14#include <linux/sched.h>
15#include <linux/kernel.h>
16#include <linux/errno.h>
17#include <linux/string.h>
18#include <linux/types.h>
19#include <linux/ptrace.h>
20#include <linux/mman.h>
21#include <linux/mm.h>
22#include <linux/smp.h>
23#include <linux/kdebug.h>
24#include <linux/smp_lock.h>
25#include <linux/init.h>
26#include <linux/console.h>
27#include <linux/module.h>
28#include <linux/hardirq.h>
29#include <linux/kprobes.h>
30#include <linux/uaccess.h>
31
32#include <asm/system.h>
33#include <asm/pgtable.h>
34#include <asm/s390_ext.h>
35
36#ifndef CONFIG_64BIT
37#define __FAIL_ADDR_MASK 0x7ffff000
38#define __FIXUP_MASK 0x7fffffff
39#define __SUBCODE_MASK 0x0200
40#define __PF_RES_FIELD 0ULL
41#else /* CONFIG_64BIT */
42#define __FAIL_ADDR_MASK -4096L
43#define __FIXUP_MASK ~0L
44#define __SUBCODE_MASK 0x0600
45#define __PF_RES_FIELD 0x8000000000000000ULL
46#endif /* CONFIG_64BIT */
47
48#ifdef CONFIG_SYSCTL
49extern int sysctl_userprocess_debug;
50#endif
51
52extern void die(const char *,struct pt_regs *,long);
53
54#ifdef CONFIG_KPROBES
55static inline int notify_page_fault(struct pt_regs *regs, long err)
56{
57	int ret = 0;
58
59	/* kprobe_running() needs smp_processor_id() */
60	if (!user_mode(regs)) {
61		preempt_disable();
62		if (kprobe_running() && kprobe_fault_handler(regs, 14))
63			ret = 1;
64		preempt_enable();
65	}
66
67	return ret;
68}
69#else
70static inline int notify_page_fault(struct pt_regs *regs, long err)
71{
72	return 0;
73}
74#endif
75
76
77/*
78 * Unlock any spinlocks which will prevent us from getting the
79 * message out.
80 */
81void bust_spinlocks(int yes)
82{
83	if (yes) {
84		oops_in_progress = 1;
85	} else {
86		int loglevel_save = console_loglevel;
87		console_unblank();
88		oops_in_progress = 0;
89		/*
90		 * OK, the message is on the console.  Now we call printk()
91		 * without oops_in_progress set so that printk will give klogd
92		 * a poke.  Hold onto your hats...
93		 */
94		console_loglevel = 15;
95		printk(" ");
96		console_loglevel = loglevel_save;
97	}
98}
99
100/*
101 * Returns the address space associated with the fault.
102 * Returns 0 for kernel space, 1 for user space and
103 * 2 for code execution in user space with noexec=on.
104 */
105static inline int check_space(struct task_struct *tsk)
106{
107	/*
108	 * The lowest two bits of S390_lowcore.trans_exc_code
109	 * indicate which paging table was used.
110	 */
111	int desc = S390_lowcore.trans_exc_code & 3;
112
113	if (desc == 3)	/* Home Segment Table Descriptor */
114		return switch_amode == 0;
115	if (desc == 2)	/* Secondary Segment Table Descriptor */
116		return tsk->thread.mm_segment.ar4;
117#ifdef CONFIG_S390_SWITCH_AMODE
118	if (unlikely(desc == 1)) { /* STD determined via access register */
119		/* %a0 always indicates primary space. */
120		if (S390_lowcore.exc_access_id != 0) {
121			save_access_regs(tsk->thread.acrs);
122			/*
123			 * An alet of 0 indicates primary space.
124			 * An alet of 1 indicates secondary space.
125			 * Any other alet values generate an
126			 * alen-translation exception.
127			 */
128			if (tsk->thread.acrs[S390_lowcore.exc_access_id])
129				return tsk->thread.mm_segment.ar4;
130		}
131	}
132#endif
133	/* Primary Segment Table Descriptor */
134	return switch_amode << s390_noexec;
135}
136
137/*
138 * Send SIGSEGV to task.  This is an external routine
139 * to keep the stack usage of do_page_fault small.
140 */
141static void do_sigsegv(struct pt_regs *regs, unsigned long error_code,
142		       int si_code, unsigned long address)
143{
144	struct siginfo si;
145
146#if defined(CONFIG_SYSCTL) || defined(CONFIG_PROCESS_DEBUG)
147#if defined(CONFIG_SYSCTL)
148	if (sysctl_userprocess_debug)
149#endif
150	{
151		printk("User process fault: interruption code 0x%lX\n",
152		       error_code);
153		printk("failing address: %lX\n", address);
154		show_regs(regs);
155	}
156#endif
157	si.si_signo = SIGSEGV;
158	si.si_code = si_code;
159	si.si_addr = (void __user *) address;
160	force_sig_info(SIGSEGV, &si, current);
161}
162
163static void do_no_context(struct pt_regs *regs, unsigned long error_code,
164			  unsigned long address)
165{
166	const struct exception_table_entry *fixup;
167
168	/* Are we prepared to handle this kernel fault?  */
169	fixup = search_exception_tables(regs->psw.addr & __FIXUP_MASK);
170	if (fixup) {
171		regs->psw.addr = fixup->fixup | PSW_ADDR_AMODE;
172		return;
173	}
174
175	/*
176	 * Oops. The kernel tried to access some bad page. We'll have to
177	 * terminate things with extreme prejudice.
178	 */
179	if (check_space(current) == 0)
180		printk(KERN_ALERT "Unable to handle kernel pointer dereference"
181		       " at virtual kernel address %p\n", (void *)address);
182	else
183		printk(KERN_ALERT "Unable to handle kernel paging request"
184		       " at virtual user address %p\n", (void *)address);
185
186	die("Oops", regs, error_code);
187	do_exit(SIGKILL);
188}
189
190static void do_low_address(struct pt_regs *regs, unsigned long error_code)
191{
192	/* Low-address protection hit in kernel mode means
193	   NULL pointer write access in kernel mode.  */
194	if (regs->psw.mask & PSW_MASK_PSTATE) {
195		/* Low-address protection hit in user mode 'cannot happen'. */
196		die ("Low-address protection", regs, error_code);
197		do_exit(SIGKILL);
198	}
199
200	do_no_context(regs, error_code, 0);
201}
202
203/*
204 * We ran out of memory, or some other thing happened to us that made
205 * us unable to handle the page fault gracefully.
206 */
207static int do_out_of_memory(struct pt_regs *regs, unsigned long error_code,
208			    unsigned long address)
209{
210	struct task_struct *tsk = current;
211	struct mm_struct *mm = tsk->mm;
212
213	up_read(&mm->mmap_sem);
214	if (is_init(tsk)) {
215		yield();
216		down_read(&mm->mmap_sem);
217		return 1;
218	}
219	printk("VM: killing process %s\n", tsk->comm);
220	if (regs->psw.mask & PSW_MASK_PSTATE)
221		do_exit(SIGKILL);
222	do_no_context(regs, error_code, address);
223	return 0;
224}
225
226static void do_sigbus(struct pt_regs *regs, unsigned long error_code,
227		      unsigned long address)
228{
229	struct task_struct *tsk = current;
230	struct mm_struct *mm = tsk->mm;
231
232	up_read(&mm->mmap_sem);
233	/*
234	 * Send a sigbus, regardless of whether we were in kernel
235	 * or user mode.
236	 */
237	tsk->thread.prot_addr = address;
238	tsk->thread.trap_no = error_code;
239	force_sig(SIGBUS, tsk);
240
241	/* Kernel mode? Handle exceptions or die */
242	if (!(regs->psw.mask & PSW_MASK_PSTATE))
243		do_no_context(regs, error_code, address);
244}
245
246#ifdef CONFIG_S390_EXEC_PROTECT
247extern long sys_sigreturn(struct pt_regs *regs);
248extern long sys_rt_sigreturn(struct pt_regs *regs);
249extern long sys32_sigreturn(struct pt_regs *regs);
250extern long sys32_rt_sigreturn(struct pt_regs *regs);
251
252static int signal_return(struct mm_struct *mm, struct pt_regs *regs,
253			 unsigned long address, unsigned long error_code)
254{
255	u16 instruction;
256	int rc;
257#ifdef CONFIG_COMPAT
258	int compat;
259#endif
260
261	pagefault_disable();
262	rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
263	pagefault_enable();
264	if (rc)
265		return -EFAULT;
266
267	up_read(&mm->mmap_sem);
268	clear_tsk_thread_flag(current, TIF_SINGLE_STEP);
269#ifdef CONFIG_COMPAT
270	compat = test_tsk_thread_flag(current, TIF_31BIT);
271	if (compat && instruction == 0x0a77)
272		sys32_sigreturn(regs);
273	else if (compat && instruction == 0x0aad)
274		sys32_rt_sigreturn(regs);
275	else
276#endif
277	if (instruction == 0x0a77)
278		sys_sigreturn(regs);
279	else if (instruction == 0x0aad)
280		sys_rt_sigreturn(regs);
281	else {
282		current->thread.prot_addr = address;
283		current->thread.trap_no = error_code;
284		do_sigsegv(regs, error_code, SEGV_MAPERR, address);
285	}
286	return 0;
287}
288#endif /* CONFIG_S390_EXEC_PROTECT */
289
290/*
291 * This routine handles page faults.  It determines the address,
292 * and the problem, and then passes it off to one of the appropriate
293 * routines.
294 *
295 * error_code:
296 *   04       Protection           ->  Write-Protection  (suprression)
297 *   10       Segment translation  ->  Not present       (nullification)
298 *   11       Page translation     ->  Not present       (nullification)
299 *   3b       Region third trans.  ->  Not present       (nullification)
300 */
301static inline void
302do_exception(struct pt_regs *regs, unsigned long error_code, int write)
303{
304	struct task_struct *tsk;
305	struct mm_struct *mm;
306	struct vm_area_struct *vma;
307	unsigned long address;
308	int space;
309	int si_code;
310
311	if (notify_page_fault(regs, error_code))
312		return;
313
314	tsk = current;
315	mm = tsk->mm;
316
317	/* get the failing address and the affected space */
318	address = S390_lowcore.trans_exc_code & __FAIL_ADDR_MASK;
319	space = check_space(tsk);
320
321	/*
322	 * Verify that the fault happened in user space, that
323	 * we are not in an interrupt and that there is a
324	 * user context.
325	 */
326	if (unlikely(space == 0 || in_atomic() || !mm))
327		goto no_context;
328
329	/*
330	 * When we get here, the fault happened in the current
331	 * task's user address space, so we can switch on the
332	 * interrupts again and then search the VMAs
333	 */
334	local_irq_enable();
335
336	down_read(&mm->mmap_sem);
337
338	si_code = SEGV_MAPERR;
339	vma = find_vma(mm, address);
340	if (!vma)
341		goto bad_area;
342
343#ifdef CONFIG_S390_EXEC_PROTECT
344	if (unlikely((space == 2) && !(vma->vm_flags & VM_EXEC)))
345		if (!signal_return(mm, regs, address, error_code))
346			/*
347			 * signal_return() has done an up_read(&mm->mmap_sem)
348			 * if it returns 0.
349			 */
350			return;
351#endif
352
353	if (vma->vm_start <= address)
354		goto good_area;
355	if (!(vma->vm_flags & VM_GROWSDOWN))
356		goto bad_area;
357	if (expand_stack(vma, address))
358		goto bad_area;
359/*
360 * Ok, we have a good vm_area for this memory access, so
361 * we can handle it..
362 */
363good_area:
364	si_code = SEGV_ACCERR;
365	if (!write) {
366		/* page not present, check vm flags */
367		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
368			goto bad_area;
369	} else {
370		if (!(vma->vm_flags & VM_WRITE))
371			goto bad_area;
372	}
373
374survive:
375	/*
376	 * If for any reason at all we couldn't handle the fault,
377	 * make sure we exit gracefully rather than endlessly redo
378	 * the fault.
379	 */
380	switch (handle_mm_fault(mm, vma, address, write)) {
381	case VM_FAULT_MINOR:
382		tsk->min_flt++;
383		break;
384	case VM_FAULT_MAJOR:
385		tsk->maj_flt++;
386		break;
387	case VM_FAULT_SIGBUS:
388		do_sigbus(regs, error_code, address);
389		return;
390	case VM_FAULT_OOM:
391		if (do_out_of_memory(regs, error_code, address))
392			goto survive;
393		return;
394	default:
395		BUG();
396	}
397
398        up_read(&mm->mmap_sem);
399	/*
400	 * The instruction that caused the program check will
401	 * be repeated. Don't signal single step via SIGTRAP.
402	 */
403	clear_tsk_thread_flag(tsk, TIF_SINGLE_STEP);
404        return;
405
406/*
407 * Something tried to access memory that isn't in our memory map..
408 * Fix it, but check if it's kernel or user first..
409 */
410bad_area:
411	up_read(&mm->mmap_sem);
412
413	/* User mode accesses just cause a SIGSEGV */
414	if (regs->psw.mask & PSW_MASK_PSTATE) {
415		tsk->thread.prot_addr = address;
416		tsk->thread.trap_no = error_code;
417		do_sigsegv(regs, error_code, si_code, address);
418		return;
419	}
420
421no_context:
422	do_no_context(regs, error_code, address);
423}
424
425void __kprobes do_protection_exception(struct pt_regs *regs,
426				       unsigned long error_code)
427{
428	/* Protection exception is supressing, decrement psw address. */
429	regs->psw.addr -= (error_code >> 16);
430	/*
431	 * Check for low-address protection.  This needs to be treated
432	 * as a special case because the translation exception code
433	 * field is not guaranteed to contain valid data in this case.
434	 */
435	if (unlikely(!(S390_lowcore.trans_exc_code & 4))) {
436		do_low_address(regs, error_code);
437		return;
438	}
439	do_exception(regs, 4, 1);
440}
441
442void __kprobes do_dat_exception(struct pt_regs *regs, unsigned long error_code)
443{
444	do_exception(regs, error_code & 0xff, 0);
445}
446
447#ifdef CONFIG_PFAULT
448/*
449 * 'pfault' pseudo page faults routines.
450 */
451static ext_int_info_t ext_int_pfault;
452static int pfault_disable = 0;
453
454static int __init nopfault(char *str)
455{
456	pfault_disable = 1;
457	return 1;
458}
459
460__setup("nopfault", nopfault);
461
462typedef struct {
463	__u16 refdiagc;
464	__u16 reffcode;
465	__u16 refdwlen;
466	__u16 refversn;
467	__u64 refgaddr;
468	__u64 refselmk;
469	__u64 refcmpmk;
470	__u64 reserved;
471} __attribute__ ((packed)) pfault_refbk_t;
472
473int pfault_init(void)
474{
475	pfault_refbk_t refbk =
476		{ 0x258, 0, 5, 2, __LC_CURRENT, 1ULL << 48, 1ULL << 48,
477		  __PF_RES_FIELD };
478        int rc;
479
480	if (!MACHINE_IS_VM || pfault_disable)
481		return -1;
482	asm volatile(
483		"	diag	%1,%0,0x258\n"
484		"0:	j	2f\n"
485		"1:	la	%0,8\n"
486		"2:\n"
487		EX_TABLE(0b,1b)
488		: "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc");
489        __ctl_set_bit(0, 9);
490        return rc;
491}
492
493void pfault_fini(void)
494{
495	pfault_refbk_t refbk =
496	{ 0x258, 1, 5, 2, 0ULL, 0ULL, 0ULL, 0ULL };
497
498	if (!MACHINE_IS_VM || pfault_disable)
499		return;
500	__ctl_clear_bit(0,9);
501	asm volatile(
502		"	diag	%0,0,0x258\n"
503		"0:\n"
504		EX_TABLE(0b,0b)
505		: : "a" (&refbk), "m" (refbk) : "cc");
506}
507
508static void pfault_interrupt(__u16 error_code)
509{
510	struct task_struct *tsk;
511	__u16 subcode;
512
513	/*
514	 * Get the external interruption subcode & pfault
515	 * initial/completion signal bit. VM stores this
516	 * in the 'cpu address' field associated with the
517         * external interrupt.
518	 */
519	subcode = S390_lowcore.cpu_addr;
520	if ((subcode & 0xff00) != __SUBCODE_MASK)
521		return;
522
523	/*
524	 * Get the token (= address of the task structure of the affected task).
525	 */
526	tsk = *(struct task_struct **) __LC_PFAULT_INTPARM;
527
528	if (subcode & 0x0080) {
529		/* signal bit is set -> a page has been swapped in by VM */
530		if (xchg(&tsk->thread.pfault_wait, -1) != 0) {
531			/* Initial interrupt was faster than the completion
532			 * interrupt. pfault_wait is valid. Set pfault_wait
533			 * back to zero and wake up the process. This can
534			 * safely be done because the task is still sleeping
535			 * and can't produce new pfaults. */
536			tsk->thread.pfault_wait = 0;
537			wake_up_process(tsk);
538			put_task_struct(tsk);
539		}
540	} else {
541		/* signal bit not set -> a real page is missing. */
542		get_task_struct(tsk);
543		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
544		if (xchg(&tsk->thread.pfault_wait, 1) != 0) {
545			/* Completion interrupt was faster than the initial
546			 * interrupt (swapped in a -1 for pfault_wait). Set
547			 * pfault_wait back to zero and exit. This can be
548			 * done safely because tsk is running in kernel
549			 * mode and can't produce new pfaults. */
550			tsk->thread.pfault_wait = 0;
551			set_task_state(tsk, TASK_RUNNING);
552			put_task_struct(tsk);
553		} else
554			set_tsk_need_resched(tsk);
555	}
556}
557
558void __init pfault_irq_init(void)
559{
560	if (!MACHINE_IS_VM)
561		return;
562
563	/*
564	 * Try to get pfault pseudo page faults going.
565	 */
566	if (register_early_external_interrupt(0x2603, pfault_interrupt,
567					      &ext_int_pfault) != 0)
568		panic("Couldn't request external interrupt 0x2603");
569
570	if (pfault_init() == 0)
571		return;
572
573	/* Tough luck, no pfault. */
574	pfault_disable = 1;
575	unregister_early_external_interrupt(0x2603, pfault_interrupt,
576					    &ext_int_pfault);
577}
578#endif
579