subr_trap.c revision 200
1/*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * the University of Utah, and William Jolitz.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)trap.c	7.4 (Berkeley) 5/13/91
37 *
38 * PATCHES MAGIC                LEVEL   PATCH THAT GOT US HERE
39 * --------------------         -----   ----------------------
40 * CURRENT PATCH LEVEL:         1       00137
41 * --------------------         -----   ----------------------
42 *
43 * 08 Apr 93	Bruce Evans		Several VM system fixes
44 * 		Paul Kranenburg		Add counter for vmstat
45 */
46static char rcsid[] = "$Header: /a/cvs/386BSD/src/sys/i386/i386/trap.c,v 1.1.1.1 1993/06/12 14:58:05 rgrimes Exp $";
47
48/*
49 * 386 Trap and System call handleing
50 */
51
52#include "machine/cpu.h"
53#include "machine/psl.h"
54#include "machine/reg.h"
55
56#include "param.h"
57#include "systm.h"
58#include "proc.h"
59#include "user.h"
60#include "acct.h"
61#include "kernel.h"
62#ifdef KTRACE
63#include "ktrace.h"
64#endif
65
66#include "vm/vm_param.h"
67#include "vm/pmap.h"
68#include "vm/vm_map.h"
69#include "sys/vmmeter.h"
70
71#include "machine/trap.h"
72
73#ifdef	__GNUC__
74
75/*
76 * The "r" contraint could be "rm" except for fatal bugs in gas.  As usual,
77 * we omit the size from the mov instruction to avoid nonfatal bugs in gas.
78 */
79#define	read_gs()	({ u_short gs; __asm("mov %%gs,%0" : "=r" (gs)); gs; })
80#define	write_gs(gs)	__asm("mov %0,%%gs" : : "r" ((u_short) gs))
81
82#else	/* not __GNUC__ */
83
84u_short	read_gs		__P((void));
85void	write_gs	__P((/* promoted u_short */ int gs));
86
87#endif	/* __GNUC__ */
88
89struct	sysent sysent[];
90int	nsysent;
91int dostacklimits;
92unsigned rcr2();
93extern short cpl;
94
95
96/*
97 * trap(frame):
98 *	Exception, fault, and trap interface to BSD kernel. This
99 * common code is called from assembly language IDT gate entry
100 * routines that prepare a suitable stack frame, and restore this
101 * frame after the exception has been processed. Note that the
102 * effect is as if the arguments were passed call by reference.
103 */
104
105/*ARGSUSED*/
106trap(frame)
107	struct trapframe frame;
108{
109	register int i;
110	register struct proc *p = curproc;
111	struct timeval syst;
112	int ucode, type, code, eva;
113
114	frame.tf_eflags &= ~PSL_NT;	/* clear nested trap XXX */
115	type = frame.tf_trapno;
116#include "ddb.h"
117#if NDDB > 0
118	if (curpcb && curpcb->pcb_onfault) {
119		if (frame.tf_trapno == T_BPTFLT
120		    || frame.tf_trapno == T_TRCTRAP)
121			if (kdb_trap (type, 0, &frame))
122				return;
123	}
124#endif
125
126/*pg("trap type %d code = %x eip = %x cs = %x eva = %x esp %x",
127			frame.tf_trapno, frame.tf_err, frame.tf_eip,
128			frame.tf_cs, rcr2(), frame.tf_esp);*/
129if(curpcb == 0 || curproc == 0) goto we_re_toast;
130	if (curpcb->pcb_onfault && frame.tf_trapno != T_PAGEFLT) {
131		extern int _udatasel;
132
133		if (read_gs() != (u_short) _udatasel)
134			/*
135			 * Some user has corrupted %gs but we depend on it in
136			 * copyout() etc.  Fix it up and retry.
137			 *
138			 * (We don't preserve %fs or %gs, so users can change
139			 * them to either _ucodesel, _udatasel or a not-present
140			 * selector, possibly ORed with 0 to 3, making them
141			 * volatile for other users.  Not preserving them saves
142			 * time and doesn't lose functionality or open security
143			 * holes.)
144			 */
145			write_gs(_udatasel);
146		else
147copyfault:
148			frame.tf_eip = (int)curpcb->pcb_onfault;
149		return;
150	}
151
152	syst = p->p_stime;
153	if (ISPL(frame.tf_cs) == SEL_UPL) {
154		type |= T_USER;
155		p->p_regs = (int *)&frame;
156		curpcb->pcb_flags |= FM_TRAP;	/* used by sendsig */
157	}
158
159	ucode=0;
160	eva = rcr2();
161	code = frame.tf_err;
162	switch (type) {
163
164	default:
165	we_re_toast:
166#ifdef KDB
167		if (kdb_trap(&psl))
168			return;
169#endif
170#if NDDB > 0
171		if (kdb_trap (type, 0, &frame))
172			return;
173#endif
174
175		printf("trap type %d code = %x eip = %x cs = %x eflags = %x ",
176			frame.tf_trapno, frame.tf_err, frame.tf_eip,
177			frame.tf_cs, frame.tf_eflags);
178	eva = rcr2();
179		printf("cr2 %x cpl %x\n", eva, cpl);
180		/* type &= ~T_USER; */ /* XXX what the hell is this */
181		panic("trap");
182		/*NOTREACHED*/
183
184	case T_SEGNPFLT|T_USER:
185	case T_STKFLT|T_USER:
186	case T_PROTFLT|T_USER:		/* protection fault */
187		ucode = code + BUS_SEGM_FAULT ;
188		i = SIGBUS;
189		break;
190
191	case T_PRIVINFLT|T_USER:	/* privileged instruction fault */
192	case T_RESADFLT|T_USER:		/* reserved addressing fault */
193	case T_RESOPFLT|T_USER:		/* reserved operand fault */
194	case T_FPOPFLT|T_USER:		/* coprocessor operand fault */
195		ucode = type &~ T_USER;
196		i = SIGILL;
197		break;
198
199	case T_ASTFLT|T_USER:		/* Allow process switch */
200		astoff();
201		cnt.v_soft++;
202		if ((p->p_flag & SOWEUPC) && p->p_stats->p_prof.pr_scale) {
203			addupc(frame.tf_eip, &p->p_stats->p_prof, 1);
204			p->p_flag &= ~SOWEUPC;
205		}
206		goto out;
207
208	case T_DNA|T_USER:
209#ifdef	NPX
210		/* if a transparent fault (due to context switch "late") */
211		if (npxdna()) return;
212#endif
213		i = math_emulate(&frame);
214		if (i == 0) return;
215		ucode = FPE_FPU_NP_TRAP;
216		break;
217
218	case T_BOUND|T_USER:
219		ucode = FPE_SUBRNG_TRAP;
220		i = SIGFPE;
221		break;
222
223	case T_OFLOW|T_USER:
224		ucode = FPE_INTOVF_TRAP;
225		i = SIGFPE;
226		break;
227
228	case T_DIVIDE|T_USER:
229		ucode = FPE_INTDIV_TRAP;
230		i = SIGFPE;
231		break;
232
233	case T_ARITHTRAP|T_USER:
234		ucode = code;
235		i = SIGFPE;
236		break;
237
238	case T_PAGEFLT:			/* allow page faults in kernel mode */
239#if 0
240		/* XXX - check only applies to 386's and 486's with WP off */
241		if (code & PGEX_P) goto we_re_toast;
242#endif
243
244		/* fall into */
245	case T_PAGEFLT|T_USER:		/* page fault */
246	    {
247		register vm_offset_t va;
248		register struct vmspace *vm = p->p_vmspace;
249		register vm_map_t map;
250		int rv;
251		vm_prot_t ftype;
252		extern vm_map_t kernel_map;
253		unsigned nss,v;
254
255		va = trunc_page((vm_offset_t)eva);
256		/*
257		 * Avoid even looking at pde_v(va) for high va's.   va's
258		 * above VM_MAX_KERNEL_ADDRESS don't correspond to normal
259		 * PDE's (half of them correspond to APDEpde and half to
260		 * an unmapped kernel PDE).  va's betweeen 0xFEC00000 and
261		 * VM_MAX_KERNEL_ADDRESS correspond to unmapped kernel PDE's
262		 * (XXX - why are only 3 initialized when 6 are required to
263		 * reach VM_MAX_KERNEL_ADDRESS?).  Faulting in an unmapped
264		 * kernel page table would give inconsistent PTD's.
265		 *
266		 * XXX - faulting in unmapped page tables wastes a page if
267		 * va turns out to be invalid.
268		 *
269		 * XXX - should "kernel address space" cover the kernel page
270		 * tables?  Might have same problem with PDEpde as with
271		 * APDEpde (or there may be no problem with APDEpde).
272		 */
273		if (va > 0xFEBFF000) {
274			rv = KERN_FAILURE;	/* becomes SIGBUS */
275			goto nogo;
276		}
277		/*
278		 * It is only a kernel address space fault iff:
279		 * 	1. (type & T_USER) == 0  and
280		 * 	2. pcb_onfault not set or
281		 *	3. pcb_onfault set but supervisor space fault
282		 * The last can occur during an exec() copyin where the
283		 * argument space is lazy-allocated.
284		 */
285		if (type == T_PAGEFLT && va >= KERNBASE)
286			map = kernel_map;
287		else
288			map = &vm->vm_map;
289		if (code & PGEX_W)
290			ftype = VM_PROT_READ | VM_PROT_WRITE;
291		else
292			ftype = VM_PROT_READ;
293
294#ifdef DEBUG
295		if (map == kernel_map && va == 0) {
296			printf("trap: bad kernel access at %x\n", va);
297			goto we_re_toast;
298		}
299#endif
300
301		/*
302		 * XXX: rude hack to make stack limits "work"
303		 */
304		nss = 0;
305		if ((caddr_t)va >= vm->vm_maxsaddr && map != kernel_map
306			&& dostacklimits) {
307			nss = clrnd(btoc((unsigned)vm->vm_maxsaddr
308				+ MAXSSIZ - (unsigned)va));
309			if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
310/*pg("trap rlimit %d, maxsaddr %x va %x ", nss, vm->vm_maxsaddr, va);*/
311				rv = KERN_FAILURE;
312				goto nogo;
313			}
314		}
315
316		/* check if page table is mapped, if not, fault it first */
317#define pde_v(v) (PTD[((v)>>PD_SHIFT)&1023].pd_v)
318		if (!pde_v(va)) {
319			v = trunc_page(vtopte(va));
320			rv = vm_fault(map, v, ftype, FALSE);
321			if (rv != KERN_SUCCESS) goto nogo;
322			/* check if page table fault, increment wiring */
323			vm_map_pageable(map, v, round_page(v+1), FALSE);
324		} else v=0;
325		rv = vm_fault(map, va, ftype, FALSE);
326		if (rv == KERN_SUCCESS) {
327			/*
328			 * XXX: continuation of rude stack hack
329			 */
330			if (nss > vm->vm_ssize)
331				vm->vm_ssize = nss;
332			va = trunc_page(vtopte(va));
333			/* for page table, increment wiring
334			   as long as not a page table fault as well */
335			if (!v && type != T_PAGEFLT)
336			  vm_map_pageable(map, va, round_page(va+1), FALSE);
337			if (type == T_PAGEFLT)
338				return;
339			goto out;
340		}
341nogo:
342		if (type == T_PAGEFLT) {
343			if (curpcb->pcb_onfault)
344				goto copyfault;
345			printf("vm_fault(%x, %x, %x, 0) -> %x\n",
346			       map, va, ftype, rv);
347			printf("  type %x, code %x\n",
348			       type, code);
349			goto we_re_toast;
350		}
351		i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV;
352		break;
353	    }
354
355#if NDDB == 0
356	case T_TRCTRAP:	 /* trace trap -- someone single stepping lcall's */
357		frame.tf_eflags &= ~PSL_T;
358
359			/* Q: how do we turn it on again? */
360		return;
361#endif
362
363	case T_BPTFLT|T_USER:		/* bpt instruction fault */
364	case T_TRCTRAP|T_USER:		/* trace trap */
365		frame.tf_eflags &= ~PSL_T;
366		i = SIGTRAP;
367		break;
368
369#include "isa.h"
370#if	NISA > 0
371	case T_NMI:
372	case T_NMI|T_USER:
373#if NDDB > 0
374		/* NMI can be hooked up to a pushbutton for debugging */
375		printf ("NMI ... going to debugger\n");
376		if (kdb_trap (type, 0, &frame))
377			return;
378#endif
379		/* machine/parity/power fail/"kitchen sink" faults */
380		if(isa_nmi(code) == 0) return;
381		else goto we_re_toast;
382#endif
383	}
384
385	trapsignal(p, i, ucode);
386	if ((type & T_USER) == 0)
387		return;
388out:
389	while (i = CURSIG(p))
390		psig(i);
391	p->p_pri = p->p_usrpri;
392	if (want_resched) {
393		/*
394		 * Since we are curproc, clock will normally just change
395		 * our priority without moving us from one queue to another
396		 * (since the running process is not on a queue.)
397		 * If that happened after we setrq ourselves but before we
398		 * swtch()'ed, we might not be on the queue indicated by
399		 * our priority.
400		 */
401		(void) splclock();
402		setrq(p);
403		p->p_stats->p_ru.ru_nivcsw++;
404		swtch();
405		(void) splnone();
406		while (i = CURSIG(p))
407			psig(i);
408	}
409	if (p->p_stats->p_prof.pr_scale) {
410		int ticks;
411		struct timeval *tv = &p->p_stime;
412
413		ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
414			(tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
415		if (ticks) {
416#ifdef PROFTIMER
417			extern int profscale;
418			addupc(frame.tf_eip, &p->p_stats->p_prof,
419			    ticks * profscale);
420#else
421			addupc(frame.tf_eip, &p->p_stats->p_prof, ticks);
422#endif
423		}
424	}
425	curpri = p->p_pri;
426	curpcb->pcb_flags &= ~FM_TRAP;	/* used by sendsig */
427}
428
429/*
430 * Compensate for 386 brain damage (missing URKR).
431 * This is a little simpler than the pagefault handler in trap() because
432 * it the page tables have already been faulted in and high addresses
433 * are thrown out early for other reasons.
434 */
435int trapwrite(addr)
436	unsigned addr;
437{
438	unsigned nss;
439	struct proc *p;
440	vm_offset_t va;
441	struct vmspace *vm;
442
443	va = trunc_page((vm_offset_t)addr);
444	/*
445	 * XXX - MAX is END.  Changed > to >= for temp. fix.
446	 */
447	if (va >= VM_MAXUSER_ADDRESS)
448		return (1);
449	/*
450	 * XXX: rude stack hack adapted from trap().
451	 */
452	nss = 0;
453	p = curproc;
454	vm = p->p_vmspace;
455	if ((caddr_t)va >= vm->vm_maxsaddr && dostacklimits) {
456		nss = clrnd(btoc((unsigned)vm->vm_maxsaddr + MAXSSIZ
457				 - (unsigned)va));
458		if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur))
459			return (1);
460	}
461
462	if (vm_fault(&vm->vm_map, va, VM_PROT_READ | VM_PROT_WRITE, FALSE)
463	    != KERN_SUCCESS)
464		return (1);
465
466	/*
467	 * XXX: continuation of rude stack hack
468	 */
469	if (nss > vm->vm_ssize)
470		vm->vm_ssize = nss;
471
472	return (0);
473}
474
475/*
476 * syscall(frame):
477 *	System call request from POSIX system call gate interface to kernel.
478 * Like trap(), argument is call by reference.
479 */
480/*ARGSUSED*/
481syscall(frame)
482	volatile struct syscframe frame;
483{
484	register int *locr0 = ((int *)&frame);
485	register caddr_t params;
486	register int i;
487	register struct sysent *callp;
488	register struct proc *p = curproc;
489	struct timeval syst;
490	int error, opc;
491	int args[8], rval[2];
492	int code;
493
494#ifdef lint
495	r0 = 0; r0 = r0; r1 = 0; r1 = r1;
496#endif
497	syst = p->p_stime;
498	if (ISPL(frame.sf_cs) != SEL_UPL)
499		panic("syscall");
500
501	code = frame.sf_eax;
502	curpcb->pcb_flags &= ~FM_TRAP;	/* used by sendsig */
503	p->p_regs = (int *)&frame;
504	params = (caddr_t)frame.sf_esp + sizeof (int) ;
505
506	/*
507	 * Reconstruct pc, assuming lcall $X,y is 7 bytes, as it is always.
508	 */
509	opc = frame.sf_eip - 7;
510	callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
511	if (callp == sysent) {
512		i = fuword(params);
513		params += sizeof (int);
514		callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
515	}
516
517	if ((i = callp->sy_narg * sizeof (int)) &&
518	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
519		frame.sf_eax = error;
520		frame.sf_eflags |= PSL_C;	/* carry bit */
521#ifdef KTRACE
522		if (KTRPOINT(p, KTR_SYSCALL))
523			ktrsyscall(p->p_tracep, code, callp->sy_narg, &args);
524#endif
525		goto done;
526	}
527#ifdef KTRACE
528	if (KTRPOINT(p, KTR_SYSCALL))
529		ktrsyscall(p->p_tracep, code, callp->sy_narg, &args);
530#endif
531	rval[0] = 0;
532	rval[1] = frame.sf_edx;
533/*pg("%d. s %d\n", p->p_pid, code);*/
534	error = (*callp->sy_call)(p, args, rval);
535	if (error == ERESTART)
536		frame.sf_eip = opc;
537	else if (error != EJUSTRETURN) {
538		if (error) {
539/*pg("error %d", error);*/
540			frame.sf_eax = error;
541			frame.sf_eflags |= PSL_C;	/* carry bit */
542		} else {
543			frame.sf_eax = rval[0];
544			frame.sf_edx = rval[1];
545			frame.sf_eflags &= ~PSL_C;	/* carry bit */
546		}
547	}
548	/* else if (error == EJUSTRETURN) */
549		/* nothing to do */
550done:
551	/*
552	 * Reinitialize proc pointer `p' as it may be different
553	 * if this is a child returning from fork syscall.
554	 */
555	p = curproc;
556	while (i = CURSIG(p))
557		psig(i);
558	p->p_pri = p->p_usrpri;
559	if (want_resched) {
560		/*
561		 * Since we are curproc, clock will normally just change
562		 * our priority without moving us from one queue to another
563		 * (since the running process is not on a queue.)
564		 * If that happened after we setrq ourselves but before we
565		 * swtch()'ed, we might not be on the queue indicated by
566		 * our priority.
567		 */
568		(void) splclock();
569		setrq(p);
570		p->p_stats->p_ru.ru_nivcsw++;
571		swtch();
572		(void) splnone();
573		while (i = CURSIG(p))
574			psig(i);
575	}
576	if (p->p_stats->p_prof.pr_scale) {
577		int ticks;
578		struct timeval *tv = &p->p_stime;
579
580		ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
581			(tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
582		if (ticks) {
583#ifdef PROFTIMER
584			extern int profscale;
585			addupc(frame.sf_eip, &p->p_stats->p_prof,
586			    ticks * profscale);
587#else
588			addupc(frame.sf_eip, &p->p_stats->p_prof, ticks);
589#endif
590		}
591	}
592	curpri = p->p_pri;
593#ifdef KTRACE
594	if (KTRPOINT(p, KTR_SYSRET))
595		ktrsysret(p->p_tracep, code, error, rval[0]);
596#endif
597#ifdef	DIAGNOSTICx
598{ extern int _udatasel, _ucodesel;
599	if (frame.sf_ss != _udatasel)
600		printf("ss %x call %d\n", frame.sf_ss, code);
601	if ((frame.sf_cs&0xffff) != _ucodesel)
602		printf("cs %x call %d\n", frame.sf_cs, code);
603	if (frame.sf_eip > VM_MAXUSER_ADDRESS) {
604		printf("eip %x call %d\n", frame.sf_eip, code);
605		frame.sf_eip = 0;
606	}
607}
608#endif
609}
610