subr_syscall.c revision 4
1/*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * the University of Utah, and William Jolitz.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)trap.c	7.4 (Berkeley) 5/13/91
37 *
38 * PATCHES MAGIC                LEVEL   PATCH THAT GOT US HERE
39 * --------------------         -----   ----------------------
40 * CURRENT PATCH LEVEL:         1       00137
41 * --------------------         -----   ----------------------
42 *
43 * 08 Apr 93	Bruce Evans		Several VM system fixes
44 * 		Paul Kranenburg		Add counter for vmstat
45 */
46static char rcsid[] = "$Header: /usr/bill/working/sys/i386/i386/RCS/trap.c,v 1.2 92/01/21 14:22:13 william Exp $";
47
48/*
49 * 386 Trap and System call handleing
50 */
51
52#include "machine/cpu.h"
53#include "machine/psl.h"
54#include "machine/reg.h"
55
56#include "param.h"
57#include "systm.h"
58#include "proc.h"
59#include "user.h"
60#include "acct.h"
61#include "kernel.h"
62#ifdef KTRACE
63#include "ktrace.h"
64#endif
65
66#include "vm/vm_param.h"
67#include "vm/pmap.h"
68#include "vm/vm_map.h"
69#include "sys/vmmeter.h"
70
71#include "machine/trap.h"
72
73
74struct	sysent sysent[];
75int	nsysent;
76int dostacklimits;
77unsigned rcr2();
78extern short cpl;
79
80
81/*
82 * trap(frame):
83 *	Exception, fault, and trap interface to BSD kernel. This
84 * common code is called from assembly language IDT gate entry
85 * routines that prepare a suitable stack frame, and restore this
86 * frame after the exception has been processed. Note that the
87 * effect is as if the arguments were passed call by reference.
88 */
89
90/*ARGSUSED*/
91trap(frame)
92	struct trapframe frame;
93{
94	register int i;
95	register struct proc *p = curproc;
96	struct timeval syst;
97	int ucode, type, code, eva;
98
99	frame.tf_eflags &= ~PSL_NT;	/* clear nested trap XXX */
100	type = frame.tf_trapno;
101#include "ddb.h"
102#if NDDB > 0
103	if (curpcb && curpcb->pcb_onfault) {
104		if (frame.tf_trapno == T_BPTFLT
105		    || frame.tf_trapno == T_TRCTRAP)
106			if (kdb_trap (type, 0, &frame))
107				return;
108	}
109#endif
110
111/*pg("trap type %d code = %x eip = %x cs = %x eva = %x esp %x",
112			frame.tf_trapno, frame.tf_err, frame.tf_eip,
113			frame.tf_cs, rcr2(), frame.tf_esp);*/
114if(curpcb == 0 || curproc == 0) goto we_re_toast;
115	if (curpcb->pcb_onfault && frame.tf_trapno != 0xc) {
116copyfault:
117		frame.tf_eip = (int)curpcb->pcb_onfault;
118		return;
119	}
120
121	syst = p->p_stime;
122	if (ISPL(frame.tf_cs) == SEL_UPL) {
123		type |= T_USER;
124		p->p_regs = (int *)&frame;
125		curpcb->pcb_flags |= FM_TRAP;	/* used by sendsig */
126	}
127
128	ucode=0;
129	eva = rcr2();
130	code = frame.tf_err;
131	switch (type) {
132
133	default:
134	we_re_toast:
135#ifdef KDB
136		if (kdb_trap(&psl))
137			return;
138#endif
139#if NDDB > 0
140		if (kdb_trap (type, 0, &frame))
141			return;
142#endif
143
144		printf("trap type %d code = %x eip = %x cs = %x eflags = %x ",
145			frame.tf_trapno, frame.tf_err, frame.tf_eip,
146			frame.tf_cs, frame.tf_eflags);
147	eva = rcr2();
148		printf("cr2 %x cpl %x\n", eva, cpl);
149		/* type &= ~T_USER; */ /* XXX what the hell is this */
150		panic("trap");
151		/*NOTREACHED*/
152
153	case T_SEGNPFLT|T_USER:
154	case T_STKFLT|T_USER:
155	case T_PROTFLT|T_USER:		/* protection fault */
156		ucode = code + BUS_SEGM_FAULT ;
157		i = SIGBUS;
158		break;
159
160	case T_PRIVINFLT|T_USER:	/* privileged instruction fault */
161	case T_RESADFLT|T_USER:		/* reserved addressing fault */
162	case T_RESOPFLT|T_USER:		/* reserved operand fault */
163	case T_FPOPFLT|T_USER:		/* coprocessor operand fault */
164		ucode = type &~ T_USER;
165		i = SIGILL;
166		break;
167
168	case T_ASTFLT|T_USER:		/* Allow process switch */
169		astoff();
170		cnt.v_soft++;
171		if ((p->p_flag & SOWEUPC) && p->p_stats->p_prof.pr_scale) {
172			addupc(frame.tf_eip, &p->p_stats->p_prof, 1);
173			p->p_flag &= ~SOWEUPC;
174		}
175		goto out;
176
177	case T_DNA|T_USER:
178#ifdef	NPX
179		/* if a transparent fault (due to context switch "late") */
180		if (npxdna()) return;
181#endif
182		i = math_emulate(&frame);
183		if (i == 0) return;
184		ucode = FPE_FPU_NP_TRAP;
185		break;
186
187	case T_BOUND|T_USER:
188		ucode = FPE_SUBRNG_TRAP;
189		i = SIGFPE;
190		break;
191
192	case T_OFLOW|T_USER:
193		ucode = FPE_INTOVF_TRAP;
194		i = SIGFPE;
195		break;
196
197	case T_DIVIDE|T_USER:
198		ucode = FPE_INTDIV_TRAP;
199		i = SIGFPE;
200		break;
201
202	case T_ARITHTRAP|T_USER:
203		ucode = code;
204		i = SIGFPE;
205		break;
206
207	case T_PAGEFLT:			/* allow page faults in kernel mode */
208#if 0
209		/* XXX - check only applies to 386's and 486's with WP off */
210		if (code & PGEX_P) goto we_re_toast;
211#endif
212
213		/* fall into */
214	case T_PAGEFLT|T_USER:		/* page fault */
215	    {
216		register vm_offset_t va;
217		register struct vmspace *vm = p->p_vmspace;
218		register vm_map_t map;
219		int rv;
220		vm_prot_t ftype;
221		extern vm_map_t kernel_map;
222		unsigned nss,v;
223
224		va = trunc_page((vm_offset_t)eva);
225		/*
226		 * Avoid even looking at pde_v(va) for high va's.   va's
227		 * above VM_MAX_KERNEL_ADDRESS don't correspond to normal
228		 * PDE's (half of them correspond to APDEpde and half to
229		 * an unmapped kernel PDE).  va's betweeen 0xFEC00000 and
230		 * VM_MAX_KERNEL_ADDRESS correspond to unmapped kernel PDE's
231		 * (XXX - why are only 3 initialized when 6 are required to
232		 * reach VM_MAX_KERNEL_ADDRESS?).  Faulting in an unmapped
233		 * kernel page table would give inconsistent PTD's.
234		 *
235		 * XXX - faulting in unmapped page tables wastes a page if
236		 * va turns out to be invalid.
237		 *
238		 * XXX - should "kernel address space" cover the kernel page
239		 * tables?  Might have same problem with PDEpde as with
240		 * APDEpde (or there may be no problem with APDEpde).
241		 */
242		if (va > 0xFEBFF000) {
243			rv = KERN_FAILURE;	/* becomes SIGBUS */
244			goto nogo;
245		}
246		/*
247		 * It is only a kernel address space fault iff:
248		 * 	1. (type & T_USER) == 0  and
249		 * 	2. pcb_onfault not set or
250		 *	3. pcb_onfault set but supervisor space fault
251		 * The last can occur during an exec() copyin where the
252		 * argument space is lazy-allocated.
253		 */
254		if (type == T_PAGEFLT && va >= KERNBASE)
255			map = kernel_map;
256		else
257			map = &vm->vm_map;
258		if (code & PGEX_W)
259			ftype = VM_PROT_READ | VM_PROT_WRITE;
260		else
261			ftype = VM_PROT_READ;
262
263#ifdef DEBUG
264		if (map == kernel_map && va == 0) {
265			printf("trap: bad kernel access at %x\n", va);
266			goto we_re_toast;
267		}
268#endif
269
270		/*
271		 * XXX: rude hack to make stack limits "work"
272		 */
273		nss = 0;
274		if ((caddr_t)va >= vm->vm_maxsaddr && map != kernel_map
275			&& dostacklimits) {
276			nss = clrnd(btoc((unsigned)vm->vm_maxsaddr
277				+ MAXSSIZ - (unsigned)va));
278			if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
279/*pg("trap rlimit %d, maxsaddr %x va %x ", nss, vm->vm_maxsaddr, va);*/
280				rv = KERN_FAILURE;
281				goto nogo;
282			}
283		}
284
285		/* check if page table is mapped, if not, fault it first */
286#define pde_v(v) (PTD[((v)>>PD_SHIFT)&1023].pd_v)
287		if (!pde_v(va)) {
288			v = trunc_page(vtopte(va));
289			rv = vm_fault(map, v, ftype, FALSE);
290			if (rv != KERN_SUCCESS) goto nogo;
291			/* check if page table fault, increment wiring */
292			vm_map_pageable(map, v, round_page(v+1), FALSE);
293		} else v=0;
294		rv = vm_fault(map, va, ftype, FALSE);
295		if (rv == KERN_SUCCESS) {
296			/*
297			 * XXX: continuation of rude stack hack
298			 */
299			if (nss > vm->vm_ssize)
300				vm->vm_ssize = nss;
301			va = trunc_page(vtopte(va));
302			/* for page table, increment wiring
303			   as long as not a page table fault as well */
304			if (!v && type != T_PAGEFLT)
305			  vm_map_pageable(map, va, round_page(va+1), FALSE);
306			if (type == T_PAGEFLT)
307				return;
308			goto out;
309		}
310nogo:
311		if (type == T_PAGEFLT) {
312			if (curpcb->pcb_onfault)
313				goto copyfault;
314			printf("vm_fault(%x, %x, %x, 0) -> %x\n",
315			       map, va, ftype, rv);
316			printf("  type %x, code %x\n",
317			       type, code);
318			goto we_re_toast;
319		}
320		i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV;
321		break;
322	    }
323
324#if NDDB == 0
325	case T_TRCTRAP:	 /* trace trap -- someone single stepping lcall's */
326		frame.tf_eflags &= ~PSL_T;
327
328			/* Q: how do we turn it on again? */
329		return;
330#endif
331
332	case T_BPTFLT|T_USER:		/* bpt instruction fault */
333	case T_TRCTRAP|T_USER:		/* trace trap */
334		frame.tf_eflags &= ~PSL_T;
335		i = SIGTRAP;
336		break;
337
338#include "isa.h"
339#if	NISA > 0
340	case T_NMI:
341	case T_NMI|T_USER:
342#if NDDB > 0
343		/* NMI can be hooked up to a pushbutton for debugging */
344		printf ("NMI ... going to debugger\n");
345		if (kdb_trap (type, 0, &frame))
346			return;
347#endif
348		/* machine/parity/power fail/"kitchen sink" faults */
349		if(isa_nmi(code) == 0) return;
350		else goto we_re_toast;
351#endif
352	}
353
354	trapsignal(p, i, ucode);
355	if ((type & T_USER) == 0)
356		return;
357out:
358	while (i = CURSIG(p))
359		psig(i);
360	p->p_pri = p->p_usrpri;
361	if (want_resched) {
362		/*
363		 * Since we are curproc, clock will normally just change
364		 * our priority without moving us from one queue to another
365		 * (since the running process is not on a queue.)
366		 * If that happened after we setrq ourselves but before we
367		 * swtch()'ed, we might not be on the queue indicated by
368		 * our priority.
369		 */
370		(void) splclock();
371		setrq(p);
372		p->p_stats->p_ru.ru_nivcsw++;
373		swtch();
374		(void) splnone();
375		while (i = CURSIG(p))
376			psig(i);
377	}
378	if (p->p_stats->p_prof.pr_scale) {
379		int ticks;
380		struct timeval *tv = &p->p_stime;
381
382		ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
383			(tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
384		if (ticks) {
385#ifdef PROFTIMER
386			extern int profscale;
387			addupc(frame.tf_eip, &p->p_stats->p_prof,
388			    ticks * profscale);
389#else
390			addupc(frame.tf_eip, &p->p_stats->p_prof, ticks);
391#endif
392		}
393	}
394	curpri = p->p_pri;
395	curpcb->pcb_flags &= ~FM_TRAP;	/* used by sendsig */
396}
397
398/*
399 * Compensate for 386 brain damage (missing URKR)
400 */
401int trapwrite(unsigned addr) {
402	int rv;
403	vm_offset_t va;
404
405	va = trunc_page((vm_offset_t)addr);
406	if (va > VM_MAXUSER_ADDRESS) return(1);
407	rv = vm_fault(&curproc->p_vmspace->vm_map, va,
408		VM_PROT_READ | VM_PROT_WRITE, FALSE);
409	if (rv == KERN_SUCCESS) return(0);
410	else return(1);
411}
412
413/*
414 * syscall(frame):
415 *	System call request from POSIX system call gate interface to kernel.
416 * Like trap(), argument is call by reference.
417 */
418/*ARGSUSED*/
419syscall(frame)
420	volatile struct syscframe frame;
421{
422	register int *locr0 = ((int *)&frame);
423	register caddr_t params;
424	register int i;
425	register struct sysent *callp;
426	register struct proc *p = curproc;
427	struct timeval syst;
428	int error, opc;
429	int args[8], rval[2];
430	int code;
431
432#ifdef lint
433	r0 = 0; r0 = r0; r1 = 0; r1 = r1;
434#endif
435	syst = p->p_stime;
436	if (ISPL(frame.sf_cs) != SEL_UPL)
437		panic("syscall");
438
439	code = frame.sf_eax;
440	curpcb->pcb_flags &= ~FM_TRAP;	/* used by sendsig */
441	p->p_regs = (int *)&frame;
442	params = (caddr_t)frame.sf_esp + sizeof (int) ;
443
444	/*
445	 * Reconstruct pc, assuming lcall $X,y is 7 bytes, as it is always.
446	 */
447	opc = frame.sf_eip - 7;
448	callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
449	if (callp == sysent) {
450		i = fuword(params);
451		params += sizeof (int);
452		callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
453	}
454
455	if ((i = callp->sy_narg * sizeof (int)) &&
456	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
457		frame.sf_eax = error;
458		frame.sf_eflags |= PSL_C;	/* carry bit */
459#ifdef KTRACE
460		if (KTRPOINT(p, KTR_SYSCALL))
461			ktrsyscall(p->p_tracep, code, callp->sy_narg, &args);
462#endif
463		goto done;
464	}
465#ifdef KTRACE
466	if (KTRPOINT(p, KTR_SYSCALL))
467		ktrsyscall(p->p_tracep, code, callp->sy_narg, &args);
468#endif
469	rval[0] = 0;
470	rval[1] = frame.sf_edx;
471/*pg("%d. s %d\n", p->p_pid, code);*/
472	error = (*callp->sy_call)(p, args, rval);
473	if (error == ERESTART)
474		frame.sf_eip = opc;
475	else if (error != EJUSTRETURN) {
476		if (error) {
477/*pg("error %d", error);*/
478			frame.sf_eax = error;
479			frame.sf_eflags |= PSL_C;	/* carry bit */
480		} else {
481			frame.sf_eax = rval[0];
482			frame.sf_edx = rval[1];
483			frame.sf_eflags &= ~PSL_C;	/* carry bit */
484		}
485	}
486	/* else if (error == EJUSTRETURN) */
487		/* nothing to do */
488done:
489	/*
490	 * Reinitialize proc pointer `p' as it may be different
491	 * if this is a child returning from fork syscall.
492	 */
493	p = curproc;
494	while (i = CURSIG(p))
495		psig(i);
496	p->p_pri = p->p_usrpri;
497	if (want_resched) {
498		/*
499		 * Since we are curproc, clock will normally just change
500		 * our priority without moving us from one queue to another
501		 * (since the running process is not on a queue.)
502		 * If that happened after we setrq ourselves but before we
503		 * swtch()'ed, we might not be on the queue indicated by
504		 * our priority.
505		 */
506		(void) splclock();
507		setrq(p);
508		p->p_stats->p_ru.ru_nivcsw++;
509		swtch();
510		(void) splnone();
511		while (i = CURSIG(p))
512			psig(i);
513	}
514	if (p->p_stats->p_prof.pr_scale) {
515		int ticks;
516		struct timeval *tv = &p->p_stime;
517
518		ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
519			(tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
520		if (ticks) {
521#ifdef PROFTIMER
522			extern int profscale;
523			addupc(frame.sf_eip, &p->p_stats->p_prof,
524			    ticks * profscale);
525#else
526			addupc(frame.sf_eip, &p->p_stats->p_prof, ticks);
527#endif
528		}
529	}
530	curpri = p->p_pri;
531#ifdef KTRACE
532	if (KTRPOINT(p, KTR_SYSRET))
533		ktrsysret(p->p_tracep, code, error, rval[0]);
534#endif
535#ifdef	DIAGNOSTICx
536{ extern int _udatasel, _ucodesel;
537	if (frame.sf_ss != _udatasel)
538		printf("ss %x call %d\n", frame.sf_ss, code);
539	if ((frame.sf_cs&0xffff) != _ucodesel)
540		printf("cs %x call %d\n", frame.sf_cs, code);
541	if (frame.sf_eip > VM_MAXUSER_ADDRESS) {
542		printf("eip %x call %d\n", frame.sf_eip, code);
543		frame.sf_eip = 0;
544	}
545}
546#endif
547}
548