subr_syscall.c revision 351
1/*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * the University of Utah, and William Jolitz.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)trap.c	7.4 (Berkeley) 5/13/91
37 *
38 * PATCHES MAGIC                LEVEL   PATCH THAT GOT US HERE
39 * --------------------         -----   ----------------------
40 * CURRENT PATCH LEVEL:         1       00137
41 * --------------------         -----   ----------------------
42 *
43 * 08 Apr 93	Bruce Evans		Several VM system fixes
44 * 		Paul Kranenburg		Add counter for vmstat
45 */
46static char rcsid[] = "$Header: /a/cvs/386BSD/src/sys/i386/i386/trap.c,v 1.2 1993/07/27 10:52:20 davidg Exp $";
47
48/*
49 * 386 Trap and System call handleing
50 */
51
52#include "machine/cpu.h"
53#include "machine/psl.h"
54#include "machine/reg.h"
55
56#include "param.h"
57#include "systm.h"
58#include "proc.h"
59#include "user.h"
60#include "acct.h"
61#include "kernel.h"
62#ifdef KTRACE
63#include "ktrace.h"
64#endif
65
66#include "vm/vm_param.h"
67#include "vm/pmap.h"
68#include "vm/vm_map.h"
69#include "sys/vmmeter.h"
70
71#include "machine/trap.h"
72
73#ifdef	__GNUC__
74
75/*
76 * The "r" contraint could be "rm" except for fatal bugs in gas.  As usual,
77 * we omit the size from the mov instruction to avoid nonfatal bugs in gas.
78 */
79#define	read_gs()	({ u_short gs; __asm("mov %%gs,%0" : "=r" (gs)); gs; })
80#define	write_gs(gs)	__asm("mov %0,%%gs" : : "r" ((u_short) gs))
81
82#else	/* not __GNUC__ */
83
84u_short	read_gs		__P((void));
85void	write_gs	__P((/* promoted u_short */ int gs));
86
87#endif	/* __GNUC__ */
88
89struct	sysent sysent[];
90int	nsysent;
91int dostacklimits;
92unsigned rcr2();
93extern short cpl;
94
95
96/*
97 * trap(frame):
98 *	Exception, fault, and trap interface to BSD kernel. This
99 * common code is called from assembly language IDT gate entry
100 * routines that prepare a suitable stack frame, and restore this
101 * frame after the exception has been processed. Note that the
102 * effect is as if the arguments were passed call by reference.
103 */
104
105/*ARGSUSED*/
106trap(frame)
107	struct trapframe frame;
108{
109	register int i;
110	register struct proc *p = curproc;
111	struct timeval syst;
112	int ucode, type, code, eva;
113
114	frame.tf_eflags &= ~PSL_NT;	/* clear nested trap XXX */
115	type = frame.tf_trapno;
116#include "ddb.h"
117#if NDDB > 0
118	if (curpcb && curpcb->pcb_onfault) {
119		if (frame.tf_trapno == T_BPTFLT
120		    || frame.tf_trapno == T_TRCTRAP)
121			if (kdb_trap (type, 0, &frame))
122				return;
123	}
124#endif
125
126/*pg("trap type %d code = %x eip = %x cs = %x eva = %x esp %x",
127			frame.tf_trapno, frame.tf_err, frame.tf_eip,
128			frame.tf_cs, rcr2(), frame.tf_esp);*/
129if(curpcb == 0 || curproc == 0) goto we_re_toast;
130	if (curpcb->pcb_onfault && frame.tf_trapno != T_PAGEFLT) {
131		extern int _udatasel;
132
133		if (read_gs() != (u_short) _udatasel)
134			/*
135			 * Some user has corrupted %gs but we depend on it in
136			 * copyout() etc.  Fix it up and retry.
137			 *
138			 * (We don't preserve %fs or %gs, so users can change
139			 * them to either _ucodesel, _udatasel or a not-present
140			 * selector, possibly ORed with 0 to 3, making them
141			 * volatile for other users.  Not preserving them saves
142			 * time and doesn't lose functionality or open security
143			 * holes.)
144			 */
145			write_gs(_udatasel);
146		else
147copyfault:
148			frame.tf_eip = (int)curpcb->pcb_onfault;
149		return;
150	}
151
152	syst = p->p_stime;
153	if (ISPL(frame.tf_cs) == SEL_UPL) {
154		type |= T_USER;
155		p->p_regs = (int *)&frame;
156		curpcb->pcb_flags |= FM_TRAP;	/* used by sendsig */
157	}
158
159	ucode=0;
160	eva = rcr2();
161	code = frame.tf_err;
162	switch (type) {
163
164	default:
165	we_re_toast:
166#ifdef KDB
167		if (kdb_trap(&psl))
168			return;
169#endif
170#if NDDB > 0
171		if (kdb_trap (type, 0, &frame))
172			return;
173#endif
174
175		printf("trap type %d code = %x eip = %x cs = %x eflags = %x ",
176			frame.tf_trapno, frame.tf_err, frame.tf_eip,
177			frame.tf_cs, frame.tf_eflags);
178	eva = rcr2();
179		printf("cr2 %x cpl %x\n", eva, cpl);
180		/* type &= ~T_USER; */ /* XXX what the hell is this */
181		panic("trap");
182		/*NOTREACHED*/
183
184	case T_SEGNPFLT|T_USER:
185	case T_STKFLT|T_USER:
186	case T_PROTFLT|T_USER:		/* protection fault */
187		ucode = code + BUS_SEGM_FAULT ;
188		i = SIGBUS;
189		break;
190
191	case T_PRIVINFLT|T_USER:	/* privileged instruction fault */
192	case T_RESADFLT|T_USER:		/* reserved addressing fault */
193	case T_RESOPFLT|T_USER:		/* reserved operand fault */
194	case T_FPOPFLT|T_USER:		/* coprocessor operand fault */
195		ucode = type &~ T_USER;
196		i = SIGILL;
197		break;
198
199	case T_ASTFLT|T_USER:		/* Allow process switch */
200		astoff();
201		cnt.v_soft++;
202		if ((p->p_flag & SOWEUPC) && p->p_stats->p_prof.pr_scale) {
203			addupc(frame.tf_eip, &p->p_stats->p_prof, 1);
204			p->p_flag &= ~SOWEUPC;
205		}
206		goto out;
207
208	case T_DNA|T_USER:
209#ifdef	NPX
210		/* if a transparent fault (due to context switch "late") */
211		if (npxdna()) return;
212#endif
213#ifdef	MATH_EMULATE
214		i = math_emulate(&frame);
215		if (i == 0) return;
216#else	/* MATH_EMULTATE */
217		panic("trap: math emulation necessary!");
218#endif	/* MATH_EMULTATE */
219		ucode = FPE_FPU_NP_TRAP;
220		break;
221
222	case T_BOUND|T_USER:
223		ucode = FPE_SUBRNG_TRAP;
224		i = SIGFPE;
225		break;
226
227	case T_OFLOW|T_USER:
228		ucode = FPE_INTOVF_TRAP;
229		i = SIGFPE;
230		break;
231
232	case T_DIVIDE|T_USER:
233		ucode = FPE_INTDIV_TRAP;
234		i = SIGFPE;
235		break;
236
237	case T_ARITHTRAP|T_USER:
238		ucode = code;
239		i = SIGFPE;
240		break;
241
242	case T_PAGEFLT:			/* allow page faults in kernel mode */
243#if 0
244		/* XXX - check only applies to 386's and 486's with WP off */
245		if (code & PGEX_P) goto we_re_toast;
246#endif
247
248		/* fall into */
249	case T_PAGEFLT|T_USER:		/* page fault */
250	    {
251		register vm_offset_t va;
252		register struct vmspace *vm = p->p_vmspace;
253		register vm_map_t map;
254		int rv;
255		vm_prot_t ftype;
256		extern vm_map_t kernel_map;
257		unsigned nss,v;
258
259		va = trunc_page((vm_offset_t)eva);
260		/*
261		 * Avoid even looking at pde_v(va) for high va's.   va's
262		 * above VM_MAX_KERNEL_ADDRESS don't correspond to normal
263		 * PDE's (half of them correspond to APDEpde and half to
264		 * an unmapped kernel PDE).  va's betweeen 0xFEC00000 and
265		 * VM_MAX_KERNEL_ADDRESS correspond to unmapped kernel PDE's
266		 * (XXX - why are only 3 initialized when 6 are required to
267		 * reach VM_MAX_KERNEL_ADDRESS?).  Faulting in an unmapped
268		 * kernel page table would give inconsistent PTD's.
269		 *
270		 * XXX - faulting in unmapped page tables wastes a page if
271		 * va turns out to be invalid.
272		 *
273		 * XXX - should "kernel address space" cover the kernel page
274		 * tables?  Might have same problem with PDEpde as with
275		 * APDEpde (or there may be no problem with APDEpde).
276		 */
277		if (va > 0xFEBFF000) {
278			rv = KERN_FAILURE;	/* becomes SIGBUS */
279			goto nogo;
280		}
281		/*
282		 * It is only a kernel address space fault iff:
283		 * 	1. (type & T_USER) == 0  and
284		 * 	2. pcb_onfault not set or
285		 *	3. pcb_onfault set but supervisor space fault
286		 * The last can occur during an exec() copyin where the
287		 * argument space is lazy-allocated.
288		 */
289		if (type == T_PAGEFLT && va >= KERNBASE)
290			map = kernel_map;
291		else
292			map = &vm->vm_map;
293		if (code & PGEX_W)
294			ftype = VM_PROT_READ | VM_PROT_WRITE;
295		else
296			ftype = VM_PROT_READ;
297
298#ifdef DEBUG
299		if (map == kernel_map && va == 0) {
300			printf("trap: bad kernel access at %x\n", va);
301			goto we_re_toast;
302		}
303#endif
304
305		/*
306		 * XXX: rude hack to make stack limits "work"
307		 */
308		nss = 0;
309		if ((caddr_t)va >= vm->vm_maxsaddr && map != kernel_map
310			&& dostacklimits) {
311			nss = clrnd(btoc((unsigned)vm->vm_maxsaddr
312				+ MAXSSIZ - (unsigned)va));
313			if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
314/*pg("trap rlimit %d, maxsaddr %x va %x ", nss, vm->vm_maxsaddr, va);*/
315				rv = KERN_FAILURE;
316				goto nogo;
317			}
318		}
319
320		/* check if page table is mapped, if not, fault it first */
321#define pde_v(v) (PTD[((v)>>PD_SHIFT)&1023].pd_v)
322		if (!pde_v(va)) {
323			v = trunc_page(vtopte(va));
324			rv = vm_fault(map, v, ftype, FALSE);
325			if (rv != KERN_SUCCESS) goto nogo;
326			/* check if page table fault, increment wiring */
327			vm_map_pageable(map, v, round_page(v+1), FALSE);
328		} else v=0;
329		rv = vm_fault(map, va, ftype, FALSE);
330		if (rv == KERN_SUCCESS) {
331			/*
332			 * XXX: continuation of rude stack hack
333			 */
334			if (nss > vm->vm_ssize)
335				vm->vm_ssize = nss;
336			va = trunc_page(vtopte(va));
337			/* for page table, increment wiring
338			   as long as not a page table fault as well */
339			if (!v && type != T_PAGEFLT)
340			  vm_map_pageable(map, va, round_page(va+1), FALSE);
341			if (type == T_PAGEFLT)
342				return;
343			goto out;
344		}
345nogo:
346		if (type == T_PAGEFLT) {
347			if (curpcb->pcb_onfault)
348				goto copyfault;
349			printf("vm_fault(%x, %x, %x, 0) -> %x\n",
350			       map, va, ftype, rv);
351			printf("  type %x, code %x\n",
352			       type, code);
353			goto we_re_toast;
354		}
355		i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV;
356		break;
357	    }
358
359#if NDDB == 0
360	case T_TRCTRAP:	 /* trace trap -- someone single stepping lcall's */
361		frame.tf_eflags &= ~PSL_T;
362
363			/* Q: how do we turn it on again? */
364		return;
365#endif
366
367	case T_BPTFLT|T_USER:		/* bpt instruction fault */
368	case T_TRCTRAP|T_USER:		/* trace trap */
369		frame.tf_eflags &= ~PSL_T;
370		i = SIGTRAP;
371		break;
372
373#include "isa.h"
374#if	NISA > 0
375	case T_NMI:
376	case T_NMI|T_USER:
377#if NDDB > 0
378		/* NMI can be hooked up to a pushbutton for debugging */
379		printf ("NMI ... going to debugger\n");
380		if (kdb_trap (type, 0, &frame))
381			return;
382#endif
383		/* machine/parity/power fail/"kitchen sink" faults */
384		if(isa_nmi(code) == 0) return;
385		else goto we_re_toast;
386#endif
387	}
388
389	trapsignal(p, i, ucode);
390	if ((type & T_USER) == 0)
391		return;
392out:
393	while (i = CURSIG(p))
394		psig(i);
395	p->p_pri = p->p_usrpri;
396	if (want_resched) {
397		/*
398		 * Since we are curproc, clock will normally just change
399		 * our priority without moving us from one queue to another
400		 * (since the running process is not on a queue.)
401		 * If that happened after we setrq ourselves but before we
402		 * swtch()'ed, we might not be on the queue indicated by
403		 * our priority.
404		 */
405		(void) splclock();
406		setrq(p);
407		p->p_stats->p_ru.ru_nivcsw++;
408		swtch();
409		(void) splnone();
410		while (i = CURSIG(p))
411			psig(i);
412	}
413	if (p->p_stats->p_prof.pr_scale) {
414		int ticks;
415		struct timeval *tv = &p->p_stime;
416
417		ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
418			(tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
419		if (ticks) {
420#ifdef PROFTIMER
421			extern int profscale;
422			addupc(frame.tf_eip, &p->p_stats->p_prof,
423			    ticks * profscale);
424#else
425			addupc(frame.tf_eip, &p->p_stats->p_prof, ticks);
426#endif
427		}
428	}
429	curpri = p->p_pri;
430	curpcb->pcb_flags &= ~FM_TRAP;	/* used by sendsig */
431}
432
433/*
434 * Compensate for 386 brain damage (missing URKR).
435 * This is a little simpler than the pagefault handler in trap() because
436 * it the page tables have already been faulted in and high addresses
437 * are thrown out early for other reasons.
438 */
439int trapwrite(addr)
440	unsigned addr;
441{
442	unsigned nss;
443	struct proc *p;
444	vm_offset_t va;
445	struct vmspace *vm;
446
447	va = trunc_page((vm_offset_t)addr);
448	/*
449	 * XXX - MAX is END.  Changed > to >= for temp. fix.
450	 */
451	if (va >= VM_MAXUSER_ADDRESS)
452		return (1);
453	/*
454	 * XXX: rude stack hack adapted from trap().
455	 */
456	nss = 0;
457	p = curproc;
458	vm = p->p_vmspace;
459	if ((caddr_t)va >= vm->vm_maxsaddr && dostacklimits) {
460		nss = clrnd(btoc((unsigned)vm->vm_maxsaddr + MAXSSIZ
461				 - (unsigned)va));
462		if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur))
463			return (1);
464	}
465
466	if (vm_fault(&vm->vm_map, va, VM_PROT_READ | VM_PROT_WRITE, FALSE)
467	    != KERN_SUCCESS)
468		return (1);
469
470	/*
471	 * XXX: continuation of rude stack hack
472	 */
473	if (nss > vm->vm_ssize)
474		vm->vm_ssize = nss;
475
476	return (0);
477}
478
479/*
480 * syscall(frame):
481 *	System call request from POSIX system call gate interface to kernel.
482 * Like trap(), argument is call by reference.
483 */
484/*ARGSUSED*/
485syscall(frame)
486	volatile struct syscframe frame;
487{
488	register int *locr0 = ((int *)&frame);
489	register caddr_t params;
490	register int i;
491	register struct sysent *callp;
492	register struct proc *p = curproc;
493	struct timeval syst;
494	int error, opc;
495	int args[8], rval[2];
496	int code;
497
498#ifdef lint
499	r0 = 0; r0 = r0; r1 = 0; r1 = r1;
500#endif
501	syst = p->p_stime;
502	if (ISPL(frame.sf_cs) != SEL_UPL)
503		panic("syscall");
504
505	code = frame.sf_eax;
506	curpcb->pcb_flags &= ~FM_TRAP;	/* used by sendsig */
507	p->p_regs = (int *)&frame;
508	params = (caddr_t)frame.sf_esp + sizeof (int) ;
509
510	/*
511	 * Reconstruct pc, assuming lcall $X,y is 7 bytes, as it is always.
512	 */
513	opc = frame.sf_eip - 7;
514	callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
515	if (callp == sysent) {
516		i = fuword(params);
517		params += sizeof (int);
518		callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
519	}
520
521	if ((i = callp->sy_narg * sizeof (int)) &&
522	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
523		frame.sf_eax = error;
524		frame.sf_eflags |= PSL_C;	/* carry bit */
525#ifdef KTRACE
526		if (KTRPOINT(p, KTR_SYSCALL))
527			ktrsyscall(p->p_tracep, code, callp->sy_narg, &args);
528#endif
529		goto done;
530	}
531#ifdef KTRACE
532	if (KTRPOINT(p, KTR_SYSCALL))
533		ktrsyscall(p->p_tracep, code, callp->sy_narg, &args);
534#endif
535	rval[0] = 0;
536	rval[1] = frame.sf_edx;
537/*pg("%d. s %d\n", p->p_pid, code);*/
538	error = (*callp->sy_call)(p, args, rval);
539	if (error == ERESTART)
540		frame.sf_eip = opc;
541	else if (error != EJUSTRETURN) {
542		if (error) {
543/*pg("error %d", error);*/
544			frame.sf_eax = error;
545			frame.sf_eflags |= PSL_C;	/* carry bit */
546		} else {
547			frame.sf_eax = rval[0];
548			frame.sf_edx = rval[1];
549			frame.sf_eflags &= ~PSL_C;	/* carry bit */
550		}
551	}
552	/* else if (error == EJUSTRETURN) */
553		/* nothing to do */
554done:
555	/*
556	 * Reinitialize proc pointer `p' as it may be different
557	 * if this is a child returning from fork syscall.
558	 */
559	p = curproc;
560	while (i = CURSIG(p))
561		psig(i);
562	p->p_pri = p->p_usrpri;
563	if (want_resched) {
564		/*
565		 * Since we are curproc, clock will normally just change
566		 * our priority without moving us from one queue to another
567		 * (since the running process is not on a queue.)
568		 * If that happened after we setrq ourselves but before we
569		 * swtch()'ed, we might not be on the queue indicated by
570		 * our priority.
571		 */
572		(void) splclock();
573		setrq(p);
574		p->p_stats->p_ru.ru_nivcsw++;
575		swtch();
576		(void) splnone();
577		while (i = CURSIG(p))
578			psig(i);
579	}
580	if (p->p_stats->p_prof.pr_scale) {
581		int ticks;
582		struct timeval *tv = &p->p_stime;
583
584		ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
585			(tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
586		if (ticks) {
587#ifdef PROFTIMER
588			extern int profscale;
589			addupc(frame.sf_eip, &p->p_stats->p_prof,
590			    ticks * profscale);
591#else
592			addupc(frame.sf_eip, &p->p_stats->p_prof, ticks);
593#endif
594		}
595	}
596	curpri = p->p_pri;
597#ifdef KTRACE
598	if (KTRPOINT(p, KTR_SYSRET))
599		ktrsysret(p->p_tracep, code, error, rval[0]);
600#endif
601#ifdef	DIAGNOSTICx
602{ extern int _udatasel, _ucodesel;
603	if (frame.sf_ss != _udatasel)
604		printf("ss %x call %d\n", frame.sf_ss, code);
605	if ((frame.sf_cs&0xffff) != _ucodesel)
606		printf("cs %x call %d\n", frame.sf_cs, code);
607	if (frame.sf_eip > VM_MAXUSER_ADDRESS) {
608		printf("eip %x call %d\n", frame.sf_eip, code);
609		frame.sf_eip = 0;
610	}
611}
612#endif
613}
614