subr_syscall.c revision 757
1/*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * the University of Utah, and William Jolitz.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
37 *	$Id: trap.c,v 1.6 1993/11/04 15:05:41 davidg Exp $
38 */
39
40/*
41 * 386 Trap and System call handleing
42 */
43
44#include "npx.h"
45#include "machine/cpu.h"
46#include "machine/psl.h"
47#include "machine/reg.h"
48
49#include "param.h"
50#include "systm.h"
51#include "proc.h"
52#include "user.h"
53#include "acct.h"
54#include "kernel.h"
55#ifdef KTRACE
56#include "ktrace.h"
57#endif
58
59#include "vm/vm_param.h"
60#include "vm/pmap.h"
61#include "vm/vm_map.h"
62#include "sys/vmmeter.h"
63
64#include "machine/trap.h"
65
66#ifdef	__GNUC__
67
68/*
69 * The "r" contraint could be "rm" except for fatal bugs in gas.  As usual,
70 * we omit the size from the mov instruction to avoid nonfatal bugs in gas.
71 */
72#define	read_gs()	({ u_short gs; __asm("mov %%gs,%0" : "=r" (gs)); gs; })
73#define	write_gs(gs)	__asm("mov %0,%%gs" : : "r" ((u_short) gs))
74
75#else	/* not __GNUC__ */
76
77u_short	read_gs		__P((void));
78void	write_gs	__P((/* promoted u_short */ int gs));
79
80#endif	/* __GNUC__ */
81
82struct	sysent sysent[];
83int	nsysent;
84int dostacklimits;
85unsigned rcr2();
86extern short cpl;
87
88#define MAX_TRAP_MSG		27
89char *trap_msg[] = {
90	"reserved addressing fault",		/*  0 T_RESADFLT */
91	"privileged instruction fault",		/*  1 T_PRIVINFLT */
92	"reserved operand fault",		/*  2 T_RESOPFLT */
93	"breakpoint instruction fault",		/*  3 T_BPTFLT */
94	"",					/*  4 unused */
95	"system call trap",			/*  5 T_SYSCALL */
96	"arithmetic trap",			/*  6 T_ARITHTRAP */
97	"system forced exception",		/*  7 T_ASTFLT */
98	"segmentation (limit) fault",		/*  8 T_SEGFLT */
99	"protection fault",			/*  9 T_PROTFLT */
100	"trace trap",				/* 10 T_TRCTRAP */
101	"",					/* 11 unused */
102	"page fault",				/* 12 T_PAGEFLT */
103	"page table fault",			/* 13 T_TABLEFLT */
104	"alignment fault",			/* 14 T_ALIGNFLT */
105	"kernel stack pointer not valid",	/* 15 T_KSPNOTVAL */
106	"bus error",				/* 16 T_BUSERR */
107	"kernel debugger fault",		/* 17 T_KDBTRAP */
108	"integer divide fault",			/* 18 T_DIVIDE */
109	"non-maskable interrupt trap",		/* 19 T_NMI */
110	"overflow trap",			/* 20 T_OFLOW */
111	"FPU bounds check fault",		/* 21 T_BOUND */
112	"FPU device not available",		/* 22 T_DNA */
113	"double fault",				/* 23 T_DOUBLEFLT */
114	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
115	"invalid TSS fault",			/* 25 T_TSSFLT */
116	"segment not present fault",		/* 26 T_SEGNPFLT */
117	"stack fault",				/* 27 T_STKFLT */
118};
119
120
121/*
122 * trap(frame):
123 *	Exception, fault, and trap interface to BSD kernel. This
124 * common code is called from assembly language IDT gate entry
125 * routines that prepare a suitable stack frame, and restore this
126 * frame after the exception has been processed. Note that the
127 * effect is as if the arguments were passed call by reference.
128 */
129
130/*ARGSUSED*/
131trap(frame)
132	struct trapframe frame;
133{
134	register int i;
135	register struct proc *p = curproc;
136	struct timeval syst;
137	int ucode, type, code, eva;
138
139	frame.tf_eflags &= ~PSL_NT;	/* clear nested trap XXX */
140	type = frame.tf_trapno;
141#include "ddb.h"
142#if NDDB > 0
143	if (curpcb && curpcb->pcb_onfault) {
144		if (frame.tf_trapno == T_BPTFLT
145		    || frame.tf_trapno == T_TRCTRAP)
146			if (kdb_trap (type, 0, &frame))
147				return;
148	}
149#endif
150
151/*pg("trap type %d code = %x eip = %x cs = %x eva = %x esp %x",
152			frame.tf_trapno, frame.tf_err, frame.tf_eip,
153			frame.tf_cs, rcr2(), frame.tf_esp);*/
154if(curpcb == 0 || curproc == 0) goto we_re_toast;
155	if (curpcb->pcb_onfault && frame.tf_trapno != T_PAGEFLT) {
156		extern int _udatasel;
157
158		if (read_gs() != (u_short) _udatasel)
159			/*
160			 * Some user has corrupted %gs but we depend on it in
161			 * copyout() etc.  Fix it up and retry.
162			 *
163			 * (We don't preserve %fs or %gs, so users can change
164			 * them to either _ucodesel, _udatasel or a not-present
165			 * selector, possibly ORed with 0 to 3, making them
166			 * volatile for other users.  Not preserving them saves
167			 * time and doesn't lose functionality or open security
168			 * holes.)
169			 */
170			write_gs(_udatasel);
171		else
172copyfault:
173			frame.tf_eip = (int)curpcb->pcb_onfault;
174		return;
175	}
176
177	syst = p->p_stime;
178	if (ISPL(frame.tf_cs) == SEL_UPL) {
179		type |= T_USER;
180		p->p_regs = (int *)&frame;
181		curpcb->pcb_flags |= FM_TRAP;	/* used by sendsig */
182	}
183
184	ucode=0;
185	eva = rcr2();
186	code = frame.tf_err;
187	switch (type) {
188
189	default:
190	we_re_toast:
191#ifdef KDB
192		if (kdb_trap(&psl))
193			return;
194#endif
195#if NDDB > 0
196		if (kdb_trap (type, 0, &frame))
197			return;
198#endif
199
200		if ((type & ~T_USER) <= MAX_TRAP_MSG)
201			printf("\n\nFatal trap %d: %s while in %s mode\n",
202				type & ~T_USER, trap_msg[type & ~T_USER],
203				(type & T_USER) ? "user" : "kernel");
204
205		printf("trap type = %d, code = %x\n     eip = %x, cs = %x, eflags = %x, ",
206			frame.tf_trapno, frame.tf_err, frame.tf_eip,
207			frame.tf_cs, frame.tf_eflags);
208		eva = rcr2();
209		printf("cr2 = %x, current priority = %x\n", eva, cpl);
210
211		type &= ~T_USER;
212		if (type <= MAX_TRAP_MSG)
213			panic(trap_msg[type]);
214		else
215			panic("unknown/reserved trap");
216
217		/*NOTREACHED*/
218
219	case T_SEGNPFLT|T_USER:
220	case T_STKFLT|T_USER:
221	case T_PROTFLT|T_USER:		/* protection fault */
222		ucode = code + BUS_SEGM_FAULT ;
223		i = SIGBUS;
224		break;
225
226	case T_PRIVINFLT|T_USER:	/* privileged instruction fault */
227	case T_RESADFLT|T_USER:		/* reserved addressing fault */
228	case T_RESOPFLT|T_USER:		/* reserved operand fault */
229	case T_FPOPFLT|T_USER:		/* coprocessor operand fault */
230		ucode = type &~ T_USER;
231		i = SIGILL;
232		break;
233
234	case T_ASTFLT|T_USER:		/* Allow process switch */
235		astoff();
236		cnt.v_soft++;
237		if ((p->p_flag & SOWEUPC) && p->p_stats->p_prof.pr_scale) {
238			addupc(frame.tf_eip, &p->p_stats->p_prof, 1);
239			p->p_flag &= ~SOWEUPC;
240		}
241		goto out;
242
243	case T_DNA|T_USER:
244#if NNPX > 0
245		/* if a transparent fault (due to context switch "late") */
246		if (npxdna()) return;
247#endif	/* NNPX > 0 */
248#ifdef	MATH_EMULATE
249		i = math_emulate(&frame);
250		if (i == 0) return;
251#else	/* MATH_EMULTATE */
252		panic("trap: math emulation necessary!");
253#endif	/* MATH_EMULTATE */
254		ucode = FPE_FPU_NP_TRAP;
255		break;
256
257	case T_BOUND|T_USER:
258		ucode = FPE_SUBRNG_TRAP;
259		i = SIGFPE;
260		break;
261
262	case T_OFLOW|T_USER:
263		ucode = FPE_INTOVF_TRAP;
264		i = SIGFPE;
265		break;
266
267	case T_DIVIDE|T_USER:
268		ucode = FPE_INTDIV_TRAP;
269		i = SIGFPE;
270		break;
271
272	case T_ARITHTRAP|T_USER:
273		ucode = code;
274		i = SIGFPE;
275		break;
276
277	case T_PAGEFLT:			/* allow page faults in kernel mode */
278#if 0
279		/* XXX - check only applies to 386's and 486's with WP off */
280		if (code & PGEX_P) goto we_re_toast;
281#endif
282
283		/* fall into */
284	case T_PAGEFLT|T_USER:		/* page fault */
285	    {
286		register vm_offset_t va;
287		register struct vmspace *vm = p->p_vmspace;
288		register vm_map_t map;
289		int rv;
290		vm_prot_t ftype;
291		extern vm_map_t kernel_map;
292		unsigned nss,v;
293
294		va = trunc_page((vm_offset_t)eva);
295		/*
296		 * It is only a kernel address space fault iff:
297		 * 	1. (type & T_USER) == 0  and
298		 * 	2. pcb_onfault not set or
299		 *	3. pcb_onfault set but supervisor space fault
300		 * The last can occur during an exec() copyin where the
301		 * argument space is lazy-allocated.
302		 */
303		if (type == T_PAGEFLT && va >= KERNBASE)
304			map = kernel_map;
305		else
306			map = &vm->vm_map;
307		if (code & PGEX_W)
308			ftype = VM_PROT_READ | VM_PROT_WRITE;
309		else
310			ftype = VM_PROT_READ;
311
312#ifdef DEBUG
313		if (map == kernel_map && va == 0) {
314			printf("trap: bad kernel access at %x\n", va);
315			goto we_re_toast;
316		}
317#endif
318
319		/*
320		 * XXX: rude hack to make stack limits "work"
321		 */
322		nss = 0;
323		if ((caddr_t)va >= vm->vm_maxsaddr
324			&& (caddr_t)va < (caddr_t)VM_MAXUSER_ADDRESS
325			&& map != kernel_map
326			&& dostacklimits) {
327			nss = clrnd(btoc((unsigned)vm->vm_maxsaddr
328				+ MAXSSIZ - (unsigned)va));
329			if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
330/*pg("trap rlimit %d, maxsaddr %x va %x ", nss, vm->vm_maxsaddr, va);*/
331				rv = KERN_FAILURE;
332				goto nogo;
333			}
334		}
335
336		/* check if page table is mapped, if not, fault it first */
337#define pde_v(v) (PTD[((v)>>PD_SHIFT)&1023].pd_v)
338		if (!pde_v(va)) {
339			v = trunc_page(vtopte(va));
340			rv = vm_fault(map, v, ftype, FALSE);
341			if (rv != KERN_SUCCESS) goto nogo;
342			/* check if page table fault, increment wiring */
343			vm_map_pageable(map, v, round_page(v+1), FALSE);
344		} else v=0;
345		rv = vm_fault(map, va, ftype, FALSE);
346		if (rv == KERN_SUCCESS) {
347			/*
348			 * XXX: continuation of rude stack hack
349			 */
350			if (nss > vm->vm_ssize)
351				vm->vm_ssize = nss;
352			va = trunc_page(vtopte(va));
353			/* for page table, increment wiring
354			   as long as not a page table fault as well */
355			if (!v && type != T_PAGEFLT)
356			  vm_map_pageable(map, va, round_page(va+1), FALSE);
357			if (type == T_PAGEFLT)
358				return;
359			goto out;
360		}
361nogo:
362		if (type == T_PAGEFLT) {
363			if (curpcb->pcb_onfault)
364				goto copyfault;
365			printf("vm_fault(%x, %x, %x, 0) -> %x\n",
366			       map, va, ftype, rv);
367			printf("  type %x, code %x\n",
368			       type, code);
369			goto we_re_toast;
370		}
371		i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV;
372		break;
373	    }
374
375#if NDDB == 0
376	case T_TRCTRAP:	 /* trace trap -- someone single stepping lcall's */
377		frame.tf_eflags &= ~PSL_T;
378
379			/* Q: how do we turn it on again? */
380		return;
381#endif
382
383	case T_BPTFLT|T_USER:		/* bpt instruction fault */
384	case T_TRCTRAP|T_USER:		/* trace trap */
385		frame.tf_eflags &= ~PSL_T;
386		i = SIGTRAP;
387		break;
388
389#include "isa.h"
390#if	NISA > 0
391	case T_NMI:
392	case T_NMI|T_USER:
393#if NDDB > 0
394		/* NMI can be hooked up to a pushbutton for debugging */
395		printf ("NMI ... going to debugger\n");
396		if (kdb_trap (type, 0, &frame))
397			return;
398#endif
399		/* machine/parity/power fail/"kitchen sink" faults */
400		if(isa_nmi(code) == 0) return;
401		else goto we_re_toast;
402#endif
403	}
404
405	trapsignal(p, i, ucode);
406	if ((type & T_USER) == 0)
407		return;
408out:
409	while (i = CURSIG(p))
410		psig(i);
411	p->p_pri = p->p_usrpri;
412	if (want_resched) {
413		int s;
414		/*
415		 * Since we are curproc, clock will normally just change
416		 * our priority without moving us from one queue to another
417		 * (since the running process is not on a queue.)
418		 * If that happened after we setrq ourselves but before we
419		 * swtch()'ed, we might not be on the queue indicated by
420		 * our priority.
421		 */
422		s = splclock();
423		setrq(p);
424		p->p_stats->p_ru.ru_nivcsw++;
425		swtch();
426		splx(s);
427		while (i = CURSIG(p))
428			psig(i);
429	}
430	if (p->p_stats->p_prof.pr_scale) {
431		int ticks;
432		struct timeval *tv = &p->p_stime;
433
434		ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
435			(tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
436		if (ticks) {
437#ifdef PROFTIMER
438			extern int profscale;
439			addupc(frame.tf_eip, &p->p_stats->p_prof,
440			    ticks * profscale);
441#else
442			addupc(frame.tf_eip, &p->p_stats->p_prof, ticks);
443#endif
444		}
445	}
446	curpri = p->p_pri;
447	curpcb->pcb_flags &= ~FM_TRAP;	/* used by sendsig */
448}
449
450/*
451 * Compensate for 386 brain damage (missing URKR).
452 * This is a little simpler than the pagefault handler in trap() because
453 * it the page tables have already been faulted in and high addresses
454 * are thrown out early for other reasons.
455 */
456int trapwrite(addr)
457	unsigned addr;
458{
459	unsigned nss;
460	struct proc *p;
461	vm_offset_t va;
462	struct vmspace *vm;
463
464	va = trunc_page((vm_offset_t)addr);
465	/*
466	 * XXX - MAX is END.  Changed > to >= for temp. fix.
467	 */
468	if (va >= VM_MAXUSER_ADDRESS)
469		return (1);
470	/*
471	 * XXX: rude stack hack adapted from trap().
472	 */
473	nss = 0;
474	p = curproc;
475	vm = p->p_vmspace;
476	if ((caddr_t)va >= vm->vm_maxsaddr && dostacklimits) {
477		nss = clrnd(btoc((unsigned)vm->vm_maxsaddr + MAXSSIZ
478				 - (unsigned)va));
479		if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur))
480			return (1);
481	}
482
483	if (vm_fault(&vm->vm_map, va, VM_PROT_READ | VM_PROT_WRITE, FALSE)
484	    != KERN_SUCCESS)
485		return (1);
486
487	/*
488	 * XXX: continuation of rude stack hack
489	 */
490	if (nss > vm->vm_ssize)
491		vm->vm_ssize = nss;
492
493	return (0);
494}
495
496/*
497 * syscall(frame):
498 *	System call request from POSIX system call gate interface to kernel.
499 * Like trap(), argument is call by reference.
500 */
501/*ARGSUSED*/
502syscall(frame)
503	volatile struct syscframe frame;
504{
505	register int *locr0 = ((int *)&frame);
506	register caddr_t params;
507	register int i;
508	register struct sysent *callp;
509	register struct proc *p = curproc;
510	struct timeval syst;
511	int error, opc;
512	int args[8], rval[2];
513	int code;
514
515#ifdef lint
516	r0 = 0; r0 = r0; r1 = 0; r1 = r1;
517#endif
518	syst = p->p_stime;
519	if (ISPL(frame.sf_cs) != SEL_UPL)
520		panic("syscall");
521
522	code = frame.sf_eax;
523	curpcb->pcb_flags &= ~FM_TRAP;	/* used by sendsig */
524	p->p_regs = (int *)&frame;
525	params = (caddr_t)frame.sf_esp + sizeof (int) ;
526
527	/*
528	 * Reconstruct pc, assuming lcall $X,y is 7 bytes, as it is always.
529	 */
530	opc = frame.sf_eip - 7;
531	callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
532	if (callp == sysent) {
533		i = fuword(params);
534		params += sizeof (int);
535		callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
536	}
537
538	if ((i = callp->sy_narg * sizeof (int)) &&
539	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
540		frame.sf_eax = error;
541		frame.sf_eflags |= PSL_C;	/* carry bit */
542#ifdef KTRACE
543		if (KTRPOINT(p, KTR_SYSCALL))
544			ktrsyscall(p->p_tracep, code, callp->sy_narg, &args);
545#endif
546		goto done;
547	}
548#ifdef KTRACE
549	if (KTRPOINT(p, KTR_SYSCALL))
550		ktrsyscall(p->p_tracep, code, callp->sy_narg, &args);
551#endif
552	rval[0] = 0;
553	rval[1] = frame.sf_edx;
554/*pg("%d. s %d\n", p->p_pid, code);*/
555	error = (*callp->sy_call)(p, args, rval);
556	if (error == ERESTART)
557		frame.sf_eip = opc;
558	else if (error != EJUSTRETURN) {
559		if (error) {
560/*pg("error %d", error);*/
561			frame.sf_eax = error;
562			frame.sf_eflags |= PSL_C;	/* carry bit */
563		} else {
564			frame.sf_eax = rval[0];
565			frame.sf_edx = rval[1];
566			frame.sf_eflags &= ~PSL_C;	/* carry bit */
567		}
568	}
569	/* else if (error == EJUSTRETURN) */
570		/* nothing to do */
571done:
572	/*
573	 * Reinitialize proc pointer `p' as it may be different
574	 * if this is a child returning from fork syscall.
575	 */
576	p = curproc;
577	while (i = CURSIG(p))
578		psig(i);
579	p->p_pri = p->p_usrpri;
580	if (want_resched) {
581		int s;
582		/*
583		 * Since we are curproc, clock will normally just change
584		 * our priority without moving us from one queue to another
585		 * (since the running process is not on a queue.)
586		 * If that happened after we setrq ourselves but before we
587		 * swtch()'ed, we might not be on the queue indicated by
588		 * our priority.
589		 */
590		s = splclock();
591		setrq(p);
592		p->p_stats->p_ru.ru_nivcsw++;
593		swtch();
594		splx(s);
595		while (i = CURSIG(p))
596			psig(i);
597	}
598	if (p->p_stats->p_prof.pr_scale) {
599		int ticks;
600		struct timeval *tv = &p->p_stime;
601
602		ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
603			(tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
604		if (ticks) {
605#ifdef PROFTIMER
606			extern int profscale;
607			addupc(frame.sf_eip, &p->p_stats->p_prof,
608			    ticks * profscale);
609#else
610			addupc(frame.sf_eip, &p->p_stats->p_prof, ticks);
611#endif
612		}
613	}
614	curpri = p->p_pri;
615#ifdef KTRACE
616	if (KTRPOINT(p, KTR_SYSRET))
617		ktrsysret(p->p_tracep, code, error, rval[0]);
618#endif
619#ifdef	DIAGNOSTICx
620{ extern int _udatasel, _ucodesel;
621	if (frame.sf_ss != _udatasel)
622		printf("ss %x call %d\n", frame.sf_ss, code);
623	if ((frame.sf_cs&0xffff) != _ucodesel)
624		printf("cs %x call %d\n", frame.sf_cs, code);
625	if (frame.sf_eip > VM_MAXUSER_ADDRESS) {
626		printf("eip %x call %d\n", frame.sf_eip, code);
627		frame.sf_eip = 0;
628	}
629}
630#endif
631}
632