subr_syscall.c revision 701
1/*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * the University of Utah, and William Jolitz.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
37 *	$Id: trap.c,v 1.5 1993/11/01 11:51:29 chmr Exp $
38 */
39
40/*
41 * 386 Trap and System call handleing
42 */
43
44#include "npx.h"
45#include "machine/cpu.h"
46#include "machine/psl.h"
47#include "machine/reg.h"
48
49#include "param.h"
50#include "systm.h"
51#include "proc.h"
52#include "user.h"
53#include "acct.h"
54#include "kernel.h"
55#ifdef KTRACE
56#include "ktrace.h"
57#endif
58
59#include "vm/vm_param.h"
60#include "vm/pmap.h"
61#include "vm/vm_map.h"
62#include "sys/vmmeter.h"
63
64#include "machine/trap.h"
65
66#ifdef	__GNUC__
67
68/*
69 * The "r" contraint could be "rm" except for fatal bugs in gas.  As usual,
70 * we omit the size from the mov instruction to avoid nonfatal bugs in gas.
71 */
72#define	read_gs()	({ u_short gs; __asm("mov %%gs,%0" : "=r" (gs)); gs; })
73#define	write_gs(gs)	__asm("mov %0,%%gs" : : "r" ((u_short) gs))
74
75#else	/* not __GNUC__ */
76
77u_short	read_gs		__P((void));
78void	write_gs	__P((/* promoted u_short */ int gs));
79
80#endif	/* __GNUC__ */
81
82struct	sysent sysent[];
83int	nsysent;
84int dostacklimits;
85unsigned rcr2();
86extern short cpl;
87
88
89/*
90 * trap(frame):
91 *	Exception, fault, and trap interface to BSD kernel. This
92 * common code is called from assembly language IDT gate entry
93 * routines that prepare a suitable stack frame, and restore this
94 * frame after the exception has been processed. Note that the
95 * effect is as if the arguments were passed call by reference.
96 */
97
98/*ARGSUSED*/
99trap(frame)
100	struct trapframe frame;
101{
102	register int i;
103	register struct proc *p = curproc;
104	struct timeval syst;
105	int ucode, type, code, eva;
106
107	frame.tf_eflags &= ~PSL_NT;	/* clear nested trap XXX */
108	type = frame.tf_trapno;
109#include "ddb.h"
110#if NDDB > 0
111	if (curpcb && curpcb->pcb_onfault) {
112		if (frame.tf_trapno == T_BPTFLT
113		    || frame.tf_trapno == T_TRCTRAP)
114			if (kdb_trap (type, 0, &frame))
115				return;
116	}
117#endif
118
119/*pg("trap type %d code = %x eip = %x cs = %x eva = %x esp %x",
120			frame.tf_trapno, frame.tf_err, frame.tf_eip,
121			frame.tf_cs, rcr2(), frame.tf_esp);*/
122if(curpcb == 0 || curproc == 0) goto we_re_toast;
123	if (curpcb->pcb_onfault && frame.tf_trapno != T_PAGEFLT) {
124		extern int _udatasel;
125
126		if (read_gs() != (u_short) _udatasel)
127			/*
128			 * Some user has corrupted %gs but we depend on it in
129			 * copyout() etc.  Fix it up and retry.
130			 *
131			 * (We don't preserve %fs or %gs, so users can change
132			 * them to either _ucodesel, _udatasel or a not-present
133			 * selector, possibly ORed with 0 to 3, making them
134			 * volatile for other users.  Not preserving them saves
135			 * time and doesn't lose functionality or open security
136			 * holes.)
137			 */
138			write_gs(_udatasel);
139		else
140copyfault:
141			frame.tf_eip = (int)curpcb->pcb_onfault;
142		return;
143	}
144
145	syst = p->p_stime;
146	if (ISPL(frame.tf_cs) == SEL_UPL) {
147		type |= T_USER;
148		p->p_regs = (int *)&frame;
149		curpcb->pcb_flags |= FM_TRAP;	/* used by sendsig */
150	}
151
152	ucode=0;
153	eva = rcr2();
154	code = frame.tf_err;
155	switch (type) {
156
157	default:
158	we_re_toast:
159#ifdef KDB
160		if (kdb_trap(&psl))
161			return;
162#endif
163#if NDDB > 0
164		if (kdb_trap (type, 0, &frame))
165			return;
166#endif
167
168		printf("trap type %d code = %x eip = %x cs = %x eflags = %x ",
169			frame.tf_trapno, frame.tf_err, frame.tf_eip,
170			frame.tf_cs, frame.tf_eflags);
171	eva = rcr2();
172		printf("cr2 %x cpl %x\n", eva, cpl);
173		/* type &= ~T_USER; */ /* XXX what the hell is this */
174		panic("trap");
175		/*NOTREACHED*/
176
177	case T_SEGNPFLT|T_USER:
178	case T_STKFLT|T_USER:
179	case T_PROTFLT|T_USER:		/* protection fault */
180		ucode = code + BUS_SEGM_FAULT ;
181		i = SIGBUS;
182		break;
183
184	case T_PRIVINFLT|T_USER:	/* privileged instruction fault */
185	case T_RESADFLT|T_USER:		/* reserved addressing fault */
186	case T_RESOPFLT|T_USER:		/* reserved operand fault */
187	case T_FPOPFLT|T_USER:		/* coprocessor operand fault */
188		ucode = type &~ T_USER;
189		i = SIGILL;
190		break;
191
192	case T_ASTFLT|T_USER:		/* Allow process switch */
193		astoff();
194		cnt.v_soft++;
195		if ((p->p_flag & SOWEUPC) && p->p_stats->p_prof.pr_scale) {
196			addupc(frame.tf_eip, &p->p_stats->p_prof, 1);
197			p->p_flag &= ~SOWEUPC;
198		}
199		goto out;
200
201	case T_DNA|T_USER:
202#if NNPX > 0
203		/* if a transparent fault (due to context switch "late") */
204		if (npxdna()) return;
205#endif	/* NNPX > 0 */
206#ifdef	MATH_EMULATE
207		i = math_emulate(&frame);
208		if (i == 0) return;
209#else	/* MATH_EMULTATE */
210		panic("trap: math emulation necessary!");
211#endif	/* MATH_EMULTATE */
212		ucode = FPE_FPU_NP_TRAP;
213		break;
214
215	case T_BOUND|T_USER:
216		ucode = FPE_SUBRNG_TRAP;
217		i = SIGFPE;
218		break;
219
220	case T_OFLOW|T_USER:
221		ucode = FPE_INTOVF_TRAP;
222		i = SIGFPE;
223		break;
224
225	case T_DIVIDE|T_USER:
226		ucode = FPE_INTDIV_TRAP;
227		i = SIGFPE;
228		break;
229
230	case T_ARITHTRAP|T_USER:
231		ucode = code;
232		i = SIGFPE;
233		break;
234
235	case T_PAGEFLT:			/* allow page faults in kernel mode */
236#if 0
237		/* XXX - check only applies to 386's and 486's with WP off */
238		if (code & PGEX_P) goto we_re_toast;
239#endif
240
241		/* fall into */
242	case T_PAGEFLT|T_USER:		/* page fault */
243	    {
244		register vm_offset_t va;
245		register struct vmspace *vm = p->p_vmspace;
246		register vm_map_t map;
247		int rv;
248		vm_prot_t ftype;
249		extern vm_map_t kernel_map;
250		unsigned nss,v;
251
252		va = trunc_page((vm_offset_t)eva);
253		/*
254		 * It is only a kernel address space fault iff:
255		 * 	1. (type & T_USER) == 0  and
256		 * 	2. pcb_onfault not set or
257		 *	3. pcb_onfault set but supervisor space fault
258		 * The last can occur during an exec() copyin where the
259		 * argument space is lazy-allocated.
260		 */
261		if (type == T_PAGEFLT && va >= KERNBASE)
262			map = kernel_map;
263		else
264			map = &vm->vm_map;
265		if (code & PGEX_W)
266			ftype = VM_PROT_READ | VM_PROT_WRITE;
267		else
268			ftype = VM_PROT_READ;
269
270#ifdef DEBUG
271		if (map == kernel_map && va == 0) {
272			printf("trap: bad kernel access at %x\n", va);
273			goto we_re_toast;
274		}
275#endif
276
277		/*
278		 * XXX: rude hack to make stack limits "work"
279		 */
280		nss = 0;
281		if ((caddr_t)va >= vm->vm_maxsaddr
282			&& (caddr_t)va < (caddr_t)VM_MAXUSER_ADDRESS
283			&& map != kernel_map
284			&& dostacklimits) {
285			nss = clrnd(btoc((unsigned)vm->vm_maxsaddr
286				+ MAXSSIZ - (unsigned)va));
287			if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
288/*pg("trap rlimit %d, maxsaddr %x va %x ", nss, vm->vm_maxsaddr, va);*/
289				rv = KERN_FAILURE;
290				goto nogo;
291			}
292		}
293
294		/* check if page table is mapped, if not, fault it first */
295#define pde_v(v) (PTD[((v)>>PD_SHIFT)&1023].pd_v)
296		if (!pde_v(va)) {
297			v = trunc_page(vtopte(va));
298			rv = vm_fault(map, v, ftype, FALSE);
299			if (rv != KERN_SUCCESS) goto nogo;
300			/* check if page table fault, increment wiring */
301			vm_map_pageable(map, v, round_page(v+1), FALSE);
302		} else v=0;
303		rv = vm_fault(map, va, ftype, FALSE);
304		if (rv == KERN_SUCCESS) {
305			/*
306			 * XXX: continuation of rude stack hack
307			 */
308			if (nss > vm->vm_ssize)
309				vm->vm_ssize = nss;
310			va = trunc_page(vtopte(va));
311			/* for page table, increment wiring
312			   as long as not a page table fault as well */
313			if (!v && type != T_PAGEFLT)
314			  vm_map_pageable(map, va, round_page(va+1), FALSE);
315			if (type == T_PAGEFLT)
316				return;
317			goto out;
318		}
319nogo:
320		if (type == T_PAGEFLT) {
321			if (curpcb->pcb_onfault)
322				goto copyfault;
323			printf("vm_fault(%x, %x, %x, 0) -> %x\n",
324			       map, va, ftype, rv);
325			printf("  type %x, code %x\n",
326			       type, code);
327			goto we_re_toast;
328		}
329		i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV;
330		break;
331	    }
332
333#if NDDB == 0
334	case T_TRCTRAP:	 /* trace trap -- someone single stepping lcall's */
335		frame.tf_eflags &= ~PSL_T;
336
337			/* Q: how do we turn it on again? */
338		return;
339#endif
340
341	case T_BPTFLT|T_USER:		/* bpt instruction fault */
342	case T_TRCTRAP|T_USER:		/* trace trap */
343		frame.tf_eflags &= ~PSL_T;
344		i = SIGTRAP;
345		break;
346
347#include "isa.h"
348#if	NISA > 0
349	case T_NMI:
350	case T_NMI|T_USER:
351#if NDDB > 0
352		/* NMI can be hooked up to a pushbutton for debugging */
353		printf ("NMI ... going to debugger\n");
354		if (kdb_trap (type, 0, &frame))
355			return;
356#endif
357		/* machine/parity/power fail/"kitchen sink" faults */
358		if(isa_nmi(code) == 0) return;
359		else goto we_re_toast;
360#endif
361	}
362
363	trapsignal(p, i, ucode);
364	if ((type & T_USER) == 0)
365		return;
366out:
367	while (i = CURSIG(p))
368		psig(i);
369	p->p_pri = p->p_usrpri;
370	if (want_resched) {
371		int s;
372		/*
373		 * Since we are curproc, clock will normally just change
374		 * our priority without moving us from one queue to another
375		 * (since the running process is not on a queue.)
376		 * If that happened after we setrq ourselves but before we
377		 * swtch()'ed, we might not be on the queue indicated by
378		 * our priority.
379		 */
380		s = splclock();
381		setrq(p);
382		p->p_stats->p_ru.ru_nivcsw++;
383		swtch();
384		splx(s);
385		while (i = CURSIG(p))
386			psig(i);
387	}
388	if (p->p_stats->p_prof.pr_scale) {
389		int ticks;
390		struct timeval *tv = &p->p_stime;
391
392		ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
393			(tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
394		if (ticks) {
395#ifdef PROFTIMER
396			extern int profscale;
397			addupc(frame.tf_eip, &p->p_stats->p_prof,
398			    ticks * profscale);
399#else
400			addupc(frame.tf_eip, &p->p_stats->p_prof, ticks);
401#endif
402		}
403	}
404	curpri = p->p_pri;
405	curpcb->pcb_flags &= ~FM_TRAP;	/* used by sendsig */
406}
407
408/*
409 * Compensate for 386 brain damage (missing URKR).
410 * This is a little simpler than the pagefault handler in trap() because
411 * it the page tables have already been faulted in and high addresses
412 * are thrown out early for other reasons.
413 */
414int trapwrite(addr)
415	unsigned addr;
416{
417	unsigned nss;
418	struct proc *p;
419	vm_offset_t va;
420	struct vmspace *vm;
421
422	va = trunc_page((vm_offset_t)addr);
423	/*
424	 * XXX - MAX is END.  Changed > to >= for temp. fix.
425	 */
426	if (va >= VM_MAXUSER_ADDRESS)
427		return (1);
428	/*
429	 * XXX: rude stack hack adapted from trap().
430	 */
431	nss = 0;
432	p = curproc;
433	vm = p->p_vmspace;
434	if ((caddr_t)va >= vm->vm_maxsaddr && dostacklimits) {
435		nss = clrnd(btoc((unsigned)vm->vm_maxsaddr + MAXSSIZ
436				 - (unsigned)va));
437		if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur))
438			return (1);
439	}
440
441	if (vm_fault(&vm->vm_map, va, VM_PROT_READ | VM_PROT_WRITE, FALSE)
442	    != KERN_SUCCESS)
443		return (1);
444
445	/*
446	 * XXX: continuation of rude stack hack
447	 */
448	if (nss > vm->vm_ssize)
449		vm->vm_ssize = nss;
450
451	return (0);
452}
453
454/*
455 * syscall(frame):
456 *	System call request from POSIX system call gate interface to kernel.
457 * Like trap(), argument is call by reference.
458 */
459/*ARGSUSED*/
460syscall(frame)
461	volatile struct syscframe frame;
462{
463	register int *locr0 = ((int *)&frame);
464	register caddr_t params;
465	register int i;
466	register struct sysent *callp;
467	register struct proc *p = curproc;
468	struct timeval syst;
469	int error, opc;
470	int args[8], rval[2];
471	int code;
472
473#ifdef lint
474	r0 = 0; r0 = r0; r1 = 0; r1 = r1;
475#endif
476	syst = p->p_stime;
477	if (ISPL(frame.sf_cs) != SEL_UPL)
478		panic("syscall");
479
480	code = frame.sf_eax;
481	curpcb->pcb_flags &= ~FM_TRAP;	/* used by sendsig */
482	p->p_regs = (int *)&frame;
483	params = (caddr_t)frame.sf_esp + sizeof (int) ;
484
485	/*
486	 * Reconstruct pc, assuming lcall $X,y is 7 bytes, as it is always.
487	 */
488	opc = frame.sf_eip - 7;
489	callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
490	if (callp == sysent) {
491		i = fuword(params);
492		params += sizeof (int);
493		callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
494	}
495
496	if ((i = callp->sy_narg * sizeof (int)) &&
497	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
498		frame.sf_eax = error;
499		frame.sf_eflags |= PSL_C;	/* carry bit */
500#ifdef KTRACE
501		if (KTRPOINT(p, KTR_SYSCALL))
502			ktrsyscall(p->p_tracep, code, callp->sy_narg, &args);
503#endif
504		goto done;
505	}
506#ifdef KTRACE
507	if (KTRPOINT(p, KTR_SYSCALL))
508		ktrsyscall(p->p_tracep, code, callp->sy_narg, &args);
509#endif
510	rval[0] = 0;
511	rval[1] = frame.sf_edx;
512/*pg("%d. s %d\n", p->p_pid, code);*/
513	error = (*callp->sy_call)(p, args, rval);
514	if (error == ERESTART)
515		frame.sf_eip = opc;
516	else if (error != EJUSTRETURN) {
517		if (error) {
518/*pg("error %d", error);*/
519			frame.sf_eax = error;
520			frame.sf_eflags |= PSL_C;	/* carry bit */
521		} else {
522			frame.sf_eax = rval[0];
523			frame.sf_edx = rval[1];
524			frame.sf_eflags &= ~PSL_C;	/* carry bit */
525		}
526	}
527	/* else if (error == EJUSTRETURN) */
528		/* nothing to do */
529done:
530	/*
531	 * Reinitialize proc pointer `p' as it may be different
532	 * if this is a child returning from fork syscall.
533	 */
534	p = curproc;
535	while (i = CURSIG(p))
536		psig(i);
537	p->p_pri = p->p_usrpri;
538	if (want_resched) {
539		int s;
540		/*
541		 * Since we are curproc, clock will normally just change
542		 * our priority without moving us from one queue to another
543		 * (since the running process is not on a queue.)
544		 * If that happened after we setrq ourselves but before we
545		 * swtch()'ed, we might not be on the queue indicated by
546		 * our priority.
547		 */
548		s = splclock();
549		setrq(p);
550		p->p_stats->p_ru.ru_nivcsw++;
551		swtch();
552		splx(s);
553		while (i = CURSIG(p))
554			psig(i);
555	}
556	if (p->p_stats->p_prof.pr_scale) {
557		int ticks;
558		struct timeval *tv = &p->p_stime;
559
560		ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
561			(tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
562		if (ticks) {
563#ifdef PROFTIMER
564			extern int profscale;
565			addupc(frame.sf_eip, &p->p_stats->p_prof,
566			    ticks * profscale);
567#else
568			addupc(frame.sf_eip, &p->p_stats->p_prof, ticks);
569#endif
570		}
571	}
572	curpri = p->p_pri;
573#ifdef KTRACE
574	if (KTRPOINT(p, KTR_SYSRET))
575		ktrsysret(p->p_tracep, code, error, rval[0]);
576#endif
577#ifdef	DIAGNOSTICx
578{ extern int _udatasel, _ucodesel;
579	if (frame.sf_ss != _udatasel)
580		printf("ss %x call %d\n", frame.sf_ss, code);
581	if ((frame.sf_cs&0xffff) != _ucodesel)
582		printf("cs %x call %d\n", frame.sf_cs, code);
583	if (frame.sf_eip > VM_MAXUSER_ADDRESS) {
584		printf("eip %x call %d\n", frame.sf_eip, code);
585		frame.sf_eip = 0;
586	}
587}
588#endif
589}
590