subr_syscall.c revision 1072
1/*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * the University of Utah, and William Jolitz.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
37 *	$Id: trap.c,v 1.15 1994/01/17 09:32:32 davidg Exp $
38 */
39
40/*
41 * 386 Trap and System call handleing
42 */
43
44#include "isa.h"
45#include "npx.h"
46#include "ddb.h"
47#include "machine/cpu.h"
48#include "machine/psl.h"
49#include "machine/reg.h"
50#include "machine/eflags.h"
51
52#include "param.h"
53#include "systm.h"
54#include "proc.h"
55#include "user.h"
56#include "acct.h"
57#include "kernel.h"
58#ifdef KTRACE
59#include "ktrace.h"
60#endif
61
62#include "vm/vm_param.h"
63#include "vm/pmap.h"
64#include "vm/vm_map.h"
65#include "vm/vm_user.h"
66#include "vm/vm_page.h"
67#include "sys/vmmeter.h"
68
69#include "machine/trap.h"
70
71#ifdef	__GNUC__
72
73/*
74 * The "r" contraint could be "rm" except for fatal bugs in gas.  As usual,
75 * we omit the size from the mov instruction to avoid nonfatal bugs in gas.
76 */
77#define	read_gs()	({ u_short gs; __asm("mov %%gs,%0" : "=r" (gs)); gs; })
78#define	write_gs(newgs)	__asm("mov %0,%%gs" : : "r" ((u_short) newgs))
79
80#else	/* not __GNUC__ */
81
82u_short	read_gs		__P((void));
83void	write_gs	__P((/* promoted u_short */ int gs));
84
85#endif	/* __GNUC__ */
86
87struct	sysent sysent[];
88int	nsysent;
89extern short cpl;
90extern short netmask, ttymask, biomask;
91
92#define MAX_TRAP_MSG		27
93char *trap_msg[] = {
94	"reserved addressing fault",		/*  0 T_RESADFLT */
95	"privileged instruction fault",		/*  1 T_PRIVINFLT */
96	"reserved operand fault",		/*  2 T_RESOPFLT */
97	"breakpoint instruction fault",		/*  3 T_BPTFLT */
98	"",					/*  4 unused */
99	"system call trap",			/*  5 T_SYSCALL */
100	"arithmetic trap",			/*  6 T_ARITHTRAP */
101	"system forced exception",		/*  7 T_ASTFLT */
102	"segmentation (limit) fault",		/*  8 T_SEGFLT */
103	"protection fault",			/*  9 T_PROTFLT */
104	"trace trap",				/* 10 T_TRCTRAP */
105	"",					/* 11 unused */
106	"page fault",				/* 12 T_PAGEFLT */
107	"page table fault",			/* 13 T_TABLEFLT */
108	"alignment fault",			/* 14 T_ALIGNFLT */
109	"kernel stack pointer not valid",	/* 15 T_KSPNOTVAL */
110	"bus error",				/* 16 T_BUSERR */
111	"kernel debugger fault",		/* 17 T_KDBTRAP */
112	"integer divide fault",			/* 18 T_DIVIDE */
113	"non-maskable interrupt trap",		/* 19 T_NMI */
114	"overflow trap",			/* 20 T_OFLOW */
115	"FPU bounds check fault",		/* 21 T_BOUND */
116	"FPU device not available",		/* 22 T_DNA */
117	"double fault",				/* 23 T_DOUBLEFLT */
118	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
119	"invalid TSS fault",			/* 25 T_TSSFLT */
120	"segment not present fault",		/* 26 T_SEGNPFLT */
121	"stack fault",				/* 27 T_STKFLT */
122};
123
124#define pde_v(v) (PTD[((v)>>PD_SHIFT)&1023].pd_v)
125
126/*
127 * trap(frame):
128 *	Exception, fault, and trap interface to BSD kernel. This
129 * common code is called from assembly language IDT gate entry
130 * routines that prepare a suitable stack frame, and restore this
131 * frame after the exception has been processed. Note that the
132 * effect is as if the arguments were passed call by reference.
133 */
134
135/*ARGSUSED*/
136void
137trap(frame)
138	struct trapframe frame;
139{
140	register int i;
141	register struct proc *p = curproc;
142	struct timeval syst;
143	int ucode, type, code, eva, fault_type;
144
145	frame.tf_eflags &= ~PSL_NT;	/* clear nested trap XXX */
146	type = frame.tf_trapno;
147#if NDDB > 0
148	if (curpcb && curpcb->pcb_onfault) {
149		if (frame.tf_trapno == T_BPTFLT
150		    || frame.tf_trapno == T_TRCTRAP)
151			if (kdb_trap (type, 0, &frame))
152				return;
153	}
154#endif
155
156	if (curpcb == 0 || curproc == 0)
157		goto skiptoswitch;
158	if (curpcb->pcb_onfault && frame.tf_trapno != T_PAGEFLT) {
159		extern int _udatasel;
160
161		if (read_gs() != (u_short) _udatasel)
162			/*
163			 * Some user has corrupted %gs but we depend on it in
164			 * copyout() etc.  Fix it up and retry.
165			 *
166			 * (We don't preserve %fs or %gs, so users can change
167			 * them to either _ucodesel, _udatasel or a not-present
168			 * selector, possibly ORed with 0 to 3, making them
169			 * volatile for other users.  Not preserving them saves
170			 * time and doesn't lose functionality or open security
171			 * holes.)
172			 */
173			write_gs(_udatasel);
174		else
175copyfault:
176			frame.tf_eip = (int)curpcb->pcb_onfault;
177		return;
178	}
179
180	syst = p->p_stime;
181	if (ISPL(frame.tf_cs) == SEL_UPL) {
182		type |= T_USER;
183		p->p_regs = (int *)&frame;
184	}
185
186skiptoswitch:
187	ucode=0;
188	eva = rcr2();
189	code = frame.tf_err;
190
191	if ((type & ~T_USER) == T_PAGEFLT)
192		goto pfault;
193
194	switch (type) {
195	case T_SEGNPFLT|T_USER:
196	case T_STKFLT|T_USER:
197	case T_PROTFLT|T_USER:		/* protection fault */
198		ucode = code + BUS_SEGM_FAULT ;
199		i = SIGBUS;
200		break;
201
202	case T_PRIVINFLT|T_USER:	/* privileged instruction fault */
203	case T_RESADFLT|T_USER:		/* reserved addressing fault */
204	case T_RESOPFLT|T_USER:		/* reserved operand fault */
205	case T_FPOPFLT|T_USER:		/* coprocessor operand fault */
206		ucode = type &~ T_USER;
207		i = SIGILL;
208		break;
209
210	case T_ASTFLT|T_USER:		/* Allow process switch */
211		astoff();
212		cnt.v_soft++;
213		if ((p->p_flag & SOWEUPC) && p->p_stats->p_prof.pr_scale) {
214			addupc(frame.tf_eip, &p->p_stats->p_prof, 1);
215			p->p_flag &= ~SOWEUPC;
216		}
217		goto out;
218
219	case T_DNA|T_USER:
220#if NNPX > 0
221		/* if a transparent fault (due to context switch "late") */
222		if (npxdna()) return;
223#endif	/* NNPX > 0 */
224#ifdef	MATH_EMULATE
225		i = math_emulate(&frame);
226		if (i == 0) return;
227#else	/* MATH_EMULTATE */
228		panic("trap: math emulation necessary!");
229#endif	/* MATH_EMULTATE */
230		ucode = FPE_FPU_NP_TRAP;
231		break;
232
233	case T_BOUND|T_USER:
234		ucode = FPE_SUBRNG_TRAP;
235		i = SIGFPE;
236		break;
237
238	case T_OFLOW|T_USER:
239		ucode = FPE_INTOVF_TRAP;
240		i = SIGFPE;
241		break;
242
243	case T_DIVIDE|T_USER:
244		ucode = FPE_INTDIV_TRAP;
245		i = SIGFPE;
246		break;
247
248	case T_ARITHTRAP|T_USER:
249		ucode = code;
250		i = SIGFPE;
251		break;
252
253	case T_PAGEFLT:			/* allow page faults in kernel mode */
254#if 0
255		/* XXX - check only applies to 386's and 486's with WP off */
256		if (code & PGEX_P) goto we_re_toast;
257#endif
258
259	pfault:
260		/* fall into */
261	case T_PAGEFLT|T_USER:		/* page fault */
262	    {
263		register vm_offset_t va;
264		register struct vmspace *vm;
265		register vm_map_t map;
266		int rv=0;
267		vm_prot_t ftype;
268		extern vm_map_t kernel_map;
269		unsigned nss,v;
270		int oldflags;
271
272		va = trunc_page((vm_offset_t)eva);
273		/*
274		 * It is only a kernel address space fault iff:
275		 * 	1. (type & T_USER) == 0  and
276		 * 	2. pcb_onfault not set or
277		 *	3. pcb_onfault set but supervisor space fault
278		 * The last can occur during an exec() copyin where the
279		 * argument space is lazy-allocated.
280		 */
281
282		if ((p == 0) || (type == T_PAGEFLT && va >= KERNBASE)) {
283			vm = 0;
284			map = kernel_map;
285		} else {
286			vm = p->p_vmspace;
287			map = &vm->vm_map;
288		}
289
290		if (code & PGEX_W)
291			ftype = VM_PROT_READ | VM_PROT_WRITE;
292		else
293			ftype = VM_PROT_READ;
294
295/*
296 * keep swapout from messing with us during this
297 * critical time.
298 */
299		oldflags = p->p_flag;
300		if (map != kernel_map) {
301				p->p_flag |= SLOCK;
302		}
303		/*
304		 * XXX: rude hack to make stack limits "work"
305		 */
306
307		nss = 0;
308		if (map != kernel_map && (caddr_t)va >= vm->vm_maxsaddr
309			&& (caddr_t)va < (caddr_t)USRSTACK) {
310			caddr_t v;
311			nss = roundup(USRSTACK - (unsigned)va, PAGE_SIZE);
312			if (nss > p->p_rlimit[RLIMIT_STACK].rlim_cur) {
313				rv = KERN_FAILURE;
314				p->p_flag &= ~SLOCK;
315				p->p_flag |= (oldflags & SLOCK);
316				goto nogo;
317			}
318
319			if (vm->vm_ssize && roundup(vm->vm_ssize << PGSHIFT,
320			    DFLSSIZ) < nss) {
321				int grow_amount;
322				/*
323				 * If necessary, grow the VM that the stack occupies
324				 * to allow for the rlimit. This allows us to not have
325				 * to allocate all of the VM up-front in execve (which
326				 * is expensive).
327				 * Grow the VM by the amount requested rounded up to
328				 * the nearest DFLSSIZ to provide for some hysteresis.
329				 */
330				grow_amount = roundup((nss - (vm->vm_ssize << PGSHIFT)), DFLSSIZ);
331				v = (char *)USRSTACK - roundup(vm->vm_ssize << PGSHIFT,
332				    DFLSSIZ) - grow_amount;
333				/*
334				 * If there isn't enough room to extend by DFLSSIZ, then
335				 * just extend to the maximum size
336				 */
337				if (v < vm->vm_maxsaddr) {
338					v = vm->vm_maxsaddr;
339					grow_amount = MAXSSIZ - (vm->vm_ssize << PGSHIFT);
340				}
341				if (vm_allocate(&vm->vm_map, (vm_offset_t *)&v,
342						grow_amount, FALSE) !=
343				    KERN_SUCCESS) {
344					p->p_flag &= ~SLOCK;
345					p->p_flag |= (oldflags & SLOCK);
346					goto nogo;
347				}
348			}
349		}
350
351
352		/* check if page table is mapped, if not, fault it first */
353#define pde_v(v) (PTD[((v)>>PD_SHIFT)&1023].pd_v)
354		{
355
356			if (map != kernel_map) {
357				vm_offset_t pa;
358				vm_offset_t v = (vm_offset_t) vtopte(va);
359
360				/* Fault the pte only if needed: */
361				*(volatile char *)v += 0;
362
363				/* Get the physical address: */
364				pa = pmap_extract(vm_map_pmap(map), v);
365
366				/* And wire the pte page at system vm level: */
367				vm_page_wire(PHYS_TO_VM_PAGE(pa));
368
369				/* Fault in the user page: */
370				rv = vm_fault(map, va, ftype, FALSE);
371
372				/* Unwire the pte page: */
373				vm_page_unwire(PHYS_TO_VM_PAGE(pa));
374
375			} else {
376				/*
377				 * Since we know that kernel virtual address addresses
378				 * always have pte pages mapped, we just have to fault
379				 * the page.
380				 */
381				rv = vm_fault(map, va, ftype, FALSE);
382			}
383
384		}
385		if (map != kernel_map) {
386			p->p_flag &= ~SLOCK;
387			p->p_flag |= (oldflags & SLOCK);
388		}
389		if (rv == KERN_SUCCESS) {
390			/*
391			 * XXX: continuation of rude stack hack
392			 */
393			nss = nss >> PGSHIFT;
394			if (vm && nss > vm->vm_ssize) {
395				vm->vm_ssize = nss;
396			}
397 			/*
398 			 * va could be a page table address, if the fault
399			 */
400			if (type == T_PAGEFLT)
401				return;
402			goto out;
403		}
404nogo:
405		if (type == T_PAGEFLT) {
406			if (curpcb->pcb_onfault)
407				goto copyfault;
408
409			goto we_re_toast;
410		}
411		i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV;
412
413		/* kludge to pass faulting virtual address to sendsig */
414		ucode = type &~ T_USER;
415		frame.tf_err = eva;
416
417		break;
418	    }
419
420#if NDDB == 0
421	case T_TRCTRAP:	 /* trace trap -- someone single stepping lcall's */
422		frame.tf_eflags &= ~PSL_T;
423
424			/* Q: how do we turn it on again? */
425		return;
426#endif
427
428	case T_BPTFLT|T_USER:		/* bpt instruction fault */
429	case T_TRCTRAP|T_USER:		/* trace trap */
430		frame.tf_eflags &= ~PSL_T;
431		i = SIGTRAP;
432		break;
433
434#if NISA > 0
435	case T_NMI:
436	case T_NMI|T_USER:
437#if NDDB > 0
438		/* NMI can be hooked up to a pushbutton for debugging */
439		printf ("NMI ... going to debugger\n");
440		if (kdb_trap (type, 0, &frame))
441			return;
442#endif
443		/* machine/parity/power fail/"kitchen sink" faults */
444		if (isa_nmi(code) == 0) return;
445		/* FALL THROUGH */
446#endif
447	default:
448	we_re_toast:
449
450		fault_type = type & ~T_USER;
451		if (fault_type <= MAX_TRAP_MSG)
452			printf("\n\nFatal trap %d: %s while in %s mode\n",
453				fault_type, trap_msg[fault_type],
454				ISPL(frame.tf_cs) == SEL_UPL ? "user" : "kernel");
455		if (fault_type == T_PAGEFLT) {
456			printf("fault virtual address	= 0x%x\n", eva);
457			printf("fault code		= %s %s, %s\n",
458				code & PGEX_U ? "user" : "supervisor",
459				code & PGEX_W ? "write" : "read",
460				code & PGEX_P ? "protection violation" : "page not present");
461		}
462		printf("instruction pointer	= 0x%x\n", frame.tf_eip);
463		printf("processor eflags	= ");
464		if (frame.tf_eflags & EFL_TF)
465			printf("trace/trap, ");
466		if (frame.tf_eflags & EFL_IF)
467			printf("interrupt enabled, ");
468		if (frame.tf_eflags & EFL_NT)
469			printf("nested task, ");
470		if (frame.tf_eflags & EFL_RF)
471			printf("resume, ");
472		if (frame.tf_eflags & EFL_VM)
473			printf("vm86, ");
474		printf("IOPL = %d\n", (frame.tf_eflags & EFL_IOPL) >> 12);
475		printf("current process		= ");
476		if (curproc) {
477			printf("%d (%s)\n",
478			    curproc->p_pid, curproc->p_comm ?
479			    curproc->p_comm : "");
480		} else {
481			printf("Idle\n");
482		}
483		printf("interrupt mask		= ");
484		if ((cpl & netmask) == netmask)
485			printf("net ");
486		if ((cpl & ttymask) == ttymask)
487			printf("tty ");
488		if ((cpl & biomask) == biomask)
489			printf("bio ");
490		if (cpl == 0)
491			printf("none");
492		printf("\n");
493
494#ifdef KDB
495		if (kdb_trap(&psl))
496			return;
497#endif
498#if NDDB > 0
499		if (kdb_trap (type, 0, &frame))
500			return;
501#endif
502		if (fault_type <= MAX_TRAP_MSG)
503			panic(trap_msg[fault_type]);
504		else
505			panic("unknown/reserved trap");
506
507		/* NOT REACHED */
508	}
509
510	trapsignal(p, i, ucode);
511	if ((type & T_USER) == 0)
512		return;
513out:
514	while (i = CURSIG(p))
515		psig(i);
516	p->p_pri = p->p_usrpri;
517	if (want_resched) {
518		int s;
519		/*
520		 * Since we are curproc, clock will normally just change
521		 * our priority without moving us from one queue to another
522		 * (since the running process is not on a queue.)
523		 * If that happened after we setrq ourselves but before we
524		 * swtch()'ed, we might not be on the queue indicated by
525		 * our priority.
526		 */
527		s = splclock();
528		setrq(p);
529		p->p_stats->p_ru.ru_nivcsw++;
530		swtch();
531		splx(s);
532		while (i = CURSIG(p))
533			psig(i);
534	}
535	if (p->p_stats->p_prof.pr_scale) {
536		int ticks;
537		struct timeval *tv = &p->p_stime;
538
539		ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
540			(tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
541		if (ticks) {
542#ifdef PROFTIMER
543			extern int profscale;
544			addupc(frame.tf_eip, &p->p_stats->p_prof,
545			    ticks * profscale);
546#else
547			addupc(frame.tf_eip, &p->p_stats->p_prof, ticks);
548#endif
549		}
550	}
551	curpri = p->p_pri;
552}
553
554/*
555 * Compensate for 386 brain damage (missing URKR).
556 * This is a little simpler than the pagefault handler in trap() because
557 * it the page tables have already been faulted in and high addresses
558 * are thrown out early for other reasons.
559 */
560int trapwrite(addr)
561	unsigned addr;
562{
563	unsigned nss;
564	struct proc *p;
565	vm_offset_t va;
566	struct vmspace *vm;
567	int oldflags;
568	int rv;
569
570	va = trunc_page((vm_offset_t)addr);
571	/*
572	 * XXX - MAX is END.  Changed > to >= for temp. fix.
573	 */
574	if (va >= VM_MAXUSER_ADDRESS)
575		return (1);
576	/*
577	 * XXX: rude stack hack adapted from trap().
578	 */
579	nss = 0;
580	p = curproc;
581	vm = p->p_vmspace;
582
583	oldflags = p->p_flag;
584	p->p_flag |= SLOCK;
585
586	if ((caddr_t)va >= vm->vm_maxsaddr
587	    && (caddr_t)va < (caddr_t)USRSTACK) {
588		nss = roundup(((unsigned)USRSTACK - (unsigned)va), PAGE_SIZE);
589		if (nss > p->p_rlimit[RLIMIT_STACK].rlim_cur) {
590			p->p_flag &= ~SLOCK;
591			p->p_flag |= (oldflags & SLOCK);
592			return (1);
593		}
594
595		if (vm->vm_ssize && roundup(vm->vm_ssize << PGSHIFT,
596			DFLSSIZ) < nss) {
597			caddr_t v;
598			int grow_amount;
599			/*
600			 * If necessary, grow the VM that the stack occupies
601			 * to allow for the rlimit. This allows us to not have
602			 * to allocate all of the VM up-front in execve (which
603			 * is expensive).
604			 * Grow the VM by the amount requested rounded up to
605			 * the nearest DFLSSIZ to provide for some hysteresis.
606			 */
607			grow_amount = roundup((nss - (vm->vm_ssize << PGSHIFT)), DFLSSIZ);
608			v = (char *)USRSTACK - roundup(vm->vm_ssize << PGSHIFT, DFLSSIZ) -
609				grow_amount;
610			/*
611			 * If there isn't enough room to extend by DFLSSIZ, then
612			 * just extend to the maximum size
613			 */
614			if (v < vm->vm_maxsaddr) {
615				v = vm->vm_maxsaddr;
616				grow_amount = MAXSSIZ - (vm->vm_ssize << PGSHIFT);
617			}
618			if (vm_allocate(&vm->vm_map, (vm_offset_t *)&v,
619					grow_amount, FALSE)
620			    != KERN_SUCCESS) {
621				p->p_flag &= ~SLOCK;
622				p->p_flag |= (oldflags & SLOCK);
623				return(1);
624			}
625				printf("new stack growth: %lx, %d\n", v, grow_amount);
626		}
627	}
628
629
630	{
631		vm_offset_t v;
632		v = trunc_page(vtopte(va));
633		/*
634		 * wire the pte page
635		 */
636		if (va < USRSTACK) {
637			vm_map_pageable(&vm->vm_map, v, round_page(v+1), FALSE);
638		}
639		/*
640		 * fault the data page
641		 */
642		rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, FALSE);
643		/*
644		 * unwire the pte page
645		 */
646		if (va < USRSTACK) {
647			vm_map_pageable(&vm->vm_map, v, round_page(v+1), TRUE);
648		}
649	}
650	p->p_flag &= ~SLOCK;
651	p->p_flag |= (oldflags & SLOCK);
652
653	if (rv != KERN_SUCCESS)
654		return 1;
655	/*
656	 * XXX: continuation of rude stack hack
657	 */
658	nss >>= PGSHIFT;
659	if (nss > vm->vm_ssize) {
660		vm->vm_ssize = nss;
661	}
662	return (0);
663}
664
665/*
666 * syscall(frame):
667 *	System call request from POSIX system call gate interface to kernel.
668 * Like trap(), argument is call by reference.
669 */
670/*ARGSUSED*/
671void
672syscall(frame)
673	volatile struct trapframe frame;
674{
675	register int *locr0 = ((int *)&frame);
676	register caddr_t params;
677	register int i;
678	register struct sysent *callp;
679	register struct proc *p = curproc;
680	struct timeval syst;
681	int error, opc;
682	int args[8], rval[2];
683	int code;
684
685#ifdef lint
686	r0 = 0; r0 = r0; r1 = 0; r1 = r1;
687#endif
688	syst = p->p_stime;
689	if (ISPL(frame.tf_cs) != SEL_UPL)
690		panic("syscall");
691
692	code = frame.tf_eax;
693	p->p_regs = (int *)&frame;
694	params = (caddr_t)frame.tf_esp + sizeof (int) ;
695
696	/*
697	 * Reconstruct pc, assuming lcall $X,y is 7 bytes, as it is always.
698	 */
699	opc = frame.tf_eip - 7;
700	if (code == 0) {
701		code = fuword(params);
702		params += sizeof (int);
703	}
704	if (code < 0 || code >= nsysent)
705		callp = &sysent[0];
706	else
707		callp = &sysent[code];
708
709	if ((i = callp->sy_narg * sizeof (int)) &&
710	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
711		frame.tf_eax = error;
712		frame.tf_eflags |= PSL_C;	/* carry bit */
713#ifdef KTRACE
714		if (KTRPOINT(p, KTR_SYSCALL))
715			ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
716#endif
717		goto done;
718	}
719#ifdef KTRACE
720	if (KTRPOINT(p, KTR_SYSCALL))
721		ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
722#endif
723	rval[0] = 0;
724	rval[1] = frame.tf_edx;
725/*pg("%d. s %d\n", p->p_pid, code);*/
726	error = (*callp->sy_call)(p, args, rval);
727	if (error == ERESTART)
728		frame.tf_eip = opc;
729	else if (error != EJUSTRETURN) {
730		if (error) {
731/*pg("error %d", error);*/
732			frame.tf_eax = error;
733			frame.tf_eflags |= PSL_C;	/* carry bit */
734		} else {
735			frame.tf_eax = rval[0];
736			frame.tf_edx = rval[1];
737			frame.tf_eflags &= ~PSL_C;	/* carry bit */
738		}
739	}
740	/* else if (error == EJUSTRETURN) */
741		/* nothing to do */
742done:
743	/*
744	 * Reinitialize proc pointer `p' as it may be different
745	 * if this is a child returning from fork syscall.
746	 */
747	p = curproc;
748	while (i = CURSIG(p))
749		psig(i);
750	p->p_pri = p->p_usrpri;
751	if (want_resched) {
752		int s;
753		/*
754		 * Since we are curproc, clock will normally just change
755		 * our priority without moving us from one queue to another
756		 * (since the running process is not on a queue.)
757		 * If that happened after we setrq ourselves but before we
758		 * swtch()'ed, we might not be on the queue indicated by
759		 * our priority.
760		 */
761		s = splclock();
762		setrq(p);
763		p->p_stats->p_ru.ru_nivcsw++;
764		swtch();
765		splx(s);
766		while (i = CURSIG(p))
767			psig(i);
768	}
769	if (p->p_stats->p_prof.pr_scale) {
770		int ticks;
771		struct timeval *tv = &p->p_stime;
772
773		ticks = ((tv->tv_sec - syst.tv_sec) * 1000 +
774			(tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000);
775		if (ticks) {
776#ifdef PROFTIMER
777			extern int profscale;
778			addupc(frame.tf_eip, &p->p_stats->p_prof,
779			    ticks * profscale);
780#else
781			addupc(frame.tf_eip, &p->p_stats->p_prof, ticks);
782#endif
783		}
784	}
785	curpri = p->p_pri;
786#ifdef KTRACE
787	if (KTRPOINT(p, KTR_SYSRET))
788		ktrsysret(p->p_tracep, code, error, rval[0]);
789#endif
790#ifdef	DIAGNOSTICx
791{ extern int _udatasel, _ucodesel;
792	if (frame.tf_ss != _udatasel)
793		printf("ss %x call %d\n", frame.tf_ss, code);
794	if ((frame.tf_cs&0xffff) != _ucodesel)
795		printf("cs %x call %d\n", frame.tf_cs, code);
796	if (frame.tf_eip > VM_MAXUSER_ADDRESS) {
797		printf("eip %x call %d\n", frame.tf_eip, code);
798		frame.tf_eip = 0;
799	}
800}
801#endif
802}
803