subr_syscall.c revision 351
1/*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * the University of Utah, and William Jolitz. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)trap.c 7.4 (Berkeley) 5/13/91 37 * 38 * PATCHES MAGIC LEVEL PATCH THAT GOT US HERE 39 * -------------------- ----- ---------------------- 40 * CURRENT PATCH LEVEL: 1 00137 41 * -------------------- ----- ---------------------- 42 * 43 * 08 Apr 93 Bruce Evans Several VM system fixes 44 * Paul Kranenburg Add counter for vmstat 45 */ 46static char rcsid[] = "$Header: /a/cvs/386BSD/src/sys/i386/i386/trap.c,v 1.2 1993/07/27 10:52:20 davidg Exp $"; 47 48/* 49 * 386 Trap and System call handleing 50 */ 51 52#include "machine/cpu.h" 53#include "machine/psl.h" 54#include "machine/reg.h" 55 56#include "param.h" 57#include "systm.h" 58#include "proc.h" 59#include "user.h" 60#include "acct.h" 61#include "kernel.h" 62#ifdef KTRACE 63#include "ktrace.h" 64#endif 65 66#include "vm/vm_param.h" 67#include "vm/pmap.h" 68#include "vm/vm_map.h" 69#include "sys/vmmeter.h" 70 71#include "machine/trap.h" 72 73#ifdef __GNUC__ 74 75/* 76 * The "r" contraint could be "rm" except for fatal bugs in gas. As usual, 77 * we omit the size from the mov instruction to avoid nonfatal bugs in gas. 78 */ 79#define read_gs() ({ u_short gs; __asm("mov %%gs,%0" : "=r" (gs)); gs; }) 80#define write_gs(gs) __asm("mov %0,%%gs" : : "r" ((u_short) gs)) 81 82#else /* not __GNUC__ */ 83 84u_short read_gs __P((void)); 85void write_gs __P((/* promoted u_short */ int gs)); 86 87#endif /* __GNUC__ */ 88 89struct sysent sysent[]; 90int nsysent; 91int dostacklimits; 92unsigned rcr2(); 93extern short cpl; 94 95 96/* 97 * trap(frame): 98 * Exception, fault, and trap interface to BSD kernel. This 99 * common code is called from assembly language IDT gate entry 100 * routines that prepare a suitable stack frame, and restore this 101 * frame after the exception has been processed. Note that the 102 * effect is as if the arguments were passed call by reference. 103 */ 104 105/*ARGSUSED*/ 106trap(frame) 107 struct trapframe frame; 108{ 109 register int i; 110 register struct proc *p = curproc; 111 struct timeval syst; 112 int ucode, type, code, eva; 113 114 frame.tf_eflags &= ~PSL_NT; /* clear nested trap XXX */ 115 type = frame.tf_trapno; 116#include "ddb.h" 117#if NDDB > 0 118 if (curpcb && curpcb->pcb_onfault) { 119 if (frame.tf_trapno == T_BPTFLT 120 || frame.tf_trapno == T_TRCTRAP) 121 if (kdb_trap (type, 0, &frame)) 122 return; 123 } 124#endif 125 126/*pg("trap type %d code = %x eip = %x cs = %x eva = %x esp %x", 127 frame.tf_trapno, frame.tf_err, frame.tf_eip, 128 frame.tf_cs, rcr2(), frame.tf_esp);*/ 129if(curpcb == 0 || curproc == 0) goto we_re_toast; 130 if (curpcb->pcb_onfault && frame.tf_trapno != T_PAGEFLT) { 131 extern int _udatasel; 132 133 if (read_gs() != (u_short) _udatasel) 134 /* 135 * Some user has corrupted %gs but we depend on it in 136 * copyout() etc. Fix it up and retry. 137 * 138 * (We don't preserve %fs or %gs, so users can change 139 * them to either _ucodesel, _udatasel or a not-present 140 * selector, possibly ORed with 0 to 3, making them 141 * volatile for other users. Not preserving them saves 142 * time and doesn't lose functionality or open security 143 * holes.) 144 */ 145 write_gs(_udatasel); 146 else 147copyfault: 148 frame.tf_eip = (int)curpcb->pcb_onfault; 149 return; 150 } 151 152 syst = p->p_stime; 153 if (ISPL(frame.tf_cs) == SEL_UPL) { 154 type |= T_USER; 155 p->p_regs = (int *)&frame; 156 curpcb->pcb_flags |= FM_TRAP; /* used by sendsig */ 157 } 158 159 ucode=0; 160 eva = rcr2(); 161 code = frame.tf_err; 162 switch (type) { 163 164 default: 165 we_re_toast: 166#ifdef KDB 167 if (kdb_trap(&psl)) 168 return; 169#endif 170#if NDDB > 0 171 if (kdb_trap (type, 0, &frame)) 172 return; 173#endif 174 175 printf("trap type %d code = %x eip = %x cs = %x eflags = %x ", 176 frame.tf_trapno, frame.tf_err, frame.tf_eip, 177 frame.tf_cs, frame.tf_eflags); 178 eva = rcr2(); 179 printf("cr2 %x cpl %x\n", eva, cpl); 180 /* type &= ~T_USER; */ /* XXX what the hell is this */ 181 panic("trap"); 182 /*NOTREACHED*/ 183 184 case T_SEGNPFLT|T_USER: 185 case T_STKFLT|T_USER: 186 case T_PROTFLT|T_USER: /* protection fault */ 187 ucode = code + BUS_SEGM_FAULT ; 188 i = SIGBUS; 189 break; 190 191 case T_PRIVINFLT|T_USER: /* privileged instruction fault */ 192 case T_RESADFLT|T_USER: /* reserved addressing fault */ 193 case T_RESOPFLT|T_USER: /* reserved operand fault */ 194 case T_FPOPFLT|T_USER: /* coprocessor operand fault */ 195 ucode = type &~ T_USER; 196 i = SIGILL; 197 break; 198 199 case T_ASTFLT|T_USER: /* Allow process switch */ 200 astoff(); 201 cnt.v_soft++; 202 if ((p->p_flag & SOWEUPC) && p->p_stats->p_prof.pr_scale) { 203 addupc(frame.tf_eip, &p->p_stats->p_prof, 1); 204 p->p_flag &= ~SOWEUPC; 205 } 206 goto out; 207 208 case T_DNA|T_USER: 209#ifdef NPX 210 /* if a transparent fault (due to context switch "late") */ 211 if (npxdna()) return; 212#endif 213#ifdef MATH_EMULATE 214 i = math_emulate(&frame); 215 if (i == 0) return; 216#else /* MATH_EMULTATE */ 217 panic("trap: math emulation necessary!"); 218#endif /* MATH_EMULTATE */ 219 ucode = FPE_FPU_NP_TRAP; 220 break; 221 222 case T_BOUND|T_USER: 223 ucode = FPE_SUBRNG_TRAP; 224 i = SIGFPE; 225 break; 226 227 case T_OFLOW|T_USER: 228 ucode = FPE_INTOVF_TRAP; 229 i = SIGFPE; 230 break; 231 232 case T_DIVIDE|T_USER: 233 ucode = FPE_INTDIV_TRAP; 234 i = SIGFPE; 235 break; 236 237 case T_ARITHTRAP|T_USER: 238 ucode = code; 239 i = SIGFPE; 240 break; 241 242 case T_PAGEFLT: /* allow page faults in kernel mode */ 243#if 0 244 /* XXX - check only applies to 386's and 486's with WP off */ 245 if (code & PGEX_P) goto we_re_toast; 246#endif 247 248 /* fall into */ 249 case T_PAGEFLT|T_USER: /* page fault */ 250 { 251 register vm_offset_t va; 252 register struct vmspace *vm = p->p_vmspace; 253 register vm_map_t map; 254 int rv; 255 vm_prot_t ftype; 256 extern vm_map_t kernel_map; 257 unsigned nss,v; 258 259 va = trunc_page((vm_offset_t)eva); 260 /* 261 * Avoid even looking at pde_v(va) for high va's. va's 262 * above VM_MAX_KERNEL_ADDRESS don't correspond to normal 263 * PDE's (half of them correspond to APDEpde and half to 264 * an unmapped kernel PDE). va's betweeen 0xFEC00000 and 265 * VM_MAX_KERNEL_ADDRESS correspond to unmapped kernel PDE's 266 * (XXX - why are only 3 initialized when 6 are required to 267 * reach VM_MAX_KERNEL_ADDRESS?). Faulting in an unmapped 268 * kernel page table would give inconsistent PTD's. 269 * 270 * XXX - faulting in unmapped page tables wastes a page if 271 * va turns out to be invalid. 272 * 273 * XXX - should "kernel address space" cover the kernel page 274 * tables? Might have same problem with PDEpde as with 275 * APDEpde (or there may be no problem with APDEpde). 276 */ 277 if (va > 0xFEBFF000) { 278 rv = KERN_FAILURE; /* becomes SIGBUS */ 279 goto nogo; 280 } 281 /* 282 * It is only a kernel address space fault iff: 283 * 1. (type & T_USER) == 0 and 284 * 2. pcb_onfault not set or 285 * 3. pcb_onfault set but supervisor space fault 286 * The last can occur during an exec() copyin where the 287 * argument space is lazy-allocated. 288 */ 289 if (type == T_PAGEFLT && va >= KERNBASE) 290 map = kernel_map; 291 else 292 map = &vm->vm_map; 293 if (code & PGEX_W) 294 ftype = VM_PROT_READ | VM_PROT_WRITE; 295 else 296 ftype = VM_PROT_READ; 297 298#ifdef DEBUG 299 if (map == kernel_map && va == 0) { 300 printf("trap: bad kernel access at %x\n", va); 301 goto we_re_toast; 302 } 303#endif 304 305 /* 306 * XXX: rude hack to make stack limits "work" 307 */ 308 nss = 0; 309 if ((caddr_t)va >= vm->vm_maxsaddr && map != kernel_map 310 && dostacklimits) { 311 nss = clrnd(btoc((unsigned)vm->vm_maxsaddr 312 + MAXSSIZ - (unsigned)va)); 313 if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) { 314/*pg("trap rlimit %d, maxsaddr %x va %x ", nss, vm->vm_maxsaddr, va);*/ 315 rv = KERN_FAILURE; 316 goto nogo; 317 } 318 } 319 320 /* check if page table is mapped, if not, fault it first */ 321#define pde_v(v) (PTD[((v)>>PD_SHIFT)&1023].pd_v) 322 if (!pde_v(va)) { 323 v = trunc_page(vtopte(va)); 324 rv = vm_fault(map, v, ftype, FALSE); 325 if (rv != KERN_SUCCESS) goto nogo; 326 /* check if page table fault, increment wiring */ 327 vm_map_pageable(map, v, round_page(v+1), FALSE); 328 } else v=0; 329 rv = vm_fault(map, va, ftype, FALSE); 330 if (rv == KERN_SUCCESS) { 331 /* 332 * XXX: continuation of rude stack hack 333 */ 334 if (nss > vm->vm_ssize) 335 vm->vm_ssize = nss; 336 va = trunc_page(vtopte(va)); 337 /* for page table, increment wiring 338 as long as not a page table fault as well */ 339 if (!v && type != T_PAGEFLT) 340 vm_map_pageable(map, va, round_page(va+1), FALSE); 341 if (type == T_PAGEFLT) 342 return; 343 goto out; 344 } 345nogo: 346 if (type == T_PAGEFLT) { 347 if (curpcb->pcb_onfault) 348 goto copyfault; 349 printf("vm_fault(%x, %x, %x, 0) -> %x\n", 350 map, va, ftype, rv); 351 printf(" type %x, code %x\n", 352 type, code); 353 goto we_re_toast; 354 } 355 i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV; 356 break; 357 } 358 359#if NDDB == 0 360 case T_TRCTRAP: /* trace trap -- someone single stepping lcall's */ 361 frame.tf_eflags &= ~PSL_T; 362 363 /* Q: how do we turn it on again? */ 364 return; 365#endif 366 367 case T_BPTFLT|T_USER: /* bpt instruction fault */ 368 case T_TRCTRAP|T_USER: /* trace trap */ 369 frame.tf_eflags &= ~PSL_T; 370 i = SIGTRAP; 371 break; 372 373#include "isa.h" 374#if NISA > 0 375 case T_NMI: 376 case T_NMI|T_USER: 377#if NDDB > 0 378 /* NMI can be hooked up to a pushbutton for debugging */ 379 printf ("NMI ... going to debugger\n"); 380 if (kdb_trap (type, 0, &frame)) 381 return; 382#endif 383 /* machine/parity/power fail/"kitchen sink" faults */ 384 if(isa_nmi(code) == 0) return; 385 else goto we_re_toast; 386#endif 387 } 388 389 trapsignal(p, i, ucode); 390 if ((type & T_USER) == 0) 391 return; 392out: 393 while (i = CURSIG(p)) 394 psig(i); 395 p->p_pri = p->p_usrpri; 396 if (want_resched) { 397 /* 398 * Since we are curproc, clock will normally just change 399 * our priority without moving us from one queue to another 400 * (since the running process is not on a queue.) 401 * If that happened after we setrq ourselves but before we 402 * swtch()'ed, we might not be on the queue indicated by 403 * our priority. 404 */ 405 (void) splclock(); 406 setrq(p); 407 p->p_stats->p_ru.ru_nivcsw++; 408 swtch(); 409 (void) splnone(); 410 while (i = CURSIG(p)) 411 psig(i); 412 } 413 if (p->p_stats->p_prof.pr_scale) { 414 int ticks; 415 struct timeval *tv = &p->p_stime; 416 417 ticks = ((tv->tv_sec - syst.tv_sec) * 1000 + 418 (tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000); 419 if (ticks) { 420#ifdef PROFTIMER 421 extern int profscale; 422 addupc(frame.tf_eip, &p->p_stats->p_prof, 423 ticks * profscale); 424#else 425 addupc(frame.tf_eip, &p->p_stats->p_prof, ticks); 426#endif 427 } 428 } 429 curpri = p->p_pri; 430 curpcb->pcb_flags &= ~FM_TRAP; /* used by sendsig */ 431} 432 433/* 434 * Compensate for 386 brain damage (missing URKR). 435 * This is a little simpler than the pagefault handler in trap() because 436 * it the page tables have already been faulted in and high addresses 437 * are thrown out early for other reasons. 438 */ 439int trapwrite(addr) 440 unsigned addr; 441{ 442 unsigned nss; 443 struct proc *p; 444 vm_offset_t va; 445 struct vmspace *vm; 446 447 va = trunc_page((vm_offset_t)addr); 448 /* 449 * XXX - MAX is END. Changed > to >= for temp. fix. 450 */ 451 if (va >= VM_MAXUSER_ADDRESS) 452 return (1); 453 /* 454 * XXX: rude stack hack adapted from trap(). 455 */ 456 nss = 0; 457 p = curproc; 458 vm = p->p_vmspace; 459 if ((caddr_t)va >= vm->vm_maxsaddr && dostacklimits) { 460 nss = clrnd(btoc((unsigned)vm->vm_maxsaddr + MAXSSIZ 461 - (unsigned)va)); 462 if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) 463 return (1); 464 } 465 466 if (vm_fault(&vm->vm_map, va, VM_PROT_READ | VM_PROT_WRITE, FALSE) 467 != KERN_SUCCESS) 468 return (1); 469 470 /* 471 * XXX: continuation of rude stack hack 472 */ 473 if (nss > vm->vm_ssize) 474 vm->vm_ssize = nss; 475 476 return (0); 477} 478 479/* 480 * syscall(frame): 481 * System call request from POSIX system call gate interface to kernel. 482 * Like trap(), argument is call by reference. 483 */ 484/*ARGSUSED*/ 485syscall(frame) 486 volatile struct syscframe frame; 487{ 488 register int *locr0 = ((int *)&frame); 489 register caddr_t params; 490 register int i; 491 register struct sysent *callp; 492 register struct proc *p = curproc; 493 struct timeval syst; 494 int error, opc; 495 int args[8], rval[2]; 496 int code; 497 498#ifdef lint 499 r0 = 0; r0 = r0; r1 = 0; r1 = r1; 500#endif 501 syst = p->p_stime; 502 if (ISPL(frame.sf_cs) != SEL_UPL) 503 panic("syscall"); 504 505 code = frame.sf_eax; 506 curpcb->pcb_flags &= ~FM_TRAP; /* used by sendsig */ 507 p->p_regs = (int *)&frame; 508 params = (caddr_t)frame.sf_esp + sizeof (int) ; 509 510 /* 511 * Reconstruct pc, assuming lcall $X,y is 7 bytes, as it is always. 512 */ 513 opc = frame.sf_eip - 7; 514 callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; 515 if (callp == sysent) { 516 i = fuword(params); 517 params += sizeof (int); 518 callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; 519 } 520 521 if ((i = callp->sy_narg * sizeof (int)) && 522 (error = copyin(params, (caddr_t)args, (u_int)i))) { 523 frame.sf_eax = error; 524 frame.sf_eflags |= PSL_C; /* carry bit */ 525#ifdef KTRACE 526 if (KTRPOINT(p, KTR_SYSCALL)) 527 ktrsyscall(p->p_tracep, code, callp->sy_narg, &args); 528#endif 529 goto done; 530 } 531#ifdef KTRACE 532 if (KTRPOINT(p, KTR_SYSCALL)) 533 ktrsyscall(p->p_tracep, code, callp->sy_narg, &args); 534#endif 535 rval[0] = 0; 536 rval[1] = frame.sf_edx; 537/*pg("%d. s %d\n", p->p_pid, code);*/ 538 error = (*callp->sy_call)(p, args, rval); 539 if (error == ERESTART) 540 frame.sf_eip = opc; 541 else if (error != EJUSTRETURN) { 542 if (error) { 543/*pg("error %d", error);*/ 544 frame.sf_eax = error; 545 frame.sf_eflags |= PSL_C; /* carry bit */ 546 } else { 547 frame.sf_eax = rval[0]; 548 frame.sf_edx = rval[1]; 549 frame.sf_eflags &= ~PSL_C; /* carry bit */ 550 } 551 } 552 /* else if (error == EJUSTRETURN) */ 553 /* nothing to do */ 554done: 555 /* 556 * Reinitialize proc pointer `p' as it may be different 557 * if this is a child returning from fork syscall. 558 */ 559 p = curproc; 560 while (i = CURSIG(p)) 561 psig(i); 562 p->p_pri = p->p_usrpri; 563 if (want_resched) { 564 /* 565 * Since we are curproc, clock will normally just change 566 * our priority without moving us from one queue to another 567 * (since the running process is not on a queue.) 568 * If that happened after we setrq ourselves but before we 569 * swtch()'ed, we might not be on the queue indicated by 570 * our priority. 571 */ 572 (void) splclock(); 573 setrq(p); 574 p->p_stats->p_ru.ru_nivcsw++; 575 swtch(); 576 (void) splnone(); 577 while (i = CURSIG(p)) 578 psig(i); 579 } 580 if (p->p_stats->p_prof.pr_scale) { 581 int ticks; 582 struct timeval *tv = &p->p_stime; 583 584 ticks = ((tv->tv_sec - syst.tv_sec) * 1000 + 585 (tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000); 586 if (ticks) { 587#ifdef PROFTIMER 588 extern int profscale; 589 addupc(frame.sf_eip, &p->p_stats->p_prof, 590 ticks * profscale); 591#else 592 addupc(frame.sf_eip, &p->p_stats->p_prof, ticks); 593#endif 594 } 595 } 596 curpri = p->p_pri; 597#ifdef KTRACE 598 if (KTRPOINT(p, KTR_SYSRET)) 599 ktrsysret(p->p_tracep, code, error, rval[0]); 600#endif 601#ifdef DIAGNOSTICx 602{ extern int _udatasel, _ucodesel; 603 if (frame.sf_ss != _udatasel) 604 printf("ss %x call %d\n", frame.sf_ss, code); 605 if ((frame.sf_cs&0xffff) != _ucodesel) 606 printf("cs %x call %d\n", frame.sf_cs, code); 607 if (frame.sf_eip > VM_MAXUSER_ADDRESS) { 608 printf("eip %x call %d\n", frame.sf_eip, code); 609 frame.sf_eip = 0; 610 } 611} 612#endif 613} 614