subr_syscall.c revision 827
1/*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * the University of Utah, and William Jolitz. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 37 * $Id: trap.c,v 1.9 1993/11/28 09:28:54 davidg Exp $ 38 */ 39 40/* 41 * 386 Trap and System call handleing 42 */ 43 44#include "npx.h" 45#include "machine/cpu.h" 46#include "machine/psl.h" 47#include "machine/reg.h" 48 49#include "param.h" 50#include "systm.h" 51#include "proc.h" 52#include "user.h" 53#include "acct.h" 54#include "kernel.h" 55#ifdef KTRACE 56#include "ktrace.h" 57#endif 58 59#include "vm/vm_param.h" 60#include "vm/pmap.h" 61#include "vm/vm_map.h" 62#include "sys/vmmeter.h" 63 64#include "machine/trap.h" 65 66#ifdef __GNUC__ 67 68/* 69 * The "r" contraint could be "rm" except for fatal bugs in gas. As usual, 70 * we omit the size from the mov instruction to avoid nonfatal bugs in gas. 71 */ 72#define read_gs() ({ u_short gs; __asm("mov %%gs,%0" : "=r" (gs)); gs; }) 73#define write_gs(gs) __asm("mov %0,%%gs" : : "r" ((u_short) gs)) 74 75#else /* not __GNUC__ */ 76 77u_short read_gs __P((void)); 78void write_gs __P((/* promoted u_short */ int gs)); 79 80#endif /* __GNUC__ */ 81 82struct sysent sysent[]; 83int nsysent; 84int dostacklimits; 85unsigned rcr2(); 86extern short cpl; 87 88#define MAX_TRAP_MSG 27 89char *trap_msg[] = { 90 "reserved addressing fault", /* 0 T_RESADFLT */ 91 "privileged instruction fault", /* 1 T_PRIVINFLT */ 92 "reserved operand fault", /* 2 T_RESOPFLT */ 93 "breakpoint instruction fault", /* 3 T_BPTFLT */ 94 "", /* 4 unused */ 95 "system call trap", /* 5 T_SYSCALL */ 96 "arithmetic trap", /* 6 T_ARITHTRAP */ 97 "system forced exception", /* 7 T_ASTFLT */ 98 "segmentation (limit) fault", /* 8 T_SEGFLT */ 99 "protection fault", /* 9 T_PROTFLT */ 100 "trace trap", /* 10 T_TRCTRAP */ 101 "", /* 11 unused */ 102 "page fault", /* 12 T_PAGEFLT */ 103 "page table fault", /* 13 T_TABLEFLT */ 104 "alignment fault", /* 14 T_ALIGNFLT */ 105 "kernel stack pointer not valid", /* 15 T_KSPNOTVAL */ 106 "bus error", /* 16 T_BUSERR */ 107 "kernel debugger fault", /* 17 T_KDBTRAP */ 108 "integer divide fault", /* 18 T_DIVIDE */ 109 "non-maskable interrupt trap", /* 19 T_NMI */ 110 "overflow trap", /* 20 T_OFLOW */ 111 "FPU bounds check fault", /* 21 T_BOUND */ 112 "FPU device not available", /* 22 T_DNA */ 113 "double fault", /* 23 T_DOUBLEFLT */ 114 "FPU operand fetch fault", /* 24 T_FPOPFLT */ 115 "invalid TSS fault", /* 25 T_TSSFLT */ 116 "segment not present fault", /* 26 T_SEGNPFLT */ 117 "stack fault", /* 27 T_STKFLT */ 118}; 119 120 121/* 122 * trap(frame): 123 * Exception, fault, and trap interface to BSD kernel. This 124 * common code is called from assembly language IDT gate entry 125 * routines that prepare a suitable stack frame, and restore this 126 * frame after the exception has been processed. Note that the 127 * effect is as if the arguments were passed call by reference. 128 */ 129 130/*ARGSUSED*/ 131void 132trap(frame) 133 struct trapframe frame; 134{ 135 register int i; 136 register struct proc *p = curproc; 137 struct timeval syst; 138 int ucode, type, code, eva; 139 140 frame.tf_eflags &= ~PSL_NT; /* clear nested trap XXX */ 141 type = frame.tf_trapno; 142#include "ddb.h" 143#if NDDB > 0 144 if (curpcb && curpcb->pcb_onfault) { 145 if (frame.tf_trapno == T_BPTFLT 146 || frame.tf_trapno == T_TRCTRAP) 147 if (kdb_trap (type, 0, &frame)) 148 return; 149 } 150#endif 151 152/*pg("trap type %d code = %x eip = %x cs = %x eva = %x esp %x", 153 frame.tf_trapno, frame.tf_err, frame.tf_eip, 154 frame.tf_cs, rcr2(), frame.tf_esp);*/ 155if(curpcb == 0 || curproc == 0) goto we_re_toast; 156 if (curpcb->pcb_onfault && frame.tf_trapno != T_PAGEFLT) { 157 extern int _udatasel; 158 159 if (read_gs() != (u_short) _udatasel) 160 /* 161 * Some user has corrupted %gs but we depend on it in 162 * copyout() etc. Fix it up and retry. 163 * 164 * (We don't preserve %fs or %gs, so users can change 165 * them to either _ucodesel, _udatasel or a not-present 166 * selector, possibly ORed with 0 to 3, making them 167 * volatile for other users. Not preserving them saves 168 * time and doesn't lose functionality or open security 169 * holes.) 170 */ 171 write_gs(_udatasel); 172 else 173copyfault: 174 frame.tf_eip = (int)curpcb->pcb_onfault; 175 return; 176 } 177 178 syst = p->p_stime; 179 if (ISPL(frame.tf_cs) == SEL_UPL) { 180 type |= T_USER; 181 p->p_regs = (int *)&frame; 182 curpcb->pcb_flags |= FM_TRAP; /* used by sendsig */ 183 } 184 185 ucode=0; 186 eva = rcr2(); 187 code = frame.tf_err; 188 switch (type) { 189 190 default: 191 we_re_toast: 192#ifdef KDB 193 if (kdb_trap(&psl)) 194 return; 195#endif 196#if NDDB > 0 197 if (kdb_trap (type, 0, &frame)) 198 return; 199#endif 200 201 if ((type & ~T_USER) <= MAX_TRAP_MSG) 202 printf("\n\nFatal trap %d: %s while in %s mode\n", 203 type & ~T_USER, trap_msg[type & ~T_USER], 204 (type & T_USER) ? "user" : "kernel"); 205 206 printf("trap type = %d, code = %x\n eip = %x, cs = %x, eflags = %x, ", 207 frame.tf_trapno, frame.tf_err, frame.tf_eip, 208 frame.tf_cs, frame.tf_eflags); 209 eva = rcr2(); 210 printf("cr2 = %x, current priority = %x\n", eva, cpl); 211 212 type &= ~T_USER; 213 if (type <= MAX_TRAP_MSG) 214 panic(trap_msg[type]); 215 else 216 panic("unknown/reserved trap"); 217 218 /*NOTREACHED*/ 219 220 case T_SEGNPFLT|T_USER: 221 case T_STKFLT|T_USER: 222 case T_PROTFLT|T_USER: /* protection fault */ 223 ucode = code + BUS_SEGM_FAULT ; 224 i = SIGBUS; 225 break; 226 227 case T_PRIVINFLT|T_USER: /* privileged instruction fault */ 228 case T_RESADFLT|T_USER: /* reserved addressing fault */ 229 case T_RESOPFLT|T_USER: /* reserved operand fault */ 230 case T_FPOPFLT|T_USER: /* coprocessor operand fault */ 231 ucode = type &~ T_USER; 232 i = SIGILL; 233 break; 234 235 case T_ASTFLT|T_USER: /* Allow process switch */ 236 astoff(); 237 cnt.v_soft++; 238 if ((p->p_flag & SOWEUPC) && p->p_stats->p_prof.pr_scale) { 239 addupc(frame.tf_eip, &p->p_stats->p_prof, 1); 240 p->p_flag &= ~SOWEUPC; 241 } 242 goto out; 243 244 case T_DNA|T_USER: 245#if NNPX > 0 246 /* if a transparent fault (due to context switch "late") */ 247 if (npxdna()) return; 248#endif /* NNPX > 0 */ 249#ifdef MATH_EMULATE 250 i = math_emulate(&frame); 251 if (i == 0) return; 252#else /* MATH_EMULTATE */ 253 panic("trap: math emulation necessary!"); 254#endif /* MATH_EMULTATE */ 255 ucode = FPE_FPU_NP_TRAP; 256 break; 257 258 case T_BOUND|T_USER: 259 ucode = FPE_SUBRNG_TRAP; 260 i = SIGFPE; 261 break; 262 263 case T_OFLOW|T_USER: 264 ucode = FPE_INTOVF_TRAP; 265 i = SIGFPE; 266 break; 267 268 case T_DIVIDE|T_USER: 269 ucode = FPE_INTDIV_TRAP; 270 i = SIGFPE; 271 break; 272 273 case T_ARITHTRAP|T_USER: 274 ucode = code; 275 i = SIGFPE; 276 break; 277 278 case T_PAGEFLT: /* allow page faults in kernel mode */ 279#if 0 280 /* XXX - check only applies to 386's and 486's with WP off */ 281 if (code & PGEX_P) goto we_re_toast; 282#endif 283 284 /* fall into */ 285 case T_PAGEFLT|T_USER: /* page fault */ 286 { 287 register vm_offset_t va; 288 register struct vmspace *vm = p->p_vmspace; 289 register vm_map_t map; 290 int rv; 291 vm_prot_t ftype; 292 extern vm_map_t kernel_map; 293 unsigned nss,v; 294 295 va = trunc_page((vm_offset_t)eva); 296 /* 297 * It is only a kernel address space fault iff: 298 * 1. (type & T_USER) == 0 and 299 * 2. pcb_onfault not set or 300 * 3. pcb_onfault set but supervisor space fault 301 * The last can occur during an exec() copyin where the 302 * argument space is lazy-allocated. 303 */ 304 if (type == T_PAGEFLT && va >= KERNBASE) 305 map = kernel_map; 306 else 307 map = &vm->vm_map; 308 if (code & PGEX_W) 309 ftype = VM_PROT_READ | VM_PROT_WRITE; 310 else 311 ftype = VM_PROT_READ; 312 313#ifdef DEBUG 314 if (map == kernel_map && va == 0) { 315 printf("trap: bad kernel access at %x\n", va); 316 goto we_re_toast; 317 } 318#endif 319 320 /* 321 * XXX: rude hack to make stack limits "work" 322 */ 323 nss = 0; 324 if ((caddr_t)va >= vm->vm_maxsaddr 325 && (caddr_t)va < (caddr_t)USRSTACK 326 && map != kernel_map 327 && dostacklimits) { 328 nss = clrnd(btoc((unsigned)vm->vm_maxsaddr 329 + MAXSSIZ - (unsigned)va)); 330 if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) { 331/*pg("trap rlimit %d, maxsaddr %x va %x ", nss, vm->vm_maxsaddr, va);*/ 332 rv = KERN_FAILURE; 333 goto nogo; 334 } 335 } 336 337 /* check if page table is mapped, if not, fault it first */ 338#define pde_v(v) (PTD[((v)>>PD_SHIFT)&1023].pd_v) 339 if (!pde_v(va)) { 340 v = trunc_page(vtopte(va)); 341 rv = vm_fault(map, v, ftype, FALSE); 342 if (rv != KERN_SUCCESS) goto nogo; 343 /* check if page table fault, increment wiring */ 344 vm_map_pageable(map, v, round_page(v+1), FALSE); 345 } else v=0; 346 rv = vm_fault(map, va, ftype, FALSE); 347 if (rv == KERN_SUCCESS) { 348 /* 349 * XXX: continuation of rude stack hack 350 */ 351 if (nss > vm->vm_ssize) 352 vm->vm_ssize = nss; 353 va = trunc_page(vtopte(va)); 354 /* for page table, increment wiring 355 as long as not a page table fault as well */ 356 if (!v && type != T_PAGEFLT) 357 vm_map_pageable(map, va, round_page(va+1), FALSE); 358 if (type == T_PAGEFLT) 359 return; 360 goto out; 361 } 362nogo: 363 if (type == T_PAGEFLT) { 364 if (curpcb->pcb_onfault) 365 goto copyfault; 366 printf("vm_fault(%x, %x, %x, 0) -> %x\n", 367 map, va, ftype, rv); 368 printf(" type %x, code %x\n", 369 type, code); 370 goto we_re_toast; 371 } 372 i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV; 373 ucode = type &~ T_USER; 374 frame.tf_err = eva; 375 /* kludge to pass faulting virtual address to sendsig */ 376 break; 377 } 378 379#if NDDB == 0 380 case T_TRCTRAP: /* trace trap -- someone single stepping lcall's */ 381 frame.tf_eflags &= ~PSL_T; 382 383 /* Q: how do we turn it on again? */ 384 return; 385#endif 386 387 case T_BPTFLT|T_USER: /* bpt instruction fault */ 388 case T_TRCTRAP|T_USER: /* trace trap */ 389 frame.tf_eflags &= ~PSL_T; 390 i = SIGTRAP; 391 break; 392 393#include "isa.h" 394#if NISA > 0 395 case T_NMI: 396 case T_NMI|T_USER: 397#if NDDB > 0 398 /* NMI can be hooked up to a pushbutton for debugging */ 399 printf ("NMI ... going to debugger\n"); 400 if (kdb_trap (type, 0, &frame)) 401 return; 402#endif 403 /* machine/parity/power fail/"kitchen sink" faults */ 404 if(isa_nmi(code) == 0) return; 405 else goto we_re_toast; 406#endif 407 } 408 409 trapsignal(p, i, ucode); 410 if ((type & T_USER) == 0) 411 return; 412out: 413 while (i = CURSIG(p)) 414 psig(i); 415 p->p_pri = p->p_usrpri; 416 if (want_resched) { 417 int s; 418 /* 419 * Since we are curproc, clock will normally just change 420 * our priority without moving us from one queue to another 421 * (since the running process is not on a queue.) 422 * If that happened after we setrq ourselves but before we 423 * swtch()'ed, we might not be on the queue indicated by 424 * our priority. 425 */ 426 s = splclock(); 427 setrq(p); 428 p->p_stats->p_ru.ru_nivcsw++; 429 swtch(); 430 splx(s); 431 while (i = CURSIG(p)) 432 psig(i); 433 } 434 if (p->p_stats->p_prof.pr_scale) { 435 int ticks; 436 struct timeval *tv = &p->p_stime; 437 438 ticks = ((tv->tv_sec - syst.tv_sec) * 1000 + 439 (tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000); 440 if (ticks) { 441#ifdef PROFTIMER 442 extern int profscale; 443 addupc(frame.tf_eip, &p->p_stats->p_prof, 444 ticks * profscale); 445#else 446 addupc(frame.tf_eip, &p->p_stats->p_prof, ticks); 447#endif 448 } 449 } 450 curpri = p->p_pri; 451 curpcb->pcb_flags &= ~FM_TRAP; /* used by sendsig */ 452} 453 454/* 455 * Compensate for 386 brain damage (missing URKR). 456 * This is a little simpler than the pagefault handler in trap() because 457 * it the page tables have already been faulted in and high addresses 458 * are thrown out early for other reasons. 459 */ 460int trapwrite(addr) 461 unsigned addr; 462{ 463 unsigned nss; 464 struct proc *p; 465 vm_offset_t va; 466 struct vmspace *vm; 467 468 va = trunc_page((vm_offset_t)addr); 469 /* 470 * XXX - MAX is END. Changed > to >= for temp. fix. 471 */ 472 if (va >= VM_MAXUSER_ADDRESS) 473 return (1); 474 /* 475 * XXX: rude stack hack adapted from trap(). 476 */ 477 nss = 0; 478 p = curproc; 479 vm = p->p_vmspace; 480 if ((caddr_t)va >= vm->vm_maxsaddr 481 && (caddr_t)va < (caddr_t)USRSTACK /* EWS 11/27/93 */ 482 && dostacklimits) { 483 nss = clrnd(btoc((unsigned)vm->vm_maxsaddr + MAXSSIZ 484 - (unsigned)va)); 485 if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) 486 return (1); 487 } 488 489 if (vm_fault(&vm->vm_map, va, VM_PROT_READ | VM_PROT_WRITE, FALSE) 490 != KERN_SUCCESS) 491 return (1); 492 493 /* 494 * XXX: continuation of rude stack hack 495 */ 496 if (nss > vm->vm_ssize) 497 vm->vm_ssize = nss; 498 499 return (0); 500} 501 502/* 503 * syscall(frame): 504 * System call request from POSIX system call gate interface to kernel. 505 * Like trap(), argument is call by reference. 506 */ 507/*ARGSUSED*/ 508void 509syscall(frame) 510 volatile struct syscframe frame; 511{ 512 register int *locr0 = ((int *)&frame); 513 register caddr_t params; 514 register int i; 515 register struct sysent *callp; 516 register struct proc *p = curproc; 517 struct timeval syst; 518 int error, opc; 519 int args[8], rval[2]; 520 int code; 521 522#ifdef lint 523 r0 = 0; r0 = r0; r1 = 0; r1 = r1; 524#endif 525 syst = p->p_stime; 526 if (ISPL(frame.sf_cs) != SEL_UPL) 527 panic("syscall"); 528 529 code = frame.sf_eax; 530 curpcb->pcb_flags &= ~FM_TRAP; /* used by sendsig */ 531 p->p_regs = (int *)&frame; 532 params = (caddr_t)frame.sf_esp + sizeof (int) ; 533 534 /* 535 * Reconstruct pc, assuming lcall $X,y is 7 bytes, as it is always. 536 */ 537 opc = frame.sf_eip - 7; 538 callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; 539 if (callp == sysent) { 540 i = fuword(params); 541 params += sizeof (int); 542 callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; 543 } 544 545 if ((i = callp->sy_narg * sizeof (int)) && 546 (error = copyin(params, (caddr_t)args, (u_int)i))) { 547 frame.sf_eax = error; 548 frame.sf_eflags |= PSL_C; /* carry bit */ 549#ifdef KTRACE 550 if (KTRPOINT(p, KTR_SYSCALL)) 551 ktrsyscall(p->p_tracep, code, callp->sy_narg, &args); 552#endif 553 goto done; 554 } 555#ifdef KTRACE 556 if (KTRPOINT(p, KTR_SYSCALL)) 557 ktrsyscall(p->p_tracep, code, callp->sy_narg, &args); 558#endif 559 rval[0] = 0; 560 rval[1] = frame.sf_edx; 561/*pg("%d. s %d\n", p->p_pid, code);*/ 562 error = (*callp->sy_call)(p, args, rval); 563 if (error == ERESTART) 564 frame.sf_eip = opc; 565 else if (error != EJUSTRETURN) { 566 if (error) { 567/*pg("error %d", error);*/ 568 frame.sf_eax = error; 569 frame.sf_eflags |= PSL_C; /* carry bit */ 570 } else { 571 frame.sf_eax = rval[0]; 572 frame.sf_edx = rval[1]; 573 frame.sf_eflags &= ~PSL_C; /* carry bit */ 574 } 575 } 576 /* else if (error == EJUSTRETURN) */ 577 /* nothing to do */ 578done: 579 /* 580 * Reinitialize proc pointer `p' as it may be different 581 * if this is a child returning from fork syscall. 582 */ 583 p = curproc; 584 while (i = CURSIG(p)) 585 psig(i); 586 p->p_pri = p->p_usrpri; 587 if (want_resched) { 588 int s; 589 /* 590 * Since we are curproc, clock will normally just change 591 * our priority without moving us from one queue to another 592 * (since the running process is not on a queue.) 593 * If that happened after we setrq ourselves but before we 594 * swtch()'ed, we might not be on the queue indicated by 595 * our priority. 596 */ 597 s = splclock(); 598 setrq(p); 599 p->p_stats->p_ru.ru_nivcsw++; 600 swtch(); 601 splx(s); 602 while (i = CURSIG(p)) 603 psig(i); 604 } 605 if (p->p_stats->p_prof.pr_scale) { 606 int ticks; 607 struct timeval *tv = &p->p_stime; 608 609 ticks = ((tv->tv_sec - syst.tv_sec) * 1000 + 610 (tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000); 611 if (ticks) { 612#ifdef PROFTIMER 613 extern int profscale; 614 addupc(frame.sf_eip, &p->p_stats->p_prof, 615 ticks * profscale); 616#else 617 addupc(frame.sf_eip, &p->p_stats->p_prof, ticks); 618#endif 619 } 620 } 621 curpri = p->p_pri; 622#ifdef KTRACE 623 if (KTRPOINT(p, KTR_SYSRET)) 624 ktrsysret(p->p_tracep, code, error, rval[0]); 625#endif 626#ifdef DIAGNOSTICx 627{ extern int _udatasel, _ucodesel; 628 if (frame.sf_ss != _udatasel) 629 printf("ss %x call %d\n", frame.sf_ss, code); 630 if ((frame.sf_cs&0xffff) != _ucodesel) 631 printf("cs %x call %d\n", frame.sf_cs, code); 632 if (frame.sf_eip > VM_MAXUSER_ADDRESS) { 633 printf("eip %x call %d\n", frame.sf_eip, code); 634 frame.sf_eip = 0; 635 } 636} 637#endif 638} 639