subr_syscall.c revision 757
1/*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * the University of Utah, and William Jolitz. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 37 * $Id: trap.c,v 1.6 1993/11/04 15:05:41 davidg Exp $ 38 */ 39 40/* 41 * 386 Trap and System call handleing 42 */ 43 44#include "npx.h" 45#include "machine/cpu.h" 46#include "machine/psl.h" 47#include "machine/reg.h" 48 49#include "param.h" 50#include "systm.h" 51#include "proc.h" 52#include "user.h" 53#include "acct.h" 54#include "kernel.h" 55#ifdef KTRACE 56#include "ktrace.h" 57#endif 58 59#include "vm/vm_param.h" 60#include "vm/pmap.h" 61#include "vm/vm_map.h" 62#include "sys/vmmeter.h" 63 64#include "machine/trap.h" 65 66#ifdef __GNUC__ 67 68/* 69 * The "r" contraint could be "rm" except for fatal bugs in gas. As usual, 70 * we omit the size from the mov instruction to avoid nonfatal bugs in gas. 71 */ 72#define read_gs() ({ u_short gs; __asm("mov %%gs,%0" : "=r" (gs)); gs; }) 73#define write_gs(gs) __asm("mov %0,%%gs" : : "r" ((u_short) gs)) 74 75#else /* not __GNUC__ */ 76 77u_short read_gs __P((void)); 78void write_gs __P((/* promoted u_short */ int gs)); 79 80#endif /* __GNUC__ */ 81 82struct sysent sysent[]; 83int nsysent; 84int dostacklimits; 85unsigned rcr2(); 86extern short cpl; 87 88#define MAX_TRAP_MSG 27 89char *trap_msg[] = { 90 "reserved addressing fault", /* 0 T_RESADFLT */ 91 "privileged instruction fault", /* 1 T_PRIVINFLT */ 92 "reserved operand fault", /* 2 T_RESOPFLT */ 93 "breakpoint instruction fault", /* 3 T_BPTFLT */ 94 "", /* 4 unused */ 95 "system call trap", /* 5 T_SYSCALL */ 96 "arithmetic trap", /* 6 T_ARITHTRAP */ 97 "system forced exception", /* 7 T_ASTFLT */ 98 "segmentation (limit) fault", /* 8 T_SEGFLT */ 99 "protection fault", /* 9 T_PROTFLT */ 100 "trace trap", /* 10 T_TRCTRAP */ 101 "", /* 11 unused */ 102 "page fault", /* 12 T_PAGEFLT */ 103 "page table fault", /* 13 T_TABLEFLT */ 104 "alignment fault", /* 14 T_ALIGNFLT */ 105 "kernel stack pointer not valid", /* 15 T_KSPNOTVAL */ 106 "bus error", /* 16 T_BUSERR */ 107 "kernel debugger fault", /* 17 T_KDBTRAP */ 108 "integer divide fault", /* 18 T_DIVIDE */ 109 "non-maskable interrupt trap", /* 19 T_NMI */ 110 "overflow trap", /* 20 T_OFLOW */ 111 "FPU bounds check fault", /* 21 T_BOUND */ 112 "FPU device not available", /* 22 T_DNA */ 113 "double fault", /* 23 T_DOUBLEFLT */ 114 "FPU operand fetch fault", /* 24 T_FPOPFLT */ 115 "invalid TSS fault", /* 25 T_TSSFLT */ 116 "segment not present fault", /* 26 T_SEGNPFLT */ 117 "stack fault", /* 27 T_STKFLT */ 118}; 119 120 121/* 122 * trap(frame): 123 * Exception, fault, and trap interface to BSD kernel. This 124 * common code is called from assembly language IDT gate entry 125 * routines that prepare a suitable stack frame, and restore this 126 * frame after the exception has been processed. Note that the 127 * effect is as if the arguments were passed call by reference. 128 */ 129 130/*ARGSUSED*/ 131trap(frame) 132 struct trapframe frame; 133{ 134 register int i; 135 register struct proc *p = curproc; 136 struct timeval syst; 137 int ucode, type, code, eva; 138 139 frame.tf_eflags &= ~PSL_NT; /* clear nested trap XXX */ 140 type = frame.tf_trapno; 141#include "ddb.h" 142#if NDDB > 0 143 if (curpcb && curpcb->pcb_onfault) { 144 if (frame.tf_trapno == T_BPTFLT 145 || frame.tf_trapno == T_TRCTRAP) 146 if (kdb_trap (type, 0, &frame)) 147 return; 148 } 149#endif 150 151/*pg("trap type %d code = %x eip = %x cs = %x eva = %x esp %x", 152 frame.tf_trapno, frame.tf_err, frame.tf_eip, 153 frame.tf_cs, rcr2(), frame.tf_esp);*/ 154if(curpcb == 0 || curproc == 0) goto we_re_toast; 155 if (curpcb->pcb_onfault && frame.tf_trapno != T_PAGEFLT) { 156 extern int _udatasel; 157 158 if (read_gs() != (u_short) _udatasel) 159 /* 160 * Some user has corrupted %gs but we depend on it in 161 * copyout() etc. Fix it up and retry. 162 * 163 * (We don't preserve %fs or %gs, so users can change 164 * them to either _ucodesel, _udatasel or a not-present 165 * selector, possibly ORed with 0 to 3, making them 166 * volatile for other users. Not preserving them saves 167 * time and doesn't lose functionality or open security 168 * holes.) 169 */ 170 write_gs(_udatasel); 171 else 172copyfault: 173 frame.tf_eip = (int)curpcb->pcb_onfault; 174 return; 175 } 176 177 syst = p->p_stime; 178 if (ISPL(frame.tf_cs) == SEL_UPL) { 179 type |= T_USER; 180 p->p_regs = (int *)&frame; 181 curpcb->pcb_flags |= FM_TRAP; /* used by sendsig */ 182 } 183 184 ucode=0; 185 eva = rcr2(); 186 code = frame.tf_err; 187 switch (type) { 188 189 default: 190 we_re_toast: 191#ifdef KDB 192 if (kdb_trap(&psl)) 193 return; 194#endif 195#if NDDB > 0 196 if (kdb_trap (type, 0, &frame)) 197 return; 198#endif 199 200 if ((type & ~T_USER) <= MAX_TRAP_MSG) 201 printf("\n\nFatal trap %d: %s while in %s mode\n", 202 type & ~T_USER, trap_msg[type & ~T_USER], 203 (type & T_USER) ? "user" : "kernel"); 204 205 printf("trap type = %d, code = %x\n eip = %x, cs = %x, eflags = %x, ", 206 frame.tf_trapno, frame.tf_err, frame.tf_eip, 207 frame.tf_cs, frame.tf_eflags); 208 eva = rcr2(); 209 printf("cr2 = %x, current priority = %x\n", eva, cpl); 210 211 type &= ~T_USER; 212 if (type <= MAX_TRAP_MSG) 213 panic(trap_msg[type]); 214 else 215 panic("unknown/reserved trap"); 216 217 /*NOTREACHED*/ 218 219 case T_SEGNPFLT|T_USER: 220 case T_STKFLT|T_USER: 221 case T_PROTFLT|T_USER: /* protection fault */ 222 ucode = code + BUS_SEGM_FAULT ; 223 i = SIGBUS; 224 break; 225 226 case T_PRIVINFLT|T_USER: /* privileged instruction fault */ 227 case T_RESADFLT|T_USER: /* reserved addressing fault */ 228 case T_RESOPFLT|T_USER: /* reserved operand fault */ 229 case T_FPOPFLT|T_USER: /* coprocessor operand fault */ 230 ucode = type &~ T_USER; 231 i = SIGILL; 232 break; 233 234 case T_ASTFLT|T_USER: /* Allow process switch */ 235 astoff(); 236 cnt.v_soft++; 237 if ((p->p_flag & SOWEUPC) && p->p_stats->p_prof.pr_scale) { 238 addupc(frame.tf_eip, &p->p_stats->p_prof, 1); 239 p->p_flag &= ~SOWEUPC; 240 } 241 goto out; 242 243 case T_DNA|T_USER: 244#if NNPX > 0 245 /* if a transparent fault (due to context switch "late") */ 246 if (npxdna()) return; 247#endif /* NNPX > 0 */ 248#ifdef MATH_EMULATE 249 i = math_emulate(&frame); 250 if (i == 0) return; 251#else /* MATH_EMULTATE */ 252 panic("trap: math emulation necessary!"); 253#endif /* MATH_EMULTATE */ 254 ucode = FPE_FPU_NP_TRAP; 255 break; 256 257 case T_BOUND|T_USER: 258 ucode = FPE_SUBRNG_TRAP; 259 i = SIGFPE; 260 break; 261 262 case T_OFLOW|T_USER: 263 ucode = FPE_INTOVF_TRAP; 264 i = SIGFPE; 265 break; 266 267 case T_DIVIDE|T_USER: 268 ucode = FPE_INTDIV_TRAP; 269 i = SIGFPE; 270 break; 271 272 case T_ARITHTRAP|T_USER: 273 ucode = code; 274 i = SIGFPE; 275 break; 276 277 case T_PAGEFLT: /* allow page faults in kernel mode */ 278#if 0 279 /* XXX - check only applies to 386's and 486's with WP off */ 280 if (code & PGEX_P) goto we_re_toast; 281#endif 282 283 /* fall into */ 284 case T_PAGEFLT|T_USER: /* page fault */ 285 { 286 register vm_offset_t va; 287 register struct vmspace *vm = p->p_vmspace; 288 register vm_map_t map; 289 int rv; 290 vm_prot_t ftype; 291 extern vm_map_t kernel_map; 292 unsigned nss,v; 293 294 va = trunc_page((vm_offset_t)eva); 295 /* 296 * It is only a kernel address space fault iff: 297 * 1. (type & T_USER) == 0 and 298 * 2. pcb_onfault not set or 299 * 3. pcb_onfault set but supervisor space fault 300 * The last can occur during an exec() copyin where the 301 * argument space is lazy-allocated. 302 */ 303 if (type == T_PAGEFLT && va >= KERNBASE) 304 map = kernel_map; 305 else 306 map = &vm->vm_map; 307 if (code & PGEX_W) 308 ftype = VM_PROT_READ | VM_PROT_WRITE; 309 else 310 ftype = VM_PROT_READ; 311 312#ifdef DEBUG 313 if (map == kernel_map && va == 0) { 314 printf("trap: bad kernel access at %x\n", va); 315 goto we_re_toast; 316 } 317#endif 318 319 /* 320 * XXX: rude hack to make stack limits "work" 321 */ 322 nss = 0; 323 if ((caddr_t)va >= vm->vm_maxsaddr 324 && (caddr_t)va < (caddr_t)VM_MAXUSER_ADDRESS 325 && map != kernel_map 326 && dostacklimits) { 327 nss = clrnd(btoc((unsigned)vm->vm_maxsaddr 328 + MAXSSIZ - (unsigned)va)); 329 if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) { 330/*pg("trap rlimit %d, maxsaddr %x va %x ", nss, vm->vm_maxsaddr, va);*/ 331 rv = KERN_FAILURE; 332 goto nogo; 333 } 334 } 335 336 /* check if page table is mapped, if not, fault it first */ 337#define pde_v(v) (PTD[((v)>>PD_SHIFT)&1023].pd_v) 338 if (!pde_v(va)) { 339 v = trunc_page(vtopte(va)); 340 rv = vm_fault(map, v, ftype, FALSE); 341 if (rv != KERN_SUCCESS) goto nogo; 342 /* check if page table fault, increment wiring */ 343 vm_map_pageable(map, v, round_page(v+1), FALSE); 344 } else v=0; 345 rv = vm_fault(map, va, ftype, FALSE); 346 if (rv == KERN_SUCCESS) { 347 /* 348 * XXX: continuation of rude stack hack 349 */ 350 if (nss > vm->vm_ssize) 351 vm->vm_ssize = nss; 352 va = trunc_page(vtopte(va)); 353 /* for page table, increment wiring 354 as long as not a page table fault as well */ 355 if (!v && type != T_PAGEFLT) 356 vm_map_pageable(map, va, round_page(va+1), FALSE); 357 if (type == T_PAGEFLT) 358 return; 359 goto out; 360 } 361nogo: 362 if (type == T_PAGEFLT) { 363 if (curpcb->pcb_onfault) 364 goto copyfault; 365 printf("vm_fault(%x, %x, %x, 0) -> %x\n", 366 map, va, ftype, rv); 367 printf(" type %x, code %x\n", 368 type, code); 369 goto we_re_toast; 370 } 371 i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV; 372 break; 373 } 374 375#if NDDB == 0 376 case T_TRCTRAP: /* trace trap -- someone single stepping lcall's */ 377 frame.tf_eflags &= ~PSL_T; 378 379 /* Q: how do we turn it on again? */ 380 return; 381#endif 382 383 case T_BPTFLT|T_USER: /* bpt instruction fault */ 384 case T_TRCTRAP|T_USER: /* trace trap */ 385 frame.tf_eflags &= ~PSL_T; 386 i = SIGTRAP; 387 break; 388 389#include "isa.h" 390#if NISA > 0 391 case T_NMI: 392 case T_NMI|T_USER: 393#if NDDB > 0 394 /* NMI can be hooked up to a pushbutton for debugging */ 395 printf ("NMI ... going to debugger\n"); 396 if (kdb_trap (type, 0, &frame)) 397 return; 398#endif 399 /* machine/parity/power fail/"kitchen sink" faults */ 400 if(isa_nmi(code) == 0) return; 401 else goto we_re_toast; 402#endif 403 } 404 405 trapsignal(p, i, ucode); 406 if ((type & T_USER) == 0) 407 return; 408out: 409 while (i = CURSIG(p)) 410 psig(i); 411 p->p_pri = p->p_usrpri; 412 if (want_resched) { 413 int s; 414 /* 415 * Since we are curproc, clock will normally just change 416 * our priority without moving us from one queue to another 417 * (since the running process is not on a queue.) 418 * If that happened after we setrq ourselves but before we 419 * swtch()'ed, we might not be on the queue indicated by 420 * our priority. 421 */ 422 s = splclock(); 423 setrq(p); 424 p->p_stats->p_ru.ru_nivcsw++; 425 swtch(); 426 splx(s); 427 while (i = CURSIG(p)) 428 psig(i); 429 } 430 if (p->p_stats->p_prof.pr_scale) { 431 int ticks; 432 struct timeval *tv = &p->p_stime; 433 434 ticks = ((tv->tv_sec - syst.tv_sec) * 1000 + 435 (tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000); 436 if (ticks) { 437#ifdef PROFTIMER 438 extern int profscale; 439 addupc(frame.tf_eip, &p->p_stats->p_prof, 440 ticks * profscale); 441#else 442 addupc(frame.tf_eip, &p->p_stats->p_prof, ticks); 443#endif 444 } 445 } 446 curpri = p->p_pri; 447 curpcb->pcb_flags &= ~FM_TRAP; /* used by sendsig */ 448} 449 450/* 451 * Compensate for 386 brain damage (missing URKR). 452 * This is a little simpler than the pagefault handler in trap() because 453 * it the page tables have already been faulted in and high addresses 454 * are thrown out early for other reasons. 455 */ 456int trapwrite(addr) 457 unsigned addr; 458{ 459 unsigned nss; 460 struct proc *p; 461 vm_offset_t va; 462 struct vmspace *vm; 463 464 va = trunc_page((vm_offset_t)addr); 465 /* 466 * XXX - MAX is END. Changed > to >= for temp. fix. 467 */ 468 if (va >= VM_MAXUSER_ADDRESS) 469 return (1); 470 /* 471 * XXX: rude stack hack adapted from trap(). 472 */ 473 nss = 0; 474 p = curproc; 475 vm = p->p_vmspace; 476 if ((caddr_t)va >= vm->vm_maxsaddr && dostacklimits) { 477 nss = clrnd(btoc((unsigned)vm->vm_maxsaddr + MAXSSIZ 478 - (unsigned)va)); 479 if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) 480 return (1); 481 } 482 483 if (vm_fault(&vm->vm_map, va, VM_PROT_READ | VM_PROT_WRITE, FALSE) 484 != KERN_SUCCESS) 485 return (1); 486 487 /* 488 * XXX: continuation of rude stack hack 489 */ 490 if (nss > vm->vm_ssize) 491 vm->vm_ssize = nss; 492 493 return (0); 494} 495 496/* 497 * syscall(frame): 498 * System call request from POSIX system call gate interface to kernel. 499 * Like trap(), argument is call by reference. 500 */ 501/*ARGSUSED*/ 502syscall(frame) 503 volatile struct syscframe frame; 504{ 505 register int *locr0 = ((int *)&frame); 506 register caddr_t params; 507 register int i; 508 register struct sysent *callp; 509 register struct proc *p = curproc; 510 struct timeval syst; 511 int error, opc; 512 int args[8], rval[2]; 513 int code; 514 515#ifdef lint 516 r0 = 0; r0 = r0; r1 = 0; r1 = r1; 517#endif 518 syst = p->p_stime; 519 if (ISPL(frame.sf_cs) != SEL_UPL) 520 panic("syscall"); 521 522 code = frame.sf_eax; 523 curpcb->pcb_flags &= ~FM_TRAP; /* used by sendsig */ 524 p->p_regs = (int *)&frame; 525 params = (caddr_t)frame.sf_esp + sizeof (int) ; 526 527 /* 528 * Reconstruct pc, assuming lcall $X,y is 7 bytes, as it is always. 529 */ 530 opc = frame.sf_eip - 7; 531 callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; 532 if (callp == sysent) { 533 i = fuword(params); 534 params += sizeof (int); 535 callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; 536 } 537 538 if ((i = callp->sy_narg * sizeof (int)) && 539 (error = copyin(params, (caddr_t)args, (u_int)i))) { 540 frame.sf_eax = error; 541 frame.sf_eflags |= PSL_C; /* carry bit */ 542#ifdef KTRACE 543 if (KTRPOINT(p, KTR_SYSCALL)) 544 ktrsyscall(p->p_tracep, code, callp->sy_narg, &args); 545#endif 546 goto done; 547 } 548#ifdef KTRACE 549 if (KTRPOINT(p, KTR_SYSCALL)) 550 ktrsyscall(p->p_tracep, code, callp->sy_narg, &args); 551#endif 552 rval[0] = 0; 553 rval[1] = frame.sf_edx; 554/*pg("%d. s %d\n", p->p_pid, code);*/ 555 error = (*callp->sy_call)(p, args, rval); 556 if (error == ERESTART) 557 frame.sf_eip = opc; 558 else if (error != EJUSTRETURN) { 559 if (error) { 560/*pg("error %d", error);*/ 561 frame.sf_eax = error; 562 frame.sf_eflags |= PSL_C; /* carry bit */ 563 } else { 564 frame.sf_eax = rval[0]; 565 frame.sf_edx = rval[1]; 566 frame.sf_eflags &= ~PSL_C; /* carry bit */ 567 } 568 } 569 /* else if (error == EJUSTRETURN) */ 570 /* nothing to do */ 571done: 572 /* 573 * Reinitialize proc pointer `p' as it may be different 574 * if this is a child returning from fork syscall. 575 */ 576 p = curproc; 577 while (i = CURSIG(p)) 578 psig(i); 579 p->p_pri = p->p_usrpri; 580 if (want_resched) { 581 int s; 582 /* 583 * Since we are curproc, clock will normally just change 584 * our priority without moving us from one queue to another 585 * (since the running process is not on a queue.) 586 * If that happened after we setrq ourselves but before we 587 * swtch()'ed, we might not be on the queue indicated by 588 * our priority. 589 */ 590 s = splclock(); 591 setrq(p); 592 p->p_stats->p_ru.ru_nivcsw++; 593 swtch(); 594 splx(s); 595 while (i = CURSIG(p)) 596 psig(i); 597 } 598 if (p->p_stats->p_prof.pr_scale) { 599 int ticks; 600 struct timeval *tv = &p->p_stime; 601 602 ticks = ((tv->tv_sec - syst.tv_sec) * 1000 + 603 (tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000); 604 if (ticks) { 605#ifdef PROFTIMER 606 extern int profscale; 607 addupc(frame.sf_eip, &p->p_stats->p_prof, 608 ticks * profscale); 609#else 610 addupc(frame.sf_eip, &p->p_stats->p_prof, ticks); 611#endif 612 } 613 } 614 curpri = p->p_pri; 615#ifdef KTRACE 616 if (KTRPOINT(p, KTR_SYSRET)) 617 ktrsysret(p->p_tracep, code, error, rval[0]); 618#endif 619#ifdef DIAGNOSTICx 620{ extern int _udatasel, _ucodesel; 621 if (frame.sf_ss != _udatasel) 622 printf("ss %x call %d\n", frame.sf_ss, code); 623 if ((frame.sf_cs&0xffff) != _ucodesel) 624 printf("cs %x call %d\n", frame.sf_cs, code); 625 if (frame.sf_eip > VM_MAXUSER_ADDRESS) { 626 printf("eip %x call %d\n", frame.sf_eip, code); 627 frame.sf_eip = 0; 628 } 629} 630#endif 631} 632