subr_syscall.c revision 1246
1/*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * the University of Utah, and William Jolitz. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 37 * $Id: trap.c,v 1.17 1994/02/08 09:26:01 davidg Exp $ 38 */ 39 40/* 41 * 386 Trap and System call handleing 42 */ 43 44#include "isa.h" 45#include "npx.h" 46#include "ddb.h" 47#include "machine/cpu.h" 48#include "machine/psl.h" 49#include "machine/reg.h" 50#include "machine/eflags.h" 51 52#include "param.h" 53#include "systm.h" 54#include "proc.h" 55#include "user.h" 56#include "acct.h" 57#include "kernel.h" 58#ifdef KTRACE 59#include "ktrace.h" 60#endif 61 62#include "vm/vm_param.h" 63#include "vm/pmap.h" 64#include "vm/vm_map.h" 65#include "vm/vm_user.h" 66#include "vm/vm_page.h" 67#include "sys/vmmeter.h" 68 69#include "machine/trap.h" 70 71#ifdef __GNUC__ 72 73/* 74 * The "r" contraint could be "rm" except for fatal bugs in gas. As usual, 75 * we omit the size from the mov instruction to avoid nonfatal bugs in gas. 76 */ 77#define read_gs() ({ u_short gs; __asm("mov %%gs,%0" : "=r" (gs)); gs; }) 78#define write_gs(newgs) __asm("mov %0,%%gs" : : "r" ((u_short) newgs)) 79 80#else /* not __GNUC__ */ 81 82u_short read_gs __P((void)); 83void write_gs __P((/* promoted u_short */ int gs)); 84 85#endif /* __GNUC__ */ 86 87extern int grow(struct proc *,int); 88 89struct sysent sysent[]; 90int nsysent; 91extern short cpl; 92extern short netmask, ttymask, biomask; 93 94#define MAX_TRAP_MSG 27 95char *trap_msg[] = { 96 "reserved addressing fault", /* 0 T_RESADFLT */ 97 "privileged instruction fault", /* 1 T_PRIVINFLT */ 98 "reserved operand fault", /* 2 T_RESOPFLT */ 99 "breakpoint instruction fault", /* 3 T_BPTFLT */ 100 "", /* 4 unused */ 101 "system call trap", /* 5 T_SYSCALL */ 102 "arithmetic trap", /* 6 T_ARITHTRAP */ 103 "system forced exception", /* 7 T_ASTFLT */ 104 "segmentation (limit) fault", /* 8 T_SEGFLT */ 105 "protection fault", /* 9 T_PROTFLT */ 106 "trace trap", /* 10 T_TRCTRAP */ 107 "", /* 11 unused */ 108 "page fault", /* 12 T_PAGEFLT */ 109 "page table fault", /* 13 T_TABLEFLT */ 110 "alignment fault", /* 14 T_ALIGNFLT */ 111 "kernel stack pointer not valid", /* 15 T_KSPNOTVAL */ 112 "bus error", /* 16 T_BUSERR */ 113 "kernel debugger fault", /* 17 T_KDBTRAP */ 114 "integer divide fault", /* 18 T_DIVIDE */ 115 "non-maskable interrupt trap", /* 19 T_NMI */ 116 "overflow trap", /* 20 T_OFLOW */ 117 "FPU bounds check fault", /* 21 T_BOUND */ 118 "FPU device not available", /* 22 T_DNA */ 119 "double fault", /* 23 T_DOUBLEFLT */ 120 "FPU operand fetch fault", /* 24 T_FPOPFLT */ 121 "invalid TSS fault", /* 25 T_TSSFLT */ 122 "segment not present fault", /* 26 T_SEGNPFLT */ 123 "stack fault", /* 27 T_STKFLT */ 124}; 125 126#define pde_v(v) (PTD[((v)>>PD_SHIFT)&1023].pd_v) 127 128/* 129 * trap(frame): 130 * Exception, fault, and trap interface to BSD kernel. This 131 * common code is called from assembly language IDT gate entry 132 * routines that prepare a suitable stack frame, and restore this 133 * frame after the exception has been processed. Note that the 134 * effect is as if the arguments were passed call by reference. 135 */ 136 137/*ARGSUSED*/ 138void 139trap(frame) 140 struct trapframe frame; 141{ 142 register int i; 143 register struct proc *p = curproc; 144 struct timeval syst; 145 int ucode, type, code, eva, fault_type; 146 147 frame.tf_eflags &= ~PSL_NT; /* clear nested trap XXX */ 148 type = frame.tf_trapno; 149#if NDDB > 0 150 if (curpcb && curpcb->pcb_onfault) { 151 if (frame.tf_trapno == T_BPTFLT 152 || frame.tf_trapno == T_TRCTRAP) 153 if (kdb_trap (type, 0, &frame)) 154 return; 155 } 156#endif 157 158 if (curpcb == 0 || curproc == 0) 159 goto skiptoswitch; 160 if (curpcb->pcb_onfault && frame.tf_trapno != T_PAGEFLT) { 161 extern int _udatasel; 162 163 if (read_gs() != (u_short) _udatasel) 164 /* 165 * Some user has corrupted %gs but we depend on it in 166 * copyout() etc. Fix it up and retry. 167 * 168 * (We don't preserve %fs or %gs, so users can change 169 * them to either _ucodesel, _udatasel or a not-present 170 * selector, possibly ORed with 0 to 3, making them 171 * volatile for other users. Not preserving them saves 172 * time and doesn't lose functionality or open security 173 * holes.) 174 */ 175 write_gs(_udatasel); 176 else 177copyfault: 178 frame.tf_eip = (int)curpcb->pcb_onfault; 179 return; 180 } 181 182 syst = p->p_stime; 183 if (ISPL(frame.tf_cs) == SEL_UPL) { 184 type |= T_USER; 185 p->p_regs = (int *)&frame; 186 } 187 188skiptoswitch: 189 ucode=0; 190 eva = rcr2(); 191 code = frame.tf_err; 192 193 if ((type & ~T_USER) == T_PAGEFLT) 194 goto pfault; 195 196 switch (type) { 197 case T_SEGNPFLT|T_USER: 198 case T_STKFLT|T_USER: 199 case T_PROTFLT|T_USER: /* protection fault */ 200 ucode = code + BUS_SEGM_FAULT ; 201 i = SIGBUS; 202 break; 203 204 case T_PRIVINFLT|T_USER: /* privileged instruction fault */ 205 case T_RESADFLT|T_USER: /* reserved addressing fault */ 206 case T_RESOPFLT|T_USER: /* reserved operand fault */ 207 case T_FPOPFLT|T_USER: /* coprocessor operand fault */ 208 ucode = type &~ T_USER; 209 i = SIGILL; 210 break; 211 212 case T_ASTFLT|T_USER: /* Allow process switch */ 213 astoff(); 214 cnt.v_soft++; 215 if ((p->p_flag & SOWEUPC) && p->p_stats->p_prof.pr_scale) { 216 addupc(frame.tf_eip, &p->p_stats->p_prof, 1); 217 p->p_flag &= ~SOWEUPC; 218 } 219 goto out; 220 221 case T_DNA|T_USER: 222#if NNPX > 0 223 /* if a transparent fault (due to context switch "late") */ 224 if (npxdna()) return; 225#endif /* NNPX > 0 */ 226#ifdef MATH_EMULATE 227 i = math_emulate(&frame); 228 if (i == 0) return; 229#else /* MATH_EMULTATE */ 230 panic("trap: math emulation necessary!"); 231#endif /* MATH_EMULTATE */ 232 ucode = FPE_FPU_NP_TRAP; 233 break; 234 235 case T_BOUND|T_USER: 236 ucode = FPE_SUBRNG_TRAP; 237 i = SIGFPE; 238 break; 239 240 case T_OFLOW|T_USER: 241 ucode = FPE_INTOVF_TRAP; 242 i = SIGFPE; 243 break; 244 245 case T_DIVIDE|T_USER: 246 ucode = FPE_INTDIV_TRAP; 247 i = SIGFPE; 248 break; 249 250 case T_ARITHTRAP|T_USER: 251 ucode = code; 252 i = SIGFPE; 253 break; 254 255 pfault: 256 case T_PAGEFLT: /* allow page faults in kernel mode */ 257 case T_PAGEFLT|T_USER: /* page fault */ 258 { 259 vm_offset_t va; 260 struct vmspace *vm; 261 vm_map_t map = 0; 262 int rv = 0, oldflags; 263 vm_prot_t ftype; 264 unsigned nss, v; 265 extern vm_map_t kernel_map; 266 267 va = trunc_page((vm_offset_t)eva); 268 269 /* 270 * Don't allow user-mode faults in kernel address space 271 */ 272 if ((type == (T_PAGEFLT|T_USER)) && (va >= KERNBASE)) { 273 goto nogo; 274 } 275 276 if ((p == 0) || (type == T_PAGEFLT && va >= KERNBASE)) { 277 vm = 0; 278 map = kernel_map; 279 } else { 280 vm = p->p_vmspace; 281 map = &vm->vm_map; 282 } 283 284 if (code & PGEX_W) 285 ftype = VM_PROT_READ | VM_PROT_WRITE; 286 else 287 ftype = VM_PROT_READ; 288 289 oldflags = p->p_flag; 290 if (map != kernel_map) { 291 vm_offset_t pa; 292 vm_offset_t v = (vm_offset_t) vtopte(va); 293 294 /* 295 * Keep swapout from messing with us during this 296 * critical time. 297 */ 298 p->p_flag |= SLOCK; 299 300 /* 301 * Grow the stack if necessary 302 */ 303 if ((caddr_t)va > vm->vm_maxsaddr 304 && (caddr_t)va < (caddr_t)USRSTACK) { 305 if (!grow(p, va)) { 306 rv = KERN_FAILURE; 307 p->p_flag &= ~SLOCK; 308 p->p_flag |= (oldflags & SLOCK); 309 goto nogo; 310 } 311 } 312 313 /* 314 * Check if page table is mapped, if not, 315 * fault it first 316 */ 317 318 /* Fault the pte only if needed: */ 319 *(volatile char *)v += 0; 320 321 vm_page_wire(pmap_pte_vm_page(vm_map_pmap(map),v)); 322 323 /* Fault in the user page: */ 324 rv = vm_fault(map, va, ftype, FALSE); 325 326 vm_page_unwire(pmap_pte_vm_page(vm_map_pmap(map),v)); 327 328 p->p_flag &= ~SLOCK; 329 p->p_flag |= (oldflags & SLOCK); 330 } else { 331 /* 332 * Since we know that kernel virtual address addresses 333 * always have pte pages mapped, we just have to fault 334 * the page. 335 */ 336 rv = vm_fault(map, va, ftype, FALSE); 337 } 338 339 if (rv == KERN_SUCCESS) { 340 if (type == T_PAGEFLT) 341 return; 342 goto out; 343 } 344nogo: 345 if (type == T_PAGEFLT) { 346 if (curpcb->pcb_onfault) 347 goto copyfault; 348 349 goto we_re_toast; 350 } 351 i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV; 352 353 /* kludge to pass faulting virtual address to sendsig */ 354 ucode = type &~ T_USER; 355 frame.tf_err = eva; 356 357 break; 358 } 359 360#if NDDB == 0 361 case T_TRCTRAP: /* trace trap -- someone single stepping lcall's */ 362 frame.tf_eflags &= ~PSL_T; 363 364 /* Q: how do we turn it on again? */ 365 return; 366#endif 367 368 case T_BPTFLT|T_USER: /* bpt instruction fault */ 369 case T_TRCTRAP|T_USER: /* trace trap */ 370 frame.tf_eflags &= ~PSL_T; 371 i = SIGTRAP; 372 break; 373 374#if NISA > 0 375 case T_NMI: 376 case T_NMI|T_USER: 377#if NDDB > 0 378 /* NMI can be hooked up to a pushbutton for debugging */ 379 printf ("NMI ... going to debugger\n"); 380 if (kdb_trap (type, 0, &frame)) 381 return; 382#endif 383 /* machine/parity/power fail/"kitchen sink" faults */ 384 if (isa_nmi(code) == 0) return; 385 /* FALL THROUGH */ 386#endif 387 default: 388 we_re_toast: 389 390 fault_type = type & ~T_USER; 391 if (fault_type <= MAX_TRAP_MSG) 392 printf("\n\nFatal trap %d: %s while in %s mode\n", 393 fault_type, trap_msg[fault_type], 394 ISPL(frame.tf_cs) == SEL_UPL ? "user" : "kernel"); 395 if (fault_type == T_PAGEFLT) { 396 printf("fault virtual address = 0x%x\n", eva); 397 printf("fault code = %s %s, %s\n", 398 code & PGEX_U ? "user" : "supervisor", 399 code & PGEX_W ? "write" : "read", 400 code & PGEX_P ? "protection violation" : "page not present"); 401 } 402 printf("instruction pointer = 0x%x\n", frame.tf_eip); 403 printf("processor eflags = "); 404 if (frame.tf_eflags & EFL_TF) 405 printf("trace/trap, "); 406 if (frame.tf_eflags & EFL_IF) 407 printf("interrupt enabled, "); 408 if (frame.tf_eflags & EFL_NT) 409 printf("nested task, "); 410 if (frame.tf_eflags & EFL_RF) 411 printf("resume, "); 412 if (frame.tf_eflags & EFL_VM) 413 printf("vm86, "); 414 printf("IOPL = %d\n", (frame.tf_eflags & EFL_IOPL) >> 12); 415 printf("current process = "); 416 if (curproc) { 417 printf("%d (%s)\n", 418 curproc->p_pid, curproc->p_comm ? 419 curproc->p_comm : ""); 420 } else { 421 printf("Idle\n"); 422 } 423 printf("interrupt mask = "); 424 if ((cpl & netmask) == netmask) 425 printf("net "); 426 if ((cpl & ttymask) == ttymask) 427 printf("tty "); 428 if ((cpl & biomask) == biomask) 429 printf("bio "); 430 if (cpl == 0) 431 printf("none"); 432 printf("\n"); 433 434#ifdef KDB 435 if (kdb_trap(&psl)) 436 return; 437#endif 438#if NDDB > 0 439 if (kdb_trap (type, 0, &frame)) 440 return; 441#endif 442 if (fault_type <= MAX_TRAP_MSG) 443 panic(trap_msg[fault_type]); 444 else 445 panic("unknown/reserved trap"); 446 447 /* NOTREACHED */ 448 } 449 450 trapsignal(p, i, ucode); 451 if ((type & T_USER) == 0) 452 return; 453out: 454 while (i = CURSIG(p)) 455 psig(i); 456 p->p_pri = p->p_usrpri; 457 if (want_resched) { 458 int s; 459 /* 460 * Since we are curproc, clock will normally just change 461 * our priority without moving us from one queue to another 462 * (since the running process is not on a queue.) 463 * If that happened after we setrq ourselves but before we 464 * swtch()'ed, we might not be on the queue indicated by 465 * our priority. 466 */ 467 s = splclock(); 468 setrq(p); 469 p->p_stats->p_ru.ru_nivcsw++; 470 swtch(); 471 splx(s); 472 while (i = CURSIG(p)) 473 psig(i); 474 } 475 if (p->p_stats->p_prof.pr_scale) { 476 int ticks; 477 struct timeval *tv = &p->p_stime; 478 479 ticks = ((tv->tv_sec - syst.tv_sec) * 1000 + 480 (tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000); 481 if (ticks) { 482#ifdef PROFTIMER 483 extern int profscale; 484 addupc(frame.tf_eip, &p->p_stats->p_prof, 485 ticks * profscale); 486#else 487 addupc(frame.tf_eip, &p->p_stats->p_prof, ticks); 488#endif 489 } 490 } 491 curpri = p->p_pri; 492} 493 494/* 495 * Compensate for 386 brain damage (missing URKR). 496 * This is a little simpler than the pagefault handler in trap() because 497 * it the page tables have already been faulted in and high addresses 498 * are thrown out early for other reasons. 499 */ 500int trapwrite(addr) 501 unsigned addr; 502{ 503 unsigned nss; 504 struct proc *p; 505 vm_offset_t va, v; 506 struct vmspace *vm; 507 int oldflags; 508 int rv; 509 510 va = trunc_page((vm_offset_t)addr); 511 /* 512 * XXX - MAX is END. Changed > to >= for temp. fix. 513 */ 514 if (va >= VM_MAXUSER_ADDRESS) 515 return (1); 516 517 p = curproc; 518 vm = p->p_vmspace; 519 520 oldflags = p->p_flag; 521 p->p_flag |= SLOCK; 522 523 if ((caddr_t)va >= vm->vm_maxsaddr 524 && (caddr_t)va < (caddr_t)USRSTACK) { 525 if (!grow(p, va)) { 526 p->p_flag &= ~SLOCK; 527 p->p_flag |= (oldflags & SLOCK); 528 return (1); 529 } 530 } 531 532 v = trunc_page(vtopte(va)); 533 534 /* 535 * wire the pte page 536 */ 537 if (va < USRSTACK) { 538 vm_map_pageable(&vm->vm_map, v, round_page(v+1), FALSE); 539 } 540 541 /* 542 * fault the data page 543 */ 544 rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, FALSE); 545 546 /* 547 * unwire the pte page 548 */ 549 if (va < USRSTACK) { 550 vm_map_pageable(&vm->vm_map, v, round_page(v+1), TRUE); 551 } 552 553 p->p_flag &= ~SLOCK; 554 p->p_flag |= (oldflags & SLOCK); 555 556 if (rv != KERN_SUCCESS) 557 return 1; 558 559 return (0); 560} 561 562/* 563 * syscall(frame): 564 * System call request from POSIX system call gate interface to kernel. 565 * Like trap(), argument is call by reference. 566 */ 567/*ARGSUSED*/ 568void 569syscall(frame) 570 volatile struct trapframe frame; 571{ 572 register int *locr0 = ((int *)&frame); 573 register caddr_t params; 574 register int i; 575 register struct sysent *callp; 576 register struct proc *p = curproc; 577 struct timeval syst; 578 int error, opc; 579 int args[8], rval[2]; 580 int code; 581 582#ifdef lint 583 r0 = 0; r0 = r0; r1 = 0; r1 = r1; 584#endif 585 syst = p->p_stime; 586 if (ISPL(frame.tf_cs) != SEL_UPL) 587 panic("syscall"); 588 589 code = frame.tf_eax; 590 p->p_regs = (int *)&frame; 591 params = (caddr_t)frame.tf_esp + sizeof (int) ; 592 593 /* 594 * Reconstruct pc, assuming lcall $X,y is 7 bytes, as it is always. 595 */ 596 opc = frame.tf_eip - 7; 597 if (code == 0) { 598 code = fuword(params); 599 params += sizeof (int); 600 } 601 if (code < 0 || code >= nsysent) 602 callp = &sysent[0]; 603 else 604 callp = &sysent[code]; 605 606 if ((i = callp->sy_narg * sizeof (int)) && 607 (error = copyin(params, (caddr_t)args, (u_int)i))) { 608 frame.tf_eax = error; 609 frame.tf_eflags |= PSL_C; /* carry bit */ 610#ifdef KTRACE 611 if (KTRPOINT(p, KTR_SYSCALL)) 612 ktrsyscall(p->p_tracep, code, callp->sy_narg, args); 613#endif 614 goto done; 615 } 616#ifdef KTRACE 617 if (KTRPOINT(p, KTR_SYSCALL)) 618 ktrsyscall(p->p_tracep, code, callp->sy_narg, args); 619#endif 620 rval[0] = 0; 621 rval[1] = frame.tf_edx; 622/*pg("%d. s %d\n", p->p_pid, code);*/ 623 error = (*callp->sy_call)(p, args, rval); 624 if (error == ERESTART) 625 frame.tf_eip = opc; 626 else if (error != EJUSTRETURN) { 627 if (error) { 628/*pg("error %d", error);*/ 629 frame.tf_eax = error; 630 frame.tf_eflags |= PSL_C; /* carry bit */ 631 } else { 632 frame.tf_eax = rval[0]; 633 frame.tf_edx = rval[1]; 634 frame.tf_eflags &= ~PSL_C; /* carry bit */ 635 } 636 } 637 /* else if (error == EJUSTRETURN) */ 638 /* nothing to do */ 639done: 640 /* 641 * Reinitialize proc pointer `p' as it may be different 642 * if this is a child returning from fork syscall. 643 */ 644 p = curproc; 645 while (i = CURSIG(p)) 646 psig(i); 647 p->p_pri = p->p_usrpri; 648 if (want_resched) { 649 int s; 650 /* 651 * Since we are curproc, clock will normally just change 652 * our priority without moving us from one queue to another 653 * (since the running process is not on a queue.) 654 * If that happened after we setrq ourselves but before we 655 * swtch()'ed, we might not be on the queue indicated by 656 * our priority. 657 */ 658 s = splclock(); 659 setrq(p); 660 p->p_stats->p_ru.ru_nivcsw++; 661 swtch(); 662 splx(s); 663 while (i = CURSIG(p)) 664 psig(i); 665 } 666 if (p->p_stats->p_prof.pr_scale) { 667 int ticks; 668 struct timeval *tv = &p->p_stime; 669 670 ticks = ((tv->tv_sec - syst.tv_sec) * 1000 + 671 (tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000); 672 if (ticks) { 673#ifdef PROFTIMER 674 extern int profscale; 675 addupc(frame.tf_eip, &p->p_stats->p_prof, 676 ticks * profscale); 677#else 678 addupc(frame.tf_eip, &p->p_stats->p_prof, ticks); 679#endif 680 } 681 } 682 curpri = p->p_pri; 683#ifdef KTRACE 684 if (KTRPOINT(p, KTR_SYSRET)) 685 ktrsysret(p->p_tracep, code, error, rval[0]); 686#endif 687#ifdef DIAGNOSTICx 688{ extern int _udatasel, _ucodesel; 689 if (frame.tf_ss != _udatasel) 690 printf("ss %x call %d\n", frame.tf_ss, code); 691 if ((frame.tf_cs&0xffff) != _ucodesel) 692 printf("cs %x call %d\n", frame.tf_cs, code); 693 if (frame.tf_eip > VM_MAXUSER_ADDRESS) { 694 printf("eip %x call %d\n", frame.tf_eip, code); 695 frame.tf_eip = 0; 696 } 697} 698#endif 699} 700