141 142#ifdef DEV_APIC 143#include <machine/apicvar.h> 144#endif 145 146#ifdef DEV_ISA 147#include <x86/isa/icu.h> 148#endif 149 150#ifdef XBOX 151#include <machine/xbox.h> 152 153int arch_i386_is_xbox = 0; 154uint32_t arch_i386_xbox_memsize = 0; 155#endif 156 157#ifdef XEN 158/* XEN includes */ 159#include <machine/xen/xen-os.h> 160#include <xen/hypervisor.h> 161#include <machine/xen/xen-os.h> 162#include <machine/xen/xenvar.h> 163#include <machine/xen/xenfunc.h> 164#include <xen/xen_intr.h> 165 166void Xhypervisor_callback(void); 167void failsafe_callback(void); 168 169extern trap_info_t trap_table[]; 170struct proc_ldt default_proc_ldt; 171extern int init_first; 172int running_xen = 1; 173extern unsigned long physfree; 174#endif /* XEN */ 175 176/* Sanity check for __curthread() */ 177CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 178 179extern void init386(int first); 180extern void dblfault_handler(void); 181 182extern void printcpuinfo(void); /* XXX header file */ 183extern void finishidentcpu(void); 184extern void panicifcpuunsupported(void); 185 186#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 187#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 188 189#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) 190#define CPU_ENABLE_SSE 191#endif 192 193static void cpu_startup(void *); 194static void fpstate_drop(struct thread *td); 195static void get_fpcontext(struct thread *td, mcontext_t *mcp); 196static int set_fpcontext(struct thread *td, const mcontext_t *mcp); 197#ifdef CPU_ENABLE_SSE 198static void set_fpregs_xmm(struct save87 *, struct savexmm *); 199static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 200#endif /* CPU_ENABLE_SSE */ 201SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 202 203#ifdef DDB 204extern vm_offset_t ksym_start, ksym_end; 205#endif 206 207/* Intel ICH registers */ 208#define ICH_PMBASE 0x400 209#define ICH_SMI_EN ICH_PMBASE + 0x30 210 211int _udatasel, _ucodesel; 212u_int basemem; 213 214int cold = 1; 215 216#ifdef COMPAT_43 217static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 218#endif 219#ifdef COMPAT_FREEBSD4 220static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 221#endif 222 223long Maxmem = 0; 224long realmem = 0; 225 226#ifdef PAE 227FEATURE(pae, "Physical Address Extensions"); 228#endif 229 230/* 231 * The number of PHYSMAP entries must be one less than the number of 232 * PHYSSEG entries because the PHYSMAP entry that spans the largest 233 * physical address that is accessible by ISA DMA is split into two 234 * PHYSSEG entries. 235 */ 236#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 237 238vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 239vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 240 241/* must be 2 less so 0 0 can signal end of chunks */ 242#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) 243#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) 244 245struct kva_md_info kmi; 246 247static struct trapframe proc0_tf; 248struct pcpu __pcpu[MAXCPU]; 249 250struct mtx icu_lock; 251 252struct mem_range_softc mem_range_softc; 253 254static void 255cpu_startup(dummy) 256 void *dummy; 257{ 258 uintmax_t memsize; 259 char *sysenv; 260 261 /* 262 * On MacBooks, we need to disallow the legacy USB circuit to 263 * generate an SMI# because this can cause several problems, 264 * namely: incorrect CPU frequency detection and failure to 265 * start the APs. 266 * We do this by disabling a bit in the SMI_EN (SMI Control and 267 * Enable register) of the Intel ICH LPC Interface Bridge. 268 */ 269 sysenv = getenv("smbios.system.product"); 270 if (sysenv != NULL) { 271 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 272 strncmp(sysenv, "MacBook3,1", 10) == 0 || 273 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 274 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 275 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 276 strncmp(sysenv, "Macmini1,1", 10) == 0) { 277 if (bootverbose) 278 printf("Disabling LEGACY_USB_EN bit on " 279 "Intel ICH.\n"); 280 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 281 } 282 freeenv(sysenv); 283 } 284 285 /* 286 * Good {morning,afternoon,evening,night}. 287 */ 288 startrtclock(); 289 printcpuinfo(); 290 panicifcpuunsupported(); 291#ifdef PERFMON 292 perfmon_init(); 293#endif 294 realmem = Maxmem; 295 296 /* 297 * Display physical memory if SMBIOS reports reasonable amount. 298 */ 299 memsize = 0; 300 sysenv = getenv("smbios.memory.enabled"); 301 if (sysenv != NULL) { 302 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 303 freeenv(sysenv); 304 } 305 if (memsize < ptoa((uintmax_t)cnt.v_free_count)) 306 memsize = ptoa((uintmax_t)Maxmem); 307 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 308 309 /* 310 * Display any holes after the first chunk of extended memory. 311 */ 312 if (bootverbose) { 313 int indx; 314 315 printf("Physical memory chunk(s):\n"); 316 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 317 vm_paddr_t size; 318 319 size = phys_avail[indx + 1] - phys_avail[indx]; 320 printf( 321 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 322 (uintmax_t)phys_avail[indx], 323 (uintmax_t)phys_avail[indx + 1] - 1, 324 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 325 } 326 } 327 328 vm_ksubmap_init(&kmi); 329 330 printf("avail memory = %ju (%ju MB)\n", 331 ptoa((uintmax_t)cnt.v_free_count), 332 ptoa((uintmax_t)cnt.v_free_count) / 1048576); 333 334 /* 335 * Set up buffers, so they can be used to read disk labels. 336 */ 337 bufinit(); 338 vm_pager_bufferinit(); 339#ifndef XEN 340 cpu_setregs(); 341#endif 342} 343 344/* 345 * Send an interrupt to process. 346 * 347 * Stack is set up to allow sigcode stored 348 * at top to call routine, followed by kcall 349 * to sigreturn routine below. After sigreturn 350 * resets the signal mask, the stack, and the 351 * frame pointer, it returns to the user 352 * specified pc, psl. 353 */ 354#ifdef COMPAT_43 355static void 356osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 357{ 358 struct osigframe sf, *fp; 359 struct proc *p; 360 struct thread *td; 361 struct sigacts *psp; 362 struct trapframe *regs; 363 int sig; 364 int oonstack; 365 366 td = curthread; 367 p = td->td_proc; 368 PROC_LOCK_ASSERT(p, MA_OWNED); 369 sig = ksi->ksi_signo; 370 psp = p->p_sigacts; 371 mtx_assert(&psp->ps_mtx, MA_OWNED); 372 regs = td->td_frame; 373 oonstack = sigonstack(regs->tf_esp); 374 375 /* Allocate space for the signal handler context. */ 376 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 377 SIGISMEMBER(psp->ps_sigonstack, sig)) { 378 fp = (struct osigframe *)(td->td_sigstk.ss_sp + 379 td->td_sigstk.ss_size - sizeof(struct osigframe)); 380#if defined(COMPAT_43) 381 td->td_sigstk.ss_flags |= SS_ONSTACK; 382#endif 383 } else 384 fp = (struct osigframe *)regs->tf_esp - 1; 385 386 /* Translate the signal if appropriate. */ 387 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 388 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 389 390 /* Build the argument list for the signal handler. */ 391 sf.sf_signum = sig; 392 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; 393 bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); 394 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 395 /* Signal handler installed with SA_SIGINFO. */ 396 sf.sf_arg2 = (register_t)&fp->sf_siginfo; 397 sf.sf_siginfo.si_signo = sig; 398 sf.sf_siginfo.si_code = ksi->ksi_code; 399 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; 400 sf.sf_addr = 0; 401 } else { 402 /* Old FreeBSD-style arguments. */ 403 sf.sf_arg2 = ksi->ksi_code; 404 sf.sf_addr = (register_t)ksi->ksi_addr; 405 sf.sf_ahu.sf_handler = catcher; 406 } 407 mtx_unlock(&psp->ps_mtx); 408 PROC_UNLOCK(p); 409 410 /* Save most if not all of trap frame. */ 411 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; 412 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; 413 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; 414 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; 415 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; 416 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; 417 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; 418 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; 419 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; 420 sf.sf_siginfo.si_sc.sc_es = regs->tf_es; 421 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; 422 sf.sf_siginfo.si_sc.sc_gs = rgs(); 423 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; 424 425 /* Build the signal context to be used by osigreturn(). */ 426 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; 427 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); 428 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; 429 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; 430 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; 431 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; 432 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; 433 sf.sf_siginfo.si_sc.sc_err = regs->tf_err; 434 435 /* 436 * If we're a vm86 process, we want to save the segment registers. 437 * We also change eflags to be our emulated eflags, not the actual 438 * eflags. 439 */ 440 if (regs->tf_eflags & PSL_VM) { 441 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ 442 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 443 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 444 445 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; 446 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; 447 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; 448 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; 449 450 if (vm86->vm86_has_vme == 0) 451 sf.sf_siginfo.si_sc.sc_ps = 452 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 453 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 454 455 /* See sendsig() for comments. */ 456 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 457 } 458 459 /* 460 * Copy the sigframe out to the user's stack. 461 */ 462 if (copyout(&sf, fp, sizeof(*fp)) != 0) { 463#ifdef DEBUG 464 printf("process %ld has trashed its stack\n", (long)p->p_pid); 465#endif 466 PROC_LOCK(p); 467 sigexit(td, SIGILL); 468 } 469 470 regs->tf_esp = (int)fp; 471 if (p->p_sysent->sv_sigcode_base != 0) { 472 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 473 szosigcode; 474 } else { 475 /* a.out sysentvec does not use shared page */ 476 regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode; 477 } 478 regs->tf_eflags &= ~(PSL_T | PSL_D); 479 regs->tf_cs = _ucodesel; 480 regs->tf_ds = _udatasel; 481 regs->tf_es = _udatasel; 482 regs->tf_fs = _udatasel; 483 load_gs(_udatasel); 484 regs->tf_ss = _udatasel; 485 PROC_LOCK(p); 486 mtx_lock(&psp->ps_mtx); 487} 488#endif /* COMPAT_43 */ 489 490#ifdef COMPAT_FREEBSD4 491static void 492freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 493{ 494 struct sigframe4 sf, *sfp; 495 struct proc *p; 496 struct thread *td; 497 struct sigacts *psp; 498 struct trapframe *regs; 499 int sig; 500 int oonstack; 501 502 td = curthread; 503 p = td->td_proc; 504 PROC_LOCK_ASSERT(p, MA_OWNED); 505 sig = ksi->ksi_signo; 506 psp = p->p_sigacts; 507 mtx_assert(&psp->ps_mtx, MA_OWNED); 508 regs = td->td_frame; 509 oonstack = sigonstack(regs->tf_esp); 510 511 /* Save user context. */ 512 bzero(&sf, sizeof(sf)); 513 sf.sf_uc.uc_sigmask = *mask; 514 sf.sf_uc.uc_stack = td->td_sigstk; 515 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 516 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 517 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 518 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 519 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 520 bzero(sf.sf_uc.uc_mcontext.mc_fpregs, 521 sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); 522 bzero(sf.sf_uc.uc_mcontext.__spare__, 523 sizeof(sf.sf_uc.uc_mcontext.__spare__)); 524 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 525 526 /* Allocate space for the signal handler context. */ 527 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 528 SIGISMEMBER(psp->ps_sigonstack, sig)) { 529 sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp + 530 td->td_sigstk.ss_size - sizeof(struct sigframe4)); 531#if defined(COMPAT_43) 532 td->td_sigstk.ss_flags |= SS_ONSTACK; 533#endif 534 } else 535 sfp = (struct sigframe4 *)regs->tf_esp - 1; 536 537 /* Translate the signal if appropriate. */ 538 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 539 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 540 541 /* Build the argument list for the signal handler. */ 542 sf.sf_signum = sig; 543 sf.sf_ucontext = (register_t)&sfp->sf_uc; 544 bzero(&sf.sf_si, sizeof(sf.sf_si)); 545 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 546 /* Signal handler installed with SA_SIGINFO. */ 547 sf.sf_siginfo = (register_t)&sfp->sf_si; 548 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 549 550 /* Fill in POSIX parts */ 551 sf.sf_si.si_signo = sig; 552 sf.sf_si.si_code = ksi->ksi_code; 553 sf.sf_si.si_addr = ksi->ksi_addr; 554 } else { 555 /* Old FreeBSD-style arguments. */ 556 sf.sf_siginfo = ksi->ksi_code; 557 sf.sf_addr = (register_t)ksi->ksi_addr; 558 sf.sf_ahu.sf_handler = catcher; 559 } 560 mtx_unlock(&psp->ps_mtx); 561 PROC_UNLOCK(p); 562 563 /* 564 * If we're a vm86 process, we want to save the segment registers. 565 * We also change eflags to be our emulated eflags, not the actual 566 * eflags. 567 */ 568 if (regs->tf_eflags & PSL_VM) { 569 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 570 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 571 572 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 573 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 574 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 575 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 576 577 if (vm86->vm86_has_vme == 0) 578 sf.sf_uc.uc_mcontext.mc_eflags = 579 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 580 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 581 582 /* 583 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 584 * syscalls made by the signal handler. This just avoids 585 * wasting time for our lazy fixup of such faults. PSL_NT 586 * does nothing in vm86 mode, but vm86 programs can set it 587 * almost legitimately in probes for old cpu types. 588 */ 589 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 590 } 591 592 /* 593 * Copy the sigframe out to the user's stack. 594 */ 595 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 596#ifdef DEBUG 597 printf("process %ld has trashed its stack\n", (long)p->p_pid); 598#endif 599 PROC_LOCK(p); 600 sigexit(td, SIGILL); 601 } 602 603 regs->tf_esp = (int)sfp; 604 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 605 szfreebsd4_sigcode; 606 regs->tf_eflags &= ~(PSL_T | PSL_D); 607 regs->tf_cs = _ucodesel; 608 regs->tf_ds = _udatasel; 609 regs->tf_es = _udatasel; 610 regs->tf_fs = _udatasel; 611 regs->tf_ss = _udatasel; 612 PROC_LOCK(p); 613 mtx_lock(&psp->ps_mtx); 614} 615#endif /* COMPAT_FREEBSD4 */ 616 617void 618sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 619{ 620 struct sigframe sf, *sfp; 621 struct proc *p; 622 struct thread *td; 623 struct sigacts *psp; 624 char *sp; 625 struct trapframe *regs; 626 struct segment_descriptor *sdp; 627 int sig; 628 int oonstack; 629 630 td = curthread; 631 p = td->td_proc; 632 PROC_LOCK_ASSERT(p, MA_OWNED); 633 sig = ksi->ksi_signo; 634 psp = p->p_sigacts; 635 mtx_assert(&psp->ps_mtx, MA_OWNED); 636#ifdef COMPAT_FREEBSD4 637 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { 638 freebsd4_sendsig(catcher, ksi, mask); 639 return; 640 } 641#endif 642#ifdef COMPAT_43 643 if (SIGISMEMBER(psp->ps_osigset, sig)) { 644 osendsig(catcher, ksi, mask); 645 return; 646 } 647#endif 648 regs = td->td_frame; 649 oonstack = sigonstack(regs->tf_esp); 650 651 /* Save user context. */ 652 bzero(&sf, sizeof(sf)); 653 sf.sf_uc.uc_sigmask = *mask; 654 sf.sf_uc.uc_stack = td->td_sigstk; 655 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 656 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 657 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 658 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 659 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 660 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 661 get_fpcontext(td, &sf.sf_uc.uc_mcontext); 662 fpstate_drop(td); 663 /* 664 * Unconditionally fill the fsbase and gsbase into the mcontext. 665 */ 666 sdp = &td->td_pcb->pcb_fsd; 667 sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 | 668 sdp->sd_lobase; 669 sdp = &td->td_pcb->pcb_gsd; 670 sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | 671 sdp->sd_lobase; 672 sf.sf_uc.uc_mcontext.mc_flags = 0; 673 bzero(sf.sf_uc.uc_mcontext.mc_spare2, 674 sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); 675 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 676 677 /* Allocate space for the signal handler context. */ 678 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 679 SIGISMEMBER(psp->ps_sigonstack, sig)) { 680 sp = td->td_sigstk.ss_sp + 681 td->td_sigstk.ss_size - sizeof(struct sigframe); 682#if defined(COMPAT_43) 683 td->td_sigstk.ss_flags |= SS_ONSTACK; 684#endif 685 } else 686 sp = (char *)regs->tf_esp - sizeof(struct sigframe); 687 /* Align to 16 bytes. */ 688 sfp = (struct sigframe *)((unsigned int)sp & ~0xF); 689 690 /* Translate the signal if appropriate. */ 691 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 692 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 693 694 /* Build the argument list for the signal handler. */ 695 sf.sf_signum = sig; 696 sf.sf_ucontext = (register_t)&sfp->sf_uc; 697 bzero(&sf.sf_si, sizeof(sf.sf_si)); 698 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 699 /* Signal handler installed with SA_SIGINFO. */ 700 sf.sf_siginfo = (register_t)&sfp->sf_si; 701 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 702 703 /* Fill in POSIX parts */ 704 sf.sf_si = ksi->ksi_info; 705 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 706 } else { 707 /* Old FreeBSD-style arguments. */ 708 sf.sf_siginfo = ksi->ksi_code; 709 sf.sf_addr = (register_t)ksi->ksi_addr; 710 sf.sf_ahu.sf_handler = catcher; 711 } 712 mtx_unlock(&psp->ps_mtx); 713 PROC_UNLOCK(p); 714 715 /* 716 * If we're a vm86 process, we want to save the segment registers. 717 * We also change eflags to be our emulated eflags, not the actual 718 * eflags. 719 */ 720 if (regs->tf_eflags & PSL_VM) { 721 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 722 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 723 724 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 725 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 726 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 727 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 728 729 if (vm86->vm86_has_vme == 0) 730 sf.sf_uc.uc_mcontext.mc_eflags = 731 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 732 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 733 734 /* 735 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 736 * syscalls made by the signal handler. This just avoids 737 * wasting time for our lazy fixup of such faults. PSL_NT 738 * does nothing in vm86 mode, but vm86 programs can set it 739 * almost legitimately in probes for old cpu types. 740 */ 741 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 742 } 743 744 /* 745 * Copy the sigframe out to the user's stack. 746 */ 747 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 748#ifdef DEBUG 749 printf("process %ld has trashed its stack\n", (long)p->p_pid); 750#endif 751 PROC_LOCK(p); 752 sigexit(td, SIGILL); 753 } 754 755 regs->tf_esp = (int)sfp; 756 regs->tf_eip = p->p_sysent->sv_sigcode_base; 757 regs->tf_eflags &= ~(PSL_T | PSL_D); 758 regs->tf_cs = _ucodesel; 759 regs->tf_ds = _udatasel; 760 regs->tf_es = _udatasel; 761 regs->tf_fs = _udatasel; 762 regs->tf_ss = _udatasel; 763 PROC_LOCK(p); 764 mtx_lock(&psp->ps_mtx); 765} 766 767/* 768 * System call to cleanup state after a signal 769 * has been taken. Reset signal mask and 770 * stack state from context left by sendsig (above). 771 * Return to previous pc and psl as specified by 772 * context left by sendsig. Check carefully to 773 * make sure that the user has not modified the 774 * state to gain improper privileges. 775 * 776 * MPSAFE 777 */ 778#ifdef COMPAT_43 779int 780osigreturn(td, uap) 781 struct thread *td; 782 struct osigreturn_args /* { 783 struct osigcontext *sigcntxp; 784 } */ *uap; 785{ 786 struct osigcontext sc; 787 struct trapframe *regs; 788 struct osigcontext *scp; 789 int eflags, error; 790 ksiginfo_t ksi; 791 792 regs = td->td_frame; 793 error = copyin(uap->sigcntxp, &sc, sizeof(sc)); 794 if (error != 0) 795 return (error); 796 scp = ≻ 797 eflags = scp->sc_ps; 798 if (eflags & PSL_VM) { 799 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 800 struct vm86_kernel *vm86; 801 802 /* 803 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 804 * set up the vm86 area, and we can't enter vm86 mode. 805 */ 806 if (td->td_pcb->pcb_ext == 0) 807 return (EINVAL); 808 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 809 if (vm86->vm86_inited == 0) 810 return (EINVAL); 811 812 /* Go back to user mode if both flags are set. */ 813 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 814 ksiginfo_init_trap(&ksi); 815 ksi.ksi_signo = SIGBUS; 816 ksi.ksi_code = BUS_OBJERR; 817 ksi.ksi_addr = (void *)regs->tf_eip; 818 trapsignal(td, &ksi); 819 } 820 821 if (vm86->vm86_has_vme) { 822 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 823 (eflags & VME_USERCHANGE) | PSL_VM; 824 } else { 825 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 826 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 827 (eflags & VM_USERCHANGE) | PSL_VM; 828 } 829 tf->tf_vm86_ds = scp->sc_ds; 830 tf->tf_vm86_es = scp->sc_es; 831 tf->tf_vm86_fs = scp->sc_fs; 832 tf->tf_vm86_gs = scp->sc_gs; 833 tf->tf_ds = _udatasel; 834 tf->tf_es = _udatasel; 835 tf->tf_fs = _udatasel; 836 } else { 837 /* 838 * Don't allow users to change privileged or reserved flags. 839 */ 840 /* 841 * XXX do allow users to change the privileged flag PSL_RF. 842 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 843 * should sometimes set it there too. tf_eflags is kept in 844 * the signal context during signal handling and there is no 845 * other place to remember it, so the PSL_RF bit may be 846 * corrupted by the signal handler without us knowing. 847 * Corruption of the PSL_RF bit at worst causes one more or 848 * one less debugger trap, so allowing it is fairly harmless. 849 */ 850 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { 851 return (EINVAL); 852 } 853 854 /* 855 * Don't allow users to load a valid privileged %cs. Let the 856 * hardware check for invalid selectors, excess privilege in 857 * other selectors, invalid %eip's and invalid %esp's. 858 */ 859 if (!CS_SECURE(scp->sc_cs)) { 860 ksiginfo_init_trap(&ksi); 861 ksi.ksi_signo = SIGBUS; 862 ksi.ksi_code = BUS_OBJERR; 863 ksi.ksi_trapno = T_PROTFLT; 864 ksi.ksi_addr = (void *)regs->tf_eip; 865 trapsignal(td, &ksi); 866 return (EINVAL); 867 } 868 regs->tf_ds = scp->sc_ds; 869 regs->tf_es = scp->sc_es; 870 regs->tf_fs = scp->sc_fs; 871 } 872 873 /* Restore remaining registers. */ 874 regs->tf_eax = scp->sc_eax; 875 regs->tf_ebx = scp->sc_ebx; 876 regs->tf_ecx = scp->sc_ecx; 877 regs->tf_edx = scp->sc_edx; 878 regs->tf_esi = scp->sc_esi; 879 regs->tf_edi = scp->sc_edi; 880 regs->tf_cs = scp->sc_cs; 881 regs->tf_ss = scp->sc_ss; 882 regs->tf_isp = scp->sc_isp; 883 regs->tf_ebp = scp->sc_fp; 884 regs->tf_esp = scp->sc_sp; 885 regs->tf_eip = scp->sc_pc; 886 regs->tf_eflags = eflags; 887 888#if defined(COMPAT_43) 889 if (scp->sc_onstack & 1) 890 td->td_sigstk.ss_flags |= SS_ONSTACK; 891 else 892 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 893#endif 894 kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, 895 SIGPROCMASK_OLD); 896 return (EJUSTRETURN); 897} 898#endif /* COMPAT_43 */ 899 900#ifdef COMPAT_FREEBSD4 901/* 902 * MPSAFE 903 */ 904int 905freebsd4_sigreturn(td, uap) 906 struct thread *td; 907 struct freebsd4_sigreturn_args /* { 908 const ucontext4 *sigcntxp; 909 } */ *uap; 910{ 911 struct ucontext4 uc; 912 struct trapframe *regs; 913 struct ucontext4 *ucp; 914 int cs, eflags, error; 915 ksiginfo_t ksi; 916 917 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 918 if (error != 0) 919 return (error); 920 ucp = &uc; 921 regs = td->td_frame; 922 eflags = ucp->uc_mcontext.mc_eflags; 923 if (eflags & PSL_VM) { 924 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 925 struct vm86_kernel *vm86; 926 927 /* 928 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 929 * set up the vm86 area, and we can't enter vm86 mode. 930 */ 931 if (td->td_pcb->pcb_ext == 0) 932 return (EINVAL); 933 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 934 if (vm86->vm86_inited == 0) 935 return (EINVAL); 936 937 /* Go back to user mode if both flags are set. */ 938 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 939 ksiginfo_init_trap(&ksi); 940 ksi.ksi_signo = SIGBUS; 941 ksi.ksi_code = BUS_OBJERR; 942 ksi.ksi_addr = (void *)regs->tf_eip; 943 trapsignal(td, &ksi); 944 } 945 if (vm86->vm86_has_vme) { 946 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 947 (eflags & VME_USERCHANGE) | PSL_VM; 948 } else { 949 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 950 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 951 (eflags & VM_USERCHANGE) | PSL_VM; 952 } 953 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 954 tf->tf_eflags = eflags; 955 tf->tf_vm86_ds = tf->tf_ds; 956 tf->tf_vm86_es = tf->tf_es; 957 tf->tf_vm86_fs = tf->tf_fs; 958 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 959 tf->tf_ds = _udatasel; 960 tf->tf_es = _udatasel; 961 tf->tf_fs = _udatasel; 962 } else { 963 /* 964 * Don't allow users to change privileged or reserved flags. 965 */ 966 /* 967 * XXX do allow users to change the privileged flag PSL_RF. 968 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 969 * should sometimes set it there too. tf_eflags is kept in 970 * the signal context during signal handling and there is no 971 * other place to remember it, so the PSL_RF bit may be 972 * corrupted by the signal handler without us knowing. 973 * Corruption of the PSL_RF bit at worst causes one more or 974 * one less debugger trap, so allowing it is fairly harmless. 975 */ 976 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { 977 uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n", 978 td->td_proc->p_pid, td->td_name, eflags); 979 return (EINVAL); 980 } 981 982 /* 983 * Don't allow users to load a valid privileged %cs. Let the 984 * hardware check for invalid selectors, excess privilege in 985 * other selectors, invalid %eip's and invalid %esp's. 986 */ 987 cs = ucp->uc_mcontext.mc_cs; 988 if (!CS_SECURE(cs)) { 989 uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", 990 td->td_proc->p_pid, td->td_name, cs); 991 ksiginfo_init_trap(&ksi); 992 ksi.ksi_signo = SIGBUS; 993 ksi.ksi_code = BUS_OBJERR; 994 ksi.ksi_trapno = T_PROTFLT; 995 ksi.ksi_addr = (void *)regs->tf_eip; 996 trapsignal(td, &ksi); 997 return (EINVAL); 998 } 999 1000 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 1001 } 1002 1003#if defined(COMPAT_43) 1004 if (ucp->uc_mcontext.mc_onstack & 1) 1005 td->td_sigstk.ss_flags |= SS_ONSTACK; 1006 else 1007 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 1008#endif 1009 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 1010 return (EJUSTRETURN); 1011} 1012#endif /* COMPAT_FREEBSD4 */ 1013 1014/* 1015 * MPSAFE 1016 */ 1017int 1018sys_sigreturn(td, uap) 1019 struct thread *td; 1020 struct sigreturn_args /* { 1021 const struct __ucontext *sigcntxp; 1022 } */ *uap; 1023{ 1024 ucontext_t uc; 1025 struct trapframe *regs; 1026 ucontext_t *ucp; 1027 int cs, eflags, error, ret; 1028 ksiginfo_t ksi; 1029 1030 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 1031 if (error != 0) 1032 return (error); 1033 ucp = &uc; 1034 regs = td->td_frame; 1035 eflags = ucp->uc_mcontext.mc_eflags; 1036 if (eflags & PSL_VM) { 1037 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 1038 struct vm86_kernel *vm86; 1039 1040 /* 1041 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 1042 * set up the vm86 area, and we can't enter vm86 mode. 1043 */ 1044 if (td->td_pcb->pcb_ext == 0) 1045 return (EINVAL); 1046 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 1047 if (vm86->vm86_inited == 0) 1048 return (EINVAL); 1049 1050 /* Go back to user mode if both flags are set. */ 1051 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 1052 ksiginfo_init_trap(&ksi); 1053 ksi.ksi_signo = SIGBUS; 1054 ksi.ksi_code = BUS_OBJERR; 1055 ksi.ksi_addr = (void *)regs->tf_eip; 1056 trapsignal(td, &ksi); 1057 } 1058 1059 if (vm86->vm86_has_vme) { 1060 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 1061 (eflags & VME_USERCHANGE) | PSL_VM; 1062 } else { 1063 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 1064 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 1065 (eflags & VM_USERCHANGE) | PSL_VM; 1066 } 1067 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 1068 tf->tf_eflags = eflags; 1069 tf->tf_vm86_ds = tf->tf_ds; 1070 tf->tf_vm86_es = tf->tf_es; 1071 tf->tf_vm86_fs = tf->tf_fs; 1072 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 1073 tf->tf_ds = _udatasel; 1074 tf->tf_es = _udatasel; 1075 tf->tf_fs = _udatasel; 1076 } else { 1077 /* 1078 * Don't allow users to change privileged or reserved flags. 1079 */ 1080 /* 1081 * XXX do allow users to change the privileged flag PSL_RF. 1082 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 1083 * should sometimes set it there too. tf_eflags is kept in 1084 * the signal context during signal handling and there is no 1085 * other place to remember it, so the PSL_RF bit may be 1086 * corrupted by the signal handler without us knowing. 1087 * Corruption of the PSL_RF bit at worst causes one more or 1088 * one less debugger trap, so allowing it is fairly harmless. 1089 */ 1090 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { 1091 uprintf("pid %d (%s): sigreturn eflags = 0x%x\n", 1092 td->td_proc->p_pid, td->td_name, eflags); 1093 return (EINVAL); 1094 } 1095 1096 /* 1097 * Don't allow users to load a valid privileged %cs. Let the 1098 * hardware check for invalid selectors, excess privilege in 1099 * other selectors, invalid %eip's and invalid %esp's. 1100 */ 1101 cs = ucp->uc_mcontext.mc_cs; 1102 if (!CS_SECURE(cs)) { 1103 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", 1104 td->td_proc->p_pid, td->td_name, cs); 1105 ksiginfo_init_trap(&ksi); 1106 ksi.ksi_signo = SIGBUS; 1107 ksi.ksi_code = BUS_OBJERR; 1108 ksi.ksi_trapno = T_PROTFLT; 1109 ksi.ksi_addr = (void *)regs->tf_eip; 1110 trapsignal(td, &ksi); 1111 return (EINVAL); 1112 } 1113 1114 ret = set_fpcontext(td, &ucp->uc_mcontext); 1115 if (ret != 0) 1116 return (ret); 1117 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 1118 } 1119 1120#if defined(COMPAT_43) 1121 if (ucp->uc_mcontext.mc_onstack & 1) 1122 td->td_sigstk.ss_flags |= SS_ONSTACK; 1123 else 1124 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 1125#endif 1126 1127 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 1128 return (EJUSTRETURN); 1129} 1130 1131/* 1132 * Machine dependent boot() routine 1133 * 1134 * I haven't seen anything to put here yet 1135 * Possibly some stuff might be grafted back here from boot() 1136 */ 1137void 1138cpu_boot(int howto) 1139{ 1140} 1141 1142/* 1143 * Flush the D-cache for non-DMA I/O so that the I-cache can 1144 * be made coherent later. 1145 */ 1146void 1147cpu_flush_dcache(void *ptr, size_t len) 1148{ 1149 /* Not applicable */ 1150} 1151 1152/* Get current clock frequency for the given cpu id. */ 1153int 1154cpu_est_clockrate(int cpu_id, uint64_t *rate) 1155{ 1156 uint64_t tsc1, tsc2; 1157 uint64_t acnt, mcnt, perf; 1158 register_t reg; 1159 1160 if (pcpu_find(cpu_id) == NULL || rate == NULL) 1161 return (EINVAL); 1162 if ((cpu_feature & CPUID_TSC) == 0) 1163 return (EOPNOTSUPP); 1164 1165 /* 1166 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist, 1167 * DELAY(9) based logic fails. 1168 */ 1169 if (tsc_is_invariant && !tsc_perf_stat) 1170 return (EOPNOTSUPP); 1171 1172#ifdef SMP 1173 if (smp_cpus > 1) { 1174 /* Schedule ourselves on the indicated cpu. */ 1175 thread_lock(curthread); 1176 sched_bind(curthread, cpu_id); 1177 thread_unlock(curthread); 1178 } 1179#endif 1180 1181 /* Calibrate by measuring a short delay. */ 1182 reg = intr_disable(); 1183 if (tsc_is_invariant) { 1184 wrmsr(MSR_MPERF, 0); 1185 wrmsr(MSR_APERF, 0); 1186 tsc1 = rdtsc(); 1187 DELAY(1000); 1188 mcnt = rdmsr(MSR_MPERF); 1189 acnt = rdmsr(MSR_APERF); 1190 tsc2 = rdtsc(); 1191 intr_restore(reg); 1192 perf = 1000 * acnt / mcnt; 1193 *rate = (tsc2 - tsc1) * perf; 1194 } else { 1195 tsc1 = rdtsc(); 1196 DELAY(1000); 1197 tsc2 = rdtsc(); 1198 intr_restore(reg); 1199 *rate = (tsc2 - tsc1) * 1000; 1200 } 1201 1202#ifdef SMP 1203 if (smp_cpus > 1) { 1204 thread_lock(curthread); 1205 sched_unbind(curthread); 1206 thread_unlock(curthread); 1207 } 1208#endif 1209 1210 return (0); 1211} 1212 1213#ifdef XEN 1214 1215void 1216cpu_halt(void) 1217{ 1218 HYPERVISOR_shutdown(SHUTDOWN_poweroff); 1219} 1220 1221int scheduler_running; 1222 1223static void 1224cpu_idle_hlt(sbintime_t sbt) 1225{ 1226 1227 scheduler_running = 1; 1228 enable_intr(); 1229 idle_block(); 1230} 1231 1232#else 1233/* 1234 * Shutdown the CPU as much as possible 1235 */ 1236void 1237cpu_halt(void) 1238{ 1239 for (;;) 1240 halt(); 1241} 1242 1243#endif 1244 1245void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ 1246static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ 1247static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ 1248TUNABLE_INT("machdep.idle_mwait", &idle_mwait); 1249SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait, 1250 0, "Use MONITOR/MWAIT for short idle"); 1251 1252#define STATE_RUNNING 0x0 1253#define STATE_MWAIT 0x1 1254#define STATE_SLEEPING 0x2 1255 1256static void 1257cpu_idle_acpi(sbintime_t sbt) 1258{ 1259 int *state; 1260 1261 state = (int *)PCPU_PTR(monitorbuf); 1262 *state = STATE_SLEEPING; 1263 1264 /* See comments in cpu_idle_hlt(). */ 1265 disable_intr(); 1266 if (sched_runnable()) 1267 enable_intr(); 1268 else if (cpu_idle_hook) 1269 cpu_idle_hook(sbt); 1270 else 1271 __asm __volatile("sti; hlt"); 1272 *state = STATE_RUNNING; 1273} 1274 1275#ifndef XEN 1276static void 1277cpu_idle_hlt(sbintime_t sbt) 1278{ 1279 int *state; 1280 1281 state = (int *)PCPU_PTR(monitorbuf); 1282 *state = STATE_SLEEPING; 1283 1284 /* 1285 * Since we may be in a critical section from cpu_idle(), if 1286 * an interrupt fires during that critical section we may have 1287 * a pending preemption. If the CPU halts, then that thread 1288 * may not execute until a later interrupt awakens the CPU. 1289 * To handle this race, check for a runnable thread after 1290 * disabling interrupts and immediately return if one is 1291 * found. Also, we must absolutely guarentee that hlt is 1292 * the next instruction after sti. This ensures that any 1293 * interrupt that fires after the call to disable_intr() will 1294 * immediately awaken the CPU from hlt. Finally, please note 1295 * that on x86 this works fine because of interrupts enabled only 1296 * after the instruction following sti takes place, while IF is set 1297 * to 1 immediately, allowing hlt instruction to acknowledge the 1298 * interrupt. 1299 */ 1300 disable_intr(); 1301 if (sched_runnable()) 1302 enable_intr(); 1303 else 1304 __asm __volatile("sti; hlt"); 1305 *state = STATE_RUNNING; 1306} 1307#endif 1308 1309/* 1310 * MWAIT cpu power states. Lower 4 bits are sub-states. 1311 */ 1312#define MWAIT_C0 0xf0 1313#define MWAIT_C1 0x00 1314#define MWAIT_C2 0x10 1315#define MWAIT_C3 0x20 1316#define MWAIT_C4 0x30 1317 1318static void 1319cpu_idle_mwait(sbintime_t sbt) 1320{ 1321 int *state; 1322 1323 state = (int *)PCPU_PTR(monitorbuf); 1324 *state = STATE_MWAIT; 1325 1326 /* See comments in cpu_idle_hlt(). */ 1327 disable_intr(); 1328 if (sched_runnable()) { 1329 enable_intr(); 1330 *state = STATE_RUNNING; 1331 return; 1332 } 1333 cpu_monitor(state, 0, 0); 1334 if (*state == STATE_MWAIT) 1335 __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); 1336 else 1337 enable_intr(); 1338 *state = STATE_RUNNING; 1339} 1340 1341static void 1342cpu_idle_spin(sbintime_t sbt) 1343{ 1344 int *state; 1345 int i; 1346 1347 state = (int *)PCPU_PTR(monitorbuf); 1348 *state = STATE_RUNNING; 1349 1350 /* 1351 * The sched_runnable() call is racy but as long as there is 1352 * a loop missing it one time will have just a little impact if any 1353 * (and it is much better than missing the check at all). 1354 */ 1355 for (i = 0; i < 1000; i++) { 1356 if (sched_runnable()) 1357 return; 1358 cpu_spinwait(); 1359 } 1360} 1361 1362/* 1363 * C1E renders the local APIC timer dead, so we disable it by 1364 * reading the Interrupt Pending Message register and clearing 1365 * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27). 1366 * 1367 * Reference: 1368 * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors" 1369 * #32559 revision 3.00+ 1370 */ 1371#define MSR_AMDK8_IPM 0xc0010055 1372#define AMDK8_SMIONCMPHALT (1ULL << 27) 1373#define AMDK8_C1EONCMPHALT (1ULL << 28) 1374#define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT) 1375 1376static void 1377cpu_probe_amdc1e(void) 1378{ 1379 1380 /* 1381 * Detect the presence of C1E capability mostly on latest 1382 * dual-cores (or future) k8 family. 1383 */ 1384 if (cpu_vendor_id == CPU_VENDOR_AMD && 1385 (cpu_id & 0x00000f00) == 0x00000f00 && 1386 (cpu_id & 0x0fff0000) >= 0x00040000) { 1387 cpu_ident_amdc1e = 1; 1388 } 1389} 1390 1391#ifdef XEN 1392void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt; 1393#else 1394void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; 1395#endif 1396 1397void 1398cpu_idle(int busy) 1399{ 1400#ifndef XEN 1401 uint64_t msr; 1402#endif 1403 sbintime_t sbt = -1; 1404 1405 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", 1406 busy, curcpu); 1407#if defined(MP_WATCHDOG) && !defined(XEN) 1408 ap_watchdog(PCPU_GET(cpuid)); 1409#endif 1410#ifndef XEN 1411 /* If we are busy - try to use fast methods. */ 1412 if (busy) { 1413 if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { 1414 cpu_idle_mwait(busy); 1415 goto out; 1416 } 1417 } 1418#endif 1419 1420 /* If we have time - switch timers into idle mode. */ 1421 if (!busy) { 1422 critical_enter(); 1423 sbt = cpu_idleclock(); 1424 } 1425 1426#ifndef XEN 1427 /* Apply AMD APIC timer C1E workaround. */ 1428 if (cpu_ident_amdc1e && cpu_disable_deep_sleep) { 1429 msr = rdmsr(MSR_AMDK8_IPM); 1430 if (msr & AMDK8_CMPHALT) 1431 wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); 1432 } 1433#endif 1434 1435 /* Call main idle method. */ 1436 cpu_idle_fn(sbt); 1437 1438 /* Switch timers mack into active mode. */ 1439 if (!busy) { 1440 cpu_activeclock(); 1441 critical_exit(); 1442 } 1443#ifndef XEN 1444out: 1445#endif 1446 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", 1447 busy, curcpu); 1448} 1449 1450int 1451cpu_idle_wakeup(int cpu) 1452{ 1453 struct pcpu *pcpu; 1454 int *state; 1455 1456 pcpu = pcpu_find(cpu); 1457 state = (int *)pcpu->pc_monitorbuf; 1458 /* 1459 * This doesn't need to be atomic since missing the race will 1460 * simply result in unnecessary IPIs. 1461 */ 1462 if (*state == STATE_SLEEPING) 1463 return (0); 1464 if (*state == STATE_MWAIT) 1465 *state = STATE_RUNNING; 1466 return (1); 1467} 1468 1469/* 1470 * Ordered by speed/power consumption. 1471 */ 1472struct { 1473 void *id_fn; 1474 char *id_name; 1475} idle_tbl[] = { 1476 { cpu_idle_spin, "spin" }, 1477 { cpu_idle_mwait, "mwait" }, 1478 { cpu_idle_hlt, "hlt" }, 1479 { cpu_idle_acpi, "acpi" }, 1480 { NULL, NULL } 1481}; 1482 1483static int 1484idle_sysctl_available(SYSCTL_HANDLER_ARGS) 1485{ 1486 char *avail, *p; 1487 int error; 1488 int i; 1489 1490 avail = malloc(256, M_TEMP, M_WAITOK); 1491 p = avail; 1492 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1493 if (strstr(idle_tbl[i].id_name, "mwait") && 1494 (cpu_feature2 & CPUID2_MON) == 0) 1495 continue; 1496 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && 1497 cpu_idle_hook == NULL) 1498 continue; 1499 p += sprintf(p, "%s%s", p != avail ? ", " : "", 1500 idle_tbl[i].id_name); 1501 } 1502 error = sysctl_handle_string(oidp, avail, 0, req); 1503 free(avail, M_TEMP); 1504 return (error); 1505} 1506 1507SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD, 1508 0, 0, idle_sysctl_available, "A", "list of available idle functions"); 1509 1510static int 1511idle_sysctl(SYSCTL_HANDLER_ARGS) 1512{ 1513 char buf[16]; 1514 int error; 1515 char *p; 1516 int i; 1517 1518 p = "unknown"; 1519 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1520 if (idle_tbl[i].id_fn == cpu_idle_fn) { 1521 p = idle_tbl[i].id_name; 1522 break; 1523 } 1524 } 1525 strncpy(buf, p, sizeof(buf)); 1526 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 1527 if (error != 0 || req->newptr == NULL) 1528 return (error); 1529 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1530 if (strstr(idle_tbl[i].id_name, "mwait") && 1531 (cpu_feature2 & CPUID2_MON) == 0) 1532 continue; 1533 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && 1534 cpu_idle_hook == NULL) 1535 continue; 1536 if (strcmp(idle_tbl[i].id_name, buf)) 1537 continue; 1538 cpu_idle_fn = idle_tbl[i].id_fn; 1539 return (0); 1540 } 1541 return (EINVAL); 1542} 1543 1544SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, 1545 idle_sysctl, "A", "currently selected idle function"); 1546 1547uint64_t (*atomic_load_acq_64)(volatile uint64_t *) = 1548 atomic_load_acq_64_i386; 1549void (*atomic_store_rel_64)(volatile uint64_t *, uint64_t) = 1550 atomic_store_rel_64_i386; 1551 1552static void 1553cpu_probe_cmpxchg8b(void) 1554{ 1555 1556 if ((cpu_feature & CPUID_CX8) != 0 || 1557 cpu_vendor_id == CPU_VENDOR_RISE) { 1558 atomic_load_acq_64 = atomic_load_acq_64_i586; 1559 atomic_store_rel_64 = atomic_store_rel_64_i586; 1560 } 1561} 1562 1563/* 1564 * Reset registers to default values on exec. 1565 */ 1566void 1567exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 1568{ 1569 struct trapframe *regs = td->td_frame; 1570 struct pcb *pcb = td->td_pcb; 1571 1572 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ 1573 pcb->pcb_gs = _udatasel; 1574 load_gs(_udatasel); 1575 1576 mtx_lock_spin(&dt_lock); 1577 if (td->td_proc->p_md.md_ldt) 1578 user_ldt_free(td); 1579 else 1580 mtx_unlock_spin(&dt_lock); 1581 1582 bzero((char *)regs, sizeof(struct trapframe)); 1583 regs->tf_eip = imgp->entry_addr; 1584 regs->tf_esp = stack; 1585 regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); 1586 regs->tf_ss = _udatasel; 1587 regs->tf_ds = _udatasel; 1588 regs->tf_es = _udatasel; 1589 regs->tf_fs = _udatasel; 1590 regs->tf_cs = _ucodesel; 1591 1592 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ 1593 regs->tf_ebx = imgp->ps_strings; 1594 1595 /* 1596 * Reset the hardware debug registers if they were in use. 1597 * They won't have any meaning for the newly exec'd process. 1598 */ 1599 if (pcb->pcb_flags & PCB_DBREGS) { 1600 pcb->pcb_dr0 = 0; 1601 pcb->pcb_dr1 = 0; 1602 pcb->pcb_dr2 = 0; 1603 pcb->pcb_dr3 = 0; 1604 pcb->pcb_dr6 = 0; 1605 pcb->pcb_dr7 = 0; 1606 if (pcb == curpcb) { 1607 /* 1608 * Clear the debug registers on the running 1609 * CPU, otherwise they will end up affecting 1610 * the next process we switch to. 1611 */ 1612 reset_dbregs(); 1613 } 1614 pcb->pcb_flags &= ~PCB_DBREGS; 1615 } 1616 1617 /* 1618 * Initialize the math emulator (if any) for the current process. 1619 * Actually, just clear the bit that says that the emulator has 1620 * been initialized. Initialization is delayed until the process 1621 * traps to the emulator (if it is done at all) mainly because 1622 * emulators don't provide an entry point for initialization. 1623 */ 1624 td->td_pcb->pcb_flags &= ~FP_SOFTFP; 1625 pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; 1626 1627 /* 1628 * Drop the FP state if we hold it, so that the process gets a 1629 * clean FP state if it uses the FPU again. 1630 */ 1631 fpstate_drop(td); 1632 1633 /* 1634 * XXX - Linux emulator 1635 * Make sure sure edx is 0x0 on entry. Linux binaries depend 1636 * on it. 1637 */ 1638 td->td_retval[1] = 0; 1639} 1640 1641void 1642cpu_setregs(void) 1643{ 1644 unsigned int cr0; 1645 1646 cr0 = rcr0(); 1647 1648 /* 1649 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: 1650 * 1651 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT 1652 * instructions. We must set the CR0_MP bit and use the CR0_TS 1653 * bit to control the trap, because setting the CR0_EM bit does 1654 * not cause WAIT instructions to trap. It's important to trap 1655 * WAIT instructions - otherwise the "wait" variants of no-wait 1656 * control instructions would degenerate to the "no-wait" variants 1657 * after FP context switches but work correctly otherwise. It's 1658 * particularly important to trap WAITs when there is no NPX - 1659 * otherwise the "wait" variants would always degenerate. 1660 * 1661 * Try setting CR0_NE to get correct error reporting on 486DX's. 1662 * Setting it should fail or do nothing on lesser processors. 1663 */ 1664 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 1665 load_cr0(cr0); 1666 load_gs(_udatasel); 1667} 1668 1669u_long bootdev; /* not a struct cdev *- encoding is different */ 1670SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1671 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); 1672 1673/* 1674 * Initialize 386 and configure to run kernel 1675 */ 1676 1677/* 1678 * Initialize segments & interrupt table 1679 */ 1680 1681int _default_ldt; 1682 1683#ifdef XEN 1684union descriptor *gdt; 1685union descriptor *ldt; 1686#else 1687union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1688union descriptor ldt[NLDT]; /* local descriptor table */ 1689#endif 1690static struct gate_descriptor idt0[NIDT]; 1691struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1692struct region_descriptor r_gdt, r_idt; /* table descriptors */ 1693struct mtx dt_lock; /* lock for GDT and LDT */ 1694 1695#if defined(I586_CPU) && !defined(NO_F00F_HACK) 1696extern int has_f00f_bug; 1697#endif 1698 1699static struct i386tss dblfault_tss; 1700static char dblfault_stack[PAGE_SIZE]; 1701 1702extern vm_offset_t proc0kstack; 1703 1704 1705/* 1706 * software prototypes -- in more palatable form. 1707 * 1708 * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret 1709 * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) 1710 */ 1711struct soft_segment_descriptor gdt_segs[] = { 1712/* GNULL_SEL 0 Null Descriptor */ 1713{ .ssd_base = 0x0, 1714 .ssd_limit = 0x0, 1715 .ssd_type = 0, 1716 .ssd_dpl = SEL_KPL, 1717 .ssd_p = 0, 1718 .ssd_xx = 0, .ssd_xx1 = 0, 1719 .ssd_def32 = 0, 1720 .ssd_gran = 0 }, 1721/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ 1722{ .ssd_base = 0x0, 1723 .ssd_limit = 0xfffff, 1724 .ssd_type = SDT_MEMRWA, 1725 .ssd_dpl = SEL_KPL, 1726 .ssd_p = 1, 1727 .ssd_xx = 0, .ssd_xx1 = 0, 1728 .ssd_def32 = 1, 1729 .ssd_gran = 1 }, 1730/* GUFS_SEL 2 %fs Descriptor for user */ 1731{ .ssd_base = 0x0, 1732 .ssd_limit = 0xfffff, 1733 .ssd_type = SDT_MEMRWA, 1734 .ssd_dpl = SEL_UPL, 1735 .ssd_p = 1, 1736 .ssd_xx = 0, .ssd_xx1 = 0, 1737 .ssd_def32 = 1, 1738 .ssd_gran = 1 }, 1739/* GUGS_SEL 3 %gs Descriptor for user */ 1740{ .ssd_base = 0x0, 1741 .ssd_limit = 0xfffff, 1742 .ssd_type = SDT_MEMRWA, 1743 .ssd_dpl = SEL_UPL, 1744 .ssd_p = 1, 1745 .ssd_xx = 0, .ssd_xx1 = 0, 1746 .ssd_def32 = 1, 1747 .ssd_gran = 1 }, 1748/* GCODE_SEL 4 Code Descriptor for kernel */ 1749{ .ssd_base = 0x0, 1750 .ssd_limit = 0xfffff, 1751 .ssd_type = SDT_MEMERA, 1752 .ssd_dpl = SEL_KPL, 1753 .ssd_p = 1, 1754 .ssd_xx = 0, .ssd_xx1 = 0, 1755 .ssd_def32 = 1, 1756 .ssd_gran = 1 }, 1757/* GDATA_SEL 5 Data Descriptor for kernel */ 1758{ .ssd_base = 0x0, 1759 .ssd_limit = 0xfffff, 1760 .ssd_type = SDT_MEMRWA, 1761 .ssd_dpl = SEL_KPL, 1762 .ssd_p = 1, 1763 .ssd_xx = 0, .ssd_xx1 = 0, 1764 .ssd_def32 = 1, 1765 .ssd_gran = 1 }, 1766/* GUCODE_SEL 6 Code Descriptor for user */ 1767{ .ssd_base = 0x0, 1768 .ssd_limit = 0xfffff, 1769 .ssd_type = SDT_MEMERA, 1770 .ssd_dpl = SEL_UPL, 1771 .ssd_p = 1, 1772 .ssd_xx = 0, .ssd_xx1 = 0, 1773 .ssd_def32 = 1, 1774 .ssd_gran = 1 }, 1775/* GUDATA_SEL 7 Data Descriptor for user */ 1776{ .ssd_base = 0x0, 1777 .ssd_limit = 0xfffff, 1778 .ssd_type = SDT_MEMRWA, 1779 .ssd_dpl = SEL_UPL, 1780 .ssd_p = 1, 1781 .ssd_xx = 0, .ssd_xx1 = 0, 1782 .ssd_def32 = 1, 1783 .ssd_gran = 1 }, 1784/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ 1785{ .ssd_base = 0x400, 1786 .ssd_limit = 0xfffff, 1787 .ssd_type = SDT_MEMRWA, 1788 .ssd_dpl = SEL_KPL, 1789 .ssd_p = 1, 1790 .ssd_xx = 0, .ssd_xx1 = 0, 1791 .ssd_def32 = 1, 1792 .ssd_gran = 1 }, 1793#ifndef XEN 1794/* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 1795{ 1796 .ssd_base = 0x0, 1797 .ssd_limit = sizeof(struct i386tss)-1, 1798 .ssd_type = SDT_SYS386TSS, 1799 .ssd_dpl = 0, 1800 .ssd_p = 1, 1801 .ssd_xx = 0, .ssd_xx1 = 0, 1802 .ssd_def32 = 0, 1803 .ssd_gran = 0 }, 1804/* GLDT_SEL 10 LDT Descriptor */ 1805{ .ssd_base = (int) ldt, 1806 .ssd_limit = sizeof(ldt)-1, 1807 .ssd_type = SDT_SYSLDT, 1808 .ssd_dpl = SEL_UPL, 1809 .ssd_p = 1, 1810 .ssd_xx = 0, .ssd_xx1 = 0, 1811 .ssd_def32 = 0, 1812 .ssd_gran = 0 }, 1813/* GUSERLDT_SEL 11 User LDT Descriptor per process */ 1814{ .ssd_base = (int) ldt, 1815 .ssd_limit = (512 * sizeof(union descriptor)-1), 1816 .ssd_type = SDT_SYSLDT, 1817 .ssd_dpl = 0, 1818 .ssd_p = 1, 1819 .ssd_xx = 0, .ssd_xx1 = 0, 1820 .ssd_def32 = 0, 1821 .ssd_gran = 0 }, 1822/* GPANIC_SEL 12 Panic Tss Descriptor */ 1823{ .ssd_base = (int) &dblfault_tss, 1824 .ssd_limit = sizeof(struct i386tss)-1, 1825 .ssd_type = SDT_SYS386TSS, 1826 .ssd_dpl = 0, 1827 .ssd_p = 1, 1828 .ssd_xx = 0, .ssd_xx1 = 0, 1829 .ssd_def32 = 0, 1830 .ssd_gran = 0 }, 1831/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ 1832{ .ssd_base = 0, 1833 .ssd_limit = 0xfffff, 1834 .ssd_type = SDT_MEMERA, 1835 .ssd_dpl = 0, 1836 .ssd_p = 1, 1837 .ssd_xx = 0, .ssd_xx1 = 0, 1838 .ssd_def32 = 0, 1839 .ssd_gran = 1 }, 1840/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ 1841{ .ssd_base = 0, 1842 .ssd_limit = 0xfffff, 1843 .ssd_type = SDT_MEMERA, 1844 .ssd_dpl = 0, 1845 .ssd_p = 1, 1846 .ssd_xx = 0, .ssd_xx1 = 0, 1847 .ssd_def32 = 0, 1848 .ssd_gran = 1 }, 1849/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ 1850{ .ssd_base = 0, 1851 .ssd_limit = 0xfffff, 1852 .ssd_type = SDT_MEMRWA, 1853 .ssd_dpl = 0, 1854 .ssd_p = 1, 1855 .ssd_xx = 0, .ssd_xx1 = 0, 1856 .ssd_def32 = 1, 1857 .ssd_gran = 1 }, 1858/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ 1859{ .ssd_base = 0, 1860 .ssd_limit = 0xfffff, 1861 .ssd_type = SDT_MEMRWA, 1862 .ssd_dpl = 0, 1863 .ssd_p = 1, 1864 .ssd_xx = 0, .ssd_xx1 = 0, 1865 .ssd_def32 = 0, 1866 .ssd_gran = 1 }, 1867/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ 1868{ .ssd_base = 0, 1869 .ssd_limit = 0xfffff, 1870 .ssd_type = SDT_MEMRWA, 1871 .ssd_dpl = 0, 1872 .ssd_p = 1, 1873 .ssd_xx = 0, .ssd_xx1 = 0, 1874 .ssd_def32 = 0, 1875 .ssd_gran = 1 }, 1876/* GNDIS_SEL 18 NDIS Descriptor */ 1877{ .ssd_base = 0x0, 1878 .ssd_limit = 0x0, 1879 .ssd_type = 0, 1880 .ssd_dpl = 0, 1881 .ssd_p = 0, 1882 .ssd_xx = 0, .ssd_xx1 = 0, 1883 .ssd_def32 = 0, 1884 .ssd_gran = 0 }, 1885#endif /* !XEN */ 1886}; 1887 1888static struct soft_segment_descriptor ldt_segs[] = { 1889 /* Null Descriptor - overwritten by call gate */ 1890{ .ssd_base = 0x0, 1891 .ssd_limit = 0x0, 1892 .ssd_type = 0, 1893 .ssd_dpl = 0, 1894 .ssd_p = 0, 1895 .ssd_xx = 0, .ssd_xx1 = 0, 1896 .ssd_def32 = 0, 1897 .ssd_gran = 0 }, 1898 /* Null Descriptor - overwritten by call gate */ 1899{ .ssd_base = 0x0, 1900 .ssd_limit = 0x0, 1901 .ssd_type = 0, 1902 .ssd_dpl = 0, 1903 .ssd_p = 0, 1904 .ssd_xx = 0, .ssd_xx1 = 0, 1905 .ssd_def32 = 0, 1906 .ssd_gran = 0 }, 1907 /* Null Descriptor - overwritten by call gate */ 1908{ .ssd_base = 0x0, 1909 .ssd_limit = 0x0, 1910 .ssd_type = 0, 1911 .ssd_dpl = 0, 1912 .ssd_p = 0, 1913 .ssd_xx = 0, .ssd_xx1 = 0, 1914 .ssd_def32 = 0, 1915 .ssd_gran = 0 }, 1916 /* Code Descriptor for user */ 1917{ .ssd_base = 0x0, 1918 .ssd_limit = 0xfffff, 1919 .ssd_type = SDT_MEMERA, 1920 .ssd_dpl = SEL_UPL, 1921 .ssd_p = 1, 1922 .ssd_xx = 0, .ssd_xx1 = 0, 1923 .ssd_def32 = 1, 1924 .ssd_gran = 1 }, 1925 /* Null Descriptor - overwritten by call gate */ 1926{ .ssd_base = 0x0, 1927 .ssd_limit = 0x0, 1928 .ssd_type = 0, 1929 .ssd_dpl = 0, 1930 .ssd_p = 0, 1931 .ssd_xx = 0, .ssd_xx1 = 0, 1932 .ssd_def32 = 0, 1933 .ssd_gran = 0 }, 1934 /* Data Descriptor for user */ 1935{ .ssd_base = 0x0, 1936 .ssd_limit = 0xfffff, 1937 .ssd_type = SDT_MEMRWA, 1938 .ssd_dpl = SEL_UPL, 1939 .ssd_p = 1, 1940 .ssd_xx = 0, .ssd_xx1 = 0, 1941 .ssd_def32 = 1, 1942 .ssd_gran = 1 }, 1943}; 1944 1945void 1946setidt(idx, func, typ, dpl, selec) 1947 int idx; 1948 inthand_t *func; 1949 int typ; 1950 int dpl; 1951 int selec; 1952{ 1953 struct gate_descriptor *ip; 1954 1955 ip = idt + idx; 1956 ip->gd_looffset = (int)func; 1957 ip->gd_selector = selec; 1958 ip->gd_stkcpy = 0; 1959 ip->gd_xx = 0; 1960 ip->gd_type = typ; 1961 ip->gd_dpl = dpl; 1962 ip->gd_p = 1; 1963 ip->gd_hioffset = ((int)func)>>16 ; 1964} 1965 1966extern inthand_t 1967 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1968 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1969 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1970 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1971 IDTVEC(xmm), 1972#ifdef KDTRACE_HOOKS 1973 IDTVEC(dtrace_ret), 1974#endif 1975 IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); 1976 1977#ifdef DDB 1978/* 1979 * Display the index and function name of any IDT entries that don't use 1980 * the default 'rsvd' entry point. 1981 */ 1982DB_SHOW_COMMAND(idt, db_show_idt) 1983{ 1984 struct gate_descriptor *ip; 1985 int idx; 1986 uintptr_t func; 1987 1988 ip = idt; 1989 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 1990 func = (ip->gd_hioffset << 16 | ip->gd_looffset); 1991 if (func != (uintptr_t)&IDTVEC(rsvd)) { 1992 db_printf("%3d\t", idx); 1993 db_printsym(func, DB_STGY_PROC); 1994 db_printf("\n"); 1995 } 1996 ip++; 1997 } 1998} 1999 2000/* Show privileged registers. */ 2001DB_SHOW_COMMAND(sysregs, db_show_sysregs) 2002{ 2003 uint64_t idtr, gdtr; 2004 2005 idtr = ridt(); 2006 db_printf("idtr\t0x%08x/%04x\n", 2007 (u_int)(idtr >> 16), (u_int)idtr & 0xffff); 2008 gdtr = rgdt(); 2009 db_printf("gdtr\t0x%08x/%04x\n", 2010 (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); 2011 db_printf("ldtr\t0x%04x\n", rldt()); 2012 db_printf("tr\t0x%04x\n", rtr()); 2013 db_printf("cr0\t0x%08x\n", rcr0()); 2014 db_printf("cr2\t0x%08x\n", rcr2()); 2015 db_printf("cr3\t0x%08x\n", rcr3()); 2016 db_printf("cr4\t0x%08x\n", rcr4()); 2017} 2018#endif 2019 2020void 2021sdtossd(sd, ssd) 2022 struct segment_descriptor *sd; 2023 struct soft_segment_descriptor *ssd; 2024{ 2025 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 2026 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 2027 ssd->ssd_type = sd->sd_type; 2028 ssd->ssd_dpl = sd->sd_dpl; 2029 ssd->ssd_p = sd->sd_p; 2030 ssd->ssd_def32 = sd->sd_def32; 2031 ssd->ssd_gran = sd->sd_gran; 2032} 2033 2034#ifndef XEN 2035static int 2036add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) 2037{ 2038 int i, insert_idx, physmap_idx; 2039 2040 physmap_idx = *physmap_idxp; 2041 2042 if (boothowto & RB_VERBOSE) 2043 printf("SMAP type=%02x base=%016llx len=%016llx\n", 2044 smap->type, smap->base, smap->length); 2045 2046 if (smap->type != SMAP_TYPE_MEMORY) 2047 return (1); 2048 2049 if (smap->length == 0) 2050 return (1); 2051 2052#ifndef PAE 2053 if (smap->base > 0xffffffff) { 2054 printf("%uK of memory above 4GB ignored\n", 2055 (u_int)(smap->length / 1024)); 2056 return (1); 2057 } 2058#endif 2059 2060 /* 2061 * Find insertion point while checking for overlap. Start off by 2062 * assuming the new entry will be added to the end. 2063 */ 2064 insert_idx = physmap_idx + 2; 2065 for (i = 0; i <= physmap_idx; i += 2) { 2066 if (smap->base < physmap[i + 1]) { 2067 if (smap->base + smap->length <= physmap[i]) { 2068 insert_idx = i; 2069 break; 2070 } 2071 if (boothowto & RB_VERBOSE) 2072 printf( 2073 "Overlapping memory regions, ignoring second region\n"); 2074 return (1); 2075 } 2076 } 2077 2078 /* See if we can prepend to the next entry. */ 2079 if (insert_idx <= physmap_idx && 2080 smap->base + smap->length == physmap[insert_idx]) { 2081 physmap[insert_idx] = smap->base; 2082 return (1); 2083 } 2084 2085 /* See if we can append to the previous entry. */ 2086 if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) { 2087 physmap[insert_idx - 1] += smap->length; 2088 return (1); 2089 } 2090 2091 physmap_idx += 2; 2092 *physmap_idxp = physmap_idx; 2093 if (physmap_idx == PHYSMAP_SIZE) { 2094 printf( 2095 "Too many segments in the physical address map, giving up\n"); 2096 return (0); 2097 } 2098 2099 /* 2100 * Move the last 'N' entries down to make room for the new 2101 * entry if needed. 2102 */ 2103 for (i = physmap_idx; i > insert_idx; i -= 2) { 2104 physmap[i] = physmap[i - 2]; 2105 physmap[i + 1] = physmap[i - 1]; 2106 } 2107 2108 /* Insert the new entry. */ 2109 physmap[insert_idx] = smap->base; 2110 physmap[insert_idx + 1] = smap->base + smap->length; 2111 return (1); 2112} 2113 2114static void 2115basemem_setup(void) 2116{ 2117 vm_paddr_t pa; 2118 pt_entry_t *pte; 2119 int i; 2120 2121 if (basemem > 640) { 2122 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", 2123 basemem); 2124 basemem = 640; 2125 } 2126 2127 /* 2128 * XXX if biosbasemem is now < 640, there is a `hole' 2129 * between the end of base memory and the start of 2130 * ISA memory. The hole may be empty or it may 2131 * contain BIOS code or data. Map it read/write so 2132 * that the BIOS can write to it. (Memory from 0 to 2133 * the physical end of the kernel is mapped read-only 2134 * to begin with and then parts of it are remapped. 2135 * The parts that aren't remapped form holes that 2136 * remain read-only and are unused by the kernel. 2137 * The base memory area is below the physical end of 2138 * the kernel and right now forms a read-only hole. 2139 * The part of it from PAGE_SIZE to 2140 * (trunc_page(biosbasemem * 1024) - 1) will be 2141 * remapped and used by the kernel later.) 2142 * 2143 * This code is similar to the code used in 2144 * pmap_mapdev, but since no memory needs to be 2145 * allocated we simply change the mapping. 2146 */ 2147 for (pa = trunc_page(basemem * 1024); 2148 pa < ISA_HOLE_START; pa += PAGE_SIZE) 2149 pmap_kenter(KERNBASE + pa, pa); 2150 2151 /* 2152 * Map pages between basemem and ISA_HOLE_START, if any, r/w into 2153 * the vm86 page table so that vm86 can scribble on them using 2154 * the vm86 map too. XXX: why 2 ways for this and only 1 way for 2155 * page 0, at least as initialized here? 2156 */ 2157 pte = (pt_entry_t *)vm86paddr; 2158 for (i = basemem / 4; i < 160; i++) 2159 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; 2160} 2161#endif 2162 2163/* 2164 * Populate the (physmap) array with base/bound pairs describing the 2165 * available physical memory in the system, then test this memory and 2166 * build the phys_avail array describing the actually-available memory. 2167 * 2168 * If we cannot accurately determine the physical memory map, then use 2169 * value from the 0xE801 call, and failing that, the RTC. 2170 * 2171 * Total memory size may be set by the kernel environment variable 2172 * hw.physmem or the compile-time define MAXMEM. 2173 * 2174 * XXX first should be vm_paddr_t. 2175 */ 2176static void 2177getmemsize(int first) 2178{ 2179 int has_smap, off, physmap_idx, pa_indx, da_indx; 2180 u_long physmem_tunable, memtest; 2181 vm_paddr_t physmap[PHYSMAP_SIZE]; 2182 pt_entry_t *pte; 2183 quad_t dcons_addr, dcons_size; 2184#ifndef XEN 2185 int hasbrokenint12, i, res; 2186 u_int extmem; 2187 struct vm86frame vmf; 2188 struct vm86context vmc; 2189 vm_paddr_t pa; 2190 struct bios_smap *smap, *smapbase, *smapend; 2191 u_int32_t smapsize; 2192 caddr_t kmdp; 2193#endif 2194 2195 has_smap = 0; 2196#if defined(XEN) 2197 Maxmem = xen_start_info->nr_pages - init_first; 2198 physmem = Maxmem; 2199 basemem = 0; 2200 physmap[0] = init_first << PAGE_SHIFT; 2201 physmap[1] = ptoa(Maxmem) - round_page(msgbufsize); 2202 physmap_idx = 0; 2203#else 2204#ifdef XBOX 2205 if (arch_i386_is_xbox) { 2206 /* 2207 * We queried the memory size before, so chop off 4MB for 2208 * the framebuffer and inform the OS of this. 2209 */ 2210 physmap[0] = 0; 2211 physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE; 2212 physmap_idx = 0; 2213 goto physmap_done; 2214 } 2215#endif 2216 bzero(&vmf, sizeof(vmf)); 2217 bzero(physmap, sizeof(physmap)); 2218 basemem = 0; 2219 2220 /* 2221 * Check if the loader supplied an SMAP memory map. If so, 2222 * use that and do not make any VM86 calls. 2223 */ 2224 physmap_idx = 0; 2225 smapbase = NULL; 2226 kmdp = preload_search_by_type("elf kernel"); 2227 if (kmdp == NULL) 2228 kmdp = preload_search_by_type("elf32 kernel"); 2229 if (kmdp != NULL) 2230 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2231 MODINFO_METADATA | MODINFOMD_SMAP); 2232 if (smapbase != NULL) { 2233 /* 2234 * subr_module.c says: 2235 * "Consumer may safely assume that size value precedes data." 2236 * ie: an int32_t immediately precedes SMAP. 2237 */ 2238 smapsize = *((u_int32_t *)smapbase - 1); 2239 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 2240 has_smap = 1; 2241 2242 for (smap = smapbase; smap < smapend; smap++) 2243 if (!add_smap_entry(smap, physmap, &physmap_idx)) 2244 break; 2245 goto have_smap; 2246 } 2247 2248 /* 2249 * Some newer BIOSes have a broken INT 12H implementation 2250 * which causes a kernel panic immediately. In this case, we 2251 * need use the SMAP to determine the base memory size. 2252 */ 2253 hasbrokenint12 = 0; 2254 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); 2255 if (hasbrokenint12 == 0) { 2256 /* Use INT12 to determine base memory size. */ 2257 vm86_intcall(0x12, &vmf); 2258 basemem = vmf.vmf_ax; 2259 basemem_setup(); 2260 } 2261 2262 /* 2263 * Fetch the memory map with INT 15:E820. Map page 1 R/W into 2264 * the kernel page table so we can use it as a buffer. The 2265 * kernel will unmap this page later. 2266 */ 2267 pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT); 2268 vmc.npages = 0; 2269 smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); 2270 res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); 2271 KASSERT(res != 0, ("vm86_getptr() failed: address not found")); 2272 2273 vmf.vmf_ebx = 0; 2274 do { 2275 vmf.vmf_eax = 0xE820; 2276 vmf.vmf_edx = SMAP_SIG; 2277 vmf.vmf_ecx = sizeof(struct bios_smap); 2278 i = vm86_datacall(0x15, &vmf, &vmc); 2279 if (i || vmf.vmf_eax != SMAP_SIG) 2280 break; 2281 has_smap = 1; 2282 if (!add_smap_entry(smap, physmap, &physmap_idx)) 2283 break; 2284 } while (vmf.vmf_ebx != 0); 2285 2286have_smap: 2287 /* 2288 * If we didn't fetch the "base memory" size from INT12, 2289 * figure it out from the SMAP (or just guess). 2290 */ 2291 if (basemem == 0) { 2292 for (i = 0; i <= physmap_idx; i += 2) { 2293 if (physmap[i] == 0x00000000) { 2294 basemem = physmap[i + 1] / 1024; 2295 break; 2296 } 2297 } 2298 2299 /* XXX: If we couldn't find basemem from SMAP, just guess. */ 2300 if (basemem == 0) 2301 basemem = 640; 2302 basemem_setup(); 2303 } 2304 2305 if (physmap[1] != 0) 2306 goto physmap_done; 2307 2308 /* 2309 * If we failed to find an SMAP, figure out the extended 2310 * memory size. We will then build a simple memory map with 2311 * two segments, one for "base memory" and the second for 2312 * "extended memory". Note that "extended memory" starts at a 2313 * physical address of 1MB and that both basemem and extmem 2314 * are in units of 1KB. 2315 * 2316 * First, try to fetch the extended memory size via INT 15:E801. 2317 */ 2318 vmf.vmf_ax = 0xE801; 2319 if (vm86_intcall(0x15, &vmf) == 0) { 2320 extmem = vmf.vmf_cx + vmf.vmf_dx * 64; 2321 } else { 2322 /* 2323 * If INT15:E801 fails, this is our last ditch effort 2324 * to determine the extended memory size. Currently 2325 * we prefer the RTC value over INT15:88. 2326 */ 2327#if 0 2328 vmf.vmf_ah = 0x88; 2329 vm86_intcall(0x15, &vmf); 2330 extmem = vmf.vmf_ax; 2331#else 2332 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); 2333#endif 2334 } 2335 2336 /* 2337 * Special hack for chipsets that still remap the 384k hole when 2338 * there's 16MB of memory - this really confuses people that 2339 * are trying to use bus mastering ISA controllers with the 2340 * "16MB limit"; they only have 16MB, but the remapping puts 2341 * them beyond the limit. 2342 * 2343 * If extended memory is between 15-16MB (16-17MB phys address range), 2344 * chop it to 15MB. 2345 */ 2346 if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) 2347 extmem = 15 * 1024; 2348 2349 physmap[0] = 0; 2350 physmap[1] = basemem * 1024; 2351 physmap_idx = 2; 2352 physmap[physmap_idx] = 0x100000; 2353 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 2354 2355physmap_done: 2356#endif 2357 /* 2358 * Now, physmap contains a map of physical memory. 2359 */ 2360 2361#ifdef SMP 2362 /* make hole for AP bootstrap code */ 2363 physmap[1] = mp_bootaddress(physmap[1]); 2364#endif 2365 2366 /* 2367 * Maxmem isn't the "maximum memory", it's one larger than the 2368 * highest page of the physical address space. It should be 2369 * called something like "Maxphyspage". We may adjust this 2370 * based on ``hw.physmem'' and the results of the memory test. 2371 */ 2372 Maxmem = atop(physmap[physmap_idx + 1]); 2373 2374#ifdef MAXMEM 2375 Maxmem = MAXMEM / 4; 2376#endif 2377 2378 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 2379 Maxmem = atop(physmem_tunable); 2380 2381 /* 2382 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend 2383 * the amount of memory in the system. 2384 */ 2385 if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) 2386 Maxmem = atop(physmap[physmap_idx + 1]); 2387 2388 /* 2389 * By default enable the memory test on real hardware, and disable 2390 * it if we appear to be running in a VM. This avoids touching all 2391 * pages unnecessarily, which doesn't matter on real hardware but is 2392 * bad for shared VM hosts. Use a general name so that 2393 * one could eventually do more with the code than just disable it. 2394 */ 2395 memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1; 2396 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 2397 2398 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2399 (boothowto & RB_VERBOSE)) 2400 printf("Physical memory use set to %ldK\n", Maxmem * 4); 2401 2402 /* 2403 * If Maxmem has been increased beyond what the system has detected, 2404 * extend the last memory segment to the new limit. 2405 */ 2406 if (atop(physmap[physmap_idx + 1]) < Maxmem) 2407 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 2408 2409 /* call pmap initialization to make new kernel address space */ 2410 pmap_bootstrap(first); 2411 2412 /* 2413 * Size up each available chunk of physical memory. 2414 */ 2415 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 2416 pa_indx = 0; 2417 da_indx = 1; 2418 phys_avail[pa_indx++] = physmap[0]; 2419 phys_avail[pa_indx] = physmap[0]; 2420 dump_avail[da_indx] = physmap[0]; 2421 pte = CMAP1; 2422 2423 /* 2424 * Get dcons buffer address 2425 */ 2426 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 2427 getenv_quad("dcons.size", &dcons_size) == 0) 2428 dcons_addr = 0; 2429 2430#ifndef XEN 2431 /* 2432 * physmap is in bytes, so when converting to page boundaries, 2433 * round up the start address and round down the end address. 2434 */ 2435 for (i = 0; i <= physmap_idx; i += 2) { 2436 vm_paddr_t end; 2437 2438 end = ptoa((vm_paddr_t)Maxmem); 2439 if (physmap[i + 1] < end) 2440 end = trunc_page(physmap[i + 1]); 2441 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 2442 int tmp, page_bad, full; 2443 int *ptr = (int *)CADDR1; 2444 2445 full = FALSE; 2446 /* 2447 * block out kernel memory as not available. 2448 */ 2449 if (pa >= KERNLOAD && pa < first) 2450 goto do_dump_avail; 2451 2452 /* 2453 * block out dcons buffer 2454 */ 2455 if (dcons_addr > 0 2456 && pa >= trunc_page(dcons_addr) 2457 && pa < dcons_addr + dcons_size) 2458 goto do_dump_avail; 2459 2460 page_bad = FALSE; 2461 if (memtest == 0) 2462 goto skip_memtest; 2463 2464 /* 2465 * map page into kernel: valid, read/write,non-cacheable 2466 */ 2467 *pte = pa | PG_V | PG_RW | PG_N; 2468 invltlb(); 2469 2470 tmp = *(int *)ptr; 2471 /* 2472 * Test for alternating 1's and 0's 2473 */ 2474 *(volatile int *)ptr = 0xaaaaaaaa; 2475 if (*(volatile int *)ptr != 0xaaaaaaaa) 2476 page_bad = TRUE; 2477 /* 2478 * Test for alternating 0's and 1's 2479 */ 2480 *(volatile int *)ptr = 0x55555555; 2481 if (*(volatile int *)ptr != 0x55555555) 2482 page_bad = TRUE; 2483 /* 2484 * Test for all 1's 2485 */ 2486 *(volatile int *)ptr = 0xffffffff; 2487 if (*(volatile int *)ptr != 0xffffffff) 2488 page_bad = TRUE; 2489 /* 2490 * Test for all 0's 2491 */ 2492 *(volatile int *)ptr = 0x0; 2493 if (*(volatile int *)ptr != 0x0) 2494 page_bad = TRUE; 2495 /* 2496 * Restore original value. 2497 */ 2498 *(int *)ptr = tmp; 2499 2500skip_memtest: 2501 /* 2502 * Adjust array of valid/good pages. 2503 */ 2504 if (page_bad == TRUE) 2505 continue; 2506 /* 2507 * If this good page is a continuation of the 2508 * previous set of good pages, then just increase 2509 * the end pointer. Otherwise start a new chunk. 2510 * Note that "end" points one higher than end, 2511 * making the range >= start and < end. 2512 * If we're also doing a speculative memory 2513 * test and we at or past the end, bump up Maxmem 2514 * so that we keep going. The first bad page 2515 * will terminate the loop. 2516 */ 2517 if (phys_avail[pa_indx] == pa) { 2518 phys_avail[pa_indx] += PAGE_SIZE; 2519 } else { 2520 pa_indx++; 2521 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2522 printf( 2523 "Too many holes in the physical address space, giving up\n"); 2524 pa_indx--; 2525 full = TRUE; 2526 goto do_dump_avail; 2527 } 2528 phys_avail[pa_indx++] = pa; /* start */ 2529 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 2530 } 2531 physmem++; 2532do_dump_avail: 2533 if (dump_avail[da_indx] == pa) { 2534 dump_avail[da_indx] += PAGE_SIZE; 2535 } else { 2536 da_indx++; 2537 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2538 da_indx--; 2539 goto do_next; 2540 } 2541 dump_avail[da_indx++] = pa; /* start */ 2542 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 2543 } 2544do_next: 2545 if (full) 2546 break; 2547 } 2548 } 2549 *pte = 0; 2550 invltlb(); 2551#else 2552 phys_avail[0] = physfree; 2553 phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; 2554 dump_avail[0] = 0; 2555 dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; 2556 2557#endif 2558 2559 /* 2560 * XXX 2561 * The last chunk must contain at least one page plus the message 2562 * buffer to avoid complicating other code (message buffer address 2563 * calculation, etc.). 2564 */ 2565 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 2566 round_page(msgbufsize) >= phys_avail[pa_indx]) { 2567 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2568 phys_avail[pa_indx--] = 0; 2569 phys_avail[pa_indx--] = 0; 2570 } 2571 2572 Maxmem = atop(phys_avail[pa_indx]); 2573 2574 /* Trim off space for the message buffer. */ 2575 phys_avail[pa_indx] -= round_page(msgbufsize); 2576 2577 /* Map the message buffer. */ 2578 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) 2579 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 2580 off); 2581 2582 PT_UPDATES_FLUSH(); 2583} 2584 2585#ifdef XEN 2586#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) 2587 2588void 2589init386(first) 2590 int first; 2591{ 2592 unsigned long gdtmachpfn; 2593 int error, gsel_tss, metadata_missing, x, pa; 2594 size_t kstack0_sz; 2595 struct pcpu *pc; 2596 struct callback_register event = { 2597 .type = CALLBACKTYPE_event, 2598 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback }, 2599 }; 2600 struct callback_register failsafe = { 2601 .type = CALLBACKTYPE_failsafe, 2602 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback }, 2603 }; 2604 2605 thread0.td_kstack = proc0kstack; 2606 thread0.td_kstack_pages = KSTACK_PAGES; 2607 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 2608 thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1; 2609 2610 /* 2611 * This may be done better later if it gets more high level 2612 * components in it. If so just link td->td_proc here. 2613 */ 2614 proc_linkup0(&proc0, &thread0); 2615 2616 metadata_missing = 0; 2617 if (xen_start_info->mod_start) { 2618 preload_metadata = (caddr_t)xen_start_info->mod_start; 2619 preload_bootstrap_relocate(KERNBASE); 2620 } else { 2621 metadata_missing = 1; 2622 } 2623 if (envmode == 1) 2624 kern_envp = static_env; 2625 else if ((caddr_t)xen_start_info->cmd_line) 2626 kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line); 2627 2628 boothowto |= xen_boothowto(kern_envp); 2629 2630 /* Init basic tunables, hz etc */ 2631 init_param1(); 2632 2633 /* 2634 * XEN occupies a portion of the upper virtual address space 2635 * At its base it manages an array mapping machine page frames 2636 * to physical page frames - hence we need to be able to 2637 * access 4GB - (64MB - 4MB + 64k) 2638 */ 2639 gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2640 gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2641 gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2642 gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2643 gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2644 gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2645 gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2646 gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2647 2648 pc = &__pcpu[0]; 2649 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 2650 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 2651 2652 PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW); 2653 bzero(gdt, PAGE_SIZE); 2654 for (x = 0; x < NGDT; x++) 2655 ssdtosd(&gdt_segs[x], &gdt[x].sd); 2656 2657 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2658 2659 gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT; 2660 PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V); 2661 PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0); 2662 lgdt(&r_gdt); 2663 gdtset = 1; 2664 2665 if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) { 2666 panic("set_trap_table failed - error %d\n", error); 2667 } 2668 2669 error = HYPERVISOR_callback_op(CALLBACKOP_register, &event); 2670 if (error == 0) 2671 error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); 2672#if CONFIG_XEN_COMPAT <= 0x030002 2673 if (error == -ENOXENSYS) 2674 HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL), 2675 (unsigned long)Xhypervisor_callback, 2676 GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback); 2677#endif 2678 pcpu_init(pc, 0, sizeof(struct pcpu)); 2679 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2680 pmap_kenter(pa + KERNBASE, pa); 2681 dpcpu_init((void *)(first + KERNBASE), 0); 2682 first += DPCPU_SIZE; 2683 physfree += DPCPU_SIZE; 2684 init_first += DPCPU_SIZE / PAGE_SIZE; 2685 2686 PCPU_SET(prvspace, pc); 2687 PCPU_SET(curthread, &thread0); 2688 PCPU_SET(curpcb, thread0.td_pcb); 2689 2690 /* 2691 * Initialize mutexes. 2692 * 2693 * icu_lock: in order to allow an interrupt to occur in a critical 2694 * section, to set pcpu->ipending (etc...) properly, we 2695 * must be able to get the icu lock, so it can't be 2696 * under witness. 2697 */ 2698 mutex_init(); 2699 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2700 2701 /* make ldt memory segments */ 2702 PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW); 2703 bzero(ldt, PAGE_SIZE); 2704 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2705 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2706 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) 2707 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2708 2709 default_proc_ldt.ldt_base = (caddr_t)ldt; 2710 default_proc_ldt.ldt_len = 6; 2711 _default_ldt = (int)&default_proc_ldt; 2712 PCPU_SET(currentldt, _default_ldt); 2713 PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW); 2714 xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0])); 2715 2716#if defined(XEN_PRIVILEGED) 2717 /* 2718 * Initialize the i8254 before the console so that console 2719 * initialization can use DELAY(). 2720 */ 2721 i8254_init(); 2722#endif 2723 2724 /* 2725 * Initialize the console before we print anything out. 2726 */ 2727 cninit(); 2728 2729 if (metadata_missing) 2730 printf("WARNING: loader(8) metadata is missing!\n"); 2731 2732#ifdef DEV_ISA 2733#ifdef DEV_ATPIC 2734 elcr_probe(); 2735 atpic_startup(); 2736#else 2737 /* Reset and mask the atpics and leave them shut down. */ 2738 atpic_reset(); 2739 2740 /* 2741 * Point the ICU spurious interrupt vectors at the APIC spurious 2742 * interrupt handler. 2743 */ 2744 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2745 GSEL(GCODE_SEL, SEL_KPL)); 2746 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2747 GSEL(GCODE_SEL, SEL_KPL)); 2748#endif 2749#endif 2750 2751#ifdef DDB 2752 ksym_start = bootinfo.bi_symtab; 2753 ksym_end = bootinfo.bi_esymtab; 2754#endif 2755 2756 kdb_init(); 2757 2758#ifdef KDB 2759 if (boothowto & RB_KDB) 2760 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 2761#endif 2762 2763 finishidentcpu(); /* Final stage of CPU initialization */ 2764 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2765 GSEL(GCODE_SEL, SEL_KPL)); 2766 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2767 GSEL(GCODE_SEL, SEL_KPL)); 2768 initializecpu(); /* Initialize CPU registers */ 2769 2770 /* make an initial tss so cpu can get interrupt stack on syscall! */ 2771 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 2772 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + 2773 kstack0_sz - sizeof(struct pcb) - 16); 2774 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 2775 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2776 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), 2777 PCPU_GET(common_tss.tss_esp0)); 2778 2779 /* pointer to selector slot for %fs/%gs */ 2780 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 2781 2782 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = 2783 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; 2784 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = 2785 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 2786#ifdef PAE 2787 dblfault_tss.tss_cr3 = (int)IdlePDPT; 2788#else 2789 dblfault_tss.tss_cr3 = (int)IdlePTD; 2790#endif 2791 dblfault_tss.tss_eip = (int)dblfault_handler; 2792 dblfault_tss.tss_eflags = PSL_KERNEL; 2793 dblfault_tss.tss_ds = dblfault_tss.tss_es = 2794 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); 2795 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 2796 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); 2797 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 2798 2799 vm86_initialize(); 2800 getmemsize(first); 2801 init_param2(physmem); 2802 2803 /* now running on new page tables, configured,and u/iom is accessible */ 2804 2805 msgbufinit(msgbufp, msgbufsize); 2806 /* transfer to user mode */ 2807 2808 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2809 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2810 2811 /* setup proc 0's pcb */ 2812 thread0.td_pcb->pcb_flags = 0; 2813#ifdef PAE 2814 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; 2815#else 2816 thread0.td_pcb->pcb_cr3 = (int)IdlePTD; 2817#endif 2818 thread0.td_pcb->pcb_ext = 0; 2819 thread0.td_frame = &proc0_tf; 2820 thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0]; 2821 thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1]; 2822 2823 cpu_probe_amdc1e(); 2824 cpu_probe_cmpxchg8b(); 2825} 2826 2827#else 2828void 2829init386(first) 2830 int first; 2831{ 2832 struct gate_descriptor *gdp; 2833 int gsel_tss, metadata_missing, x, pa; 2834 size_t kstack0_sz; 2835 struct pcpu *pc; 2836 2837 thread0.td_kstack = proc0kstack; 2838 thread0.td_kstack_pages = KSTACK_PAGES; 2839 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 2840 thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1; 2841 2842 /* 2843 * This may be done better later if it gets more high level 2844 * components in it. If so just link td->td_proc here. 2845 */ 2846 proc_linkup0(&proc0, &thread0); 2847 2848 metadata_missing = 0; 2849 if (bootinfo.bi_modulep) { 2850 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 2851 preload_bootstrap_relocate(KERNBASE); 2852 } else { 2853 metadata_missing = 1; 2854 } 2855 if (envmode == 1) 2856 kern_envp = static_env; 2857 else if (bootinfo.bi_envp) 2858 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 2859 2860 /* Init basic tunables, hz etc */ 2861 init_param1(); 2862 2863 /* 2864 * Make gdt memory segments. All segments cover the full 4GB 2865 * of address space and permissions are enforced at page level. 2866 */ 2867 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); 2868 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); 2869 gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); 2870 gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); 2871 gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); 2872 gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); 2873 2874 pc = &__pcpu[0]; 2875 gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); 2876 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 2877 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 2878 2879 for (x = 0; x < NGDT; x++) 2880 ssdtosd(&gdt_segs[x], &gdt[x].sd); 2881 2882 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2883 r_gdt.rd_base = (int) gdt; 2884 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2885 lgdt(&r_gdt); 2886 2887 pcpu_init(pc, 0, sizeof(struct pcpu)); 2888 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2889 pmap_kenter(pa + KERNBASE, pa); 2890 dpcpu_init((void *)(first + KERNBASE), 0); 2891 first += DPCPU_SIZE; 2892 PCPU_SET(prvspace, pc); 2893 PCPU_SET(curthread, &thread0); 2894 PCPU_SET(curpcb, thread0.td_pcb); 2895 2896 /* 2897 * Initialize mutexes. 2898 * 2899 * icu_lock: in order to allow an interrupt to occur in a critical 2900 * section, to set pcpu->ipending (etc...) properly, we 2901 * must be able to get the icu lock, so it can't be 2902 * under witness. 2903 */ 2904 mutex_init(); 2905 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2906 2907 /* make ldt memory segments */ 2908 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2909 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2910 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) 2911 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2912 2913 _default_ldt = GSEL(GLDT_SEL, SEL_KPL); 2914 lldt(_default_ldt); 2915 PCPU_SET(currentldt, _default_ldt); 2916 2917 /* exceptions */ 2918 for (x = 0; x < NIDT; x++) 2919 setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, 2920 GSEL(GCODE_SEL, SEL_KPL)); 2921 setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, 2922 GSEL(GCODE_SEL, SEL_KPL)); 2923 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, 2924 GSEL(GCODE_SEL, SEL_KPL)); 2925 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, 2926 GSEL(GCODE_SEL, SEL_KPL)); 2927 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, 2928 GSEL(GCODE_SEL, SEL_KPL)); 2929 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, 2930 GSEL(GCODE_SEL, SEL_KPL)); 2931 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, 2932 GSEL(GCODE_SEL, SEL_KPL)); 2933 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2934 GSEL(GCODE_SEL, SEL_KPL)); 2935 setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL 2936 , GSEL(GCODE_SEL, SEL_KPL)); 2937 setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); 2938 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, 2939 GSEL(GCODE_SEL, SEL_KPL)); 2940 setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, 2941 GSEL(GCODE_SEL, SEL_KPL)); 2942 setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, 2943 GSEL(GCODE_SEL, SEL_KPL)); 2944 setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, 2945 GSEL(GCODE_SEL, SEL_KPL)); 2946 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2947 GSEL(GCODE_SEL, SEL_KPL)); 2948 setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, 2949 GSEL(GCODE_SEL, SEL_KPL)); 2950 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, 2951 GSEL(GCODE_SEL, SEL_KPL)); 2952 setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, 2953 GSEL(GCODE_SEL, SEL_KPL)); 2954 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, 2955 GSEL(GCODE_SEL, SEL_KPL)); 2956 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, 2957 GSEL(GCODE_SEL, SEL_KPL)); 2958 setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, 2959 GSEL(GCODE_SEL, SEL_KPL)); 2960#ifdef KDTRACE_HOOKS 2961 setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL, 2962 GSEL(GCODE_SEL, SEL_KPL)); 2963#endif 2964 2965 r_idt.rd_limit = sizeof(idt0) - 1; 2966 r_idt.rd_base = (int) idt; 2967 lidt(&r_idt); 2968 2969#ifdef XBOX 2970 /* 2971 * The following code queries the PCI ID of 0:0:0. For the XBOX, 2972 * This should be 0x10de / 0x02a5. 2973 * 2974 * This is exactly what Linux does. 2975 */ 2976 outl(0xcf8, 0x80000000); 2977 if (inl(0xcfc) == 0x02a510de) { 2978 arch_i386_is_xbox = 1; 2979 pic16l_setled(XBOX_LED_GREEN); 2980 2981 /* 2982 * We are an XBOX, but we may have either 64MB or 128MB of 2983 * memory. The PCI host bridge should be programmed for this, 2984 * so we just query it. 2985 */ 2986 outl(0xcf8, 0x80000084); 2987 arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64; 2988 } 2989#endif /* XBOX */ 2990 2991 /* 2992 * Initialize the i8254 before the console so that console 2993 * initialization can use DELAY(). 2994 */ 2995 i8254_init(); 2996 2997 /* 2998 * Initialize the console before we print anything out. 2999 */ 3000 cninit(); 3001 3002 if (metadata_missing) 3003 printf("WARNING: loader(8) metadata is missing!\n"); 3004 3005#ifdef DEV_ISA 3006#ifdef DEV_ATPIC 3007 elcr_probe(); 3008 atpic_startup(); 3009#else 3010 /* Reset and mask the atpics and leave them shut down. */ 3011 atpic_reset(); 3012 3013 /* 3014 * Point the ICU spurious interrupt vectors at the APIC spurious 3015 * interrupt handler. 3016 */ 3017 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 3018 GSEL(GCODE_SEL, SEL_KPL)); 3019 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 3020 GSEL(GCODE_SEL, SEL_KPL)); 3021#endif 3022#endif 3023 3024#ifdef DDB 3025 ksym_start = bootinfo.bi_symtab; 3026 ksym_end = bootinfo.bi_esymtab; 3027#endif 3028 3029 kdb_init(); 3030 3031#ifdef KDB 3032 if (boothowto & RB_KDB) 3033 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 3034#endif 3035 3036 finishidentcpu(); /* Final stage of CPU initialization */ 3037 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 3038 GSEL(GCODE_SEL, SEL_KPL)); 3039 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 3040 GSEL(GCODE_SEL, SEL_KPL)); 3041 initializecpu(); /* Initialize CPU registers */ 3042 3043 /* make an initial tss so cpu can get interrupt stack on syscall! */ 3044 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 3045 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + 3046 kstack0_sz - sizeof(struct pcb) - 16); 3047 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 3048 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 3049 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); 3050 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 3051 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 3052 ltr(gsel_tss); 3053 3054 /* pointer to selector slot for %fs/%gs */ 3055 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 3056 3057 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = 3058 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; 3059 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = 3060 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 3061#ifdef PAE 3062 dblfault_tss.tss_cr3 = (int)IdlePDPT; 3063#else 3064 dblfault_tss.tss_cr3 = (int)IdlePTD; 3065#endif 3066 dblfault_tss.tss_eip = (int)dblfault_handler; 3067 dblfault_tss.tss_eflags = PSL_KERNEL; 3068 dblfault_tss.tss_ds = dblfault_tss.tss_es = 3069 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); 3070 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 3071 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); 3072 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 3073 3074 vm86_initialize(); 3075 getmemsize(first); 3076 init_param2(physmem); 3077 3078 /* now running on new page tables, configured,and u/iom is accessible */ 3079 3080 msgbufinit(msgbufp, msgbufsize); 3081 3082 /* make a call gate to reenter kernel with */ 3083 gdp = &ldt[LSYS5CALLS_SEL].gd; 3084 3085 x = (int) &IDTVEC(lcall_syscall); 3086 gdp->gd_looffset = x; 3087 gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); 3088 gdp->gd_stkcpy = 1; 3089 gdp->gd_type = SDT_SYS386CGT; 3090 gdp->gd_dpl = SEL_UPL; 3091 gdp->gd_p = 1; 3092 gdp->gd_hioffset = x >> 16; 3093 3094 /* XXX does this work? */ 3095 /* XXX yes! */ 3096 ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; 3097 ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; 3098 3099 /* transfer to user mode */ 3100 3101 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 3102 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 3103 3104 /* setup proc 0's pcb */ 3105 thread0.td_pcb->pcb_flags = 0; 3106#ifdef PAE 3107 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; 3108#else 3109 thread0.td_pcb->pcb_cr3 = (int)IdlePTD; 3110#endif 3111 thread0.td_pcb->pcb_ext = 0; 3112 thread0.td_frame = &proc0_tf; 3113 3114 cpu_probe_amdc1e(); 3115 cpu_probe_cmpxchg8b();
| 145 146#ifdef DEV_APIC 147#include <machine/apicvar.h> 148#endif 149 150#ifdef DEV_ISA 151#include <x86/isa/icu.h> 152#endif 153 154#ifdef XBOX 155#include <machine/xbox.h> 156 157int arch_i386_is_xbox = 0; 158uint32_t arch_i386_xbox_memsize = 0; 159#endif 160 161#ifdef XEN 162/* XEN includes */ 163#include <machine/xen/xen-os.h> 164#include <xen/hypervisor.h> 165#include <machine/xen/xen-os.h> 166#include <machine/xen/xenvar.h> 167#include <machine/xen/xenfunc.h> 168#include <xen/xen_intr.h> 169 170void Xhypervisor_callback(void); 171void failsafe_callback(void); 172 173extern trap_info_t trap_table[]; 174struct proc_ldt default_proc_ldt; 175extern int init_first; 176int running_xen = 1; 177extern unsigned long physfree; 178#endif /* XEN */ 179 180/* Sanity check for __curthread() */ 181CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 182 183extern void init386(int first); 184extern void dblfault_handler(void); 185 186extern void printcpuinfo(void); /* XXX header file */ 187extern void finishidentcpu(void); 188extern void panicifcpuunsupported(void); 189 190#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 191#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 192 193#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) 194#define CPU_ENABLE_SSE 195#endif 196 197static void cpu_startup(void *); 198static void fpstate_drop(struct thread *td); 199static void get_fpcontext(struct thread *td, mcontext_t *mcp); 200static int set_fpcontext(struct thread *td, const mcontext_t *mcp); 201#ifdef CPU_ENABLE_SSE 202static void set_fpregs_xmm(struct save87 *, struct savexmm *); 203static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 204#endif /* CPU_ENABLE_SSE */ 205SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 206 207#ifdef DDB 208extern vm_offset_t ksym_start, ksym_end; 209#endif 210 211/* Intel ICH registers */ 212#define ICH_PMBASE 0x400 213#define ICH_SMI_EN ICH_PMBASE + 0x30 214 215int _udatasel, _ucodesel; 216u_int basemem; 217 218int cold = 1; 219 220#ifdef COMPAT_43 221static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 222#endif 223#ifdef COMPAT_FREEBSD4 224static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 225#endif 226 227long Maxmem = 0; 228long realmem = 0; 229 230#ifdef PAE 231FEATURE(pae, "Physical Address Extensions"); 232#endif 233 234/* 235 * The number of PHYSMAP entries must be one less than the number of 236 * PHYSSEG entries because the PHYSMAP entry that spans the largest 237 * physical address that is accessible by ISA DMA is split into two 238 * PHYSSEG entries. 239 */ 240#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 241 242vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 243vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 244 245/* must be 2 less so 0 0 can signal end of chunks */ 246#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) 247#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) 248 249struct kva_md_info kmi; 250 251static struct trapframe proc0_tf; 252struct pcpu __pcpu[MAXCPU]; 253 254struct mtx icu_lock; 255 256struct mem_range_softc mem_range_softc; 257 258static void 259cpu_startup(dummy) 260 void *dummy; 261{ 262 uintmax_t memsize; 263 char *sysenv; 264 265 /* 266 * On MacBooks, we need to disallow the legacy USB circuit to 267 * generate an SMI# because this can cause several problems, 268 * namely: incorrect CPU frequency detection and failure to 269 * start the APs. 270 * We do this by disabling a bit in the SMI_EN (SMI Control and 271 * Enable register) of the Intel ICH LPC Interface Bridge. 272 */ 273 sysenv = getenv("smbios.system.product"); 274 if (sysenv != NULL) { 275 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 276 strncmp(sysenv, "MacBook3,1", 10) == 0 || 277 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 278 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 279 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 280 strncmp(sysenv, "Macmini1,1", 10) == 0) { 281 if (bootverbose) 282 printf("Disabling LEGACY_USB_EN bit on " 283 "Intel ICH.\n"); 284 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 285 } 286 freeenv(sysenv); 287 } 288 289 /* 290 * Good {morning,afternoon,evening,night}. 291 */ 292 startrtclock(); 293 printcpuinfo(); 294 panicifcpuunsupported(); 295#ifdef PERFMON 296 perfmon_init(); 297#endif 298 realmem = Maxmem; 299 300 /* 301 * Display physical memory if SMBIOS reports reasonable amount. 302 */ 303 memsize = 0; 304 sysenv = getenv("smbios.memory.enabled"); 305 if (sysenv != NULL) { 306 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 307 freeenv(sysenv); 308 } 309 if (memsize < ptoa((uintmax_t)cnt.v_free_count)) 310 memsize = ptoa((uintmax_t)Maxmem); 311 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 312 313 /* 314 * Display any holes after the first chunk of extended memory. 315 */ 316 if (bootverbose) { 317 int indx; 318 319 printf("Physical memory chunk(s):\n"); 320 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 321 vm_paddr_t size; 322 323 size = phys_avail[indx + 1] - phys_avail[indx]; 324 printf( 325 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 326 (uintmax_t)phys_avail[indx], 327 (uintmax_t)phys_avail[indx + 1] - 1, 328 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 329 } 330 } 331 332 vm_ksubmap_init(&kmi); 333 334 printf("avail memory = %ju (%ju MB)\n", 335 ptoa((uintmax_t)cnt.v_free_count), 336 ptoa((uintmax_t)cnt.v_free_count) / 1048576); 337 338 /* 339 * Set up buffers, so they can be used to read disk labels. 340 */ 341 bufinit(); 342 vm_pager_bufferinit(); 343#ifndef XEN 344 cpu_setregs(); 345#endif 346} 347 348/* 349 * Send an interrupt to process. 350 * 351 * Stack is set up to allow sigcode stored 352 * at top to call routine, followed by kcall 353 * to sigreturn routine below. After sigreturn 354 * resets the signal mask, the stack, and the 355 * frame pointer, it returns to the user 356 * specified pc, psl. 357 */ 358#ifdef COMPAT_43 359static void 360osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 361{ 362 struct osigframe sf, *fp; 363 struct proc *p; 364 struct thread *td; 365 struct sigacts *psp; 366 struct trapframe *regs; 367 int sig; 368 int oonstack; 369 370 td = curthread; 371 p = td->td_proc; 372 PROC_LOCK_ASSERT(p, MA_OWNED); 373 sig = ksi->ksi_signo; 374 psp = p->p_sigacts; 375 mtx_assert(&psp->ps_mtx, MA_OWNED); 376 regs = td->td_frame; 377 oonstack = sigonstack(regs->tf_esp); 378 379 /* Allocate space for the signal handler context. */ 380 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 381 SIGISMEMBER(psp->ps_sigonstack, sig)) { 382 fp = (struct osigframe *)(td->td_sigstk.ss_sp + 383 td->td_sigstk.ss_size - sizeof(struct osigframe)); 384#if defined(COMPAT_43) 385 td->td_sigstk.ss_flags |= SS_ONSTACK; 386#endif 387 } else 388 fp = (struct osigframe *)regs->tf_esp - 1; 389 390 /* Translate the signal if appropriate. */ 391 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 392 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 393 394 /* Build the argument list for the signal handler. */ 395 sf.sf_signum = sig; 396 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; 397 bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); 398 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 399 /* Signal handler installed with SA_SIGINFO. */ 400 sf.sf_arg2 = (register_t)&fp->sf_siginfo; 401 sf.sf_siginfo.si_signo = sig; 402 sf.sf_siginfo.si_code = ksi->ksi_code; 403 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; 404 sf.sf_addr = 0; 405 } else { 406 /* Old FreeBSD-style arguments. */ 407 sf.sf_arg2 = ksi->ksi_code; 408 sf.sf_addr = (register_t)ksi->ksi_addr; 409 sf.sf_ahu.sf_handler = catcher; 410 } 411 mtx_unlock(&psp->ps_mtx); 412 PROC_UNLOCK(p); 413 414 /* Save most if not all of trap frame. */ 415 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; 416 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; 417 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; 418 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; 419 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; 420 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; 421 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; 422 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; 423 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; 424 sf.sf_siginfo.si_sc.sc_es = regs->tf_es; 425 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; 426 sf.sf_siginfo.si_sc.sc_gs = rgs(); 427 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; 428 429 /* Build the signal context to be used by osigreturn(). */ 430 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; 431 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); 432 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; 433 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; 434 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; 435 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; 436 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; 437 sf.sf_siginfo.si_sc.sc_err = regs->tf_err; 438 439 /* 440 * If we're a vm86 process, we want to save the segment registers. 441 * We also change eflags to be our emulated eflags, not the actual 442 * eflags. 443 */ 444 if (regs->tf_eflags & PSL_VM) { 445 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ 446 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 447 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 448 449 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; 450 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; 451 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; 452 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; 453 454 if (vm86->vm86_has_vme == 0) 455 sf.sf_siginfo.si_sc.sc_ps = 456 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 457 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 458 459 /* See sendsig() for comments. */ 460 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 461 } 462 463 /* 464 * Copy the sigframe out to the user's stack. 465 */ 466 if (copyout(&sf, fp, sizeof(*fp)) != 0) { 467#ifdef DEBUG 468 printf("process %ld has trashed its stack\n", (long)p->p_pid); 469#endif 470 PROC_LOCK(p); 471 sigexit(td, SIGILL); 472 } 473 474 regs->tf_esp = (int)fp; 475 if (p->p_sysent->sv_sigcode_base != 0) { 476 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 477 szosigcode; 478 } else { 479 /* a.out sysentvec does not use shared page */ 480 regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode; 481 } 482 regs->tf_eflags &= ~(PSL_T | PSL_D); 483 regs->tf_cs = _ucodesel; 484 regs->tf_ds = _udatasel; 485 regs->tf_es = _udatasel; 486 regs->tf_fs = _udatasel; 487 load_gs(_udatasel); 488 regs->tf_ss = _udatasel; 489 PROC_LOCK(p); 490 mtx_lock(&psp->ps_mtx); 491} 492#endif /* COMPAT_43 */ 493 494#ifdef COMPAT_FREEBSD4 495static void 496freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 497{ 498 struct sigframe4 sf, *sfp; 499 struct proc *p; 500 struct thread *td; 501 struct sigacts *psp; 502 struct trapframe *regs; 503 int sig; 504 int oonstack; 505 506 td = curthread; 507 p = td->td_proc; 508 PROC_LOCK_ASSERT(p, MA_OWNED); 509 sig = ksi->ksi_signo; 510 psp = p->p_sigacts; 511 mtx_assert(&psp->ps_mtx, MA_OWNED); 512 regs = td->td_frame; 513 oonstack = sigonstack(regs->tf_esp); 514 515 /* Save user context. */ 516 bzero(&sf, sizeof(sf)); 517 sf.sf_uc.uc_sigmask = *mask; 518 sf.sf_uc.uc_stack = td->td_sigstk; 519 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 520 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 521 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 522 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 523 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 524 bzero(sf.sf_uc.uc_mcontext.mc_fpregs, 525 sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); 526 bzero(sf.sf_uc.uc_mcontext.__spare__, 527 sizeof(sf.sf_uc.uc_mcontext.__spare__)); 528 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 529 530 /* Allocate space for the signal handler context. */ 531 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 532 SIGISMEMBER(psp->ps_sigonstack, sig)) { 533 sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp + 534 td->td_sigstk.ss_size - sizeof(struct sigframe4)); 535#if defined(COMPAT_43) 536 td->td_sigstk.ss_flags |= SS_ONSTACK; 537#endif 538 } else 539 sfp = (struct sigframe4 *)regs->tf_esp - 1; 540 541 /* Translate the signal if appropriate. */ 542 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 543 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 544 545 /* Build the argument list for the signal handler. */ 546 sf.sf_signum = sig; 547 sf.sf_ucontext = (register_t)&sfp->sf_uc; 548 bzero(&sf.sf_si, sizeof(sf.sf_si)); 549 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 550 /* Signal handler installed with SA_SIGINFO. */ 551 sf.sf_siginfo = (register_t)&sfp->sf_si; 552 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 553 554 /* Fill in POSIX parts */ 555 sf.sf_si.si_signo = sig; 556 sf.sf_si.si_code = ksi->ksi_code; 557 sf.sf_si.si_addr = ksi->ksi_addr; 558 } else { 559 /* Old FreeBSD-style arguments. */ 560 sf.sf_siginfo = ksi->ksi_code; 561 sf.sf_addr = (register_t)ksi->ksi_addr; 562 sf.sf_ahu.sf_handler = catcher; 563 } 564 mtx_unlock(&psp->ps_mtx); 565 PROC_UNLOCK(p); 566 567 /* 568 * If we're a vm86 process, we want to save the segment registers. 569 * We also change eflags to be our emulated eflags, not the actual 570 * eflags. 571 */ 572 if (regs->tf_eflags & PSL_VM) { 573 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 574 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 575 576 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 577 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 578 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 579 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 580 581 if (vm86->vm86_has_vme == 0) 582 sf.sf_uc.uc_mcontext.mc_eflags = 583 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 584 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 585 586 /* 587 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 588 * syscalls made by the signal handler. This just avoids 589 * wasting time for our lazy fixup of such faults. PSL_NT 590 * does nothing in vm86 mode, but vm86 programs can set it 591 * almost legitimately in probes for old cpu types. 592 */ 593 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 594 } 595 596 /* 597 * Copy the sigframe out to the user's stack. 598 */ 599 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 600#ifdef DEBUG 601 printf("process %ld has trashed its stack\n", (long)p->p_pid); 602#endif 603 PROC_LOCK(p); 604 sigexit(td, SIGILL); 605 } 606 607 regs->tf_esp = (int)sfp; 608 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 609 szfreebsd4_sigcode; 610 regs->tf_eflags &= ~(PSL_T | PSL_D); 611 regs->tf_cs = _ucodesel; 612 regs->tf_ds = _udatasel; 613 regs->tf_es = _udatasel; 614 regs->tf_fs = _udatasel; 615 regs->tf_ss = _udatasel; 616 PROC_LOCK(p); 617 mtx_lock(&psp->ps_mtx); 618} 619#endif /* COMPAT_FREEBSD4 */ 620 621void 622sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 623{ 624 struct sigframe sf, *sfp; 625 struct proc *p; 626 struct thread *td; 627 struct sigacts *psp; 628 char *sp; 629 struct trapframe *regs; 630 struct segment_descriptor *sdp; 631 int sig; 632 int oonstack; 633 634 td = curthread; 635 p = td->td_proc; 636 PROC_LOCK_ASSERT(p, MA_OWNED); 637 sig = ksi->ksi_signo; 638 psp = p->p_sigacts; 639 mtx_assert(&psp->ps_mtx, MA_OWNED); 640#ifdef COMPAT_FREEBSD4 641 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { 642 freebsd4_sendsig(catcher, ksi, mask); 643 return; 644 } 645#endif 646#ifdef COMPAT_43 647 if (SIGISMEMBER(psp->ps_osigset, sig)) { 648 osendsig(catcher, ksi, mask); 649 return; 650 } 651#endif 652 regs = td->td_frame; 653 oonstack = sigonstack(regs->tf_esp); 654 655 /* Save user context. */ 656 bzero(&sf, sizeof(sf)); 657 sf.sf_uc.uc_sigmask = *mask; 658 sf.sf_uc.uc_stack = td->td_sigstk; 659 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 660 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 661 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 662 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 663 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 664 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 665 get_fpcontext(td, &sf.sf_uc.uc_mcontext); 666 fpstate_drop(td); 667 /* 668 * Unconditionally fill the fsbase and gsbase into the mcontext. 669 */ 670 sdp = &td->td_pcb->pcb_fsd; 671 sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 | 672 sdp->sd_lobase; 673 sdp = &td->td_pcb->pcb_gsd; 674 sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | 675 sdp->sd_lobase; 676 sf.sf_uc.uc_mcontext.mc_flags = 0; 677 bzero(sf.sf_uc.uc_mcontext.mc_spare2, 678 sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); 679 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 680 681 /* Allocate space for the signal handler context. */ 682 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 683 SIGISMEMBER(psp->ps_sigonstack, sig)) { 684 sp = td->td_sigstk.ss_sp + 685 td->td_sigstk.ss_size - sizeof(struct sigframe); 686#if defined(COMPAT_43) 687 td->td_sigstk.ss_flags |= SS_ONSTACK; 688#endif 689 } else 690 sp = (char *)regs->tf_esp - sizeof(struct sigframe); 691 /* Align to 16 bytes. */ 692 sfp = (struct sigframe *)((unsigned int)sp & ~0xF); 693 694 /* Translate the signal if appropriate. */ 695 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 696 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 697 698 /* Build the argument list for the signal handler. */ 699 sf.sf_signum = sig; 700 sf.sf_ucontext = (register_t)&sfp->sf_uc; 701 bzero(&sf.sf_si, sizeof(sf.sf_si)); 702 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 703 /* Signal handler installed with SA_SIGINFO. */ 704 sf.sf_siginfo = (register_t)&sfp->sf_si; 705 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 706 707 /* Fill in POSIX parts */ 708 sf.sf_si = ksi->ksi_info; 709 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 710 } else { 711 /* Old FreeBSD-style arguments. */ 712 sf.sf_siginfo = ksi->ksi_code; 713 sf.sf_addr = (register_t)ksi->ksi_addr; 714 sf.sf_ahu.sf_handler = catcher; 715 } 716 mtx_unlock(&psp->ps_mtx); 717 PROC_UNLOCK(p); 718 719 /* 720 * If we're a vm86 process, we want to save the segment registers. 721 * We also change eflags to be our emulated eflags, not the actual 722 * eflags. 723 */ 724 if (regs->tf_eflags & PSL_VM) { 725 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 726 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 727 728 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 729 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 730 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 731 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 732 733 if (vm86->vm86_has_vme == 0) 734 sf.sf_uc.uc_mcontext.mc_eflags = 735 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 736 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 737 738 /* 739 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 740 * syscalls made by the signal handler. This just avoids 741 * wasting time for our lazy fixup of such faults. PSL_NT 742 * does nothing in vm86 mode, but vm86 programs can set it 743 * almost legitimately in probes for old cpu types. 744 */ 745 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 746 } 747 748 /* 749 * Copy the sigframe out to the user's stack. 750 */ 751 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 752#ifdef DEBUG 753 printf("process %ld has trashed its stack\n", (long)p->p_pid); 754#endif 755 PROC_LOCK(p); 756 sigexit(td, SIGILL); 757 } 758 759 regs->tf_esp = (int)sfp; 760 regs->tf_eip = p->p_sysent->sv_sigcode_base; 761 regs->tf_eflags &= ~(PSL_T | PSL_D); 762 regs->tf_cs = _ucodesel; 763 regs->tf_ds = _udatasel; 764 regs->tf_es = _udatasel; 765 regs->tf_fs = _udatasel; 766 regs->tf_ss = _udatasel; 767 PROC_LOCK(p); 768 mtx_lock(&psp->ps_mtx); 769} 770 771/* 772 * System call to cleanup state after a signal 773 * has been taken. Reset signal mask and 774 * stack state from context left by sendsig (above). 775 * Return to previous pc and psl as specified by 776 * context left by sendsig. Check carefully to 777 * make sure that the user has not modified the 778 * state to gain improper privileges. 779 * 780 * MPSAFE 781 */ 782#ifdef COMPAT_43 783int 784osigreturn(td, uap) 785 struct thread *td; 786 struct osigreturn_args /* { 787 struct osigcontext *sigcntxp; 788 } */ *uap; 789{ 790 struct osigcontext sc; 791 struct trapframe *regs; 792 struct osigcontext *scp; 793 int eflags, error; 794 ksiginfo_t ksi; 795 796 regs = td->td_frame; 797 error = copyin(uap->sigcntxp, &sc, sizeof(sc)); 798 if (error != 0) 799 return (error); 800 scp = ≻ 801 eflags = scp->sc_ps; 802 if (eflags & PSL_VM) { 803 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 804 struct vm86_kernel *vm86; 805 806 /* 807 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 808 * set up the vm86 area, and we can't enter vm86 mode. 809 */ 810 if (td->td_pcb->pcb_ext == 0) 811 return (EINVAL); 812 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 813 if (vm86->vm86_inited == 0) 814 return (EINVAL); 815 816 /* Go back to user mode if both flags are set. */ 817 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 818 ksiginfo_init_trap(&ksi); 819 ksi.ksi_signo = SIGBUS; 820 ksi.ksi_code = BUS_OBJERR; 821 ksi.ksi_addr = (void *)regs->tf_eip; 822 trapsignal(td, &ksi); 823 } 824 825 if (vm86->vm86_has_vme) { 826 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 827 (eflags & VME_USERCHANGE) | PSL_VM; 828 } else { 829 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 830 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 831 (eflags & VM_USERCHANGE) | PSL_VM; 832 } 833 tf->tf_vm86_ds = scp->sc_ds; 834 tf->tf_vm86_es = scp->sc_es; 835 tf->tf_vm86_fs = scp->sc_fs; 836 tf->tf_vm86_gs = scp->sc_gs; 837 tf->tf_ds = _udatasel; 838 tf->tf_es = _udatasel; 839 tf->tf_fs = _udatasel; 840 } else { 841 /* 842 * Don't allow users to change privileged or reserved flags. 843 */ 844 /* 845 * XXX do allow users to change the privileged flag PSL_RF. 846 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 847 * should sometimes set it there too. tf_eflags is kept in 848 * the signal context during signal handling and there is no 849 * other place to remember it, so the PSL_RF bit may be 850 * corrupted by the signal handler without us knowing. 851 * Corruption of the PSL_RF bit at worst causes one more or 852 * one less debugger trap, so allowing it is fairly harmless. 853 */ 854 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { 855 return (EINVAL); 856 } 857 858 /* 859 * Don't allow users to load a valid privileged %cs. Let the 860 * hardware check for invalid selectors, excess privilege in 861 * other selectors, invalid %eip's and invalid %esp's. 862 */ 863 if (!CS_SECURE(scp->sc_cs)) { 864 ksiginfo_init_trap(&ksi); 865 ksi.ksi_signo = SIGBUS; 866 ksi.ksi_code = BUS_OBJERR; 867 ksi.ksi_trapno = T_PROTFLT; 868 ksi.ksi_addr = (void *)regs->tf_eip; 869 trapsignal(td, &ksi); 870 return (EINVAL); 871 } 872 regs->tf_ds = scp->sc_ds; 873 regs->tf_es = scp->sc_es; 874 regs->tf_fs = scp->sc_fs; 875 } 876 877 /* Restore remaining registers. */ 878 regs->tf_eax = scp->sc_eax; 879 regs->tf_ebx = scp->sc_ebx; 880 regs->tf_ecx = scp->sc_ecx; 881 regs->tf_edx = scp->sc_edx; 882 regs->tf_esi = scp->sc_esi; 883 regs->tf_edi = scp->sc_edi; 884 regs->tf_cs = scp->sc_cs; 885 regs->tf_ss = scp->sc_ss; 886 regs->tf_isp = scp->sc_isp; 887 regs->tf_ebp = scp->sc_fp; 888 regs->tf_esp = scp->sc_sp; 889 regs->tf_eip = scp->sc_pc; 890 regs->tf_eflags = eflags; 891 892#if defined(COMPAT_43) 893 if (scp->sc_onstack & 1) 894 td->td_sigstk.ss_flags |= SS_ONSTACK; 895 else 896 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 897#endif 898 kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, 899 SIGPROCMASK_OLD); 900 return (EJUSTRETURN); 901} 902#endif /* COMPAT_43 */ 903 904#ifdef COMPAT_FREEBSD4 905/* 906 * MPSAFE 907 */ 908int 909freebsd4_sigreturn(td, uap) 910 struct thread *td; 911 struct freebsd4_sigreturn_args /* { 912 const ucontext4 *sigcntxp; 913 } */ *uap; 914{ 915 struct ucontext4 uc; 916 struct trapframe *regs; 917 struct ucontext4 *ucp; 918 int cs, eflags, error; 919 ksiginfo_t ksi; 920 921 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 922 if (error != 0) 923 return (error); 924 ucp = &uc; 925 regs = td->td_frame; 926 eflags = ucp->uc_mcontext.mc_eflags; 927 if (eflags & PSL_VM) { 928 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 929 struct vm86_kernel *vm86; 930 931 /* 932 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 933 * set up the vm86 area, and we can't enter vm86 mode. 934 */ 935 if (td->td_pcb->pcb_ext == 0) 936 return (EINVAL); 937 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 938 if (vm86->vm86_inited == 0) 939 return (EINVAL); 940 941 /* Go back to user mode if both flags are set. */ 942 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 943 ksiginfo_init_trap(&ksi); 944 ksi.ksi_signo = SIGBUS; 945 ksi.ksi_code = BUS_OBJERR; 946 ksi.ksi_addr = (void *)regs->tf_eip; 947 trapsignal(td, &ksi); 948 } 949 if (vm86->vm86_has_vme) { 950 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 951 (eflags & VME_USERCHANGE) | PSL_VM; 952 } else { 953 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 954 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 955 (eflags & VM_USERCHANGE) | PSL_VM; 956 } 957 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 958 tf->tf_eflags = eflags; 959 tf->tf_vm86_ds = tf->tf_ds; 960 tf->tf_vm86_es = tf->tf_es; 961 tf->tf_vm86_fs = tf->tf_fs; 962 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 963 tf->tf_ds = _udatasel; 964 tf->tf_es = _udatasel; 965 tf->tf_fs = _udatasel; 966 } else { 967 /* 968 * Don't allow users to change privileged or reserved flags. 969 */ 970 /* 971 * XXX do allow users to change the privileged flag PSL_RF. 972 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 973 * should sometimes set it there too. tf_eflags is kept in 974 * the signal context during signal handling and there is no 975 * other place to remember it, so the PSL_RF bit may be 976 * corrupted by the signal handler without us knowing. 977 * Corruption of the PSL_RF bit at worst causes one more or 978 * one less debugger trap, so allowing it is fairly harmless. 979 */ 980 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { 981 uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n", 982 td->td_proc->p_pid, td->td_name, eflags); 983 return (EINVAL); 984 } 985 986 /* 987 * Don't allow users to load a valid privileged %cs. Let the 988 * hardware check for invalid selectors, excess privilege in 989 * other selectors, invalid %eip's and invalid %esp's. 990 */ 991 cs = ucp->uc_mcontext.mc_cs; 992 if (!CS_SECURE(cs)) { 993 uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", 994 td->td_proc->p_pid, td->td_name, cs); 995 ksiginfo_init_trap(&ksi); 996 ksi.ksi_signo = SIGBUS; 997 ksi.ksi_code = BUS_OBJERR; 998 ksi.ksi_trapno = T_PROTFLT; 999 ksi.ksi_addr = (void *)regs->tf_eip; 1000 trapsignal(td, &ksi); 1001 return (EINVAL); 1002 } 1003 1004 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 1005 } 1006 1007#if defined(COMPAT_43) 1008 if (ucp->uc_mcontext.mc_onstack & 1) 1009 td->td_sigstk.ss_flags |= SS_ONSTACK; 1010 else 1011 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 1012#endif 1013 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 1014 return (EJUSTRETURN); 1015} 1016#endif /* COMPAT_FREEBSD4 */ 1017 1018/* 1019 * MPSAFE 1020 */ 1021int 1022sys_sigreturn(td, uap) 1023 struct thread *td; 1024 struct sigreturn_args /* { 1025 const struct __ucontext *sigcntxp; 1026 } */ *uap; 1027{ 1028 ucontext_t uc; 1029 struct trapframe *regs; 1030 ucontext_t *ucp; 1031 int cs, eflags, error, ret; 1032 ksiginfo_t ksi; 1033 1034 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 1035 if (error != 0) 1036 return (error); 1037 ucp = &uc; 1038 regs = td->td_frame; 1039 eflags = ucp->uc_mcontext.mc_eflags; 1040 if (eflags & PSL_VM) { 1041 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 1042 struct vm86_kernel *vm86; 1043 1044 /* 1045 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 1046 * set up the vm86 area, and we can't enter vm86 mode. 1047 */ 1048 if (td->td_pcb->pcb_ext == 0) 1049 return (EINVAL); 1050 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 1051 if (vm86->vm86_inited == 0) 1052 return (EINVAL); 1053 1054 /* Go back to user mode if both flags are set. */ 1055 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 1056 ksiginfo_init_trap(&ksi); 1057 ksi.ksi_signo = SIGBUS; 1058 ksi.ksi_code = BUS_OBJERR; 1059 ksi.ksi_addr = (void *)regs->tf_eip; 1060 trapsignal(td, &ksi); 1061 } 1062 1063 if (vm86->vm86_has_vme) { 1064 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 1065 (eflags & VME_USERCHANGE) | PSL_VM; 1066 } else { 1067 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 1068 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 1069 (eflags & VM_USERCHANGE) | PSL_VM; 1070 } 1071 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 1072 tf->tf_eflags = eflags; 1073 tf->tf_vm86_ds = tf->tf_ds; 1074 tf->tf_vm86_es = tf->tf_es; 1075 tf->tf_vm86_fs = tf->tf_fs; 1076 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 1077 tf->tf_ds = _udatasel; 1078 tf->tf_es = _udatasel; 1079 tf->tf_fs = _udatasel; 1080 } else { 1081 /* 1082 * Don't allow users to change privileged or reserved flags. 1083 */ 1084 /* 1085 * XXX do allow users to change the privileged flag PSL_RF. 1086 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 1087 * should sometimes set it there too. tf_eflags is kept in 1088 * the signal context during signal handling and there is no 1089 * other place to remember it, so the PSL_RF bit may be 1090 * corrupted by the signal handler without us knowing. 1091 * Corruption of the PSL_RF bit at worst causes one more or 1092 * one less debugger trap, so allowing it is fairly harmless. 1093 */ 1094 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { 1095 uprintf("pid %d (%s): sigreturn eflags = 0x%x\n", 1096 td->td_proc->p_pid, td->td_name, eflags); 1097 return (EINVAL); 1098 } 1099 1100 /* 1101 * Don't allow users to load a valid privileged %cs. Let the 1102 * hardware check for invalid selectors, excess privilege in 1103 * other selectors, invalid %eip's and invalid %esp's. 1104 */ 1105 cs = ucp->uc_mcontext.mc_cs; 1106 if (!CS_SECURE(cs)) { 1107 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", 1108 td->td_proc->p_pid, td->td_name, cs); 1109 ksiginfo_init_trap(&ksi); 1110 ksi.ksi_signo = SIGBUS; 1111 ksi.ksi_code = BUS_OBJERR; 1112 ksi.ksi_trapno = T_PROTFLT; 1113 ksi.ksi_addr = (void *)regs->tf_eip; 1114 trapsignal(td, &ksi); 1115 return (EINVAL); 1116 } 1117 1118 ret = set_fpcontext(td, &ucp->uc_mcontext); 1119 if (ret != 0) 1120 return (ret); 1121 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 1122 } 1123 1124#if defined(COMPAT_43) 1125 if (ucp->uc_mcontext.mc_onstack & 1) 1126 td->td_sigstk.ss_flags |= SS_ONSTACK; 1127 else 1128 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 1129#endif 1130 1131 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 1132 return (EJUSTRETURN); 1133} 1134 1135/* 1136 * Machine dependent boot() routine 1137 * 1138 * I haven't seen anything to put here yet 1139 * Possibly some stuff might be grafted back here from boot() 1140 */ 1141void 1142cpu_boot(int howto) 1143{ 1144} 1145 1146/* 1147 * Flush the D-cache for non-DMA I/O so that the I-cache can 1148 * be made coherent later. 1149 */ 1150void 1151cpu_flush_dcache(void *ptr, size_t len) 1152{ 1153 /* Not applicable */ 1154} 1155 1156/* Get current clock frequency for the given cpu id. */ 1157int 1158cpu_est_clockrate(int cpu_id, uint64_t *rate) 1159{ 1160 uint64_t tsc1, tsc2; 1161 uint64_t acnt, mcnt, perf; 1162 register_t reg; 1163 1164 if (pcpu_find(cpu_id) == NULL || rate == NULL) 1165 return (EINVAL); 1166 if ((cpu_feature & CPUID_TSC) == 0) 1167 return (EOPNOTSUPP); 1168 1169 /* 1170 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist, 1171 * DELAY(9) based logic fails. 1172 */ 1173 if (tsc_is_invariant && !tsc_perf_stat) 1174 return (EOPNOTSUPP); 1175 1176#ifdef SMP 1177 if (smp_cpus > 1) { 1178 /* Schedule ourselves on the indicated cpu. */ 1179 thread_lock(curthread); 1180 sched_bind(curthread, cpu_id); 1181 thread_unlock(curthread); 1182 } 1183#endif 1184 1185 /* Calibrate by measuring a short delay. */ 1186 reg = intr_disable(); 1187 if (tsc_is_invariant) { 1188 wrmsr(MSR_MPERF, 0); 1189 wrmsr(MSR_APERF, 0); 1190 tsc1 = rdtsc(); 1191 DELAY(1000); 1192 mcnt = rdmsr(MSR_MPERF); 1193 acnt = rdmsr(MSR_APERF); 1194 tsc2 = rdtsc(); 1195 intr_restore(reg); 1196 perf = 1000 * acnt / mcnt; 1197 *rate = (tsc2 - tsc1) * perf; 1198 } else { 1199 tsc1 = rdtsc(); 1200 DELAY(1000); 1201 tsc2 = rdtsc(); 1202 intr_restore(reg); 1203 *rate = (tsc2 - tsc1) * 1000; 1204 } 1205 1206#ifdef SMP 1207 if (smp_cpus > 1) { 1208 thread_lock(curthread); 1209 sched_unbind(curthread); 1210 thread_unlock(curthread); 1211 } 1212#endif 1213 1214 return (0); 1215} 1216 1217#ifdef XEN 1218 1219void 1220cpu_halt(void) 1221{ 1222 HYPERVISOR_shutdown(SHUTDOWN_poweroff); 1223} 1224 1225int scheduler_running; 1226 1227static void 1228cpu_idle_hlt(sbintime_t sbt) 1229{ 1230 1231 scheduler_running = 1; 1232 enable_intr(); 1233 idle_block(); 1234} 1235 1236#else 1237/* 1238 * Shutdown the CPU as much as possible 1239 */ 1240void 1241cpu_halt(void) 1242{ 1243 for (;;) 1244 halt(); 1245} 1246 1247#endif 1248 1249void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ 1250static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ 1251static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ 1252TUNABLE_INT("machdep.idle_mwait", &idle_mwait); 1253SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait, 1254 0, "Use MONITOR/MWAIT for short idle"); 1255 1256#define STATE_RUNNING 0x0 1257#define STATE_MWAIT 0x1 1258#define STATE_SLEEPING 0x2 1259 1260static void 1261cpu_idle_acpi(sbintime_t sbt) 1262{ 1263 int *state; 1264 1265 state = (int *)PCPU_PTR(monitorbuf); 1266 *state = STATE_SLEEPING; 1267 1268 /* See comments in cpu_idle_hlt(). */ 1269 disable_intr(); 1270 if (sched_runnable()) 1271 enable_intr(); 1272 else if (cpu_idle_hook) 1273 cpu_idle_hook(sbt); 1274 else 1275 __asm __volatile("sti; hlt"); 1276 *state = STATE_RUNNING; 1277} 1278 1279#ifndef XEN 1280static void 1281cpu_idle_hlt(sbintime_t sbt) 1282{ 1283 int *state; 1284 1285 state = (int *)PCPU_PTR(monitorbuf); 1286 *state = STATE_SLEEPING; 1287 1288 /* 1289 * Since we may be in a critical section from cpu_idle(), if 1290 * an interrupt fires during that critical section we may have 1291 * a pending preemption. If the CPU halts, then that thread 1292 * may not execute until a later interrupt awakens the CPU. 1293 * To handle this race, check for a runnable thread after 1294 * disabling interrupts and immediately return if one is 1295 * found. Also, we must absolutely guarentee that hlt is 1296 * the next instruction after sti. This ensures that any 1297 * interrupt that fires after the call to disable_intr() will 1298 * immediately awaken the CPU from hlt. Finally, please note 1299 * that on x86 this works fine because of interrupts enabled only 1300 * after the instruction following sti takes place, while IF is set 1301 * to 1 immediately, allowing hlt instruction to acknowledge the 1302 * interrupt. 1303 */ 1304 disable_intr(); 1305 if (sched_runnable()) 1306 enable_intr(); 1307 else 1308 __asm __volatile("sti; hlt"); 1309 *state = STATE_RUNNING; 1310} 1311#endif 1312 1313/* 1314 * MWAIT cpu power states. Lower 4 bits are sub-states. 1315 */ 1316#define MWAIT_C0 0xf0 1317#define MWAIT_C1 0x00 1318#define MWAIT_C2 0x10 1319#define MWAIT_C3 0x20 1320#define MWAIT_C4 0x30 1321 1322static void 1323cpu_idle_mwait(sbintime_t sbt) 1324{ 1325 int *state; 1326 1327 state = (int *)PCPU_PTR(monitorbuf); 1328 *state = STATE_MWAIT; 1329 1330 /* See comments in cpu_idle_hlt(). */ 1331 disable_intr(); 1332 if (sched_runnable()) { 1333 enable_intr(); 1334 *state = STATE_RUNNING; 1335 return; 1336 } 1337 cpu_monitor(state, 0, 0); 1338 if (*state == STATE_MWAIT) 1339 __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); 1340 else 1341 enable_intr(); 1342 *state = STATE_RUNNING; 1343} 1344 1345static void 1346cpu_idle_spin(sbintime_t sbt) 1347{ 1348 int *state; 1349 int i; 1350 1351 state = (int *)PCPU_PTR(monitorbuf); 1352 *state = STATE_RUNNING; 1353 1354 /* 1355 * The sched_runnable() call is racy but as long as there is 1356 * a loop missing it one time will have just a little impact if any 1357 * (and it is much better than missing the check at all). 1358 */ 1359 for (i = 0; i < 1000; i++) { 1360 if (sched_runnable()) 1361 return; 1362 cpu_spinwait(); 1363 } 1364} 1365 1366/* 1367 * C1E renders the local APIC timer dead, so we disable it by 1368 * reading the Interrupt Pending Message register and clearing 1369 * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27). 1370 * 1371 * Reference: 1372 * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors" 1373 * #32559 revision 3.00+ 1374 */ 1375#define MSR_AMDK8_IPM 0xc0010055 1376#define AMDK8_SMIONCMPHALT (1ULL << 27) 1377#define AMDK8_C1EONCMPHALT (1ULL << 28) 1378#define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT) 1379 1380static void 1381cpu_probe_amdc1e(void) 1382{ 1383 1384 /* 1385 * Detect the presence of C1E capability mostly on latest 1386 * dual-cores (or future) k8 family. 1387 */ 1388 if (cpu_vendor_id == CPU_VENDOR_AMD && 1389 (cpu_id & 0x00000f00) == 0x00000f00 && 1390 (cpu_id & 0x0fff0000) >= 0x00040000) { 1391 cpu_ident_amdc1e = 1; 1392 } 1393} 1394 1395#ifdef XEN 1396void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt; 1397#else 1398void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; 1399#endif 1400 1401void 1402cpu_idle(int busy) 1403{ 1404#ifndef XEN 1405 uint64_t msr; 1406#endif 1407 sbintime_t sbt = -1; 1408 1409 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", 1410 busy, curcpu); 1411#if defined(MP_WATCHDOG) && !defined(XEN) 1412 ap_watchdog(PCPU_GET(cpuid)); 1413#endif 1414#ifndef XEN 1415 /* If we are busy - try to use fast methods. */ 1416 if (busy) { 1417 if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { 1418 cpu_idle_mwait(busy); 1419 goto out; 1420 } 1421 } 1422#endif 1423 1424 /* If we have time - switch timers into idle mode. */ 1425 if (!busy) { 1426 critical_enter(); 1427 sbt = cpu_idleclock(); 1428 } 1429 1430#ifndef XEN 1431 /* Apply AMD APIC timer C1E workaround. */ 1432 if (cpu_ident_amdc1e && cpu_disable_deep_sleep) { 1433 msr = rdmsr(MSR_AMDK8_IPM); 1434 if (msr & AMDK8_CMPHALT) 1435 wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); 1436 } 1437#endif 1438 1439 /* Call main idle method. */ 1440 cpu_idle_fn(sbt); 1441 1442 /* Switch timers mack into active mode. */ 1443 if (!busy) { 1444 cpu_activeclock(); 1445 critical_exit(); 1446 } 1447#ifndef XEN 1448out: 1449#endif 1450 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", 1451 busy, curcpu); 1452} 1453 1454int 1455cpu_idle_wakeup(int cpu) 1456{ 1457 struct pcpu *pcpu; 1458 int *state; 1459 1460 pcpu = pcpu_find(cpu); 1461 state = (int *)pcpu->pc_monitorbuf; 1462 /* 1463 * This doesn't need to be atomic since missing the race will 1464 * simply result in unnecessary IPIs. 1465 */ 1466 if (*state == STATE_SLEEPING) 1467 return (0); 1468 if (*state == STATE_MWAIT) 1469 *state = STATE_RUNNING; 1470 return (1); 1471} 1472 1473/* 1474 * Ordered by speed/power consumption. 1475 */ 1476struct { 1477 void *id_fn; 1478 char *id_name; 1479} idle_tbl[] = { 1480 { cpu_idle_spin, "spin" }, 1481 { cpu_idle_mwait, "mwait" }, 1482 { cpu_idle_hlt, "hlt" }, 1483 { cpu_idle_acpi, "acpi" }, 1484 { NULL, NULL } 1485}; 1486 1487static int 1488idle_sysctl_available(SYSCTL_HANDLER_ARGS) 1489{ 1490 char *avail, *p; 1491 int error; 1492 int i; 1493 1494 avail = malloc(256, M_TEMP, M_WAITOK); 1495 p = avail; 1496 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1497 if (strstr(idle_tbl[i].id_name, "mwait") && 1498 (cpu_feature2 & CPUID2_MON) == 0) 1499 continue; 1500 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && 1501 cpu_idle_hook == NULL) 1502 continue; 1503 p += sprintf(p, "%s%s", p != avail ? ", " : "", 1504 idle_tbl[i].id_name); 1505 } 1506 error = sysctl_handle_string(oidp, avail, 0, req); 1507 free(avail, M_TEMP); 1508 return (error); 1509} 1510 1511SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD, 1512 0, 0, idle_sysctl_available, "A", "list of available idle functions"); 1513 1514static int 1515idle_sysctl(SYSCTL_HANDLER_ARGS) 1516{ 1517 char buf[16]; 1518 int error; 1519 char *p; 1520 int i; 1521 1522 p = "unknown"; 1523 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1524 if (idle_tbl[i].id_fn == cpu_idle_fn) { 1525 p = idle_tbl[i].id_name; 1526 break; 1527 } 1528 } 1529 strncpy(buf, p, sizeof(buf)); 1530 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 1531 if (error != 0 || req->newptr == NULL) 1532 return (error); 1533 for (i = 0; idle_tbl[i].id_name != NULL; i++) { 1534 if (strstr(idle_tbl[i].id_name, "mwait") && 1535 (cpu_feature2 & CPUID2_MON) == 0) 1536 continue; 1537 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && 1538 cpu_idle_hook == NULL) 1539 continue; 1540 if (strcmp(idle_tbl[i].id_name, buf)) 1541 continue; 1542 cpu_idle_fn = idle_tbl[i].id_fn; 1543 return (0); 1544 } 1545 return (EINVAL); 1546} 1547 1548SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, 1549 idle_sysctl, "A", "currently selected idle function"); 1550 1551uint64_t (*atomic_load_acq_64)(volatile uint64_t *) = 1552 atomic_load_acq_64_i386; 1553void (*atomic_store_rel_64)(volatile uint64_t *, uint64_t) = 1554 atomic_store_rel_64_i386; 1555 1556static void 1557cpu_probe_cmpxchg8b(void) 1558{ 1559 1560 if ((cpu_feature & CPUID_CX8) != 0 || 1561 cpu_vendor_id == CPU_VENDOR_RISE) { 1562 atomic_load_acq_64 = atomic_load_acq_64_i586; 1563 atomic_store_rel_64 = atomic_store_rel_64_i586; 1564 } 1565} 1566 1567/* 1568 * Reset registers to default values on exec. 1569 */ 1570void 1571exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 1572{ 1573 struct trapframe *regs = td->td_frame; 1574 struct pcb *pcb = td->td_pcb; 1575 1576 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ 1577 pcb->pcb_gs = _udatasel; 1578 load_gs(_udatasel); 1579 1580 mtx_lock_spin(&dt_lock); 1581 if (td->td_proc->p_md.md_ldt) 1582 user_ldt_free(td); 1583 else 1584 mtx_unlock_spin(&dt_lock); 1585 1586 bzero((char *)regs, sizeof(struct trapframe)); 1587 regs->tf_eip = imgp->entry_addr; 1588 regs->tf_esp = stack; 1589 regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); 1590 regs->tf_ss = _udatasel; 1591 regs->tf_ds = _udatasel; 1592 regs->tf_es = _udatasel; 1593 regs->tf_fs = _udatasel; 1594 regs->tf_cs = _ucodesel; 1595 1596 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ 1597 regs->tf_ebx = imgp->ps_strings; 1598 1599 /* 1600 * Reset the hardware debug registers if they were in use. 1601 * They won't have any meaning for the newly exec'd process. 1602 */ 1603 if (pcb->pcb_flags & PCB_DBREGS) { 1604 pcb->pcb_dr0 = 0; 1605 pcb->pcb_dr1 = 0; 1606 pcb->pcb_dr2 = 0; 1607 pcb->pcb_dr3 = 0; 1608 pcb->pcb_dr6 = 0; 1609 pcb->pcb_dr7 = 0; 1610 if (pcb == curpcb) { 1611 /* 1612 * Clear the debug registers on the running 1613 * CPU, otherwise they will end up affecting 1614 * the next process we switch to. 1615 */ 1616 reset_dbregs(); 1617 } 1618 pcb->pcb_flags &= ~PCB_DBREGS; 1619 } 1620 1621 /* 1622 * Initialize the math emulator (if any) for the current process. 1623 * Actually, just clear the bit that says that the emulator has 1624 * been initialized. Initialization is delayed until the process 1625 * traps to the emulator (if it is done at all) mainly because 1626 * emulators don't provide an entry point for initialization. 1627 */ 1628 td->td_pcb->pcb_flags &= ~FP_SOFTFP; 1629 pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; 1630 1631 /* 1632 * Drop the FP state if we hold it, so that the process gets a 1633 * clean FP state if it uses the FPU again. 1634 */ 1635 fpstate_drop(td); 1636 1637 /* 1638 * XXX - Linux emulator 1639 * Make sure sure edx is 0x0 on entry. Linux binaries depend 1640 * on it. 1641 */ 1642 td->td_retval[1] = 0; 1643} 1644 1645void 1646cpu_setregs(void) 1647{ 1648 unsigned int cr0; 1649 1650 cr0 = rcr0(); 1651 1652 /* 1653 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: 1654 * 1655 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT 1656 * instructions. We must set the CR0_MP bit and use the CR0_TS 1657 * bit to control the trap, because setting the CR0_EM bit does 1658 * not cause WAIT instructions to trap. It's important to trap 1659 * WAIT instructions - otherwise the "wait" variants of no-wait 1660 * control instructions would degenerate to the "no-wait" variants 1661 * after FP context switches but work correctly otherwise. It's 1662 * particularly important to trap WAITs when there is no NPX - 1663 * otherwise the "wait" variants would always degenerate. 1664 * 1665 * Try setting CR0_NE to get correct error reporting on 486DX's. 1666 * Setting it should fail or do nothing on lesser processors. 1667 */ 1668 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 1669 load_cr0(cr0); 1670 load_gs(_udatasel); 1671} 1672 1673u_long bootdev; /* not a struct cdev *- encoding is different */ 1674SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1675 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); 1676 1677/* 1678 * Initialize 386 and configure to run kernel 1679 */ 1680 1681/* 1682 * Initialize segments & interrupt table 1683 */ 1684 1685int _default_ldt; 1686 1687#ifdef XEN 1688union descriptor *gdt; 1689union descriptor *ldt; 1690#else 1691union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1692union descriptor ldt[NLDT]; /* local descriptor table */ 1693#endif 1694static struct gate_descriptor idt0[NIDT]; 1695struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1696struct region_descriptor r_gdt, r_idt; /* table descriptors */ 1697struct mtx dt_lock; /* lock for GDT and LDT */ 1698 1699#if defined(I586_CPU) && !defined(NO_F00F_HACK) 1700extern int has_f00f_bug; 1701#endif 1702 1703static struct i386tss dblfault_tss; 1704static char dblfault_stack[PAGE_SIZE]; 1705 1706extern vm_offset_t proc0kstack; 1707 1708 1709/* 1710 * software prototypes -- in more palatable form. 1711 * 1712 * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret 1713 * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) 1714 */ 1715struct soft_segment_descriptor gdt_segs[] = { 1716/* GNULL_SEL 0 Null Descriptor */ 1717{ .ssd_base = 0x0, 1718 .ssd_limit = 0x0, 1719 .ssd_type = 0, 1720 .ssd_dpl = SEL_KPL, 1721 .ssd_p = 0, 1722 .ssd_xx = 0, .ssd_xx1 = 0, 1723 .ssd_def32 = 0, 1724 .ssd_gran = 0 }, 1725/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ 1726{ .ssd_base = 0x0, 1727 .ssd_limit = 0xfffff, 1728 .ssd_type = SDT_MEMRWA, 1729 .ssd_dpl = SEL_KPL, 1730 .ssd_p = 1, 1731 .ssd_xx = 0, .ssd_xx1 = 0, 1732 .ssd_def32 = 1, 1733 .ssd_gran = 1 }, 1734/* GUFS_SEL 2 %fs Descriptor for user */ 1735{ .ssd_base = 0x0, 1736 .ssd_limit = 0xfffff, 1737 .ssd_type = SDT_MEMRWA, 1738 .ssd_dpl = SEL_UPL, 1739 .ssd_p = 1, 1740 .ssd_xx = 0, .ssd_xx1 = 0, 1741 .ssd_def32 = 1, 1742 .ssd_gran = 1 }, 1743/* GUGS_SEL 3 %gs Descriptor for user */ 1744{ .ssd_base = 0x0, 1745 .ssd_limit = 0xfffff, 1746 .ssd_type = SDT_MEMRWA, 1747 .ssd_dpl = SEL_UPL, 1748 .ssd_p = 1, 1749 .ssd_xx = 0, .ssd_xx1 = 0, 1750 .ssd_def32 = 1, 1751 .ssd_gran = 1 }, 1752/* GCODE_SEL 4 Code Descriptor for kernel */ 1753{ .ssd_base = 0x0, 1754 .ssd_limit = 0xfffff, 1755 .ssd_type = SDT_MEMERA, 1756 .ssd_dpl = SEL_KPL, 1757 .ssd_p = 1, 1758 .ssd_xx = 0, .ssd_xx1 = 0, 1759 .ssd_def32 = 1, 1760 .ssd_gran = 1 }, 1761/* GDATA_SEL 5 Data Descriptor for kernel */ 1762{ .ssd_base = 0x0, 1763 .ssd_limit = 0xfffff, 1764 .ssd_type = SDT_MEMRWA, 1765 .ssd_dpl = SEL_KPL, 1766 .ssd_p = 1, 1767 .ssd_xx = 0, .ssd_xx1 = 0, 1768 .ssd_def32 = 1, 1769 .ssd_gran = 1 }, 1770/* GUCODE_SEL 6 Code Descriptor for user */ 1771{ .ssd_base = 0x0, 1772 .ssd_limit = 0xfffff, 1773 .ssd_type = SDT_MEMERA, 1774 .ssd_dpl = SEL_UPL, 1775 .ssd_p = 1, 1776 .ssd_xx = 0, .ssd_xx1 = 0, 1777 .ssd_def32 = 1, 1778 .ssd_gran = 1 }, 1779/* GUDATA_SEL 7 Data Descriptor for user */ 1780{ .ssd_base = 0x0, 1781 .ssd_limit = 0xfffff, 1782 .ssd_type = SDT_MEMRWA, 1783 .ssd_dpl = SEL_UPL, 1784 .ssd_p = 1, 1785 .ssd_xx = 0, .ssd_xx1 = 0, 1786 .ssd_def32 = 1, 1787 .ssd_gran = 1 }, 1788/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ 1789{ .ssd_base = 0x400, 1790 .ssd_limit = 0xfffff, 1791 .ssd_type = SDT_MEMRWA, 1792 .ssd_dpl = SEL_KPL, 1793 .ssd_p = 1, 1794 .ssd_xx = 0, .ssd_xx1 = 0, 1795 .ssd_def32 = 1, 1796 .ssd_gran = 1 }, 1797#ifndef XEN 1798/* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 1799{ 1800 .ssd_base = 0x0, 1801 .ssd_limit = sizeof(struct i386tss)-1, 1802 .ssd_type = SDT_SYS386TSS, 1803 .ssd_dpl = 0, 1804 .ssd_p = 1, 1805 .ssd_xx = 0, .ssd_xx1 = 0, 1806 .ssd_def32 = 0, 1807 .ssd_gran = 0 }, 1808/* GLDT_SEL 10 LDT Descriptor */ 1809{ .ssd_base = (int) ldt, 1810 .ssd_limit = sizeof(ldt)-1, 1811 .ssd_type = SDT_SYSLDT, 1812 .ssd_dpl = SEL_UPL, 1813 .ssd_p = 1, 1814 .ssd_xx = 0, .ssd_xx1 = 0, 1815 .ssd_def32 = 0, 1816 .ssd_gran = 0 }, 1817/* GUSERLDT_SEL 11 User LDT Descriptor per process */ 1818{ .ssd_base = (int) ldt, 1819 .ssd_limit = (512 * sizeof(union descriptor)-1), 1820 .ssd_type = SDT_SYSLDT, 1821 .ssd_dpl = 0, 1822 .ssd_p = 1, 1823 .ssd_xx = 0, .ssd_xx1 = 0, 1824 .ssd_def32 = 0, 1825 .ssd_gran = 0 }, 1826/* GPANIC_SEL 12 Panic Tss Descriptor */ 1827{ .ssd_base = (int) &dblfault_tss, 1828 .ssd_limit = sizeof(struct i386tss)-1, 1829 .ssd_type = SDT_SYS386TSS, 1830 .ssd_dpl = 0, 1831 .ssd_p = 1, 1832 .ssd_xx = 0, .ssd_xx1 = 0, 1833 .ssd_def32 = 0, 1834 .ssd_gran = 0 }, 1835/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ 1836{ .ssd_base = 0, 1837 .ssd_limit = 0xfffff, 1838 .ssd_type = SDT_MEMERA, 1839 .ssd_dpl = 0, 1840 .ssd_p = 1, 1841 .ssd_xx = 0, .ssd_xx1 = 0, 1842 .ssd_def32 = 0, 1843 .ssd_gran = 1 }, 1844/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ 1845{ .ssd_base = 0, 1846 .ssd_limit = 0xfffff, 1847 .ssd_type = SDT_MEMERA, 1848 .ssd_dpl = 0, 1849 .ssd_p = 1, 1850 .ssd_xx = 0, .ssd_xx1 = 0, 1851 .ssd_def32 = 0, 1852 .ssd_gran = 1 }, 1853/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ 1854{ .ssd_base = 0, 1855 .ssd_limit = 0xfffff, 1856 .ssd_type = SDT_MEMRWA, 1857 .ssd_dpl = 0, 1858 .ssd_p = 1, 1859 .ssd_xx = 0, .ssd_xx1 = 0, 1860 .ssd_def32 = 1, 1861 .ssd_gran = 1 }, 1862/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ 1863{ .ssd_base = 0, 1864 .ssd_limit = 0xfffff, 1865 .ssd_type = SDT_MEMRWA, 1866 .ssd_dpl = 0, 1867 .ssd_p = 1, 1868 .ssd_xx = 0, .ssd_xx1 = 0, 1869 .ssd_def32 = 0, 1870 .ssd_gran = 1 }, 1871/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ 1872{ .ssd_base = 0, 1873 .ssd_limit = 0xfffff, 1874 .ssd_type = SDT_MEMRWA, 1875 .ssd_dpl = 0, 1876 .ssd_p = 1, 1877 .ssd_xx = 0, .ssd_xx1 = 0, 1878 .ssd_def32 = 0, 1879 .ssd_gran = 1 }, 1880/* GNDIS_SEL 18 NDIS Descriptor */ 1881{ .ssd_base = 0x0, 1882 .ssd_limit = 0x0, 1883 .ssd_type = 0, 1884 .ssd_dpl = 0, 1885 .ssd_p = 0, 1886 .ssd_xx = 0, .ssd_xx1 = 0, 1887 .ssd_def32 = 0, 1888 .ssd_gran = 0 }, 1889#endif /* !XEN */ 1890}; 1891 1892static struct soft_segment_descriptor ldt_segs[] = { 1893 /* Null Descriptor - overwritten by call gate */ 1894{ .ssd_base = 0x0, 1895 .ssd_limit = 0x0, 1896 .ssd_type = 0, 1897 .ssd_dpl = 0, 1898 .ssd_p = 0, 1899 .ssd_xx = 0, .ssd_xx1 = 0, 1900 .ssd_def32 = 0, 1901 .ssd_gran = 0 }, 1902 /* Null Descriptor - overwritten by call gate */ 1903{ .ssd_base = 0x0, 1904 .ssd_limit = 0x0, 1905 .ssd_type = 0, 1906 .ssd_dpl = 0, 1907 .ssd_p = 0, 1908 .ssd_xx = 0, .ssd_xx1 = 0, 1909 .ssd_def32 = 0, 1910 .ssd_gran = 0 }, 1911 /* Null Descriptor - overwritten by call gate */ 1912{ .ssd_base = 0x0, 1913 .ssd_limit = 0x0, 1914 .ssd_type = 0, 1915 .ssd_dpl = 0, 1916 .ssd_p = 0, 1917 .ssd_xx = 0, .ssd_xx1 = 0, 1918 .ssd_def32 = 0, 1919 .ssd_gran = 0 }, 1920 /* Code Descriptor for user */ 1921{ .ssd_base = 0x0, 1922 .ssd_limit = 0xfffff, 1923 .ssd_type = SDT_MEMERA, 1924 .ssd_dpl = SEL_UPL, 1925 .ssd_p = 1, 1926 .ssd_xx = 0, .ssd_xx1 = 0, 1927 .ssd_def32 = 1, 1928 .ssd_gran = 1 }, 1929 /* Null Descriptor - overwritten by call gate */ 1930{ .ssd_base = 0x0, 1931 .ssd_limit = 0x0, 1932 .ssd_type = 0, 1933 .ssd_dpl = 0, 1934 .ssd_p = 0, 1935 .ssd_xx = 0, .ssd_xx1 = 0, 1936 .ssd_def32 = 0, 1937 .ssd_gran = 0 }, 1938 /* Data Descriptor for user */ 1939{ .ssd_base = 0x0, 1940 .ssd_limit = 0xfffff, 1941 .ssd_type = SDT_MEMRWA, 1942 .ssd_dpl = SEL_UPL, 1943 .ssd_p = 1, 1944 .ssd_xx = 0, .ssd_xx1 = 0, 1945 .ssd_def32 = 1, 1946 .ssd_gran = 1 }, 1947}; 1948 1949void 1950setidt(idx, func, typ, dpl, selec) 1951 int idx; 1952 inthand_t *func; 1953 int typ; 1954 int dpl; 1955 int selec; 1956{ 1957 struct gate_descriptor *ip; 1958 1959 ip = idt + idx; 1960 ip->gd_looffset = (int)func; 1961 ip->gd_selector = selec; 1962 ip->gd_stkcpy = 0; 1963 ip->gd_xx = 0; 1964 ip->gd_type = typ; 1965 ip->gd_dpl = dpl; 1966 ip->gd_p = 1; 1967 ip->gd_hioffset = ((int)func)>>16 ; 1968} 1969 1970extern inthand_t 1971 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1972 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1973 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1974 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1975 IDTVEC(xmm), 1976#ifdef KDTRACE_HOOKS 1977 IDTVEC(dtrace_ret), 1978#endif 1979 IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); 1980 1981#ifdef DDB 1982/* 1983 * Display the index and function name of any IDT entries that don't use 1984 * the default 'rsvd' entry point. 1985 */ 1986DB_SHOW_COMMAND(idt, db_show_idt) 1987{ 1988 struct gate_descriptor *ip; 1989 int idx; 1990 uintptr_t func; 1991 1992 ip = idt; 1993 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 1994 func = (ip->gd_hioffset << 16 | ip->gd_looffset); 1995 if (func != (uintptr_t)&IDTVEC(rsvd)) { 1996 db_printf("%3d\t", idx); 1997 db_printsym(func, DB_STGY_PROC); 1998 db_printf("\n"); 1999 } 2000 ip++; 2001 } 2002} 2003 2004/* Show privileged registers. */ 2005DB_SHOW_COMMAND(sysregs, db_show_sysregs) 2006{ 2007 uint64_t idtr, gdtr; 2008 2009 idtr = ridt(); 2010 db_printf("idtr\t0x%08x/%04x\n", 2011 (u_int)(idtr >> 16), (u_int)idtr & 0xffff); 2012 gdtr = rgdt(); 2013 db_printf("gdtr\t0x%08x/%04x\n", 2014 (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); 2015 db_printf("ldtr\t0x%04x\n", rldt()); 2016 db_printf("tr\t0x%04x\n", rtr()); 2017 db_printf("cr0\t0x%08x\n", rcr0()); 2018 db_printf("cr2\t0x%08x\n", rcr2()); 2019 db_printf("cr3\t0x%08x\n", rcr3()); 2020 db_printf("cr4\t0x%08x\n", rcr4()); 2021} 2022#endif 2023 2024void 2025sdtossd(sd, ssd) 2026 struct segment_descriptor *sd; 2027 struct soft_segment_descriptor *ssd; 2028{ 2029 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 2030 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 2031 ssd->ssd_type = sd->sd_type; 2032 ssd->ssd_dpl = sd->sd_dpl; 2033 ssd->ssd_p = sd->sd_p; 2034 ssd->ssd_def32 = sd->sd_def32; 2035 ssd->ssd_gran = sd->sd_gran; 2036} 2037 2038#ifndef XEN 2039static int 2040add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) 2041{ 2042 int i, insert_idx, physmap_idx; 2043 2044 physmap_idx = *physmap_idxp; 2045 2046 if (boothowto & RB_VERBOSE) 2047 printf("SMAP type=%02x base=%016llx len=%016llx\n", 2048 smap->type, smap->base, smap->length); 2049 2050 if (smap->type != SMAP_TYPE_MEMORY) 2051 return (1); 2052 2053 if (smap->length == 0) 2054 return (1); 2055 2056#ifndef PAE 2057 if (smap->base > 0xffffffff) { 2058 printf("%uK of memory above 4GB ignored\n", 2059 (u_int)(smap->length / 1024)); 2060 return (1); 2061 } 2062#endif 2063 2064 /* 2065 * Find insertion point while checking for overlap. Start off by 2066 * assuming the new entry will be added to the end. 2067 */ 2068 insert_idx = physmap_idx + 2; 2069 for (i = 0; i <= physmap_idx; i += 2) { 2070 if (smap->base < physmap[i + 1]) { 2071 if (smap->base + smap->length <= physmap[i]) { 2072 insert_idx = i; 2073 break; 2074 } 2075 if (boothowto & RB_VERBOSE) 2076 printf( 2077 "Overlapping memory regions, ignoring second region\n"); 2078 return (1); 2079 } 2080 } 2081 2082 /* See if we can prepend to the next entry. */ 2083 if (insert_idx <= physmap_idx && 2084 smap->base + smap->length == physmap[insert_idx]) { 2085 physmap[insert_idx] = smap->base; 2086 return (1); 2087 } 2088 2089 /* See if we can append to the previous entry. */ 2090 if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) { 2091 physmap[insert_idx - 1] += smap->length; 2092 return (1); 2093 } 2094 2095 physmap_idx += 2; 2096 *physmap_idxp = physmap_idx; 2097 if (physmap_idx == PHYSMAP_SIZE) { 2098 printf( 2099 "Too many segments in the physical address map, giving up\n"); 2100 return (0); 2101 } 2102 2103 /* 2104 * Move the last 'N' entries down to make room for the new 2105 * entry if needed. 2106 */ 2107 for (i = physmap_idx; i > insert_idx; i -= 2) { 2108 physmap[i] = physmap[i - 2]; 2109 physmap[i + 1] = physmap[i - 1]; 2110 } 2111 2112 /* Insert the new entry. */ 2113 physmap[insert_idx] = smap->base; 2114 physmap[insert_idx + 1] = smap->base + smap->length; 2115 return (1); 2116} 2117 2118static void 2119basemem_setup(void) 2120{ 2121 vm_paddr_t pa; 2122 pt_entry_t *pte; 2123 int i; 2124 2125 if (basemem > 640) { 2126 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", 2127 basemem); 2128 basemem = 640; 2129 } 2130 2131 /* 2132 * XXX if biosbasemem is now < 640, there is a `hole' 2133 * between the end of base memory and the start of 2134 * ISA memory. The hole may be empty or it may 2135 * contain BIOS code or data. Map it read/write so 2136 * that the BIOS can write to it. (Memory from 0 to 2137 * the physical end of the kernel is mapped read-only 2138 * to begin with and then parts of it are remapped. 2139 * The parts that aren't remapped form holes that 2140 * remain read-only and are unused by the kernel. 2141 * The base memory area is below the physical end of 2142 * the kernel and right now forms a read-only hole. 2143 * The part of it from PAGE_SIZE to 2144 * (trunc_page(biosbasemem * 1024) - 1) will be 2145 * remapped and used by the kernel later.) 2146 * 2147 * This code is similar to the code used in 2148 * pmap_mapdev, but since no memory needs to be 2149 * allocated we simply change the mapping. 2150 */ 2151 for (pa = trunc_page(basemem * 1024); 2152 pa < ISA_HOLE_START; pa += PAGE_SIZE) 2153 pmap_kenter(KERNBASE + pa, pa); 2154 2155 /* 2156 * Map pages between basemem and ISA_HOLE_START, if any, r/w into 2157 * the vm86 page table so that vm86 can scribble on them using 2158 * the vm86 map too. XXX: why 2 ways for this and only 1 way for 2159 * page 0, at least as initialized here? 2160 */ 2161 pte = (pt_entry_t *)vm86paddr; 2162 for (i = basemem / 4; i < 160; i++) 2163 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; 2164} 2165#endif 2166 2167/* 2168 * Populate the (physmap) array with base/bound pairs describing the 2169 * available physical memory in the system, then test this memory and 2170 * build the phys_avail array describing the actually-available memory. 2171 * 2172 * If we cannot accurately determine the physical memory map, then use 2173 * value from the 0xE801 call, and failing that, the RTC. 2174 * 2175 * Total memory size may be set by the kernel environment variable 2176 * hw.physmem or the compile-time define MAXMEM. 2177 * 2178 * XXX first should be vm_paddr_t. 2179 */ 2180static void 2181getmemsize(int first) 2182{ 2183 int has_smap, off, physmap_idx, pa_indx, da_indx; 2184 u_long physmem_tunable, memtest; 2185 vm_paddr_t physmap[PHYSMAP_SIZE]; 2186 pt_entry_t *pte; 2187 quad_t dcons_addr, dcons_size; 2188#ifndef XEN 2189 int hasbrokenint12, i, res; 2190 u_int extmem; 2191 struct vm86frame vmf; 2192 struct vm86context vmc; 2193 vm_paddr_t pa; 2194 struct bios_smap *smap, *smapbase, *smapend; 2195 u_int32_t smapsize; 2196 caddr_t kmdp; 2197#endif 2198 2199 has_smap = 0; 2200#if defined(XEN) 2201 Maxmem = xen_start_info->nr_pages - init_first; 2202 physmem = Maxmem; 2203 basemem = 0; 2204 physmap[0] = init_first << PAGE_SHIFT; 2205 physmap[1] = ptoa(Maxmem) - round_page(msgbufsize); 2206 physmap_idx = 0; 2207#else 2208#ifdef XBOX 2209 if (arch_i386_is_xbox) { 2210 /* 2211 * We queried the memory size before, so chop off 4MB for 2212 * the framebuffer and inform the OS of this. 2213 */ 2214 physmap[0] = 0; 2215 physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE; 2216 physmap_idx = 0; 2217 goto physmap_done; 2218 } 2219#endif 2220 bzero(&vmf, sizeof(vmf)); 2221 bzero(physmap, sizeof(physmap)); 2222 basemem = 0; 2223 2224 /* 2225 * Check if the loader supplied an SMAP memory map. If so, 2226 * use that and do not make any VM86 calls. 2227 */ 2228 physmap_idx = 0; 2229 smapbase = NULL; 2230 kmdp = preload_search_by_type("elf kernel"); 2231 if (kmdp == NULL) 2232 kmdp = preload_search_by_type("elf32 kernel"); 2233 if (kmdp != NULL) 2234 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2235 MODINFO_METADATA | MODINFOMD_SMAP); 2236 if (smapbase != NULL) { 2237 /* 2238 * subr_module.c says: 2239 * "Consumer may safely assume that size value precedes data." 2240 * ie: an int32_t immediately precedes SMAP. 2241 */ 2242 smapsize = *((u_int32_t *)smapbase - 1); 2243 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 2244 has_smap = 1; 2245 2246 for (smap = smapbase; smap < smapend; smap++) 2247 if (!add_smap_entry(smap, physmap, &physmap_idx)) 2248 break; 2249 goto have_smap; 2250 } 2251 2252 /* 2253 * Some newer BIOSes have a broken INT 12H implementation 2254 * which causes a kernel panic immediately. In this case, we 2255 * need use the SMAP to determine the base memory size. 2256 */ 2257 hasbrokenint12 = 0; 2258 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); 2259 if (hasbrokenint12 == 0) { 2260 /* Use INT12 to determine base memory size. */ 2261 vm86_intcall(0x12, &vmf); 2262 basemem = vmf.vmf_ax; 2263 basemem_setup(); 2264 } 2265 2266 /* 2267 * Fetch the memory map with INT 15:E820. Map page 1 R/W into 2268 * the kernel page table so we can use it as a buffer. The 2269 * kernel will unmap this page later. 2270 */ 2271 pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT); 2272 vmc.npages = 0; 2273 smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); 2274 res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); 2275 KASSERT(res != 0, ("vm86_getptr() failed: address not found")); 2276 2277 vmf.vmf_ebx = 0; 2278 do { 2279 vmf.vmf_eax = 0xE820; 2280 vmf.vmf_edx = SMAP_SIG; 2281 vmf.vmf_ecx = sizeof(struct bios_smap); 2282 i = vm86_datacall(0x15, &vmf, &vmc); 2283 if (i || vmf.vmf_eax != SMAP_SIG) 2284 break; 2285 has_smap = 1; 2286 if (!add_smap_entry(smap, physmap, &physmap_idx)) 2287 break; 2288 } while (vmf.vmf_ebx != 0); 2289 2290have_smap: 2291 /* 2292 * If we didn't fetch the "base memory" size from INT12, 2293 * figure it out from the SMAP (or just guess). 2294 */ 2295 if (basemem == 0) { 2296 for (i = 0; i <= physmap_idx; i += 2) { 2297 if (physmap[i] == 0x00000000) { 2298 basemem = physmap[i + 1] / 1024; 2299 break; 2300 } 2301 } 2302 2303 /* XXX: If we couldn't find basemem from SMAP, just guess. */ 2304 if (basemem == 0) 2305 basemem = 640; 2306 basemem_setup(); 2307 } 2308 2309 if (physmap[1] != 0) 2310 goto physmap_done; 2311 2312 /* 2313 * If we failed to find an SMAP, figure out the extended 2314 * memory size. We will then build a simple memory map with 2315 * two segments, one for "base memory" and the second for 2316 * "extended memory". Note that "extended memory" starts at a 2317 * physical address of 1MB and that both basemem and extmem 2318 * are in units of 1KB. 2319 * 2320 * First, try to fetch the extended memory size via INT 15:E801. 2321 */ 2322 vmf.vmf_ax = 0xE801; 2323 if (vm86_intcall(0x15, &vmf) == 0) { 2324 extmem = vmf.vmf_cx + vmf.vmf_dx * 64; 2325 } else { 2326 /* 2327 * If INT15:E801 fails, this is our last ditch effort 2328 * to determine the extended memory size. Currently 2329 * we prefer the RTC value over INT15:88. 2330 */ 2331#if 0 2332 vmf.vmf_ah = 0x88; 2333 vm86_intcall(0x15, &vmf); 2334 extmem = vmf.vmf_ax; 2335#else 2336 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); 2337#endif 2338 } 2339 2340 /* 2341 * Special hack for chipsets that still remap the 384k hole when 2342 * there's 16MB of memory - this really confuses people that 2343 * are trying to use bus mastering ISA controllers with the 2344 * "16MB limit"; they only have 16MB, but the remapping puts 2345 * them beyond the limit. 2346 * 2347 * If extended memory is between 15-16MB (16-17MB phys address range), 2348 * chop it to 15MB. 2349 */ 2350 if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) 2351 extmem = 15 * 1024; 2352 2353 physmap[0] = 0; 2354 physmap[1] = basemem * 1024; 2355 physmap_idx = 2; 2356 physmap[physmap_idx] = 0x100000; 2357 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 2358 2359physmap_done: 2360#endif 2361 /* 2362 * Now, physmap contains a map of physical memory. 2363 */ 2364 2365#ifdef SMP 2366 /* make hole for AP bootstrap code */ 2367 physmap[1] = mp_bootaddress(physmap[1]); 2368#endif 2369 2370 /* 2371 * Maxmem isn't the "maximum memory", it's one larger than the 2372 * highest page of the physical address space. It should be 2373 * called something like "Maxphyspage". We may adjust this 2374 * based on ``hw.physmem'' and the results of the memory test. 2375 */ 2376 Maxmem = atop(physmap[physmap_idx + 1]); 2377 2378#ifdef MAXMEM 2379 Maxmem = MAXMEM / 4; 2380#endif 2381 2382 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 2383 Maxmem = atop(physmem_tunable); 2384 2385 /* 2386 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend 2387 * the amount of memory in the system. 2388 */ 2389 if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) 2390 Maxmem = atop(physmap[physmap_idx + 1]); 2391 2392 /* 2393 * By default enable the memory test on real hardware, and disable 2394 * it if we appear to be running in a VM. This avoids touching all 2395 * pages unnecessarily, which doesn't matter on real hardware but is 2396 * bad for shared VM hosts. Use a general name so that 2397 * one could eventually do more with the code than just disable it. 2398 */ 2399 memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1; 2400 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 2401 2402 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2403 (boothowto & RB_VERBOSE)) 2404 printf("Physical memory use set to %ldK\n", Maxmem * 4); 2405 2406 /* 2407 * If Maxmem has been increased beyond what the system has detected, 2408 * extend the last memory segment to the new limit. 2409 */ 2410 if (atop(physmap[physmap_idx + 1]) < Maxmem) 2411 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 2412 2413 /* call pmap initialization to make new kernel address space */ 2414 pmap_bootstrap(first); 2415 2416 /* 2417 * Size up each available chunk of physical memory. 2418 */ 2419 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 2420 pa_indx = 0; 2421 da_indx = 1; 2422 phys_avail[pa_indx++] = physmap[0]; 2423 phys_avail[pa_indx] = physmap[0]; 2424 dump_avail[da_indx] = physmap[0]; 2425 pte = CMAP1; 2426 2427 /* 2428 * Get dcons buffer address 2429 */ 2430 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 2431 getenv_quad("dcons.size", &dcons_size) == 0) 2432 dcons_addr = 0; 2433 2434#ifndef XEN 2435 /* 2436 * physmap is in bytes, so when converting to page boundaries, 2437 * round up the start address and round down the end address. 2438 */ 2439 for (i = 0; i <= physmap_idx; i += 2) { 2440 vm_paddr_t end; 2441 2442 end = ptoa((vm_paddr_t)Maxmem); 2443 if (physmap[i + 1] < end) 2444 end = trunc_page(physmap[i + 1]); 2445 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 2446 int tmp, page_bad, full; 2447 int *ptr = (int *)CADDR1; 2448 2449 full = FALSE; 2450 /* 2451 * block out kernel memory as not available. 2452 */ 2453 if (pa >= KERNLOAD && pa < first) 2454 goto do_dump_avail; 2455 2456 /* 2457 * block out dcons buffer 2458 */ 2459 if (dcons_addr > 0 2460 && pa >= trunc_page(dcons_addr) 2461 && pa < dcons_addr + dcons_size) 2462 goto do_dump_avail; 2463 2464 page_bad = FALSE; 2465 if (memtest == 0) 2466 goto skip_memtest; 2467 2468 /* 2469 * map page into kernel: valid, read/write,non-cacheable 2470 */ 2471 *pte = pa | PG_V | PG_RW | PG_N; 2472 invltlb(); 2473 2474 tmp = *(int *)ptr; 2475 /* 2476 * Test for alternating 1's and 0's 2477 */ 2478 *(volatile int *)ptr = 0xaaaaaaaa; 2479 if (*(volatile int *)ptr != 0xaaaaaaaa) 2480 page_bad = TRUE; 2481 /* 2482 * Test for alternating 0's and 1's 2483 */ 2484 *(volatile int *)ptr = 0x55555555; 2485 if (*(volatile int *)ptr != 0x55555555) 2486 page_bad = TRUE; 2487 /* 2488 * Test for all 1's 2489 */ 2490 *(volatile int *)ptr = 0xffffffff; 2491 if (*(volatile int *)ptr != 0xffffffff) 2492 page_bad = TRUE; 2493 /* 2494 * Test for all 0's 2495 */ 2496 *(volatile int *)ptr = 0x0; 2497 if (*(volatile int *)ptr != 0x0) 2498 page_bad = TRUE; 2499 /* 2500 * Restore original value. 2501 */ 2502 *(int *)ptr = tmp; 2503 2504skip_memtest: 2505 /* 2506 * Adjust array of valid/good pages. 2507 */ 2508 if (page_bad == TRUE) 2509 continue; 2510 /* 2511 * If this good page is a continuation of the 2512 * previous set of good pages, then just increase 2513 * the end pointer. Otherwise start a new chunk. 2514 * Note that "end" points one higher than end, 2515 * making the range >= start and < end. 2516 * If we're also doing a speculative memory 2517 * test and we at or past the end, bump up Maxmem 2518 * so that we keep going. The first bad page 2519 * will terminate the loop. 2520 */ 2521 if (phys_avail[pa_indx] == pa) { 2522 phys_avail[pa_indx] += PAGE_SIZE; 2523 } else { 2524 pa_indx++; 2525 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2526 printf( 2527 "Too many holes in the physical address space, giving up\n"); 2528 pa_indx--; 2529 full = TRUE; 2530 goto do_dump_avail; 2531 } 2532 phys_avail[pa_indx++] = pa; /* start */ 2533 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 2534 } 2535 physmem++; 2536do_dump_avail: 2537 if (dump_avail[da_indx] == pa) { 2538 dump_avail[da_indx] += PAGE_SIZE; 2539 } else { 2540 da_indx++; 2541 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2542 da_indx--; 2543 goto do_next; 2544 } 2545 dump_avail[da_indx++] = pa; /* start */ 2546 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 2547 } 2548do_next: 2549 if (full) 2550 break; 2551 } 2552 } 2553 *pte = 0; 2554 invltlb(); 2555#else 2556 phys_avail[0] = physfree; 2557 phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; 2558 dump_avail[0] = 0; 2559 dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; 2560 2561#endif 2562 2563 /* 2564 * XXX 2565 * The last chunk must contain at least one page plus the message 2566 * buffer to avoid complicating other code (message buffer address 2567 * calculation, etc.). 2568 */ 2569 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 2570 round_page(msgbufsize) >= phys_avail[pa_indx]) { 2571 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2572 phys_avail[pa_indx--] = 0; 2573 phys_avail[pa_indx--] = 0; 2574 } 2575 2576 Maxmem = atop(phys_avail[pa_indx]); 2577 2578 /* Trim off space for the message buffer. */ 2579 phys_avail[pa_indx] -= round_page(msgbufsize); 2580 2581 /* Map the message buffer. */ 2582 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) 2583 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 2584 off); 2585 2586 PT_UPDATES_FLUSH(); 2587} 2588 2589#ifdef XEN 2590#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) 2591 2592void 2593init386(first) 2594 int first; 2595{ 2596 unsigned long gdtmachpfn; 2597 int error, gsel_tss, metadata_missing, x, pa; 2598 size_t kstack0_sz; 2599 struct pcpu *pc; 2600 struct callback_register event = { 2601 .type = CALLBACKTYPE_event, 2602 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback }, 2603 }; 2604 struct callback_register failsafe = { 2605 .type = CALLBACKTYPE_failsafe, 2606 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback }, 2607 }; 2608 2609 thread0.td_kstack = proc0kstack; 2610 thread0.td_kstack_pages = KSTACK_PAGES; 2611 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 2612 thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1; 2613 2614 /* 2615 * This may be done better later if it gets more high level 2616 * components in it. If so just link td->td_proc here. 2617 */ 2618 proc_linkup0(&proc0, &thread0); 2619 2620 metadata_missing = 0; 2621 if (xen_start_info->mod_start) { 2622 preload_metadata = (caddr_t)xen_start_info->mod_start; 2623 preload_bootstrap_relocate(KERNBASE); 2624 } else { 2625 metadata_missing = 1; 2626 } 2627 if (envmode == 1) 2628 kern_envp = static_env; 2629 else if ((caddr_t)xen_start_info->cmd_line) 2630 kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line); 2631 2632 boothowto |= xen_boothowto(kern_envp); 2633 2634 /* Init basic tunables, hz etc */ 2635 init_param1(); 2636 2637 /* 2638 * XEN occupies a portion of the upper virtual address space 2639 * At its base it manages an array mapping machine page frames 2640 * to physical page frames - hence we need to be able to 2641 * access 4GB - (64MB - 4MB + 64k) 2642 */ 2643 gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2644 gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2645 gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2646 gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2647 gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2648 gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2649 gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2650 gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); 2651 2652 pc = &__pcpu[0]; 2653 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 2654 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 2655 2656 PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW); 2657 bzero(gdt, PAGE_SIZE); 2658 for (x = 0; x < NGDT; x++) 2659 ssdtosd(&gdt_segs[x], &gdt[x].sd); 2660 2661 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2662 2663 gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT; 2664 PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V); 2665 PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0); 2666 lgdt(&r_gdt); 2667 gdtset = 1; 2668 2669 if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) { 2670 panic("set_trap_table failed - error %d\n", error); 2671 } 2672 2673 error = HYPERVISOR_callback_op(CALLBACKOP_register, &event); 2674 if (error == 0) 2675 error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); 2676#if CONFIG_XEN_COMPAT <= 0x030002 2677 if (error == -ENOXENSYS) 2678 HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL), 2679 (unsigned long)Xhypervisor_callback, 2680 GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback); 2681#endif 2682 pcpu_init(pc, 0, sizeof(struct pcpu)); 2683 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2684 pmap_kenter(pa + KERNBASE, pa); 2685 dpcpu_init((void *)(first + KERNBASE), 0); 2686 first += DPCPU_SIZE; 2687 physfree += DPCPU_SIZE; 2688 init_first += DPCPU_SIZE / PAGE_SIZE; 2689 2690 PCPU_SET(prvspace, pc); 2691 PCPU_SET(curthread, &thread0); 2692 PCPU_SET(curpcb, thread0.td_pcb); 2693 2694 /* 2695 * Initialize mutexes. 2696 * 2697 * icu_lock: in order to allow an interrupt to occur in a critical 2698 * section, to set pcpu->ipending (etc...) properly, we 2699 * must be able to get the icu lock, so it can't be 2700 * under witness. 2701 */ 2702 mutex_init(); 2703 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2704 2705 /* make ldt memory segments */ 2706 PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW); 2707 bzero(ldt, PAGE_SIZE); 2708 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2709 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2710 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) 2711 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2712 2713 default_proc_ldt.ldt_base = (caddr_t)ldt; 2714 default_proc_ldt.ldt_len = 6; 2715 _default_ldt = (int)&default_proc_ldt; 2716 PCPU_SET(currentldt, _default_ldt); 2717 PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW); 2718 xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0])); 2719 2720#if defined(XEN_PRIVILEGED) 2721 /* 2722 * Initialize the i8254 before the console so that console 2723 * initialization can use DELAY(). 2724 */ 2725 i8254_init(); 2726#endif 2727 2728 /* 2729 * Initialize the console before we print anything out. 2730 */ 2731 cninit(); 2732 2733 if (metadata_missing) 2734 printf("WARNING: loader(8) metadata is missing!\n"); 2735 2736#ifdef DEV_ISA 2737#ifdef DEV_ATPIC 2738 elcr_probe(); 2739 atpic_startup(); 2740#else 2741 /* Reset and mask the atpics and leave them shut down. */ 2742 atpic_reset(); 2743 2744 /* 2745 * Point the ICU spurious interrupt vectors at the APIC spurious 2746 * interrupt handler. 2747 */ 2748 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2749 GSEL(GCODE_SEL, SEL_KPL)); 2750 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2751 GSEL(GCODE_SEL, SEL_KPL)); 2752#endif 2753#endif 2754 2755#ifdef DDB 2756 ksym_start = bootinfo.bi_symtab; 2757 ksym_end = bootinfo.bi_esymtab; 2758#endif 2759 2760 kdb_init(); 2761 2762#ifdef KDB 2763 if (boothowto & RB_KDB) 2764 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 2765#endif 2766 2767 finishidentcpu(); /* Final stage of CPU initialization */ 2768 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2769 GSEL(GCODE_SEL, SEL_KPL)); 2770 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2771 GSEL(GCODE_SEL, SEL_KPL)); 2772 initializecpu(); /* Initialize CPU registers */ 2773 2774 /* make an initial tss so cpu can get interrupt stack on syscall! */ 2775 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 2776 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + 2777 kstack0_sz - sizeof(struct pcb) - 16); 2778 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 2779 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2780 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), 2781 PCPU_GET(common_tss.tss_esp0)); 2782 2783 /* pointer to selector slot for %fs/%gs */ 2784 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 2785 2786 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = 2787 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; 2788 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = 2789 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 2790#ifdef PAE 2791 dblfault_tss.tss_cr3 = (int)IdlePDPT; 2792#else 2793 dblfault_tss.tss_cr3 = (int)IdlePTD; 2794#endif 2795 dblfault_tss.tss_eip = (int)dblfault_handler; 2796 dblfault_tss.tss_eflags = PSL_KERNEL; 2797 dblfault_tss.tss_ds = dblfault_tss.tss_es = 2798 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); 2799 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 2800 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); 2801 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 2802 2803 vm86_initialize(); 2804 getmemsize(first); 2805 init_param2(physmem); 2806 2807 /* now running on new page tables, configured,and u/iom is accessible */ 2808 2809 msgbufinit(msgbufp, msgbufsize); 2810 /* transfer to user mode */ 2811 2812 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2813 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2814 2815 /* setup proc 0's pcb */ 2816 thread0.td_pcb->pcb_flags = 0; 2817#ifdef PAE 2818 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; 2819#else 2820 thread0.td_pcb->pcb_cr3 = (int)IdlePTD; 2821#endif 2822 thread0.td_pcb->pcb_ext = 0; 2823 thread0.td_frame = &proc0_tf; 2824 thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0]; 2825 thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1]; 2826 2827 cpu_probe_amdc1e(); 2828 cpu_probe_cmpxchg8b(); 2829} 2830 2831#else 2832void 2833init386(first) 2834 int first; 2835{ 2836 struct gate_descriptor *gdp; 2837 int gsel_tss, metadata_missing, x, pa; 2838 size_t kstack0_sz; 2839 struct pcpu *pc; 2840 2841 thread0.td_kstack = proc0kstack; 2842 thread0.td_kstack_pages = KSTACK_PAGES; 2843 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 2844 thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1; 2845 2846 /* 2847 * This may be done better later if it gets more high level 2848 * components in it. If so just link td->td_proc here. 2849 */ 2850 proc_linkup0(&proc0, &thread0); 2851 2852 metadata_missing = 0; 2853 if (bootinfo.bi_modulep) { 2854 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 2855 preload_bootstrap_relocate(KERNBASE); 2856 } else { 2857 metadata_missing = 1; 2858 } 2859 if (envmode == 1) 2860 kern_envp = static_env; 2861 else if (bootinfo.bi_envp) 2862 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 2863 2864 /* Init basic tunables, hz etc */ 2865 init_param1(); 2866 2867 /* 2868 * Make gdt memory segments. All segments cover the full 4GB 2869 * of address space and permissions are enforced at page level. 2870 */ 2871 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); 2872 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); 2873 gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); 2874 gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); 2875 gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); 2876 gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); 2877 2878 pc = &__pcpu[0]; 2879 gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); 2880 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 2881 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 2882 2883 for (x = 0; x < NGDT; x++) 2884 ssdtosd(&gdt_segs[x], &gdt[x].sd); 2885 2886 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2887 r_gdt.rd_base = (int) gdt; 2888 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2889 lgdt(&r_gdt); 2890 2891 pcpu_init(pc, 0, sizeof(struct pcpu)); 2892 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2893 pmap_kenter(pa + KERNBASE, pa); 2894 dpcpu_init((void *)(first + KERNBASE), 0); 2895 first += DPCPU_SIZE; 2896 PCPU_SET(prvspace, pc); 2897 PCPU_SET(curthread, &thread0); 2898 PCPU_SET(curpcb, thread0.td_pcb); 2899 2900 /* 2901 * Initialize mutexes. 2902 * 2903 * icu_lock: in order to allow an interrupt to occur in a critical 2904 * section, to set pcpu->ipending (etc...) properly, we 2905 * must be able to get the icu lock, so it can't be 2906 * under witness. 2907 */ 2908 mutex_init(); 2909 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2910 2911 /* make ldt memory segments */ 2912 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2913 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2914 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) 2915 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2916 2917 _default_ldt = GSEL(GLDT_SEL, SEL_KPL); 2918 lldt(_default_ldt); 2919 PCPU_SET(currentldt, _default_ldt); 2920 2921 /* exceptions */ 2922 for (x = 0; x < NIDT; x++) 2923 setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, 2924 GSEL(GCODE_SEL, SEL_KPL)); 2925 setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, 2926 GSEL(GCODE_SEL, SEL_KPL)); 2927 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, 2928 GSEL(GCODE_SEL, SEL_KPL)); 2929 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, 2930 GSEL(GCODE_SEL, SEL_KPL)); 2931 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, 2932 GSEL(GCODE_SEL, SEL_KPL)); 2933 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, 2934 GSEL(GCODE_SEL, SEL_KPL)); 2935 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, 2936 GSEL(GCODE_SEL, SEL_KPL)); 2937 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2938 GSEL(GCODE_SEL, SEL_KPL)); 2939 setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL 2940 , GSEL(GCODE_SEL, SEL_KPL)); 2941 setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); 2942 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, 2943 GSEL(GCODE_SEL, SEL_KPL)); 2944 setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, 2945 GSEL(GCODE_SEL, SEL_KPL)); 2946 setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, 2947 GSEL(GCODE_SEL, SEL_KPL)); 2948 setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, 2949 GSEL(GCODE_SEL, SEL_KPL)); 2950 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2951 GSEL(GCODE_SEL, SEL_KPL)); 2952 setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, 2953 GSEL(GCODE_SEL, SEL_KPL)); 2954 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, 2955 GSEL(GCODE_SEL, SEL_KPL)); 2956 setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, 2957 GSEL(GCODE_SEL, SEL_KPL)); 2958 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, 2959 GSEL(GCODE_SEL, SEL_KPL)); 2960 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, 2961 GSEL(GCODE_SEL, SEL_KPL)); 2962 setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, 2963 GSEL(GCODE_SEL, SEL_KPL)); 2964#ifdef KDTRACE_HOOKS 2965 setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL, 2966 GSEL(GCODE_SEL, SEL_KPL)); 2967#endif 2968 2969 r_idt.rd_limit = sizeof(idt0) - 1; 2970 r_idt.rd_base = (int) idt; 2971 lidt(&r_idt); 2972 2973#ifdef XBOX 2974 /* 2975 * The following code queries the PCI ID of 0:0:0. For the XBOX, 2976 * This should be 0x10de / 0x02a5. 2977 * 2978 * This is exactly what Linux does. 2979 */ 2980 outl(0xcf8, 0x80000000); 2981 if (inl(0xcfc) == 0x02a510de) { 2982 arch_i386_is_xbox = 1; 2983 pic16l_setled(XBOX_LED_GREEN); 2984 2985 /* 2986 * We are an XBOX, but we may have either 64MB or 128MB of 2987 * memory. The PCI host bridge should be programmed for this, 2988 * so we just query it. 2989 */ 2990 outl(0xcf8, 0x80000084); 2991 arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64; 2992 } 2993#endif /* XBOX */ 2994 2995 /* 2996 * Initialize the i8254 before the console so that console 2997 * initialization can use DELAY(). 2998 */ 2999 i8254_init(); 3000 3001 /* 3002 * Initialize the console before we print anything out. 3003 */ 3004 cninit(); 3005 3006 if (metadata_missing) 3007 printf("WARNING: loader(8) metadata is missing!\n"); 3008 3009#ifdef DEV_ISA 3010#ifdef DEV_ATPIC 3011 elcr_probe(); 3012 atpic_startup(); 3013#else 3014 /* Reset and mask the atpics and leave them shut down. */ 3015 atpic_reset(); 3016 3017 /* 3018 * Point the ICU spurious interrupt vectors at the APIC spurious 3019 * interrupt handler. 3020 */ 3021 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 3022 GSEL(GCODE_SEL, SEL_KPL)); 3023 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 3024 GSEL(GCODE_SEL, SEL_KPL)); 3025#endif 3026#endif 3027 3028#ifdef DDB 3029 ksym_start = bootinfo.bi_symtab; 3030 ksym_end = bootinfo.bi_esymtab; 3031#endif 3032 3033 kdb_init(); 3034 3035#ifdef KDB 3036 if (boothowto & RB_KDB) 3037 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 3038#endif 3039 3040 finishidentcpu(); /* Final stage of CPU initialization */ 3041 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 3042 GSEL(GCODE_SEL, SEL_KPL)); 3043 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 3044 GSEL(GCODE_SEL, SEL_KPL)); 3045 initializecpu(); /* Initialize CPU registers */ 3046 3047 /* make an initial tss so cpu can get interrupt stack on syscall! */ 3048 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 3049 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + 3050 kstack0_sz - sizeof(struct pcb) - 16); 3051 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 3052 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 3053 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); 3054 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 3055 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 3056 ltr(gsel_tss); 3057 3058 /* pointer to selector slot for %fs/%gs */ 3059 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 3060 3061 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = 3062 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; 3063 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = 3064 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 3065#ifdef PAE 3066 dblfault_tss.tss_cr3 = (int)IdlePDPT; 3067#else 3068 dblfault_tss.tss_cr3 = (int)IdlePTD; 3069#endif 3070 dblfault_tss.tss_eip = (int)dblfault_handler; 3071 dblfault_tss.tss_eflags = PSL_KERNEL; 3072 dblfault_tss.tss_ds = dblfault_tss.tss_es = 3073 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); 3074 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 3075 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); 3076 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 3077 3078 vm86_initialize(); 3079 getmemsize(first); 3080 init_param2(physmem); 3081 3082 /* now running on new page tables, configured,and u/iom is accessible */ 3083 3084 msgbufinit(msgbufp, msgbufsize); 3085 3086 /* make a call gate to reenter kernel with */ 3087 gdp = &ldt[LSYS5CALLS_SEL].gd; 3088 3089 x = (int) &IDTVEC(lcall_syscall); 3090 gdp->gd_looffset = x; 3091 gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); 3092 gdp->gd_stkcpy = 1; 3093 gdp->gd_type = SDT_SYS386CGT; 3094 gdp->gd_dpl = SEL_UPL; 3095 gdp->gd_p = 1; 3096 gdp->gd_hioffset = x >> 16; 3097 3098 /* XXX does this work? */ 3099 /* XXX yes! */ 3100 ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; 3101 ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; 3102 3103 /* transfer to user mode */ 3104 3105 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 3106 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 3107 3108 /* setup proc 0's pcb */ 3109 thread0.td_pcb->pcb_flags = 0; 3110#ifdef PAE 3111 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; 3112#else 3113 thread0.td_pcb->pcb_cr3 = (int)IdlePTD; 3114#endif 3115 thread0.td_pcb->pcb_ext = 0; 3116 thread0.td_frame = &proc0_tf; 3117 3118 cpu_probe_amdc1e(); 3119 cpu_probe_cmpxchg8b();
|