machdep.c revision 114987
1/*- 2 * Copyright (c) 1992 Terrence R. Lambert. 3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * William Jolitz. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 38 * $FreeBSD: head/sys/amd64/amd64/machdep.c 114987 2003-05-14 04:10:49Z peter $ 39 */ 40 41#include "opt_atalk.h" 42#include "opt_compat.h" 43#include "opt_cpu.h" 44#include "opt_ddb.h" 45#include "opt_inet.h" 46#include "opt_ipx.h" 47#include "opt_isa.h" 48#include "opt_maxmem.h" 49#include "opt_msgbuf.h" 50#include "opt_perfmon.h" 51#include "opt_kstack_pages.h" 52 53#include <sys/param.h> 54#include <sys/systm.h> 55#include <sys/sysproto.h> 56#include <sys/signalvar.h> 57#include <sys/imgact.h> 58#include <sys/kernel.h> 59#include <sys/ktr.h> 60#include <sys/linker.h> 61#include <sys/lock.h> 62#include <sys/malloc.h> 63#include <sys/mutex.h> 64#include <sys/pcpu.h> 65#include <sys/proc.h> 66#include <sys/bio.h> 67#include <sys/buf.h> 68#include <sys/reboot.h> 69#include <sys/callout.h> 70#include <sys/msgbuf.h> 71#include <sys/sched.h> 72#include <sys/sysent.h> 73#include <sys/sysctl.h> 74#include <sys/ucontext.h> 75#include <sys/vmmeter.h> 76#include <sys/bus.h> 77#include <sys/eventhandler.h> 78 79#include <vm/vm.h> 80#include <vm/vm_param.h> 81#include <vm/vm_kern.h> 82#include <vm/vm_object.h> 83#include <vm/vm_page.h> 84#include <vm/vm_map.h> 85#include <vm/vm_pager.h> 86#include <vm/vm_extern.h> 87 88#include <sys/user.h> 89#include <sys/exec.h> 90#include <sys/cons.h> 91 92#include <ddb/ddb.h> 93 94#include <net/netisr.h> 95 96#include <machine/cpu.h> 97#include <machine/cputypes.h> 98#include <machine/reg.h> 99#include <machine/clock.h> 100#include <machine/specialreg.h> 101#include <machine/md_var.h> 102#include <machine/metadata.h> 103#include <machine/proc.h> 104#ifdef PERFMON 105#include <machine/perfmon.h> 106#endif 107#include <machine/tss.h> 108 109#include <amd64/isa/icu.h> 110#include <amd64/isa/intr_machdep.h> 111#include <isa/rtc.h> 112#include <sys/ptrace.h> 113#include <machine/sigframe.h> 114 115extern void hammer_time(void); 116extern void dblfault_handler(void); 117 118extern void printcpuinfo(void); /* XXX header file */ 119extern void identify_cpu(void); 120extern void panicifcpuunsupported(void); 121extern void initializecpu(void); 122 123#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 124#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 125 126static void cpu_startup(void *); 127static void get_fpcontext(struct thread *td, mcontext_t *mcp); 128static int set_fpcontext(struct thread *td, const mcontext_t *mcp); 129SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) 130 131int _udatasel, _ucodesel, _ucode32sel; 132u_long atdevbase; 133 134u_int64_t modulep; /* phys addr of metadata table */ 135u_int64_t physfree; /* first free page after kernel */ 136u_int64_t IdlePTD; /* phys addr of kernel PTD */ 137u_int64_t IdlePDP; /* phys addr of kernel level 3 */ 138u_int64_t IdlePML4; /* phys addr of kernel level 4 */ 139struct user *proc0uarea; /* address of proc 0 uarea space */ 140vm_offset_t proc0kstack; /* address of proc 0 kstack space */ 141 142int cold = 1; 143 144long Maxmem = 0; 145 146vm_paddr_t phys_avail[10]; 147 148/* must be 2 less so 0 0 can signal end of chunks */ 149#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) 150 151struct kva_md_info kmi; 152 153static struct trapframe proc0_tf; 154static struct pcpu __pcpu; 155 156struct mtx icu_lock; 157 158static void 159cpu_startup(dummy) 160 void *dummy; 161{ 162 /* 163 * Good {morning,afternoon,evening,night}. 164 */ 165 startrtclock(); 166 printcpuinfo(); 167 panicifcpuunsupported(); 168#ifdef PERFMON 169 perfmon_init(); 170#endif 171 printf("real memory = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem), 172 ptoa((uintmax_t)Maxmem) / 1048576); 173 /* 174 * Display any holes after the first chunk of extended memory. 175 */ 176 if (bootverbose) { 177 int indx; 178 179 printf("Physical memory chunk(s):\n"); 180 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 181 vm_paddr_t size; 182 183 size = phys_avail[indx + 1] - phys_avail[indx]; 184 printf( 185 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 186 (uintmax_t)phys_avail[indx], 187 (uintmax_t)phys_avail[indx + 1] - 1, 188 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 189 } 190 } 191 192 vm_ksubmap_init(&kmi); 193 194 printf("avail memory = %ju (%ju MB)\n", 195 ptoa((uintmax_t)cnt.v_free_count), 196 ptoa((uintmax_t)cnt.v_free_count) / 1048576); 197 198 /* 199 * Set up buffers, so they can be used to read disk labels. 200 */ 201 bufinit(); 202 vm_pager_bufferinit(); 203 204 /* For SMP, we delay the cpu_setregs() until after SMP startup. */ 205 cpu_setregs(); 206} 207 208/* 209 * Send an interrupt to process. 210 * 211 * Stack is set up to allow sigcode stored 212 * at top to call routine, followed by kcall 213 * to sigreturn routine below. After sigreturn 214 * resets the signal mask, the stack, and the 215 * frame pointer, it returns to the user 216 * specified pc, psl. 217 */ 218void 219sendsig(catcher, sig, mask, code) 220 sig_t catcher; 221 int sig; 222 sigset_t *mask; 223 u_long code; 224{ 225 struct sigframe sf, *sfp; 226 struct proc *p; 227 struct thread *td; 228 struct sigacts *psp; 229 char *sp; 230 struct trapframe *regs; 231 int oonstack; 232 233 td = curthread; 234 p = td->td_proc; 235 PROC_LOCK_ASSERT(p, MA_OWNED); 236 psp = p->p_sigacts; 237 mtx_assert(&psp->ps_mtx, MA_OWNED); 238 regs = td->td_frame; 239 oonstack = sigonstack(regs->tf_rsp); 240 241 /* Save user context. */ 242 bzero(&sf, sizeof(sf)); 243 sf.sf_uc.uc_sigmask = *mask; 244 sf.sf_uc.uc_stack = p->p_sigstk; 245 sf.sf_uc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK) 246 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 247 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 248 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); 249 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 250 get_fpcontext(td, &sf.sf_uc.uc_mcontext); 251 fpstate_drop(td); 252 253 /* Allocate space for the signal handler context. */ 254 if ((p->p_flag & P_ALTSTACK) != 0 && !oonstack && 255 SIGISMEMBER(psp->ps_sigonstack, sig)) { 256 sp = p->p_sigstk.ss_sp + 257 p->p_sigstk.ss_size - sizeof(struct sigframe); 258#if defined(COMPAT_43) || defined(COMPAT_SUNOS) 259 p->p_sigstk.ss_flags |= SS_ONSTACK; 260#endif 261 } else 262 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128; 263 /* Align to 16 bytes. */ 264 sfp = (struct sigframe *)((unsigned long)sp & ~0xF); 265 266 /* Translate the signal if appropriate. */ 267 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) 268 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 269 270 /* Build the argument list for the signal handler. */ 271 regs->tf_rdi = sig; /* arg 1 in %rdi */ 272 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ 273 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 274 /* Signal handler installed with SA_SIGINFO. */ 275 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ 276 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 277 278 /* Fill in POSIX parts */ 279 sf.sf_si.si_signo = sig; 280 sf.sf_si.si_code = code; 281 regs->tf_rcx = regs->tf_addr; /* arg 4 in %rcx */ 282 } else { 283 /* Old FreeBSD-style arguments. */ 284 regs->tf_rsi = code; /* arg 2 in %rsi */ 285 regs->tf_rcx = regs->tf_addr; /* arg 4 in %rcx */ 286 sf.sf_ahu.sf_handler = catcher; 287 } 288 mtx_unlock(&psp->ps_mtx); 289 PROC_UNLOCK(p); 290 291 /* 292 * Copy the sigframe out to the user's stack. 293 */ 294 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 295#ifdef DEBUG 296 printf("process %ld has trashed its stack\n", (long)p->p_pid); 297#endif 298 PROC_LOCK(p); 299 sigexit(td, SIGILL); 300 } 301 302 regs->tf_rsp = (long)sfp; 303 regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); 304 regs->tf_rflags &= ~PSL_T; 305 regs->tf_cs = _ucodesel; 306 PROC_LOCK(p); 307 mtx_lock(&psp->ps_mtx); 308} 309 310/* 311 * System call to cleanup state after a signal 312 * has been taken. Reset signal mask and 313 * stack state from context left by sendsig (above). 314 * Return to previous pc and psl as specified by 315 * context left by sendsig. Check carefully to 316 * make sure that the user has not modified the 317 * state to gain improper privileges. 318 * 319 * MPSAFE 320 */ 321int 322sigreturn(td, uap) 323 struct thread *td; 324 struct sigreturn_args /* { 325 const __ucontext *sigcntxp; 326 } */ *uap; 327{ 328 ucontext_t uc; 329 struct proc *p = td->td_proc; 330 struct trapframe *regs; 331 const ucontext_t *ucp; 332 long rflags; 333 int cs, error, ret; 334 335 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 336 if (error != 0) 337 return (error); 338 ucp = &uc; 339 regs = td->td_frame; 340 rflags = ucp->uc_mcontext.mc_rflags; 341 /* 342 * Don't allow users to change privileged or reserved flags. 343 */ 344 /* 345 * XXX do allow users to change the privileged flag PSL_RF. 346 * The cpu sets PSL_RF in tf_rflags for faults. Debuggers 347 * should sometimes set it there too. tf_rflags is kept in 348 * the signal context during signal handling and there is no 349 * other place to remember it, so the PSL_RF bit may be 350 * corrupted by the signal handler without us knowing. 351 * Corruption of the PSL_RF bit at worst causes one more or 352 * one less debugger trap, so allowing it is fairly harmless. 353 */ 354 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { 355 printf("sigreturn: rflags = 0x%lx\n", rflags); 356 return (EINVAL); 357 } 358 359 /* 360 * Don't allow users to load a valid privileged %cs. Let the 361 * hardware check for invalid selectors, excess privilege in 362 * other selectors, invalid %eip's and invalid %esp's. 363 */ 364 cs = ucp->uc_mcontext.mc_cs; 365 if (!CS_SECURE(cs)) { 366 printf("sigreturn: cs = 0x%x\n", cs); 367 trapsignal(td, SIGBUS, T_PROTFLT); 368 return (EINVAL); 369 } 370 371 ret = set_fpcontext(td, &ucp->uc_mcontext); 372 if (ret != 0) 373 return (ret); 374 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); 375 376 PROC_LOCK(p); 377#if defined(COMPAT_43) || defined(COMPAT_SUNOS) 378 if (ucp->uc_mcontext.mc_onstack & 1) 379 p->p_sigstk.ss_flags |= SS_ONSTACK; 380 else 381 p->p_sigstk.ss_flags &= ~SS_ONSTACK; 382#endif 383 384 td->td_sigmask = ucp->uc_sigmask; 385 SIG_CANTMASK(td->td_sigmask); 386 signotify(td); 387 PROC_UNLOCK(p); 388 td->td_pcb->pcb_flags |= PCB_FULLCTX; 389 return (EJUSTRETURN); 390} 391 392#ifdef COMPAT_FREEBSD4 393int 394freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) 395{ 396 397 return sigreturn(td, (struct sigreturn_args *)uap); 398} 399#endif 400 401 402/* 403 * Machine dependent boot() routine 404 * 405 * I haven't seen anything to put here yet 406 * Possibly some stuff might be grafted back here from boot() 407 */ 408void 409cpu_boot(int howto) 410{ 411} 412 413/* 414 * Shutdown the CPU as much as possible 415 */ 416void 417cpu_halt(void) 418{ 419 for (;;) 420 __asm__ ("hlt"); 421} 422 423/* 424 * Hook to idle the CPU when possible. In the SMP case we default to 425 * off because a halted cpu will not currently pick up a new thread in the 426 * run queue until the next timer tick. If turned on this will result in 427 * approximately a 4.2% loss in real time performance in buildworld tests 428 * (but improves user and sys times oddly enough), and saves approximately 429 * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3). 430 * 431 * XXX we need to have a cpu mask of idle cpus and generate an IPI or 432 * otherwise generate some sort of interrupt to wake up cpus sitting in HLT. 433 * Then we can have our cake and eat it too. 434 * 435 * XXX I'm turning it on for SMP as well by default for now. It seems to 436 * help lock contention somewhat, and this is critical for HTT. -Peter 437 */ 438static int cpu_idle_hlt = 1; 439SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, 440 &cpu_idle_hlt, 0, "Idle loop HLT enable"); 441 442/* 443 * Note that we have to be careful here to avoid a race between checking 444 * sched_runnable() and actually halting. If we don't do this, we may waste 445 * the time between calling hlt and the next interrupt even though there 446 * is a runnable process. 447 */ 448void 449cpu_idle(void) 450{ 451 452 if (cpu_idle_hlt) { 453 disable_intr(); 454 if (sched_runnable()) { 455 enable_intr(); 456 } else { 457 /* 458 * we must absolutely guarentee that hlt is the 459 * absolute next instruction after sti or we 460 * introduce a timing window. 461 */ 462 __asm __volatile("sti; hlt"); 463 } 464 } 465} 466 467/* 468 * Clear registers on exec 469 */ 470void 471exec_setregs(td, entry, stack, ps_strings) 472 struct thread *td; 473 u_long entry; 474 u_long stack; 475 u_long ps_strings; 476{ 477 struct trapframe *regs = td->td_frame; 478 struct pcb *pcb = td->td_pcb; 479 u_int64_t pc; 480 481 wrmsr(MSR_FSBASE, 0); 482 wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ 483 pcb->pcb_fsbase = 0; 484 pcb->pcb_gsbase = 0; 485 pcb->pcb_kgsbase = rdmsr(MSR_GSBASE); 486 load_ds(_udatasel); 487 load_es(_udatasel); 488 load_fs(_udatasel); 489 critical_enter(); 490 pc = rdmsr(MSR_GSBASE); 491 load_gs(_udatasel); /* Clobbers kernel %GS.base */ 492 wrmsr(MSR_GSBASE, pc); 493 critical_exit(); 494 pcb->pcb_ds = _udatasel; 495 pcb->pcb_es = _udatasel; 496 pcb->pcb_fs = _udatasel; 497 pcb->pcb_gs = _udatasel; 498 499 bzero((char *)regs, sizeof(struct trapframe)); 500 regs->tf_rip = entry; 501 /* This strangeness is to ensure alignment after the implied return address */ 502 regs->tf_rsp = ((stack - 8) & ~0xF) + 8; 503 regs->tf_rdi = stack; /* argv */ 504 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 505 regs->tf_ss = _udatasel; 506 regs->tf_cs = _ucodesel; 507 508 /* 509 * Arrange to trap the next npx or `fwait' instruction (see npx.c 510 * for why fwait must be trapped at least if there is an npx or an 511 * emulator). This is mainly to handle the case where npx0 is not 512 * configured, since the npx routines normally set up the trap 513 * otherwise. It should be done only at boot time, but doing it 514 * here allows modifying `npx_exists' for testing the emulator on 515 * systems with an npx. 516 */ 517 load_cr0(rcr0() | CR0_MP | CR0_TS); 518 519 /* Initialize the npx (if any) for the current process. */ 520 /* 521 * XXX the above load_cr0() also initializes it and is a layering 522 * violation if NPX is configured. It drops the npx partially 523 * and this would be fatal if we were interrupted now, and decided 524 * to force the state to the pcb, and checked the invariant 525 * (CR0_TS clear) if and only if PCPU_GET(fpcurthread) != NULL). 526 * ALL of this can happen except the check. The check used to 527 * happen and be fatal later when we didn't complete the drop 528 * before returning to user mode. This should be fixed properly 529 * soon. 530 */ 531 fpstate_drop(td); 532} 533 534void 535cpu_setregs(void) 536{ 537 register_t cr0; 538 539 cr0 = rcr0(); 540 cr0 |= CR0_NE; /* Done by npxinit() */ 541 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ 542 cr0 |= CR0_WP | CR0_AM; 543 load_cr0(cr0); 544} 545 546static int 547sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) 548{ 549 int error; 550 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, 551 req); 552 if (!error && req->newptr) 553 resettodr(); 554 return (error); 555} 556 557SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, 558 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); 559 560SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, 561 CTLFLAG_RW, &disable_rtc_set, 0, ""); 562 563SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, 564 CTLFLAG_RW, &wall_cmos_clock, 0, ""); 565 566/* 567 * Initialize 386 and configure to run kernel 568 */ 569 570/* 571 * Initialize segments & interrupt table 572 */ 573 574struct user_segment_descriptor gdt[NGDT];/* global descriptor table */ 575static struct gate_descriptor idt0[NIDT]; 576struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 577 578static char dblfault_stack[PAGE_SIZE] __aligned(16); 579 580struct amd64tss common_tss; 581 582/* software prototypes -- in more palatable form */ 583struct soft_segment_descriptor gdt_segs[] = { 584/* GNULL_SEL 0 Null Descriptor */ 585{ 0x0, /* segment base address */ 586 0x0, /* length */ 587 0, /* segment type */ 588 0, /* segment descriptor priority level */ 589 0, /* segment descriptor present */ 590 0, /* long */ 591 0, /* default 32 vs 16 bit size */ 592 0 /* limit granularity (byte/page units)*/ }, 593/* GCODE_SEL 1 Code Descriptor for kernel */ 594{ 0x0, /* segment base address */ 595 0xfffff, /* length - all address space */ 596 SDT_MEMERA, /* segment type */ 597 SEL_KPL, /* segment descriptor priority level */ 598 1, /* segment descriptor present */ 599 1, /* long */ 600 0, /* default 32 vs 16 bit size */ 601 1 /* limit granularity (byte/page units)*/ }, 602/* GDATA_SEL 2 Data Descriptor for kernel */ 603{ 0x0, /* segment base address */ 604 0xfffff, /* length - all address space */ 605 SDT_MEMRWA, /* segment type */ 606 SEL_KPL, /* segment descriptor priority level */ 607 1, /* segment descriptor present */ 608 1, /* long */ 609 0, /* default 32 vs 16 bit size */ 610 1 /* limit granularity (byte/page units)*/ }, 611/* GUCODE32_SEL 3 32 bit Code Descriptor for user */ 612{ 0x0, /* segment base address */ 613 0xfffff, /* length - all address space */ 614 SDT_MEMERA, /* segment type */ 615 SEL_UPL, /* segment descriptor priority level */ 616 1, /* segment descriptor present */ 617 0, /* long */ 618 1, /* default 32 vs 16 bit size */ 619 1 /* limit granularity (byte/page units)*/ }, 620/* GUDATA_SEL 4 32/64 bit Data Descriptor for user */ 621{ 0x0, /* segment base address */ 622 0xfffff, /* length - all address space */ 623 SDT_MEMRWA, /* segment type */ 624 SEL_UPL, /* segment descriptor priority level */ 625 1, /* segment descriptor present */ 626 0, /* long */ 627 1, /* default 32 vs 16 bit size */ 628 1 /* limit granularity (byte/page units)*/ }, 629/* GUCODE_SEL 5 64 bit Code Descriptor for user */ 630{ 0x0, /* segment base address */ 631 0xfffff, /* length - all address space */ 632 SDT_MEMERA, /* segment type */ 633 SEL_UPL, /* segment descriptor priority level */ 634 1, /* segment descriptor present */ 635 1, /* long */ 636 0, /* default 32 vs 16 bit size */ 637 1 /* limit granularity (byte/page units)*/ }, 638/* GPROC0_SEL 6 Proc 0 Tss Descriptor */ 639{ 640 0x0, /* segment base address */ 641 sizeof(struct amd64tss)-1,/* length - all address space */ 642 SDT_SYSTSS, /* segment type */ 643 SEL_KPL, /* segment descriptor priority level */ 644 1, /* segment descriptor present */ 645 0, /* long */ 646 0, /* unused - default 32 vs 16 bit size */ 647 0 /* limit granularity (byte/page units)*/ }, 648/* Actually, the TSS is a system descriptor which is double size */ 649{ 0x0, /* segment base address */ 650 0x0, /* length */ 651 0, /* segment type */ 652 0, /* segment descriptor priority level */ 653 0, /* segment descriptor present */ 654 0, /* long */ 655 0, /* default 32 vs 16 bit size */ 656 0 /* limit granularity (byte/page units)*/ }, 657}; 658 659void 660setidt(idx, func, typ, dpl, ist) 661 int idx; 662 inthand_t *func; 663 int typ; 664 int dpl; 665 int ist; 666{ 667 struct gate_descriptor *ip; 668 669 ip = idt + idx; 670 ip->gd_looffset = (uintptr_t)func; 671 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 672 ip->gd_ist = ist; 673 ip->gd_xx = 0; 674 ip->gd_type = typ; 675 ip->gd_dpl = dpl; 676 ip->gd_p = 1; 677 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 678} 679 680#define IDTVEC(name) __CONCAT(X,name) 681 682extern inthand_t 683 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 684 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 685 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 686 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 687 IDTVEC(xmm), IDTVEC(dblfault), 688 IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 689 690void 691sdtossd(sd, ssd) 692 struct user_segment_descriptor *sd; 693 struct soft_segment_descriptor *ssd; 694{ 695 696 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 697 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 698 ssd->ssd_type = sd->sd_type; 699 ssd->ssd_dpl = sd->sd_dpl; 700 ssd->ssd_p = sd->sd_p; 701 ssd->ssd_long = sd->sd_long; 702 ssd->ssd_def32 = sd->sd_def32; 703 ssd->ssd_gran = sd->sd_gran; 704} 705 706void 707ssdtosd(ssd, sd) 708 struct soft_segment_descriptor *ssd; 709 struct user_segment_descriptor *sd; 710{ 711 712 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 713 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 714 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 715 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 716 sd->sd_type = ssd->ssd_type; 717 sd->sd_dpl = ssd->ssd_dpl; 718 sd->sd_p = ssd->ssd_p; 719 sd->sd_long = ssd->ssd_long; 720 sd->sd_def32 = ssd->ssd_def32; 721 sd->sd_gran = ssd->ssd_gran; 722} 723 724void 725ssdtosyssd(ssd, sd) 726 struct soft_segment_descriptor *ssd; 727 struct system_segment_descriptor *sd; 728{ 729 730 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 731 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xffffff; 732 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 733 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 734 sd->sd_type = ssd->ssd_type; 735 sd->sd_dpl = ssd->ssd_dpl; 736 sd->sd_p = ssd->ssd_p; 737 sd->sd_gran = ssd->ssd_gran; 738} 739 740 741#define PHYSMAP_SIZE (2 * 8) 742 743struct bios_smap { 744 u_int64_t base; 745 u_int64_t length; 746 u_int32_t type; 747} __packed; 748 749/* 750 * Populate the (physmap) array with base/bound pairs describing the 751 * available physical memory in the system, then test this memory and 752 * build the phys_avail array describing the actually-available memory. 753 * 754 * If we cannot accurately determine the physical memory map, then use 755 * value from the 0xE801 call, and failing that, the RTC. 756 * 757 * Total memory size may be set by the kernel environment variable 758 * hw.physmem or the compile-time define MAXMEM. 759 * 760 * XXX first should be vm_paddr_t. 761 */ 762static void 763getmemsize(caddr_t kmdp, u_int64_t first) 764{ 765 int i, physmap_idx, pa_indx; 766 u_int basemem, extmem; 767 vm_paddr_t pa, physmap[PHYSMAP_SIZE]; 768 pt_entry_t *pte; 769 char *cp; 770 struct bios_smap *smapbase, *smap, *smapend; 771 u_int32_t smapsize; 772 773 bzero(physmap, sizeof(physmap)); 774 basemem = 0; 775 physmap_idx = 0; 776 777 /* 778 * get memory map from INT 15:E820, kindly supplied by the loader. 779 * 780 * subr_module.c says: 781 * "Consumer may safely assume that size value precedes data." 782 * ie: an int32_t immediately precedes smap. 783 */ 784 smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); 785 if (smapbase == 0) 786 smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | 0x0009); /* Old value for MODINFOMD_SMAP */ 787 if (smapbase == 0) { 788 panic("No BIOS smap info from loader!"); 789 goto deep_shit; 790 } 791 smapsize = *((u_int32_t *)smapbase - 1); 792 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 793 794 for (smap = smapbase; smap < smapend; smap++) { 795 if (boothowto & RB_VERBOSE) 796 printf("SMAP type=%02x base=%016lx len=%016lx\n", 797 smap->type, smap->base, smap->length); 798 799 if (smap->type != 0x01) { 800 continue; 801 } 802 803 if (smap->length == 0) { 804next_run: 805 continue; 806 } 807 808 for (i = 0; i <= physmap_idx; i += 2) { 809 if (smap->base < physmap[i + 1]) { 810 if (boothowto & RB_VERBOSE) 811 printf( 812 "Overlapping or non-montonic memory region, ignoring second region\n"); 813 goto next_run; 814 } 815 } 816 817 if (smap->base == physmap[physmap_idx + 1]) { 818 physmap[physmap_idx + 1] += smap->length; 819 continue; 820 } 821 822 physmap_idx += 2; 823 if (physmap_idx == PHYSMAP_SIZE) { 824 printf( 825 "Too many segments in the physical address map, giving up\n"); 826 break; 827 } 828 physmap[physmap_idx] = smap->base; 829 physmap[physmap_idx + 1] = smap->base + smap->length; 830 } 831 832 /* 833 * Perform "base memory" related probes & setup based on SMAP 834 */ 835deep_shit: 836 if (basemem == 0) { 837 for (i = 0; i <= physmap_idx; i += 2) { 838 if (physmap[i] == 0x00000000) { 839 basemem = physmap[i + 1] / 1024; 840 break; 841 } 842 } 843 844 if (basemem == 0) { 845 basemem = rtcin(RTC_BASELO) + (rtcin(RTC_BASEHI) << 8); 846 } 847 848 if (basemem == 0) { 849 basemem = 640; 850 } 851 852 if (basemem > 640) { 853 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", 854 basemem); 855 basemem = 640; 856 } 857 858#if 0 859 for (pa = trunc_page(basemem * 1024); 860 pa < ISA_HOLE_START; pa += PAGE_SIZE) 861 pmap_kenter(KERNBASE + pa, pa); 862#endif 863 } 864 865 if (physmap[1] != 0) 866 goto physmap_done; 867 868 /* 869 * Prefer the RTC value for extended memory. 870 */ 871 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); 872 873 /* 874 * Special hack for chipsets that still remap the 384k hole when 875 * there's 16MB of memory - this really confuses people that 876 * are trying to use bus mastering ISA controllers with the 877 * "16MB limit"; they only have 16MB, but the remapping puts 878 * them beyond the limit. 879 * 880 * If extended memory is between 15-16MB (16-17MB phys address range), 881 * chop it to 15MB. 882 */ 883 if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) 884 extmem = 15 * 1024; 885 886 physmap[0] = 0; 887 physmap[1] = basemem * 1024; 888 physmap_idx = 2; 889 physmap[physmap_idx] = 0x100000; 890 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 891 892physmap_done: 893 /* 894 * Now, physmap contains a map of physical memory. 895 */ 896 /* 897 * Maxmem isn't the "maximum memory", it's one larger than the 898 * highest page of the physical address space. It should be 899 * called something like "Maxphyspage". We may adjust this 900 * based on ``hw.physmem'' and the results of the memory test. 901 */ 902 Maxmem = atop(physmap[physmap_idx + 1]); 903 904#ifdef MAXMEM 905 Maxmem = MAXMEM / 4; 906#endif 907 908 /* 909 * hw.physmem is a size in bytes; we also allow k, m, and g suffixes 910 * for the appropriate modifiers. This overrides MAXMEM. 911 */ 912 if ((cp = getenv("hw.physmem")) != NULL) { 913 u_int64_t AllowMem, sanity; 914 char *ep; 915 916 sanity = AllowMem = strtouq(cp, &ep, 0); 917 if ((ep != cp) && (*ep != 0)) { 918 switch(*ep) { 919 case 'g': 920 case 'G': 921 AllowMem <<= 10; 922 case 'm': 923 case 'M': 924 AllowMem <<= 10; 925 case 'k': 926 case 'K': 927 AllowMem <<= 10; 928 break; 929 default: 930 AllowMem = sanity = 0; 931 } 932 if (AllowMem < sanity) 933 AllowMem = 0; 934 } 935 if (AllowMem == 0) 936 printf("Ignoring invalid memory size of '%s'\n", cp); 937 else 938 Maxmem = atop(AllowMem); 939 freeenv(cp); 940 } 941 942 if (atop(physmap[physmap_idx + 1]) != Maxmem && 943 (boothowto & RB_VERBOSE)) 944 printf("Physical memory use set to %ldK\n", Maxmem * 4); 945 946 /* 947 * If Maxmem has been increased beyond what the system has detected, 948 * extend the last memory segment to the new limit. 949 */ 950 if (atop(physmap[physmap_idx + 1]) < Maxmem) 951 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 952 953 /* call pmap initialization to make new kernel address space */ 954 pmap_bootstrap(first, 0); 955 956 /* 957 * Size up each available chunk of physical memory. 958 */ 959 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 960 pa_indx = 0; 961 phys_avail[pa_indx++] = physmap[0]; 962 phys_avail[pa_indx] = physmap[0]; 963 pte = CMAP1; 964 965 /* 966 * physmap is in bytes, so when converting to page boundaries, 967 * round up the start address and round down the end address. 968 */ 969 for (i = 0; i <= physmap_idx; i += 2) { 970 vm_paddr_t end; 971 972 end = ptoa((vm_paddr_t)Maxmem); 973 if (physmap[i + 1] < end) 974 end = trunc_page(physmap[i + 1]); 975 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 976 int tmp, page_bad; 977 int *ptr = (int *)CADDR1; 978 979 /* 980 * block out kernel memory as not available. 981 */ 982 if (pa >= 0x100000 && pa < first) 983 continue; 984 985 page_bad = FALSE; 986 987 /* 988 * map page into kernel: valid, read/write,non-cacheable 989 */ 990 *pte = pa | PG_V | PG_RW | PG_N; 991 invltlb(); 992 993 tmp = *(int *)ptr; 994 /* 995 * Test for alternating 1's and 0's 996 */ 997 *(volatile int *)ptr = 0xaaaaaaaa; 998 if (*(volatile int *)ptr != 0xaaaaaaaa) { 999 page_bad = TRUE; 1000 } 1001 /* 1002 * Test for alternating 0's and 1's 1003 */ 1004 *(volatile int *)ptr = 0x55555555; 1005 if (*(volatile int *)ptr != 0x55555555) { 1006 page_bad = TRUE; 1007 } 1008 /* 1009 * Test for all 1's 1010 */ 1011 *(volatile int *)ptr = 0xffffffff; 1012 if (*(volatile int *)ptr != 0xffffffff) { 1013 page_bad = TRUE; 1014 } 1015 /* 1016 * Test for all 0's 1017 */ 1018 *(volatile int *)ptr = 0x0; 1019 if (*(volatile int *)ptr != 0x0) { 1020 page_bad = TRUE; 1021 } 1022 /* 1023 * Restore original value. 1024 */ 1025 *(int *)ptr = tmp; 1026 1027 /* 1028 * Adjust array of valid/good pages. 1029 */ 1030 if (page_bad == TRUE) { 1031 continue; 1032 } 1033 /* 1034 * If this good page is a continuation of the 1035 * previous set of good pages, then just increase 1036 * the end pointer. Otherwise start a new chunk. 1037 * Note that "end" points one higher than end, 1038 * making the range >= start and < end. 1039 * If we're also doing a speculative memory 1040 * test and we at or past the end, bump up Maxmem 1041 * so that we keep going. The first bad page 1042 * will terminate the loop. 1043 */ 1044 if (phys_avail[pa_indx] == pa) { 1045 phys_avail[pa_indx] += PAGE_SIZE; 1046 } else { 1047 pa_indx++; 1048 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 1049 printf( 1050 "Too many holes in the physical address space, giving up\n"); 1051 pa_indx--; 1052 break; 1053 } 1054 phys_avail[pa_indx++] = pa; /* start */ 1055 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1056 } 1057 physmem++; 1058 } 1059 } 1060 *pte = 0; 1061 invltlb(); 1062 1063 /* 1064 * XXX 1065 * The last chunk must contain at least one page plus the message 1066 * buffer to avoid complicating other code (message buffer address 1067 * calculation, etc.). 1068 */ 1069 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1070 round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { 1071 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1072 phys_avail[pa_indx--] = 0; 1073 phys_avail[pa_indx--] = 0; 1074 } 1075 1076 Maxmem = atop(phys_avail[pa_indx]); 1077 1078 /* Trim off space for the message buffer. */ 1079 phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); 1080 1081 avail_end = phys_avail[pa_indx]; 1082} 1083 1084static u_int64_t 1085allocpages(int n) 1086{ 1087 u_int64_t ret; 1088 1089 ret = physfree; 1090 bzero((void *)ret, n * PAGE_SIZE); 1091 physfree += n * PAGE_SIZE; 1092 return (ret); 1093} 1094 1095static void 1096create_pagetables(void) 1097{ 1098 u_int64_t p0kpa; 1099 u_int64_t p0upa; 1100 u_int64_t KPTphys; 1101 int i; 1102 1103 /* Allocate pages */ 1104 KPTphys = allocpages(NKPT); 1105 IdlePML4 = allocpages(NKPML4E); 1106 IdlePDP = allocpages(NKPDPE); 1107 IdlePTD = allocpages(NPGPTD); 1108 p0upa = allocpages(UAREA_PAGES); 1109 p0kpa = allocpages(KSTACK_PAGES); 1110 1111 proc0uarea = (struct user *)(p0upa + KERNBASE); 1112 proc0kstack = p0kpa + KERNBASE; 1113 1114 /* Fill in the underlying page table pages */ 1115 /* Read-only from zero to physfree */ 1116 /* XXX not fully used, underneath 2M pages */ 1117 for (i = 0; (i << PAGE_SHIFT) < physfree; i++) { 1118 ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT; 1119 ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V; 1120 } 1121 1122 /* Now map the page tables at their location within PTmap */ 1123 for (i = 0; i < NKPT; i++) { 1124 ((pd_entry_t *)IdlePTD)[i + KPTDI] = KPTphys + (i << PAGE_SHIFT); 1125 ((pd_entry_t *)IdlePTD)[i + KPTDI] |= PG_RW | PG_V; 1126 } 1127 1128 /* Map from zero to end of allocations under 2M pages */ 1129 /* This replaces some of the PTD entries above */ 1130 for (i = 0; (i << PDRSHIFT) < physfree; i++) { 1131 ((pd_entry_t *)IdlePTD)[i] = i << PDRSHIFT; 1132 ((pd_entry_t *)IdlePTD)[i] |= PG_RW | PG_V | PG_PS; 1133 } 1134 1135 /* Now map the page tables at their location within PTmap */ 1136 for (i = 0; i < NKPT; i++) { 1137 ((pd_entry_t *)IdlePTD)[i] = KPTphys + (i << PAGE_SHIFT); 1138 ((pd_entry_t *)IdlePTD)[i] |= PG_RW | PG_V; 1139 } 1140 1141 /* Now map the PTD at the top of the PTmap (ie: PTD[]) */ 1142 for (i = 0; i < NPGPTD; i++) { 1143 ((pd_entry_t *)IdlePTD)[i + PTDPTDI] = IdlePTD + (i << PAGE_SHIFT); 1144 ((pd_entry_t *)IdlePTD)[i + PTDPTDI] |= PG_RW | PG_V; 1145 } 1146 1147 /* And connect up the PTD to the PDP */ 1148 for (i = 0; i < NPGPTD; i++) { 1149 ((pdp_entry_t *)IdlePDP)[i] = IdlePTD + (i << PAGE_SHIFT); 1150 ((pdp_entry_t *)IdlePDP)[i] |= PG_RW | PG_V | PG_U; 1151 } 1152 1153 /* And connect up the PDP to the PML4 */ 1154 ((pdp_entry_t *)IdlePML4)[0] = IdlePDP; 1155 ((pdp_entry_t *)IdlePML4)[0] |= PG_RW | PG_V | PG_U; 1156} 1157 1158void 1159hammer_time(void) 1160{ 1161 caddr_t kmdp; 1162 int gsel_tss, off, x; 1163 struct region_descriptor r_gdt, r_idt; 1164 struct pcpu *pc; 1165 u_int64_t msr; 1166 1167 /* Turn on PTE NX (no execute) bit */ 1168 msr = rdmsr(MSR_EFER) | EFER_NXE; 1169 wrmsr(MSR_EFER, msr); 1170 create_pagetables(); 1171 1172 /* XXX do %cr0 as well */ 1173 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 1174 load_cr3(IdlePML4); 1175 1176 proc0.p_uarea = proc0uarea; 1177 thread0.td_kstack = proc0kstack; 1178 thread0.td_pcb = (struct pcb *) 1179 (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; 1180 atdevbase = ISA_HOLE_START + KERNBASE; 1181 1182 /* 1183 * This may be done better later if it gets more high level 1184 * components in it. If so just link td->td_proc here. 1185 */ 1186 proc_linkup(&proc0, &ksegrp0, &kse0, &thread0); 1187 1188 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1189 preload_bootstrap_relocate(KERNBASE); 1190 kmdp = preload_search_by_type("elf kernel"); 1191 if (kmdp == NULL) 1192 kmdp = preload_search_by_type("elf64 kernel"); 1193 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1194 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE; 1195 1196 /* Init basic tunables, hz etc */ 1197 init_param1(); 1198 1199 /* 1200 * make gdt memory segments 1201 */ 1202 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss; 1203 1204 for (x = 0; x < NGDT; x++) { 1205 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) 1206 ssdtosd(&gdt_segs[x], &gdt[x]); 1207 } 1208 ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1209 1210 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1211 r_gdt.rd_base = (long) gdt; 1212 lgdt(&r_gdt); 1213 pc = &__pcpu; 1214 1215 wrmsr(MSR_FSBASE, 0); /* User value */ 1216 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1217 wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ 1218 1219 pcpu_init(pc, 0, sizeof(struct pcpu)); 1220 PCPU_SET(prvspace, pc); 1221 PCPU_SET(curthread, &thread0); 1222 1223 /* 1224 * Initialize mutexes. 1225 * 1226 * icu_lock: in order to allow an interrupt to occur in a critical 1227 * section, to set pcpu->ipending (etc...) properly, we 1228 * must be able to get the icu lock, so it can't be 1229 * under witness. 1230 */ 1231 mutex_init(); 1232 mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_RECURSE); 1233 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1234 1235 /* exceptions */ 1236 for (x = 0; x < NIDT; x++) 1237 setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); 1238 setidt(0, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); 1239 setidt(1, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0); 1240 setidt(2, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 0); 1241 setidt(3, &IDTVEC(bpt), SDT_SYSIGT, SEL_KPL, 0); 1242 setidt(4, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); 1243 setidt(5, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); 1244 setidt(6, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); 1245 setidt(7, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); 1246 setidt(8, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1247 setidt(9, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); 1248 setidt(10, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); 1249 setidt(11, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); 1250 setidt(12, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); 1251 setidt(13, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); 1252 setidt(14, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); 1253 setidt(15, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); 1254 setidt(16, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); 1255 setidt(17, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); 1256 setidt(18, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); 1257 setidt(19, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); 1258 1259 r_idt.rd_limit = sizeof(idt0) - 1; 1260 r_idt.rd_base = (long) idt; 1261 lidt(&r_idt); 1262 1263 /* 1264 * Initialize the console before we print anything out. 1265 */ 1266 cninit(); 1267 1268#ifdef DEV_ISA 1269 isa_defaultirq(); 1270#endif 1271 1272#ifdef DDB 1273 kdb_init(); 1274 if (boothowto & RB_KDB) 1275 Debugger("Boot flags requested debugger"); 1276#endif 1277 1278 identify_cpu(); /* Final stage of CPU initialization */ 1279 initializecpu(); /* Initialize CPU registers */ 1280 1281 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1282 common_tss.tss_rsp0 = thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb); 1283 1284 /* doublefault stack space, runs on ist1 */ 1285 common_tss.tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; 1286 1287 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1288 ltr(gsel_tss); 1289 1290 /* Set up the fast syscall stuff */ 1291 msr = rdmsr(MSR_EFER) | EFER_SCE; 1292 wrmsr(MSR_EFER, msr); 1293 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 1294 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1295 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1296 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1297 wrmsr(MSR_STAR, msr); 1298 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); 1299 1300 getmemsize(kmdp, physfree); 1301 init_param2(physmem); 1302 1303 /* now running on new page tables, configured,and u/iom is accessible */ 1304 1305 /* Map the message buffer. */ 1306 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 1307 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 1308 1309 msgbufinit(msgbufp, MSGBUF_SIZE); 1310 1311 /* transfer to user mode */ 1312 1313 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1314 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1315 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1316 1317 /* setup proc 0's pcb */ 1318 thread0.td_pcb->pcb_flags = 0; /* XXXKSE */ 1319 thread0.td_pcb->pcb_cr3 = IdlePML4; 1320 thread0.td_pcb->pcb_kgsbase = (u_int64_t)pc; 1321 thread0.td_frame = &proc0_tf; 1322} 1323 1324void 1325cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1326{ 1327} 1328 1329int 1330ptrace_set_pc(struct thread *td, unsigned long addr) 1331{ 1332 td->td_frame->tf_rip = addr; 1333 return (0); 1334} 1335 1336int 1337ptrace_single_step(struct thread *td) 1338{ 1339 td->td_frame->tf_rflags |= PSL_T; 1340 return (0); 1341} 1342 1343int 1344fill_regs(struct thread *td, struct reg *regs) 1345{ 1346 struct pcb *pcb; 1347 struct trapframe *tp; 1348 1349 tp = td->td_frame; 1350 regs->r_r15 = tp->tf_r15; 1351 regs->r_r14 = tp->tf_r14; 1352 regs->r_r13 = tp->tf_r13; 1353 regs->r_r12 = tp->tf_r12; 1354 regs->r_r11 = tp->tf_r11; 1355 regs->r_r10 = tp->tf_r10; 1356 regs->r_r9 = tp->tf_r9; 1357 regs->r_r8 = tp->tf_r8; 1358 regs->r_rdi = tp->tf_rdi; 1359 regs->r_rsi = tp->tf_rsi; 1360 regs->r_rbp = tp->tf_rbp; 1361 regs->r_rbx = tp->tf_rbx; 1362 regs->r_rdx = tp->tf_rdx; 1363 regs->r_rcx = tp->tf_rcx; 1364 regs->r_rax = tp->tf_rax; 1365 regs->r_rip = tp->tf_rip; 1366 regs->r_cs = tp->tf_cs; 1367 regs->r_rflags = tp->tf_rflags; 1368 regs->r_rsp = tp->tf_rsp; 1369 regs->r_ss = tp->tf_ss; 1370 pcb = td->td_pcb; 1371 return (0); 1372} 1373 1374int 1375set_regs(struct thread *td, struct reg *regs) 1376{ 1377 struct pcb *pcb; 1378 struct trapframe *tp; 1379 1380 tp = td->td_frame; 1381 if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) || 1382 !CS_SECURE(regs->r_cs)) 1383 return (EINVAL); 1384 tp->tf_r15 = regs->r_r15; 1385 tp->tf_r14 = regs->r_r14; 1386 tp->tf_r13 = regs->r_r13; 1387 tp->tf_r12 = regs->r_r12; 1388 tp->tf_r11 = regs->r_r11; 1389 tp->tf_r10 = regs->r_r10; 1390 tp->tf_r9 = regs->r_r9; 1391 tp->tf_r8 = regs->r_r8; 1392 tp->tf_rdi = regs->r_rdi; 1393 tp->tf_rsi = regs->r_rsi; 1394 tp->tf_rbp = regs->r_rbp; 1395 tp->tf_rbx = regs->r_rbx; 1396 tp->tf_rdx = regs->r_rdx; 1397 tp->tf_rcx = regs->r_rcx; 1398 tp->tf_rax = regs->r_rax; 1399 tp->tf_rip = regs->r_rip; 1400 tp->tf_cs = regs->r_cs; 1401 tp->tf_rflags = regs->r_rflags; 1402 tp->tf_rsp = regs->r_rsp; 1403 tp->tf_ss = regs->r_ss; 1404 pcb = td->td_pcb; 1405 return (0); 1406} 1407 1408/* XXX check all this stuff! */ 1409/* externalize from sv_xmm */ 1410static void 1411fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) 1412{ 1413 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 1414 struct envxmm *penv_xmm = &sv_xmm->sv_env; 1415 int i; 1416 1417 /* pcb -> fpregs */ 1418 bzero(fpregs, sizeof(*fpregs)); 1419 1420 /* FPU control/status */ 1421 penv_fpreg->en_cw = penv_xmm->en_cw; 1422 penv_fpreg->en_sw = penv_xmm->en_sw; 1423 penv_fpreg->en_tw = penv_xmm->en_tw; 1424 penv_fpreg->en_opcode = penv_xmm->en_opcode; 1425 penv_fpreg->en_rip = penv_xmm->en_rip; 1426 penv_fpreg->en_rdp = penv_xmm->en_rdp; 1427 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; 1428 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; 1429 1430 /* FPU registers */ 1431 for (i = 0; i < 8; ++i) 1432 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); 1433 1434 /* SSE registers */ 1435 for (i = 0; i < 16; ++i) 1436 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); 1437} 1438 1439/* internalize from fpregs into sv_xmm */ 1440static void 1441set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) 1442{ 1443 struct envxmm *penv_xmm = &sv_xmm->sv_env; 1444 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 1445 int i; 1446 1447 /* fpregs -> pcb */ 1448 /* FPU control/status */ 1449 penv_xmm->en_cw = penv_fpreg->en_cw; 1450 penv_xmm->en_sw = penv_fpreg->en_sw; 1451 penv_xmm->en_tw = penv_fpreg->en_tw; 1452 penv_xmm->en_opcode = penv_fpreg->en_opcode; 1453 penv_xmm->en_rip = penv_fpreg->en_rip; 1454 penv_xmm->en_rdp = penv_fpreg->en_rdp; 1455 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; 1456 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask; 1457 1458 /* FPU registers */ 1459 for (i = 0; i < 8; ++i) 1460 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); 1461 1462 /* SSE registers */ 1463 for (i = 0; i < 16; ++i) 1464 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); 1465} 1466 1467/* externalize from td->pcb */ 1468int 1469fill_fpregs(struct thread *td, struct fpreg *fpregs) 1470{ 1471 1472 fill_fpregs_xmm(&td->td_pcb->pcb_save, fpregs); 1473 return (0); 1474} 1475 1476/* internalize to td->pcb */ 1477int 1478set_fpregs(struct thread *td, struct fpreg *fpregs) 1479{ 1480 1481 set_fpregs_xmm(fpregs, &td->td_pcb->pcb_save); 1482 return (0); 1483} 1484 1485/* 1486 * Get machine context. 1487 */ 1488int 1489get_mcontext(struct thread *td, mcontext_t *mcp, int clear_ret) 1490{ 1491 struct trapframe *tp; 1492 1493 tp = td->td_frame; 1494 1495 PROC_LOCK(curthread->td_proc); 1496 mcp->mc_onstack = sigonstack(tp->tf_rsp); 1497 PROC_UNLOCK(curthread->td_proc); 1498 mcp->mc_r15 = tp->tf_r15; 1499 mcp->mc_r14 = tp->tf_r14; 1500 mcp->mc_r13 = tp->tf_r13; 1501 mcp->mc_r12 = tp->tf_r12; 1502 mcp->mc_r11 = tp->tf_r11; 1503 mcp->mc_r10 = tp->tf_r10; 1504 mcp->mc_r9 = tp->tf_r9; 1505 mcp->mc_r8 = tp->tf_r8; 1506 mcp->mc_rdi = tp->tf_rdi; 1507 mcp->mc_rsi = tp->tf_rsi; 1508 mcp->mc_rbp = tp->tf_rbp; 1509 mcp->mc_rbx = tp->tf_rbx; 1510 mcp->mc_rcx = tp->tf_rcx; 1511 if (clear_ret != 0) { 1512 mcp->mc_rax = 0; 1513 mcp->mc_rdx = 0; 1514 } else { 1515 mcp->mc_rax = tp->tf_rax; 1516 mcp->mc_rdx = tp->tf_rdx; 1517 } 1518 mcp->mc_rip = tp->tf_rip; 1519 mcp->mc_cs = tp->tf_cs; 1520 mcp->mc_rflags = tp->tf_rflags; 1521 mcp->mc_rsp = tp->tf_rsp; 1522 mcp->mc_ss = tp->tf_ss; 1523 mcp->mc_len = sizeof(*mcp); 1524 get_fpcontext(td, mcp); 1525 return (0); 1526} 1527 1528/* 1529 * Set machine context. 1530 * 1531 * However, we don't set any but the user modifiable flags, and we won't 1532 * touch the cs selector. 1533 */ 1534int 1535set_mcontext(struct thread *td, const mcontext_t *mcp) 1536{ 1537 struct trapframe *tp; 1538 long rflags; 1539 int ret; 1540 1541 tp = td->td_frame; 1542 if (mcp->mc_len != sizeof(*mcp)) 1543 return (EINVAL); 1544 rflags = (mcp->mc_rflags & PSL_USERCHANGE) | 1545 (tp->tf_rflags & ~PSL_USERCHANGE); 1546 if ((ret = set_fpcontext(td, mcp)) == 0) { 1547 tp->tf_r15 = mcp->mc_r15; 1548 tp->tf_r14 = mcp->mc_r14; 1549 tp->tf_r13 = mcp->mc_r13; 1550 tp->tf_r12 = mcp->mc_r12; 1551 tp->tf_r11 = mcp->mc_r11; 1552 tp->tf_r10 = mcp->mc_r10; 1553 tp->tf_r9 = mcp->mc_r9; 1554 tp->tf_r8 = mcp->mc_r8; 1555 tp->tf_rdi = mcp->mc_rdi; 1556 tp->tf_rsi = mcp->mc_rsi; 1557 tp->tf_rbp = mcp->mc_rbp; 1558 tp->tf_rbx = mcp->mc_rbx; 1559 tp->tf_rdx = mcp->mc_rdx; 1560 tp->tf_rcx = mcp->mc_rcx; 1561 tp->tf_rax = mcp->mc_rax; 1562 tp->tf_rip = mcp->mc_rip; 1563 tp->tf_rflags = rflags; 1564 tp->tf_rsp = mcp->mc_rsp; 1565 tp->tf_ss = mcp->mc_ss; 1566 ret = 0; 1567 } 1568 return (ret); 1569} 1570 1571static void 1572get_fpcontext(struct thread *td, mcontext_t *mcp) 1573{ 1574 struct savefpu *addr; 1575 1576 /* 1577 * XXX mc_fpstate might be misaligned, since its declaration is not 1578 * unportabilized using __attribute__((aligned(16))) like the 1579 * declaration of struct savemm, and anyway, alignment doesn't work 1580 * for auto variables since we don't use gcc's pessimal stack 1581 * alignment. Work around this by abusing the spare fields after 1582 * mcp->mc_fpstate. 1583 * 1584 * XXX unpessimize most cases by only aligning when fxsave might be 1585 * called, although this requires knowing too much about 1586 * npxgetregs()'s internals. 1587 */ 1588 addr = (struct savefpu *)&mcp->mc_fpstate; 1589 if (td == PCPU_GET(fpcurthread) && ((uintptr_t)(void *)addr & 0xF)) { 1590 do 1591 addr = (void *)((char *)addr + 4); 1592 while ((uintptr_t)(void *)addr & 0xF); 1593 } 1594 mcp->mc_ownedfp = npxgetregs(td, addr); 1595 if (addr != (struct savefpu *)&mcp->mc_fpstate) { 1596 bcopy(addr, &mcp->mc_fpstate, sizeof(mcp->mc_fpstate)); 1597 bzero(&mcp->mc_spare2, sizeof(mcp->mc_spare2)); 1598 } 1599 mcp->mc_fpformat = npxformat(); 1600} 1601 1602static int 1603set_fpcontext(struct thread *td, const mcontext_t *mcp) 1604{ 1605 struct savefpu *addr; 1606 1607 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 1608 return (0); 1609 else if (mcp->mc_fpformat != _MC_FPFMT_XMM) 1610 return (EINVAL); 1611 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) 1612 /* We don't care what state is left in the FPU or PCB. */ 1613 fpstate_drop(td); 1614 else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 1615 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 1616 /* XXX align as above. */ 1617 addr = (struct savefpu *)&mcp->mc_fpstate; 1618 if (td == PCPU_GET(fpcurthread) && 1619 ((uintptr_t)(void *)addr & 0xF)) { 1620 do 1621 addr = (void *)((char *)addr + 4); 1622 while ((uintptr_t)(void *)addr & 0xF); 1623 bcopy(&mcp->mc_fpstate, addr, sizeof(mcp->mc_fpstate)); 1624 } 1625 /* 1626 * XXX we violate the dubious requirement that npxsetregs() 1627 * be called with interrupts disabled. 1628 */ 1629 npxsetregs(td, addr); 1630 /* 1631 * Don't bother putting things back where they were in the 1632 * misaligned case, since we know that the caller won't use 1633 * them again. 1634 */ 1635 } else 1636 return (EINVAL); 1637 return (0); 1638} 1639 1640void 1641fpstate_drop(struct thread *td) 1642{ 1643 register_t s; 1644 1645 s = intr_disable(); 1646 if (PCPU_GET(fpcurthread) == td) 1647 npxdrop(); 1648 /* 1649 * XXX force a full drop of the npx. The above only drops it if we 1650 * owned it. 1651 * 1652 * XXX I don't much like npxgetregs()'s semantics of doing a full 1653 * drop. Dropping only to the pcb matches fnsave's behaviour. 1654 * We only need to drop to !PCB_INITDONE in sendsig(). But 1655 * sendsig() is the only caller of npxgetregs()... perhaps we just 1656 * have too many layers. 1657 */ 1658 curthread->td_pcb->pcb_flags &= ~PCB_NPXINITDONE; 1659 intr_restore(s); 1660} 1661 1662int 1663fill_dbregs(struct thread *td, struct dbreg *dbregs) 1664{ 1665 1666 return (0); 1667} 1668 1669int 1670set_dbregs(struct thread *td, struct dbreg *dbregs) 1671{ 1672 1673 return (0); 1674} 1675 1676#ifndef DDB 1677void 1678Debugger(const char *msg) 1679{ 1680 printf("Debugger(\"%s\") called.\n", msg); 1681} 1682#endif /* no DDB */ 1683 1684#ifdef DDB 1685 1686/* 1687 * Provide inb() and outb() as functions. They are normally only 1688 * available as macros calling inlined functions, thus cannot be 1689 * called inside DDB. 1690 * 1691 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined. 1692 */ 1693 1694#undef inb 1695#undef outb 1696 1697/* silence compiler warnings */ 1698u_char inb(u_int); 1699void outb(u_int, u_char); 1700 1701u_char 1702inb(u_int port) 1703{ 1704 u_char data; 1705 /* 1706 * We use %%dx and not %1 here because i/o is done at %dx and not at 1707 * %edx, while gcc generates inferior code (movw instead of movl) 1708 * if we tell it to load (u_short) port. 1709 */ 1710 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); 1711 return (data); 1712} 1713 1714void 1715outb(u_int port, u_char data) 1716{ 1717 u_char al; 1718 /* 1719 * Use an unnecessary assignment to help gcc's register allocator. 1720 * This make a large difference for gcc-1.40 and a tiny difference 1721 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for 1722 * best results. gcc-2.6.0 can't handle this. 1723 */ 1724 al = data; 1725 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); 1726} 1727 1728#endif /* DDB */ 1729 1730MODULE_VERSION(acpi, 100); 1731