machdep.c revision 347700
1/*- 2 * Copyright (c) 2003 Peter Wemm. 3 * Copyright (c) 1992 Terrence R. Lambert. 4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * William Jolitz. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 39 */ 40 41#include <sys/cdefs.h> 42__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 347700 2019-05-16 14:42:16Z markj $"); 43 44#include "opt_atpic.h" 45#include "opt_compat.h" 46#include "opt_cpu.h" 47#include "opt_ddb.h" 48#include "opt_inet.h" 49#include "opt_isa.h" 50#include "opt_kstack_pages.h" 51#include "opt_maxmem.h" 52#include "opt_mp_watchdog.h" 53#include "opt_perfmon.h" 54#include "opt_platform.h" 55#include "opt_sched.h" 56 57#include <sys/param.h> 58#include <sys/proc.h> 59#include <sys/systm.h> 60#include <sys/bio.h> 61#include <sys/buf.h> 62#include <sys/bus.h> 63#include <sys/callout.h> 64#include <sys/cons.h> 65#include <sys/cpu.h> 66#include <sys/efi.h> 67#include <sys/eventhandler.h> 68#include <sys/exec.h> 69#include <sys/imgact.h> 70#include <sys/kdb.h> 71#include <sys/kernel.h> 72#include <sys/ktr.h> 73#include <sys/linker.h> 74#include <sys/lock.h> 75#include <sys/malloc.h> 76#include <sys/memrange.h> 77#include <sys/msgbuf.h> 78#include <sys/mutex.h> 79#include <sys/pcpu.h> 80#include <sys/ptrace.h> 81#include <sys/reboot.h> 82#include <sys/rwlock.h> 83#include <sys/sched.h> 84#include <sys/signalvar.h> 85#ifdef SMP 86#include <sys/smp.h> 87#endif 88#include <sys/syscallsubr.h> 89#include <sys/sysctl.h> 90#include <sys/sysent.h> 91#include <sys/sysproto.h> 92#include <sys/ucontext.h> 93#include <sys/vmmeter.h> 94 95#include <vm/vm.h> 96#include <vm/vm_extern.h> 97#include <vm/vm_kern.h> 98#include <vm/vm_page.h> 99#include <vm/vm_map.h> 100#include <vm/vm_object.h> 101#include <vm/vm_pager.h> 102#include <vm/vm_param.h> 103#include <vm/vm_phys.h> 104 105#ifdef DDB 106#ifndef KDB 107#error KDB must be enabled in order for DDB to work! 108#endif 109#include <ddb/ddb.h> 110#include <ddb/db_sym.h> 111#endif 112 113#include <net/netisr.h> 114 115#include <machine/clock.h> 116#include <machine/cpu.h> 117#include <machine/cputypes.h> 118#include <machine/frame.h> 119#include <machine/intr_machdep.h> 120#include <x86/mca.h> 121#include <machine/md_var.h> 122#include <machine/metadata.h> 123#include <machine/mp_watchdog.h> 124#include <machine/pc/bios.h> 125#include <machine/pcb.h> 126#include <machine/proc.h> 127#include <machine/reg.h> 128#include <machine/sigframe.h> 129#include <machine/specialreg.h> 130#ifdef PERFMON 131#include <machine/perfmon.h> 132#endif 133#include <machine/tss.h> 134#include <x86/ucode.h> 135#ifdef SMP 136#include <machine/smp.h> 137#endif 138#ifdef FDT 139#include <x86/fdt.h> 140#endif 141 142#ifdef DEV_ATPIC 143#include <x86/isa/icu.h> 144#else 145#include <x86/apicvar.h> 146#endif 147 148#include <isa/isareg.h> 149#include <isa/rtc.h> 150#include <x86/init.h> 151 152/* Sanity check for __curthread() */ 153CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 154 155/* 156 * The PTI trampoline stack needs enough space for a hardware trapframe and a 157 * couple of scratch registers, as well as the trapframe left behind after an 158 * iret fault. 159 */ 160CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - 161 offsetof(struct pti_frame, pti_rip)); 162 163extern u_int64_t hammer_time(u_int64_t, u_int64_t); 164 165#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 166#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 167 168static void cpu_startup(void *); 169static void get_fpcontext(struct thread *td, mcontext_t *mcp, 170 char *xfpusave, size_t xfpusave_len); 171static int set_fpcontext(struct thread *td, mcontext_t *mcp, 172 char *xfpustate, size_t xfpustate_len); 173SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 174 175/* Preload data parse function */ 176static caddr_t native_parse_preload_data(u_int64_t); 177 178/* Native function to fetch and parse the e820 map */ 179static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); 180 181/* Default init_ops implementation. */ 182struct init_ops init_ops = { 183 .parse_preload_data = native_parse_preload_data, 184 .early_clock_source_init = i8254_init, 185 .early_delay = i8254_delay, 186 .parse_memmap = native_parse_memmap, 187#ifdef SMP 188 .mp_bootaddress = mp_bootaddress, 189 .start_all_aps = native_start_all_aps, 190#endif 191 .msi_init = msi_init, 192}; 193 194struct msgbuf *msgbufp; 195 196/* 197 * Physical address of the EFI System Table. Stashed from the metadata hints 198 * passed into the kernel and used by the EFI code to call runtime services. 199 */ 200vm_paddr_t efi_systbl_phys; 201 202/* Intel ICH registers */ 203#define ICH_PMBASE 0x400 204#define ICH_SMI_EN ICH_PMBASE + 0x30 205 206int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; 207 208int cold = 1; 209 210long Maxmem = 0; 211long realmem = 0; 212 213/* 214 * The number of PHYSMAP entries must be one less than the number of 215 * PHYSSEG entries because the PHYSMAP entry that spans the largest 216 * physical address that is accessible by ISA DMA is split into two 217 * PHYSSEG entries. 218 */ 219#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 220 221vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 222vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 223 224/* must be 2 less so 0 0 can signal end of chunks */ 225#define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) 226#define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) 227 228struct kva_md_info kmi; 229 230static struct trapframe proc0_tf; 231struct region_descriptor r_gdt, r_idt; 232 233struct pcpu __pcpu[MAXCPU]; 234 235struct mtx icu_lock; 236 237struct mem_range_softc mem_range_softc; 238 239struct mtx dt_lock; /* lock for GDT and LDT */ 240 241void (*vmm_resume_p)(void); 242 243static void 244cpu_startup(dummy) 245 void *dummy; 246{ 247 uintmax_t memsize; 248 char *sysenv; 249 250 /* 251 * On MacBooks, we need to disallow the legacy USB circuit to 252 * generate an SMI# because this can cause several problems, 253 * namely: incorrect CPU frequency detection and failure to 254 * start the APs. 255 * We do this by disabling a bit in the SMI_EN (SMI Control and 256 * Enable register) of the Intel ICH LPC Interface Bridge. 257 */ 258 sysenv = kern_getenv("smbios.system.product"); 259 if (sysenv != NULL) { 260 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 261 strncmp(sysenv, "MacBook3,1", 10) == 0 || 262 strncmp(sysenv, "MacBook4,1", 10) == 0 || 263 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 264 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 265 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 266 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 267 strncmp(sysenv, "Macmini1,1", 10) == 0) { 268 if (bootverbose) 269 printf("Disabling LEGACY_USB_EN bit on " 270 "Intel ICH.\n"); 271 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 272 } 273 freeenv(sysenv); 274 } 275 276 /* 277 * Good {morning,afternoon,evening,night}. 278 */ 279 startrtclock(); 280 printcpuinfo(); 281#ifdef PERFMON 282 perfmon_init(); 283#endif 284 285 /* 286 * Display physical memory if SMBIOS reports reasonable amount. 287 */ 288 memsize = 0; 289 sysenv = kern_getenv("smbios.memory.enabled"); 290 if (sysenv != NULL) { 291 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 292 freeenv(sysenv); 293 } 294 if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count)) 295 memsize = ptoa((uintmax_t)Maxmem); 296 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 297 realmem = atop(memsize); 298 299 /* 300 * Display any holes after the first chunk of extended memory. 301 */ 302 if (bootverbose) { 303 int indx; 304 305 printf("Physical memory chunk(s):\n"); 306 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 307 vm_paddr_t size; 308 309 size = phys_avail[indx + 1] - phys_avail[indx]; 310 printf( 311 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 312 (uintmax_t)phys_avail[indx], 313 (uintmax_t)phys_avail[indx + 1] - 1, 314 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 315 } 316 } 317 318 vm_ksubmap_init(&kmi); 319 320 printf("avail memory = %ju (%ju MB)\n", 321 ptoa((uintmax_t)vm_cnt.v_free_count), 322 ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); 323 324 /* 325 * Set up buffers, so they can be used to read disk labels. 326 */ 327 bufinit(); 328 vm_pager_bufferinit(); 329 330 cpu_setregs(); 331} 332 333/* 334 * Send an interrupt to process. 335 * 336 * Stack is set up to allow sigcode stored 337 * at top to call routine, followed by call 338 * to sigreturn routine below. After sigreturn 339 * resets the signal mask, the stack, and the 340 * frame pointer, it returns to the user 341 * specified pc, psl. 342 */ 343void 344sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 345{ 346 struct sigframe sf, *sfp; 347 struct pcb *pcb; 348 struct proc *p; 349 struct thread *td; 350 struct sigacts *psp; 351 char *sp; 352 struct trapframe *regs; 353 char *xfpusave; 354 size_t xfpusave_len; 355 int sig; 356 int oonstack; 357 358 td = curthread; 359 pcb = td->td_pcb; 360 p = td->td_proc; 361 PROC_LOCK_ASSERT(p, MA_OWNED); 362 sig = ksi->ksi_signo; 363 psp = p->p_sigacts; 364 mtx_assert(&psp->ps_mtx, MA_OWNED); 365 regs = td->td_frame; 366 oonstack = sigonstack(regs->tf_rsp); 367 368 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { 369 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); 370 xfpusave = __builtin_alloca(xfpusave_len); 371 } else { 372 xfpusave_len = 0; 373 xfpusave = NULL; 374 } 375 376 /* Save user context. */ 377 bzero(&sf, sizeof(sf)); 378 sf.sf_uc.uc_sigmask = *mask; 379 sf.sf_uc.uc_stack = td->td_sigstk; 380 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 381 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 382 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 383 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); 384 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 385 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 386 fpstate_drop(td); 387 update_pcb_bases(pcb); 388 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; 389 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; 390 bzero(sf.sf_uc.uc_mcontext.mc_spare, 391 sizeof(sf.sf_uc.uc_mcontext.mc_spare)); 392 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 393 394 /* Allocate space for the signal handler context. */ 395 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 396 SIGISMEMBER(psp->ps_sigonstack, sig)) { 397 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 398#if defined(COMPAT_43) 399 td->td_sigstk.ss_flags |= SS_ONSTACK; 400#endif 401 } else 402 sp = (char *)regs->tf_rsp - 128; 403 if (xfpusave != NULL) { 404 sp -= xfpusave_len; 405 sp = (char *)((unsigned long)sp & ~0x3Ful); 406 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 407 } 408 sp -= sizeof(struct sigframe); 409 /* Align to 16 bytes. */ 410 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); 411 412 /* Build the argument list for the signal handler. */ 413 regs->tf_rdi = sig; /* arg 1 in %rdi */ 414 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ 415 bzero(&sf.sf_si, sizeof(sf.sf_si)); 416 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 417 /* Signal handler installed with SA_SIGINFO. */ 418 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ 419 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 420 421 /* Fill in POSIX parts */ 422 sf.sf_si = ksi->ksi_info; 423 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 424 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 425 } else { 426 /* Old FreeBSD-style arguments. */ 427 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ 428 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 429 sf.sf_ahu.sf_handler = catcher; 430 } 431 mtx_unlock(&psp->ps_mtx); 432 PROC_UNLOCK(p); 433 434 /* 435 * Copy the sigframe out to the user's stack. 436 */ 437 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 438 (xfpusave != NULL && copyout(xfpusave, 439 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 440 != 0)) { 441#ifdef DEBUG 442 printf("process %ld has trashed its stack\n", (long)p->p_pid); 443#endif 444 PROC_LOCK(p); 445 sigexit(td, SIGILL); 446 } 447 448 regs->tf_rsp = (long)sfp; 449 regs->tf_rip = p->p_sysent->sv_sigcode_base; 450 regs->tf_rflags &= ~(PSL_T | PSL_D); 451 regs->tf_cs = _ucodesel; 452 regs->tf_ds = _udatasel; 453 regs->tf_ss = _udatasel; 454 regs->tf_es = _udatasel; 455 regs->tf_fs = _ufssel; 456 regs->tf_gs = _ugssel; 457 regs->tf_flags = TF_HASSEGS; 458 PROC_LOCK(p); 459 mtx_lock(&psp->ps_mtx); 460} 461 462/* 463 * System call to cleanup state after a signal 464 * has been taken. Reset signal mask and 465 * stack state from context left by sendsig (above). 466 * Return to previous pc and psl as specified by 467 * context left by sendsig. Check carefully to 468 * make sure that the user has not modified the 469 * state to gain improper privileges. 470 * 471 * MPSAFE 472 */ 473int 474sys_sigreturn(td, uap) 475 struct thread *td; 476 struct sigreturn_args /* { 477 const struct __ucontext *sigcntxp; 478 } */ *uap; 479{ 480 ucontext_t uc; 481 struct pcb *pcb; 482 struct proc *p; 483 struct trapframe *regs; 484 ucontext_t *ucp; 485 char *xfpustate; 486 size_t xfpustate_len; 487 long rflags; 488 int cs, error, ret; 489 ksiginfo_t ksi; 490 491 pcb = td->td_pcb; 492 p = td->td_proc; 493 494 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 495 if (error != 0) { 496 uprintf("pid %d (%s): sigreturn copyin failed\n", 497 p->p_pid, td->td_name); 498 return (error); 499 } 500 ucp = &uc; 501 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 502 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 503 td->td_name, ucp->uc_mcontext.mc_flags); 504 return (EINVAL); 505 } 506 regs = td->td_frame; 507 rflags = ucp->uc_mcontext.mc_rflags; 508 /* 509 * Don't allow users to change privileged or reserved flags. 510 */ 511 if (!EFL_SECURE(rflags, regs->tf_rflags)) { 512 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, 513 td->td_name, rflags); 514 return (EINVAL); 515 } 516 517 /* 518 * Don't allow users to load a valid privileged %cs. Let the 519 * hardware check for invalid selectors, excess privilege in 520 * other selectors, invalid %eip's and invalid %esp's. 521 */ 522 cs = ucp->uc_mcontext.mc_cs; 523 if (!CS_SECURE(cs)) { 524 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, 525 td->td_name, cs); 526 ksiginfo_init_trap(&ksi); 527 ksi.ksi_signo = SIGBUS; 528 ksi.ksi_code = BUS_OBJERR; 529 ksi.ksi_trapno = T_PROTFLT; 530 ksi.ksi_addr = (void *)regs->tf_rip; 531 trapsignal(td, &ksi); 532 return (EINVAL); 533 } 534 535 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 536 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 537 if (xfpustate_len > cpu_max_ext_state_size - 538 sizeof(struct savefpu)) { 539 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 540 p->p_pid, td->td_name, xfpustate_len); 541 return (EINVAL); 542 } 543 xfpustate = __builtin_alloca(xfpustate_len); 544 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 545 xfpustate, xfpustate_len); 546 if (error != 0) { 547 uprintf( 548 "pid %d (%s): sigreturn copying xfpustate failed\n", 549 p->p_pid, td->td_name); 550 return (error); 551 } 552 } else { 553 xfpustate = NULL; 554 xfpustate_len = 0; 555 } 556 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); 557 if (ret != 0) { 558 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", 559 p->p_pid, td->td_name, ret); 560 return (ret); 561 } 562 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); 563 update_pcb_bases(pcb); 564 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; 565 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; 566 567#if defined(COMPAT_43) 568 if (ucp->uc_mcontext.mc_onstack & 1) 569 td->td_sigstk.ss_flags |= SS_ONSTACK; 570 else 571 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 572#endif 573 574 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 575 return (EJUSTRETURN); 576} 577 578#ifdef COMPAT_FREEBSD4 579int 580freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) 581{ 582 583 return sys_sigreturn(td, (struct sigreturn_args *)uap); 584} 585#endif 586 587/* 588 * Reset registers to default values on exec. 589 */ 590void 591exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 592{ 593 struct trapframe *regs; 594 struct pcb *pcb; 595 register_t saved_rflags; 596 597 regs = td->td_frame; 598 pcb = td->td_pcb; 599 600 mtx_lock(&dt_lock); 601 if (td->td_proc->p_md.md_ldt != NULL) 602 user_ldt_free(td); 603 else 604 mtx_unlock(&dt_lock); 605 606 update_pcb_bases(pcb); 607 pcb->pcb_fsbase = 0; 608 pcb->pcb_gsbase = 0; 609 clear_pcb_flags(pcb, PCB_32BIT); 610 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; 611 612 saved_rflags = regs->tf_rflags & PSL_T; 613 bzero((char *)regs, sizeof(struct trapframe)); 614 regs->tf_rip = imgp->entry_addr; 615 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; 616 regs->tf_rdi = stack; /* argv */ 617 regs->tf_rflags = PSL_USER | saved_rflags; 618 regs->tf_ss = _udatasel; 619 regs->tf_cs = _ucodesel; 620 regs->tf_ds = _udatasel; 621 regs->tf_es = _udatasel; 622 regs->tf_fs = _ufssel; 623 regs->tf_gs = _ugssel; 624 regs->tf_flags = TF_HASSEGS; 625 td->td_retval[1] = 0; 626 627 /* 628 * Reset the hardware debug registers if they were in use. 629 * They won't have any meaning for the newly exec'd process. 630 */ 631 if (pcb->pcb_flags & PCB_DBREGS) { 632 pcb->pcb_dr0 = 0; 633 pcb->pcb_dr1 = 0; 634 pcb->pcb_dr2 = 0; 635 pcb->pcb_dr3 = 0; 636 pcb->pcb_dr6 = 0; 637 pcb->pcb_dr7 = 0; 638 if (pcb == curpcb) { 639 /* 640 * Clear the debug registers on the running 641 * CPU, otherwise they will end up affecting 642 * the next process we switch to. 643 */ 644 reset_dbregs(); 645 } 646 clear_pcb_flags(pcb, PCB_DBREGS); 647 } 648 649 /* 650 * Drop the FP state if we hold it, so that the process gets a 651 * clean FP state if it uses the FPU again. 652 */ 653 fpstate_drop(td); 654} 655 656void 657cpu_setregs(void) 658{ 659 register_t cr0; 660 661 cr0 = rcr0(); 662 /* 663 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the 664 * BSP. See the comments there about why we set them. 665 */ 666 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 667 load_cr0(cr0); 668} 669 670/* 671 * Initialize amd64 and configure to run kernel 672 */ 673 674/* 675 * Initialize segments & interrupt table 676 */ 677 678struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */ 679static struct gate_descriptor idt0[NIDT]; 680struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 681 682static char dblfault_stack[PAGE_SIZE] __aligned(16); 683static char mce0_stack[PAGE_SIZE] __aligned(16); 684static char nmi0_stack[PAGE_SIZE] __aligned(16); 685static char dbg0_stack[PAGE_SIZE] __aligned(16); 686CTASSERT(sizeof(struct nmi_pcpu) == 16); 687 688struct amd64tss common_tss[MAXCPU]; 689 690/* 691 * Software prototypes -- in more palatable form. 692 * 693 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same 694 * slots as corresponding segments for i386 kernel. 695 */ 696struct soft_segment_descriptor gdt_segs[] = { 697/* GNULL_SEL 0 Null Descriptor */ 698{ .ssd_base = 0x0, 699 .ssd_limit = 0x0, 700 .ssd_type = 0, 701 .ssd_dpl = 0, 702 .ssd_p = 0, 703 .ssd_long = 0, 704 .ssd_def32 = 0, 705 .ssd_gran = 0 }, 706/* GNULL2_SEL 1 Null Descriptor */ 707{ .ssd_base = 0x0, 708 .ssd_limit = 0x0, 709 .ssd_type = 0, 710 .ssd_dpl = 0, 711 .ssd_p = 0, 712 .ssd_long = 0, 713 .ssd_def32 = 0, 714 .ssd_gran = 0 }, 715/* GUFS32_SEL 2 32 bit %gs Descriptor for user */ 716{ .ssd_base = 0x0, 717 .ssd_limit = 0xfffff, 718 .ssd_type = SDT_MEMRWA, 719 .ssd_dpl = SEL_UPL, 720 .ssd_p = 1, 721 .ssd_long = 0, 722 .ssd_def32 = 1, 723 .ssd_gran = 1 }, 724/* GUGS32_SEL 3 32 bit %fs Descriptor for user */ 725{ .ssd_base = 0x0, 726 .ssd_limit = 0xfffff, 727 .ssd_type = SDT_MEMRWA, 728 .ssd_dpl = SEL_UPL, 729 .ssd_p = 1, 730 .ssd_long = 0, 731 .ssd_def32 = 1, 732 .ssd_gran = 1 }, 733/* GCODE_SEL 4 Code Descriptor for kernel */ 734{ .ssd_base = 0x0, 735 .ssd_limit = 0xfffff, 736 .ssd_type = SDT_MEMERA, 737 .ssd_dpl = SEL_KPL, 738 .ssd_p = 1, 739 .ssd_long = 1, 740 .ssd_def32 = 0, 741 .ssd_gran = 1 }, 742/* GDATA_SEL 5 Data Descriptor for kernel */ 743{ .ssd_base = 0x0, 744 .ssd_limit = 0xfffff, 745 .ssd_type = SDT_MEMRWA, 746 .ssd_dpl = SEL_KPL, 747 .ssd_p = 1, 748 .ssd_long = 1, 749 .ssd_def32 = 0, 750 .ssd_gran = 1 }, 751/* GUCODE32_SEL 6 32 bit Code Descriptor for user */ 752{ .ssd_base = 0x0, 753 .ssd_limit = 0xfffff, 754 .ssd_type = SDT_MEMERA, 755 .ssd_dpl = SEL_UPL, 756 .ssd_p = 1, 757 .ssd_long = 0, 758 .ssd_def32 = 1, 759 .ssd_gran = 1 }, 760/* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ 761{ .ssd_base = 0x0, 762 .ssd_limit = 0xfffff, 763 .ssd_type = SDT_MEMRWA, 764 .ssd_dpl = SEL_UPL, 765 .ssd_p = 1, 766 .ssd_long = 0, 767 .ssd_def32 = 1, 768 .ssd_gran = 1 }, 769/* GUCODE_SEL 8 64 bit Code Descriptor for user */ 770{ .ssd_base = 0x0, 771 .ssd_limit = 0xfffff, 772 .ssd_type = SDT_MEMERA, 773 .ssd_dpl = SEL_UPL, 774 .ssd_p = 1, 775 .ssd_long = 1, 776 .ssd_def32 = 0, 777 .ssd_gran = 1 }, 778/* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 779{ .ssd_base = 0x0, 780 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, 781 .ssd_type = SDT_SYSTSS, 782 .ssd_dpl = SEL_KPL, 783 .ssd_p = 1, 784 .ssd_long = 0, 785 .ssd_def32 = 0, 786 .ssd_gran = 0 }, 787/* Actually, the TSS is a system descriptor which is double size */ 788{ .ssd_base = 0x0, 789 .ssd_limit = 0x0, 790 .ssd_type = 0, 791 .ssd_dpl = 0, 792 .ssd_p = 0, 793 .ssd_long = 0, 794 .ssd_def32 = 0, 795 .ssd_gran = 0 }, 796/* GUSERLDT_SEL 11 LDT Descriptor */ 797{ .ssd_base = 0x0, 798 .ssd_limit = 0x0, 799 .ssd_type = 0, 800 .ssd_dpl = 0, 801 .ssd_p = 0, 802 .ssd_long = 0, 803 .ssd_def32 = 0, 804 .ssd_gran = 0 }, 805/* GUSERLDT_SEL 12 LDT Descriptor, double size */ 806{ .ssd_base = 0x0, 807 .ssd_limit = 0x0, 808 .ssd_type = 0, 809 .ssd_dpl = 0, 810 .ssd_p = 0, 811 .ssd_long = 0, 812 .ssd_def32 = 0, 813 .ssd_gran = 0 }, 814}; 815 816void 817setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 818{ 819 struct gate_descriptor *ip; 820 821 ip = idt + idx; 822 ip->gd_looffset = (uintptr_t)func; 823 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 824 ip->gd_ist = ist; 825 ip->gd_xx = 0; 826 ip->gd_type = typ; 827 ip->gd_dpl = dpl; 828 ip->gd_p = 1; 829 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 830} 831 832extern inthand_t 833 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 834 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 835 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 836 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 837 IDTVEC(xmm), IDTVEC(dblfault), 838 IDTVEC(div_pti), IDTVEC(bpt_pti), 839 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), 840 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), 841 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), 842 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), 843 IDTVEC(xmm_pti), 844#ifdef KDTRACE_HOOKS 845 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), 846#endif 847#ifdef XENHVM 848 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), 849#endif 850 IDTVEC(fast_syscall), IDTVEC(fast_syscall32), 851 IDTVEC(fast_syscall_pti); 852 853#ifdef DDB 854/* 855 * Display the index and function name of any IDT entries that don't use 856 * the default 'rsvd' entry point. 857 */ 858DB_SHOW_COMMAND(idt, db_show_idt) 859{ 860 struct gate_descriptor *ip; 861 int idx; 862 uintptr_t func; 863 864 ip = idt; 865 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 866 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); 867 if (func != (uintptr_t)&IDTVEC(rsvd)) { 868 db_printf("%3d\t", idx); 869 db_printsym(func, DB_STGY_PROC); 870 db_printf("\n"); 871 } 872 ip++; 873 } 874} 875 876/* Show privileged registers. */ 877DB_SHOW_COMMAND(sysregs, db_show_sysregs) 878{ 879 struct { 880 uint16_t limit; 881 uint64_t base; 882 } __packed idtr, gdtr; 883 uint16_t ldt, tr; 884 885 __asm __volatile("sidt %0" : "=m" (idtr)); 886 db_printf("idtr\t0x%016lx/%04x\n", 887 (u_long)idtr.base, (u_int)idtr.limit); 888 __asm __volatile("sgdt %0" : "=m" (gdtr)); 889 db_printf("gdtr\t0x%016lx/%04x\n", 890 (u_long)gdtr.base, (u_int)gdtr.limit); 891 __asm __volatile("sldt %0" : "=r" (ldt)); 892 db_printf("ldtr\t0x%04x\n", ldt); 893 __asm __volatile("str %0" : "=r" (tr)); 894 db_printf("tr\t0x%04x\n", tr); 895 db_printf("cr0\t0x%016lx\n", rcr0()); 896 db_printf("cr2\t0x%016lx\n", rcr2()); 897 db_printf("cr3\t0x%016lx\n", rcr3()); 898 db_printf("cr4\t0x%016lx\n", rcr4()); 899 if (rcr4() & CR4_XSAVE) 900 db_printf("xcr0\t0x%016lx\n", rxcr(0)); 901 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); 902 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 903 db_printf("FEATURES_CTL\t%016lx\n", 904 rdmsr(MSR_IA32_FEATURE_CONTROL)); 905 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); 906 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); 907 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); 908} 909 910DB_SHOW_COMMAND(dbregs, db_show_dbregs) 911{ 912 913 db_printf("dr0\t0x%016lx\n", rdr0()); 914 db_printf("dr1\t0x%016lx\n", rdr1()); 915 db_printf("dr2\t0x%016lx\n", rdr2()); 916 db_printf("dr3\t0x%016lx\n", rdr3()); 917 db_printf("dr6\t0x%016lx\n", rdr6()); 918 db_printf("dr7\t0x%016lx\n", rdr7()); 919} 920#endif 921 922void 923sdtossd(sd, ssd) 924 struct user_segment_descriptor *sd; 925 struct soft_segment_descriptor *ssd; 926{ 927 928 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 929 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 930 ssd->ssd_type = sd->sd_type; 931 ssd->ssd_dpl = sd->sd_dpl; 932 ssd->ssd_p = sd->sd_p; 933 ssd->ssd_long = sd->sd_long; 934 ssd->ssd_def32 = sd->sd_def32; 935 ssd->ssd_gran = sd->sd_gran; 936} 937 938void 939ssdtosd(ssd, sd) 940 struct soft_segment_descriptor *ssd; 941 struct user_segment_descriptor *sd; 942{ 943 944 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 945 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 946 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 947 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 948 sd->sd_type = ssd->ssd_type; 949 sd->sd_dpl = ssd->ssd_dpl; 950 sd->sd_p = ssd->ssd_p; 951 sd->sd_long = ssd->ssd_long; 952 sd->sd_def32 = ssd->ssd_def32; 953 sd->sd_gran = ssd->ssd_gran; 954} 955 956void 957ssdtosyssd(ssd, sd) 958 struct soft_segment_descriptor *ssd; 959 struct system_segment_descriptor *sd; 960{ 961 962 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 963 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 964 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 965 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 966 sd->sd_type = ssd->ssd_type; 967 sd->sd_dpl = ssd->ssd_dpl; 968 sd->sd_p = ssd->ssd_p; 969 sd->sd_gran = ssd->ssd_gran; 970} 971 972#if !defined(DEV_ATPIC) && defined(DEV_ISA) 973#include <isa/isavar.h> 974#include <isa/isareg.h> 975/* 976 * Return a bitmap of the current interrupt requests. This is 8259-specific 977 * and is only suitable for use at probe time. 978 * This is only here to pacify sio. It is NOT FATAL if this doesn't work. 979 * It shouldn't be here. There should probably be an APIC centric 980 * implementation in the apic driver code, if at all. 981 */ 982intrmask_t 983isa_irq_pending(void) 984{ 985 u_char irr1; 986 u_char irr2; 987 988 irr1 = inb(IO_ICU1); 989 irr2 = inb(IO_ICU2); 990 return ((irr2 << 8) | irr1); 991} 992#endif 993 994u_int basemem; 995 996static int 997add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 998 int *physmap_idxp) 999{ 1000 int i, insert_idx, physmap_idx; 1001 1002 physmap_idx = *physmap_idxp; 1003 1004 if (length == 0) 1005 return (1); 1006 1007 /* 1008 * Find insertion point while checking for overlap. Start off by 1009 * assuming the new entry will be added to the end. 1010 * 1011 * NB: physmap_idx points to the next free slot. 1012 */ 1013 insert_idx = physmap_idx; 1014 for (i = 0; i <= physmap_idx; i += 2) { 1015 if (base < physmap[i + 1]) { 1016 if (base + length <= physmap[i]) { 1017 insert_idx = i; 1018 break; 1019 } 1020 if (boothowto & RB_VERBOSE) 1021 printf( 1022 "Overlapping memory regions, ignoring second region\n"); 1023 return (1); 1024 } 1025 } 1026 1027 /* See if we can prepend to the next entry. */ 1028 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1029 physmap[insert_idx] = base; 1030 return (1); 1031 } 1032 1033 /* See if we can append to the previous entry. */ 1034 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1035 physmap[insert_idx - 1] += length; 1036 return (1); 1037 } 1038 1039 physmap_idx += 2; 1040 *physmap_idxp = physmap_idx; 1041 if (physmap_idx == PHYSMAP_SIZE) { 1042 printf( 1043 "Too many segments in the physical address map, giving up\n"); 1044 return (0); 1045 } 1046 1047 /* 1048 * Move the last 'N' entries down to make room for the new 1049 * entry if needed. 1050 */ 1051 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { 1052 physmap[i] = physmap[i - 2]; 1053 physmap[i + 1] = physmap[i - 1]; 1054 } 1055 1056 /* Insert the new entry. */ 1057 physmap[insert_idx] = base; 1058 physmap[insert_idx + 1] = base + length; 1059 return (1); 1060} 1061 1062void 1063bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, 1064 vm_paddr_t *physmap, int *physmap_idx) 1065{ 1066 struct bios_smap *smap, *smapend; 1067 1068 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1069 1070 for (smap = smapbase; smap < smapend; smap++) { 1071 if (boothowto & RB_VERBOSE) 1072 printf("SMAP type=%02x base=%016lx len=%016lx\n", 1073 smap->type, smap->base, smap->length); 1074 1075 if (smap->type != SMAP_TYPE_MEMORY) 1076 continue; 1077 1078 if (!add_physmap_entry(smap->base, smap->length, physmap, 1079 physmap_idx)) 1080 break; 1081 } 1082} 1083 1084static void 1085add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, 1086 int *physmap_idx) 1087{ 1088 struct efi_md *map, *p; 1089 const char *type; 1090 size_t efisz; 1091 int ndesc, i; 1092 1093 static const char *types[] = { 1094 "Reserved", 1095 "LoaderCode", 1096 "LoaderData", 1097 "BootServicesCode", 1098 "BootServicesData", 1099 "RuntimeServicesCode", 1100 "RuntimeServicesData", 1101 "ConventionalMemory", 1102 "UnusableMemory", 1103 "ACPIReclaimMemory", 1104 "ACPIMemoryNVS", 1105 "MemoryMappedIO", 1106 "MemoryMappedIOPortSpace", 1107 "PalCode", 1108 "PersistentMemory" 1109 }; 1110 1111 /* 1112 * Memory map data provided by UEFI via the GetMemoryMap 1113 * Boot Services API. 1114 */ 1115 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 1116 map = (struct efi_md *)((uint8_t *)efihdr + efisz); 1117 1118 if (efihdr->descriptor_size == 0) 1119 return; 1120 ndesc = efihdr->memory_size / efihdr->descriptor_size; 1121 1122 if (boothowto & RB_VERBOSE) 1123 printf("%23s %12s %12s %8s %4s\n", 1124 "Type", "Physical", "Virtual", "#Pages", "Attr"); 1125 1126 for (i = 0, p = map; i < ndesc; i++, 1127 p = efi_next_descriptor(p, efihdr->descriptor_size)) { 1128 if (boothowto & RB_VERBOSE) { 1129 if (p->md_type < nitems(types)) 1130 type = types[p->md_type]; 1131 else 1132 type = "<INVALID>"; 1133 printf("%23s %012lx %12p %08lx ", type, p->md_phys, 1134 p->md_virt, p->md_pages); 1135 if (p->md_attr & EFI_MD_ATTR_UC) 1136 printf("UC "); 1137 if (p->md_attr & EFI_MD_ATTR_WC) 1138 printf("WC "); 1139 if (p->md_attr & EFI_MD_ATTR_WT) 1140 printf("WT "); 1141 if (p->md_attr & EFI_MD_ATTR_WB) 1142 printf("WB "); 1143 if (p->md_attr & EFI_MD_ATTR_UCE) 1144 printf("UCE "); 1145 if (p->md_attr & EFI_MD_ATTR_WP) 1146 printf("WP "); 1147 if (p->md_attr & EFI_MD_ATTR_RP) 1148 printf("RP "); 1149 if (p->md_attr & EFI_MD_ATTR_XP) 1150 printf("XP "); 1151 if (p->md_attr & EFI_MD_ATTR_NV) 1152 printf("NV "); 1153 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) 1154 printf("MORE_RELIABLE "); 1155 if (p->md_attr & EFI_MD_ATTR_RO) 1156 printf("RO "); 1157 if (p->md_attr & EFI_MD_ATTR_RT) 1158 printf("RUNTIME"); 1159 printf("\n"); 1160 } 1161 1162 switch (p->md_type) { 1163 case EFI_MD_TYPE_CODE: 1164 case EFI_MD_TYPE_DATA: 1165 case EFI_MD_TYPE_BS_CODE: 1166 case EFI_MD_TYPE_BS_DATA: 1167 case EFI_MD_TYPE_FREE: 1168 /* 1169 * We're allowed to use any entry with these types. 1170 */ 1171 break; 1172 default: 1173 continue; 1174 } 1175 1176 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), 1177 physmap, physmap_idx)) 1178 break; 1179 } 1180} 1181 1182static char bootmethod[16] = ""; 1183SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1184 "System firmware boot method"); 1185 1186static void 1187native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) 1188{ 1189 struct bios_smap *smap; 1190 struct efi_map_header *efihdr; 1191 u_int32_t size; 1192 1193 /* 1194 * Memory map from INT 15:E820. 1195 * 1196 * subr_module.c says: 1197 * "Consumer may safely assume that size value precedes data." 1198 * ie: an int32_t immediately precedes smap. 1199 */ 1200 1201 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1202 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1203 smap = (struct bios_smap *)preload_search_info(kmdp, 1204 MODINFO_METADATA | MODINFOMD_SMAP); 1205 if (efihdr == NULL && smap == NULL) 1206 panic("No BIOS smap or EFI map info from loader!"); 1207 1208 if (efihdr != NULL) { 1209 add_efi_map_entries(efihdr, physmap, physmap_idx); 1210 strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); 1211 } else { 1212 size = *((u_int32_t *)smap - 1); 1213 bios_add_smap_entries(smap, size, physmap, physmap_idx); 1214 strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); 1215 } 1216} 1217 1218#define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) 1219 1220/* 1221 * Populate the (physmap) array with base/bound pairs describing the 1222 * available physical memory in the system, then test this memory and 1223 * build the phys_avail array describing the actually-available memory. 1224 * 1225 * Total memory size may be set by the kernel environment variable 1226 * hw.physmem or the compile-time define MAXMEM. 1227 * 1228 * XXX first should be vm_paddr_t. 1229 */ 1230static void 1231getmemsize(caddr_t kmdp, u_int64_t first) 1232{ 1233 int i, physmap_idx, pa_indx, da_indx; 1234 vm_paddr_t pa, physmap[PHYSMAP_SIZE]; 1235 u_long physmem_start, physmem_tunable, memtest; 1236 pt_entry_t *pte; 1237 quad_t dcons_addr, dcons_size; 1238 int page_counter; 1239 1240 /* 1241 * Tell the physical memory allocator about pages used to store 1242 * the kernel and preloaded data. See kmem_bootstrap_free(). 1243 */ 1244 vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first)); 1245 1246 bzero(physmap, sizeof(physmap)); 1247 physmap_idx = 0; 1248 1249 init_ops.parse_memmap(kmdp, physmap, &physmap_idx); 1250 physmap_idx -= 2; 1251 1252 /* 1253 * Find the 'base memory' segment for SMP 1254 */ 1255 basemem = 0; 1256 for (i = 0; i <= physmap_idx; i += 2) { 1257 if (physmap[i] <= 0xA0000) { 1258 basemem = physmap[i + 1] / 1024; 1259 break; 1260 } 1261 } 1262 if (basemem == 0 || basemem > 640) { 1263 if (bootverbose) 1264 printf( 1265 "Memory map doesn't contain a basemem segment, faking it"); 1266 basemem = 640; 1267 } 1268 1269 /* 1270 * Make hole for "AP -> long mode" bootstrap code. The 1271 * mp_bootaddress vector is only available when the kernel 1272 * is configured to support APs and APs for the system start 1273 * in 32bit mode (e.g. SMP bare metal). 1274 */ 1275 if (init_ops.mp_bootaddress) { 1276 if (physmap[1] >= 0x100000000) 1277 panic( 1278 "Basemem segment is not suitable for AP bootstrap code!"); 1279 physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024); 1280 } 1281 1282 /* 1283 * Maxmem isn't the "maximum memory", it's one larger than the 1284 * highest page of the physical address space. It should be 1285 * called something like "Maxphyspage". We may adjust this 1286 * based on ``hw.physmem'' and the results of the memory test. 1287 */ 1288 Maxmem = atop(physmap[physmap_idx + 1]); 1289 1290#ifdef MAXMEM 1291 Maxmem = MAXMEM / 4; 1292#endif 1293 1294 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1295 Maxmem = atop(physmem_tunable); 1296 1297 /* 1298 * The boot memory test is disabled by default, as it takes a 1299 * significant amount of time on large-memory systems, and is 1300 * unfriendly to virtual machines as it unnecessarily touches all 1301 * pages. 1302 * 1303 * A general name is used as the code may be extended to support 1304 * additional tests beyond the current "page present" test. 1305 */ 1306 memtest = 0; 1307 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 1308 1309 /* 1310 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1311 * in the system. 1312 */ 1313 if (Maxmem > atop(physmap[physmap_idx + 1])) 1314 Maxmem = atop(physmap[physmap_idx + 1]); 1315 1316 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1317 (boothowto & RB_VERBOSE)) 1318 printf("Physical memory use set to %ldK\n", Maxmem * 4); 1319 1320 /* call pmap initialization to make new kernel address space */ 1321 pmap_bootstrap(&first); 1322 1323 /* 1324 * Size up each available chunk of physical memory. 1325 * 1326 * XXX Some BIOSes corrupt low 64KB between suspend and resume. 1327 * By default, mask off the first 16 pages unless we appear to be 1328 * running in a VM. 1329 */ 1330 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; 1331 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); 1332 if (physmap[0] < physmem_start) { 1333 if (physmem_start < PAGE_SIZE) 1334 physmap[0] = PAGE_SIZE; 1335 else if (physmem_start >= physmap[1]) 1336 physmap[0] = round_page(physmap[1] - PAGE_SIZE); 1337 else 1338 physmap[0] = round_page(physmem_start); 1339 } 1340 pa_indx = 0; 1341 da_indx = 1; 1342 phys_avail[pa_indx++] = physmap[0]; 1343 phys_avail[pa_indx] = physmap[0]; 1344 dump_avail[da_indx] = physmap[0]; 1345 pte = CMAP1; 1346 1347 /* 1348 * Get dcons buffer address 1349 */ 1350 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 1351 getenv_quad("dcons.size", &dcons_size) == 0) 1352 dcons_addr = 0; 1353 1354 /* 1355 * physmap is in bytes, so when converting to page boundaries, 1356 * round up the start address and round down the end address. 1357 */ 1358 page_counter = 0; 1359 if (memtest != 0) 1360 printf("Testing system memory"); 1361 for (i = 0; i <= physmap_idx; i += 2) { 1362 vm_paddr_t end; 1363 1364 end = ptoa((vm_paddr_t)Maxmem); 1365 if (physmap[i + 1] < end) 1366 end = trunc_page(physmap[i + 1]); 1367 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1368 int tmp, page_bad, full; 1369 int *ptr = (int *)CADDR1; 1370 1371 full = FALSE; 1372 /* 1373 * block out kernel memory as not available. 1374 */ 1375 if (pa >= (vm_paddr_t)kernphys && pa < first) 1376 goto do_dump_avail; 1377 1378 /* 1379 * block out dcons buffer 1380 */ 1381 if (dcons_addr > 0 1382 && pa >= trunc_page(dcons_addr) 1383 && pa < dcons_addr + dcons_size) 1384 goto do_dump_avail; 1385 1386 page_bad = FALSE; 1387 if (memtest == 0) 1388 goto skip_memtest; 1389 1390 /* 1391 * Print a "." every GB to show we're making 1392 * progress. 1393 */ 1394 page_counter++; 1395 if ((page_counter % PAGES_PER_GB) == 0) 1396 printf("."); 1397 1398 /* 1399 * map page into kernel: valid, read/write,non-cacheable 1400 */ 1401 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; 1402 invltlb(); 1403 1404 tmp = *(int *)ptr; 1405 /* 1406 * Test for alternating 1's and 0's 1407 */ 1408 *(volatile int *)ptr = 0xaaaaaaaa; 1409 if (*(volatile int *)ptr != 0xaaaaaaaa) 1410 page_bad = TRUE; 1411 /* 1412 * Test for alternating 0's and 1's 1413 */ 1414 *(volatile int *)ptr = 0x55555555; 1415 if (*(volatile int *)ptr != 0x55555555) 1416 page_bad = TRUE; 1417 /* 1418 * Test for all 1's 1419 */ 1420 *(volatile int *)ptr = 0xffffffff; 1421 if (*(volatile int *)ptr != 0xffffffff) 1422 page_bad = TRUE; 1423 /* 1424 * Test for all 0's 1425 */ 1426 *(volatile int *)ptr = 0x0; 1427 if (*(volatile int *)ptr != 0x0) 1428 page_bad = TRUE; 1429 /* 1430 * Restore original value. 1431 */ 1432 *(int *)ptr = tmp; 1433 1434skip_memtest: 1435 /* 1436 * Adjust array of valid/good pages. 1437 */ 1438 if (page_bad == TRUE) 1439 continue; 1440 /* 1441 * If this good page is a continuation of the 1442 * previous set of good pages, then just increase 1443 * the end pointer. Otherwise start a new chunk. 1444 * Note that "end" points one higher than end, 1445 * making the range >= start and < end. 1446 * If we're also doing a speculative memory 1447 * test and we at or past the end, bump up Maxmem 1448 * so that we keep going. The first bad page 1449 * will terminate the loop. 1450 */ 1451 if (phys_avail[pa_indx] == pa) { 1452 phys_avail[pa_indx] += PAGE_SIZE; 1453 } else { 1454 pa_indx++; 1455 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 1456 printf( 1457 "Too many holes in the physical address space, giving up\n"); 1458 pa_indx--; 1459 full = TRUE; 1460 goto do_dump_avail; 1461 } 1462 phys_avail[pa_indx++] = pa; /* start */ 1463 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1464 } 1465 physmem++; 1466do_dump_avail: 1467 if (dump_avail[da_indx] == pa) { 1468 dump_avail[da_indx] += PAGE_SIZE; 1469 } else { 1470 da_indx++; 1471 if (da_indx == DUMP_AVAIL_ARRAY_END) { 1472 da_indx--; 1473 goto do_next; 1474 } 1475 dump_avail[da_indx++] = pa; /* start */ 1476 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1477 } 1478do_next: 1479 if (full) 1480 break; 1481 } 1482 } 1483 *pte = 0; 1484 invltlb(); 1485 if (memtest != 0) 1486 printf("\n"); 1487 1488 /* 1489 * XXX 1490 * The last chunk must contain at least one page plus the message 1491 * buffer to avoid complicating other code (message buffer address 1492 * calculation, etc.). 1493 */ 1494 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1495 round_page(msgbufsize) >= phys_avail[pa_indx]) { 1496 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1497 phys_avail[pa_indx--] = 0; 1498 phys_avail[pa_indx--] = 0; 1499 } 1500 1501 Maxmem = atop(phys_avail[pa_indx]); 1502 1503 /* Trim off space for the message buffer. */ 1504 phys_avail[pa_indx] -= round_page(msgbufsize); 1505 1506 /* Map the message buffer. */ 1507 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); 1508} 1509 1510static caddr_t 1511native_parse_preload_data(u_int64_t modulep) 1512{ 1513 caddr_t kmdp; 1514 char *envp; 1515#ifdef DDB 1516 vm_offset_t ksym_start; 1517 vm_offset_t ksym_end; 1518#endif 1519 1520 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1521 preload_bootstrap_relocate(KERNBASE); 1522 kmdp = preload_search_by_type("elf kernel"); 1523 if (kmdp == NULL) 1524 kmdp = preload_search_by_type("elf64 kernel"); 1525 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1526 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); 1527 if (envp != NULL) 1528 envp += KERNBASE; 1529 init_static_kenv(envp, 0); 1530#ifdef DDB 1531 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1532 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1533 db_fetch_ksymtab(ksym_start, ksym_end); 1534#endif 1535 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 1536 1537 return (kmdp); 1538} 1539 1540static void 1541amd64_kdb_init(void) 1542{ 1543 kdb_init(); 1544#ifdef KDB 1545 if (boothowto & RB_KDB) 1546 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 1547#endif 1548} 1549 1550/* Set up the fast syscall stuff */ 1551void 1552amd64_conf_fast_syscall(void) 1553{ 1554 uint64_t msr; 1555 1556 msr = rdmsr(MSR_EFER) | EFER_SCE; 1557 wrmsr(MSR_EFER, msr); 1558 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : 1559 (u_int64_t)IDTVEC(fast_syscall)); 1560 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1561 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1562 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1563 wrmsr(MSR_STAR, msr); 1564 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D); 1565} 1566 1567u_int64_t 1568hammer_time(u_int64_t modulep, u_int64_t physfree) 1569{ 1570 caddr_t kmdp; 1571 int gsel_tss, x; 1572 struct pcpu *pc; 1573 struct nmi_pcpu *np; 1574 struct xstate_hdr *xhdr; 1575 u_int64_t rsp0; 1576 char *env; 1577 size_t kstack0_sz; 1578 int late_console; 1579 1580 kmdp = init_ops.parse_preload_data(modulep); 1581 1582 physfree += ucode_load_bsp(physfree + KERNBASE); 1583 physfree = roundup2(physfree, PAGE_SIZE); 1584 1585 identify_cpu1(); 1586 identify_hypervisor(); 1587 /* 1588 * hw.cpu_stdext_disable is ignored by the call, it will be 1589 * re-evaluted by the below call to finishidentcpu(). 1590 */ 1591 identify_cpu2(); 1592 1593 link_elf_ireloc(kmdp); 1594 1595 /* 1596 * This may be done better later if it gets more high level 1597 * components in it. If so just link td->td_proc here. 1598 */ 1599 proc_linkup0(&proc0, &thread0); 1600 1601 /* Init basic tunables, hz etc */ 1602 init_param1(); 1603 1604 thread0.td_kstack = physfree + KERNBASE; 1605 thread0.td_kstack_pages = kstack_pages; 1606 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 1607 bzero((void *)thread0.td_kstack, kstack0_sz); 1608 physfree += kstack0_sz; 1609 1610 /* 1611 * make gdt memory segments 1612 */ 1613 for (x = 0; x < NGDT; x++) { 1614 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 1615 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) 1616 ssdtosd(&gdt_segs[x], &gdt[x]); 1617 } 1618 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0]; 1619 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1620 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1621 1622 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1623 r_gdt.rd_base = (long) gdt; 1624 lgdt(&r_gdt); 1625 pc = &__pcpu[0]; 1626 1627 wrmsr(MSR_FSBASE, 0); /* User value */ 1628 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1629 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1630 1631 pcpu_init(pc, 0, sizeof(struct pcpu)); 1632 dpcpu_init((void *)(physfree + KERNBASE), 0); 1633 physfree += DPCPU_SIZE; 1634 PCPU_SET(prvspace, pc); 1635 PCPU_SET(curthread, &thread0); 1636 /* Non-late cninit() and printf() can be moved up to here. */ 1637 PCPU_SET(tssp, &common_tss[0]); 1638 PCPU_SET(commontssp, &common_tss[0]); 1639 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1640 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); 1641 PCPU_SET(fs32p, &gdt[GUFS32_SEL]); 1642 PCPU_SET(gs32p, &gdt[GUGS32_SEL]); 1643 1644 /* 1645 * Initialize mutexes. 1646 * 1647 * icu_lock: in order to allow an interrupt to occur in a critical 1648 * section, to set pcpu->ipending (etc...) properly, we 1649 * must be able to get the icu lock, so it can't be 1650 * under witness. 1651 */ 1652 mutex_init(); 1653 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1654 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); 1655 1656 /* exceptions */ 1657 pti = pti_get_default(); 1658 TUNABLE_INT_FETCH("vm.pmap.pti", &pti); 1659 1660 for (x = 0; x < NIDT; x++) 1661 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, 1662 SEL_KPL, 0); 1663 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, 1664 SEL_KPL, 0); 1665 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); 1666 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); 1667 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, 1668 SEL_UPL, 0); 1669 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, 1670 SEL_UPL, 0); 1671 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, 1672 SEL_KPL, 0); 1673 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, 1674 SEL_KPL, 0); 1675 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, 1676 SEL_KPL, 0); 1677 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1678 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), 1679 SDT_SYSIGT, SEL_KPL, 0); 1680 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, 1681 SEL_KPL, 0); 1682 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), 1683 SDT_SYSIGT, SEL_KPL, 0); 1684 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, 1685 SEL_KPL, 0); 1686 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, 1687 SEL_KPL, 0); 1688 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, 1689 SEL_KPL, 0); 1690 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, 1691 SEL_KPL, 0); 1692 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, 1693 SEL_KPL, 0); 1694 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); 1695 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, 1696 SEL_KPL, 0); 1697#ifdef KDTRACE_HOOKS 1698 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : 1699 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); 1700#endif 1701#ifdef XENHVM 1702 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : 1703 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); 1704#endif 1705 r_idt.rd_limit = sizeof(idt0) - 1; 1706 r_idt.rd_base = (long) idt; 1707 lidt(&r_idt); 1708 1709 /* 1710 * Initialize the clock before the console so that console 1711 * initialization can use DELAY(). 1712 */ 1713 clock_init(); 1714 1715 /* 1716 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) 1717 * transition). 1718 * Once bootblocks have updated, we can test directly for 1719 * efi_systbl != NULL here... 1720 */ 1721 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) 1722 != NULL) 1723 vty_set_preferred(VTY_VT); 1724 1725 finishidentcpu(); /* Final stage of CPU initialization */ 1726 initializecpu(); /* Initialize CPU registers */ 1727 initializecpucache(); 1728 1729 /* doublefault stack space, runs on ist1 */ 1730 common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; 1731 1732 /* 1733 * NMI stack, runs on ist2. The pcpu pointer is stored just 1734 * above the start of the ist2 stack. 1735 */ 1736 np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1; 1737 np->np_pcpu = (register_t) pc; 1738 common_tss[0].tss_ist2 = (long) np; 1739 1740 /* 1741 * MC# stack, runs on ist3. The pcpu pointer is stored just 1742 * above the start of the ist3 stack. 1743 */ 1744 np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1; 1745 np->np_pcpu = (register_t) pc; 1746 common_tss[0].tss_ist3 = (long) np; 1747 1748 /* 1749 * DB# stack, runs on ist4. 1750 */ 1751 np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1; 1752 np->np_pcpu = (register_t) pc; 1753 common_tss[0].tss_ist4 = (long) np; 1754 1755 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1756 common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; 1757 1758 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1759 ltr(gsel_tss); 1760 1761 amd64_conf_fast_syscall(); 1762 1763 /* 1764 * Temporary forge some valid pointer to PCB, for exception 1765 * handlers. It is reinitialized properly below after FPU is 1766 * set up. Also set up td_critnest to short-cut the page 1767 * fault handler. 1768 */ 1769 cpu_max_ext_state_size = sizeof(struct savefpu); 1770 thread0.td_pcb = get_pcb_td(&thread0); 1771 thread0.td_critnest = 1; 1772 1773 /* 1774 * The console and kdb should be initialized even earlier than here, 1775 * but some console drivers don't work until after getmemsize(). 1776 * Default to late console initialization to support these drivers. 1777 * This loses mainly printf()s in getmemsize() and early debugging. 1778 */ 1779 late_console = 1; 1780 TUNABLE_INT_FETCH("debug.late_console", &late_console); 1781 if (!late_console) { 1782 cninit(); 1783 amd64_kdb_init(); 1784 } 1785 1786 getmemsize(kmdp, physfree); 1787 init_param2(physmem); 1788 1789 /* now running on new page tables, configured,and u/iom is accessible */ 1790 1791 if (late_console) 1792 cninit(); 1793 1794#ifdef DEV_ISA 1795#ifdef DEV_ATPIC 1796 elcr_probe(); 1797 atpic_startup(); 1798#else 1799 /* Reset and mask the atpics and leave them shut down. */ 1800 atpic_reset(); 1801 1802 /* 1803 * Point the ICU spurious interrupt vectors at the APIC spurious 1804 * interrupt handler. 1805 */ 1806 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1807 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1808#endif 1809#else 1810#error "have you forgotten the isa device?"; 1811#endif 1812 1813 if (late_console) 1814 amd64_kdb_init(); 1815 1816 msgbufinit(msgbufp, msgbufsize); 1817 fpuinit(); 1818 1819 /* 1820 * Set up thread0 pcb after fpuinit calculated pcb + fpu save 1821 * area size. Zero out the extended state header in fpu save 1822 * area. 1823 */ 1824 thread0.td_pcb = get_pcb_td(&thread0); 1825 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 1826 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); 1827 if (use_xsave) { 1828 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1829 1); 1830 xhdr->xstate_bv = xsave_mask; 1831 } 1832 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1833 rsp0 = (vm_offset_t)thread0.td_pcb; 1834 /* Ensure the stack is aligned to 16 bytes */ 1835 rsp0 &= ~0xFul; 1836 common_tss[0].tss_rsp0 = rsp0; 1837 PCPU_SET(rsp0, rsp0); 1838 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + 1839 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); 1840 PCPU_SET(curpcb, thread0.td_pcb); 1841 1842 /* transfer to user mode */ 1843 1844 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1845 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1846 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1847 _ufssel = GSEL(GUFS32_SEL, SEL_UPL); 1848 _ugssel = GSEL(GUGS32_SEL, SEL_UPL); 1849 1850 load_ds(_udatasel); 1851 load_es(_udatasel); 1852 load_fs(_ufssel); 1853 1854 /* setup proc 0's pcb */ 1855 thread0.td_pcb->pcb_flags = 0; 1856 thread0.td_frame = &proc0_tf; 1857 1858 env = kern_getenv("kernelname"); 1859 if (env != NULL) 1860 strlcpy(kernelname, env, sizeof(kernelname)); 1861 1862 cpu_probe_amdc1e(); 1863 1864#ifdef FDT 1865 x86_init_fdt(); 1866#endif 1867 thread0.td_critnest = 0; 1868 1869 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); 1870 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); 1871 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable); 1872 1873 /* Location of kernel stack for locore */ 1874 return ((u_int64_t)thread0.td_pcb); 1875} 1876 1877void 1878cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1879{ 1880 1881 pcpu->pc_acpi_id = 0xffffffff; 1882} 1883 1884static int 1885smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 1886{ 1887 struct bios_smap *smapbase; 1888 struct bios_smap_xattr smap; 1889 caddr_t kmdp; 1890 uint32_t *smapattr; 1891 int count, error, i; 1892 1893 /* Retrieve the system memory map from the loader. */ 1894 kmdp = preload_search_by_type("elf kernel"); 1895 if (kmdp == NULL) 1896 kmdp = preload_search_by_type("elf64 kernel"); 1897 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1898 MODINFO_METADATA | MODINFOMD_SMAP); 1899 if (smapbase == NULL) 1900 return (0); 1901 smapattr = (uint32_t *)preload_search_info(kmdp, 1902 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 1903 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); 1904 error = 0; 1905 for (i = 0; i < count; i++) { 1906 smap.base = smapbase[i].base; 1907 smap.length = smapbase[i].length; 1908 smap.type = smapbase[i].type; 1909 if (smapattr != NULL) 1910 smap.xattr = smapattr[i]; 1911 else 1912 smap.xattr = 0; 1913 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 1914 } 1915 return (error); 1916} 1917SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1918 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); 1919 1920static int 1921efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1922{ 1923 struct efi_map_header *efihdr; 1924 caddr_t kmdp; 1925 uint32_t efisize; 1926 1927 kmdp = preload_search_by_type("elf kernel"); 1928 if (kmdp == NULL) 1929 kmdp = preload_search_by_type("elf64 kernel"); 1930 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1931 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1932 if (efihdr == NULL) 1933 return (0); 1934 efisize = *((uint32_t *)efihdr - 1); 1935 return (SYSCTL_OUT(req, efihdr, efisize)); 1936} 1937SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1938 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); 1939 1940void 1941spinlock_enter(void) 1942{ 1943 struct thread *td; 1944 register_t flags; 1945 1946 td = curthread; 1947 if (td->td_md.md_spinlock_count == 0) { 1948 flags = intr_disable(); 1949 td->td_md.md_spinlock_count = 1; 1950 td->td_md.md_saved_flags = flags; 1951 } else 1952 td->td_md.md_spinlock_count++; 1953 critical_enter(); 1954} 1955 1956void 1957spinlock_exit(void) 1958{ 1959 struct thread *td; 1960 register_t flags; 1961 1962 td = curthread; 1963 critical_exit(); 1964 flags = td->td_md.md_saved_flags; 1965 td->td_md.md_spinlock_count--; 1966 if (td->td_md.md_spinlock_count == 0) 1967 intr_restore(flags); 1968} 1969 1970/* 1971 * Construct a PCB from a trapframe. This is called from kdb_trap() where 1972 * we want to start a backtrace from the function that caused us to enter 1973 * the debugger. We have the context in the trapframe, but base the trace 1974 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 1975 * enough for a backtrace. 1976 */ 1977void 1978makectx(struct trapframe *tf, struct pcb *pcb) 1979{ 1980 1981 pcb->pcb_r12 = tf->tf_r12; 1982 pcb->pcb_r13 = tf->tf_r13; 1983 pcb->pcb_r14 = tf->tf_r14; 1984 pcb->pcb_r15 = tf->tf_r15; 1985 pcb->pcb_rbp = tf->tf_rbp; 1986 pcb->pcb_rbx = tf->tf_rbx; 1987 pcb->pcb_rip = tf->tf_rip; 1988 pcb->pcb_rsp = tf->tf_rsp; 1989} 1990 1991int 1992ptrace_set_pc(struct thread *td, unsigned long addr) 1993{ 1994 1995 td->td_frame->tf_rip = addr; 1996 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 1997 return (0); 1998} 1999 2000int 2001ptrace_single_step(struct thread *td) 2002{ 2003 2004 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2005 if ((td->td_frame->tf_rflags & PSL_T) == 0) { 2006 td->td_frame->tf_rflags |= PSL_T; 2007 td->td_dbgflags |= TDB_STEP; 2008 } 2009 return (0); 2010} 2011 2012int 2013ptrace_clear_single_step(struct thread *td) 2014{ 2015 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2016 td->td_frame->tf_rflags &= ~PSL_T; 2017 td->td_dbgflags &= ~TDB_STEP; 2018 return (0); 2019} 2020 2021int 2022fill_regs(struct thread *td, struct reg *regs) 2023{ 2024 struct trapframe *tp; 2025 2026 tp = td->td_frame; 2027 return (fill_frame_regs(tp, regs)); 2028} 2029 2030int 2031fill_frame_regs(struct trapframe *tp, struct reg *regs) 2032{ 2033 2034 regs->r_r15 = tp->tf_r15; 2035 regs->r_r14 = tp->tf_r14; 2036 regs->r_r13 = tp->tf_r13; 2037 regs->r_r12 = tp->tf_r12; 2038 regs->r_r11 = tp->tf_r11; 2039 regs->r_r10 = tp->tf_r10; 2040 regs->r_r9 = tp->tf_r9; 2041 regs->r_r8 = tp->tf_r8; 2042 regs->r_rdi = tp->tf_rdi; 2043 regs->r_rsi = tp->tf_rsi; 2044 regs->r_rbp = tp->tf_rbp; 2045 regs->r_rbx = tp->tf_rbx; 2046 regs->r_rdx = tp->tf_rdx; 2047 regs->r_rcx = tp->tf_rcx; 2048 regs->r_rax = tp->tf_rax; 2049 regs->r_rip = tp->tf_rip; 2050 regs->r_cs = tp->tf_cs; 2051 regs->r_rflags = tp->tf_rflags; 2052 regs->r_rsp = tp->tf_rsp; 2053 regs->r_ss = tp->tf_ss; 2054 if (tp->tf_flags & TF_HASSEGS) { 2055 regs->r_ds = tp->tf_ds; 2056 regs->r_es = tp->tf_es; 2057 regs->r_fs = tp->tf_fs; 2058 regs->r_gs = tp->tf_gs; 2059 } else { 2060 regs->r_ds = 0; 2061 regs->r_es = 0; 2062 regs->r_fs = 0; 2063 regs->r_gs = 0; 2064 } 2065 regs->r_err = 0; 2066 regs->r_trapno = 0; 2067 return (0); 2068} 2069 2070int 2071set_regs(struct thread *td, struct reg *regs) 2072{ 2073 struct trapframe *tp; 2074 register_t rflags; 2075 2076 tp = td->td_frame; 2077 rflags = regs->r_rflags & 0xffffffff; 2078 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) 2079 return (EINVAL); 2080 tp->tf_r15 = regs->r_r15; 2081 tp->tf_r14 = regs->r_r14; 2082 tp->tf_r13 = regs->r_r13; 2083 tp->tf_r12 = regs->r_r12; 2084 tp->tf_r11 = regs->r_r11; 2085 tp->tf_r10 = regs->r_r10; 2086 tp->tf_r9 = regs->r_r9; 2087 tp->tf_r8 = regs->r_r8; 2088 tp->tf_rdi = regs->r_rdi; 2089 tp->tf_rsi = regs->r_rsi; 2090 tp->tf_rbp = regs->r_rbp; 2091 tp->tf_rbx = regs->r_rbx; 2092 tp->tf_rdx = regs->r_rdx; 2093 tp->tf_rcx = regs->r_rcx; 2094 tp->tf_rax = regs->r_rax; 2095 tp->tf_rip = regs->r_rip; 2096 tp->tf_cs = regs->r_cs; 2097 tp->tf_rflags = rflags; 2098 tp->tf_rsp = regs->r_rsp; 2099 tp->tf_ss = regs->r_ss; 2100 if (0) { /* XXXKIB */ 2101 tp->tf_ds = regs->r_ds; 2102 tp->tf_es = regs->r_es; 2103 tp->tf_fs = regs->r_fs; 2104 tp->tf_gs = regs->r_gs; 2105 tp->tf_flags = TF_HASSEGS; 2106 } 2107 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2108 return (0); 2109} 2110 2111/* XXX check all this stuff! */ 2112/* externalize from sv_xmm */ 2113static void 2114fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) 2115{ 2116 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2117 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2118 int i; 2119 2120 /* pcb -> fpregs */ 2121 bzero(fpregs, sizeof(*fpregs)); 2122 2123 /* FPU control/status */ 2124 penv_fpreg->en_cw = penv_xmm->en_cw; 2125 penv_fpreg->en_sw = penv_xmm->en_sw; 2126 penv_fpreg->en_tw = penv_xmm->en_tw; 2127 penv_fpreg->en_opcode = penv_xmm->en_opcode; 2128 penv_fpreg->en_rip = penv_xmm->en_rip; 2129 penv_fpreg->en_rdp = penv_xmm->en_rdp; 2130 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; 2131 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; 2132 2133 /* FPU registers */ 2134 for (i = 0; i < 8; ++i) 2135 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); 2136 2137 /* SSE registers */ 2138 for (i = 0; i < 16; ++i) 2139 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); 2140} 2141 2142/* internalize from fpregs into sv_xmm */ 2143static void 2144set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) 2145{ 2146 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2147 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2148 int i; 2149 2150 /* fpregs -> pcb */ 2151 /* FPU control/status */ 2152 penv_xmm->en_cw = penv_fpreg->en_cw; 2153 penv_xmm->en_sw = penv_fpreg->en_sw; 2154 penv_xmm->en_tw = penv_fpreg->en_tw; 2155 penv_xmm->en_opcode = penv_fpreg->en_opcode; 2156 penv_xmm->en_rip = penv_fpreg->en_rip; 2157 penv_xmm->en_rdp = penv_fpreg->en_rdp; 2158 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; 2159 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; 2160 2161 /* FPU registers */ 2162 for (i = 0; i < 8; ++i) 2163 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); 2164 2165 /* SSE registers */ 2166 for (i = 0; i < 16; ++i) 2167 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); 2168} 2169 2170/* externalize from td->pcb */ 2171int 2172fill_fpregs(struct thread *td, struct fpreg *fpregs) 2173{ 2174 2175 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2176 P_SHOULDSTOP(td->td_proc), 2177 ("not suspended thread %p", td)); 2178 fpugetregs(td); 2179 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); 2180 return (0); 2181} 2182 2183/* internalize to td->pcb */ 2184int 2185set_fpregs(struct thread *td, struct fpreg *fpregs) 2186{ 2187 2188 critical_enter(); 2189 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); 2190 fpuuserinited(td); 2191 critical_exit(); 2192 return (0); 2193} 2194 2195/* 2196 * Get machine context. 2197 */ 2198int 2199get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 2200{ 2201 struct pcb *pcb; 2202 struct trapframe *tp; 2203 2204 pcb = td->td_pcb; 2205 tp = td->td_frame; 2206 PROC_LOCK(curthread->td_proc); 2207 mcp->mc_onstack = sigonstack(tp->tf_rsp); 2208 PROC_UNLOCK(curthread->td_proc); 2209 mcp->mc_r15 = tp->tf_r15; 2210 mcp->mc_r14 = tp->tf_r14; 2211 mcp->mc_r13 = tp->tf_r13; 2212 mcp->mc_r12 = tp->tf_r12; 2213 mcp->mc_r11 = tp->tf_r11; 2214 mcp->mc_r10 = tp->tf_r10; 2215 mcp->mc_r9 = tp->tf_r9; 2216 mcp->mc_r8 = tp->tf_r8; 2217 mcp->mc_rdi = tp->tf_rdi; 2218 mcp->mc_rsi = tp->tf_rsi; 2219 mcp->mc_rbp = tp->tf_rbp; 2220 mcp->mc_rbx = tp->tf_rbx; 2221 mcp->mc_rcx = tp->tf_rcx; 2222 mcp->mc_rflags = tp->tf_rflags; 2223 if (flags & GET_MC_CLEAR_RET) { 2224 mcp->mc_rax = 0; 2225 mcp->mc_rdx = 0; 2226 mcp->mc_rflags &= ~PSL_C; 2227 } else { 2228 mcp->mc_rax = tp->tf_rax; 2229 mcp->mc_rdx = tp->tf_rdx; 2230 } 2231 mcp->mc_rip = tp->tf_rip; 2232 mcp->mc_cs = tp->tf_cs; 2233 mcp->mc_rsp = tp->tf_rsp; 2234 mcp->mc_ss = tp->tf_ss; 2235 mcp->mc_ds = tp->tf_ds; 2236 mcp->mc_es = tp->tf_es; 2237 mcp->mc_fs = tp->tf_fs; 2238 mcp->mc_gs = tp->tf_gs; 2239 mcp->mc_flags = tp->tf_flags; 2240 mcp->mc_len = sizeof(*mcp); 2241 get_fpcontext(td, mcp, NULL, 0); 2242 update_pcb_bases(pcb); 2243 mcp->mc_fsbase = pcb->pcb_fsbase; 2244 mcp->mc_gsbase = pcb->pcb_gsbase; 2245 mcp->mc_xfpustate = 0; 2246 mcp->mc_xfpustate_len = 0; 2247 bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); 2248 return (0); 2249} 2250 2251/* 2252 * Set machine context. 2253 * 2254 * However, we don't set any but the user modifiable flags, and we won't 2255 * touch the cs selector. 2256 */ 2257int 2258set_mcontext(struct thread *td, mcontext_t *mcp) 2259{ 2260 struct pcb *pcb; 2261 struct trapframe *tp; 2262 char *xfpustate; 2263 long rflags; 2264 int ret; 2265 2266 pcb = td->td_pcb; 2267 tp = td->td_frame; 2268 if (mcp->mc_len != sizeof(*mcp) || 2269 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 2270 return (EINVAL); 2271 rflags = (mcp->mc_rflags & PSL_USERCHANGE) | 2272 (tp->tf_rflags & ~PSL_USERCHANGE); 2273 if (mcp->mc_flags & _MC_HASFPXSTATE) { 2274 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 2275 sizeof(struct savefpu)) 2276 return (EINVAL); 2277 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 2278 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 2279 mcp->mc_xfpustate_len); 2280 if (ret != 0) 2281 return (ret); 2282 } else 2283 xfpustate = NULL; 2284 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 2285 if (ret != 0) 2286 return (ret); 2287 tp->tf_r15 = mcp->mc_r15; 2288 tp->tf_r14 = mcp->mc_r14; 2289 tp->tf_r13 = mcp->mc_r13; 2290 tp->tf_r12 = mcp->mc_r12; 2291 tp->tf_r11 = mcp->mc_r11; 2292 tp->tf_r10 = mcp->mc_r10; 2293 tp->tf_r9 = mcp->mc_r9; 2294 tp->tf_r8 = mcp->mc_r8; 2295 tp->tf_rdi = mcp->mc_rdi; 2296 tp->tf_rsi = mcp->mc_rsi; 2297 tp->tf_rbp = mcp->mc_rbp; 2298 tp->tf_rbx = mcp->mc_rbx; 2299 tp->tf_rdx = mcp->mc_rdx; 2300 tp->tf_rcx = mcp->mc_rcx; 2301 tp->tf_rax = mcp->mc_rax; 2302 tp->tf_rip = mcp->mc_rip; 2303 tp->tf_rflags = rflags; 2304 tp->tf_rsp = mcp->mc_rsp; 2305 tp->tf_ss = mcp->mc_ss; 2306 tp->tf_flags = mcp->mc_flags; 2307 if (tp->tf_flags & TF_HASSEGS) { 2308 tp->tf_ds = mcp->mc_ds; 2309 tp->tf_es = mcp->mc_es; 2310 tp->tf_fs = mcp->mc_fs; 2311 tp->tf_gs = mcp->mc_gs; 2312 } 2313 set_pcb_flags(pcb, PCB_FULL_IRET); 2314 if (mcp->mc_flags & _MC_HASBASES) { 2315 pcb->pcb_fsbase = mcp->mc_fsbase; 2316 pcb->pcb_gsbase = mcp->mc_gsbase; 2317 } 2318 return (0); 2319} 2320 2321static void 2322get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 2323 size_t xfpusave_len) 2324{ 2325 size_t max_len, len; 2326 2327 mcp->mc_ownedfp = fpugetregs(td); 2328 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 2329 sizeof(mcp->mc_fpstate)); 2330 mcp->mc_fpformat = fpuformat(); 2331 if (!use_xsave || xfpusave_len == 0) 2332 return; 2333 max_len = cpu_max_ext_state_size - sizeof(struct savefpu); 2334 len = xfpusave_len; 2335 if (len > max_len) { 2336 len = max_len; 2337 bzero(xfpusave + max_len, len - max_len); 2338 } 2339 mcp->mc_flags |= _MC_HASFPXSTATE; 2340 mcp->mc_xfpustate_len = len; 2341 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 2342} 2343 2344static int 2345set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 2346 size_t xfpustate_len) 2347{ 2348 int error; 2349 2350 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 2351 return (0); 2352 else if (mcp->mc_fpformat != _MC_FPFMT_XMM) 2353 return (EINVAL); 2354 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 2355 /* We don't care what state is left in the FPU or PCB. */ 2356 fpstate_drop(td); 2357 error = 0; 2358 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 2359 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 2360 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, 2361 xfpustate, xfpustate_len); 2362 } else 2363 return (EINVAL); 2364 return (error); 2365} 2366 2367void 2368fpstate_drop(struct thread *td) 2369{ 2370 2371 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 2372 critical_enter(); 2373 if (PCPU_GET(fpcurthread) == td) 2374 fpudrop(); 2375 /* 2376 * XXX force a full drop of the fpu. The above only drops it if we 2377 * owned it. 2378 * 2379 * XXX I don't much like fpugetuserregs()'s semantics of doing a full 2380 * drop. Dropping only to the pcb matches fnsave's behaviour. 2381 * We only need to drop to !PCB_INITDONE in sendsig(). But 2382 * sendsig() is the only caller of fpugetuserregs()... perhaps we just 2383 * have too many layers. 2384 */ 2385 clear_pcb_flags(curthread->td_pcb, 2386 PCB_FPUINITDONE | PCB_USERFPUINITDONE); 2387 critical_exit(); 2388} 2389 2390int 2391fill_dbregs(struct thread *td, struct dbreg *dbregs) 2392{ 2393 struct pcb *pcb; 2394 2395 if (td == NULL) { 2396 dbregs->dr[0] = rdr0(); 2397 dbregs->dr[1] = rdr1(); 2398 dbregs->dr[2] = rdr2(); 2399 dbregs->dr[3] = rdr3(); 2400 dbregs->dr[6] = rdr6(); 2401 dbregs->dr[7] = rdr7(); 2402 } else { 2403 pcb = td->td_pcb; 2404 dbregs->dr[0] = pcb->pcb_dr0; 2405 dbregs->dr[1] = pcb->pcb_dr1; 2406 dbregs->dr[2] = pcb->pcb_dr2; 2407 dbregs->dr[3] = pcb->pcb_dr3; 2408 dbregs->dr[6] = pcb->pcb_dr6; 2409 dbregs->dr[7] = pcb->pcb_dr7; 2410 } 2411 dbregs->dr[4] = 0; 2412 dbregs->dr[5] = 0; 2413 dbregs->dr[8] = 0; 2414 dbregs->dr[9] = 0; 2415 dbregs->dr[10] = 0; 2416 dbregs->dr[11] = 0; 2417 dbregs->dr[12] = 0; 2418 dbregs->dr[13] = 0; 2419 dbregs->dr[14] = 0; 2420 dbregs->dr[15] = 0; 2421 return (0); 2422} 2423 2424int 2425set_dbregs(struct thread *td, struct dbreg *dbregs) 2426{ 2427 struct pcb *pcb; 2428 int i; 2429 2430 if (td == NULL) { 2431 load_dr0(dbregs->dr[0]); 2432 load_dr1(dbregs->dr[1]); 2433 load_dr2(dbregs->dr[2]); 2434 load_dr3(dbregs->dr[3]); 2435 load_dr6(dbregs->dr[6]); 2436 load_dr7(dbregs->dr[7]); 2437 } else { 2438 /* 2439 * Don't let an illegal value for dr7 get set. Specifically, 2440 * check for undefined settings. Setting these bit patterns 2441 * result in undefined behaviour and can lead to an unexpected 2442 * TRCTRAP or a general protection fault right here. 2443 * Upper bits of dr6 and dr7 must not be set 2444 */ 2445 for (i = 0; i < 4; i++) { 2446 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 2447 return (EINVAL); 2448 if (td->td_frame->tf_cs == _ucode32sel && 2449 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) 2450 return (EINVAL); 2451 } 2452 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || 2453 (dbregs->dr[7] & 0xffffffff00000000ul) != 0) 2454 return (EINVAL); 2455 2456 pcb = td->td_pcb; 2457 2458 /* 2459 * Don't let a process set a breakpoint that is not within the 2460 * process's address space. If a process could do this, it 2461 * could halt the system by setting a breakpoint in the kernel 2462 * (if ddb was enabled). Thus, we need to check to make sure 2463 * that no breakpoints are being enabled for addresses outside 2464 * process's address space. 2465 * 2466 * XXX - what about when the watched area of the user's 2467 * address space is written into from within the kernel 2468 * ... wouldn't that still cause a breakpoint to be generated 2469 * from within kernel mode? 2470 */ 2471 2472 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 2473 /* dr0 is enabled */ 2474 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 2475 return (EINVAL); 2476 } 2477 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 2478 /* dr1 is enabled */ 2479 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 2480 return (EINVAL); 2481 } 2482 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 2483 /* dr2 is enabled */ 2484 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 2485 return (EINVAL); 2486 } 2487 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 2488 /* dr3 is enabled */ 2489 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 2490 return (EINVAL); 2491 } 2492 2493 pcb->pcb_dr0 = dbregs->dr[0]; 2494 pcb->pcb_dr1 = dbregs->dr[1]; 2495 pcb->pcb_dr2 = dbregs->dr[2]; 2496 pcb->pcb_dr3 = dbregs->dr[3]; 2497 pcb->pcb_dr6 = dbregs->dr[6]; 2498 pcb->pcb_dr7 = dbregs->dr[7]; 2499 2500 set_pcb_flags(pcb, PCB_DBREGS); 2501 } 2502 2503 return (0); 2504} 2505 2506void 2507reset_dbregs(void) 2508{ 2509 2510 load_dr7(0); /* Turn off the control bits first */ 2511 load_dr0(0); 2512 load_dr1(0); 2513 load_dr2(0); 2514 load_dr3(0); 2515 load_dr6(0); 2516} 2517 2518/* 2519 * Return > 0 if a hardware breakpoint has been hit, and the 2520 * breakpoint was in user space. Return 0, otherwise. 2521 */ 2522int 2523user_dbreg_trap(register_t dr6) 2524{ 2525 u_int64_t dr7; 2526 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2527 int nbp; /* number of breakpoints that triggered */ 2528 caddr_t addr[4]; /* breakpoint addresses */ 2529 int i; 2530 2531 bp = dr6 & DBREG_DR6_BMASK; 2532 if (bp == 0) { 2533 /* 2534 * None of the breakpoint bits are set meaning this 2535 * trap was not caused by any of the debug registers 2536 */ 2537 return 0; 2538 } 2539 2540 dr7 = rdr7(); 2541 if ((dr7 & 0x000000ff) == 0) { 2542 /* 2543 * all GE and LE bits in the dr7 register are zero, 2544 * thus the trap couldn't have been caused by the 2545 * hardware debug registers 2546 */ 2547 return 0; 2548 } 2549 2550 nbp = 0; 2551 2552 /* 2553 * at least one of the breakpoints were hit, check to see 2554 * which ones and if any of them are user space addresses 2555 */ 2556 2557 if (bp & 0x01) { 2558 addr[nbp++] = (caddr_t)rdr0(); 2559 } 2560 if (bp & 0x02) { 2561 addr[nbp++] = (caddr_t)rdr1(); 2562 } 2563 if (bp & 0x04) { 2564 addr[nbp++] = (caddr_t)rdr2(); 2565 } 2566 if (bp & 0x08) { 2567 addr[nbp++] = (caddr_t)rdr3(); 2568 } 2569 2570 for (i = 0; i < nbp; i++) { 2571 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 2572 /* 2573 * addr[i] is in user space 2574 */ 2575 return nbp; 2576 } 2577 } 2578 2579 /* 2580 * None of the breakpoints are in user space. 2581 */ 2582 return 0; 2583} 2584 2585/* 2586 * The pcb_flags is only modified by current thread, or by other threads 2587 * when current thread is stopped. However, current thread may change it 2588 * from the interrupt context in cpu_switch(), or in the trap handler. 2589 * When we read-modify-write pcb_flags from C sources, compiler may generate 2590 * code that is not atomic regarding the interrupt handler. If a trap or 2591 * interrupt happens and any flag is modified from the handler, it can be 2592 * clobbered with the cached value later. Therefore, we implement setting 2593 * and clearing flags with single-instruction functions, which do not race 2594 * with possible modification of the flags from the trap or interrupt context, 2595 * because traps and interrupts are executed only on instruction boundary. 2596 */ 2597void 2598set_pcb_flags_raw(struct pcb *pcb, const u_int flags) 2599{ 2600 2601 __asm __volatile("orl %1,%0" 2602 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) 2603 : "cc", "memory"); 2604 2605} 2606 2607/* 2608 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs 2609 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into 2610 * pcb if user space modified the bases. We must save on the context 2611 * switch or if the return to usermode happens through the doreti. 2612 * 2613 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, 2614 * which have a consequence that the base MSRs must be saved each time 2615 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with 2616 * context switches. 2617 */ 2618void 2619set_pcb_flags(struct pcb *pcb, const u_int flags) 2620{ 2621 register_t r; 2622 2623 if (curpcb == pcb && 2624 (flags & PCB_FULL_IRET) != 0 && 2625 (pcb->pcb_flags & PCB_FULL_IRET) == 0 && 2626 (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) { 2627 r = intr_disable(); 2628 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2629 if (rfs() == _ufssel) 2630 pcb->pcb_fsbase = rdfsbase(); 2631 if (rgs() == _ugssel) 2632 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); 2633 } 2634 set_pcb_flags_raw(pcb, flags); 2635 intr_restore(r); 2636 } else { 2637 set_pcb_flags_raw(pcb, flags); 2638 } 2639} 2640 2641void 2642clear_pcb_flags(struct pcb *pcb, const u_int flags) 2643{ 2644 2645 __asm __volatile("andl %1,%0" 2646 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) 2647 : "cc", "memory"); 2648} 2649 2650#ifdef KDB 2651 2652/* 2653 * Provide inb() and outb() as functions. They are normally only available as 2654 * inline functions, thus cannot be called from the debugger. 2655 */ 2656 2657/* silence compiler warnings */ 2658u_char inb_(u_short); 2659void outb_(u_short, u_char); 2660 2661u_char 2662inb_(u_short port) 2663{ 2664 return inb(port); 2665} 2666 2667void 2668outb_(u_short port, u_char data) 2669{ 2670 outb(port, data); 2671} 2672 2673#endif /* KDB */ 2674