machdep.c revision 336963
1/*- 2 * Copyright (c) 2003 Peter Wemm. 3 * Copyright (c) 1992 Terrence R. Lambert. 4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * William Jolitz. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 39 */ 40 41#include <sys/cdefs.h> 42__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 336963 2018-07-31 10:18:30Z kib $"); 43 44#include "opt_atpic.h" 45#include "opt_compat.h" 46#include "opt_cpu.h" 47#include "opt_ddb.h" 48#include "opt_inet.h" 49#include "opt_isa.h" 50#include "opt_kstack_pages.h" 51#include "opt_maxmem.h" 52#include "opt_mp_watchdog.h" 53#include "opt_perfmon.h" 54#include "opt_platform.h" 55#include "opt_sched.h" 56 57#include <sys/param.h> 58#include <sys/proc.h> 59#include <sys/systm.h> 60#include <sys/bio.h> 61#include <sys/buf.h> 62#include <sys/bus.h> 63#include <sys/callout.h> 64#include <sys/cons.h> 65#include <sys/cpu.h> 66#include <sys/efi.h> 67#include <sys/eventhandler.h> 68#include <sys/exec.h> 69#include <sys/imgact.h> 70#include <sys/kdb.h> 71#include <sys/kernel.h> 72#include <sys/ktr.h> 73#include <sys/linker.h> 74#include <sys/lock.h> 75#include <sys/malloc.h> 76#include <sys/memrange.h> 77#include <sys/msgbuf.h> 78#include <sys/mutex.h> 79#include <sys/pcpu.h> 80#include <sys/ptrace.h> 81#include <sys/reboot.h> 82#include <sys/rwlock.h> 83#include <sys/sched.h> 84#include <sys/signalvar.h> 85#ifdef SMP 86#include <sys/smp.h> 87#endif 88#include <sys/syscallsubr.h> 89#include <sys/sysctl.h> 90#include <sys/sysent.h> 91#include <sys/sysproto.h> 92#include <sys/ucontext.h> 93#include <sys/vmmeter.h> 94 95#include <vm/vm.h> 96#include <vm/vm_extern.h> 97#include <vm/vm_kern.h> 98#include <vm/vm_page.h> 99#include <vm/vm_map.h> 100#include <vm/vm_object.h> 101#include <vm/vm_pager.h> 102#include <vm/vm_param.h> 103 104#ifdef DDB 105#ifndef KDB 106#error KDB must be enabled in order for DDB to work! 107#endif 108#include <ddb/ddb.h> 109#include <ddb/db_sym.h> 110#endif 111 112#include <net/netisr.h> 113 114#include <machine/clock.h> 115#include <machine/cpu.h> 116#include <machine/cputypes.h> 117#include <machine/frame.h> 118#include <machine/intr_machdep.h> 119#include <x86/mca.h> 120#include <machine/md_var.h> 121#include <machine/metadata.h> 122#include <machine/mp_watchdog.h> 123#include <machine/pc/bios.h> 124#include <machine/pcb.h> 125#include <machine/proc.h> 126#include <machine/reg.h> 127#include <machine/sigframe.h> 128#include <machine/specialreg.h> 129#ifdef PERFMON 130#include <machine/perfmon.h> 131#endif 132#include <machine/tss.h> 133#ifdef SMP 134#include <machine/smp.h> 135#endif 136#ifdef FDT 137#include <x86/fdt.h> 138#endif 139 140#ifdef DEV_ATPIC 141#include <x86/isa/icu.h> 142#else 143#include <x86/apicvar.h> 144#endif 145 146#include <isa/isareg.h> 147#include <isa/rtc.h> 148#include <x86/init.h> 149 150/* Sanity check for __curthread() */ 151CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 152 153/* 154 * The PTI trampoline stack needs enough space for a hardware trapframe and a 155 * couple of scratch registers, as well as the trapframe left behind after an 156 * iret fault. 157 */ 158CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - 159 offsetof(struct pti_frame, pti_rip)); 160 161extern u_int64_t hammer_time(u_int64_t, u_int64_t); 162 163#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 164#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 165 166static void cpu_startup(void *); 167static void get_fpcontext(struct thread *td, mcontext_t *mcp, 168 char *xfpusave, size_t xfpusave_len); 169static int set_fpcontext(struct thread *td, mcontext_t *mcp, 170 char *xfpustate, size_t xfpustate_len); 171SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 172 173/* Preload data parse function */ 174static caddr_t native_parse_preload_data(u_int64_t); 175 176/* Native function to fetch and parse the e820 map */ 177static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); 178 179/* Default init_ops implementation. */ 180struct init_ops init_ops = { 181 .parse_preload_data = native_parse_preload_data, 182 .early_clock_source_init = i8254_init, 183 .early_delay = i8254_delay, 184 .parse_memmap = native_parse_memmap, 185#ifdef SMP 186 .mp_bootaddress = mp_bootaddress, 187 .start_all_aps = native_start_all_aps, 188#endif 189 .msi_init = msi_init, 190}; 191 192struct msgbuf *msgbufp; 193 194/* 195 * Physical address of the EFI System Table. Stashed from the metadata hints 196 * passed into the kernel and used by the EFI code to call runtime services. 197 */ 198vm_paddr_t efi_systbl_phys; 199 200/* Intel ICH registers */ 201#define ICH_PMBASE 0x400 202#define ICH_SMI_EN ICH_PMBASE + 0x30 203 204int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; 205 206int cold = 1; 207 208long Maxmem = 0; 209long realmem = 0; 210 211/* 212 * The number of PHYSMAP entries must be one less than the number of 213 * PHYSSEG entries because the PHYSMAP entry that spans the largest 214 * physical address that is accessible by ISA DMA is split into two 215 * PHYSSEG entries. 216 */ 217#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 218 219vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 220vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 221 222/* must be 2 less so 0 0 can signal end of chunks */ 223#define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) 224#define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) 225 226struct kva_md_info kmi; 227 228static struct trapframe proc0_tf; 229struct region_descriptor r_gdt, r_idt; 230 231struct pcpu __pcpu[MAXCPU]; 232 233struct mtx icu_lock; 234 235struct mem_range_softc mem_range_softc; 236 237struct mtx dt_lock; /* lock for GDT and LDT */ 238 239void (*vmm_resume_p)(void); 240 241static void 242cpu_startup(dummy) 243 void *dummy; 244{ 245 uintmax_t memsize; 246 char *sysenv; 247 248 /* 249 * On MacBooks, we need to disallow the legacy USB circuit to 250 * generate an SMI# because this can cause several problems, 251 * namely: incorrect CPU frequency detection and failure to 252 * start the APs. 253 * We do this by disabling a bit in the SMI_EN (SMI Control and 254 * Enable register) of the Intel ICH LPC Interface Bridge. 255 */ 256 sysenv = kern_getenv("smbios.system.product"); 257 if (sysenv != NULL) { 258 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 259 strncmp(sysenv, "MacBook3,1", 10) == 0 || 260 strncmp(sysenv, "MacBook4,1", 10) == 0 || 261 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 262 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 263 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 264 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 265 strncmp(sysenv, "Macmini1,1", 10) == 0) { 266 if (bootverbose) 267 printf("Disabling LEGACY_USB_EN bit on " 268 "Intel ICH.\n"); 269 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 270 } 271 freeenv(sysenv); 272 } 273 274 /* 275 * Good {morning,afternoon,evening,night}. 276 */ 277 startrtclock(); 278 printcpuinfo(); 279#ifdef PERFMON 280 perfmon_init(); 281#endif 282 283 /* 284 * Display physical memory if SMBIOS reports reasonable amount. 285 */ 286 memsize = 0; 287 sysenv = kern_getenv("smbios.memory.enabled"); 288 if (sysenv != NULL) { 289 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 290 freeenv(sysenv); 291 } 292 if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count)) 293 memsize = ptoa((uintmax_t)Maxmem); 294 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 295 realmem = atop(memsize); 296 297 /* 298 * Display any holes after the first chunk of extended memory. 299 */ 300 if (bootverbose) { 301 int indx; 302 303 printf("Physical memory chunk(s):\n"); 304 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 305 vm_paddr_t size; 306 307 size = phys_avail[indx + 1] - phys_avail[indx]; 308 printf( 309 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 310 (uintmax_t)phys_avail[indx], 311 (uintmax_t)phys_avail[indx + 1] - 1, 312 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 313 } 314 } 315 316 vm_ksubmap_init(&kmi); 317 318 printf("avail memory = %ju (%ju MB)\n", 319 ptoa((uintmax_t)vm_cnt.v_free_count), 320 ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); 321 322 /* 323 * Set up buffers, so they can be used to read disk labels. 324 */ 325 bufinit(); 326 vm_pager_bufferinit(); 327 328 cpu_setregs(); 329} 330 331/* 332 * Send an interrupt to process. 333 * 334 * Stack is set up to allow sigcode stored 335 * at top to call routine, followed by call 336 * to sigreturn routine below. After sigreturn 337 * resets the signal mask, the stack, and the 338 * frame pointer, it returns to the user 339 * specified pc, psl. 340 */ 341void 342sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 343{ 344 struct sigframe sf, *sfp; 345 struct pcb *pcb; 346 struct proc *p; 347 struct thread *td; 348 struct sigacts *psp; 349 char *sp; 350 struct trapframe *regs; 351 char *xfpusave; 352 size_t xfpusave_len; 353 int sig; 354 int oonstack; 355 356 td = curthread; 357 pcb = td->td_pcb; 358 p = td->td_proc; 359 PROC_LOCK_ASSERT(p, MA_OWNED); 360 sig = ksi->ksi_signo; 361 psp = p->p_sigacts; 362 mtx_assert(&psp->ps_mtx, MA_OWNED); 363 regs = td->td_frame; 364 oonstack = sigonstack(regs->tf_rsp); 365 366 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { 367 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); 368 xfpusave = __builtin_alloca(xfpusave_len); 369 } else { 370 xfpusave_len = 0; 371 xfpusave = NULL; 372 } 373 374 /* Save user context. */ 375 bzero(&sf, sizeof(sf)); 376 sf.sf_uc.uc_sigmask = *mask; 377 sf.sf_uc.uc_stack = td->td_sigstk; 378 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 379 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 380 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 381 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); 382 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 383 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 384 fpstate_drop(td); 385 update_pcb_bases(pcb); 386 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; 387 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; 388 bzero(sf.sf_uc.uc_mcontext.mc_spare, 389 sizeof(sf.sf_uc.uc_mcontext.mc_spare)); 390 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 391 392 /* Allocate space for the signal handler context. */ 393 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 394 SIGISMEMBER(psp->ps_sigonstack, sig)) { 395 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 396#if defined(COMPAT_43) 397 td->td_sigstk.ss_flags |= SS_ONSTACK; 398#endif 399 } else 400 sp = (char *)regs->tf_rsp - 128; 401 if (xfpusave != NULL) { 402 sp -= xfpusave_len; 403 sp = (char *)((unsigned long)sp & ~0x3Ful); 404 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 405 } 406 sp -= sizeof(struct sigframe); 407 /* Align to 16 bytes. */ 408 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); 409 410 /* Build the argument list for the signal handler. */ 411 regs->tf_rdi = sig; /* arg 1 in %rdi */ 412 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ 413 bzero(&sf.sf_si, sizeof(sf.sf_si)); 414 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 415 /* Signal handler installed with SA_SIGINFO. */ 416 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ 417 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 418 419 /* Fill in POSIX parts */ 420 sf.sf_si = ksi->ksi_info; 421 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 422 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 423 } else { 424 /* Old FreeBSD-style arguments. */ 425 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ 426 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 427 sf.sf_ahu.sf_handler = catcher; 428 } 429 mtx_unlock(&psp->ps_mtx); 430 PROC_UNLOCK(p); 431 432 /* 433 * Copy the sigframe out to the user's stack. 434 */ 435 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 436 (xfpusave != NULL && copyout(xfpusave, 437 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 438 != 0)) { 439#ifdef DEBUG 440 printf("process %ld has trashed its stack\n", (long)p->p_pid); 441#endif 442 PROC_LOCK(p); 443 sigexit(td, SIGILL); 444 } 445 446 regs->tf_rsp = (long)sfp; 447 regs->tf_rip = p->p_sysent->sv_sigcode_base; 448 regs->tf_rflags &= ~(PSL_T | PSL_D); 449 regs->tf_cs = _ucodesel; 450 regs->tf_ds = _udatasel; 451 regs->tf_ss = _udatasel; 452 regs->tf_es = _udatasel; 453 regs->tf_fs = _ufssel; 454 regs->tf_gs = _ugssel; 455 regs->tf_flags = TF_HASSEGS; 456 PROC_LOCK(p); 457 mtx_lock(&psp->ps_mtx); 458} 459 460/* 461 * System call to cleanup state after a signal 462 * has been taken. Reset signal mask and 463 * stack state from context left by sendsig (above). 464 * Return to previous pc and psl as specified by 465 * context left by sendsig. Check carefully to 466 * make sure that the user has not modified the 467 * state to gain improper privileges. 468 * 469 * MPSAFE 470 */ 471int 472sys_sigreturn(td, uap) 473 struct thread *td; 474 struct sigreturn_args /* { 475 const struct __ucontext *sigcntxp; 476 } */ *uap; 477{ 478 ucontext_t uc; 479 struct pcb *pcb; 480 struct proc *p; 481 struct trapframe *regs; 482 ucontext_t *ucp; 483 char *xfpustate; 484 size_t xfpustate_len; 485 long rflags; 486 int cs, error, ret; 487 ksiginfo_t ksi; 488 489 pcb = td->td_pcb; 490 p = td->td_proc; 491 492 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 493 if (error != 0) { 494 uprintf("pid %d (%s): sigreturn copyin failed\n", 495 p->p_pid, td->td_name); 496 return (error); 497 } 498 ucp = &uc; 499 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 500 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 501 td->td_name, ucp->uc_mcontext.mc_flags); 502 return (EINVAL); 503 } 504 regs = td->td_frame; 505 rflags = ucp->uc_mcontext.mc_rflags; 506 /* 507 * Don't allow users to change privileged or reserved flags. 508 */ 509 if (!EFL_SECURE(rflags, regs->tf_rflags)) { 510 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, 511 td->td_name, rflags); 512 return (EINVAL); 513 } 514 515 /* 516 * Don't allow users to load a valid privileged %cs. Let the 517 * hardware check for invalid selectors, excess privilege in 518 * other selectors, invalid %eip's and invalid %esp's. 519 */ 520 cs = ucp->uc_mcontext.mc_cs; 521 if (!CS_SECURE(cs)) { 522 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, 523 td->td_name, cs); 524 ksiginfo_init_trap(&ksi); 525 ksi.ksi_signo = SIGBUS; 526 ksi.ksi_code = BUS_OBJERR; 527 ksi.ksi_trapno = T_PROTFLT; 528 ksi.ksi_addr = (void *)regs->tf_rip; 529 trapsignal(td, &ksi); 530 return (EINVAL); 531 } 532 533 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 534 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 535 if (xfpustate_len > cpu_max_ext_state_size - 536 sizeof(struct savefpu)) { 537 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 538 p->p_pid, td->td_name, xfpustate_len); 539 return (EINVAL); 540 } 541 xfpustate = __builtin_alloca(xfpustate_len); 542 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 543 xfpustate, xfpustate_len); 544 if (error != 0) { 545 uprintf( 546 "pid %d (%s): sigreturn copying xfpustate failed\n", 547 p->p_pid, td->td_name); 548 return (error); 549 } 550 } else { 551 xfpustate = NULL; 552 xfpustate_len = 0; 553 } 554 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); 555 if (ret != 0) { 556 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", 557 p->p_pid, td->td_name, ret); 558 return (ret); 559 } 560 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); 561 update_pcb_bases(pcb); 562 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; 563 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; 564 565#if defined(COMPAT_43) 566 if (ucp->uc_mcontext.mc_onstack & 1) 567 td->td_sigstk.ss_flags |= SS_ONSTACK; 568 else 569 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 570#endif 571 572 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 573 return (EJUSTRETURN); 574} 575 576#ifdef COMPAT_FREEBSD4 577int 578freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) 579{ 580 581 return sys_sigreturn(td, (struct sigreturn_args *)uap); 582} 583#endif 584 585/* 586 * Reset registers to default values on exec. 587 */ 588void 589exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 590{ 591 struct trapframe *regs = td->td_frame; 592 struct pcb *pcb = td->td_pcb; 593 594 mtx_lock(&dt_lock); 595 if (td->td_proc->p_md.md_ldt != NULL) 596 user_ldt_free(td); 597 else 598 mtx_unlock(&dt_lock); 599 600 update_pcb_bases(pcb); 601 pcb->pcb_fsbase = 0; 602 pcb->pcb_gsbase = 0; 603 clear_pcb_flags(pcb, PCB_32BIT); 604 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; 605 606 bzero((char *)regs, sizeof(struct trapframe)); 607 regs->tf_rip = imgp->entry_addr; 608 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; 609 regs->tf_rdi = stack; /* argv */ 610 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 611 regs->tf_ss = _udatasel; 612 regs->tf_cs = _ucodesel; 613 regs->tf_ds = _udatasel; 614 regs->tf_es = _udatasel; 615 regs->tf_fs = _ufssel; 616 regs->tf_gs = _ugssel; 617 regs->tf_flags = TF_HASSEGS; 618 td->td_retval[1] = 0; 619 620 /* 621 * Reset the hardware debug registers if they were in use. 622 * They won't have any meaning for the newly exec'd process. 623 */ 624 if (pcb->pcb_flags & PCB_DBREGS) { 625 pcb->pcb_dr0 = 0; 626 pcb->pcb_dr1 = 0; 627 pcb->pcb_dr2 = 0; 628 pcb->pcb_dr3 = 0; 629 pcb->pcb_dr6 = 0; 630 pcb->pcb_dr7 = 0; 631 if (pcb == curpcb) { 632 /* 633 * Clear the debug registers on the running 634 * CPU, otherwise they will end up affecting 635 * the next process we switch to. 636 */ 637 reset_dbregs(); 638 } 639 clear_pcb_flags(pcb, PCB_DBREGS); 640 } 641 642 /* 643 * Drop the FP state if we hold it, so that the process gets a 644 * clean FP state if it uses the FPU again. 645 */ 646 fpstate_drop(td); 647} 648 649void 650cpu_setregs(void) 651{ 652 register_t cr0; 653 654 cr0 = rcr0(); 655 /* 656 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the 657 * BSP. See the comments there about why we set them. 658 */ 659 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 660 load_cr0(cr0); 661} 662 663/* 664 * Initialize amd64 and configure to run kernel 665 */ 666 667/* 668 * Initialize segments & interrupt table 669 */ 670 671struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */ 672static struct gate_descriptor idt0[NIDT]; 673struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 674 675static char dblfault_stack[PAGE_SIZE] __aligned(16); 676static char mce0_stack[PAGE_SIZE] __aligned(16); 677static char nmi0_stack[PAGE_SIZE] __aligned(16); 678static char dbg0_stack[PAGE_SIZE] __aligned(16); 679CTASSERT(sizeof(struct nmi_pcpu) == 16); 680 681struct amd64tss common_tss[MAXCPU]; 682 683/* 684 * Software prototypes -- in more palatable form. 685 * 686 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same 687 * slots as corresponding segments for i386 kernel. 688 */ 689struct soft_segment_descriptor gdt_segs[] = { 690/* GNULL_SEL 0 Null Descriptor */ 691{ .ssd_base = 0x0, 692 .ssd_limit = 0x0, 693 .ssd_type = 0, 694 .ssd_dpl = 0, 695 .ssd_p = 0, 696 .ssd_long = 0, 697 .ssd_def32 = 0, 698 .ssd_gran = 0 }, 699/* GNULL2_SEL 1 Null Descriptor */ 700{ .ssd_base = 0x0, 701 .ssd_limit = 0x0, 702 .ssd_type = 0, 703 .ssd_dpl = 0, 704 .ssd_p = 0, 705 .ssd_long = 0, 706 .ssd_def32 = 0, 707 .ssd_gran = 0 }, 708/* GUFS32_SEL 2 32 bit %gs Descriptor for user */ 709{ .ssd_base = 0x0, 710 .ssd_limit = 0xfffff, 711 .ssd_type = SDT_MEMRWA, 712 .ssd_dpl = SEL_UPL, 713 .ssd_p = 1, 714 .ssd_long = 0, 715 .ssd_def32 = 1, 716 .ssd_gran = 1 }, 717/* GUGS32_SEL 3 32 bit %fs Descriptor for user */ 718{ .ssd_base = 0x0, 719 .ssd_limit = 0xfffff, 720 .ssd_type = SDT_MEMRWA, 721 .ssd_dpl = SEL_UPL, 722 .ssd_p = 1, 723 .ssd_long = 0, 724 .ssd_def32 = 1, 725 .ssd_gran = 1 }, 726/* GCODE_SEL 4 Code Descriptor for kernel */ 727{ .ssd_base = 0x0, 728 .ssd_limit = 0xfffff, 729 .ssd_type = SDT_MEMERA, 730 .ssd_dpl = SEL_KPL, 731 .ssd_p = 1, 732 .ssd_long = 1, 733 .ssd_def32 = 0, 734 .ssd_gran = 1 }, 735/* GDATA_SEL 5 Data Descriptor for kernel */ 736{ .ssd_base = 0x0, 737 .ssd_limit = 0xfffff, 738 .ssd_type = SDT_MEMRWA, 739 .ssd_dpl = SEL_KPL, 740 .ssd_p = 1, 741 .ssd_long = 1, 742 .ssd_def32 = 0, 743 .ssd_gran = 1 }, 744/* GUCODE32_SEL 6 32 bit Code Descriptor for user */ 745{ .ssd_base = 0x0, 746 .ssd_limit = 0xfffff, 747 .ssd_type = SDT_MEMERA, 748 .ssd_dpl = SEL_UPL, 749 .ssd_p = 1, 750 .ssd_long = 0, 751 .ssd_def32 = 1, 752 .ssd_gran = 1 }, 753/* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ 754{ .ssd_base = 0x0, 755 .ssd_limit = 0xfffff, 756 .ssd_type = SDT_MEMRWA, 757 .ssd_dpl = SEL_UPL, 758 .ssd_p = 1, 759 .ssd_long = 0, 760 .ssd_def32 = 1, 761 .ssd_gran = 1 }, 762/* GUCODE_SEL 8 64 bit Code Descriptor for user */ 763{ .ssd_base = 0x0, 764 .ssd_limit = 0xfffff, 765 .ssd_type = SDT_MEMERA, 766 .ssd_dpl = SEL_UPL, 767 .ssd_p = 1, 768 .ssd_long = 1, 769 .ssd_def32 = 0, 770 .ssd_gran = 1 }, 771/* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 772{ .ssd_base = 0x0, 773 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, 774 .ssd_type = SDT_SYSTSS, 775 .ssd_dpl = SEL_KPL, 776 .ssd_p = 1, 777 .ssd_long = 0, 778 .ssd_def32 = 0, 779 .ssd_gran = 0 }, 780/* Actually, the TSS is a system descriptor which is double size */ 781{ .ssd_base = 0x0, 782 .ssd_limit = 0x0, 783 .ssd_type = 0, 784 .ssd_dpl = 0, 785 .ssd_p = 0, 786 .ssd_long = 0, 787 .ssd_def32 = 0, 788 .ssd_gran = 0 }, 789/* GUSERLDT_SEL 11 LDT Descriptor */ 790{ .ssd_base = 0x0, 791 .ssd_limit = 0x0, 792 .ssd_type = 0, 793 .ssd_dpl = 0, 794 .ssd_p = 0, 795 .ssd_long = 0, 796 .ssd_def32 = 0, 797 .ssd_gran = 0 }, 798/* GUSERLDT_SEL 12 LDT Descriptor, double size */ 799{ .ssd_base = 0x0, 800 .ssd_limit = 0x0, 801 .ssd_type = 0, 802 .ssd_dpl = 0, 803 .ssd_p = 0, 804 .ssd_long = 0, 805 .ssd_def32 = 0, 806 .ssd_gran = 0 }, 807}; 808 809void 810setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 811{ 812 struct gate_descriptor *ip; 813 814 ip = idt + idx; 815 ip->gd_looffset = (uintptr_t)func; 816 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 817 ip->gd_ist = ist; 818 ip->gd_xx = 0; 819 ip->gd_type = typ; 820 ip->gd_dpl = dpl; 821 ip->gd_p = 1; 822 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 823} 824 825extern inthand_t 826 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 827 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 828 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 829 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 830 IDTVEC(xmm), IDTVEC(dblfault), 831 IDTVEC(div_pti), IDTVEC(bpt_pti), 832 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), 833 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), 834 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), 835 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), 836 IDTVEC(xmm_pti), 837#ifdef KDTRACE_HOOKS 838 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), 839#endif 840#ifdef XENHVM 841 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), 842#endif 843 IDTVEC(fast_syscall), IDTVEC(fast_syscall32), 844 IDTVEC(fast_syscall_pti); 845 846#ifdef DDB 847/* 848 * Display the index and function name of any IDT entries that don't use 849 * the default 'rsvd' entry point. 850 */ 851DB_SHOW_COMMAND(idt, db_show_idt) 852{ 853 struct gate_descriptor *ip; 854 int idx; 855 uintptr_t func; 856 857 ip = idt; 858 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 859 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); 860 if (func != (uintptr_t)&IDTVEC(rsvd)) { 861 db_printf("%3d\t", idx); 862 db_printsym(func, DB_STGY_PROC); 863 db_printf("\n"); 864 } 865 ip++; 866 } 867} 868 869/* Show privileged registers. */ 870DB_SHOW_COMMAND(sysregs, db_show_sysregs) 871{ 872 struct { 873 uint16_t limit; 874 uint64_t base; 875 } __packed idtr, gdtr; 876 uint16_t ldt, tr; 877 878 __asm __volatile("sidt %0" : "=m" (idtr)); 879 db_printf("idtr\t0x%016lx/%04x\n", 880 (u_long)idtr.base, (u_int)idtr.limit); 881 __asm __volatile("sgdt %0" : "=m" (gdtr)); 882 db_printf("gdtr\t0x%016lx/%04x\n", 883 (u_long)gdtr.base, (u_int)gdtr.limit); 884 __asm __volatile("sldt %0" : "=r" (ldt)); 885 db_printf("ldtr\t0x%04x\n", ldt); 886 __asm __volatile("str %0" : "=r" (tr)); 887 db_printf("tr\t0x%04x\n", tr); 888 db_printf("cr0\t0x%016lx\n", rcr0()); 889 db_printf("cr2\t0x%016lx\n", rcr2()); 890 db_printf("cr3\t0x%016lx\n", rcr3()); 891 db_printf("cr4\t0x%016lx\n", rcr4()); 892 if (rcr4() & CR4_XSAVE) 893 db_printf("xcr0\t0x%016lx\n", rxcr(0)); 894 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); 895 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 896 db_printf("FEATURES_CTL\t%016lx\n", 897 rdmsr(MSR_IA32_FEATURE_CONTROL)); 898 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); 899 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); 900 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); 901} 902 903DB_SHOW_COMMAND(dbregs, db_show_dbregs) 904{ 905 906 db_printf("dr0\t0x%016lx\n", rdr0()); 907 db_printf("dr1\t0x%016lx\n", rdr1()); 908 db_printf("dr2\t0x%016lx\n", rdr2()); 909 db_printf("dr3\t0x%016lx\n", rdr3()); 910 db_printf("dr6\t0x%016lx\n", rdr6()); 911 db_printf("dr7\t0x%016lx\n", rdr7()); 912} 913#endif 914 915void 916sdtossd(sd, ssd) 917 struct user_segment_descriptor *sd; 918 struct soft_segment_descriptor *ssd; 919{ 920 921 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 922 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 923 ssd->ssd_type = sd->sd_type; 924 ssd->ssd_dpl = sd->sd_dpl; 925 ssd->ssd_p = sd->sd_p; 926 ssd->ssd_long = sd->sd_long; 927 ssd->ssd_def32 = sd->sd_def32; 928 ssd->ssd_gran = sd->sd_gran; 929} 930 931void 932ssdtosd(ssd, sd) 933 struct soft_segment_descriptor *ssd; 934 struct user_segment_descriptor *sd; 935{ 936 937 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 938 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 939 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 940 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 941 sd->sd_type = ssd->ssd_type; 942 sd->sd_dpl = ssd->ssd_dpl; 943 sd->sd_p = ssd->ssd_p; 944 sd->sd_long = ssd->ssd_long; 945 sd->sd_def32 = ssd->ssd_def32; 946 sd->sd_gran = ssd->ssd_gran; 947} 948 949void 950ssdtosyssd(ssd, sd) 951 struct soft_segment_descriptor *ssd; 952 struct system_segment_descriptor *sd; 953{ 954 955 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 956 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 957 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 958 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 959 sd->sd_type = ssd->ssd_type; 960 sd->sd_dpl = ssd->ssd_dpl; 961 sd->sd_p = ssd->ssd_p; 962 sd->sd_gran = ssd->ssd_gran; 963} 964 965#if !defined(DEV_ATPIC) && defined(DEV_ISA) 966#include <isa/isavar.h> 967#include <isa/isareg.h> 968/* 969 * Return a bitmap of the current interrupt requests. This is 8259-specific 970 * and is only suitable for use at probe time. 971 * This is only here to pacify sio. It is NOT FATAL if this doesn't work. 972 * It shouldn't be here. There should probably be an APIC centric 973 * implementation in the apic driver code, if at all. 974 */ 975intrmask_t 976isa_irq_pending(void) 977{ 978 u_char irr1; 979 u_char irr2; 980 981 irr1 = inb(IO_ICU1); 982 irr2 = inb(IO_ICU2); 983 return ((irr2 << 8) | irr1); 984} 985#endif 986 987u_int basemem; 988 989static int 990add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 991 int *physmap_idxp) 992{ 993 int i, insert_idx, physmap_idx; 994 995 physmap_idx = *physmap_idxp; 996 997 if (length == 0) 998 return (1); 999 1000 /* 1001 * Find insertion point while checking for overlap. Start off by 1002 * assuming the new entry will be added to the end. 1003 * 1004 * NB: physmap_idx points to the next free slot. 1005 */ 1006 insert_idx = physmap_idx; 1007 for (i = 0; i <= physmap_idx; i += 2) { 1008 if (base < physmap[i + 1]) { 1009 if (base + length <= physmap[i]) { 1010 insert_idx = i; 1011 break; 1012 } 1013 if (boothowto & RB_VERBOSE) 1014 printf( 1015 "Overlapping memory regions, ignoring second region\n"); 1016 return (1); 1017 } 1018 } 1019 1020 /* See if we can prepend to the next entry. */ 1021 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1022 physmap[insert_idx] = base; 1023 return (1); 1024 } 1025 1026 /* See if we can append to the previous entry. */ 1027 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1028 physmap[insert_idx - 1] += length; 1029 return (1); 1030 } 1031 1032 physmap_idx += 2; 1033 *physmap_idxp = physmap_idx; 1034 if (physmap_idx == PHYSMAP_SIZE) { 1035 printf( 1036 "Too many segments in the physical address map, giving up\n"); 1037 return (0); 1038 } 1039 1040 /* 1041 * Move the last 'N' entries down to make room for the new 1042 * entry if needed. 1043 */ 1044 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { 1045 physmap[i] = physmap[i - 2]; 1046 physmap[i + 1] = physmap[i - 1]; 1047 } 1048 1049 /* Insert the new entry. */ 1050 physmap[insert_idx] = base; 1051 physmap[insert_idx + 1] = base + length; 1052 return (1); 1053} 1054 1055void 1056bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, 1057 vm_paddr_t *physmap, int *physmap_idx) 1058{ 1059 struct bios_smap *smap, *smapend; 1060 1061 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1062 1063 for (smap = smapbase; smap < smapend; smap++) { 1064 if (boothowto & RB_VERBOSE) 1065 printf("SMAP type=%02x base=%016lx len=%016lx\n", 1066 smap->type, smap->base, smap->length); 1067 1068 if (smap->type != SMAP_TYPE_MEMORY) 1069 continue; 1070 1071 if (!add_physmap_entry(smap->base, smap->length, physmap, 1072 physmap_idx)) 1073 break; 1074 } 1075} 1076 1077static void 1078add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, 1079 int *physmap_idx) 1080{ 1081 struct efi_md *map, *p; 1082 const char *type; 1083 size_t efisz; 1084 int ndesc, i; 1085 1086 static const char *types[] = { 1087 "Reserved", 1088 "LoaderCode", 1089 "LoaderData", 1090 "BootServicesCode", 1091 "BootServicesData", 1092 "RuntimeServicesCode", 1093 "RuntimeServicesData", 1094 "ConventionalMemory", 1095 "UnusableMemory", 1096 "ACPIReclaimMemory", 1097 "ACPIMemoryNVS", 1098 "MemoryMappedIO", 1099 "MemoryMappedIOPortSpace", 1100 "PalCode", 1101 "PersistentMemory" 1102 }; 1103 1104 /* 1105 * Memory map data provided by UEFI via the GetMemoryMap 1106 * Boot Services API. 1107 */ 1108 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 1109 map = (struct efi_md *)((uint8_t *)efihdr + efisz); 1110 1111 if (efihdr->descriptor_size == 0) 1112 return; 1113 ndesc = efihdr->memory_size / efihdr->descriptor_size; 1114 1115 if (boothowto & RB_VERBOSE) 1116 printf("%23s %12s %12s %8s %4s\n", 1117 "Type", "Physical", "Virtual", "#Pages", "Attr"); 1118 1119 for (i = 0, p = map; i < ndesc; i++, 1120 p = efi_next_descriptor(p, efihdr->descriptor_size)) { 1121 if (boothowto & RB_VERBOSE) { 1122 if (p->md_type < nitems(types)) 1123 type = types[p->md_type]; 1124 else 1125 type = "<INVALID>"; 1126 printf("%23s %012lx %12p %08lx ", type, p->md_phys, 1127 p->md_virt, p->md_pages); 1128 if (p->md_attr & EFI_MD_ATTR_UC) 1129 printf("UC "); 1130 if (p->md_attr & EFI_MD_ATTR_WC) 1131 printf("WC "); 1132 if (p->md_attr & EFI_MD_ATTR_WT) 1133 printf("WT "); 1134 if (p->md_attr & EFI_MD_ATTR_WB) 1135 printf("WB "); 1136 if (p->md_attr & EFI_MD_ATTR_UCE) 1137 printf("UCE "); 1138 if (p->md_attr & EFI_MD_ATTR_WP) 1139 printf("WP "); 1140 if (p->md_attr & EFI_MD_ATTR_RP) 1141 printf("RP "); 1142 if (p->md_attr & EFI_MD_ATTR_XP) 1143 printf("XP "); 1144 if (p->md_attr & EFI_MD_ATTR_NV) 1145 printf("NV "); 1146 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) 1147 printf("MORE_RELIABLE "); 1148 if (p->md_attr & EFI_MD_ATTR_RO) 1149 printf("RO "); 1150 if (p->md_attr & EFI_MD_ATTR_RT) 1151 printf("RUNTIME"); 1152 printf("\n"); 1153 } 1154 1155 switch (p->md_type) { 1156 case EFI_MD_TYPE_CODE: 1157 case EFI_MD_TYPE_DATA: 1158 case EFI_MD_TYPE_BS_CODE: 1159 case EFI_MD_TYPE_BS_DATA: 1160 case EFI_MD_TYPE_FREE: 1161 /* 1162 * We're allowed to use any entry with these types. 1163 */ 1164 break; 1165 default: 1166 continue; 1167 } 1168 1169 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), 1170 physmap, physmap_idx)) 1171 break; 1172 } 1173} 1174 1175static char bootmethod[16] = ""; 1176SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1177 "System firmware boot method"); 1178 1179static void 1180native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) 1181{ 1182 struct bios_smap *smap; 1183 struct efi_map_header *efihdr; 1184 u_int32_t size; 1185 1186 /* 1187 * Memory map from INT 15:E820. 1188 * 1189 * subr_module.c says: 1190 * "Consumer may safely assume that size value precedes data." 1191 * ie: an int32_t immediately precedes smap. 1192 */ 1193 1194 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1195 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1196 smap = (struct bios_smap *)preload_search_info(kmdp, 1197 MODINFO_METADATA | MODINFOMD_SMAP); 1198 if (efihdr == NULL && smap == NULL) 1199 panic("No BIOS smap or EFI map info from loader!"); 1200 1201 if (efihdr != NULL) { 1202 add_efi_map_entries(efihdr, physmap, physmap_idx); 1203 strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); 1204 } else { 1205 size = *((u_int32_t *)smap - 1); 1206 bios_add_smap_entries(smap, size, physmap, physmap_idx); 1207 strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); 1208 } 1209} 1210 1211#define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) 1212 1213/* 1214 * Populate the (physmap) array with base/bound pairs describing the 1215 * available physical memory in the system, then test this memory and 1216 * build the phys_avail array describing the actually-available memory. 1217 * 1218 * Total memory size may be set by the kernel environment variable 1219 * hw.physmem or the compile-time define MAXMEM. 1220 * 1221 * XXX first should be vm_paddr_t. 1222 */ 1223static void 1224getmemsize(caddr_t kmdp, u_int64_t first) 1225{ 1226 int i, physmap_idx, pa_indx, da_indx; 1227 vm_paddr_t pa, physmap[PHYSMAP_SIZE]; 1228 u_long physmem_start, physmem_tunable, memtest; 1229 pt_entry_t *pte; 1230 quad_t dcons_addr, dcons_size; 1231 int page_counter; 1232 1233 bzero(physmap, sizeof(physmap)); 1234 physmap_idx = 0; 1235 1236 init_ops.parse_memmap(kmdp, physmap, &physmap_idx); 1237 physmap_idx -= 2; 1238 1239 /* 1240 * Find the 'base memory' segment for SMP 1241 */ 1242 basemem = 0; 1243 for (i = 0; i <= physmap_idx; i += 2) { 1244 if (physmap[i] <= 0xA0000) { 1245 basemem = physmap[i + 1] / 1024; 1246 break; 1247 } 1248 } 1249 if (basemem == 0 || basemem > 640) { 1250 if (bootverbose) 1251 printf( 1252 "Memory map doesn't contain a basemem segment, faking it"); 1253 basemem = 640; 1254 } 1255 1256 /* 1257 * Make hole for "AP -> long mode" bootstrap code. The 1258 * mp_bootaddress vector is only available when the kernel 1259 * is configured to support APs and APs for the system start 1260 * in 32bit mode (e.g. SMP bare metal). 1261 */ 1262 if (init_ops.mp_bootaddress) { 1263 if (physmap[1] >= 0x100000000) 1264 panic( 1265 "Basemem segment is not suitable for AP bootstrap code!"); 1266 physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024); 1267 } 1268 1269 /* 1270 * Maxmem isn't the "maximum memory", it's one larger than the 1271 * highest page of the physical address space. It should be 1272 * called something like "Maxphyspage". We may adjust this 1273 * based on ``hw.physmem'' and the results of the memory test. 1274 */ 1275 Maxmem = atop(physmap[physmap_idx + 1]); 1276 1277#ifdef MAXMEM 1278 Maxmem = MAXMEM / 4; 1279#endif 1280 1281 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1282 Maxmem = atop(physmem_tunable); 1283 1284 /* 1285 * The boot memory test is disabled by default, as it takes a 1286 * significant amount of time on large-memory systems, and is 1287 * unfriendly to virtual machines as it unnecessarily touches all 1288 * pages. 1289 * 1290 * A general name is used as the code may be extended to support 1291 * additional tests beyond the current "page present" test. 1292 */ 1293 memtest = 0; 1294 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 1295 1296 /* 1297 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1298 * in the system. 1299 */ 1300 if (Maxmem > atop(physmap[physmap_idx + 1])) 1301 Maxmem = atop(physmap[physmap_idx + 1]); 1302 1303 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1304 (boothowto & RB_VERBOSE)) 1305 printf("Physical memory use set to %ldK\n", Maxmem * 4); 1306 1307 /* call pmap initialization to make new kernel address space */ 1308 pmap_bootstrap(&first); 1309 1310 /* 1311 * Size up each available chunk of physical memory. 1312 * 1313 * XXX Some BIOSes corrupt low 64KB between suspend and resume. 1314 * By default, mask off the first 16 pages unless we appear to be 1315 * running in a VM. 1316 */ 1317 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; 1318 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); 1319 if (physmap[0] < physmem_start) { 1320 if (physmem_start < PAGE_SIZE) 1321 physmap[0] = PAGE_SIZE; 1322 else if (physmem_start >= physmap[1]) 1323 physmap[0] = round_page(physmap[1] - PAGE_SIZE); 1324 else 1325 physmap[0] = round_page(physmem_start); 1326 } 1327 pa_indx = 0; 1328 da_indx = 1; 1329 phys_avail[pa_indx++] = physmap[0]; 1330 phys_avail[pa_indx] = physmap[0]; 1331 dump_avail[da_indx] = physmap[0]; 1332 pte = CMAP1; 1333 1334 /* 1335 * Get dcons buffer address 1336 */ 1337 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 1338 getenv_quad("dcons.size", &dcons_size) == 0) 1339 dcons_addr = 0; 1340 1341 /* 1342 * physmap is in bytes, so when converting to page boundaries, 1343 * round up the start address and round down the end address. 1344 */ 1345 page_counter = 0; 1346 if (memtest != 0) 1347 printf("Testing system memory"); 1348 for (i = 0; i <= physmap_idx; i += 2) { 1349 vm_paddr_t end; 1350 1351 end = ptoa((vm_paddr_t)Maxmem); 1352 if (physmap[i + 1] < end) 1353 end = trunc_page(physmap[i + 1]); 1354 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1355 int tmp, page_bad, full; 1356 int *ptr = (int *)CADDR1; 1357 1358 full = FALSE; 1359 /* 1360 * block out kernel memory as not available. 1361 */ 1362 if (pa >= (vm_paddr_t)kernphys && pa < first) 1363 goto do_dump_avail; 1364 1365 /* 1366 * block out dcons buffer 1367 */ 1368 if (dcons_addr > 0 1369 && pa >= trunc_page(dcons_addr) 1370 && pa < dcons_addr + dcons_size) 1371 goto do_dump_avail; 1372 1373 page_bad = FALSE; 1374 if (memtest == 0) 1375 goto skip_memtest; 1376 1377 /* 1378 * Print a "." every GB to show we're making 1379 * progress. 1380 */ 1381 page_counter++; 1382 if ((page_counter % PAGES_PER_GB) == 0) 1383 printf("."); 1384 1385 /* 1386 * map page into kernel: valid, read/write,non-cacheable 1387 */ 1388 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; 1389 invltlb(); 1390 1391 tmp = *(int *)ptr; 1392 /* 1393 * Test for alternating 1's and 0's 1394 */ 1395 *(volatile int *)ptr = 0xaaaaaaaa; 1396 if (*(volatile int *)ptr != 0xaaaaaaaa) 1397 page_bad = TRUE; 1398 /* 1399 * Test for alternating 0's and 1's 1400 */ 1401 *(volatile int *)ptr = 0x55555555; 1402 if (*(volatile int *)ptr != 0x55555555) 1403 page_bad = TRUE; 1404 /* 1405 * Test for all 1's 1406 */ 1407 *(volatile int *)ptr = 0xffffffff; 1408 if (*(volatile int *)ptr != 0xffffffff) 1409 page_bad = TRUE; 1410 /* 1411 * Test for all 0's 1412 */ 1413 *(volatile int *)ptr = 0x0; 1414 if (*(volatile int *)ptr != 0x0) 1415 page_bad = TRUE; 1416 /* 1417 * Restore original value. 1418 */ 1419 *(int *)ptr = tmp; 1420 1421skip_memtest: 1422 /* 1423 * Adjust array of valid/good pages. 1424 */ 1425 if (page_bad == TRUE) 1426 continue; 1427 /* 1428 * If this good page is a continuation of the 1429 * previous set of good pages, then just increase 1430 * the end pointer. Otherwise start a new chunk. 1431 * Note that "end" points one higher than end, 1432 * making the range >= start and < end. 1433 * If we're also doing a speculative memory 1434 * test and we at or past the end, bump up Maxmem 1435 * so that we keep going. The first bad page 1436 * will terminate the loop. 1437 */ 1438 if (phys_avail[pa_indx] == pa) { 1439 phys_avail[pa_indx] += PAGE_SIZE; 1440 } else { 1441 pa_indx++; 1442 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 1443 printf( 1444 "Too many holes in the physical address space, giving up\n"); 1445 pa_indx--; 1446 full = TRUE; 1447 goto do_dump_avail; 1448 } 1449 phys_avail[pa_indx++] = pa; /* start */ 1450 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1451 } 1452 physmem++; 1453do_dump_avail: 1454 if (dump_avail[da_indx] == pa) { 1455 dump_avail[da_indx] += PAGE_SIZE; 1456 } else { 1457 da_indx++; 1458 if (da_indx == DUMP_AVAIL_ARRAY_END) { 1459 da_indx--; 1460 goto do_next; 1461 } 1462 dump_avail[da_indx++] = pa; /* start */ 1463 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1464 } 1465do_next: 1466 if (full) 1467 break; 1468 } 1469 } 1470 *pte = 0; 1471 invltlb(); 1472 if (memtest != 0) 1473 printf("\n"); 1474 1475 /* 1476 * XXX 1477 * The last chunk must contain at least one page plus the message 1478 * buffer to avoid complicating other code (message buffer address 1479 * calculation, etc.). 1480 */ 1481 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1482 round_page(msgbufsize) >= phys_avail[pa_indx]) { 1483 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1484 phys_avail[pa_indx--] = 0; 1485 phys_avail[pa_indx--] = 0; 1486 } 1487 1488 Maxmem = atop(phys_avail[pa_indx]); 1489 1490 /* Trim off space for the message buffer. */ 1491 phys_avail[pa_indx] -= round_page(msgbufsize); 1492 1493 /* Map the message buffer. */ 1494 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); 1495} 1496 1497static caddr_t 1498native_parse_preload_data(u_int64_t modulep) 1499{ 1500 caddr_t kmdp; 1501 char *envp; 1502#ifdef DDB 1503 vm_offset_t ksym_start; 1504 vm_offset_t ksym_end; 1505#endif 1506 1507 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1508 preload_bootstrap_relocate(KERNBASE); 1509 kmdp = preload_search_by_type("elf kernel"); 1510 if (kmdp == NULL) 1511 kmdp = preload_search_by_type("elf64 kernel"); 1512 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1513 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); 1514 if (envp != NULL) 1515 envp += KERNBASE; 1516 init_static_kenv(envp, 0); 1517#ifdef DDB 1518 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1519 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1520 db_fetch_ksymtab(ksym_start, ksym_end); 1521#endif 1522 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 1523 1524 return (kmdp); 1525} 1526 1527static void 1528amd64_kdb_init(void) 1529{ 1530 kdb_init(); 1531#ifdef KDB 1532 if (boothowto & RB_KDB) 1533 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 1534#endif 1535} 1536 1537/* Set up the fast syscall stuff */ 1538void 1539amd64_conf_fast_syscall(void) 1540{ 1541 uint64_t msr; 1542 1543 msr = rdmsr(MSR_EFER) | EFER_SCE; 1544 wrmsr(MSR_EFER, msr); 1545 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : 1546 (u_int64_t)IDTVEC(fast_syscall)); 1547 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1548 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1549 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1550 wrmsr(MSR_STAR, msr); 1551 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D); 1552} 1553 1554u_int64_t 1555hammer_time(u_int64_t modulep, u_int64_t physfree) 1556{ 1557 caddr_t kmdp; 1558 int gsel_tss, x; 1559 struct pcpu *pc; 1560 struct nmi_pcpu *np; 1561 struct xstate_hdr *xhdr; 1562 u_int64_t rsp0; 1563 char *env; 1564 size_t kstack0_sz; 1565 int late_console; 1566 1567 kmdp = init_ops.parse_preload_data(modulep); 1568 1569 identify_cpu1(); 1570 identify_hypervisor(); 1571 /* 1572 * hw.cpu_stdext_disable is ignored by the call, it will be 1573 * re-evaluted by the below call to finishidentcpu(). 1574 */ 1575 identify_cpu2(); 1576 1577 link_elf_ireloc(kmdp); 1578 1579 /* 1580 * This may be done better later if it gets more high level 1581 * components in it. If so just link td->td_proc here. 1582 */ 1583 proc_linkup0(&proc0, &thread0); 1584 1585 /* Init basic tunables, hz etc */ 1586 init_param1(); 1587 1588 thread0.td_kstack = physfree + KERNBASE; 1589 thread0.td_kstack_pages = kstack_pages; 1590 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 1591 bzero((void *)thread0.td_kstack, kstack0_sz); 1592 physfree += kstack0_sz; 1593 1594 /* 1595 * make gdt memory segments 1596 */ 1597 for (x = 0; x < NGDT; x++) { 1598 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 1599 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) 1600 ssdtosd(&gdt_segs[x], &gdt[x]); 1601 } 1602 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0]; 1603 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1604 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1605 1606 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1607 r_gdt.rd_base = (long) gdt; 1608 lgdt(&r_gdt); 1609 pc = &__pcpu[0]; 1610 1611 wrmsr(MSR_FSBASE, 0); /* User value */ 1612 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1613 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1614 1615 pcpu_init(pc, 0, sizeof(struct pcpu)); 1616 dpcpu_init((void *)(physfree + KERNBASE), 0); 1617 physfree += DPCPU_SIZE; 1618 PCPU_SET(prvspace, pc); 1619 PCPU_SET(curthread, &thread0); 1620 /* Non-late cninit() and printf() can be moved up to here. */ 1621 PCPU_SET(tssp, &common_tss[0]); 1622 PCPU_SET(commontssp, &common_tss[0]); 1623 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1624 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); 1625 PCPU_SET(fs32p, &gdt[GUFS32_SEL]); 1626 PCPU_SET(gs32p, &gdt[GUGS32_SEL]); 1627 1628 /* 1629 * Initialize mutexes. 1630 * 1631 * icu_lock: in order to allow an interrupt to occur in a critical 1632 * section, to set pcpu->ipending (etc...) properly, we 1633 * must be able to get the icu lock, so it can't be 1634 * under witness. 1635 */ 1636 mutex_init(); 1637 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1638 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); 1639 1640 /* exceptions */ 1641 pti = pti_get_default(); 1642 TUNABLE_INT_FETCH("vm.pmap.pti", &pti); 1643 1644 for (x = 0; x < NIDT; x++) 1645 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, 1646 SEL_KPL, 0); 1647 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, 1648 SEL_KPL, 0); 1649 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); 1650 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); 1651 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, 1652 SEL_UPL, 0); 1653 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, 1654 SEL_UPL, 0); 1655 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, 1656 SEL_KPL, 0); 1657 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, 1658 SEL_KPL, 0); 1659 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, 1660 SEL_KPL, 0); 1661 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1662 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), 1663 SDT_SYSIGT, SEL_KPL, 0); 1664 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, 1665 SEL_KPL, 0); 1666 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), 1667 SDT_SYSIGT, SEL_KPL, 0); 1668 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, 1669 SEL_KPL, 0); 1670 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, 1671 SEL_KPL, 0); 1672 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, 1673 SEL_KPL, 0); 1674 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, 1675 SEL_KPL, 0); 1676 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, 1677 SEL_KPL, 0); 1678 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); 1679 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, 1680 SEL_KPL, 0); 1681#ifdef KDTRACE_HOOKS 1682 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : 1683 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); 1684#endif 1685#ifdef XENHVM 1686 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : 1687 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); 1688#endif 1689 r_idt.rd_limit = sizeof(idt0) - 1; 1690 r_idt.rd_base = (long) idt; 1691 lidt(&r_idt); 1692 1693 /* 1694 * Initialize the clock before the console so that console 1695 * initialization can use DELAY(). 1696 */ 1697 clock_init(); 1698 1699 /* 1700 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) 1701 * transition). 1702 * Once bootblocks have updated, we can test directly for 1703 * efi_systbl != NULL here... 1704 */ 1705 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) 1706 != NULL) 1707 vty_set_preferred(VTY_VT); 1708 1709 finishidentcpu(); /* Final stage of CPU initialization */ 1710 initializecpu(); /* Initialize CPU registers */ 1711 initializecpucache(); 1712 1713 /* doublefault stack space, runs on ist1 */ 1714 common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; 1715 1716 /* 1717 * NMI stack, runs on ist2. The pcpu pointer is stored just 1718 * above the start of the ist2 stack. 1719 */ 1720 np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1; 1721 np->np_pcpu = (register_t) pc; 1722 common_tss[0].tss_ist2 = (long) np; 1723 1724 /* 1725 * MC# stack, runs on ist3. The pcpu pointer is stored just 1726 * above the start of the ist3 stack. 1727 */ 1728 np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1; 1729 np->np_pcpu = (register_t) pc; 1730 common_tss[0].tss_ist3 = (long) np; 1731 1732 /* 1733 * DB# stack, runs on ist4. 1734 */ 1735 np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1; 1736 np->np_pcpu = (register_t) pc; 1737 common_tss[0].tss_ist4 = (long) np; 1738 1739 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1740 common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; 1741 1742 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1743 ltr(gsel_tss); 1744 1745 amd64_conf_fast_syscall(); 1746 1747 /* 1748 * Temporary forge some valid pointer to PCB, for exception 1749 * handlers. It is reinitialized properly below after FPU is 1750 * set up. Also set up td_critnest to short-cut the page 1751 * fault handler. 1752 */ 1753 cpu_max_ext_state_size = sizeof(struct savefpu); 1754 thread0.td_pcb = get_pcb_td(&thread0); 1755 thread0.td_critnest = 1; 1756 1757 /* 1758 * The console and kdb should be initialized even earlier than here, 1759 * but some console drivers don't work until after getmemsize(). 1760 * Default to late console initialization to support these drivers. 1761 * This loses mainly printf()s in getmemsize() and early debugging. 1762 */ 1763 late_console = 1; 1764 TUNABLE_INT_FETCH("debug.late_console", &late_console); 1765 if (!late_console) { 1766 cninit(); 1767 amd64_kdb_init(); 1768 } 1769 1770 getmemsize(kmdp, physfree); 1771 init_param2(physmem); 1772 1773 /* now running on new page tables, configured,and u/iom is accessible */ 1774 1775 if (late_console) 1776 cninit(); 1777 1778#ifdef DEV_ISA 1779#ifdef DEV_ATPIC 1780 elcr_probe(); 1781 atpic_startup(); 1782#else 1783 /* Reset and mask the atpics and leave them shut down. */ 1784 atpic_reset(); 1785 1786 /* 1787 * Point the ICU spurious interrupt vectors at the APIC spurious 1788 * interrupt handler. 1789 */ 1790 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1791 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1792#endif 1793#else 1794#error "have you forgotten the isa device?"; 1795#endif 1796 1797 if (late_console) 1798 amd64_kdb_init(); 1799 1800 msgbufinit(msgbufp, msgbufsize); 1801 fpuinit(); 1802 1803 /* 1804 * Set up thread0 pcb after fpuinit calculated pcb + fpu save 1805 * area size. Zero out the extended state header in fpu save 1806 * area. 1807 */ 1808 thread0.td_pcb = get_pcb_td(&thread0); 1809 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 1810 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); 1811 if (use_xsave) { 1812 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1813 1); 1814 xhdr->xstate_bv = xsave_mask; 1815 } 1816 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1817 rsp0 = (vm_offset_t)thread0.td_pcb; 1818 /* Ensure the stack is aligned to 16 bytes */ 1819 rsp0 &= ~0xFul; 1820 common_tss[0].tss_rsp0 = rsp0; 1821 PCPU_SET(rsp0, rsp0); 1822 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + 1823 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); 1824 PCPU_SET(curpcb, thread0.td_pcb); 1825 1826 /* transfer to user mode */ 1827 1828 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1829 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1830 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1831 _ufssel = GSEL(GUFS32_SEL, SEL_UPL); 1832 _ugssel = GSEL(GUGS32_SEL, SEL_UPL); 1833 1834 load_ds(_udatasel); 1835 load_es(_udatasel); 1836 load_fs(_ufssel); 1837 1838 /* setup proc 0's pcb */ 1839 thread0.td_pcb->pcb_flags = 0; 1840 thread0.td_frame = &proc0_tf; 1841 1842 env = kern_getenv("kernelname"); 1843 if (env != NULL) 1844 strlcpy(kernelname, env, sizeof(kernelname)); 1845 1846 cpu_probe_amdc1e(); 1847 1848#ifdef FDT 1849 x86_init_fdt(); 1850#endif 1851 thread0.td_critnest = 0; 1852 1853 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); 1854 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); 1855 1856 /* Location of kernel stack for locore */ 1857 return ((u_int64_t)thread0.td_pcb); 1858} 1859 1860void 1861cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1862{ 1863 1864 pcpu->pc_acpi_id = 0xffffffff; 1865} 1866 1867static int 1868smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 1869{ 1870 struct bios_smap *smapbase; 1871 struct bios_smap_xattr smap; 1872 caddr_t kmdp; 1873 uint32_t *smapattr; 1874 int count, error, i; 1875 1876 /* Retrieve the system memory map from the loader. */ 1877 kmdp = preload_search_by_type("elf kernel"); 1878 if (kmdp == NULL) 1879 kmdp = preload_search_by_type("elf64 kernel"); 1880 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1881 MODINFO_METADATA | MODINFOMD_SMAP); 1882 if (smapbase == NULL) 1883 return (0); 1884 smapattr = (uint32_t *)preload_search_info(kmdp, 1885 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 1886 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); 1887 error = 0; 1888 for (i = 0; i < count; i++) { 1889 smap.base = smapbase[i].base; 1890 smap.length = smapbase[i].length; 1891 smap.type = smapbase[i].type; 1892 if (smapattr != NULL) 1893 smap.xattr = smapattr[i]; 1894 else 1895 smap.xattr = 0; 1896 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 1897 } 1898 return (error); 1899} 1900SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1901 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); 1902 1903static int 1904efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1905{ 1906 struct efi_map_header *efihdr; 1907 caddr_t kmdp; 1908 uint32_t efisize; 1909 1910 kmdp = preload_search_by_type("elf kernel"); 1911 if (kmdp == NULL) 1912 kmdp = preload_search_by_type("elf64 kernel"); 1913 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1914 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1915 if (efihdr == NULL) 1916 return (0); 1917 efisize = *((uint32_t *)efihdr - 1); 1918 return (SYSCTL_OUT(req, efihdr, efisize)); 1919} 1920SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1921 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); 1922 1923void 1924spinlock_enter(void) 1925{ 1926 struct thread *td; 1927 register_t flags; 1928 1929 td = curthread; 1930 if (td->td_md.md_spinlock_count == 0) { 1931 flags = intr_disable(); 1932 td->td_md.md_spinlock_count = 1; 1933 td->td_md.md_saved_flags = flags; 1934 } else 1935 td->td_md.md_spinlock_count++; 1936 critical_enter(); 1937} 1938 1939void 1940spinlock_exit(void) 1941{ 1942 struct thread *td; 1943 register_t flags; 1944 1945 td = curthread; 1946 critical_exit(); 1947 flags = td->td_md.md_saved_flags; 1948 td->td_md.md_spinlock_count--; 1949 if (td->td_md.md_spinlock_count == 0) 1950 intr_restore(flags); 1951} 1952 1953/* 1954 * Construct a PCB from a trapframe. This is called from kdb_trap() where 1955 * we want to start a backtrace from the function that caused us to enter 1956 * the debugger. We have the context in the trapframe, but base the trace 1957 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 1958 * enough for a backtrace. 1959 */ 1960void 1961makectx(struct trapframe *tf, struct pcb *pcb) 1962{ 1963 1964 pcb->pcb_r12 = tf->tf_r12; 1965 pcb->pcb_r13 = tf->tf_r13; 1966 pcb->pcb_r14 = tf->tf_r14; 1967 pcb->pcb_r15 = tf->tf_r15; 1968 pcb->pcb_rbp = tf->tf_rbp; 1969 pcb->pcb_rbx = tf->tf_rbx; 1970 pcb->pcb_rip = tf->tf_rip; 1971 pcb->pcb_rsp = tf->tf_rsp; 1972} 1973 1974int 1975ptrace_set_pc(struct thread *td, unsigned long addr) 1976{ 1977 1978 td->td_frame->tf_rip = addr; 1979 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 1980 return (0); 1981} 1982 1983int 1984ptrace_single_step(struct thread *td) 1985{ 1986 td->td_frame->tf_rflags |= PSL_T; 1987 return (0); 1988} 1989 1990int 1991ptrace_clear_single_step(struct thread *td) 1992{ 1993 td->td_frame->tf_rflags &= ~PSL_T; 1994 return (0); 1995} 1996 1997int 1998fill_regs(struct thread *td, struct reg *regs) 1999{ 2000 struct trapframe *tp; 2001 2002 tp = td->td_frame; 2003 return (fill_frame_regs(tp, regs)); 2004} 2005 2006int 2007fill_frame_regs(struct trapframe *tp, struct reg *regs) 2008{ 2009 regs->r_r15 = tp->tf_r15; 2010 regs->r_r14 = tp->tf_r14; 2011 regs->r_r13 = tp->tf_r13; 2012 regs->r_r12 = tp->tf_r12; 2013 regs->r_r11 = tp->tf_r11; 2014 regs->r_r10 = tp->tf_r10; 2015 regs->r_r9 = tp->tf_r9; 2016 regs->r_r8 = tp->tf_r8; 2017 regs->r_rdi = tp->tf_rdi; 2018 regs->r_rsi = tp->tf_rsi; 2019 regs->r_rbp = tp->tf_rbp; 2020 regs->r_rbx = tp->tf_rbx; 2021 regs->r_rdx = tp->tf_rdx; 2022 regs->r_rcx = tp->tf_rcx; 2023 regs->r_rax = tp->tf_rax; 2024 regs->r_rip = tp->tf_rip; 2025 regs->r_cs = tp->tf_cs; 2026 regs->r_rflags = tp->tf_rflags; 2027 regs->r_rsp = tp->tf_rsp; 2028 regs->r_ss = tp->tf_ss; 2029 if (tp->tf_flags & TF_HASSEGS) { 2030 regs->r_ds = tp->tf_ds; 2031 regs->r_es = tp->tf_es; 2032 regs->r_fs = tp->tf_fs; 2033 regs->r_gs = tp->tf_gs; 2034 } else { 2035 regs->r_ds = 0; 2036 regs->r_es = 0; 2037 regs->r_fs = 0; 2038 regs->r_gs = 0; 2039 } 2040 return (0); 2041} 2042 2043int 2044set_regs(struct thread *td, struct reg *regs) 2045{ 2046 struct trapframe *tp; 2047 register_t rflags; 2048 2049 tp = td->td_frame; 2050 rflags = regs->r_rflags & 0xffffffff; 2051 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) 2052 return (EINVAL); 2053 tp->tf_r15 = regs->r_r15; 2054 tp->tf_r14 = regs->r_r14; 2055 tp->tf_r13 = regs->r_r13; 2056 tp->tf_r12 = regs->r_r12; 2057 tp->tf_r11 = regs->r_r11; 2058 tp->tf_r10 = regs->r_r10; 2059 tp->tf_r9 = regs->r_r9; 2060 tp->tf_r8 = regs->r_r8; 2061 tp->tf_rdi = regs->r_rdi; 2062 tp->tf_rsi = regs->r_rsi; 2063 tp->tf_rbp = regs->r_rbp; 2064 tp->tf_rbx = regs->r_rbx; 2065 tp->tf_rdx = regs->r_rdx; 2066 tp->tf_rcx = regs->r_rcx; 2067 tp->tf_rax = regs->r_rax; 2068 tp->tf_rip = regs->r_rip; 2069 tp->tf_cs = regs->r_cs; 2070 tp->tf_rflags = rflags; 2071 tp->tf_rsp = regs->r_rsp; 2072 tp->tf_ss = regs->r_ss; 2073 if (0) { /* XXXKIB */ 2074 tp->tf_ds = regs->r_ds; 2075 tp->tf_es = regs->r_es; 2076 tp->tf_fs = regs->r_fs; 2077 tp->tf_gs = regs->r_gs; 2078 tp->tf_flags = TF_HASSEGS; 2079 } 2080 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2081 return (0); 2082} 2083 2084/* XXX check all this stuff! */ 2085/* externalize from sv_xmm */ 2086static void 2087fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) 2088{ 2089 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2090 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2091 int i; 2092 2093 /* pcb -> fpregs */ 2094 bzero(fpregs, sizeof(*fpregs)); 2095 2096 /* FPU control/status */ 2097 penv_fpreg->en_cw = penv_xmm->en_cw; 2098 penv_fpreg->en_sw = penv_xmm->en_sw; 2099 penv_fpreg->en_tw = penv_xmm->en_tw; 2100 penv_fpreg->en_opcode = penv_xmm->en_opcode; 2101 penv_fpreg->en_rip = penv_xmm->en_rip; 2102 penv_fpreg->en_rdp = penv_xmm->en_rdp; 2103 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; 2104 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; 2105 2106 /* FPU registers */ 2107 for (i = 0; i < 8; ++i) 2108 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); 2109 2110 /* SSE registers */ 2111 for (i = 0; i < 16; ++i) 2112 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); 2113} 2114 2115/* internalize from fpregs into sv_xmm */ 2116static void 2117set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) 2118{ 2119 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2120 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2121 int i; 2122 2123 /* fpregs -> pcb */ 2124 /* FPU control/status */ 2125 penv_xmm->en_cw = penv_fpreg->en_cw; 2126 penv_xmm->en_sw = penv_fpreg->en_sw; 2127 penv_xmm->en_tw = penv_fpreg->en_tw; 2128 penv_xmm->en_opcode = penv_fpreg->en_opcode; 2129 penv_xmm->en_rip = penv_fpreg->en_rip; 2130 penv_xmm->en_rdp = penv_fpreg->en_rdp; 2131 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; 2132 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; 2133 2134 /* FPU registers */ 2135 for (i = 0; i < 8; ++i) 2136 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); 2137 2138 /* SSE registers */ 2139 for (i = 0; i < 16; ++i) 2140 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); 2141} 2142 2143/* externalize from td->pcb */ 2144int 2145fill_fpregs(struct thread *td, struct fpreg *fpregs) 2146{ 2147 2148 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2149 P_SHOULDSTOP(td->td_proc), 2150 ("not suspended thread %p", td)); 2151 fpugetregs(td); 2152 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); 2153 return (0); 2154} 2155 2156/* internalize to td->pcb */ 2157int 2158set_fpregs(struct thread *td, struct fpreg *fpregs) 2159{ 2160 2161 critical_enter(); 2162 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); 2163 fpuuserinited(td); 2164 critical_exit(); 2165 return (0); 2166} 2167 2168/* 2169 * Get machine context. 2170 */ 2171int 2172get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 2173{ 2174 struct pcb *pcb; 2175 struct trapframe *tp; 2176 2177 pcb = td->td_pcb; 2178 tp = td->td_frame; 2179 PROC_LOCK(curthread->td_proc); 2180 mcp->mc_onstack = sigonstack(tp->tf_rsp); 2181 PROC_UNLOCK(curthread->td_proc); 2182 mcp->mc_r15 = tp->tf_r15; 2183 mcp->mc_r14 = tp->tf_r14; 2184 mcp->mc_r13 = tp->tf_r13; 2185 mcp->mc_r12 = tp->tf_r12; 2186 mcp->mc_r11 = tp->tf_r11; 2187 mcp->mc_r10 = tp->tf_r10; 2188 mcp->mc_r9 = tp->tf_r9; 2189 mcp->mc_r8 = tp->tf_r8; 2190 mcp->mc_rdi = tp->tf_rdi; 2191 mcp->mc_rsi = tp->tf_rsi; 2192 mcp->mc_rbp = tp->tf_rbp; 2193 mcp->mc_rbx = tp->tf_rbx; 2194 mcp->mc_rcx = tp->tf_rcx; 2195 mcp->mc_rflags = tp->tf_rflags; 2196 if (flags & GET_MC_CLEAR_RET) { 2197 mcp->mc_rax = 0; 2198 mcp->mc_rdx = 0; 2199 mcp->mc_rflags &= ~PSL_C; 2200 } else { 2201 mcp->mc_rax = tp->tf_rax; 2202 mcp->mc_rdx = tp->tf_rdx; 2203 } 2204 mcp->mc_rip = tp->tf_rip; 2205 mcp->mc_cs = tp->tf_cs; 2206 mcp->mc_rsp = tp->tf_rsp; 2207 mcp->mc_ss = tp->tf_ss; 2208 mcp->mc_ds = tp->tf_ds; 2209 mcp->mc_es = tp->tf_es; 2210 mcp->mc_fs = tp->tf_fs; 2211 mcp->mc_gs = tp->tf_gs; 2212 mcp->mc_flags = tp->tf_flags; 2213 mcp->mc_len = sizeof(*mcp); 2214 get_fpcontext(td, mcp, NULL, 0); 2215 update_pcb_bases(pcb); 2216 mcp->mc_fsbase = pcb->pcb_fsbase; 2217 mcp->mc_gsbase = pcb->pcb_gsbase; 2218 mcp->mc_xfpustate = 0; 2219 mcp->mc_xfpustate_len = 0; 2220 bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); 2221 return (0); 2222} 2223 2224/* 2225 * Set machine context. 2226 * 2227 * However, we don't set any but the user modifiable flags, and we won't 2228 * touch the cs selector. 2229 */ 2230int 2231set_mcontext(struct thread *td, mcontext_t *mcp) 2232{ 2233 struct pcb *pcb; 2234 struct trapframe *tp; 2235 char *xfpustate; 2236 long rflags; 2237 int ret; 2238 2239 pcb = td->td_pcb; 2240 tp = td->td_frame; 2241 if (mcp->mc_len != sizeof(*mcp) || 2242 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 2243 return (EINVAL); 2244 rflags = (mcp->mc_rflags & PSL_USERCHANGE) | 2245 (tp->tf_rflags & ~PSL_USERCHANGE); 2246 if (mcp->mc_flags & _MC_HASFPXSTATE) { 2247 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 2248 sizeof(struct savefpu)) 2249 return (EINVAL); 2250 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 2251 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 2252 mcp->mc_xfpustate_len); 2253 if (ret != 0) 2254 return (ret); 2255 } else 2256 xfpustate = NULL; 2257 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 2258 if (ret != 0) 2259 return (ret); 2260 tp->tf_r15 = mcp->mc_r15; 2261 tp->tf_r14 = mcp->mc_r14; 2262 tp->tf_r13 = mcp->mc_r13; 2263 tp->tf_r12 = mcp->mc_r12; 2264 tp->tf_r11 = mcp->mc_r11; 2265 tp->tf_r10 = mcp->mc_r10; 2266 tp->tf_r9 = mcp->mc_r9; 2267 tp->tf_r8 = mcp->mc_r8; 2268 tp->tf_rdi = mcp->mc_rdi; 2269 tp->tf_rsi = mcp->mc_rsi; 2270 tp->tf_rbp = mcp->mc_rbp; 2271 tp->tf_rbx = mcp->mc_rbx; 2272 tp->tf_rdx = mcp->mc_rdx; 2273 tp->tf_rcx = mcp->mc_rcx; 2274 tp->tf_rax = mcp->mc_rax; 2275 tp->tf_rip = mcp->mc_rip; 2276 tp->tf_rflags = rflags; 2277 tp->tf_rsp = mcp->mc_rsp; 2278 tp->tf_ss = mcp->mc_ss; 2279 tp->tf_flags = mcp->mc_flags; 2280 if (tp->tf_flags & TF_HASSEGS) { 2281 tp->tf_ds = mcp->mc_ds; 2282 tp->tf_es = mcp->mc_es; 2283 tp->tf_fs = mcp->mc_fs; 2284 tp->tf_gs = mcp->mc_gs; 2285 } 2286 set_pcb_flags(pcb, PCB_FULL_IRET); 2287 if (mcp->mc_flags & _MC_HASBASES) { 2288 pcb->pcb_fsbase = mcp->mc_fsbase; 2289 pcb->pcb_gsbase = mcp->mc_gsbase; 2290 } 2291 return (0); 2292} 2293 2294static void 2295get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 2296 size_t xfpusave_len) 2297{ 2298 size_t max_len, len; 2299 2300 mcp->mc_ownedfp = fpugetregs(td); 2301 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 2302 sizeof(mcp->mc_fpstate)); 2303 mcp->mc_fpformat = fpuformat(); 2304 if (!use_xsave || xfpusave_len == 0) 2305 return; 2306 max_len = cpu_max_ext_state_size - sizeof(struct savefpu); 2307 len = xfpusave_len; 2308 if (len > max_len) { 2309 len = max_len; 2310 bzero(xfpusave + max_len, len - max_len); 2311 } 2312 mcp->mc_flags |= _MC_HASFPXSTATE; 2313 mcp->mc_xfpustate_len = len; 2314 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 2315} 2316 2317static int 2318set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 2319 size_t xfpustate_len) 2320{ 2321 int error; 2322 2323 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 2324 return (0); 2325 else if (mcp->mc_fpformat != _MC_FPFMT_XMM) 2326 return (EINVAL); 2327 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 2328 /* We don't care what state is left in the FPU or PCB. */ 2329 fpstate_drop(td); 2330 error = 0; 2331 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 2332 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 2333 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, 2334 xfpustate, xfpustate_len); 2335 } else 2336 return (EINVAL); 2337 return (error); 2338} 2339 2340void 2341fpstate_drop(struct thread *td) 2342{ 2343 2344 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 2345 critical_enter(); 2346 if (PCPU_GET(fpcurthread) == td) 2347 fpudrop(); 2348 /* 2349 * XXX force a full drop of the fpu. The above only drops it if we 2350 * owned it. 2351 * 2352 * XXX I don't much like fpugetuserregs()'s semantics of doing a full 2353 * drop. Dropping only to the pcb matches fnsave's behaviour. 2354 * We only need to drop to !PCB_INITDONE in sendsig(). But 2355 * sendsig() is the only caller of fpugetuserregs()... perhaps we just 2356 * have too many layers. 2357 */ 2358 clear_pcb_flags(curthread->td_pcb, 2359 PCB_FPUINITDONE | PCB_USERFPUINITDONE); 2360 critical_exit(); 2361} 2362 2363int 2364fill_dbregs(struct thread *td, struct dbreg *dbregs) 2365{ 2366 struct pcb *pcb; 2367 2368 if (td == NULL) { 2369 dbregs->dr[0] = rdr0(); 2370 dbregs->dr[1] = rdr1(); 2371 dbregs->dr[2] = rdr2(); 2372 dbregs->dr[3] = rdr3(); 2373 dbregs->dr[6] = rdr6(); 2374 dbregs->dr[7] = rdr7(); 2375 } else { 2376 pcb = td->td_pcb; 2377 dbregs->dr[0] = pcb->pcb_dr0; 2378 dbregs->dr[1] = pcb->pcb_dr1; 2379 dbregs->dr[2] = pcb->pcb_dr2; 2380 dbregs->dr[3] = pcb->pcb_dr3; 2381 dbregs->dr[6] = pcb->pcb_dr6; 2382 dbregs->dr[7] = pcb->pcb_dr7; 2383 } 2384 dbregs->dr[4] = 0; 2385 dbregs->dr[5] = 0; 2386 dbregs->dr[8] = 0; 2387 dbregs->dr[9] = 0; 2388 dbregs->dr[10] = 0; 2389 dbregs->dr[11] = 0; 2390 dbregs->dr[12] = 0; 2391 dbregs->dr[13] = 0; 2392 dbregs->dr[14] = 0; 2393 dbregs->dr[15] = 0; 2394 return (0); 2395} 2396 2397int 2398set_dbregs(struct thread *td, struct dbreg *dbregs) 2399{ 2400 struct pcb *pcb; 2401 int i; 2402 2403 if (td == NULL) { 2404 load_dr0(dbregs->dr[0]); 2405 load_dr1(dbregs->dr[1]); 2406 load_dr2(dbregs->dr[2]); 2407 load_dr3(dbregs->dr[3]); 2408 load_dr6(dbregs->dr[6]); 2409 load_dr7(dbregs->dr[7]); 2410 } else { 2411 /* 2412 * Don't let an illegal value for dr7 get set. Specifically, 2413 * check for undefined settings. Setting these bit patterns 2414 * result in undefined behaviour and can lead to an unexpected 2415 * TRCTRAP or a general protection fault right here. 2416 * Upper bits of dr6 and dr7 must not be set 2417 */ 2418 for (i = 0; i < 4; i++) { 2419 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 2420 return (EINVAL); 2421 if (td->td_frame->tf_cs == _ucode32sel && 2422 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) 2423 return (EINVAL); 2424 } 2425 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || 2426 (dbregs->dr[7] & 0xffffffff00000000ul) != 0) 2427 return (EINVAL); 2428 2429 pcb = td->td_pcb; 2430 2431 /* 2432 * Don't let a process set a breakpoint that is not within the 2433 * process's address space. If a process could do this, it 2434 * could halt the system by setting a breakpoint in the kernel 2435 * (if ddb was enabled). Thus, we need to check to make sure 2436 * that no breakpoints are being enabled for addresses outside 2437 * process's address space. 2438 * 2439 * XXX - what about when the watched area of the user's 2440 * address space is written into from within the kernel 2441 * ... wouldn't that still cause a breakpoint to be generated 2442 * from within kernel mode? 2443 */ 2444 2445 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 2446 /* dr0 is enabled */ 2447 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 2448 return (EINVAL); 2449 } 2450 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 2451 /* dr1 is enabled */ 2452 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 2453 return (EINVAL); 2454 } 2455 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 2456 /* dr2 is enabled */ 2457 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 2458 return (EINVAL); 2459 } 2460 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 2461 /* dr3 is enabled */ 2462 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 2463 return (EINVAL); 2464 } 2465 2466 pcb->pcb_dr0 = dbregs->dr[0]; 2467 pcb->pcb_dr1 = dbregs->dr[1]; 2468 pcb->pcb_dr2 = dbregs->dr[2]; 2469 pcb->pcb_dr3 = dbregs->dr[3]; 2470 pcb->pcb_dr6 = dbregs->dr[6]; 2471 pcb->pcb_dr7 = dbregs->dr[7]; 2472 2473 set_pcb_flags(pcb, PCB_DBREGS); 2474 } 2475 2476 return (0); 2477} 2478 2479void 2480reset_dbregs(void) 2481{ 2482 2483 load_dr7(0); /* Turn off the control bits first */ 2484 load_dr0(0); 2485 load_dr1(0); 2486 load_dr2(0); 2487 load_dr3(0); 2488 load_dr6(0); 2489} 2490 2491/* 2492 * Return > 0 if a hardware breakpoint has been hit, and the 2493 * breakpoint was in user space. Return 0, otherwise. 2494 */ 2495int 2496user_dbreg_trap(void) 2497{ 2498 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ 2499 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2500 int nbp; /* number of breakpoints that triggered */ 2501 caddr_t addr[4]; /* breakpoint addresses */ 2502 int i; 2503 2504 dr7 = rdr7(); 2505 if ((dr7 & 0x000000ff) == 0) { 2506 /* 2507 * all GE and LE bits in the dr7 register are zero, 2508 * thus the trap couldn't have been caused by the 2509 * hardware debug registers 2510 */ 2511 return 0; 2512 } 2513 2514 nbp = 0; 2515 dr6 = rdr6(); 2516 bp = dr6 & 0x0000000f; 2517 2518 if (!bp) { 2519 /* 2520 * None of the breakpoint bits are set meaning this 2521 * trap was not caused by any of the debug registers 2522 */ 2523 return 0; 2524 } 2525 2526 /* 2527 * at least one of the breakpoints were hit, check to see 2528 * which ones and if any of them are user space addresses 2529 */ 2530 2531 if (bp & 0x01) { 2532 addr[nbp++] = (caddr_t)rdr0(); 2533 } 2534 if (bp & 0x02) { 2535 addr[nbp++] = (caddr_t)rdr1(); 2536 } 2537 if (bp & 0x04) { 2538 addr[nbp++] = (caddr_t)rdr2(); 2539 } 2540 if (bp & 0x08) { 2541 addr[nbp++] = (caddr_t)rdr3(); 2542 } 2543 2544 for (i = 0; i < nbp; i++) { 2545 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 2546 /* 2547 * addr[i] is in user space 2548 */ 2549 return nbp; 2550 } 2551 } 2552 2553 /* 2554 * None of the breakpoints are in user space. 2555 */ 2556 return 0; 2557} 2558 2559/* 2560 * The pcb_flags is only modified by current thread, or by other threads 2561 * when current thread is stopped. However, current thread may change it 2562 * from the interrupt context in cpu_switch(), or in the trap handler. 2563 * When we read-modify-write pcb_flags from C sources, compiler may generate 2564 * code that is not atomic regarding the interrupt handler. If a trap or 2565 * interrupt happens and any flag is modified from the handler, it can be 2566 * clobbered with the cached value later. Therefore, we implement setting 2567 * and clearing flags with single-instruction functions, which do not race 2568 * with possible modification of the flags from the trap or interrupt context, 2569 * because traps and interrupts are executed only on instruction boundary. 2570 */ 2571void 2572set_pcb_flags_raw(struct pcb *pcb, const u_int flags) 2573{ 2574 2575 __asm __volatile("orl %1,%0" 2576 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) 2577 : "cc", "memory"); 2578 2579} 2580 2581/* 2582 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs 2583 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into 2584 * pcb if user space modified the bases. We must save on the context 2585 * switch or if the return to usermode happens through the doreti. 2586 * 2587 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, 2588 * which have a consequence that the base MSRs must be saved each time 2589 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with 2590 * context switches. 2591 */ 2592void 2593set_pcb_flags(struct pcb *pcb, const u_int flags) 2594{ 2595 register_t r; 2596 2597 if (curpcb == pcb && 2598 (flags & PCB_FULL_IRET) != 0 && 2599 (pcb->pcb_flags & PCB_FULL_IRET) == 0 && 2600 (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) { 2601 r = intr_disable(); 2602 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2603 if (rfs() == _ufssel) 2604 pcb->pcb_fsbase = rdfsbase(); 2605 if (rgs() == _ugssel) 2606 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); 2607 } 2608 set_pcb_flags_raw(pcb, flags); 2609 intr_restore(r); 2610 } else { 2611 set_pcb_flags_raw(pcb, flags); 2612 } 2613} 2614 2615void 2616clear_pcb_flags(struct pcb *pcb, const u_int flags) 2617{ 2618 2619 __asm __volatile("andl %1,%0" 2620 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) 2621 : "cc", "memory"); 2622} 2623 2624#ifdef KDB 2625 2626/* 2627 * Provide inb() and outb() as functions. They are normally only available as 2628 * inline functions, thus cannot be called from the debugger. 2629 */ 2630 2631/* silence compiler warnings */ 2632u_char inb_(u_short); 2633void outb_(u_short, u_char); 2634 2635u_char 2636inb_(u_short port) 2637{ 2638 return inb(port); 2639} 2640 2641void 2642outb_(u_short port, u_char data) 2643{ 2644 outb(port, data); 2645} 2646 2647#endif /* KDB */ 2648