machdep.c revision 337262
1/*- 2 * Copyright (c) 2003 Peter Wemm. 3 * Copyright (c) 1992 Terrence R. Lambert. 4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * William Jolitz. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 39 */ 40 41#include <sys/cdefs.h> 42__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 337262 2018-08-03 15:42:39Z markj $"); 43 44#include "opt_atpic.h" 45#include "opt_compat.h" 46#include "opt_cpu.h" 47#include "opt_ddb.h" 48#include "opt_inet.h" 49#include "opt_isa.h" 50#include "opt_kstack_pages.h" 51#include "opt_maxmem.h" 52#include "opt_mp_watchdog.h" 53#include "opt_perfmon.h" 54#include "opt_platform.h" 55#include "opt_sched.h" 56 57#include <sys/param.h> 58#include <sys/proc.h> 59#include <sys/systm.h> 60#include <sys/bio.h> 61#include <sys/buf.h> 62#include <sys/bus.h> 63#include <sys/callout.h> 64#include <sys/cons.h> 65#include <sys/cpu.h> 66#include <sys/efi.h> 67#include <sys/eventhandler.h> 68#include <sys/exec.h> 69#include <sys/imgact.h> 70#include <sys/kdb.h> 71#include <sys/kernel.h> 72#include <sys/ktr.h> 73#include <sys/linker.h> 74#include <sys/lock.h> 75#include <sys/malloc.h> 76#include <sys/memrange.h> 77#include <sys/msgbuf.h> 78#include <sys/mutex.h> 79#include <sys/pcpu.h> 80#include <sys/ptrace.h> 81#include <sys/reboot.h> 82#include <sys/rwlock.h> 83#include <sys/sched.h> 84#include <sys/signalvar.h> 85#ifdef SMP 86#include <sys/smp.h> 87#endif 88#include <sys/syscallsubr.h> 89#include <sys/sysctl.h> 90#include <sys/sysent.h> 91#include <sys/sysproto.h> 92#include <sys/ucontext.h> 93#include <sys/vmmeter.h> 94 95#include <vm/vm.h> 96#include <vm/vm_extern.h> 97#include <vm/vm_kern.h> 98#include <vm/vm_page.h> 99#include <vm/vm_map.h> 100#include <vm/vm_object.h> 101#include <vm/vm_pager.h> 102#include <vm/vm_param.h> 103#include <vm/vm_phys.h> 104 105#ifdef DDB 106#ifndef KDB 107#error KDB must be enabled in order for DDB to work! 108#endif 109#include <ddb/ddb.h> 110#include <ddb/db_sym.h> 111#endif 112 113#include <net/netisr.h> 114 115#include <machine/clock.h> 116#include <machine/cpu.h> 117#include <machine/cputypes.h> 118#include <machine/frame.h> 119#include <machine/intr_machdep.h> 120#include <x86/mca.h> 121#include <machine/md_var.h> 122#include <machine/metadata.h> 123#include <machine/mp_watchdog.h> 124#include <machine/pc/bios.h> 125#include <machine/pcb.h> 126#include <machine/proc.h> 127#include <machine/reg.h> 128#include <machine/sigframe.h> 129#include <machine/specialreg.h> 130#ifdef PERFMON 131#include <machine/perfmon.h> 132#endif 133#include <machine/tss.h> 134#ifdef SMP 135#include <machine/smp.h> 136#endif 137#ifdef FDT 138#include <x86/fdt.h> 139#endif 140 141#ifdef DEV_ATPIC 142#include <x86/isa/icu.h> 143#else 144#include <x86/apicvar.h> 145#endif 146 147#include <isa/isareg.h> 148#include <isa/rtc.h> 149#include <x86/init.h> 150 151/* Sanity check for __curthread() */ 152CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 153 154/* 155 * The PTI trampoline stack needs enough space for a hardware trapframe and a 156 * couple of scratch registers, as well as the trapframe left behind after an 157 * iret fault. 158 */ 159CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - 160 offsetof(struct pti_frame, pti_rip)); 161 162extern u_int64_t hammer_time(u_int64_t, u_int64_t); 163 164#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 165#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 166 167static void cpu_startup(void *); 168static void get_fpcontext(struct thread *td, mcontext_t *mcp, 169 char *xfpusave, size_t xfpusave_len); 170static int set_fpcontext(struct thread *td, mcontext_t *mcp, 171 char *xfpustate, size_t xfpustate_len); 172SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 173 174/* Preload data parse function */ 175static caddr_t native_parse_preload_data(u_int64_t); 176 177/* Native function to fetch and parse the e820 map */ 178static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); 179 180/* Default init_ops implementation. */ 181struct init_ops init_ops = { 182 .parse_preload_data = native_parse_preload_data, 183 .early_clock_source_init = i8254_init, 184 .early_delay = i8254_delay, 185 .parse_memmap = native_parse_memmap, 186#ifdef SMP 187 .mp_bootaddress = mp_bootaddress, 188 .start_all_aps = native_start_all_aps, 189#endif 190 .msi_init = msi_init, 191}; 192 193struct msgbuf *msgbufp; 194 195/* 196 * Physical address of the EFI System Table. Stashed from the metadata hints 197 * passed into the kernel and used by the EFI code to call runtime services. 198 */ 199vm_paddr_t efi_systbl_phys; 200 201/* Intel ICH registers */ 202#define ICH_PMBASE 0x400 203#define ICH_SMI_EN ICH_PMBASE + 0x30 204 205int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; 206 207int cold = 1; 208 209long Maxmem = 0; 210long realmem = 0; 211 212/* 213 * The number of PHYSMAP entries must be one less than the number of 214 * PHYSSEG entries because the PHYSMAP entry that spans the largest 215 * physical address that is accessible by ISA DMA is split into two 216 * PHYSSEG entries. 217 */ 218#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 219 220vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 221vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 222 223/* must be 2 less so 0 0 can signal end of chunks */ 224#define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) 225#define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) 226 227struct kva_md_info kmi; 228 229static struct trapframe proc0_tf; 230struct region_descriptor r_gdt, r_idt; 231 232struct pcpu __pcpu[MAXCPU]; 233 234struct mtx icu_lock; 235 236struct mem_range_softc mem_range_softc; 237 238struct mtx dt_lock; /* lock for GDT and LDT */ 239 240void (*vmm_resume_p)(void); 241 242static void 243cpu_startup(dummy) 244 void *dummy; 245{ 246 uintmax_t memsize; 247 char *sysenv; 248 249 /* 250 * On MacBooks, we need to disallow the legacy USB circuit to 251 * generate an SMI# because this can cause several problems, 252 * namely: incorrect CPU frequency detection and failure to 253 * start the APs. 254 * We do this by disabling a bit in the SMI_EN (SMI Control and 255 * Enable register) of the Intel ICH LPC Interface Bridge. 256 */ 257 sysenv = kern_getenv("smbios.system.product"); 258 if (sysenv != NULL) { 259 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 260 strncmp(sysenv, "MacBook3,1", 10) == 0 || 261 strncmp(sysenv, "MacBook4,1", 10) == 0 || 262 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 263 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 264 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 265 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 266 strncmp(sysenv, "Macmini1,1", 10) == 0) { 267 if (bootverbose) 268 printf("Disabling LEGACY_USB_EN bit on " 269 "Intel ICH.\n"); 270 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 271 } 272 freeenv(sysenv); 273 } 274 275 /* 276 * Good {morning,afternoon,evening,night}. 277 */ 278 startrtclock(); 279 printcpuinfo(); 280#ifdef PERFMON 281 perfmon_init(); 282#endif 283 284 /* 285 * Display physical memory if SMBIOS reports reasonable amount. 286 */ 287 memsize = 0; 288 sysenv = kern_getenv("smbios.memory.enabled"); 289 if (sysenv != NULL) { 290 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 291 freeenv(sysenv); 292 } 293 if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count)) 294 memsize = ptoa((uintmax_t)Maxmem); 295 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 296 realmem = atop(memsize); 297 298 /* 299 * Display any holes after the first chunk of extended memory. 300 */ 301 if (bootverbose) { 302 int indx; 303 304 printf("Physical memory chunk(s):\n"); 305 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 306 vm_paddr_t size; 307 308 size = phys_avail[indx + 1] - phys_avail[indx]; 309 printf( 310 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 311 (uintmax_t)phys_avail[indx], 312 (uintmax_t)phys_avail[indx + 1] - 1, 313 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 314 } 315 } 316 317 vm_ksubmap_init(&kmi); 318 319 printf("avail memory = %ju (%ju MB)\n", 320 ptoa((uintmax_t)vm_cnt.v_free_count), 321 ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); 322 323 /* 324 * Set up buffers, so they can be used to read disk labels. 325 */ 326 bufinit(); 327 vm_pager_bufferinit(); 328 329 cpu_setregs(); 330} 331 332/* 333 * Send an interrupt to process. 334 * 335 * Stack is set up to allow sigcode stored 336 * at top to call routine, followed by call 337 * to sigreturn routine below. After sigreturn 338 * resets the signal mask, the stack, and the 339 * frame pointer, it returns to the user 340 * specified pc, psl. 341 */ 342void 343sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 344{ 345 struct sigframe sf, *sfp; 346 struct pcb *pcb; 347 struct proc *p; 348 struct thread *td; 349 struct sigacts *psp; 350 char *sp; 351 struct trapframe *regs; 352 char *xfpusave; 353 size_t xfpusave_len; 354 int sig; 355 int oonstack; 356 357 td = curthread; 358 pcb = td->td_pcb; 359 p = td->td_proc; 360 PROC_LOCK_ASSERT(p, MA_OWNED); 361 sig = ksi->ksi_signo; 362 psp = p->p_sigacts; 363 mtx_assert(&psp->ps_mtx, MA_OWNED); 364 regs = td->td_frame; 365 oonstack = sigonstack(regs->tf_rsp); 366 367 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { 368 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); 369 xfpusave = __builtin_alloca(xfpusave_len); 370 } else { 371 xfpusave_len = 0; 372 xfpusave = NULL; 373 } 374 375 /* Save user context. */ 376 bzero(&sf, sizeof(sf)); 377 sf.sf_uc.uc_sigmask = *mask; 378 sf.sf_uc.uc_stack = td->td_sigstk; 379 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 380 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 381 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 382 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); 383 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 384 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 385 fpstate_drop(td); 386 update_pcb_bases(pcb); 387 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; 388 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; 389 bzero(sf.sf_uc.uc_mcontext.mc_spare, 390 sizeof(sf.sf_uc.uc_mcontext.mc_spare)); 391 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 392 393 /* Allocate space for the signal handler context. */ 394 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 395 SIGISMEMBER(psp->ps_sigonstack, sig)) { 396 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 397#if defined(COMPAT_43) 398 td->td_sigstk.ss_flags |= SS_ONSTACK; 399#endif 400 } else 401 sp = (char *)regs->tf_rsp - 128; 402 if (xfpusave != NULL) { 403 sp -= xfpusave_len; 404 sp = (char *)((unsigned long)sp & ~0x3Ful); 405 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 406 } 407 sp -= sizeof(struct sigframe); 408 /* Align to 16 bytes. */ 409 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); 410 411 /* Build the argument list for the signal handler. */ 412 regs->tf_rdi = sig; /* arg 1 in %rdi */ 413 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ 414 bzero(&sf.sf_si, sizeof(sf.sf_si)); 415 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 416 /* Signal handler installed with SA_SIGINFO. */ 417 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ 418 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 419 420 /* Fill in POSIX parts */ 421 sf.sf_si = ksi->ksi_info; 422 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 423 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 424 } else { 425 /* Old FreeBSD-style arguments. */ 426 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ 427 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 428 sf.sf_ahu.sf_handler = catcher; 429 } 430 mtx_unlock(&psp->ps_mtx); 431 PROC_UNLOCK(p); 432 433 /* 434 * Copy the sigframe out to the user's stack. 435 */ 436 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 437 (xfpusave != NULL && copyout(xfpusave, 438 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 439 != 0)) { 440#ifdef DEBUG 441 printf("process %ld has trashed its stack\n", (long)p->p_pid); 442#endif 443 PROC_LOCK(p); 444 sigexit(td, SIGILL); 445 } 446 447 regs->tf_rsp = (long)sfp; 448 regs->tf_rip = p->p_sysent->sv_sigcode_base; 449 regs->tf_rflags &= ~(PSL_T | PSL_D); 450 regs->tf_cs = _ucodesel; 451 regs->tf_ds = _udatasel; 452 regs->tf_ss = _udatasel; 453 regs->tf_es = _udatasel; 454 regs->tf_fs = _ufssel; 455 regs->tf_gs = _ugssel; 456 regs->tf_flags = TF_HASSEGS; 457 PROC_LOCK(p); 458 mtx_lock(&psp->ps_mtx); 459} 460 461/* 462 * System call to cleanup state after a signal 463 * has been taken. Reset signal mask and 464 * stack state from context left by sendsig (above). 465 * Return to previous pc and psl as specified by 466 * context left by sendsig. Check carefully to 467 * make sure that the user has not modified the 468 * state to gain improper privileges. 469 * 470 * MPSAFE 471 */ 472int 473sys_sigreturn(td, uap) 474 struct thread *td; 475 struct sigreturn_args /* { 476 const struct __ucontext *sigcntxp; 477 } */ *uap; 478{ 479 ucontext_t uc; 480 struct pcb *pcb; 481 struct proc *p; 482 struct trapframe *regs; 483 ucontext_t *ucp; 484 char *xfpustate; 485 size_t xfpustate_len; 486 long rflags; 487 int cs, error, ret; 488 ksiginfo_t ksi; 489 490 pcb = td->td_pcb; 491 p = td->td_proc; 492 493 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 494 if (error != 0) { 495 uprintf("pid %d (%s): sigreturn copyin failed\n", 496 p->p_pid, td->td_name); 497 return (error); 498 } 499 ucp = &uc; 500 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 501 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 502 td->td_name, ucp->uc_mcontext.mc_flags); 503 return (EINVAL); 504 } 505 regs = td->td_frame; 506 rflags = ucp->uc_mcontext.mc_rflags; 507 /* 508 * Don't allow users to change privileged or reserved flags. 509 */ 510 if (!EFL_SECURE(rflags, regs->tf_rflags)) { 511 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, 512 td->td_name, rflags); 513 return (EINVAL); 514 } 515 516 /* 517 * Don't allow users to load a valid privileged %cs. Let the 518 * hardware check for invalid selectors, excess privilege in 519 * other selectors, invalid %eip's and invalid %esp's. 520 */ 521 cs = ucp->uc_mcontext.mc_cs; 522 if (!CS_SECURE(cs)) { 523 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, 524 td->td_name, cs); 525 ksiginfo_init_trap(&ksi); 526 ksi.ksi_signo = SIGBUS; 527 ksi.ksi_code = BUS_OBJERR; 528 ksi.ksi_trapno = T_PROTFLT; 529 ksi.ksi_addr = (void *)regs->tf_rip; 530 trapsignal(td, &ksi); 531 return (EINVAL); 532 } 533 534 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 535 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 536 if (xfpustate_len > cpu_max_ext_state_size - 537 sizeof(struct savefpu)) { 538 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 539 p->p_pid, td->td_name, xfpustate_len); 540 return (EINVAL); 541 } 542 xfpustate = __builtin_alloca(xfpustate_len); 543 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 544 xfpustate, xfpustate_len); 545 if (error != 0) { 546 uprintf( 547 "pid %d (%s): sigreturn copying xfpustate failed\n", 548 p->p_pid, td->td_name); 549 return (error); 550 } 551 } else { 552 xfpustate = NULL; 553 xfpustate_len = 0; 554 } 555 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); 556 if (ret != 0) { 557 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", 558 p->p_pid, td->td_name, ret); 559 return (ret); 560 } 561 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); 562 update_pcb_bases(pcb); 563 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; 564 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; 565 566#if defined(COMPAT_43) 567 if (ucp->uc_mcontext.mc_onstack & 1) 568 td->td_sigstk.ss_flags |= SS_ONSTACK; 569 else 570 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 571#endif 572 573 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 574 return (EJUSTRETURN); 575} 576 577#ifdef COMPAT_FREEBSD4 578int 579freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) 580{ 581 582 return sys_sigreturn(td, (struct sigreturn_args *)uap); 583} 584#endif 585 586/* 587 * Reset registers to default values on exec. 588 */ 589void 590exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 591{ 592 struct trapframe *regs = td->td_frame; 593 struct pcb *pcb = td->td_pcb; 594 595 mtx_lock(&dt_lock); 596 if (td->td_proc->p_md.md_ldt != NULL) 597 user_ldt_free(td); 598 else 599 mtx_unlock(&dt_lock); 600 601 update_pcb_bases(pcb); 602 pcb->pcb_fsbase = 0; 603 pcb->pcb_gsbase = 0; 604 clear_pcb_flags(pcb, PCB_32BIT); 605 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; 606 607 bzero((char *)regs, sizeof(struct trapframe)); 608 regs->tf_rip = imgp->entry_addr; 609 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; 610 regs->tf_rdi = stack; /* argv */ 611 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 612 regs->tf_ss = _udatasel; 613 regs->tf_cs = _ucodesel; 614 regs->tf_ds = _udatasel; 615 regs->tf_es = _udatasel; 616 regs->tf_fs = _ufssel; 617 regs->tf_gs = _ugssel; 618 regs->tf_flags = TF_HASSEGS; 619 td->td_retval[1] = 0; 620 621 /* 622 * Reset the hardware debug registers if they were in use. 623 * They won't have any meaning for the newly exec'd process. 624 */ 625 if (pcb->pcb_flags & PCB_DBREGS) { 626 pcb->pcb_dr0 = 0; 627 pcb->pcb_dr1 = 0; 628 pcb->pcb_dr2 = 0; 629 pcb->pcb_dr3 = 0; 630 pcb->pcb_dr6 = 0; 631 pcb->pcb_dr7 = 0; 632 if (pcb == curpcb) { 633 /* 634 * Clear the debug registers on the running 635 * CPU, otherwise they will end up affecting 636 * the next process we switch to. 637 */ 638 reset_dbregs(); 639 } 640 clear_pcb_flags(pcb, PCB_DBREGS); 641 } 642 643 /* 644 * Drop the FP state if we hold it, so that the process gets a 645 * clean FP state if it uses the FPU again. 646 */ 647 fpstate_drop(td); 648} 649 650void 651cpu_setregs(void) 652{ 653 register_t cr0; 654 655 cr0 = rcr0(); 656 /* 657 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the 658 * BSP. See the comments there about why we set them. 659 */ 660 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 661 load_cr0(cr0); 662} 663 664/* 665 * Initialize amd64 and configure to run kernel 666 */ 667 668/* 669 * Initialize segments & interrupt table 670 */ 671 672struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */ 673static struct gate_descriptor idt0[NIDT]; 674struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 675 676static char dblfault_stack[PAGE_SIZE] __aligned(16); 677static char mce0_stack[PAGE_SIZE] __aligned(16); 678static char nmi0_stack[PAGE_SIZE] __aligned(16); 679static char dbg0_stack[PAGE_SIZE] __aligned(16); 680CTASSERT(sizeof(struct nmi_pcpu) == 16); 681 682struct amd64tss common_tss[MAXCPU]; 683 684/* 685 * Software prototypes -- in more palatable form. 686 * 687 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same 688 * slots as corresponding segments for i386 kernel. 689 */ 690struct soft_segment_descriptor gdt_segs[] = { 691/* GNULL_SEL 0 Null Descriptor */ 692{ .ssd_base = 0x0, 693 .ssd_limit = 0x0, 694 .ssd_type = 0, 695 .ssd_dpl = 0, 696 .ssd_p = 0, 697 .ssd_long = 0, 698 .ssd_def32 = 0, 699 .ssd_gran = 0 }, 700/* GNULL2_SEL 1 Null Descriptor */ 701{ .ssd_base = 0x0, 702 .ssd_limit = 0x0, 703 .ssd_type = 0, 704 .ssd_dpl = 0, 705 .ssd_p = 0, 706 .ssd_long = 0, 707 .ssd_def32 = 0, 708 .ssd_gran = 0 }, 709/* GUFS32_SEL 2 32 bit %gs Descriptor for user */ 710{ .ssd_base = 0x0, 711 .ssd_limit = 0xfffff, 712 .ssd_type = SDT_MEMRWA, 713 .ssd_dpl = SEL_UPL, 714 .ssd_p = 1, 715 .ssd_long = 0, 716 .ssd_def32 = 1, 717 .ssd_gran = 1 }, 718/* GUGS32_SEL 3 32 bit %fs Descriptor for user */ 719{ .ssd_base = 0x0, 720 .ssd_limit = 0xfffff, 721 .ssd_type = SDT_MEMRWA, 722 .ssd_dpl = SEL_UPL, 723 .ssd_p = 1, 724 .ssd_long = 0, 725 .ssd_def32 = 1, 726 .ssd_gran = 1 }, 727/* GCODE_SEL 4 Code Descriptor for kernel */ 728{ .ssd_base = 0x0, 729 .ssd_limit = 0xfffff, 730 .ssd_type = SDT_MEMERA, 731 .ssd_dpl = SEL_KPL, 732 .ssd_p = 1, 733 .ssd_long = 1, 734 .ssd_def32 = 0, 735 .ssd_gran = 1 }, 736/* GDATA_SEL 5 Data Descriptor for kernel */ 737{ .ssd_base = 0x0, 738 .ssd_limit = 0xfffff, 739 .ssd_type = SDT_MEMRWA, 740 .ssd_dpl = SEL_KPL, 741 .ssd_p = 1, 742 .ssd_long = 1, 743 .ssd_def32 = 0, 744 .ssd_gran = 1 }, 745/* GUCODE32_SEL 6 32 bit Code Descriptor for user */ 746{ .ssd_base = 0x0, 747 .ssd_limit = 0xfffff, 748 .ssd_type = SDT_MEMERA, 749 .ssd_dpl = SEL_UPL, 750 .ssd_p = 1, 751 .ssd_long = 0, 752 .ssd_def32 = 1, 753 .ssd_gran = 1 }, 754/* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ 755{ .ssd_base = 0x0, 756 .ssd_limit = 0xfffff, 757 .ssd_type = SDT_MEMRWA, 758 .ssd_dpl = SEL_UPL, 759 .ssd_p = 1, 760 .ssd_long = 0, 761 .ssd_def32 = 1, 762 .ssd_gran = 1 }, 763/* GUCODE_SEL 8 64 bit Code Descriptor for user */ 764{ .ssd_base = 0x0, 765 .ssd_limit = 0xfffff, 766 .ssd_type = SDT_MEMERA, 767 .ssd_dpl = SEL_UPL, 768 .ssd_p = 1, 769 .ssd_long = 1, 770 .ssd_def32 = 0, 771 .ssd_gran = 1 }, 772/* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 773{ .ssd_base = 0x0, 774 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, 775 .ssd_type = SDT_SYSTSS, 776 .ssd_dpl = SEL_KPL, 777 .ssd_p = 1, 778 .ssd_long = 0, 779 .ssd_def32 = 0, 780 .ssd_gran = 0 }, 781/* Actually, the TSS is a system descriptor which is double size */ 782{ .ssd_base = 0x0, 783 .ssd_limit = 0x0, 784 .ssd_type = 0, 785 .ssd_dpl = 0, 786 .ssd_p = 0, 787 .ssd_long = 0, 788 .ssd_def32 = 0, 789 .ssd_gran = 0 }, 790/* GUSERLDT_SEL 11 LDT Descriptor */ 791{ .ssd_base = 0x0, 792 .ssd_limit = 0x0, 793 .ssd_type = 0, 794 .ssd_dpl = 0, 795 .ssd_p = 0, 796 .ssd_long = 0, 797 .ssd_def32 = 0, 798 .ssd_gran = 0 }, 799/* GUSERLDT_SEL 12 LDT Descriptor, double size */ 800{ .ssd_base = 0x0, 801 .ssd_limit = 0x0, 802 .ssd_type = 0, 803 .ssd_dpl = 0, 804 .ssd_p = 0, 805 .ssd_long = 0, 806 .ssd_def32 = 0, 807 .ssd_gran = 0 }, 808}; 809 810void 811setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 812{ 813 struct gate_descriptor *ip; 814 815 ip = idt + idx; 816 ip->gd_looffset = (uintptr_t)func; 817 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 818 ip->gd_ist = ist; 819 ip->gd_xx = 0; 820 ip->gd_type = typ; 821 ip->gd_dpl = dpl; 822 ip->gd_p = 1; 823 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 824} 825 826extern inthand_t 827 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 828 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 829 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 830 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 831 IDTVEC(xmm), IDTVEC(dblfault), 832 IDTVEC(div_pti), IDTVEC(bpt_pti), 833 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), 834 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), 835 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), 836 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), 837 IDTVEC(xmm_pti), 838#ifdef KDTRACE_HOOKS 839 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), 840#endif 841#ifdef XENHVM 842 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), 843#endif 844 IDTVEC(fast_syscall), IDTVEC(fast_syscall32), 845 IDTVEC(fast_syscall_pti); 846 847#ifdef DDB 848/* 849 * Display the index and function name of any IDT entries that don't use 850 * the default 'rsvd' entry point. 851 */ 852DB_SHOW_COMMAND(idt, db_show_idt) 853{ 854 struct gate_descriptor *ip; 855 int idx; 856 uintptr_t func; 857 858 ip = idt; 859 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 860 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); 861 if (func != (uintptr_t)&IDTVEC(rsvd)) { 862 db_printf("%3d\t", idx); 863 db_printsym(func, DB_STGY_PROC); 864 db_printf("\n"); 865 } 866 ip++; 867 } 868} 869 870/* Show privileged registers. */ 871DB_SHOW_COMMAND(sysregs, db_show_sysregs) 872{ 873 struct { 874 uint16_t limit; 875 uint64_t base; 876 } __packed idtr, gdtr; 877 uint16_t ldt, tr; 878 879 __asm __volatile("sidt %0" : "=m" (idtr)); 880 db_printf("idtr\t0x%016lx/%04x\n", 881 (u_long)idtr.base, (u_int)idtr.limit); 882 __asm __volatile("sgdt %0" : "=m" (gdtr)); 883 db_printf("gdtr\t0x%016lx/%04x\n", 884 (u_long)gdtr.base, (u_int)gdtr.limit); 885 __asm __volatile("sldt %0" : "=r" (ldt)); 886 db_printf("ldtr\t0x%04x\n", ldt); 887 __asm __volatile("str %0" : "=r" (tr)); 888 db_printf("tr\t0x%04x\n", tr); 889 db_printf("cr0\t0x%016lx\n", rcr0()); 890 db_printf("cr2\t0x%016lx\n", rcr2()); 891 db_printf("cr3\t0x%016lx\n", rcr3()); 892 db_printf("cr4\t0x%016lx\n", rcr4()); 893 if (rcr4() & CR4_XSAVE) 894 db_printf("xcr0\t0x%016lx\n", rxcr(0)); 895 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); 896 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 897 db_printf("FEATURES_CTL\t%016lx\n", 898 rdmsr(MSR_IA32_FEATURE_CONTROL)); 899 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); 900 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); 901 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); 902} 903 904DB_SHOW_COMMAND(dbregs, db_show_dbregs) 905{ 906 907 db_printf("dr0\t0x%016lx\n", rdr0()); 908 db_printf("dr1\t0x%016lx\n", rdr1()); 909 db_printf("dr2\t0x%016lx\n", rdr2()); 910 db_printf("dr3\t0x%016lx\n", rdr3()); 911 db_printf("dr6\t0x%016lx\n", rdr6()); 912 db_printf("dr7\t0x%016lx\n", rdr7()); 913} 914#endif 915 916void 917sdtossd(sd, ssd) 918 struct user_segment_descriptor *sd; 919 struct soft_segment_descriptor *ssd; 920{ 921 922 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 923 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 924 ssd->ssd_type = sd->sd_type; 925 ssd->ssd_dpl = sd->sd_dpl; 926 ssd->ssd_p = sd->sd_p; 927 ssd->ssd_long = sd->sd_long; 928 ssd->ssd_def32 = sd->sd_def32; 929 ssd->ssd_gran = sd->sd_gran; 930} 931 932void 933ssdtosd(ssd, sd) 934 struct soft_segment_descriptor *ssd; 935 struct user_segment_descriptor *sd; 936{ 937 938 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 939 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 940 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 941 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 942 sd->sd_type = ssd->ssd_type; 943 sd->sd_dpl = ssd->ssd_dpl; 944 sd->sd_p = ssd->ssd_p; 945 sd->sd_long = ssd->ssd_long; 946 sd->sd_def32 = ssd->ssd_def32; 947 sd->sd_gran = ssd->ssd_gran; 948} 949 950void 951ssdtosyssd(ssd, sd) 952 struct soft_segment_descriptor *ssd; 953 struct system_segment_descriptor *sd; 954{ 955 956 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 957 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 958 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 959 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 960 sd->sd_type = ssd->ssd_type; 961 sd->sd_dpl = ssd->ssd_dpl; 962 sd->sd_p = ssd->ssd_p; 963 sd->sd_gran = ssd->ssd_gran; 964} 965 966#if !defined(DEV_ATPIC) && defined(DEV_ISA) 967#include <isa/isavar.h> 968#include <isa/isareg.h> 969/* 970 * Return a bitmap of the current interrupt requests. This is 8259-specific 971 * and is only suitable for use at probe time. 972 * This is only here to pacify sio. It is NOT FATAL if this doesn't work. 973 * It shouldn't be here. There should probably be an APIC centric 974 * implementation in the apic driver code, if at all. 975 */ 976intrmask_t 977isa_irq_pending(void) 978{ 979 u_char irr1; 980 u_char irr2; 981 982 irr1 = inb(IO_ICU1); 983 irr2 = inb(IO_ICU2); 984 return ((irr2 << 8) | irr1); 985} 986#endif 987 988u_int basemem; 989 990static int 991add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 992 int *physmap_idxp) 993{ 994 int i, insert_idx, physmap_idx; 995 996 physmap_idx = *physmap_idxp; 997 998 if (length == 0) 999 return (1); 1000 1001 /* 1002 * Find insertion point while checking for overlap. Start off by 1003 * assuming the new entry will be added to the end. 1004 * 1005 * NB: physmap_idx points to the next free slot. 1006 */ 1007 insert_idx = physmap_idx; 1008 for (i = 0; i <= physmap_idx; i += 2) { 1009 if (base < physmap[i + 1]) { 1010 if (base + length <= physmap[i]) { 1011 insert_idx = i; 1012 break; 1013 } 1014 if (boothowto & RB_VERBOSE) 1015 printf( 1016 "Overlapping memory regions, ignoring second region\n"); 1017 return (1); 1018 } 1019 } 1020 1021 /* See if we can prepend to the next entry. */ 1022 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1023 physmap[insert_idx] = base; 1024 return (1); 1025 } 1026 1027 /* See if we can append to the previous entry. */ 1028 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1029 physmap[insert_idx - 1] += length; 1030 return (1); 1031 } 1032 1033 physmap_idx += 2; 1034 *physmap_idxp = physmap_idx; 1035 if (physmap_idx == PHYSMAP_SIZE) { 1036 printf( 1037 "Too many segments in the physical address map, giving up\n"); 1038 return (0); 1039 } 1040 1041 /* 1042 * Move the last 'N' entries down to make room for the new 1043 * entry if needed. 1044 */ 1045 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { 1046 physmap[i] = physmap[i - 2]; 1047 physmap[i + 1] = physmap[i - 1]; 1048 } 1049 1050 /* Insert the new entry. */ 1051 physmap[insert_idx] = base; 1052 physmap[insert_idx + 1] = base + length; 1053 return (1); 1054} 1055 1056void 1057bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, 1058 vm_paddr_t *physmap, int *physmap_idx) 1059{ 1060 struct bios_smap *smap, *smapend; 1061 1062 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1063 1064 for (smap = smapbase; smap < smapend; smap++) { 1065 if (boothowto & RB_VERBOSE) 1066 printf("SMAP type=%02x base=%016lx len=%016lx\n", 1067 smap->type, smap->base, smap->length); 1068 1069 if (smap->type != SMAP_TYPE_MEMORY) 1070 continue; 1071 1072 if (!add_physmap_entry(smap->base, smap->length, physmap, 1073 physmap_idx)) 1074 break; 1075 } 1076} 1077 1078static void 1079add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, 1080 int *physmap_idx) 1081{ 1082 struct efi_md *map, *p; 1083 const char *type; 1084 size_t efisz; 1085 int ndesc, i; 1086 1087 static const char *types[] = { 1088 "Reserved", 1089 "LoaderCode", 1090 "LoaderData", 1091 "BootServicesCode", 1092 "BootServicesData", 1093 "RuntimeServicesCode", 1094 "RuntimeServicesData", 1095 "ConventionalMemory", 1096 "UnusableMemory", 1097 "ACPIReclaimMemory", 1098 "ACPIMemoryNVS", 1099 "MemoryMappedIO", 1100 "MemoryMappedIOPortSpace", 1101 "PalCode", 1102 "PersistentMemory" 1103 }; 1104 1105 /* 1106 * Memory map data provided by UEFI via the GetMemoryMap 1107 * Boot Services API. 1108 */ 1109 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 1110 map = (struct efi_md *)((uint8_t *)efihdr + efisz); 1111 1112 if (efihdr->descriptor_size == 0) 1113 return; 1114 ndesc = efihdr->memory_size / efihdr->descriptor_size; 1115 1116 if (boothowto & RB_VERBOSE) 1117 printf("%23s %12s %12s %8s %4s\n", 1118 "Type", "Physical", "Virtual", "#Pages", "Attr"); 1119 1120 for (i = 0, p = map; i < ndesc; i++, 1121 p = efi_next_descriptor(p, efihdr->descriptor_size)) { 1122 if (boothowto & RB_VERBOSE) { 1123 if (p->md_type < nitems(types)) 1124 type = types[p->md_type]; 1125 else 1126 type = "<INVALID>"; 1127 printf("%23s %012lx %12p %08lx ", type, p->md_phys, 1128 p->md_virt, p->md_pages); 1129 if (p->md_attr & EFI_MD_ATTR_UC) 1130 printf("UC "); 1131 if (p->md_attr & EFI_MD_ATTR_WC) 1132 printf("WC "); 1133 if (p->md_attr & EFI_MD_ATTR_WT) 1134 printf("WT "); 1135 if (p->md_attr & EFI_MD_ATTR_WB) 1136 printf("WB "); 1137 if (p->md_attr & EFI_MD_ATTR_UCE) 1138 printf("UCE "); 1139 if (p->md_attr & EFI_MD_ATTR_WP) 1140 printf("WP "); 1141 if (p->md_attr & EFI_MD_ATTR_RP) 1142 printf("RP "); 1143 if (p->md_attr & EFI_MD_ATTR_XP) 1144 printf("XP "); 1145 if (p->md_attr & EFI_MD_ATTR_NV) 1146 printf("NV "); 1147 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) 1148 printf("MORE_RELIABLE "); 1149 if (p->md_attr & EFI_MD_ATTR_RO) 1150 printf("RO "); 1151 if (p->md_attr & EFI_MD_ATTR_RT) 1152 printf("RUNTIME"); 1153 printf("\n"); 1154 } 1155 1156 switch (p->md_type) { 1157 case EFI_MD_TYPE_CODE: 1158 case EFI_MD_TYPE_DATA: 1159 case EFI_MD_TYPE_BS_CODE: 1160 case EFI_MD_TYPE_BS_DATA: 1161 case EFI_MD_TYPE_FREE: 1162 /* 1163 * We're allowed to use any entry with these types. 1164 */ 1165 break; 1166 default: 1167 continue; 1168 } 1169 1170 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), 1171 physmap, physmap_idx)) 1172 break; 1173 } 1174} 1175 1176static char bootmethod[16] = ""; 1177SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1178 "System firmware boot method"); 1179 1180static void 1181native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) 1182{ 1183 struct bios_smap *smap; 1184 struct efi_map_header *efihdr; 1185 u_int32_t size; 1186 1187 /* 1188 * Memory map from INT 15:E820. 1189 * 1190 * subr_module.c says: 1191 * "Consumer may safely assume that size value precedes data." 1192 * ie: an int32_t immediately precedes smap. 1193 */ 1194 1195 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1196 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1197 smap = (struct bios_smap *)preload_search_info(kmdp, 1198 MODINFO_METADATA | MODINFOMD_SMAP); 1199 if (efihdr == NULL && smap == NULL) 1200 panic("No BIOS smap or EFI map info from loader!"); 1201 1202 if (efihdr != NULL) { 1203 add_efi_map_entries(efihdr, physmap, physmap_idx); 1204 strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); 1205 } else { 1206 size = *((u_int32_t *)smap - 1); 1207 bios_add_smap_entries(smap, size, physmap, physmap_idx); 1208 strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); 1209 } 1210} 1211 1212#define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) 1213 1214/* 1215 * Populate the (physmap) array with base/bound pairs describing the 1216 * available physical memory in the system, then test this memory and 1217 * build the phys_avail array describing the actually-available memory. 1218 * 1219 * Total memory size may be set by the kernel environment variable 1220 * hw.physmem or the compile-time define MAXMEM. 1221 * 1222 * XXX first should be vm_paddr_t. 1223 */ 1224static void 1225getmemsize(caddr_t kmdp, u_int64_t first) 1226{ 1227 int i, physmap_idx, pa_indx, da_indx; 1228 vm_paddr_t pa, physmap[PHYSMAP_SIZE]; 1229 u_long physmem_start, physmem_tunable, memtest; 1230 pt_entry_t *pte; 1231 quad_t dcons_addr, dcons_size; 1232 int page_counter; 1233 1234 /* 1235 * Tell the physical memory allocator about pages used to store 1236 * the kernel and preloaded data. See kmem_bootstrap_free(). 1237 */ 1238 vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first)); 1239 1240 bzero(physmap, sizeof(physmap)); 1241 physmap_idx = 0; 1242 1243 init_ops.parse_memmap(kmdp, physmap, &physmap_idx); 1244 physmap_idx -= 2; 1245 1246 /* 1247 * Find the 'base memory' segment for SMP 1248 */ 1249 basemem = 0; 1250 for (i = 0; i <= physmap_idx; i += 2) { 1251 if (physmap[i] <= 0xA0000) { 1252 basemem = physmap[i + 1] / 1024; 1253 break; 1254 } 1255 } 1256 if (basemem == 0 || basemem > 640) { 1257 if (bootverbose) 1258 printf( 1259 "Memory map doesn't contain a basemem segment, faking it"); 1260 basemem = 640; 1261 } 1262 1263 /* 1264 * Make hole for "AP -> long mode" bootstrap code. The 1265 * mp_bootaddress vector is only available when the kernel 1266 * is configured to support APs and APs for the system start 1267 * in 32bit mode (e.g. SMP bare metal). 1268 */ 1269 if (init_ops.mp_bootaddress) { 1270 if (physmap[1] >= 0x100000000) 1271 panic( 1272 "Basemem segment is not suitable for AP bootstrap code!"); 1273 physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024); 1274 } 1275 1276 /* 1277 * Maxmem isn't the "maximum memory", it's one larger than the 1278 * highest page of the physical address space. It should be 1279 * called something like "Maxphyspage". We may adjust this 1280 * based on ``hw.physmem'' and the results of the memory test. 1281 */ 1282 Maxmem = atop(physmap[physmap_idx + 1]); 1283 1284#ifdef MAXMEM 1285 Maxmem = MAXMEM / 4; 1286#endif 1287 1288 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1289 Maxmem = atop(physmem_tunable); 1290 1291 /* 1292 * The boot memory test is disabled by default, as it takes a 1293 * significant amount of time on large-memory systems, and is 1294 * unfriendly to virtual machines as it unnecessarily touches all 1295 * pages. 1296 * 1297 * A general name is used as the code may be extended to support 1298 * additional tests beyond the current "page present" test. 1299 */ 1300 memtest = 0; 1301 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 1302 1303 /* 1304 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1305 * in the system. 1306 */ 1307 if (Maxmem > atop(physmap[physmap_idx + 1])) 1308 Maxmem = atop(physmap[physmap_idx + 1]); 1309 1310 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1311 (boothowto & RB_VERBOSE)) 1312 printf("Physical memory use set to %ldK\n", Maxmem * 4); 1313 1314 /* call pmap initialization to make new kernel address space */ 1315 pmap_bootstrap(&first); 1316 1317 /* 1318 * Size up each available chunk of physical memory. 1319 * 1320 * XXX Some BIOSes corrupt low 64KB between suspend and resume. 1321 * By default, mask off the first 16 pages unless we appear to be 1322 * running in a VM. 1323 */ 1324 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; 1325 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); 1326 if (physmap[0] < physmem_start) { 1327 if (physmem_start < PAGE_SIZE) 1328 physmap[0] = PAGE_SIZE; 1329 else if (physmem_start >= physmap[1]) 1330 physmap[0] = round_page(physmap[1] - PAGE_SIZE); 1331 else 1332 physmap[0] = round_page(physmem_start); 1333 } 1334 pa_indx = 0; 1335 da_indx = 1; 1336 phys_avail[pa_indx++] = physmap[0]; 1337 phys_avail[pa_indx] = physmap[0]; 1338 dump_avail[da_indx] = physmap[0]; 1339 pte = CMAP1; 1340 1341 /* 1342 * Get dcons buffer address 1343 */ 1344 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 1345 getenv_quad("dcons.size", &dcons_size) == 0) 1346 dcons_addr = 0; 1347 1348 /* 1349 * physmap is in bytes, so when converting to page boundaries, 1350 * round up the start address and round down the end address. 1351 */ 1352 page_counter = 0; 1353 if (memtest != 0) 1354 printf("Testing system memory"); 1355 for (i = 0; i <= physmap_idx; i += 2) { 1356 vm_paddr_t end; 1357 1358 end = ptoa((vm_paddr_t)Maxmem); 1359 if (physmap[i + 1] < end) 1360 end = trunc_page(physmap[i + 1]); 1361 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1362 int tmp, page_bad, full; 1363 int *ptr = (int *)CADDR1; 1364 1365 full = FALSE; 1366 /* 1367 * block out kernel memory as not available. 1368 */ 1369 if (pa >= (vm_paddr_t)kernphys && pa < first) 1370 goto do_dump_avail; 1371 1372 /* 1373 * block out dcons buffer 1374 */ 1375 if (dcons_addr > 0 1376 && pa >= trunc_page(dcons_addr) 1377 && pa < dcons_addr + dcons_size) 1378 goto do_dump_avail; 1379 1380 page_bad = FALSE; 1381 if (memtest == 0) 1382 goto skip_memtest; 1383 1384 /* 1385 * Print a "." every GB to show we're making 1386 * progress. 1387 */ 1388 page_counter++; 1389 if ((page_counter % PAGES_PER_GB) == 0) 1390 printf("."); 1391 1392 /* 1393 * map page into kernel: valid, read/write,non-cacheable 1394 */ 1395 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; 1396 invltlb(); 1397 1398 tmp = *(int *)ptr; 1399 /* 1400 * Test for alternating 1's and 0's 1401 */ 1402 *(volatile int *)ptr = 0xaaaaaaaa; 1403 if (*(volatile int *)ptr != 0xaaaaaaaa) 1404 page_bad = TRUE; 1405 /* 1406 * Test for alternating 0's and 1's 1407 */ 1408 *(volatile int *)ptr = 0x55555555; 1409 if (*(volatile int *)ptr != 0x55555555) 1410 page_bad = TRUE; 1411 /* 1412 * Test for all 1's 1413 */ 1414 *(volatile int *)ptr = 0xffffffff; 1415 if (*(volatile int *)ptr != 0xffffffff) 1416 page_bad = TRUE; 1417 /* 1418 * Test for all 0's 1419 */ 1420 *(volatile int *)ptr = 0x0; 1421 if (*(volatile int *)ptr != 0x0) 1422 page_bad = TRUE; 1423 /* 1424 * Restore original value. 1425 */ 1426 *(int *)ptr = tmp; 1427 1428skip_memtest: 1429 /* 1430 * Adjust array of valid/good pages. 1431 */ 1432 if (page_bad == TRUE) 1433 continue; 1434 /* 1435 * If this good page is a continuation of the 1436 * previous set of good pages, then just increase 1437 * the end pointer. Otherwise start a new chunk. 1438 * Note that "end" points one higher than end, 1439 * making the range >= start and < end. 1440 * If we're also doing a speculative memory 1441 * test and we at or past the end, bump up Maxmem 1442 * so that we keep going. The first bad page 1443 * will terminate the loop. 1444 */ 1445 if (phys_avail[pa_indx] == pa) { 1446 phys_avail[pa_indx] += PAGE_SIZE; 1447 } else { 1448 pa_indx++; 1449 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 1450 printf( 1451 "Too many holes in the physical address space, giving up\n"); 1452 pa_indx--; 1453 full = TRUE; 1454 goto do_dump_avail; 1455 } 1456 phys_avail[pa_indx++] = pa; /* start */ 1457 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1458 } 1459 physmem++; 1460do_dump_avail: 1461 if (dump_avail[da_indx] == pa) { 1462 dump_avail[da_indx] += PAGE_SIZE; 1463 } else { 1464 da_indx++; 1465 if (da_indx == DUMP_AVAIL_ARRAY_END) { 1466 da_indx--; 1467 goto do_next; 1468 } 1469 dump_avail[da_indx++] = pa; /* start */ 1470 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1471 } 1472do_next: 1473 if (full) 1474 break; 1475 } 1476 } 1477 *pte = 0; 1478 invltlb(); 1479 if (memtest != 0) 1480 printf("\n"); 1481 1482 /* 1483 * XXX 1484 * The last chunk must contain at least one page plus the message 1485 * buffer to avoid complicating other code (message buffer address 1486 * calculation, etc.). 1487 */ 1488 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1489 round_page(msgbufsize) >= phys_avail[pa_indx]) { 1490 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1491 phys_avail[pa_indx--] = 0; 1492 phys_avail[pa_indx--] = 0; 1493 } 1494 1495 Maxmem = atop(phys_avail[pa_indx]); 1496 1497 /* Trim off space for the message buffer. */ 1498 phys_avail[pa_indx] -= round_page(msgbufsize); 1499 1500 /* Map the message buffer. */ 1501 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); 1502} 1503 1504static caddr_t 1505native_parse_preload_data(u_int64_t modulep) 1506{ 1507 caddr_t kmdp; 1508 char *envp; 1509#ifdef DDB 1510 vm_offset_t ksym_start; 1511 vm_offset_t ksym_end; 1512#endif 1513 1514 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1515 preload_bootstrap_relocate(KERNBASE); 1516 kmdp = preload_search_by_type("elf kernel"); 1517 if (kmdp == NULL) 1518 kmdp = preload_search_by_type("elf64 kernel"); 1519 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1520 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); 1521 if (envp != NULL) 1522 envp += KERNBASE; 1523 init_static_kenv(envp, 0); 1524#ifdef DDB 1525 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1526 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1527 db_fetch_ksymtab(ksym_start, ksym_end); 1528#endif 1529 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 1530 1531 return (kmdp); 1532} 1533 1534static void 1535amd64_kdb_init(void) 1536{ 1537 kdb_init(); 1538#ifdef KDB 1539 if (boothowto & RB_KDB) 1540 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 1541#endif 1542} 1543 1544/* Set up the fast syscall stuff */ 1545void 1546amd64_conf_fast_syscall(void) 1547{ 1548 uint64_t msr; 1549 1550 msr = rdmsr(MSR_EFER) | EFER_SCE; 1551 wrmsr(MSR_EFER, msr); 1552 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : 1553 (u_int64_t)IDTVEC(fast_syscall)); 1554 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1555 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1556 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1557 wrmsr(MSR_STAR, msr); 1558 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D); 1559} 1560 1561u_int64_t 1562hammer_time(u_int64_t modulep, u_int64_t physfree) 1563{ 1564 caddr_t kmdp; 1565 int gsel_tss, x; 1566 struct pcpu *pc; 1567 struct nmi_pcpu *np; 1568 struct xstate_hdr *xhdr; 1569 u_int64_t rsp0; 1570 char *env; 1571 size_t kstack0_sz; 1572 int late_console; 1573 1574 kmdp = init_ops.parse_preload_data(modulep); 1575 1576 identify_cpu1(); 1577 identify_hypervisor(); 1578 /* 1579 * hw.cpu_stdext_disable is ignored by the call, it will be 1580 * re-evaluted by the below call to finishidentcpu(). 1581 */ 1582 identify_cpu2(); 1583 1584 link_elf_ireloc(kmdp); 1585 1586 /* 1587 * This may be done better later if it gets more high level 1588 * components in it. If so just link td->td_proc here. 1589 */ 1590 proc_linkup0(&proc0, &thread0); 1591 1592 /* Init basic tunables, hz etc */ 1593 init_param1(); 1594 1595 thread0.td_kstack = physfree + KERNBASE; 1596 thread0.td_kstack_pages = kstack_pages; 1597 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 1598 bzero((void *)thread0.td_kstack, kstack0_sz); 1599 physfree += kstack0_sz; 1600 1601 /* 1602 * make gdt memory segments 1603 */ 1604 for (x = 0; x < NGDT; x++) { 1605 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 1606 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) 1607 ssdtosd(&gdt_segs[x], &gdt[x]); 1608 } 1609 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0]; 1610 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1611 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1612 1613 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1614 r_gdt.rd_base = (long) gdt; 1615 lgdt(&r_gdt); 1616 pc = &__pcpu[0]; 1617 1618 wrmsr(MSR_FSBASE, 0); /* User value */ 1619 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1620 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1621 1622 pcpu_init(pc, 0, sizeof(struct pcpu)); 1623 dpcpu_init((void *)(physfree + KERNBASE), 0); 1624 physfree += DPCPU_SIZE; 1625 PCPU_SET(prvspace, pc); 1626 PCPU_SET(curthread, &thread0); 1627 /* Non-late cninit() and printf() can be moved up to here. */ 1628 PCPU_SET(tssp, &common_tss[0]); 1629 PCPU_SET(commontssp, &common_tss[0]); 1630 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1631 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); 1632 PCPU_SET(fs32p, &gdt[GUFS32_SEL]); 1633 PCPU_SET(gs32p, &gdt[GUGS32_SEL]); 1634 1635 /* 1636 * Initialize mutexes. 1637 * 1638 * icu_lock: in order to allow an interrupt to occur in a critical 1639 * section, to set pcpu->ipending (etc...) properly, we 1640 * must be able to get the icu lock, so it can't be 1641 * under witness. 1642 */ 1643 mutex_init(); 1644 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1645 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); 1646 1647 /* exceptions */ 1648 pti = pti_get_default(); 1649 TUNABLE_INT_FETCH("vm.pmap.pti", &pti); 1650 1651 for (x = 0; x < NIDT; x++) 1652 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, 1653 SEL_KPL, 0); 1654 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, 1655 SEL_KPL, 0); 1656 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); 1657 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); 1658 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, 1659 SEL_UPL, 0); 1660 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, 1661 SEL_UPL, 0); 1662 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, 1663 SEL_KPL, 0); 1664 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, 1665 SEL_KPL, 0); 1666 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, 1667 SEL_KPL, 0); 1668 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1669 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), 1670 SDT_SYSIGT, SEL_KPL, 0); 1671 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, 1672 SEL_KPL, 0); 1673 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), 1674 SDT_SYSIGT, SEL_KPL, 0); 1675 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, 1676 SEL_KPL, 0); 1677 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, 1678 SEL_KPL, 0); 1679 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, 1680 SEL_KPL, 0); 1681 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, 1682 SEL_KPL, 0); 1683 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, 1684 SEL_KPL, 0); 1685 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); 1686 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, 1687 SEL_KPL, 0); 1688#ifdef KDTRACE_HOOKS 1689 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : 1690 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); 1691#endif 1692#ifdef XENHVM 1693 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : 1694 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); 1695#endif 1696 r_idt.rd_limit = sizeof(idt0) - 1; 1697 r_idt.rd_base = (long) idt; 1698 lidt(&r_idt); 1699 1700 /* 1701 * Initialize the clock before the console so that console 1702 * initialization can use DELAY(). 1703 */ 1704 clock_init(); 1705 1706 /* 1707 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) 1708 * transition). 1709 * Once bootblocks have updated, we can test directly for 1710 * efi_systbl != NULL here... 1711 */ 1712 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) 1713 != NULL) 1714 vty_set_preferred(VTY_VT); 1715 1716 finishidentcpu(); /* Final stage of CPU initialization */ 1717 initializecpu(); /* Initialize CPU registers */ 1718 initializecpucache(); 1719 1720 /* doublefault stack space, runs on ist1 */ 1721 common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; 1722 1723 /* 1724 * NMI stack, runs on ist2. The pcpu pointer is stored just 1725 * above the start of the ist2 stack. 1726 */ 1727 np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1; 1728 np->np_pcpu = (register_t) pc; 1729 common_tss[0].tss_ist2 = (long) np; 1730 1731 /* 1732 * MC# stack, runs on ist3. The pcpu pointer is stored just 1733 * above the start of the ist3 stack. 1734 */ 1735 np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1; 1736 np->np_pcpu = (register_t) pc; 1737 common_tss[0].tss_ist3 = (long) np; 1738 1739 /* 1740 * DB# stack, runs on ist4. 1741 */ 1742 np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1; 1743 np->np_pcpu = (register_t) pc; 1744 common_tss[0].tss_ist4 = (long) np; 1745 1746 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1747 common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; 1748 1749 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1750 ltr(gsel_tss); 1751 1752 amd64_conf_fast_syscall(); 1753 1754 /* 1755 * Temporary forge some valid pointer to PCB, for exception 1756 * handlers. It is reinitialized properly below after FPU is 1757 * set up. Also set up td_critnest to short-cut the page 1758 * fault handler. 1759 */ 1760 cpu_max_ext_state_size = sizeof(struct savefpu); 1761 thread0.td_pcb = get_pcb_td(&thread0); 1762 thread0.td_critnest = 1; 1763 1764 /* 1765 * The console and kdb should be initialized even earlier than here, 1766 * but some console drivers don't work until after getmemsize(). 1767 * Default to late console initialization to support these drivers. 1768 * This loses mainly printf()s in getmemsize() and early debugging. 1769 */ 1770 late_console = 1; 1771 TUNABLE_INT_FETCH("debug.late_console", &late_console); 1772 if (!late_console) { 1773 cninit(); 1774 amd64_kdb_init(); 1775 } 1776 1777 getmemsize(kmdp, physfree); 1778 init_param2(physmem); 1779 1780 /* now running on new page tables, configured,and u/iom is accessible */ 1781 1782 if (late_console) 1783 cninit(); 1784 1785#ifdef DEV_ISA 1786#ifdef DEV_ATPIC 1787 elcr_probe(); 1788 atpic_startup(); 1789#else 1790 /* Reset and mask the atpics and leave them shut down. */ 1791 atpic_reset(); 1792 1793 /* 1794 * Point the ICU spurious interrupt vectors at the APIC spurious 1795 * interrupt handler. 1796 */ 1797 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1798 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1799#endif 1800#else 1801#error "have you forgotten the isa device?"; 1802#endif 1803 1804 if (late_console) 1805 amd64_kdb_init(); 1806 1807 msgbufinit(msgbufp, msgbufsize); 1808 fpuinit(); 1809 1810 /* 1811 * Set up thread0 pcb after fpuinit calculated pcb + fpu save 1812 * area size. Zero out the extended state header in fpu save 1813 * area. 1814 */ 1815 thread0.td_pcb = get_pcb_td(&thread0); 1816 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 1817 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); 1818 if (use_xsave) { 1819 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1820 1); 1821 xhdr->xstate_bv = xsave_mask; 1822 } 1823 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1824 rsp0 = (vm_offset_t)thread0.td_pcb; 1825 /* Ensure the stack is aligned to 16 bytes */ 1826 rsp0 &= ~0xFul; 1827 common_tss[0].tss_rsp0 = rsp0; 1828 PCPU_SET(rsp0, rsp0); 1829 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + 1830 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); 1831 PCPU_SET(curpcb, thread0.td_pcb); 1832 1833 /* transfer to user mode */ 1834 1835 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1836 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1837 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1838 _ufssel = GSEL(GUFS32_SEL, SEL_UPL); 1839 _ugssel = GSEL(GUGS32_SEL, SEL_UPL); 1840 1841 load_ds(_udatasel); 1842 load_es(_udatasel); 1843 load_fs(_ufssel); 1844 1845 /* setup proc 0's pcb */ 1846 thread0.td_pcb->pcb_flags = 0; 1847 thread0.td_frame = &proc0_tf; 1848 1849 env = kern_getenv("kernelname"); 1850 if (env != NULL) 1851 strlcpy(kernelname, env, sizeof(kernelname)); 1852 1853 cpu_probe_amdc1e(); 1854 1855#ifdef FDT 1856 x86_init_fdt(); 1857#endif 1858 thread0.td_critnest = 0; 1859 1860 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); 1861 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); 1862 1863 /* Location of kernel stack for locore */ 1864 return ((u_int64_t)thread0.td_pcb); 1865} 1866 1867void 1868cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1869{ 1870 1871 pcpu->pc_acpi_id = 0xffffffff; 1872} 1873 1874static int 1875smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 1876{ 1877 struct bios_smap *smapbase; 1878 struct bios_smap_xattr smap; 1879 caddr_t kmdp; 1880 uint32_t *smapattr; 1881 int count, error, i; 1882 1883 /* Retrieve the system memory map from the loader. */ 1884 kmdp = preload_search_by_type("elf kernel"); 1885 if (kmdp == NULL) 1886 kmdp = preload_search_by_type("elf64 kernel"); 1887 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1888 MODINFO_METADATA | MODINFOMD_SMAP); 1889 if (smapbase == NULL) 1890 return (0); 1891 smapattr = (uint32_t *)preload_search_info(kmdp, 1892 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 1893 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); 1894 error = 0; 1895 for (i = 0; i < count; i++) { 1896 smap.base = smapbase[i].base; 1897 smap.length = smapbase[i].length; 1898 smap.type = smapbase[i].type; 1899 if (smapattr != NULL) 1900 smap.xattr = smapattr[i]; 1901 else 1902 smap.xattr = 0; 1903 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 1904 } 1905 return (error); 1906} 1907SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1908 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); 1909 1910static int 1911efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1912{ 1913 struct efi_map_header *efihdr; 1914 caddr_t kmdp; 1915 uint32_t efisize; 1916 1917 kmdp = preload_search_by_type("elf kernel"); 1918 if (kmdp == NULL) 1919 kmdp = preload_search_by_type("elf64 kernel"); 1920 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1921 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1922 if (efihdr == NULL) 1923 return (0); 1924 efisize = *((uint32_t *)efihdr - 1); 1925 return (SYSCTL_OUT(req, efihdr, efisize)); 1926} 1927SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1928 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); 1929 1930void 1931spinlock_enter(void) 1932{ 1933 struct thread *td; 1934 register_t flags; 1935 1936 td = curthread; 1937 if (td->td_md.md_spinlock_count == 0) { 1938 flags = intr_disable(); 1939 td->td_md.md_spinlock_count = 1; 1940 td->td_md.md_saved_flags = flags; 1941 } else 1942 td->td_md.md_spinlock_count++; 1943 critical_enter(); 1944} 1945 1946void 1947spinlock_exit(void) 1948{ 1949 struct thread *td; 1950 register_t flags; 1951 1952 td = curthread; 1953 critical_exit(); 1954 flags = td->td_md.md_saved_flags; 1955 td->td_md.md_spinlock_count--; 1956 if (td->td_md.md_spinlock_count == 0) 1957 intr_restore(flags); 1958} 1959 1960/* 1961 * Construct a PCB from a trapframe. This is called from kdb_trap() where 1962 * we want to start a backtrace from the function that caused us to enter 1963 * the debugger. We have the context in the trapframe, but base the trace 1964 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 1965 * enough for a backtrace. 1966 */ 1967void 1968makectx(struct trapframe *tf, struct pcb *pcb) 1969{ 1970 1971 pcb->pcb_r12 = tf->tf_r12; 1972 pcb->pcb_r13 = tf->tf_r13; 1973 pcb->pcb_r14 = tf->tf_r14; 1974 pcb->pcb_r15 = tf->tf_r15; 1975 pcb->pcb_rbp = tf->tf_rbp; 1976 pcb->pcb_rbx = tf->tf_rbx; 1977 pcb->pcb_rip = tf->tf_rip; 1978 pcb->pcb_rsp = tf->tf_rsp; 1979} 1980 1981int 1982ptrace_set_pc(struct thread *td, unsigned long addr) 1983{ 1984 1985 td->td_frame->tf_rip = addr; 1986 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 1987 return (0); 1988} 1989 1990int 1991ptrace_single_step(struct thread *td) 1992{ 1993 td->td_frame->tf_rflags |= PSL_T; 1994 return (0); 1995} 1996 1997int 1998ptrace_clear_single_step(struct thread *td) 1999{ 2000 td->td_frame->tf_rflags &= ~PSL_T; 2001 return (0); 2002} 2003 2004int 2005fill_regs(struct thread *td, struct reg *regs) 2006{ 2007 struct trapframe *tp; 2008 2009 tp = td->td_frame; 2010 return (fill_frame_regs(tp, regs)); 2011} 2012 2013int 2014fill_frame_regs(struct trapframe *tp, struct reg *regs) 2015{ 2016 regs->r_r15 = tp->tf_r15; 2017 regs->r_r14 = tp->tf_r14; 2018 regs->r_r13 = tp->tf_r13; 2019 regs->r_r12 = tp->tf_r12; 2020 regs->r_r11 = tp->tf_r11; 2021 regs->r_r10 = tp->tf_r10; 2022 regs->r_r9 = tp->tf_r9; 2023 regs->r_r8 = tp->tf_r8; 2024 regs->r_rdi = tp->tf_rdi; 2025 regs->r_rsi = tp->tf_rsi; 2026 regs->r_rbp = tp->tf_rbp; 2027 regs->r_rbx = tp->tf_rbx; 2028 regs->r_rdx = tp->tf_rdx; 2029 regs->r_rcx = tp->tf_rcx; 2030 regs->r_rax = tp->tf_rax; 2031 regs->r_rip = tp->tf_rip; 2032 regs->r_cs = tp->tf_cs; 2033 regs->r_rflags = tp->tf_rflags; 2034 regs->r_rsp = tp->tf_rsp; 2035 regs->r_ss = tp->tf_ss; 2036 if (tp->tf_flags & TF_HASSEGS) { 2037 regs->r_ds = tp->tf_ds; 2038 regs->r_es = tp->tf_es; 2039 regs->r_fs = tp->tf_fs; 2040 regs->r_gs = tp->tf_gs; 2041 } else { 2042 regs->r_ds = 0; 2043 regs->r_es = 0; 2044 regs->r_fs = 0; 2045 regs->r_gs = 0; 2046 } 2047 return (0); 2048} 2049 2050int 2051set_regs(struct thread *td, struct reg *regs) 2052{ 2053 struct trapframe *tp; 2054 register_t rflags; 2055 2056 tp = td->td_frame; 2057 rflags = regs->r_rflags & 0xffffffff; 2058 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) 2059 return (EINVAL); 2060 tp->tf_r15 = regs->r_r15; 2061 tp->tf_r14 = regs->r_r14; 2062 tp->tf_r13 = regs->r_r13; 2063 tp->tf_r12 = regs->r_r12; 2064 tp->tf_r11 = regs->r_r11; 2065 tp->tf_r10 = regs->r_r10; 2066 tp->tf_r9 = regs->r_r9; 2067 tp->tf_r8 = regs->r_r8; 2068 tp->tf_rdi = regs->r_rdi; 2069 tp->tf_rsi = regs->r_rsi; 2070 tp->tf_rbp = regs->r_rbp; 2071 tp->tf_rbx = regs->r_rbx; 2072 tp->tf_rdx = regs->r_rdx; 2073 tp->tf_rcx = regs->r_rcx; 2074 tp->tf_rax = regs->r_rax; 2075 tp->tf_rip = regs->r_rip; 2076 tp->tf_cs = regs->r_cs; 2077 tp->tf_rflags = rflags; 2078 tp->tf_rsp = regs->r_rsp; 2079 tp->tf_ss = regs->r_ss; 2080 if (0) { /* XXXKIB */ 2081 tp->tf_ds = regs->r_ds; 2082 tp->tf_es = regs->r_es; 2083 tp->tf_fs = regs->r_fs; 2084 tp->tf_gs = regs->r_gs; 2085 tp->tf_flags = TF_HASSEGS; 2086 } 2087 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2088 return (0); 2089} 2090 2091/* XXX check all this stuff! */ 2092/* externalize from sv_xmm */ 2093static void 2094fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) 2095{ 2096 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2097 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2098 int i; 2099 2100 /* pcb -> fpregs */ 2101 bzero(fpregs, sizeof(*fpregs)); 2102 2103 /* FPU control/status */ 2104 penv_fpreg->en_cw = penv_xmm->en_cw; 2105 penv_fpreg->en_sw = penv_xmm->en_sw; 2106 penv_fpreg->en_tw = penv_xmm->en_tw; 2107 penv_fpreg->en_opcode = penv_xmm->en_opcode; 2108 penv_fpreg->en_rip = penv_xmm->en_rip; 2109 penv_fpreg->en_rdp = penv_xmm->en_rdp; 2110 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; 2111 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; 2112 2113 /* FPU registers */ 2114 for (i = 0; i < 8; ++i) 2115 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); 2116 2117 /* SSE registers */ 2118 for (i = 0; i < 16; ++i) 2119 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); 2120} 2121 2122/* internalize from fpregs into sv_xmm */ 2123static void 2124set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) 2125{ 2126 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2127 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2128 int i; 2129 2130 /* fpregs -> pcb */ 2131 /* FPU control/status */ 2132 penv_xmm->en_cw = penv_fpreg->en_cw; 2133 penv_xmm->en_sw = penv_fpreg->en_sw; 2134 penv_xmm->en_tw = penv_fpreg->en_tw; 2135 penv_xmm->en_opcode = penv_fpreg->en_opcode; 2136 penv_xmm->en_rip = penv_fpreg->en_rip; 2137 penv_xmm->en_rdp = penv_fpreg->en_rdp; 2138 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; 2139 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; 2140 2141 /* FPU registers */ 2142 for (i = 0; i < 8; ++i) 2143 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); 2144 2145 /* SSE registers */ 2146 for (i = 0; i < 16; ++i) 2147 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); 2148} 2149 2150/* externalize from td->pcb */ 2151int 2152fill_fpregs(struct thread *td, struct fpreg *fpregs) 2153{ 2154 2155 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2156 P_SHOULDSTOP(td->td_proc), 2157 ("not suspended thread %p", td)); 2158 fpugetregs(td); 2159 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); 2160 return (0); 2161} 2162 2163/* internalize to td->pcb */ 2164int 2165set_fpregs(struct thread *td, struct fpreg *fpregs) 2166{ 2167 2168 critical_enter(); 2169 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); 2170 fpuuserinited(td); 2171 critical_exit(); 2172 return (0); 2173} 2174 2175/* 2176 * Get machine context. 2177 */ 2178int 2179get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 2180{ 2181 struct pcb *pcb; 2182 struct trapframe *tp; 2183 2184 pcb = td->td_pcb; 2185 tp = td->td_frame; 2186 PROC_LOCK(curthread->td_proc); 2187 mcp->mc_onstack = sigonstack(tp->tf_rsp); 2188 PROC_UNLOCK(curthread->td_proc); 2189 mcp->mc_r15 = tp->tf_r15; 2190 mcp->mc_r14 = tp->tf_r14; 2191 mcp->mc_r13 = tp->tf_r13; 2192 mcp->mc_r12 = tp->tf_r12; 2193 mcp->mc_r11 = tp->tf_r11; 2194 mcp->mc_r10 = tp->tf_r10; 2195 mcp->mc_r9 = tp->tf_r9; 2196 mcp->mc_r8 = tp->tf_r8; 2197 mcp->mc_rdi = tp->tf_rdi; 2198 mcp->mc_rsi = tp->tf_rsi; 2199 mcp->mc_rbp = tp->tf_rbp; 2200 mcp->mc_rbx = tp->tf_rbx; 2201 mcp->mc_rcx = tp->tf_rcx; 2202 mcp->mc_rflags = tp->tf_rflags; 2203 if (flags & GET_MC_CLEAR_RET) { 2204 mcp->mc_rax = 0; 2205 mcp->mc_rdx = 0; 2206 mcp->mc_rflags &= ~PSL_C; 2207 } else { 2208 mcp->mc_rax = tp->tf_rax; 2209 mcp->mc_rdx = tp->tf_rdx; 2210 } 2211 mcp->mc_rip = tp->tf_rip; 2212 mcp->mc_cs = tp->tf_cs; 2213 mcp->mc_rsp = tp->tf_rsp; 2214 mcp->mc_ss = tp->tf_ss; 2215 mcp->mc_ds = tp->tf_ds; 2216 mcp->mc_es = tp->tf_es; 2217 mcp->mc_fs = tp->tf_fs; 2218 mcp->mc_gs = tp->tf_gs; 2219 mcp->mc_flags = tp->tf_flags; 2220 mcp->mc_len = sizeof(*mcp); 2221 get_fpcontext(td, mcp, NULL, 0); 2222 update_pcb_bases(pcb); 2223 mcp->mc_fsbase = pcb->pcb_fsbase; 2224 mcp->mc_gsbase = pcb->pcb_gsbase; 2225 mcp->mc_xfpustate = 0; 2226 mcp->mc_xfpustate_len = 0; 2227 bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); 2228 return (0); 2229} 2230 2231/* 2232 * Set machine context. 2233 * 2234 * However, we don't set any but the user modifiable flags, and we won't 2235 * touch the cs selector. 2236 */ 2237int 2238set_mcontext(struct thread *td, mcontext_t *mcp) 2239{ 2240 struct pcb *pcb; 2241 struct trapframe *tp; 2242 char *xfpustate; 2243 long rflags; 2244 int ret; 2245 2246 pcb = td->td_pcb; 2247 tp = td->td_frame; 2248 if (mcp->mc_len != sizeof(*mcp) || 2249 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 2250 return (EINVAL); 2251 rflags = (mcp->mc_rflags & PSL_USERCHANGE) | 2252 (tp->tf_rflags & ~PSL_USERCHANGE); 2253 if (mcp->mc_flags & _MC_HASFPXSTATE) { 2254 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 2255 sizeof(struct savefpu)) 2256 return (EINVAL); 2257 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 2258 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 2259 mcp->mc_xfpustate_len); 2260 if (ret != 0) 2261 return (ret); 2262 } else 2263 xfpustate = NULL; 2264 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 2265 if (ret != 0) 2266 return (ret); 2267 tp->tf_r15 = mcp->mc_r15; 2268 tp->tf_r14 = mcp->mc_r14; 2269 tp->tf_r13 = mcp->mc_r13; 2270 tp->tf_r12 = mcp->mc_r12; 2271 tp->tf_r11 = mcp->mc_r11; 2272 tp->tf_r10 = mcp->mc_r10; 2273 tp->tf_r9 = mcp->mc_r9; 2274 tp->tf_r8 = mcp->mc_r8; 2275 tp->tf_rdi = mcp->mc_rdi; 2276 tp->tf_rsi = mcp->mc_rsi; 2277 tp->tf_rbp = mcp->mc_rbp; 2278 tp->tf_rbx = mcp->mc_rbx; 2279 tp->tf_rdx = mcp->mc_rdx; 2280 tp->tf_rcx = mcp->mc_rcx; 2281 tp->tf_rax = mcp->mc_rax; 2282 tp->tf_rip = mcp->mc_rip; 2283 tp->tf_rflags = rflags; 2284 tp->tf_rsp = mcp->mc_rsp; 2285 tp->tf_ss = mcp->mc_ss; 2286 tp->tf_flags = mcp->mc_flags; 2287 if (tp->tf_flags & TF_HASSEGS) { 2288 tp->tf_ds = mcp->mc_ds; 2289 tp->tf_es = mcp->mc_es; 2290 tp->tf_fs = mcp->mc_fs; 2291 tp->tf_gs = mcp->mc_gs; 2292 } 2293 set_pcb_flags(pcb, PCB_FULL_IRET); 2294 if (mcp->mc_flags & _MC_HASBASES) { 2295 pcb->pcb_fsbase = mcp->mc_fsbase; 2296 pcb->pcb_gsbase = mcp->mc_gsbase; 2297 } 2298 return (0); 2299} 2300 2301static void 2302get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 2303 size_t xfpusave_len) 2304{ 2305 size_t max_len, len; 2306 2307 mcp->mc_ownedfp = fpugetregs(td); 2308 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 2309 sizeof(mcp->mc_fpstate)); 2310 mcp->mc_fpformat = fpuformat(); 2311 if (!use_xsave || xfpusave_len == 0) 2312 return; 2313 max_len = cpu_max_ext_state_size - sizeof(struct savefpu); 2314 len = xfpusave_len; 2315 if (len > max_len) { 2316 len = max_len; 2317 bzero(xfpusave + max_len, len - max_len); 2318 } 2319 mcp->mc_flags |= _MC_HASFPXSTATE; 2320 mcp->mc_xfpustate_len = len; 2321 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 2322} 2323 2324static int 2325set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 2326 size_t xfpustate_len) 2327{ 2328 int error; 2329 2330 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 2331 return (0); 2332 else if (mcp->mc_fpformat != _MC_FPFMT_XMM) 2333 return (EINVAL); 2334 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 2335 /* We don't care what state is left in the FPU or PCB. */ 2336 fpstate_drop(td); 2337 error = 0; 2338 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 2339 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 2340 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, 2341 xfpustate, xfpustate_len); 2342 } else 2343 return (EINVAL); 2344 return (error); 2345} 2346 2347void 2348fpstate_drop(struct thread *td) 2349{ 2350 2351 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 2352 critical_enter(); 2353 if (PCPU_GET(fpcurthread) == td) 2354 fpudrop(); 2355 /* 2356 * XXX force a full drop of the fpu. The above only drops it if we 2357 * owned it. 2358 * 2359 * XXX I don't much like fpugetuserregs()'s semantics of doing a full 2360 * drop. Dropping only to the pcb matches fnsave's behaviour. 2361 * We only need to drop to !PCB_INITDONE in sendsig(). But 2362 * sendsig() is the only caller of fpugetuserregs()... perhaps we just 2363 * have too many layers. 2364 */ 2365 clear_pcb_flags(curthread->td_pcb, 2366 PCB_FPUINITDONE | PCB_USERFPUINITDONE); 2367 critical_exit(); 2368} 2369 2370int 2371fill_dbregs(struct thread *td, struct dbreg *dbregs) 2372{ 2373 struct pcb *pcb; 2374 2375 if (td == NULL) { 2376 dbregs->dr[0] = rdr0(); 2377 dbregs->dr[1] = rdr1(); 2378 dbregs->dr[2] = rdr2(); 2379 dbregs->dr[3] = rdr3(); 2380 dbregs->dr[6] = rdr6(); 2381 dbregs->dr[7] = rdr7(); 2382 } else { 2383 pcb = td->td_pcb; 2384 dbregs->dr[0] = pcb->pcb_dr0; 2385 dbregs->dr[1] = pcb->pcb_dr1; 2386 dbregs->dr[2] = pcb->pcb_dr2; 2387 dbregs->dr[3] = pcb->pcb_dr3; 2388 dbregs->dr[6] = pcb->pcb_dr6; 2389 dbregs->dr[7] = pcb->pcb_dr7; 2390 } 2391 dbregs->dr[4] = 0; 2392 dbregs->dr[5] = 0; 2393 dbregs->dr[8] = 0; 2394 dbregs->dr[9] = 0; 2395 dbregs->dr[10] = 0; 2396 dbregs->dr[11] = 0; 2397 dbregs->dr[12] = 0; 2398 dbregs->dr[13] = 0; 2399 dbregs->dr[14] = 0; 2400 dbregs->dr[15] = 0; 2401 return (0); 2402} 2403 2404int 2405set_dbregs(struct thread *td, struct dbreg *dbregs) 2406{ 2407 struct pcb *pcb; 2408 int i; 2409 2410 if (td == NULL) { 2411 load_dr0(dbregs->dr[0]); 2412 load_dr1(dbregs->dr[1]); 2413 load_dr2(dbregs->dr[2]); 2414 load_dr3(dbregs->dr[3]); 2415 load_dr6(dbregs->dr[6]); 2416 load_dr7(dbregs->dr[7]); 2417 } else { 2418 /* 2419 * Don't let an illegal value for dr7 get set. Specifically, 2420 * check for undefined settings. Setting these bit patterns 2421 * result in undefined behaviour and can lead to an unexpected 2422 * TRCTRAP or a general protection fault right here. 2423 * Upper bits of dr6 and dr7 must not be set 2424 */ 2425 for (i = 0; i < 4; i++) { 2426 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 2427 return (EINVAL); 2428 if (td->td_frame->tf_cs == _ucode32sel && 2429 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) 2430 return (EINVAL); 2431 } 2432 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || 2433 (dbregs->dr[7] & 0xffffffff00000000ul) != 0) 2434 return (EINVAL); 2435 2436 pcb = td->td_pcb; 2437 2438 /* 2439 * Don't let a process set a breakpoint that is not within the 2440 * process's address space. If a process could do this, it 2441 * could halt the system by setting a breakpoint in the kernel 2442 * (if ddb was enabled). Thus, we need to check to make sure 2443 * that no breakpoints are being enabled for addresses outside 2444 * process's address space. 2445 * 2446 * XXX - what about when the watched area of the user's 2447 * address space is written into from within the kernel 2448 * ... wouldn't that still cause a breakpoint to be generated 2449 * from within kernel mode? 2450 */ 2451 2452 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 2453 /* dr0 is enabled */ 2454 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 2455 return (EINVAL); 2456 } 2457 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 2458 /* dr1 is enabled */ 2459 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 2460 return (EINVAL); 2461 } 2462 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 2463 /* dr2 is enabled */ 2464 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 2465 return (EINVAL); 2466 } 2467 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 2468 /* dr3 is enabled */ 2469 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 2470 return (EINVAL); 2471 } 2472 2473 pcb->pcb_dr0 = dbregs->dr[0]; 2474 pcb->pcb_dr1 = dbregs->dr[1]; 2475 pcb->pcb_dr2 = dbregs->dr[2]; 2476 pcb->pcb_dr3 = dbregs->dr[3]; 2477 pcb->pcb_dr6 = dbregs->dr[6]; 2478 pcb->pcb_dr7 = dbregs->dr[7]; 2479 2480 set_pcb_flags(pcb, PCB_DBREGS); 2481 } 2482 2483 return (0); 2484} 2485 2486void 2487reset_dbregs(void) 2488{ 2489 2490 load_dr7(0); /* Turn off the control bits first */ 2491 load_dr0(0); 2492 load_dr1(0); 2493 load_dr2(0); 2494 load_dr3(0); 2495 load_dr6(0); 2496} 2497 2498/* 2499 * Return > 0 if a hardware breakpoint has been hit, and the 2500 * breakpoint was in user space. Return 0, otherwise. 2501 */ 2502int 2503user_dbreg_trap(void) 2504{ 2505 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ 2506 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2507 int nbp; /* number of breakpoints that triggered */ 2508 caddr_t addr[4]; /* breakpoint addresses */ 2509 int i; 2510 2511 dr7 = rdr7(); 2512 if ((dr7 & 0x000000ff) == 0) { 2513 /* 2514 * all GE and LE bits in the dr7 register are zero, 2515 * thus the trap couldn't have been caused by the 2516 * hardware debug registers 2517 */ 2518 return 0; 2519 } 2520 2521 nbp = 0; 2522 dr6 = rdr6(); 2523 bp = dr6 & 0x0000000f; 2524 2525 if (!bp) { 2526 /* 2527 * None of the breakpoint bits are set meaning this 2528 * trap was not caused by any of the debug registers 2529 */ 2530 return 0; 2531 } 2532 2533 /* 2534 * at least one of the breakpoints were hit, check to see 2535 * which ones and if any of them are user space addresses 2536 */ 2537 2538 if (bp & 0x01) { 2539 addr[nbp++] = (caddr_t)rdr0(); 2540 } 2541 if (bp & 0x02) { 2542 addr[nbp++] = (caddr_t)rdr1(); 2543 } 2544 if (bp & 0x04) { 2545 addr[nbp++] = (caddr_t)rdr2(); 2546 } 2547 if (bp & 0x08) { 2548 addr[nbp++] = (caddr_t)rdr3(); 2549 } 2550 2551 for (i = 0; i < nbp; i++) { 2552 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 2553 /* 2554 * addr[i] is in user space 2555 */ 2556 return nbp; 2557 } 2558 } 2559 2560 /* 2561 * None of the breakpoints are in user space. 2562 */ 2563 return 0; 2564} 2565 2566/* 2567 * The pcb_flags is only modified by current thread, or by other threads 2568 * when current thread is stopped. However, current thread may change it 2569 * from the interrupt context in cpu_switch(), or in the trap handler. 2570 * When we read-modify-write pcb_flags from C sources, compiler may generate 2571 * code that is not atomic regarding the interrupt handler. If a trap or 2572 * interrupt happens and any flag is modified from the handler, it can be 2573 * clobbered with the cached value later. Therefore, we implement setting 2574 * and clearing flags with single-instruction functions, which do not race 2575 * with possible modification of the flags from the trap or interrupt context, 2576 * because traps and interrupts are executed only on instruction boundary. 2577 */ 2578void 2579set_pcb_flags_raw(struct pcb *pcb, const u_int flags) 2580{ 2581 2582 __asm __volatile("orl %1,%0" 2583 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) 2584 : "cc", "memory"); 2585 2586} 2587 2588/* 2589 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs 2590 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into 2591 * pcb if user space modified the bases. We must save on the context 2592 * switch or if the return to usermode happens through the doreti. 2593 * 2594 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, 2595 * which have a consequence that the base MSRs must be saved each time 2596 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with 2597 * context switches. 2598 */ 2599void 2600set_pcb_flags(struct pcb *pcb, const u_int flags) 2601{ 2602 register_t r; 2603 2604 if (curpcb == pcb && 2605 (flags & PCB_FULL_IRET) != 0 && 2606 (pcb->pcb_flags & PCB_FULL_IRET) == 0 && 2607 (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) { 2608 r = intr_disable(); 2609 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2610 if (rfs() == _ufssel) 2611 pcb->pcb_fsbase = rdfsbase(); 2612 if (rgs() == _ugssel) 2613 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); 2614 } 2615 set_pcb_flags_raw(pcb, flags); 2616 intr_restore(r); 2617 } else { 2618 set_pcb_flags_raw(pcb, flags); 2619 } 2620} 2621 2622void 2623clear_pcb_flags(struct pcb *pcb, const u_int flags) 2624{ 2625 2626 __asm __volatile("andl %1,%0" 2627 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) 2628 : "cc", "memory"); 2629} 2630 2631#ifdef KDB 2632 2633/* 2634 * Provide inb() and outb() as functions. They are normally only available as 2635 * inline functions, thus cannot be called from the debugger. 2636 */ 2637 2638/* silence compiler warnings */ 2639u_char inb_(u_short); 2640void outb_(u_short, u_char); 2641 2642u_char 2643inb_(u_short port) 2644{ 2645 return inb(port); 2646} 2647 2648void 2649outb_(u_short port, u_char data) 2650{ 2651 outb(port, data); 2652} 2653 2654#endif /* KDB */ 2655