machdep.c revision 323431
1/*- 2 * Copyright (c) 2003 Peter Wemm. 3 * Copyright (c) 1992 Terrence R. Lambert. 4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * William Jolitz. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 39 */ 40 41#include <sys/cdefs.h> 42__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 323431 2017-09-11 08:48:36Z kib $"); 43 44#include "opt_atpic.h" 45#include "opt_compat.h" 46#include "opt_cpu.h" 47#include "opt_ddb.h" 48#include "opt_inet.h" 49#include "opt_isa.h" 50#include "opt_kstack_pages.h" 51#include "opt_maxmem.h" 52#include "opt_mp_watchdog.h" 53#include "opt_perfmon.h" 54#include "opt_platform.h" 55#include "opt_sched.h" 56 57#include <sys/param.h> 58#include <sys/proc.h> 59#include <sys/systm.h> 60#include <sys/bio.h> 61#include <sys/buf.h> 62#include <sys/bus.h> 63#include <sys/callout.h> 64#include <sys/cons.h> 65#include <sys/cpu.h> 66#include <sys/efi.h> 67#include <sys/eventhandler.h> 68#include <sys/exec.h> 69#include <sys/imgact.h> 70#include <sys/kdb.h> 71#include <sys/kernel.h> 72#include <sys/ktr.h> 73#include <sys/linker.h> 74#include <sys/lock.h> 75#include <sys/malloc.h> 76#include <sys/memrange.h> 77#include <sys/msgbuf.h> 78#include <sys/mutex.h> 79#include <sys/pcpu.h> 80#include <sys/ptrace.h> 81#include <sys/reboot.h> 82#include <sys/rwlock.h> 83#include <sys/sched.h> 84#include <sys/signalvar.h> 85#ifdef SMP 86#include <sys/smp.h> 87#endif 88#include <sys/syscallsubr.h> 89#include <sys/sysctl.h> 90#include <sys/sysent.h> 91#include <sys/sysproto.h> 92#include <sys/ucontext.h> 93#include <sys/vmmeter.h> 94 95#include <vm/vm.h> 96#include <vm/vm_extern.h> 97#include <vm/vm_kern.h> 98#include <vm/vm_page.h> 99#include <vm/vm_map.h> 100#include <vm/vm_object.h> 101#include <vm/vm_pager.h> 102#include <vm/vm_param.h> 103 104#ifdef DDB 105#ifndef KDB 106#error KDB must be enabled in order for DDB to work! 107#endif 108#include <ddb/ddb.h> 109#include <ddb/db_sym.h> 110#endif 111 112#include <net/netisr.h> 113 114#include <machine/clock.h> 115#include <machine/cpu.h> 116#include <machine/cputypes.h> 117#include <machine/intr_machdep.h> 118#include <x86/mca.h> 119#include <machine/md_var.h> 120#include <machine/metadata.h> 121#include <machine/mp_watchdog.h> 122#include <machine/pc/bios.h> 123#include <machine/pcb.h> 124#include <machine/proc.h> 125#include <machine/reg.h> 126#include <machine/sigframe.h> 127#include <machine/specialreg.h> 128#ifdef PERFMON 129#include <machine/perfmon.h> 130#endif 131#include <machine/tss.h> 132#ifdef SMP 133#include <machine/smp.h> 134#endif 135#ifdef FDT 136#include <x86/fdt.h> 137#endif 138 139#ifdef DEV_ATPIC 140#include <x86/isa/icu.h> 141#else 142#include <x86/apicvar.h> 143#endif 144 145#include <isa/isareg.h> 146#include <isa/rtc.h> 147#include <x86/init.h> 148 149/* Sanity check for __curthread() */ 150CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 151 152extern u_int64_t hammer_time(u_int64_t, u_int64_t); 153 154#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 155#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 156 157static void cpu_startup(void *); 158static void get_fpcontext(struct thread *td, mcontext_t *mcp, 159 char *xfpusave, size_t xfpusave_len); 160static int set_fpcontext(struct thread *td, mcontext_t *mcp, 161 char *xfpustate, size_t xfpustate_len); 162SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 163 164/* Preload data parse function */ 165static caddr_t native_parse_preload_data(u_int64_t); 166 167/* Native function to fetch and parse the e820 map */ 168static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); 169 170/* Default init_ops implementation. */ 171struct init_ops init_ops = { 172 .parse_preload_data = native_parse_preload_data, 173 .early_clock_source_init = i8254_init, 174 .early_delay = i8254_delay, 175 .parse_memmap = native_parse_memmap, 176#ifdef SMP 177 .mp_bootaddress = mp_bootaddress, 178 .start_all_aps = native_start_all_aps, 179#endif 180 .msi_init = msi_init, 181}; 182 183/* 184 * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its value is 185 * the physical address at which the kernel is loaded. 186 */ 187extern char kernphys[]; 188 189struct msgbuf *msgbufp; 190 191/* 192 * Physical address of the EFI System Table. Stashed from the metadata hints 193 * passed into the kernel and used by the EFI code to call runtime services. 194 */ 195vm_paddr_t efi_systbl_phys; 196 197/* Intel ICH registers */ 198#define ICH_PMBASE 0x400 199#define ICH_SMI_EN ICH_PMBASE + 0x30 200 201int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; 202 203int cold = 1; 204 205long Maxmem = 0; 206long realmem = 0; 207 208/* 209 * The number of PHYSMAP entries must be one less than the number of 210 * PHYSSEG entries because the PHYSMAP entry that spans the largest 211 * physical address that is accessible by ISA DMA is split into two 212 * PHYSSEG entries. 213 */ 214#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 215 216vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 217vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 218 219/* must be 2 less so 0 0 can signal end of chunks */ 220#define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) 221#define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) 222 223struct kva_md_info kmi; 224 225static struct trapframe proc0_tf; 226struct region_descriptor r_gdt, r_idt; 227 228struct pcpu __pcpu[MAXCPU]; 229 230struct mtx icu_lock; 231 232struct mem_range_softc mem_range_softc; 233 234struct mtx dt_lock; /* lock for GDT and LDT */ 235 236void (*vmm_resume_p)(void); 237 238static void 239cpu_startup(dummy) 240 void *dummy; 241{ 242 uintmax_t memsize; 243 char *sysenv; 244 245 /* 246 * On MacBooks, we need to disallow the legacy USB circuit to 247 * generate an SMI# because this can cause several problems, 248 * namely: incorrect CPU frequency detection and failure to 249 * start the APs. 250 * We do this by disabling a bit in the SMI_EN (SMI Control and 251 * Enable register) of the Intel ICH LPC Interface Bridge. 252 */ 253 sysenv = kern_getenv("smbios.system.product"); 254 if (sysenv != NULL) { 255 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 256 strncmp(sysenv, "MacBook3,1", 10) == 0 || 257 strncmp(sysenv, "MacBook4,1", 10) == 0 || 258 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 259 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 260 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 261 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 262 strncmp(sysenv, "Macmini1,1", 10) == 0) { 263 if (bootverbose) 264 printf("Disabling LEGACY_USB_EN bit on " 265 "Intel ICH.\n"); 266 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 267 } 268 freeenv(sysenv); 269 } 270 271 /* 272 * Good {morning,afternoon,evening,night}. 273 */ 274 startrtclock(); 275 printcpuinfo(); 276#ifdef PERFMON 277 perfmon_init(); 278#endif 279 280 /* 281 * Display physical memory if SMBIOS reports reasonable amount. 282 */ 283 memsize = 0; 284 sysenv = kern_getenv("smbios.memory.enabled"); 285 if (sysenv != NULL) { 286 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 287 freeenv(sysenv); 288 } 289 if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count)) 290 memsize = ptoa((uintmax_t)Maxmem); 291 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 292 realmem = atop(memsize); 293 294 /* 295 * Display any holes after the first chunk of extended memory. 296 */ 297 if (bootverbose) { 298 int indx; 299 300 printf("Physical memory chunk(s):\n"); 301 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 302 vm_paddr_t size; 303 304 size = phys_avail[indx + 1] - phys_avail[indx]; 305 printf( 306 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 307 (uintmax_t)phys_avail[indx], 308 (uintmax_t)phys_avail[indx + 1] - 1, 309 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 310 } 311 } 312 313 vm_ksubmap_init(&kmi); 314 315 printf("avail memory = %ju (%ju MB)\n", 316 ptoa((uintmax_t)vm_cnt.v_free_count), 317 ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); 318 319 /* 320 * Set up buffers, so they can be used to read disk labels. 321 */ 322 bufinit(); 323 vm_pager_bufferinit(); 324 325 cpu_setregs(); 326} 327 328/* 329 * Send an interrupt to process. 330 * 331 * Stack is set up to allow sigcode stored 332 * at top to call routine, followed by call 333 * to sigreturn routine below. After sigreturn 334 * resets the signal mask, the stack, and the 335 * frame pointer, it returns to the user 336 * specified pc, psl. 337 */ 338void 339sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 340{ 341 struct sigframe sf, *sfp; 342 struct pcb *pcb; 343 struct proc *p; 344 struct thread *td; 345 struct sigacts *psp; 346 char *sp; 347 struct trapframe *regs; 348 char *xfpusave; 349 size_t xfpusave_len; 350 int sig; 351 int oonstack; 352 353 td = curthread; 354 pcb = td->td_pcb; 355 p = td->td_proc; 356 PROC_LOCK_ASSERT(p, MA_OWNED); 357 sig = ksi->ksi_signo; 358 psp = p->p_sigacts; 359 mtx_assert(&psp->ps_mtx, MA_OWNED); 360 regs = td->td_frame; 361 oonstack = sigonstack(regs->tf_rsp); 362 363 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { 364 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); 365 xfpusave = __builtin_alloca(xfpusave_len); 366 } else { 367 xfpusave_len = 0; 368 xfpusave = NULL; 369 } 370 371 /* Save user context. */ 372 bzero(&sf, sizeof(sf)); 373 sf.sf_uc.uc_sigmask = *mask; 374 sf.sf_uc.uc_stack = td->td_sigstk; 375 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 376 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 377 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 378 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); 379 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 380 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 381 fpstate_drop(td); 382 update_pcb_bases(pcb); 383 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; 384 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; 385 bzero(sf.sf_uc.uc_mcontext.mc_spare, 386 sizeof(sf.sf_uc.uc_mcontext.mc_spare)); 387 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 388 389 /* Allocate space for the signal handler context. */ 390 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 391 SIGISMEMBER(psp->ps_sigonstack, sig)) { 392 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 393#if defined(COMPAT_43) 394 td->td_sigstk.ss_flags |= SS_ONSTACK; 395#endif 396 } else 397 sp = (char *)regs->tf_rsp - 128; 398 if (xfpusave != NULL) { 399 sp -= xfpusave_len; 400 sp = (char *)((unsigned long)sp & ~0x3Ful); 401 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 402 } 403 sp -= sizeof(struct sigframe); 404 /* Align to 16 bytes. */ 405 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); 406 407 /* Build the argument list for the signal handler. */ 408 regs->tf_rdi = sig; /* arg 1 in %rdi */ 409 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ 410 bzero(&sf.sf_si, sizeof(sf.sf_si)); 411 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 412 /* Signal handler installed with SA_SIGINFO. */ 413 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ 414 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 415 416 /* Fill in POSIX parts */ 417 sf.sf_si = ksi->ksi_info; 418 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 419 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 420 } else { 421 /* Old FreeBSD-style arguments. */ 422 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ 423 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 424 sf.sf_ahu.sf_handler = catcher; 425 } 426 mtx_unlock(&psp->ps_mtx); 427 PROC_UNLOCK(p); 428 429 /* 430 * Copy the sigframe out to the user's stack. 431 */ 432 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 433 (xfpusave != NULL && copyout(xfpusave, 434 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 435 != 0)) { 436#ifdef DEBUG 437 printf("process %ld has trashed its stack\n", (long)p->p_pid); 438#endif 439 PROC_LOCK(p); 440 sigexit(td, SIGILL); 441 } 442 443 regs->tf_rsp = (long)sfp; 444 regs->tf_rip = p->p_sysent->sv_sigcode_base; 445 regs->tf_rflags &= ~(PSL_T | PSL_D); 446 regs->tf_cs = _ucodesel; 447 regs->tf_ds = _udatasel; 448 regs->tf_ss = _udatasel; 449 regs->tf_es = _udatasel; 450 regs->tf_fs = _ufssel; 451 regs->tf_gs = _ugssel; 452 regs->tf_flags = TF_HASSEGS; 453 PROC_LOCK(p); 454 mtx_lock(&psp->ps_mtx); 455} 456 457/* 458 * System call to cleanup state after a signal 459 * has been taken. Reset signal mask and 460 * stack state from context left by sendsig (above). 461 * Return to previous pc and psl as specified by 462 * context left by sendsig. Check carefully to 463 * make sure that the user has not modified the 464 * state to gain improper privileges. 465 * 466 * MPSAFE 467 */ 468int 469sys_sigreturn(td, uap) 470 struct thread *td; 471 struct sigreturn_args /* { 472 const struct __ucontext *sigcntxp; 473 } */ *uap; 474{ 475 ucontext_t uc; 476 struct pcb *pcb; 477 struct proc *p; 478 struct trapframe *regs; 479 ucontext_t *ucp; 480 char *xfpustate; 481 size_t xfpustate_len; 482 long rflags; 483 int cs, error, ret; 484 ksiginfo_t ksi; 485 486 pcb = td->td_pcb; 487 p = td->td_proc; 488 489 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 490 if (error != 0) { 491 uprintf("pid %d (%s): sigreturn copyin failed\n", 492 p->p_pid, td->td_name); 493 return (error); 494 } 495 ucp = &uc; 496 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 497 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 498 td->td_name, ucp->uc_mcontext.mc_flags); 499 return (EINVAL); 500 } 501 regs = td->td_frame; 502 rflags = ucp->uc_mcontext.mc_rflags; 503 /* 504 * Don't allow users to change privileged or reserved flags. 505 */ 506 if (!EFL_SECURE(rflags, regs->tf_rflags)) { 507 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, 508 td->td_name, rflags); 509 return (EINVAL); 510 } 511 512 /* 513 * Don't allow users to load a valid privileged %cs. Let the 514 * hardware check for invalid selectors, excess privilege in 515 * other selectors, invalid %eip's and invalid %esp's. 516 */ 517 cs = ucp->uc_mcontext.mc_cs; 518 if (!CS_SECURE(cs)) { 519 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, 520 td->td_name, cs); 521 ksiginfo_init_trap(&ksi); 522 ksi.ksi_signo = SIGBUS; 523 ksi.ksi_code = BUS_OBJERR; 524 ksi.ksi_trapno = T_PROTFLT; 525 ksi.ksi_addr = (void *)regs->tf_rip; 526 trapsignal(td, &ksi); 527 return (EINVAL); 528 } 529 530 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 531 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 532 if (xfpustate_len > cpu_max_ext_state_size - 533 sizeof(struct savefpu)) { 534 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 535 p->p_pid, td->td_name, xfpustate_len); 536 return (EINVAL); 537 } 538 xfpustate = __builtin_alloca(xfpustate_len); 539 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 540 xfpustate, xfpustate_len); 541 if (error != 0) { 542 uprintf( 543 "pid %d (%s): sigreturn copying xfpustate failed\n", 544 p->p_pid, td->td_name); 545 return (error); 546 } 547 } else { 548 xfpustate = NULL; 549 xfpustate_len = 0; 550 } 551 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); 552 if (ret != 0) { 553 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", 554 p->p_pid, td->td_name, ret); 555 return (ret); 556 } 557 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); 558 update_pcb_bases(pcb); 559 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; 560 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; 561 562#if defined(COMPAT_43) 563 if (ucp->uc_mcontext.mc_onstack & 1) 564 td->td_sigstk.ss_flags |= SS_ONSTACK; 565 else 566 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 567#endif 568 569 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 570 return (EJUSTRETURN); 571} 572 573#ifdef COMPAT_FREEBSD4 574int 575freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) 576{ 577 578 return sys_sigreturn(td, (struct sigreturn_args *)uap); 579} 580#endif 581 582/* 583 * Reset registers to default values on exec. 584 */ 585void 586exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 587{ 588 struct trapframe *regs = td->td_frame; 589 struct pcb *pcb = td->td_pcb; 590 591 mtx_lock(&dt_lock); 592 if (td->td_proc->p_md.md_ldt != NULL) 593 user_ldt_free(td); 594 else 595 mtx_unlock(&dt_lock); 596 597 update_pcb_bases(pcb); 598 pcb->pcb_fsbase = 0; 599 pcb->pcb_gsbase = 0; 600 clear_pcb_flags(pcb, PCB_32BIT); 601 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; 602 603 bzero((char *)regs, sizeof(struct trapframe)); 604 regs->tf_rip = imgp->entry_addr; 605 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; 606 regs->tf_rdi = stack; /* argv */ 607 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 608 regs->tf_ss = _udatasel; 609 regs->tf_cs = _ucodesel; 610 regs->tf_ds = _udatasel; 611 regs->tf_es = _udatasel; 612 regs->tf_fs = _ufssel; 613 regs->tf_gs = _ugssel; 614 regs->tf_flags = TF_HASSEGS; 615 td->td_retval[1] = 0; 616 617 /* 618 * Reset the hardware debug registers if they were in use. 619 * They won't have any meaning for the newly exec'd process. 620 */ 621 if (pcb->pcb_flags & PCB_DBREGS) { 622 pcb->pcb_dr0 = 0; 623 pcb->pcb_dr1 = 0; 624 pcb->pcb_dr2 = 0; 625 pcb->pcb_dr3 = 0; 626 pcb->pcb_dr6 = 0; 627 pcb->pcb_dr7 = 0; 628 if (pcb == curpcb) { 629 /* 630 * Clear the debug registers on the running 631 * CPU, otherwise they will end up affecting 632 * the next process we switch to. 633 */ 634 reset_dbregs(); 635 } 636 clear_pcb_flags(pcb, PCB_DBREGS); 637 } 638 639 /* 640 * Drop the FP state if we hold it, so that the process gets a 641 * clean FP state if it uses the FPU again. 642 */ 643 fpstate_drop(td); 644} 645 646void 647cpu_setregs(void) 648{ 649 register_t cr0; 650 651 cr0 = rcr0(); 652 /* 653 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the 654 * BSP. See the comments there about why we set them. 655 */ 656 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 657 load_cr0(cr0); 658} 659 660/* 661 * Initialize amd64 and configure to run kernel 662 */ 663 664/* 665 * Initialize segments & interrupt table 666 */ 667 668struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */ 669static struct gate_descriptor idt0[NIDT]; 670struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 671 672static char dblfault_stack[PAGE_SIZE] __aligned(16); 673 674static char nmi0_stack[PAGE_SIZE] __aligned(16); 675CTASSERT(sizeof(struct nmi_pcpu) == 16); 676 677struct amd64tss common_tss[MAXCPU]; 678 679/* 680 * Software prototypes -- in more palatable form. 681 * 682 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same 683 * slots as corresponding segments for i386 kernel. 684 */ 685struct soft_segment_descriptor gdt_segs[] = { 686/* GNULL_SEL 0 Null Descriptor */ 687{ .ssd_base = 0x0, 688 .ssd_limit = 0x0, 689 .ssd_type = 0, 690 .ssd_dpl = 0, 691 .ssd_p = 0, 692 .ssd_long = 0, 693 .ssd_def32 = 0, 694 .ssd_gran = 0 }, 695/* GNULL2_SEL 1 Null Descriptor */ 696{ .ssd_base = 0x0, 697 .ssd_limit = 0x0, 698 .ssd_type = 0, 699 .ssd_dpl = 0, 700 .ssd_p = 0, 701 .ssd_long = 0, 702 .ssd_def32 = 0, 703 .ssd_gran = 0 }, 704/* GUFS32_SEL 2 32 bit %gs Descriptor for user */ 705{ .ssd_base = 0x0, 706 .ssd_limit = 0xfffff, 707 .ssd_type = SDT_MEMRWA, 708 .ssd_dpl = SEL_UPL, 709 .ssd_p = 1, 710 .ssd_long = 0, 711 .ssd_def32 = 1, 712 .ssd_gran = 1 }, 713/* GUGS32_SEL 3 32 bit %fs Descriptor for user */ 714{ .ssd_base = 0x0, 715 .ssd_limit = 0xfffff, 716 .ssd_type = SDT_MEMRWA, 717 .ssd_dpl = SEL_UPL, 718 .ssd_p = 1, 719 .ssd_long = 0, 720 .ssd_def32 = 1, 721 .ssd_gran = 1 }, 722/* GCODE_SEL 4 Code Descriptor for kernel */ 723{ .ssd_base = 0x0, 724 .ssd_limit = 0xfffff, 725 .ssd_type = SDT_MEMERA, 726 .ssd_dpl = SEL_KPL, 727 .ssd_p = 1, 728 .ssd_long = 1, 729 .ssd_def32 = 0, 730 .ssd_gran = 1 }, 731/* GDATA_SEL 5 Data Descriptor for kernel */ 732{ .ssd_base = 0x0, 733 .ssd_limit = 0xfffff, 734 .ssd_type = SDT_MEMRWA, 735 .ssd_dpl = SEL_KPL, 736 .ssd_p = 1, 737 .ssd_long = 1, 738 .ssd_def32 = 0, 739 .ssd_gran = 1 }, 740/* GUCODE32_SEL 6 32 bit Code Descriptor for user */ 741{ .ssd_base = 0x0, 742 .ssd_limit = 0xfffff, 743 .ssd_type = SDT_MEMERA, 744 .ssd_dpl = SEL_UPL, 745 .ssd_p = 1, 746 .ssd_long = 0, 747 .ssd_def32 = 1, 748 .ssd_gran = 1 }, 749/* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ 750{ .ssd_base = 0x0, 751 .ssd_limit = 0xfffff, 752 .ssd_type = SDT_MEMRWA, 753 .ssd_dpl = SEL_UPL, 754 .ssd_p = 1, 755 .ssd_long = 0, 756 .ssd_def32 = 1, 757 .ssd_gran = 1 }, 758/* GUCODE_SEL 8 64 bit Code Descriptor for user */ 759{ .ssd_base = 0x0, 760 .ssd_limit = 0xfffff, 761 .ssd_type = SDT_MEMERA, 762 .ssd_dpl = SEL_UPL, 763 .ssd_p = 1, 764 .ssd_long = 1, 765 .ssd_def32 = 0, 766 .ssd_gran = 1 }, 767/* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 768{ .ssd_base = 0x0, 769 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, 770 .ssd_type = SDT_SYSTSS, 771 .ssd_dpl = SEL_KPL, 772 .ssd_p = 1, 773 .ssd_long = 0, 774 .ssd_def32 = 0, 775 .ssd_gran = 0 }, 776/* Actually, the TSS is a system descriptor which is double size */ 777{ .ssd_base = 0x0, 778 .ssd_limit = 0x0, 779 .ssd_type = 0, 780 .ssd_dpl = 0, 781 .ssd_p = 0, 782 .ssd_long = 0, 783 .ssd_def32 = 0, 784 .ssd_gran = 0 }, 785/* GUSERLDT_SEL 11 LDT Descriptor */ 786{ .ssd_base = 0x0, 787 .ssd_limit = 0x0, 788 .ssd_type = 0, 789 .ssd_dpl = 0, 790 .ssd_p = 0, 791 .ssd_long = 0, 792 .ssd_def32 = 0, 793 .ssd_gran = 0 }, 794/* GUSERLDT_SEL 12 LDT Descriptor, double size */ 795{ .ssd_base = 0x0, 796 .ssd_limit = 0x0, 797 .ssd_type = 0, 798 .ssd_dpl = 0, 799 .ssd_p = 0, 800 .ssd_long = 0, 801 .ssd_def32 = 0, 802 .ssd_gran = 0 }, 803}; 804 805void 806setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 807{ 808 struct gate_descriptor *ip; 809 810 ip = idt + idx; 811 ip->gd_looffset = (uintptr_t)func; 812 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 813 ip->gd_ist = ist; 814 ip->gd_xx = 0; 815 ip->gd_type = typ; 816 ip->gd_dpl = dpl; 817 ip->gd_p = 1; 818 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 819} 820 821extern inthand_t 822 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 823 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 824 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 825 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 826 IDTVEC(xmm), IDTVEC(dblfault), 827#ifdef KDTRACE_HOOKS 828 IDTVEC(dtrace_ret), 829#endif 830#ifdef XENHVM 831 IDTVEC(xen_intr_upcall), 832#endif 833 IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 834 835#ifdef DDB 836/* 837 * Display the index and function name of any IDT entries that don't use 838 * the default 'rsvd' entry point. 839 */ 840DB_SHOW_COMMAND(idt, db_show_idt) 841{ 842 struct gate_descriptor *ip; 843 int idx; 844 uintptr_t func; 845 846 ip = idt; 847 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 848 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); 849 if (func != (uintptr_t)&IDTVEC(rsvd)) { 850 db_printf("%3d\t", idx); 851 db_printsym(func, DB_STGY_PROC); 852 db_printf("\n"); 853 } 854 ip++; 855 } 856} 857 858/* Show privileged registers. */ 859DB_SHOW_COMMAND(sysregs, db_show_sysregs) 860{ 861 struct { 862 uint16_t limit; 863 uint64_t base; 864 } __packed idtr, gdtr; 865 uint16_t ldt, tr; 866 867 __asm __volatile("sidt %0" : "=m" (idtr)); 868 db_printf("idtr\t0x%016lx/%04x\n", 869 (u_long)idtr.base, (u_int)idtr.limit); 870 __asm __volatile("sgdt %0" : "=m" (gdtr)); 871 db_printf("gdtr\t0x%016lx/%04x\n", 872 (u_long)gdtr.base, (u_int)gdtr.limit); 873 __asm __volatile("sldt %0" : "=r" (ldt)); 874 db_printf("ldtr\t0x%04x\n", ldt); 875 __asm __volatile("str %0" : "=r" (tr)); 876 db_printf("tr\t0x%04x\n", tr); 877 db_printf("cr0\t0x%016lx\n", rcr0()); 878 db_printf("cr2\t0x%016lx\n", rcr2()); 879 db_printf("cr3\t0x%016lx\n", rcr3()); 880 db_printf("cr4\t0x%016lx\n", rcr4()); 881 if (rcr4() & CR4_XSAVE) 882 db_printf("xcr0\t0x%016lx\n", rxcr(0)); 883 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); 884 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 885 db_printf("FEATURES_CTL\t%016lx\n", 886 rdmsr(MSR_IA32_FEATURE_CONTROL)); 887 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); 888 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); 889 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); 890} 891 892DB_SHOW_COMMAND(dbregs, db_show_dbregs) 893{ 894 895 db_printf("dr0\t0x%016lx\n", rdr0()); 896 db_printf("dr1\t0x%016lx\n", rdr1()); 897 db_printf("dr2\t0x%016lx\n", rdr2()); 898 db_printf("dr3\t0x%016lx\n", rdr3()); 899 db_printf("dr6\t0x%016lx\n", rdr6()); 900 db_printf("dr7\t0x%016lx\n", rdr7()); 901} 902#endif 903 904void 905sdtossd(sd, ssd) 906 struct user_segment_descriptor *sd; 907 struct soft_segment_descriptor *ssd; 908{ 909 910 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 911 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 912 ssd->ssd_type = sd->sd_type; 913 ssd->ssd_dpl = sd->sd_dpl; 914 ssd->ssd_p = sd->sd_p; 915 ssd->ssd_long = sd->sd_long; 916 ssd->ssd_def32 = sd->sd_def32; 917 ssd->ssd_gran = sd->sd_gran; 918} 919 920void 921ssdtosd(ssd, sd) 922 struct soft_segment_descriptor *ssd; 923 struct user_segment_descriptor *sd; 924{ 925 926 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 927 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 928 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 929 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 930 sd->sd_type = ssd->ssd_type; 931 sd->sd_dpl = ssd->ssd_dpl; 932 sd->sd_p = ssd->ssd_p; 933 sd->sd_long = ssd->ssd_long; 934 sd->sd_def32 = ssd->ssd_def32; 935 sd->sd_gran = ssd->ssd_gran; 936} 937 938void 939ssdtosyssd(ssd, sd) 940 struct soft_segment_descriptor *ssd; 941 struct system_segment_descriptor *sd; 942{ 943 944 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 945 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 946 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 947 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 948 sd->sd_type = ssd->ssd_type; 949 sd->sd_dpl = ssd->ssd_dpl; 950 sd->sd_p = ssd->ssd_p; 951 sd->sd_gran = ssd->ssd_gran; 952} 953 954#if !defined(DEV_ATPIC) && defined(DEV_ISA) 955#include <isa/isavar.h> 956#include <isa/isareg.h> 957/* 958 * Return a bitmap of the current interrupt requests. This is 8259-specific 959 * and is only suitable for use at probe time. 960 * This is only here to pacify sio. It is NOT FATAL if this doesn't work. 961 * It shouldn't be here. There should probably be an APIC centric 962 * implementation in the apic driver code, if at all. 963 */ 964intrmask_t 965isa_irq_pending(void) 966{ 967 u_char irr1; 968 u_char irr2; 969 970 irr1 = inb(IO_ICU1); 971 irr2 = inb(IO_ICU2); 972 return ((irr2 << 8) | irr1); 973} 974#endif 975 976u_int basemem; 977 978static int 979add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 980 int *physmap_idxp) 981{ 982 int i, insert_idx, physmap_idx; 983 984 physmap_idx = *physmap_idxp; 985 986 if (length == 0) 987 return (1); 988 989 /* 990 * Find insertion point while checking for overlap. Start off by 991 * assuming the new entry will be added to the end. 992 * 993 * NB: physmap_idx points to the next free slot. 994 */ 995 insert_idx = physmap_idx; 996 for (i = 0; i <= physmap_idx; i += 2) { 997 if (base < physmap[i + 1]) { 998 if (base + length <= physmap[i]) { 999 insert_idx = i; 1000 break; 1001 } 1002 if (boothowto & RB_VERBOSE) 1003 printf( 1004 "Overlapping memory regions, ignoring second region\n"); 1005 return (1); 1006 } 1007 } 1008 1009 /* See if we can prepend to the next entry. */ 1010 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1011 physmap[insert_idx] = base; 1012 return (1); 1013 } 1014 1015 /* See if we can append to the previous entry. */ 1016 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1017 physmap[insert_idx - 1] += length; 1018 return (1); 1019 } 1020 1021 physmap_idx += 2; 1022 *physmap_idxp = physmap_idx; 1023 if (physmap_idx == PHYSMAP_SIZE) { 1024 printf( 1025 "Too many segments in the physical address map, giving up\n"); 1026 return (0); 1027 } 1028 1029 /* 1030 * Move the last 'N' entries down to make room for the new 1031 * entry if needed. 1032 */ 1033 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { 1034 physmap[i] = physmap[i - 2]; 1035 physmap[i + 1] = physmap[i - 1]; 1036 } 1037 1038 /* Insert the new entry. */ 1039 physmap[insert_idx] = base; 1040 physmap[insert_idx + 1] = base + length; 1041 return (1); 1042} 1043 1044void 1045bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, 1046 vm_paddr_t *physmap, int *physmap_idx) 1047{ 1048 struct bios_smap *smap, *smapend; 1049 1050 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1051 1052 for (smap = smapbase; smap < smapend; smap++) { 1053 if (boothowto & RB_VERBOSE) 1054 printf("SMAP type=%02x base=%016lx len=%016lx\n", 1055 smap->type, smap->base, smap->length); 1056 1057 if (smap->type != SMAP_TYPE_MEMORY) 1058 continue; 1059 1060 if (!add_physmap_entry(smap->base, smap->length, physmap, 1061 physmap_idx)) 1062 break; 1063 } 1064} 1065 1066static void 1067add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, 1068 int *physmap_idx) 1069{ 1070 struct efi_md *map, *p; 1071 const char *type; 1072 size_t efisz; 1073 int ndesc, i; 1074 1075 static const char *types[] = { 1076 "Reserved", 1077 "LoaderCode", 1078 "LoaderData", 1079 "BootServicesCode", 1080 "BootServicesData", 1081 "RuntimeServicesCode", 1082 "RuntimeServicesData", 1083 "ConventionalMemory", 1084 "UnusableMemory", 1085 "ACPIReclaimMemory", 1086 "ACPIMemoryNVS", 1087 "MemoryMappedIO", 1088 "MemoryMappedIOPortSpace", 1089 "PalCode", 1090 "PersistentMemory" 1091 }; 1092 1093 /* 1094 * Memory map data provided by UEFI via the GetMemoryMap 1095 * Boot Services API. 1096 */ 1097 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 1098 map = (struct efi_md *)((uint8_t *)efihdr + efisz); 1099 1100 if (efihdr->descriptor_size == 0) 1101 return; 1102 ndesc = efihdr->memory_size / efihdr->descriptor_size; 1103 1104 if (boothowto & RB_VERBOSE) 1105 printf("%23s %12s %12s %8s %4s\n", 1106 "Type", "Physical", "Virtual", "#Pages", "Attr"); 1107 1108 for (i = 0, p = map; i < ndesc; i++, 1109 p = efi_next_descriptor(p, efihdr->descriptor_size)) { 1110 if (boothowto & RB_VERBOSE) { 1111 if (p->md_type < nitems(types)) 1112 type = types[p->md_type]; 1113 else 1114 type = "<INVALID>"; 1115 printf("%23s %012lx %12p %08lx ", type, p->md_phys, 1116 p->md_virt, p->md_pages); 1117 if (p->md_attr & EFI_MD_ATTR_UC) 1118 printf("UC "); 1119 if (p->md_attr & EFI_MD_ATTR_WC) 1120 printf("WC "); 1121 if (p->md_attr & EFI_MD_ATTR_WT) 1122 printf("WT "); 1123 if (p->md_attr & EFI_MD_ATTR_WB) 1124 printf("WB "); 1125 if (p->md_attr & EFI_MD_ATTR_UCE) 1126 printf("UCE "); 1127 if (p->md_attr & EFI_MD_ATTR_WP) 1128 printf("WP "); 1129 if (p->md_attr & EFI_MD_ATTR_RP) 1130 printf("RP "); 1131 if (p->md_attr & EFI_MD_ATTR_XP) 1132 printf("XP "); 1133 if (p->md_attr & EFI_MD_ATTR_NV) 1134 printf("NV "); 1135 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) 1136 printf("MORE_RELIABLE "); 1137 if (p->md_attr & EFI_MD_ATTR_RO) 1138 printf("RO "); 1139 if (p->md_attr & EFI_MD_ATTR_RT) 1140 printf("RUNTIME"); 1141 printf("\n"); 1142 } 1143 1144 switch (p->md_type) { 1145 case EFI_MD_TYPE_CODE: 1146 case EFI_MD_TYPE_DATA: 1147 case EFI_MD_TYPE_BS_CODE: 1148 case EFI_MD_TYPE_BS_DATA: 1149 case EFI_MD_TYPE_FREE: 1150 /* 1151 * We're allowed to use any entry with these types. 1152 */ 1153 break; 1154 default: 1155 continue; 1156 } 1157 1158 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), 1159 physmap, physmap_idx)) 1160 break; 1161 } 1162} 1163 1164static char bootmethod[16] = ""; 1165SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1166 "System firmware boot method"); 1167 1168static void 1169native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) 1170{ 1171 struct bios_smap *smap; 1172 struct efi_map_header *efihdr; 1173 u_int32_t size; 1174 1175 /* 1176 * Memory map from INT 15:E820. 1177 * 1178 * subr_module.c says: 1179 * "Consumer may safely assume that size value precedes data." 1180 * ie: an int32_t immediately precedes smap. 1181 */ 1182 1183 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1184 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1185 smap = (struct bios_smap *)preload_search_info(kmdp, 1186 MODINFO_METADATA | MODINFOMD_SMAP); 1187 if (efihdr == NULL && smap == NULL) 1188 panic("No BIOS smap or EFI map info from loader!"); 1189 1190 if (efihdr != NULL) { 1191 add_efi_map_entries(efihdr, physmap, physmap_idx); 1192 strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); 1193 } else { 1194 size = *((u_int32_t *)smap - 1); 1195 bios_add_smap_entries(smap, size, physmap, physmap_idx); 1196 strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); 1197 } 1198} 1199 1200#define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) 1201 1202/* 1203 * Populate the (physmap) array with base/bound pairs describing the 1204 * available physical memory in the system, then test this memory and 1205 * build the phys_avail array describing the actually-available memory. 1206 * 1207 * Total memory size may be set by the kernel environment variable 1208 * hw.physmem or the compile-time define MAXMEM. 1209 * 1210 * XXX first should be vm_paddr_t. 1211 */ 1212static void 1213getmemsize(caddr_t kmdp, u_int64_t first) 1214{ 1215 int i, physmap_idx, pa_indx, da_indx; 1216 vm_paddr_t pa, physmap[PHYSMAP_SIZE]; 1217 u_long physmem_start, physmem_tunable, memtest; 1218 pt_entry_t *pte; 1219 quad_t dcons_addr, dcons_size; 1220 int page_counter; 1221 1222 bzero(physmap, sizeof(physmap)); 1223 physmap_idx = 0; 1224 1225 init_ops.parse_memmap(kmdp, physmap, &physmap_idx); 1226 physmap_idx -= 2; 1227 1228 /* 1229 * Find the 'base memory' segment for SMP 1230 */ 1231 basemem = 0; 1232 for (i = 0; i <= physmap_idx; i += 2) { 1233 if (physmap[i] <= 0xA0000) { 1234 basemem = physmap[i + 1] / 1024; 1235 break; 1236 } 1237 } 1238 if (basemem == 0 || basemem > 640) { 1239 if (bootverbose) 1240 printf( 1241 "Memory map doesn't contain a basemem segment, faking it"); 1242 basemem = 640; 1243 } 1244 1245 /* 1246 * Make hole for "AP -> long mode" bootstrap code. The 1247 * mp_bootaddress vector is only available when the kernel 1248 * is configured to support APs and APs for the system start 1249 * in 32bit mode (e.g. SMP bare metal). 1250 */ 1251 if (init_ops.mp_bootaddress) { 1252 if (physmap[1] >= 0x100000000) 1253 panic( 1254 "Basemem segment is not suitable for AP bootstrap code!"); 1255 physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024); 1256 } 1257 1258 /* 1259 * Maxmem isn't the "maximum memory", it's one larger than the 1260 * highest page of the physical address space. It should be 1261 * called something like "Maxphyspage". We may adjust this 1262 * based on ``hw.physmem'' and the results of the memory test. 1263 */ 1264 Maxmem = atop(physmap[physmap_idx + 1]); 1265 1266#ifdef MAXMEM 1267 Maxmem = MAXMEM / 4; 1268#endif 1269 1270 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1271 Maxmem = atop(physmem_tunable); 1272 1273 /* 1274 * The boot memory test is disabled by default, as it takes a 1275 * significant amount of time on large-memory systems, and is 1276 * unfriendly to virtual machines as it unnecessarily touches all 1277 * pages. 1278 * 1279 * A general name is used as the code may be extended to support 1280 * additional tests beyond the current "page present" test. 1281 */ 1282 memtest = 0; 1283 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 1284 1285 /* 1286 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1287 * in the system. 1288 */ 1289 if (Maxmem > atop(physmap[physmap_idx + 1])) 1290 Maxmem = atop(physmap[physmap_idx + 1]); 1291 1292 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1293 (boothowto & RB_VERBOSE)) 1294 printf("Physical memory use set to %ldK\n", Maxmem * 4); 1295 1296 /* call pmap initialization to make new kernel address space */ 1297 pmap_bootstrap(&first); 1298 1299 /* 1300 * Size up each available chunk of physical memory. 1301 * 1302 * XXX Some BIOSes corrupt low 64KB between suspend and resume. 1303 * By default, mask off the first 16 pages unless we appear to be 1304 * running in a VM. 1305 */ 1306 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; 1307 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); 1308 if (physmap[0] < physmem_start) { 1309 if (physmem_start < PAGE_SIZE) 1310 physmap[0] = PAGE_SIZE; 1311 else if (physmem_start >= physmap[1]) 1312 physmap[0] = round_page(physmap[1] - PAGE_SIZE); 1313 else 1314 physmap[0] = round_page(physmem_start); 1315 } 1316 pa_indx = 0; 1317 da_indx = 1; 1318 phys_avail[pa_indx++] = physmap[0]; 1319 phys_avail[pa_indx] = physmap[0]; 1320 dump_avail[da_indx] = physmap[0]; 1321 pte = CMAP1; 1322 1323 /* 1324 * Get dcons buffer address 1325 */ 1326 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 1327 getenv_quad("dcons.size", &dcons_size) == 0) 1328 dcons_addr = 0; 1329 1330 /* 1331 * physmap is in bytes, so when converting to page boundaries, 1332 * round up the start address and round down the end address. 1333 */ 1334 page_counter = 0; 1335 if (memtest != 0) 1336 printf("Testing system memory"); 1337 for (i = 0; i <= physmap_idx; i += 2) { 1338 vm_paddr_t end; 1339 1340 end = ptoa((vm_paddr_t)Maxmem); 1341 if (physmap[i + 1] < end) 1342 end = trunc_page(physmap[i + 1]); 1343 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1344 int tmp, page_bad, full; 1345 int *ptr = (int *)CADDR1; 1346 1347 full = FALSE; 1348 /* 1349 * block out kernel memory as not available. 1350 */ 1351 if (pa >= (vm_paddr_t)kernphys && pa < first) 1352 goto do_dump_avail; 1353 1354 /* 1355 * block out dcons buffer 1356 */ 1357 if (dcons_addr > 0 1358 && pa >= trunc_page(dcons_addr) 1359 && pa < dcons_addr + dcons_size) 1360 goto do_dump_avail; 1361 1362 page_bad = FALSE; 1363 if (memtest == 0) 1364 goto skip_memtest; 1365 1366 /* 1367 * Print a "." every GB to show we're making 1368 * progress. 1369 */ 1370 page_counter++; 1371 if ((page_counter % PAGES_PER_GB) == 0) 1372 printf("."); 1373 1374 /* 1375 * map page into kernel: valid, read/write,non-cacheable 1376 */ 1377 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; 1378 invltlb(); 1379 1380 tmp = *(int *)ptr; 1381 /* 1382 * Test for alternating 1's and 0's 1383 */ 1384 *(volatile int *)ptr = 0xaaaaaaaa; 1385 if (*(volatile int *)ptr != 0xaaaaaaaa) 1386 page_bad = TRUE; 1387 /* 1388 * Test for alternating 0's and 1's 1389 */ 1390 *(volatile int *)ptr = 0x55555555; 1391 if (*(volatile int *)ptr != 0x55555555) 1392 page_bad = TRUE; 1393 /* 1394 * Test for all 1's 1395 */ 1396 *(volatile int *)ptr = 0xffffffff; 1397 if (*(volatile int *)ptr != 0xffffffff) 1398 page_bad = TRUE; 1399 /* 1400 * Test for all 0's 1401 */ 1402 *(volatile int *)ptr = 0x0; 1403 if (*(volatile int *)ptr != 0x0) 1404 page_bad = TRUE; 1405 /* 1406 * Restore original value. 1407 */ 1408 *(int *)ptr = tmp; 1409 1410skip_memtest: 1411 /* 1412 * Adjust array of valid/good pages. 1413 */ 1414 if (page_bad == TRUE) 1415 continue; 1416 /* 1417 * If this good page is a continuation of the 1418 * previous set of good pages, then just increase 1419 * the end pointer. Otherwise start a new chunk. 1420 * Note that "end" points one higher than end, 1421 * making the range >= start and < end. 1422 * If we're also doing a speculative memory 1423 * test and we at or past the end, bump up Maxmem 1424 * so that we keep going. The first bad page 1425 * will terminate the loop. 1426 */ 1427 if (phys_avail[pa_indx] == pa) { 1428 phys_avail[pa_indx] += PAGE_SIZE; 1429 } else { 1430 pa_indx++; 1431 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 1432 printf( 1433 "Too many holes in the physical address space, giving up\n"); 1434 pa_indx--; 1435 full = TRUE; 1436 goto do_dump_avail; 1437 } 1438 phys_avail[pa_indx++] = pa; /* start */ 1439 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1440 } 1441 physmem++; 1442do_dump_avail: 1443 if (dump_avail[da_indx] == pa) { 1444 dump_avail[da_indx] += PAGE_SIZE; 1445 } else { 1446 da_indx++; 1447 if (da_indx == DUMP_AVAIL_ARRAY_END) { 1448 da_indx--; 1449 goto do_next; 1450 } 1451 dump_avail[da_indx++] = pa; /* start */ 1452 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1453 } 1454do_next: 1455 if (full) 1456 break; 1457 } 1458 } 1459 *pte = 0; 1460 invltlb(); 1461 if (memtest != 0) 1462 printf("\n"); 1463 1464 /* 1465 * XXX 1466 * The last chunk must contain at least one page plus the message 1467 * buffer to avoid complicating other code (message buffer address 1468 * calculation, etc.). 1469 */ 1470 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1471 round_page(msgbufsize) >= phys_avail[pa_indx]) { 1472 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1473 phys_avail[pa_indx--] = 0; 1474 phys_avail[pa_indx--] = 0; 1475 } 1476 1477 Maxmem = atop(phys_avail[pa_indx]); 1478 1479 /* Trim off space for the message buffer. */ 1480 phys_avail[pa_indx] -= round_page(msgbufsize); 1481 1482 /* Map the message buffer. */ 1483 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); 1484} 1485 1486static caddr_t 1487native_parse_preload_data(u_int64_t modulep) 1488{ 1489 caddr_t kmdp; 1490 char *envp; 1491#ifdef DDB 1492 vm_offset_t ksym_start; 1493 vm_offset_t ksym_end; 1494#endif 1495 1496 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1497 preload_bootstrap_relocate(KERNBASE); 1498 kmdp = preload_search_by_type("elf kernel"); 1499 if (kmdp == NULL) 1500 kmdp = preload_search_by_type("elf64 kernel"); 1501 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1502 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); 1503 if (envp != NULL) 1504 envp += KERNBASE; 1505 init_static_kenv(envp, 0); 1506#ifdef DDB 1507 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1508 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1509 db_fetch_ksymtab(ksym_start, ksym_end); 1510#endif 1511 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 1512 1513 return (kmdp); 1514} 1515 1516static void 1517amd64_kdb_init(void) 1518{ 1519 kdb_init(); 1520#ifdef KDB 1521 if (boothowto & RB_KDB) 1522 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 1523#endif 1524} 1525 1526u_int64_t 1527hammer_time(u_int64_t modulep, u_int64_t physfree) 1528{ 1529 caddr_t kmdp; 1530 int gsel_tss, x; 1531 struct pcpu *pc; 1532 struct nmi_pcpu *np; 1533 struct xstate_hdr *xhdr; 1534 u_int64_t msr; 1535 char *env; 1536 size_t kstack0_sz; 1537 int late_console; 1538 1539 /* 1540 * This may be done better later if it gets more high level 1541 * components in it. If so just link td->td_proc here. 1542 */ 1543 proc_linkup0(&proc0, &thread0); 1544 1545 kmdp = init_ops.parse_preload_data(modulep); 1546 1547 identify_cpu(); 1548 identify_hypervisor(); 1549 1550 /* Init basic tunables, hz etc */ 1551 init_param1(); 1552 1553 thread0.td_kstack = physfree + KERNBASE; 1554 thread0.td_kstack_pages = kstack_pages; 1555 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 1556 bzero((void *)thread0.td_kstack, kstack0_sz); 1557 physfree += kstack0_sz; 1558 1559 /* 1560 * make gdt memory segments 1561 */ 1562 for (x = 0; x < NGDT; x++) { 1563 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 1564 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) 1565 ssdtosd(&gdt_segs[x], &gdt[x]); 1566 } 1567 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0]; 1568 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1569 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1570 1571 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1572 r_gdt.rd_base = (long) gdt; 1573 lgdt(&r_gdt); 1574 pc = &__pcpu[0]; 1575 1576 wrmsr(MSR_FSBASE, 0); /* User value */ 1577 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1578 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1579 1580 pcpu_init(pc, 0, sizeof(struct pcpu)); 1581 dpcpu_init((void *)(physfree + KERNBASE), 0); 1582 physfree += DPCPU_SIZE; 1583 PCPU_SET(prvspace, pc); 1584 PCPU_SET(curthread, &thread0); 1585 /* Non-late cninit() and printf() can be moved up to here. */ 1586 PCPU_SET(tssp, &common_tss[0]); 1587 PCPU_SET(commontssp, &common_tss[0]); 1588 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1589 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); 1590 PCPU_SET(fs32p, &gdt[GUFS32_SEL]); 1591 PCPU_SET(gs32p, &gdt[GUGS32_SEL]); 1592 1593 /* 1594 * Initialize mutexes. 1595 * 1596 * icu_lock: in order to allow an interrupt to occur in a critical 1597 * section, to set pcpu->ipending (etc...) properly, we 1598 * must be able to get the icu lock, so it can't be 1599 * under witness. 1600 */ 1601 mutex_init(); 1602 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1603 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); 1604 1605 /* exceptions */ 1606 for (x = 0; x < NIDT; x++) 1607 setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); 1608 setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); 1609 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0); 1610 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); 1611 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); 1612 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); 1613 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); 1614 setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); 1615 setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); 1616 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1617 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); 1618 setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); 1619 setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); 1620 setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); 1621 setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); 1622 setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); 1623 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); 1624 setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); 1625 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); 1626 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); 1627#ifdef KDTRACE_HOOKS 1628 setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); 1629#endif 1630#ifdef XENHVM 1631 setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0); 1632#endif 1633 1634 r_idt.rd_limit = sizeof(idt0) - 1; 1635 r_idt.rd_base = (long) idt; 1636 lidt(&r_idt); 1637 1638 /* 1639 * Initialize the clock before the console so that console 1640 * initialization can use DELAY(). 1641 */ 1642 clock_init(); 1643 1644 /* 1645 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) 1646 * transition). 1647 * Once bootblocks have updated, we can test directly for 1648 * efi_systbl != NULL here... 1649 */ 1650 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) 1651 != NULL) 1652 vty_set_preferred(VTY_VT); 1653 1654 finishidentcpu(); /* Final stage of CPU initialization */ 1655 initializecpu(); /* Initialize CPU registers */ 1656 initializecpucache(); 1657 1658 /* doublefault stack space, runs on ist1 */ 1659 common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; 1660 1661 /* 1662 * NMI stack, runs on ist2. The pcpu pointer is stored just 1663 * above the start of the ist2 stack. 1664 */ 1665 np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1; 1666 np->np_pcpu = (register_t) pc; 1667 common_tss[0].tss_ist2 = (long) np; 1668 1669 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1670 common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; 1671 1672 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1673 ltr(gsel_tss); 1674 1675 /* Set up the fast syscall stuff */ 1676 msr = rdmsr(MSR_EFER) | EFER_SCE; 1677 wrmsr(MSR_EFER, msr); 1678 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 1679 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1680 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1681 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1682 wrmsr(MSR_STAR, msr); 1683 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); 1684 1685 /* 1686 * Temporary forge some valid pointer to PCB, for exception 1687 * handlers. It is reinitialized properly below after FPU is 1688 * set up. Also set up td_critnest to short-cut the page 1689 * fault handler. 1690 */ 1691 cpu_max_ext_state_size = sizeof(struct savefpu); 1692 thread0.td_pcb = get_pcb_td(&thread0); 1693 thread0.td_critnest = 1; 1694 1695 /* 1696 * The console and kdb should be initialized even earlier than here, 1697 * but some console drivers don't work until after getmemsize(). 1698 * Default to late console initialization to support these drivers. 1699 * This loses mainly printf()s in getmemsize() and early debugging. 1700 */ 1701 late_console = 1; 1702 TUNABLE_INT_FETCH("debug.late_console", &late_console); 1703 if (!late_console) { 1704 cninit(); 1705 amd64_kdb_init(); 1706 } 1707 1708 getmemsize(kmdp, physfree); 1709 init_param2(physmem); 1710 1711 /* now running on new page tables, configured,and u/iom is accessible */ 1712 1713 if (late_console) 1714 cninit(); 1715 1716#ifdef DEV_ISA 1717#ifdef DEV_ATPIC 1718 elcr_probe(); 1719 atpic_startup(); 1720#else 1721 /* Reset and mask the atpics and leave them shut down. */ 1722 atpic_reset(); 1723 1724 /* 1725 * Point the ICU spurious interrupt vectors at the APIC spurious 1726 * interrupt handler. 1727 */ 1728 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1729 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1730#endif 1731#else 1732#error "have you forgotten the isa device?"; 1733#endif 1734 1735 if (late_console) 1736 amd64_kdb_init(); 1737 1738 msgbufinit(msgbufp, msgbufsize); 1739 fpuinit(); 1740 1741 /* 1742 * Set up thread0 pcb after fpuinit calculated pcb + fpu save 1743 * area size. Zero out the extended state header in fpu save 1744 * area. 1745 */ 1746 thread0.td_pcb = get_pcb_td(&thread0); 1747 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 1748 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); 1749 if (use_xsave) { 1750 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1751 1); 1752 xhdr->xstate_bv = xsave_mask; 1753 } 1754 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1755 common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb; 1756 /* Ensure the stack is aligned to 16 bytes */ 1757 common_tss[0].tss_rsp0 &= ~0xFul; 1758 PCPU_SET(rsp0, common_tss[0].tss_rsp0); 1759 PCPU_SET(curpcb, thread0.td_pcb); 1760 1761 /* transfer to user mode */ 1762 1763 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1764 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1765 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1766 _ufssel = GSEL(GUFS32_SEL, SEL_UPL); 1767 _ugssel = GSEL(GUGS32_SEL, SEL_UPL); 1768 1769 load_ds(_udatasel); 1770 load_es(_udatasel); 1771 load_fs(_ufssel); 1772 1773 /* setup proc 0's pcb */ 1774 thread0.td_pcb->pcb_flags = 0; 1775 thread0.td_frame = &proc0_tf; 1776 1777 env = kern_getenv("kernelname"); 1778 if (env != NULL) 1779 strlcpy(kernelname, env, sizeof(kernelname)); 1780 1781 cpu_probe_amdc1e(); 1782 1783#ifdef FDT 1784 x86_init_fdt(); 1785#endif 1786 thread0.td_critnest = 0; 1787 1788 /* Location of kernel stack for locore */ 1789 return ((u_int64_t)thread0.td_pcb); 1790} 1791 1792void 1793cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1794{ 1795 1796 pcpu->pc_acpi_id = 0xffffffff; 1797} 1798 1799static int 1800smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 1801{ 1802 struct bios_smap *smapbase; 1803 struct bios_smap_xattr smap; 1804 caddr_t kmdp; 1805 uint32_t *smapattr; 1806 int count, error, i; 1807 1808 /* Retrieve the system memory map from the loader. */ 1809 kmdp = preload_search_by_type("elf kernel"); 1810 if (kmdp == NULL) 1811 kmdp = preload_search_by_type("elf64 kernel"); 1812 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1813 MODINFO_METADATA | MODINFOMD_SMAP); 1814 if (smapbase == NULL) 1815 return (0); 1816 smapattr = (uint32_t *)preload_search_info(kmdp, 1817 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 1818 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); 1819 error = 0; 1820 for (i = 0; i < count; i++) { 1821 smap.base = smapbase[i].base; 1822 smap.length = smapbase[i].length; 1823 smap.type = smapbase[i].type; 1824 if (smapattr != NULL) 1825 smap.xattr = smapattr[i]; 1826 else 1827 smap.xattr = 0; 1828 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 1829 } 1830 return (error); 1831} 1832SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1833 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); 1834 1835static int 1836efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1837{ 1838 struct efi_map_header *efihdr; 1839 caddr_t kmdp; 1840 uint32_t efisize; 1841 1842 kmdp = preload_search_by_type("elf kernel"); 1843 if (kmdp == NULL) 1844 kmdp = preload_search_by_type("elf64 kernel"); 1845 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1846 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1847 if (efihdr == NULL) 1848 return (0); 1849 efisize = *((uint32_t *)efihdr - 1); 1850 return (SYSCTL_OUT(req, efihdr, efisize)); 1851} 1852SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1853 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); 1854 1855void 1856spinlock_enter(void) 1857{ 1858 struct thread *td; 1859 register_t flags; 1860 1861 td = curthread; 1862 if (td->td_md.md_spinlock_count == 0) { 1863 flags = intr_disable(); 1864 td->td_md.md_spinlock_count = 1; 1865 td->td_md.md_saved_flags = flags; 1866 } else 1867 td->td_md.md_spinlock_count++; 1868 critical_enter(); 1869} 1870 1871void 1872spinlock_exit(void) 1873{ 1874 struct thread *td; 1875 register_t flags; 1876 1877 td = curthread; 1878 critical_exit(); 1879 flags = td->td_md.md_saved_flags; 1880 td->td_md.md_spinlock_count--; 1881 if (td->td_md.md_spinlock_count == 0) 1882 intr_restore(flags); 1883} 1884 1885/* 1886 * Construct a PCB from a trapframe. This is called from kdb_trap() where 1887 * we want to start a backtrace from the function that caused us to enter 1888 * the debugger. We have the context in the trapframe, but base the trace 1889 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 1890 * enough for a backtrace. 1891 */ 1892void 1893makectx(struct trapframe *tf, struct pcb *pcb) 1894{ 1895 1896 pcb->pcb_r12 = tf->tf_r12; 1897 pcb->pcb_r13 = tf->tf_r13; 1898 pcb->pcb_r14 = tf->tf_r14; 1899 pcb->pcb_r15 = tf->tf_r15; 1900 pcb->pcb_rbp = tf->tf_rbp; 1901 pcb->pcb_rbx = tf->tf_rbx; 1902 pcb->pcb_rip = tf->tf_rip; 1903 pcb->pcb_rsp = tf->tf_rsp; 1904} 1905 1906int 1907ptrace_set_pc(struct thread *td, unsigned long addr) 1908{ 1909 1910 td->td_frame->tf_rip = addr; 1911 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 1912 return (0); 1913} 1914 1915int 1916ptrace_single_step(struct thread *td) 1917{ 1918 td->td_frame->tf_rflags |= PSL_T; 1919 return (0); 1920} 1921 1922int 1923ptrace_clear_single_step(struct thread *td) 1924{ 1925 td->td_frame->tf_rflags &= ~PSL_T; 1926 return (0); 1927} 1928 1929int 1930fill_regs(struct thread *td, struct reg *regs) 1931{ 1932 struct trapframe *tp; 1933 1934 tp = td->td_frame; 1935 return (fill_frame_regs(tp, regs)); 1936} 1937 1938int 1939fill_frame_regs(struct trapframe *tp, struct reg *regs) 1940{ 1941 regs->r_r15 = tp->tf_r15; 1942 regs->r_r14 = tp->tf_r14; 1943 regs->r_r13 = tp->tf_r13; 1944 regs->r_r12 = tp->tf_r12; 1945 regs->r_r11 = tp->tf_r11; 1946 regs->r_r10 = tp->tf_r10; 1947 regs->r_r9 = tp->tf_r9; 1948 regs->r_r8 = tp->tf_r8; 1949 regs->r_rdi = tp->tf_rdi; 1950 regs->r_rsi = tp->tf_rsi; 1951 regs->r_rbp = tp->tf_rbp; 1952 regs->r_rbx = tp->tf_rbx; 1953 regs->r_rdx = tp->tf_rdx; 1954 regs->r_rcx = tp->tf_rcx; 1955 regs->r_rax = tp->tf_rax; 1956 regs->r_rip = tp->tf_rip; 1957 regs->r_cs = tp->tf_cs; 1958 regs->r_rflags = tp->tf_rflags; 1959 regs->r_rsp = tp->tf_rsp; 1960 regs->r_ss = tp->tf_ss; 1961 if (tp->tf_flags & TF_HASSEGS) { 1962 regs->r_ds = tp->tf_ds; 1963 regs->r_es = tp->tf_es; 1964 regs->r_fs = tp->tf_fs; 1965 regs->r_gs = tp->tf_gs; 1966 } else { 1967 regs->r_ds = 0; 1968 regs->r_es = 0; 1969 regs->r_fs = 0; 1970 regs->r_gs = 0; 1971 } 1972 return (0); 1973} 1974 1975int 1976set_regs(struct thread *td, struct reg *regs) 1977{ 1978 struct trapframe *tp; 1979 register_t rflags; 1980 1981 tp = td->td_frame; 1982 rflags = regs->r_rflags & 0xffffffff; 1983 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) 1984 return (EINVAL); 1985 tp->tf_r15 = regs->r_r15; 1986 tp->tf_r14 = regs->r_r14; 1987 tp->tf_r13 = regs->r_r13; 1988 tp->tf_r12 = regs->r_r12; 1989 tp->tf_r11 = regs->r_r11; 1990 tp->tf_r10 = regs->r_r10; 1991 tp->tf_r9 = regs->r_r9; 1992 tp->tf_r8 = regs->r_r8; 1993 tp->tf_rdi = regs->r_rdi; 1994 tp->tf_rsi = regs->r_rsi; 1995 tp->tf_rbp = regs->r_rbp; 1996 tp->tf_rbx = regs->r_rbx; 1997 tp->tf_rdx = regs->r_rdx; 1998 tp->tf_rcx = regs->r_rcx; 1999 tp->tf_rax = regs->r_rax; 2000 tp->tf_rip = regs->r_rip; 2001 tp->tf_cs = regs->r_cs; 2002 tp->tf_rflags = rflags; 2003 tp->tf_rsp = regs->r_rsp; 2004 tp->tf_ss = regs->r_ss; 2005 if (0) { /* XXXKIB */ 2006 tp->tf_ds = regs->r_ds; 2007 tp->tf_es = regs->r_es; 2008 tp->tf_fs = regs->r_fs; 2009 tp->tf_gs = regs->r_gs; 2010 tp->tf_flags = TF_HASSEGS; 2011 } 2012 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2013 return (0); 2014} 2015 2016/* XXX check all this stuff! */ 2017/* externalize from sv_xmm */ 2018static void 2019fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) 2020{ 2021 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2022 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2023 int i; 2024 2025 /* pcb -> fpregs */ 2026 bzero(fpregs, sizeof(*fpregs)); 2027 2028 /* FPU control/status */ 2029 penv_fpreg->en_cw = penv_xmm->en_cw; 2030 penv_fpreg->en_sw = penv_xmm->en_sw; 2031 penv_fpreg->en_tw = penv_xmm->en_tw; 2032 penv_fpreg->en_opcode = penv_xmm->en_opcode; 2033 penv_fpreg->en_rip = penv_xmm->en_rip; 2034 penv_fpreg->en_rdp = penv_xmm->en_rdp; 2035 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; 2036 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; 2037 2038 /* FPU registers */ 2039 for (i = 0; i < 8; ++i) 2040 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); 2041 2042 /* SSE registers */ 2043 for (i = 0; i < 16; ++i) 2044 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); 2045} 2046 2047/* internalize from fpregs into sv_xmm */ 2048static void 2049set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) 2050{ 2051 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2052 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2053 int i; 2054 2055 /* fpregs -> pcb */ 2056 /* FPU control/status */ 2057 penv_xmm->en_cw = penv_fpreg->en_cw; 2058 penv_xmm->en_sw = penv_fpreg->en_sw; 2059 penv_xmm->en_tw = penv_fpreg->en_tw; 2060 penv_xmm->en_opcode = penv_fpreg->en_opcode; 2061 penv_xmm->en_rip = penv_fpreg->en_rip; 2062 penv_xmm->en_rdp = penv_fpreg->en_rdp; 2063 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; 2064 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; 2065 2066 /* FPU registers */ 2067 for (i = 0; i < 8; ++i) 2068 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); 2069 2070 /* SSE registers */ 2071 for (i = 0; i < 16; ++i) 2072 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); 2073} 2074 2075/* externalize from td->pcb */ 2076int 2077fill_fpregs(struct thread *td, struct fpreg *fpregs) 2078{ 2079 2080 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2081 P_SHOULDSTOP(td->td_proc), 2082 ("not suspended thread %p", td)); 2083 fpugetregs(td); 2084 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); 2085 return (0); 2086} 2087 2088/* internalize to td->pcb */ 2089int 2090set_fpregs(struct thread *td, struct fpreg *fpregs) 2091{ 2092 2093 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); 2094 fpuuserinited(td); 2095 return (0); 2096} 2097 2098/* 2099 * Get machine context. 2100 */ 2101int 2102get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 2103{ 2104 struct pcb *pcb; 2105 struct trapframe *tp; 2106 2107 pcb = td->td_pcb; 2108 tp = td->td_frame; 2109 PROC_LOCK(curthread->td_proc); 2110 mcp->mc_onstack = sigonstack(tp->tf_rsp); 2111 PROC_UNLOCK(curthread->td_proc); 2112 mcp->mc_r15 = tp->tf_r15; 2113 mcp->mc_r14 = tp->tf_r14; 2114 mcp->mc_r13 = tp->tf_r13; 2115 mcp->mc_r12 = tp->tf_r12; 2116 mcp->mc_r11 = tp->tf_r11; 2117 mcp->mc_r10 = tp->tf_r10; 2118 mcp->mc_r9 = tp->tf_r9; 2119 mcp->mc_r8 = tp->tf_r8; 2120 mcp->mc_rdi = tp->tf_rdi; 2121 mcp->mc_rsi = tp->tf_rsi; 2122 mcp->mc_rbp = tp->tf_rbp; 2123 mcp->mc_rbx = tp->tf_rbx; 2124 mcp->mc_rcx = tp->tf_rcx; 2125 mcp->mc_rflags = tp->tf_rflags; 2126 if (flags & GET_MC_CLEAR_RET) { 2127 mcp->mc_rax = 0; 2128 mcp->mc_rdx = 0; 2129 mcp->mc_rflags &= ~PSL_C; 2130 } else { 2131 mcp->mc_rax = tp->tf_rax; 2132 mcp->mc_rdx = tp->tf_rdx; 2133 } 2134 mcp->mc_rip = tp->tf_rip; 2135 mcp->mc_cs = tp->tf_cs; 2136 mcp->mc_rsp = tp->tf_rsp; 2137 mcp->mc_ss = tp->tf_ss; 2138 mcp->mc_ds = tp->tf_ds; 2139 mcp->mc_es = tp->tf_es; 2140 mcp->mc_fs = tp->tf_fs; 2141 mcp->mc_gs = tp->tf_gs; 2142 mcp->mc_flags = tp->tf_flags; 2143 mcp->mc_len = sizeof(*mcp); 2144 get_fpcontext(td, mcp, NULL, 0); 2145 update_pcb_bases(pcb); 2146 mcp->mc_fsbase = pcb->pcb_fsbase; 2147 mcp->mc_gsbase = pcb->pcb_gsbase; 2148 mcp->mc_xfpustate = 0; 2149 mcp->mc_xfpustate_len = 0; 2150 bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); 2151 return (0); 2152} 2153 2154/* 2155 * Set machine context. 2156 * 2157 * However, we don't set any but the user modifiable flags, and we won't 2158 * touch the cs selector. 2159 */ 2160int 2161set_mcontext(struct thread *td, mcontext_t *mcp) 2162{ 2163 struct pcb *pcb; 2164 struct trapframe *tp; 2165 char *xfpustate; 2166 long rflags; 2167 int ret; 2168 2169 pcb = td->td_pcb; 2170 tp = td->td_frame; 2171 if (mcp->mc_len != sizeof(*mcp) || 2172 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 2173 return (EINVAL); 2174 rflags = (mcp->mc_rflags & PSL_USERCHANGE) | 2175 (tp->tf_rflags & ~PSL_USERCHANGE); 2176 if (mcp->mc_flags & _MC_HASFPXSTATE) { 2177 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 2178 sizeof(struct savefpu)) 2179 return (EINVAL); 2180 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 2181 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 2182 mcp->mc_xfpustate_len); 2183 if (ret != 0) 2184 return (ret); 2185 } else 2186 xfpustate = NULL; 2187 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 2188 if (ret != 0) 2189 return (ret); 2190 tp->tf_r15 = mcp->mc_r15; 2191 tp->tf_r14 = mcp->mc_r14; 2192 tp->tf_r13 = mcp->mc_r13; 2193 tp->tf_r12 = mcp->mc_r12; 2194 tp->tf_r11 = mcp->mc_r11; 2195 tp->tf_r10 = mcp->mc_r10; 2196 tp->tf_r9 = mcp->mc_r9; 2197 tp->tf_r8 = mcp->mc_r8; 2198 tp->tf_rdi = mcp->mc_rdi; 2199 tp->tf_rsi = mcp->mc_rsi; 2200 tp->tf_rbp = mcp->mc_rbp; 2201 tp->tf_rbx = mcp->mc_rbx; 2202 tp->tf_rdx = mcp->mc_rdx; 2203 tp->tf_rcx = mcp->mc_rcx; 2204 tp->tf_rax = mcp->mc_rax; 2205 tp->tf_rip = mcp->mc_rip; 2206 tp->tf_rflags = rflags; 2207 tp->tf_rsp = mcp->mc_rsp; 2208 tp->tf_ss = mcp->mc_ss; 2209 tp->tf_flags = mcp->mc_flags; 2210 if (tp->tf_flags & TF_HASSEGS) { 2211 tp->tf_ds = mcp->mc_ds; 2212 tp->tf_es = mcp->mc_es; 2213 tp->tf_fs = mcp->mc_fs; 2214 tp->tf_gs = mcp->mc_gs; 2215 } 2216 set_pcb_flags(pcb, PCB_FULL_IRET); 2217 if (mcp->mc_flags & _MC_HASBASES) { 2218 pcb->pcb_fsbase = mcp->mc_fsbase; 2219 pcb->pcb_gsbase = mcp->mc_gsbase; 2220 } 2221 return (0); 2222} 2223 2224static void 2225get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 2226 size_t xfpusave_len) 2227{ 2228 size_t max_len, len; 2229 2230 mcp->mc_ownedfp = fpugetregs(td); 2231 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 2232 sizeof(mcp->mc_fpstate)); 2233 mcp->mc_fpformat = fpuformat(); 2234 if (!use_xsave || xfpusave_len == 0) 2235 return; 2236 max_len = cpu_max_ext_state_size - sizeof(struct savefpu); 2237 len = xfpusave_len; 2238 if (len > max_len) { 2239 len = max_len; 2240 bzero(xfpusave + max_len, len - max_len); 2241 } 2242 mcp->mc_flags |= _MC_HASFPXSTATE; 2243 mcp->mc_xfpustate_len = len; 2244 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 2245} 2246 2247static int 2248set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 2249 size_t xfpustate_len) 2250{ 2251 struct savefpu *fpstate; 2252 int error; 2253 2254 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 2255 return (0); 2256 else if (mcp->mc_fpformat != _MC_FPFMT_XMM) 2257 return (EINVAL); 2258 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 2259 /* We don't care what state is left in the FPU or PCB. */ 2260 fpstate_drop(td); 2261 error = 0; 2262 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 2263 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 2264 fpstate = (struct savefpu *)&mcp->mc_fpstate; 2265 fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask; 2266 error = fpusetregs(td, fpstate, xfpustate, xfpustate_len); 2267 } else 2268 return (EINVAL); 2269 return (error); 2270} 2271 2272void 2273fpstate_drop(struct thread *td) 2274{ 2275 2276 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 2277 critical_enter(); 2278 if (PCPU_GET(fpcurthread) == td) 2279 fpudrop(); 2280 /* 2281 * XXX force a full drop of the fpu. The above only drops it if we 2282 * owned it. 2283 * 2284 * XXX I don't much like fpugetuserregs()'s semantics of doing a full 2285 * drop. Dropping only to the pcb matches fnsave's behaviour. 2286 * We only need to drop to !PCB_INITDONE in sendsig(). But 2287 * sendsig() is the only caller of fpugetuserregs()... perhaps we just 2288 * have too many layers. 2289 */ 2290 clear_pcb_flags(curthread->td_pcb, 2291 PCB_FPUINITDONE | PCB_USERFPUINITDONE); 2292 critical_exit(); 2293} 2294 2295int 2296fill_dbregs(struct thread *td, struct dbreg *dbregs) 2297{ 2298 struct pcb *pcb; 2299 2300 if (td == NULL) { 2301 dbregs->dr[0] = rdr0(); 2302 dbregs->dr[1] = rdr1(); 2303 dbregs->dr[2] = rdr2(); 2304 dbregs->dr[3] = rdr3(); 2305 dbregs->dr[6] = rdr6(); 2306 dbregs->dr[7] = rdr7(); 2307 } else { 2308 pcb = td->td_pcb; 2309 dbregs->dr[0] = pcb->pcb_dr0; 2310 dbregs->dr[1] = pcb->pcb_dr1; 2311 dbregs->dr[2] = pcb->pcb_dr2; 2312 dbregs->dr[3] = pcb->pcb_dr3; 2313 dbregs->dr[6] = pcb->pcb_dr6; 2314 dbregs->dr[7] = pcb->pcb_dr7; 2315 } 2316 dbregs->dr[4] = 0; 2317 dbregs->dr[5] = 0; 2318 dbregs->dr[8] = 0; 2319 dbregs->dr[9] = 0; 2320 dbregs->dr[10] = 0; 2321 dbregs->dr[11] = 0; 2322 dbregs->dr[12] = 0; 2323 dbregs->dr[13] = 0; 2324 dbregs->dr[14] = 0; 2325 dbregs->dr[15] = 0; 2326 return (0); 2327} 2328 2329int 2330set_dbregs(struct thread *td, struct dbreg *dbregs) 2331{ 2332 struct pcb *pcb; 2333 int i; 2334 2335 if (td == NULL) { 2336 load_dr0(dbregs->dr[0]); 2337 load_dr1(dbregs->dr[1]); 2338 load_dr2(dbregs->dr[2]); 2339 load_dr3(dbregs->dr[3]); 2340 load_dr6(dbregs->dr[6]); 2341 load_dr7(dbregs->dr[7]); 2342 } else { 2343 /* 2344 * Don't let an illegal value for dr7 get set. Specifically, 2345 * check for undefined settings. Setting these bit patterns 2346 * result in undefined behaviour and can lead to an unexpected 2347 * TRCTRAP or a general protection fault right here. 2348 * Upper bits of dr6 and dr7 must not be set 2349 */ 2350 for (i = 0; i < 4; i++) { 2351 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 2352 return (EINVAL); 2353 if (td->td_frame->tf_cs == _ucode32sel && 2354 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) 2355 return (EINVAL); 2356 } 2357 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || 2358 (dbregs->dr[7] & 0xffffffff00000000ul) != 0) 2359 return (EINVAL); 2360 2361 pcb = td->td_pcb; 2362 2363 /* 2364 * Don't let a process set a breakpoint that is not within the 2365 * process's address space. If a process could do this, it 2366 * could halt the system by setting a breakpoint in the kernel 2367 * (if ddb was enabled). Thus, we need to check to make sure 2368 * that no breakpoints are being enabled for addresses outside 2369 * process's address space. 2370 * 2371 * XXX - what about when the watched area of the user's 2372 * address space is written into from within the kernel 2373 * ... wouldn't that still cause a breakpoint to be generated 2374 * from within kernel mode? 2375 */ 2376 2377 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 2378 /* dr0 is enabled */ 2379 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 2380 return (EINVAL); 2381 } 2382 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 2383 /* dr1 is enabled */ 2384 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 2385 return (EINVAL); 2386 } 2387 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 2388 /* dr2 is enabled */ 2389 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 2390 return (EINVAL); 2391 } 2392 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 2393 /* dr3 is enabled */ 2394 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 2395 return (EINVAL); 2396 } 2397 2398 pcb->pcb_dr0 = dbregs->dr[0]; 2399 pcb->pcb_dr1 = dbregs->dr[1]; 2400 pcb->pcb_dr2 = dbregs->dr[2]; 2401 pcb->pcb_dr3 = dbregs->dr[3]; 2402 pcb->pcb_dr6 = dbregs->dr[6]; 2403 pcb->pcb_dr7 = dbregs->dr[7]; 2404 2405 set_pcb_flags(pcb, PCB_DBREGS); 2406 } 2407 2408 return (0); 2409} 2410 2411void 2412reset_dbregs(void) 2413{ 2414 2415 load_dr7(0); /* Turn off the control bits first */ 2416 load_dr0(0); 2417 load_dr1(0); 2418 load_dr2(0); 2419 load_dr3(0); 2420 load_dr6(0); 2421} 2422 2423/* 2424 * Return > 0 if a hardware breakpoint has been hit, and the 2425 * breakpoint was in user space. Return 0, otherwise. 2426 */ 2427int 2428user_dbreg_trap(void) 2429{ 2430 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ 2431 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2432 int nbp; /* number of breakpoints that triggered */ 2433 caddr_t addr[4]; /* breakpoint addresses */ 2434 int i; 2435 2436 dr7 = rdr7(); 2437 if ((dr7 & 0x000000ff) == 0) { 2438 /* 2439 * all GE and LE bits in the dr7 register are zero, 2440 * thus the trap couldn't have been caused by the 2441 * hardware debug registers 2442 */ 2443 return 0; 2444 } 2445 2446 nbp = 0; 2447 dr6 = rdr6(); 2448 bp = dr6 & 0x0000000f; 2449 2450 if (!bp) { 2451 /* 2452 * None of the breakpoint bits are set meaning this 2453 * trap was not caused by any of the debug registers 2454 */ 2455 return 0; 2456 } 2457 2458 /* 2459 * at least one of the breakpoints were hit, check to see 2460 * which ones and if any of them are user space addresses 2461 */ 2462 2463 if (bp & 0x01) { 2464 addr[nbp++] = (caddr_t)rdr0(); 2465 } 2466 if (bp & 0x02) { 2467 addr[nbp++] = (caddr_t)rdr1(); 2468 } 2469 if (bp & 0x04) { 2470 addr[nbp++] = (caddr_t)rdr2(); 2471 } 2472 if (bp & 0x08) { 2473 addr[nbp++] = (caddr_t)rdr3(); 2474 } 2475 2476 for (i = 0; i < nbp; i++) { 2477 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 2478 /* 2479 * addr[i] is in user space 2480 */ 2481 return nbp; 2482 } 2483 } 2484 2485 /* 2486 * None of the breakpoints are in user space. 2487 */ 2488 return 0; 2489} 2490 2491/* 2492 * The pcb_flags is only modified by current thread, or by other threads 2493 * when current thread is stopped. However, current thread may change it 2494 * from the interrupt context in cpu_switch(), or in the trap handler. 2495 * When we read-modify-write pcb_flags from C sources, compiler may generate 2496 * code that is not atomic regarding the interrupt handler. If a trap or 2497 * interrupt happens and any flag is modified from the handler, it can be 2498 * clobbered with the cached value later. Therefore, we implement setting 2499 * and clearing flags with single-instruction functions, which do not race 2500 * with possible modification of the flags from the trap or interrupt context, 2501 * because traps and interrupts are executed only on instruction boundary. 2502 */ 2503void 2504set_pcb_flags_raw(struct pcb *pcb, const u_int flags) 2505{ 2506 2507 __asm __volatile("orl %1,%0" 2508 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) 2509 : "cc", "memory"); 2510 2511} 2512 2513/* 2514 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs 2515 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into 2516 * pcb if user space modified the bases. We must save on the context 2517 * switch or if the return to usermode happens through the doreti. 2518 * 2519 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, 2520 * which have a consequence that the base MSRs must be saved each time 2521 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with 2522 * context switches. 2523 */ 2524void 2525set_pcb_flags(struct pcb *pcb, const u_int flags) 2526{ 2527 register_t r; 2528 2529 if (curpcb == pcb && 2530 (flags & PCB_FULL_IRET) != 0 && 2531 (pcb->pcb_flags & PCB_FULL_IRET) == 0 && 2532 (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) { 2533 r = intr_disable(); 2534 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2535 if (rfs() == _ufssel) 2536 pcb->pcb_fsbase = rdfsbase(); 2537 if (rgs() == _ugssel) 2538 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); 2539 } 2540 set_pcb_flags_raw(pcb, flags); 2541 intr_restore(r); 2542 } else { 2543 set_pcb_flags_raw(pcb, flags); 2544 } 2545} 2546 2547void 2548clear_pcb_flags(struct pcb *pcb, const u_int flags) 2549{ 2550 2551 __asm __volatile("andl %1,%0" 2552 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) 2553 : "cc", "memory"); 2554} 2555 2556#ifdef KDB 2557 2558/* 2559 * Provide inb() and outb() as functions. They are normally only available as 2560 * inline functions, thus cannot be called from the debugger. 2561 */ 2562 2563/* silence compiler warnings */ 2564u_char inb_(u_short); 2565void outb_(u_short, u_char); 2566 2567u_char 2568inb_(u_short port) 2569{ 2570 return inb(port); 2571} 2572 2573void 2574outb_(u_short port, u_char data) 2575{ 2576 outb(port, data); 2577} 2578 2579#endif /* KDB */ 2580