machdep.c revision 347700
1/*- 2 * Copyright (c) 1992 Terrence R. Lambert. 3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * William Jolitz. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 38 */ 39 40#include <sys/cdefs.h> 41__FBSDID("$FreeBSD: stable/11/sys/i386/i386/machdep.c 347700 2019-05-16 14:42:16Z markj $"); 42 43#include "opt_apic.h" 44#include "opt_atpic.h" 45#include "opt_compat.h" 46#include "opt_cpu.h" 47#include "opt_ddb.h" 48#include "opt_inet.h" 49#include "opt_isa.h" 50#include "opt_kstack_pages.h" 51#include "opt_maxmem.h" 52#include "opt_mp_watchdog.h" 53#include "opt_perfmon.h" 54#include "opt_platform.h" 55#include "opt_xbox.h" 56 57#include <sys/param.h> 58#include <sys/proc.h> 59#include <sys/systm.h> 60#include <sys/bio.h> 61#include <sys/buf.h> 62#include <sys/bus.h> 63#include <sys/callout.h> 64#include <sys/cons.h> 65#include <sys/cpu.h> 66#include <sys/eventhandler.h> 67#include <sys/exec.h> 68#include <sys/imgact.h> 69#include <sys/kdb.h> 70#include <sys/kernel.h> 71#include <sys/ktr.h> 72#include <sys/linker.h> 73#include <sys/lock.h> 74#include <sys/malloc.h> 75#include <sys/memrange.h> 76#include <sys/msgbuf.h> 77#include <sys/mutex.h> 78#include <sys/pcpu.h> 79#include <sys/ptrace.h> 80#include <sys/reboot.h> 81#include <sys/rwlock.h> 82#include <sys/sched.h> 83#include <sys/signalvar.h> 84#ifdef SMP 85#include <sys/smp.h> 86#endif 87#include <sys/syscallsubr.h> 88#include <sys/sysctl.h> 89#include <sys/sysent.h> 90#include <sys/sysproto.h> 91#include <sys/ucontext.h> 92#include <sys/vmmeter.h> 93 94#include <vm/vm.h> 95#include <vm/vm_extern.h> 96#include <vm/vm_kern.h> 97#include <vm/vm_page.h> 98#include <vm/vm_map.h> 99#include <vm/vm_object.h> 100#include <vm/vm_pager.h> 101#include <vm/vm_param.h> 102#include <vm/vm_phys.h> 103 104#ifdef DDB 105#ifndef KDB 106#error KDB must be enabled in order for DDB to work! 107#endif 108#include <ddb/ddb.h> 109#include <ddb/db_sym.h> 110#endif 111 112#ifdef PC98 113#include <pc98/pc98/pc98_machdep.h> 114#else 115#include <isa/rtc.h> 116#endif 117 118#include <net/netisr.h> 119 120#include <machine/bootinfo.h> 121#include <machine/clock.h> 122#include <machine/cpu.h> 123#include <machine/cputypes.h> 124#include <machine/intr_machdep.h> 125#include <x86/mca.h> 126#include <machine/md_var.h> 127#include <machine/metadata.h> 128#include <machine/mp_watchdog.h> 129#include <machine/pc/bios.h> 130#include <machine/pcb.h> 131#include <machine/pcb_ext.h> 132#include <machine/proc.h> 133#include <machine/reg.h> 134#include <machine/sigframe.h> 135#include <machine/specialreg.h> 136#include <x86/ucode.h> 137#include <machine/vm86.h> 138#include <x86/init.h> 139#ifdef PERFMON 140#include <machine/perfmon.h> 141#endif 142#ifdef SMP 143#include <machine/smp.h> 144#endif 145#ifdef FDT 146#include <x86/fdt.h> 147#endif 148 149#ifdef DEV_APIC 150#include <x86/apicvar.h> 151#endif 152 153#ifdef DEV_ISA 154#include <x86/isa/icu.h> 155#endif 156 157#ifdef XBOX 158#include <machine/xbox.h> 159 160int arch_i386_is_xbox = 0; 161uint32_t arch_i386_xbox_memsize = 0; 162#endif 163 164/* Sanity check for __curthread() */ 165CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 166 167extern register_t init386(int first); 168extern void dblfault_handler(void); 169void identify_cpu(void); 170 171static void cpu_startup(void *); 172static void fpstate_drop(struct thread *td); 173static void get_fpcontext(struct thread *td, mcontext_t *mcp, 174 char *xfpusave, size_t xfpusave_len); 175static int set_fpcontext(struct thread *td, mcontext_t *mcp, 176 char *xfpustate, size_t xfpustate_len); 177SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 178 179/* Intel ICH registers */ 180#define ICH_PMBASE 0x400 181#define ICH_SMI_EN ICH_PMBASE + 0x30 182 183int _udatasel, _ucodesel; 184u_int basemem; 185 186#ifdef PC98 187int need_pre_dma_flush; /* If 1, use wbinvd befor DMA transfer. */ 188int need_post_dma_flush; /* If 1, use invd after DMA transfer. */ 189 190static int ispc98 = 1; 191SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, ""); 192#endif 193 194int cold = 1; 195 196#ifdef COMPAT_43 197static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 198#endif 199#ifdef COMPAT_FREEBSD4 200static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 201#endif 202 203long Maxmem = 0; 204long realmem = 0; 205 206#ifdef PAE 207FEATURE(pae, "Physical Address Extensions"); 208#endif 209 210/* 211 * The number of PHYSMAP entries must be one less than the number of 212 * PHYSSEG entries because the PHYSMAP entry that spans the largest 213 * physical address that is accessible by ISA DMA is split into two 214 * PHYSSEG entries. 215 */ 216#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 217 218vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 219vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 220 221/* must be 2 less so 0 0 can signal end of chunks */ 222#define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) 223#define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) 224 225struct kva_md_info kmi; 226 227static struct trapframe proc0_tf; 228struct pcpu __pcpu[MAXCPU]; 229 230struct mtx icu_lock; 231 232struct mem_range_softc mem_range_softc; 233 234 /* Default init_ops implementation. */ 235 struct init_ops init_ops = { 236 .early_clock_source_init = i8254_init, 237 .early_delay = i8254_delay, 238#ifdef DEV_APIC 239 .msi_init = msi_init, 240#endif 241 }; 242 243static void 244cpu_startup(dummy) 245 void *dummy; 246{ 247 uintmax_t memsize; 248 char *sysenv; 249 250#ifndef PC98 251 /* 252 * On MacBooks, we need to disallow the legacy USB circuit to 253 * generate an SMI# because this can cause several problems, 254 * namely: incorrect CPU frequency detection and failure to 255 * start the APs. 256 * We do this by disabling a bit in the SMI_EN (SMI Control and 257 * Enable register) of the Intel ICH LPC Interface Bridge. 258 */ 259 sysenv = kern_getenv("smbios.system.product"); 260 if (sysenv != NULL) { 261 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 262 strncmp(sysenv, "MacBook3,1", 10) == 0 || 263 strncmp(sysenv, "MacBook4,1", 10) == 0 || 264 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 265 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 266 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 267 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 268 strncmp(sysenv, "Macmini1,1", 10) == 0) { 269 if (bootverbose) 270 printf("Disabling LEGACY_USB_EN bit on " 271 "Intel ICH.\n"); 272 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 273 } 274 freeenv(sysenv); 275 } 276#endif /* !PC98 */ 277 278 /* 279 * Good {morning,afternoon,evening,night}. 280 */ 281 startrtclock(); 282 printcpuinfo(); 283 panicifcpuunsupported(); 284#ifdef PERFMON 285 perfmon_init(); 286#endif 287 288 /* 289 * Display physical memory if SMBIOS reports reasonable amount. 290 */ 291 memsize = 0; 292 sysenv = kern_getenv("smbios.memory.enabled"); 293 if (sysenv != NULL) { 294 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 295 freeenv(sysenv); 296 } 297 if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count)) 298 memsize = ptoa((uintmax_t)Maxmem); 299 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 300 realmem = atop(memsize); 301 302 /* 303 * Display any holes after the first chunk of extended memory. 304 */ 305 if (bootverbose) { 306 int indx; 307 308 printf("Physical memory chunk(s):\n"); 309 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 310 vm_paddr_t size; 311 312 size = phys_avail[indx + 1] - phys_avail[indx]; 313 printf( 314 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 315 (uintmax_t)phys_avail[indx], 316 (uintmax_t)phys_avail[indx + 1] - 1, 317 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 318 } 319 } 320 321 vm_ksubmap_init(&kmi); 322 323 printf("avail memory = %ju (%ju MB)\n", 324 ptoa((uintmax_t)vm_cnt.v_free_count), 325 ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); 326 327 /* 328 * Set up buffers, so they can be used to read disk labels. 329 */ 330 bufinit(); 331 vm_pager_bufferinit(); 332 cpu_setregs(); 333} 334 335/* 336 * Send an interrupt to process. 337 * 338 * Stack is set up to allow sigcode stored 339 * at top to call routine, followed by call 340 * to sigreturn routine below. After sigreturn 341 * resets the signal mask, the stack, and the 342 * frame pointer, it returns to the user 343 * specified pc, psl. 344 */ 345#ifdef COMPAT_43 346static void 347osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 348{ 349 struct osigframe sf, *fp; 350 struct proc *p; 351 struct thread *td; 352 struct sigacts *psp; 353 struct trapframe *regs; 354 int sig; 355 int oonstack; 356 357 td = curthread; 358 p = td->td_proc; 359 PROC_LOCK_ASSERT(p, MA_OWNED); 360 sig = ksi->ksi_signo; 361 psp = p->p_sigacts; 362 mtx_assert(&psp->ps_mtx, MA_OWNED); 363 regs = td->td_frame; 364 oonstack = sigonstack(regs->tf_esp); 365 366 /* Allocate space for the signal handler context. */ 367 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 368 SIGISMEMBER(psp->ps_sigonstack, sig)) { 369 fp = (struct osigframe *)((uintptr_t)td->td_sigstk.ss_sp + 370 td->td_sigstk.ss_size - sizeof(struct osigframe)); 371#if defined(COMPAT_43) 372 td->td_sigstk.ss_flags |= SS_ONSTACK; 373#endif 374 } else 375 fp = (struct osigframe *)regs->tf_esp - 1; 376 377 /* Build the argument list for the signal handler. */ 378 sf.sf_signum = sig; 379 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; 380 bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); 381 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 382 /* Signal handler installed with SA_SIGINFO. */ 383 sf.sf_arg2 = (register_t)&fp->sf_siginfo; 384 sf.sf_siginfo.si_signo = sig; 385 sf.sf_siginfo.si_code = ksi->ksi_code; 386 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; 387 sf.sf_addr = 0; 388 } else { 389 /* Old FreeBSD-style arguments. */ 390 sf.sf_arg2 = ksi->ksi_code; 391 sf.sf_addr = (register_t)ksi->ksi_addr; 392 sf.sf_ahu.sf_handler = catcher; 393 } 394 mtx_unlock(&psp->ps_mtx); 395 PROC_UNLOCK(p); 396 397 /* Save most if not all of trap frame. */ 398 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; 399 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; 400 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; 401 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; 402 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; 403 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; 404 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; 405 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; 406 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; 407 sf.sf_siginfo.si_sc.sc_es = regs->tf_es; 408 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; 409 sf.sf_siginfo.si_sc.sc_gs = rgs(); 410 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; 411 412 /* Build the signal context to be used by osigreturn(). */ 413 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; 414 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); 415 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; 416 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; 417 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; 418 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; 419 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; 420 sf.sf_siginfo.si_sc.sc_err = regs->tf_err; 421 422 /* 423 * If we're a vm86 process, we want to save the segment registers. 424 * We also change eflags to be our emulated eflags, not the actual 425 * eflags. 426 */ 427 if (regs->tf_eflags & PSL_VM) { 428 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ 429 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 430 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 431 432 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; 433 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; 434 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; 435 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; 436 437 if (vm86->vm86_has_vme == 0) 438 sf.sf_siginfo.si_sc.sc_ps = 439 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 440 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 441 442 /* See sendsig() for comments. */ 443 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 444 } 445 446 /* 447 * Copy the sigframe out to the user's stack. 448 */ 449 if (copyout(&sf, fp, sizeof(*fp)) != 0) { 450 PROC_LOCK(p); 451 sigexit(td, SIGILL); 452 } 453 454 regs->tf_esp = (int)fp; 455 if (p->p_sysent->sv_sigcode_base != 0) { 456 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 457 szosigcode; 458 } else { 459 /* a.out sysentvec does not use shared page */ 460 regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode; 461 } 462 regs->tf_eflags &= ~(PSL_T | PSL_D); 463 regs->tf_cs = _ucodesel; 464 regs->tf_ds = _udatasel; 465 regs->tf_es = _udatasel; 466 regs->tf_fs = _udatasel; 467 load_gs(_udatasel); 468 regs->tf_ss = _udatasel; 469 PROC_LOCK(p); 470 mtx_lock(&psp->ps_mtx); 471} 472#endif /* COMPAT_43 */ 473 474#ifdef COMPAT_FREEBSD4 475static void 476freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 477{ 478 struct sigframe4 sf, *sfp; 479 struct proc *p; 480 struct thread *td; 481 struct sigacts *psp; 482 struct trapframe *regs; 483 int sig; 484 int oonstack; 485 486 td = curthread; 487 p = td->td_proc; 488 PROC_LOCK_ASSERT(p, MA_OWNED); 489 sig = ksi->ksi_signo; 490 psp = p->p_sigacts; 491 mtx_assert(&psp->ps_mtx, MA_OWNED); 492 regs = td->td_frame; 493 oonstack = sigonstack(regs->tf_esp); 494 495 /* Save user context. */ 496 bzero(&sf, sizeof(sf)); 497 sf.sf_uc.uc_sigmask = *mask; 498 sf.sf_uc.uc_stack = td->td_sigstk; 499 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 500 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 501 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 502 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 503 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 504 bzero(sf.sf_uc.uc_mcontext.mc_fpregs, 505 sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); 506 bzero(sf.sf_uc.uc_mcontext.__spare__, 507 sizeof(sf.sf_uc.uc_mcontext.__spare__)); 508 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 509 510 /* Allocate space for the signal handler context. */ 511 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 512 SIGISMEMBER(psp->ps_sigonstack, sig)) { 513 sfp = (struct sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp + 514 td->td_sigstk.ss_size - sizeof(struct sigframe4)); 515#if defined(COMPAT_43) 516 td->td_sigstk.ss_flags |= SS_ONSTACK; 517#endif 518 } else 519 sfp = (struct sigframe4 *)regs->tf_esp - 1; 520 521 /* Build the argument list for the signal handler. */ 522 sf.sf_signum = sig; 523 sf.sf_ucontext = (register_t)&sfp->sf_uc; 524 bzero(&sf.sf_si, sizeof(sf.sf_si)); 525 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 526 /* Signal handler installed with SA_SIGINFO. */ 527 sf.sf_siginfo = (register_t)&sfp->sf_si; 528 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 529 530 /* Fill in POSIX parts */ 531 sf.sf_si.si_signo = sig; 532 sf.sf_si.si_code = ksi->ksi_code; 533 sf.sf_si.si_addr = ksi->ksi_addr; 534 } else { 535 /* Old FreeBSD-style arguments. */ 536 sf.sf_siginfo = ksi->ksi_code; 537 sf.sf_addr = (register_t)ksi->ksi_addr; 538 sf.sf_ahu.sf_handler = catcher; 539 } 540 mtx_unlock(&psp->ps_mtx); 541 PROC_UNLOCK(p); 542 543 /* 544 * If we're a vm86 process, we want to save the segment registers. 545 * We also change eflags to be our emulated eflags, not the actual 546 * eflags. 547 */ 548 if (regs->tf_eflags & PSL_VM) { 549 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 550 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 551 552 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 553 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 554 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 555 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 556 557 if (vm86->vm86_has_vme == 0) 558 sf.sf_uc.uc_mcontext.mc_eflags = 559 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 560 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 561 562 /* 563 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 564 * syscalls made by the signal handler. This just avoids 565 * wasting time for our lazy fixup of such faults. PSL_NT 566 * does nothing in vm86 mode, but vm86 programs can set it 567 * almost legitimately in probes for old cpu types. 568 */ 569 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 570 } 571 572 /* 573 * Copy the sigframe out to the user's stack. 574 */ 575 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 576 PROC_LOCK(p); 577 sigexit(td, SIGILL); 578 } 579 580 regs->tf_esp = (int)sfp; 581 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 582 szfreebsd4_sigcode; 583 regs->tf_eflags &= ~(PSL_T | PSL_D); 584 regs->tf_cs = _ucodesel; 585 regs->tf_ds = _udatasel; 586 regs->tf_es = _udatasel; 587 regs->tf_fs = _udatasel; 588 regs->tf_ss = _udatasel; 589 PROC_LOCK(p); 590 mtx_lock(&psp->ps_mtx); 591} 592#endif /* COMPAT_FREEBSD4 */ 593 594void 595sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 596{ 597 struct sigframe sf, *sfp; 598 struct proc *p; 599 struct thread *td; 600 struct sigacts *psp; 601 char *sp; 602 struct trapframe *regs; 603 struct segment_descriptor *sdp; 604 char *xfpusave; 605 size_t xfpusave_len; 606 int sig; 607 int oonstack; 608 609 td = curthread; 610 p = td->td_proc; 611 PROC_LOCK_ASSERT(p, MA_OWNED); 612 sig = ksi->ksi_signo; 613 psp = p->p_sigacts; 614 mtx_assert(&psp->ps_mtx, MA_OWNED); 615#ifdef COMPAT_FREEBSD4 616 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { 617 freebsd4_sendsig(catcher, ksi, mask); 618 return; 619 } 620#endif 621#ifdef COMPAT_43 622 if (SIGISMEMBER(psp->ps_osigset, sig)) { 623 osendsig(catcher, ksi, mask); 624 return; 625 } 626#endif 627 regs = td->td_frame; 628 oonstack = sigonstack(regs->tf_esp); 629 630 if (cpu_max_ext_state_size > sizeof(union savefpu) && use_xsave) { 631 xfpusave_len = cpu_max_ext_state_size - sizeof(union savefpu); 632 xfpusave = __builtin_alloca(xfpusave_len); 633 } else { 634 xfpusave_len = 0; 635 xfpusave = NULL; 636 } 637 638 /* Save user context. */ 639 bzero(&sf, sizeof(sf)); 640 sf.sf_uc.uc_sigmask = *mask; 641 sf.sf_uc.uc_stack = td->td_sigstk; 642 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 643 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 644 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 645 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 646 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 647 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 648 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 649 fpstate_drop(td); 650 /* 651 * Unconditionally fill the fsbase and gsbase into the mcontext. 652 */ 653 sdp = &td->td_pcb->pcb_fsd; 654 sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 | 655 sdp->sd_lobase; 656 sdp = &td->td_pcb->pcb_gsd; 657 sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | 658 sdp->sd_lobase; 659 bzero(sf.sf_uc.uc_mcontext.mc_spare2, 660 sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); 661 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 662 663 /* Allocate space for the signal handler context. */ 664 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 665 SIGISMEMBER(psp->ps_sigonstack, sig)) { 666 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 667#if defined(COMPAT_43) 668 td->td_sigstk.ss_flags |= SS_ONSTACK; 669#endif 670 } else 671 sp = (char *)regs->tf_esp - 128; 672 if (xfpusave != NULL) { 673 sp -= xfpusave_len; 674 sp = (char *)((unsigned int)sp & ~0x3F); 675 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 676 } 677 sp -= sizeof(struct sigframe); 678 679 /* Align to 16 bytes. */ 680 sfp = (struct sigframe *)((unsigned int)sp & ~0xF); 681 682 /* Build the argument list for the signal handler. */ 683 sf.sf_signum = sig; 684 sf.sf_ucontext = (register_t)&sfp->sf_uc; 685 bzero(&sf.sf_si, sizeof(sf.sf_si)); 686 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 687 /* Signal handler installed with SA_SIGINFO. */ 688 sf.sf_siginfo = (register_t)&sfp->sf_si; 689 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 690 691 /* Fill in POSIX parts */ 692 sf.sf_si = ksi->ksi_info; 693 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 694 } else { 695 /* Old FreeBSD-style arguments. */ 696 sf.sf_siginfo = ksi->ksi_code; 697 sf.sf_addr = (register_t)ksi->ksi_addr; 698 sf.sf_ahu.sf_handler = catcher; 699 } 700 mtx_unlock(&psp->ps_mtx); 701 PROC_UNLOCK(p); 702 703 /* 704 * If we're a vm86 process, we want to save the segment registers. 705 * We also change eflags to be our emulated eflags, not the actual 706 * eflags. 707 */ 708 if (regs->tf_eflags & PSL_VM) { 709 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 710 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 711 712 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 713 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 714 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 715 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 716 717 if (vm86->vm86_has_vme == 0) 718 sf.sf_uc.uc_mcontext.mc_eflags = 719 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 720 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 721 722 /* 723 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 724 * syscalls made by the signal handler. This just avoids 725 * wasting time for our lazy fixup of such faults. PSL_NT 726 * does nothing in vm86 mode, but vm86 programs can set it 727 * almost legitimately in probes for old cpu types. 728 */ 729 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 730 } 731 732 /* 733 * Copy the sigframe out to the user's stack. 734 */ 735 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 736 (xfpusave != NULL && copyout(xfpusave, 737 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 738 != 0)) { 739 PROC_LOCK(p); 740 sigexit(td, SIGILL); 741 } 742 743 regs->tf_esp = (int)sfp; 744 regs->tf_eip = p->p_sysent->sv_sigcode_base; 745 if (regs->tf_eip == 0) 746 regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode; 747 regs->tf_eflags &= ~(PSL_T | PSL_D); 748 regs->tf_cs = _ucodesel; 749 regs->tf_ds = _udatasel; 750 regs->tf_es = _udatasel; 751 regs->tf_fs = _udatasel; 752 regs->tf_ss = _udatasel; 753 PROC_LOCK(p); 754 mtx_lock(&psp->ps_mtx); 755} 756 757/* 758 * System call to cleanup state after a signal 759 * has been taken. Reset signal mask and 760 * stack state from context left by sendsig (above). 761 * Return to previous pc and psl as specified by 762 * context left by sendsig. Check carefully to 763 * make sure that the user has not modified the 764 * state to gain improper privileges. 765 * 766 * MPSAFE 767 */ 768#ifdef COMPAT_43 769int 770osigreturn(td, uap) 771 struct thread *td; 772 struct osigreturn_args /* { 773 struct osigcontext *sigcntxp; 774 } */ *uap; 775{ 776 struct osigcontext sc; 777 struct trapframe *regs; 778 struct osigcontext *scp; 779 int eflags, error; 780 ksiginfo_t ksi; 781 782 regs = td->td_frame; 783 error = copyin(uap->sigcntxp, &sc, sizeof(sc)); 784 if (error != 0) 785 return (error); 786 scp = ≻ 787 eflags = scp->sc_ps; 788 if (eflags & PSL_VM) { 789 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 790 struct vm86_kernel *vm86; 791 792 /* 793 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 794 * set up the vm86 area, and we can't enter vm86 mode. 795 */ 796 if (td->td_pcb->pcb_ext == 0) 797 return (EINVAL); 798 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 799 if (vm86->vm86_inited == 0) 800 return (EINVAL); 801 802 /* Go back to user mode if both flags are set. */ 803 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 804 ksiginfo_init_trap(&ksi); 805 ksi.ksi_signo = SIGBUS; 806 ksi.ksi_code = BUS_OBJERR; 807 ksi.ksi_addr = (void *)regs->tf_eip; 808 trapsignal(td, &ksi); 809 } 810 811 if (vm86->vm86_has_vme) { 812 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 813 (eflags & VME_USERCHANGE) | PSL_VM; 814 } else { 815 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 816 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 817 (eflags & VM_USERCHANGE) | PSL_VM; 818 } 819 tf->tf_vm86_ds = scp->sc_ds; 820 tf->tf_vm86_es = scp->sc_es; 821 tf->tf_vm86_fs = scp->sc_fs; 822 tf->tf_vm86_gs = scp->sc_gs; 823 tf->tf_ds = _udatasel; 824 tf->tf_es = _udatasel; 825 tf->tf_fs = _udatasel; 826 } else { 827 /* 828 * Don't allow users to change privileged or reserved flags. 829 */ 830 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 831 return (EINVAL); 832 } 833 834 /* 835 * Don't allow users to load a valid privileged %cs. Let the 836 * hardware check for invalid selectors, excess privilege in 837 * other selectors, invalid %eip's and invalid %esp's. 838 */ 839 if (!CS_SECURE(scp->sc_cs)) { 840 ksiginfo_init_trap(&ksi); 841 ksi.ksi_signo = SIGBUS; 842 ksi.ksi_code = BUS_OBJERR; 843 ksi.ksi_trapno = T_PROTFLT; 844 ksi.ksi_addr = (void *)regs->tf_eip; 845 trapsignal(td, &ksi); 846 return (EINVAL); 847 } 848 regs->tf_ds = scp->sc_ds; 849 regs->tf_es = scp->sc_es; 850 regs->tf_fs = scp->sc_fs; 851 } 852 853 /* Restore remaining registers. */ 854 regs->tf_eax = scp->sc_eax; 855 regs->tf_ebx = scp->sc_ebx; 856 regs->tf_ecx = scp->sc_ecx; 857 regs->tf_edx = scp->sc_edx; 858 regs->tf_esi = scp->sc_esi; 859 regs->tf_edi = scp->sc_edi; 860 regs->tf_cs = scp->sc_cs; 861 regs->tf_ss = scp->sc_ss; 862 regs->tf_isp = scp->sc_isp; 863 regs->tf_ebp = scp->sc_fp; 864 regs->tf_esp = scp->sc_sp; 865 regs->tf_eip = scp->sc_pc; 866 regs->tf_eflags = eflags; 867 868#if defined(COMPAT_43) 869 if (scp->sc_onstack & 1) 870 td->td_sigstk.ss_flags |= SS_ONSTACK; 871 else 872 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 873#endif 874 kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, 875 SIGPROCMASK_OLD); 876 return (EJUSTRETURN); 877} 878#endif /* COMPAT_43 */ 879 880#ifdef COMPAT_FREEBSD4 881/* 882 * MPSAFE 883 */ 884int 885freebsd4_sigreturn(td, uap) 886 struct thread *td; 887 struct freebsd4_sigreturn_args /* { 888 const ucontext4 *sigcntxp; 889 } */ *uap; 890{ 891 struct ucontext4 uc; 892 struct trapframe *regs; 893 struct ucontext4 *ucp; 894 int cs, eflags, error; 895 ksiginfo_t ksi; 896 897 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 898 if (error != 0) 899 return (error); 900 ucp = &uc; 901 regs = td->td_frame; 902 eflags = ucp->uc_mcontext.mc_eflags; 903 if (eflags & PSL_VM) { 904 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 905 struct vm86_kernel *vm86; 906 907 /* 908 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 909 * set up the vm86 area, and we can't enter vm86 mode. 910 */ 911 if (td->td_pcb->pcb_ext == 0) 912 return (EINVAL); 913 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 914 if (vm86->vm86_inited == 0) 915 return (EINVAL); 916 917 /* Go back to user mode if both flags are set. */ 918 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 919 ksiginfo_init_trap(&ksi); 920 ksi.ksi_signo = SIGBUS; 921 ksi.ksi_code = BUS_OBJERR; 922 ksi.ksi_addr = (void *)regs->tf_eip; 923 trapsignal(td, &ksi); 924 } 925 if (vm86->vm86_has_vme) { 926 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 927 (eflags & VME_USERCHANGE) | PSL_VM; 928 } else { 929 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 930 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 931 (eflags & VM_USERCHANGE) | PSL_VM; 932 } 933 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 934 tf->tf_eflags = eflags; 935 tf->tf_vm86_ds = tf->tf_ds; 936 tf->tf_vm86_es = tf->tf_es; 937 tf->tf_vm86_fs = tf->tf_fs; 938 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 939 tf->tf_ds = _udatasel; 940 tf->tf_es = _udatasel; 941 tf->tf_fs = _udatasel; 942 } else { 943 /* 944 * Don't allow users to change privileged or reserved flags. 945 */ 946 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 947 uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n", 948 td->td_proc->p_pid, td->td_name, eflags); 949 return (EINVAL); 950 } 951 952 /* 953 * Don't allow users to load a valid privileged %cs. Let the 954 * hardware check for invalid selectors, excess privilege in 955 * other selectors, invalid %eip's and invalid %esp's. 956 */ 957 cs = ucp->uc_mcontext.mc_cs; 958 if (!CS_SECURE(cs)) { 959 uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", 960 td->td_proc->p_pid, td->td_name, cs); 961 ksiginfo_init_trap(&ksi); 962 ksi.ksi_signo = SIGBUS; 963 ksi.ksi_code = BUS_OBJERR; 964 ksi.ksi_trapno = T_PROTFLT; 965 ksi.ksi_addr = (void *)regs->tf_eip; 966 trapsignal(td, &ksi); 967 return (EINVAL); 968 } 969 970 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 971 } 972 973#if defined(COMPAT_43) 974 if (ucp->uc_mcontext.mc_onstack & 1) 975 td->td_sigstk.ss_flags |= SS_ONSTACK; 976 else 977 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 978#endif 979 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 980 return (EJUSTRETURN); 981} 982#endif /* COMPAT_FREEBSD4 */ 983 984/* 985 * MPSAFE 986 */ 987int 988sys_sigreturn(td, uap) 989 struct thread *td; 990 struct sigreturn_args /* { 991 const struct __ucontext *sigcntxp; 992 } */ *uap; 993{ 994 ucontext_t uc; 995 struct proc *p; 996 struct trapframe *regs; 997 ucontext_t *ucp; 998 char *xfpustate; 999 size_t xfpustate_len; 1000 int cs, eflags, error, ret; 1001 ksiginfo_t ksi; 1002 1003 p = td->td_proc; 1004 1005 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 1006 if (error != 0) 1007 return (error); 1008 ucp = &uc; 1009 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 1010 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 1011 td->td_name, ucp->uc_mcontext.mc_flags); 1012 return (EINVAL); 1013 } 1014 regs = td->td_frame; 1015 eflags = ucp->uc_mcontext.mc_eflags; 1016 if (eflags & PSL_VM) { 1017 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 1018 struct vm86_kernel *vm86; 1019 1020 /* 1021 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 1022 * set up the vm86 area, and we can't enter vm86 mode. 1023 */ 1024 if (td->td_pcb->pcb_ext == 0) 1025 return (EINVAL); 1026 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 1027 if (vm86->vm86_inited == 0) 1028 return (EINVAL); 1029 1030 /* Go back to user mode if both flags are set. */ 1031 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 1032 ksiginfo_init_trap(&ksi); 1033 ksi.ksi_signo = SIGBUS; 1034 ksi.ksi_code = BUS_OBJERR; 1035 ksi.ksi_addr = (void *)regs->tf_eip; 1036 trapsignal(td, &ksi); 1037 } 1038 1039 if (vm86->vm86_has_vme) { 1040 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 1041 (eflags & VME_USERCHANGE) | PSL_VM; 1042 } else { 1043 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 1044 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 1045 (eflags & VM_USERCHANGE) | PSL_VM; 1046 } 1047 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 1048 tf->tf_eflags = eflags; 1049 tf->tf_vm86_ds = tf->tf_ds; 1050 tf->tf_vm86_es = tf->tf_es; 1051 tf->tf_vm86_fs = tf->tf_fs; 1052 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 1053 tf->tf_ds = _udatasel; 1054 tf->tf_es = _udatasel; 1055 tf->tf_fs = _udatasel; 1056 } else { 1057 /* 1058 * Don't allow users to change privileged or reserved flags. 1059 */ 1060 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 1061 uprintf("pid %d (%s): sigreturn eflags = 0x%x\n", 1062 td->td_proc->p_pid, td->td_name, eflags); 1063 return (EINVAL); 1064 } 1065 1066 /* 1067 * Don't allow users to load a valid privileged %cs. Let the 1068 * hardware check for invalid selectors, excess privilege in 1069 * other selectors, invalid %eip's and invalid %esp's. 1070 */ 1071 cs = ucp->uc_mcontext.mc_cs; 1072 if (!CS_SECURE(cs)) { 1073 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", 1074 td->td_proc->p_pid, td->td_name, cs); 1075 ksiginfo_init_trap(&ksi); 1076 ksi.ksi_signo = SIGBUS; 1077 ksi.ksi_code = BUS_OBJERR; 1078 ksi.ksi_trapno = T_PROTFLT; 1079 ksi.ksi_addr = (void *)regs->tf_eip; 1080 trapsignal(td, &ksi); 1081 return (EINVAL); 1082 } 1083 1084 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 1085 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 1086 if (xfpustate_len > cpu_max_ext_state_size - 1087 sizeof(union savefpu)) { 1088 uprintf( 1089 "pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 1090 p->p_pid, td->td_name, xfpustate_len); 1091 return (EINVAL); 1092 } 1093 xfpustate = __builtin_alloca(xfpustate_len); 1094 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 1095 xfpustate, xfpustate_len); 1096 if (error != 0) { 1097 uprintf( 1098 "pid %d (%s): sigreturn copying xfpustate failed\n", 1099 p->p_pid, td->td_name); 1100 return (error); 1101 } 1102 } else { 1103 xfpustate = NULL; 1104 xfpustate_len = 0; 1105 } 1106 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, 1107 xfpustate_len); 1108 if (ret != 0) 1109 return (ret); 1110 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 1111 } 1112 1113#if defined(COMPAT_43) 1114 if (ucp->uc_mcontext.mc_onstack & 1) 1115 td->td_sigstk.ss_flags |= SS_ONSTACK; 1116 else 1117 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 1118#endif 1119 1120 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 1121 return (EJUSTRETURN); 1122} 1123 1124/* 1125 * Reset registers to default values on exec. 1126 */ 1127void 1128exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 1129{ 1130 struct trapframe *regs; 1131 struct pcb *pcb; 1132 register_t saved_eflags; 1133 1134 regs = td->td_frame; 1135 pcb = td->td_pcb; 1136 1137 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ 1138 pcb->pcb_gs = _udatasel; 1139 load_gs(_udatasel); 1140 1141 mtx_lock_spin(&dt_lock); 1142 if (td->td_proc->p_md.md_ldt) 1143 user_ldt_free(td); 1144 else 1145 mtx_unlock_spin(&dt_lock); 1146 1147 /* 1148 * Reset the fs and gs bases. The values from the old address 1149 * space do not make sense for the new program. In particular, 1150 * gsbase might be the TLS base for the old program but the new 1151 * program has no TLS now. 1152 */ 1153 set_fsbase(td, 0); 1154 set_gsbase(td, 0); 1155 1156 saved_eflags = regs->tf_eflags & PSL_T; 1157 bzero((char *)regs, sizeof(struct trapframe)); 1158 regs->tf_eip = imgp->entry_addr; 1159 regs->tf_esp = stack; 1160 regs->tf_eflags = PSL_USER | saved_eflags; 1161 regs->tf_ss = _udatasel; 1162 regs->tf_ds = _udatasel; 1163 regs->tf_es = _udatasel; 1164 regs->tf_fs = _udatasel; 1165 regs->tf_cs = _ucodesel; 1166 1167 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ 1168 regs->tf_ebx = imgp->ps_strings; 1169 1170 /* 1171 * Reset the hardware debug registers if they were in use. 1172 * They won't have any meaning for the newly exec'd process. 1173 */ 1174 if (pcb->pcb_flags & PCB_DBREGS) { 1175 pcb->pcb_dr0 = 0; 1176 pcb->pcb_dr1 = 0; 1177 pcb->pcb_dr2 = 0; 1178 pcb->pcb_dr3 = 0; 1179 pcb->pcb_dr6 = 0; 1180 pcb->pcb_dr7 = 0; 1181 if (pcb == curpcb) { 1182 /* 1183 * Clear the debug registers on the running 1184 * CPU, otherwise they will end up affecting 1185 * the next process we switch to. 1186 */ 1187 reset_dbregs(); 1188 } 1189 pcb->pcb_flags &= ~PCB_DBREGS; 1190 } 1191 1192 pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; 1193 1194 /* 1195 * Drop the FP state if we hold it, so that the process gets a 1196 * clean FP state if it uses the FPU again. 1197 */ 1198 fpstate_drop(td); 1199 1200 /* 1201 * XXX - Linux emulator 1202 * Make sure sure edx is 0x0 on entry. Linux binaries depend 1203 * on it. 1204 */ 1205 td->td_retval[1] = 0; 1206} 1207 1208void 1209cpu_setregs(void) 1210{ 1211 unsigned int cr0; 1212 1213 cr0 = rcr0(); 1214 1215 /* 1216 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: 1217 * 1218 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT 1219 * instructions. We must set the CR0_MP bit and use the CR0_TS 1220 * bit to control the trap, because setting the CR0_EM bit does 1221 * not cause WAIT instructions to trap. It's important to trap 1222 * WAIT instructions - otherwise the "wait" variants of no-wait 1223 * control instructions would degenerate to the "no-wait" variants 1224 * after FP context switches but work correctly otherwise. It's 1225 * particularly important to trap WAITs when there is no NPX - 1226 * otherwise the "wait" variants would always degenerate. 1227 * 1228 * Try setting CR0_NE to get correct error reporting on 486DX's. 1229 * Setting it should fail or do nothing on lesser processors. 1230 */ 1231 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 1232 load_cr0(cr0); 1233 load_gs(_udatasel); 1234} 1235 1236u_long bootdev; /* not a struct cdev *- encoding is different */ 1237SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1238 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); 1239 1240static char bootmethod[16] = "BIOS"; 1241SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1242 "System firmware boot method"); 1243 1244/* 1245 * Initialize 386 and configure to run kernel 1246 */ 1247 1248/* 1249 * Initialize segments & interrupt table 1250 */ 1251 1252int _default_ldt; 1253 1254union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1255union descriptor ldt[NLDT]; /* local descriptor table */ 1256static struct gate_descriptor idt0[NIDT]; 1257struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1258struct region_descriptor r_gdt, r_idt; /* table descriptors */ 1259struct mtx dt_lock; /* lock for GDT and LDT */ 1260 1261static struct i386tss dblfault_tss; 1262static char dblfault_stack[PAGE_SIZE]; 1263 1264extern vm_offset_t proc0kstack; 1265 1266 1267/* 1268 * software prototypes -- in more palatable form. 1269 * 1270 * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret 1271 * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) 1272 */ 1273struct soft_segment_descriptor gdt_segs[] = { 1274/* GNULL_SEL 0 Null Descriptor */ 1275{ .ssd_base = 0x0, 1276 .ssd_limit = 0x0, 1277 .ssd_type = 0, 1278 .ssd_dpl = SEL_KPL, 1279 .ssd_p = 0, 1280 .ssd_xx = 0, .ssd_xx1 = 0, 1281 .ssd_def32 = 0, 1282 .ssd_gran = 0 }, 1283/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ 1284{ .ssd_base = 0x0, 1285 .ssd_limit = 0xfffff, 1286 .ssd_type = SDT_MEMRWA, 1287 .ssd_dpl = SEL_KPL, 1288 .ssd_p = 1, 1289 .ssd_xx = 0, .ssd_xx1 = 0, 1290 .ssd_def32 = 1, 1291 .ssd_gran = 1 }, 1292/* GUFS_SEL 2 %fs Descriptor for user */ 1293{ .ssd_base = 0x0, 1294 .ssd_limit = 0xfffff, 1295 .ssd_type = SDT_MEMRWA, 1296 .ssd_dpl = SEL_UPL, 1297 .ssd_p = 1, 1298 .ssd_xx = 0, .ssd_xx1 = 0, 1299 .ssd_def32 = 1, 1300 .ssd_gran = 1 }, 1301/* GUGS_SEL 3 %gs Descriptor for user */ 1302{ .ssd_base = 0x0, 1303 .ssd_limit = 0xfffff, 1304 .ssd_type = SDT_MEMRWA, 1305 .ssd_dpl = SEL_UPL, 1306 .ssd_p = 1, 1307 .ssd_xx = 0, .ssd_xx1 = 0, 1308 .ssd_def32 = 1, 1309 .ssd_gran = 1 }, 1310/* GCODE_SEL 4 Code Descriptor for kernel */ 1311{ .ssd_base = 0x0, 1312 .ssd_limit = 0xfffff, 1313 .ssd_type = SDT_MEMERA, 1314 .ssd_dpl = SEL_KPL, 1315 .ssd_p = 1, 1316 .ssd_xx = 0, .ssd_xx1 = 0, 1317 .ssd_def32 = 1, 1318 .ssd_gran = 1 }, 1319/* GDATA_SEL 5 Data Descriptor for kernel */ 1320{ .ssd_base = 0x0, 1321 .ssd_limit = 0xfffff, 1322 .ssd_type = SDT_MEMRWA, 1323 .ssd_dpl = SEL_KPL, 1324 .ssd_p = 1, 1325 .ssd_xx = 0, .ssd_xx1 = 0, 1326 .ssd_def32 = 1, 1327 .ssd_gran = 1 }, 1328/* GUCODE_SEL 6 Code Descriptor for user */ 1329{ .ssd_base = 0x0, 1330 .ssd_limit = 0xfffff, 1331 .ssd_type = SDT_MEMERA, 1332 .ssd_dpl = SEL_UPL, 1333 .ssd_p = 1, 1334 .ssd_xx = 0, .ssd_xx1 = 0, 1335 .ssd_def32 = 1, 1336 .ssd_gran = 1 }, 1337/* GUDATA_SEL 7 Data Descriptor for user */ 1338{ .ssd_base = 0x0, 1339 .ssd_limit = 0xfffff, 1340 .ssd_type = SDT_MEMRWA, 1341 .ssd_dpl = SEL_UPL, 1342 .ssd_p = 1, 1343 .ssd_xx = 0, .ssd_xx1 = 0, 1344 .ssd_def32 = 1, 1345 .ssd_gran = 1 }, 1346/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ 1347{ .ssd_base = 0x400, 1348 .ssd_limit = 0xfffff, 1349 .ssd_type = SDT_MEMRWA, 1350 .ssd_dpl = SEL_KPL, 1351 .ssd_p = 1, 1352 .ssd_xx = 0, .ssd_xx1 = 0, 1353 .ssd_def32 = 1, 1354 .ssd_gran = 1 }, 1355/* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 1356{ 1357 .ssd_base = 0x0, 1358 .ssd_limit = sizeof(struct i386tss)-1, 1359 .ssd_type = SDT_SYS386TSS, 1360 .ssd_dpl = 0, 1361 .ssd_p = 1, 1362 .ssd_xx = 0, .ssd_xx1 = 0, 1363 .ssd_def32 = 0, 1364 .ssd_gran = 0 }, 1365/* GLDT_SEL 10 LDT Descriptor */ 1366{ .ssd_base = (int) ldt, 1367 .ssd_limit = sizeof(ldt)-1, 1368 .ssd_type = SDT_SYSLDT, 1369 .ssd_dpl = SEL_UPL, 1370 .ssd_p = 1, 1371 .ssd_xx = 0, .ssd_xx1 = 0, 1372 .ssd_def32 = 0, 1373 .ssd_gran = 0 }, 1374/* GUSERLDT_SEL 11 User LDT Descriptor per process */ 1375{ .ssd_base = (int) ldt, 1376 .ssd_limit = (512 * sizeof(union descriptor)-1), 1377 .ssd_type = SDT_SYSLDT, 1378 .ssd_dpl = 0, 1379 .ssd_p = 1, 1380 .ssd_xx = 0, .ssd_xx1 = 0, 1381 .ssd_def32 = 0, 1382 .ssd_gran = 0 }, 1383/* GPANIC_SEL 12 Panic Tss Descriptor */ 1384{ .ssd_base = (int) &dblfault_tss, 1385 .ssd_limit = sizeof(struct i386tss)-1, 1386 .ssd_type = SDT_SYS386TSS, 1387 .ssd_dpl = 0, 1388 .ssd_p = 1, 1389 .ssd_xx = 0, .ssd_xx1 = 0, 1390 .ssd_def32 = 0, 1391 .ssd_gran = 0 }, 1392/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ 1393{ .ssd_base = 0, 1394 .ssd_limit = 0xfffff, 1395 .ssd_type = SDT_MEMERA, 1396 .ssd_dpl = 0, 1397 .ssd_p = 1, 1398 .ssd_xx = 0, .ssd_xx1 = 0, 1399 .ssd_def32 = 0, 1400 .ssd_gran = 1 }, 1401/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ 1402{ .ssd_base = 0, 1403 .ssd_limit = 0xfffff, 1404 .ssd_type = SDT_MEMERA, 1405 .ssd_dpl = 0, 1406 .ssd_p = 1, 1407 .ssd_xx = 0, .ssd_xx1 = 0, 1408 .ssd_def32 = 0, 1409 .ssd_gran = 1 }, 1410/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ 1411{ .ssd_base = 0, 1412 .ssd_limit = 0xfffff, 1413 .ssd_type = SDT_MEMRWA, 1414 .ssd_dpl = 0, 1415 .ssd_p = 1, 1416 .ssd_xx = 0, .ssd_xx1 = 0, 1417 .ssd_def32 = 1, 1418 .ssd_gran = 1 }, 1419/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ 1420{ .ssd_base = 0, 1421 .ssd_limit = 0xfffff, 1422 .ssd_type = SDT_MEMRWA, 1423 .ssd_dpl = 0, 1424 .ssd_p = 1, 1425 .ssd_xx = 0, .ssd_xx1 = 0, 1426 .ssd_def32 = 0, 1427 .ssd_gran = 1 }, 1428/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ 1429{ .ssd_base = 0, 1430 .ssd_limit = 0xfffff, 1431 .ssd_type = SDT_MEMRWA, 1432 .ssd_dpl = 0, 1433 .ssd_p = 1, 1434 .ssd_xx = 0, .ssd_xx1 = 0, 1435 .ssd_def32 = 0, 1436 .ssd_gran = 1 }, 1437/* GNDIS_SEL 18 NDIS Descriptor */ 1438{ .ssd_base = 0x0, 1439 .ssd_limit = 0x0, 1440 .ssd_type = 0, 1441 .ssd_dpl = 0, 1442 .ssd_p = 0, 1443 .ssd_xx = 0, .ssd_xx1 = 0, 1444 .ssd_def32 = 0, 1445 .ssd_gran = 0 }, 1446}; 1447 1448static struct soft_segment_descriptor ldt_segs[] = { 1449 /* Null Descriptor - overwritten by call gate */ 1450{ .ssd_base = 0x0, 1451 .ssd_limit = 0x0, 1452 .ssd_type = 0, 1453 .ssd_dpl = 0, 1454 .ssd_p = 0, 1455 .ssd_xx = 0, .ssd_xx1 = 0, 1456 .ssd_def32 = 0, 1457 .ssd_gran = 0 }, 1458 /* Null Descriptor - overwritten by call gate */ 1459{ .ssd_base = 0x0, 1460 .ssd_limit = 0x0, 1461 .ssd_type = 0, 1462 .ssd_dpl = 0, 1463 .ssd_p = 0, 1464 .ssd_xx = 0, .ssd_xx1 = 0, 1465 .ssd_def32 = 0, 1466 .ssd_gran = 0 }, 1467 /* Null Descriptor - overwritten by call gate */ 1468{ .ssd_base = 0x0, 1469 .ssd_limit = 0x0, 1470 .ssd_type = 0, 1471 .ssd_dpl = 0, 1472 .ssd_p = 0, 1473 .ssd_xx = 0, .ssd_xx1 = 0, 1474 .ssd_def32 = 0, 1475 .ssd_gran = 0 }, 1476 /* Code Descriptor for user */ 1477{ .ssd_base = 0x0, 1478 .ssd_limit = 0xfffff, 1479 .ssd_type = SDT_MEMERA, 1480 .ssd_dpl = SEL_UPL, 1481 .ssd_p = 1, 1482 .ssd_xx = 0, .ssd_xx1 = 0, 1483 .ssd_def32 = 1, 1484 .ssd_gran = 1 }, 1485 /* Null Descriptor - overwritten by call gate */ 1486{ .ssd_base = 0x0, 1487 .ssd_limit = 0x0, 1488 .ssd_type = 0, 1489 .ssd_dpl = 0, 1490 .ssd_p = 0, 1491 .ssd_xx = 0, .ssd_xx1 = 0, 1492 .ssd_def32 = 0, 1493 .ssd_gran = 0 }, 1494 /* Data Descriptor for user */ 1495{ .ssd_base = 0x0, 1496 .ssd_limit = 0xfffff, 1497 .ssd_type = SDT_MEMRWA, 1498 .ssd_dpl = SEL_UPL, 1499 .ssd_p = 1, 1500 .ssd_xx = 0, .ssd_xx1 = 0, 1501 .ssd_def32 = 1, 1502 .ssd_gran = 1 }, 1503}; 1504 1505void 1506setidt(idx, func, typ, dpl, selec) 1507 int idx; 1508 inthand_t *func; 1509 int typ; 1510 int dpl; 1511 int selec; 1512{ 1513 struct gate_descriptor *ip; 1514 1515 ip = idt + idx; 1516 ip->gd_looffset = (int)func; 1517 ip->gd_selector = selec; 1518 ip->gd_stkcpy = 0; 1519 ip->gd_xx = 0; 1520 ip->gd_type = typ; 1521 ip->gd_dpl = dpl; 1522 ip->gd_p = 1; 1523 ip->gd_hioffset = ((int)func)>>16 ; 1524} 1525 1526extern inthand_t 1527 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1528 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1529 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1530 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1531 IDTVEC(xmm), 1532#ifdef KDTRACE_HOOKS 1533 IDTVEC(dtrace_ret), 1534#endif 1535#ifdef XENHVM 1536 IDTVEC(xen_intr_upcall), 1537#endif 1538 IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); 1539 1540#ifdef DDB 1541/* 1542 * Display the index and function name of any IDT entries that don't use 1543 * the default 'rsvd' entry point. 1544 */ 1545DB_SHOW_COMMAND(idt, db_show_idt) 1546{ 1547 struct gate_descriptor *ip; 1548 int idx; 1549 uintptr_t func; 1550 1551 ip = idt; 1552 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 1553 func = (ip->gd_hioffset << 16 | ip->gd_looffset); 1554 if (func != (uintptr_t)&IDTVEC(rsvd)) { 1555 db_printf("%3d\t", idx); 1556 db_printsym(func, DB_STGY_PROC); 1557 db_printf("\n"); 1558 } 1559 ip++; 1560 } 1561} 1562 1563/* Show privileged registers. */ 1564DB_SHOW_COMMAND(sysregs, db_show_sysregs) 1565{ 1566 uint64_t idtr, gdtr; 1567 1568 idtr = ridt(); 1569 db_printf("idtr\t0x%08x/%04x\n", 1570 (u_int)(idtr >> 16), (u_int)idtr & 0xffff); 1571 gdtr = rgdt(); 1572 db_printf("gdtr\t0x%08x/%04x\n", 1573 (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); 1574 db_printf("ldtr\t0x%04x\n", rldt()); 1575 db_printf("tr\t0x%04x\n", rtr()); 1576 db_printf("cr0\t0x%08x\n", rcr0()); 1577 db_printf("cr2\t0x%08x\n", rcr2()); 1578 db_printf("cr3\t0x%08x\n", rcr3()); 1579 db_printf("cr4\t0x%08x\n", rcr4()); 1580 if (rcr4() & CR4_XSAVE) 1581 db_printf("xcr0\t0x%016llx\n", rxcr(0)); 1582 if (amd_feature & (AMDID_NX | AMDID_LM)) 1583 db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER)); 1584 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 1585 db_printf("FEATURES_CTL\t0x%016llx\n", 1586 rdmsr(MSR_IA32_FEATURE_CONTROL)); 1587 if ((cpu_vendor_id == CPU_VENDOR_INTEL || 1588 cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6) 1589 db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR)); 1590 if (cpu_feature & CPUID_PAT) 1591 db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT)); 1592} 1593 1594DB_SHOW_COMMAND(dbregs, db_show_dbregs) 1595{ 1596 1597 db_printf("dr0\t0x%08x\n", rdr0()); 1598 db_printf("dr1\t0x%08x\n", rdr1()); 1599 db_printf("dr2\t0x%08x\n", rdr2()); 1600 db_printf("dr3\t0x%08x\n", rdr3()); 1601 db_printf("dr6\t0x%08x\n", rdr6()); 1602 db_printf("dr7\t0x%08x\n", rdr7()); 1603} 1604#endif 1605 1606void 1607sdtossd(sd, ssd) 1608 struct segment_descriptor *sd; 1609 struct soft_segment_descriptor *ssd; 1610{ 1611 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1612 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1613 ssd->ssd_type = sd->sd_type; 1614 ssd->ssd_dpl = sd->sd_dpl; 1615 ssd->ssd_p = sd->sd_p; 1616 ssd->ssd_def32 = sd->sd_def32; 1617 ssd->ssd_gran = sd->sd_gran; 1618} 1619 1620#if !defined(PC98) 1621static int 1622add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 1623 int *physmap_idxp) 1624{ 1625 int i, insert_idx, physmap_idx; 1626 1627 physmap_idx = *physmap_idxp; 1628 1629 if (length == 0) 1630 return (1); 1631 1632#ifndef PAE 1633 if (base > 0xffffffff) { 1634 printf("%uK of memory above 4GB ignored\n", 1635 (u_int)(length / 1024)); 1636 return (1); 1637 } 1638#endif 1639 1640 /* 1641 * Find insertion point while checking for overlap. Start off by 1642 * assuming the new entry will be added to the end. 1643 */ 1644 insert_idx = physmap_idx + 2; 1645 for (i = 0; i <= physmap_idx; i += 2) { 1646 if (base < physmap[i + 1]) { 1647 if (base + length <= physmap[i]) { 1648 insert_idx = i; 1649 break; 1650 } 1651 if (boothowto & RB_VERBOSE) 1652 printf( 1653 "Overlapping memory regions, ignoring second region\n"); 1654 return (1); 1655 } 1656 } 1657 1658 /* See if we can prepend to the next entry. */ 1659 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1660 physmap[insert_idx] = base; 1661 return (1); 1662 } 1663 1664 /* See if we can append to the previous entry. */ 1665 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1666 physmap[insert_idx - 1] += length; 1667 return (1); 1668 } 1669 1670 physmap_idx += 2; 1671 *physmap_idxp = physmap_idx; 1672 if (physmap_idx == PHYSMAP_SIZE) { 1673 printf( 1674 "Too many segments in the physical address map, giving up\n"); 1675 return (0); 1676 } 1677 1678 /* 1679 * Move the last 'N' entries down to make room for the new 1680 * entry if needed. 1681 */ 1682 for (i = physmap_idx; i > insert_idx; i -= 2) { 1683 physmap[i] = physmap[i - 2]; 1684 physmap[i + 1] = physmap[i - 1]; 1685 } 1686 1687 /* Insert the new entry. */ 1688 physmap[insert_idx] = base; 1689 physmap[insert_idx + 1] = base + length; 1690 return (1); 1691} 1692 1693static int 1694add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) 1695{ 1696 if (boothowto & RB_VERBOSE) 1697 printf("SMAP type=%02x base=%016llx len=%016llx\n", 1698 smap->type, smap->base, smap->length); 1699 1700 if (smap->type != SMAP_TYPE_MEMORY) 1701 return (1); 1702 1703 return (add_physmap_entry(smap->base, smap->length, physmap, 1704 physmap_idxp)); 1705} 1706 1707static void 1708add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap, 1709 int *physmap_idxp) 1710{ 1711 struct bios_smap *smap, *smapend; 1712 u_int32_t smapsize; 1713 /* 1714 * Memory map from INT 15:E820. 1715 * 1716 * subr_module.c says: 1717 * "Consumer may safely assume that size value precedes data." 1718 * ie: an int32_t immediately precedes SMAP. 1719 */ 1720 smapsize = *((u_int32_t *)smapbase - 1); 1721 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1722 1723 for (smap = smapbase; smap < smapend; smap++) 1724 if (!add_smap_entry(smap, physmap, physmap_idxp)) 1725 break; 1726} 1727#endif /* !PC98 */ 1728 1729static void 1730basemem_setup(void) 1731{ 1732 vm_paddr_t pa; 1733 pt_entry_t *pte; 1734 int i; 1735 1736 if (basemem > 640) { 1737 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", 1738 basemem); 1739 basemem = 640; 1740 } 1741 1742 /* 1743 * XXX if biosbasemem is now < 640, there is a `hole' 1744 * between the end of base memory and the start of 1745 * ISA memory. The hole may be empty or it may 1746 * contain BIOS code or data. Map it read/write so 1747 * that the BIOS can write to it. (Memory from 0 to 1748 * the physical end of the kernel is mapped read-only 1749 * to begin with and then parts of it are remapped. 1750 * The parts that aren't remapped form holes that 1751 * remain read-only and are unused by the kernel. 1752 * The base memory area is below the physical end of 1753 * the kernel and right now forms a read-only hole. 1754 * The part of it from PAGE_SIZE to 1755 * (trunc_page(biosbasemem * 1024) - 1) will be 1756 * remapped and used by the kernel later.) 1757 * 1758 * This code is similar to the code used in 1759 * pmap_mapdev, but since no memory needs to be 1760 * allocated we simply change the mapping. 1761 */ 1762 for (pa = trunc_page(basemem * 1024); 1763 pa < ISA_HOLE_START; pa += PAGE_SIZE) 1764 pmap_kenter(KERNBASE + pa, pa); 1765 1766 /* 1767 * Map pages between basemem and ISA_HOLE_START, if any, r/w into 1768 * the vm86 page table so that vm86 can scribble on them using 1769 * the vm86 map too. XXX: why 2 ways for this and only 1 way for 1770 * page 0, at least as initialized here? 1771 */ 1772 pte = (pt_entry_t *)vm86paddr; 1773 for (i = basemem / 4; i < 160; i++) 1774 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; 1775} 1776 1777/* 1778 * Populate the (physmap) array with base/bound pairs describing the 1779 * available physical memory in the system, then test this memory and 1780 * build the phys_avail array describing the actually-available memory. 1781 * 1782 * If we cannot accurately determine the physical memory map, then use 1783 * value from the 0xE801 call, and failing that, the RTC. 1784 * 1785 * Total memory size may be set by the kernel environment variable 1786 * hw.physmem or the compile-time define MAXMEM. 1787 * 1788 * XXX first should be vm_paddr_t. 1789 */ 1790#ifdef PC98 1791static void 1792getmemsize(int first) 1793{ 1794 int off, physmap_idx, pa_indx, da_indx; 1795 u_long physmem_tunable, memtest; 1796 vm_paddr_t physmap[PHYSMAP_SIZE]; 1797 pt_entry_t *pte; 1798 quad_t dcons_addr, dcons_size; 1799 int i; 1800 int pg_n; 1801 u_int extmem; 1802 u_int under16; 1803 vm_paddr_t pa; 1804 1805 bzero(physmap, sizeof(physmap)); 1806 1807 /* XXX - some of EPSON machines can't use PG_N */ 1808 pg_n = PG_N; 1809 if (pc98_machine_type & M_EPSON_PC98) { 1810 switch (epson_machine_id) { 1811#ifdef WB_CACHE 1812 default: 1813#endif 1814 case EPSON_PC486_HX: 1815 case EPSON_PC486_HG: 1816 case EPSON_PC486_HA: 1817 pg_n = 0; 1818 break; 1819 } 1820 } 1821 1822 under16 = pc98_getmemsize(&basemem, &extmem); 1823 basemem_setup(); 1824 1825 physmap[0] = 0; 1826 physmap[1] = basemem * 1024; 1827 physmap_idx = 2; 1828 physmap[physmap_idx] = 0x100000; 1829 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 1830 1831 /* 1832 * Now, physmap contains a map of physical memory. 1833 */ 1834 1835#ifdef SMP 1836 /* make hole for AP bootstrap code */ 1837 physmap[1] = mp_bootaddress(physmap[1]); 1838#endif 1839 1840 /* 1841 * Maxmem isn't the "maximum memory", it's one larger than the 1842 * highest page of the physical address space. It should be 1843 * called something like "Maxphyspage". We may adjust this 1844 * based on ``hw.physmem'' and the results of the memory test. 1845 */ 1846 Maxmem = atop(physmap[physmap_idx + 1]); 1847 1848#ifdef MAXMEM 1849 Maxmem = MAXMEM / 4; 1850#endif 1851 1852 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1853 Maxmem = atop(physmem_tunable); 1854 1855 /* 1856 * By default keep the memtest enabled. Use a general name so that 1857 * one could eventually do more with the code than just disable it. 1858 */ 1859 memtest = 1; 1860 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 1861 1862 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1863 (boothowto & RB_VERBOSE)) 1864 printf("Physical memory use set to %ldK\n", Maxmem * 4); 1865 1866 /* 1867 * If Maxmem has been increased beyond what the system has detected, 1868 * extend the last memory segment to the new limit. 1869 */ 1870 if (atop(physmap[physmap_idx + 1]) < Maxmem) 1871 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 1872 1873 /* 1874 * We need to divide chunk if Maxmem is larger than 16MB and 1875 * under 16MB area is not full of memory. 1876 * (1) system area (15-16MB region) is cut off 1877 * (2) extended memory is only over 16MB area (ex. Melco "HYPERMEMORY") 1878 */ 1879 if ((under16 != 16 * 1024) && (extmem > 15 * 1024)) { 1880 /* 15M - 16M region is cut off, so need to divide chunk */ 1881 physmap[physmap_idx + 1] = under16 * 1024; 1882 physmap_idx += 2; 1883 physmap[physmap_idx] = 0x1000000; 1884 physmap[physmap_idx + 1] = physmap[2] + extmem * 1024; 1885 } 1886 1887 /* call pmap initialization to make new kernel address space */ 1888 pmap_bootstrap(first); 1889 1890 /* 1891 * Size up each available chunk of physical memory. 1892 */ 1893 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 1894 pa_indx = 0; 1895 da_indx = 1; 1896 phys_avail[pa_indx++] = physmap[0]; 1897 phys_avail[pa_indx] = physmap[0]; 1898 dump_avail[da_indx] = physmap[0]; 1899 pte = CMAP3; 1900 1901 /* 1902 * Get dcons buffer address 1903 */ 1904 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 1905 getenv_quad("dcons.size", &dcons_size) == 0) 1906 dcons_addr = 0; 1907 1908 /* 1909 * physmap is in bytes, so when converting to page boundaries, 1910 * round up the start address and round down the end address. 1911 */ 1912 for (i = 0; i <= physmap_idx; i += 2) { 1913 vm_paddr_t end; 1914 1915 end = ptoa((vm_paddr_t)Maxmem); 1916 if (physmap[i + 1] < end) 1917 end = trunc_page(physmap[i + 1]); 1918 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1919 int tmp, page_bad, full; 1920 int *ptr = (int *)CADDR3; 1921 1922 full = FALSE; 1923 /* 1924 * block out kernel memory as not available. 1925 */ 1926 if (pa >= KERNLOAD && pa < first) 1927 goto do_dump_avail; 1928 1929 /* 1930 * block out dcons buffer 1931 */ 1932 if (dcons_addr > 0 1933 && pa >= trunc_page(dcons_addr) 1934 && pa < dcons_addr + dcons_size) 1935 goto do_dump_avail; 1936 1937 page_bad = FALSE; 1938 if (memtest == 0) 1939 goto skip_memtest; 1940 1941 /* 1942 * map page into kernel: valid, read/write,non-cacheable 1943 */ 1944 *pte = pa | PG_V | PG_RW | pg_n; 1945 invltlb(); 1946 1947 tmp = *(int *)ptr; 1948 /* 1949 * Test for alternating 1's and 0's 1950 */ 1951 *(volatile int *)ptr = 0xaaaaaaaa; 1952 if (*(volatile int *)ptr != 0xaaaaaaaa) 1953 page_bad = TRUE; 1954 /* 1955 * Test for alternating 0's and 1's 1956 */ 1957 *(volatile int *)ptr = 0x55555555; 1958 if (*(volatile int *)ptr != 0x55555555) 1959 page_bad = TRUE; 1960 /* 1961 * Test for all 1's 1962 */ 1963 *(volatile int *)ptr = 0xffffffff; 1964 if (*(volatile int *)ptr != 0xffffffff) 1965 page_bad = TRUE; 1966 /* 1967 * Test for all 0's 1968 */ 1969 *(volatile int *)ptr = 0x0; 1970 if (*(volatile int *)ptr != 0x0) 1971 page_bad = TRUE; 1972 /* 1973 * Restore original value. 1974 */ 1975 *(int *)ptr = tmp; 1976 1977skip_memtest: 1978 /* 1979 * Adjust array of valid/good pages. 1980 */ 1981 if (page_bad == TRUE) 1982 continue; 1983 /* 1984 * If this good page is a continuation of the 1985 * previous set of good pages, then just increase 1986 * the end pointer. Otherwise start a new chunk. 1987 * Note that "end" points one higher than end, 1988 * making the range >= start and < end. 1989 * If we're also doing a speculative memory 1990 * test and we at or past the end, bump up Maxmem 1991 * so that we keep going. The first bad page 1992 * will terminate the loop. 1993 */ 1994 if (phys_avail[pa_indx] == pa) { 1995 phys_avail[pa_indx] += PAGE_SIZE; 1996 } else { 1997 pa_indx++; 1998 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 1999 printf( 2000 "Too many holes in the physical address space, giving up\n"); 2001 pa_indx--; 2002 full = TRUE; 2003 goto do_dump_avail; 2004 } 2005 phys_avail[pa_indx++] = pa; /* start */ 2006 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 2007 } 2008 physmem++; 2009do_dump_avail: 2010 if (dump_avail[da_indx] == pa) { 2011 dump_avail[da_indx] += PAGE_SIZE; 2012 } else { 2013 da_indx++; 2014 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2015 da_indx--; 2016 goto do_next; 2017 } 2018 dump_avail[da_indx++] = pa; /* start */ 2019 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 2020 } 2021do_next: 2022 if (full) 2023 break; 2024 } 2025 } 2026 *pte = 0; 2027 invltlb(); 2028 2029 /* 2030 * XXX 2031 * The last chunk must contain at least one page plus the message 2032 * buffer to avoid complicating other code (message buffer address 2033 * calculation, etc.). 2034 */ 2035 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 2036 round_page(msgbufsize) >= phys_avail[pa_indx]) { 2037 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2038 phys_avail[pa_indx--] = 0; 2039 phys_avail[pa_indx--] = 0; 2040 } 2041 2042 Maxmem = atop(phys_avail[pa_indx]); 2043 2044 /* Trim off space for the message buffer. */ 2045 phys_avail[pa_indx] -= round_page(msgbufsize); 2046 2047 /* Map the message buffer. */ 2048 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) 2049 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 2050 off); 2051} 2052#else /* PC98 */ 2053static void 2054getmemsize(int first) 2055{ 2056 int has_smap, off, physmap_idx, pa_indx, da_indx; 2057 u_long memtest; 2058 vm_paddr_t physmap[PHYSMAP_SIZE]; 2059 pt_entry_t *pte; 2060 quad_t dcons_addr, dcons_size, physmem_tunable; 2061 int hasbrokenint12, i, res; 2062 u_int extmem; 2063 struct vm86frame vmf; 2064 struct vm86context vmc; 2065 vm_paddr_t pa; 2066 struct bios_smap *smap, *smapbase; 2067 caddr_t kmdp; 2068 2069 has_smap = 0; 2070#ifdef XBOX 2071 if (arch_i386_is_xbox) { 2072 /* 2073 * We queried the memory size before, so chop off 4MB for 2074 * the framebuffer and inform the OS of this. 2075 */ 2076 physmap[0] = 0; 2077 physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE; 2078 physmap_idx = 0; 2079 goto physmap_done; 2080 } 2081#endif 2082 bzero(&vmf, sizeof(vmf)); 2083 bzero(physmap, sizeof(physmap)); 2084 basemem = 0; 2085 2086 /* 2087 * Tell the physical memory allocator about pages used to store 2088 * the kernel and preloaded data. See kmem_bootstrap_free(). 2089 */ 2090 vm_phys_add_seg((vm_paddr_t)KERNLOAD, trunc_page(first)); 2091 2092 /* 2093 * Check if the loader supplied an SMAP memory map. If so, 2094 * use that and do not make any VM86 calls. 2095 */ 2096 physmap_idx = 0; 2097 kmdp = preload_search_by_type("elf kernel"); 2098 if (kmdp == NULL) 2099 kmdp = preload_search_by_type("elf32 kernel"); 2100 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2101 MODINFO_METADATA | MODINFOMD_SMAP); 2102 if (smapbase != NULL) { 2103 add_smap_entries(smapbase, physmap, &physmap_idx); 2104 has_smap = 1; 2105 goto have_smap; 2106 } 2107 2108 /* 2109 * Some newer BIOSes have a broken INT 12H implementation 2110 * which causes a kernel panic immediately. In this case, we 2111 * need use the SMAP to determine the base memory size. 2112 */ 2113 hasbrokenint12 = 0; 2114 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); 2115 if (hasbrokenint12 == 0) { 2116 /* Use INT12 to determine base memory size. */ 2117 vm86_intcall(0x12, &vmf); 2118 basemem = vmf.vmf_ax; 2119 basemem_setup(); 2120 } 2121 2122 /* 2123 * Fetch the memory map with INT 15:E820. Map page 1 R/W into 2124 * the kernel page table so we can use it as a buffer. The 2125 * kernel will unmap this page later. 2126 */ 2127 pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT); 2128 vmc.npages = 0; 2129 smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); 2130 res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); 2131 KASSERT(res != 0, ("vm86_getptr() failed: address not found")); 2132 2133 vmf.vmf_ebx = 0; 2134 do { 2135 vmf.vmf_eax = 0xE820; 2136 vmf.vmf_edx = SMAP_SIG; 2137 vmf.vmf_ecx = sizeof(struct bios_smap); 2138 i = vm86_datacall(0x15, &vmf, &vmc); 2139 if (i || vmf.vmf_eax != SMAP_SIG) 2140 break; 2141 has_smap = 1; 2142 if (!add_smap_entry(smap, physmap, &physmap_idx)) 2143 break; 2144 } while (vmf.vmf_ebx != 0); 2145 2146have_smap: 2147 /* 2148 * If we didn't fetch the "base memory" size from INT12, 2149 * figure it out from the SMAP (or just guess). 2150 */ 2151 if (basemem == 0) { 2152 for (i = 0; i <= physmap_idx; i += 2) { 2153 if (physmap[i] == 0x00000000) { 2154 basemem = physmap[i + 1] / 1024; 2155 break; 2156 } 2157 } 2158 2159 /* XXX: If we couldn't find basemem from SMAP, just guess. */ 2160 if (basemem == 0) 2161 basemem = 640; 2162 basemem_setup(); 2163 } 2164 2165 if (physmap[1] != 0) 2166 goto physmap_done; 2167 2168 /* 2169 * If we failed to find an SMAP, figure out the extended 2170 * memory size. We will then build a simple memory map with 2171 * two segments, one for "base memory" and the second for 2172 * "extended memory". Note that "extended memory" starts at a 2173 * physical address of 1MB and that both basemem and extmem 2174 * are in units of 1KB. 2175 * 2176 * First, try to fetch the extended memory size via INT 15:E801. 2177 */ 2178 vmf.vmf_ax = 0xE801; 2179 if (vm86_intcall(0x15, &vmf) == 0) { 2180 extmem = vmf.vmf_cx + vmf.vmf_dx * 64; 2181 } else { 2182 /* 2183 * If INT15:E801 fails, this is our last ditch effort 2184 * to determine the extended memory size. Currently 2185 * we prefer the RTC value over INT15:88. 2186 */ 2187#if 0 2188 vmf.vmf_ah = 0x88; 2189 vm86_intcall(0x15, &vmf); 2190 extmem = vmf.vmf_ax; 2191#else 2192 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); 2193#endif 2194 } 2195 2196 /* 2197 * Special hack for chipsets that still remap the 384k hole when 2198 * there's 16MB of memory - this really confuses people that 2199 * are trying to use bus mastering ISA controllers with the 2200 * "16MB limit"; they only have 16MB, but the remapping puts 2201 * them beyond the limit. 2202 * 2203 * If extended memory is between 15-16MB (16-17MB phys address range), 2204 * chop it to 15MB. 2205 */ 2206 if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) 2207 extmem = 15 * 1024; 2208 2209 physmap[0] = 0; 2210 physmap[1] = basemem * 1024; 2211 physmap_idx = 2; 2212 physmap[physmap_idx] = 0x100000; 2213 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 2214 2215physmap_done: 2216 /* 2217 * Now, physmap contains a map of physical memory. 2218 */ 2219 2220#ifdef SMP 2221 /* make hole for AP bootstrap code */ 2222 physmap[1] = mp_bootaddress(physmap[1]); 2223#endif 2224 2225 /* 2226 * Maxmem isn't the "maximum memory", it's one larger than the 2227 * highest page of the physical address space. It should be 2228 * called something like "Maxphyspage". We may adjust this 2229 * based on ``hw.physmem'' and the results of the memory test. 2230 * 2231 * This is especially confusing when it is much larger than the 2232 * memory size and is displayed as "realmem". 2233 */ 2234 Maxmem = atop(physmap[physmap_idx + 1]); 2235 2236#ifdef MAXMEM 2237 Maxmem = MAXMEM / 4; 2238#endif 2239 2240 if (TUNABLE_QUAD_FETCH("hw.physmem", &physmem_tunable)) 2241 Maxmem = atop(physmem_tunable); 2242 2243 /* 2244 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend 2245 * the amount of memory in the system. 2246 */ 2247 if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) 2248 Maxmem = atop(physmap[physmap_idx + 1]); 2249 2250 /* 2251 * By default enable the memory test on real hardware, and disable 2252 * it if we appear to be running in a VM. This avoids touching all 2253 * pages unnecessarily, which doesn't matter on real hardware but is 2254 * bad for shared VM hosts. Use a general name so that 2255 * one could eventually do more with the code than just disable it. 2256 */ 2257 memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1; 2258 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 2259 2260 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2261 (boothowto & RB_VERBOSE)) 2262 printf("Physical memory use set to %ldK\n", Maxmem * 4); 2263 2264 /* 2265 * If Maxmem has been increased beyond what the system has detected, 2266 * extend the last memory segment to the new limit. 2267 */ 2268 if (atop(physmap[physmap_idx + 1]) < Maxmem) 2269 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 2270 2271 /* call pmap initialization to make new kernel address space */ 2272 pmap_bootstrap(first); 2273 2274 /* 2275 * Size up each available chunk of physical memory. 2276 */ 2277 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 2278 pa_indx = 0; 2279 da_indx = 1; 2280 phys_avail[pa_indx++] = physmap[0]; 2281 phys_avail[pa_indx] = physmap[0]; 2282 dump_avail[da_indx] = physmap[0]; 2283 pte = CMAP3; 2284 2285 /* 2286 * Get dcons buffer address 2287 */ 2288 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 2289 getenv_quad("dcons.size", &dcons_size) == 0) 2290 dcons_addr = 0; 2291 2292 /* 2293 * physmap is in bytes, so when converting to page boundaries, 2294 * round up the start address and round down the end address. 2295 */ 2296 for (i = 0; i <= physmap_idx; i += 2) { 2297 vm_paddr_t end; 2298 2299 end = ptoa((vm_paddr_t)Maxmem); 2300 if (physmap[i + 1] < end) 2301 end = trunc_page(physmap[i + 1]); 2302 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 2303 int tmp, page_bad, full; 2304 int *ptr = (int *)CADDR3; 2305 2306 full = FALSE; 2307 /* 2308 * block out kernel memory as not available. 2309 */ 2310 if (pa >= KERNLOAD && pa < first) 2311 goto do_dump_avail; 2312 2313 /* 2314 * block out dcons buffer 2315 */ 2316 if (dcons_addr > 0 2317 && pa >= trunc_page(dcons_addr) 2318 && pa < dcons_addr + dcons_size) 2319 goto do_dump_avail; 2320 2321 page_bad = FALSE; 2322 if (memtest == 0) 2323 goto skip_memtest; 2324 2325 /* 2326 * map page into kernel: valid, read/write,non-cacheable 2327 */ 2328 *pte = pa | PG_V | PG_RW | PG_N; 2329 invltlb(); 2330 2331 tmp = *(int *)ptr; 2332 /* 2333 * Test for alternating 1's and 0's 2334 */ 2335 *(volatile int *)ptr = 0xaaaaaaaa; 2336 if (*(volatile int *)ptr != 0xaaaaaaaa) 2337 page_bad = TRUE; 2338 /* 2339 * Test for alternating 0's and 1's 2340 */ 2341 *(volatile int *)ptr = 0x55555555; 2342 if (*(volatile int *)ptr != 0x55555555) 2343 page_bad = TRUE; 2344 /* 2345 * Test for all 1's 2346 */ 2347 *(volatile int *)ptr = 0xffffffff; 2348 if (*(volatile int *)ptr != 0xffffffff) 2349 page_bad = TRUE; 2350 /* 2351 * Test for all 0's 2352 */ 2353 *(volatile int *)ptr = 0x0; 2354 if (*(volatile int *)ptr != 0x0) 2355 page_bad = TRUE; 2356 /* 2357 * Restore original value. 2358 */ 2359 *(int *)ptr = tmp; 2360 2361skip_memtest: 2362 /* 2363 * Adjust array of valid/good pages. 2364 */ 2365 if (page_bad == TRUE) 2366 continue; 2367 /* 2368 * If this good page is a continuation of the 2369 * previous set of good pages, then just increase 2370 * the end pointer. Otherwise start a new chunk. 2371 * Note that "end" points one higher than end, 2372 * making the range >= start and < end. 2373 * If we're also doing a speculative memory 2374 * test and we at or past the end, bump up Maxmem 2375 * so that we keep going. The first bad page 2376 * will terminate the loop. 2377 */ 2378 if (phys_avail[pa_indx] == pa) { 2379 phys_avail[pa_indx] += PAGE_SIZE; 2380 } else { 2381 pa_indx++; 2382 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2383 printf( 2384 "Too many holes in the physical address space, giving up\n"); 2385 pa_indx--; 2386 full = TRUE; 2387 goto do_dump_avail; 2388 } 2389 phys_avail[pa_indx++] = pa; /* start */ 2390 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 2391 } 2392 physmem++; 2393do_dump_avail: 2394 if (dump_avail[da_indx] == pa) { 2395 dump_avail[da_indx] += PAGE_SIZE; 2396 } else { 2397 da_indx++; 2398 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2399 da_indx--; 2400 goto do_next; 2401 } 2402 dump_avail[da_indx++] = pa; /* start */ 2403 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 2404 } 2405do_next: 2406 if (full) 2407 break; 2408 } 2409 } 2410 *pte = 0; 2411 invltlb(); 2412 2413 /* 2414 * XXX 2415 * The last chunk must contain at least one page plus the message 2416 * buffer to avoid complicating other code (message buffer address 2417 * calculation, etc.). 2418 */ 2419 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 2420 round_page(msgbufsize) >= phys_avail[pa_indx]) { 2421 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2422 phys_avail[pa_indx--] = 0; 2423 phys_avail[pa_indx--] = 0; 2424 } 2425 2426 Maxmem = atop(phys_avail[pa_indx]); 2427 2428 /* Trim off space for the message buffer. */ 2429 phys_avail[pa_indx] -= round_page(msgbufsize); 2430 2431 /* Map the message buffer. */ 2432 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) 2433 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 2434 off); 2435} 2436#endif /* PC98 */ 2437 2438static void 2439i386_kdb_init(void) 2440{ 2441#ifdef DDB 2442 db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab); 2443#endif 2444 kdb_init(); 2445#ifdef KDB 2446 if (boothowto & RB_KDB) 2447 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 2448#endif 2449} 2450 2451register_t 2452init386(int first) 2453{ 2454 struct gate_descriptor *gdp; 2455 int gsel_tss, metadata_missing, x, pa; 2456 struct pcpu *pc; 2457 struct xstate_hdr *xhdr; 2458 caddr_t kmdp; 2459 size_t ucode_len; 2460 int late_console; 2461 2462 thread0.td_kstack = proc0kstack; 2463 thread0.td_kstack_pages = TD0_KSTACK_PAGES; 2464 2465 /* 2466 * This may be done better later if it gets more high level 2467 * components in it. If so just link td->td_proc here. 2468 */ 2469 proc_linkup0(&proc0, &thread0); 2470 2471#ifdef PC98 2472 /* 2473 * Initialize DMAC 2474 */ 2475 pc98_init_dmac(); 2476#endif 2477 2478 metadata_missing = 0; 2479 if (bootinfo.bi_modulep) { 2480 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 2481 preload_bootstrap_relocate(KERNBASE); 2482 } else { 2483 metadata_missing = 1; 2484 } 2485 2486 if (bootinfo.bi_envp != 0) 2487 init_static_kenv((char *)bootinfo.bi_envp + KERNBASE, 0); 2488 else 2489 init_static_kenv(NULL, 0); 2490 2491 /* 2492 * Re-evaluate CPU features if we loaded a microcode update. 2493 */ 2494 ucode_len = ucode_load_bsp(first); 2495 if (ucode_len != 0) { 2496 identify_cpu(); 2497 first = roundup2(first + ucode_len, PAGE_SIZE); 2498 } 2499 2500 identify_hypervisor(); 2501 2502 /* Init basic tunables, hz etc */ 2503 init_param1(); 2504 2505 /* 2506 * Make gdt memory segments. All segments cover the full 4GB 2507 * of address space and permissions are enforced at page level. 2508 */ 2509 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); 2510 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); 2511 gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); 2512 gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); 2513 gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); 2514 gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); 2515 2516 pc = &__pcpu[0]; 2517 gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); 2518 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 2519 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 2520 2521 for (x = 0; x < NGDT; x++) 2522 ssdtosd(&gdt_segs[x], &gdt[x].sd); 2523 2524 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2525 r_gdt.rd_base = (int) gdt; 2526 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2527 lgdt(&r_gdt); 2528 2529 pcpu_init(pc, 0, sizeof(struct pcpu)); 2530 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2531 pmap_kenter(pa + KERNBASE, pa); 2532 dpcpu_init((void *)(first + KERNBASE), 0); 2533 first += DPCPU_SIZE; 2534 PCPU_SET(prvspace, pc); 2535 PCPU_SET(curthread, &thread0); 2536 /* Non-late cninit() and printf() can be moved up to here. */ 2537 2538 /* 2539 * Initialize mutexes. 2540 * 2541 * icu_lock: in order to allow an interrupt to occur in a critical 2542 * section, to set pcpu->ipending (etc...) properly, we 2543 * must be able to get the icu lock, so it can't be 2544 * under witness. 2545 */ 2546 mutex_init(); 2547 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2548 2549 /* make ldt memory segments */ 2550 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2551 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2552 for (x = 0; x < nitems(ldt_segs); x++) 2553 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2554 2555 _default_ldt = GSEL(GLDT_SEL, SEL_KPL); 2556 lldt(_default_ldt); 2557 PCPU_SET(currentldt, _default_ldt); 2558 2559 /* exceptions */ 2560 for (x = 0; x < NIDT; x++) 2561 setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, 2562 GSEL(GCODE_SEL, SEL_KPL)); 2563 setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, 2564 GSEL(GCODE_SEL, SEL_KPL)); 2565 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, 2566 GSEL(GCODE_SEL, SEL_KPL)); 2567 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, 2568 GSEL(GCODE_SEL, SEL_KPL)); 2569 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, 2570 GSEL(GCODE_SEL, SEL_KPL)); 2571 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, 2572 GSEL(GCODE_SEL, SEL_KPL)); 2573 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, 2574 GSEL(GCODE_SEL, SEL_KPL)); 2575 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2576 GSEL(GCODE_SEL, SEL_KPL)); 2577 setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL 2578 , GSEL(GCODE_SEL, SEL_KPL)); 2579 setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); 2580 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, 2581 GSEL(GCODE_SEL, SEL_KPL)); 2582 setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, 2583 GSEL(GCODE_SEL, SEL_KPL)); 2584 setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, 2585 GSEL(GCODE_SEL, SEL_KPL)); 2586 setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, 2587 GSEL(GCODE_SEL, SEL_KPL)); 2588 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2589 GSEL(GCODE_SEL, SEL_KPL)); 2590 setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, 2591 GSEL(GCODE_SEL, SEL_KPL)); 2592 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, 2593 GSEL(GCODE_SEL, SEL_KPL)); 2594 setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, 2595 GSEL(GCODE_SEL, SEL_KPL)); 2596 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, 2597 GSEL(GCODE_SEL, SEL_KPL)); 2598 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, 2599 GSEL(GCODE_SEL, SEL_KPL)); 2600 setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, 2601 GSEL(GCODE_SEL, SEL_KPL)); 2602#ifdef KDTRACE_HOOKS 2603 setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL, 2604 GSEL(GCODE_SEL, SEL_KPL)); 2605#endif 2606#ifdef XENHVM 2607 setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_KPL, 2608 GSEL(GCODE_SEL, SEL_KPL)); 2609#endif 2610 2611 r_idt.rd_limit = sizeof(idt0) - 1; 2612 r_idt.rd_base = (int) idt; 2613 lidt(&r_idt); 2614 2615#ifdef XBOX 2616 /* 2617 * The following code queries the PCI ID of 0:0:0. For the XBOX, 2618 * This should be 0x10de / 0x02a5. 2619 * 2620 * This is exactly what Linux does. 2621 */ 2622 outl(0xcf8, 0x80000000); 2623 if (inl(0xcfc) == 0x02a510de) { 2624 arch_i386_is_xbox = 1; 2625 pic16l_setled(XBOX_LED_GREEN); 2626 2627 /* 2628 * We are an XBOX, but we may have either 64MB or 128MB of 2629 * memory. The PCI host bridge should be programmed for this, 2630 * so we just query it. 2631 */ 2632 outl(0xcf8, 0x80000084); 2633 arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64; 2634 } 2635#endif /* XBOX */ 2636 2637 /* 2638 * Initialize the clock before the console so that console 2639 * initialization can use DELAY(). 2640 */ 2641 clock_init(); 2642 2643 finishidentcpu(); /* Final stage of CPU initialization */ 2644 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2645 GSEL(GCODE_SEL, SEL_KPL)); 2646 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2647 GSEL(GCODE_SEL, SEL_KPL)); 2648 initializecpu(); /* Initialize CPU registers */ 2649 initializecpucache(); 2650 2651 /* pointer to selector slot for %fs/%gs */ 2652 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 2653 2654 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = 2655 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; 2656 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = 2657 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 2658#if defined(PAE) || defined(PAE_TABLES) 2659 dblfault_tss.tss_cr3 = (int)IdlePDPT; 2660#else 2661 dblfault_tss.tss_cr3 = (int)IdlePTD; 2662#endif 2663 dblfault_tss.tss_eip = (int)dblfault_handler; 2664 dblfault_tss.tss_eflags = PSL_KERNEL; 2665 dblfault_tss.tss_ds = dblfault_tss.tss_es = 2666 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); 2667 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 2668 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); 2669 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 2670 2671 /* Initialize the tss (except for the final esp0) early for vm86. */ 2672 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + 2673 thread0.td_kstack_pages * PAGE_SIZE - 16); 2674 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 2675 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2676 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); 2677 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 2678 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 2679 ltr(gsel_tss); 2680 2681 /* Initialize the PIC early for vm86 calls. */ 2682#ifdef DEV_ISA 2683#ifdef DEV_ATPIC 2684#ifndef PC98 2685 elcr_probe(); 2686#endif 2687 atpic_startup(); 2688#else 2689 /* Reset and mask the atpics and leave them shut down. */ 2690 atpic_reset(); 2691 2692 /* 2693 * Point the ICU spurious interrupt vectors at the APIC spurious 2694 * interrupt handler. 2695 */ 2696 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2697 GSEL(GCODE_SEL, SEL_KPL)); 2698 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2699 GSEL(GCODE_SEL, SEL_KPL)); 2700#endif 2701#endif 2702 2703 /* 2704 * The console and kdb should be initialized even earlier than here, 2705 * but some console drivers don't work until after getmemsize(). 2706 * Default to late console initialization to support these drivers. 2707 * This loses mainly printf()s in getmemsize() and early debugging. 2708 */ 2709 late_console = 1; 2710 TUNABLE_INT_FETCH("debug.late_console", &late_console); 2711 if (!late_console) { 2712 cninit(); 2713 i386_kdb_init(); 2714 } 2715 2716 kmdp = preload_search_by_type("elf kernel"); 2717 link_elf_ireloc(kmdp); 2718 2719 vm86_initialize(); 2720 getmemsize(first); 2721 init_param2(physmem); 2722 2723 /* now running on new page tables, configured,and u/iom is accessible */ 2724 2725 if (late_console) 2726 cninit(); 2727 2728 if (metadata_missing) 2729 printf("WARNING: loader(8) metadata is missing!\n"); 2730 2731 if (late_console) 2732 i386_kdb_init(); 2733 2734 msgbufinit(msgbufp, msgbufsize); 2735 npxinit(true); 2736 /* 2737 * Set up thread0 pcb after npxinit calculated pcb + fpu save 2738 * area size. Zero out the extended state header in fpu save 2739 * area. 2740 */ 2741 thread0.td_pcb = get_pcb_td(&thread0); 2742 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 2743 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); 2744 if (use_xsave) { 2745 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 2746 1); 2747 xhdr->xstate_bv = xsave_mask; 2748 } 2749 PCPU_SET(curpcb, thread0.td_pcb); 2750 /* Move esp0 in the tss to its final place. */ 2751 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 2752 PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16); 2753 gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */ 2754 ltr(gsel_tss); 2755 2756 /* make a call gate to reenter kernel with */ 2757 gdp = &ldt[LSYS5CALLS_SEL].gd; 2758 2759 x = (int) &IDTVEC(lcall_syscall); 2760 gdp->gd_looffset = x; 2761 gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); 2762 gdp->gd_stkcpy = 1; 2763 gdp->gd_type = SDT_SYS386CGT; 2764 gdp->gd_dpl = SEL_UPL; 2765 gdp->gd_p = 1; 2766 gdp->gd_hioffset = x >> 16; 2767 2768 /* XXX does this work? */ 2769 /* XXX yes! */ 2770 ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; 2771 ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; 2772 2773 /* transfer to user mode */ 2774 2775 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2776 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2777 2778 /* setup proc 0's pcb */ 2779 thread0.td_pcb->pcb_flags = 0; 2780#if defined(PAE) || defined(PAE_TABLES) 2781 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; 2782#else 2783 thread0.td_pcb->pcb_cr3 = (int)IdlePTD; 2784#endif 2785 thread0.td_pcb->pcb_ext = 0; 2786 thread0.td_frame = &proc0_tf; 2787 2788 cpu_probe_amdc1e(); 2789 2790#ifdef FDT 2791 x86_init_fdt(); 2792#endif 2793 2794 /* Location of kernel stack for locore */ 2795 return ((register_t)thread0.td_pcb); 2796} 2797 2798void 2799cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 2800{ 2801 2802 pcpu->pc_acpi_id = 0xffffffff; 2803} 2804 2805#ifndef PC98 2806static int 2807smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 2808{ 2809 struct bios_smap *smapbase; 2810 struct bios_smap_xattr smap; 2811 caddr_t kmdp; 2812 uint32_t *smapattr; 2813 int count, error, i; 2814 2815 /* Retrieve the system memory map from the loader. */ 2816 kmdp = preload_search_by_type("elf kernel"); 2817 if (kmdp == NULL) 2818 kmdp = preload_search_by_type("elf32 kernel"); 2819 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2820 MODINFO_METADATA | MODINFOMD_SMAP); 2821 if (smapbase == NULL) 2822 return (0); 2823 smapattr = (uint32_t *)preload_search_info(kmdp, 2824 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 2825 count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase); 2826 error = 0; 2827 for (i = 0; i < count; i++) { 2828 smap.base = smapbase[i].base; 2829 smap.length = smapbase[i].length; 2830 smap.type = smapbase[i].type; 2831 if (smapattr != NULL) 2832 smap.xattr = smapattr[i]; 2833 else 2834 smap.xattr = 0; 2835 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 2836 } 2837 return (error); 2838} 2839SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 2840 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); 2841#endif /* !PC98 */ 2842 2843void 2844spinlock_enter(void) 2845{ 2846 struct thread *td; 2847 register_t flags; 2848 2849 td = curthread; 2850 if (td->td_md.md_spinlock_count == 0) { 2851 flags = intr_disable(); 2852 td->td_md.md_spinlock_count = 1; 2853 td->td_md.md_saved_flags = flags; 2854 } else 2855 td->td_md.md_spinlock_count++; 2856 critical_enter(); 2857} 2858 2859void 2860spinlock_exit(void) 2861{ 2862 struct thread *td; 2863 register_t flags; 2864 2865 td = curthread; 2866 critical_exit(); 2867 flags = td->td_md.md_saved_flags; 2868 td->td_md.md_spinlock_count--; 2869 if (td->td_md.md_spinlock_count == 0) 2870 intr_restore(flags); 2871} 2872 2873#if defined(I586_CPU) && !defined(NO_F00F_HACK) 2874static void f00f_hack(void *unused); 2875SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); 2876 2877static void 2878f00f_hack(void *unused) 2879{ 2880 struct gate_descriptor *new_idt; 2881 vm_offset_t tmp; 2882 2883 if (!has_f00f_bug) 2884 return; 2885 2886 GIANT_REQUIRED; 2887 2888 printf("Intel Pentium detected, installing workaround for F00F bug\n"); 2889 2890 tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO); 2891 if (tmp == 0) 2892 panic("kmem_malloc returned 0"); 2893 2894 /* Put the problematic entry (#6) at the end of the lower page. */ 2895 new_idt = (struct gate_descriptor*) 2896 (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); 2897 bcopy(idt, new_idt, sizeof(idt0)); 2898 r_idt.rd_base = (u_int)new_idt; 2899 lidt(&r_idt); 2900 idt = new_idt; 2901 pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ); 2902} 2903#endif /* defined(I586_CPU) && !NO_F00F_HACK */ 2904 2905/* 2906 * Construct a PCB from a trapframe. This is called from kdb_trap() where 2907 * we want to start a backtrace from the function that caused us to enter 2908 * the debugger. We have the context in the trapframe, but base the trace 2909 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 2910 * enough for a backtrace. 2911 */ 2912void 2913makectx(struct trapframe *tf, struct pcb *pcb) 2914{ 2915 2916 pcb->pcb_edi = tf->tf_edi; 2917 pcb->pcb_esi = tf->tf_esi; 2918 pcb->pcb_ebp = tf->tf_ebp; 2919 pcb->pcb_ebx = tf->tf_ebx; 2920 pcb->pcb_eip = tf->tf_eip; 2921 pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; 2922 pcb->pcb_gs = rgs(); 2923} 2924 2925int 2926ptrace_set_pc(struct thread *td, u_long addr) 2927{ 2928 2929 td->td_frame->tf_eip = addr; 2930 return (0); 2931} 2932 2933int 2934ptrace_single_step(struct thread *td) 2935{ 2936 2937 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2938 if ((td->td_frame->tf_eflags & PSL_T) == 0) { 2939 td->td_frame->tf_eflags |= PSL_T; 2940 td->td_dbgflags |= TDB_STEP; 2941 } 2942 return (0); 2943} 2944 2945int 2946ptrace_clear_single_step(struct thread *td) 2947{ 2948 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2949 td->td_frame->tf_eflags &= ~PSL_T; 2950 td->td_dbgflags &= ~TDB_STEP; 2951 return (0); 2952} 2953 2954int 2955fill_regs(struct thread *td, struct reg *regs) 2956{ 2957 struct pcb *pcb; 2958 struct trapframe *tp; 2959 2960 tp = td->td_frame; 2961 pcb = td->td_pcb; 2962 regs->r_gs = pcb->pcb_gs; 2963 return (fill_frame_regs(tp, regs)); 2964} 2965 2966int 2967fill_frame_regs(struct trapframe *tp, struct reg *regs) 2968{ 2969 2970 regs->r_fs = tp->tf_fs; 2971 regs->r_es = tp->tf_es; 2972 regs->r_ds = tp->tf_ds; 2973 regs->r_edi = tp->tf_edi; 2974 regs->r_esi = tp->tf_esi; 2975 regs->r_ebp = tp->tf_ebp; 2976 regs->r_ebx = tp->tf_ebx; 2977 regs->r_edx = tp->tf_edx; 2978 regs->r_ecx = tp->tf_ecx; 2979 regs->r_eax = tp->tf_eax; 2980 regs->r_eip = tp->tf_eip; 2981 regs->r_cs = tp->tf_cs; 2982 regs->r_eflags = tp->tf_eflags; 2983 regs->r_esp = tp->tf_esp; 2984 regs->r_ss = tp->tf_ss; 2985 regs->r_err = 0; 2986 regs->r_trapno = 0; 2987 return (0); 2988} 2989 2990int 2991set_regs(struct thread *td, struct reg *regs) 2992{ 2993 struct pcb *pcb; 2994 struct trapframe *tp; 2995 2996 tp = td->td_frame; 2997 if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || 2998 !CS_SECURE(regs->r_cs)) 2999 return (EINVAL); 3000 pcb = td->td_pcb; 3001 tp->tf_fs = regs->r_fs; 3002 tp->tf_es = regs->r_es; 3003 tp->tf_ds = regs->r_ds; 3004 tp->tf_edi = regs->r_edi; 3005 tp->tf_esi = regs->r_esi; 3006 tp->tf_ebp = regs->r_ebp; 3007 tp->tf_ebx = regs->r_ebx; 3008 tp->tf_edx = regs->r_edx; 3009 tp->tf_ecx = regs->r_ecx; 3010 tp->tf_eax = regs->r_eax; 3011 tp->tf_eip = regs->r_eip; 3012 tp->tf_cs = regs->r_cs; 3013 tp->tf_eflags = regs->r_eflags; 3014 tp->tf_esp = regs->r_esp; 3015 tp->tf_ss = regs->r_ss; 3016 pcb->pcb_gs = regs->r_gs; 3017 return (0); 3018} 3019 3020int 3021fill_fpregs(struct thread *td, struct fpreg *fpregs) 3022{ 3023 3024 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 3025 P_SHOULDSTOP(td->td_proc), 3026 ("not suspended thread %p", td)); 3027 npxgetregs(td); 3028 if (cpu_fxsr) 3029 npx_fill_fpregs_xmm(&get_pcb_user_save_td(td)->sv_xmm, 3030 (struct save87 *)fpregs); 3031 else 3032 bcopy(&get_pcb_user_save_td(td)->sv_87, fpregs, 3033 sizeof(*fpregs)); 3034 return (0); 3035} 3036 3037int 3038set_fpregs(struct thread *td, struct fpreg *fpregs) 3039{ 3040 3041 critical_enter(); 3042 if (cpu_fxsr) 3043 npx_set_fpregs_xmm((struct save87 *)fpregs, 3044 &get_pcb_user_save_td(td)->sv_xmm); 3045 else 3046 bcopy(fpregs, &get_pcb_user_save_td(td)->sv_87, 3047 sizeof(*fpregs)); 3048 npxuserinited(td); 3049 critical_exit(); 3050 return (0); 3051} 3052 3053/* 3054 * Get machine context. 3055 */ 3056int 3057get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 3058{ 3059 struct trapframe *tp; 3060 struct segment_descriptor *sdp; 3061 3062 tp = td->td_frame; 3063 3064 PROC_LOCK(curthread->td_proc); 3065 mcp->mc_onstack = sigonstack(tp->tf_esp); 3066 PROC_UNLOCK(curthread->td_proc); 3067 mcp->mc_gs = td->td_pcb->pcb_gs; 3068 mcp->mc_fs = tp->tf_fs; 3069 mcp->mc_es = tp->tf_es; 3070 mcp->mc_ds = tp->tf_ds; 3071 mcp->mc_edi = tp->tf_edi; 3072 mcp->mc_esi = tp->tf_esi; 3073 mcp->mc_ebp = tp->tf_ebp; 3074 mcp->mc_isp = tp->tf_isp; 3075 mcp->mc_eflags = tp->tf_eflags; 3076 if (flags & GET_MC_CLEAR_RET) { 3077 mcp->mc_eax = 0; 3078 mcp->mc_edx = 0; 3079 mcp->mc_eflags &= ~PSL_C; 3080 } else { 3081 mcp->mc_eax = tp->tf_eax; 3082 mcp->mc_edx = tp->tf_edx; 3083 } 3084 mcp->mc_ebx = tp->tf_ebx; 3085 mcp->mc_ecx = tp->tf_ecx; 3086 mcp->mc_eip = tp->tf_eip; 3087 mcp->mc_cs = tp->tf_cs; 3088 mcp->mc_esp = tp->tf_esp; 3089 mcp->mc_ss = tp->tf_ss; 3090 mcp->mc_len = sizeof(*mcp); 3091 get_fpcontext(td, mcp, NULL, 0); 3092 sdp = &td->td_pcb->pcb_fsd; 3093 mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 3094 sdp = &td->td_pcb->pcb_gsd; 3095 mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 3096 mcp->mc_flags = 0; 3097 mcp->mc_xfpustate = 0; 3098 mcp->mc_xfpustate_len = 0; 3099 bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); 3100 return (0); 3101} 3102 3103/* 3104 * Set machine context. 3105 * 3106 * However, we don't set any but the user modifiable flags, and we won't 3107 * touch the cs selector. 3108 */ 3109int 3110set_mcontext(struct thread *td, mcontext_t *mcp) 3111{ 3112 struct trapframe *tp; 3113 char *xfpustate; 3114 int eflags, ret; 3115 3116 tp = td->td_frame; 3117 if (mcp->mc_len != sizeof(*mcp) || 3118 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 3119 return (EINVAL); 3120 eflags = (mcp->mc_eflags & PSL_USERCHANGE) | 3121 (tp->tf_eflags & ~PSL_USERCHANGE); 3122 if (mcp->mc_flags & _MC_HASFPXSTATE) { 3123 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 3124 sizeof(union savefpu)) 3125 return (EINVAL); 3126 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 3127 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 3128 mcp->mc_xfpustate_len); 3129 if (ret != 0) 3130 return (ret); 3131 } else 3132 xfpustate = NULL; 3133 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 3134 if (ret != 0) 3135 return (ret); 3136 tp->tf_fs = mcp->mc_fs; 3137 tp->tf_es = mcp->mc_es; 3138 tp->tf_ds = mcp->mc_ds; 3139 tp->tf_edi = mcp->mc_edi; 3140 tp->tf_esi = mcp->mc_esi; 3141 tp->tf_ebp = mcp->mc_ebp; 3142 tp->tf_ebx = mcp->mc_ebx; 3143 tp->tf_edx = mcp->mc_edx; 3144 tp->tf_ecx = mcp->mc_ecx; 3145 tp->tf_eax = mcp->mc_eax; 3146 tp->tf_eip = mcp->mc_eip; 3147 tp->tf_eflags = eflags; 3148 tp->tf_esp = mcp->mc_esp; 3149 tp->tf_ss = mcp->mc_ss; 3150 td->td_pcb->pcb_gs = mcp->mc_gs; 3151 return (0); 3152} 3153 3154static void 3155get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 3156 size_t xfpusave_len) 3157{ 3158 size_t max_len, len; 3159 3160 mcp->mc_ownedfp = npxgetregs(td); 3161 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 3162 sizeof(mcp->mc_fpstate)); 3163 mcp->mc_fpformat = npxformat(); 3164 if (!use_xsave || xfpusave_len == 0) 3165 return; 3166 max_len = cpu_max_ext_state_size - sizeof(union savefpu); 3167 len = xfpusave_len; 3168 if (len > max_len) { 3169 len = max_len; 3170 bzero(xfpusave + max_len, len - max_len); 3171 } 3172 mcp->mc_flags |= _MC_HASFPXSTATE; 3173 mcp->mc_xfpustate_len = len; 3174 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 3175} 3176 3177static int 3178set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 3179 size_t xfpustate_len) 3180{ 3181 int error; 3182 3183 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 3184 return (0); 3185 else if (mcp->mc_fpformat != _MC_FPFMT_387 && 3186 mcp->mc_fpformat != _MC_FPFMT_XMM) 3187 return (EINVAL); 3188 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 3189 /* We don't care what state is left in the FPU or PCB. */ 3190 fpstate_drop(td); 3191 error = 0; 3192 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 3193 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 3194 error = npxsetregs(td, (union savefpu *)&mcp->mc_fpstate, 3195 xfpustate, xfpustate_len); 3196 } else 3197 return (EINVAL); 3198 return (error); 3199} 3200 3201static void 3202fpstate_drop(struct thread *td) 3203{ 3204 3205 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 3206 critical_enter(); 3207 if (PCPU_GET(fpcurthread) == td) 3208 npxdrop(); 3209 /* 3210 * XXX force a full drop of the npx. The above only drops it if we 3211 * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. 3212 * 3213 * XXX I don't much like npxgetregs()'s semantics of doing a full 3214 * drop. Dropping only to the pcb matches fnsave's behaviour. 3215 * We only need to drop to !PCB_INITDONE in sendsig(). But 3216 * sendsig() is the only caller of npxgetregs()... perhaps we just 3217 * have too many layers. 3218 */ 3219 curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE | 3220 PCB_NPXUSERINITDONE); 3221 critical_exit(); 3222} 3223 3224int 3225fill_dbregs(struct thread *td, struct dbreg *dbregs) 3226{ 3227 struct pcb *pcb; 3228 3229 if (td == NULL) { 3230 dbregs->dr[0] = rdr0(); 3231 dbregs->dr[1] = rdr1(); 3232 dbregs->dr[2] = rdr2(); 3233 dbregs->dr[3] = rdr3(); 3234 dbregs->dr[4] = rdr4(); 3235 dbregs->dr[5] = rdr5(); 3236 dbregs->dr[6] = rdr6(); 3237 dbregs->dr[7] = rdr7(); 3238 } else { 3239 pcb = td->td_pcb; 3240 dbregs->dr[0] = pcb->pcb_dr0; 3241 dbregs->dr[1] = pcb->pcb_dr1; 3242 dbregs->dr[2] = pcb->pcb_dr2; 3243 dbregs->dr[3] = pcb->pcb_dr3; 3244 dbregs->dr[4] = 0; 3245 dbregs->dr[5] = 0; 3246 dbregs->dr[6] = pcb->pcb_dr6; 3247 dbregs->dr[7] = pcb->pcb_dr7; 3248 } 3249 return (0); 3250} 3251 3252int 3253set_dbregs(struct thread *td, struct dbreg *dbregs) 3254{ 3255 struct pcb *pcb; 3256 int i; 3257 3258 if (td == NULL) { 3259 load_dr0(dbregs->dr[0]); 3260 load_dr1(dbregs->dr[1]); 3261 load_dr2(dbregs->dr[2]); 3262 load_dr3(dbregs->dr[3]); 3263 load_dr4(dbregs->dr[4]); 3264 load_dr5(dbregs->dr[5]); 3265 load_dr6(dbregs->dr[6]); 3266 load_dr7(dbregs->dr[7]); 3267 } else { 3268 /* 3269 * Don't let an illegal value for dr7 get set. Specifically, 3270 * check for undefined settings. Setting these bit patterns 3271 * result in undefined behaviour and can lead to an unexpected 3272 * TRCTRAP. 3273 */ 3274 for (i = 0; i < 4; i++) { 3275 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 3276 return (EINVAL); 3277 if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) 3278 return (EINVAL); 3279 } 3280 3281 pcb = td->td_pcb; 3282 3283 /* 3284 * Don't let a process set a breakpoint that is not within the 3285 * process's address space. If a process could do this, it 3286 * could halt the system by setting a breakpoint in the kernel 3287 * (if ddb was enabled). Thus, we need to check to make sure 3288 * that no breakpoints are being enabled for addresses outside 3289 * process's address space. 3290 * 3291 * XXX - what about when the watched area of the user's 3292 * address space is written into from within the kernel 3293 * ... wouldn't that still cause a breakpoint to be generated 3294 * from within kernel mode? 3295 */ 3296 3297 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 3298 /* dr0 is enabled */ 3299 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 3300 return (EINVAL); 3301 } 3302 3303 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 3304 /* dr1 is enabled */ 3305 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 3306 return (EINVAL); 3307 } 3308 3309 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 3310 /* dr2 is enabled */ 3311 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 3312 return (EINVAL); 3313 } 3314 3315 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 3316 /* dr3 is enabled */ 3317 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 3318 return (EINVAL); 3319 } 3320 3321 pcb->pcb_dr0 = dbregs->dr[0]; 3322 pcb->pcb_dr1 = dbregs->dr[1]; 3323 pcb->pcb_dr2 = dbregs->dr[2]; 3324 pcb->pcb_dr3 = dbregs->dr[3]; 3325 pcb->pcb_dr6 = dbregs->dr[6]; 3326 pcb->pcb_dr7 = dbregs->dr[7]; 3327 3328 pcb->pcb_flags |= PCB_DBREGS; 3329 } 3330 3331 return (0); 3332} 3333 3334/* 3335 * Return > 0 if a hardware breakpoint has been hit, and the 3336 * breakpoint was in user space. Return 0, otherwise. 3337 */ 3338int 3339user_dbreg_trap(register_t dr6) 3340{ 3341 u_int32_t dr7; 3342 u_int32_t bp; /* breakpoint bits extracted from dr6 */ 3343 int nbp; /* number of breakpoints that triggered */ 3344 caddr_t addr[4]; /* breakpoint addresses */ 3345 int i; 3346 3347 bp = dr6 & DBREG_DR6_BMASK; 3348 if (bp == 0) { 3349 /* 3350 * None of the breakpoint bits are set meaning this 3351 * trap was not caused by any of the debug registers 3352 */ 3353 return 0; 3354 } 3355 3356 dr7 = rdr7(); 3357 if ((dr7 & 0x000000ff) == 0) { 3358 /* 3359 * all GE and LE bits in the dr7 register are zero, 3360 * thus the trap couldn't have been caused by the 3361 * hardware debug registers 3362 */ 3363 return 0; 3364 } 3365 3366 nbp = 0; 3367 3368 /* 3369 * at least one of the breakpoints were hit, check to see 3370 * which ones and if any of them are user space addresses 3371 */ 3372 3373 if (bp & 0x01) { 3374 addr[nbp++] = (caddr_t)rdr0(); 3375 } 3376 if (bp & 0x02) { 3377 addr[nbp++] = (caddr_t)rdr1(); 3378 } 3379 if (bp & 0x04) { 3380 addr[nbp++] = (caddr_t)rdr2(); 3381 } 3382 if (bp & 0x08) { 3383 addr[nbp++] = (caddr_t)rdr3(); 3384 } 3385 3386 for (i = 0; i < nbp; i++) { 3387 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 3388 /* 3389 * addr[i] is in user space 3390 */ 3391 return nbp; 3392 } 3393 } 3394 3395 /* 3396 * None of the breakpoints are in user space. 3397 */ 3398 return 0; 3399} 3400 3401#ifdef KDB 3402 3403/* 3404 * Provide inb() and outb() as functions. They are normally only available as 3405 * inline functions, thus cannot be called from the debugger. 3406 */ 3407 3408/* silence compiler warnings */ 3409u_char inb_(u_short); 3410void outb_(u_short, u_char); 3411 3412u_char 3413inb_(u_short port) 3414{ 3415 return inb(port); 3416} 3417 3418void 3419outb_(u_short port, u_char data) 3420{ 3421 outb(port, data); 3422} 3423 3424#endif /* KDB */ 3425