machdep.c revision 338691
1/*- 2 * Copyright (c) 1992 Terrence R. Lambert. 3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * William Jolitz. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 38 */ 39 40#include <sys/cdefs.h> 41__FBSDID("$FreeBSD: stable/11/sys/i386/i386/machdep.c 338691 2018-09-14 23:21:52Z jhb $"); 42 43#include "opt_apic.h" 44#include "opt_atpic.h" 45#include "opt_compat.h" 46#include "opt_cpu.h" 47#include "opt_ddb.h" 48#include "opt_inet.h" 49#include "opt_isa.h" 50#include "opt_kstack_pages.h" 51#include "opt_maxmem.h" 52#include "opt_mp_watchdog.h" 53#include "opt_perfmon.h" 54#include "opt_platform.h" 55#include "opt_xbox.h" 56 57#include <sys/param.h> 58#include <sys/proc.h> 59#include <sys/systm.h> 60#include <sys/bio.h> 61#include <sys/buf.h> 62#include <sys/bus.h> 63#include <sys/callout.h> 64#include <sys/cons.h> 65#include <sys/cpu.h> 66#include <sys/eventhandler.h> 67#include <sys/exec.h> 68#include <sys/imgact.h> 69#include <sys/kdb.h> 70#include <sys/kernel.h> 71#include <sys/ktr.h> 72#include <sys/linker.h> 73#include <sys/lock.h> 74#include <sys/malloc.h> 75#include <sys/memrange.h> 76#include <sys/msgbuf.h> 77#include <sys/mutex.h> 78#include <sys/pcpu.h> 79#include <sys/ptrace.h> 80#include <sys/reboot.h> 81#include <sys/rwlock.h> 82#include <sys/sched.h> 83#include <sys/signalvar.h> 84#ifdef SMP 85#include <sys/smp.h> 86#endif 87#include <sys/syscallsubr.h> 88#include <sys/sysctl.h> 89#include <sys/sysent.h> 90#include <sys/sysproto.h> 91#include <sys/ucontext.h> 92#include <sys/vmmeter.h> 93 94#include <vm/vm.h> 95#include <vm/vm_extern.h> 96#include <vm/vm_kern.h> 97#include <vm/vm_page.h> 98#include <vm/vm_map.h> 99#include <vm/vm_object.h> 100#include <vm/vm_pager.h> 101#include <vm/vm_param.h> 102#include <vm/vm_phys.h> 103 104#ifdef DDB 105#ifndef KDB 106#error KDB must be enabled in order for DDB to work! 107#endif 108#include <ddb/ddb.h> 109#include <ddb/db_sym.h> 110#endif 111 112#ifdef PC98 113#include <pc98/pc98/pc98_machdep.h> 114#else 115#include <isa/rtc.h> 116#endif 117 118#include <net/netisr.h> 119 120#include <machine/bootinfo.h> 121#include <machine/clock.h> 122#include <machine/cpu.h> 123#include <machine/cputypes.h> 124#include <machine/intr_machdep.h> 125#include <x86/mca.h> 126#include <machine/md_var.h> 127#include <machine/metadata.h> 128#include <machine/mp_watchdog.h> 129#include <machine/pc/bios.h> 130#include <machine/pcb.h> 131#include <machine/pcb_ext.h> 132#include <machine/proc.h> 133#include <machine/reg.h> 134#include <machine/sigframe.h> 135#include <machine/specialreg.h> 136#include <machine/vm86.h> 137#include <x86/init.h> 138#ifdef PERFMON 139#include <machine/perfmon.h> 140#endif 141#ifdef SMP 142#include <machine/smp.h> 143#endif 144#ifdef FDT 145#include <x86/fdt.h> 146#endif 147 148#ifdef DEV_APIC 149#include <x86/apicvar.h> 150#endif 151 152#ifdef DEV_ISA 153#include <x86/isa/icu.h> 154#endif 155 156#ifdef XBOX 157#include <machine/xbox.h> 158 159int arch_i386_is_xbox = 0; 160uint32_t arch_i386_xbox_memsize = 0; 161#endif 162 163/* Sanity check for __curthread() */ 164CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 165 166extern register_t init386(int first); 167extern void dblfault_handler(void); 168 169static void cpu_startup(void *); 170static void fpstate_drop(struct thread *td); 171static void get_fpcontext(struct thread *td, mcontext_t *mcp, 172 char *xfpusave, size_t xfpusave_len); 173static int set_fpcontext(struct thread *td, mcontext_t *mcp, 174 char *xfpustate, size_t xfpustate_len); 175SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 176 177/* Intel ICH registers */ 178#define ICH_PMBASE 0x400 179#define ICH_SMI_EN ICH_PMBASE + 0x30 180 181int _udatasel, _ucodesel; 182u_int basemem; 183 184#ifdef PC98 185int need_pre_dma_flush; /* If 1, use wbinvd befor DMA transfer. */ 186int need_post_dma_flush; /* If 1, use invd after DMA transfer. */ 187 188static int ispc98 = 1; 189SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, ""); 190#endif 191 192int cold = 1; 193 194#ifdef COMPAT_43 195static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 196#endif 197#ifdef COMPAT_FREEBSD4 198static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 199#endif 200 201long Maxmem = 0; 202long realmem = 0; 203 204#ifdef PAE 205FEATURE(pae, "Physical Address Extensions"); 206#endif 207 208/* 209 * The number of PHYSMAP entries must be one less than the number of 210 * PHYSSEG entries because the PHYSMAP entry that spans the largest 211 * physical address that is accessible by ISA DMA is split into two 212 * PHYSSEG entries. 213 */ 214#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 215 216vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 217vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 218 219/* must be 2 less so 0 0 can signal end of chunks */ 220#define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) 221#define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) 222 223struct kva_md_info kmi; 224 225static struct trapframe proc0_tf; 226struct pcpu __pcpu[MAXCPU]; 227 228struct mtx icu_lock; 229 230struct mem_range_softc mem_range_softc; 231 232 /* Default init_ops implementation. */ 233 struct init_ops init_ops = { 234 .early_clock_source_init = i8254_init, 235 .early_delay = i8254_delay, 236#ifdef DEV_APIC 237 .msi_init = msi_init, 238#endif 239 }; 240 241static void 242cpu_startup(dummy) 243 void *dummy; 244{ 245 uintmax_t memsize; 246 char *sysenv; 247 248#ifndef PC98 249 /* 250 * On MacBooks, we need to disallow the legacy USB circuit to 251 * generate an SMI# because this can cause several problems, 252 * namely: incorrect CPU frequency detection and failure to 253 * start the APs. 254 * We do this by disabling a bit in the SMI_EN (SMI Control and 255 * Enable register) of the Intel ICH LPC Interface Bridge. 256 */ 257 sysenv = kern_getenv("smbios.system.product"); 258 if (sysenv != NULL) { 259 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 260 strncmp(sysenv, "MacBook3,1", 10) == 0 || 261 strncmp(sysenv, "MacBook4,1", 10) == 0 || 262 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 263 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 264 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 265 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 266 strncmp(sysenv, "Macmini1,1", 10) == 0) { 267 if (bootverbose) 268 printf("Disabling LEGACY_USB_EN bit on " 269 "Intel ICH.\n"); 270 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 271 } 272 freeenv(sysenv); 273 } 274#endif /* !PC98 */ 275 276 /* 277 * Good {morning,afternoon,evening,night}. 278 */ 279 startrtclock(); 280 printcpuinfo(); 281 panicifcpuunsupported(); 282#ifdef PERFMON 283 perfmon_init(); 284#endif 285 286 /* 287 * Display physical memory if SMBIOS reports reasonable amount. 288 */ 289 memsize = 0; 290 sysenv = kern_getenv("smbios.memory.enabled"); 291 if (sysenv != NULL) { 292 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 293 freeenv(sysenv); 294 } 295 if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count)) 296 memsize = ptoa((uintmax_t)Maxmem); 297 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 298 realmem = atop(memsize); 299 300 /* 301 * Display any holes after the first chunk of extended memory. 302 */ 303 if (bootverbose) { 304 int indx; 305 306 printf("Physical memory chunk(s):\n"); 307 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 308 vm_paddr_t size; 309 310 size = phys_avail[indx + 1] - phys_avail[indx]; 311 printf( 312 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 313 (uintmax_t)phys_avail[indx], 314 (uintmax_t)phys_avail[indx + 1] - 1, 315 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 316 } 317 } 318 319 vm_ksubmap_init(&kmi); 320 321 printf("avail memory = %ju (%ju MB)\n", 322 ptoa((uintmax_t)vm_cnt.v_free_count), 323 ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); 324 325 /* 326 * Set up buffers, so they can be used to read disk labels. 327 */ 328 bufinit(); 329 vm_pager_bufferinit(); 330 cpu_setregs(); 331} 332 333/* 334 * Send an interrupt to process. 335 * 336 * Stack is set up to allow sigcode stored 337 * at top to call routine, followed by call 338 * to sigreturn routine below. After sigreturn 339 * resets the signal mask, the stack, and the 340 * frame pointer, it returns to the user 341 * specified pc, psl. 342 */ 343#ifdef COMPAT_43 344static void 345osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 346{ 347 struct osigframe sf, *fp; 348 struct proc *p; 349 struct thread *td; 350 struct sigacts *psp; 351 struct trapframe *regs; 352 int sig; 353 int oonstack; 354 355 td = curthread; 356 p = td->td_proc; 357 PROC_LOCK_ASSERT(p, MA_OWNED); 358 sig = ksi->ksi_signo; 359 psp = p->p_sigacts; 360 mtx_assert(&psp->ps_mtx, MA_OWNED); 361 regs = td->td_frame; 362 oonstack = sigonstack(regs->tf_esp); 363 364 /* Allocate space for the signal handler context. */ 365 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 366 SIGISMEMBER(psp->ps_sigonstack, sig)) { 367 fp = (struct osigframe *)((uintptr_t)td->td_sigstk.ss_sp + 368 td->td_sigstk.ss_size - sizeof(struct osigframe)); 369#if defined(COMPAT_43) 370 td->td_sigstk.ss_flags |= SS_ONSTACK; 371#endif 372 } else 373 fp = (struct osigframe *)regs->tf_esp - 1; 374 375 /* Build the argument list for the signal handler. */ 376 sf.sf_signum = sig; 377 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; 378 bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); 379 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 380 /* Signal handler installed with SA_SIGINFO. */ 381 sf.sf_arg2 = (register_t)&fp->sf_siginfo; 382 sf.sf_siginfo.si_signo = sig; 383 sf.sf_siginfo.si_code = ksi->ksi_code; 384 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; 385 sf.sf_addr = 0; 386 } else { 387 /* Old FreeBSD-style arguments. */ 388 sf.sf_arg2 = ksi->ksi_code; 389 sf.sf_addr = (register_t)ksi->ksi_addr; 390 sf.sf_ahu.sf_handler = catcher; 391 } 392 mtx_unlock(&psp->ps_mtx); 393 PROC_UNLOCK(p); 394 395 /* Save most if not all of trap frame. */ 396 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; 397 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; 398 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; 399 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; 400 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; 401 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; 402 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; 403 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; 404 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; 405 sf.sf_siginfo.si_sc.sc_es = regs->tf_es; 406 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; 407 sf.sf_siginfo.si_sc.sc_gs = rgs(); 408 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; 409 410 /* Build the signal context to be used by osigreturn(). */ 411 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; 412 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); 413 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; 414 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; 415 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; 416 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; 417 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; 418 sf.sf_siginfo.si_sc.sc_err = regs->tf_err; 419 420 /* 421 * If we're a vm86 process, we want to save the segment registers. 422 * We also change eflags to be our emulated eflags, not the actual 423 * eflags. 424 */ 425 if (regs->tf_eflags & PSL_VM) { 426 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ 427 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 428 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 429 430 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; 431 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; 432 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; 433 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; 434 435 if (vm86->vm86_has_vme == 0) 436 sf.sf_siginfo.si_sc.sc_ps = 437 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 438 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 439 440 /* See sendsig() for comments. */ 441 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 442 } 443 444 /* 445 * Copy the sigframe out to the user's stack. 446 */ 447 if (copyout(&sf, fp, sizeof(*fp)) != 0) { 448 PROC_LOCK(p); 449 sigexit(td, SIGILL); 450 } 451 452 regs->tf_esp = (int)fp; 453 if (p->p_sysent->sv_sigcode_base != 0) { 454 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 455 szosigcode; 456 } else { 457 /* a.out sysentvec does not use shared page */ 458 regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode; 459 } 460 regs->tf_eflags &= ~(PSL_T | PSL_D); 461 regs->tf_cs = _ucodesel; 462 regs->tf_ds = _udatasel; 463 regs->tf_es = _udatasel; 464 regs->tf_fs = _udatasel; 465 load_gs(_udatasel); 466 regs->tf_ss = _udatasel; 467 PROC_LOCK(p); 468 mtx_lock(&psp->ps_mtx); 469} 470#endif /* COMPAT_43 */ 471 472#ifdef COMPAT_FREEBSD4 473static void 474freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 475{ 476 struct sigframe4 sf, *sfp; 477 struct proc *p; 478 struct thread *td; 479 struct sigacts *psp; 480 struct trapframe *regs; 481 int sig; 482 int oonstack; 483 484 td = curthread; 485 p = td->td_proc; 486 PROC_LOCK_ASSERT(p, MA_OWNED); 487 sig = ksi->ksi_signo; 488 psp = p->p_sigacts; 489 mtx_assert(&psp->ps_mtx, MA_OWNED); 490 regs = td->td_frame; 491 oonstack = sigonstack(regs->tf_esp); 492 493 /* Save user context. */ 494 bzero(&sf, sizeof(sf)); 495 sf.sf_uc.uc_sigmask = *mask; 496 sf.sf_uc.uc_stack = td->td_sigstk; 497 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 498 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 499 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 500 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 501 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 502 bzero(sf.sf_uc.uc_mcontext.mc_fpregs, 503 sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); 504 bzero(sf.sf_uc.uc_mcontext.__spare__, 505 sizeof(sf.sf_uc.uc_mcontext.__spare__)); 506 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 507 508 /* Allocate space for the signal handler context. */ 509 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 510 SIGISMEMBER(psp->ps_sigonstack, sig)) { 511 sfp = (struct sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp + 512 td->td_sigstk.ss_size - sizeof(struct sigframe4)); 513#if defined(COMPAT_43) 514 td->td_sigstk.ss_flags |= SS_ONSTACK; 515#endif 516 } else 517 sfp = (struct sigframe4 *)regs->tf_esp - 1; 518 519 /* Build the argument list for the signal handler. */ 520 sf.sf_signum = sig; 521 sf.sf_ucontext = (register_t)&sfp->sf_uc; 522 bzero(&sf.sf_si, sizeof(sf.sf_si)); 523 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 524 /* Signal handler installed with SA_SIGINFO. */ 525 sf.sf_siginfo = (register_t)&sfp->sf_si; 526 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 527 528 /* Fill in POSIX parts */ 529 sf.sf_si.si_signo = sig; 530 sf.sf_si.si_code = ksi->ksi_code; 531 sf.sf_si.si_addr = ksi->ksi_addr; 532 } else { 533 /* Old FreeBSD-style arguments. */ 534 sf.sf_siginfo = ksi->ksi_code; 535 sf.sf_addr = (register_t)ksi->ksi_addr; 536 sf.sf_ahu.sf_handler = catcher; 537 } 538 mtx_unlock(&psp->ps_mtx); 539 PROC_UNLOCK(p); 540 541 /* 542 * If we're a vm86 process, we want to save the segment registers. 543 * We also change eflags to be our emulated eflags, not the actual 544 * eflags. 545 */ 546 if (regs->tf_eflags & PSL_VM) { 547 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 548 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 549 550 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 551 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 552 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 553 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 554 555 if (vm86->vm86_has_vme == 0) 556 sf.sf_uc.uc_mcontext.mc_eflags = 557 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 558 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 559 560 /* 561 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 562 * syscalls made by the signal handler. This just avoids 563 * wasting time for our lazy fixup of such faults. PSL_NT 564 * does nothing in vm86 mode, but vm86 programs can set it 565 * almost legitimately in probes for old cpu types. 566 */ 567 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 568 } 569 570 /* 571 * Copy the sigframe out to the user's stack. 572 */ 573 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 574 PROC_LOCK(p); 575 sigexit(td, SIGILL); 576 } 577 578 regs->tf_esp = (int)sfp; 579 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 580 szfreebsd4_sigcode; 581 regs->tf_eflags &= ~(PSL_T | PSL_D); 582 regs->tf_cs = _ucodesel; 583 regs->tf_ds = _udatasel; 584 regs->tf_es = _udatasel; 585 regs->tf_fs = _udatasel; 586 regs->tf_ss = _udatasel; 587 PROC_LOCK(p); 588 mtx_lock(&psp->ps_mtx); 589} 590#endif /* COMPAT_FREEBSD4 */ 591 592void 593sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 594{ 595 struct sigframe sf, *sfp; 596 struct proc *p; 597 struct thread *td; 598 struct sigacts *psp; 599 char *sp; 600 struct trapframe *regs; 601 struct segment_descriptor *sdp; 602 char *xfpusave; 603 size_t xfpusave_len; 604 int sig; 605 int oonstack; 606 607 td = curthread; 608 p = td->td_proc; 609 PROC_LOCK_ASSERT(p, MA_OWNED); 610 sig = ksi->ksi_signo; 611 psp = p->p_sigacts; 612 mtx_assert(&psp->ps_mtx, MA_OWNED); 613#ifdef COMPAT_FREEBSD4 614 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { 615 freebsd4_sendsig(catcher, ksi, mask); 616 return; 617 } 618#endif 619#ifdef COMPAT_43 620 if (SIGISMEMBER(psp->ps_osigset, sig)) { 621 osendsig(catcher, ksi, mask); 622 return; 623 } 624#endif 625 regs = td->td_frame; 626 oonstack = sigonstack(regs->tf_esp); 627 628 if (cpu_max_ext_state_size > sizeof(union savefpu) && use_xsave) { 629 xfpusave_len = cpu_max_ext_state_size - sizeof(union savefpu); 630 xfpusave = __builtin_alloca(xfpusave_len); 631 } else { 632 xfpusave_len = 0; 633 xfpusave = NULL; 634 } 635 636 /* Save user context. */ 637 bzero(&sf, sizeof(sf)); 638 sf.sf_uc.uc_sigmask = *mask; 639 sf.sf_uc.uc_stack = td->td_sigstk; 640 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 641 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 642 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 643 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 644 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 645 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 646 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 647 fpstate_drop(td); 648 /* 649 * Unconditionally fill the fsbase and gsbase into the mcontext. 650 */ 651 sdp = &td->td_pcb->pcb_fsd; 652 sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 | 653 sdp->sd_lobase; 654 sdp = &td->td_pcb->pcb_gsd; 655 sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | 656 sdp->sd_lobase; 657 bzero(sf.sf_uc.uc_mcontext.mc_spare2, 658 sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); 659 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 660 661 /* Allocate space for the signal handler context. */ 662 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 663 SIGISMEMBER(psp->ps_sigonstack, sig)) { 664 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 665#if defined(COMPAT_43) 666 td->td_sigstk.ss_flags |= SS_ONSTACK; 667#endif 668 } else 669 sp = (char *)regs->tf_esp - 128; 670 if (xfpusave != NULL) { 671 sp -= xfpusave_len; 672 sp = (char *)((unsigned int)sp & ~0x3F); 673 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 674 } 675 sp -= sizeof(struct sigframe); 676 677 /* Align to 16 bytes. */ 678 sfp = (struct sigframe *)((unsigned int)sp & ~0xF); 679 680 /* Build the argument list for the signal handler. */ 681 sf.sf_signum = sig; 682 sf.sf_ucontext = (register_t)&sfp->sf_uc; 683 bzero(&sf.sf_si, sizeof(sf.sf_si)); 684 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 685 /* Signal handler installed with SA_SIGINFO. */ 686 sf.sf_siginfo = (register_t)&sfp->sf_si; 687 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 688 689 /* Fill in POSIX parts */ 690 sf.sf_si = ksi->ksi_info; 691 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 692 } else { 693 /* Old FreeBSD-style arguments. */ 694 sf.sf_siginfo = ksi->ksi_code; 695 sf.sf_addr = (register_t)ksi->ksi_addr; 696 sf.sf_ahu.sf_handler = catcher; 697 } 698 mtx_unlock(&psp->ps_mtx); 699 PROC_UNLOCK(p); 700 701 /* 702 * If we're a vm86 process, we want to save the segment registers. 703 * We also change eflags to be our emulated eflags, not the actual 704 * eflags. 705 */ 706 if (regs->tf_eflags & PSL_VM) { 707 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 708 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 709 710 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 711 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 712 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 713 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 714 715 if (vm86->vm86_has_vme == 0) 716 sf.sf_uc.uc_mcontext.mc_eflags = 717 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 718 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 719 720 /* 721 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 722 * syscalls made by the signal handler. This just avoids 723 * wasting time for our lazy fixup of such faults. PSL_NT 724 * does nothing in vm86 mode, but vm86 programs can set it 725 * almost legitimately in probes for old cpu types. 726 */ 727 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 728 } 729 730 /* 731 * Copy the sigframe out to the user's stack. 732 */ 733 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 734 (xfpusave != NULL && copyout(xfpusave, 735 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 736 != 0)) { 737 PROC_LOCK(p); 738 sigexit(td, SIGILL); 739 } 740 741 regs->tf_esp = (int)sfp; 742 regs->tf_eip = p->p_sysent->sv_sigcode_base; 743 if (regs->tf_eip == 0) 744 regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode; 745 regs->tf_eflags &= ~(PSL_T | PSL_D); 746 regs->tf_cs = _ucodesel; 747 regs->tf_ds = _udatasel; 748 regs->tf_es = _udatasel; 749 regs->tf_fs = _udatasel; 750 regs->tf_ss = _udatasel; 751 PROC_LOCK(p); 752 mtx_lock(&psp->ps_mtx); 753} 754 755/* 756 * System call to cleanup state after a signal 757 * has been taken. Reset signal mask and 758 * stack state from context left by sendsig (above). 759 * Return to previous pc and psl as specified by 760 * context left by sendsig. Check carefully to 761 * make sure that the user has not modified the 762 * state to gain improper privileges. 763 * 764 * MPSAFE 765 */ 766#ifdef COMPAT_43 767int 768osigreturn(td, uap) 769 struct thread *td; 770 struct osigreturn_args /* { 771 struct osigcontext *sigcntxp; 772 } */ *uap; 773{ 774 struct osigcontext sc; 775 struct trapframe *regs; 776 struct osigcontext *scp; 777 int eflags, error; 778 ksiginfo_t ksi; 779 780 regs = td->td_frame; 781 error = copyin(uap->sigcntxp, &sc, sizeof(sc)); 782 if (error != 0) 783 return (error); 784 scp = ≻ 785 eflags = scp->sc_ps; 786 if (eflags & PSL_VM) { 787 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 788 struct vm86_kernel *vm86; 789 790 /* 791 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 792 * set up the vm86 area, and we can't enter vm86 mode. 793 */ 794 if (td->td_pcb->pcb_ext == 0) 795 return (EINVAL); 796 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 797 if (vm86->vm86_inited == 0) 798 return (EINVAL); 799 800 /* Go back to user mode if both flags are set. */ 801 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 802 ksiginfo_init_trap(&ksi); 803 ksi.ksi_signo = SIGBUS; 804 ksi.ksi_code = BUS_OBJERR; 805 ksi.ksi_addr = (void *)regs->tf_eip; 806 trapsignal(td, &ksi); 807 } 808 809 if (vm86->vm86_has_vme) { 810 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 811 (eflags & VME_USERCHANGE) | PSL_VM; 812 } else { 813 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 814 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 815 (eflags & VM_USERCHANGE) | PSL_VM; 816 } 817 tf->tf_vm86_ds = scp->sc_ds; 818 tf->tf_vm86_es = scp->sc_es; 819 tf->tf_vm86_fs = scp->sc_fs; 820 tf->tf_vm86_gs = scp->sc_gs; 821 tf->tf_ds = _udatasel; 822 tf->tf_es = _udatasel; 823 tf->tf_fs = _udatasel; 824 } else { 825 /* 826 * Don't allow users to change privileged or reserved flags. 827 */ 828 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 829 return (EINVAL); 830 } 831 832 /* 833 * Don't allow users to load a valid privileged %cs. Let the 834 * hardware check for invalid selectors, excess privilege in 835 * other selectors, invalid %eip's and invalid %esp's. 836 */ 837 if (!CS_SECURE(scp->sc_cs)) { 838 ksiginfo_init_trap(&ksi); 839 ksi.ksi_signo = SIGBUS; 840 ksi.ksi_code = BUS_OBJERR; 841 ksi.ksi_trapno = T_PROTFLT; 842 ksi.ksi_addr = (void *)regs->tf_eip; 843 trapsignal(td, &ksi); 844 return (EINVAL); 845 } 846 regs->tf_ds = scp->sc_ds; 847 regs->tf_es = scp->sc_es; 848 regs->tf_fs = scp->sc_fs; 849 } 850 851 /* Restore remaining registers. */ 852 regs->tf_eax = scp->sc_eax; 853 regs->tf_ebx = scp->sc_ebx; 854 regs->tf_ecx = scp->sc_ecx; 855 regs->tf_edx = scp->sc_edx; 856 regs->tf_esi = scp->sc_esi; 857 regs->tf_edi = scp->sc_edi; 858 regs->tf_cs = scp->sc_cs; 859 regs->tf_ss = scp->sc_ss; 860 regs->tf_isp = scp->sc_isp; 861 regs->tf_ebp = scp->sc_fp; 862 regs->tf_esp = scp->sc_sp; 863 regs->tf_eip = scp->sc_pc; 864 regs->tf_eflags = eflags; 865 866#if defined(COMPAT_43) 867 if (scp->sc_onstack & 1) 868 td->td_sigstk.ss_flags |= SS_ONSTACK; 869 else 870 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 871#endif 872 kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, 873 SIGPROCMASK_OLD); 874 return (EJUSTRETURN); 875} 876#endif /* COMPAT_43 */ 877 878#ifdef COMPAT_FREEBSD4 879/* 880 * MPSAFE 881 */ 882int 883freebsd4_sigreturn(td, uap) 884 struct thread *td; 885 struct freebsd4_sigreturn_args /* { 886 const ucontext4 *sigcntxp; 887 } */ *uap; 888{ 889 struct ucontext4 uc; 890 struct trapframe *regs; 891 struct ucontext4 *ucp; 892 int cs, eflags, error; 893 ksiginfo_t ksi; 894 895 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 896 if (error != 0) 897 return (error); 898 ucp = &uc; 899 regs = td->td_frame; 900 eflags = ucp->uc_mcontext.mc_eflags; 901 if (eflags & PSL_VM) { 902 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 903 struct vm86_kernel *vm86; 904 905 /* 906 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 907 * set up the vm86 area, and we can't enter vm86 mode. 908 */ 909 if (td->td_pcb->pcb_ext == 0) 910 return (EINVAL); 911 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 912 if (vm86->vm86_inited == 0) 913 return (EINVAL); 914 915 /* Go back to user mode if both flags are set. */ 916 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 917 ksiginfo_init_trap(&ksi); 918 ksi.ksi_signo = SIGBUS; 919 ksi.ksi_code = BUS_OBJERR; 920 ksi.ksi_addr = (void *)regs->tf_eip; 921 trapsignal(td, &ksi); 922 } 923 if (vm86->vm86_has_vme) { 924 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 925 (eflags & VME_USERCHANGE) | PSL_VM; 926 } else { 927 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 928 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 929 (eflags & VM_USERCHANGE) | PSL_VM; 930 } 931 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 932 tf->tf_eflags = eflags; 933 tf->tf_vm86_ds = tf->tf_ds; 934 tf->tf_vm86_es = tf->tf_es; 935 tf->tf_vm86_fs = tf->tf_fs; 936 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 937 tf->tf_ds = _udatasel; 938 tf->tf_es = _udatasel; 939 tf->tf_fs = _udatasel; 940 } else { 941 /* 942 * Don't allow users to change privileged or reserved flags. 943 */ 944 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 945 uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n", 946 td->td_proc->p_pid, td->td_name, eflags); 947 return (EINVAL); 948 } 949 950 /* 951 * Don't allow users to load a valid privileged %cs. Let the 952 * hardware check for invalid selectors, excess privilege in 953 * other selectors, invalid %eip's and invalid %esp's. 954 */ 955 cs = ucp->uc_mcontext.mc_cs; 956 if (!CS_SECURE(cs)) { 957 uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", 958 td->td_proc->p_pid, td->td_name, cs); 959 ksiginfo_init_trap(&ksi); 960 ksi.ksi_signo = SIGBUS; 961 ksi.ksi_code = BUS_OBJERR; 962 ksi.ksi_trapno = T_PROTFLT; 963 ksi.ksi_addr = (void *)regs->tf_eip; 964 trapsignal(td, &ksi); 965 return (EINVAL); 966 } 967 968 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 969 } 970 971#if defined(COMPAT_43) 972 if (ucp->uc_mcontext.mc_onstack & 1) 973 td->td_sigstk.ss_flags |= SS_ONSTACK; 974 else 975 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 976#endif 977 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 978 return (EJUSTRETURN); 979} 980#endif /* COMPAT_FREEBSD4 */ 981 982/* 983 * MPSAFE 984 */ 985int 986sys_sigreturn(td, uap) 987 struct thread *td; 988 struct sigreturn_args /* { 989 const struct __ucontext *sigcntxp; 990 } */ *uap; 991{ 992 ucontext_t uc; 993 struct proc *p; 994 struct trapframe *regs; 995 ucontext_t *ucp; 996 char *xfpustate; 997 size_t xfpustate_len; 998 int cs, eflags, error, ret; 999 ksiginfo_t ksi; 1000 1001 p = td->td_proc; 1002 1003 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 1004 if (error != 0) 1005 return (error); 1006 ucp = &uc; 1007 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 1008 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 1009 td->td_name, ucp->uc_mcontext.mc_flags); 1010 return (EINVAL); 1011 } 1012 regs = td->td_frame; 1013 eflags = ucp->uc_mcontext.mc_eflags; 1014 if (eflags & PSL_VM) { 1015 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 1016 struct vm86_kernel *vm86; 1017 1018 /* 1019 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 1020 * set up the vm86 area, and we can't enter vm86 mode. 1021 */ 1022 if (td->td_pcb->pcb_ext == 0) 1023 return (EINVAL); 1024 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 1025 if (vm86->vm86_inited == 0) 1026 return (EINVAL); 1027 1028 /* Go back to user mode if both flags are set. */ 1029 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 1030 ksiginfo_init_trap(&ksi); 1031 ksi.ksi_signo = SIGBUS; 1032 ksi.ksi_code = BUS_OBJERR; 1033 ksi.ksi_addr = (void *)regs->tf_eip; 1034 trapsignal(td, &ksi); 1035 } 1036 1037 if (vm86->vm86_has_vme) { 1038 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 1039 (eflags & VME_USERCHANGE) | PSL_VM; 1040 } else { 1041 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 1042 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 1043 (eflags & VM_USERCHANGE) | PSL_VM; 1044 } 1045 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 1046 tf->tf_eflags = eflags; 1047 tf->tf_vm86_ds = tf->tf_ds; 1048 tf->tf_vm86_es = tf->tf_es; 1049 tf->tf_vm86_fs = tf->tf_fs; 1050 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 1051 tf->tf_ds = _udatasel; 1052 tf->tf_es = _udatasel; 1053 tf->tf_fs = _udatasel; 1054 } else { 1055 /* 1056 * Don't allow users to change privileged or reserved flags. 1057 */ 1058 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 1059 uprintf("pid %d (%s): sigreturn eflags = 0x%x\n", 1060 td->td_proc->p_pid, td->td_name, eflags); 1061 return (EINVAL); 1062 } 1063 1064 /* 1065 * Don't allow users to load a valid privileged %cs. Let the 1066 * hardware check for invalid selectors, excess privilege in 1067 * other selectors, invalid %eip's and invalid %esp's. 1068 */ 1069 cs = ucp->uc_mcontext.mc_cs; 1070 if (!CS_SECURE(cs)) { 1071 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", 1072 td->td_proc->p_pid, td->td_name, cs); 1073 ksiginfo_init_trap(&ksi); 1074 ksi.ksi_signo = SIGBUS; 1075 ksi.ksi_code = BUS_OBJERR; 1076 ksi.ksi_trapno = T_PROTFLT; 1077 ksi.ksi_addr = (void *)regs->tf_eip; 1078 trapsignal(td, &ksi); 1079 return (EINVAL); 1080 } 1081 1082 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 1083 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 1084 if (xfpustate_len > cpu_max_ext_state_size - 1085 sizeof(union savefpu)) { 1086 uprintf( 1087 "pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 1088 p->p_pid, td->td_name, xfpustate_len); 1089 return (EINVAL); 1090 } 1091 xfpustate = __builtin_alloca(xfpustate_len); 1092 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 1093 xfpustate, xfpustate_len); 1094 if (error != 0) { 1095 uprintf( 1096 "pid %d (%s): sigreturn copying xfpustate failed\n", 1097 p->p_pid, td->td_name); 1098 return (error); 1099 } 1100 } else { 1101 xfpustate = NULL; 1102 xfpustate_len = 0; 1103 } 1104 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, 1105 xfpustate_len); 1106 if (ret != 0) 1107 return (ret); 1108 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 1109 } 1110 1111#if defined(COMPAT_43) 1112 if (ucp->uc_mcontext.mc_onstack & 1) 1113 td->td_sigstk.ss_flags |= SS_ONSTACK; 1114 else 1115 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 1116#endif 1117 1118 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 1119 return (EJUSTRETURN); 1120} 1121 1122/* 1123 * Reset registers to default values on exec. 1124 */ 1125void 1126exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 1127{ 1128 struct trapframe *regs; 1129 struct pcb *pcb; 1130 register_t saved_eflags; 1131 1132 regs = td->td_frame; 1133 pcb = td->td_pcb; 1134 1135 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ 1136 pcb->pcb_gs = _udatasel; 1137 load_gs(_udatasel); 1138 1139 mtx_lock_spin(&dt_lock); 1140 if (td->td_proc->p_md.md_ldt) 1141 user_ldt_free(td); 1142 else 1143 mtx_unlock_spin(&dt_lock); 1144 1145 /* 1146 * Reset the fs and gs bases. The values from the old address 1147 * space do not make sense for the new program. In particular, 1148 * gsbase might be the TLS base for the old program but the new 1149 * program has no TLS now. 1150 */ 1151 set_fsbase(td, 0); 1152 set_gsbase(td, 0); 1153 1154 saved_eflags = regs->tf_eflags & PSL_T; 1155 bzero((char *)regs, sizeof(struct trapframe)); 1156 regs->tf_eip = imgp->entry_addr; 1157 regs->tf_esp = stack; 1158 regs->tf_eflags = PSL_USER | saved_eflags; 1159 regs->tf_ss = _udatasel; 1160 regs->tf_ds = _udatasel; 1161 regs->tf_es = _udatasel; 1162 regs->tf_fs = _udatasel; 1163 regs->tf_cs = _ucodesel; 1164 1165 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ 1166 regs->tf_ebx = imgp->ps_strings; 1167 1168 /* 1169 * Reset the hardware debug registers if they were in use. 1170 * They won't have any meaning for the newly exec'd process. 1171 */ 1172 if (pcb->pcb_flags & PCB_DBREGS) { 1173 pcb->pcb_dr0 = 0; 1174 pcb->pcb_dr1 = 0; 1175 pcb->pcb_dr2 = 0; 1176 pcb->pcb_dr3 = 0; 1177 pcb->pcb_dr6 = 0; 1178 pcb->pcb_dr7 = 0; 1179 if (pcb == curpcb) { 1180 /* 1181 * Clear the debug registers on the running 1182 * CPU, otherwise they will end up affecting 1183 * the next process we switch to. 1184 */ 1185 reset_dbregs(); 1186 } 1187 pcb->pcb_flags &= ~PCB_DBREGS; 1188 } 1189 1190 pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; 1191 1192 /* 1193 * Drop the FP state if we hold it, so that the process gets a 1194 * clean FP state if it uses the FPU again. 1195 */ 1196 fpstate_drop(td); 1197 1198 /* 1199 * XXX - Linux emulator 1200 * Make sure sure edx is 0x0 on entry. Linux binaries depend 1201 * on it. 1202 */ 1203 td->td_retval[1] = 0; 1204} 1205 1206void 1207cpu_setregs(void) 1208{ 1209 unsigned int cr0; 1210 1211 cr0 = rcr0(); 1212 1213 /* 1214 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: 1215 * 1216 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT 1217 * instructions. We must set the CR0_MP bit and use the CR0_TS 1218 * bit to control the trap, because setting the CR0_EM bit does 1219 * not cause WAIT instructions to trap. It's important to trap 1220 * WAIT instructions - otherwise the "wait" variants of no-wait 1221 * control instructions would degenerate to the "no-wait" variants 1222 * after FP context switches but work correctly otherwise. It's 1223 * particularly important to trap WAITs when there is no NPX - 1224 * otherwise the "wait" variants would always degenerate. 1225 * 1226 * Try setting CR0_NE to get correct error reporting on 486DX's. 1227 * Setting it should fail or do nothing on lesser processors. 1228 */ 1229 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 1230 load_cr0(cr0); 1231 load_gs(_udatasel); 1232} 1233 1234u_long bootdev; /* not a struct cdev *- encoding is different */ 1235SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1236 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); 1237 1238static char bootmethod[16] = "BIOS"; 1239SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1240 "System firmware boot method"); 1241 1242/* 1243 * Initialize 386 and configure to run kernel 1244 */ 1245 1246/* 1247 * Initialize segments & interrupt table 1248 */ 1249 1250int _default_ldt; 1251 1252union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1253union descriptor ldt[NLDT]; /* local descriptor table */ 1254static struct gate_descriptor idt0[NIDT]; 1255struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1256struct region_descriptor r_gdt, r_idt; /* table descriptors */ 1257struct mtx dt_lock; /* lock for GDT and LDT */ 1258 1259static struct i386tss dblfault_tss; 1260static char dblfault_stack[PAGE_SIZE]; 1261 1262extern vm_offset_t proc0kstack; 1263 1264 1265/* 1266 * software prototypes -- in more palatable form. 1267 * 1268 * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret 1269 * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) 1270 */ 1271struct soft_segment_descriptor gdt_segs[] = { 1272/* GNULL_SEL 0 Null Descriptor */ 1273{ .ssd_base = 0x0, 1274 .ssd_limit = 0x0, 1275 .ssd_type = 0, 1276 .ssd_dpl = SEL_KPL, 1277 .ssd_p = 0, 1278 .ssd_xx = 0, .ssd_xx1 = 0, 1279 .ssd_def32 = 0, 1280 .ssd_gran = 0 }, 1281/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ 1282{ .ssd_base = 0x0, 1283 .ssd_limit = 0xfffff, 1284 .ssd_type = SDT_MEMRWA, 1285 .ssd_dpl = SEL_KPL, 1286 .ssd_p = 1, 1287 .ssd_xx = 0, .ssd_xx1 = 0, 1288 .ssd_def32 = 1, 1289 .ssd_gran = 1 }, 1290/* GUFS_SEL 2 %fs Descriptor for user */ 1291{ .ssd_base = 0x0, 1292 .ssd_limit = 0xfffff, 1293 .ssd_type = SDT_MEMRWA, 1294 .ssd_dpl = SEL_UPL, 1295 .ssd_p = 1, 1296 .ssd_xx = 0, .ssd_xx1 = 0, 1297 .ssd_def32 = 1, 1298 .ssd_gran = 1 }, 1299/* GUGS_SEL 3 %gs Descriptor for user */ 1300{ .ssd_base = 0x0, 1301 .ssd_limit = 0xfffff, 1302 .ssd_type = SDT_MEMRWA, 1303 .ssd_dpl = SEL_UPL, 1304 .ssd_p = 1, 1305 .ssd_xx = 0, .ssd_xx1 = 0, 1306 .ssd_def32 = 1, 1307 .ssd_gran = 1 }, 1308/* GCODE_SEL 4 Code Descriptor for kernel */ 1309{ .ssd_base = 0x0, 1310 .ssd_limit = 0xfffff, 1311 .ssd_type = SDT_MEMERA, 1312 .ssd_dpl = SEL_KPL, 1313 .ssd_p = 1, 1314 .ssd_xx = 0, .ssd_xx1 = 0, 1315 .ssd_def32 = 1, 1316 .ssd_gran = 1 }, 1317/* GDATA_SEL 5 Data Descriptor for kernel */ 1318{ .ssd_base = 0x0, 1319 .ssd_limit = 0xfffff, 1320 .ssd_type = SDT_MEMRWA, 1321 .ssd_dpl = SEL_KPL, 1322 .ssd_p = 1, 1323 .ssd_xx = 0, .ssd_xx1 = 0, 1324 .ssd_def32 = 1, 1325 .ssd_gran = 1 }, 1326/* GUCODE_SEL 6 Code Descriptor for user */ 1327{ .ssd_base = 0x0, 1328 .ssd_limit = 0xfffff, 1329 .ssd_type = SDT_MEMERA, 1330 .ssd_dpl = SEL_UPL, 1331 .ssd_p = 1, 1332 .ssd_xx = 0, .ssd_xx1 = 0, 1333 .ssd_def32 = 1, 1334 .ssd_gran = 1 }, 1335/* GUDATA_SEL 7 Data Descriptor for user */ 1336{ .ssd_base = 0x0, 1337 .ssd_limit = 0xfffff, 1338 .ssd_type = SDT_MEMRWA, 1339 .ssd_dpl = SEL_UPL, 1340 .ssd_p = 1, 1341 .ssd_xx = 0, .ssd_xx1 = 0, 1342 .ssd_def32 = 1, 1343 .ssd_gran = 1 }, 1344/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ 1345{ .ssd_base = 0x400, 1346 .ssd_limit = 0xfffff, 1347 .ssd_type = SDT_MEMRWA, 1348 .ssd_dpl = SEL_KPL, 1349 .ssd_p = 1, 1350 .ssd_xx = 0, .ssd_xx1 = 0, 1351 .ssd_def32 = 1, 1352 .ssd_gran = 1 }, 1353/* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 1354{ 1355 .ssd_base = 0x0, 1356 .ssd_limit = sizeof(struct i386tss)-1, 1357 .ssd_type = SDT_SYS386TSS, 1358 .ssd_dpl = 0, 1359 .ssd_p = 1, 1360 .ssd_xx = 0, .ssd_xx1 = 0, 1361 .ssd_def32 = 0, 1362 .ssd_gran = 0 }, 1363/* GLDT_SEL 10 LDT Descriptor */ 1364{ .ssd_base = (int) ldt, 1365 .ssd_limit = sizeof(ldt)-1, 1366 .ssd_type = SDT_SYSLDT, 1367 .ssd_dpl = SEL_UPL, 1368 .ssd_p = 1, 1369 .ssd_xx = 0, .ssd_xx1 = 0, 1370 .ssd_def32 = 0, 1371 .ssd_gran = 0 }, 1372/* GUSERLDT_SEL 11 User LDT Descriptor per process */ 1373{ .ssd_base = (int) ldt, 1374 .ssd_limit = (512 * sizeof(union descriptor)-1), 1375 .ssd_type = SDT_SYSLDT, 1376 .ssd_dpl = 0, 1377 .ssd_p = 1, 1378 .ssd_xx = 0, .ssd_xx1 = 0, 1379 .ssd_def32 = 0, 1380 .ssd_gran = 0 }, 1381/* GPANIC_SEL 12 Panic Tss Descriptor */ 1382{ .ssd_base = (int) &dblfault_tss, 1383 .ssd_limit = sizeof(struct i386tss)-1, 1384 .ssd_type = SDT_SYS386TSS, 1385 .ssd_dpl = 0, 1386 .ssd_p = 1, 1387 .ssd_xx = 0, .ssd_xx1 = 0, 1388 .ssd_def32 = 0, 1389 .ssd_gran = 0 }, 1390/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ 1391{ .ssd_base = 0, 1392 .ssd_limit = 0xfffff, 1393 .ssd_type = SDT_MEMERA, 1394 .ssd_dpl = 0, 1395 .ssd_p = 1, 1396 .ssd_xx = 0, .ssd_xx1 = 0, 1397 .ssd_def32 = 0, 1398 .ssd_gran = 1 }, 1399/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ 1400{ .ssd_base = 0, 1401 .ssd_limit = 0xfffff, 1402 .ssd_type = SDT_MEMERA, 1403 .ssd_dpl = 0, 1404 .ssd_p = 1, 1405 .ssd_xx = 0, .ssd_xx1 = 0, 1406 .ssd_def32 = 0, 1407 .ssd_gran = 1 }, 1408/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ 1409{ .ssd_base = 0, 1410 .ssd_limit = 0xfffff, 1411 .ssd_type = SDT_MEMRWA, 1412 .ssd_dpl = 0, 1413 .ssd_p = 1, 1414 .ssd_xx = 0, .ssd_xx1 = 0, 1415 .ssd_def32 = 1, 1416 .ssd_gran = 1 }, 1417/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ 1418{ .ssd_base = 0, 1419 .ssd_limit = 0xfffff, 1420 .ssd_type = SDT_MEMRWA, 1421 .ssd_dpl = 0, 1422 .ssd_p = 1, 1423 .ssd_xx = 0, .ssd_xx1 = 0, 1424 .ssd_def32 = 0, 1425 .ssd_gran = 1 }, 1426/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ 1427{ .ssd_base = 0, 1428 .ssd_limit = 0xfffff, 1429 .ssd_type = SDT_MEMRWA, 1430 .ssd_dpl = 0, 1431 .ssd_p = 1, 1432 .ssd_xx = 0, .ssd_xx1 = 0, 1433 .ssd_def32 = 0, 1434 .ssd_gran = 1 }, 1435/* GNDIS_SEL 18 NDIS Descriptor */ 1436{ .ssd_base = 0x0, 1437 .ssd_limit = 0x0, 1438 .ssd_type = 0, 1439 .ssd_dpl = 0, 1440 .ssd_p = 0, 1441 .ssd_xx = 0, .ssd_xx1 = 0, 1442 .ssd_def32 = 0, 1443 .ssd_gran = 0 }, 1444}; 1445 1446static struct soft_segment_descriptor ldt_segs[] = { 1447 /* Null Descriptor - overwritten by call gate */ 1448{ .ssd_base = 0x0, 1449 .ssd_limit = 0x0, 1450 .ssd_type = 0, 1451 .ssd_dpl = 0, 1452 .ssd_p = 0, 1453 .ssd_xx = 0, .ssd_xx1 = 0, 1454 .ssd_def32 = 0, 1455 .ssd_gran = 0 }, 1456 /* Null Descriptor - overwritten by call gate */ 1457{ .ssd_base = 0x0, 1458 .ssd_limit = 0x0, 1459 .ssd_type = 0, 1460 .ssd_dpl = 0, 1461 .ssd_p = 0, 1462 .ssd_xx = 0, .ssd_xx1 = 0, 1463 .ssd_def32 = 0, 1464 .ssd_gran = 0 }, 1465 /* Null Descriptor - overwritten by call gate */ 1466{ .ssd_base = 0x0, 1467 .ssd_limit = 0x0, 1468 .ssd_type = 0, 1469 .ssd_dpl = 0, 1470 .ssd_p = 0, 1471 .ssd_xx = 0, .ssd_xx1 = 0, 1472 .ssd_def32 = 0, 1473 .ssd_gran = 0 }, 1474 /* Code Descriptor for user */ 1475{ .ssd_base = 0x0, 1476 .ssd_limit = 0xfffff, 1477 .ssd_type = SDT_MEMERA, 1478 .ssd_dpl = SEL_UPL, 1479 .ssd_p = 1, 1480 .ssd_xx = 0, .ssd_xx1 = 0, 1481 .ssd_def32 = 1, 1482 .ssd_gran = 1 }, 1483 /* Null Descriptor - overwritten by call gate */ 1484{ .ssd_base = 0x0, 1485 .ssd_limit = 0x0, 1486 .ssd_type = 0, 1487 .ssd_dpl = 0, 1488 .ssd_p = 0, 1489 .ssd_xx = 0, .ssd_xx1 = 0, 1490 .ssd_def32 = 0, 1491 .ssd_gran = 0 }, 1492 /* Data Descriptor for user */ 1493{ .ssd_base = 0x0, 1494 .ssd_limit = 0xfffff, 1495 .ssd_type = SDT_MEMRWA, 1496 .ssd_dpl = SEL_UPL, 1497 .ssd_p = 1, 1498 .ssd_xx = 0, .ssd_xx1 = 0, 1499 .ssd_def32 = 1, 1500 .ssd_gran = 1 }, 1501}; 1502 1503void 1504setidt(idx, func, typ, dpl, selec) 1505 int idx; 1506 inthand_t *func; 1507 int typ; 1508 int dpl; 1509 int selec; 1510{ 1511 struct gate_descriptor *ip; 1512 1513 ip = idt + idx; 1514 ip->gd_looffset = (int)func; 1515 ip->gd_selector = selec; 1516 ip->gd_stkcpy = 0; 1517 ip->gd_xx = 0; 1518 ip->gd_type = typ; 1519 ip->gd_dpl = dpl; 1520 ip->gd_p = 1; 1521 ip->gd_hioffset = ((int)func)>>16 ; 1522} 1523 1524extern inthand_t 1525 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1526 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1527 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1528 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1529 IDTVEC(xmm), 1530#ifdef KDTRACE_HOOKS 1531 IDTVEC(dtrace_ret), 1532#endif 1533#ifdef XENHVM 1534 IDTVEC(xen_intr_upcall), 1535#endif 1536 IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); 1537 1538#ifdef DDB 1539/* 1540 * Display the index and function name of any IDT entries that don't use 1541 * the default 'rsvd' entry point. 1542 */ 1543DB_SHOW_COMMAND(idt, db_show_idt) 1544{ 1545 struct gate_descriptor *ip; 1546 int idx; 1547 uintptr_t func; 1548 1549 ip = idt; 1550 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 1551 func = (ip->gd_hioffset << 16 | ip->gd_looffset); 1552 if (func != (uintptr_t)&IDTVEC(rsvd)) { 1553 db_printf("%3d\t", idx); 1554 db_printsym(func, DB_STGY_PROC); 1555 db_printf("\n"); 1556 } 1557 ip++; 1558 } 1559} 1560 1561/* Show privileged registers. */ 1562DB_SHOW_COMMAND(sysregs, db_show_sysregs) 1563{ 1564 uint64_t idtr, gdtr; 1565 1566 idtr = ridt(); 1567 db_printf("idtr\t0x%08x/%04x\n", 1568 (u_int)(idtr >> 16), (u_int)idtr & 0xffff); 1569 gdtr = rgdt(); 1570 db_printf("gdtr\t0x%08x/%04x\n", 1571 (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); 1572 db_printf("ldtr\t0x%04x\n", rldt()); 1573 db_printf("tr\t0x%04x\n", rtr()); 1574 db_printf("cr0\t0x%08x\n", rcr0()); 1575 db_printf("cr2\t0x%08x\n", rcr2()); 1576 db_printf("cr3\t0x%08x\n", rcr3()); 1577 db_printf("cr4\t0x%08x\n", rcr4()); 1578 if (rcr4() & CR4_XSAVE) 1579 db_printf("xcr0\t0x%016llx\n", rxcr(0)); 1580 if (amd_feature & (AMDID_NX | AMDID_LM)) 1581 db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER)); 1582 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 1583 db_printf("FEATURES_CTL\t0x%016llx\n", 1584 rdmsr(MSR_IA32_FEATURE_CONTROL)); 1585 if ((cpu_vendor_id == CPU_VENDOR_INTEL || 1586 cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6) 1587 db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR)); 1588 if (cpu_feature & CPUID_PAT) 1589 db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT)); 1590} 1591 1592DB_SHOW_COMMAND(dbregs, db_show_dbregs) 1593{ 1594 1595 db_printf("dr0\t0x%08x\n", rdr0()); 1596 db_printf("dr1\t0x%08x\n", rdr1()); 1597 db_printf("dr2\t0x%08x\n", rdr2()); 1598 db_printf("dr3\t0x%08x\n", rdr3()); 1599 db_printf("dr6\t0x%08x\n", rdr6()); 1600 db_printf("dr7\t0x%08x\n", rdr7()); 1601} 1602#endif 1603 1604void 1605sdtossd(sd, ssd) 1606 struct segment_descriptor *sd; 1607 struct soft_segment_descriptor *ssd; 1608{ 1609 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1610 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1611 ssd->ssd_type = sd->sd_type; 1612 ssd->ssd_dpl = sd->sd_dpl; 1613 ssd->ssd_p = sd->sd_p; 1614 ssd->ssd_def32 = sd->sd_def32; 1615 ssd->ssd_gran = sd->sd_gran; 1616} 1617 1618#if !defined(PC98) 1619static int 1620add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 1621 int *physmap_idxp) 1622{ 1623 int i, insert_idx, physmap_idx; 1624 1625 physmap_idx = *physmap_idxp; 1626 1627 if (length == 0) 1628 return (1); 1629 1630#ifndef PAE 1631 if (base > 0xffffffff) { 1632 printf("%uK of memory above 4GB ignored\n", 1633 (u_int)(length / 1024)); 1634 return (1); 1635 } 1636#endif 1637 1638 /* 1639 * Find insertion point while checking for overlap. Start off by 1640 * assuming the new entry will be added to the end. 1641 */ 1642 insert_idx = physmap_idx + 2; 1643 for (i = 0; i <= physmap_idx; i += 2) { 1644 if (base < physmap[i + 1]) { 1645 if (base + length <= physmap[i]) { 1646 insert_idx = i; 1647 break; 1648 } 1649 if (boothowto & RB_VERBOSE) 1650 printf( 1651 "Overlapping memory regions, ignoring second region\n"); 1652 return (1); 1653 } 1654 } 1655 1656 /* See if we can prepend to the next entry. */ 1657 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1658 physmap[insert_idx] = base; 1659 return (1); 1660 } 1661 1662 /* See if we can append to the previous entry. */ 1663 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1664 physmap[insert_idx - 1] += length; 1665 return (1); 1666 } 1667 1668 physmap_idx += 2; 1669 *physmap_idxp = physmap_idx; 1670 if (physmap_idx == PHYSMAP_SIZE) { 1671 printf( 1672 "Too many segments in the physical address map, giving up\n"); 1673 return (0); 1674 } 1675 1676 /* 1677 * Move the last 'N' entries down to make room for the new 1678 * entry if needed. 1679 */ 1680 for (i = physmap_idx; i > insert_idx; i -= 2) { 1681 physmap[i] = physmap[i - 2]; 1682 physmap[i + 1] = physmap[i - 1]; 1683 } 1684 1685 /* Insert the new entry. */ 1686 physmap[insert_idx] = base; 1687 physmap[insert_idx + 1] = base + length; 1688 return (1); 1689} 1690 1691static int 1692add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) 1693{ 1694 if (boothowto & RB_VERBOSE) 1695 printf("SMAP type=%02x base=%016llx len=%016llx\n", 1696 smap->type, smap->base, smap->length); 1697 1698 if (smap->type != SMAP_TYPE_MEMORY) 1699 return (1); 1700 1701 return (add_physmap_entry(smap->base, smap->length, physmap, 1702 physmap_idxp)); 1703} 1704 1705static void 1706add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap, 1707 int *physmap_idxp) 1708{ 1709 struct bios_smap *smap, *smapend; 1710 u_int32_t smapsize; 1711 /* 1712 * Memory map from INT 15:E820. 1713 * 1714 * subr_module.c says: 1715 * "Consumer may safely assume that size value precedes data." 1716 * ie: an int32_t immediately precedes SMAP. 1717 */ 1718 smapsize = *((u_int32_t *)smapbase - 1); 1719 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1720 1721 for (smap = smapbase; smap < smapend; smap++) 1722 if (!add_smap_entry(smap, physmap, physmap_idxp)) 1723 break; 1724} 1725#endif /* !PC98 */ 1726 1727static void 1728basemem_setup(void) 1729{ 1730 vm_paddr_t pa; 1731 pt_entry_t *pte; 1732 int i; 1733 1734 if (basemem > 640) { 1735 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", 1736 basemem); 1737 basemem = 640; 1738 } 1739 1740 /* 1741 * XXX if biosbasemem is now < 640, there is a `hole' 1742 * between the end of base memory and the start of 1743 * ISA memory. The hole may be empty or it may 1744 * contain BIOS code or data. Map it read/write so 1745 * that the BIOS can write to it. (Memory from 0 to 1746 * the physical end of the kernel is mapped read-only 1747 * to begin with and then parts of it are remapped. 1748 * The parts that aren't remapped form holes that 1749 * remain read-only and are unused by the kernel. 1750 * The base memory area is below the physical end of 1751 * the kernel and right now forms a read-only hole. 1752 * The part of it from PAGE_SIZE to 1753 * (trunc_page(biosbasemem * 1024) - 1) will be 1754 * remapped and used by the kernel later.) 1755 * 1756 * This code is similar to the code used in 1757 * pmap_mapdev, but since no memory needs to be 1758 * allocated we simply change the mapping. 1759 */ 1760 for (pa = trunc_page(basemem * 1024); 1761 pa < ISA_HOLE_START; pa += PAGE_SIZE) 1762 pmap_kenter(KERNBASE + pa, pa); 1763 1764 /* 1765 * Map pages between basemem and ISA_HOLE_START, if any, r/w into 1766 * the vm86 page table so that vm86 can scribble on them using 1767 * the vm86 map too. XXX: why 2 ways for this and only 1 way for 1768 * page 0, at least as initialized here? 1769 */ 1770 pte = (pt_entry_t *)vm86paddr; 1771 for (i = basemem / 4; i < 160; i++) 1772 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; 1773} 1774 1775/* 1776 * Populate the (physmap) array with base/bound pairs describing the 1777 * available physical memory in the system, then test this memory and 1778 * build the phys_avail array describing the actually-available memory. 1779 * 1780 * If we cannot accurately determine the physical memory map, then use 1781 * value from the 0xE801 call, and failing that, the RTC. 1782 * 1783 * Total memory size may be set by the kernel environment variable 1784 * hw.physmem or the compile-time define MAXMEM. 1785 * 1786 * XXX first should be vm_paddr_t. 1787 */ 1788#ifdef PC98 1789static void 1790getmemsize(int first) 1791{ 1792 int off, physmap_idx, pa_indx, da_indx; 1793 u_long physmem_tunable, memtest; 1794 vm_paddr_t physmap[PHYSMAP_SIZE]; 1795 pt_entry_t *pte; 1796 quad_t dcons_addr, dcons_size; 1797 int i; 1798 int pg_n; 1799 u_int extmem; 1800 u_int under16; 1801 vm_paddr_t pa; 1802 1803 bzero(physmap, sizeof(physmap)); 1804 1805 /* XXX - some of EPSON machines can't use PG_N */ 1806 pg_n = PG_N; 1807 if (pc98_machine_type & M_EPSON_PC98) { 1808 switch (epson_machine_id) { 1809#ifdef WB_CACHE 1810 default: 1811#endif 1812 case EPSON_PC486_HX: 1813 case EPSON_PC486_HG: 1814 case EPSON_PC486_HA: 1815 pg_n = 0; 1816 break; 1817 } 1818 } 1819 1820 under16 = pc98_getmemsize(&basemem, &extmem); 1821 basemem_setup(); 1822 1823 physmap[0] = 0; 1824 physmap[1] = basemem * 1024; 1825 physmap_idx = 2; 1826 physmap[physmap_idx] = 0x100000; 1827 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 1828 1829 /* 1830 * Now, physmap contains a map of physical memory. 1831 */ 1832 1833#ifdef SMP 1834 /* make hole for AP bootstrap code */ 1835 physmap[1] = mp_bootaddress(physmap[1]); 1836#endif 1837 1838 /* 1839 * Maxmem isn't the "maximum memory", it's one larger than the 1840 * highest page of the physical address space. It should be 1841 * called something like "Maxphyspage". We may adjust this 1842 * based on ``hw.physmem'' and the results of the memory test. 1843 */ 1844 Maxmem = atop(physmap[physmap_idx + 1]); 1845 1846#ifdef MAXMEM 1847 Maxmem = MAXMEM / 4; 1848#endif 1849 1850 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1851 Maxmem = atop(physmem_tunable); 1852 1853 /* 1854 * By default keep the memtest enabled. Use a general name so that 1855 * one could eventually do more with the code than just disable it. 1856 */ 1857 memtest = 1; 1858 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 1859 1860 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1861 (boothowto & RB_VERBOSE)) 1862 printf("Physical memory use set to %ldK\n", Maxmem * 4); 1863 1864 /* 1865 * If Maxmem has been increased beyond what the system has detected, 1866 * extend the last memory segment to the new limit. 1867 */ 1868 if (atop(physmap[physmap_idx + 1]) < Maxmem) 1869 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 1870 1871 /* 1872 * We need to divide chunk if Maxmem is larger than 16MB and 1873 * under 16MB area is not full of memory. 1874 * (1) system area (15-16MB region) is cut off 1875 * (2) extended memory is only over 16MB area (ex. Melco "HYPERMEMORY") 1876 */ 1877 if ((under16 != 16 * 1024) && (extmem > 15 * 1024)) { 1878 /* 15M - 16M region is cut off, so need to divide chunk */ 1879 physmap[physmap_idx + 1] = under16 * 1024; 1880 physmap_idx += 2; 1881 physmap[physmap_idx] = 0x1000000; 1882 physmap[physmap_idx + 1] = physmap[2] + extmem * 1024; 1883 } 1884 1885 /* call pmap initialization to make new kernel address space */ 1886 pmap_bootstrap(first); 1887 1888 /* 1889 * Size up each available chunk of physical memory. 1890 */ 1891 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 1892 pa_indx = 0; 1893 da_indx = 1; 1894 phys_avail[pa_indx++] = physmap[0]; 1895 phys_avail[pa_indx] = physmap[0]; 1896 dump_avail[da_indx] = physmap[0]; 1897 pte = CMAP3; 1898 1899 /* 1900 * Get dcons buffer address 1901 */ 1902 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 1903 getenv_quad("dcons.size", &dcons_size) == 0) 1904 dcons_addr = 0; 1905 1906 /* 1907 * physmap is in bytes, so when converting to page boundaries, 1908 * round up the start address and round down the end address. 1909 */ 1910 for (i = 0; i <= physmap_idx; i += 2) { 1911 vm_paddr_t end; 1912 1913 end = ptoa((vm_paddr_t)Maxmem); 1914 if (physmap[i + 1] < end) 1915 end = trunc_page(physmap[i + 1]); 1916 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1917 int tmp, page_bad, full; 1918 int *ptr = (int *)CADDR3; 1919 1920 full = FALSE; 1921 /* 1922 * block out kernel memory as not available. 1923 */ 1924 if (pa >= KERNLOAD && pa < first) 1925 goto do_dump_avail; 1926 1927 /* 1928 * block out dcons buffer 1929 */ 1930 if (dcons_addr > 0 1931 && pa >= trunc_page(dcons_addr) 1932 && pa < dcons_addr + dcons_size) 1933 goto do_dump_avail; 1934 1935 page_bad = FALSE; 1936 if (memtest == 0) 1937 goto skip_memtest; 1938 1939 /* 1940 * map page into kernel: valid, read/write,non-cacheable 1941 */ 1942 *pte = pa | PG_V | PG_RW | pg_n; 1943 invltlb(); 1944 1945 tmp = *(int *)ptr; 1946 /* 1947 * Test for alternating 1's and 0's 1948 */ 1949 *(volatile int *)ptr = 0xaaaaaaaa; 1950 if (*(volatile int *)ptr != 0xaaaaaaaa) 1951 page_bad = TRUE; 1952 /* 1953 * Test for alternating 0's and 1's 1954 */ 1955 *(volatile int *)ptr = 0x55555555; 1956 if (*(volatile int *)ptr != 0x55555555) 1957 page_bad = TRUE; 1958 /* 1959 * Test for all 1's 1960 */ 1961 *(volatile int *)ptr = 0xffffffff; 1962 if (*(volatile int *)ptr != 0xffffffff) 1963 page_bad = TRUE; 1964 /* 1965 * Test for all 0's 1966 */ 1967 *(volatile int *)ptr = 0x0; 1968 if (*(volatile int *)ptr != 0x0) 1969 page_bad = TRUE; 1970 /* 1971 * Restore original value. 1972 */ 1973 *(int *)ptr = tmp; 1974 1975skip_memtest: 1976 /* 1977 * Adjust array of valid/good pages. 1978 */ 1979 if (page_bad == TRUE) 1980 continue; 1981 /* 1982 * If this good page is a continuation of the 1983 * previous set of good pages, then just increase 1984 * the end pointer. Otherwise start a new chunk. 1985 * Note that "end" points one higher than end, 1986 * making the range >= start and < end. 1987 * If we're also doing a speculative memory 1988 * test and we at or past the end, bump up Maxmem 1989 * so that we keep going. The first bad page 1990 * will terminate the loop. 1991 */ 1992 if (phys_avail[pa_indx] == pa) { 1993 phys_avail[pa_indx] += PAGE_SIZE; 1994 } else { 1995 pa_indx++; 1996 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 1997 printf( 1998 "Too many holes in the physical address space, giving up\n"); 1999 pa_indx--; 2000 full = TRUE; 2001 goto do_dump_avail; 2002 } 2003 phys_avail[pa_indx++] = pa; /* start */ 2004 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 2005 } 2006 physmem++; 2007do_dump_avail: 2008 if (dump_avail[da_indx] == pa) { 2009 dump_avail[da_indx] += PAGE_SIZE; 2010 } else { 2011 da_indx++; 2012 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2013 da_indx--; 2014 goto do_next; 2015 } 2016 dump_avail[da_indx++] = pa; /* start */ 2017 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 2018 } 2019do_next: 2020 if (full) 2021 break; 2022 } 2023 } 2024 *pte = 0; 2025 invltlb(); 2026 2027 /* 2028 * XXX 2029 * The last chunk must contain at least one page plus the message 2030 * buffer to avoid complicating other code (message buffer address 2031 * calculation, etc.). 2032 */ 2033 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 2034 round_page(msgbufsize) >= phys_avail[pa_indx]) { 2035 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2036 phys_avail[pa_indx--] = 0; 2037 phys_avail[pa_indx--] = 0; 2038 } 2039 2040 Maxmem = atop(phys_avail[pa_indx]); 2041 2042 /* Trim off space for the message buffer. */ 2043 phys_avail[pa_indx] -= round_page(msgbufsize); 2044 2045 /* Map the message buffer. */ 2046 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) 2047 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 2048 off); 2049} 2050#else /* PC98 */ 2051static void 2052getmemsize(int first) 2053{ 2054 int has_smap, off, physmap_idx, pa_indx, da_indx; 2055 u_long memtest; 2056 vm_paddr_t physmap[PHYSMAP_SIZE]; 2057 pt_entry_t *pte; 2058 quad_t dcons_addr, dcons_size, physmem_tunable; 2059 int hasbrokenint12, i, res; 2060 u_int extmem; 2061 struct vm86frame vmf; 2062 struct vm86context vmc; 2063 vm_paddr_t pa; 2064 struct bios_smap *smap, *smapbase; 2065 caddr_t kmdp; 2066 2067 has_smap = 0; 2068#ifdef XBOX 2069 if (arch_i386_is_xbox) { 2070 /* 2071 * We queried the memory size before, so chop off 4MB for 2072 * the framebuffer and inform the OS of this. 2073 */ 2074 physmap[0] = 0; 2075 physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE; 2076 physmap_idx = 0; 2077 goto physmap_done; 2078 } 2079#endif 2080 bzero(&vmf, sizeof(vmf)); 2081 bzero(physmap, sizeof(physmap)); 2082 basemem = 0; 2083 2084 /* 2085 * Tell the physical memory allocator about pages used to store 2086 * the kernel and preloaded data. See kmem_bootstrap_free(). 2087 */ 2088 vm_phys_add_seg((vm_paddr_t)KERNLOAD, trunc_page(first)); 2089 2090 /* 2091 * Check if the loader supplied an SMAP memory map. If so, 2092 * use that and do not make any VM86 calls. 2093 */ 2094 physmap_idx = 0; 2095 kmdp = preload_search_by_type("elf kernel"); 2096 if (kmdp == NULL) 2097 kmdp = preload_search_by_type("elf32 kernel"); 2098 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2099 MODINFO_METADATA | MODINFOMD_SMAP); 2100 if (smapbase != NULL) { 2101 add_smap_entries(smapbase, physmap, &physmap_idx); 2102 has_smap = 1; 2103 goto have_smap; 2104 } 2105 2106 /* 2107 * Some newer BIOSes have a broken INT 12H implementation 2108 * which causes a kernel panic immediately. In this case, we 2109 * need use the SMAP to determine the base memory size. 2110 */ 2111 hasbrokenint12 = 0; 2112 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); 2113 if (hasbrokenint12 == 0) { 2114 /* Use INT12 to determine base memory size. */ 2115 vm86_intcall(0x12, &vmf); 2116 basemem = vmf.vmf_ax; 2117 basemem_setup(); 2118 } 2119 2120 /* 2121 * Fetch the memory map with INT 15:E820. Map page 1 R/W into 2122 * the kernel page table so we can use it as a buffer. The 2123 * kernel will unmap this page later. 2124 */ 2125 pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT); 2126 vmc.npages = 0; 2127 smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); 2128 res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); 2129 KASSERT(res != 0, ("vm86_getptr() failed: address not found")); 2130 2131 vmf.vmf_ebx = 0; 2132 do { 2133 vmf.vmf_eax = 0xE820; 2134 vmf.vmf_edx = SMAP_SIG; 2135 vmf.vmf_ecx = sizeof(struct bios_smap); 2136 i = vm86_datacall(0x15, &vmf, &vmc); 2137 if (i || vmf.vmf_eax != SMAP_SIG) 2138 break; 2139 has_smap = 1; 2140 if (!add_smap_entry(smap, physmap, &physmap_idx)) 2141 break; 2142 } while (vmf.vmf_ebx != 0); 2143 2144have_smap: 2145 /* 2146 * If we didn't fetch the "base memory" size from INT12, 2147 * figure it out from the SMAP (or just guess). 2148 */ 2149 if (basemem == 0) { 2150 for (i = 0; i <= physmap_idx; i += 2) { 2151 if (physmap[i] == 0x00000000) { 2152 basemem = physmap[i + 1] / 1024; 2153 break; 2154 } 2155 } 2156 2157 /* XXX: If we couldn't find basemem from SMAP, just guess. */ 2158 if (basemem == 0) 2159 basemem = 640; 2160 basemem_setup(); 2161 } 2162 2163 if (physmap[1] != 0) 2164 goto physmap_done; 2165 2166 /* 2167 * If we failed to find an SMAP, figure out the extended 2168 * memory size. We will then build a simple memory map with 2169 * two segments, one for "base memory" and the second for 2170 * "extended memory". Note that "extended memory" starts at a 2171 * physical address of 1MB and that both basemem and extmem 2172 * are in units of 1KB. 2173 * 2174 * First, try to fetch the extended memory size via INT 15:E801. 2175 */ 2176 vmf.vmf_ax = 0xE801; 2177 if (vm86_intcall(0x15, &vmf) == 0) { 2178 extmem = vmf.vmf_cx + vmf.vmf_dx * 64; 2179 } else { 2180 /* 2181 * If INT15:E801 fails, this is our last ditch effort 2182 * to determine the extended memory size. Currently 2183 * we prefer the RTC value over INT15:88. 2184 */ 2185#if 0 2186 vmf.vmf_ah = 0x88; 2187 vm86_intcall(0x15, &vmf); 2188 extmem = vmf.vmf_ax; 2189#else 2190 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); 2191#endif 2192 } 2193 2194 /* 2195 * Special hack for chipsets that still remap the 384k hole when 2196 * there's 16MB of memory - this really confuses people that 2197 * are trying to use bus mastering ISA controllers with the 2198 * "16MB limit"; they only have 16MB, but the remapping puts 2199 * them beyond the limit. 2200 * 2201 * If extended memory is between 15-16MB (16-17MB phys address range), 2202 * chop it to 15MB. 2203 */ 2204 if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) 2205 extmem = 15 * 1024; 2206 2207 physmap[0] = 0; 2208 physmap[1] = basemem * 1024; 2209 physmap_idx = 2; 2210 physmap[physmap_idx] = 0x100000; 2211 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 2212 2213physmap_done: 2214 /* 2215 * Now, physmap contains a map of physical memory. 2216 */ 2217 2218#ifdef SMP 2219 /* make hole for AP bootstrap code */ 2220 physmap[1] = mp_bootaddress(physmap[1]); 2221#endif 2222 2223 /* 2224 * Maxmem isn't the "maximum memory", it's one larger than the 2225 * highest page of the physical address space. It should be 2226 * called something like "Maxphyspage". We may adjust this 2227 * based on ``hw.physmem'' and the results of the memory test. 2228 * 2229 * This is especially confusing when it is much larger than the 2230 * memory size and is displayed as "realmem". 2231 */ 2232 Maxmem = atop(physmap[physmap_idx + 1]); 2233 2234#ifdef MAXMEM 2235 Maxmem = MAXMEM / 4; 2236#endif 2237 2238 if (TUNABLE_QUAD_FETCH("hw.physmem", &physmem_tunable)) 2239 Maxmem = atop(physmem_tunable); 2240 2241 /* 2242 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend 2243 * the amount of memory in the system. 2244 */ 2245 if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) 2246 Maxmem = atop(physmap[physmap_idx + 1]); 2247 2248 /* 2249 * By default enable the memory test on real hardware, and disable 2250 * it if we appear to be running in a VM. This avoids touching all 2251 * pages unnecessarily, which doesn't matter on real hardware but is 2252 * bad for shared VM hosts. Use a general name so that 2253 * one could eventually do more with the code than just disable it. 2254 */ 2255 memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1; 2256 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 2257 2258 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2259 (boothowto & RB_VERBOSE)) 2260 printf("Physical memory use set to %ldK\n", Maxmem * 4); 2261 2262 /* 2263 * If Maxmem has been increased beyond what the system has detected, 2264 * extend the last memory segment to the new limit. 2265 */ 2266 if (atop(physmap[physmap_idx + 1]) < Maxmem) 2267 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 2268 2269 /* call pmap initialization to make new kernel address space */ 2270 pmap_bootstrap(first); 2271 2272 /* 2273 * Size up each available chunk of physical memory. 2274 */ 2275 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 2276 pa_indx = 0; 2277 da_indx = 1; 2278 phys_avail[pa_indx++] = physmap[0]; 2279 phys_avail[pa_indx] = physmap[0]; 2280 dump_avail[da_indx] = physmap[0]; 2281 pte = CMAP3; 2282 2283 /* 2284 * Get dcons buffer address 2285 */ 2286 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 2287 getenv_quad("dcons.size", &dcons_size) == 0) 2288 dcons_addr = 0; 2289 2290 /* 2291 * physmap is in bytes, so when converting to page boundaries, 2292 * round up the start address and round down the end address. 2293 */ 2294 for (i = 0; i <= physmap_idx; i += 2) { 2295 vm_paddr_t end; 2296 2297 end = ptoa((vm_paddr_t)Maxmem); 2298 if (physmap[i + 1] < end) 2299 end = trunc_page(physmap[i + 1]); 2300 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 2301 int tmp, page_bad, full; 2302 int *ptr = (int *)CADDR3; 2303 2304 full = FALSE; 2305 /* 2306 * block out kernel memory as not available. 2307 */ 2308 if (pa >= KERNLOAD && pa < first) 2309 goto do_dump_avail; 2310 2311 /* 2312 * block out dcons buffer 2313 */ 2314 if (dcons_addr > 0 2315 && pa >= trunc_page(dcons_addr) 2316 && pa < dcons_addr + dcons_size) 2317 goto do_dump_avail; 2318 2319 page_bad = FALSE; 2320 if (memtest == 0) 2321 goto skip_memtest; 2322 2323 /* 2324 * map page into kernel: valid, read/write,non-cacheable 2325 */ 2326 *pte = pa | PG_V | PG_RW | PG_N; 2327 invltlb(); 2328 2329 tmp = *(int *)ptr; 2330 /* 2331 * Test for alternating 1's and 0's 2332 */ 2333 *(volatile int *)ptr = 0xaaaaaaaa; 2334 if (*(volatile int *)ptr != 0xaaaaaaaa) 2335 page_bad = TRUE; 2336 /* 2337 * Test for alternating 0's and 1's 2338 */ 2339 *(volatile int *)ptr = 0x55555555; 2340 if (*(volatile int *)ptr != 0x55555555) 2341 page_bad = TRUE; 2342 /* 2343 * Test for all 1's 2344 */ 2345 *(volatile int *)ptr = 0xffffffff; 2346 if (*(volatile int *)ptr != 0xffffffff) 2347 page_bad = TRUE; 2348 /* 2349 * Test for all 0's 2350 */ 2351 *(volatile int *)ptr = 0x0; 2352 if (*(volatile int *)ptr != 0x0) 2353 page_bad = TRUE; 2354 /* 2355 * Restore original value. 2356 */ 2357 *(int *)ptr = tmp; 2358 2359skip_memtest: 2360 /* 2361 * Adjust array of valid/good pages. 2362 */ 2363 if (page_bad == TRUE) 2364 continue; 2365 /* 2366 * If this good page is a continuation of the 2367 * previous set of good pages, then just increase 2368 * the end pointer. Otherwise start a new chunk. 2369 * Note that "end" points one higher than end, 2370 * making the range >= start and < end. 2371 * If we're also doing a speculative memory 2372 * test and we at or past the end, bump up Maxmem 2373 * so that we keep going. The first bad page 2374 * will terminate the loop. 2375 */ 2376 if (phys_avail[pa_indx] == pa) { 2377 phys_avail[pa_indx] += PAGE_SIZE; 2378 } else { 2379 pa_indx++; 2380 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2381 printf( 2382 "Too many holes in the physical address space, giving up\n"); 2383 pa_indx--; 2384 full = TRUE; 2385 goto do_dump_avail; 2386 } 2387 phys_avail[pa_indx++] = pa; /* start */ 2388 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 2389 } 2390 physmem++; 2391do_dump_avail: 2392 if (dump_avail[da_indx] == pa) { 2393 dump_avail[da_indx] += PAGE_SIZE; 2394 } else { 2395 da_indx++; 2396 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2397 da_indx--; 2398 goto do_next; 2399 } 2400 dump_avail[da_indx++] = pa; /* start */ 2401 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 2402 } 2403do_next: 2404 if (full) 2405 break; 2406 } 2407 } 2408 *pte = 0; 2409 invltlb(); 2410 2411 /* 2412 * XXX 2413 * The last chunk must contain at least one page plus the message 2414 * buffer to avoid complicating other code (message buffer address 2415 * calculation, etc.). 2416 */ 2417 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 2418 round_page(msgbufsize) >= phys_avail[pa_indx]) { 2419 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2420 phys_avail[pa_indx--] = 0; 2421 phys_avail[pa_indx--] = 0; 2422 } 2423 2424 Maxmem = atop(phys_avail[pa_indx]); 2425 2426 /* Trim off space for the message buffer. */ 2427 phys_avail[pa_indx] -= round_page(msgbufsize); 2428 2429 /* Map the message buffer. */ 2430 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) 2431 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 2432 off); 2433} 2434#endif /* PC98 */ 2435 2436static void 2437i386_kdb_init(void) 2438{ 2439#ifdef DDB 2440 db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab); 2441#endif 2442 kdb_init(); 2443#ifdef KDB 2444 if (boothowto & RB_KDB) 2445 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 2446#endif 2447} 2448 2449register_t 2450init386(int first) 2451{ 2452 struct gate_descriptor *gdp; 2453 int gsel_tss, metadata_missing, x, pa; 2454 struct pcpu *pc; 2455 struct xstate_hdr *xhdr; 2456 caddr_t kmdp; 2457 int late_console; 2458 2459 thread0.td_kstack = proc0kstack; 2460 thread0.td_kstack_pages = TD0_KSTACK_PAGES; 2461 2462 /* 2463 * This may be done better later if it gets more high level 2464 * components in it. If so just link td->td_proc here. 2465 */ 2466 proc_linkup0(&proc0, &thread0); 2467 2468#ifdef PC98 2469 /* 2470 * Initialize DMAC 2471 */ 2472 pc98_init_dmac(); 2473#endif 2474 2475 metadata_missing = 0; 2476 if (bootinfo.bi_modulep) { 2477 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 2478 preload_bootstrap_relocate(KERNBASE); 2479 } else { 2480 metadata_missing = 1; 2481 } 2482 2483 if (bootinfo.bi_envp != 0) 2484 init_static_kenv((char *)bootinfo.bi_envp + KERNBASE, 0); 2485 else 2486 init_static_kenv(NULL, 0); 2487 2488 identify_hypervisor(); 2489 2490 /* Init basic tunables, hz etc */ 2491 init_param1(); 2492 2493 /* 2494 * Make gdt memory segments. All segments cover the full 4GB 2495 * of address space and permissions are enforced at page level. 2496 */ 2497 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); 2498 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); 2499 gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); 2500 gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); 2501 gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); 2502 gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); 2503 2504 pc = &__pcpu[0]; 2505 gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); 2506 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 2507 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 2508 2509 for (x = 0; x < NGDT; x++) 2510 ssdtosd(&gdt_segs[x], &gdt[x].sd); 2511 2512 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2513 r_gdt.rd_base = (int) gdt; 2514 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2515 lgdt(&r_gdt); 2516 2517 pcpu_init(pc, 0, sizeof(struct pcpu)); 2518 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2519 pmap_kenter(pa + KERNBASE, pa); 2520 dpcpu_init((void *)(first + KERNBASE), 0); 2521 first += DPCPU_SIZE; 2522 PCPU_SET(prvspace, pc); 2523 PCPU_SET(curthread, &thread0); 2524 /* Non-late cninit() and printf() can be moved up to here. */ 2525 2526 /* 2527 * Initialize mutexes. 2528 * 2529 * icu_lock: in order to allow an interrupt to occur in a critical 2530 * section, to set pcpu->ipending (etc...) properly, we 2531 * must be able to get the icu lock, so it can't be 2532 * under witness. 2533 */ 2534 mutex_init(); 2535 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2536 2537 /* make ldt memory segments */ 2538 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2539 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2540 for (x = 0; x < nitems(ldt_segs); x++) 2541 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2542 2543 _default_ldt = GSEL(GLDT_SEL, SEL_KPL); 2544 lldt(_default_ldt); 2545 PCPU_SET(currentldt, _default_ldt); 2546 2547 /* exceptions */ 2548 for (x = 0; x < NIDT; x++) 2549 setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, 2550 GSEL(GCODE_SEL, SEL_KPL)); 2551 setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, 2552 GSEL(GCODE_SEL, SEL_KPL)); 2553 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, 2554 GSEL(GCODE_SEL, SEL_KPL)); 2555 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, 2556 GSEL(GCODE_SEL, SEL_KPL)); 2557 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, 2558 GSEL(GCODE_SEL, SEL_KPL)); 2559 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, 2560 GSEL(GCODE_SEL, SEL_KPL)); 2561 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, 2562 GSEL(GCODE_SEL, SEL_KPL)); 2563 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2564 GSEL(GCODE_SEL, SEL_KPL)); 2565 setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL 2566 , GSEL(GCODE_SEL, SEL_KPL)); 2567 setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); 2568 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, 2569 GSEL(GCODE_SEL, SEL_KPL)); 2570 setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, 2571 GSEL(GCODE_SEL, SEL_KPL)); 2572 setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, 2573 GSEL(GCODE_SEL, SEL_KPL)); 2574 setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, 2575 GSEL(GCODE_SEL, SEL_KPL)); 2576 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2577 GSEL(GCODE_SEL, SEL_KPL)); 2578 setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, 2579 GSEL(GCODE_SEL, SEL_KPL)); 2580 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, 2581 GSEL(GCODE_SEL, SEL_KPL)); 2582 setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, 2583 GSEL(GCODE_SEL, SEL_KPL)); 2584 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, 2585 GSEL(GCODE_SEL, SEL_KPL)); 2586 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, 2587 GSEL(GCODE_SEL, SEL_KPL)); 2588 setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, 2589 GSEL(GCODE_SEL, SEL_KPL)); 2590#ifdef KDTRACE_HOOKS 2591 setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL, 2592 GSEL(GCODE_SEL, SEL_KPL)); 2593#endif 2594#ifdef XENHVM 2595 setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_KPL, 2596 GSEL(GCODE_SEL, SEL_KPL)); 2597#endif 2598 2599 r_idt.rd_limit = sizeof(idt0) - 1; 2600 r_idt.rd_base = (int) idt; 2601 lidt(&r_idt); 2602 2603#ifdef XBOX 2604 /* 2605 * The following code queries the PCI ID of 0:0:0. For the XBOX, 2606 * This should be 0x10de / 0x02a5. 2607 * 2608 * This is exactly what Linux does. 2609 */ 2610 outl(0xcf8, 0x80000000); 2611 if (inl(0xcfc) == 0x02a510de) { 2612 arch_i386_is_xbox = 1; 2613 pic16l_setled(XBOX_LED_GREEN); 2614 2615 /* 2616 * We are an XBOX, but we may have either 64MB or 128MB of 2617 * memory. The PCI host bridge should be programmed for this, 2618 * so we just query it. 2619 */ 2620 outl(0xcf8, 0x80000084); 2621 arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64; 2622 } 2623#endif /* XBOX */ 2624 2625 /* 2626 * Initialize the clock before the console so that console 2627 * initialization can use DELAY(). 2628 */ 2629 clock_init(); 2630 2631 finishidentcpu(); /* Final stage of CPU initialization */ 2632 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2633 GSEL(GCODE_SEL, SEL_KPL)); 2634 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2635 GSEL(GCODE_SEL, SEL_KPL)); 2636 initializecpu(); /* Initialize CPU registers */ 2637 initializecpucache(); 2638 2639 /* pointer to selector slot for %fs/%gs */ 2640 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 2641 2642 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = 2643 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; 2644 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = 2645 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 2646#if defined(PAE) || defined(PAE_TABLES) 2647 dblfault_tss.tss_cr3 = (int)IdlePDPT; 2648#else 2649 dblfault_tss.tss_cr3 = (int)IdlePTD; 2650#endif 2651 dblfault_tss.tss_eip = (int)dblfault_handler; 2652 dblfault_tss.tss_eflags = PSL_KERNEL; 2653 dblfault_tss.tss_ds = dblfault_tss.tss_es = 2654 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); 2655 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 2656 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); 2657 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 2658 2659 /* Initialize the tss (except for the final esp0) early for vm86. */ 2660 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + 2661 thread0.td_kstack_pages * PAGE_SIZE - 16); 2662 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 2663 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2664 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); 2665 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 2666 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 2667 ltr(gsel_tss); 2668 2669 /* Initialize the PIC early for vm86 calls. */ 2670#ifdef DEV_ISA 2671#ifdef DEV_ATPIC 2672#ifndef PC98 2673 elcr_probe(); 2674#endif 2675 atpic_startup(); 2676#else 2677 /* Reset and mask the atpics and leave them shut down. */ 2678 atpic_reset(); 2679 2680 /* 2681 * Point the ICU spurious interrupt vectors at the APIC spurious 2682 * interrupt handler. 2683 */ 2684 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2685 GSEL(GCODE_SEL, SEL_KPL)); 2686 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2687 GSEL(GCODE_SEL, SEL_KPL)); 2688#endif 2689#endif 2690 2691 /* 2692 * The console and kdb should be initialized even earlier than here, 2693 * but some console drivers don't work until after getmemsize(). 2694 * Default to late console initialization to support these drivers. 2695 * This loses mainly printf()s in getmemsize() and early debugging. 2696 */ 2697 late_console = 1; 2698 TUNABLE_INT_FETCH("debug.late_console", &late_console); 2699 if (!late_console) { 2700 cninit(); 2701 i386_kdb_init(); 2702 } 2703 2704 kmdp = preload_search_by_type("elf kernel"); 2705 link_elf_ireloc(kmdp); 2706 2707 vm86_initialize(); 2708 getmemsize(first); 2709 init_param2(physmem); 2710 2711 /* now running on new page tables, configured,and u/iom is accessible */ 2712 2713 if (late_console) 2714 cninit(); 2715 2716 if (metadata_missing) 2717 printf("WARNING: loader(8) metadata is missing!\n"); 2718 2719 if (late_console) 2720 i386_kdb_init(); 2721 2722 msgbufinit(msgbufp, msgbufsize); 2723 npxinit(true); 2724 /* 2725 * Set up thread0 pcb after npxinit calculated pcb + fpu save 2726 * area size. Zero out the extended state header in fpu save 2727 * area. 2728 */ 2729 thread0.td_pcb = get_pcb_td(&thread0); 2730 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 2731 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); 2732 if (use_xsave) { 2733 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 2734 1); 2735 xhdr->xstate_bv = xsave_mask; 2736 } 2737 PCPU_SET(curpcb, thread0.td_pcb); 2738 /* Move esp0 in the tss to its final place. */ 2739 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 2740 PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16); 2741 gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */ 2742 ltr(gsel_tss); 2743 2744 /* make a call gate to reenter kernel with */ 2745 gdp = &ldt[LSYS5CALLS_SEL].gd; 2746 2747 x = (int) &IDTVEC(lcall_syscall); 2748 gdp->gd_looffset = x; 2749 gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); 2750 gdp->gd_stkcpy = 1; 2751 gdp->gd_type = SDT_SYS386CGT; 2752 gdp->gd_dpl = SEL_UPL; 2753 gdp->gd_p = 1; 2754 gdp->gd_hioffset = x >> 16; 2755 2756 /* XXX does this work? */ 2757 /* XXX yes! */ 2758 ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; 2759 ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; 2760 2761 /* transfer to user mode */ 2762 2763 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2764 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2765 2766 /* setup proc 0's pcb */ 2767 thread0.td_pcb->pcb_flags = 0; 2768#if defined(PAE) || defined(PAE_TABLES) 2769 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; 2770#else 2771 thread0.td_pcb->pcb_cr3 = (int)IdlePTD; 2772#endif 2773 thread0.td_pcb->pcb_ext = 0; 2774 thread0.td_frame = &proc0_tf; 2775 2776 cpu_probe_amdc1e(); 2777 2778#ifdef FDT 2779 x86_init_fdt(); 2780#endif 2781 2782 /* Location of kernel stack for locore */ 2783 return ((register_t)thread0.td_pcb); 2784} 2785 2786void 2787cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 2788{ 2789 2790 pcpu->pc_acpi_id = 0xffffffff; 2791} 2792 2793#ifndef PC98 2794static int 2795smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 2796{ 2797 struct bios_smap *smapbase; 2798 struct bios_smap_xattr smap; 2799 caddr_t kmdp; 2800 uint32_t *smapattr; 2801 int count, error, i; 2802 2803 /* Retrieve the system memory map from the loader. */ 2804 kmdp = preload_search_by_type("elf kernel"); 2805 if (kmdp == NULL) 2806 kmdp = preload_search_by_type("elf32 kernel"); 2807 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2808 MODINFO_METADATA | MODINFOMD_SMAP); 2809 if (smapbase == NULL) 2810 return (0); 2811 smapattr = (uint32_t *)preload_search_info(kmdp, 2812 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 2813 count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase); 2814 error = 0; 2815 for (i = 0; i < count; i++) { 2816 smap.base = smapbase[i].base; 2817 smap.length = smapbase[i].length; 2818 smap.type = smapbase[i].type; 2819 if (smapattr != NULL) 2820 smap.xattr = smapattr[i]; 2821 else 2822 smap.xattr = 0; 2823 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 2824 } 2825 return (error); 2826} 2827SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 2828 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); 2829#endif /* !PC98 */ 2830 2831void 2832spinlock_enter(void) 2833{ 2834 struct thread *td; 2835 register_t flags; 2836 2837 td = curthread; 2838 if (td->td_md.md_spinlock_count == 0) { 2839 flags = intr_disable(); 2840 td->td_md.md_spinlock_count = 1; 2841 td->td_md.md_saved_flags = flags; 2842 } else 2843 td->td_md.md_spinlock_count++; 2844 critical_enter(); 2845} 2846 2847void 2848spinlock_exit(void) 2849{ 2850 struct thread *td; 2851 register_t flags; 2852 2853 td = curthread; 2854 critical_exit(); 2855 flags = td->td_md.md_saved_flags; 2856 td->td_md.md_spinlock_count--; 2857 if (td->td_md.md_spinlock_count == 0) 2858 intr_restore(flags); 2859} 2860 2861#if defined(I586_CPU) && !defined(NO_F00F_HACK) 2862static void f00f_hack(void *unused); 2863SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); 2864 2865static void 2866f00f_hack(void *unused) 2867{ 2868 struct gate_descriptor *new_idt; 2869 vm_offset_t tmp; 2870 2871 if (!has_f00f_bug) 2872 return; 2873 2874 GIANT_REQUIRED; 2875 2876 printf("Intel Pentium detected, installing workaround for F00F bug\n"); 2877 2878 tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO); 2879 if (tmp == 0) 2880 panic("kmem_malloc returned 0"); 2881 2882 /* Put the problematic entry (#6) at the end of the lower page. */ 2883 new_idt = (struct gate_descriptor*) 2884 (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); 2885 bcopy(idt, new_idt, sizeof(idt0)); 2886 r_idt.rd_base = (u_int)new_idt; 2887 lidt(&r_idt); 2888 idt = new_idt; 2889 pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ); 2890} 2891#endif /* defined(I586_CPU) && !NO_F00F_HACK */ 2892 2893/* 2894 * Construct a PCB from a trapframe. This is called from kdb_trap() where 2895 * we want to start a backtrace from the function that caused us to enter 2896 * the debugger. We have the context in the trapframe, but base the trace 2897 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 2898 * enough for a backtrace. 2899 */ 2900void 2901makectx(struct trapframe *tf, struct pcb *pcb) 2902{ 2903 2904 pcb->pcb_edi = tf->tf_edi; 2905 pcb->pcb_esi = tf->tf_esi; 2906 pcb->pcb_ebp = tf->tf_ebp; 2907 pcb->pcb_ebx = tf->tf_ebx; 2908 pcb->pcb_eip = tf->tf_eip; 2909 pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; 2910 pcb->pcb_gs = rgs(); 2911} 2912 2913int 2914ptrace_set_pc(struct thread *td, u_long addr) 2915{ 2916 2917 td->td_frame->tf_eip = addr; 2918 return (0); 2919} 2920 2921int 2922ptrace_single_step(struct thread *td) 2923{ 2924 2925 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2926 if ((td->td_frame->tf_eflags & PSL_T) == 0) { 2927 td->td_frame->tf_eflags |= PSL_T; 2928 td->td_dbgflags |= TDB_STEP; 2929 } 2930 return (0); 2931} 2932 2933int 2934ptrace_clear_single_step(struct thread *td) 2935{ 2936 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2937 td->td_frame->tf_eflags &= ~PSL_T; 2938 td->td_dbgflags &= ~TDB_STEP; 2939 return (0); 2940} 2941 2942int 2943fill_regs(struct thread *td, struct reg *regs) 2944{ 2945 struct pcb *pcb; 2946 struct trapframe *tp; 2947 2948 tp = td->td_frame; 2949 pcb = td->td_pcb; 2950 regs->r_gs = pcb->pcb_gs; 2951 return (fill_frame_regs(tp, regs)); 2952} 2953 2954int 2955fill_frame_regs(struct trapframe *tp, struct reg *regs) 2956{ 2957 regs->r_fs = tp->tf_fs; 2958 regs->r_es = tp->tf_es; 2959 regs->r_ds = tp->tf_ds; 2960 regs->r_edi = tp->tf_edi; 2961 regs->r_esi = tp->tf_esi; 2962 regs->r_ebp = tp->tf_ebp; 2963 regs->r_ebx = tp->tf_ebx; 2964 regs->r_edx = tp->tf_edx; 2965 regs->r_ecx = tp->tf_ecx; 2966 regs->r_eax = tp->tf_eax; 2967 regs->r_eip = tp->tf_eip; 2968 regs->r_cs = tp->tf_cs; 2969 regs->r_eflags = tp->tf_eflags; 2970 regs->r_esp = tp->tf_esp; 2971 regs->r_ss = tp->tf_ss; 2972 return (0); 2973} 2974 2975int 2976set_regs(struct thread *td, struct reg *regs) 2977{ 2978 struct pcb *pcb; 2979 struct trapframe *tp; 2980 2981 tp = td->td_frame; 2982 if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || 2983 !CS_SECURE(regs->r_cs)) 2984 return (EINVAL); 2985 pcb = td->td_pcb; 2986 tp->tf_fs = regs->r_fs; 2987 tp->tf_es = regs->r_es; 2988 tp->tf_ds = regs->r_ds; 2989 tp->tf_edi = regs->r_edi; 2990 tp->tf_esi = regs->r_esi; 2991 tp->tf_ebp = regs->r_ebp; 2992 tp->tf_ebx = regs->r_ebx; 2993 tp->tf_edx = regs->r_edx; 2994 tp->tf_ecx = regs->r_ecx; 2995 tp->tf_eax = regs->r_eax; 2996 tp->tf_eip = regs->r_eip; 2997 tp->tf_cs = regs->r_cs; 2998 tp->tf_eflags = regs->r_eflags; 2999 tp->tf_esp = regs->r_esp; 3000 tp->tf_ss = regs->r_ss; 3001 pcb->pcb_gs = regs->r_gs; 3002 return (0); 3003} 3004 3005int 3006fill_fpregs(struct thread *td, struct fpreg *fpregs) 3007{ 3008 3009 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 3010 P_SHOULDSTOP(td->td_proc), 3011 ("not suspended thread %p", td)); 3012 npxgetregs(td); 3013 if (cpu_fxsr) 3014 npx_fill_fpregs_xmm(&get_pcb_user_save_td(td)->sv_xmm, 3015 (struct save87 *)fpregs); 3016 else 3017 bcopy(&get_pcb_user_save_td(td)->sv_87, fpregs, 3018 sizeof(*fpregs)); 3019 return (0); 3020} 3021 3022int 3023set_fpregs(struct thread *td, struct fpreg *fpregs) 3024{ 3025 3026 critical_enter(); 3027 if (cpu_fxsr) 3028 npx_set_fpregs_xmm((struct save87 *)fpregs, 3029 &get_pcb_user_save_td(td)->sv_xmm); 3030 else 3031 bcopy(fpregs, &get_pcb_user_save_td(td)->sv_87, 3032 sizeof(*fpregs)); 3033 npxuserinited(td); 3034 critical_exit(); 3035 return (0); 3036} 3037 3038/* 3039 * Get machine context. 3040 */ 3041int 3042get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 3043{ 3044 struct trapframe *tp; 3045 struct segment_descriptor *sdp; 3046 3047 tp = td->td_frame; 3048 3049 PROC_LOCK(curthread->td_proc); 3050 mcp->mc_onstack = sigonstack(tp->tf_esp); 3051 PROC_UNLOCK(curthread->td_proc); 3052 mcp->mc_gs = td->td_pcb->pcb_gs; 3053 mcp->mc_fs = tp->tf_fs; 3054 mcp->mc_es = tp->tf_es; 3055 mcp->mc_ds = tp->tf_ds; 3056 mcp->mc_edi = tp->tf_edi; 3057 mcp->mc_esi = tp->tf_esi; 3058 mcp->mc_ebp = tp->tf_ebp; 3059 mcp->mc_isp = tp->tf_isp; 3060 mcp->mc_eflags = tp->tf_eflags; 3061 if (flags & GET_MC_CLEAR_RET) { 3062 mcp->mc_eax = 0; 3063 mcp->mc_edx = 0; 3064 mcp->mc_eflags &= ~PSL_C; 3065 } else { 3066 mcp->mc_eax = tp->tf_eax; 3067 mcp->mc_edx = tp->tf_edx; 3068 } 3069 mcp->mc_ebx = tp->tf_ebx; 3070 mcp->mc_ecx = tp->tf_ecx; 3071 mcp->mc_eip = tp->tf_eip; 3072 mcp->mc_cs = tp->tf_cs; 3073 mcp->mc_esp = tp->tf_esp; 3074 mcp->mc_ss = tp->tf_ss; 3075 mcp->mc_len = sizeof(*mcp); 3076 get_fpcontext(td, mcp, NULL, 0); 3077 sdp = &td->td_pcb->pcb_fsd; 3078 mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 3079 sdp = &td->td_pcb->pcb_gsd; 3080 mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 3081 mcp->mc_flags = 0; 3082 mcp->mc_xfpustate = 0; 3083 mcp->mc_xfpustate_len = 0; 3084 bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); 3085 return (0); 3086} 3087 3088/* 3089 * Set machine context. 3090 * 3091 * However, we don't set any but the user modifiable flags, and we won't 3092 * touch the cs selector. 3093 */ 3094int 3095set_mcontext(struct thread *td, mcontext_t *mcp) 3096{ 3097 struct trapframe *tp; 3098 char *xfpustate; 3099 int eflags, ret; 3100 3101 tp = td->td_frame; 3102 if (mcp->mc_len != sizeof(*mcp) || 3103 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 3104 return (EINVAL); 3105 eflags = (mcp->mc_eflags & PSL_USERCHANGE) | 3106 (tp->tf_eflags & ~PSL_USERCHANGE); 3107 if (mcp->mc_flags & _MC_HASFPXSTATE) { 3108 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 3109 sizeof(union savefpu)) 3110 return (EINVAL); 3111 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 3112 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 3113 mcp->mc_xfpustate_len); 3114 if (ret != 0) 3115 return (ret); 3116 } else 3117 xfpustate = NULL; 3118 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 3119 if (ret != 0) 3120 return (ret); 3121 tp->tf_fs = mcp->mc_fs; 3122 tp->tf_es = mcp->mc_es; 3123 tp->tf_ds = mcp->mc_ds; 3124 tp->tf_edi = mcp->mc_edi; 3125 tp->tf_esi = mcp->mc_esi; 3126 tp->tf_ebp = mcp->mc_ebp; 3127 tp->tf_ebx = mcp->mc_ebx; 3128 tp->tf_edx = mcp->mc_edx; 3129 tp->tf_ecx = mcp->mc_ecx; 3130 tp->tf_eax = mcp->mc_eax; 3131 tp->tf_eip = mcp->mc_eip; 3132 tp->tf_eflags = eflags; 3133 tp->tf_esp = mcp->mc_esp; 3134 tp->tf_ss = mcp->mc_ss; 3135 td->td_pcb->pcb_gs = mcp->mc_gs; 3136 return (0); 3137} 3138 3139static void 3140get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 3141 size_t xfpusave_len) 3142{ 3143 size_t max_len, len; 3144 3145 mcp->mc_ownedfp = npxgetregs(td); 3146 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 3147 sizeof(mcp->mc_fpstate)); 3148 mcp->mc_fpformat = npxformat(); 3149 if (!use_xsave || xfpusave_len == 0) 3150 return; 3151 max_len = cpu_max_ext_state_size - sizeof(union savefpu); 3152 len = xfpusave_len; 3153 if (len > max_len) { 3154 len = max_len; 3155 bzero(xfpusave + max_len, len - max_len); 3156 } 3157 mcp->mc_flags |= _MC_HASFPXSTATE; 3158 mcp->mc_xfpustate_len = len; 3159 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 3160} 3161 3162static int 3163set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 3164 size_t xfpustate_len) 3165{ 3166 int error; 3167 3168 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 3169 return (0); 3170 else if (mcp->mc_fpformat != _MC_FPFMT_387 && 3171 mcp->mc_fpformat != _MC_FPFMT_XMM) 3172 return (EINVAL); 3173 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 3174 /* We don't care what state is left in the FPU or PCB. */ 3175 fpstate_drop(td); 3176 error = 0; 3177 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 3178 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 3179 error = npxsetregs(td, (union savefpu *)&mcp->mc_fpstate, 3180 xfpustate, xfpustate_len); 3181 } else 3182 return (EINVAL); 3183 return (error); 3184} 3185 3186static void 3187fpstate_drop(struct thread *td) 3188{ 3189 3190 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 3191 critical_enter(); 3192 if (PCPU_GET(fpcurthread) == td) 3193 npxdrop(); 3194 /* 3195 * XXX force a full drop of the npx. The above only drops it if we 3196 * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. 3197 * 3198 * XXX I don't much like npxgetregs()'s semantics of doing a full 3199 * drop. Dropping only to the pcb matches fnsave's behaviour. 3200 * We only need to drop to !PCB_INITDONE in sendsig(). But 3201 * sendsig() is the only caller of npxgetregs()... perhaps we just 3202 * have too many layers. 3203 */ 3204 curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE | 3205 PCB_NPXUSERINITDONE); 3206 critical_exit(); 3207} 3208 3209int 3210fill_dbregs(struct thread *td, struct dbreg *dbregs) 3211{ 3212 struct pcb *pcb; 3213 3214 if (td == NULL) { 3215 dbregs->dr[0] = rdr0(); 3216 dbregs->dr[1] = rdr1(); 3217 dbregs->dr[2] = rdr2(); 3218 dbregs->dr[3] = rdr3(); 3219 dbregs->dr[4] = rdr4(); 3220 dbregs->dr[5] = rdr5(); 3221 dbregs->dr[6] = rdr6(); 3222 dbregs->dr[7] = rdr7(); 3223 } else { 3224 pcb = td->td_pcb; 3225 dbregs->dr[0] = pcb->pcb_dr0; 3226 dbregs->dr[1] = pcb->pcb_dr1; 3227 dbregs->dr[2] = pcb->pcb_dr2; 3228 dbregs->dr[3] = pcb->pcb_dr3; 3229 dbregs->dr[4] = 0; 3230 dbregs->dr[5] = 0; 3231 dbregs->dr[6] = pcb->pcb_dr6; 3232 dbregs->dr[7] = pcb->pcb_dr7; 3233 } 3234 return (0); 3235} 3236 3237int 3238set_dbregs(struct thread *td, struct dbreg *dbregs) 3239{ 3240 struct pcb *pcb; 3241 int i; 3242 3243 if (td == NULL) { 3244 load_dr0(dbregs->dr[0]); 3245 load_dr1(dbregs->dr[1]); 3246 load_dr2(dbregs->dr[2]); 3247 load_dr3(dbregs->dr[3]); 3248 load_dr4(dbregs->dr[4]); 3249 load_dr5(dbregs->dr[5]); 3250 load_dr6(dbregs->dr[6]); 3251 load_dr7(dbregs->dr[7]); 3252 } else { 3253 /* 3254 * Don't let an illegal value for dr7 get set. Specifically, 3255 * check for undefined settings. Setting these bit patterns 3256 * result in undefined behaviour and can lead to an unexpected 3257 * TRCTRAP. 3258 */ 3259 for (i = 0; i < 4; i++) { 3260 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 3261 return (EINVAL); 3262 if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) 3263 return (EINVAL); 3264 } 3265 3266 pcb = td->td_pcb; 3267 3268 /* 3269 * Don't let a process set a breakpoint that is not within the 3270 * process's address space. If a process could do this, it 3271 * could halt the system by setting a breakpoint in the kernel 3272 * (if ddb was enabled). Thus, we need to check to make sure 3273 * that no breakpoints are being enabled for addresses outside 3274 * process's address space. 3275 * 3276 * XXX - what about when the watched area of the user's 3277 * address space is written into from within the kernel 3278 * ... wouldn't that still cause a breakpoint to be generated 3279 * from within kernel mode? 3280 */ 3281 3282 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 3283 /* dr0 is enabled */ 3284 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 3285 return (EINVAL); 3286 } 3287 3288 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 3289 /* dr1 is enabled */ 3290 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 3291 return (EINVAL); 3292 } 3293 3294 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 3295 /* dr2 is enabled */ 3296 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 3297 return (EINVAL); 3298 } 3299 3300 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 3301 /* dr3 is enabled */ 3302 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 3303 return (EINVAL); 3304 } 3305 3306 pcb->pcb_dr0 = dbregs->dr[0]; 3307 pcb->pcb_dr1 = dbregs->dr[1]; 3308 pcb->pcb_dr2 = dbregs->dr[2]; 3309 pcb->pcb_dr3 = dbregs->dr[3]; 3310 pcb->pcb_dr6 = dbregs->dr[6]; 3311 pcb->pcb_dr7 = dbregs->dr[7]; 3312 3313 pcb->pcb_flags |= PCB_DBREGS; 3314 } 3315 3316 return (0); 3317} 3318 3319/* 3320 * Return > 0 if a hardware breakpoint has been hit, and the 3321 * breakpoint was in user space. Return 0, otherwise. 3322 */ 3323int 3324user_dbreg_trap(register_t dr6) 3325{ 3326 u_int32_t dr7; 3327 u_int32_t bp; /* breakpoint bits extracted from dr6 */ 3328 int nbp; /* number of breakpoints that triggered */ 3329 caddr_t addr[4]; /* breakpoint addresses */ 3330 int i; 3331 3332 bp = dr6 & DBREG_DR6_BMASK; 3333 if (bp == 0) { 3334 /* 3335 * None of the breakpoint bits are set meaning this 3336 * trap was not caused by any of the debug registers 3337 */ 3338 return 0; 3339 } 3340 3341 dr7 = rdr7(); 3342 if ((dr7 & 0x000000ff) == 0) { 3343 /* 3344 * all GE and LE bits in the dr7 register are zero, 3345 * thus the trap couldn't have been caused by the 3346 * hardware debug registers 3347 */ 3348 return 0; 3349 } 3350 3351 nbp = 0; 3352 3353 /* 3354 * at least one of the breakpoints were hit, check to see 3355 * which ones and if any of them are user space addresses 3356 */ 3357 3358 if (bp & 0x01) { 3359 addr[nbp++] = (caddr_t)rdr0(); 3360 } 3361 if (bp & 0x02) { 3362 addr[nbp++] = (caddr_t)rdr1(); 3363 } 3364 if (bp & 0x04) { 3365 addr[nbp++] = (caddr_t)rdr2(); 3366 } 3367 if (bp & 0x08) { 3368 addr[nbp++] = (caddr_t)rdr3(); 3369 } 3370 3371 for (i = 0; i < nbp; i++) { 3372 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 3373 /* 3374 * addr[i] is in user space 3375 */ 3376 return nbp; 3377 } 3378 } 3379 3380 /* 3381 * None of the breakpoints are in user space. 3382 */ 3383 return 0; 3384} 3385 3386#ifdef KDB 3387 3388/* 3389 * Provide inb() and outb() as functions. They are normally only available as 3390 * inline functions, thus cannot be called from the debugger. 3391 */ 3392 3393/* silence compiler warnings */ 3394u_char inb_(u_short); 3395void outb_(u_short, u_char); 3396 3397u_char 3398inb_(u_short port) 3399{ 3400 return inb(port); 3401} 3402 3403void 3404outb_(u_short port, u_char data) 3405{ 3406 outb(port, data); 3407} 3408 3409#endif /* KDB */ 3410