machdep.c revision 324855
1/*- 2 * Copyright (c) 1992 Terrence R. Lambert. 3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * William Jolitz. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 38 */ 39 40#include <sys/cdefs.h> 41__FBSDID("$FreeBSD: stable/11/sys/i386/i386/machdep.c 324855 2017-10-22 08:47:13Z kib $"); 42 43#include "opt_apic.h" 44#include "opt_atpic.h" 45#include "opt_compat.h" 46#include "opt_cpu.h" 47#include "opt_ddb.h" 48#include "opt_inet.h" 49#include "opt_isa.h" 50#include "opt_kstack_pages.h" 51#include "opt_maxmem.h" 52#include "opt_mp_watchdog.h" 53#include "opt_perfmon.h" 54#include "opt_platform.h" 55#include "opt_xbox.h" 56 57#include <sys/param.h> 58#include <sys/proc.h> 59#include <sys/systm.h> 60#include <sys/bio.h> 61#include <sys/buf.h> 62#include <sys/bus.h> 63#include <sys/callout.h> 64#include <sys/cons.h> 65#include <sys/cpu.h> 66#include <sys/eventhandler.h> 67#include <sys/exec.h> 68#include <sys/imgact.h> 69#include <sys/kdb.h> 70#include <sys/kernel.h> 71#include <sys/ktr.h> 72#include <sys/linker.h> 73#include <sys/lock.h> 74#include <sys/malloc.h> 75#include <sys/memrange.h> 76#include <sys/msgbuf.h> 77#include <sys/mutex.h> 78#include <sys/pcpu.h> 79#include <sys/ptrace.h> 80#include <sys/reboot.h> 81#include <sys/rwlock.h> 82#include <sys/sched.h> 83#include <sys/signalvar.h> 84#ifdef SMP 85#include <sys/smp.h> 86#endif 87#include <sys/syscallsubr.h> 88#include <sys/sysctl.h> 89#include <sys/sysent.h> 90#include <sys/sysproto.h> 91#include <sys/ucontext.h> 92#include <sys/vmmeter.h> 93 94#include <vm/vm.h> 95#include <vm/vm_extern.h> 96#include <vm/vm_kern.h> 97#include <vm/vm_page.h> 98#include <vm/vm_map.h> 99#include <vm/vm_object.h> 100#include <vm/vm_pager.h> 101#include <vm/vm_param.h> 102 103#ifdef DDB 104#ifndef KDB 105#error KDB must be enabled in order for DDB to work! 106#endif 107#include <ddb/ddb.h> 108#include <ddb/db_sym.h> 109#endif 110 111#ifdef PC98 112#include <pc98/pc98/pc98_machdep.h> 113#else 114#include <isa/rtc.h> 115#endif 116 117#include <net/netisr.h> 118 119#include <machine/bootinfo.h> 120#include <machine/clock.h> 121#include <machine/cpu.h> 122#include <machine/cputypes.h> 123#include <machine/intr_machdep.h> 124#include <x86/mca.h> 125#include <machine/md_var.h> 126#include <machine/metadata.h> 127#include <machine/mp_watchdog.h> 128#include <machine/pc/bios.h> 129#include <machine/pcb.h> 130#include <machine/pcb_ext.h> 131#include <machine/proc.h> 132#include <machine/reg.h> 133#include <machine/sigframe.h> 134#include <machine/specialreg.h> 135#include <machine/vm86.h> 136#include <x86/init.h> 137#ifdef PERFMON 138#include <machine/perfmon.h> 139#endif 140#ifdef SMP 141#include <machine/smp.h> 142#endif 143#ifdef FDT 144#include <x86/fdt.h> 145#endif 146 147#ifdef DEV_APIC 148#include <x86/apicvar.h> 149#endif 150 151#ifdef DEV_ISA 152#include <x86/isa/icu.h> 153#endif 154 155#ifdef XBOX 156#include <machine/xbox.h> 157 158int arch_i386_is_xbox = 0; 159uint32_t arch_i386_xbox_memsize = 0; 160#endif 161 162/* Sanity check for __curthread() */ 163CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 164 165extern register_t init386(int first); 166extern void dblfault_handler(void); 167 168static void cpu_startup(void *); 169static void fpstate_drop(struct thread *td); 170static void get_fpcontext(struct thread *td, mcontext_t *mcp, 171 char *xfpusave, size_t xfpusave_len); 172static int set_fpcontext(struct thread *td, mcontext_t *mcp, 173 char *xfpustate, size_t xfpustate_len); 174SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 175 176/* Intel ICH registers */ 177#define ICH_PMBASE 0x400 178#define ICH_SMI_EN ICH_PMBASE + 0x30 179 180int _udatasel, _ucodesel; 181u_int basemem; 182 183#ifdef PC98 184int need_pre_dma_flush; /* If 1, use wbinvd befor DMA transfer. */ 185int need_post_dma_flush; /* If 1, use invd after DMA transfer. */ 186 187static int ispc98 = 1; 188SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, ""); 189#endif 190 191int cold = 1; 192 193#ifdef COMPAT_43 194static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 195#endif 196#ifdef COMPAT_FREEBSD4 197static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 198#endif 199 200long Maxmem = 0; 201long realmem = 0; 202 203#ifdef PAE 204FEATURE(pae, "Physical Address Extensions"); 205#endif 206 207/* 208 * The number of PHYSMAP entries must be one less than the number of 209 * PHYSSEG entries because the PHYSMAP entry that spans the largest 210 * physical address that is accessible by ISA DMA is split into two 211 * PHYSSEG entries. 212 */ 213#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 214 215vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 216vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 217 218/* must be 2 less so 0 0 can signal end of chunks */ 219#define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) 220#define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) 221 222struct kva_md_info kmi; 223 224static struct trapframe proc0_tf; 225struct pcpu __pcpu[MAXCPU]; 226 227struct mtx icu_lock; 228 229struct mem_range_softc mem_range_softc; 230 231 /* Default init_ops implementation. */ 232 struct init_ops init_ops = { 233 .early_clock_source_init = i8254_init, 234 .early_delay = i8254_delay, 235#ifdef DEV_APIC 236 .msi_init = msi_init, 237#endif 238 }; 239 240static void 241cpu_startup(dummy) 242 void *dummy; 243{ 244 uintmax_t memsize; 245 char *sysenv; 246 247#ifndef PC98 248 /* 249 * On MacBooks, we need to disallow the legacy USB circuit to 250 * generate an SMI# because this can cause several problems, 251 * namely: incorrect CPU frequency detection and failure to 252 * start the APs. 253 * We do this by disabling a bit in the SMI_EN (SMI Control and 254 * Enable register) of the Intel ICH LPC Interface Bridge. 255 */ 256 sysenv = kern_getenv("smbios.system.product"); 257 if (sysenv != NULL) { 258 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 259 strncmp(sysenv, "MacBook3,1", 10) == 0 || 260 strncmp(sysenv, "MacBook4,1", 10) == 0 || 261 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 262 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 263 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 264 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 265 strncmp(sysenv, "Macmini1,1", 10) == 0) { 266 if (bootverbose) 267 printf("Disabling LEGACY_USB_EN bit on " 268 "Intel ICH.\n"); 269 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 270 } 271 freeenv(sysenv); 272 } 273#endif /* !PC98 */ 274 275 /* 276 * Good {morning,afternoon,evening,night}. 277 */ 278 startrtclock(); 279 printcpuinfo(); 280 panicifcpuunsupported(); 281#ifdef PERFMON 282 perfmon_init(); 283#endif 284 285 /* 286 * Display physical memory if SMBIOS reports reasonable amount. 287 */ 288 memsize = 0; 289 sysenv = kern_getenv("smbios.memory.enabled"); 290 if (sysenv != NULL) { 291 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 292 freeenv(sysenv); 293 } 294 if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count)) 295 memsize = ptoa((uintmax_t)Maxmem); 296 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 297 realmem = atop(memsize); 298 299 /* 300 * Display any holes after the first chunk of extended memory. 301 */ 302 if (bootverbose) { 303 int indx; 304 305 printf("Physical memory chunk(s):\n"); 306 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 307 vm_paddr_t size; 308 309 size = phys_avail[indx + 1] - phys_avail[indx]; 310 printf( 311 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 312 (uintmax_t)phys_avail[indx], 313 (uintmax_t)phys_avail[indx + 1] - 1, 314 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 315 } 316 } 317 318 vm_ksubmap_init(&kmi); 319 320 printf("avail memory = %ju (%ju MB)\n", 321 ptoa((uintmax_t)vm_cnt.v_free_count), 322 ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); 323 324 /* 325 * Set up buffers, so they can be used to read disk labels. 326 */ 327 bufinit(); 328 vm_pager_bufferinit(); 329 cpu_setregs(); 330} 331 332/* 333 * Send an interrupt to process. 334 * 335 * Stack is set up to allow sigcode stored 336 * at top to call routine, followed by call 337 * to sigreturn routine below. After sigreturn 338 * resets the signal mask, the stack, and the 339 * frame pointer, it returns to the user 340 * specified pc, psl. 341 */ 342#ifdef COMPAT_43 343static void 344osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 345{ 346 struct osigframe sf, *fp; 347 struct proc *p; 348 struct thread *td; 349 struct sigacts *psp; 350 struct trapframe *regs; 351 int sig; 352 int oonstack; 353 354 td = curthread; 355 p = td->td_proc; 356 PROC_LOCK_ASSERT(p, MA_OWNED); 357 sig = ksi->ksi_signo; 358 psp = p->p_sigacts; 359 mtx_assert(&psp->ps_mtx, MA_OWNED); 360 regs = td->td_frame; 361 oonstack = sigonstack(regs->tf_esp); 362 363 /* Allocate space for the signal handler context. */ 364 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 365 SIGISMEMBER(psp->ps_sigonstack, sig)) { 366 fp = (struct osigframe *)((uintptr_t)td->td_sigstk.ss_sp + 367 td->td_sigstk.ss_size - sizeof(struct osigframe)); 368#if defined(COMPAT_43) 369 td->td_sigstk.ss_flags |= SS_ONSTACK; 370#endif 371 } else 372 fp = (struct osigframe *)regs->tf_esp - 1; 373 374 /* Build the argument list for the signal handler. */ 375 sf.sf_signum = sig; 376 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; 377 bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); 378 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 379 /* Signal handler installed with SA_SIGINFO. */ 380 sf.sf_arg2 = (register_t)&fp->sf_siginfo; 381 sf.sf_siginfo.si_signo = sig; 382 sf.sf_siginfo.si_code = ksi->ksi_code; 383 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; 384 sf.sf_addr = 0; 385 } else { 386 /* Old FreeBSD-style arguments. */ 387 sf.sf_arg2 = ksi->ksi_code; 388 sf.sf_addr = (register_t)ksi->ksi_addr; 389 sf.sf_ahu.sf_handler = catcher; 390 } 391 mtx_unlock(&psp->ps_mtx); 392 PROC_UNLOCK(p); 393 394 /* Save most if not all of trap frame. */ 395 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; 396 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; 397 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; 398 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; 399 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; 400 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; 401 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; 402 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; 403 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; 404 sf.sf_siginfo.si_sc.sc_es = regs->tf_es; 405 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; 406 sf.sf_siginfo.si_sc.sc_gs = rgs(); 407 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; 408 409 /* Build the signal context to be used by osigreturn(). */ 410 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; 411 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); 412 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; 413 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; 414 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; 415 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; 416 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; 417 sf.sf_siginfo.si_sc.sc_err = regs->tf_err; 418 419 /* 420 * If we're a vm86 process, we want to save the segment registers. 421 * We also change eflags to be our emulated eflags, not the actual 422 * eflags. 423 */ 424 if (regs->tf_eflags & PSL_VM) { 425 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ 426 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 427 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 428 429 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; 430 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; 431 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; 432 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; 433 434 if (vm86->vm86_has_vme == 0) 435 sf.sf_siginfo.si_sc.sc_ps = 436 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 437 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 438 439 /* See sendsig() for comments. */ 440 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 441 } 442 443 /* 444 * Copy the sigframe out to the user's stack. 445 */ 446 if (copyout(&sf, fp, sizeof(*fp)) != 0) { 447#ifdef DEBUG 448 printf("process %ld has trashed its stack\n", (long)p->p_pid); 449#endif 450 PROC_LOCK(p); 451 sigexit(td, SIGILL); 452 } 453 454 regs->tf_esp = (int)fp; 455 if (p->p_sysent->sv_sigcode_base != 0) { 456 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 457 szosigcode; 458 } else { 459 /* a.out sysentvec does not use shared page */ 460 regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode; 461 } 462 regs->tf_eflags &= ~(PSL_T | PSL_D); 463 regs->tf_cs = _ucodesel; 464 regs->tf_ds = _udatasel; 465 regs->tf_es = _udatasel; 466 regs->tf_fs = _udatasel; 467 load_gs(_udatasel); 468 regs->tf_ss = _udatasel; 469 PROC_LOCK(p); 470 mtx_lock(&psp->ps_mtx); 471} 472#endif /* COMPAT_43 */ 473 474#ifdef COMPAT_FREEBSD4 475static void 476freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 477{ 478 struct sigframe4 sf, *sfp; 479 struct proc *p; 480 struct thread *td; 481 struct sigacts *psp; 482 struct trapframe *regs; 483 int sig; 484 int oonstack; 485 486 td = curthread; 487 p = td->td_proc; 488 PROC_LOCK_ASSERT(p, MA_OWNED); 489 sig = ksi->ksi_signo; 490 psp = p->p_sigacts; 491 mtx_assert(&psp->ps_mtx, MA_OWNED); 492 regs = td->td_frame; 493 oonstack = sigonstack(regs->tf_esp); 494 495 /* Save user context. */ 496 bzero(&sf, sizeof(sf)); 497 sf.sf_uc.uc_sigmask = *mask; 498 sf.sf_uc.uc_stack = td->td_sigstk; 499 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 500 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 501 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 502 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 503 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 504 bzero(sf.sf_uc.uc_mcontext.mc_fpregs, 505 sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); 506 bzero(sf.sf_uc.uc_mcontext.__spare__, 507 sizeof(sf.sf_uc.uc_mcontext.__spare__)); 508 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 509 510 /* Allocate space for the signal handler context. */ 511 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 512 SIGISMEMBER(psp->ps_sigonstack, sig)) { 513 sfp = (struct sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp + 514 td->td_sigstk.ss_size - sizeof(struct sigframe4)); 515#if defined(COMPAT_43) 516 td->td_sigstk.ss_flags |= SS_ONSTACK; 517#endif 518 } else 519 sfp = (struct sigframe4 *)regs->tf_esp - 1; 520 521 /* Build the argument list for the signal handler. */ 522 sf.sf_signum = sig; 523 sf.sf_ucontext = (register_t)&sfp->sf_uc; 524 bzero(&sf.sf_si, sizeof(sf.sf_si)); 525 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 526 /* Signal handler installed with SA_SIGINFO. */ 527 sf.sf_siginfo = (register_t)&sfp->sf_si; 528 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 529 530 /* Fill in POSIX parts */ 531 sf.sf_si.si_signo = sig; 532 sf.sf_si.si_code = ksi->ksi_code; 533 sf.sf_si.si_addr = ksi->ksi_addr; 534 } else { 535 /* Old FreeBSD-style arguments. */ 536 sf.sf_siginfo = ksi->ksi_code; 537 sf.sf_addr = (register_t)ksi->ksi_addr; 538 sf.sf_ahu.sf_handler = catcher; 539 } 540 mtx_unlock(&psp->ps_mtx); 541 PROC_UNLOCK(p); 542 543 /* 544 * If we're a vm86 process, we want to save the segment registers. 545 * We also change eflags to be our emulated eflags, not the actual 546 * eflags. 547 */ 548 if (regs->tf_eflags & PSL_VM) { 549 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 550 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 551 552 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 553 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 554 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 555 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 556 557 if (vm86->vm86_has_vme == 0) 558 sf.sf_uc.uc_mcontext.mc_eflags = 559 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 560 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 561 562 /* 563 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 564 * syscalls made by the signal handler. This just avoids 565 * wasting time for our lazy fixup of such faults. PSL_NT 566 * does nothing in vm86 mode, but vm86 programs can set it 567 * almost legitimately in probes for old cpu types. 568 */ 569 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 570 } 571 572 /* 573 * Copy the sigframe out to the user's stack. 574 */ 575 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 576#ifdef DEBUG 577 printf("process %ld has trashed its stack\n", (long)p->p_pid); 578#endif 579 PROC_LOCK(p); 580 sigexit(td, SIGILL); 581 } 582 583 regs->tf_esp = (int)sfp; 584 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 585 szfreebsd4_sigcode; 586 regs->tf_eflags &= ~(PSL_T | PSL_D); 587 regs->tf_cs = _ucodesel; 588 regs->tf_ds = _udatasel; 589 regs->tf_es = _udatasel; 590 regs->tf_fs = _udatasel; 591 regs->tf_ss = _udatasel; 592 PROC_LOCK(p); 593 mtx_lock(&psp->ps_mtx); 594} 595#endif /* COMPAT_FREEBSD4 */ 596 597void 598sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 599{ 600 struct sigframe sf, *sfp; 601 struct proc *p; 602 struct thread *td; 603 struct sigacts *psp; 604 char *sp; 605 struct trapframe *regs; 606 struct segment_descriptor *sdp; 607 char *xfpusave; 608 size_t xfpusave_len; 609 int sig; 610 int oonstack; 611 612 td = curthread; 613 p = td->td_proc; 614 PROC_LOCK_ASSERT(p, MA_OWNED); 615 sig = ksi->ksi_signo; 616 psp = p->p_sigacts; 617 mtx_assert(&psp->ps_mtx, MA_OWNED); 618#ifdef COMPAT_FREEBSD4 619 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { 620 freebsd4_sendsig(catcher, ksi, mask); 621 return; 622 } 623#endif 624#ifdef COMPAT_43 625 if (SIGISMEMBER(psp->ps_osigset, sig)) { 626 osendsig(catcher, ksi, mask); 627 return; 628 } 629#endif 630 regs = td->td_frame; 631 oonstack = sigonstack(regs->tf_esp); 632 633 if (cpu_max_ext_state_size > sizeof(union savefpu) && use_xsave) { 634 xfpusave_len = cpu_max_ext_state_size - sizeof(union savefpu); 635 xfpusave = __builtin_alloca(xfpusave_len); 636 } else { 637 xfpusave_len = 0; 638 xfpusave = NULL; 639 } 640 641 /* Save user context. */ 642 bzero(&sf, sizeof(sf)); 643 sf.sf_uc.uc_sigmask = *mask; 644 sf.sf_uc.uc_stack = td->td_sigstk; 645 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 646 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 647 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 648 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 649 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 650 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 651 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 652 fpstate_drop(td); 653 /* 654 * Unconditionally fill the fsbase and gsbase into the mcontext. 655 */ 656 sdp = &td->td_pcb->pcb_fsd; 657 sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 | 658 sdp->sd_lobase; 659 sdp = &td->td_pcb->pcb_gsd; 660 sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | 661 sdp->sd_lobase; 662 bzero(sf.sf_uc.uc_mcontext.mc_spare2, 663 sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); 664 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 665 666 /* Allocate space for the signal handler context. */ 667 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 668 SIGISMEMBER(psp->ps_sigonstack, sig)) { 669 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 670#if defined(COMPAT_43) 671 td->td_sigstk.ss_flags |= SS_ONSTACK; 672#endif 673 } else 674 sp = (char *)regs->tf_esp - 128; 675 if (xfpusave != NULL) { 676 sp -= xfpusave_len; 677 sp = (char *)((unsigned int)sp & ~0x3F); 678 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 679 } 680 sp -= sizeof(struct sigframe); 681 682 /* Align to 16 bytes. */ 683 sfp = (struct sigframe *)((unsigned int)sp & ~0xF); 684 685 /* Build the argument list for the signal handler. */ 686 sf.sf_signum = sig; 687 sf.sf_ucontext = (register_t)&sfp->sf_uc; 688 bzero(&sf.sf_si, sizeof(sf.sf_si)); 689 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 690 /* Signal handler installed with SA_SIGINFO. */ 691 sf.sf_siginfo = (register_t)&sfp->sf_si; 692 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 693 694 /* Fill in POSIX parts */ 695 sf.sf_si = ksi->ksi_info; 696 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 697 } else { 698 /* Old FreeBSD-style arguments. */ 699 sf.sf_siginfo = ksi->ksi_code; 700 sf.sf_addr = (register_t)ksi->ksi_addr; 701 sf.sf_ahu.sf_handler = catcher; 702 } 703 mtx_unlock(&psp->ps_mtx); 704 PROC_UNLOCK(p); 705 706 /* 707 * If we're a vm86 process, we want to save the segment registers. 708 * We also change eflags to be our emulated eflags, not the actual 709 * eflags. 710 */ 711 if (regs->tf_eflags & PSL_VM) { 712 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 713 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 714 715 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 716 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 717 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 718 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 719 720 if (vm86->vm86_has_vme == 0) 721 sf.sf_uc.uc_mcontext.mc_eflags = 722 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 723 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 724 725 /* 726 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 727 * syscalls made by the signal handler. This just avoids 728 * wasting time for our lazy fixup of such faults. PSL_NT 729 * does nothing in vm86 mode, but vm86 programs can set it 730 * almost legitimately in probes for old cpu types. 731 */ 732 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 733 } 734 735 /* 736 * Copy the sigframe out to the user's stack. 737 */ 738 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 739 (xfpusave != NULL && copyout(xfpusave, 740 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 741 != 0)) { 742#ifdef DEBUG 743 printf("process %ld has trashed its stack\n", (long)p->p_pid); 744#endif 745 PROC_LOCK(p); 746 sigexit(td, SIGILL); 747 } 748 749 regs->tf_esp = (int)sfp; 750 regs->tf_eip = p->p_sysent->sv_sigcode_base; 751 if (regs->tf_eip == 0) 752 regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode; 753 regs->tf_eflags &= ~(PSL_T | PSL_D); 754 regs->tf_cs = _ucodesel; 755 regs->tf_ds = _udatasel; 756 regs->tf_es = _udatasel; 757 regs->tf_fs = _udatasel; 758 regs->tf_ss = _udatasel; 759 PROC_LOCK(p); 760 mtx_lock(&psp->ps_mtx); 761} 762 763/* 764 * System call to cleanup state after a signal 765 * has been taken. Reset signal mask and 766 * stack state from context left by sendsig (above). 767 * Return to previous pc and psl as specified by 768 * context left by sendsig. Check carefully to 769 * make sure that the user has not modified the 770 * state to gain improper privileges. 771 * 772 * MPSAFE 773 */ 774#ifdef COMPAT_43 775int 776osigreturn(td, uap) 777 struct thread *td; 778 struct osigreturn_args /* { 779 struct osigcontext *sigcntxp; 780 } */ *uap; 781{ 782 struct osigcontext sc; 783 struct trapframe *regs; 784 struct osigcontext *scp; 785 int eflags, error; 786 ksiginfo_t ksi; 787 788 regs = td->td_frame; 789 error = copyin(uap->sigcntxp, &sc, sizeof(sc)); 790 if (error != 0) 791 return (error); 792 scp = ≻ 793 eflags = scp->sc_ps; 794 if (eflags & PSL_VM) { 795 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 796 struct vm86_kernel *vm86; 797 798 /* 799 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 800 * set up the vm86 area, and we can't enter vm86 mode. 801 */ 802 if (td->td_pcb->pcb_ext == 0) 803 return (EINVAL); 804 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 805 if (vm86->vm86_inited == 0) 806 return (EINVAL); 807 808 /* Go back to user mode if both flags are set. */ 809 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 810 ksiginfo_init_trap(&ksi); 811 ksi.ksi_signo = SIGBUS; 812 ksi.ksi_code = BUS_OBJERR; 813 ksi.ksi_addr = (void *)regs->tf_eip; 814 trapsignal(td, &ksi); 815 } 816 817 if (vm86->vm86_has_vme) { 818 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 819 (eflags & VME_USERCHANGE) | PSL_VM; 820 } else { 821 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 822 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 823 (eflags & VM_USERCHANGE) | PSL_VM; 824 } 825 tf->tf_vm86_ds = scp->sc_ds; 826 tf->tf_vm86_es = scp->sc_es; 827 tf->tf_vm86_fs = scp->sc_fs; 828 tf->tf_vm86_gs = scp->sc_gs; 829 tf->tf_ds = _udatasel; 830 tf->tf_es = _udatasel; 831 tf->tf_fs = _udatasel; 832 } else { 833 /* 834 * Don't allow users to change privileged or reserved flags. 835 */ 836 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 837 return (EINVAL); 838 } 839 840 /* 841 * Don't allow users to load a valid privileged %cs. Let the 842 * hardware check for invalid selectors, excess privilege in 843 * other selectors, invalid %eip's and invalid %esp's. 844 */ 845 if (!CS_SECURE(scp->sc_cs)) { 846 ksiginfo_init_trap(&ksi); 847 ksi.ksi_signo = SIGBUS; 848 ksi.ksi_code = BUS_OBJERR; 849 ksi.ksi_trapno = T_PROTFLT; 850 ksi.ksi_addr = (void *)regs->tf_eip; 851 trapsignal(td, &ksi); 852 return (EINVAL); 853 } 854 regs->tf_ds = scp->sc_ds; 855 regs->tf_es = scp->sc_es; 856 regs->tf_fs = scp->sc_fs; 857 } 858 859 /* Restore remaining registers. */ 860 regs->tf_eax = scp->sc_eax; 861 regs->tf_ebx = scp->sc_ebx; 862 regs->tf_ecx = scp->sc_ecx; 863 regs->tf_edx = scp->sc_edx; 864 regs->tf_esi = scp->sc_esi; 865 regs->tf_edi = scp->sc_edi; 866 regs->tf_cs = scp->sc_cs; 867 regs->tf_ss = scp->sc_ss; 868 regs->tf_isp = scp->sc_isp; 869 regs->tf_ebp = scp->sc_fp; 870 regs->tf_esp = scp->sc_sp; 871 regs->tf_eip = scp->sc_pc; 872 regs->tf_eflags = eflags; 873 874#if defined(COMPAT_43) 875 if (scp->sc_onstack & 1) 876 td->td_sigstk.ss_flags |= SS_ONSTACK; 877 else 878 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 879#endif 880 kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, 881 SIGPROCMASK_OLD); 882 return (EJUSTRETURN); 883} 884#endif /* COMPAT_43 */ 885 886#ifdef COMPAT_FREEBSD4 887/* 888 * MPSAFE 889 */ 890int 891freebsd4_sigreturn(td, uap) 892 struct thread *td; 893 struct freebsd4_sigreturn_args /* { 894 const ucontext4 *sigcntxp; 895 } */ *uap; 896{ 897 struct ucontext4 uc; 898 struct trapframe *regs; 899 struct ucontext4 *ucp; 900 int cs, eflags, error; 901 ksiginfo_t ksi; 902 903 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 904 if (error != 0) 905 return (error); 906 ucp = &uc; 907 regs = td->td_frame; 908 eflags = ucp->uc_mcontext.mc_eflags; 909 if (eflags & PSL_VM) { 910 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 911 struct vm86_kernel *vm86; 912 913 /* 914 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 915 * set up the vm86 area, and we can't enter vm86 mode. 916 */ 917 if (td->td_pcb->pcb_ext == 0) 918 return (EINVAL); 919 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 920 if (vm86->vm86_inited == 0) 921 return (EINVAL); 922 923 /* Go back to user mode if both flags are set. */ 924 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 925 ksiginfo_init_trap(&ksi); 926 ksi.ksi_signo = SIGBUS; 927 ksi.ksi_code = BUS_OBJERR; 928 ksi.ksi_addr = (void *)regs->tf_eip; 929 trapsignal(td, &ksi); 930 } 931 if (vm86->vm86_has_vme) { 932 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 933 (eflags & VME_USERCHANGE) | PSL_VM; 934 } else { 935 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 936 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 937 (eflags & VM_USERCHANGE) | PSL_VM; 938 } 939 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 940 tf->tf_eflags = eflags; 941 tf->tf_vm86_ds = tf->tf_ds; 942 tf->tf_vm86_es = tf->tf_es; 943 tf->tf_vm86_fs = tf->tf_fs; 944 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 945 tf->tf_ds = _udatasel; 946 tf->tf_es = _udatasel; 947 tf->tf_fs = _udatasel; 948 } else { 949 /* 950 * Don't allow users to change privileged or reserved flags. 951 */ 952 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 953 uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n", 954 td->td_proc->p_pid, td->td_name, eflags); 955 return (EINVAL); 956 } 957 958 /* 959 * Don't allow users to load a valid privileged %cs. Let the 960 * hardware check for invalid selectors, excess privilege in 961 * other selectors, invalid %eip's and invalid %esp's. 962 */ 963 cs = ucp->uc_mcontext.mc_cs; 964 if (!CS_SECURE(cs)) { 965 uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", 966 td->td_proc->p_pid, td->td_name, cs); 967 ksiginfo_init_trap(&ksi); 968 ksi.ksi_signo = SIGBUS; 969 ksi.ksi_code = BUS_OBJERR; 970 ksi.ksi_trapno = T_PROTFLT; 971 ksi.ksi_addr = (void *)regs->tf_eip; 972 trapsignal(td, &ksi); 973 return (EINVAL); 974 } 975 976 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 977 } 978 979#if defined(COMPAT_43) 980 if (ucp->uc_mcontext.mc_onstack & 1) 981 td->td_sigstk.ss_flags |= SS_ONSTACK; 982 else 983 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 984#endif 985 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 986 return (EJUSTRETURN); 987} 988#endif /* COMPAT_FREEBSD4 */ 989 990/* 991 * MPSAFE 992 */ 993int 994sys_sigreturn(td, uap) 995 struct thread *td; 996 struct sigreturn_args /* { 997 const struct __ucontext *sigcntxp; 998 } */ *uap; 999{ 1000 ucontext_t uc; 1001 struct proc *p; 1002 struct trapframe *regs; 1003 ucontext_t *ucp; 1004 char *xfpustate; 1005 size_t xfpustate_len; 1006 int cs, eflags, error, ret; 1007 ksiginfo_t ksi; 1008 1009 p = td->td_proc; 1010 1011 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 1012 if (error != 0) 1013 return (error); 1014 ucp = &uc; 1015 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 1016 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 1017 td->td_name, ucp->uc_mcontext.mc_flags); 1018 return (EINVAL); 1019 } 1020 regs = td->td_frame; 1021 eflags = ucp->uc_mcontext.mc_eflags; 1022 if (eflags & PSL_VM) { 1023 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 1024 struct vm86_kernel *vm86; 1025 1026 /* 1027 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 1028 * set up the vm86 area, and we can't enter vm86 mode. 1029 */ 1030 if (td->td_pcb->pcb_ext == 0) 1031 return (EINVAL); 1032 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 1033 if (vm86->vm86_inited == 0) 1034 return (EINVAL); 1035 1036 /* Go back to user mode if both flags are set. */ 1037 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 1038 ksiginfo_init_trap(&ksi); 1039 ksi.ksi_signo = SIGBUS; 1040 ksi.ksi_code = BUS_OBJERR; 1041 ksi.ksi_addr = (void *)regs->tf_eip; 1042 trapsignal(td, &ksi); 1043 } 1044 1045 if (vm86->vm86_has_vme) { 1046 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 1047 (eflags & VME_USERCHANGE) | PSL_VM; 1048 } else { 1049 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 1050 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 1051 (eflags & VM_USERCHANGE) | PSL_VM; 1052 } 1053 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 1054 tf->tf_eflags = eflags; 1055 tf->tf_vm86_ds = tf->tf_ds; 1056 tf->tf_vm86_es = tf->tf_es; 1057 tf->tf_vm86_fs = tf->tf_fs; 1058 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 1059 tf->tf_ds = _udatasel; 1060 tf->tf_es = _udatasel; 1061 tf->tf_fs = _udatasel; 1062 } else { 1063 /* 1064 * Don't allow users to change privileged or reserved flags. 1065 */ 1066 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 1067 uprintf("pid %d (%s): sigreturn eflags = 0x%x\n", 1068 td->td_proc->p_pid, td->td_name, eflags); 1069 return (EINVAL); 1070 } 1071 1072 /* 1073 * Don't allow users to load a valid privileged %cs. Let the 1074 * hardware check for invalid selectors, excess privilege in 1075 * other selectors, invalid %eip's and invalid %esp's. 1076 */ 1077 cs = ucp->uc_mcontext.mc_cs; 1078 if (!CS_SECURE(cs)) { 1079 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", 1080 td->td_proc->p_pid, td->td_name, cs); 1081 ksiginfo_init_trap(&ksi); 1082 ksi.ksi_signo = SIGBUS; 1083 ksi.ksi_code = BUS_OBJERR; 1084 ksi.ksi_trapno = T_PROTFLT; 1085 ksi.ksi_addr = (void *)regs->tf_eip; 1086 trapsignal(td, &ksi); 1087 return (EINVAL); 1088 } 1089 1090 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 1091 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 1092 if (xfpustate_len > cpu_max_ext_state_size - 1093 sizeof(union savefpu)) { 1094 uprintf( 1095 "pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 1096 p->p_pid, td->td_name, xfpustate_len); 1097 return (EINVAL); 1098 } 1099 xfpustate = __builtin_alloca(xfpustate_len); 1100 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 1101 xfpustate, xfpustate_len); 1102 if (error != 0) { 1103 uprintf( 1104 "pid %d (%s): sigreturn copying xfpustate failed\n", 1105 p->p_pid, td->td_name); 1106 return (error); 1107 } 1108 } else { 1109 xfpustate = NULL; 1110 xfpustate_len = 0; 1111 } 1112 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, 1113 xfpustate_len); 1114 if (ret != 0) 1115 return (ret); 1116 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 1117 } 1118 1119#if defined(COMPAT_43) 1120 if (ucp->uc_mcontext.mc_onstack & 1) 1121 td->td_sigstk.ss_flags |= SS_ONSTACK; 1122 else 1123 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 1124#endif 1125 1126 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 1127 return (EJUSTRETURN); 1128} 1129 1130/* 1131 * Reset registers to default values on exec. 1132 */ 1133void 1134exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 1135{ 1136 struct trapframe *regs = td->td_frame; 1137 struct pcb *pcb = td->td_pcb; 1138 1139 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ 1140 pcb->pcb_gs = _udatasel; 1141 load_gs(_udatasel); 1142 1143 mtx_lock_spin(&dt_lock); 1144 if (td->td_proc->p_md.md_ldt) 1145 user_ldt_free(td); 1146 else 1147 mtx_unlock_spin(&dt_lock); 1148 1149 /* 1150 * Reset the fs and gs bases. The values from the old address 1151 * space do not make sense for the new program. In particular, 1152 * gsbase might be the TLS base for the old program but the new 1153 * program has no TLS now. 1154 */ 1155 set_fsbase(td, 0); 1156 set_gsbase(td, 0); 1157 1158 bzero((char *)regs, sizeof(struct trapframe)); 1159 regs->tf_eip = imgp->entry_addr; 1160 regs->tf_esp = stack; 1161 regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); 1162 regs->tf_ss = _udatasel; 1163 regs->tf_ds = _udatasel; 1164 regs->tf_es = _udatasel; 1165 regs->tf_fs = _udatasel; 1166 regs->tf_cs = _ucodesel; 1167 1168 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ 1169 regs->tf_ebx = imgp->ps_strings; 1170 1171 /* 1172 * Reset the hardware debug registers if they were in use. 1173 * They won't have any meaning for the newly exec'd process. 1174 */ 1175 if (pcb->pcb_flags & PCB_DBREGS) { 1176 pcb->pcb_dr0 = 0; 1177 pcb->pcb_dr1 = 0; 1178 pcb->pcb_dr2 = 0; 1179 pcb->pcb_dr3 = 0; 1180 pcb->pcb_dr6 = 0; 1181 pcb->pcb_dr7 = 0; 1182 if (pcb == curpcb) { 1183 /* 1184 * Clear the debug registers on the running 1185 * CPU, otherwise they will end up affecting 1186 * the next process we switch to. 1187 */ 1188 reset_dbregs(); 1189 } 1190 pcb->pcb_flags &= ~PCB_DBREGS; 1191 } 1192 1193 pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; 1194 1195 /* 1196 * Drop the FP state if we hold it, so that the process gets a 1197 * clean FP state if it uses the FPU again. 1198 */ 1199 fpstate_drop(td); 1200 1201 /* 1202 * XXX - Linux emulator 1203 * Make sure sure edx is 0x0 on entry. Linux binaries depend 1204 * on it. 1205 */ 1206 td->td_retval[1] = 0; 1207} 1208 1209void 1210cpu_setregs(void) 1211{ 1212 unsigned int cr0; 1213 1214 cr0 = rcr0(); 1215 1216 /* 1217 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: 1218 * 1219 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT 1220 * instructions. We must set the CR0_MP bit and use the CR0_TS 1221 * bit to control the trap, because setting the CR0_EM bit does 1222 * not cause WAIT instructions to trap. It's important to trap 1223 * WAIT instructions - otherwise the "wait" variants of no-wait 1224 * control instructions would degenerate to the "no-wait" variants 1225 * after FP context switches but work correctly otherwise. It's 1226 * particularly important to trap WAITs when there is no NPX - 1227 * otherwise the "wait" variants would always degenerate. 1228 * 1229 * Try setting CR0_NE to get correct error reporting on 486DX's. 1230 * Setting it should fail or do nothing on lesser processors. 1231 */ 1232 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 1233 load_cr0(cr0); 1234 load_gs(_udatasel); 1235} 1236 1237u_long bootdev; /* not a struct cdev *- encoding is different */ 1238SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1239 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); 1240 1241static char bootmethod[16] = "BIOS"; 1242SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1243 "System firmware boot method"); 1244 1245/* 1246 * Initialize 386 and configure to run kernel 1247 */ 1248 1249/* 1250 * Initialize segments & interrupt table 1251 */ 1252 1253int _default_ldt; 1254 1255union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1256union descriptor ldt[NLDT]; /* local descriptor table */ 1257static struct gate_descriptor idt0[NIDT]; 1258struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1259struct region_descriptor r_gdt, r_idt; /* table descriptors */ 1260struct mtx dt_lock; /* lock for GDT and LDT */ 1261 1262static struct i386tss dblfault_tss; 1263static char dblfault_stack[PAGE_SIZE]; 1264 1265extern vm_offset_t proc0kstack; 1266 1267 1268/* 1269 * software prototypes -- in more palatable form. 1270 * 1271 * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret 1272 * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) 1273 */ 1274struct soft_segment_descriptor gdt_segs[] = { 1275/* GNULL_SEL 0 Null Descriptor */ 1276{ .ssd_base = 0x0, 1277 .ssd_limit = 0x0, 1278 .ssd_type = 0, 1279 .ssd_dpl = SEL_KPL, 1280 .ssd_p = 0, 1281 .ssd_xx = 0, .ssd_xx1 = 0, 1282 .ssd_def32 = 0, 1283 .ssd_gran = 0 }, 1284/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ 1285{ .ssd_base = 0x0, 1286 .ssd_limit = 0xfffff, 1287 .ssd_type = SDT_MEMRWA, 1288 .ssd_dpl = SEL_KPL, 1289 .ssd_p = 1, 1290 .ssd_xx = 0, .ssd_xx1 = 0, 1291 .ssd_def32 = 1, 1292 .ssd_gran = 1 }, 1293/* GUFS_SEL 2 %fs Descriptor for user */ 1294{ .ssd_base = 0x0, 1295 .ssd_limit = 0xfffff, 1296 .ssd_type = SDT_MEMRWA, 1297 .ssd_dpl = SEL_UPL, 1298 .ssd_p = 1, 1299 .ssd_xx = 0, .ssd_xx1 = 0, 1300 .ssd_def32 = 1, 1301 .ssd_gran = 1 }, 1302/* GUGS_SEL 3 %gs Descriptor for user */ 1303{ .ssd_base = 0x0, 1304 .ssd_limit = 0xfffff, 1305 .ssd_type = SDT_MEMRWA, 1306 .ssd_dpl = SEL_UPL, 1307 .ssd_p = 1, 1308 .ssd_xx = 0, .ssd_xx1 = 0, 1309 .ssd_def32 = 1, 1310 .ssd_gran = 1 }, 1311/* GCODE_SEL 4 Code Descriptor for kernel */ 1312{ .ssd_base = 0x0, 1313 .ssd_limit = 0xfffff, 1314 .ssd_type = SDT_MEMERA, 1315 .ssd_dpl = SEL_KPL, 1316 .ssd_p = 1, 1317 .ssd_xx = 0, .ssd_xx1 = 0, 1318 .ssd_def32 = 1, 1319 .ssd_gran = 1 }, 1320/* GDATA_SEL 5 Data Descriptor for kernel */ 1321{ .ssd_base = 0x0, 1322 .ssd_limit = 0xfffff, 1323 .ssd_type = SDT_MEMRWA, 1324 .ssd_dpl = SEL_KPL, 1325 .ssd_p = 1, 1326 .ssd_xx = 0, .ssd_xx1 = 0, 1327 .ssd_def32 = 1, 1328 .ssd_gran = 1 }, 1329/* GUCODE_SEL 6 Code Descriptor for user */ 1330{ .ssd_base = 0x0, 1331 .ssd_limit = 0xfffff, 1332 .ssd_type = SDT_MEMERA, 1333 .ssd_dpl = SEL_UPL, 1334 .ssd_p = 1, 1335 .ssd_xx = 0, .ssd_xx1 = 0, 1336 .ssd_def32 = 1, 1337 .ssd_gran = 1 }, 1338/* GUDATA_SEL 7 Data Descriptor for user */ 1339{ .ssd_base = 0x0, 1340 .ssd_limit = 0xfffff, 1341 .ssd_type = SDT_MEMRWA, 1342 .ssd_dpl = SEL_UPL, 1343 .ssd_p = 1, 1344 .ssd_xx = 0, .ssd_xx1 = 0, 1345 .ssd_def32 = 1, 1346 .ssd_gran = 1 }, 1347/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ 1348{ .ssd_base = 0x400, 1349 .ssd_limit = 0xfffff, 1350 .ssd_type = SDT_MEMRWA, 1351 .ssd_dpl = SEL_KPL, 1352 .ssd_p = 1, 1353 .ssd_xx = 0, .ssd_xx1 = 0, 1354 .ssd_def32 = 1, 1355 .ssd_gran = 1 }, 1356/* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 1357{ 1358 .ssd_base = 0x0, 1359 .ssd_limit = sizeof(struct i386tss)-1, 1360 .ssd_type = SDT_SYS386TSS, 1361 .ssd_dpl = 0, 1362 .ssd_p = 1, 1363 .ssd_xx = 0, .ssd_xx1 = 0, 1364 .ssd_def32 = 0, 1365 .ssd_gran = 0 }, 1366/* GLDT_SEL 10 LDT Descriptor */ 1367{ .ssd_base = (int) ldt, 1368 .ssd_limit = sizeof(ldt)-1, 1369 .ssd_type = SDT_SYSLDT, 1370 .ssd_dpl = SEL_UPL, 1371 .ssd_p = 1, 1372 .ssd_xx = 0, .ssd_xx1 = 0, 1373 .ssd_def32 = 0, 1374 .ssd_gran = 0 }, 1375/* GUSERLDT_SEL 11 User LDT Descriptor per process */ 1376{ .ssd_base = (int) ldt, 1377 .ssd_limit = (512 * sizeof(union descriptor)-1), 1378 .ssd_type = SDT_SYSLDT, 1379 .ssd_dpl = 0, 1380 .ssd_p = 1, 1381 .ssd_xx = 0, .ssd_xx1 = 0, 1382 .ssd_def32 = 0, 1383 .ssd_gran = 0 }, 1384/* GPANIC_SEL 12 Panic Tss Descriptor */ 1385{ .ssd_base = (int) &dblfault_tss, 1386 .ssd_limit = sizeof(struct i386tss)-1, 1387 .ssd_type = SDT_SYS386TSS, 1388 .ssd_dpl = 0, 1389 .ssd_p = 1, 1390 .ssd_xx = 0, .ssd_xx1 = 0, 1391 .ssd_def32 = 0, 1392 .ssd_gran = 0 }, 1393/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ 1394{ .ssd_base = 0, 1395 .ssd_limit = 0xfffff, 1396 .ssd_type = SDT_MEMERA, 1397 .ssd_dpl = 0, 1398 .ssd_p = 1, 1399 .ssd_xx = 0, .ssd_xx1 = 0, 1400 .ssd_def32 = 0, 1401 .ssd_gran = 1 }, 1402/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ 1403{ .ssd_base = 0, 1404 .ssd_limit = 0xfffff, 1405 .ssd_type = SDT_MEMERA, 1406 .ssd_dpl = 0, 1407 .ssd_p = 1, 1408 .ssd_xx = 0, .ssd_xx1 = 0, 1409 .ssd_def32 = 0, 1410 .ssd_gran = 1 }, 1411/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ 1412{ .ssd_base = 0, 1413 .ssd_limit = 0xfffff, 1414 .ssd_type = SDT_MEMRWA, 1415 .ssd_dpl = 0, 1416 .ssd_p = 1, 1417 .ssd_xx = 0, .ssd_xx1 = 0, 1418 .ssd_def32 = 1, 1419 .ssd_gran = 1 }, 1420/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ 1421{ .ssd_base = 0, 1422 .ssd_limit = 0xfffff, 1423 .ssd_type = SDT_MEMRWA, 1424 .ssd_dpl = 0, 1425 .ssd_p = 1, 1426 .ssd_xx = 0, .ssd_xx1 = 0, 1427 .ssd_def32 = 0, 1428 .ssd_gran = 1 }, 1429/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ 1430{ .ssd_base = 0, 1431 .ssd_limit = 0xfffff, 1432 .ssd_type = SDT_MEMRWA, 1433 .ssd_dpl = 0, 1434 .ssd_p = 1, 1435 .ssd_xx = 0, .ssd_xx1 = 0, 1436 .ssd_def32 = 0, 1437 .ssd_gran = 1 }, 1438/* GNDIS_SEL 18 NDIS Descriptor */ 1439{ .ssd_base = 0x0, 1440 .ssd_limit = 0x0, 1441 .ssd_type = 0, 1442 .ssd_dpl = 0, 1443 .ssd_p = 0, 1444 .ssd_xx = 0, .ssd_xx1 = 0, 1445 .ssd_def32 = 0, 1446 .ssd_gran = 0 }, 1447}; 1448 1449static struct soft_segment_descriptor ldt_segs[] = { 1450 /* Null Descriptor - overwritten by call gate */ 1451{ .ssd_base = 0x0, 1452 .ssd_limit = 0x0, 1453 .ssd_type = 0, 1454 .ssd_dpl = 0, 1455 .ssd_p = 0, 1456 .ssd_xx = 0, .ssd_xx1 = 0, 1457 .ssd_def32 = 0, 1458 .ssd_gran = 0 }, 1459 /* Null Descriptor - overwritten by call gate */ 1460{ .ssd_base = 0x0, 1461 .ssd_limit = 0x0, 1462 .ssd_type = 0, 1463 .ssd_dpl = 0, 1464 .ssd_p = 0, 1465 .ssd_xx = 0, .ssd_xx1 = 0, 1466 .ssd_def32 = 0, 1467 .ssd_gran = 0 }, 1468 /* Null Descriptor - overwritten by call gate */ 1469{ .ssd_base = 0x0, 1470 .ssd_limit = 0x0, 1471 .ssd_type = 0, 1472 .ssd_dpl = 0, 1473 .ssd_p = 0, 1474 .ssd_xx = 0, .ssd_xx1 = 0, 1475 .ssd_def32 = 0, 1476 .ssd_gran = 0 }, 1477 /* Code Descriptor for user */ 1478{ .ssd_base = 0x0, 1479 .ssd_limit = 0xfffff, 1480 .ssd_type = SDT_MEMERA, 1481 .ssd_dpl = SEL_UPL, 1482 .ssd_p = 1, 1483 .ssd_xx = 0, .ssd_xx1 = 0, 1484 .ssd_def32 = 1, 1485 .ssd_gran = 1 }, 1486 /* Null Descriptor - overwritten by call gate */ 1487{ .ssd_base = 0x0, 1488 .ssd_limit = 0x0, 1489 .ssd_type = 0, 1490 .ssd_dpl = 0, 1491 .ssd_p = 0, 1492 .ssd_xx = 0, .ssd_xx1 = 0, 1493 .ssd_def32 = 0, 1494 .ssd_gran = 0 }, 1495 /* Data Descriptor for user */ 1496{ .ssd_base = 0x0, 1497 .ssd_limit = 0xfffff, 1498 .ssd_type = SDT_MEMRWA, 1499 .ssd_dpl = SEL_UPL, 1500 .ssd_p = 1, 1501 .ssd_xx = 0, .ssd_xx1 = 0, 1502 .ssd_def32 = 1, 1503 .ssd_gran = 1 }, 1504}; 1505 1506void 1507setidt(idx, func, typ, dpl, selec) 1508 int idx; 1509 inthand_t *func; 1510 int typ; 1511 int dpl; 1512 int selec; 1513{ 1514 struct gate_descriptor *ip; 1515 1516 ip = idt + idx; 1517 ip->gd_looffset = (int)func; 1518 ip->gd_selector = selec; 1519 ip->gd_stkcpy = 0; 1520 ip->gd_xx = 0; 1521 ip->gd_type = typ; 1522 ip->gd_dpl = dpl; 1523 ip->gd_p = 1; 1524 ip->gd_hioffset = ((int)func)>>16 ; 1525} 1526 1527extern inthand_t 1528 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1529 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1530 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1531 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1532 IDTVEC(xmm), 1533#ifdef KDTRACE_HOOKS 1534 IDTVEC(dtrace_ret), 1535#endif 1536#ifdef XENHVM 1537 IDTVEC(xen_intr_upcall), 1538#endif 1539 IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); 1540 1541#ifdef DDB 1542/* 1543 * Display the index and function name of any IDT entries that don't use 1544 * the default 'rsvd' entry point. 1545 */ 1546DB_SHOW_COMMAND(idt, db_show_idt) 1547{ 1548 struct gate_descriptor *ip; 1549 int idx; 1550 uintptr_t func; 1551 1552 ip = idt; 1553 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 1554 func = (ip->gd_hioffset << 16 | ip->gd_looffset); 1555 if (func != (uintptr_t)&IDTVEC(rsvd)) { 1556 db_printf("%3d\t", idx); 1557 db_printsym(func, DB_STGY_PROC); 1558 db_printf("\n"); 1559 } 1560 ip++; 1561 } 1562} 1563 1564/* Show privileged registers. */ 1565DB_SHOW_COMMAND(sysregs, db_show_sysregs) 1566{ 1567 uint64_t idtr, gdtr; 1568 1569 idtr = ridt(); 1570 db_printf("idtr\t0x%08x/%04x\n", 1571 (u_int)(idtr >> 16), (u_int)idtr & 0xffff); 1572 gdtr = rgdt(); 1573 db_printf("gdtr\t0x%08x/%04x\n", 1574 (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); 1575 db_printf("ldtr\t0x%04x\n", rldt()); 1576 db_printf("tr\t0x%04x\n", rtr()); 1577 db_printf("cr0\t0x%08x\n", rcr0()); 1578 db_printf("cr2\t0x%08x\n", rcr2()); 1579 db_printf("cr3\t0x%08x\n", rcr3()); 1580 db_printf("cr4\t0x%08x\n", rcr4()); 1581 if (rcr4() & CR4_XSAVE) 1582 db_printf("xcr0\t0x%016llx\n", rxcr(0)); 1583 if (amd_feature & (AMDID_NX | AMDID_LM)) 1584 db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER)); 1585 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 1586 db_printf("FEATURES_CTL\t0x%016llx\n", 1587 rdmsr(MSR_IA32_FEATURE_CONTROL)); 1588 if ((cpu_vendor_id == CPU_VENDOR_INTEL || 1589 cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6) 1590 db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR)); 1591 if (cpu_feature & CPUID_PAT) 1592 db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT)); 1593} 1594 1595DB_SHOW_COMMAND(dbregs, db_show_dbregs) 1596{ 1597 1598 db_printf("dr0\t0x%08x\n", rdr0()); 1599 db_printf("dr1\t0x%08x\n", rdr1()); 1600 db_printf("dr2\t0x%08x\n", rdr2()); 1601 db_printf("dr3\t0x%08x\n", rdr3()); 1602 db_printf("dr6\t0x%08x\n", rdr6()); 1603 db_printf("dr7\t0x%08x\n", rdr7()); 1604} 1605#endif 1606 1607void 1608sdtossd(sd, ssd) 1609 struct segment_descriptor *sd; 1610 struct soft_segment_descriptor *ssd; 1611{ 1612 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1613 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1614 ssd->ssd_type = sd->sd_type; 1615 ssd->ssd_dpl = sd->sd_dpl; 1616 ssd->ssd_p = sd->sd_p; 1617 ssd->ssd_def32 = sd->sd_def32; 1618 ssd->ssd_gran = sd->sd_gran; 1619} 1620 1621#if !defined(PC98) 1622static int 1623add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 1624 int *physmap_idxp) 1625{ 1626 int i, insert_idx, physmap_idx; 1627 1628 physmap_idx = *physmap_idxp; 1629 1630 if (length == 0) 1631 return (1); 1632 1633#ifndef PAE 1634 if (base > 0xffffffff) { 1635 printf("%uK of memory above 4GB ignored\n", 1636 (u_int)(length / 1024)); 1637 return (1); 1638 } 1639#endif 1640 1641 /* 1642 * Find insertion point while checking for overlap. Start off by 1643 * assuming the new entry will be added to the end. 1644 */ 1645 insert_idx = physmap_idx + 2; 1646 for (i = 0; i <= physmap_idx; i += 2) { 1647 if (base < physmap[i + 1]) { 1648 if (base + length <= physmap[i]) { 1649 insert_idx = i; 1650 break; 1651 } 1652 if (boothowto & RB_VERBOSE) 1653 printf( 1654 "Overlapping memory regions, ignoring second region\n"); 1655 return (1); 1656 } 1657 } 1658 1659 /* See if we can prepend to the next entry. */ 1660 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1661 physmap[insert_idx] = base; 1662 return (1); 1663 } 1664 1665 /* See if we can append to the previous entry. */ 1666 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1667 physmap[insert_idx - 1] += length; 1668 return (1); 1669 } 1670 1671 physmap_idx += 2; 1672 *physmap_idxp = physmap_idx; 1673 if (physmap_idx == PHYSMAP_SIZE) { 1674 printf( 1675 "Too many segments in the physical address map, giving up\n"); 1676 return (0); 1677 } 1678 1679 /* 1680 * Move the last 'N' entries down to make room for the new 1681 * entry if needed. 1682 */ 1683 for (i = physmap_idx; i > insert_idx; i -= 2) { 1684 physmap[i] = physmap[i - 2]; 1685 physmap[i + 1] = physmap[i - 1]; 1686 } 1687 1688 /* Insert the new entry. */ 1689 physmap[insert_idx] = base; 1690 physmap[insert_idx + 1] = base + length; 1691 return (1); 1692} 1693 1694static int 1695add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) 1696{ 1697 if (boothowto & RB_VERBOSE) 1698 printf("SMAP type=%02x base=%016llx len=%016llx\n", 1699 smap->type, smap->base, smap->length); 1700 1701 if (smap->type != SMAP_TYPE_MEMORY) 1702 return (1); 1703 1704 return (add_physmap_entry(smap->base, smap->length, physmap, 1705 physmap_idxp)); 1706} 1707 1708static void 1709add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap, 1710 int *physmap_idxp) 1711{ 1712 struct bios_smap *smap, *smapend; 1713 u_int32_t smapsize; 1714 /* 1715 * Memory map from INT 15:E820. 1716 * 1717 * subr_module.c says: 1718 * "Consumer may safely assume that size value precedes data." 1719 * ie: an int32_t immediately precedes SMAP. 1720 */ 1721 smapsize = *((u_int32_t *)smapbase - 1); 1722 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1723 1724 for (smap = smapbase; smap < smapend; smap++) 1725 if (!add_smap_entry(smap, physmap, physmap_idxp)) 1726 break; 1727} 1728#endif /* !PC98 */ 1729 1730static void 1731basemem_setup(void) 1732{ 1733 vm_paddr_t pa; 1734 pt_entry_t *pte; 1735 int i; 1736 1737 if (basemem > 640) { 1738 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", 1739 basemem); 1740 basemem = 640; 1741 } 1742 1743 /* 1744 * XXX if biosbasemem is now < 640, there is a `hole' 1745 * between the end of base memory and the start of 1746 * ISA memory. The hole may be empty or it may 1747 * contain BIOS code or data. Map it read/write so 1748 * that the BIOS can write to it. (Memory from 0 to 1749 * the physical end of the kernel is mapped read-only 1750 * to begin with and then parts of it are remapped. 1751 * The parts that aren't remapped form holes that 1752 * remain read-only and are unused by the kernel. 1753 * The base memory area is below the physical end of 1754 * the kernel and right now forms a read-only hole. 1755 * The part of it from PAGE_SIZE to 1756 * (trunc_page(biosbasemem * 1024) - 1) will be 1757 * remapped and used by the kernel later.) 1758 * 1759 * This code is similar to the code used in 1760 * pmap_mapdev, but since no memory needs to be 1761 * allocated we simply change the mapping. 1762 */ 1763 for (pa = trunc_page(basemem * 1024); 1764 pa < ISA_HOLE_START; pa += PAGE_SIZE) 1765 pmap_kenter(KERNBASE + pa, pa); 1766 1767 /* 1768 * Map pages between basemem and ISA_HOLE_START, if any, r/w into 1769 * the vm86 page table so that vm86 can scribble on them using 1770 * the vm86 map too. XXX: why 2 ways for this and only 1 way for 1771 * page 0, at least as initialized here? 1772 */ 1773 pte = (pt_entry_t *)vm86paddr; 1774 for (i = basemem / 4; i < 160; i++) 1775 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; 1776} 1777 1778/* 1779 * Populate the (physmap) array with base/bound pairs describing the 1780 * available physical memory in the system, then test this memory and 1781 * build the phys_avail array describing the actually-available memory. 1782 * 1783 * If we cannot accurately determine the physical memory map, then use 1784 * value from the 0xE801 call, and failing that, the RTC. 1785 * 1786 * Total memory size may be set by the kernel environment variable 1787 * hw.physmem or the compile-time define MAXMEM. 1788 * 1789 * XXX first should be vm_paddr_t. 1790 */ 1791#ifdef PC98 1792static void 1793getmemsize(int first) 1794{ 1795 int off, physmap_idx, pa_indx, da_indx; 1796 u_long physmem_tunable, memtest; 1797 vm_paddr_t physmap[PHYSMAP_SIZE]; 1798 pt_entry_t *pte; 1799 quad_t dcons_addr, dcons_size; 1800 int i; 1801 int pg_n; 1802 u_int extmem; 1803 u_int under16; 1804 vm_paddr_t pa; 1805 1806 bzero(physmap, sizeof(physmap)); 1807 1808 /* XXX - some of EPSON machines can't use PG_N */ 1809 pg_n = PG_N; 1810 if (pc98_machine_type & M_EPSON_PC98) { 1811 switch (epson_machine_id) { 1812#ifdef WB_CACHE 1813 default: 1814#endif 1815 case EPSON_PC486_HX: 1816 case EPSON_PC486_HG: 1817 case EPSON_PC486_HA: 1818 pg_n = 0; 1819 break; 1820 } 1821 } 1822 1823 under16 = pc98_getmemsize(&basemem, &extmem); 1824 basemem_setup(); 1825 1826 physmap[0] = 0; 1827 physmap[1] = basemem * 1024; 1828 physmap_idx = 2; 1829 physmap[physmap_idx] = 0x100000; 1830 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 1831 1832 /* 1833 * Now, physmap contains a map of physical memory. 1834 */ 1835 1836#ifdef SMP 1837 /* make hole for AP bootstrap code */ 1838 physmap[1] = mp_bootaddress(physmap[1]); 1839#endif 1840 1841 /* 1842 * Maxmem isn't the "maximum memory", it's one larger than the 1843 * highest page of the physical address space. It should be 1844 * called something like "Maxphyspage". We may adjust this 1845 * based on ``hw.physmem'' and the results of the memory test. 1846 */ 1847 Maxmem = atop(physmap[physmap_idx + 1]); 1848 1849#ifdef MAXMEM 1850 Maxmem = MAXMEM / 4; 1851#endif 1852 1853 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1854 Maxmem = atop(physmem_tunable); 1855 1856 /* 1857 * By default keep the memtest enabled. Use a general name so that 1858 * one could eventually do more with the code than just disable it. 1859 */ 1860 memtest = 1; 1861 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 1862 1863 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1864 (boothowto & RB_VERBOSE)) 1865 printf("Physical memory use set to %ldK\n", Maxmem * 4); 1866 1867 /* 1868 * If Maxmem has been increased beyond what the system has detected, 1869 * extend the last memory segment to the new limit. 1870 */ 1871 if (atop(physmap[physmap_idx + 1]) < Maxmem) 1872 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 1873 1874 /* 1875 * We need to divide chunk if Maxmem is larger than 16MB and 1876 * under 16MB area is not full of memory. 1877 * (1) system area (15-16MB region) is cut off 1878 * (2) extended memory is only over 16MB area (ex. Melco "HYPERMEMORY") 1879 */ 1880 if ((under16 != 16 * 1024) && (extmem > 15 * 1024)) { 1881 /* 15M - 16M region is cut off, so need to divide chunk */ 1882 physmap[physmap_idx + 1] = under16 * 1024; 1883 physmap_idx += 2; 1884 physmap[physmap_idx] = 0x1000000; 1885 physmap[physmap_idx + 1] = physmap[2] + extmem * 1024; 1886 } 1887 1888 /* call pmap initialization to make new kernel address space */ 1889 pmap_bootstrap(first); 1890 1891 /* 1892 * Size up each available chunk of physical memory. 1893 */ 1894 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 1895 pa_indx = 0; 1896 da_indx = 1; 1897 phys_avail[pa_indx++] = physmap[0]; 1898 phys_avail[pa_indx] = physmap[0]; 1899 dump_avail[da_indx] = physmap[0]; 1900 pte = CMAP3; 1901 1902 /* 1903 * Get dcons buffer address 1904 */ 1905 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 1906 getenv_quad("dcons.size", &dcons_size) == 0) 1907 dcons_addr = 0; 1908 1909 /* 1910 * physmap is in bytes, so when converting to page boundaries, 1911 * round up the start address and round down the end address. 1912 */ 1913 for (i = 0; i <= physmap_idx; i += 2) { 1914 vm_paddr_t end; 1915 1916 end = ptoa((vm_paddr_t)Maxmem); 1917 if (physmap[i + 1] < end) 1918 end = trunc_page(physmap[i + 1]); 1919 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1920 int tmp, page_bad, full; 1921 int *ptr = (int *)CADDR3; 1922 1923 full = FALSE; 1924 /* 1925 * block out kernel memory as not available. 1926 */ 1927 if (pa >= KERNLOAD && pa < first) 1928 goto do_dump_avail; 1929 1930 /* 1931 * block out dcons buffer 1932 */ 1933 if (dcons_addr > 0 1934 && pa >= trunc_page(dcons_addr) 1935 && pa < dcons_addr + dcons_size) 1936 goto do_dump_avail; 1937 1938 page_bad = FALSE; 1939 if (memtest == 0) 1940 goto skip_memtest; 1941 1942 /* 1943 * map page into kernel: valid, read/write,non-cacheable 1944 */ 1945 *pte = pa | PG_V | PG_RW | pg_n; 1946 invltlb(); 1947 1948 tmp = *(int *)ptr; 1949 /* 1950 * Test for alternating 1's and 0's 1951 */ 1952 *(volatile int *)ptr = 0xaaaaaaaa; 1953 if (*(volatile int *)ptr != 0xaaaaaaaa) 1954 page_bad = TRUE; 1955 /* 1956 * Test for alternating 0's and 1's 1957 */ 1958 *(volatile int *)ptr = 0x55555555; 1959 if (*(volatile int *)ptr != 0x55555555) 1960 page_bad = TRUE; 1961 /* 1962 * Test for all 1's 1963 */ 1964 *(volatile int *)ptr = 0xffffffff; 1965 if (*(volatile int *)ptr != 0xffffffff) 1966 page_bad = TRUE; 1967 /* 1968 * Test for all 0's 1969 */ 1970 *(volatile int *)ptr = 0x0; 1971 if (*(volatile int *)ptr != 0x0) 1972 page_bad = TRUE; 1973 /* 1974 * Restore original value. 1975 */ 1976 *(int *)ptr = tmp; 1977 1978skip_memtest: 1979 /* 1980 * Adjust array of valid/good pages. 1981 */ 1982 if (page_bad == TRUE) 1983 continue; 1984 /* 1985 * If this good page is a continuation of the 1986 * previous set of good pages, then just increase 1987 * the end pointer. Otherwise start a new chunk. 1988 * Note that "end" points one higher than end, 1989 * making the range >= start and < end. 1990 * If we're also doing a speculative memory 1991 * test and we at or past the end, bump up Maxmem 1992 * so that we keep going. The first bad page 1993 * will terminate the loop. 1994 */ 1995 if (phys_avail[pa_indx] == pa) { 1996 phys_avail[pa_indx] += PAGE_SIZE; 1997 } else { 1998 pa_indx++; 1999 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2000 printf( 2001 "Too many holes in the physical address space, giving up\n"); 2002 pa_indx--; 2003 full = TRUE; 2004 goto do_dump_avail; 2005 } 2006 phys_avail[pa_indx++] = pa; /* start */ 2007 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 2008 } 2009 physmem++; 2010do_dump_avail: 2011 if (dump_avail[da_indx] == pa) { 2012 dump_avail[da_indx] += PAGE_SIZE; 2013 } else { 2014 da_indx++; 2015 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2016 da_indx--; 2017 goto do_next; 2018 } 2019 dump_avail[da_indx++] = pa; /* start */ 2020 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 2021 } 2022do_next: 2023 if (full) 2024 break; 2025 } 2026 } 2027 *pte = 0; 2028 invltlb(); 2029 2030 /* 2031 * XXX 2032 * The last chunk must contain at least one page plus the message 2033 * buffer to avoid complicating other code (message buffer address 2034 * calculation, etc.). 2035 */ 2036 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 2037 round_page(msgbufsize) >= phys_avail[pa_indx]) { 2038 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2039 phys_avail[pa_indx--] = 0; 2040 phys_avail[pa_indx--] = 0; 2041 } 2042 2043 Maxmem = atop(phys_avail[pa_indx]); 2044 2045 /* Trim off space for the message buffer. */ 2046 phys_avail[pa_indx] -= round_page(msgbufsize); 2047 2048 /* Map the message buffer. */ 2049 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) 2050 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 2051 off); 2052} 2053#else /* PC98 */ 2054static void 2055getmemsize(int first) 2056{ 2057 int has_smap, off, physmap_idx, pa_indx, da_indx; 2058 u_long memtest; 2059 vm_paddr_t physmap[PHYSMAP_SIZE]; 2060 pt_entry_t *pte; 2061 quad_t dcons_addr, dcons_size, physmem_tunable; 2062 int hasbrokenint12, i, res; 2063 u_int extmem; 2064 struct vm86frame vmf; 2065 struct vm86context vmc; 2066 vm_paddr_t pa; 2067 struct bios_smap *smap, *smapbase; 2068 caddr_t kmdp; 2069 2070 has_smap = 0; 2071#ifdef XBOX 2072 if (arch_i386_is_xbox) { 2073 /* 2074 * We queried the memory size before, so chop off 4MB for 2075 * the framebuffer and inform the OS of this. 2076 */ 2077 physmap[0] = 0; 2078 physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE; 2079 physmap_idx = 0; 2080 goto physmap_done; 2081 } 2082#endif 2083 bzero(&vmf, sizeof(vmf)); 2084 bzero(physmap, sizeof(physmap)); 2085 basemem = 0; 2086 2087 /* 2088 * Check if the loader supplied an SMAP memory map. If so, 2089 * use that and do not make any VM86 calls. 2090 */ 2091 physmap_idx = 0; 2092 kmdp = preload_search_by_type("elf kernel"); 2093 if (kmdp == NULL) 2094 kmdp = preload_search_by_type("elf32 kernel"); 2095 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2096 MODINFO_METADATA | MODINFOMD_SMAP); 2097 if (smapbase != NULL) { 2098 add_smap_entries(smapbase, physmap, &physmap_idx); 2099 has_smap = 1; 2100 goto have_smap; 2101 } 2102 2103 /* 2104 * Some newer BIOSes have a broken INT 12H implementation 2105 * which causes a kernel panic immediately. In this case, we 2106 * need use the SMAP to determine the base memory size. 2107 */ 2108 hasbrokenint12 = 0; 2109 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); 2110 if (hasbrokenint12 == 0) { 2111 /* Use INT12 to determine base memory size. */ 2112 vm86_intcall(0x12, &vmf); 2113 basemem = vmf.vmf_ax; 2114 basemem_setup(); 2115 } 2116 2117 /* 2118 * Fetch the memory map with INT 15:E820. Map page 1 R/W into 2119 * the kernel page table so we can use it as a buffer. The 2120 * kernel will unmap this page later. 2121 */ 2122 pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT); 2123 vmc.npages = 0; 2124 smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); 2125 res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); 2126 KASSERT(res != 0, ("vm86_getptr() failed: address not found")); 2127 2128 vmf.vmf_ebx = 0; 2129 do { 2130 vmf.vmf_eax = 0xE820; 2131 vmf.vmf_edx = SMAP_SIG; 2132 vmf.vmf_ecx = sizeof(struct bios_smap); 2133 i = vm86_datacall(0x15, &vmf, &vmc); 2134 if (i || vmf.vmf_eax != SMAP_SIG) 2135 break; 2136 has_smap = 1; 2137 if (!add_smap_entry(smap, physmap, &physmap_idx)) 2138 break; 2139 } while (vmf.vmf_ebx != 0); 2140 2141have_smap: 2142 /* 2143 * If we didn't fetch the "base memory" size from INT12, 2144 * figure it out from the SMAP (or just guess). 2145 */ 2146 if (basemem == 0) { 2147 for (i = 0; i <= physmap_idx; i += 2) { 2148 if (physmap[i] == 0x00000000) { 2149 basemem = physmap[i + 1] / 1024; 2150 break; 2151 } 2152 } 2153 2154 /* XXX: If we couldn't find basemem from SMAP, just guess. */ 2155 if (basemem == 0) 2156 basemem = 640; 2157 basemem_setup(); 2158 } 2159 2160 if (physmap[1] != 0) 2161 goto physmap_done; 2162 2163 /* 2164 * If we failed to find an SMAP, figure out the extended 2165 * memory size. We will then build a simple memory map with 2166 * two segments, one for "base memory" and the second for 2167 * "extended memory". Note that "extended memory" starts at a 2168 * physical address of 1MB and that both basemem and extmem 2169 * are in units of 1KB. 2170 * 2171 * First, try to fetch the extended memory size via INT 15:E801. 2172 */ 2173 vmf.vmf_ax = 0xE801; 2174 if (vm86_intcall(0x15, &vmf) == 0) { 2175 extmem = vmf.vmf_cx + vmf.vmf_dx * 64; 2176 } else { 2177 /* 2178 * If INT15:E801 fails, this is our last ditch effort 2179 * to determine the extended memory size. Currently 2180 * we prefer the RTC value over INT15:88. 2181 */ 2182#if 0 2183 vmf.vmf_ah = 0x88; 2184 vm86_intcall(0x15, &vmf); 2185 extmem = vmf.vmf_ax; 2186#else 2187 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); 2188#endif 2189 } 2190 2191 /* 2192 * Special hack for chipsets that still remap the 384k hole when 2193 * there's 16MB of memory - this really confuses people that 2194 * are trying to use bus mastering ISA controllers with the 2195 * "16MB limit"; they only have 16MB, but the remapping puts 2196 * them beyond the limit. 2197 * 2198 * If extended memory is between 15-16MB (16-17MB phys address range), 2199 * chop it to 15MB. 2200 */ 2201 if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) 2202 extmem = 15 * 1024; 2203 2204 physmap[0] = 0; 2205 physmap[1] = basemem * 1024; 2206 physmap_idx = 2; 2207 physmap[physmap_idx] = 0x100000; 2208 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 2209 2210physmap_done: 2211 /* 2212 * Now, physmap contains a map of physical memory. 2213 */ 2214 2215#ifdef SMP 2216 /* make hole for AP bootstrap code */ 2217 physmap[1] = mp_bootaddress(physmap[1]); 2218#endif 2219 2220 /* 2221 * Maxmem isn't the "maximum memory", it's one larger than the 2222 * highest page of the physical address space. It should be 2223 * called something like "Maxphyspage". We may adjust this 2224 * based on ``hw.physmem'' and the results of the memory test. 2225 * 2226 * This is especially confusing when it is much larger than the 2227 * memory size and is displayed as "realmem". 2228 */ 2229 Maxmem = atop(physmap[physmap_idx + 1]); 2230 2231#ifdef MAXMEM 2232 Maxmem = MAXMEM / 4; 2233#endif 2234 2235 if (TUNABLE_QUAD_FETCH("hw.physmem", &physmem_tunable)) 2236 Maxmem = atop(physmem_tunable); 2237 2238 /* 2239 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend 2240 * the amount of memory in the system. 2241 */ 2242 if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) 2243 Maxmem = atop(physmap[physmap_idx + 1]); 2244 2245 /* 2246 * By default enable the memory test on real hardware, and disable 2247 * it if we appear to be running in a VM. This avoids touching all 2248 * pages unnecessarily, which doesn't matter on real hardware but is 2249 * bad for shared VM hosts. Use a general name so that 2250 * one could eventually do more with the code than just disable it. 2251 */ 2252 memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1; 2253 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 2254 2255 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2256 (boothowto & RB_VERBOSE)) 2257 printf("Physical memory use set to %ldK\n", Maxmem * 4); 2258 2259 /* 2260 * If Maxmem has been increased beyond what the system has detected, 2261 * extend the last memory segment to the new limit. 2262 */ 2263 if (atop(physmap[physmap_idx + 1]) < Maxmem) 2264 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 2265 2266 /* call pmap initialization to make new kernel address space */ 2267 pmap_bootstrap(first); 2268 2269 /* 2270 * Size up each available chunk of physical memory. 2271 */ 2272 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 2273 pa_indx = 0; 2274 da_indx = 1; 2275 phys_avail[pa_indx++] = physmap[0]; 2276 phys_avail[pa_indx] = physmap[0]; 2277 dump_avail[da_indx] = physmap[0]; 2278 pte = CMAP3; 2279 2280 /* 2281 * Get dcons buffer address 2282 */ 2283 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 2284 getenv_quad("dcons.size", &dcons_size) == 0) 2285 dcons_addr = 0; 2286 2287 /* 2288 * physmap is in bytes, so when converting to page boundaries, 2289 * round up the start address and round down the end address. 2290 */ 2291 for (i = 0; i <= physmap_idx; i += 2) { 2292 vm_paddr_t end; 2293 2294 end = ptoa((vm_paddr_t)Maxmem); 2295 if (physmap[i + 1] < end) 2296 end = trunc_page(physmap[i + 1]); 2297 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 2298 int tmp, page_bad, full; 2299 int *ptr = (int *)CADDR3; 2300 2301 full = FALSE; 2302 /* 2303 * block out kernel memory as not available. 2304 */ 2305 if (pa >= KERNLOAD && pa < first) 2306 goto do_dump_avail; 2307 2308 /* 2309 * block out dcons buffer 2310 */ 2311 if (dcons_addr > 0 2312 && pa >= trunc_page(dcons_addr) 2313 && pa < dcons_addr + dcons_size) 2314 goto do_dump_avail; 2315 2316 page_bad = FALSE; 2317 if (memtest == 0) 2318 goto skip_memtest; 2319 2320 /* 2321 * map page into kernel: valid, read/write,non-cacheable 2322 */ 2323 *pte = pa | PG_V | PG_RW | PG_N; 2324 invltlb(); 2325 2326 tmp = *(int *)ptr; 2327 /* 2328 * Test for alternating 1's and 0's 2329 */ 2330 *(volatile int *)ptr = 0xaaaaaaaa; 2331 if (*(volatile int *)ptr != 0xaaaaaaaa) 2332 page_bad = TRUE; 2333 /* 2334 * Test for alternating 0's and 1's 2335 */ 2336 *(volatile int *)ptr = 0x55555555; 2337 if (*(volatile int *)ptr != 0x55555555) 2338 page_bad = TRUE; 2339 /* 2340 * Test for all 1's 2341 */ 2342 *(volatile int *)ptr = 0xffffffff; 2343 if (*(volatile int *)ptr != 0xffffffff) 2344 page_bad = TRUE; 2345 /* 2346 * Test for all 0's 2347 */ 2348 *(volatile int *)ptr = 0x0; 2349 if (*(volatile int *)ptr != 0x0) 2350 page_bad = TRUE; 2351 /* 2352 * Restore original value. 2353 */ 2354 *(int *)ptr = tmp; 2355 2356skip_memtest: 2357 /* 2358 * Adjust array of valid/good pages. 2359 */ 2360 if (page_bad == TRUE) 2361 continue; 2362 /* 2363 * If this good page is a continuation of the 2364 * previous set of good pages, then just increase 2365 * the end pointer. Otherwise start a new chunk. 2366 * Note that "end" points one higher than end, 2367 * making the range >= start and < end. 2368 * If we're also doing a speculative memory 2369 * test and we at or past the end, bump up Maxmem 2370 * so that we keep going. The first bad page 2371 * will terminate the loop. 2372 */ 2373 if (phys_avail[pa_indx] == pa) { 2374 phys_avail[pa_indx] += PAGE_SIZE; 2375 } else { 2376 pa_indx++; 2377 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2378 printf( 2379 "Too many holes in the physical address space, giving up\n"); 2380 pa_indx--; 2381 full = TRUE; 2382 goto do_dump_avail; 2383 } 2384 phys_avail[pa_indx++] = pa; /* start */ 2385 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 2386 } 2387 physmem++; 2388do_dump_avail: 2389 if (dump_avail[da_indx] == pa) { 2390 dump_avail[da_indx] += PAGE_SIZE; 2391 } else { 2392 da_indx++; 2393 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2394 da_indx--; 2395 goto do_next; 2396 } 2397 dump_avail[da_indx++] = pa; /* start */ 2398 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 2399 } 2400do_next: 2401 if (full) 2402 break; 2403 } 2404 } 2405 *pte = 0; 2406 invltlb(); 2407 2408 /* 2409 * XXX 2410 * The last chunk must contain at least one page plus the message 2411 * buffer to avoid complicating other code (message buffer address 2412 * calculation, etc.). 2413 */ 2414 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 2415 round_page(msgbufsize) >= phys_avail[pa_indx]) { 2416 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2417 phys_avail[pa_indx--] = 0; 2418 phys_avail[pa_indx--] = 0; 2419 } 2420 2421 Maxmem = atop(phys_avail[pa_indx]); 2422 2423 /* Trim off space for the message buffer. */ 2424 phys_avail[pa_indx] -= round_page(msgbufsize); 2425 2426 /* Map the message buffer. */ 2427 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) 2428 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 2429 off); 2430} 2431#endif /* PC98 */ 2432 2433static void 2434i386_kdb_init(void) 2435{ 2436#ifdef DDB 2437 db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab); 2438#endif 2439 kdb_init(); 2440#ifdef KDB 2441 if (boothowto & RB_KDB) 2442 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 2443#endif 2444} 2445 2446register_t 2447init386(int first) 2448{ 2449 struct gate_descriptor *gdp; 2450 int gsel_tss, metadata_missing, x, pa; 2451 struct pcpu *pc; 2452 struct xstate_hdr *xhdr; 2453 int late_console; 2454 2455 thread0.td_kstack = proc0kstack; 2456 thread0.td_kstack_pages = TD0_KSTACK_PAGES; 2457 2458 /* 2459 * This may be done better later if it gets more high level 2460 * components in it. If so just link td->td_proc here. 2461 */ 2462 proc_linkup0(&proc0, &thread0); 2463 2464#ifdef PC98 2465 /* 2466 * Initialize DMAC 2467 */ 2468 pc98_init_dmac(); 2469#endif 2470 2471 metadata_missing = 0; 2472 if (bootinfo.bi_modulep) { 2473 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 2474 preload_bootstrap_relocate(KERNBASE); 2475 } else { 2476 metadata_missing = 1; 2477 } 2478 2479 if (bootinfo.bi_envp != 0) 2480 init_static_kenv((char *)bootinfo.bi_envp + KERNBASE, 0); 2481 else 2482 init_static_kenv(NULL, 0); 2483 2484 identify_hypervisor(); 2485 2486 /* Init basic tunables, hz etc */ 2487 init_param1(); 2488 2489 /* 2490 * Make gdt memory segments. All segments cover the full 4GB 2491 * of address space and permissions are enforced at page level. 2492 */ 2493 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); 2494 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); 2495 gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); 2496 gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); 2497 gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); 2498 gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); 2499 2500 pc = &__pcpu[0]; 2501 gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); 2502 gdt_segs[GPRIV_SEL].ssd_base = (int) pc; 2503 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; 2504 2505 for (x = 0; x < NGDT; x++) 2506 ssdtosd(&gdt_segs[x], &gdt[x].sd); 2507 2508 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2509 r_gdt.rd_base = (int) gdt; 2510 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2511 lgdt(&r_gdt); 2512 2513 pcpu_init(pc, 0, sizeof(struct pcpu)); 2514 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2515 pmap_kenter(pa + KERNBASE, pa); 2516 dpcpu_init((void *)(first + KERNBASE), 0); 2517 first += DPCPU_SIZE; 2518 PCPU_SET(prvspace, pc); 2519 PCPU_SET(curthread, &thread0); 2520 /* Non-late cninit() and printf() can be moved up to here. */ 2521 2522 /* 2523 * Initialize mutexes. 2524 * 2525 * icu_lock: in order to allow an interrupt to occur in a critical 2526 * section, to set pcpu->ipending (etc...) properly, we 2527 * must be able to get the icu lock, so it can't be 2528 * under witness. 2529 */ 2530 mutex_init(); 2531 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2532 2533 /* make ldt memory segments */ 2534 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2535 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2536 for (x = 0; x < nitems(ldt_segs); x++) 2537 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2538 2539 _default_ldt = GSEL(GLDT_SEL, SEL_KPL); 2540 lldt(_default_ldt); 2541 PCPU_SET(currentldt, _default_ldt); 2542 2543 /* exceptions */ 2544 for (x = 0; x < NIDT; x++) 2545 setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, 2546 GSEL(GCODE_SEL, SEL_KPL)); 2547 setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, 2548 GSEL(GCODE_SEL, SEL_KPL)); 2549 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, 2550 GSEL(GCODE_SEL, SEL_KPL)); 2551 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, 2552 GSEL(GCODE_SEL, SEL_KPL)); 2553 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, 2554 GSEL(GCODE_SEL, SEL_KPL)); 2555 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, 2556 GSEL(GCODE_SEL, SEL_KPL)); 2557 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, 2558 GSEL(GCODE_SEL, SEL_KPL)); 2559 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2560 GSEL(GCODE_SEL, SEL_KPL)); 2561 setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL 2562 , GSEL(GCODE_SEL, SEL_KPL)); 2563 setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); 2564 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, 2565 GSEL(GCODE_SEL, SEL_KPL)); 2566 setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, 2567 GSEL(GCODE_SEL, SEL_KPL)); 2568 setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, 2569 GSEL(GCODE_SEL, SEL_KPL)); 2570 setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, 2571 GSEL(GCODE_SEL, SEL_KPL)); 2572 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2573 GSEL(GCODE_SEL, SEL_KPL)); 2574 setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, 2575 GSEL(GCODE_SEL, SEL_KPL)); 2576 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, 2577 GSEL(GCODE_SEL, SEL_KPL)); 2578 setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, 2579 GSEL(GCODE_SEL, SEL_KPL)); 2580 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, 2581 GSEL(GCODE_SEL, SEL_KPL)); 2582 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, 2583 GSEL(GCODE_SEL, SEL_KPL)); 2584 setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, 2585 GSEL(GCODE_SEL, SEL_KPL)); 2586#ifdef KDTRACE_HOOKS 2587 setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL, 2588 GSEL(GCODE_SEL, SEL_KPL)); 2589#endif 2590#ifdef XENHVM 2591 setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_UPL, 2592 GSEL(GCODE_SEL, SEL_KPL)); 2593#endif 2594 2595 r_idt.rd_limit = sizeof(idt0) - 1; 2596 r_idt.rd_base = (int) idt; 2597 lidt(&r_idt); 2598 2599#ifdef XBOX 2600 /* 2601 * The following code queries the PCI ID of 0:0:0. For the XBOX, 2602 * This should be 0x10de / 0x02a5. 2603 * 2604 * This is exactly what Linux does. 2605 */ 2606 outl(0xcf8, 0x80000000); 2607 if (inl(0xcfc) == 0x02a510de) { 2608 arch_i386_is_xbox = 1; 2609 pic16l_setled(XBOX_LED_GREEN); 2610 2611 /* 2612 * We are an XBOX, but we may have either 64MB or 128MB of 2613 * memory. The PCI host bridge should be programmed for this, 2614 * so we just query it. 2615 */ 2616 outl(0xcf8, 0x80000084); 2617 arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64; 2618 } 2619#endif /* XBOX */ 2620 2621 /* 2622 * Initialize the clock before the console so that console 2623 * initialization can use DELAY(). 2624 */ 2625 clock_init(); 2626 2627 finishidentcpu(); /* Final stage of CPU initialization */ 2628 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, 2629 GSEL(GCODE_SEL, SEL_KPL)); 2630 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, 2631 GSEL(GCODE_SEL, SEL_KPL)); 2632 initializecpu(); /* Initialize CPU registers */ 2633 initializecpucache(); 2634 2635 /* pointer to selector slot for %fs/%gs */ 2636 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 2637 2638 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = 2639 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; 2640 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = 2641 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 2642#if defined(PAE) || defined(PAE_TABLES) 2643 dblfault_tss.tss_cr3 = (int)IdlePDPT; 2644#else 2645 dblfault_tss.tss_cr3 = (int)IdlePTD; 2646#endif 2647 dblfault_tss.tss_eip = (int)dblfault_handler; 2648 dblfault_tss.tss_eflags = PSL_KERNEL; 2649 dblfault_tss.tss_ds = dblfault_tss.tss_es = 2650 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); 2651 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 2652 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); 2653 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 2654 2655 /* Initialize the tss (except for the final esp0) early for vm86. */ 2656 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + 2657 thread0.td_kstack_pages * PAGE_SIZE - 16); 2658 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); 2659 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2660 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); 2661 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 2662 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); 2663 ltr(gsel_tss); 2664 2665 /* Initialize the PIC early for vm86 calls. */ 2666#ifdef DEV_ISA 2667#ifdef DEV_ATPIC 2668#ifndef PC98 2669 elcr_probe(); 2670#endif 2671 atpic_startup(); 2672#else 2673 /* Reset and mask the atpics and leave them shut down. */ 2674 atpic_reset(); 2675 2676 /* 2677 * Point the ICU spurious interrupt vectors at the APIC spurious 2678 * interrupt handler. 2679 */ 2680 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2681 GSEL(GCODE_SEL, SEL_KPL)); 2682 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, 2683 GSEL(GCODE_SEL, SEL_KPL)); 2684#endif 2685#endif 2686 2687 /* 2688 * The console and kdb should be initialized even earlier than here, 2689 * but some console drivers don't work until after getmemsize(). 2690 * Default to late console initialization to support these drivers. 2691 * This loses mainly printf()s in getmemsize() and early debugging. 2692 */ 2693 late_console = 1; 2694 TUNABLE_INT_FETCH("debug.late_console", &late_console); 2695 if (!late_console) { 2696 cninit(); 2697 i386_kdb_init(); 2698 } 2699 2700 vm86_initialize(); 2701 getmemsize(first); 2702 init_param2(physmem); 2703 2704 /* now running on new page tables, configured,and u/iom is accessible */ 2705 2706 if (late_console) 2707 cninit(); 2708 2709 if (metadata_missing) 2710 printf("WARNING: loader(8) metadata is missing!\n"); 2711 2712 if (late_console) 2713 i386_kdb_init(); 2714 2715 msgbufinit(msgbufp, msgbufsize); 2716 npxinit(true); 2717 /* 2718 * Set up thread0 pcb after npxinit calculated pcb + fpu save 2719 * area size. Zero out the extended state header in fpu save 2720 * area. 2721 */ 2722 thread0.td_pcb = get_pcb_td(&thread0); 2723 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 2724 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); 2725 if (use_xsave) { 2726 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 2727 1); 2728 xhdr->xstate_bv = xsave_mask; 2729 } 2730 PCPU_SET(curpcb, thread0.td_pcb); 2731 /* Move esp0 in the tss to its final place. */ 2732 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 2733 PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16); 2734 gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */ 2735 ltr(gsel_tss); 2736 2737 /* make a call gate to reenter kernel with */ 2738 gdp = &ldt[LSYS5CALLS_SEL].gd; 2739 2740 x = (int) &IDTVEC(lcall_syscall); 2741 gdp->gd_looffset = x; 2742 gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); 2743 gdp->gd_stkcpy = 1; 2744 gdp->gd_type = SDT_SYS386CGT; 2745 gdp->gd_dpl = SEL_UPL; 2746 gdp->gd_p = 1; 2747 gdp->gd_hioffset = x >> 16; 2748 2749 /* XXX does this work? */ 2750 /* XXX yes! */ 2751 ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; 2752 ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; 2753 2754 /* transfer to user mode */ 2755 2756 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2757 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2758 2759 /* setup proc 0's pcb */ 2760 thread0.td_pcb->pcb_flags = 0; 2761#if defined(PAE) || defined(PAE_TABLES) 2762 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; 2763#else 2764 thread0.td_pcb->pcb_cr3 = (int)IdlePTD; 2765#endif 2766 thread0.td_pcb->pcb_ext = 0; 2767 thread0.td_frame = &proc0_tf; 2768 2769 cpu_probe_amdc1e(); 2770 2771#ifdef FDT 2772 x86_init_fdt(); 2773#endif 2774 2775 /* Location of kernel stack for locore */ 2776 return ((register_t)thread0.td_pcb); 2777} 2778 2779void 2780cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 2781{ 2782 2783 pcpu->pc_acpi_id = 0xffffffff; 2784} 2785 2786#ifndef PC98 2787static int 2788smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 2789{ 2790 struct bios_smap *smapbase; 2791 struct bios_smap_xattr smap; 2792 caddr_t kmdp; 2793 uint32_t *smapattr; 2794 int count, error, i; 2795 2796 /* Retrieve the system memory map from the loader. */ 2797 kmdp = preload_search_by_type("elf kernel"); 2798 if (kmdp == NULL) 2799 kmdp = preload_search_by_type("elf32 kernel"); 2800 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2801 MODINFO_METADATA | MODINFOMD_SMAP); 2802 if (smapbase == NULL) 2803 return (0); 2804 smapattr = (uint32_t *)preload_search_info(kmdp, 2805 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 2806 count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase); 2807 error = 0; 2808 for (i = 0; i < count; i++) { 2809 smap.base = smapbase[i].base; 2810 smap.length = smapbase[i].length; 2811 smap.type = smapbase[i].type; 2812 if (smapattr != NULL) 2813 smap.xattr = smapattr[i]; 2814 else 2815 smap.xattr = 0; 2816 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 2817 } 2818 return (error); 2819} 2820SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 2821 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); 2822#endif /* !PC98 */ 2823 2824void 2825spinlock_enter(void) 2826{ 2827 struct thread *td; 2828 register_t flags; 2829 2830 td = curthread; 2831 if (td->td_md.md_spinlock_count == 0) { 2832 flags = intr_disable(); 2833 td->td_md.md_spinlock_count = 1; 2834 td->td_md.md_saved_flags = flags; 2835 } else 2836 td->td_md.md_spinlock_count++; 2837 critical_enter(); 2838} 2839 2840void 2841spinlock_exit(void) 2842{ 2843 struct thread *td; 2844 register_t flags; 2845 2846 td = curthread; 2847 critical_exit(); 2848 flags = td->td_md.md_saved_flags; 2849 td->td_md.md_spinlock_count--; 2850 if (td->td_md.md_spinlock_count == 0) 2851 intr_restore(flags); 2852} 2853 2854#if defined(I586_CPU) && !defined(NO_F00F_HACK) 2855static void f00f_hack(void *unused); 2856SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); 2857 2858static void 2859f00f_hack(void *unused) 2860{ 2861 struct gate_descriptor *new_idt; 2862 vm_offset_t tmp; 2863 2864 if (!has_f00f_bug) 2865 return; 2866 2867 GIANT_REQUIRED; 2868 2869 printf("Intel Pentium detected, installing workaround for F00F bug\n"); 2870 2871 tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO); 2872 if (tmp == 0) 2873 panic("kmem_malloc returned 0"); 2874 2875 /* Put the problematic entry (#6) at the end of the lower page. */ 2876 new_idt = (struct gate_descriptor*) 2877 (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); 2878 bcopy(idt, new_idt, sizeof(idt0)); 2879 r_idt.rd_base = (u_int)new_idt; 2880 lidt(&r_idt); 2881 idt = new_idt; 2882 pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ); 2883} 2884#endif /* defined(I586_CPU) && !NO_F00F_HACK */ 2885 2886/* 2887 * Construct a PCB from a trapframe. This is called from kdb_trap() where 2888 * we want to start a backtrace from the function that caused us to enter 2889 * the debugger. We have the context in the trapframe, but base the trace 2890 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 2891 * enough for a backtrace. 2892 */ 2893void 2894makectx(struct trapframe *tf, struct pcb *pcb) 2895{ 2896 2897 pcb->pcb_edi = tf->tf_edi; 2898 pcb->pcb_esi = tf->tf_esi; 2899 pcb->pcb_ebp = tf->tf_ebp; 2900 pcb->pcb_ebx = tf->tf_ebx; 2901 pcb->pcb_eip = tf->tf_eip; 2902 pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; 2903 pcb->pcb_gs = rgs(); 2904} 2905 2906int 2907ptrace_set_pc(struct thread *td, u_long addr) 2908{ 2909 2910 td->td_frame->tf_eip = addr; 2911 return (0); 2912} 2913 2914int 2915ptrace_single_step(struct thread *td) 2916{ 2917 td->td_frame->tf_eflags |= PSL_T; 2918 return (0); 2919} 2920 2921int 2922ptrace_clear_single_step(struct thread *td) 2923{ 2924 td->td_frame->tf_eflags &= ~PSL_T; 2925 return (0); 2926} 2927 2928int 2929fill_regs(struct thread *td, struct reg *regs) 2930{ 2931 struct pcb *pcb; 2932 struct trapframe *tp; 2933 2934 tp = td->td_frame; 2935 pcb = td->td_pcb; 2936 regs->r_gs = pcb->pcb_gs; 2937 return (fill_frame_regs(tp, regs)); 2938} 2939 2940int 2941fill_frame_regs(struct trapframe *tp, struct reg *regs) 2942{ 2943 regs->r_fs = tp->tf_fs; 2944 regs->r_es = tp->tf_es; 2945 regs->r_ds = tp->tf_ds; 2946 regs->r_edi = tp->tf_edi; 2947 regs->r_esi = tp->tf_esi; 2948 regs->r_ebp = tp->tf_ebp; 2949 regs->r_ebx = tp->tf_ebx; 2950 regs->r_edx = tp->tf_edx; 2951 regs->r_ecx = tp->tf_ecx; 2952 regs->r_eax = tp->tf_eax; 2953 regs->r_eip = tp->tf_eip; 2954 regs->r_cs = tp->tf_cs; 2955 regs->r_eflags = tp->tf_eflags; 2956 regs->r_esp = tp->tf_esp; 2957 regs->r_ss = tp->tf_ss; 2958 return (0); 2959} 2960 2961int 2962set_regs(struct thread *td, struct reg *regs) 2963{ 2964 struct pcb *pcb; 2965 struct trapframe *tp; 2966 2967 tp = td->td_frame; 2968 if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || 2969 !CS_SECURE(regs->r_cs)) 2970 return (EINVAL); 2971 pcb = td->td_pcb; 2972 tp->tf_fs = regs->r_fs; 2973 tp->tf_es = regs->r_es; 2974 tp->tf_ds = regs->r_ds; 2975 tp->tf_edi = regs->r_edi; 2976 tp->tf_esi = regs->r_esi; 2977 tp->tf_ebp = regs->r_ebp; 2978 tp->tf_ebx = regs->r_ebx; 2979 tp->tf_edx = regs->r_edx; 2980 tp->tf_ecx = regs->r_ecx; 2981 tp->tf_eax = regs->r_eax; 2982 tp->tf_eip = regs->r_eip; 2983 tp->tf_cs = regs->r_cs; 2984 tp->tf_eflags = regs->r_eflags; 2985 tp->tf_esp = regs->r_esp; 2986 tp->tf_ss = regs->r_ss; 2987 pcb->pcb_gs = regs->r_gs; 2988 return (0); 2989} 2990 2991int 2992fill_fpregs(struct thread *td, struct fpreg *fpregs) 2993{ 2994 2995 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2996 P_SHOULDSTOP(td->td_proc), 2997 ("not suspended thread %p", td)); 2998 npxgetregs(td); 2999 if (cpu_fxsr) 3000 npx_fill_fpregs_xmm(&get_pcb_user_save_td(td)->sv_xmm, 3001 (struct save87 *)fpregs); 3002 else 3003 bcopy(&get_pcb_user_save_td(td)->sv_87, fpregs, 3004 sizeof(*fpregs)); 3005 return (0); 3006} 3007 3008int 3009set_fpregs(struct thread *td, struct fpreg *fpregs) 3010{ 3011 3012 if (cpu_fxsr) 3013 npx_set_fpregs_xmm((struct save87 *)fpregs, 3014 &get_pcb_user_save_td(td)->sv_xmm); 3015 else 3016 bcopy(fpregs, &get_pcb_user_save_td(td)->sv_87, 3017 sizeof(*fpregs)); 3018 npxuserinited(td); 3019 return (0); 3020} 3021 3022/* 3023 * Get machine context. 3024 */ 3025int 3026get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 3027{ 3028 struct trapframe *tp; 3029 struct segment_descriptor *sdp; 3030 3031 tp = td->td_frame; 3032 3033 PROC_LOCK(curthread->td_proc); 3034 mcp->mc_onstack = sigonstack(tp->tf_esp); 3035 PROC_UNLOCK(curthread->td_proc); 3036 mcp->mc_gs = td->td_pcb->pcb_gs; 3037 mcp->mc_fs = tp->tf_fs; 3038 mcp->mc_es = tp->tf_es; 3039 mcp->mc_ds = tp->tf_ds; 3040 mcp->mc_edi = tp->tf_edi; 3041 mcp->mc_esi = tp->tf_esi; 3042 mcp->mc_ebp = tp->tf_ebp; 3043 mcp->mc_isp = tp->tf_isp; 3044 mcp->mc_eflags = tp->tf_eflags; 3045 if (flags & GET_MC_CLEAR_RET) { 3046 mcp->mc_eax = 0; 3047 mcp->mc_edx = 0; 3048 mcp->mc_eflags &= ~PSL_C; 3049 } else { 3050 mcp->mc_eax = tp->tf_eax; 3051 mcp->mc_edx = tp->tf_edx; 3052 } 3053 mcp->mc_ebx = tp->tf_ebx; 3054 mcp->mc_ecx = tp->tf_ecx; 3055 mcp->mc_eip = tp->tf_eip; 3056 mcp->mc_cs = tp->tf_cs; 3057 mcp->mc_esp = tp->tf_esp; 3058 mcp->mc_ss = tp->tf_ss; 3059 mcp->mc_len = sizeof(*mcp); 3060 get_fpcontext(td, mcp, NULL, 0); 3061 sdp = &td->td_pcb->pcb_fsd; 3062 mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 3063 sdp = &td->td_pcb->pcb_gsd; 3064 mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 3065 mcp->mc_flags = 0; 3066 mcp->mc_xfpustate = 0; 3067 mcp->mc_xfpustate_len = 0; 3068 bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); 3069 return (0); 3070} 3071 3072/* 3073 * Set machine context. 3074 * 3075 * However, we don't set any but the user modifiable flags, and we won't 3076 * touch the cs selector. 3077 */ 3078int 3079set_mcontext(struct thread *td, mcontext_t *mcp) 3080{ 3081 struct trapframe *tp; 3082 char *xfpustate; 3083 int eflags, ret; 3084 3085 tp = td->td_frame; 3086 if (mcp->mc_len != sizeof(*mcp) || 3087 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 3088 return (EINVAL); 3089 eflags = (mcp->mc_eflags & PSL_USERCHANGE) | 3090 (tp->tf_eflags & ~PSL_USERCHANGE); 3091 if (mcp->mc_flags & _MC_HASFPXSTATE) { 3092 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 3093 sizeof(union savefpu)) 3094 return (EINVAL); 3095 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 3096 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 3097 mcp->mc_xfpustate_len); 3098 if (ret != 0) 3099 return (ret); 3100 } else 3101 xfpustate = NULL; 3102 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 3103 if (ret != 0) 3104 return (ret); 3105 tp->tf_fs = mcp->mc_fs; 3106 tp->tf_es = mcp->mc_es; 3107 tp->tf_ds = mcp->mc_ds; 3108 tp->tf_edi = mcp->mc_edi; 3109 tp->tf_esi = mcp->mc_esi; 3110 tp->tf_ebp = mcp->mc_ebp; 3111 tp->tf_ebx = mcp->mc_ebx; 3112 tp->tf_edx = mcp->mc_edx; 3113 tp->tf_ecx = mcp->mc_ecx; 3114 tp->tf_eax = mcp->mc_eax; 3115 tp->tf_eip = mcp->mc_eip; 3116 tp->tf_eflags = eflags; 3117 tp->tf_esp = mcp->mc_esp; 3118 tp->tf_ss = mcp->mc_ss; 3119 td->td_pcb->pcb_gs = mcp->mc_gs; 3120 return (0); 3121} 3122 3123static void 3124get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 3125 size_t xfpusave_len) 3126{ 3127 size_t max_len, len; 3128 3129 mcp->mc_ownedfp = npxgetregs(td); 3130 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 3131 sizeof(mcp->mc_fpstate)); 3132 mcp->mc_fpformat = npxformat(); 3133 if (!use_xsave || xfpusave_len == 0) 3134 return; 3135 max_len = cpu_max_ext_state_size - sizeof(union savefpu); 3136 len = xfpusave_len; 3137 if (len > max_len) { 3138 len = max_len; 3139 bzero(xfpusave + max_len, len - max_len); 3140 } 3141 mcp->mc_flags |= _MC_HASFPXSTATE; 3142 mcp->mc_xfpustate_len = len; 3143 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 3144} 3145 3146static int 3147set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 3148 size_t xfpustate_len) 3149{ 3150 union savefpu *fpstate; 3151 int error; 3152 3153 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 3154 return (0); 3155 else if (mcp->mc_fpformat != _MC_FPFMT_387 && 3156 mcp->mc_fpformat != _MC_FPFMT_XMM) 3157 return (EINVAL); 3158 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 3159 /* We don't care what state is left in the FPU or PCB. */ 3160 fpstate_drop(td); 3161 error = 0; 3162 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 3163 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 3164 fpstate = (union savefpu *)&mcp->mc_fpstate; 3165 if (cpu_fxsr) 3166 fpstate->sv_xmm.sv_env.en_mxcsr &= cpu_mxcsr_mask; 3167 error = npxsetregs(td, fpstate, xfpustate, xfpustate_len); 3168 } else 3169 return (EINVAL); 3170 return (error); 3171} 3172 3173static void 3174fpstate_drop(struct thread *td) 3175{ 3176 3177 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 3178 critical_enter(); 3179 if (PCPU_GET(fpcurthread) == td) 3180 npxdrop(); 3181 /* 3182 * XXX force a full drop of the npx. The above only drops it if we 3183 * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. 3184 * 3185 * XXX I don't much like npxgetregs()'s semantics of doing a full 3186 * drop. Dropping only to the pcb matches fnsave's behaviour. 3187 * We only need to drop to !PCB_INITDONE in sendsig(). But 3188 * sendsig() is the only caller of npxgetregs()... perhaps we just 3189 * have too many layers. 3190 */ 3191 curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE | 3192 PCB_NPXUSERINITDONE); 3193 critical_exit(); 3194} 3195 3196int 3197fill_dbregs(struct thread *td, struct dbreg *dbregs) 3198{ 3199 struct pcb *pcb; 3200 3201 if (td == NULL) { 3202 dbregs->dr[0] = rdr0(); 3203 dbregs->dr[1] = rdr1(); 3204 dbregs->dr[2] = rdr2(); 3205 dbregs->dr[3] = rdr3(); 3206 dbregs->dr[4] = rdr4(); 3207 dbregs->dr[5] = rdr5(); 3208 dbregs->dr[6] = rdr6(); 3209 dbregs->dr[7] = rdr7(); 3210 } else { 3211 pcb = td->td_pcb; 3212 dbregs->dr[0] = pcb->pcb_dr0; 3213 dbregs->dr[1] = pcb->pcb_dr1; 3214 dbregs->dr[2] = pcb->pcb_dr2; 3215 dbregs->dr[3] = pcb->pcb_dr3; 3216 dbregs->dr[4] = 0; 3217 dbregs->dr[5] = 0; 3218 dbregs->dr[6] = pcb->pcb_dr6; 3219 dbregs->dr[7] = pcb->pcb_dr7; 3220 } 3221 return (0); 3222} 3223 3224int 3225set_dbregs(struct thread *td, struct dbreg *dbregs) 3226{ 3227 struct pcb *pcb; 3228 int i; 3229 3230 if (td == NULL) { 3231 load_dr0(dbregs->dr[0]); 3232 load_dr1(dbregs->dr[1]); 3233 load_dr2(dbregs->dr[2]); 3234 load_dr3(dbregs->dr[3]); 3235 load_dr4(dbregs->dr[4]); 3236 load_dr5(dbregs->dr[5]); 3237 load_dr6(dbregs->dr[6]); 3238 load_dr7(dbregs->dr[7]); 3239 } else { 3240 /* 3241 * Don't let an illegal value for dr7 get set. Specifically, 3242 * check for undefined settings. Setting these bit patterns 3243 * result in undefined behaviour and can lead to an unexpected 3244 * TRCTRAP. 3245 */ 3246 for (i = 0; i < 4; i++) { 3247 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 3248 return (EINVAL); 3249 if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) 3250 return (EINVAL); 3251 } 3252 3253 pcb = td->td_pcb; 3254 3255 /* 3256 * Don't let a process set a breakpoint that is not within the 3257 * process's address space. If a process could do this, it 3258 * could halt the system by setting a breakpoint in the kernel 3259 * (if ddb was enabled). Thus, we need to check to make sure 3260 * that no breakpoints are being enabled for addresses outside 3261 * process's address space. 3262 * 3263 * XXX - what about when the watched area of the user's 3264 * address space is written into from within the kernel 3265 * ... wouldn't that still cause a breakpoint to be generated 3266 * from within kernel mode? 3267 */ 3268 3269 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 3270 /* dr0 is enabled */ 3271 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 3272 return (EINVAL); 3273 } 3274 3275 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 3276 /* dr1 is enabled */ 3277 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 3278 return (EINVAL); 3279 } 3280 3281 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 3282 /* dr2 is enabled */ 3283 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 3284 return (EINVAL); 3285 } 3286 3287 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 3288 /* dr3 is enabled */ 3289 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 3290 return (EINVAL); 3291 } 3292 3293 pcb->pcb_dr0 = dbregs->dr[0]; 3294 pcb->pcb_dr1 = dbregs->dr[1]; 3295 pcb->pcb_dr2 = dbregs->dr[2]; 3296 pcb->pcb_dr3 = dbregs->dr[3]; 3297 pcb->pcb_dr6 = dbregs->dr[6]; 3298 pcb->pcb_dr7 = dbregs->dr[7]; 3299 3300 pcb->pcb_flags |= PCB_DBREGS; 3301 } 3302 3303 return (0); 3304} 3305 3306/* 3307 * Return > 0 if a hardware breakpoint has been hit, and the 3308 * breakpoint was in user space. Return 0, otherwise. 3309 */ 3310int 3311user_dbreg_trap(void) 3312{ 3313 u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ 3314 u_int32_t bp; /* breakpoint bits extracted from dr6 */ 3315 int nbp; /* number of breakpoints that triggered */ 3316 caddr_t addr[4]; /* breakpoint addresses */ 3317 int i; 3318 3319 dr7 = rdr7(); 3320 if ((dr7 & 0x000000ff) == 0) { 3321 /* 3322 * all GE and LE bits in the dr7 register are zero, 3323 * thus the trap couldn't have been caused by the 3324 * hardware debug registers 3325 */ 3326 return 0; 3327 } 3328 3329 nbp = 0; 3330 dr6 = rdr6(); 3331 bp = dr6 & 0x0000000f; 3332 3333 if (!bp) { 3334 /* 3335 * None of the breakpoint bits are set meaning this 3336 * trap was not caused by any of the debug registers 3337 */ 3338 return 0; 3339 } 3340 3341 /* 3342 * at least one of the breakpoints were hit, check to see 3343 * which ones and if any of them are user space addresses 3344 */ 3345 3346 if (bp & 0x01) { 3347 addr[nbp++] = (caddr_t)rdr0(); 3348 } 3349 if (bp & 0x02) { 3350 addr[nbp++] = (caddr_t)rdr1(); 3351 } 3352 if (bp & 0x04) { 3353 addr[nbp++] = (caddr_t)rdr2(); 3354 } 3355 if (bp & 0x08) { 3356 addr[nbp++] = (caddr_t)rdr3(); 3357 } 3358 3359 for (i = 0; i < nbp; i++) { 3360 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 3361 /* 3362 * addr[i] is in user space 3363 */ 3364 return nbp; 3365 } 3366 } 3367 3368 /* 3369 * None of the breakpoints are in user space. 3370 */ 3371 return 0; 3372} 3373 3374#ifdef KDB 3375 3376/* 3377 * Provide inb() and outb() as functions. They are normally only available as 3378 * inline functions, thus cannot be called from the debugger. 3379 */ 3380 3381/* silence compiler warnings */ 3382u_char inb_(u_short); 3383void outb_(u_short, u_char); 3384 3385u_char 3386inb_(u_short port) 3387{ 3388 return inb(port); 3389} 3390 3391void 3392outb_(u_short port, u_char data) 3393{ 3394 outb(port, data); 3395} 3396 3397#endif /* KDB */ 3398