Deleted Added
full compact
machdep.c (341491) machdep.c (347568)
1/*-
2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * William Jolitz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
39 */
40
41#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * William Jolitz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 341491 2018-12-04 19:07:10Z markj $");
42__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 347568 2019-05-14 17:05:02Z kib $");
43
44#include "opt_atpic.h"
45#include "opt_compat.h"
46#include "opt_cpu.h"
47#include "opt_ddb.h"
48#include "opt_inet.h"
49#include "opt_isa.h"
50#include "opt_kstack_pages.h"
51#include "opt_maxmem.h"
52#include "opt_mp_watchdog.h"
53#include "opt_perfmon.h"
54#include "opt_platform.h"
55#include "opt_sched.h"
56
57#include <sys/param.h>
58#include <sys/proc.h>
59#include <sys/systm.h>
60#include <sys/bio.h>
61#include <sys/buf.h>
62#include <sys/bus.h>
63#include <sys/callout.h>
64#include <sys/cons.h>
65#include <sys/cpu.h>
66#include <sys/efi.h>
67#include <sys/eventhandler.h>
68#include <sys/exec.h>
69#include <sys/imgact.h>
70#include <sys/kdb.h>
71#include <sys/kernel.h>
72#include <sys/ktr.h>
73#include <sys/linker.h>
74#include <sys/lock.h>
75#include <sys/malloc.h>
76#include <sys/memrange.h>
77#include <sys/msgbuf.h>
78#include <sys/mutex.h>
79#include <sys/pcpu.h>
80#include <sys/ptrace.h>
81#include <sys/reboot.h>
82#include <sys/rwlock.h>
83#include <sys/sched.h>
84#include <sys/signalvar.h>
85#ifdef SMP
86#include <sys/smp.h>
87#endif
88#include <sys/syscallsubr.h>
89#include <sys/sysctl.h>
90#include <sys/sysent.h>
91#include <sys/sysproto.h>
92#include <sys/ucontext.h>
93#include <sys/vmmeter.h>
94
95#include <vm/vm.h>
96#include <vm/vm_extern.h>
97#include <vm/vm_kern.h>
98#include <vm/vm_page.h>
99#include <vm/vm_map.h>
100#include <vm/vm_object.h>
101#include <vm/vm_pager.h>
102#include <vm/vm_param.h>
103#include <vm/vm_phys.h>
104
105#ifdef DDB
106#ifndef KDB
107#error KDB must be enabled in order for DDB to work!
108#endif
109#include <ddb/ddb.h>
110#include <ddb/db_sym.h>
111#endif
112
113#include <net/netisr.h>
114
115#include <machine/clock.h>
116#include <machine/cpu.h>
117#include <machine/cputypes.h>
118#include <machine/frame.h>
119#include <machine/intr_machdep.h>
120#include <x86/mca.h>
121#include <machine/md_var.h>
122#include <machine/metadata.h>
123#include <machine/mp_watchdog.h>
124#include <machine/pc/bios.h>
125#include <machine/pcb.h>
126#include <machine/proc.h>
127#include <machine/reg.h>
128#include <machine/sigframe.h>
129#include <machine/specialreg.h>
130#ifdef PERFMON
131#include <machine/perfmon.h>
132#endif
133#include <machine/tss.h>
134#ifdef SMP
135#include <machine/smp.h>
136#endif
137#ifdef FDT
138#include <x86/fdt.h>
139#endif
140
141#ifdef DEV_ATPIC
142#include <x86/isa/icu.h>
143#else
144#include <x86/apicvar.h>
145#endif
146
147#include <isa/isareg.h>
148#include <isa/rtc.h>
149#include <x86/init.h>
150
151/* Sanity check for __curthread() */
152CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
153
154/*
155 * The PTI trampoline stack needs enough space for a hardware trapframe and a
156 * couple of scratch registers, as well as the trapframe left behind after an
157 * iret fault.
158 */
159CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
160 offsetof(struct pti_frame, pti_rip));
161
162extern u_int64_t hammer_time(u_int64_t, u_int64_t);
163
164#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
165#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
166
167static void cpu_startup(void *);
168static void get_fpcontext(struct thread *td, mcontext_t *mcp,
169 char *xfpusave, size_t xfpusave_len);
170static int set_fpcontext(struct thread *td, mcontext_t *mcp,
171 char *xfpustate, size_t xfpustate_len);
172SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
173
174/* Preload data parse function */
175static caddr_t native_parse_preload_data(u_int64_t);
176
177/* Native function to fetch and parse the e820 map */
178static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
179
180/* Default init_ops implementation. */
181struct init_ops init_ops = {
182 .parse_preload_data = native_parse_preload_data,
183 .early_clock_source_init = i8254_init,
184 .early_delay = i8254_delay,
185 .parse_memmap = native_parse_memmap,
186#ifdef SMP
187 .mp_bootaddress = mp_bootaddress,
188 .start_all_aps = native_start_all_aps,
189#endif
190 .msi_init = msi_init,
191};
192
193struct msgbuf *msgbufp;
194
195/*
196 * Physical address of the EFI System Table. Stashed from the metadata hints
197 * passed into the kernel and used by the EFI code to call runtime services.
198 */
199vm_paddr_t efi_systbl_phys;
200
201/* Intel ICH registers */
202#define ICH_PMBASE 0x400
203#define ICH_SMI_EN ICH_PMBASE + 0x30
204
205int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
206
207int cold = 1;
208
209long Maxmem = 0;
210long realmem = 0;
211
212/*
213 * The number of PHYSMAP entries must be one less than the number of
214 * PHYSSEG entries because the PHYSMAP entry that spans the largest
215 * physical address that is accessible by ISA DMA is split into two
216 * PHYSSEG entries.
217 */
218#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
219
220vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
221vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
222
223/* must be 2 less so 0 0 can signal end of chunks */
224#define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
225#define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
226
227struct kva_md_info kmi;
228
229static struct trapframe proc0_tf;
230struct region_descriptor r_gdt, r_idt;
231
232struct pcpu __pcpu[MAXCPU];
233
234struct mtx icu_lock;
235
236struct mem_range_softc mem_range_softc;
237
238struct mtx dt_lock; /* lock for GDT and LDT */
239
240void (*vmm_resume_p)(void);
241
242static void
243cpu_startup(dummy)
244 void *dummy;
245{
246 uintmax_t memsize;
247 char *sysenv;
248
249 /*
250 * On MacBooks, we need to disallow the legacy USB circuit to
251 * generate an SMI# because this can cause several problems,
252 * namely: incorrect CPU frequency detection and failure to
253 * start the APs.
254 * We do this by disabling a bit in the SMI_EN (SMI Control and
255 * Enable register) of the Intel ICH LPC Interface Bridge.
256 */
257 sysenv = kern_getenv("smbios.system.product");
258 if (sysenv != NULL) {
259 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
260 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
261 strncmp(sysenv, "MacBook4,1", 10) == 0 ||
262 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
263 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
264 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
265 strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
266 strncmp(sysenv, "Macmini1,1", 10) == 0) {
267 if (bootverbose)
268 printf("Disabling LEGACY_USB_EN bit on "
269 "Intel ICH.\n");
270 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
271 }
272 freeenv(sysenv);
273 }
274
275 /*
276 * Good {morning,afternoon,evening,night}.
277 */
278 startrtclock();
279 printcpuinfo();
280#ifdef PERFMON
281 perfmon_init();
282#endif
283
284 /*
285 * Display physical memory if SMBIOS reports reasonable amount.
286 */
287 memsize = 0;
288 sysenv = kern_getenv("smbios.memory.enabled");
289 if (sysenv != NULL) {
290 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
291 freeenv(sysenv);
292 }
293 if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
294 memsize = ptoa((uintmax_t)Maxmem);
295 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
296 realmem = atop(memsize);
297
298 /*
299 * Display any holes after the first chunk of extended memory.
300 */
301 if (bootverbose) {
302 int indx;
303
304 printf("Physical memory chunk(s):\n");
305 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
306 vm_paddr_t size;
307
308 size = phys_avail[indx + 1] - phys_avail[indx];
309 printf(
310 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
311 (uintmax_t)phys_avail[indx],
312 (uintmax_t)phys_avail[indx + 1] - 1,
313 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
314 }
315 }
316
317 vm_ksubmap_init(&kmi);
318
319 printf("avail memory = %ju (%ju MB)\n",
320 ptoa((uintmax_t)vm_cnt.v_free_count),
321 ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
322
323 /*
324 * Set up buffers, so they can be used to read disk labels.
325 */
326 bufinit();
327 vm_pager_bufferinit();
328
329 cpu_setregs();
330}
331
332/*
333 * Send an interrupt to process.
334 *
335 * Stack is set up to allow sigcode stored
336 * at top to call routine, followed by call
337 * to sigreturn routine below. After sigreturn
338 * resets the signal mask, the stack, and the
339 * frame pointer, it returns to the user
340 * specified pc, psl.
341 */
342void
343sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
344{
345 struct sigframe sf, *sfp;
346 struct pcb *pcb;
347 struct proc *p;
348 struct thread *td;
349 struct sigacts *psp;
350 char *sp;
351 struct trapframe *regs;
352 char *xfpusave;
353 size_t xfpusave_len;
354 int sig;
355 int oonstack;
356
357 td = curthread;
358 pcb = td->td_pcb;
359 p = td->td_proc;
360 PROC_LOCK_ASSERT(p, MA_OWNED);
361 sig = ksi->ksi_signo;
362 psp = p->p_sigacts;
363 mtx_assert(&psp->ps_mtx, MA_OWNED);
364 regs = td->td_frame;
365 oonstack = sigonstack(regs->tf_rsp);
366
367 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
368 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
369 xfpusave = __builtin_alloca(xfpusave_len);
370 } else {
371 xfpusave_len = 0;
372 xfpusave = NULL;
373 }
374
375 /* Save user context. */
376 bzero(&sf, sizeof(sf));
377 sf.sf_uc.uc_sigmask = *mask;
378 sf.sf_uc.uc_stack = td->td_sigstk;
379 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
380 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
381 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
382 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
383 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
384 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
385 fpstate_drop(td);
386 update_pcb_bases(pcb);
387 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
388 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
389 bzero(sf.sf_uc.uc_mcontext.mc_spare,
390 sizeof(sf.sf_uc.uc_mcontext.mc_spare));
391 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
392
393 /* Allocate space for the signal handler context. */
394 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
395 SIGISMEMBER(psp->ps_sigonstack, sig)) {
396 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
397#if defined(COMPAT_43)
398 td->td_sigstk.ss_flags |= SS_ONSTACK;
399#endif
400 } else
401 sp = (char *)regs->tf_rsp - 128;
402 if (xfpusave != NULL) {
403 sp -= xfpusave_len;
404 sp = (char *)((unsigned long)sp & ~0x3Ful);
405 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
406 }
407 sp -= sizeof(struct sigframe);
408 /* Align to 16 bytes. */
409 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
410
411 /* Build the argument list for the signal handler. */
412 regs->tf_rdi = sig; /* arg 1 in %rdi */
413 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
414 bzero(&sf.sf_si, sizeof(sf.sf_si));
415 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
416 /* Signal handler installed with SA_SIGINFO. */
417 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
418 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
419
420 /* Fill in POSIX parts */
421 sf.sf_si = ksi->ksi_info;
422 sf.sf_si.si_signo = sig; /* maybe a translated signal */
423 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
424 } else {
425 /* Old FreeBSD-style arguments. */
426 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */
427 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
428 sf.sf_ahu.sf_handler = catcher;
429 }
430 mtx_unlock(&psp->ps_mtx);
431 PROC_UNLOCK(p);
432
433 /*
434 * Copy the sigframe out to the user's stack.
435 */
436 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
437 (xfpusave != NULL && copyout(xfpusave,
438 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
439 != 0)) {
440#ifdef DEBUG
441 printf("process %ld has trashed its stack\n", (long)p->p_pid);
442#endif
443 PROC_LOCK(p);
444 sigexit(td, SIGILL);
445 }
446
447 regs->tf_rsp = (long)sfp;
448 regs->tf_rip = p->p_sysent->sv_sigcode_base;
449 regs->tf_rflags &= ~(PSL_T | PSL_D);
450 regs->tf_cs = _ucodesel;
451 regs->tf_ds = _udatasel;
452 regs->tf_ss = _udatasel;
453 regs->tf_es = _udatasel;
454 regs->tf_fs = _ufssel;
455 regs->tf_gs = _ugssel;
456 regs->tf_flags = TF_HASSEGS;
457 PROC_LOCK(p);
458 mtx_lock(&psp->ps_mtx);
459}
460
461/*
462 * System call to cleanup state after a signal
463 * has been taken. Reset signal mask and
464 * stack state from context left by sendsig (above).
465 * Return to previous pc and psl as specified by
466 * context left by sendsig. Check carefully to
467 * make sure that the user has not modified the
468 * state to gain improper privileges.
469 *
470 * MPSAFE
471 */
472int
473sys_sigreturn(td, uap)
474 struct thread *td;
475 struct sigreturn_args /* {
476 const struct __ucontext *sigcntxp;
477 } */ *uap;
478{
479 ucontext_t uc;
480 struct pcb *pcb;
481 struct proc *p;
482 struct trapframe *regs;
483 ucontext_t *ucp;
484 char *xfpustate;
485 size_t xfpustate_len;
486 long rflags;
487 int cs, error, ret;
488 ksiginfo_t ksi;
489
490 pcb = td->td_pcb;
491 p = td->td_proc;
492
493 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
494 if (error != 0) {
495 uprintf("pid %d (%s): sigreturn copyin failed\n",
496 p->p_pid, td->td_name);
497 return (error);
498 }
499 ucp = &uc;
500 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
501 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
502 td->td_name, ucp->uc_mcontext.mc_flags);
503 return (EINVAL);
504 }
505 regs = td->td_frame;
506 rflags = ucp->uc_mcontext.mc_rflags;
507 /*
508 * Don't allow users to change privileged or reserved flags.
509 */
510 if (!EFL_SECURE(rflags, regs->tf_rflags)) {
511 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
512 td->td_name, rflags);
513 return (EINVAL);
514 }
515
516 /*
517 * Don't allow users to load a valid privileged %cs. Let the
518 * hardware check for invalid selectors, excess privilege in
519 * other selectors, invalid %eip's and invalid %esp's.
520 */
521 cs = ucp->uc_mcontext.mc_cs;
522 if (!CS_SECURE(cs)) {
523 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
524 td->td_name, cs);
525 ksiginfo_init_trap(&ksi);
526 ksi.ksi_signo = SIGBUS;
527 ksi.ksi_code = BUS_OBJERR;
528 ksi.ksi_trapno = T_PROTFLT;
529 ksi.ksi_addr = (void *)regs->tf_rip;
530 trapsignal(td, &ksi);
531 return (EINVAL);
532 }
533
534 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
535 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
536 if (xfpustate_len > cpu_max_ext_state_size -
537 sizeof(struct savefpu)) {
538 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
539 p->p_pid, td->td_name, xfpustate_len);
540 return (EINVAL);
541 }
542 xfpustate = __builtin_alloca(xfpustate_len);
543 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
544 xfpustate, xfpustate_len);
545 if (error != 0) {
546 uprintf(
547 "pid %d (%s): sigreturn copying xfpustate failed\n",
548 p->p_pid, td->td_name);
549 return (error);
550 }
551 } else {
552 xfpustate = NULL;
553 xfpustate_len = 0;
554 }
555 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
556 if (ret != 0) {
557 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
558 p->p_pid, td->td_name, ret);
559 return (ret);
560 }
561 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
562 update_pcb_bases(pcb);
563 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
564 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
565
566#if defined(COMPAT_43)
567 if (ucp->uc_mcontext.mc_onstack & 1)
568 td->td_sigstk.ss_flags |= SS_ONSTACK;
569 else
570 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
571#endif
572
573 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
574 return (EJUSTRETURN);
575}
576
577#ifdef COMPAT_FREEBSD4
578int
579freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
580{
581
582 return sys_sigreturn(td, (struct sigreturn_args *)uap);
583}
584#endif
585
586/*
587 * Reset registers to default values on exec.
588 */
589void
590exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
591{
592 struct trapframe *regs;
593 struct pcb *pcb;
594 register_t saved_rflags;
595
596 regs = td->td_frame;
597 pcb = td->td_pcb;
598
599 mtx_lock(&dt_lock);
600 if (td->td_proc->p_md.md_ldt != NULL)
601 user_ldt_free(td);
602 else
603 mtx_unlock(&dt_lock);
604
605 update_pcb_bases(pcb);
606 pcb->pcb_fsbase = 0;
607 pcb->pcb_gsbase = 0;
608 clear_pcb_flags(pcb, PCB_32BIT);
609 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
610
611 saved_rflags = regs->tf_rflags & PSL_T;
612 bzero((char *)regs, sizeof(struct trapframe));
613 regs->tf_rip = imgp->entry_addr;
614 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
615 regs->tf_rdi = stack; /* argv */
616 regs->tf_rflags = PSL_USER | saved_rflags;
617 regs->tf_ss = _udatasel;
618 regs->tf_cs = _ucodesel;
619 regs->tf_ds = _udatasel;
620 regs->tf_es = _udatasel;
621 regs->tf_fs = _ufssel;
622 regs->tf_gs = _ugssel;
623 regs->tf_flags = TF_HASSEGS;
624 td->td_retval[1] = 0;
625
626 /*
627 * Reset the hardware debug registers if they were in use.
628 * They won't have any meaning for the newly exec'd process.
629 */
630 if (pcb->pcb_flags & PCB_DBREGS) {
631 pcb->pcb_dr0 = 0;
632 pcb->pcb_dr1 = 0;
633 pcb->pcb_dr2 = 0;
634 pcb->pcb_dr3 = 0;
635 pcb->pcb_dr6 = 0;
636 pcb->pcb_dr7 = 0;
637 if (pcb == curpcb) {
638 /*
639 * Clear the debug registers on the running
640 * CPU, otherwise they will end up affecting
641 * the next process we switch to.
642 */
643 reset_dbregs();
644 }
645 clear_pcb_flags(pcb, PCB_DBREGS);
646 }
647
648 /*
649 * Drop the FP state if we hold it, so that the process gets a
650 * clean FP state if it uses the FPU again.
651 */
652 fpstate_drop(td);
653}
654
655void
656cpu_setregs(void)
657{
658 register_t cr0;
659
660 cr0 = rcr0();
661 /*
662 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
663 * BSP. See the comments there about why we set them.
664 */
665 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
666 load_cr0(cr0);
667}
668
669/*
670 * Initialize amd64 and configure to run kernel
671 */
672
673/*
674 * Initialize segments & interrupt table
675 */
676
677struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
678static struct gate_descriptor idt0[NIDT];
679struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
680
681static char dblfault_stack[PAGE_SIZE] __aligned(16);
682static char mce0_stack[PAGE_SIZE] __aligned(16);
683static char nmi0_stack[PAGE_SIZE] __aligned(16);
684static char dbg0_stack[PAGE_SIZE] __aligned(16);
685CTASSERT(sizeof(struct nmi_pcpu) == 16);
686
687struct amd64tss common_tss[MAXCPU];
688
689/*
690 * Software prototypes -- in more palatable form.
691 *
692 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
693 * slots as corresponding segments for i386 kernel.
694 */
695struct soft_segment_descriptor gdt_segs[] = {
696/* GNULL_SEL 0 Null Descriptor */
697{ .ssd_base = 0x0,
698 .ssd_limit = 0x0,
699 .ssd_type = 0,
700 .ssd_dpl = 0,
701 .ssd_p = 0,
702 .ssd_long = 0,
703 .ssd_def32 = 0,
704 .ssd_gran = 0 },
705/* GNULL2_SEL 1 Null Descriptor */
706{ .ssd_base = 0x0,
707 .ssd_limit = 0x0,
708 .ssd_type = 0,
709 .ssd_dpl = 0,
710 .ssd_p = 0,
711 .ssd_long = 0,
712 .ssd_def32 = 0,
713 .ssd_gran = 0 },
714/* GUFS32_SEL 2 32 bit %gs Descriptor for user */
715{ .ssd_base = 0x0,
716 .ssd_limit = 0xfffff,
717 .ssd_type = SDT_MEMRWA,
718 .ssd_dpl = SEL_UPL,
719 .ssd_p = 1,
720 .ssd_long = 0,
721 .ssd_def32 = 1,
722 .ssd_gran = 1 },
723/* GUGS32_SEL 3 32 bit %fs Descriptor for user */
724{ .ssd_base = 0x0,
725 .ssd_limit = 0xfffff,
726 .ssd_type = SDT_MEMRWA,
727 .ssd_dpl = SEL_UPL,
728 .ssd_p = 1,
729 .ssd_long = 0,
730 .ssd_def32 = 1,
731 .ssd_gran = 1 },
732/* GCODE_SEL 4 Code Descriptor for kernel */
733{ .ssd_base = 0x0,
734 .ssd_limit = 0xfffff,
735 .ssd_type = SDT_MEMERA,
736 .ssd_dpl = SEL_KPL,
737 .ssd_p = 1,
738 .ssd_long = 1,
739 .ssd_def32 = 0,
740 .ssd_gran = 1 },
741/* GDATA_SEL 5 Data Descriptor for kernel */
742{ .ssd_base = 0x0,
743 .ssd_limit = 0xfffff,
744 .ssd_type = SDT_MEMRWA,
745 .ssd_dpl = SEL_KPL,
746 .ssd_p = 1,
747 .ssd_long = 1,
748 .ssd_def32 = 0,
749 .ssd_gran = 1 },
750/* GUCODE32_SEL 6 32 bit Code Descriptor for user */
751{ .ssd_base = 0x0,
752 .ssd_limit = 0xfffff,
753 .ssd_type = SDT_MEMERA,
754 .ssd_dpl = SEL_UPL,
755 .ssd_p = 1,
756 .ssd_long = 0,
757 .ssd_def32 = 1,
758 .ssd_gran = 1 },
759/* GUDATA_SEL 7 32/64 bit Data Descriptor for user */
760{ .ssd_base = 0x0,
761 .ssd_limit = 0xfffff,
762 .ssd_type = SDT_MEMRWA,
763 .ssd_dpl = SEL_UPL,
764 .ssd_p = 1,
765 .ssd_long = 0,
766 .ssd_def32 = 1,
767 .ssd_gran = 1 },
768/* GUCODE_SEL 8 64 bit Code Descriptor for user */
769{ .ssd_base = 0x0,
770 .ssd_limit = 0xfffff,
771 .ssd_type = SDT_MEMERA,
772 .ssd_dpl = SEL_UPL,
773 .ssd_p = 1,
774 .ssd_long = 1,
775 .ssd_def32 = 0,
776 .ssd_gran = 1 },
777/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
778{ .ssd_base = 0x0,
779 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
780 .ssd_type = SDT_SYSTSS,
781 .ssd_dpl = SEL_KPL,
782 .ssd_p = 1,
783 .ssd_long = 0,
784 .ssd_def32 = 0,
785 .ssd_gran = 0 },
786/* Actually, the TSS is a system descriptor which is double size */
787{ .ssd_base = 0x0,
788 .ssd_limit = 0x0,
789 .ssd_type = 0,
790 .ssd_dpl = 0,
791 .ssd_p = 0,
792 .ssd_long = 0,
793 .ssd_def32 = 0,
794 .ssd_gran = 0 },
795/* GUSERLDT_SEL 11 LDT Descriptor */
796{ .ssd_base = 0x0,
797 .ssd_limit = 0x0,
798 .ssd_type = 0,
799 .ssd_dpl = 0,
800 .ssd_p = 0,
801 .ssd_long = 0,
802 .ssd_def32 = 0,
803 .ssd_gran = 0 },
804/* GUSERLDT_SEL 12 LDT Descriptor, double size */
805{ .ssd_base = 0x0,
806 .ssd_limit = 0x0,
807 .ssd_type = 0,
808 .ssd_dpl = 0,
809 .ssd_p = 0,
810 .ssd_long = 0,
811 .ssd_def32 = 0,
812 .ssd_gran = 0 },
813};
814
815void
816setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
817{
818 struct gate_descriptor *ip;
819
820 ip = idt + idx;
821 ip->gd_looffset = (uintptr_t)func;
822 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
823 ip->gd_ist = ist;
824 ip->gd_xx = 0;
825 ip->gd_type = typ;
826 ip->gd_dpl = dpl;
827 ip->gd_p = 1;
828 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
829}
830
831extern inthand_t
832 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
833 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
834 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
835 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
836 IDTVEC(xmm), IDTVEC(dblfault),
837 IDTVEC(div_pti), IDTVEC(bpt_pti),
838 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
839 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
840 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
841 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
842 IDTVEC(xmm_pti),
843#ifdef KDTRACE_HOOKS
844 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
845#endif
846#ifdef XENHVM
847 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
848#endif
849 IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
850 IDTVEC(fast_syscall_pti);
851
852#ifdef DDB
853/*
854 * Display the index and function name of any IDT entries that don't use
855 * the default 'rsvd' entry point.
856 */
857DB_SHOW_COMMAND(idt, db_show_idt)
858{
859 struct gate_descriptor *ip;
860 int idx;
861 uintptr_t func;
862
863 ip = idt;
864 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
865 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
866 if (func != (uintptr_t)&IDTVEC(rsvd)) {
867 db_printf("%3d\t", idx);
868 db_printsym(func, DB_STGY_PROC);
869 db_printf("\n");
870 }
871 ip++;
872 }
873}
874
875/* Show privileged registers. */
876DB_SHOW_COMMAND(sysregs, db_show_sysregs)
877{
878 struct {
879 uint16_t limit;
880 uint64_t base;
881 } __packed idtr, gdtr;
882 uint16_t ldt, tr;
883
884 __asm __volatile("sidt %0" : "=m" (idtr));
885 db_printf("idtr\t0x%016lx/%04x\n",
886 (u_long)idtr.base, (u_int)idtr.limit);
887 __asm __volatile("sgdt %0" : "=m" (gdtr));
888 db_printf("gdtr\t0x%016lx/%04x\n",
889 (u_long)gdtr.base, (u_int)gdtr.limit);
890 __asm __volatile("sldt %0" : "=r" (ldt));
891 db_printf("ldtr\t0x%04x\n", ldt);
892 __asm __volatile("str %0" : "=r" (tr));
893 db_printf("tr\t0x%04x\n", tr);
894 db_printf("cr0\t0x%016lx\n", rcr0());
895 db_printf("cr2\t0x%016lx\n", rcr2());
896 db_printf("cr3\t0x%016lx\n", rcr3());
897 db_printf("cr4\t0x%016lx\n", rcr4());
898 if (rcr4() & CR4_XSAVE)
899 db_printf("xcr0\t0x%016lx\n", rxcr(0));
900 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
901 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
902 db_printf("FEATURES_CTL\t%016lx\n",
903 rdmsr(MSR_IA32_FEATURE_CONTROL));
904 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
905 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
906 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
907}
908
909DB_SHOW_COMMAND(dbregs, db_show_dbregs)
910{
911
912 db_printf("dr0\t0x%016lx\n", rdr0());
913 db_printf("dr1\t0x%016lx\n", rdr1());
914 db_printf("dr2\t0x%016lx\n", rdr2());
915 db_printf("dr3\t0x%016lx\n", rdr3());
916 db_printf("dr6\t0x%016lx\n", rdr6());
917 db_printf("dr7\t0x%016lx\n", rdr7());
918}
919#endif
920
921void
922sdtossd(sd, ssd)
923 struct user_segment_descriptor *sd;
924 struct soft_segment_descriptor *ssd;
925{
926
927 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
928 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
929 ssd->ssd_type = sd->sd_type;
930 ssd->ssd_dpl = sd->sd_dpl;
931 ssd->ssd_p = sd->sd_p;
932 ssd->ssd_long = sd->sd_long;
933 ssd->ssd_def32 = sd->sd_def32;
934 ssd->ssd_gran = sd->sd_gran;
935}
936
937void
938ssdtosd(ssd, sd)
939 struct soft_segment_descriptor *ssd;
940 struct user_segment_descriptor *sd;
941{
942
943 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
944 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
945 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
946 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
947 sd->sd_type = ssd->ssd_type;
948 sd->sd_dpl = ssd->ssd_dpl;
949 sd->sd_p = ssd->ssd_p;
950 sd->sd_long = ssd->ssd_long;
951 sd->sd_def32 = ssd->ssd_def32;
952 sd->sd_gran = ssd->ssd_gran;
953}
954
955void
956ssdtosyssd(ssd, sd)
957 struct soft_segment_descriptor *ssd;
958 struct system_segment_descriptor *sd;
959{
960
961 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
962 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
963 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
964 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
965 sd->sd_type = ssd->ssd_type;
966 sd->sd_dpl = ssd->ssd_dpl;
967 sd->sd_p = ssd->ssd_p;
968 sd->sd_gran = ssd->ssd_gran;
969}
970
971#if !defined(DEV_ATPIC) && defined(DEV_ISA)
972#include <isa/isavar.h>
973#include <isa/isareg.h>
974/*
975 * Return a bitmap of the current interrupt requests. This is 8259-specific
976 * and is only suitable for use at probe time.
977 * This is only here to pacify sio. It is NOT FATAL if this doesn't work.
978 * It shouldn't be here. There should probably be an APIC centric
979 * implementation in the apic driver code, if at all.
980 */
981intrmask_t
982isa_irq_pending(void)
983{
984 u_char irr1;
985 u_char irr2;
986
987 irr1 = inb(IO_ICU1);
988 irr2 = inb(IO_ICU2);
989 return ((irr2 << 8) | irr1);
990}
991#endif
992
993u_int basemem;
994
995static int
996add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
997 int *physmap_idxp)
998{
999 int i, insert_idx, physmap_idx;
1000
1001 physmap_idx = *physmap_idxp;
1002
1003 if (length == 0)
1004 return (1);
1005
1006 /*
1007 * Find insertion point while checking for overlap. Start off by
1008 * assuming the new entry will be added to the end.
1009 *
1010 * NB: physmap_idx points to the next free slot.
1011 */
1012 insert_idx = physmap_idx;
1013 for (i = 0; i <= physmap_idx; i += 2) {
1014 if (base < physmap[i + 1]) {
1015 if (base + length <= physmap[i]) {
1016 insert_idx = i;
1017 break;
1018 }
1019 if (boothowto & RB_VERBOSE)
1020 printf(
1021 "Overlapping memory regions, ignoring second region\n");
1022 return (1);
1023 }
1024 }
1025
1026 /* See if we can prepend to the next entry. */
1027 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1028 physmap[insert_idx] = base;
1029 return (1);
1030 }
1031
1032 /* See if we can append to the previous entry. */
1033 if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1034 physmap[insert_idx - 1] += length;
1035 return (1);
1036 }
1037
1038 physmap_idx += 2;
1039 *physmap_idxp = physmap_idx;
1040 if (physmap_idx == PHYSMAP_SIZE) {
1041 printf(
1042 "Too many segments in the physical address map, giving up\n");
1043 return (0);
1044 }
1045
1046 /*
1047 * Move the last 'N' entries down to make room for the new
1048 * entry if needed.
1049 */
1050 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1051 physmap[i] = physmap[i - 2];
1052 physmap[i + 1] = physmap[i - 1];
1053 }
1054
1055 /* Insert the new entry. */
1056 physmap[insert_idx] = base;
1057 physmap[insert_idx + 1] = base + length;
1058 return (1);
1059}
1060
1061void
1062bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1063 vm_paddr_t *physmap, int *physmap_idx)
1064{
1065 struct bios_smap *smap, *smapend;
1066
1067 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1068
1069 for (smap = smapbase; smap < smapend; smap++) {
1070 if (boothowto & RB_VERBOSE)
1071 printf("SMAP type=%02x base=%016lx len=%016lx\n",
1072 smap->type, smap->base, smap->length);
1073
1074 if (smap->type != SMAP_TYPE_MEMORY)
1075 continue;
1076
1077 if (!add_physmap_entry(smap->base, smap->length, physmap,
1078 physmap_idx))
1079 break;
1080 }
1081}
1082
1083static void
1084add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1085 int *physmap_idx)
1086{
1087 struct efi_md *map, *p;
1088 const char *type;
1089 size_t efisz;
1090 int ndesc, i;
1091
1092 static const char *types[] = {
1093 "Reserved",
1094 "LoaderCode",
1095 "LoaderData",
1096 "BootServicesCode",
1097 "BootServicesData",
1098 "RuntimeServicesCode",
1099 "RuntimeServicesData",
1100 "ConventionalMemory",
1101 "UnusableMemory",
1102 "ACPIReclaimMemory",
1103 "ACPIMemoryNVS",
1104 "MemoryMappedIO",
1105 "MemoryMappedIOPortSpace",
1106 "PalCode",
1107 "PersistentMemory"
1108 };
1109
1110 /*
1111 * Memory map data provided by UEFI via the GetMemoryMap
1112 * Boot Services API.
1113 */
1114 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1115 map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1116
1117 if (efihdr->descriptor_size == 0)
1118 return;
1119 ndesc = efihdr->memory_size / efihdr->descriptor_size;
1120
1121 if (boothowto & RB_VERBOSE)
1122 printf("%23s %12s %12s %8s %4s\n",
1123 "Type", "Physical", "Virtual", "#Pages", "Attr");
1124
1125 for (i = 0, p = map; i < ndesc; i++,
1126 p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1127 if (boothowto & RB_VERBOSE) {
1128 if (p->md_type < nitems(types))
1129 type = types[p->md_type];
1130 else
1131 type = "<INVALID>";
1132 printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1133 p->md_virt, p->md_pages);
1134 if (p->md_attr & EFI_MD_ATTR_UC)
1135 printf("UC ");
1136 if (p->md_attr & EFI_MD_ATTR_WC)
1137 printf("WC ");
1138 if (p->md_attr & EFI_MD_ATTR_WT)
1139 printf("WT ");
1140 if (p->md_attr & EFI_MD_ATTR_WB)
1141 printf("WB ");
1142 if (p->md_attr & EFI_MD_ATTR_UCE)
1143 printf("UCE ");
1144 if (p->md_attr & EFI_MD_ATTR_WP)
1145 printf("WP ");
1146 if (p->md_attr & EFI_MD_ATTR_RP)
1147 printf("RP ");
1148 if (p->md_attr & EFI_MD_ATTR_XP)
1149 printf("XP ");
1150 if (p->md_attr & EFI_MD_ATTR_NV)
1151 printf("NV ");
1152 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1153 printf("MORE_RELIABLE ");
1154 if (p->md_attr & EFI_MD_ATTR_RO)
1155 printf("RO ");
1156 if (p->md_attr & EFI_MD_ATTR_RT)
1157 printf("RUNTIME");
1158 printf("\n");
1159 }
1160
1161 switch (p->md_type) {
1162 case EFI_MD_TYPE_CODE:
1163 case EFI_MD_TYPE_DATA:
1164 case EFI_MD_TYPE_BS_CODE:
1165 case EFI_MD_TYPE_BS_DATA:
1166 case EFI_MD_TYPE_FREE:
1167 /*
1168 * We're allowed to use any entry with these types.
1169 */
1170 break;
1171 default:
1172 continue;
1173 }
1174
1175 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1176 physmap, physmap_idx))
1177 break;
1178 }
1179}
1180
1181static char bootmethod[16] = "";
1182SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1183 "System firmware boot method");
1184
1185static void
1186native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1187{
1188 struct bios_smap *smap;
1189 struct efi_map_header *efihdr;
1190 u_int32_t size;
1191
1192 /*
1193 * Memory map from INT 15:E820.
1194 *
1195 * subr_module.c says:
1196 * "Consumer may safely assume that size value precedes data."
1197 * ie: an int32_t immediately precedes smap.
1198 */
1199
1200 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1201 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1202 smap = (struct bios_smap *)preload_search_info(kmdp,
1203 MODINFO_METADATA | MODINFOMD_SMAP);
1204 if (efihdr == NULL && smap == NULL)
1205 panic("No BIOS smap or EFI map info from loader!");
1206
1207 if (efihdr != NULL) {
1208 add_efi_map_entries(efihdr, physmap, physmap_idx);
1209 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1210 } else {
1211 size = *((u_int32_t *)smap - 1);
1212 bios_add_smap_entries(smap, size, physmap, physmap_idx);
1213 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1214 }
1215}
1216
1217#define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
1218
1219/*
1220 * Populate the (physmap) array with base/bound pairs describing the
1221 * available physical memory in the system, then test this memory and
1222 * build the phys_avail array describing the actually-available memory.
1223 *
1224 * Total memory size may be set by the kernel environment variable
1225 * hw.physmem or the compile-time define MAXMEM.
1226 *
1227 * XXX first should be vm_paddr_t.
1228 */
1229static void
1230getmemsize(caddr_t kmdp, u_int64_t first)
1231{
1232 int i, physmap_idx, pa_indx, da_indx;
1233 vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1234 u_long physmem_start, physmem_tunable, memtest;
1235 pt_entry_t *pte;
1236 quad_t dcons_addr, dcons_size;
1237 int page_counter;
1238
1239 /*
1240 * Tell the physical memory allocator about pages used to store
1241 * the kernel and preloaded data. See kmem_bootstrap_free().
1242 */
1243 vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1244
1245 bzero(physmap, sizeof(physmap));
1246 physmap_idx = 0;
1247
1248 init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1249 physmap_idx -= 2;
1250
1251 /*
1252 * Find the 'base memory' segment for SMP
1253 */
1254 basemem = 0;
1255 for (i = 0; i <= physmap_idx; i += 2) {
1256 if (physmap[i] <= 0xA0000) {
1257 basemem = physmap[i + 1] / 1024;
1258 break;
1259 }
1260 }
1261 if (basemem == 0 || basemem > 640) {
1262 if (bootverbose)
1263 printf(
1264 "Memory map doesn't contain a basemem segment, faking it");
1265 basemem = 640;
1266 }
1267
1268 /*
1269 * Make hole for "AP -> long mode" bootstrap code. The
1270 * mp_bootaddress vector is only available when the kernel
1271 * is configured to support APs and APs for the system start
1272 * in 32bit mode (e.g. SMP bare metal).
1273 */
1274 if (init_ops.mp_bootaddress) {
1275 if (physmap[1] >= 0x100000000)
1276 panic(
1277 "Basemem segment is not suitable for AP bootstrap code!");
1278 physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
1279 }
1280
1281 /*
1282 * Maxmem isn't the "maximum memory", it's one larger than the
1283 * highest page of the physical address space. It should be
1284 * called something like "Maxphyspage". We may adjust this
1285 * based on ``hw.physmem'' and the results of the memory test.
1286 */
1287 Maxmem = atop(physmap[physmap_idx + 1]);
1288
1289#ifdef MAXMEM
1290 Maxmem = MAXMEM / 4;
1291#endif
1292
1293 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1294 Maxmem = atop(physmem_tunable);
1295
1296 /*
1297 * The boot memory test is disabled by default, as it takes a
1298 * significant amount of time on large-memory systems, and is
1299 * unfriendly to virtual machines as it unnecessarily touches all
1300 * pages.
1301 *
1302 * A general name is used as the code may be extended to support
1303 * additional tests beyond the current "page present" test.
1304 */
1305 memtest = 0;
1306 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1307
1308 /*
1309 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1310 * in the system.
1311 */
1312 if (Maxmem > atop(physmap[physmap_idx + 1]))
1313 Maxmem = atop(physmap[physmap_idx + 1]);
1314
1315 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1316 (boothowto & RB_VERBOSE))
1317 printf("Physical memory use set to %ldK\n", Maxmem * 4);
1318
1319 /* call pmap initialization to make new kernel address space */
1320 pmap_bootstrap(&first);
1321
1322 /*
1323 * Size up each available chunk of physical memory.
1324 *
1325 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1326 * By default, mask off the first 16 pages unless we appear to be
1327 * running in a VM.
1328 */
1329 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1330 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1331 if (physmap[0] < physmem_start) {
1332 if (physmem_start < PAGE_SIZE)
1333 physmap[0] = PAGE_SIZE;
1334 else if (physmem_start >= physmap[1])
1335 physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1336 else
1337 physmap[0] = round_page(physmem_start);
1338 }
1339 pa_indx = 0;
1340 da_indx = 1;
1341 phys_avail[pa_indx++] = physmap[0];
1342 phys_avail[pa_indx] = physmap[0];
1343 dump_avail[da_indx] = physmap[0];
1344 pte = CMAP1;
1345
1346 /*
1347 * Get dcons buffer address
1348 */
1349 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1350 getenv_quad("dcons.size", &dcons_size) == 0)
1351 dcons_addr = 0;
1352
1353 /*
1354 * physmap is in bytes, so when converting to page boundaries,
1355 * round up the start address and round down the end address.
1356 */
1357 page_counter = 0;
1358 if (memtest != 0)
1359 printf("Testing system memory");
1360 for (i = 0; i <= physmap_idx; i += 2) {
1361 vm_paddr_t end;
1362
1363 end = ptoa((vm_paddr_t)Maxmem);
1364 if (physmap[i + 1] < end)
1365 end = trunc_page(physmap[i + 1]);
1366 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1367 int tmp, page_bad, full;
1368 int *ptr = (int *)CADDR1;
1369
1370 full = FALSE;
1371 /*
1372 * block out kernel memory as not available.
1373 */
1374 if (pa >= (vm_paddr_t)kernphys && pa < first)
1375 goto do_dump_avail;
1376
1377 /*
1378 * block out dcons buffer
1379 */
1380 if (dcons_addr > 0
1381 && pa >= trunc_page(dcons_addr)
1382 && pa < dcons_addr + dcons_size)
1383 goto do_dump_avail;
1384
1385 page_bad = FALSE;
1386 if (memtest == 0)
1387 goto skip_memtest;
1388
1389 /*
1390 * Print a "." every GB to show we're making
1391 * progress.
1392 */
1393 page_counter++;
1394 if ((page_counter % PAGES_PER_GB) == 0)
1395 printf(".");
1396
1397 /*
1398 * map page into kernel: valid, read/write,non-cacheable
1399 */
1400 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1401 invltlb();
1402
1403 tmp = *(int *)ptr;
1404 /*
1405 * Test for alternating 1's and 0's
1406 */
1407 *(volatile int *)ptr = 0xaaaaaaaa;
1408 if (*(volatile int *)ptr != 0xaaaaaaaa)
1409 page_bad = TRUE;
1410 /*
1411 * Test for alternating 0's and 1's
1412 */
1413 *(volatile int *)ptr = 0x55555555;
1414 if (*(volatile int *)ptr != 0x55555555)
1415 page_bad = TRUE;
1416 /*
1417 * Test for all 1's
1418 */
1419 *(volatile int *)ptr = 0xffffffff;
1420 if (*(volatile int *)ptr != 0xffffffff)
1421 page_bad = TRUE;
1422 /*
1423 * Test for all 0's
1424 */
1425 *(volatile int *)ptr = 0x0;
1426 if (*(volatile int *)ptr != 0x0)
1427 page_bad = TRUE;
1428 /*
1429 * Restore original value.
1430 */
1431 *(int *)ptr = tmp;
1432
1433skip_memtest:
1434 /*
1435 * Adjust array of valid/good pages.
1436 */
1437 if (page_bad == TRUE)
1438 continue;
1439 /*
1440 * If this good page is a continuation of the
1441 * previous set of good pages, then just increase
1442 * the end pointer. Otherwise start a new chunk.
1443 * Note that "end" points one higher than end,
1444 * making the range >= start and < end.
1445 * If we're also doing a speculative memory
1446 * test and we at or past the end, bump up Maxmem
1447 * so that we keep going. The first bad page
1448 * will terminate the loop.
1449 */
1450 if (phys_avail[pa_indx] == pa) {
1451 phys_avail[pa_indx] += PAGE_SIZE;
1452 } else {
1453 pa_indx++;
1454 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1455 printf(
1456 "Too many holes in the physical address space, giving up\n");
1457 pa_indx--;
1458 full = TRUE;
1459 goto do_dump_avail;
1460 }
1461 phys_avail[pa_indx++] = pa; /* start */
1462 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1463 }
1464 physmem++;
1465do_dump_avail:
1466 if (dump_avail[da_indx] == pa) {
1467 dump_avail[da_indx] += PAGE_SIZE;
1468 } else {
1469 da_indx++;
1470 if (da_indx == DUMP_AVAIL_ARRAY_END) {
1471 da_indx--;
1472 goto do_next;
1473 }
1474 dump_avail[da_indx++] = pa; /* start */
1475 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1476 }
1477do_next:
1478 if (full)
1479 break;
1480 }
1481 }
1482 *pte = 0;
1483 invltlb();
1484 if (memtest != 0)
1485 printf("\n");
1486
1487 /*
1488 * XXX
1489 * The last chunk must contain at least one page plus the message
1490 * buffer to avoid complicating other code (message buffer address
1491 * calculation, etc.).
1492 */
1493 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1494 round_page(msgbufsize) >= phys_avail[pa_indx]) {
1495 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1496 phys_avail[pa_indx--] = 0;
1497 phys_avail[pa_indx--] = 0;
1498 }
1499
1500 Maxmem = atop(phys_avail[pa_indx]);
1501
1502 /* Trim off space for the message buffer. */
1503 phys_avail[pa_indx] -= round_page(msgbufsize);
1504
1505 /* Map the message buffer. */
1506 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1507}
1508
1509static caddr_t
1510native_parse_preload_data(u_int64_t modulep)
1511{
1512 caddr_t kmdp;
1513 char *envp;
1514#ifdef DDB
1515 vm_offset_t ksym_start;
1516 vm_offset_t ksym_end;
1517#endif
1518
1519 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1520 preload_bootstrap_relocate(KERNBASE);
1521 kmdp = preload_search_by_type("elf kernel");
1522 if (kmdp == NULL)
1523 kmdp = preload_search_by_type("elf64 kernel");
1524 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1525 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1526 if (envp != NULL)
1527 envp += KERNBASE;
1528 init_static_kenv(envp, 0);
1529#ifdef DDB
1530 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1531 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1532 db_fetch_ksymtab(ksym_start, ksym_end);
1533#endif
1534 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1535
1536 return (kmdp);
1537}
1538
1539static void
1540amd64_kdb_init(void)
1541{
1542 kdb_init();
1543#ifdef KDB
1544 if (boothowto & RB_KDB)
1545 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1546#endif
1547}
1548
1549/* Set up the fast syscall stuff */
1550void
1551amd64_conf_fast_syscall(void)
1552{
1553 uint64_t msr;
1554
1555 msr = rdmsr(MSR_EFER) | EFER_SCE;
1556 wrmsr(MSR_EFER, msr);
1557 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1558 (u_int64_t)IDTVEC(fast_syscall));
1559 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1560 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1561 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1562 wrmsr(MSR_STAR, msr);
1563 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
1564}
1565
1566u_int64_t
1567hammer_time(u_int64_t modulep, u_int64_t physfree)
1568{
1569 caddr_t kmdp;
1570 int gsel_tss, x;
1571 struct pcpu *pc;
1572 struct nmi_pcpu *np;
1573 struct xstate_hdr *xhdr;
1574 u_int64_t rsp0;
1575 char *env;
1576 size_t kstack0_sz;
1577 int late_console;
1578
1579 kmdp = init_ops.parse_preload_data(modulep);
1580
1581 identify_cpu1();
1582 identify_hypervisor();
1583 /*
1584 * hw.cpu_stdext_disable is ignored by the call, it will be
1585 * re-evaluted by the below call to finishidentcpu().
1586 */
1587 identify_cpu2();
1588
1589 link_elf_ireloc(kmdp);
1590
1591 /*
1592 * This may be done better later if it gets more high level
1593 * components in it. If so just link td->td_proc here.
1594 */
1595 proc_linkup0(&proc0, &thread0);
1596
1597 /* Init basic tunables, hz etc */
1598 init_param1();
1599
1600 thread0.td_kstack = physfree + KERNBASE;
1601 thread0.td_kstack_pages = kstack_pages;
1602 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1603 bzero((void *)thread0.td_kstack, kstack0_sz);
1604 physfree += kstack0_sz;
1605
1606 /*
1607 * make gdt memory segments
1608 */
1609 for (x = 0; x < NGDT; x++) {
1610 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1611 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1612 ssdtosd(&gdt_segs[x], &gdt[x]);
1613 }
1614 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1615 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1616 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1617
1618 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1619 r_gdt.rd_base = (long) gdt;
1620 lgdt(&r_gdt);
1621 pc = &__pcpu[0];
1622
1623 wrmsr(MSR_FSBASE, 0); /* User value */
1624 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1625 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1626
1627 pcpu_init(pc, 0, sizeof(struct pcpu));
1628 dpcpu_init((void *)(physfree + KERNBASE), 0);
1629 physfree += DPCPU_SIZE;
1630 PCPU_SET(prvspace, pc);
1631 PCPU_SET(curthread, &thread0);
1632 /* Non-late cninit() and printf() can be moved up to here. */
1633 PCPU_SET(tssp, &common_tss[0]);
1634 PCPU_SET(commontssp, &common_tss[0]);
1635 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1636 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1637 PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1638 PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1639
1640 /*
1641 * Initialize mutexes.
1642 *
1643 * icu_lock: in order to allow an interrupt to occur in a critical
1644 * section, to set pcpu->ipending (etc...) properly, we
1645 * must be able to get the icu lock, so it can't be
1646 * under witness.
1647 */
1648 mutex_init();
1649 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1650 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1651
1652 /* exceptions */
1653 pti = pti_get_default();
1654 TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1655
1656 for (x = 0; x < NIDT; x++)
1657 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1658 SEL_KPL, 0);
1659 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1660 SEL_KPL, 0);
1661 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1662 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
1663 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1664 SEL_UPL, 0);
1665 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1666 SEL_UPL, 0);
1667 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1668 SEL_KPL, 0);
1669 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1670 SEL_KPL, 0);
1671 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1672 SEL_KPL, 0);
1673 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1674 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1675 SDT_SYSIGT, SEL_KPL, 0);
1676 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1677 SEL_KPL, 0);
1678 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1679 SDT_SYSIGT, SEL_KPL, 0);
1680 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1681 SEL_KPL, 0);
1682 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1683 SEL_KPL, 0);
1684 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1685 SEL_KPL, 0);
1686 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1687 SEL_KPL, 0);
1688 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1689 SEL_KPL, 0);
1690 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1691 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1692 SEL_KPL, 0);
1693#ifdef KDTRACE_HOOKS
1694 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1695 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1696#endif
1697#ifdef XENHVM
1698 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1699 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1700#endif
1701 r_idt.rd_limit = sizeof(idt0) - 1;
1702 r_idt.rd_base = (long) idt;
1703 lidt(&r_idt);
1704
1705 /*
1706 * Initialize the clock before the console so that console
1707 * initialization can use DELAY().
1708 */
1709 clock_init();
1710
1711 /*
1712 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1713 * transition).
1714 * Once bootblocks have updated, we can test directly for
1715 * efi_systbl != NULL here...
1716 */
1717 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1718 != NULL)
1719 vty_set_preferred(VTY_VT);
1720
1721 finishidentcpu(); /* Final stage of CPU initialization */
1722 initializecpu(); /* Initialize CPU registers */
1723 initializecpucache();
1724
1725 /* doublefault stack space, runs on ist1 */
1726 common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1727
1728 /*
1729 * NMI stack, runs on ist2. The pcpu pointer is stored just
1730 * above the start of the ist2 stack.
1731 */
1732 np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1733 np->np_pcpu = (register_t) pc;
1734 common_tss[0].tss_ist2 = (long) np;
1735
1736 /*
1737 * MC# stack, runs on ist3. The pcpu pointer is stored just
1738 * above the start of the ist3 stack.
1739 */
1740 np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
1741 np->np_pcpu = (register_t) pc;
1742 common_tss[0].tss_ist3 = (long) np;
1743
1744 /*
1745 * DB# stack, runs on ist4.
1746 */
1747 np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
1748 np->np_pcpu = (register_t) pc;
1749 common_tss[0].tss_ist4 = (long) np;
1750
1751 /* Set the IO permission bitmap (empty due to tss seg limit) */
1752 common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1753
1754 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1755 ltr(gsel_tss);
1756
1757 amd64_conf_fast_syscall();
1758
1759 /*
1760 * Temporary forge some valid pointer to PCB, for exception
1761 * handlers. It is reinitialized properly below after FPU is
1762 * set up. Also set up td_critnest to short-cut the page
1763 * fault handler.
1764 */
1765 cpu_max_ext_state_size = sizeof(struct savefpu);
1766 thread0.td_pcb = get_pcb_td(&thread0);
1767 thread0.td_critnest = 1;
1768
1769 /*
1770 * The console and kdb should be initialized even earlier than here,
1771 * but some console drivers don't work until after getmemsize().
1772 * Default to late console initialization to support these drivers.
1773 * This loses mainly printf()s in getmemsize() and early debugging.
1774 */
1775 late_console = 1;
1776 TUNABLE_INT_FETCH("debug.late_console", &late_console);
1777 if (!late_console) {
1778 cninit();
1779 amd64_kdb_init();
1780 }
1781
1782 getmemsize(kmdp, physfree);
1783 init_param2(physmem);
1784
1785 /* now running on new page tables, configured,and u/iom is accessible */
1786
1787 if (late_console)
1788 cninit();
1789
1790#ifdef DEV_ISA
1791#ifdef DEV_ATPIC
1792 elcr_probe();
1793 atpic_startup();
1794#else
1795 /* Reset and mask the atpics and leave them shut down. */
1796 atpic_reset();
1797
1798 /*
1799 * Point the ICU spurious interrupt vectors at the APIC spurious
1800 * interrupt handler.
1801 */
1802 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1803 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1804#endif
1805#else
1806#error "have you forgotten the isa device?";
1807#endif
1808
1809 if (late_console)
1810 amd64_kdb_init();
1811
1812 msgbufinit(msgbufp, msgbufsize);
1813 fpuinit();
1814
1815 /*
1816 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1817 * area size. Zero out the extended state header in fpu save
1818 * area.
1819 */
1820 thread0.td_pcb = get_pcb_td(&thread0);
1821 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1822 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1823 if (use_xsave) {
1824 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1825 1);
1826 xhdr->xstate_bv = xsave_mask;
1827 }
1828 /* make an initial tss so cpu can get interrupt stack on syscall! */
1829 rsp0 = (vm_offset_t)thread0.td_pcb;
1830 /* Ensure the stack is aligned to 16 bytes */
1831 rsp0 &= ~0xFul;
1832 common_tss[0].tss_rsp0 = rsp0;
1833 PCPU_SET(rsp0, rsp0);
1834 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1835 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1836 PCPU_SET(curpcb, thread0.td_pcb);
1837
1838 /* transfer to user mode */
1839
1840 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1841 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1842 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1843 _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1844 _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1845
1846 load_ds(_udatasel);
1847 load_es(_udatasel);
1848 load_fs(_ufssel);
1849
1850 /* setup proc 0's pcb */
1851 thread0.td_pcb->pcb_flags = 0;
1852 thread0.td_frame = &proc0_tf;
1853
1854 env = kern_getenv("kernelname");
1855 if (env != NULL)
1856 strlcpy(kernelname, env, sizeof(kernelname));
1857
1858 cpu_probe_amdc1e();
1859
1860#ifdef FDT
1861 x86_init_fdt();
1862#endif
1863 thread0.td_critnest = 0;
1864
1865 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1866 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
43
44#include "opt_atpic.h"
45#include "opt_compat.h"
46#include "opt_cpu.h"
47#include "opt_ddb.h"
48#include "opt_inet.h"
49#include "opt_isa.h"
50#include "opt_kstack_pages.h"
51#include "opt_maxmem.h"
52#include "opt_mp_watchdog.h"
53#include "opt_perfmon.h"
54#include "opt_platform.h"
55#include "opt_sched.h"
56
57#include <sys/param.h>
58#include <sys/proc.h>
59#include <sys/systm.h>
60#include <sys/bio.h>
61#include <sys/buf.h>
62#include <sys/bus.h>
63#include <sys/callout.h>
64#include <sys/cons.h>
65#include <sys/cpu.h>
66#include <sys/efi.h>
67#include <sys/eventhandler.h>
68#include <sys/exec.h>
69#include <sys/imgact.h>
70#include <sys/kdb.h>
71#include <sys/kernel.h>
72#include <sys/ktr.h>
73#include <sys/linker.h>
74#include <sys/lock.h>
75#include <sys/malloc.h>
76#include <sys/memrange.h>
77#include <sys/msgbuf.h>
78#include <sys/mutex.h>
79#include <sys/pcpu.h>
80#include <sys/ptrace.h>
81#include <sys/reboot.h>
82#include <sys/rwlock.h>
83#include <sys/sched.h>
84#include <sys/signalvar.h>
85#ifdef SMP
86#include <sys/smp.h>
87#endif
88#include <sys/syscallsubr.h>
89#include <sys/sysctl.h>
90#include <sys/sysent.h>
91#include <sys/sysproto.h>
92#include <sys/ucontext.h>
93#include <sys/vmmeter.h>
94
95#include <vm/vm.h>
96#include <vm/vm_extern.h>
97#include <vm/vm_kern.h>
98#include <vm/vm_page.h>
99#include <vm/vm_map.h>
100#include <vm/vm_object.h>
101#include <vm/vm_pager.h>
102#include <vm/vm_param.h>
103#include <vm/vm_phys.h>
104
105#ifdef DDB
106#ifndef KDB
107#error KDB must be enabled in order for DDB to work!
108#endif
109#include <ddb/ddb.h>
110#include <ddb/db_sym.h>
111#endif
112
113#include <net/netisr.h>
114
115#include <machine/clock.h>
116#include <machine/cpu.h>
117#include <machine/cputypes.h>
118#include <machine/frame.h>
119#include <machine/intr_machdep.h>
120#include <x86/mca.h>
121#include <machine/md_var.h>
122#include <machine/metadata.h>
123#include <machine/mp_watchdog.h>
124#include <machine/pc/bios.h>
125#include <machine/pcb.h>
126#include <machine/proc.h>
127#include <machine/reg.h>
128#include <machine/sigframe.h>
129#include <machine/specialreg.h>
130#ifdef PERFMON
131#include <machine/perfmon.h>
132#endif
133#include <machine/tss.h>
134#ifdef SMP
135#include <machine/smp.h>
136#endif
137#ifdef FDT
138#include <x86/fdt.h>
139#endif
140
141#ifdef DEV_ATPIC
142#include <x86/isa/icu.h>
143#else
144#include <x86/apicvar.h>
145#endif
146
147#include <isa/isareg.h>
148#include <isa/rtc.h>
149#include <x86/init.h>
150
151/* Sanity check for __curthread() */
152CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
153
154/*
155 * The PTI trampoline stack needs enough space for a hardware trapframe and a
156 * couple of scratch registers, as well as the trapframe left behind after an
157 * iret fault.
158 */
159CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
160 offsetof(struct pti_frame, pti_rip));
161
162extern u_int64_t hammer_time(u_int64_t, u_int64_t);
163
164#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
165#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
166
167static void cpu_startup(void *);
168static void get_fpcontext(struct thread *td, mcontext_t *mcp,
169 char *xfpusave, size_t xfpusave_len);
170static int set_fpcontext(struct thread *td, mcontext_t *mcp,
171 char *xfpustate, size_t xfpustate_len);
172SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
173
174/* Preload data parse function */
175static caddr_t native_parse_preload_data(u_int64_t);
176
177/* Native function to fetch and parse the e820 map */
178static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
179
180/* Default init_ops implementation. */
181struct init_ops init_ops = {
182 .parse_preload_data = native_parse_preload_data,
183 .early_clock_source_init = i8254_init,
184 .early_delay = i8254_delay,
185 .parse_memmap = native_parse_memmap,
186#ifdef SMP
187 .mp_bootaddress = mp_bootaddress,
188 .start_all_aps = native_start_all_aps,
189#endif
190 .msi_init = msi_init,
191};
192
193struct msgbuf *msgbufp;
194
195/*
196 * Physical address of the EFI System Table. Stashed from the metadata hints
197 * passed into the kernel and used by the EFI code to call runtime services.
198 */
199vm_paddr_t efi_systbl_phys;
200
201/* Intel ICH registers */
202#define ICH_PMBASE 0x400
203#define ICH_SMI_EN ICH_PMBASE + 0x30
204
205int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
206
207int cold = 1;
208
209long Maxmem = 0;
210long realmem = 0;
211
212/*
213 * The number of PHYSMAP entries must be one less than the number of
214 * PHYSSEG entries because the PHYSMAP entry that spans the largest
215 * physical address that is accessible by ISA DMA is split into two
216 * PHYSSEG entries.
217 */
218#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
219
220vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
221vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
222
223/* must be 2 less so 0 0 can signal end of chunks */
224#define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
225#define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
226
227struct kva_md_info kmi;
228
229static struct trapframe proc0_tf;
230struct region_descriptor r_gdt, r_idt;
231
232struct pcpu __pcpu[MAXCPU];
233
234struct mtx icu_lock;
235
236struct mem_range_softc mem_range_softc;
237
238struct mtx dt_lock; /* lock for GDT and LDT */
239
240void (*vmm_resume_p)(void);
241
242static void
243cpu_startup(dummy)
244 void *dummy;
245{
246 uintmax_t memsize;
247 char *sysenv;
248
249 /*
250 * On MacBooks, we need to disallow the legacy USB circuit to
251 * generate an SMI# because this can cause several problems,
252 * namely: incorrect CPU frequency detection and failure to
253 * start the APs.
254 * We do this by disabling a bit in the SMI_EN (SMI Control and
255 * Enable register) of the Intel ICH LPC Interface Bridge.
256 */
257 sysenv = kern_getenv("smbios.system.product");
258 if (sysenv != NULL) {
259 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
260 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
261 strncmp(sysenv, "MacBook4,1", 10) == 0 ||
262 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
263 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
264 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
265 strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
266 strncmp(sysenv, "Macmini1,1", 10) == 0) {
267 if (bootverbose)
268 printf("Disabling LEGACY_USB_EN bit on "
269 "Intel ICH.\n");
270 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
271 }
272 freeenv(sysenv);
273 }
274
275 /*
276 * Good {morning,afternoon,evening,night}.
277 */
278 startrtclock();
279 printcpuinfo();
280#ifdef PERFMON
281 perfmon_init();
282#endif
283
284 /*
285 * Display physical memory if SMBIOS reports reasonable amount.
286 */
287 memsize = 0;
288 sysenv = kern_getenv("smbios.memory.enabled");
289 if (sysenv != NULL) {
290 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
291 freeenv(sysenv);
292 }
293 if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
294 memsize = ptoa((uintmax_t)Maxmem);
295 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
296 realmem = atop(memsize);
297
298 /*
299 * Display any holes after the first chunk of extended memory.
300 */
301 if (bootverbose) {
302 int indx;
303
304 printf("Physical memory chunk(s):\n");
305 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
306 vm_paddr_t size;
307
308 size = phys_avail[indx + 1] - phys_avail[indx];
309 printf(
310 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
311 (uintmax_t)phys_avail[indx],
312 (uintmax_t)phys_avail[indx + 1] - 1,
313 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
314 }
315 }
316
317 vm_ksubmap_init(&kmi);
318
319 printf("avail memory = %ju (%ju MB)\n",
320 ptoa((uintmax_t)vm_cnt.v_free_count),
321 ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
322
323 /*
324 * Set up buffers, so they can be used to read disk labels.
325 */
326 bufinit();
327 vm_pager_bufferinit();
328
329 cpu_setregs();
330}
331
332/*
333 * Send an interrupt to process.
334 *
335 * Stack is set up to allow sigcode stored
336 * at top to call routine, followed by call
337 * to sigreturn routine below. After sigreturn
338 * resets the signal mask, the stack, and the
339 * frame pointer, it returns to the user
340 * specified pc, psl.
341 */
342void
343sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
344{
345 struct sigframe sf, *sfp;
346 struct pcb *pcb;
347 struct proc *p;
348 struct thread *td;
349 struct sigacts *psp;
350 char *sp;
351 struct trapframe *regs;
352 char *xfpusave;
353 size_t xfpusave_len;
354 int sig;
355 int oonstack;
356
357 td = curthread;
358 pcb = td->td_pcb;
359 p = td->td_proc;
360 PROC_LOCK_ASSERT(p, MA_OWNED);
361 sig = ksi->ksi_signo;
362 psp = p->p_sigacts;
363 mtx_assert(&psp->ps_mtx, MA_OWNED);
364 regs = td->td_frame;
365 oonstack = sigonstack(regs->tf_rsp);
366
367 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
368 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
369 xfpusave = __builtin_alloca(xfpusave_len);
370 } else {
371 xfpusave_len = 0;
372 xfpusave = NULL;
373 }
374
375 /* Save user context. */
376 bzero(&sf, sizeof(sf));
377 sf.sf_uc.uc_sigmask = *mask;
378 sf.sf_uc.uc_stack = td->td_sigstk;
379 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
380 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
381 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
382 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
383 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
384 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
385 fpstate_drop(td);
386 update_pcb_bases(pcb);
387 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
388 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
389 bzero(sf.sf_uc.uc_mcontext.mc_spare,
390 sizeof(sf.sf_uc.uc_mcontext.mc_spare));
391 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
392
393 /* Allocate space for the signal handler context. */
394 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
395 SIGISMEMBER(psp->ps_sigonstack, sig)) {
396 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
397#if defined(COMPAT_43)
398 td->td_sigstk.ss_flags |= SS_ONSTACK;
399#endif
400 } else
401 sp = (char *)regs->tf_rsp - 128;
402 if (xfpusave != NULL) {
403 sp -= xfpusave_len;
404 sp = (char *)((unsigned long)sp & ~0x3Ful);
405 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
406 }
407 sp -= sizeof(struct sigframe);
408 /* Align to 16 bytes. */
409 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
410
411 /* Build the argument list for the signal handler. */
412 regs->tf_rdi = sig; /* arg 1 in %rdi */
413 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
414 bzero(&sf.sf_si, sizeof(sf.sf_si));
415 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
416 /* Signal handler installed with SA_SIGINFO. */
417 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
418 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
419
420 /* Fill in POSIX parts */
421 sf.sf_si = ksi->ksi_info;
422 sf.sf_si.si_signo = sig; /* maybe a translated signal */
423 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
424 } else {
425 /* Old FreeBSD-style arguments. */
426 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */
427 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
428 sf.sf_ahu.sf_handler = catcher;
429 }
430 mtx_unlock(&psp->ps_mtx);
431 PROC_UNLOCK(p);
432
433 /*
434 * Copy the sigframe out to the user's stack.
435 */
436 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
437 (xfpusave != NULL && copyout(xfpusave,
438 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
439 != 0)) {
440#ifdef DEBUG
441 printf("process %ld has trashed its stack\n", (long)p->p_pid);
442#endif
443 PROC_LOCK(p);
444 sigexit(td, SIGILL);
445 }
446
447 regs->tf_rsp = (long)sfp;
448 regs->tf_rip = p->p_sysent->sv_sigcode_base;
449 regs->tf_rflags &= ~(PSL_T | PSL_D);
450 regs->tf_cs = _ucodesel;
451 regs->tf_ds = _udatasel;
452 regs->tf_ss = _udatasel;
453 regs->tf_es = _udatasel;
454 regs->tf_fs = _ufssel;
455 regs->tf_gs = _ugssel;
456 regs->tf_flags = TF_HASSEGS;
457 PROC_LOCK(p);
458 mtx_lock(&psp->ps_mtx);
459}
460
461/*
462 * System call to cleanup state after a signal
463 * has been taken. Reset signal mask and
464 * stack state from context left by sendsig (above).
465 * Return to previous pc and psl as specified by
466 * context left by sendsig. Check carefully to
467 * make sure that the user has not modified the
468 * state to gain improper privileges.
469 *
470 * MPSAFE
471 */
472int
473sys_sigreturn(td, uap)
474 struct thread *td;
475 struct sigreturn_args /* {
476 const struct __ucontext *sigcntxp;
477 } */ *uap;
478{
479 ucontext_t uc;
480 struct pcb *pcb;
481 struct proc *p;
482 struct trapframe *regs;
483 ucontext_t *ucp;
484 char *xfpustate;
485 size_t xfpustate_len;
486 long rflags;
487 int cs, error, ret;
488 ksiginfo_t ksi;
489
490 pcb = td->td_pcb;
491 p = td->td_proc;
492
493 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
494 if (error != 0) {
495 uprintf("pid %d (%s): sigreturn copyin failed\n",
496 p->p_pid, td->td_name);
497 return (error);
498 }
499 ucp = &uc;
500 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
501 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
502 td->td_name, ucp->uc_mcontext.mc_flags);
503 return (EINVAL);
504 }
505 regs = td->td_frame;
506 rflags = ucp->uc_mcontext.mc_rflags;
507 /*
508 * Don't allow users to change privileged or reserved flags.
509 */
510 if (!EFL_SECURE(rflags, regs->tf_rflags)) {
511 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
512 td->td_name, rflags);
513 return (EINVAL);
514 }
515
516 /*
517 * Don't allow users to load a valid privileged %cs. Let the
518 * hardware check for invalid selectors, excess privilege in
519 * other selectors, invalid %eip's and invalid %esp's.
520 */
521 cs = ucp->uc_mcontext.mc_cs;
522 if (!CS_SECURE(cs)) {
523 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
524 td->td_name, cs);
525 ksiginfo_init_trap(&ksi);
526 ksi.ksi_signo = SIGBUS;
527 ksi.ksi_code = BUS_OBJERR;
528 ksi.ksi_trapno = T_PROTFLT;
529 ksi.ksi_addr = (void *)regs->tf_rip;
530 trapsignal(td, &ksi);
531 return (EINVAL);
532 }
533
534 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
535 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
536 if (xfpustate_len > cpu_max_ext_state_size -
537 sizeof(struct savefpu)) {
538 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
539 p->p_pid, td->td_name, xfpustate_len);
540 return (EINVAL);
541 }
542 xfpustate = __builtin_alloca(xfpustate_len);
543 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
544 xfpustate, xfpustate_len);
545 if (error != 0) {
546 uprintf(
547 "pid %d (%s): sigreturn copying xfpustate failed\n",
548 p->p_pid, td->td_name);
549 return (error);
550 }
551 } else {
552 xfpustate = NULL;
553 xfpustate_len = 0;
554 }
555 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
556 if (ret != 0) {
557 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
558 p->p_pid, td->td_name, ret);
559 return (ret);
560 }
561 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
562 update_pcb_bases(pcb);
563 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
564 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
565
566#if defined(COMPAT_43)
567 if (ucp->uc_mcontext.mc_onstack & 1)
568 td->td_sigstk.ss_flags |= SS_ONSTACK;
569 else
570 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
571#endif
572
573 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
574 return (EJUSTRETURN);
575}
576
577#ifdef COMPAT_FREEBSD4
578int
579freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
580{
581
582 return sys_sigreturn(td, (struct sigreturn_args *)uap);
583}
584#endif
585
586/*
587 * Reset registers to default values on exec.
588 */
589void
590exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
591{
592 struct trapframe *regs;
593 struct pcb *pcb;
594 register_t saved_rflags;
595
596 regs = td->td_frame;
597 pcb = td->td_pcb;
598
599 mtx_lock(&dt_lock);
600 if (td->td_proc->p_md.md_ldt != NULL)
601 user_ldt_free(td);
602 else
603 mtx_unlock(&dt_lock);
604
605 update_pcb_bases(pcb);
606 pcb->pcb_fsbase = 0;
607 pcb->pcb_gsbase = 0;
608 clear_pcb_flags(pcb, PCB_32BIT);
609 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
610
611 saved_rflags = regs->tf_rflags & PSL_T;
612 bzero((char *)regs, sizeof(struct trapframe));
613 regs->tf_rip = imgp->entry_addr;
614 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
615 regs->tf_rdi = stack; /* argv */
616 regs->tf_rflags = PSL_USER | saved_rflags;
617 regs->tf_ss = _udatasel;
618 regs->tf_cs = _ucodesel;
619 regs->tf_ds = _udatasel;
620 regs->tf_es = _udatasel;
621 regs->tf_fs = _ufssel;
622 regs->tf_gs = _ugssel;
623 regs->tf_flags = TF_HASSEGS;
624 td->td_retval[1] = 0;
625
626 /*
627 * Reset the hardware debug registers if they were in use.
628 * They won't have any meaning for the newly exec'd process.
629 */
630 if (pcb->pcb_flags & PCB_DBREGS) {
631 pcb->pcb_dr0 = 0;
632 pcb->pcb_dr1 = 0;
633 pcb->pcb_dr2 = 0;
634 pcb->pcb_dr3 = 0;
635 pcb->pcb_dr6 = 0;
636 pcb->pcb_dr7 = 0;
637 if (pcb == curpcb) {
638 /*
639 * Clear the debug registers on the running
640 * CPU, otherwise they will end up affecting
641 * the next process we switch to.
642 */
643 reset_dbregs();
644 }
645 clear_pcb_flags(pcb, PCB_DBREGS);
646 }
647
648 /*
649 * Drop the FP state if we hold it, so that the process gets a
650 * clean FP state if it uses the FPU again.
651 */
652 fpstate_drop(td);
653}
654
655void
656cpu_setregs(void)
657{
658 register_t cr0;
659
660 cr0 = rcr0();
661 /*
662 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
663 * BSP. See the comments there about why we set them.
664 */
665 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
666 load_cr0(cr0);
667}
668
669/*
670 * Initialize amd64 and configure to run kernel
671 */
672
673/*
674 * Initialize segments & interrupt table
675 */
676
677struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
678static struct gate_descriptor idt0[NIDT];
679struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
680
681static char dblfault_stack[PAGE_SIZE] __aligned(16);
682static char mce0_stack[PAGE_SIZE] __aligned(16);
683static char nmi0_stack[PAGE_SIZE] __aligned(16);
684static char dbg0_stack[PAGE_SIZE] __aligned(16);
685CTASSERT(sizeof(struct nmi_pcpu) == 16);
686
687struct amd64tss common_tss[MAXCPU];
688
689/*
690 * Software prototypes -- in more palatable form.
691 *
692 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
693 * slots as corresponding segments for i386 kernel.
694 */
695struct soft_segment_descriptor gdt_segs[] = {
696/* GNULL_SEL 0 Null Descriptor */
697{ .ssd_base = 0x0,
698 .ssd_limit = 0x0,
699 .ssd_type = 0,
700 .ssd_dpl = 0,
701 .ssd_p = 0,
702 .ssd_long = 0,
703 .ssd_def32 = 0,
704 .ssd_gran = 0 },
705/* GNULL2_SEL 1 Null Descriptor */
706{ .ssd_base = 0x0,
707 .ssd_limit = 0x0,
708 .ssd_type = 0,
709 .ssd_dpl = 0,
710 .ssd_p = 0,
711 .ssd_long = 0,
712 .ssd_def32 = 0,
713 .ssd_gran = 0 },
714/* GUFS32_SEL 2 32 bit %gs Descriptor for user */
715{ .ssd_base = 0x0,
716 .ssd_limit = 0xfffff,
717 .ssd_type = SDT_MEMRWA,
718 .ssd_dpl = SEL_UPL,
719 .ssd_p = 1,
720 .ssd_long = 0,
721 .ssd_def32 = 1,
722 .ssd_gran = 1 },
723/* GUGS32_SEL 3 32 bit %fs Descriptor for user */
724{ .ssd_base = 0x0,
725 .ssd_limit = 0xfffff,
726 .ssd_type = SDT_MEMRWA,
727 .ssd_dpl = SEL_UPL,
728 .ssd_p = 1,
729 .ssd_long = 0,
730 .ssd_def32 = 1,
731 .ssd_gran = 1 },
732/* GCODE_SEL 4 Code Descriptor for kernel */
733{ .ssd_base = 0x0,
734 .ssd_limit = 0xfffff,
735 .ssd_type = SDT_MEMERA,
736 .ssd_dpl = SEL_KPL,
737 .ssd_p = 1,
738 .ssd_long = 1,
739 .ssd_def32 = 0,
740 .ssd_gran = 1 },
741/* GDATA_SEL 5 Data Descriptor for kernel */
742{ .ssd_base = 0x0,
743 .ssd_limit = 0xfffff,
744 .ssd_type = SDT_MEMRWA,
745 .ssd_dpl = SEL_KPL,
746 .ssd_p = 1,
747 .ssd_long = 1,
748 .ssd_def32 = 0,
749 .ssd_gran = 1 },
750/* GUCODE32_SEL 6 32 bit Code Descriptor for user */
751{ .ssd_base = 0x0,
752 .ssd_limit = 0xfffff,
753 .ssd_type = SDT_MEMERA,
754 .ssd_dpl = SEL_UPL,
755 .ssd_p = 1,
756 .ssd_long = 0,
757 .ssd_def32 = 1,
758 .ssd_gran = 1 },
759/* GUDATA_SEL 7 32/64 bit Data Descriptor for user */
760{ .ssd_base = 0x0,
761 .ssd_limit = 0xfffff,
762 .ssd_type = SDT_MEMRWA,
763 .ssd_dpl = SEL_UPL,
764 .ssd_p = 1,
765 .ssd_long = 0,
766 .ssd_def32 = 1,
767 .ssd_gran = 1 },
768/* GUCODE_SEL 8 64 bit Code Descriptor for user */
769{ .ssd_base = 0x0,
770 .ssd_limit = 0xfffff,
771 .ssd_type = SDT_MEMERA,
772 .ssd_dpl = SEL_UPL,
773 .ssd_p = 1,
774 .ssd_long = 1,
775 .ssd_def32 = 0,
776 .ssd_gran = 1 },
777/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
778{ .ssd_base = 0x0,
779 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
780 .ssd_type = SDT_SYSTSS,
781 .ssd_dpl = SEL_KPL,
782 .ssd_p = 1,
783 .ssd_long = 0,
784 .ssd_def32 = 0,
785 .ssd_gran = 0 },
786/* Actually, the TSS is a system descriptor which is double size */
787{ .ssd_base = 0x0,
788 .ssd_limit = 0x0,
789 .ssd_type = 0,
790 .ssd_dpl = 0,
791 .ssd_p = 0,
792 .ssd_long = 0,
793 .ssd_def32 = 0,
794 .ssd_gran = 0 },
795/* GUSERLDT_SEL 11 LDT Descriptor */
796{ .ssd_base = 0x0,
797 .ssd_limit = 0x0,
798 .ssd_type = 0,
799 .ssd_dpl = 0,
800 .ssd_p = 0,
801 .ssd_long = 0,
802 .ssd_def32 = 0,
803 .ssd_gran = 0 },
804/* GUSERLDT_SEL 12 LDT Descriptor, double size */
805{ .ssd_base = 0x0,
806 .ssd_limit = 0x0,
807 .ssd_type = 0,
808 .ssd_dpl = 0,
809 .ssd_p = 0,
810 .ssd_long = 0,
811 .ssd_def32 = 0,
812 .ssd_gran = 0 },
813};
814
815void
816setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
817{
818 struct gate_descriptor *ip;
819
820 ip = idt + idx;
821 ip->gd_looffset = (uintptr_t)func;
822 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
823 ip->gd_ist = ist;
824 ip->gd_xx = 0;
825 ip->gd_type = typ;
826 ip->gd_dpl = dpl;
827 ip->gd_p = 1;
828 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
829}
830
831extern inthand_t
832 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
833 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
834 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
835 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
836 IDTVEC(xmm), IDTVEC(dblfault),
837 IDTVEC(div_pti), IDTVEC(bpt_pti),
838 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
839 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
840 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
841 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
842 IDTVEC(xmm_pti),
843#ifdef KDTRACE_HOOKS
844 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
845#endif
846#ifdef XENHVM
847 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
848#endif
849 IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
850 IDTVEC(fast_syscall_pti);
851
852#ifdef DDB
853/*
854 * Display the index and function name of any IDT entries that don't use
855 * the default 'rsvd' entry point.
856 */
857DB_SHOW_COMMAND(idt, db_show_idt)
858{
859 struct gate_descriptor *ip;
860 int idx;
861 uintptr_t func;
862
863 ip = idt;
864 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
865 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
866 if (func != (uintptr_t)&IDTVEC(rsvd)) {
867 db_printf("%3d\t", idx);
868 db_printsym(func, DB_STGY_PROC);
869 db_printf("\n");
870 }
871 ip++;
872 }
873}
874
875/* Show privileged registers. */
876DB_SHOW_COMMAND(sysregs, db_show_sysregs)
877{
878 struct {
879 uint16_t limit;
880 uint64_t base;
881 } __packed idtr, gdtr;
882 uint16_t ldt, tr;
883
884 __asm __volatile("sidt %0" : "=m" (idtr));
885 db_printf("idtr\t0x%016lx/%04x\n",
886 (u_long)idtr.base, (u_int)idtr.limit);
887 __asm __volatile("sgdt %0" : "=m" (gdtr));
888 db_printf("gdtr\t0x%016lx/%04x\n",
889 (u_long)gdtr.base, (u_int)gdtr.limit);
890 __asm __volatile("sldt %0" : "=r" (ldt));
891 db_printf("ldtr\t0x%04x\n", ldt);
892 __asm __volatile("str %0" : "=r" (tr));
893 db_printf("tr\t0x%04x\n", tr);
894 db_printf("cr0\t0x%016lx\n", rcr0());
895 db_printf("cr2\t0x%016lx\n", rcr2());
896 db_printf("cr3\t0x%016lx\n", rcr3());
897 db_printf("cr4\t0x%016lx\n", rcr4());
898 if (rcr4() & CR4_XSAVE)
899 db_printf("xcr0\t0x%016lx\n", rxcr(0));
900 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
901 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
902 db_printf("FEATURES_CTL\t%016lx\n",
903 rdmsr(MSR_IA32_FEATURE_CONTROL));
904 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
905 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
906 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
907}
908
909DB_SHOW_COMMAND(dbregs, db_show_dbregs)
910{
911
912 db_printf("dr0\t0x%016lx\n", rdr0());
913 db_printf("dr1\t0x%016lx\n", rdr1());
914 db_printf("dr2\t0x%016lx\n", rdr2());
915 db_printf("dr3\t0x%016lx\n", rdr3());
916 db_printf("dr6\t0x%016lx\n", rdr6());
917 db_printf("dr7\t0x%016lx\n", rdr7());
918}
919#endif
920
921void
922sdtossd(sd, ssd)
923 struct user_segment_descriptor *sd;
924 struct soft_segment_descriptor *ssd;
925{
926
927 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
928 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
929 ssd->ssd_type = sd->sd_type;
930 ssd->ssd_dpl = sd->sd_dpl;
931 ssd->ssd_p = sd->sd_p;
932 ssd->ssd_long = sd->sd_long;
933 ssd->ssd_def32 = sd->sd_def32;
934 ssd->ssd_gran = sd->sd_gran;
935}
936
937void
938ssdtosd(ssd, sd)
939 struct soft_segment_descriptor *ssd;
940 struct user_segment_descriptor *sd;
941{
942
943 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
944 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
945 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
946 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
947 sd->sd_type = ssd->ssd_type;
948 sd->sd_dpl = ssd->ssd_dpl;
949 sd->sd_p = ssd->ssd_p;
950 sd->sd_long = ssd->ssd_long;
951 sd->sd_def32 = ssd->ssd_def32;
952 sd->sd_gran = ssd->ssd_gran;
953}
954
955void
956ssdtosyssd(ssd, sd)
957 struct soft_segment_descriptor *ssd;
958 struct system_segment_descriptor *sd;
959{
960
961 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
962 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
963 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
964 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
965 sd->sd_type = ssd->ssd_type;
966 sd->sd_dpl = ssd->ssd_dpl;
967 sd->sd_p = ssd->ssd_p;
968 sd->sd_gran = ssd->ssd_gran;
969}
970
971#if !defined(DEV_ATPIC) && defined(DEV_ISA)
972#include <isa/isavar.h>
973#include <isa/isareg.h>
974/*
975 * Return a bitmap of the current interrupt requests. This is 8259-specific
976 * and is only suitable for use at probe time.
977 * This is only here to pacify sio. It is NOT FATAL if this doesn't work.
978 * It shouldn't be here. There should probably be an APIC centric
979 * implementation in the apic driver code, if at all.
980 */
981intrmask_t
982isa_irq_pending(void)
983{
984 u_char irr1;
985 u_char irr2;
986
987 irr1 = inb(IO_ICU1);
988 irr2 = inb(IO_ICU2);
989 return ((irr2 << 8) | irr1);
990}
991#endif
992
993u_int basemem;
994
995static int
996add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
997 int *physmap_idxp)
998{
999 int i, insert_idx, physmap_idx;
1000
1001 physmap_idx = *physmap_idxp;
1002
1003 if (length == 0)
1004 return (1);
1005
1006 /*
1007 * Find insertion point while checking for overlap. Start off by
1008 * assuming the new entry will be added to the end.
1009 *
1010 * NB: physmap_idx points to the next free slot.
1011 */
1012 insert_idx = physmap_idx;
1013 for (i = 0; i <= physmap_idx; i += 2) {
1014 if (base < physmap[i + 1]) {
1015 if (base + length <= physmap[i]) {
1016 insert_idx = i;
1017 break;
1018 }
1019 if (boothowto & RB_VERBOSE)
1020 printf(
1021 "Overlapping memory regions, ignoring second region\n");
1022 return (1);
1023 }
1024 }
1025
1026 /* See if we can prepend to the next entry. */
1027 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1028 physmap[insert_idx] = base;
1029 return (1);
1030 }
1031
1032 /* See if we can append to the previous entry. */
1033 if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1034 physmap[insert_idx - 1] += length;
1035 return (1);
1036 }
1037
1038 physmap_idx += 2;
1039 *physmap_idxp = physmap_idx;
1040 if (physmap_idx == PHYSMAP_SIZE) {
1041 printf(
1042 "Too many segments in the physical address map, giving up\n");
1043 return (0);
1044 }
1045
1046 /*
1047 * Move the last 'N' entries down to make room for the new
1048 * entry if needed.
1049 */
1050 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1051 physmap[i] = physmap[i - 2];
1052 physmap[i + 1] = physmap[i - 1];
1053 }
1054
1055 /* Insert the new entry. */
1056 physmap[insert_idx] = base;
1057 physmap[insert_idx + 1] = base + length;
1058 return (1);
1059}
1060
1061void
1062bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1063 vm_paddr_t *physmap, int *physmap_idx)
1064{
1065 struct bios_smap *smap, *smapend;
1066
1067 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1068
1069 for (smap = smapbase; smap < smapend; smap++) {
1070 if (boothowto & RB_VERBOSE)
1071 printf("SMAP type=%02x base=%016lx len=%016lx\n",
1072 smap->type, smap->base, smap->length);
1073
1074 if (smap->type != SMAP_TYPE_MEMORY)
1075 continue;
1076
1077 if (!add_physmap_entry(smap->base, smap->length, physmap,
1078 physmap_idx))
1079 break;
1080 }
1081}
1082
1083static void
1084add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1085 int *physmap_idx)
1086{
1087 struct efi_md *map, *p;
1088 const char *type;
1089 size_t efisz;
1090 int ndesc, i;
1091
1092 static const char *types[] = {
1093 "Reserved",
1094 "LoaderCode",
1095 "LoaderData",
1096 "BootServicesCode",
1097 "BootServicesData",
1098 "RuntimeServicesCode",
1099 "RuntimeServicesData",
1100 "ConventionalMemory",
1101 "UnusableMemory",
1102 "ACPIReclaimMemory",
1103 "ACPIMemoryNVS",
1104 "MemoryMappedIO",
1105 "MemoryMappedIOPortSpace",
1106 "PalCode",
1107 "PersistentMemory"
1108 };
1109
1110 /*
1111 * Memory map data provided by UEFI via the GetMemoryMap
1112 * Boot Services API.
1113 */
1114 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1115 map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1116
1117 if (efihdr->descriptor_size == 0)
1118 return;
1119 ndesc = efihdr->memory_size / efihdr->descriptor_size;
1120
1121 if (boothowto & RB_VERBOSE)
1122 printf("%23s %12s %12s %8s %4s\n",
1123 "Type", "Physical", "Virtual", "#Pages", "Attr");
1124
1125 for (i = 0, p = map; i < ndesc; i++,
1126 p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1127 if (boothowto & RB_VERBOSE) {
1128 if (p->md_type < nitems(types))
1129 type = types[p->md_type];
1130 else
1131 type = "<INVALID>";
1132 printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1133 p->md_virt, p->md_pages);
1134 if (p->md_attr & EFI_MD_ATTR_UC)
1135 printf("UC ");
1136 if (p->md_attr & EFI_MD_ATTR_WC)
1137 printf("WC ");
1138 if (p->md_attr & EFI_MD_ATTR_WT)
1139 printf("WT ");
1140 if (p->md_attr & EFI_MD_ATTR_WB)
1141 printf("WB ");
1142 if (p->md_attr & EFI_MD_ATTR_UCE)
1143 printf("UCE ");
1144 if (p->md_attr & EFI_MD_ATTR_WP)
1145 printf("WP ");
1146 if (p->md_attr & EFI_MD_ATTR_RP)
1147 printf("RP ");
1148 if (p->md_attr & EFI_MD_ATTR_XP)
1149 printf("XP ");
1150 if (p->md_attr & EFI_MD_ATTR_NV)
1151 printf("NV ");
1152 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1153 printf("MORE_RELIABLE ");
1154 if (p->md_attr & EFI_MD_ATTR_RO)
1155 printf("RO ");
1156 if (p->md_attr & EFI_MD_ATTR_RT)
1157 printf("RUNTIME");
1158 printf("\n");
1159 }
1160
1161 switch (p->md_type) {
1162 case EFI_MD_TYPE_CODE:
1163 case EFI_MD_TYPE_DATA:
1164 case EFI_MD_TYPE_BS_CODE:
1165 case EFI_MD_TYPE_BS_DATA:
1166 case EFI_MD_TYPE_FREE:
1167 /*
1168 * We're allowed to use any entry with these types.
1169 */
1170 break;
1171 default:
1172 continue;
1173 }
1174
1175 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1176 physmap, physmap_idx))
1177 break;
1178 }
1179}
1180
1181static char bootmethod[16] = "";
1182SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1183 "System firmware boot method");
1184
1185static void
1186native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1187{
1188 struct bios_smap *smap;
1189 struct efi_map_header *efihdr;
1190 u_int32_t size;
1191
1192 /*
1193 * Memory map from INT 15:E820.
1194 *
1195 * subr_module.c says:
1196 * "Consumer may safely assume that size value precedes data."
1197 * ie: an int32_t immediately precedes smap.
1198 */
1199
1200 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1201 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1202 smap = (struct bios_smap *)preload_search_info(kmdp,
1203 MODINFO_METADATA | MODINFOMD_SMAP);
1204 if (efihdr == NULL && smap == NULL)
1205 panic("No BIOS smap or EFI map info from loader!");
1206
1207 if (efihdr != NULL) {
1208 add_efi_map_entries(efihdr, physmap, physmap_idx);
1209 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1210 } else {
1211 size = *((u_int32_t *)smap - 1);
1212 bios_add_smap_entries(smap, size, physmap, physmap_idx);
1213 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1214 }
1215}
1216
1217#define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
1218
1219/*
1220 * Populate the (physmap) array with base/bound pairs describing the
1221 * available physical memory in the system, then test this memory and
1222 * build the phys_avail array describing the actually-available memory.
1223 *
1224 * Total memory size may be set by the kernel environment variable
1225 * hw.physmem or the compile-time define MAXMEM.
1226 *
1227 * XXX first should be vm_paddr_t.
1228 */
1229static void
1230getmemsize(caddr_t kmdp, u_int64_t first)
1231{
1232 int i, physmap_idx, pa_indx, da_indx;
1233 vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1234 u_long physmem_start, physmem_tunable, memtest;
1235 pt_entry_t *pte;
1236 quad_t dcons_addr, dcons_size;
1237 int page_counter;
1238
1239 /*
1240 * Tell the physical memory allocator about pages used to store
1241 * the kernel and preloaded data. See kmem_bootstrap_free().
1242 */
1243 vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1244
1245 bzero(physmap, sizeof(physmap));
1246 physmap_idx = 0;
1247
1248 init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1249 physmap_idx -= 2;
1250
1251 /*
1252 * Find the 'base memory' segment for SMP
1253 */
1254 basemem = 0;
1255 for (i = 0; i <= physmap_idx; i += 2) {
1256 if (physmap[i] <= 0xA0000) {
1257 basemem = physmap[i + 1] / 1024;
1258 break;
1259 }
1260 }
1261 if (basemem == 0 || basemem > 640) {
1262 if (bootverbose)
1263 printf(
1264 "Memory map doesn't contain a basemem segment, faking it");
1265 basemem = 640;
1266 }
1267
1268 /*
1269 * Make hole for "AP -> long mode" bootstrap code. The
1270 * mp_bootaddress vector is only available when the kernel
1271 * is configured to support APs and APs for the system start
1272 * in 32bit mode (e.g. SMP bare metal).
1273 */
1274 if (init_ops.mp_bootaddress) {
1275 if (physmap[1] >= 0x100000000)
1276 panic(
1277 "Basemem segment is not suitable for AP bootstrap code!");
1278 physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
1279 }
1280
1281 /*
1282 * Maxmem isn't the "maximum memory", it's one larger than the
1283 * highest page of the physical address space. It should be
1284 * called something like "Maxphyspage". We may adjust this
1285 * based on ``hw.physmem'' and the results of the memory test.
1286 */
1287 Maxmem = atop(physmap[physmap_idx + 1]);
1288
1289#ifdef MAXMEM
1290 Maxmem = MAXMEM / 4;
1291#endif
1292
1293 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1294 Maxmem = atop(physmem_tunable);
1295
1296 /*
1297 * The boot memory test is disabled by default, as it takes a
1298 * significant amount of time on large-memory systems, and is
1299 * unfriendly to virtual machines as it unnecessarily touches all
1300 * pages.
1301 *
1302 * A general name is used as the code may be extended to support
1303 * additional tests beyond the current "page present" test.
1304 */
1305 memtest = 0;
1306 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1307
1308 /*
1309 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1310 * in the system.
1311 */
1312 if (Maxmem > atop(physmap[physmap_idx + 1]))
1313 Maxmem = atop(physmap[physmap_idx + 1]);
1314
1315 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1316 (boothowto & RB_VERBOSE))
1317 printf("Physical memory use set to %ldK\n", Maxmem * 4);
1318
1319 /* call pmap initialization to make new kernel address space */
1320 pmap_bootstrap(&first);
1321
1322 /*
1323 * Size up each available chunk of physical memory.
1324 *
1325 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1326 * By default, mask off the first 16 pages unless we appear to be
1327 * running in a VM.
1328 */
1329 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1330 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1331 if (physmap[0] < physmem_start) {
1332 if (physmem_start < PAGE_SIZE)
1333 physmap[0] = PAGE_SIZE;
1334 else if (physmem_start >= physmap[1])
1335 physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1336 else
1337 physmap[0] = round_page(physmem_start);
1338 }
1339 pa_indx = 0;
1340 da_indx = 1;
1341 phys_avail[pa_indx++] = physmap[0];
1342 phys_avail[pa_indx] = physmap[0];
1343 dump_avail[da_indx] = physmap[0];
1344 pte = CMAP1;
1345
1346 /*
1347 * Get dcons buffer address
1348 */
1349 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1350 getenv_quad("dcons.size", &dcons_size) == 0)
1351 dcons_addr = 0;
1352
1353 /*
1354 * physmap is in bytes, so when converting to page boundaries,
1355 * round up the start address and round down the end address.
1356 */
1357 page_counter = 0;
1358 if (memtest != 0)
1359 printf("Testing system memory");
1360 for (i = 0; i <= physmap_idx; i += 2) {
1361 vm_paddr_t end;
1362
1363 end = ptoa((vm_paddr_t)Maxmem);
1364 if (physmap[i + 1] < end)
1365 end = trunc_page(physmap[i + 1]);
1366 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1367 int tmp, page_bad, full;
1368 int *ptr = (int *)CADDR1;
1369
1370 full = FALSE;
1371 /*
1372 * block out kernel memory as not available.
1373 */
1374 if (pa >= (vm_paddr_t)kernphys && pa < first)
1375 goto do_dump_avail;
1376
1377 /*
1378 * block out dcons buffer
1379 */
1380 if (dcons_addr > 0
1381 && pa >= trunc_page(dcons_addr)
1382 && pa < dcons_addr + dcons_size)
1383 goto do_dump_avail;
1384
1385 page_bad = FALSE;
1386 if (memtest == 0)
1387 goto skip_memtest;
1388
1389 /*
1390 * Print a "." every GB to show we're making
1391 * progress.
1392 */
1393 page_counter++;
1394 if ((page_counter % PAGES_PER_GB) == 0)
1395 printf(".");
1396
1397 /*
1398 * map page into kernel: valid, read/write,non-cacheable
1399 */
1400 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1401 invltlb();
1402
1403 tmp = *(int *)ptr;
1404 /*
1405 * Test for alternating 1's and 0's
1406 */
1407 *(volatile int *)ptr = 0xaaaaaaaa;
1408 if (*(volatile int *)ptr != 0xaaaaaaaa)
1409 page_bad = TRUE;
1410 /*
1411 * Test for alternating 0's and 1's
1412 */
1413 *(volatile int *)ptr = 0x55555555;
1414 if (*(volatile int *)ptr != 0x55555555)
1415 page_bad = TRUE;
1416 /*
1417 * Test for all 1's
1418 */
1419 *(volatile int *)ptr = 0xffffffff;
1420 if (*(volatile int *)ptr != 0xffffffff)
1421 page_bad = TRUE;
1422 /*
1423 * Test for all 0's
1424 */
1425 *(volatile int *)ptr = 0x0;
1426 if (*(volatile int *)ptr != 0x0)
1427 page_bad = TRUE;
1428 /*
1429 * Restore original value.
1430 */
1431 *(int *)ptr = tmp;
1432
1433skip_memtest:
1434 /*
1435 * Adjust array of valid/good pages.
1436 */
1437 if (page_bad == TRUE)
1438 continue;
1439 /*
1440 * If this good page is a continuation of the
1441 * previous set of good pages, then just increase
1442 * the end pointer. Otherwise start a new chunk.
1443 * Note that "end" points one higher than end,
1444 * making the range >= start and < end.
1445 * If we're also doing a speculative memory
1446 * test and we at or past the end, bump up Maxmem
1447 * so that we keep going. The first bad page
1448 * will terminate the loop.
1449 */
1450 if (phys_avail[pa_indx] == pa) {
1451 phys_avail[pa_indx] += PAGE_SIZE;
1452 } else {
1453 pa_indx++;
1454 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1455 printf(
1456 "Too many holes in the physical address space, giving up\n");
1457 pa_indx--;
1458 full = TRUE;
1459 goto do_dump_avail;
1460 }
1461 phys_avail[pa_indx++] = pa; /* start */
1462 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1463 }
1464 physmem++;
1465do_dump_avail:
1466 if (dump_avail[da_indx] == pa) {
1467 dump_avail[da_indx] += PAGE_SIZE;
1468 } else {
1469 da_indx++;
1470 if (da_indx == DUMP_AVAIL_ARRAY_END) {
1471 da_indx--;
1472 goto do_next;
1473 }
1474 dump_avail[da_indx++] = pa; /* start */
1475 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1476 }
1477do_next:
1478 if (full)
1479 break;
1480 }
1481 }
1482 *pte = 0;
1483 invltlb();
1484 if (memtest != 0)
1485 printf("\n");
1486
1487 /*
1488 * XXX
1489 * The last chunk must contain at least one page plus the message
1490 * buffer to avoid complicating other code (message buffer address
1491 * calculation, etc.).
1492 */
1493 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1494 round_page(msgbufsize) >= phys_avail[pa_indx]) {
1495 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1496 phys_avail[pa_indx--] = 0;
1497 phys_avail[pa_indx--] = 0;
1498 }
1499
1500 Maxmem = atop(phys_avail[pa_indx]);
1501
1502 /* Trim off space for the message buffer. */
1503 phys_avail[pa_indx] -= round_page(msgbufsize);
1504
1505 /* Map the message buffer. */
1506 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1507}
1508
1509static caddr_t
1510native_parse_preload_data(u_int64_t modulep)
1511{
1512 caddr_t kmdp;
1513 char *envp;
1514#ifdef DDB
1515 vm_offset_t ksym_start;
1516 vm_offset_t ksym_end;
1517#endif
1518
1519 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1520 preload_bootstrap_relocate(KERNBASE);
1521 kmdp = preload_search_by_type("elf kernel");
1522 if (kmdp == NULL)
1523 kmdp = preload_search_by_type("elf64 kernel");
1524 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1525 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1526 if (envp != NULL)
1527 envp += KERNBASE;
1528 init_static_kenv(envp, 0);
1529#ifdef DDB
1530 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1531 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1532 db_fetch_ksymtab(ksym_start, ksym_end);
1533#endif
1534 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1535
1536 return (kmdp);
1537}
1538
1539static void
1540amd64_kdb_init(void)
1541{
1542 kdb_init();
1543#ifdef KDB
1544 if (boothowto & RB_KDB)
1545 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1546#endif
1547}
1548
1549/* Set up the fast syscall stuff */
1550void
1551amd64_conf_fast_syscall(void)
1552{
1553 uint64_t msr;
1554
1555 msr = rdmsr(MSR_EFER) | EFER_SCE;
1556 wrmsr(MSR_EFER, msr);
1557 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1558 (u_int64_t)IDTVEC(fast_syscall));
1559 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1560 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1561 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1562 wrmsr(MSR_STAR, msr);
1563 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
1564}
1565
1566u_int64_t
1567hammer_time(u_int64_t modulep, u_int64_t physfree)
1568{
1569 caddr_t kmdp;
1570 int gsel_tss, x;
1571 struct pcpu *pc;
1572 struct nmi_pcpu *np;
1573 struct xstate_hdr *xhdr;
1574 u_int64_t rsp0;
1575 char *env;
1576 size_t kstack0_sz;
1577 int late_console;
1578
1579 kmdp = init_ops.parse_preload_data(modulep);
1580
1581 identify_cpu1();
1582 identify_hypervisor();
1583 /*
1584 * hw.cpu_stdext_disable is ignored by the call, it will be
1585 * re-evaluted by the below call to finishidentcpu().
1586 */
1587 identify_cpu2();
1588
1589 link_elf_ireloc(kmdp);
1590
1591 /*
1592 * This may be done better later if it gets more high level
1593 * components in it. If so just link td->td_proc here.
1594 */
1595 proc_linkup0(&proc0, &thread0);
1596
1597 /* Init basic tunables, hz etc */
1598 init_param1();
1599
1600 thread0.td_kstack = physfree + KERNBASE;
1601 thread0.td_kstack_pages = kstack_pages;
1602 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1603 bzero((void *)thread0.td_kstack, kstack0_sz);
1604 physfree += kstack0_sz;
1605
1606 /*
1607 * make gdt memory segments
1608 */
1609 for (x = 0; x < NGDT; x++) {
1610 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1611 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1612 ssdtosd(&gdt_segs[x], &gdt[x]);
1613 }
1614 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1615 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1616 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1617
1618 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1619 r_gdt.rd_base = (long) gdt;
1620 lgdt(&r_gdt);
1621 pc = &__pcpu[0];
1622
1623 wrmsr(MSR_FSBASE, 0); /* User value */
1624 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1625 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1626
1627 pcpu_init(pc, 0, sizeof(struct pcpu));
1628 dpcpu_init((void *)(physfree + KERNBASE), 0);
1629 physfree += DPCPU_SIZE;
1630 PCPU_SET(prvspace, pc);
1631 PCPU_SET(curthread, &thread0);
1632 /* Non-late cninit() and printf() can be moved up to here. */
1633 PCPU_SET(tssp, &common_tss[0]);
1634 PCPU_SET(commontssp, &common_tss[0]);
1635 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1636 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1637 PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1638 PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1639
1640 /*
1641 * Initialize mutexes.
1642 *
1643 * icu_lock: in order to allow an interrupt to occur in a critical
1644 * section, to set pcpu->ipending (etc...) properly, we
1645 * must be able to get the icu lock, so it can't be
1646 * under witness.
1647 */
1648 mutex_init();
1649 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1650 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1651
1652 /* exceptions */
1653 pti = pti_get_default();
1654 TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1655
1656 for (x = 0; x < NIDT; x++)
1657 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1658 SEL_KPL, 0);
1659 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1660 SEL_KPL, 0);
1661 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1662 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
1663 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1664 SEL_UPL, 0);
1665 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1666 SEL_UPL, 0);
1667 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1668 SEL_KPL, 0);
1669 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1670 SEL_KPL, 0);
1671 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1672 SEL_KPL, 0);
1673 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1674 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1675 SDT_SYSIGT, SEL_KPL, 0);
1676 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1677 SEL_KPL, 0);
1678 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1679 SDT_SYSIGT, SEL_KPL, 0);
1680 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1681 SEL_KPL, 0);
1682 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1683 SEL_KPL, 0);
1684 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1685 SEL_KPL, 0);
1686 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1687 SEL_KPL, 0);
1688 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1689 SEL_KPL, 0);
1690 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1691 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1692 SEL_KPL, 0);
1693#ifdef KDTRACE_HOOKS
1694 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1695 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1696#endif
1697#ifdef XENHVM
1698 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1699 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1700#endif
1701 r_idt.rd_limit = sizeof(idt0) - 1;
1702 r_idt.rd_base = (long) idt;
1703 lidt(&r_idt);
1704
1705 /*
1706 * Initialize the clock before the console so that console
1707 * initialization can use DELAY().
1708 */
1709 clock_init();
1710
1711 /*
1712 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1713 * transition).
1714 * Once bootblocks have updated, we can test directly for
1715 * efi_systbl != NULL here...
1716 */
1717 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1718 != NULL)
1719 vty_set_preferred(VTY_VT);
1720
1721 finishidentcpu(); /* Final stage of CPU initialization */
1722 initializecpu(); /* Initialize CPU registers */
1723 initializecpucache();
1724
1725 /* doublefault stack space, runs on ist1 */
1726 common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1727
1728 /*
1729 * NMI stack, runs on ist2. The pcpu pointer is stored just
1730 * above the start of the ist2 stack.
1731 */
1732 np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1733 np->np_pcpu = (register_t) pc;
1734 common_tss[0].tss_ist2 = (long) np;
1735
1736 /*
1737 * MC# stack, runs on ist3. The pcpu pointer is stored just
1738 * above the start of the ist3 stack.
1739 */
1740 np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
1741 np->np_pcpu = (register_t) pc;
1742 common_tss[0].tss_ist3 = (long) np;
1743
1744 /*
1745 * DB# stack, runs on ist4.
1746 */
1747 np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
1748 np->np_pcpu = (register_t) pc;
1749 common_tss[0].tss_ist4 = (long) np;
1750
1751 /* Set the IO permission bitmap (empty due to tss seg limit) */
1752 common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1753
1754 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1755 ltr(gsel_tss);
1756
1757 amd64_conf_fast_syscall();
1758
1759 /*
1760 * Temporary forge some valid pointer to PCB, for exception
1761 * handlers. It is reinitialized properly below after FPU is
1762 * set up. Also set up td_critnest to short-cut the page
1763 * fault handler.
1764 */
1765 cpu_max_ext_state_size = sizeof(struct savefpu);
1766 thread0.td_pcb = get_pcb_td(&thread0);
1767 thread0.td_critnest = 1;
1768
1769 /*
1770 * The console and kdb should be initialized even earlier than here,
1771 * but some console drivers don't work until after getmemsize().
1772 * Default to late console initialization to support these drivers.
1773 * This loses mainly printf()s in getmemsize() and early debugging.
1774 */
1775 late_console = 1;
1776 TUNABLE_INT_FETCH("debug.late_console", &late_console);
1777 if (!late_console) {
1778 cninit();
1779 amd64_kdb_init();
1780 }
1781
1782 getmemsize(kmdp, physfree);
1783 init_param2(physmem);
1784
1785 /* now running on new page tables, configured,and u/iom is accessible */
1786
1787 if (late_console)
1788 cninit();
1789
1790#ifdef DEV_ISA
1791#ifdef DEV_ATPIC
1792 elcr_probe();
1793 atpic_startup();
1794#else
1795 /* Reset and mask the atpics and leave them shut down. */
1796 atpic_reset();
1797
1798 /*
1799 * Point the ICU spurious interrupt vectors at the APIC spurious
1800 * interrupt handler.
1801 */
1802 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1803 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1804#endif
1805#else
1806#error "have you forgotten the isa device?";
1807#endif
1808
1809 if (late_console)
1810 amd64_kdb_init();
1811
1812 msgbufinit(msgbufp, msgbufsize);
1813 fpuinit();
1814
1815 /*
1816 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1817 * area size. Zero out the extended state header in fpu save
1818 * area.
1819 */
1820 thread0.td_pcb = get_pcb_td(&thread0);
1821 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1822 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1823 if (use_xsave) {
1824 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1825 1);
1826 xhdr->xstate_bv = xsave_mask;
1827 }
1828 /* make an initial tss so cpu can get interrupt stack on syscall! */
1829 rsp0 = (vm_offset_t)thread0.td_pcb;
1830 /* Ensure the stack is aligned to 16 bytes */
1831 rsp0 &= ~0xFul;
1832 common_tss[0].tss_rsp0 = rsp0;
1833 PCPU_SET(rsp0, rsp0);
1834 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1835 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1836 PCPU_SET(curpcb, thread0.td_pcb);
1837
1838 /* transfer to user mode */
1839
1840 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1841 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1842 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1843 _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1844 _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1845
1846 load_ds(_udatasel);
1847 load_es(_udatasel);
1848 load_fs(_ufssel);
1849
1850 /* setup proc 0's pcb */
1851 thread0.td_pcb->pcb_flags = 0;
1852 thread0.td_frame = &proc0_tf;
1853
1854 env = kern_getenv("kernelname");
1855 if (env != NULL)
1856 strlcpy(kernelname, env, sizeof(kernelname));
1857
1858 cpu_probe_amdc1e();
1859
1860#ifdef FDT
1861 x86_init_fdt();
1862#endif
1863 thread0.td_critnest = 0;
1864
1865 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1866 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1867 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1867
1868 /* Location of kernel stack for locore */
1869 return ((u_int64_t)thread0.td_pcb);
1870}
1871
1872void
1873cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1874{
1875
1876 pcpu->pc_acpi_id = 0xffffffff;
1877}
1878
1879static int
1880smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1881{
1882 struct bios_smap *smapbase;
1883 struct bios_smap_xattr smap;
1884 caddr_t kmdp;
1885 uint32_t *smapattr;
1886 int count, error, i;
1887
1888 /* Retrieve the system memory map from the loader. */
1889 kmdp = preload_search_by_type("elf kernel");
1890 if (kmdp == NULL)
1891 kmdp = preload_search_by_type("elf64 kernel");
1892 smapbase = (struct bios_smap *)preload_search_info(kmdp,
1893 MODINFO_METADATA | MODINFOMD_SMAP);
1894 if (smapbase == NULL)
1895 return (0);
1896 smapattr = (uint32_t *)preload_search_info(kmdp,
1897 MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1898 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1899 error = 0;
1900 for (i = 0; i < count; i++) {
1901 smap.base = smapbase[i].base;
1902 smap.length = smapbase[i].length;
1903 smap.type = smapbase[i].type;
1904 if (smapattr != NULL)
1905 smap.xattr = smapattr[i];
1906 else
1907 smap.xattr = 0;
1908 error = SYSCTL_OUT(req, &smap, sizeof(smap));
1909 }
1910 return (error);
1911}
1912SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1913 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1914
1915static int
1916efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1917{
1918 struct efi_map_header *efihdr;
1919 caddr_t kmdp;
1920 uint32_t efisize;
1921
1922 kmdp = preload_search_by_type("elf kernel");
1923 if (kmdp == NULL)
1924 kmdp = preload_search_by_type("elf64 kernel");
1925 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1926 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1927 if (efihdr == NULL)
1928 return (0);
1929 efisize = *((uint32_t *)efihdr - 1);
1930 return (SYSCTL_OUT(req, efihdr, efisize));
1931}
1932SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1933 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1934
1935void
1936spinlock_enter(void)
1937{
1938 struct thread *td;
1939 register_t flags;
1940
1941 td = curthread;
1942 if (td->td_md.md_spinlock_count == 0) {
1943 flags = intr_disable();
1944 td->td_md.md_spinlock_count = 1;
1945 td->td_md.md_saved_flags = flags;
1946 } else
1947 td->td_md.md_spinlock_count++;
1948 critical_enter();
1949}
1950
1951void
1952spinlock_exit(void)
1953{
1954 struct thread *td;
1955 register_t flags;
1956
1957 td = curthread;
1958 critical_exit();
1959 flags = td->td_md.md_saved_flags;
1960 td->td_md.md_spinlock_count--;
1961 if (td->td_md.md_spinlock_count == 0)
1962 intr_restore(flags);
1963}
1964
1965/*
1966 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1967 * we want to start a backtrace from the function that caused us to enter
1968 * the debugger. We have the context in the trapframe, but base the trace
1969 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1970 * enough for a backtrace.
1971 */
1972void
1973makectx(struct trapframe *tf, struct pcb *pcb)
1974{
1975
1976 pcb->pcb_r12 = tf->tf_r12;
1977 pcb->pcb_r13 = tf->tf_r13;
1978 pcb->pcb_r14 = tf->tf_r14;
1979 pcb->pcb_r15 = tf->tf_r15;
1980 pcb->pcb_rbp = tf->tf_rbp;
1981 pcb->pcb_rbx = tf->tf_rbx;
1982 pcb->pcb_rip = tf->tf_rip;
1983 pcb->pcb_rsp = tf->tf_rsp;
1984}
1985
1986int
1987ptrace_set_pc(struct thread *td, unsigned long addr)
1988{
1989
1990 td->td_frame->tf_rip = addr;
1991 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
1992 return (0);
1993}
1994
1995int
1996ptrace_single_step(struct thread *td)
1997{
1998
1999 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2000 if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2001 td->td_frame->tf_rflags |= PSL_T;
2002 td->td_dbgflags |= TDB_STEP;
2003 }
2004 return (0);
2005}
2006
2007int
2008ptrace_clear_single_step(struct thread *td)
2009{
2010 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2011 td->td_frame->tf_rflags &= ~PSL_T;
2012 td->td_dbgflags &= ~TDB_STEP;
2013 return (0);
2014}
2015
2016int
2017fill_regs(struct thread *td, struct reg *regs)
2018{
2019 struct trapframe *tp;
2020
2021 tp = td->td_frame;
2022 return (fill_frame_regs(tp, regs));
2023}
2024
2025int
2026fill_frame_regs(struct trapframe *tp, struct reg *regs)
2027{
2028
2029 regs->r_r15 = tp->tf_r15;
2030 regs->r_r14 = tp->tf_r14;
2031 regs->r_r13 = tp->tf_r13;
2032 regs->r_r12 = tp->tf_r12;
2033 regs->r_r11 = tp->tf_r11;
2034 regs->r_r10 = tp->tf_r10;
2035 regs->r_r9 = tp->tf_r9;
2036 regs->r_r8 = tp->tf_r8;
2037 regs->r_rdi = tp->tf_rdi;
2038 regs->r_rsi = tp->tf_rsi;
2039 regs->r_rbp = tp->tf_rbp;
2040 regs->r_rbx = tp->tf_rbx;
2041 regs->r_rdx = tp->tf_rdx;
2042 regs->r_rcx = tp->tf_rcx;
2043 regs->r_rax = tp->tf_rax;
2044 regs->r_rip = tp->tf_rip;
2045 regs->r_cs = tp->tf_cs;
2046 regs->r_rflags = tp->tf_rflags;
2047 regs->r_rsp = tp->tf_rsp;
2048 regs->r_ss = tp->tf_ss;
2049 if (tp->tf_flags & TF_HASSEGS) {
2050 regs->r_ds = tp->tf_ds;
2051 regs->r_es = tp->tf_es;
2052 regs->r_fs = tp->tf_fs;
2053 regs->r_gs = tp->tf_gs;
2054 } else {
2055 regs->r_ds = 0;
2056 regs->r_es = 0;
2057 regs->r_fs = 0;
2058 regs->r_gs = 0;
2059 }
2060 regs->r_err = 0;
2061 regs->r_trapno = 0;
2062 return (0);
2063}
2064
2065int
2066set_regs(struct thread *td, struct reg *regs)
2067{
2068 struct trapframe *tp;
2069 register_t rflags;
2070
2071 tp = td->td_frame;
2072 rflags = regs->r_rflags & 0xffffffff;
2073 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2074 return (EINVAL);
2075 tp->tf_r15 = regs->r_r15;
2076 tp->tf_r14 = regs->r_r14;
2077 tp->tf_r13 = regs->r_r13;
2078 tp->tf_r12 = regs->r_r12;
2079 tp->tf_r11 = regs->r_r11;
2080 tp->tf_r10 = regs->r_r10;
2081 tp->tf_r9 = regs->r_r9;
2082 tp->tf_r8 = regs->r_r8;
2083 tp->tf_rdi = regs->r_rdi;
2084 tp->tf_rsi = regs->r_rsi;
2085 tp->tf_rbp = regs->r_rbp;
2086 tp->tf_rbx = regs->r_rbx;
2087 tp->tf_rdx = regs->r_rdx;
2088 tp->tf_rcx = regs->r_rcx;
2089 tp->tf_rax = regs->r_rax;
2090 tp->tf_rip = regs->r_rip;
2091 tp->tf_cs = regs->r_cs;
2092 tp->tf_rflags = rflags;
2093 tp->tf_rsp = regs->r_rsp;
2094 tp->tf_ss = regs->r_ss;
2095 if (0) { /* XXXKIB */
2096 tp->tf_ds = regs->r_ds;
2097 tp->tf_es = regs->r_es;
2098 tp->tf_fs = regs->r_fs;
2099 tp->tf_gs = regs->r_gs;
2100 tp->tf_flags = TF_HASSEGS;
2101 }
2102 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2103 return (0);
2104}
2105
2106/* XXX check all this stuff! */
2107/* externalize from sv_xmm */
2108static void
2109fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2110{
2111 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2112 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2113 int i;
2114
2115 /* pcb -> fpregs */
2116 bzero(fpregs, sizeof(*fpregs));
2117
2118 /* FPU control/status */
2119 penv_fpreg->en_cw = penv_xmm->en_cw;
2120 penv_fpreg->en_sw = penv_xmm->en_sw;
2121 penv_fpreg->en_tw = penv_xmm->en_tw;
2122 penv_fpreg->en_opcode = penv_xmm->en_opcode;
2123 penv_fpreg->en_rip = penv_xmm->en_rip;
2124 penv_fpreg->en_rdp = penv_xmm->en_rdp;
2125 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2126 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2127
2128 /* FPU registers */
2129 for (i = 0; i < 8; ++i)
2130 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2131
2132 /* SSE registers */
2133 for (i = 0; i < 16; ++i)
2134 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2135}
2136
2137/* internalize from fpregs into sv_xmm */
2138static void
2139set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2140{
2141 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2142 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2143 int i;
2144
2145 /* fpregs -> pcb */
2146 /* FPU control/status */
2147 penv_xmm->en_cw = penv_fpreg->en_cw;
2148 penv_xmm->en_sw = penv_fpreg->en_sw;
2149 penv_xmm->en_tw = penv_fpreg->en_tw;
2150 penv_xmm->en_opcode = penv_fpreg->en_opcode;
2151 penv_xmm->en_rip = penv_fpreg->en_rip;
2152 penv_xmm->en_rdp = penv_fpreg->en_rdp;
2153 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2154 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2155
2156 /* FPU registers */
2157 for (i = 0; i < 8; ++i)
2158 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2159
2160 /* SSE registers */
2161 for (i = 0; i < 16; ++i)
2162 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2163}
2164
2165/* externalize from td->pcb */
2166int
2167fill_fpregs(struct thread *td, struct fpreg *fpregs)
2168{
2169
2170 KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2171 P_SHOULDSTOP(td->td_proc),
2172 ("not suspended thread %p", td));
2173 fpugetregs(td);
2174 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2175 return (0);
2176}
2177
2178/* internalize to td->pcb */
2179int
2180set_fpregs(struct thread *td, struct fpreg *fpregs)
2181{
2182
2183 critical_enter();
2184 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2185 fpuuserinited(td);
2186 critical_exit();
2187 return (0);
2188}
2189
2190/*
2191 * Get machine context.
2192 */
2193int
2194get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2195{
2196 struct pcb *pcb;
2197 struct trapframe *tp;
2198
2199 pcb = td->td_pcb;
2200 tp = td->td_frame;
2201 PROC_LOCK(curthread->td_proc);
2202 mcp->mc_onstack = sigonstack(tp->tf_rsp);
2203 PROC_UNLOCK(curthread->td_proc);
2204 mcp->mc_r15 = tp->tf_r15;
2205 mcp->mc_r14 = tp->tf_r14;
2206 mcp->mc_r13 = tp->tf_r13;
2207 mcp->mc_r12 = tp->tf_r12;
2208 mcp->mc_r11 = tp->tf_r11;
2209 mcp->mc_r10 = tp->tf_r10;
2210 mcp->mc_r9 = tp->tf_r9;
2211 mcp->mc_r8 = tp->tf_r8;
2212 mcp->mc_rdi = tp->tf_rdi;
2213 mcp->mc_rsi = tp->tf_rsi;
2214 mcp->mc_rbp = tp->tf_rbp;
2215 mcp->mc_rbx = tp->tf_rbx;
2216 mcp->mc_rcx = tp->tf_rcx;
2217 mcp->mc_rflags = tp->tf_rflags;
2218 if (flags & GET_MC_CLEAR_RET) {
2219 mcp->mc_rax = 0;
2220 mcp->mc_rdx = 0;
2221 mcp->mc_rflags &= ~PSL_C;
2222 } else {
2223 mcp->mc_rax = tp->tf_rax;
2224 mcp->mc_rdx = tp->tf_rdx;
2225 }
2226 mcp->mc_rip = tp->tf_rip;
2227 mcp->mc_cs = tp->tf_cs;
2228 mcp->mc_rsp = tp->tf_rsp;
2229 mcp->mc_ss = tp->tf_ss;
2230 mcp->mc_ds = tp->tf_ds;
2231 mcp->mc_es = tp->tf_es;
2232 mcp->mc_fs = tp->tf_fs;
2233 mcp->mc_gs = tp->tf_gs;
2234 mcp->mc_flags = tp->tf_flags;
2235 mcp->mc_len = sizeof(*mcp);
2236 get_fpcontext(td, mcp, NULL, 0);
2237 update_pcb_bases(pcb);
2238 mcp->mc_fsbase = pcb->pcb_fsbase;
2239 mcp->mc_gsbase = pcb->pcb_gsbase;
2240 mcp->mc_xfpustate = 0;
2241 mcp->mc_xfpustate_len = 0;
2242 bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2243 return (0);
2244}
2245
2246/*
2247 * Set machine context.
2248 *
2249 * However, we don't set any but the user modifiable flags, and we won't
2250 * touch the cs selector.
2251 */
2252int
2253set_mcontext(struct thread *td, mcontext_t *mcp)
2254{
2255 struct pcb *pcb;
2256 struct trapframe *tp;
2257 char *xfpustate;
2258 long rflags;
2259 int ret;
2260
2261 pcb = td->td_pcb;
2262 tp = td->td_frame;
2263 if (mcp->mc_len != sizeof(*mcp) ||
2264 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2265 return (EINVAL);
2266 rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2267 (tp->tf_rflags & ~PSL_USERCHANGE);
2268 if (mcp->mc_flags & _MC_HASFPXSTATE) {
2269 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2270 sizeof(struct savefpu))
2271 return (EINVAL);
2272 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2273 ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2274 mcp->mc_xfpustate_len);
2275 if (ret != 0)
2276 return (ret);
2277 } else
2278 xfpustate = NULL;
2279 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2280 if (ret != 0)
2281 return (ret);
2282 tp->tf_r15 = mcp->mc_r15;
2283 tp->tf_r14 = mcp->mc_r14;
2284 tp->tf_r13 = mcp->mc_r13;
2285 tp->tf_r12 = mcp->mc_r12;
2286 tp->tf_r11 = mcp->mc_r11;
2287 tp->tf_r10 = mcp->mc_r10;
2288 tp->tf_r9 = mcp->mc_r9;
2289 tp->tf_r8 = mcp->mc_r8;
2290 tp->tf_rdi = mcp->mc_rdi;
2291 tp->tf_rsi = mcp->mc_rsi;
2292 tp->tf_rbp = mcp->mc_rbp;
2293 tp->tf_rbx = mcp->mc_rbx;
2294 tp->tf_rdx = mcp->mc_rdx;
2295 tp->tf_rcx = mcp->mc_rcx;
2296 tp->tf_rax = mcp->mc_rax;
2297 tp->tf_rip = mcp->mc_rip;
2298 tp->tf_rflags = rflags;
2299 tp->tf_rsp = mcp->mc_rsp;
2300 tp->tf_ss = mcp->mc_ss;
2301 tp->tf_flags = mcp->mc_flags;
2302 if (tp->tf_flags & TF_HASSEGS) {
2303 tp->tf_ds = mcp->mc_ds;
2304 tp->tf_es = mcp->mc_es;
2305 tp->tf_fs = mcp->mc_fs;
2306 tp->tf_gs = mcp->mc_gs;
2307 }
2308 set_pcb_flags(pcb, PCB_FULL_IRET);
2309 if (mcp->mc_flags & _MC_HASBASES) {
2310 pcb->pcb_fsbase = mcp->mc_fsbase;
2311 pcb->pcb_gsbase = mcp->mc_gsbase;
2312 }
2313 return (0);
2314}
2315
2316static void
2317get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2318 size_t xfpusave_len)
2319{
2320 size_t max_len, len;
2321
2322 mcp->mc_ownedfp = fpugetregs(td);
2323 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2324 sizeof(mcp->mc_fpstate));
2325 mcp->mc_fpformat = fpuformat();
2326 if (!use_xsave || xfpusave_len == 0)
2327 return;
2328 max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2329 len = xfpusave_len;
2330 if (len > max_len) {
2331 len = max_len;
2332 bzero(xfpusave + max_len, len - max_len);
2333 }
2334 mcp->mc_flags |= _MC_HASFPXSTATE;
2335 mcp->mc_xfpustate_len = len;
2336 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2337}
2338
2339static int
2340set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2341 size_t xfpustate_len)
2342{
2343 int error;
2344
2345 if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2346 return (0);
2347 else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2348 return (EINVAL);
2349 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2350 /* We don't care what state is left in the FPU or PCB. */
2351 fpstate_drop(td);
2352 error = 0;
2353 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2354 mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2355 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2356 xfpustate, xfpustate_len);
2357 } else
2358 return (EINVAL);
2359 return (error);
2360}
2361
2362void
2363fpstate_drop(struct thread *td)
2364{
2365
2366 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2367 critical_enter();
2368 if (PCPU_GET(fpcurthread) == td)
2369 fpudrop();
2370 /*
2371 * XXX force a full drop of the fpu. The above only drops it if we
2372 * owned it.
2373 *
2374 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2375 * drop. Dropping only to the pcb matches fnsave's behaviour.
2376 * We only need to drop to !PCB_INITDONE in sendsig(). But
2377 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2378 * have too many layers.
2379 */
2380 clear_pcb_flags(curthread->td_pcb,
2381 PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2382 critical_exit();
2383}
2384
2385int
2386fill_dbregs(struct thread *td, struct dbreg *dbregs)
2387{
2388 struct pcb *pcb;
2389
2390 if (td == NULL) {
2391 dbregs->dr[0] = rdr0();
2392 dbregs->dr[1] = rdr1();
2393 dbregs->dr[2] = rdr2();
2394 dbregs->dr[3] = rdr3();
2395 dbregs->dr[6] = rdr6();
2396 dbregs->dr[7] = rdr7();
2397 } else {
2398 pcb = td->td_pcb;
2399 dbregs->dr[0] = pcb->pcb_dr0;
2400 dbregs->dr[1] = pcb->pcb_dr1;
2401 dbregs->dr[2] = pcb->pcb_dr2;
2402 dbregs->dr[3] = pcb->pcb_dr3;
2403 dbregs->dr[6] = pcb->pcb_dr6;
2404 dbregs->dr[7] = pcb->pcb_dr7;
2405 }
2406 dbregs->dr[4] = 0;
2407 dbregs->dr[5] = 0;
2408 dbregs->dr[8] = 0;
2409 dbregs->dr[9] = 0;
2410 dbregs->dr[10] = 0;
2411 dbregs->dr[11] = 0;
2412 dbregs->dr[12] = 0;
2413 dbregs->dr[13] = 0;
2414 dbregs->dr[14] = 0;
2415 dbregs->dr[15] = 0;
2416 return (0);
2417}
2418
2419int
2420set_dbregs(struct thread *td, struct dbreg *dbregs)
2421{
2422 struct pcb *pcb;
2423 int i;
2424
2425 if (td == NULL) {
2426 load_dr0(dbregs->dr[0]);
2427 load_dr1(dbregs->dr[1]);
2428 load_dr2(dbregs->dr[2]);
2429 load_dr3(dbregs->dr[3]);
2430 load_dr6(dbregs->dr[6]);
2431 load_dr7(dbregs->dr[7]);
2432 } else {
2433 /*
2434 * Don't let an illegal value for dr7 get set. Specifically,
2435 * check for undefined settings. Setting these bit patterns
2436 * result in undefined behaviour and can lead to an unexpected
2437 * TRCTRAP or a general protection fault right here.
2438 * Upper bits of dr6 and dr7 must not be set
2439 */
2440 for (i = 0; i < 4; i++) {
2441 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2442 return (EINVAL);
2443 if (td->td_frame->tf_cs == _ucode32sel &&
2444 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2445 return (EINVAL);
2446 }
2447 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2448 (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2449 return (EINVAL);
2450
2451 pcb = td->td_pcb;
2452
2453 /*
2454 * Don't let a process set a breakpoint that is not within the
2455 * process's address space. If a process could do this, it
2456 * could halt the system by setting a breakpoint in the kernel
2457 * (if ddb was enabled). Thus, we need to check to make sure
2458 * that no breakpoints are being enabled for addresses outside
2459 * process's address space.
2460 *
2461 * XXX - what about when the watched area of the user's
2462 * address space is written into from within the kernel
2463 * ... wouldn't that still cause a breakpoint to be generated
2464 * from within kernel mode?
2465 */
2466
2467 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2468 /* dr0 is enabled */
2469 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2470 return (EINVAL);
2471 }
2472 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2473 /* dr1 is enabled */
2474 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2475 return (EINVAL);
2476 }
2477 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2478 /* dr2 is enabled */
2479 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2480 return (EINVAL);
2481 }
2482 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2483 /* dr3 is enabled */
2484 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2485 return (EINVAL);
2486 }
2487
2488 pcb->pcb_dr0 = dbregs->dr[0];
2489 pcb->pcb_dr1 = dbregs->dr[1];
2490 pcb->pcb_dr2 = dbregs->dr[2];
2491 pcb->pcb_dr3 = dbregs->dr[3];
2492 pcb->pcb_dr6 = dbregs->dr[6];
2493 pcb->pcb_dr7 = dbregs->dr[7];
2494
2495 set_pcb_flags(pcb, PCB_DBREGS);
2496 }
2497
2498 return (0);
2499}
2500
2501void
2502reset_dbregs(void)
2503{
2504
2505 load_dr7(0); /* Turn off the control bits first */
2506 load_dr0(0);
2507 load_dr1(0);
2508 load_dr2(0);
2509 load_dr3(0);
2510 load_dr6(0);
2511}
2512
2513/*
2514 * Return > 0 if a hardware breakpoint has been hit, and the
2515 * breakpoint was in user space. Return 0, otherwise.
2516 */
2517int
2518user_dbreg_trap(register_t dr6)
2519{
2520 u_int64_t dr7;
2521 u_int64_t bp; /* breakpoint bits extracted from dr6 */
2522 int nbp; /* number of breakpoints that triggered */
2523 caddr_t addr[4]; /* breakpoint addresses */
2524 int i;
2525
2526 bp = dr6 & DBREG_DR6_BMASK;
2527 if (bp == 0) {
2528 /*
2529 * None of the breakpoint bits are set meaning this
2530 * trap was not caused by any of the debug registers
2531 */
2532 return 0;
2533 }
2534
2535 dr7 = rdr7();
2536 if ((dr7 & 0x000000ff) == 0) {
2537 /*
2538 * all GE and LE bits in the dr7 register are zero,
2539 * thus the trap couldn't have been caused by the
2540 * hardware debug registers
2541 */
2542 return 0;
2543 }
2544
2545 nbp = 0;
2546
2547 /*
2548 * at least one of the breakpoints were hit, check to see
2549 * which ones and if any of them are user space addresses
2550 */
2551
2552 if (bp & 0x01) {
2553 addr[nbp++] = (caddr_t)rdr0();
2554 }
2555 if (bp & 0x02) {
2556 addr[nbp++] = (caddr_t)rdr1();
2557 }
2558 if (bp & 0x04) {
2559 addr[nbp++] = (caddr_t)rdr2();
2560 }
2561 if (bp & 0x08) {
2562 addr[nbp++] = (caddr_t)rdr3();
2563 }
2564
2565 for (i = 0; i < nbp; i++) {
2566 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2567 /*
2568 * addr[i] is in user space
2569 */
2570 return nbp;
2571 }
2572 }
2573
2574 /*
2575 * None of the breakpoints are in user space.
2576 */
2577 return 0;
2578}
2579
2580/*
2581 * The pcb_flags is only modified by current thread, or by other threads
2582 * when current thread is stopped. However, current thread may change it
2583 * from the interrupt context in cpu_switch(), or in the trap handler.
2584 * When we read-modify-write pcb_flags from C sources, compiler may generate
2585 * code that is not atomic regarding the interrupt handler. If a trap or
2586 * interrupt happens and any flag is modified from the handler, it can be
2587 * clobbered with the cached value later. Therefore, we implement setting
2588 * and clearing flags with single-instruction functions, which do not race
2589 * with possible modification of the flags from the trap or interrupt context,
2590 * because traps and interrupts are executed only on instruction boundary.
2591 */
2592void
2593set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2594{
2595
2596 __asm __volatile("orl %1,%0"
2597 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2598 : "cc", "memory");
2599
2600}
2601
2602/*
2603 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2604 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2605 * pcb if user space modified the bases. We must save on the context
2606 * switch or if the return to usermode happens through the doreti.
2607 *
2608 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2609 * which have a consequence that the base MSRs must be saved each time
2610 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with
2611 * context switches.
2612 */
2613void
2614set_pcb_flags(struct pcb *pcb, const u_int flags)
2615{
2616 register_t r;
2617
2618 if (curpcb == pcb &&
2619 (flags & PCB_FULL_IRET) != 0 &&
2620 (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
2621 (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
2622 r = intr_disable();
2623 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2624 if (rfs() == _ufssel)
2625 pcb->pcb_fsbase = rdfsbase();
2626 if (rgs() == _ugssel)
2627 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2628 }
2629 set_pcb_flags_raw(pcb, flags);
2630 intr_restore(r);
2631 } else {
2632 set_pcb_flags_raw(pcb, flags);
2633 }
2634}
2635
2636void
2637clear_pcb_flags(struct pcb *pcb, const u_int flags)
2638{
2639
2640 __asm __volatile("andl %1,%0"
2641 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2642 : "cc", "memory");
2643}
2644
2645#ifdef KDB
2646
2647/*
2648 * Provide inb() and outb() as functions. They are normally only available as
2649 * inline functions, thus cannot be called from the debugger.
2650 */
2651
2652/* silence compiler warnings */
2653u_char inb_(u_short);
2654void outb_(u_short, u_char);
2655
2656u_char
2657inb_(u_short port)
2658{
2659 return inb(port);
2660}
2661
2662void
2663outb_(u_short port, u_char data)
2664{
2665 outb(port, data);
2666}
2667
2668#endif /* KDB */
1868
1869 /* Location of kernel stack for locore */
1870 return ((u_int64_t)thread0.td_pcb);
1871}
1872
1873void
1874cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1875{
1876
1877 pcpu->pc_acpi_id = 0xffffffff;
1878}
1879
1880static int
1881smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1882{
1883 struct bios_smap *smapbase;
1884 struct bios_smap_xattr smap;
1885 caddr_t kmdp;
1886 uint32_t *smapattr;
1887 int count, error, i;
1888
1889 /* Retrieve the system memory map from the loader. */
1890 kmdp = preload_search_by_type("elf kernel");
1891 if (kmdp == NULL)
1892 kmdp = preload_search_by_type("elf64 kernel");
1893 smapbase = (struct bios_smap *)preload_search_info(kmdp,
1894 MODINFO_METADATA | MODINFOMD_SMAP);
1895 if (smapbase == NULL)
1896 return (0);
1897 smapattr = (uint32_t *)preload_search_info(kmdp,
1898 MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1899 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1900 error = 0;
1901 for (i = 0; i < count; i++) {
1902 smap.base = smapbase[i].base;
1903 smap.length = smapbase[i].length;
1904 smap.type = smapbase[i].type;
1905 if (smapattr != NULL)
1906 smap.xattr = smapattr[i];
1907 else
1908 smap.xattr = 0;
1909 error = SYSCTL_OUT(req, &smap, sizeof(smap));
1910 }
1911 return (error);
1912}
1913SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1914 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1915
1916static int
1917efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1918{
1919 struct efi_map_header *efihdr;
1920 caddr_t kmdp;
1921 uint32_t efisize;
1922
1923 kmdp = preload_search_by_type("elf kernel");
1924 if (kmdp == NULL)
1925 kmdp = preload_search_by_type("elf64 kernel");
1926 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1927 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1928 if (efihdr == NULL)
1929 return (0);
1930 efisize = *((uint32_t *)efihdr - 1);
1931 return (SYSCTL_OUT(req, efihdr, efisize));
1932}
1933SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1934 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1935
1936void
1937spinlock_enter(void)
1938{
1939 struct thread *td;
1940 register_t flags;
1941
1942 td = curthread;
1943 if (td->td_md.md_spinlock_count == 0) {
1944 flags = intr_disable();
1945 td->td_md.md_spinlock_count = 1;
1946 td->td_md.md_saved_flags = flags;
1947 } else
1948 td->td_md.md_spinlock_count++;
1949 critical_enter();
1950}
1951
1952void
1953spinlock_exit(void)
1954{
1955 struct thread *td;
1956 register_t flags;
1957
1958 td = curthread;
1959 critical_exit();
1960 flags = td->td_md.md_saved_flags;
1961 td->td_md.md_spinlock_count--;
1962 if (td->td_md.md_spinlock_count == 0)
1963 intr_restore(flags);
1964}
1965
1966/*
1967 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1968 * we want to start a backtrace from the function that caused us to enter
1969 * the debugger. We have the context in the trapframe, but base the trace
1970 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1971 * enough for a backtrace.
1972 */
1973void
1974makectx(struct trapframe *tf, struct pcb *pcb)
1975{
1976
1977 pcb->pcb_r12 = tf->tf_r12;
1978 pcb->pcb_r13 = tf->tf_r13;
1979 pcb->pcb_r14 = tf->tf_r14;
1980 pcb->pcb_r15 = tf->tf_r15;
1981 pcb->pcb_rbp = tf->tf_rbp;
1982 pcb->pcb_rbx = tf->tf_rbx;
1983 pcb->pcb_rip = tf->tf_rip;
1984 pcb->pcb_rsp = tf->tf_rsp;
1985}
1986
1987int
1988ptrace_set_pc(struct thread *td, unsigned long addr)
1989{
1990
1991 td->td_frame->tf_rip = addr;
1992 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
1993 return (0);
1994}
1995
1996int
1997ptrace_single_step(struct thread *td)
1998{
1999
2000 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2001 if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2002 td->td_frame->tf_rflags |= PSL_T;
2003 td->td_dbgflags |= TDB_STEP;
2004 }
2005 return (0);
2006}
2007
2008int
2009ptrace_clear_single_step(struct thread *td)
2010{
2011 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2012 td->td_frame->tf_rflags &= ~PSL_T;
2013 td->td_dbgflags &= ~TDB_STEP;
2014 return (0);
2015}
2016
2017int
2018fill_regs(struct thread *td, struct reg *regs)
2019{
2020 struct trapframe *tp;
2021
2022 tp = td->td_frame;
2023 return (fill_frame_regs(tp, regs));
2024}
2025
2026int
2027fill_frame_regs(struct trapframe *tp, struct reg *regs)
2028{
2029
2030 regs->r_r15 = tp->tf_r15;
2031 regs->r_r14 = tp->tf_r14;
2032 regs->r_r13 = tp->tf_r13;
2033 regs->r_r12 = tp->tf_r12;
2034 regs->r_r11 = tp->tf_r11;
2035 regs->r_r10 = tp->tf_r10;
2036 regs->r_r9 = tp->tf_r9;
2037 regs->r_r8 = tp->tf_r8;
2038 regs->r_rdi = tp->tf_rdi;
2039 regs->r_rsi = tp->tf_rsi;
2040 regs->r_rbp = tp->tf_rbp;
2041 regs->r_rbx = tp->tf_rbx;
2042 regs->r_rdx = tp->tf_rdx;
2043 regs->r_rcx = tp->tf_rcx;
2044 regs->r_rax = tp->tf_rax;
2045 regs->r_rip = tp->tf_rip;
2046 regs->r_cs = tp->tf_cs;
2047 regs->r_rflags = tp->tf_rflags;
2048 regs->r_rsp = tp->tf_rsp;
2049 regs->r_ss = tp->tf_ss;
2050 if (tp->tf_flags & TF_HASSEGS) {
2051 regs->r_ds = tp->tf_ds;
2052 regs->r_es = tp->tf_es;
2053 regs->r_fs = tp->tf_fs;
2054 regs->r_gs = tp->tf_gs;
2055 } else {
2056 regs->r_ds = 0;
2057 regs->r_es = 0;
2058 regs->r_fs = 0;
2059 regs->r_gs = 0;
2060 }
2061 regs->r_err = 0;
2062 regs->r_trapno = 0;
2063 return (0);
2064}
2065
2066int
2067set_regs(struct thread *td, struct reg *regs)
2068{
2069 struct trapframe *tp;
2070 register_t rflags;
2071
2072 tp = td->td_frame;
2073 rflags = regs->r_rflags & 0xffffffff;
2074 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2075 return (EINVAL);
2076 tp->tf_r15 = regs->r_r15;
2077 tp->tf_r14 = regs->r_r14;
2078 tp->tf_r13 = regs->r_r13;
2079 tp->tf_r12 = regs->r_r12;
2080 tp->tf_r11 = regs->r_r11;
2081 tp->tf_r10 = regs->r_r10;
2082 tp->tf_r9 = regs->r_r9;
2083 tp->tf_r8 = regs->r_r8;
2084 tp->tf_rdi = regs->r_rdi;
2085 tp->tf_rsi = regs->r_rsi;
2086 tp->tf_rbp = regs->r_rbp;
2087 tp->tf_rbx = regs->r_rbx;
2088 tp->tf_rdx = regs->r_rdx;
2089 tp->tf_rcx = regs->r_rcx;
2090 tp->tf_rax = regs->r_rax;
2091 tp->tf_rip = regs->r_rip;
2092 tp->tf_cs = regs->r_cs;
2093 tp->tf_rflags = rflags;
2094 tp->tf_rsp = regs->r_rsp;
2095 tp->tf_ss = regs->r_ss;
2096 if (0) { /* XXXKIB */
2097 tp->tf_ds = regs->r_ds;
2098 tp->tf_es = regs->r_es;
2099 tp->tf_fs = regs->r_fs;
2100 tp->tf_gs = regs->r_gs;
2101 tp->tf_flags = TF_HASSEGS;
2102 }
2103 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2104 return (0);
2105}
2106
2107/* XXX check all this stuff! */
2108/* externalize from sv_xmm */
2109static void
2110fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2111{
2112 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2113 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2114 int i;
2115
2116 /* pcb -> fpregs */
2117 bzero(fpregs, sizeof(*fpregs));
2118
2119 /* FPU control/status */
2120 penv_fpreg->en_cw = penv_xmm->en_cw;
2121 penv_fpreg->en_sw = penv_xmm->en_sw;
2122 penv_fpreg->en_tw = penv_xmm->en_tw;
2123 penv_fpreg->en_opcode = penv_xmm->en_opcode;
2124 penv_fpreg->en_rip = penv_xmm->en_rip;
2125 penv_fpreg->en_rdp = penv_xmm->en_rdp;
2126 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2127 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2128
2129 /* FPU registers */
2130 for (i = 0; i < 8; ++i)
2131 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2132
2133 /* SSE registers */
2134 for (i = 0; i < 16; ++i)
2135 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2136}
2137
2138/* internalize from fpregs into sv_xmm */
2139static void
2140set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2141{
2142 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2143 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2144 int i;
2145
2146 /* fpregs -> pcb */
2147 /* FPU control/status */
2148 penv_xmm->en_cw = penv_fpreg->en_cw;
2149 penv_xmm->en_sw = penv_fpreg->en_sw;
2150 penv_xmm->en_tw = penv_fpreg->en_tw;
2151 penv_xmm->en_opcode = penv_fpreg->en_opcode;
2152 penv_xmm->en_rip = penv_fpreg->en_rip;
2153 penv_xmm->en_rdp = penv_fpreg->en_rdp;
2154 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2155 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2156
2157 /* FPU registers */
2158 for (i = 0; i < 8; ++i)
2159 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2160
2161 /* SSE registers */
2162 for (i = 0; i < 16; ++i)
2163 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2164}
2165
2166/* externalize from td->pcb */
2167int
2168fill_fpregs(struct thread *td, struct fpreg *fpregs)
2169{
2170
2171 KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2172 P_SHOULDSTOP(td->td_proc),
2173 ("not suspended thread %p", td));
2174 fpugetregs(td);
2175 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2176 return (0);
2177}
2178
2179/* internalize to td->pcb */
2180int
2181set_fpregs(struct thread *td, struct fpreg *fpregs)
2182{
2183
2184 critical_enter();
2185 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2186 fpuuserinited(td);
2187 critical_exit();
2188 return (0);
2189}
2190
2191/*
2192 * Get machine context.
2193 */
2194int
2195get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2196{
2197 struct pcb *pcb;
2198 struct trapframe *tp;
2199
2200 pcb = td->td_pcb;
2201 tp = td->td_frame;
2202 PROC_LOCK(curthread->td_proc);
2203 mcp->mc_onstack = sigonstack(tp->tf_rsp);
2204 PROC_UNLOCK(curthread->td_proc);
2205 mcp->mc_r15 = tp->tf_r15;
2206 mcp->mc_r14 = tp->tf_r14;
2207 mcp->mc_r13 = tp->tf_r13;
2208 mcp->mc_r12 = tp->tf_r12;
2209 mcp->mc_r11 = tp->tf_r11;
2210 mcp->mc_r10 = tp->tf_r10;
2211 mcp->mc_r9 = tp->tf_r9;
2212 mcp->mc_r8 = tp->tf_r8;
2213 mcp->mc_rdi = tp->tf_rdi;
2214 mcp->mc_rsi = tp->tf_rsi;
2215 mcp->mc_rbp = tp->tf_rbp;
2216 mcp->mc_rbx = tp->tf_rbx;
2217 mcp->mc_rcx = tp->tf_rcx;
2218 mcp->mc_rflags = tp->tf_rflags;
2219 if (flags & GET_MC_CLEAR_RET) {
2220 mcp->mc_rax = 0;
2221 mcp->mc_rdx = 0;
2222 mcp->mc_rflags &= ~PSL_C;
2223 } else {
2224 mcp->mc_rax = tp->tf_rax;
2225 mcp->mc_rdx = tp->tf_rdx;
2226 }
2227 mcp->mc_rip = tp->tf_rip;
2228 mcp->mc_cs = tp->tf_cs;
2229 mcp->mc_rsp = tp->tf_rsp;
2230 mcp->mc_ss = tp->tf_ss;
2231 mcp->mc_ds = tp->tf_ds;
2232 mcp->mc_es = tp->tf_es;
2233 mcp->mc_fs = tp->tf_fs;
2234 mcp->mc_gs = tp->tf_gs;
2235 mcp->mc_flags = tp->tf_flags;
2236 mcp->mc_len = sizeof(*mcp);
2237 get_fpcontext(td, mcp, NULL, 0);
2238 update_pcb_bases(pcb);
2239 mcp->mc_fsbase = pcb->pcb_fsbase;
2240 mcp->mc_gsbase = pcb->pcb_gsbase;
2241 mcp->mc_xfpustate = 0;
2242 mcp->mc_xfpustate_len = 0;
2243 bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2244 return (0);
2245}
2246
2247/*
2248 * Set machine context.
2249 *
2250 * However, we don't set any but the user modifiable flags, and we won't
2251 * touch the cs selector.
2252 */
2253int
2254set_mcontext(struct thread *td, mcontext_t *mcp)
2255{
2256 struct pcb *pcb;
2257 struct trapframe *tp;
2258 char *xfpustate;
2259 long rflags;
2260 int ret;
2261
2262 pcb = td->td_pcb;
2263 tp = td->td_frame;
2264 if (mcp->mc_len != sizeof(*mcp) ||
2265 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2266 return (EINVAL);
2267 rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2268 (tp->tf_rflags & ~PSL_USERCHANGE);
2269 if (mcp->mc_flags & _MC_HASFPXSTATE) {
2270 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2271 sizeof(struct savefpu))
2272 return (EINVAL);
2273 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2274 ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2275 mcp->mc_xfpustate_len);
2276 if (ret != 0)
2277 return (ret);
2278 } else
2279 xfpustate = NULL;
2280 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2281 if (ret != 0)
2282 return (ret);
2283 tp->tf_r15 = mcp->mc_r15;
2284 tp->tf_r14 = mcp->mc_r14;
2285 tp->tf_r13 = mcp->mc_r13;
2286 tp->tf_r12 = mcp->mc_r12;
2287 tp->tf_r11 = mcp->mc_r11;
2288 tp->tf_r10 = mcp->mc_r10;
2289 tp->tf_r9 = mcp->mc_r9;
2290 tp->tf_r8 = mcp->mc_r8;
2291 tp->tf_rdi = mcp->mc_rdi;
2292 tp->tf_rsi = mcp->mc_rsi;
2293 tp->tf_rbp = mcp->mc_rbp;
2294 tp->tf_rbx = mcp->mc_rbx;
2295 tp->tf_rdx = mcp->mc_rdx;
2296 tp->tf_rcx = mcp->mc_rcx;
2297 tp->tf_rax = mcp->mc_rax;
2298 tp->tf_rip = mcp->mc_rip;
2299 tp->tf_rflags = rflags;
2300 tp->tf_rsp = mcp->mc_rsp;
2301 tp->tf_ss = mcp->mc_ss;
2302 tp->tf_flags = mcp->mc_flags;
2303 if (tp->tf_flags & TF_HASSEGS) {
2304 tp->tf_ds = mcp->mc_ds;
2305 tp->tf_es = mcp->mc_es;
2306 tp->tf_fs = mcp->mc_fs;
2307 tp->tf_gs = mcp->mc_gs;
2308 }
2309 set_pcb_flags(pcb, PCB_FULL_IRET);
2310 if (mcp->mc_flags & _MC_HASBASES) {
2311 pcb->pcb_fsbase = mcp->mc_fsbase;
2312 pcb->pcb_gsbase = mcp->mc_gsbase;
2313 }
2314 return (0);
2315}
2316
2317static void
2318get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2319 size_t xfpusave_len)
2320{
2321 size_t max_len, len;
2322
2323 mcp->mc_ownedfp = fpugetregs(td);
2324 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2325 sizeof(mcp->mc_fpstate));
2326 mcp->mc_fpformat = fpuformat();
2327 if (!use_xsave || xfpusave_len == 0)
2328 return;
2329 max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2330 len = xfpusave_len;
2331 if (len > max_len) {
2332 len = max_len;
2333 bzero(xfpusave + max_len, len - max_len);
2334 }
2335 mcp->mc_flags |= _MC_HASFPXSTATE;
2336 mcp->mc_xfpustate_len = len;
2337 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2338}
2339
2340static int
2341set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2342 size_t xfpustate_len)
2343{
2344 int error;
2345
2346 if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2347 return (0);
2348 else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2349 return (EINVAL);
2350 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2351 /* We don't care what state is left in the FPU or PCB. */
2352 fpstate_drop(td);
2353 error = 0;
2354 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2355 mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2356 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2357 xfpustate, xfpustate_len);
2358 } else
2359 return (EINVAL);
2360 return (error);
2361}
2362
2363void
2364fpstate_drop(struct thread *td)
2365{
2366
2367 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2368 critical_enter();
2369 if (PCPU_GET(fpcurthread) == td)
2370 fpudrop();
2371 /*
2372 * XXX force a full drop of the fpu. The above only drops it if we
2373 * owned it.
2374 *
2375 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2376 * drop. Dropping only to the pcb matches fnsave's behaviour.
2377 * We only need to drop to !PCB_INITDONE in sendsig(). But
2378 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2379 * have too many layers.
2380 */
2381 clear_pcb_flags(curthread->td_pcb,
2382 PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2383 critical_exit();
2384}
2385
2386int
2387fill_dbregs(struct thread *td, struct dbreg *dbregs)
2388{
2389 struct pcb *pcb;
2390
2391 if (td == NULL) {
2392 dbregs->dr[0] = rdr0();
2393 dbregs->dr[1] = rdr1();
2394 dbregs->dr[2] = rdr2();
2395 dbregs->dr[3] = rdr3();
2396 dbregs->dr[6] = rdr6();
2397 dbregs->dr[7] = rdr7();
2398 } else {
2399 pcb = td->td_pcb;
2400 dbregs->dr[0] = pcb->pcb_dr0;
2401 dbregs->dr[1] = pcb->pcb_dr1;
2402 dbregs->dr[2] = pcb->pcb_dr2;
2403 dbregs->dr[3] = pcb->pcb_dr3;
2404 dbregs->dr[6] = pcb->pcb_dr6;
2405 dbregs->dr[7] = pcb->pcb_dr7;
2406 }
2407 dbregs->dr[4] = 0;
2408 dbregs->dr[5] = 0;
2409 dbregs->dr[8] = 0;
2410 dbregs->dr[9] = 0;
2411 dbregs->dr[10] = 0;
2412 dbregs->dr[11] = 0;
2413 dbregs->dr[12] = 0;
2414 dbregs->dr[13] = 0;
2415 dbregs->dr[14] = 0;
2416 dbregs->dr[15] = 0;
2417 return (0);
2418}
2419
2420int
2421set_dbregs(struct thread *td, struct dbreg *dbregs)
2422{
2423 struct pcb *pcb;
2424 int i;
2425
2426 if (td == NULL) {
2427 load_dr0(dbregs->dr[0]);
2428 load_dr1(dbregs->dr[1]);
2429 load_dr2(dbregs->dr[2]);
2430 load_dr3(dbregs->dr[3]);
2431 load_dr6(dbregs->dr[6]);
2432 load_dr7(dbregs->dr[7]);
2433 } else {
2434 /*
2435 * Don't let an illegal value for dr7 get set. Specifically,
2436 * check for undefined settings. Setting these bit patterns
2437 * result in undefined behaviour and can lead to an unexpected
2438 * TRCTRAP or a general protection fault right here.
2439 * Upper bits of dr6 and dr7 must not be set
2440 */
2441 for (i = 0; i < 4; i++) {
2442 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2443 return (EINVAL);
2444 if (td->td_frame->tf_cs == _ucode32sel &&
2445 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2446 return (EINVAL);
2447 }
2448 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2449 (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2450 return (EINVAL);
2451
2452 pcb = td->td_pcb;
2453
2454 /*
2455 * Don't let a process set a breakpoint that is not within the
2456 * process's address space. If a process could do this, it
2457 * could halt the system by setting a breakpoint in the kernel
2458 * (if ddb was enabled). Thus, we need to check to make sure
2459 * that no breakpoints are being enabled for addresses outside
2460 * process's address space.
2461 *
2462 * XXX - what about when the watched area of the user's
2463 * address space is written into from within the kernel
2464 * ... wouldn't that still cause a breakpoint to be generated
2465 * from within kernel mode?
2466 */
2467
2468 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2469 /* dr0 is enabled */
2470 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2471 return (EINVAL);
2472 }
2473 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2474 /* dr1 is enabled */
2475 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2476 return (EINVAL);
2477 }
2478 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2479 /* dr2 is enabled */
2480 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2481 return (EINVAL);
2482 }
2483 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2484 /* dr3 is enabled */
2485 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2486 return (EINVAL);
2487 }
2488
2489 pcb->pcb_dr0 = dbregs->dr[0];
2490 pcb->pcb_dr1 = dbregs->dr[1];
2491 pcb->pcb_dr2 = dbregs->dr[2];
2492 pcb->pcb_dr3 = dbregs->dr[3];
2493 pcb->pcb_dr6 = dbregs->dr[6];
2494 pcb->pcb_dr7 = dbregs->dr[7];
2495
2496 set_pcb_flags(pcb, PCB_DBREGS);
2497 }
2498
2499 return (0);
2500}
2501
2502void
2503reset_dbregs(void)
2504{
2505
2506 load_dr7(0); /* Turn off the control bits first */
2507 load_dr0(0);
2508 load_dr1(0);
2509 load_dr2(0);
2510 load_dr3(0);
2511 load_dr6(0);
2512}
2513
2514/*
2515 * Return > 0 if a hardware breakpoint has been hit, and the
2516 * breakpoint was in user space. Return 0, otherwise.
2517 */
2518int
2519user_dbreg_trap(register_t dr6)
2520{
2521 u_int64_t dr7;
2522 u_int64_t bp; /* breakpoint bits extracted from dr6 */
2523 int nbp; /* number of breakpoints that triggered */
2524 caddr_t addr[4]; /* breakpoint addresses */
2525 int i;
2526
2527 bp = dr6 & DBREG_DR6_BMASK;
2528 if (bp == 0) {
2529 /*
2530 * None of the breakpoint bits are set meaning this
2531 * trap was not caused by any of the debug registers
2532 */
2533 return 0;
2534 }
2535
2536 dr7 = rdr7();
2537 if ((dr7 & 0x000000ff) == 0) {
2538 /*
2539 * all GE and LE bits in the dr7 register are zero,
2540 * thus the trap couldn't have been caused by the
2541 * hardware debug registers
2542 */
2543 return 0;
2544 }
2545
2546 nbp = 0;
2547
2548 /*
2549 * at least one of the breakpoints were hit, check to see
2550 * which ones and if any of them are user space addresses
2551 */
2552
2553 if (bp & 0x01) {
2554 addr[nbp++] = (caddr_t)rdr0();
2555 }
2556 if (bp & 0x02) {
2557 addr[nbp++] = (caddr_t)rdr1();
2558 }
2559 if (bp & 0x04) {
2560 addr[nbp++] = (caddr_t)rdr2();
2561 }
2562 if (bp & 0x08) {
2563 addr[nbp++] = (caddr_t)rdr3();
2564 }
2565
2566 for (i = 0; i < nbp; i++) {
2567 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2568 /*
2569 * addr[i] is in user space
2570 */
2571 return nbp;
2572 }
2573 }
2574
2575 /*
2576 * None of the breakpoints are in user space.
2577 */
2578 return 0;
2579}
2580
2581/*
2582 * The pcb_flags is only modified by current thread, or by other threads
2583 * when current thread is stopped. However, current thread may change it
2584 * from the interrupt context in cpu_switch(), or in the trap handler.
2585 * When we read-modify-write pcb_flags from C sources, compiler may generate
2586 * code that is not atomic regarding the interrupt handler. If a trap or
2587 * interrupt happens and any flag is modified from the handler, it can be
2588 * clobbered with the cached value later. Therefore, we implement setting
2589 * and clearing flags with single-instruction functions, which do not race
2590 * with possible modification of the flags from the trap or interrupt context,
2591 * because traps and interrupts are executed only on instruction boundary.
2592 */
2593void
2594set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2595{
2596
2597 __asm __volatile("orl %1,%0"
2598 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2599 : "cc", "memory");
2600
2601}
2602
2603/*
2604 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2605 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2606 * pcb if user space modified the bases. We must save on the context
2607 * switch or if the return to usermode happens through the doreti.
2608 *
2609 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2610 * which have a consequence that the base MSRs must be saved each time
2611 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with
2612 * context switches.
2613 */
2614void
2615set_pcb_flags(struct pcb *pcb, const u_int flags)
2616{
2617 register_t r;
2618
2619 if (curpcb == pcb &&
2620 (flags & PCB_FULL_IRET) != 0 &&
2621 (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
2622 (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
2623 r = intr_disable();
2624 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2625 if (rfs() == _ufssel)
2626 pcb->pcb_fsbase = rdfsbase();
2627 if (rgs() == _ugssel)
2628 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2629 }
2630 set_pcb_flags_raw(pcb, flags);
2631 intr_restore(r);
2632 } else {
2633 set_pcb_flags_raw(pcb, flags);
2634 }
2635}
2636
2637void
2638clear_pcb_flags(struct pcb *pcb, const u_int flags)
2639{
2640
2641 __asm __volatile("andl %1,%0"
2642 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2643 : "cc", "memory");
2644}
2645
2646#ifdef KDB
2647
2648/*
2649 * Provide inb() and outb() as functions. They are normally only available as
2650 * inline functions, thus cannot be called from the debugger.
2651 */
2652
2653/* silence compiler warnings */
2654u_char inb_(u_short);
2655void outb_(u_short, u_char);
2656
2657u_char
2658inb_(u_short port)
2659{
2660 return inb(port);
2661}
2662
2663void
2664outb_(u_short port, u_char data)
2665{
2666 outb(port, data);
2667}
2668
2669#endif /* KDB */