Deleted Added
full compact
machdep.c (354764) machdep.c (362383)
1/*-
2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * William Jolitz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
39 */
40
41#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * William Jolitz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 354764 2019-11-16 00:52:04Z scottl $");
42__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 362383 2020-06-19 13:48:23Z kib $");
43
44#include "opt_atpic.h"
45#include "opt_compat.h"
46#include "opt_cpu.h"
47#include "opt_ddb.h"
48#include "opt_inet.h"
49#include "opt_isa.h"
50#include "opt_kstack_pages.h"
51#include "opt_maxmem.h"
52#include "opt_mp_watchdog.h"
53#include "opt_perfmon.h"
54#include "opt_platform.h"
55#include "opt_sched.h"
56
57#include <sys/param.h>
58#include <sys/proc.h>
59#include <sys/systm.h>
60#include <sys/bio.h>
61#include <sys/buf.h>
62#include <sys/bus.h>
63#include <sys/callout.h>
64#include <sys/cons.h>
65#include <sys/cpu.h>
66#include <sys/efi.h>
67#include <sys/eventhandler.h>
68#include <sys/exec.h>
69#include <sys/imgact.h>
70#include <sys/kdb.h>
71#include <sys/kernel.h>
72#include <sys/ktr.h>
73#include <sys/linker.h>
74#include <sys/lock.h>
75#include <sys/malloc.h>
76#include <sys/memrange.h>
77#include <sys/msgbuf.h>
78#include <sys/mutex.h>
79#include <sys/pcpu.h>
80#include <sys/ptrace.h>
81#include <sys/reboot.h>
82#include <sys/rwlock.h>
83#include <sys/sched.h>
84#include <sys/signalvar.h>
85#ifdef SMP
86#include <sys/smp.h>
87#endif
88#include <sys/syscallsubr.h>
89#include <sys/sysctl.h>
90#include <sys/sysent.h>
91#include <sys/sysproto.h>
92#include <sys/ucontext.h>
93#include <sys/vmmeter.h>
94
95#include <vm/vm.h>
96#include <vm/vm_extern.h>
97#include <vm/vm_kern.h>
98#include <vm/vm_page.h>
99#include <vm/vm_map.h>
100#include <vm/vm_object.h>
101#include <vm/vm_pager.h>
102#include <vm/vm_param.h>
103#include <vm/vm_phys.h>
104
105#ifdef DDB
106#ifndef KDB
107#error KDB must be enabled in order for DDB to work!
108#endif
109#include <ddb/ddb.h>
110#include <ddb/db_sym.h>
111#endif
112
113#include <net/netisr.h>
114
115#include <machine/clock.h>
116#include <machine/cpu.h>
117#include <machine/cputypes.h>
118#include <machine/frame.h>
119#include <machine/intr_machdep.h>
120#include <x86/mca.h>
121#include <machine/md_var.h>
122#include <machine/metadata.h>
123#include <machine/mp_watchdog.h>
124#include <machine/pc/bios.h>
125#include <machine/pcb.h>
126#include <machine/proc.h>
127#include <machine/reg.h>
128#include <machine/sigframe.h>
129#include <machine/specialreg.h>
130#ifdef PERFMON
131#include <machine/perfmon.h>
132#endif
133#include <machine/tss.h>
134#include <x86/ucode.h>
135#ifdef SMP
136#include <machine/smp.h>
137#endif
138#ifdef FDT
139#include <x86/fdt.h>
140#endif
141
142#ifdef DEV_ATPIC
143#include <x86/isa/icu.h>
144#else
145#include <x86/apicvar.h>
146#endif
147
148#include <isa/isareg.h>
149#include <isa/rtc.h>
150#include <x86/init.h>
151
152/* Sanity check for __curthread() */
153CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154
155/*
156 * The PTI trampoline stack needs enough space for a hardware trapframe and a
157 * couple of scratch registers, as well as the trapframe left behind after an
158 * iret fault.
159 */
160CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161 offsetof(struct pti_frame, pti_rip));
162
163extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164
165#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
166#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
167
168static void cpu_startup(void *);
169static void get_fpcontext(struct thread *td, mcontext_t *mcp,
170 char *xfpusave, size_t xfpusave_len);
171static int set_fpcontext(struct thread *td, mcontext_t *mcp,
172 char *xfpustate, size_t xfpustate_len);
173SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
174
175/* Preload data parse function */
176static caddr_t native_parse_preload_data(u_int64_t);
177
178/* Native function to fetch and parse the e820 map */
179static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
180
181/* Default init_ops implementation. */
182struct init_ops init_ops = {
183 .parse_preload_data = native_parse_preload_data,
184 .early_clock_source_init = i8254_init,
185 .early_delay = i8254_delay,
186 .parse_memmap = native_parse_memmap,
187#ifdef SMP
188 .mp_bootaddress = mp_bootaddress,
189 .start_all_aps = native_start_all_aps,
190#endif
191 .msi_init = msi_init,
192};
193
194struct msgbuf *msgbufp;
195
196/*
197 * Physical address of the EFI System Table. Stashed from the metadata hints
198 * passed into the kernel and used by the EFI code to call runtime services.
199 */
200vm_paddr_t efi_systbl_phys;
201
202/* Intel ICH registers */
203#define ICH_PMBASE 0x400
204#define ICH_SMI_EN ICH_PMBASE + 0x30
205
206int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
207
208int cold = 1;
209
210long Maxmem = 0;
211long realmem = 0;
212
213/*
214 * The number of PHYSMAP entries must be one less than the number of
215 * PHYSSEG entries because the PHYSMAP entry that spans the largest
216 * physical address that is accessible by ISA DMA is split into two
217 * PHYSSEG entries.
218 */
219#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
220
221vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
222vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
223
224/* must be 2 less so 0 0 can signal end of chunks */
225#define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
226#define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
227
228struct kva_md_info kmi;
229
230static struct trapframe proc0_tf;
231struct region_descriptor r_gdt, r_idt;
232
233struct pcpu __pcpu[MAXCPU];
234
235struct mtx icu_lock;
236
237struct mem_range_softc mem_range_softc;
238
239struct mtx dt_lock; /* lock for GDT and LDT */
240
241void (*vmm_resume_p)(void);
242
243static void
244cpu_startup(dummy)
245 void *dummy;
246{
247 uintmax_t memsize;
248 char *sysenv;
249
250 /*
251 * On MacBooks, we need to disallow the legacy USB circuit to
252 * generate an SMI# because this can cause several problems,
253 * namely: incorrect CPU frequency detection and failure to
254 * start the APs.
255 * We do this by disabling a bit in the SMI_EN (SMI Control and
256 * Enable register) of the Intel ICH LPC Interface Bridge.
257 */
258 sysenv = kern_getenv("smbios.system.product");
259 if (sysenv != NULL) {
260 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
261 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
262 strncmp(sysenv, "MacBook4,1", 10) == 0 ||
263 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
264 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
265 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
266 strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
267 strncmp(sysenv, "Macmini1,1", 10) == 0) {
268 if (bootverbose)
269 printf("Disabling LEGACY_USB_EN bit on "
270 "Intel ICH.\n");
271 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
272 }
273 freeenv(sysenv);
274 }
275
276 /*
277 * Good {morning,afternoon,evening,night}.
278 */
279 startrtclock();
280 printcpuinfo();
281#ifdef PERFMON
282 perfmon_init();
283#endif
284
285 /*
286 * Display physical memory if SMBIOS reports reasonable amount.
287 */
288 memsize = 0;
289 sysenv = kern_getenv("smbios.memory.enabled");
290 if (sysenv != NULL) {
291 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
292 freeenv(sysenv);
293 }
294 if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
295 memsize = ptoa((uintmax_t)Maxmem);
296 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
297 realmem = atop(memsize);
298
299 /*
300 * Display any holes after the first chunk of extended memory.
301 */
302 if (bootverbose) {
303 int indx;
304
305 printf("Physical memory chunk(s):\n");
306 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
307 vm_paddr_t size;
308
309 size = phys_avail[indx + 1] - phys_avail[indx];
310 printf(
311 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
312 (uintmax_t)phys_avail[indx],
313 (uintmax_t)phys_avail[indx + 1] - 1,
314 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
315 }
316 }
317
318 vm_ksubmap_init(&kmi);
319
320 printf("avail memory = %ju (%ju MB)\n",
321 ptoa((uintmax_t)vm_cnt.v_free_count),
322 ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
323
324 /*
325 * Set up buffers, so they can be used to read disk labels.
326 */
327 bufinit();
328 vm_pager_bufferinit();
329
330 cpu_setregs();
331}
332
333/*
334 * Send an interrupt to process.
335 *
336 * Stack is set up to allow sigcode stored
337 * at top to call routine, followed by call
338 * to sigreturn routine below. After sigreturn
339 * resets the signal mask, the stack, and the
340 * frame pointer, it returns to the user
341 * specified pc, psl.
342 */
343void
344sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
345{
346 struct sigframe sf, *sfp;
347 struct pcb *pcb;
348 struct proc *p;
349 struct thread *td;
350 struct sigacts *psp;
351 char *sp;
352 struct trapframe *regs;
353 char *xfpusave;
354 size_t xfpusave_len;
355 int sig;
356 int oonstack;
357
358 td = curthread;
359 pcb = td->td_pcb;
360 p = td->td_proc;
361 PROC_LOCK_ASSERT(p, MA_OWNED);
362 sig = ksi->ksi_signo;
363 psp = p->p_sigacts;
364 mtx_assert(&psp->ps_mtx, MA_OWNED);
365 regs = td->td_frame;
366 oonstack = sigonstack(regs->tf_rsp);
367
368 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
369 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
370 xfpusave = __builtin_alloca(xfpusave_len);
371 } else {
372 xfpusave_len = 0;
373 xfpusave = NULL;
374 }
375
376 /* Save user context. */
377 bzero(&sf, sizeof(sf));
378 sf.sf_uc.uc_sigmask = *mask;
379 sf.sf_uc.uc_stack = td->td_sigstk;
380 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
381 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
382 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
383 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
384 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
385 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
386 fpstate_drop(td);
387 update_pcb_bases(pcb);
388 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
389 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
390 bzero(sf.sf_uc.uc_mcontext.mc_spare,
391 sizeof(sf.sf_uc.uc_mcontext.mc_spare));
392 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
393
394 /* Allocate space for the signal handler context. */
395 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
396 SIGISMEMBER(psp->ps_sigonstack, sig)) {
397 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
398#if defined(COMPAT_43)
399 td->td_sigstk.ss_flags |= SS_ONSTACK;
400#endif
401 } else
402 sp = (char *)regs->tf_rsp - 128;
403 if (xfpusave != NULL) {
404 sp -= xfpusave_len;
405 sp = (char *)((unsigned long)sp & ~0x3Ful);
406 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
407 }
408 sp -= sizeof(struct sigframe);
409 /* Align to 16 bytes. */
410 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
411
412 /* Build the argument list for the signal handler. */
413 regs->tf_rdi = sig; /* arg 1 in %rdi */
414 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
415 bzero(&sf.sf_si, sizeof(sf.sf_si));
416 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
417 /* Signal handler installed with SA_SIGINFO. */
418 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
419 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
420
421 /* Fill in POSIX parts */
422 sf.sf_si = ksi->ksi_info;
423 sf.sf_si.si_signo = sig; /* maybe a translated signal */
424 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
425 } else {
426 /* Old FreeBSD-style arguments. */
427 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */
428 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
429 sf.sf_ahu.sf_handler = catcher;
430 }
431 mtx_unlock(&psp->ps_mtx);
432 PROC_UNLOCK(p);
433
434 /*
435 * Copy the sigframe out to the user's stack.
436 */
437 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
438 (xfpusave != NULL && copyout(xfpusave,
439 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
440 != 0)) {
441#ifdef DEBUG
442 printf("process %ld has trashed its stack\n", (long)p->p_pid);
443#endif
444 PROC_LOCK(p);
445 sigexit(td, SIGILL);
446 }
447
448 regs->tf_rsp = (long)sfp;
449 regs->tf_rip = p->p_sysent->sv_sigcode_base;
450 regs->tf_rflags &= ~(PSL_T | PSL_D);
451 regs->tf_cs = _ucodesel;
452 regs->tf_ds = _udatasel;
453 regs->tf_ss = _udatasel;
454 regs->tf_es = _udatasel;
455 regs->tf_fs = _ufssel;
456 regs->tf_gs = _ugssel;
457 regs->tf_flags = TF_HASSEGS;
458 PROC_LOCK(p);
459 mtx_lock(&psp->ps_mtx);
460}
461
462/*
463 * System call to cleanup state after a signal
464 * has been taken. Reset signal mask and
465 * stack state from context left by sendsig (above).
466 * Return to previous pc and psl as specified by
467 * context left by sendsig. Check carefully to
468 * make sure that the user has not modified the
469 * state to gain improper privileges.
470 *
471 * MPSAFE
472 */
473int
474sys_sigreturn(td, uap)
475 struct thread *td;
476 struct sigreturn_args /* {
477 const struct __ucontext *sigcntxp;
478 } */ *uap;
479{
480 ucontext_t uc;
481 struct pcb *pcb;
482 struct proc *p;
483 struct trapframe *regs;
484 ucontext_t *ucp;
485 char *xfpustate;
486 size_t xfpustate_len;
487 long rflags;
488 int cs, error, ret;
489 ksiginfo_t ksi;
490
491 pcb = td->td_pcb;
492 p = td->td_proc;
493
494 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
495 if (error != 0) {
496 uprintf("pid %d (%s): sigreturn copyin failed\n",
497 p->p_pid, td->td_name);
498 return (error);
499 }
500 ucp = &uc;
501 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
502 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
503 td->td_name, ucp->uc_mcontext.mc_flags);
504 return (EINVAL);
505 }
506 regs = td->td_frame;
507 rflags = ucp->uc_mcontext.mc_rflags;
508 /*
509 * Don't allow users to change privileged or reserved flags.
510 */
511 if (!EFL_SECURE(rflags, regs->tf_rflags)) {
512 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
513 td->td_name, rflags);
514 return (EINVAL);
515 }
516
517 /*
518 * Don't allow users to load a valid privileged %cs. Let the
519 * hardware check for invalid selectors, excess privilege in
520 * other selectors, invalid %eip's and invalid %esp's.
521 */
522 cs = ucp->uc_mcontext.mc_cs;
523 if (!CS_SECURE(cs)) {
524 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
525 td->td_name, cs);
526 ksiginfo_init_trap(&ksi);
527 ksi.ksi_signo = SIGBUS;
528 ksi.ksi_code = BUS_OBJERR;
529 ksi.ksi_trapno = T_PROTFLT;
530 ksi.ksi_addr = (void *)regs->tf_rip;
531 trapsignal(td, &ksi);
532 return (EINVAL);
533 }
534
535 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
536 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
537 if (xfpustate_len > cpu_max_ext_state_size -
538 sizeof(struct savefpu)) {
539 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
540 p->p_pid, td->td_name, xfpustate_len);
541 return (EINVAL);
542 }
543 xfpustate = __builtin_alloca(xfpustate_len);
544 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
545 xfpustate, xfpustate_len);
546 if (error != 0) {
547 uprintf(
548 "pid %d (%s): sigreturn copying xfpustate failed\n",
549 p->p_pid, td->td_name);
550 return (error);
551 }
552 } else {
553 xfpustate = NULL;
554 xfpustate_len = 0;
555 }
556 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
557 if (ret != 0) {
558 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
559 p->p_pid, td->td_name, ret);
560 return (ret);
561 }
562 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
563 update_pcb_bases(pcb);
564 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
565 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
566
567#if defined(COMPAT_43)
568 if (ucp->uc_mcontext.mc_onstack & 1)
569 td->td_sigstk.ss_flags |= SS_ONSTACK;
570 else
571 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
572#endif
573
574 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
575 return (EJUSTRETURN);
576}
577
578#ifdef COMPAT_FREEBSD4
579int
580freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
581{
582
583 return sys_sigreturn(td, (struct sigreturn_args *)uap);
584}
585#endif
586
587/*
588 * Reset registers to default values on exec.
589 */
590void
591exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
592{
593 struct trapframe *regs;
594 struct pcb *pcb;
595 register_t saved_rflags;
596
597 regs = td->td_frame;
598 pcb = td->td_pcb;
599
600 mtx_lock(&dt_lock);
601 if (td->td_proc->p_md.md_ldt != NULL)
602 user_ldt_free(td);
603 else
604 mtx_unlock(&dt_lock);
605
606 update_pcb_bases(pcb);
607 pcb->pcb_fsbase = 0;
608 pcb->pcb_gsbase = 0;
609 clear_pcb_flags(pcb, PCB_32BIT);
610 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
611
612 saved_rflags = regs->tf_rflags & PSL_T;
613 bzero((char *)regs, sizeof(struct trapframe));
614 regs->tf_rip = imgp->entry_addr;
615 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
616 regs->tf_rdi = stack; /* argv */
617 regs->tf_rflags = PSL_USER | saved_rflags;
618 regs->tf_ss = _udatasel;
619 regs->tf_cs = _ucodesel;
620 regs->tf_ds = _udatasel;
621 regs->tf_es = _udatasel;
622 regs->tf_fs = _ufssel;
623 regs->tf_gs = _ugssel;
624 regs->tf_flags = TF_HASSEGS;
625 td->td_retval[1] = 0;
626
627 /*
628 * Reset the hardware debug registers if they were in use.
629 * They won't have any meaning for the newly exec'd process.
630 */
631 if (pcb->pcb_flags & PCB_DBREGS) {
632 pcb->pcb_dr0 = 0;
633 pcb->pcb_dr1 = 0;
634 pcb->pcb_dr2 = 0;
635 pcb->pcb_dr3 = 0;
636 pcb->pcb_dr6 = 0;
637 pcb->pcb_dr7 = 0;
638 if (pcb == curpcb) {
639 /*
640 * Clear the debug registers on the running
641 * CPU, otherwise they will end up affecting
642 * the next process we switch to.
643 */
644 reset_dbregs();
645 }
646 clear_pcb_flags(pcb, PCB_DBREGS);
647 }
648
649 /*
650 * Drop the FP state if we hold it, so that the process gets a
651 * clean FP state if it uses the FPU again.
652 */
653 fpstate_drop(td);
654}
655
656void
657cpu_setregs(void)
658{
659 register_t cr0;
660
661 cr0 = rcr0();
662 /*
663 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
664 * BSP. See the comments there about why we set them.
665 */
666 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
667 load_cr0(cr0);
668}
669
670/*
671 * Initialize amd64 and configure to run kernel
672 */
673
674/*
675 * Initialize segments & interrupt table
676 */
677
678struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
679static struct gate_descriptor idt0[NIDT];
680struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
681
682static char dblfault_stack[PAGE_SIZE] __aligned(16);
683static char mce0_stack[PAGE_SIZE] __aligned(16);
684static char nmi0_stack[PAGE_SIZE] __aligned(16);
685static char dbg0_stack[PAGE_SIZE] __aligned(16);
686CTASSERT(sizeof(struct nmi_pcpu) == 16);
687
688struct amd64tss common_tss[MAXCPU];
689
690/*
691 * Software prototypes -- in more palatable form.
692 *
693 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
694 * slots as corresponding segments for i386 kernel.
695 */
696struct soft_segment_descriptor gdt_segs[] = {
697/* GNULL_SEL 0 Null Descriptor */
698{ .ssd_base = 0x0,
699 .ssd_limit = 0x0,
700 .ssd_type = 0,
701 .ssd_dpl = 0,
702 .ssd_p = 0,
703 .ssd_long = 0,
704 .ssd_def32 = 0,
705 .ssd_gran = 0 },
706/* GNULL2_SEL 1 Null Descriptor */
707{ .ssd_base = 0x0,
708 .ssd_limit = 0x0,
709 .ssd_type = 0,
710 .ssd_dpl = 0,
711 .ssd_p = 0,
712 .ssd_long = 0,
713 .ssd_def32 = 0,
714 .ssd_gran = 0 },
715/* GUFS32_SEL 2 32 bit %gs Descriptor for user */
716{ .ssd_base = 0x0,
717 .ssd_limit = 0xfffff,
718 .ssd_type = SDT_MEMRWA,
719 .ssd_dpl = SEL_UPL,
720 .ssd_p = 1,
721 .ssd_long = 0,
722 .ssd_def32 = 1,
723 .ssd_gran = 1 },
724/* GUGS32_SEL 3 32 bit %fs Descriptor for user */
725{ .ssd_base = 0x0,
726 .ssd_limit = 0xfffff,
727 .ssd_type = SDT_MEMRWA,
728 .ssd_dpl = SEL_UPL,
729 .ssd_p = 1,
730 .ssd_long = 0,
731 .ssd_def32 = 1,
732 .ssd_gran = 1 },
733/* GCODE_SEL 4 Code Descriptor for kernel */
734{ .ssd_base = 0x0,
735 .ssd_limit = 0xfffff,
736 .ssd_type = SDT_MEMERA,
737 .ssd_dpl = SEL_KPL,
738 .ssd_p = 1,
739 .ssd_long = 1,
740 .ssd_def32 = 0,
741 .ssd_gran = 1 },
742/* GDATA_SEL 5 Data Descriptor for kernel */
743{ .ssd_base = 0x0,
744 .ssd_limit = 0xfffff,
745 .ssd_type = SDT_MEMRWA,
746 .ssd_dpl = SEL_KPL,
747 .ssd_p = 1,
748 .ssd_long = 1,
749 .ssd_def32 = 0,
750 .ssd_gran = 1 },
751/* GUCODE32_SEL 6 32 bit Code Descriptor for user */
752{ .ssd_base = 0x0,
753 .ssd_limit = 0xfffff,
754 .ssd_type = SDT_MEMERA,
755 .ssd_dpl = SEL_UPL,
756 .ssd_p = 1,
757 .ssd_long = 0,
758 .ssd_def32 = 1,
759 .ssd_gran = 1 },
760/* GUDATA_SEL 7 32/64 bit Data Descriptor for user */
761{ .ssd_base = 0x0,
762 .ssd_limit = 0xfffff,
763 .ssd_type = SDT_MEMRWA,
764 .ssd_dpl = SEL_UPL,
765 .ssd_p = 1,
766 .ssd_long = 0,
767 .ssd_def32 = 1,
768 .ssd_gran = 1 },
769/* GUCODE_SEL 8 64 bit Code Descriptor for user */
770{ .ssd_base = 0x0,
771 .ssd_limit = 0xfffff,
772 .ssd_type = SDT_MEMERA,
773 .ssd_dpl = SEL_UPL,
774 .ssd_p = 1,
775 .ssd_long = 1,
776 .ssd_def32 = 0,
777 .ssd_gran = 1 },
778/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
779{ .ssd_base = 0x0,
780 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
781 .ssd_type = SDT_SYSTSS,
782 .ssd_dpl = SEL_KPL,
783 .ssd_p = 1,
784 .ssd_long = 0,
785 .ssd_def32 = 0,
786 .ssd_gran = 0 },
787/* Actually, the TSS is a system descriptor which is double size */
788{ .ssd_base = 0x0,
789 .ssd_limit = 0x0,
790 .ssd_type = 0,
791 .ssd_dpl = 0,
792 .ssd_p = 0,
793 .ssd_long = 0,
794 .ssd_def32 = 0,
795 .ssd_gran = 0 },
796/* GUSERLDT_SEL 11 LDT Descriptor */
797{ .ssd_base = 0x0,
798 .ssd_limit = 0x0,
799 .ssd_type = 0,
800 .ssd_dpl = 0,
801 .ssd_p = 0,
802 .ssd_long = 0,
803 .ssd_def32 = 0,
804 .ssd_gran = 0 },
805/* GUSERLDT_SEL 12 LDT Descriptor, double size */
806{ .ssd_base = 0x0,
807 .ssd_limit = 0x0,
808 .ssd_type = 0,
809 .ssd_dpl = 0,
810 .ssd_p = 0,
811 .ssd_long = 0,
812 .ssd_def32 = 0,
813 .ssd_gran = 0 },
814};
815
816void
817setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
818{
819 struct gate_descriptor *ip;
820
821 ip = idt + idx;
822 ip->gd_looffset = (uintptr_t)func;
823 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
824 ip->gd_ist = ist;
825 ip->gd_xx = 0;
826 ip->gd_type = typ;
827 ip->gd_dpl = dpl;
828 ip->gd_p = 1;
829 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
830}
831
832extern inthand_t
833 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
834 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
835 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
836 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
837 IDTVEC(xmm), IDTVEC(dblfault),
838 IDTVEC(div_pti), IDTVEC(bpt_pti),
839 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
840 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
841 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
842 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
843 IDTVEC(xmm_pti),
844#ifdef KDTRACE_HOOKS
845 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
846#endif
847#ifdef XENHVM
848 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
849#endif
850 IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
851 IDTVEC(fast_syscall_pti);
852
853#ifdef DDB
854/*
855 * Display the index and function name of any IDT entries that don't use
856 * the default 'rsvd' entry point.
857 */
858DB_SHOW_COMMAND(idt, db_show_idt)
859{
860 struct gate_descriptor *ip;
861 int idx;
862 uintptr_t func;
863
864 ip = idt;
865 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
866 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
867 if (func != (uintptr_t)&IDTVEC(rsvd)) {
868 db_printf("%3d\t", idx);
869 db_printsym(func, DB_STGY_PROC);
870 db_printf("\n");
871 }
872 ip++;
873 }
874}
875
876/* Show privileged registers. */
877DB_SHOW_COMMAND(sysregs, db_show_sysregs)
878{
879 struct {
880 uint16_t limit;
881 uint64_t base;
882 } __packed idtr, gdtr;
883 uint16_t ldt, tr;
884
885 __asm __volatile("sidt %0" : "=m" (idtr));
886 db_printf("idtr\t0x%016lx/%04x\n",
887 (u_long)idtr.base, (u_int)idtr.limit);
888 __asm __volatile("sgdt %0" : "=m" (gdtr));
889 db_printf("gdtr\t0x%016lx/%04x\n",
890 (u_long)gdtr.base, (u_int)gdtr.limit);
891 __asm __volatile("sldt %0" : "=r" (ldt));
892 db_printf("ldtr\t0x%04x\n", ldt);
893 __asm __volatile("str %0" : "=r" (tr));
894 db_printf("tr\t0x%04x\n", tr);
895 db_printf("cr0\t0x%016lx\n", rcr0());
896 db_printf("cr2\t0x%016lx\n", rcr2());
897 db_printf("cr3\t0x%016lx\n", rcr3());
898 db_printf("cr4\t0x%016lx\n", rcr4());
899 if (rcr4() & CR4_XSAVE)
900 db_printf("xcr0\t0x%016lx\n", rxcr(0));
901 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
902 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
903 db_printf("FEATURES_CTL\t%016lx\n",
904 rdmsr(MSR_IA32_FEATURE_CONTROL));
905 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
906 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
907 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
908}
909
910DB_SHOW_COMMAND(dbregs, db_show_dbregs)
911{
912
913 db_printf("dr0\t0x%016lx\n", rdr0());
914 db_printf("dr1\t0x%016lx\n", rdr1());
915 db_printf("dr2\t0x%016lx\n", rdr2());
916 db_printf("dr3\t0x%016lx\n", rdr3());
917 db_printf("dr6\t0x%016lx\n", rdr6());
918 db_printf("dr7\t0x%016lx\n", rdr7());
919}
920#endif
921
922void
923sdtossd(sd, ssd)
924 struct user_segment_descriptor *sd;
925 struct soft_segment_descriptor *ssd;
926{
927
928 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
929 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
930 ssd->ssd_type = sd->sd_type;
931 ssd->ssd_dpl = sd->sd_dpl;
932 ssd->ssd_p = sd->sd_p;
933 ssd->ssd_long = sd->sd_long;
934 ssd->ssd_def32 = sd->sd_def32;
935 ssd->ssd_gran = sd->sd_gran;
936}
937
938void
939ssdtosd(ssd, sd)
940 struct soft_segment_descriptor *ssd;
941 struct user_segment_descriptor *sd;
942{
943
944 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
945 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
946 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
947 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
948 sd->sd_type = ssd->ssd_type;
949 sd->sd_dpl = ssd->ssd_dpl;
950 sd->sd_p = ssd->ssd_p;
951 sd->sd_long = ssd->ssd_long;
952 sd->sd_def32 = ssd->ssd_def32;
953 sd->sd_gran = ssd->ssd_gran;
954}
955
956void
957ssdtosyssd(ssd, sd)
958 struct soft_segment_descriptor *ssd;
959 struct system_segment_descriptor *sd;
960{
961
962 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
963 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
964 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
965 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
966 sd->sd_type = ssd->ssd_type;
967 sd->sd_dpl = ssd->ssd_dpl;
968 sd->sd_p = ssd->ssd_p;
969 sd->sd_gran = ssd->ssd_gran;
970}
971
972#if !defined(DEV_ATPIC) && defined(DEV_ISA)
973#include <isa/isavar.h>
974#include <isa/isareg.h>
975/*
976 * Return a bitmap of the current interrupt requests. This is 8259-specific
977 * and is only suitable for use at probe time.
978 * This is only here to pacify sio. It is NOT FATAL if this doesn't work.
979 * It shouldn't be here. There should probably be an APIC centric
980 * implementation in the apic driver code, if at all.
981 */
982intrmask_t
983isa_irq_pending(void)
984{
985 u_char irr1;
986 u_char irr2;
987
988 irr1 = inb(IO_ICU1);
989 irr2 = inb(IO_ICU2);
990 return ((irr2 << 8) | irr1);
991}
992#endif
993
994u_int basemem;
995
996static int
997add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
998 int *physmap_idxp)
999{
1000 int i, insert_idx, physmap_idx;
1001
1002 physmap_idx = *physmap_idxp;
1003
1004 if (length == 0)
1005 return (1);
1006
1007 /*
1008 * Find insertion point while checking for overlap. Start off by
1009 * assuming the new entry will be added to the end.
1010 *
1011 * NB: physmap_idx points to the next free slot.
1012 */
1013 insert_idx = physmap_idx;
1014 for (i = 0; i <= physmap_idx; i += 2) {
1015 if (base < physmap[i + 1]) {
1016 if (base + length <= physmap[i]) {
1017 insert_idx = i;
1018 break;
1019 }
1020 if (boothowto & RB_VERBOSE)
1021 printf(
1022 "Overlapping memory regions, ignoring second region\n");
1023 return (1);
1024 }
1025 }
1026
1027 /* See if we can prepend to the next entry. */
1028 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1029 physmap[insert_idx] = base;
1030 return (1);
1031 }
1032
1033 /* See if we can append to the previous entry. */
1034 if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1035 physmap[insert_idx - 1] += length;
1036 return (1);
1037 }
1038
1039 physmap_idx += 2;
1040 *physmap_idxp = physmap_idx;
1041 if (physmap_idx == PHYSMAP_SIZE) {
1042 printf(
1043 "Too many segments in the physical address map, giving up\n");
1044 return (0);
1045 }
1046
1047 /*
1048 * Move the last 'N' entries down to make room for the new
1049 * entry if needed.
1050 */
1051 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1052 physmap[i] = physmap[i - 2];
1053 physmap[i + 1] = physmap[i - 1];
1054 }
1055
1056 /* Insert the new entry. */
1057 physmap[insert_idx] = base;
1058 physmap[insert_idx + 1] = base + length;
1059 return (1);
1060}
1061
1062void
1063bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1064 vm_paddr_t *physmap, int *physmap_idx)
1065{
1066 struct bios_smap *smap, *smapend;
1067
1068 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1069
1070 for (smap = smapbase; smap < smapend; smap++) {
1071 if (boothowto & RB_VERBOSE)
1072 printf("SMAP type=%02x base=%016lx len=%016lx\n",
1073 smap->type, smap->base, smap->length);
1074
1075 if (smap->type != SMAP_TYPE_MEMORY)
1076 continue;
1077
1078 if (!add_physmap_entry(smap->base, smap->length, physmap,
1079 physmap_idx))
1080 break;
1081 }
1082}
1083
1084static void
1085add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1086 int *physmap_idx)
1087{
1088 struct efi_md *map, *p;
1089 const char *type;
1090 size_t efisz;
1091 int ndesc, i;
1092
1093 static const char *types[] = {
1094 "Reserved",
1095 "LoaderCode",
1096 "LoaderData",
1097 "BootServicesCode",
1098 "BootServicesData",
1099 "RuntimeServicesCode",
1100 "RuntimeServicesData",
1101 "ConventionalMemory",
1102 "UnusableMemory",
1103 "ACPIReclaimMemory",
1104 "ACPIMemoryNVS",
1105 "MemoryMappedIO",
1106 "MemoryMappedIOPortSpace",
1107 "PalCode",
1108 "PersistentMemory"
1109 };
1110
1111 /*
1112 * Memory map data provided by UEFI via the GetMemoryMap
1113 * Boot Services API.
1114 */
1115 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1116 map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1117
1118 if (efihdr->descriptor_size == 0)
1119 return;
1120 ndesc = efihdr->memory_size / efihdr->descriptor_size;
1121
1122 if (boothowto & RB_VERBOSE)
1123 printf("%23s %12s %12s %8s %4s\n",
1124 "Type", "Physical", "Virtual", "#Pages", "Attr");
1125
1126 for (i = 0, p = map; i < ndesc; i++,
1127 p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1128 if (boothowto & RB_VERBOSE) {
1129 if (p->md_type < nitems(types))
1130 type = types[p->md_type];
1131 else
1132 type = "<INVALID>";
1133 printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1134 p->md_virt, p->md_pages);
1135 if (p->md_attr & EFI_MD_ATTR_UC)
1136 printf("UC ");
1137 if (p->md_attr & EFI_MD_ATTR_WC)
1138 printf("WC ");
1139 if (p->md_attr & EFI_MD_ATTR_WT)
1140 printf("WT ");
1141 if (p->md_attr & EFI_MD_ATTR_WB)
1142 printf("WB ");
1143 if (p->md_attr & EFI_MD_ATTR_UCE)
1144 printf("UCE ");
1145 if (p->md_attr & EFI_MD_ATTR_WP)
1146 printf("WP ");
1147 if (p->md_attr & EFI_MD_ATTR_RP)
1148 printf("RP ");
1149 if (p->md_attr & EFI_MD_ATTR_XP)
1150 printf("XP ");
1151 if (p->md_attr & EFI_MD_ATTR_NV)
1152 printf("NV ");
1153 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1154 printf("MORE_RELIABLE ");
1155 if (p->md_attr & EFI_MD_ATTR_RO)
1156 printf("RO ");
1157 if (p->md_attr & EFI_MD_ATTR_RT)
1158 printf("RUNTIME");
1159 printf("\n");
1160 }
1161
1162 switch (p->md_type) {
1163 case EFI_MD_TYPE_CODE:
1164 case EFI_MD_TYPE_DATA:
1165 case EFI_MD_TYPE_BS_CODE:
1166 case EFI_MD_TYPE_BS_DATA:
1167 case EFI_MD_TYPE_FREE:
1168 /*
1169 * We're allowed to use any entry with these types.
1170 */
1171 break;
1172 default:
1173 continue;
1174 }
1175
1176 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1177 physmap, physmap_idx))
1178 break;
1179 }
1180}
1181
1182static char bootmethod[16] = "";
1183SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1184 "System firmware boot method");
1185
1186static void
1187native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1188{
1189 struct bios_smap *smap;
1190 struct efi_map_header *efihdr;
1191 u_int32_t size;
1192
1193 /*
1194 * Memory map from INT 15:E820.
1195 *
1196 * subr_module.c says:
1197 * "Consumer may safely assume that size value precedes data."
1198 * ie: an int32_t immediately precedes smap.
1199 */
1200
1201 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1202 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1203 smap = (struct bios_smap *)preload_search_info(kmdp,
1204 MODINFO_METADATA | MODINFOMD_SMAP);
1205 if (efihdr == NULL && smap == NULL)
1206 panic("No BIOS smap or EFI map info from loader!");
1207
1208 if (efihdr != NULL) {
1209 add_efi_map_entries(efihdr, physmap, physmap_idx);
1210 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1211 } else {
1212 size = *((u_int32_t *)smap - 1);
1213 bios_add_smap_entries(smap, size, physmap, physmap_idx);
1214 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1215 }
1216}
1217
1218#define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
1219
1220/*
1221 * Populate the (physmap) array with base/bound pairs describing the
1222 * available physical memory in the system, then test this memory and
1223 * build the phys_avail array describing the actually-available memory.
1224 *
1225 * Total memory size may be set by the kernel environment variable
1226 * hw.physmem or the compile-time define MAXMEM.
1227 *
1228 * XXX first should be vm_paddr_t.
1229 */
1230static void
1231getmemsize(caddr_t kmdp, u_int64_t first)
1232{
1233 int i, physmap_idx, pa_indx, da_indx;
1234 vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1235 u_long physmem_start, physmem_tunable, memtest;
1236 pt_entry_t *pte;
1237 quad_t dcons_addr, dcons_size;
1238 int page_counter;
1239
1240 /*
1241 * Tell the physical memory allocator about pages used to store
1242 * the kernel and preloaded data. See kmem_bootstrap_free().
1243 */
1244 vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1245
1246 bzero(physmap, sizeof(physmap));
1247 physmap_idx = 0;
1248
1249 init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1250 physmap_idx -= 2;
1251
1252 /*
1253 * Find the 'base memory' segment for SMP
1254 */
1255 basemem = 0;
1256 for (i = 0; i <= physmap_idx; i += 2) {
1257 if (physmap[i] <= 0xA0000) {
1258 basemem = physmap[i + 1] / 1024;
1259 break;
1260 }
1261 }
1262 if (basemem == 0 || basemem > 640) {
1263 if (bootverbose)
1264 printf(
1265 "Memory map doesn't contain a basemem segment, faking it");
1266 basemem = 640;
1267 }
1268
1269 /*
1270 * Make hole for "AP -> long mode" bootstrap code. The
1271 * mp_bootaddress vector is only available when the kernel
1272 * is configured to support APs and APs for the system start
1273 * in 32bit mode (e.g. SMP bare metal).
1274 */
1275 if (init_ops.mp_bootaddress) {
1276 if (physmap[1] >= 0x100000000)
1277 panic(
1278 "Basemem segment is not suitable for AP bootstrap code!");
1279 physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
1280 }
1281
1282 /*
1283 * Maxmem isn't the "maximum memory", it's one larger than the
1284 * highest page of the physical address space. It should be
1285 * called something like "Maxphyspage". We may adjust this
1286 * based on ``hw.physmem'' and the results of the memory test.
1287 */
1288 Maxmem = atop(physmap[physmap_idx + 1]);
1289
1290#ifdef MAXMEM
1291 Maxmem = MAXMEM / 4;
1292#endif
1293
1294 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1295 Maxmem = atop(physmem_tunable);
1296
1297 /*
1298 * The boot memory test is disabled by default, as it takes a
1299 * significant amount of time on large-memory systems, and is
1300 * unfriendly to virtual machines as it unnecessarily touches all
1301 * pages.
1302 *
1303 * A general name is used as the code may be extended to support
1304 * additional tests beyond the current "page present" test.
1305 */
1306 memtest = 0;
1307 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1308
1309 /*
1310 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1311 * in the system.
1312 */
1313 if (Maxmem > atop(physmap[physmap_idx + 1]))
1314 Maxmem = atop(physmap[physmap_idx + 1]);
1315
1316 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1317 (boothowto & RB_VERBOSE))
1318 printf("Physical memory use set to %ldK\n", Maxmem * 4);
1319
1320 /* call pmap initialization to make new kernel address space */
1321 pmap_bootstrap(&first);
1322
1323 /*
1324 * Size up each available chunk of physical memory.
1325 *
1326 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1327 * By default, mask off the first 16 pages unless we appear to be
1328 * running in a VM.
1329 */
1330 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1331 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1332 if (physmap[0] < physmem_start) {
1333 if (physmem_start < PAGE_SIZE)
1334 physmap[0] = PAGE_SIZE;
1335 else if (physmem_start >= physmap[1])
1336 physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1337 else
1338 physmap[0] = round_page(physmem_start);
1339 }
1340 pa_indx = 0;
1341 da_indx = 1;
1342 phys_avail[pa_indx++] = physmap[0];
1343 phys_avail[pa_indx] = physmap[0];
1344 dump_avail[da_indx] = physmap[0];
1345 pte = CMAP1;
1346
1347 /*
1348 * Get dcons buffer address
1349 */
1350 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1351 getenv_quad("dcons.size", &dcons_size) == 0)
1352 dcons_addr = 0;
1353
1354 /*
1355 * physmap is in bytes, so when converting to page boundaries,
1356 * round up the start address and round down the end address.
1357 */
1358 page_counter = 0;
1359 if (memtest != 0)
1360 printf("Testing system memory");
1361 for (i = 0; i <= physmap_idx; i += 2) {
1362 vm_paddr_t end;
1363
1364 end = ptoa((vm_paddr_t)Maxmem);
1365 if (physmap[i + 1] < end)
1366 end = trunc_page(physmap[i + 1]);
1367 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1368 int tmp, page_bad, full;
1369 int *ptr = (int *)CADDR1;
1370
1371 full = FALSE;
1372 /*
1373 * block out kernel memory as not available.
1374 */
1375 if (pa >= (vm_paddr_t)kernphys && pa < first)
1376 goto do_dump_avail;
1377
1378 /*
1379 * block out dcons buffer
1380 */
1381 if (dcons_addr > 0
1382 && pa >= trunc_page(dcons_addr)
1383 && pa < dcons_addr + dcons_size)
1384 goto do_dump_avail;
1385
1386 page_bad = FALSE;
1387 if (memtest == 0)
1388 goto skip_memtest;
1389
1390 /*
1391 * Print a "." every GB to show we're making
1392 * progress.
1393 */
1394 page_counter++;
1395 if ((page_counter % PAGES_PER_GB) == 0)
1396 printf(".");
1397
1398 /*
1399 * map page into kernel: valid, read/write,non-cacheable
1400 */
1401 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1402 invltlb();
1403
1404 tmp = *(int *)ptr;
1405 /*
1406 * Test for alternating 1's and 0's
1407 */
1408 *(volatile int *)ptr = 0xaaaaaaaa;
1409 if (*(volatile int *)ptr != 0xaaaaaaaa)
1410 page_bad = TRUE;
1411 /*
1412 * Test for alternating 0's and 1's
1413 */
1414 *(volatile int *)ptr = 0x55555555;
1415 if (*(volatile int *)ptr != 0x55555555)
1416 page_bad = TRUE;
1417 /*
1418 * Test for all 1's
1419 */
1420 *(volatile int *)ptr = 0xffffffff;
1421 if (*(volatile int *)ptr != 0xffffffff)
1422 page_bad = TRUE;
1423 /*
1424 * Test for all 0's
1425 */
1426 *(volatile int *)ptr = 0x0;
1427 if (*(volatile int *)ptr != 0x0)
1428 page_bad = TRUE;
1429 /*
1430 * Restore original value.
1431 */
1432 *(int *)ptr = tmp;
1433
1434skip_memtest:
1435 /*
1436 * Adjust array of valid/good pages.
1437 */
1438 if (page_bad == TRUE)
1439 continue;
1440 /*
1441 * If this good page is a continuation of the
1442 * previous set of good pages, then just increase
1443 * the end pointer. Otherwise start a new chunk.
1444 * Note that "end" points one higher than end,
1445 * making the range >= start and < end.
1446 * If we're also doing a speculative memory
1447 * test and we at or past the end, bump up Maxmem
1448 * so that we keep going. The first bad page
1449 * will terminate the loop.
1450 */
1451 if (phys_avail[pa_indx] == pa) {
1452 phys_avail[pa_indx] += PAGE_SIZE;
1453 } else {
1454 pa_indx++;
1455 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1456 printf(
1457 "Too many holes in the physical address space, giving up\n");
1458 pa_indx--;
1459 full = TRUE;
1460 goto do_dump_avail;
1461 }
1462 phys_avail[pa_indx++] = pa; /* start */
1463 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1464 }
1465 physmem++;
1466do_dump_avail:
1467 if (dump_avail[da_indx] == pa) {
1468 dump_avail[da_indx] += PAGE_SIZE;
1469 } else {
1470 da_indx++;
1471 if (da_indx == DUMP_AVAIL_ARRAY_END) {
1472 da_indx--;
1473 goto do_next;
1474 }
1475 dump_avail[da_indx++] = pa; /* start */
1476 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1477 }
1478do_next:
1479 if (full)
1480 break;
1481 }
1482 }
1483 *pte = 0;
1484 invltlb();
1485 if (memtest != 0)
1486 printf("\n");
1487
1488 /*
1489 * XXX
1490 * The last chunk must contain at least one page plus the message
1491 * buffer to avoid complicating other code (message buffer address
1492 * calculation, etc.).
1493 */
1494 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1495 round_page(msgbufsize) >= phys_avail[pa_indx]) {
1496 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1497 phys_avail[pa_indx--] = 0;
1498 phys_avail[pa_indx--] = 0;
1499 }
1500
1501 Maxmem = atop(phys_avail[pa_indx]);
1502
1503 /* Trim off space for the message buffer. */
1504 phys_avail[pa_indx] -= round_page(msgbufsize);
1505
1506 /* Map the message buffer. */
1507 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1508}
1509
1510static caddr_t
1511native_parse_preload_data(u_int64_t modulep)
1512{
1513 caddr_t kmdp;
1514 char *envp;
1515#ifdef DDB
1516 vm_offset_t ksym_start;
1517 vm_offset_t ksym_end;
1518#endif
1519
1520 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1521 preload_bootstrap_relocate(KERNBASE);
1522 kmdp = preload_search_by_type("elf kernel");
1523 if (kmdp == NULL)
1524 kmdp = preload_search_by_type("elf64 kernel");
1525 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1526 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1527 if (envp != NULL)
1528 envp += KERNBASE;
1529 init_static_kenv(envp, 0);
1530#ifdef DDB
1531 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1532 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1533 db_fetch_ksymtab(ksym_start, ksym_end);
1534#endif
1535 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1536
1537 return (kmdp);
1538}
1539
1540static void
1541amd64_kdb_init(void)
1542{
1543 kdb_init();
1544#ifdef KDB
1545 if (boothowto & RB_KDB)
1546 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1547#endif
1548}
1549
1550/* Set up the fast syscall stuff */
1551void
1552amd64_conf_fast_syscall(void)
1553{
1554 uint64_t msr;
1555
1556 msr = rdmsr(MSR_EFER) | EFER_SCE;
1557 wrmsr(MSR_EFER, msr);
1558 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1559 (u_int64_t)IDTVEC(fast_syscall));
1560 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1561 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1562 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1563 wrmsr(MSR_STAR, msr);
1564 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
1565}
1566
1567u_int64_t
1568hammer_time(u_int64_t modulep, u_int64_t physfree)
1569{
1570 caddr_t kmdp;
1571 int gsel_tss, x;
1572 struct pcpu *pc;
1573 struct nmi_pcpu *np;
1574 struct xstate_hdr *xhdr;
1575 u_int64_t rsp0;
1576 char *env;
1577 size_t kstack0_sz;
1578 int late_console;
1579
1580 kmdp = init_ops.parse_preload_data(modulep);
1581
1582 physfree += ucode_load_bsp(physfree + KERNBASE);
1583 physfree = roundup2(physfree, PAGE_SIZE);
1584
1585 identify_cpu1();
1586 identify_hypervisor();
1587 /*
1588 * hw.cpu_stdext_disable is ignored by the call, it will be
1589 * re-evaluted by the below call to finishidentcpu().
1590 */
1591 identify_cpu2();
1592
1593 link_elf_ireloc(kmdp);
1594
1595 /*
1596 * This may be done better later if it gets more high level
1597 * components in it. If so just link td->td_proc here.
1598 */
1599 proc_linkup0(&proc0, &thread0);
1600
1601 /* Init basic tunables, hz etc */
1602 init_param1();
1603
1604 thread0.td_kstack = physfree + KERNBASE;
1605 thread0.td_kstack_pages = kstack_pages;
1606 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1607 bzero((void *)thread0.td_kstack, kstack0_sz);
1608 physfree += kstack0_sz;
1609
1610 /*
1611 * make gdt memory segments
1612 */
1613 for (x = 0; x < NGDT; x++) {
1614 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1615 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1616 ssdtosd(&gdt_segs[x], &gdt[x]);
1617 }
1618 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1619 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1620 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1621
1622 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1623 r_gdt.rd_base = (long) gdt;
1624 lgdt(&r_gdt);
1625 pc = &__pcpu[0];
1626
1627 wrmsr(MSR_FSBASE, 0); /* User value */
1628 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1629 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1630
1631 pcpu_init(pc, 0, sizeof(struct pcpu));
1632 dpcpu_init((void *)(physfree + KERNBASE), 0);
1633 physfree += DPCPU_SIZE;
1634 PCPU_SET(prvspace, pc);
1635 PCPU_SET(curthread, &thread0);
1636 /* Non-late cninit() and printf() can be moved up to here. */
1637 PCPU_SET(tssp, &common_tss[0]);
1638 PCPU_SET(commontssp, &common_tss[0]);
1639 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1640 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1641 PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1642 PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1643
1644 /*
1645 * Initialize mutexes.
1646 *
1647 * icu_lock: in order to allow an interrupt to occur in a critical
1648 * section, to set pcpu->ipending (etc...) properly, we
1649 * must be able to get the icu lock, so it can't be
1650 * under witness.
1651 */
1652 mutex_init();
1653 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1654 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1655
1656 /* exceptions */
1657 pti = pti_get_default();
1658 TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1659
1660 for (x = 0; x < NIDT; x++)
1661 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1662 SEL_KPL, 0);
1663 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1664 SEL_KPL, 0);
1665 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1666 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
1667 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1668 SEL_UPL, 0);
1669 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1670 SEL_UPL, 0);
1671 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1672 SEL_KPL, 0);
1673 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1674 SEL_KPL, 0);
1675 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1676 SEL_KPL, 0);
1677 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1678 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1679 SDT_SYSIGT, SEL_KPL, 0);
1680 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1681 SEL_KPL, 0);
1682 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1683 SDT_SYSIGT, SEL_KPL, 0);
1684 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1685 SEL_KPL, 0);
1686 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1687 SEL_KPL, 0);
1688 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1689 SEL_KPL, 0);
1690 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1691 SEL_KPL, 0);
1692 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1693 SEL_KPL, 0);
1694 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1695 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1696 SEL_KPL, 0);
1697#ifdef KDTRACE_HOOKS
1698 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1699 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1700#endif
1701#ifdef XENHVM
1702 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1703 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1704#endif
1705 r_idt.rd_limit = sizeof(idt0) - 1;
1706 r_idt.rd_base = (long) idt;
1707 lidt(&r_idt);
1708
1709 /*
1710 * Initialize the clock before the console so that console
1711 * initialization can use DELAY().
1712 */
1713 clock_init();
1714
1715 /*
1716 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1717 * transition).
1718 * Once bootblocks have updated, we can test directly for
1719 * efi_systbl != NULL here...
1720 */
1721 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1722 != NULL)
1723 vty_set_preferred(VTY_VT);
1724
1725 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1726 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1727 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1728 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1729
43
44#include "opt_atpic.h"
45#include "opt_compat.h"
46#include "opt_cpu.h"
47#include "opt_ddb.h"
48#include "opt_inet.h"
49#include "opt_isa.h"
50#include "opt_kstack_pages.h"
51#include "opt_maxmem.h"
52#include "opt_mp_watchdog.h"
53#include "opt_perfmon.h"
54#include "opt_platform.h"
55#include "opt_sched.h"
56
57#include <sys/param.h>
58#include <sys/proc.h>
59#include <sys/systm.h>
60#include <sys/bio.h>
61#include <sys/buf.h>
62#include <sys/bus.h>
63#include <sys/callout.h>
64#include <sys/cons.h>
65#include <sys/cpu.h>
66#include <sys/efi.h>
67#include <sys/eventhandler.h>
68#include <sys/exec.h>
69#include <sys/imgact.h>
70#include <sys/kdb.h>
71#include <sys/kernel.h>
72#include <sys/ktr.h>
73#include <sys/linker.h>
74#include <sys/lock.h>
75#include <sys/malloc.h>
76#include <sys/memrange.h>
77#include <sys/msgbuf.h>
78#include <sys/mutex.h>
79#include <sys/pcpu.h>
80#include <sys/ptrace.h>
81#include <sys/reboot.h>
82#include <sys/rwlock.h>
83#include <sys/sched.h>
84#include <sys/signalvar.h>
85#ifdef SMP
86#include <sys/smp.h>
87#endif
88#include <sys/syscallsubr.h>
89#include <sys/sysctl.h>
90#include <sys/sysent.h>
91#include <sys/sysproto.h>
92#include <sys/ucontext.h>
93#include <sys/vmmeter.h>
94
95#include <vm/vm.h>
96#include <vm/vm_extern.h>
97#include <vm/vm_kern.h>
98#include <vm/vm_page.h>
99#include <vm/vm_map.h>
100#include <vm/vm_object.h>
101#include <vm/vm_pager.h>
102#include <vm/vm_param.h>
103#include <vm/vm_phys.h>
104
105#ifdef DDB
106#ifndef KDB
107#error KDB must be enabled in order for DDB to work!
108#endif
109#include <ddb/ddb.h>
110#include <ddb/db_sym.h>
111#endif
112
113#include <net/netisr.h>
114
115#include <machine/clock.h>
116#include <machine/cpu.h>
117#include <machine/cputypes.h>
118#include <machine/frame.h>
119#include <machine/intr_machdep.h>
120#include <x86/mca.h>
121#include <machine/md_var.h>
122#include <machine/metadata.h>
123#include <machine/mp_watchdog.h>
124#include <machine/pc/bios.h>
125#include <machine/pcb.h>
126#include <machine/proc.h>
127#include <machine/reg.h>
128#include <machine/sigframe.h>
129#include <machine/specialreg.h>
130#ifdef PERFMON
131#include <machine/perfmon.h>
132#endif
133#include <machine/tss.h>
134#include <x86/ucode.h>
135#ifdef SMP
136#include <machine/smp.h>
137#endif
138#ifdef FDT
139#include <x86/fdt.h>
140#endif
141
142#ifdef DEV_ATPIC
143#include <x86/isa/icu.h>
144#else
145#include <x86/apicvar.h>
146#endif
147
148#include <isa/isareg.h>
149#include <isa/rtc.h>
150#include <x86/init.h>
151
152/* Sanity check for __curthread() */
153CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154
155/*
156 * The PTI trampoline stack needs enough space for a hardware trapframe and a
157 * couple of scratch registers, as well as the trapframe left behind after an
158 * iret fault.
159 */
160CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161 offsetof(struct pti_frame, pti_rip));
162
163extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164
165#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
166#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
167
168static void cpu_startup(void *);
169static void get_fpcontext(struct thread *td, mcontext_t *mcp,
170 char *xfpusave, size_t xfpusave_len);
171static int set_fpcontext(struct thread *td, mcontext_t *mcp,
172 char *xfpustate, size_t xfpustate_len);
173SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
174
175/* Preload data parse function */
176static caddr_t native_parse_preload_data(u_int64_t);
177
178/* Native function to fetch and parse the e820 map */
179static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
180
181/* Default init_ops implementation. */
182struct init_ops init_ops = {
183 .parse_preload_data = native_parse_preload_data,
184 .early_clock_source_init = i8254_init,
185 .early_delay = i8254_delay,
186 .parse_memmap = native_parse_memmap,
187#ifdef SMP
188 .mp_bootaddress = mp_bootaddress,
189 .start_all_aps = native_start_all_aps,
190#endif
191 .msi_init = msi_init,
192};
193
194struct msgbuf *msgbufp;
195
196/*
197 * Physical address of the EFI System Table. Stashed from the metadata hints
198 * passed into the kernel and used by the EFI code to call runtime services.
199 */
200vm_paddr_t efi_systbl_phys;
201
202/* Intel ICH registers */
203#define ICH_PMBASE 0x400
204#define ICH_SMI_EN ICH_PMBASE + 0x30
205
206int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
207
208int cold = 1;
209
210long Maxmem = 0;
211long realmem = 0;
212
213/*
214 * The number of PHYSMAP entries must be one less than the number of
215 * PHYSSEG entries because the PHYSMAP entry that spans the largest
216 * physical address that is accessible by ISA DMA is split into two
217 * PHYSSEG entries.
218 */
219#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
220
221vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
222vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
223
224/* must be 2 less so 0 0 can signal end of chunks */
225#define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
226#define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
227
228struct kva_md_info kmi;
229
230static struct trapframe proc0_tf;
231struct region_descriptor r_gdt, r_idt;
232
233struct pcpu __pcpu[MAXCPU];
234
235struct mtx icu_lock;
236
237struct mem_range_softc mem_range_softc;
238
239struct mtx dt_lock; /* lock for GDT and LDT */
240
241void (*vmm_resume_p)(void);
242
243static void
244cpu_startup(dummy)
245 void *dummy;
246{
247 uintmax_t memsize;
248 char *sysenv;
249
250 /*
251 * On MacBooks, we need to disallow the legacy USB circuit to
252 * generate an SMI# because this can cause several problems,
253 * namely: incorrect CPU frequency detection and failure to
254 * start the APs.
255 * We do this by disabling a bit in the SMI_EN (SMI Control and
256 * Enable register) of the Intel ICH LPC Interface Bridge.
257 */
258 sysenv = kern_getenv("smbios.system.product");
259 if (sysenv != NULL) {
260 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
261 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
262 strncmp(sysenv, "MacBook4,1", 10) == 0 ||
263 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
264 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
265 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
266 strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
267 strncmp(sysenv, "Macmini1,1", 10) == 0) {
268 if (bootverbose)
269 printf("Disabling LEGACY_USB_EN bit on "
270 "Intel ICH.\n");
271 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
272 }
273 freeenv(sysenv);
274 }
275
276 /*
277 * Good {morning,afternoon,evening,night}.
278 */
279 startrtclock();
280 printcpuinfo();
281#ifdef PERFMON
282 perfmon_init();
283#endif
284
285 /*
286 * Display physical memory if SMBIOS reports reasonable amount.
287 */
288 memsize = 0;
289 sysenv = kern_getenv("smbios.memory.enabled");
290 if (sysenv != NULL) {
291 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
292 freeenv(sysenv);
293 }
294 if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
295 memsize = ptoa((uintmax_t)Maxmem);
296 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
297 realmem = atop(memsize);
298
299 /*
300 * Display any holes after the first chunk of extended memory.
301 */
302 if (bootverbose) {
303 int indx;
304
305 printf("Physical memory chunk(s):\n");
306 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
307 vm_paddr_t size;
308
309 size = phys_avail[indx + 1] - phys_avail[indx];
310 printf(
311 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
312 (uintmax_t)phys_avail[indx],
313 (uintmax_t)phys_avail[indx + 1] - 1,
314 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
315 }
316 }
317
318 vm_ksubmap_init(&kmi);
319
320 printf("avail memory = %ju (%ju MB)\n",
321 ptoa((uintmax_t)vm_cnt.v_free_count),
322 ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
323
324 /*
325 * Set up buffers, so they can be used to read disk labels.
326 */
327 bufinit();
328 vm_pager_bufferinit();
329
330 cpu_setregs();
331}
332
333/*
334 * Send an interrupt to process.
335 *
336 * Stack is set up to allow sigcode stored
337 * at top to call routine, followed by call
338 * to sigreturn routine below. After sigreturn
339 * resets the signal mask, the stack, and the
340 * frame pointer, it returns to the user
341 * specified pc, psl.
342 */
343void
344sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
345{
346 struct sigframe sf, *sfp;
347 struct pcb *pcb;
348 struct proc *p;
349 struct thread *td;
350 struct sigacts *psp;
351 char *sp;
352 struct trapframe *regs;
353 char *xfpusave;
354 size_t xfpusave_len;
355 int sig;
356 int oonstack;
357
358 td = curthread;
359 pcb = td->td_pcb;
360 p = td->td_proc;
361 PROC_LOCK_ASSERT(p, MA_OWNED);
362 sig = ksi->ksi_signo;
363 psp = p->p_sigacts;
364 mtx_assert(&psp->ps_mtx, MA_OWNED);
365 regs = td->td_frame;
366 oonstack = sigonstack(regs->tf_rsp);
367
368 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
369 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
370 xfpusave = __builtin_alloca(xfpusave_len);
371 } else {
372 xfpusave_len = 0;
373 xfpusave = NULL;
374 }
375
376 /* Save user context. */
377 bzero(&sf, sizeof(sf));
378 sf.sf_uc.uc_sigmask = *mask;
379 sf.sf_uc.uc_stack = td->td_sigstk;
380 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
381 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
382 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
383 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
384 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
385 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
386 fpstate_drop(td);
387 update_pcb_bases(pcb);
388 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
389 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
390 bzero(sf.sf_uc.uc_mcontext.mc_spare,
391 sizeof(sf.sf_uc.uc_mcontext.mc_spare));
392 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
393
394 /* Allocate space for the signal handler context. */
395 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
396 SIGISMEMBER(psp->ps_sigonstack, sig)) {
397 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
398#if defined(COMPAT_43)
399 td->td_sigstk.ss_flags |= SS_ONSTACK;
400#endif
401 } else
402 sp = (char *)regs->tf_rsp - 128;
403 if (xfpusave != NULL) {
404 sp -= xfpusave_len;
405 sp = (char *)((unsigned long)sp & ~0x3Ful);
406 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
407 }
408 sp -= sizeof(struct sigframe);
409 /* Align to 16 bytes. */
410 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
411
412 /* Build the argument list for the signal handler. */
413 regs->tf_rdi = sig; /* arg 1 in %rdi */
414 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
415 bzero(&sf.sf_si, sizeof(sf.sf_si));
416 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
417 /* Signal handler installed with SA_SIGINFO. */
418 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
419 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
420
421 /* Fill in POSIX parts */
422 sf.sf_si = ksi->ksi_info;
423 sf.sf_si.si_signo = sig; /* maybe a translated signal */
424 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
425 } else {
426 /* Old FreeBSD-style arguments. */
427 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */
428 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
429 sf.sf_ahu.sf_handler = catcher;
430 }
431 mtx_unlock(&psp->ps_mtx);
432 PROC_UNLOCK(p);
433
434 /*
435 * Copy the sigframe out to the user's stack.
436 */
437 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
438 (xfpusave != NULL && copyout(xfpusave,
439 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
440 != 0)) {
441#ifdef DEBUG
442 printf("process %ld has trashed its stack\n", (long)p->p_pid);
443#endif
444 PROC_LOCK(p);
445 sigexit(td, SIGILL);
446 }
447
448 regs->tf_rsp = (long)sfp;
449 regs->tf_rip = p->p_sysent->sv_sigcode_base;
450 regs->tf_rflags &= ~(PSL_T | PSL_D);
451 regs->tf_cs = _ucodesel;
452 regs->tf_ds = _udatasel;
453 regs->tf_ss = _udatasel;
454 regs->tf_es = _udatasel;
455 regs->tf_fs = _ufssel;
456 regs->tf_gs = _ugssel;
457 regs->tf_flags = TF_HASSEGS;
458 PROC_LOCK(p);
459 mtx_lock(&psp->ps_mtx);
460}
461
462/*
463 * System call to cleanup state after a signal
464 * has been taken. Reset signal mask and
465 * stack state from context left by sendsig (above).
466 * Return to previous pc and psl as specified by
467 * context left by sendsig. Check carefully to
468 * make sure that the user has not modified the
469 * state to gain improper privileges.
470 *
471 * MPSAFE
472 */
473int
474sys_sigreturn(td, uap)
475 struct thread *td;
476 struct sigreturn_args /* {
477 const struct __ucontext *sigcntxp;
478 } */ *uap;
479{
480 ucontext_t uc;
481 struct pcb *pcb;
482 struct proc *p;
483 struct trapframe *regs;
484 ucontext_t *ucp;
485 char *xfpustate;
486 size_t xfpustate_len;
487 long rflags;
488 int cs, error, ret;
489 ksiginfo_t ksi;
490
491 pcb = td->td_pcb;
492 p = td->td_proc;
493
494 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
495 if (error != 0) {
496 uprintf("pid %d (%s): sigreturn copyin failed\n",
497 p->p_pid, td->td_name);
498 return (error);
499 }
500 ucp = &uc;
501 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
502 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
503 td->td_name, ucp->uc_mcontext.mc_flags);
504 return (EINVAL);
505 }
506 regs = td->td_frame;
507 rflags = ucp->uc_mcontext.mc_rflags;
508 /*
509 * Don't allow users to change privileged or reserved flags.
510 */
511 if (!EFL_SECURE(rflags, regs->tf_rflags)) {
512 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
513 td->td_name, rflags);
514 return (EINVAL);
515 }
516
517 /*
518 * Don't allow users to load a valid privileged %cs. Let the
519 * hardware check for invalid selectors, excess privilege in
520 * other selectors, invalid %eip's and invalid %esp's.
521 */
522 cs = ucp->uc_mcontext.mc_cs;
523 if (!CS_SECURE(cs)) {
524 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
525 td->td_name, cs);
526 ksiginfo_init_trap(&ksi);
527 ksi.ksi_signo = SIGBUS;
528 ksi.ksi_code = BUS_OBJERR;
529 ksi.ksi_trapno = T_PROTFLT;
530 ksi.ksi_addr = (void *)regs->tf_rip;
531 trapsignal(td, &ksi);
532 return (EINVAL);
533 }
534
535 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
536 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
537 if (xfpustate_len > cpu_max_ext_state_size -
538 sizeof(struct savefpu)) {
539 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
540 p->p_pid, td->td_name, xfpustate_len);
541 return (EINVAL);
542 }
543 xfpustate = __builtin_alloca(xfpustate_len);
544 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
545 xfpustate, xfpustate_len);
546 if (error != 0) {
547 uprintf(
548 "pid %d (%s): sigreturn copying xfpustate failed\n",
549 p->p_pid, td->td_name);
550 return (error);
551 }
552 } else {
553 xfpustate = NULL;
554 xfpustate_len = 0;
555 }
556 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
557 if (ret != 0) {
558 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
559 p->p_pid, td->td_name, ret);
560 return (ret);
561 }
562 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
563 update_pcb_bases(pcb);
564 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
565 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
566
567#if defined(COMPAT_43)
568 if (ucp->uc_mcontext.mc_onstack & 1)
569 td->td_sigstk.ss_flags |= SS_ONSTACK;
570 else
571 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
572#endif
573
574 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
575 return (EJUSTRETURN);
576}
577
578#ifdef COMPAT_FREEBSD4
579int
580freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
581{
582
583 return sys_sigreturn(td, (struct sigreturn_args *)uap);
584}
585#endif
586
587/*
588 * Reset registers to default values on exec.
589 */
590void
591exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
592{
593 struct trapframe *regs;
594 struct pcb *pcb;
595 register_t saved_rflags;
596
597 regs = td->td_frame;
598 pcb = td->td_pcb;
599
600 mtx_lock(&dt_lock);
601 if (td->td_proc->p_md.md_ldt != NULL)
602 user_ldt_free(td);
603 else
604 mtx_unlock(&dt_lock);
605
606 update_pcb_bases(pcb);
607 pcb->pcb_fsbase = 0;
608 pcb->pcb_gsbase = 0;
609 clear_pcb_flags(pcb, PCB_32BIT);
610 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
611
612 saved_rflags = regs->tf_rflags & PSL_T;
613 bzero((char *)regs, sizeof(struct trapframe));
614 regs->tf_rip = imgp->entry_addr;
615 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
616 regs->tf_rdi = stack; /* argv */
617 regs->tf_rflags = PSL_USER | saved_rflags;
618 regs->tf_ss = _udatasel;
619 regs->tf_cs = _ucodesel;
620 regs->tf_ds = _udatasel;
621 regs->tf_es = _udatasel;
622 regs->tf_fs = _ufssel;
623 regs->tf_gs = _ugssel;
624 regs->tf_flags = TF_HASSEGS;
625 td->td_retval[1] = 0;
626
627 /*
628 * Reset the hardware debug registers if they were in use.
629 * They won't have any meaning for the newly exec'd process.
630 */
631 if (pcb->pcb_flags & PCB_DBREGS) {
632 pcb->pcb_dr0 = 0;
633 pcb->pcb_dr1 = 0;
634 pcb->pcb_dr2 = 0;
635 pcb->pcb_dr3 = 0;
636 pcb->pcb_dr6 = 0;
637 pcb->pcb_dr7 = 0;
638 if (pcb == curpcb) {
639 /*
640 * Clear the debug registers on the running
641 * CPU, otherwise they will end up affecting
642 * the next process we switch to.
643 */
644 reset_dbregs();
645 }
646 clear_pcb_flags(pcb, PCB_DBREGS);
647 }
648
649 /*
650 * Drop the FP state if we hold it, so that the process gets a
651 * clean FP state if it uses the FPU again.
652 */
653 fpstate_drop(td);
654}
655
656void
657cpu_setregs(void)
658{
659 register_t cr0;
660
661 cr0 = rcr0();
662 /*
663 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
664 * BSP. See the comments there about why we set them.
665 */
666 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
667 load_cr0(cr0);
668}
669
670/*
671 * Initialize amd64 and configure to run kernel
672 */
673
674/*
675 * Initialize segments & interrupt table
676 */
677
678struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
679static struct gate_descriptor idt0[NIDT];
680struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
681
682static char dblfault_stack[PAGE_SIZE] __aligned(16);
683static char mce0_stack[PAGE_SIZE] __aligned(16);
684static char nmi0_stack[PAGE_SIZE] __aligned(16);
685static char dbg0_stack[PAGE_SIZE] __aligned(16);
686CTASSERT(sizeof(struct nmi_pcpu) == 16);
687
688struct amd64tss common_tss[MAXCPU];
689
690/*
691 * Software prototypes -- in more palatable form.
692 *
693 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
694 * slots as corresponding segments for i386 kernel.
695 */
696struct soft_segment_descriptor gdt_segs[] = {
697/* GNULL_SEL 0 Null Descriptor */
698{ .ssd_base = 0x0,
699 .ssd_limit = 0x0,
700 .ssd_type = 0,
701 .ssd_dpl = 0,
702 .ssd_p = 0,
703 .ssd_long = 0,
704 .ssd_def32 = 0,
705 .ssd_gran = 0 },
706/* GNULL2_SEL 1 Null Descriptor */
707{ .ssd_base = 0x0,
708 .ssd_limit = 0x0,
709 .ssd_type = 0,
710 .ssd_dpl = 0,
711 .ssd_p = 0,
712 .ssd_long = 0,
713 .ssd_def32 = 0,
714 .ssd_gran = 0 },
715/* GUFS32_SEL 2 32 bit %gs Descriptor for user */
716{ .ssd_base = 0x0,
717 .ssd_limit = 0xfffff,
718 .ssd_type = SDT_MEMRWA,
719 .ssd_dpl = SEL_UPL,
720 .ssd_p = 1,
721 .ssd_long = 0,
722 .ssd_def32 = 1,
723 .ssd_gran = 1 },
724/* GUGS32_SEL 3 32 bit %fs Descriptor for user */
725{ .ssd_base = 0x0,
726 .ssd_limit = 0xfffff,
727 .ssd_type = SDT_MEMRWA,
728 .ssd_dpl = SEL_UPL,
729 .ssd_p = 1,
730 .ssd_long = 0,
731 .ssd_def32 = 1,
732 .ssd_gran = 1 },
733/* GCODE_SEL 4 Code Descriptor for kernel */
734{ .ssd_base = 0x0,
735 .ssd_limit = 0xfffff,
736 .ssd_type = SDT_MEMERA,
737 .ssd_dpl = SEL_KPL,
738 .ssd_p = 1,
739 .ssd_long = 1,
740 .ssd_def32 = 0,
741 .ssd_gran = 1 },
742/* GDATA_SEL 5 Data Descriptor for kernel */
743{ .ssd_base = 0x0,
744 .ssd_limit = 0xfffff,
745 .ssd_type = SDT_MEMRWA,
746 .ssd_dpl = SEL_KPL,
747 .ssd_p = 1,
748 .ssd_long = 1,
749 .ssd_def32 = 0,
750 .ssd_gran = 1 },
751/* GUCODE32_SEL 6 32 bit Code Descriptor for user */
752{ .ssd_base = 0x0,
753 .ssd_limit = 0xfffff,
754 .ssd_type = SDT_MEMERA,
755 .ssd_dpl = SEL_UPL,
756 .ssd_p = 1,
757 .ssd_long = 0,
758 .ssd_def32 = 1,
759 .ssd_gran = 1 },
760/* GUDATA_SEL 7 32/64 bit Data Descriptor for user */
761{ .ssd_base = 0x0,
762 .ssd_limit = 0xfffff,
763 .ssd_type = SDT_MEMRWA,
764 .ssd_dpl = SEL_UPL,
765 .ssd_p = 1,
766 .ssd_long = 0,
767 .ssd_def32 = 1,
768 .ssd_gran = 1 },
769/* GUCODE_SEL 8 64 bit Code Descriptor for user */
770{ .ssd_base = 0x0,
771 .ssd_limit = 0xfffff,
772 .ssd_type = SDT_MEMERA,
773 .ssd_dpl = SEL_UPL,
774 .ssd_p = 1,
775 .ssd_long = 1,
776 .ssd_def32 = 0,
777 .ssd_gran = 1 },
778/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
779{ .ssd_base = 0x0,
780 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
781 .ssd_type = SDT_SYSTSS,
782 .ssd_dpl = SEL_KPL,
783 .ssd_p = 1,
784 .ssd_long = 0,
785 .ssd_def32 = 0,
786 .ssd_gran = 0 },
787/* Actually, the TSS is a system descriptor which is double size */
788{ .ssd_base = 0x0,
789 .ssd_limit = 0x0,
790 .ssd_type = 0,
791 .ssd_dpl = 0,
792 .ssd_p = 0,
793 .ssd_long = 0,
794 .ssd_def32 = 0,
795 .ssd_gran = 0 },
796/* GUSERLDT_SEL 11 LDT Descriptor */
797{ .ssd_base = 0x0,
798 .ssd_limit = 0x0,
799 .ssd_type = 0,
800 .ssd_dpl = 0,
801 .ssd_p = 0,
802 .ssd_long = 0,
803 .ssd_def32 = 0,
804 .ssd_gran = 0 },
805/* GUSERLDT_SEL 12 LDT Descriptor, double size */
806{ .ssd_base = 0x0,
807 .ssd_limit = 0x0,
808 .ssd_type = 0,
809 .ssd_dpl = 0,
810 .ssd_p = 0,
811 .ssd_long = 0,
812 .ssd_def32 = 0,
813 .ssd_gran = 0 },
814};
815
816void
817setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
818{
819 struct gate_descriptor *ip;
820
821 ip = idt + idx;
822 ip->gd_looffset = (uintptr_t)func;
823 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
824 ip->gd_ist = ist;
825 ip->gd_xx = 0;
826 ip->gd_type = typ;
827 ip->gd_dpl = dpl;
828 ip->gd_p = 1;
829 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
830}
831
832extern inthand_t
833 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
834 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
835 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
836 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
837 IDTVEC(xmm), IDTVEC(dblfault),
838 IDTVEC(div_pti), IDTVEC(bpt_pti),
839 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
840 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
841 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
842 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
843 IDTVEC(xmm_pti),
844#ifdef KDTRACE_HOOKS
845 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
846#endif
847#ifdef XENHVM
848 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
849#endif
850 IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
851 IDTVEC(fast_syscall_pti);
852
853#ifdef DDB
854/*
855 * Display the index and function name of any IDT entries that don't use
856 * the default 'rsvd' entry point.
857 */
858DB_SHOW_COMMAND(idt, db_show_idt)
859{
860 struct gate_descriptor *ip;
861 int idx;
862 uintptr_t func;
863
864 ip = idt;
865 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
866 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
867 if (func != (uintptr_t)&IDTVEC(rsvd)) {
868 db_printf("%3d\t", idx);
869 db_printsym(func, DB_STGY_PROC);
870 db_printf("\n");
871 }
872 ip++;
873 }
874}
875
876/* Show privileged registers. */
877DB_SHOW_COMMAND(sysregs, db_show_sysregs)
878{
879 struct {
880 uint16_t limit;
881 uint64_t base;
882 } __packed idtr, gdtr;
883 uint16_t ldt, tr;
884
885 __asm __volatile("sidt %0" : "=m" (idtr));
886 db_printf("idtr\t0x%016lx/%04x\n",
887 (u_long)idtr.base, (u_int)idtr.limit);
888 __asm __volatile("sgdt %0" : "=m" (gdtr));
889 db_printf("gdtr\t0x%016lx/%04x\n",
890 (u_long)gdtr.base, (u_int)gdtr.limit);
891 __asm __volatile("sldt %0" : "=r" (ldt));
892 db_printf("ldtr\t0x%04x\n", ldt);
893 __asm __volatile("str %0" : "=r" (tr));
894 db_printf("tr\t0x%04x\n", tr);
895 db_printf("cr0\t0x%016lx\n", rcr0());
896 db_printf("cr2\t0x%016lx\n", rcr2());
897 db_printf("cr3\t0x%016lx\n", rcr3());
898 db_printf("cr4\t0x%016lx\n", rcr4());
899 if (rcr4() & CR4_XSAVE)
900 db_printf("xcr0\t0x%016lx\n", rxcr(0));
901 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
902 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
903 db_printf("FEATURES_CTL\t%016lx\n",
904 rdmsr(MSR_IA32_FEATURE_CONTROL));
905 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
906 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
907 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
908}
909
910DB_SHOW_COMMAND(dbregs, db_show_dbregs)
911{
912
913 db_printf("dr0\t0x%016lx\n", rdr0());
914 db_printf("dr1\t0x%016lx\n", rdr1());
915 db_printf("dr2\t0x%016lx\n", rdr2());
916 db_printf("dr3\t0x%016lx\n", rdr3());
917 db_printf("dr6\t0x%016lx\n", rdr6());
918 db_printf("dr7\t0x%016lx\n", rdr7());
919}
920#endif
921
922void
923sdtossd(sd, ssd)
924 struct user_segment_descriptor *sd;
925 struct soft_segment_descriptor *ssd;
926{
927
928 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
929 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
930 ssd->ssd_type = sd->sd_type;
931 ssd->ssd_dpl = sd->sd_dpl;
932 ssd->ssd_p = sd->sd_p;
933 ssd->ssd_long = sd->sd_long;
934 ssd->ssd_def32 = sd->sd_def32;
935 ssd->ssd_gran = sd->sd_gran;
936}
937
938void
939ssdtosd(ssd, sd)
940 struct soft_segment_descriptor *ssd;
941 struct user_segment_descriptor *sd;
942{
943
944 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
945 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
946 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
947 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
948 sd->sd_type = ssd->ssd_type;
949 sd->sd_dpl = ssd->ssd_dpl;
950 sd->sd_p = ssd->ssd_p;
951 sd->sd_long = ssd->ssd_long;
952 sd->sd_def32 = ssd->ssd_def32;
953 sd->sd_gran = ssd->ssd_gran;
954}
955
956void
957ssdtosyssd(ssd, sd)
958 struct soft_segment_descriptor *ssd;
959 struct system_segment_descriptor *sd;
960{
961
962 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
963 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
964 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
965 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
966 sd->sd_type = ssd->ssd_type;
967 sd->sd_dpl = ssd->ssd_dpl;
968 sd->sd_p = ssd->ssd_p;
969 sd->sd_gran = ssd->ssd_gran;
970}
971
972#if !defined(DEV_ATPIC) && defined(DEV_ISA)
973#include <isa/isavar.h>
974#include <isa/isareg.h>
975/*
976 * Return a bitmap of the current interrupt requests. This is 8259-specific
977 * and is only suitable for use at probe time.
978 * This is only here to pacify sio. It is NOT FATAL if this doesn't work.
979 * It shouldn't be here. There should probably be an APIC centric
980 * implementation in the apic driver code, if at all.
981 */
982intrmask_t
983isa_irq_pending(void)
984{
985 u_char irr1;
986 u_char irr2;
987
988 irr1 = inb(IO_ICU1);
989 irr2 = inb(IO_ICU2);
990 return ((irr2 << 8) | irr1);
991}
992#endif
993
994u_int basemem;
995
996static int
997add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
998 int *physmap_idxp)
999{
1000 int i, insert_idx, physmap_idx;
1001
1002 physmap_idx = *physmap_idxp;
1003
1004 if (length == 0)
1005 return (1);
1006
1007 /*
1008 * Find insertion point while checking for overlap. Start off by
1009 * assuming the new entry will be added to the end.
1010 *
1011 * NB: physmap_idx points to the next free slot.
1012 */
1013 insert_idx = physmap_idx;
1014 for (i = 0; i <= physmap_idx; i += 2) {
1015 if (base < physmap[i + 1]) {
1016 if (base + length <= physmap[i]) {
1017 insert_idx = i;
1018 break;
1019 }
1020 if (boothowto & RB_VERBOSE)
1021 printf(
1022 "Overlapping memory regions, ignoring second region\n");
1023 return (1);
1024 }
1025 }
1026
1027 /* See if we can prepend to the next entry. */
1028 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1029 physmap[insert_idx] = base;
1030 return (1);
1031 }
1032
1033 /* See if we can append to the previous entry. */
1034 if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1035 physmap[insert_idx - 1] += length;
1036 return (1);
1037 }
1038
1039 physmap_idx += 2;
1040 *physmap_idxp = physmap_idx;
1041 if (physmap_idx == PHYSMAP_SIZE) {
1042 printf(
1043 "Too many segments in the physical address map, giving up\n");
1044 return (0);
1045 }
1046
1047 /*
1048 * Move the last 'N' entries down to make room for the new
1049 * entry if needed.
1050 */
1051 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1052 physmap[i] = physmap[i - 2];
1053 physmap[i + 1] = physmap[i - 1];
1054 }
1055
1056 /* Insert the new entry. */
1057 physmap[insert_idx] = base;
1058 physmap[insert_idx + 1] = base + length;
1059 return (1);
1060}
1061
1062void
1063bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1064 vm_paddr_t *physmap, int *physmap_idx)
1065{
1066 struct bios_smap *smap, *smapend;
1067
1068 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1069
1070 for (smap = smapbase; smap < smapend; smap++) {
1071 if (boothowto & RB_VERBOSE)
1072 printf("SMAP type=%02x base=%016lx len=%016lx\n",
1073 smap->type, smap->base, smap->length);
1074
1075 if (smap->type != SMAP_TYPE_MEMORY)
1076 continue;
1077
1078 if (!add_physmap_entry(smap->base, smap->length, physmap,
1079 physmap_idx))
1080 break;
1081 }
1082}
1083
1084static void
1085add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1086 int *physmap_idx)
1087{
1088 struct efi_md *map, *p;
1089 const char *type;
1090 size_t efisz;
1091 int ndesc, i;
1092
1093 static const char *types[] = {
1094 "Reserved",
1095 "LoaderCode",
1096 "LoaderData",
1097 "BootServicesCode",
1098 "BootServicesData",
1099 "RuntimeServicesCode",
1100 "RuntimeServicesData",
1101 "ConventionalMemory",
1102 "UnusableMemory",
1103 "ACPIReclaimMemory",
1104 "ACPIMemoryNVS",
1105 "MemoryMappedIO",
1106 "MemoryMappedIOPortSpace",
1107 "PalCode",
1108 "PersistentMemory"
1109 };
1110
1111 /*
1112 * Memory map data provided by UEFI via the GetMemoryMap
1113 * Boot Services API.
1114 */
1115 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1116 map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1117
1118 if (efihdr->descriptor_size == 0)
1119 return;
1120 ndesc = efihdr->memory_size / efihdr->descriptor_size;
1121
1122 if (boothowto & RB_VERBOSE)
1123 printf("%23s %12s %12s %8s %4s\n",
1124 "Type", "Physical", "Virtual", "#Pages", "Attr");
1125
1126 for (i = 0, p = map; i < ndesc; i++,
1127 p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1128 if (boothowto & RB_VERBOSE) {
1129 if (p->md_type < nitems(types))
1130 type = types[p->md_type];
1131 else
1132 type = "<INVALID>";
1133 printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1134 p->md_virt, p->md_pages);
1135 if (p->md_attr & EFI_MD_ATTR_UC)
1136 printf("UC ");
1137 if (p->md_attr & EFI_MD_ATTR_WC)
1138 printf("WC ");
1139 if (p->md_attr & EFI_MD_ATTR_WT)
1140 printf("WT ");
1141 if (p->md_attr & EFI_MD_ATTR_WB)
1142 printf("WB ");
1143 if (p->md_attr & EFI_MD_ATTR_UCE)
1144 printf("UCE ");
1145 if (p->md_attr & EFI_MD_ATTR_WP)
1146 printf("WP ");
1147 if (p->md_attr & EFI_MD_ATTR_RP)
1148 printf("RP ");
1149 if (p->md_attr & EFI_MD_ATTR_XP)
1150 printf("XP ");
1151 if (p->md_attr & EFI_MD_ATTR_NV)
1152 printf("NV ");
1153 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1154 printf("MORE_RELIABLE ");
1155 if (p->md_attr & EFI_MD_ATTR_RO)
1156 printf("RO ");
1157 if (p->md_attr & EFI_MD_ATTR_RT)
1158 printf("RUNTIME");
1159 printf("\n");
1160 }
1161
1162 switch (p->md_type) {
1163 case EFI_MD_TYPE_CODE:
1164 case EFI_MD_TYPE_DATA:
1165 case EFI_MD_TYPE_BS_CODE:
1166 case EFI_MD_TYPE_BS_DATA:
1167 case EFI_MD_TYPE_FREE:
1168 /*
1169 * We're allowed to use any entry with these types.
1170 */
1171 break;
1172 default:
1173 continue;
1174 }
1175
1176 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1177 physmap, physmap_idx))
1178 break;
1179 }
1180}
1181
1182static char bootmethod[16] = "";
1183SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1184 "System firmware boot method");
1185
1186static void
1187native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1188{
1189 struct bios_smap *smap;
1190 struct efi_map_header *efihdr;
1191 u_int32_t size;
1192
1193 /*
1194 * Memory map from INT 15:E820.
1195 *
1196 * subr_module.c says:
1197 * "Consumer may safely assume that size value precedes data."
1198 * ie: an int32_t immediately precedes smap.
1199 */
1200
1201 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1202 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1203 smap = (struct bios_smap *)preload_search_info(kmdp,
1204 MODINFO_METADATA | MODINFOMD_SMAP);
1205 if (efihdr == NULL && smap == NULL)
1206 panic("No BIOS smap or EFI map info from loader!");
1207
1208 if (efihdr != NULL) {
1209 add_efi_map_entries(efihdr, physmap, physmap_idx);
1210 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1211 } else {
1212 size = *((u_int32_t *)smap - 1);
1213 bios_add_smap_entries(smap, size, physmap, physmap_idx);
1214 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1215 }
1216}
1217
1218#define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
1219
1220/*
1221 * Populate the (physmap) array with base/bound pairs describing the
1222 * available physical memory in the system, then test this memory and
1223 * build the phys_avail array describing the actually-available memory.
1224 *
1225 * Total memory size may be set by the kernel environment variable
1226 * hw.physmem or the compile-time define MAXMEM.
1227 *
1228 * XXX first should be vm_paddr_t.
1229 */
1230static void
1231getmemsize(caddr_t kmdp, u_int64_t first)
1232{
1233 int i, physmap_idx, pa_indx, da_indx;
1234 vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1235 u_long physmem_start, physmem_tunable, memtest;
1236 pt_entry_t *pte;
1237 quad_t dcons_addr, dcons_size;
1238 int page_counter;
1239
1240 /*
1241 * Tell the physical memory allocator about pages used to store
1242 * the kernel and preloaded data. See kmem_bootstrap_free().
1243 */
1244 vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1245
1246 bzero(physmap, sizeof(physmap));
1247 physmap_idx = 0;
1248
1249 init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1250 physmap_idx -= 2;
1251
1252 /*
1253 * Find the 'base memory' segment for SMP
1254 */
1255 basemem = 0;
1256 for (i = 0; i <= physmap_idx; i += 2) {
1257 if (physmap[i] <= 0xA0000) {
1258 basemem = physmap[i + 1] / 1024;
1259 break;
1260 }
1261 }
1262 if (basemem == 0 || basemem > 640) {
1263 if (bootverbose)
1264 printf(
1265 "Memory map doesn't contain a basemem segment, faking it");
1266 basemem = 640;
1267 }
1268
1269 /*
1270 * Make hole for "AP -> long mode" bootstrap code. The
1271 * mp_bootaddress vector is only available when the kernel
1272 * is configured to support APs and APs for the system start
1273 * in 32bit mode (e.g. SMP bare metal).
1274 */
1275 if (init_ops.mp_bootaddress) {
1276 if (physmap[1] >= 0x100000000)
1277 panic(
1278 "Basemem segment is not suitable for AP bootstrap code!");
1279 physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
1280 }
1281
1282 /*
1283 * Maxmem isn't the "maximum memory", it's one larger than the
1284 * highest page of the physical address space. It should be
1285 * called something like "Maxphyspage". We may adjust this
1286 * based on ``hw.physmem'' and the results of the memory test.
1287 */
1288 Maxmem = atop(physmap[physmap_idx + 1]);
1289
1290#ifdef MAXMEM
1291 Maxmem = MAXMEM / 4;
1292#endif
1293
1294 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1295 Maxmem = atop(physmem_tunable);
1296
1297 /*
1298 * The boot memory test is disabled by default, as it takes a
1299 * significant amount of time on large-memory systems, and is
1300 * unfriendly to virtual machines as it unnecessarily touches all
1301 * pages.
1302 *
1303 * A general name is used as the code may be extended to support
1304 * additional tests beyond the current "page present" test.
1305 */
1306 memtest = 0;
1307 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1308
1309 /*
1310 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1311 * in the system.
1312 */
1313 if (Maxmem > atop(physmap[physmap_idx + 1]))
1314 Maxmem = atop(physmap[physmap_idx + 1]);
1315
1316 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1317 (boothowto & RB_VERBOSE))
1318 printf("Physical memory use set to %ldK\n", Maxmem * 4);
1319
1320 /* call pmap initialization to make new kernel address space */
1321 pmap_bootstrap(&first);
1322
1323 /*
1324 * Size up each available chunk of physical memory.
1325 *
1326 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1327 * By default, mask off the first 16 pages unless we appear to be
1328 * running in a VM.
1329 */
1330 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1331 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1332 if (physmap[0] < physmem_start) {
1333 if (physmem_start < PAGE_SIZE)
1334 physmap[0] = PAGE_SIZE;
1335 else if (physmem_start >= physmap[1])
1336 physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1337 else
1338 physmap[0] = round_page(physmem_start);
1339 }
1340 pa_indx = 0;
1341 da_indx = 1;
1342 phys_avail[pa_indx++] = physmap[0];
1343 phys_avail[pa_indx] = physmap[0];
1344 dump_avail[da_indx] = physmap[0];
1345 pte = CMAP1;
1346
1347 /*
1348 * Get dcons buffer address
1349 */
1350 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1351 getenv_quad("dcons.size", &dcons_size) == 0)
1352 dcons_addr = 0;
1353
1354 /*
1355 * physmap is in bytes, so when converting to page boundaries,
1356 * round up the start address and round down the end address.
1357 */
1358 page_counter = 0;
1359 if (memtest != 0)
1360 printf("Testing system memory");
1361 for (i = 0; i <= physmap_idx; i += 2) {
1362 vm_paddr_t end;
1363
1364 end = ptoa((vm_paddr_t)Maxmem);
1365 if (physmap[i + 1] < end)
1366 end = trunc_page(physmap[i + 1]);
1367 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1368 int tmp, page_bad, full;
1369 int *ptr = (int *)CADDR1;
1370
1371 full = FALSE;
1372 /*
1373 * block out kernel memory as not available.
1374 */
1375 if (pa >= (vm_paddr_t)kernphys && pa < first)
1376 goto do_dump_avail;
1377
1378 /*
1379 * block out dcons buffer
1380 */
1381 if (dcons_addr > 0
1382 && pa >= trunc_page(dcons_addr)
1383 && pa < dcons_addr + dcons_size)
1384 goto do_dump_avail;
1385
1386 page_bad = FALSE;
1387 if (memtest == 0)
1388 goto skip_memtest;
1389
1390 /*
1391 * Print a "." every GB to show we're making
1392 * progress.
1393 */
1394 page_counter++;
1395 if ((page_counter % PAGES_PER_GB) == 0)
1396 printf(".");
1397
1398 /*
1399 * map page into kernel: valid, read/write,non-cacheable
1400 */
1401 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1402 invltlb();
1403
1404 tmp = *(int *)ptr;
1405 /*
1406 * Test for alternating 1's and 0's
1407 */
1408 *(volatile int *)ptr = 0xaaaaaaaa;
1409 if (*(volatile int *)ptr != 0xaaaaaaaa)
1410 page_bad = TRUE;
1411 /*
1412 * Test for alternating 0's and 1's
1413 */
1414 *(volatile int *)ptr = 0x55555555;
1415 if (*(volatile int *)ptr != 0x55555555)
1416 page_bad = TRUE;
1417 /*
1418 * Test for all 1's
1419 */
1420 *(volatile int *)ptr = 0xffffffff;
1421 if (*(volatile int *)ptr != 0xffffffff)
1422 page_bad = TRUE;
1423 /*
1424 * Test for all 0's
1425 */
1426 *(volatile int *)ptr = 0x0;
1427 if (*(volatile int *)ptr != 0x0)
1428 page_bad = TRUE;
1429 /*
1430 * Restore original value.
1431 */
1432 *(int *)ptr = tmp;
1433
1434skip_memtest:
1435 /*
1436 * Adjust array of valid/good pages.
1437 */
1438 if (page_bad == TRUE)
1439 continue;
1440 /*
1441 * If this good page is a continuation of the
1442 * previous set of good pages, then just increase
1443 * the end pointer. Otherwise start a new chunk.
1444 * Note that "end" points one higher than end,
1445 * making the range >= start and < end.
1446 * If we're also doing a speculative memory
1447 * test and we at or past the end, bump up Maxmem
1448 * so that we keep going. The first bad page
1449 * will terminate the loop.
1450 */
1451 if (phys_avail[pa_indx] == pa) {
1452 phys_avail[pa_indx] += PAGE_SIZE;
1453 } else {
1454 pa_indx++;
1455 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1456 printf(
1457 "Too many holes in the physical address space, giving up\n");
1458 pa_indx--;
1459 full = TRUE;
1460 goto do_dump_avail;
1461 }
1462 phys_avail[pa_indx++] = pa; /* start */
1463 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1464 }
1465 physmem++;
1466do_dump_avail:
1467 if (dump_avail[da_indx] == pa) {
1468 dump_avail[da_indx] += PAGE_SIZE;
1469 } else {
1470 da_indx++;
1471 if (da_indx == DUMP_AVAIL_ARRAY_END) {
1472 da_indx--;
1473 goto do_next;
1474 }
1475 dump_avail[da_indx++] = pa; /* start */
1476 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1477 }
1478do_next:
1479 if (full)
1480 break;
1481 }
1482 }
1483 *pte = 0;
1484 invltlb();
1485 if (memtest != 0)
1486 printf("\n");
1487
1488 /*
1489 * XXX
1490 * The last chunk must contain at least one page plus the message
1491 * buffer to avoid complicating other code (message buffer address
1492 * calculation, etc.).
1493 */
1494 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1495 round_page(msgbufsize) >= phys_avail[pa_indx]) {
1496 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1497 phys_avail[pa_indx--] = 0;
1498 phys_avail[pa_indx--] = 0;
1499 }
1500
1501 Maxmem = atop(phys_avail[pa_indx]);
1502
1503 /* Trim off space for the message buffer. */
1504 phys_avail[pa_indx] -= round_page(msgbufsize);
1505
1506 /* Map the message buffer. */
1507 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1508}
1509
1510static caddr_t
1511native_parse_preload_data(u_int64_t modulep)
1512{
1513 caddr_t kmdp;
1514 char *envp;
1515#ifdef DDB
1516 vm_offset_t ksym_start;
1517 vm_offset_t ksym_end;
1518#endif
1519
1520 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1521 preload_bootstrap_relocate(KERNBASE);
1522 kmdp = preload_search_by_type("elf kernel");
1523 if (kmdp == NULL)
1524 kmdp = preload_search_by_type("elf64 kernel");
1525 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1526 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1527 if (envp != NULL)
1528 envp += KERNBASE;
1529 init_static_kenv(envp, 0);
1530#ifdef DDB
1531 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1532 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1533 db_fetch_ksymtab(ksym_start, ksym_end);
1534#endif
1535 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1536
1537 return (kmdp);
1538}
1539
1540static void
1541amd64_kdb_init(void)
1542{
1543 kdb_init();
1544#ifdef KDB
1545 if (boothowto & RB_KDB)
1546 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1547#endif
1548}
1549
1550/* Set up the fast syscall stuff */
1551void
1552amd64_conf_fast_syscall(void)
1553{
1554 uint64_t msr;
1555
1556 msr = rdmsr(MSR_EFER) | EFER_SCE;
1557 wrmsr(MSR_EFER, msr);
1558 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1559 (u_int64_t)IDTVEC(fast_syscall));
1560 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1561 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1562 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1563 wrmsr(MSR_STAR, msr);
1564 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
1565}
1566
1567u_int64_t
1568hammer_time(u_int64_t modulep, u_int64_t physfree)
1569{
1570 caddr_t kmdp;
1571 int gsel_tss, x;
1572 struct pcpu *pc;
1573 struct nmi_pcpu *np;
1574 struct xstate_hdr *xhdr;
1575 u_int64_t rsp0;
1576 char *env;
1577 size_t kstack0_sz;
1578 int late_console;
1579
1580 kmdp = init_ops.parse_preload_data(modulep);
1581
1582 physfree += ucode_load_bsp(physfree + KERNBASE);
1583 physfree = roundup2(physfree, PAGE_SIZE);
1584
1585 identify_cpu1();
1586 identify_hypervisor();
1587 /*
1588 * hw.cpu_stdext_disable is ignored by the call, it will be
1589 * re-evaluted by the below call to finishidentcpu().
1590 */
1591 identify_cpu2();
1592
1593 link_elf_ireloc(kmdp);
1594
1595 /*
1596 * This may be done better later if it gets more high level
1597 * components in it. If so just link td->td_proc here.
1598 */
1599 proc_linkup0(&proc0, &thread0);
1600
1601 /* Init basic tunables, hz etc */
1602 init_param1();
1603
1604 thread0.td_kstack = physfree + KERNBASE;
1605 thread0.td_kstack_pages = kstack_pages;
1606 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1607 bzero((void *)thread0.td_kstack, kstack0_sz);
1608 physfree += kstack0_sz;
1609
1610 /*
1611 * make gdt memory segments
1612 */
1613 for (x = 0; x < NGDT; x++) {
1614 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1615 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1616 ssdtosd(&gdt_segs[x], &gdt[x]);
1617 }
1618 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1619 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1620 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1621
1622 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1623 r_gdt.rd_base = (long) gdt;
1624 lgdt(&r_gdt);
1625 pc = &__pcpu[0];
1626
1627 wrmsr(MSR_FSBASE, 0); /* User value */
1628 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1629 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1630
1631 pcpu_init(pc, 0, sizeof(struct pcpu));
1632 dpcpu_init((void *)(physfree + KERNBASE), 0);
1633 physfree += DPCPU_SIZE;
1634 PCPU_SET(prvspace, pc);
1635 PCPU_SET(curthread, &thread0);
1636 /* Non-late cninit() and printf() can be moved up to here. */
1637 PCPU_SET(tssp, &common_tss[0]);
1638 PCPU_SET(commontssp, &common_tss[0]);
1639 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1640 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1641 PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1642 PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1643
1644 /*
1645 * Initialize mutexes.
1646 *
1647 * icu_lock: in order to allow an interrupt to occur in a critical
1648 * section, to set pcpu->ipending (etc...) properly, we
1649 * must be able to get the icu lock, so it can't be
1650 * under witness.
1651 */
1652 mutex_init();
1653 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1654 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1655
1656 /* exceptions */
1657 pti = pti_get_default();
1658 TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1659
1660 for (x = 0; x < NIDT; x++)
1661 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1662 SEL_KPL, 0);
1663 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1664 SEL_KPL, 0);
1665 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1666 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
1667 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1668 SEL_UPL, 0);
1669 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1670 SEL_UPL, 0);
1671 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1672 SEL_KPL, 0);
1673 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1674 SEL_KPL, 0);
1675 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1676 SEL_KPL, 0);
1677 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1678 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1679 SDT_SYSIGT, SEL_KPL, 0);
1680 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1681 SEL_KPL, 0);
1682 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1683 SDT_SYSIGT, SEL_KPL, 0);
1684 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1685 SEL_KPL, 0);
1686 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1687 SEL_KPL, 0);
1688 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1689 SEL_KPL, 0);
1690 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1691 SEL_KPL, 0);
1692 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1693 SEL_KPL, 0);
1694 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1695 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1696 SEL_KPL, 0);
1697#ifdef KDTRACE_HOOKS
1698 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1699 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1700#endif
1701#ifdef XENHVM
1702 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1703 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1704#endif
1705 r_idt.rd_limit = sizeof(idt0) - 1;
1706 r_idt.rd_base = (long) idt;
1707 lidt(&r_idt);
1708
1709 /*
1710 * Initialize the clock before the console so that console
1711 * initialization can use DELAY().
1712 */
1713 clock_init();
1714
1715 /*
1716 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1717 * transition).
1718 * Once bootblocks have updated, we can test directly for
1719 * efi_systbl != NULL here...
1720 */
1721 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1722 != NULL)
1723 vty_set_preferred(VTY_VT);
1724
1725 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1726 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1727 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1728 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1729
1730 TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1731 &x86_rngds_mitg_enable);
1732
1730 finishidentcpu(); /* Final stage of CPU initialization */
1731 initializecpu(); /* Initialize CPU registers */
1732 initializecpucache();
1733
1734 /* doublefault stack space, runs on ist1 */
1735 common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1736
1737 /*
1738 * NMI stack, runs on ist2. The pcpu pointer is stored just
1739 * above the start of the ist2 stack.
1740 */
1741 np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1742 np->np_pcpu = (register_t) pc;
1743 common_tss[0].tss_ist2 = (long) np;
1744
1745 /*
1746 * MC# stack, runs on ist3. The pcpu pointer is stored just
1747 * above the start of the ist3 stack.
1748 */
1749 np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
1750 np->np_pcpu = (register_t) pc;
1751 common_tss[0].tss_ist3 = (long) np;
1752
1753 /*
1754 * DB# stack, runs on ist4.
1755 */
1756 np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
1757 np->np_pcpu = (register_t) pc;
1758 common_tss[0].tss_ist4 = (long) np;
1759
1760 /* Set the IO permission bitmap (empty due to tss seg limit) */
1761 common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1762
1763 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1764 ltr(gsel_tss);
1765
1766 amd64_conf_fast_syscall();
1767
1768 /*
1769 * Temporary forge some valid pointer to PCB, for exception
1770 * handlers. It is reinitialized properly below after FPU is
1771 * set up. Also set up td_critnest to short-cut the page
1772 * fault handler.
1773 */
1774 cpu_max_ext_state_size = sizeof(struct savefpu);
1775 thread0.td_pcb = get_pcb_td(&thread0);
1776 thread0.td_critnest = 1;
1777
1778 /*
1779 * The console and kdb should be initialized even earlier than here,
1780 * but some console drivers don't work until after getmemsize().
1781 * Default to late console initialization to support these drivers.
1782 * This loses mainly printf()s in getmemsize() and early debugging.
1783 */
1784 late_console = 1;
1785 TUNABLE_INT_FETCH("debug.late_console", &late_console);
1786 if (!late_console) {
1787 cninit();
1788 amd64_kdb_init();
1789 }
1790
1791 getmemsize(kmdp, physfree);
1792 init_param2(physmem);
1793
1794 /* now running on new page tables, configured,and u/iom is accessible */
1795
1796 if (late_console)
1797 cninit();
1798
1799#ifdef DEV_ISA
1800#ifdef DEV_ATPIC
1801 elcr_probe();
1802 atpic_startup();
1803#else
1804 /* Reset and mask the atpics and leave them shut down. */
1805 atpic_reset();
1806
1807 /*
1808 * Point the ICU spurious interrupt vectors at the APIC spurious
1809 * interrupt handler.
1810 */
1811 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1812 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1813#endif
1814#else
1815#error "have you forgotten the isa device?";
1816#endif
1817
1818 if (late_console)
1819 amd64_kdb_init();
1820
1821 msgbufinit(msgbufp, msgbufsize);
1822 fpuinit();
1823
1824 /*
1825 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1826 * area size. Zero out the extended state header in fpu save
1827 * area.
1828 */
1829 thread0.td_pcb = get_pcb_td(&thread0);
1830 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1831 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1832 if (use_xsave) {
1833 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1834 1);
1835 xhdr->xstate_bv = xsave_mask;
1836 }
1837 /* make an initial tss so cpu can get interrupt stack on syscall! */
1838 rsp0 = (vm_offset_t)thread0.td_pcb;
1839 /* Ensure the stack is aligned to 16 bytes */
1840 rsp0 &= ~0xFul;
1841 common_tss[0].tss_rsp0 = rsp0;
1842 PCPU_SET(rsp0, rsp0);
1843 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1844 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1845 PCPU_SET(curpcb, thread0.td_pcb);
1846
1847 /* transfer to user mode */
1848
1849 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1850 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1851 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1852 _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1853 _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1854
1855 load_ds(_udatasel);
1856 load_es(_udatasel);
1857 load_fs(_ufssel);
1858
1859 /* setup proc 0's pcb */
1860 thread0.td_pcb->pcb_flags = 0;
1861 thread0.td_frame = &proc0_tf;
1862
1863 env = kern_getenv("kernelname");
1864 if (env != NULL)
1865 strlcpy(kernelname, env, sizeof(kernelname));
1866
1867 cpu_probe_amdc1e();
1868
1869#ifdef FDT
1870 x86_init_fdt();
1871#endif
1872 thread0.td_critnest = 0;
1873
1874 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1875 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1876 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1877
1878 /* Location of kernel stack for locore */
1879 return ((u_int64_t)thread0.td_pcb);
1880}
1881
1882void
1883cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1884{
1885
1886 pcpu->pc_acpi_id = 0xffffffff;
1887}
1888
1889static int
1890smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1891{
1892 struct bios_smap *smapbase;
1893 struct bios_smap_xattr smap;
1894 caddr_t kmdp;
1895 uint32_t *smapattr;
1896 int count, error, i;
1897
1898 /* Retrieve the system memory map from the loader. */
1899 kmdp = preload_search_by_type("elf kernel");
1900 if (kmdp == NULL)
1901 kmdp = preload_search_by_type("elf64 kernel");
1902 smapbase = (struct bios_smap *)preload_search_info(kmdp,
1903 MODINFO_METADATA | MODINFOMD_SMAP);
1904 if (smapbase == NULL)
1905 return (0);
1906 smapattr = (uint32_t *)preload_search_info(kmdp,
1907 MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1908 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1909 error = 0;
1910 for (i = 0; i < count; i++) {
1911 smap.base = smapbase[i].base;
1912 smap.length = smapbase[i].length;
1913 smap.type = smapbase[i].type;
1914 if (smapattr != NULL)
1915 smap.xattr = smapattr[i];
1916 else
1917 smap.xattr = 0;
1918 error = SYSCTL_OUT(req, &smap, sizeof(smap));
1919 }
1920 return (error);
1921}
1922SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1923 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1924
1925static int
1926efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1927{
1928 struct efi_map_header *efihdr;
1929 caddr_t kmdp;
1930 uint32_t efisize;
1931
1932 kmdp = preload_search_by_type("elf kernel");
1933 if (kmdp == NULL)
1934 kmdp = preload_search_by_type("elf64 kernel");
1935 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1936 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1937 if (efihdr == NULL)
1938 return (0);
1939 efisize = *((uint32_t *)efihdr - 1);
1940 return (SYSCTL_OUT(req, efihdr, efisize));
1941}
1942SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1943 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1944
1945void
1946spinlock_enter(void)
1947{
1948 struct thread *td;
1949 register_t flags;
1950
1951 td = curthread;
1952 if (td->td_md.md_spinlock_count == 0) {
1953 flags = intr_disable();
1954 td->td_md.md_spinlock_count = 1;
1955 td->td_md.md_saved_flags = flags;
1956 } else
1957 td->td_md.md_spinlock_count++;
1958 critical_enter();
1959}
1960
1961void
1962spinlock_exit(void)
1963{
1964 struct thread *td;
1965 register_t flags;
1966
1967 td = curthread;
1968 critical_exit();
1969 flags = td->td_md.md_saved_flags;
1970 td->td_md.md_spinlock_count--;
1971 if (td->td_md.md_spinlock_count == 0)
1972 intr_restore(flags);
1973}
1974
1975/*
1976 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1977 * we want to start a backtrace from the function that caused us to enter
1978 * the debugger. We have the context in the trapframe, but base the trace
1979 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1980 * enough for a backtrace.
1981 */
1982void
1983makectx(struct trapframe *tf, struct pcb *pcb)
1984{
1985
1986 pcb->pcb_r12 = tf->tf_r12;
1987 pcb->pcb_r13 = tf->tf_r13;
1988 pcb->pcb_r14 = tf->tf_r14;
1989 pcb->pcb_r15 = tf->tf_r15;
1990 pcb->pcb_rbp = tf->tf_rbp;
1991 pcb->pcb_rbx = tf->tf_rbx;
1992 pcb->pcb_rip = tf->tf_rip;
1993 pcb->pcb_rsp = tf->tf_rsp;
1994}
1995
1996int
1997ptrace_set_pc(struct thread *td, unsigned long addr)
1998{
1999
2000 td->td_frame->tf_rip = addr;
2001 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2002 return (0);
2003}
2004
2005int
2006ptrace_single_step(struct thread *td)
2007{
2008
2009 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2010 if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2011 td->td_frame->tf_rflags |= PSL_T;
2012 td->td_dbgflags |= TDB_STEP;
2013 }
2014 return (0);
2015}
2016
2017int
2018ptrace_clear_single_step(struct thread *td)
2019{
2020 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2021 td->td_frame->tf_rflags &= ~PSL_T;
2022 td->td_dbgflags &= ~TDB_STEP;
2023 return (0);
2024}
2025
2026int
2027fill_regs(struct thread *td, struct reg *regs)
2028{
2029 struct trapframe *tp;
2030
2031 tp = td->td_frame;
2032 return (fill_frame_regs(tp, regs));
2033}
2034
2035int
2036fill_frame_regs(struct trapframe *tp, struct reg *regs)
2037{
2038
2039 regs->r_r15 = tp->tf_r15;
2040 regs->r_r14 = tp->tf_r14;
2041 regs->r_r13 = tp->tf_r13;
2042 regs->r_r12 = tp->tf_r12;
2043 regs->r_r11 = tp->tf_r11;
2044 regs->r_r10 = tp->tf_r10;
2045 regs->r_r9 = tp->tf_r9;
2046 regs->r_r8 = tp->tf_r8;
2047 regs->r_rdi = tp->tf_rdi;
2048 regs->r_rsi = tp->tf_rsi;
2049 regs->r_rbp = tp->tf_rbp;
2050 regs->r_rbx = tp->tf_rbx;
2051 regs->r_rdx = tp->tf_rdx;
2052 regs->r_rcx = tp->tf_rcx;
2053 regs->r_rax = tp->tf_rax;
2054 regs->r_rip = tp->tf_rip;
2055 regs->r_cs = tp->tf_cs;
2056 regs->r_rflags = tp->tf_rflags;
2057 regs->r_rsp = tp->tf_rsp;
2058 regs->r_ss = tp->tf_ss;
2059 if (tp->tf_flags & TF_HASSEGS) {
2060 regs->r_ds = tp->tf_ds;
2061 regs->r_es = tp->tf_es;
2062 regs->r_fs = tp->tf_fs;
2063 regs->r_gs = tp->tf_gs;
2064 } else {
2065 regs->r_ds = 0;
2066 regs->r_es = 0;
2067 regs->r_fs = 0;
2068 regs->r_gs = 0;
2069 }
2070 regs->r_err = 0;
2071 regs->r_trapno = 0;
2072 return (0);
2073}
2074
2075int
2076set_regs(struct thread *td, struct reg *regs)
2077{
2078 struct trapframe *tp;
2079 register_t rflags;
2080
2081 tp = td->td_frame;
2082 rflags = regs->r_rflags & 0xffffffff;
2083 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2084 return (EINVAL);
2085 tp->tf_r15 = regs->r_r15;
2086 tp->tf_r14 = regs->r_r14;
2087 tp->tf_r13 = regs->r_r13;
2088 tp->tf_r12 = regs->r_r12;
2089 tp->tf_r11 = regs->r_r11;
2090 tp->tf_r10 = regs->r_r10;
2091 tp->tf_r9 = regs->r_r9;
2092 tp->tf_r8 = regs->r_r8;
2093 tp->tf_rdi = regs->r_rdi;
2094 tp->tf_rsi = regs->r_rsi;
2095 tp->tf_rbp = regs->r_rbp;
2096 tp->tf_rbx = regs->r_rbx;
2097 tp->tf_rdx = regs->r_rdx;
2098 tp->tf_rcx = regs->r_rcx;
2099 tp->tf_rax = regs->r_rax;
2100 tp->tf_rip = regs->r_rip;
2101 tp->tf_cs = regs->r_cs;
2102 tp->tf_rflags = rflags;
2103 tp->tf_rsp = regs->r_rsp;
2104 tp->tf_ss = regs->r_ss;
2105 if (0) { /* XXXKIB */
2106 tp->tf_ds = regs->r_ds;
2107 tp->tf_es = regs->r_es;
2108 tp->tf_fs = regs->r_fs;
2109 tp->tf_gs = regs->r_gs;
2110 tp->tf_flags = TF_HASSEGS;
2111 }
2112 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2113 return (0);
2114}
2115
2116/* XXX check all this stuff! */
2117/* externalize from sv_xmm */
2118static void
2119fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2120{
2121 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2122 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2123 int i;
2124
2125 /* pcb -> fpregs */
2126 bzero(fpregs, sizeof(*fpregs));
2127
2128 /* FPU control/status */
2129 penv_fpreg->en_cw = penv_xmm->en_cw;
2130 penv_fpreg->en_sw = penv_xmm->en_sw;
2131 penv_fpreg->en_tw = penv_xmm->en_tw;
2132 penv_fpreg->en_opcode = penv_xmm->en_opcode;
2133 penv_fpreg->en_rip = penv_xmm->en_rip;
2134 penv_fpreg->en_rdp = penv_xmm->en_rdp;
2135 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2136 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2137
2138 /* FPU registers */
2139 for (i = 0; i < 8; ++i)
2140 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2141
2142 /* SSE registers */
2143 for (i = 0; i < 16; ++i)
2144 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2145}
2146
2147/* internalize from fpregs into sv_xmm */
2148static void
2149set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2150{
2151 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2152 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2153 int i;
2154
2155 /* fpregs -> pcb */
2156 /* FPU control/status */
2157 penv_xmm->en_cw = penv_fpreg->en_cw;
2158 penv_xmm->en_sw = penv_fpreg->en_sw;
2159 penv_xmm->en_tw = penv_fpreg->en_tw;
2160 penv_xmm->en_opcode = penv_fpreg->en_opcode;
2161 penv_xmm->en_rip = penv_fpreg->en_rip;
2162 penv_xmm->en_rdp = penv_fpreg->en_rdp;
2163 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2164 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2165
2166 /* FPU registers */
2167 for (i = 0; i < 8; ++i)
2168 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2169
2170 /* SSE registers */
2171 for (i = 0; i < 16; ++i)
2172 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2173}
2174
2175/* externalize from td->pcb */
2176int
2177fill_fpregs(struct thread *td, struct fpreg *fpregs)
2178{
2179
2180 KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2181 P_SHOULDSTOP(td->td_proc),
2182 ("not suspended thread %p", td));
2183 fpugetregs(td);
2184 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2185 return (0);
2186}
2187
2188/* internalize to td->pcb */
2189int
2190set_fpregs(struct thread *td, struct fpreg *fpregs)
2191{
2192
2193 critical_enter();
2194 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2195 fpuuserinited(td);
2196 critical_exit();
2197 return (0);
2198}
2199
2200/*
2201 * Get machine context.
2202 */
2203int
2204get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2205{
2206 struct pcb *pcb;
2207 struct trapframe *tp;
2208
2209 pcb = td->td_pcb;
2210 tp = td->td_frame;
2211 PROC_LOCK(curthread->td_proc);
2212 mcp->mc_onstack = sigonstack(tp->tf_rsp);
2213 PROC_UNLOCK(curthread->td_proc);
2214 mcp->mc_r15 = tp->tf_r15;
2215 mcp->mc_r14 = tp->tf_r14;
2216 mcp->mc_r13 = tp->tf_r13;
2217 mcp->mc_r12 = tp->tf_r12;
2218 mcp->mc_r11 = tp->tf_r11;
2219 mcp->mc_r10 = tp->tf_r10;
2220 mcp->mc_r9 = tp->tf_r9;
2221 mcp->mc_r8 = tp->tf_r8;
2222 mcp->mc_rdi = tp->tf_rdi;
2223 mcp->mc_rsi = tp->tf_rsi;
2224 mcp->mc_rbp = tp->tf_rbp;
2225 mcp->mc_rbx = tp->tf_rbx;
2226 mcp->mc_rcx = tp->tf_rcx;
2227 mcp->mc_rflags = tp->tf_rflags;
2228 if (flags & GET_MC_CLEAR_RET) {
2229 mcp->mc_rax = 0;
2230 mcp->mc_rdx = 0;
2231 mcp->mc_rflags &= ~PSL_C;
2232 } else {
2233 mcp->mc_rax = tp->tf_rax;
2234 mcp->mc_rdx = tp->tf_rdx;
2235 }
2236 mcp->mc_rip = tp->tf_rip;
2237 mcp->mc_cs = tp->tf_cs;
2238 mcp->mc_rsp = tp->tf_rsp;
2239 mcp->mc_ss = tp->tf_ss;
2240 mcp->mc_ds = tp->tf_ds;
2241 mcp->mc_es = tp->tf_es;
2242 mcp->mc_fs = tp->tf_fs;
2243 mcp->mc_gs = tp->tf_gs;
2244 mcp->mc_flags = tp->tf_flags;
2245 mcp->mc_len = sizeof(*mcp);
2246 get_fpcontext(td, mcp, NULL, 0);
2247 update_pcb_bases(pcb);
2248 mcp->mc_fsbase = pcb->pcb_fsbase;
2249 mcp->mc_gsbase = pcb->pcb_gsbase;
2250 mcp->mc_xfpustate = 0;
2251 mcp->mc_xfpustate_len = 0;
2252 bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2253 return (0);
2254}
2255
2256/*
2257 * Set machine context.
2258 *
2259 * However, we don't set any but the user modifiable flags, and we won't
2260 * touch the cs selector.
2261 */
2262int
2263set_mcontext(struct thread *td, mcontext_t *mcp)
2264{
2265 struct pcb *pcb;
2266 struct trapframe *tp;
2267 char *xfpustate;
2268 long rflags;
2269 int ret;
2270
2271 pcb = td->td_pcb;
2272 tp = td->td_frame;
2273 if (mcp->mc_len != sizeof(*mcp) ||
2274 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2275 return (EINVAL);
2276 rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2277 (tp->tf_rflags & ~PSL_USERCHANGE);
2278 if (mcp->mc_flags & _MC_HASFPXSTATE) {
2279 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2280 sizeof(struct savefpu))
2281 return (EINVAL);
2282 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2283 ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2284 mcp->mc_xfpustate_len);
2285 if (ret != 0)
2286 return (ret);
2287 } else
2288 xfpustate = NULL;
2289 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2290 if (ret != 0)
2291 return (ret);
2292 tp->tf_r15 = mcp->mc_r15;
2293 tp->tf_r14 = mcp->mc_r14;
2294 tp->tf_r13 = mcp->mc_r13;
2295 tp->tf_r12 = mcp->mc_r12;
2296 tp->tf_r11 = mcp->mc_r11;
2297 tp->tf_r10 = mcp->mc_r10;
2298 tp->tf_r9 = mcp->mc_r9;
2299 tp->tf_r8 = mcp->mc_r8;
2300 tp->tf_rdi = mcp->mc_rdi;
2301 tp->tf_rsi = mcp->mc_rsi;
2302 tp->tf_rbp = mcp->mc_rbp;
2303 tp->tf_rbx = mcp->mc_rbx;
2304 tp->tf_rdx = mcp->mc_rdx;
2305 tp->tf_rcx = mcp->mc_rcx;
2306 tp->tf_rax = mcp->mc_rax;
2307 tp->tf_rip = mcp->mc_rip;
2308 tp->tf_rflags = rflags;
2309 tp->tf_rsp = mcp->mc_rsp;
2310 tp->tf_ss = mcp->mc_ss;
2311 tp->tf_flags = mcp->mc_flags;
2312 if (tp->tf_flags & TF_HASSEGS) {
2313 tp->tf_ds = mcp->mc_ds;
2314 tp->tf_es = mcp->mc_es;
2315 tp->tf_fs = mcp->mc_fs;
2316 tp->tf_gs = mcp->mc_gs;
2317 }
2318 set_pcb_flags(pcb, PCB_FULL_IRET);
2319 if (mcp->mc_flags & _MC_HASBASES) {
2320 pcb->pcb_fsbase = mcp->mc_fsbase;
2321 pcb->pcb_gsbase = mcp->mc_gsbase;
2322 }
2323 return (0);
2324}
2325
2326static void
2327get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2328 size_t xfpusave_len)
2329{
2330 size_t max_len, len;
2331
2332 mcp->mc_ownedfp = fpugetregs(td);
2333 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2334 sizeof(mcp->mc_fpstate));
2335 mcp->mc_fpformat = fpuformat();
2336 if (!use_xsave || xfpusave_len == 0)
2337 return;
2338 max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2339 len = xfpusave_len;
2340 if (len > max_len) {
2341 len = max_len;
2342 bzero(xfpusave + max_len, len - max_len);
2343 }
2344 mcp->mc_flags |= _MC_HASFPXSTATE;
2345 mcp->mc_xfpustate_len = len;
2346 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2347}
2348
2349static int
2350set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2351 size_t xfpustate_len)
2352{
2353 int error;
2354
2355 if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2356 return (0);
2357 else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2358 return (EINVAL);
2359 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2360 /* We don't care what state is left in the FPU or PCB. */
2361 fpstate_drop(td);
2362 error = 0;
2363 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2364 mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2365 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2366 xfpustate, xfpustate_len);
2367 } else
2368 return (EINVAL);
2369 return (error);
2370}
2371
2372void
2373fpstate_drop(struct thread *td)
2374{
2375
2376 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2377 critical_enter();
2378 if (PCPU_GET(fpcurthread) == td)
2379 fpudrop();
2380 /*
2381 * XXX force a full drop of the fpu. The above only drops it if we
2382 * owned it.
2383 *
2384 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2385 * drop. Dropping only to the pcb matches fnsave's behaviour.
2386 * We only need to drop to !PCB_INITDONE in sendsig(). But
2387 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2388 * have too many layers.
2389 */
2390 clear_pcb_flags(curthread->td_pcb,
2391 PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2392 critical_exit();
2393}
2394
2395int
2396fill_dbregs(struct thread *td, struct dbreg *dbregs)
2397{
2398 struct pcb *pcb;
2399
2400 if (td == NULL) {
2401 dbregs->dr[0] = rdr0();
2402 dbregs->dr[1] = rdr1();
2403 dbregs->dr[2] = rdr2();
2404 dbregs->dr[3] = rdr3();
2405 dbregs->dr[6] = rdr6();
2406 dbregs->dr[7] = rdr7();
2407 } else {
2408 pcb = td->td_pcb;
2409 dbregs->dr[0] = pcb->pcb_dr0;
2410 dbregs->dr[1] = pcb->pcb_dr1;
2411 dbregs->dr[2] = pcb->pcb_dr2;
2412 dbregs->dr[3] = pcb->pcb_dr3;
2413 dbregs->dr[6] = pcb->pcb_dr6;
2414 dbregs->dr[7] = pcb->pcb_dr7;
2415 }
2416 dbregs->dr[4] = 0;
2417 dbregs->dr[5] = 0;
2418 dbregs->dr[8] = 0;
2419 dbregs->dr[9] = 0;
2420 dbregs->dr[10] = 0;
2421 dbregs->dr[11] = 0;
2422 dbregs->dr[12] = 0;
2423 dbregs->dr[13] = 0;
2424 dbregs->dr[14] = 0;
2425 dbregs->dr[15] = 0;
2426 return (0);
2427}
2428
2429int
2430set_dbregs(struct thread *td, struct dbreg *dbregs)
2431{
2432 struct pcb *pcb;
2433 int i;
2434
2435 if (td == NULL) {
2436 load_dr0(dbregs->dr[0]);
2437 load_dr1(dbregs->dr[1]);
2438 load_dr2(dbregs->dr[2]);
2439 load_dr3(dbregs->dr[3]);
2440 load_dr6(dbregs->dr[6]);
2441 load_dr7(dbregs->dr[7]);
2442 } else {
2443 /*
2444 * Don't let an illegal value for dr7 get set. Specifically,
2445 * check for undefined settings. Setting these bit patterns
2446 * result in undefined behaviour and can lead to an unexpected
2447 * TRCTRAP or a general protection fault right here.
2448 * Upper bits of dr6 and dr7 must not be set
2449 */
2450 for (i = 0; i < 4; i++) {
2451 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2452 return (EINVAL);
2453 if (td->td_frame->tf_cs == _ucode32sel &&
2454 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2455 return (EINVAL);
2456 }
2457 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2458 (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2459 return (EINVAL);
2460
2461 pcb = td->td_pcb;
2462
2463 /*
2464 * Don't let a process set a breakpoint that is not within the
2465 * process's address space. If a process could do this, it
2466 * could halt the system by setting a breakpoint in the kernel
2467 * (if ddb was enabled). Thus, we need to check to make sure
2468 * that no breakpoints are being enabled for addresses outside
2469 * process's address space.
2470 *
2471 * XXX - what about when the watched area of the user's
2472 * address space is written into from within the kernel
2473 * ... wouldn't that still cause a breakpoint to be generated
2474 * from within kernel mode?
2475 */
2476
2477 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2478 /* dr0 is enabled */
2479 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2480 return (EINVAL);
2481 }
2482 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2483 /* dr1 is enabled */
2484 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2485 return (EINVAL);
2486 }
2487 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2488 /* dr2 is enabled */
2489 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2490 return (EINVAL);
2491 }
2492 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2493 /* dr3 is enabled */
2494 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2495 return (EINVAL);
2496 }
2497
2498 pcb->pcb_dr0 = dbregs->dr[0];
2499 pcb->pcb_dr1 = dbregs->dr[1];
2500 pcb->pcb_dr2 = dbregs->dr[2];
2501 pcb->pcb_dr3 = dbregs->dr[3];
2502 pcb->pcb_dr6 = dbregs->dr[6];
2503 pcb->pcb_dr7 = dbregs->dr[7];
2504
2505 set_pcb_flags(pcb, PCB_DBREGS);
2506 }
2507
2508 return (0);
2509}
2510
2511void
2512reset_dbregs(void)
2513{
2514
2515 load_dr7(0); /* Turn off the control bits first */
2516 load_dr0(0);
2517 load_dr1(0);
2518 load_dr2(0);
2519 load_dr3(0);
2520 load_dr6(0);
2521}
2522
2523/*
2524 * Return > 0 if a hardware breakpoint has been hit, and the
2525 * breakpoint was in user space. Return 0, otherwise.
2526 */
2527int
2528user_dbreg_trap(register_t dr6)
2529{
2530 u_int64_t dr7;
2531 u_int64_t bp; /* breakpoint bits extracted from dr6 */
2532 int nbp; /* number of breakpoints that triggered */
2533 caddr_t addr[4]; /* breakpoint addresses */
2534 int i;
2535
2536 bp = dr6 & DBREG_DR6_BMASK;
2537 if (bp == 0) {
2538 /*
2539 * None of the breakpoint bits are set meaning this
2540 * trap was not caused by any of the debug registers
2541 */
2542 return 0;
2543 }
2544
2545 dr7 = rdr7();
2546 if ((dr7 & 0x000000ff) == 0) {
2547 /*
2548 * all GE and LE bits in the dr7 register are zero,
2549 * thus the trap couldn't have been caused by the
2550 * hardware debug registers
2551 */
2552 return 0;
2553 }
2554
2555 nbp = 0;
2556
2557 /*
2558 * at least one of the breakpoints were hit, check to see
2559 * which ones and if any of them are user space addresses
2560 */
2561
2562 if (bp & 0x01) {
2563 addr[nbp++] = (caddr_t)rdr0();
2564 }
2565 if (bp & 0x02) {
2566 addr[nbp++] = (caddr_t)rdr1();
2567 }
2568 if (bp & 0x04) {
2569 addr[nbp++] = (caddr_t)rdr2();
2570 }
2571 if (bp & 0x08) {
2572 addr[nbp++] = (caddr_t)rdr3();
2573 }
2574
2575 for (i = 0; i < nbp; i++) {
2576 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2577 /*
2578 * addr[i] is in user space
2579 */
2580 return nbp;
2581 }
2582 }
2583
2584 /*
2585 * None of the breakpoints are in user space.
2586 */
2587 return 0;
2588}
2589
2590/*
2591 * The pcb_flags is only modified by current thread, or by other threads
2592 * when current thread is stopped. However, current thread may change it
2593 * from the interrupt context in cpu_switch(), or in the trap handler.
2594 * When we read-modify-write pcb_flags from C sources, compiler may generate
2595 * code that is not atomic regarding the interrupt handler. If a trap or
2596 * interrupt happens and any flag is modified from the handler, it can be
2597 * clobbered with the cached value later. Therefore, we implement setting
2598 * and clearing flags with single-instruction functions, which do not race
2599 * with possible modification of the flags from the trap or interrupt context,
2600 * because traps and interrupts are executed only on instruction boundary.
2601 */
2602void
2603set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2604{
2605
2606 __asm __volatile("orl %1,%0"
2607 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2608 : "cc", "memory");
2609
2610}
2611
2612/*
2613 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2614 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2615 * pcb if user space modified the bases. We must save on the context
2616 * switch or if the return to usermode happens through the doreti.
2617 *
2618 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2619 * which have a consequence that the base MSRs must be saved each time
2620 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with
2621 * context switches.
2622 */
2623void
2624set_pcb_flags(struct pcb *pcb, const u_int flags)
2625{
2626 register_t r;
2627
2628 if (curpcb == pcb &&
2629 (flags & PCB_FULL_IRET) != 0 &&
2630 (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
2631 (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
2632 r = intr_disable();
2633 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2634 if (rfs() == _ufssel)
2635 pcb->pcb_fsbase = rdfsbase();
2636 if (rgs() == _ugssel)
2637 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2638 }
2639 set_pcb_flags_raw(pcb, flags);
2640 intr_restore(r);
2641 } else {
2642 set_pcb_flags_raw(pcb, flags);
2643 }
2644}
2645
2646void
2647clear_pcb_flags(struct pcb *pcb, const u_int flags)
2648{
2649
2650 __asm __volatile("andl %1,%0"
2651 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2652 : "cc", "memory");
2653}
2654
2655#ifdef KDB
2656
2657/*
2658 * Provide inb() and outb() as functions. They are normally only available as
2659 * inline functions, thus cannot be called from the debugger.
2660 */
2661
2662/* silence compiler warnings */
2663u_char inb_(u_short);
2664void outb_(u_short, u_char);
2665
2666u_char
2667inb_(u_short port)
2668{
2669 return inb(port);
2670}
2671
2672void
2673outb_(u_short port, u_char data)
2674{
2675 outb(port, data);
2676}
2677
2678#endif /* KDB */
1733 finishidentcpu(); /* Final stage of CPU initialization */
1734 initializecpu(); /* Initialize CPU registers */
1735 initializecpucache();
1736
1737 /* doublefault stack space, runs on ist1 */
1738 common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1739
1740 /*
1741 * NMI stack, runs on ist2. The pcpu pointer is stored just
1742 * above the start of the ist2 stack.
1743 */
1744 np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1745 np->np_pcpu = (register_t) pc;
1746 common_tss[0].tss_ist2 = (long) np;
1747
1748 /*
1749 * MC# stack, runs on ist3. The pcpu pointer is stored just
1750 * above the start of the ist3 stack.
1751 */
1752 np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
1753 np->np_pcpu = (register_t) pc;
1754 common_tss[0].tss_ist3 = (long) np;
1755
1756 /*
1757 * DB# stack, runs on ist4.
1758 */
1759 np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
1760 np->np_pcpu = (register_t) pc;
1761 common_tss[0].tss_ist4 = (long) np;
1762
1763 /* Set the IO permission bitmap (empty due to tss seg limit) */
1764 common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1765
1766 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1767 ltr(gsel_tss);
1768
1769 amd64_conf_fast_syscall();
1770
1771 /*
1772 * Temporary forge some valid pointer to PCB, for exception
1773 * handlers. It is reinitialized properly below after FPU is
1774 * set up. Also set up td_critnest to short-cut the page
1775 * fault handler.
1776 */
1777 cpu_max_ext_state_size = sizeof(struct savefpu);
1778 thread0.td_pcb = get_pcb_td(&thread0);
1779 thread0.td_critnest = 1;
1780
1781 /*
1782 * The console and kdb should be initialized even earlier than here,
1783 * but some console drivers don't work until after getmemsize().
1784 * Default to late console initialization to support these drivers.
1785 * This loses mainly printf()s in getmemsize() and early debugging.
1786 */
1787 late_console = 1;
1788 TUNABLE_INT_FETCH("debug.late_console", &late_console);
1789 if (!late_console) {
1790 cninit();
1791 amd64_kdb_init();
1792 }
1793
1794 getmemsize(kmdp, physfree);
1795 init_param2(physmem);
1796
1797 /* now running on new page tables, configured,and u/iom is accessible */
1798
1799 if (late_console)
1800 cninit();
1801
1802#ifdef DEV_ISA
1803#ifdef DEV_ATPIC
1804 elcr_probe();
1805 atpic_startup();
1806#else
1807 /* Reset and mask the atpics and leave them shut down. */
1808 atpic_reset();
1809
1810 /*
1811 * Point the ICU spurious interrupt vectors at the APIC spurious
1812 * interrupt handler.
1813 */
1814 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1815 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1816#endif
1817#else
1818#error "have you forgotten the isa device?";
1819#endif
1820
1821 if (late_console)
1822 amd64_kdb_init();
1823
1824 msgbufinit(msgbufp, msgbufsize);
1825 fpuinit();
1826
1827 /*
1828 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1829 * area size. Zero out the extended state header in fpu save
1830 * area.
1831 */
1832 thread0.td_pcb = get_pcb_td(&thread0);
1833 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1834 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1835 if (use_xsave) {
1836 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1837 1);
1838 xhdr->xstate_bv = xsave_mask;
1839 }
1840 /* make an initial tss so cpu can get interrupt stack on syscall! */
1841 rsp0 = (vm_offset_t)thread0.td_pcb;
1842 /* Ensure the stack is aligned to 16 bytes */
1843 rsp0 &= ~0xFul;
1844 common_tss[0].tss_rsp0 = rsp0;
1845 PCPU_SET(rsp0, rsp0);
1846 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1847 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1848 PCPU_SET(curpcb, thread0.td_pcb);
1849
1850 /* transfer to user mode */
1851
1852 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1853 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1854 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1855 _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1856 _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1857
1858 load_ds(_udatasel);
1859 load_es(_udatasel);
1860 load_fs(_ufssel);
1861
1862 /* setup proc 0's pcb */
1863 thread0.td_pcb->pcb_flags = 0;
1864 thread0.td_frame = &proc0_tf;
1865
1866 env = kern_getenv("kernelname");
1867 if (env != NULL)
1868 strlcpy(kernelname, env, sizeof(kernelname));
1869
1870 cpu_probe_amdc1e();
1871
1872#ifdef FDT
1873 x86_init_fdt();
1874#endif
1875 thread0.td_critnest = 0;
1876
1877 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1878 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1879 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1880
1881 /* Location of kernel stack for locore */
1882 return ((u_int64_t)thread0.td_pcb);
1883}
1884
1885void
1886cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1887{
1888
1889 pcpu->pc_acpi_id = 0xffffffff;
1890}
1891
1892static int
1893smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1894{
1895 struct bios_smap *smapbase;
1896 struct bios_smap_xattr smap;
1897 caddr_t kmdp;
1898 uint32_t *smapattr;
1899 int count, error, i;
1900
1901 /* Retrieve the system memory map from the loader. */
1902 kmdp = preload_search_by_type("elf kernel");
1903 if (kmdp == NULL)
1904 kmdp = preload_search_by_type("elf64 kernel");
1905 smapbase = (struct bios_smap *)preload_search_info(kmdp,
1906 MODINFO_METADATA | MODINFOMD_SMAP);
1907 if (smapbase == NULL)
1908 return (0);
1909 smapattr = (uint32_t *)preload_search_info(kmdp,
1910 MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1911 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1912 error = 0;
1913 for (i = 0; i < count; i++) {
1914 smap.base = smapbase[i].base;
1915 smap.length = smapbase[i].length;
1916 smap.type = smapbase[i].type;
1917 if (smapattr != NULL)
1918 smap.xattr = smapattr[i];
1919 else
1920 smap.xattr = 0;
1921 error = SYSCTL_OUT(req, &smap, sizeof(smap));
1922 }
1923 return (error);
1924}
1925SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1926 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1927
1928static int
1929efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1930{
1931 struct efi_map_header *efihdr;
1932 caddr_t kmdp;
1933 uint32_t efisize;
1934
1935 kmdp = preload_search_by_type("elf kernel");
1936 if (kmdp == NULL)
1937 kmdp = preload_search_by_type("elf64 kernel");
1938 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1939 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1940 if (efihdr == NULL)
1941 return (0);
1942 efisize = *((uint32_t *)efihdr - 1);
1943 return (SYSCTL_OUT(req, efihdr, efisize));
1944}
1945SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1946 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1947
1948void
1949spinlock_enter(void)
1950{
1951 struct thread *td;
1952 register_t flags;
1953
1954 td = curthread;
1955 if (td->td_md.md_spinlock_count == 0) {
1956 flags = intr_disable();
1957 td->td_md.md_spinlock_count = 1;
1958 td->td_md.md_saved_flags = flags;
1959 } else
1960 td->td_md.md_spinlock_count++;
1961 critical_enter();
1962}
1963
1964void
1965spinlock_exit(void)
1966{
1967 struct thread *td;
1968 register_t flags;
1969
1970 td = curthread;
1971 critical_exit();
1972 flags = td->td_md.md_saved_flags;
1973 td->td_md.md_spinlock_count--;
1974 if (td->td_md.md_spinlock_count == 0)
1975 intr_restore(flags);
1976}
1977
1978/*
1979 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1980 * we want to start a backtrace from the function that caused us to enter
1981 * the debugger. We have the context in the trapframe, but base the trace
1982 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1983 * enough for a backtrace.
1984 */
1985void
1986makectx(struct trapframe *tf, struct pcb *pcb)
1987{
1988
1989 pcb->pcb_r12 = tf->tf_r12;
1990 pcb->pcb_r13 = tf->tf_r13;
1991 pcb->pcb_r14 = tf->tf_r14;
1992 pcb->pcb_r15 = tf->tf_r15;
1993 pcb->pcb_rbp = tf->tf_rbp;
1994 pcb->pcb_rbx = tf->tf_rbx;
1995 pcb->pcb_rip = tf->tf_rip;
1996 pcb->pcb_rsp = tf->tf_rsp;
1997}
1998
1999int
2000ptrace_set_pc(struct thread *td, unsigned long addr)
2001{
2002
2003 td->td_frame->tf_rip = addr;
2004 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2005 return (0);
2006}
2007
2008int
2009ptrace_single_step(struct thread *td)
2010{
2011
2012 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2013 if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2014 td->td_frame->tf_rflags |= PSL_T;
2015 td->td_dbgflags |= TDB_STEP;
2016 }
2017 return (0);
2018}
2019
2020int
2021ptrace_clear_single_step(struct thread *td)
2022{
2023 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2024 td->td_frame->tf_rflags &= ~PSL_T;
2025 td->td_dbgflags &= ~TDB_STEP;
2026 return (0);
2027}
2028
2029int
2030fill_regs(struct thread *td, struct reg *regs)
2031{
2032 struct trapframe *tp;
2033
2034 tp = td->td_frame;
2035 return (fill_frame_regs(tp, regs));
2036}
2037
2038int
2039fill_frame_regs(struct trapframe *tp, struct reg *regs)
2040{
2041
2042 regs->r_r15 = tp->tf_r15;
2043 regs->r_r14 = tp->tf_r14;
2044 regs->r_r13 = tp->tf_r13;
2045 regs->r_r12 = tp->tf_r12;
2046 regs->r_r11 = tp->tf_r11;
2047 regs->r_r10 = tp->tf_r10;
2048 regs->r_r9 = tp->tf_r9;
2049 regs->r_r8 = tp->tf_r8;
2050 regs->r_rdi = tp->tf_rdi;
2051 regs->r_rsi = tp->tf_rsi;
2052 regs->r_rbp = tp->tf_rbp;
2053 regs->r_rbx = tp->tf_rbx;
2054 regs->r_rdx = tp->tf_rdx;
2055 regs->r_rcx = tp->tf_rcx;
2056 regs->r_rax = tp->tf_rax;
2057 regs->r_rip = tp->tf_rip;
2058 regs->r_cs = tp->tf_cs;
2059 regs->r_rflags = tp->tf_rflags;
2060 regs->r_rsp = tp->tf_rsp;
2061 regs->r_ss = tp->tf_ss;
2062 if (tp->tf_flags & TF_HASSEGS) {
2063 regs->r_ds = tp->tf_ds;
2064 regs->r_es = tp->tf_es;
2065 regs->r_fs = tp->tf_fs;
2066 regs->r_gs = tp->tf_gs;
2067 } else {
2068 regs->r_ds = 0;
2069 regs->r_es = 0;
2070 regs->r_fs = 0;
2071 regs->r_gs = 0;
2072 }
2073 regs->r_err = 0;
2074 regs->r_trapno = 0;
2075 return (0);
2076}
2077
2078int
2079set_regs(struct thread *td, struct reg *regs)
2080{
2081 struct trapframe *tp;
2082 register_t rflags;
2083
2084 tp = td->td_frame;
2085 rflags = regs->r_rflags & 0xffffffff;
2086 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2087 return (EINVAL);
2088 tp->tf_r15 = regs->r_r15;
2089 tp->tf_r14 = regs->r_r14;
2090 tp->tf_r13 = regs->r_r13;
2091 tp->tf_r12 = regs->r_r12;
2092 tp->tf_r11 = regs->r_r11;
2093 tp->tf_r10 = regs->r_r10;
2094 tp->tf_r9 = regs->r_r9;
2095 tp->tf_r8 = regs->r_r8;
2096 tp->tf_rdi = regs->r_rdi;
2097 tp->tf_rsi = regs->r_rsi;
2098 tp->tf_rbp = regs->r_rbp;
2099 tp->tf_rbx = regs->r_rbx;
2100 tp->tf_rdx = regs->r_rdx;
2101 tp->tf_rcx = regs->r_rcx;
2102 tp->tf_rax = regs->r_rax;
2103 tp->tf_rip = regs->r_rip;
2104 tp->tf_cs = regs->r_cs;
2105 tp->tf_rflags = rflags;
2106 tp->tf_rsp = regs->r_rsp;
2107 tp->tf_ss = regs->r_ss;
2108 if (0) { /* XXXKIB */
2109 tp->tf_ds = regs->r_ds;
2110 tp->tf_es = regs->r_es;
2111 tp->tf_fs = regs->r_fs;
2112 tp->tf_gs = regs->r_gs;
2113 tp->tf_flags = TF_HASSEGS;
2114 }
2115 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2116 return (0);
2117}
2118
2119/* XXX check all this stuff! */
2120/* externalize from sv_xmm */
2121static void
2122fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2123{
2124 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2125 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2126 int i;
2127
2128 /* pcb -> fpregs */
2129 bzero(fpregs, sizeof(*fpregs));
2130
2131 /* FPU control/status */
2132 penv_fpreg->en_cw = penv_xmm->en_cw;
2133 penv_fpreg->en_sw = penv_xmm->en_sw;
2134 penv_fpreg->en_tw = penv_xmm->en_tw;
2135 penv_fpreg->en_opcode = penv_xmm->en_opcode;
2136 penv_fpreg->en_rip = penv_xmm->en_rip;
2137 penv_fpreg->en_rdp = penv_xmm->en_rdp;
2138 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2139 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2140
2141 /* FPU registers */
2142 for (i = 0; i < 8; ++i)
2143 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2144
2145 /* SSE registers */
2146 for (i = 0; i < 16; ++i)
2147 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2148}
2149
2150/* internalize from fpregs into sv_xmm */
2151static void
2152set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2153{
2154 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2155 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2156 int i;
2157
2158 /* fpregs -> pcb */
2159 /* FPU control/status */
2160 penv_xmm->en_cw = penv_fpreg->en_cw;
2161 penv_xmm->en_sw = penv_fpreg->en_sw;
2162 penv_xmm->en_tw = penv_fpreg->en_tw;
2163 penv_xmm->en_opcode = penv_fpreg->en_opcode;
2164 penv_xmm->en_rip = penv_fpreg->en_rip;
2165 penv_xmm->en_rdp = penv_fpreg->en_rdp;
2166 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2167 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2168
2169 /* FPU registers */
2170 for (i = 0; i < 8; ++i)
2171 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2172
2173 /* SSE registers */
2174 for (i = 0; i < 16; ++i)
2175 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2176}
2177
2178/* externalize from td->pcb */
2179int
2180fill_fpregs(struct thread *td, struct fpreg *fpregs)
2181{
2182
2183 KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2184 P_SHOULDSTOP(td->td_proc),
2185 ("not suspended thread %p", td));
2186 fpugetregs(td);
2187 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2188 return (0);
2189}
2190
2191/* internalize to td->pcb */
2192int
2193set_fpregs(struct thread *td, struct fpreg *fpregs)
2194{
2195
2196 critical_enter();
2197 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2198 fpuuserinited(td);
2199 critical_exit();
2200 return (0);
2201}
2202
2203/*
2204 * Get machine context.
2205 */
2206int
2207get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2208{
2209 struct pcb *pcb;
2210 struct trapframe *tp;
2211
2212 pcb = td->td_pcb;
2213 tp = td->td_frame;
2214 PROC_LOCK(curthread->td_proc);
2215 mcp->mc_onstack = sigonstack(tp->tf_rsp);
2216 PROC_UNLOCK(curthread->td_proc);
2217 mcp->mc_r15 = tp->tf_r15;
2218 mcp->mc_r14 = tp->tf_r14;
2219 mcp->mc_r13 = tp->tf_r13;
2220 mcp->mc_r12 = tp->tf_r12;
2221 mcp->mc_r11 = tp->tf_r11;
2222 mcp->mc_r10 = tp->tf_r10;
2223 mcp->mc_r9 = tp->tf_r9;
2224 mcp->mc_r8 = tp->tf_r8;
2225 mcp->mc_rdi = tp->tf_rdi;
2226 mcp->mc_rsi = tp->tf_rsi;
2227 mcp->mc_rbp = tp->tf_rbp;
2228 mcp->mc_rbx = tp->tf_rbx;
2229 mcp->mc_rcx = tp->tf_rcx;
2230 mcp->mc_rflags = tp->tf_rflags;
2231 if (flags & GET_MC_CLEAR_RET) {
2232 mcp->mc_rax = 0;
2233 mcp->mc_rdx = 0;
2234 mcp->mc_rflags &= ~PSL_C;
2235 } else {
2236 mcp->mc_rax = tp->tf_rax;
2237 mcp->mc_rdx = tp->tf_rdx;
2238 }
2239 mcp->mc_rip = tp->tf_rip;
2240 mcp->mc_cs = tp->tf_cs;
2241 mcp->mc_rsp = tp->tf_rsp;
2242 mcp->mc_ss = tp->tf_ss;
2243 mcp->mc_ds = tp->tf_ds;
2244 mcp->mc_es = tp->tf_es;
2245 mcp->mc_fs = tp->tf_fs;
2246 mcp->mc_gs = tp->tf_gs;
2247 mcp->mc_flags = tp->tf_flags;
2248 mcp->mc_len = sizeof(*mcp);
2249 get_fpcontext(td, mcp, NULL, 0);
2250 update_pcb_bases(pcb);
2251 mcp->mc_fsbase = pcb->pcb_fsbase;
2252 mcp->mc_gsbase = pcb->pcb_gsbase;
2253 mcp->mc_xfpustate = 0;
2254 mcp->mc_xfpustate_len = 0;
2255 bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2256 return (0);
2257}
2258
2259/*
2260 * Set machine context.
2261 *
2262 * However, we don't set any but the user modifiable flags, and we won't
2263 * touch the cs selector.
2264 */
2265int
2266set_mcontext(struct thread *td, mcontext_t *mcp)
2267{
2268 struct pcb *pcb;
2269 struct trapframe *tp;
2270 char *xfpustate;
2271 long rflags;
2272 int ret;
2273
2274 pcb = td->td_pcb;
2275 tp = td->td_frame;
2276 if (mcp->mc_len != sizeof(*mcp) ||
2277 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2278 return (EINVAL);
2279 rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2280 (tp->tf_rflags & ~PSL_USERCHANGE);
2281 if (mcp->mc_flags & _MC_HASFPXSTATE) {
2282 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2283 sizeof(struct savefpu))
2284 return (EINVAL);
2285 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2286 ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2287 mcp->mc_xfpustate_len);
2288 if (ret != 0)
2289 return (ret);
2290 } else
2291 xfpustate = NULL;
2292 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2293 if (ret != 0)
2294 return (ret);
2295 tp->tf_r15 = mcp->mc_r15;
2296 tp->tf_r14 = mcp->mc_r14;
2297 tp->tf_r13 = mcp->mc_r13;
2298 tp->tf_r12 = mcp->mc_r12;
2299 tp->tf_r11 = mcp->mc_r11;
2300 tp->tf_r10 = mcp->mc_r10;
2301 tp->tf_r9 = mcp->mc_r9;
2302 tp->tf_r8 = mcp->mc_r8;
2303 tp->tf_rdi = mcp->mc_rdi;
2304 tp->tf_rsi = mcp->mc_rsi;
2305 tp->tf_rbp = mcp->mc_rbp;
2306 tp->tf_rbx = mcp->mc_rbx;
2307 tp->tf_rdx = mcp->mc_rdx;
2308 tp->tf_rcx = mcp->mc_rcx;
2309 tp->tf_rax = mcp->mc_rax;
2310 tp->tf_rip = mcp->mc_rip;
2311 tp->tf_rflags = rflags;
2312 tp->tf_rsp = mcp->mc_rsp;
2313 tp->tf_ss = mcp->mc_ss;
2314 tp->tf_flags = mcp->mc_flags;
2315 if (tp->tf_flags & TF_HASSEGS) {
2316 tp->tf_ds = mcp->mc_ds;
2317 tp->tf_es = mcp->mc_es;
2318 tp->tf_fs = mcp->mc_fs;
2319 tp->tf_gs = mcp->mc_gs;
2320 }
2321 set_pcb_flags(pcb, PCB_FULL_IRET);
2322 if (mcp->mc_flags & _MC_HASBASES) {
2323 pcb->pcb_fsbase = mcp->mc_fsbase;
2324 pcb->pcb_gsbase = mcp->mc_gsbase;
2325 }
2326 return (0);
2327}
2328
2329static void
2330get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2331 size_t xfpusave_len)
2332{
2333 size_t max_len, len;
2334
2335 mcp->mc_ownedfp = fpugetregs(td);
2336 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2337 sizeof(mcp->mc_fpstate));
2338 mcp->mc_fpformat = fpuformat();
2339 if (!use_xsave || xfpusave_len == 0)
2340 return;
2341 max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2342 len = xfpusave_len;
2343 if (len > max_len) {
2344 len = max_len;
2345 bzero(xfpusave + max_len, len - max_len);
2346 }
2347 mcp->mc_flags |= _MC_HASFPXSTATE;
2348 mcp->mc_xfpustate_len = len;
2349 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2350}
2351
2352static int
2353set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2354 size_t xfpustate_len)
2355{
2356 int error;
2357
2358 if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2359 return (0);
2360 else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2361 return (EINVAL);
2362 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2363 /* We don't care what state is left in the FPU or PCB. */
2364 fpstate_drop(td);
2365 error = 0;
2366 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2367 mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2368 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2369 xfpustate, xfpustate_len);
2370 } else
2371 return (EINVAL);
2372 return (error);
2373}
2374
2375void
2376fpstate_drop(struct thread *td)
2377{
2378
2379 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2380 critical_enter();
2381 if (PCPU_GET(fpcurthread) == td)
2382 fpudrop();
2383 /*
2384 * XXX force a full drop of the fpu. The above only drops it if we
2385 * owned it.
2386 *
2387 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2388 * drop. Dropping only to the pcb matches fnsave's behaviour.
2389 * We only need to drop to !PCB_INITDONE in sendsig(). But
2390 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2391 * have too many layers.
2392 */
2393 clear_pcb_flags(curthread->td_pcb,
2394 PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2395 critical_exit();
2396}
2397
2398int
2399fill_dbregs(struct thread *td, struct dbreg *dbregs)
2400{
2401 struct pcb *pcb;
2402
2403 if (td == NULL) {
2404 dbregs->dr[0] = rdr0();
2405 dbregs->dr[1] = rdr1();
2406 dbregs->dr[2] = rdr2();
2407 dbregs->dr[3] = rdr3();
2408 dbregs->dr[6] = rdr6();
2409 dbregs->dr[7] = rdr7();
2410 } else {
2411 pcb = td->td_pcb;
2412 dbregs->dr[0] = pcb->pcb_dr0;
2413 dbregs->dr[1] = pcb->pcb_dr1;
2414 dbregs->dr[2] = pcb->pcb_dr2;
2415 dbregs->dr[3] = pcb->pcb_dr3;
2416 dbregs->dr[6] = pcb->pcb_dr6;
2417 dbregs->dr[7] = pcb->pcb_dr7;
2418 }
2419 dbregs->dr[4] = 0;
2420 dbregs->dr[5] = 0;
2421 dbregs->dr[8] = 0;
2422 dbregs->dr[9] = 0;
2423 dbregs->dr[10] = 0;
2424 dbregs->dr[11] = 0;
2425 dbregs->dr[12] = 0;
2426 dbregs->dr[13] = 0;
2427 dbregs->dr[14] = 0;
2428 dbregs->dr[15] = 0;
2429 return (0);
2430}
2431
2432int
2433set_dbregs(struct thread *td, struct dbreg *dbregs)
2434{
2435 struct pcb *pcb;
2436 int i;
2437
2438 if (td == NULL) {
2439 load_dr0(dbregs->dr[0]);
2440 load_dr1(dbregs->dr[1]);
2441 load_dr2(dbregs->dr[2]);
2442 load_dr3(dbregs->dr[3]);
2443 load_dr6(dbregs->dr[6]);
2444 load_dr7(dbregs->dr[7]);
2445 } else {
2446 /*
2447 * Don't let an illegal value for dr7 get set. Specifically,
2448 * check for undefined settings. Setting these bit patterns
2449 * result in undefined behaviour and can lead to an unexpected
2450 * TRCTRAP or a general protection fault right here.
2451 * Upper bits of dr6 and dr7 must not be set
2452 */
2453 for (i = 0; i < 4; i++) {
2454 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2455 return (EINVAL);
2456 if (td->td_frame->tf_cs == _ucode32sel &&
2457 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2458 return (EINVAL);
2459 }
2460 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2461 (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2462 return (EINVAL);
2463
2464 pcb = td->td_pcb;
2465
2466 /*
2467 * Don't let a process set a breakpoint that is not within the
2468 * process's address space. If a process could do this, it
2469 * could halt the system by setting a breakpoint in the kernel
2470 * (if ddb was enabled). Thus, we need to check to make sure
2471 * that no breakpoints are being enabled for addresses outside
2472 * process's address space.
2473 *
2474 * XXX - what about when the watched area of the user's
2475 * address space is written into from within the kernel
2476 * ... wouldn't that still cause a breakpoint to be generated
2477 * from within kernel mode?
2478 */
2479
2480 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2481 /* dr0 is enabled */
2482 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2483 return (EINVAL);
2484 }
2485 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2486 /* dr1 is enabled */
2487 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2488 return (EINVAL);
2489 }
2490 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2491 /* dr2 is enabled */
2492 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2493 return (EINVAL);
2494 }
2495 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2496 /* dr3 is enabled */
2497 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2498 return (EINVAL);
2499 }
2500
2501 pcb->pcb_dr0 = dbregs->dr[0];
2502 pcb->pcb_dr1 = dbregs->dr[1];
2503 pcb->pcb_dr2 = dbregs->dr[2];
2504 pcb->pcb_dr3 = dbregs->dr[3];
2505 pcb->pcb_dr6 = dbregs->dr[6];
2506 pcb->pcb_dr7 = dbregs->dr[7];
2507
2508 set_pcb_flags(pcb, PCB_DBREGS);
2509 }
2510
2511 return (0);
2512}
2513
2514void
2515reset_dbregs(void)
2516{
2517
2518 load_dr7(0); /* Turn off the control bits first */
2519 load_dr0(0);
2520 load_dr1(0);
2521 load_dr2(0);
2522 load_dr3(0);
2523 load_dr6(0);
2524}
2525
2526/*
2527 * Return > 0 if a hardware breakpoint has been hit, and the
2528 * breakpoint was in user space. Return 0, otherwise.
2529 */
2530int
2531user_dbreg_trap(register_t dr6)
2532{
2533 u_int64_t dr7;
2534 u_int64_t bp; /* breakpoint bits extracted from dr6 */
2535 int nbp; /* number of breakpoints that triggered */
2536 caddr_t addr[4]; /* breakpoint addresses */
2537 int i;
2538
2539 bp = dr6 & DBREG_DR6_BMASK;
2540 if (bp == 0) {
2541 /*
2542 * None of the breakpoint bits are set meaning this
2543 * trap was not caused by any of the debug registers
2544 */
2545 return 0;
2546 }
2547
2548 dr7 = rdr7();
2549 if ((dr7 & 0x000000ff) == 0) {
2550 /*
2551 * all GE and LE bits in the dr7 register are zero,
2552 * thus the trap couldn't have been caused by the
2553 * hardware debug registers
2554 */
2555 return 0;
2556 }
2557
2558 nbp = 0;
2559
2560 /*
2561 * at least one of the breakpoints were hit, check to see
2562 * which ones and if any of them are user space addresses
2563 */
2564
2565 if (bp & 0x01) {
2566 addr[nbp++] = (caddr_t)rdr0();
2567 }
2568 if (bp & 0x02) {
2569 addr[nbp++] = (caddr_t)rdr1();
2570 }
2571 if (bp & 0x04) {
2572 addr[nbp++] = (caddr_t)rdr2();
2573 }
2574 if (bp & 0x08) {
2575 addr[nbp++] = (caddr_t)rdr3();
2576 }
2577
2578 for (i = 0; i < nbp; i++) {
2579 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2580 /*
2581 * addr[i] is in user space
2582 */
2583 return nbp;
2584 }
2585 }
2586
2587 /*
2588 * None of the breakpoints are in user space.
2589 */
2590 return 0;
2591}
2592
2593/*
2594 * The pcb_flags is only modified by current thread, or by other threads
2595 * when current thread is stopped. However, current thread may change it
2596 * from the interrupt context in cpu_switch(), or in the trap handler.
2597 * When we read-modify-write pcb_flags from C sources, compiler may generate
2598 * code that is not atomic regarding the interrupt handler. If a trap or
2599 * interrupt happens and any flag is modified from the handler, it can be
2600 * clobbered with the cached value later. Therefore, we implement setting
2601 * and clearing flags with single-instruction functions, which do not race
2602 * with possible modification of the flags from the trap or interrupt context,
2603 * because traps and interrupts are executed only on instruction boundary.
2604 */
2605void
2606set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2607{
2608
2609 __asm __volatile("orl %1,%0"
2610 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2611 : "cc", "memory");
2612
2613}
2614
2615/*
2616 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2617 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2618 * pcb if user space modified the bases. We must save on the context
2619 * switch or if the return to usermode happens through the doreti.
2620 *
2621 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2622 * which have a consequence that the base MSRs must be saved each time
2623 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with
2624 * context switches.
2625 */
2626void
2627set_pcb_flags(struct pcb *pcb, const u_int flags)
2628{
2629 register_t r;
2630
2631 if (curpcb == pcb &&
2632 (flags & PCB_FULL_IRET) != 0 &&
2633 (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
2634 (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
2635 r = intr_disable();
2636 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2637 if (rfs() == _ufssel)
2638 pcb->pcb_fsbase = rdfsbase();
2639 if (rgs() == _ugssel)
2640 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2641 }
2642 set_pcb_flags_raw(pcb, flags);
2643 intr_restore(r);
2644 } else {
2645 set_pcb_flags_raw(pcb, flags);
2646 }
2647}
2648
2649void
2650clear_pcb_flags(struct pcb *pcb, const u_int flags)
2651{
2652
2653 __asm __volatile("andl %1,%0"
2654 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2655 : "cc", "memory");
2656}
2657
2658#ifdef KDB
2659
2660/*
2661 * Provide inb() and outb() as functions. They are normally only available as
2662 * inline functions, thus cannot be called from the debugger.
2663 */
2664
2665/* silence compiler warnings */
2666u_char inb_(u_short);
2667void outb_(u_short, u_char);
2668
2669u_char
2670inb_(u_short port)
2671{
2672 return inb(port);
2673}
2674
2675void
2676outb_(u_short port, u_char data)
2677{
2678 outb(port, data);
2679}
2680
2681#endif /* KDB */