1/*-
2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * William Jolitz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 362383 2020-06-19 13:48:23Z kib $");
43
44#include "opt_atpic.h"
45#include "opt_compat.h"
46#include "opt_cpu.h"
47#include "opt_ddb.h"
48#include "opt_inet.h"
49#include "opt_isa.h"
50#include "opt_kstack_pages.h"
51#include "opt_maxmem.h"
52#include "opt_mp_watchdog.h"
53#include "opt_perfmon.h"
54#include "opt_platform.h"
55#include "opt_sched.h"
56
57#include <sys/param.h>
58#include <sys/proc.h>
59#include <sys/systm.h>
60#include <sys/bio.h>
61#include <sys/buf.h>
62#include <sys/bus.h>
63#include <sys/callout.h>
64#include <sys/cons.h>
65#include <sys/cpu.h>
66#include <sys/efi.h>
67#include <sys/eventhandler.h>
68#include <sys/exec.h>
69#include <sys/imgact.h>
70#include <sys/kdb.h>
71#include <sys/kernel.h>
72#include <sys/ktr.h>
73#include <sys/linker.h>
74#include <sys/lock.h>
75#include <sys/malloc.h>
76#include <sys/memrange.h>
77#include <sys/msgbuf.h>
78#include <sys/mutex.h>
79#include <sys/pcpu.h>
80#include <sys/ptrace.h>
81#include <sys/reboot.h>
82#include <sys/rwlock.h>
83#include <sys/sched.h>
84#include <sys/signalvar.h>
85#ifdef SMP
86#include <sys/smp.h>
87#endif
88#include <sys/syscallsubr.h>
89#include <sys/sysctl.h>
90#include <sys/sysent.h>
91#include <sys/sysproto.h>
92#include <sys/ucontext.h>
93#include <sys/vmmeter.h>
94
95#include <vm/vm.h>
96#include <vm/vm_extern.h>
97#include <vm/vm_kern.h>
98#include <vm/vm_page.h>
99#include <vm/vm_map.h>
100#include <vm/vm_object.h>
101#include <vm/vm_pager.h>
102#include <vm/vm_param.h>
103#include <vm/vm_phys.h>
104
105#ifdef DDB
106#ifndef KDB
107#error KDB must be enabled in order for DDB to work!
108#endif
109#include <ddb/ddb.h>
110#include <ddb/db_sym.h>
111#endif
112
113#include <net/netisr.h>
114
115#include <machine/clock.h>
116#include <machine/cpu.h>
117#include <machine/cputypes.h>
118#include <machine/frame.h>
119#include <machine/intr_machdep.h>
120#include <x86/mca.h>
121#include <machine/md_var.h>
122#include <machine/metadata.h>
123#include <machine/mp_watchdog.h>
124#include <machine/pc/bios.h>
125#include <machine/pcb.h>
126#include <machine/proc.h>
127#include <machine/reg.h>
128#include <machine/sigframe.h>
129#include <machine/specialreg.h>
130#ifdef PERFMON
131#include <machine/perfmon.h>
132#endif
133#include <machine/tss.h>
134#include <x86/ucode.h>
135#ifdef SMP
136#include <machine/smp.h>
137#endif
138#ifdef FDT
139#include <x86/fdt.h>
140#endif
141
142#ifdef DEV_ATPIC
143#include <x86/isa/icu.h>
144#else
145#include <x86/apicvar.h>
146#endif
147
148#include <isa/isareg.h>
149#include <isa/rtc.h>
150#include <x86/init.h>
151
152/* Sanity check for __curthread() */
153CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154
155/*
156 * The PTI trampoline stack needs enough space for a hardware trapframe and a
157 * couple of scratch registers, as well as the trapframe left behind after an
158 * iret fault.
159 */
160CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161    offsetof(struct pti_frame, pti_rip));
162
163extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164
165#define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
166#define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
167
168static void cpu_startup(void *);
169static void get_fpcontext(struct thread *td, mcontext_t *mcp,
170    char *xfpusave, size_t xfpusave_len);
171static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
172    char *xfpustate, size_t xfpustate_len);
173SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
174
175/* Preload data parse function */
176static caddr_t native_parse_preload_data(u_int64_t);
177
178/* Native function to fetch and parse the e820 map */
179static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
180
181/* Default init_ops implementation. */
182struct init_ops init_ops = {
183	.parse_preload_data =	native_parse_preload_data,
184	.early_clock_source_init =	i8254_init,
185	.early_delay =			i8254_delay,
186	.parse_memmap =			native_parse_memmap,
187#ifdef SMP
188	.mp_bootaddress =		mp_bootaddress,
189	.start_all_aps =		native_start_all_aps,
190#endif
191	.msi_init =			msi_init,
192};
193
194struct msgbuf *msgbufp;
195
196/*
197 * Physical address of the EFI System Table. Stashed from the metadata hints
198 * passed into the kernel and used by the EFI code to call runtime services.
199 */
200vm_paddr_t efi_systbl_phys;
201
202/* Intel ICH registers */
203#define ICH_PMBASE	0x400
204#define ICH_SMI_EN	ICH_PMBASE + 0x30
205
206int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
207
208int cold = 1;
209
210long Maxmem = 0;
211long realmem = 0;
212
213/*
214 * The number of PHYSMAP entries must be one less than the number of
215 * PHYSSEG entries because the PHYSMAP entry that spans the largest
216 * physical address that is accessible by ISA DMA is split into two
217 * PHYSSEG entries.
218 */
219#define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
220
221vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
222vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
223
224/* must be 2 less so 0 0 can signal end of chunks */
225#define	PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
226#define	DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
227
228struct kva_md_info kmi;
229
230static struct trapframe proc0_tf;
231struct region_descriptor r_gdt, r_idt;
232
233struct pcpu __pcpu[MAXCPU];
234
235struct mtx icu_lock;
236
237struct mem_range_softc mem_range_softc;
238
239struct mtx dt_lock;	/* lock for GDT and LDT */
240
241void (*vmm_resume_p)(void);
242
243static void
244cpu_startup(dummy)
245	void *dummy;
246{
247	uintmax_t memsize;
248	char *sysenv;
249
250	/*
251	 * On MacBooks, we need to disallow the legacy USB circuit to
252	 * generate an SMI# because this can cause several problems,
253	 * namely: incorrect CPU frequency detection and failure to
254	 * start the APs.
255	 * We do this by disabling a bit in the SMI_EN (SMI Control and
256	 * Enable register) of the Intel ICH LPC Interface Bridge.
257	 */
258	sysenv = kern_getenv("smbios.system.product");
259	if (sysenv != NULL) {
260		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
261		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
262		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
263		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
264		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
265		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
266		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
267		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
268			if (bootverbose)
269				printf("Disabling LEGACY_USB_EN bit on "
270				    "Intel ICH.\n");
271			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
272		}
273		freeenv(sysenv);
274	}
275
276	/*
277	 * Good {morning,afternoon,evening,night}.
278	 */
279	startrtclock();
280	printcpuinfo();
281#ifdef PERFMON
282	perfmon_init();
283#endif
284
285	/*
286	 * Display physical memory if SMBIOS reports reasonable amount.
287	 */
288	memsize = 0;
289	sysenv = kern_getenv("smbios.memory.enabled");
290	if (sysenv != NULL) {
291		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
292		freeenv(sysenv);
293	}
294	if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
295		memsize = ptoa((uintmax_t)Maxmem);
296	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
297	realmem = atop(memsize);
298
299	/*
300	 * Display any holes after the first chunk of extended memory.
301	 */
302	if (bootverbose) {
303		int indx;
304
305		printf("Physical memory chunk(s):\n");
306		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
307			vm_paddr_t size;
308
309			size = phys_avail[indx + 1] - phys_avail[indx];
310			printf(
311			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
312			    (uintmax_t)phys_avail[indx],
313			    (uintmax_t)phys_avail[indx + 1] - 1,
314			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
315		}
316	}
317
318	vm_ksubmap_init(&kmi);
319
320	printf("avail memory = %ju (%ju MB)\n",
321	    ptoa((uintmax_t)vm_cnt.v_free_count),
322	    ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
323
324	/*
325	 * Set up buffers, so they can be used to read disk labels.
326	 */
327	bufinit();
328	vm_pager_bufferinit();
329
330	cpu_setregs();
331}
332
333/*
334 * Send an interrupt to process.
335 *
336 * Stack is set up to allow sigcode stored
337 * at top to call routine, followed by call
338 * to sigreturn routine below.  After sigreturn
339 * resets the signal mask, the stack, and the
340 * frame pointer, it returns to the user
341 * specified pc, psl.
342 */
343void
344sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
345{
346	struct sigframe sf, *sfp;
347	struct pcb *pcb;
348	struct proc *p;
349	struct thread *td;
350	struct sigacts *psp;
351	char *sp;
352	struct trapframe *regs;
353	char *xfpusave;
354	size_t xfpusave_len;
355	int sig;
356	int oonstack;
357
358	td = curthread;
359	pcb = td->td_pcb;
360	p = td->td_proc;
361	PROC_LOCK_ASSERT(p, MA_OWNED);
362	sig = ksi->ksi_signo;
363	psp = p->p_sigacts;
364	mtx_assert(&psp->ps_mtx, MA_OWNED);
365	regs = td->td_frame;
366	oonstack = sigonstack(regs->tf_rsp);
367
368	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
369		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
370		xfpusave = __builtin_alloca(xfpusave_len);
371	} else {
372		xfpusave_len = 0;
373		xfpusave = NULL;
374	}
375
376	/* Save user context. */
377	bzero(&sf, sizeof(sf));
378	sf.sf_uc.uc_sigmask = *mask;
379	sf.sf_uc.uc_stack = td->td_sigstk;
380	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
381	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
382	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
383	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
384	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
385	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
386	fpstate_drop(td);
387	update_pcb_bases(pcb);
388	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
389	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
390	bzero(sf.sf_uc.uc_mcontext.mc_spare,
391	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
392	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
393
394	/* Allocate space for the signal handler context. */
395	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
396	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
397		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
398#if defined(COMPAT_43)
399		td->td_sigstk.ss_flags |= SS_ONSTACK;
400#endif
401	} else
402		sp = (char *)regs->tf_rsp - 128;
403	if (xfpusave != NULL) {
404		sp -= xfpusave_len;
405		sp = (char *)((unsigned long)sp & ~0x3Ful);
406		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
407	}
408	sp -= sizeof(struct sigframe);
409	/* Align to 16 bytes. */
410	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
411
412	/* Build the argument list for the signal handler. */
413	regs->tf_rdi = sig;			/* arg 1 in %rdi */
414	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
415	bzero(&sf.sf_si, sizeof(sf.sf_si));
416	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
417		/* Signal handler installed with SA_SIGINFO. */
418		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
419		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
420
421		/* Fill in POSIX parts */
422		sf.sf_si = ksi->ksi_info;
423		sf.sf_si.si_signo = sig; /* maybe a translated signal */
424		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
425	} else {
426		/* Old FreeBSD-style arguments. */
427		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
428		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
429		sf.sf_ahu.sf_handler = catcher;
430	}
431	mtx_unlock(&psp->ps_mtx);
432	PROC_UNLOCK(p);
433
434	/*
435	 * Copy the sigframe out to the user's stack.
436	 */
437	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
438	    (xfpusave != NULL && copyout(xfpusave,
439	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
440	    != 0)) {
441#ifdef DEBUG
442		printf("process %ld has trashed its stack\n", (long)p->p_pid);
443#endif
444		PROC_LOCK(p);
445		sigexit(td, SIGILL);
446	}
447
448	regs->tf_rsp = (long)sfp;
449	regs->tf_rip = p->p_sysent->sv_sigcode_base;
450	regs->tf_rflags &= ~(PSL_T | PSL_D);
451	regs->tf_cs = _ucodesel;
452	regs->tf_ds = _udatasel;
453	regs->tf_ss = _udatasel;
454	regs->tf_es = _udatasel;
455	regs->tf_fs = _ufssel;
456	regs->tf_gs = _ugssel;
457	regs->tf_flags = TF_HASSEGS;
458	PROC_LOCK(p);
459	mtx_lock(&psp->ps_mtx);
460}
461
462/*
463 * System call to cleanup state after a signal
464 * has been taken.  Reset signal mask and
465 * stack state from context left by sendsig (above).
466 * Return to previous pc and psl as specified by
467 * context left by sendsig. Check carefully to
468 * make sure that the user has not modified the
469 * state to gain improper privileges.
470 *
471 * MPSAFE
472 */
473int
474sys_sigreturn(td, uap)
475	struct thread *td;
476	struct sigreturn_args /* {
477		const struct __ucontext *sigcntxp;
478	} */ *uap;
479{
480	ucontext_t uc;
481	struct pcb *pcb;
482	struct proc *p;
483	struct trapframe *regs;
484	ucontext_t *ucp;
485	char *xfpustate;
486	size_t xfpustate_len;
487	long rflags;
488	int cs, error, ret;
489	ksiginfo_t ksi;
490
491	pcb = td->td_pcb;
492	p = td->td_proc;
493
494	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
495	if (error != 0) {
496		uprintf("pid %d (%s): sigreturn copyin failed\n",
497		    p->p_pid, td->td_name);
498		return (error);
499	}
500	ucp = &uc;
501	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
502		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
503		    td->td_name, ucp->uc_mcontext.mc_flags);
504		return (EINVAL);
505	}
506	regs = td->td_frame;
507	rflags = ucp->uc_mcontext.mc_rflags;
508	/*
509	 * Don't allow users to change privileged or reserved flags.
510	 */
511	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
512		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
513		    td->td_name, rflags);
514		return (EINVAL);
515	}
516
517	/*
518	 * Don't allow users to load a valid privileged %cs.  Let the
519	 * hardware check for invalid selectors, excess privilege in
520	 * other selectors, invalid %eip's and invalid %esp's.
521	 */
522	cs = ucp->uc_mcontext.mc_cs;
523	if (!CS_SECURE(cs)) {
524		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
525		    td->td_name, cs);
526		ksiginfo_init_trap(&ksi);
527		ksi.ksi_signo = SIGBUS;
528		ksi.ksi_code = BUS_OBJERR;
529		ksi.ksi_trapno = T_PROTFLT;
530		ksi.ksi_addr = (void *)regs->tf_rip;
531		trapsignal(td, &ksi);
532		return (EINVAL);
533	}
534
535	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
536		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
537		if (xfpustate_len > cpu_max_ext_state_size -
538		    sizeof(struct savefpu)) {
539			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
540			    p->p_pid, td->td_name, xfpustate_len);
541			return (EINVAL);
542		}
543		xfpustate = __builtin_alloca(xfpustate_len);
544		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
545		    xfpustate, xfpustate_len);
546		if (error != 0) {
547			uprintf(
548	"pid %d (%s): sigreturn copying xfpustate failed\n",
549			    p->p_pid, td->td_name);
550			return (error);
551		}
552	} else {
553		xfpustate = NULL;
554		xfpustate_len = 0;
555	}
556	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
557	if (ret != 0) {
558		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
559		    p->p_pid, td->td_name, ret);
560		return (ret);
561	}
562	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
563	update_pcb_bases(pcb);
564	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
565	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
566
567#if defined(COMPAT_43)
568	if (ucp->uc_mcontext.mc_onstack & 1)
569		td->td_sigstk.ss_flags |= SS_ONSTACK;
570	else
571		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
572#endif
573
574	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
575	return (EJUSTRETURN);
576}
577
578#ifdef COMPAT_FREEBSD4
579int
580freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
581{
582
583	return sys_sigreturn(td, (struct sigreturn_args *)uap);
584}
585#endif
586
587/*
588 * Reset registers to default values on exec.
589 */
590void
591exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
592{
593	struct trapframe *regs;
594	struct pcb *pcb;
595	register_t saved_rflags;
596
597	regs = td->td_frame;
598	pcb = td->td_pcb;
599
600	mtx_lock(&dt_lock);
601	if (td->td_proc->p_md.md_ldt != NULL)
602		user_ldt_free(td);
603	else
604		mtx_unlock(&dt_lock);
605
606	update_pcb_bases(pcb);
607	pcb->pcb_fsbase = 0;
608	pcb->pcb_gsbase = 0;
609	clear_pcb_flags(pcb, PCB_32BIT);
610	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
611
612	saved_rflags = regs->tf_rflags & PSL_T;
613	bzero((char *)regs, sizeof(struct trapframe));
614	regs->tf_rip = imgp->entry_addr;
615	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
616	regs->tf_rdi = stack;		/* argv */
617	regs->tf_rflags = PSL_USER | saved_rflags;
618	regs->tf_ss = _udatasel;
619	regs->tf_cs = _ucodesel;
620	regs->tf_ds = _udatasel;
621	regs->tf_es = _udatasel;
622	regs->tf_fs = _ufssel;
623	regs->tf_gs = _ugssel;
624	regs->tf_flags = TF_HASSEGS;
625	td->td_retval[1] = 0;
626
627	/*
628	 * Reset the hardware debug registers if they were in use.
629	 * They won't have any meaning for the newly exec'd process.
630	 */
631	if (pcb->pcb_flags & PCB_DBREGS) {
632		pcb->pcb_dr0 = 0;
633		pcb->pcb_dr1 = 0;
634		pcb->pcb_dr2 = 0;
635		pcb->pcb_dr3 = 0;
636		pcb->pcb_dr6 = 0;
637		pcb->pcb_dr7 = 0;
638		if (pcb == curpcb) {
639			/*
640			 * Clear the debug registers on the running
641			 * CPU, otherwise they will end up affecting
642			 * the next process we switch to.
643			 */
644			reset_dbregs();
645		}
646		clear_pcb_flags(pcb, PCB_DBREGS);
647	}
648
649	/*
650	 * Drop the FP state if we hold it, so that the process gets a
651	 * clean FP state if it uses the FPU again.
652	 */
653	fpstate_drop(td);
654}
655
656void
657cpu_setregs(void)
658{
659	register_t cr0;
660
661	cr0 = rcr0();
662	/*
663	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
664	 * BSP.  See the comments there about why we set them.
665	 */
666	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
667	load_cr0(cr0);
668}
669
670/*
671 * Initialize amd64 and configure to run kernel
672 */
673
674/*
675 * Initialize segments & interrupt table
676 */
677
678struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
679static struct gate_descriptor idt0[NIDT];
680struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
681
682static char dblfault_stack[PAGE_SIZE] __aligned(16);
683static char mce0_stack[PAGE_SIZE] __aligned(16);
684static char nmi0_stack[PAGE_SIZE] __aligned(16);
685static char dbg0_stack[PAGE_SIZE] __aligned(16);
686CTASSERT(sizeof(struct nmi_pcpu) == 16);
687
688struct amd64tss common_tss[MAXCPU];
689
690/*
691 * Software prototypes -- in more palatable form.
692 *
693 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
694 * slots as corresponding segments for i386 kernel.
695 */
696struct soft_segment_descriptor gdt_segs[] = {
697/* GNULL_SEL	0 Null Descriptor */
698{	.ssd_base = 0x0,
699	.ssd_limit = 0x0,
700	.ssd_type = 0,
701	.ssd_dpl = 0,
702	.ssd_p = 0,
703	.ssd_long = 0,
704	.ssd_def32 = 0,
705	.ssd_gran = 0		},
706/* GNULL2_SEL	1 Null Descriptor */
707{	.ssd_base = 0x0,
708	.ssd_limit = 0x0,
709	.ssd_type = 0,
710	.ssd_dpl = 0,
711	.ssd_p = 0,
712	.ssd_long = 0,
713	.ssd_def32 = 0,
714	.ssd_gran = 0		},
715/* GUFS32_SEL	2 32 bit %gs Descriptor for user */
716{	.ssd_base = 0x0,
717	.ssd_limit = 0xfffff,
718	.ssd_type = SDT_MEMRWA,
719	.ssd_dpl = SEL_UPL,
720	.ssd_p = 1,
721	.ssd_long = 0,
722	.ssd_def32 = 1,
723	.ssd_gran = 1		},
724/* GUGS32_SEL	3 32 bit %fs Descriptor for user */
725{	.ssd_base = 0x0,
726	.ssd_limit = 0xfffff,
727	.ssd_type = SDT_MEMRWA,
728	.ssd_dpl = SEL_UPL,
729	.ssd_p = 1,
730	.ssd_long = 0,
731	.ssd_def32 = 1,
732	.ssd_gran = 1		},
733/* GCODE_SEL	4 Code Descriptor for kernel */
734{	.ssd_base = 0x0,
735	.ssd_limit = 0xfffff,
736	.ssd_type = SDT_MEMERA,
737	.ssd_dpl = SEL_KPL,
738	.ssd_p = 1,
739	.ssd_long = 1,
740	.ssd_def32 = 0,
741	.ssd_gran = 1		},
742/* GDATA_SEL	5 Data Descriptor for kernel */
743{	.ssd_base = 0x0,
744	.ssd_limit = 0xfffff,
745	.ssd_type = SDT_MEMRWA,
746	.ssd_dpl = SEL_KPL,
747	.ssd_p = 1,
748	.ssd_long = 1,
749	.ssd_def32 = 0,
750	.ssd_gran = 1		},
751/* GUCODE32_SEL	6 32 bit Code Descriptor for user */
752{	.ssd_base = 0x0,
753	.ssd_limit = 0xfffff,
754	.ssd_type = SDT_MEMERA,
755	.ssd_dpl = SEL_UPL,
756	.ssd_p = 1,
757	.ssd_long = 0,
758	.ssd_def32 = 1,
759	.ssd_gran = 1		},
760/* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
761{	.ssd_base = 0x0,
762	.ssd_limit = 0xfffff,
763	.ssd_type = SDT_MEMRWA,
764	.ssd_dpl = SEL_UPL,
765	.ssd_p = 1,
766	.ssd_long = 0,
767	.ssd_def32 = 1,
768	.ssd_gran = 1		},
769/* GUCODE_SEL	8 64 bit Code Descriptor for user */
770{	.ssd_base = 0x0,
771	.ssd_limit = 0xfffff,
772	.ssd_type = SDT_MEMERA,
773	.ssd_dpl = SEL_UPL,
774	.ssd_p = 1,
775	.ssd_long = 1,
776	.ssd_def32 = 0,
777	.ssd_gran = 1		},
778/* GPROC0_SEL	9 Proc 0 Tss Descriptor */
779{	.ssd_base = 0x0,
780	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
781	.ssd_type = SDT_SYSTSS,
782	.ssd_dpl = SEL_KPL,
783	.ssd_p = 1,
784	.ssd_long = 0,
785	.ssd_def32 = 0,
786	.ssd_gran = 0		},
787/* Actually, the TSS is a system descriptor which is double size */
788{	.ssd_base = 0x0,
789	.ssd_limit = 0x0,
790	.ssd_type = 0,
791	.ssd_dpl = 0,
792	.ssd_p = 0,
793	.ssd_long = 0,
794	.ssd_def32 = 0,
795	.ssd_gran = 0		},
796/* GUSERLDT_SEL	11 LDT Descriptor */
797{	.ssd_base = 0x0,
798	.ssd_limit = 0x0,
799	.ssd_type = 0,
800	.ssd_dpl = 0,
801	.ssd_p = 0,
802	.ssd_long = 0,
803	.ssd_def32 = 0,
804	.ssd_gran = 0		},
805/* GUSERLDT_SEL	12 LDT Descriptor, double size */
806{	.ssd_base = 0x0,
807	.ssd_limit = 0x0,
808	.ssd_type = 0,
809	.ssd_dpl = 0,
810	.ssd_p = 0,
811	.ssd_long = 0,
812	.ssd_def32 = 0,
813	.ssd_gran = 0		},
814};
815
816void
817setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
818{
819	struct gate_descriptor *ip;
820
821	ip = idt + idx;
822	ip->gd_looffset = (uintptr_t)func;
823	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
824	ip->gd_ist = ist;
825	ip->gd_xx = 0;
826	ip->gd_type = typ;
827	ip->gd_dpl = dpl;
828	ip->gd_p = 1;
829	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
830}
831
832extern inthand_t
833	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
834	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
835	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
836	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
837	IDTVEC(xmm), IDTVEC(dblfault),
838	IDTVEC(div_pti), IDTVEC(bpt_pti),
839	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
840	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
841	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
842	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
843	IDTVEC(xmm_pti),
844#ifdef KDTRACE_HOOKS
845	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
846#endif
847#ifdef XENHVM
848	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
849#endif
850	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
851	IDTVEC(fast_syscall_pti);
852
853#ifdef DDB
854/*
855 * Display the index and function name of any IDT entries that don't use
856 * the default 'rsvd' entry point.
857 */
858DB_SHOW_COMMAND(idt, db_show_idt)
859{
860	struct gate_descriptor *ip;
861	int idx;
862	uintptr_t func;
863
864	ip = idt;
865	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
866		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
867		if (func != (uintptr_t)&IDTVEC(rsvd)) {
868			db_printf("%3d\t", idx);
869			db_printsym(func, DB_STGY_PROC);
870			db_printf("\n");
871		}
872		ip++;
873	}
874}
875
876/* Show privileged registers. */
877DB_SHOW_COMMAND(sysregs, db_show_sysregs)
878{
879	struct {
880		uint16_t limit;
881		uint64_t base;
882	} __packed idtr, gdtr;
883	uint16_t ldt, tr;
884
885	__asm __volatile("sidt %0" : "=m" (idtr));
886	db_printf("idtr\t0x%016lx/%04x\n",
887	    (u_long)idtr.base, (u_int)idtr.limit);
888	__asm __volatile("sgdt %0" : "=m" (gdtr));
889	db_printf("gdtr\t0x%016lx/%04x\n",
890	    (u_long)gdtr.base, (u_int)gdtr.limit);
891	__asm __volatile("sldt %0" : "=r" (ldt));
892	db_printf("ldtr\t0x%04x\n", ldt);
893	__asm __volatile("str %0" : "=r" (tr));
894	db_printf("tr\t0x%04x\n", tr);
895	db_printf("cr0\t0x%016lx\n", rcr0());
896	db_printf("cr2\t0x%016lx\n", rcr2());
897	db_printf("cr3\t0x%016lx\n", rcr3());
898	db_printf("cr4\t0x%016lx\n", rcr4());
899	if (rcr4() & CR4_XSAVE)
900		db_printf("xcr0\t0x%016lx\n", rxcr(0));
901	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
902	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
903		db_printf("FEATURES_CTL\t%016lx\n",
904		    rdmsr(MSR_IA32_FEATURE_CONTROL));
905	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
906	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
907	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
908}
909
910DB_SHOW_COMMAND(dbregs, db_show_dbregs)
911{
912
913	db_printf("dr0\t0x%016lx\n", rdr0());
914	db_printf("dr1\t0x%016lx\n", rdr1());
915	db_printf("dr2\t0x%016lx\n", rdr2());
916	db_printf("dr3\t0x%016lx\n", rdr3());
917	db_printf("dr6\t0x%016lx\n", rdr6());
918	db_printf("dr7\t0x%016lx\n", rdr7());
919}
920#endif
921
922void
923sdtossd(sd, ssd)
924	struct user_segment_descriptor *sd;
925	struct soft_segment_descriptor *ssd;
926{
927
928	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
929	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
930	ssd->ssd_type  = sd->sd_type;
931	ssd->ssd_dpl   = sd->sd_dpl;
932	ssd->ssd_p     = sd->sd_p;
933	ssd->ssd_long  = sd->sd_long;
934	ssd->ssd_def32 = sd->sd_def32;
935	ssd->ssd_gran  = sd->sd_gran;
936}
937
938void
939ssdtosd(ssd, sd)
940	struct soft_segment_descriptor *ssd;
941	struct user_segment_descriptor *sd;
942{
943
944	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
945	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
946	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
947	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
948	sd->sd_type  = ssd->ssd_type;
949	sd->sd_dpl   = ssd->ssd_dpl;
950	sd->sd_p     = ssd->ssd_p;
951	sd->sd_long  = ssd->ssd_long;
952	sd->sd_def32 = ssd->ssd_def32;
953	sd->sd_gran  = ssd->ssd_gran;
954}
955
956void
957ssdtosyssd(ssd, sd)
958	struct soft_segment_descriptor *ssd;
959	struct system_segment_descriptor *sd;
960{
961
962	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
963	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
964	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
965	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
966	sd->sd_type  = ssd->ssd_type;
967	sd->sd_dpl   = ssd->ssd_dpl;
968	sd->sd_p     = ssd->ssd_p;
969	sd->sd_gran  = ssd->ssd_gran;
970}
971
972#if !defined(DEV_ATPIC) && defined(DEV_ISA)
973#include <isa/isavar.h>
974#include <isa/isareg.h>
975/*
976 * Return a bitmap of the current interrupt requests.  This is 8259-specific
977 * and is only suitable for use at probe time.
978 * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
979 * It shouldn't be here.  There should probably be an APIC centric
980 * implementation in the apic driver code, if at all.
981 */
982intrmask_t
983isa_irq_pending(void)
984{
985	u_char irr1;
986	u_char irr2;
987
988	irr1 = inb(IO_ICU1);
989	irr2 = inb(IO_ICU2);
990	return ((irr2 << 8) | irr1);
991}
992#endif
993
994u_int basemem;
995
996static int
997add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
998    int *physmap_idxp)
999{
1000	int i, insert_idx, physmap_idx;
1001
1002	physmap_idx = *physmap_idxp;
1003
1004	if (length == 0)
1005		return (1);
1006
1007	/*
1008	 * Find insertion point while checking for overlap.  Start off by
1009	 * assuming the new entry will be added to the end.
1010	 *
1011	 * NB: physmap_idx points to the next free slot.
1012	 */
1013	insert_idx = physmap_idx;
1014	for (i = 0; i <= physmap_idx; i += 2) {
1015		if (base < physmap[i + 1]) {
1016			if (base + length <= physmap[i]) {
1017				insert_idx = i;
1018				break;
1019			}
1020			if (boothowto & RB_VERBOSE)
1021				printf(
1022		    "Overlapping memory regions, ignoring second region\n");
1023			return (1);
1024		}
1025	}
1026
1027	/* See if we can prepend to the next entry. */
1028	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1029		physmap[insert_idx] = base;
1030		return (1);
1031	}
1032
1033	/* See if we can append to the previous entry. */
1034	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1035		physmap[insert_idx - 1] += length;
1036		return (1);
1037	}
1038
1039	physmap_idx += 2;
1040	*physmap_idxp = physmap_idx;
1041	if (physmap_idx == PHYSMAP_SIZE) {
1042		printf(
1043		"Too many segments in the physical address map, giving up\n");
1044		return (0);
1045	}
1046
1047	/*
1048	 * Move the last 'N' entries down to make room for the new
1049	 * entry if needed.
1050	 */
1051	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1052		physmap[i] = physmap[i - 2];
1053		physmap[i + 1] = physmap[i - 1];
1054	}
1055
1056	/* Insert the new entry. */
1057	physmap[insert_idx] = base;
1058	physmap[insert_idx + 1] = base + length;
1059	return (1);
1060}
1061
1062void
1063bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1064                      vm_paddr_t *physmap, int *physmap_idx)
1065{
1066	struct bios_smap *smap, *smapend;
1067
1068	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1069
1070	for (smap = smapbase; smap < smapend; smap++) {
1071		if (boothowto & RB_VERBOSE)
1072			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1073			    smap->type, smap->base, smap->length);
1074
1075		if (smap->type != SMAP_TYPE_MEMORY)
1076			continue;
1077
1078		if (!add_physmap_entry(smap->base, smap->length, physmap,
1079		    physmap_idx))
1080			break;
1081	}
1082}
1083
1084static void
1085add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1086    int *physmap_idx)
1087{
1088	struct efi_md *map, *p;
1089	const char *type;
1090	size_t efisz;
1091	int ndesc, i;
1092
1093	static const char *types[] = {
1094		"Reserved",
1095		"LoaderCode",
1096		"LoaderData",
1097		"BootServicesCode",
1098		"BootServicesData",
1099		"RuntimeServicesCode",
1100		"RuntimeServicesData",
1101		"ConventionalMemory",
1102		"UnusableMemory",
1103		"ACPIReclaimMemory",
1104		"ACPIMemoryNVS",
1105		"MemoryMappedIO",
1106		"MemoryMappedIOPortSpace",
1107		"PalCode",
1108		"PersistentMemory"
1109	};
1110
1111	/*
1112	 * Memory map data provided by UEFI via the GetMemoryMap
1113	 * Boot Services API.
1114	 */
1115	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1116	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1117
1118	if (efihdr->descriptor_size == 0)
1119		return;
1120	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1121
1122	if (boothowto & RB_VERBOSE)
1123		printf("%23s %12s %12s %8s %4s\n",
1124		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1125
1126	for (i = 0, p = map; i < ndesc; i++,
1127	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1128		if (boothowto & RB_VERBOSE) {
1129			if (p->md_type < nitems(types))
1130				type = types[p->md_type];
1131			else
1132				type = "<INVALID>";
1133			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1134			    p->md_virt, p->md_pages);
1135			if (p->md_attr & EFI_MD_ATTR_UC)
1136				printf("UC ");
1137			if (p->md_attr & EFI_MD_ATTR_WC)
1138				printf("WC ");
1139			if (p->md_attr & EFI_MD_ATTR_WT)
1140				printf("WT ");
1141			if (p->md_attr & EFI_MD_ATTR_WB)
1142				printf("WB ");
1143			if (p->md_attr & EFI_MD_ATTR_UCE)
1144				printf("UCE ");
1145			if (p->md_attr & EFI_MD_ATTR_WP)
1146				printf("WP ");
1147			if (p->md_attr & EFI_MD_ATTR_RP)
1148				printf("RP ");
1149			if (p->md_attr & EFI_MD_ATTR_XP)
1150				printf("XP ");
1151			if (p->md_attr & EFI_MD_ATTR_NV)
1152				printf("NV ");
1153			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1154				printf("MORE_RELIABLE ");
1155			if (p->md_attr & EFI_MD_ATTR_RO)
1156				printf("RO ");
1157			if (p->md_attr & EFI_MD_ATTR_RT)
1158				printf("RUNTIME");
1159			printf("\n");
1160		}
1161
1162		switch (p->md_type) {
1163		case EFI_MD_TYPE_CODE:
1164		case EFI_MD_TYPE_DATA:
1165		case EFI_MD_TYPE_BS_CODE:
1166		case EFI_MD_TYPE_BS_DATA:
1167		case EFI_MD_TYPE_FREE:
1168			/*
1169			 * We're allowed to use any entry with these types.
1170			 */
1171			break;
1172		default:
1173			continue;
1174		}
1175
1176		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1177		    physmap, physmap_idx))
1178			break;
1179	}
1180}
1181
1182static char bootmethod[16] = "";
1183SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1184    "System firmware boot method");
1185
1186static void
1187native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1188{
1189	struct bios_smap *smap;
1190	struct efi_map_header *efihdr;
1191	u_int32_t size;
1192
1193	/*
1194	 * Memory map from INT 15:E820.
1195	 *
1196	 * subr_module.c says:
1197	 * "Consumer may safely assume that size value precedes data."
1198	 * ie: an int32_t immediately precedes smap.
1199	 */
1200
1201	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1202	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1203	smap = (struct bios_smap *)preload_search_info(kmdp,
1204	    MODINFO_METADATA | MODINFOMD_SMAP);
1205	if (efihdr == NULL && smap == NULL)
1206		panic("No BIOS smap or EFI map info from loader!");
1207
1208	if (efihdr != NULL) {
1209		add_efi_map_entries(efihdr, physmap, physmap_idx);
1210		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1211	} else {
1212		size = *((u_int32_t *)smap - 1);
1213		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1214		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1215	}
1216}
1217
1218#define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1219
1220/*
1221 * Populate the (physmap) array with base/bound pairs describing the
1222 * available physical memory in the system, then test this memory and
1223 * build the phys_avail array describing the actually-available memory.
1224 *
1225 * Total memory size may be set by the kernel environment variable
1226 * hw.physmem or the compile-time define MAXMEM.
1227 *
1228 * XXX first should be vm_paddr_t.
1229 */
1230static void
1231getmemsize(caddr_t kmdp, u_int64_t first)
1232{
1233	int i, physmap_idx, pa_indx, da_indx;
1234	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1235	u_long physmem_start, physmem_tunable, memtest;
1236	pt_entry_t *pte;
1237	quad_t dcons_addr, dcons_size;
1238	int page_counter;
1239
1240	/*
1241	 * Tell the physical memory allocator about pages used to store
1242	 * the kernel and preloaded data.  See kmem_bootstrap_free().
1243	 */
1244	vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1245
1246	bzero(physmap, sizeof(physmap));
1247	physmap_idx = 0;
1248
1249	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1250	physmap_idx -= 2;
1251
1252	/*
1253	 * Find the 'base memory' segment for SMP
1254	 */
1255	basemem = 0;
1256	for (i = 0; i <= physmap_idx; i += 2) {
1257		if (physmap[i] <= 0xA0000) {
1258			basemem = physmap[i + 1] / 1024;
1259			break;
1260		}
1261	}
1262	if (basemem == 0 || basemem > 640) {
1263		if (bootverbose)
1264			printf(
1265		"Memory map doesn't contain a basemem segment, faking it");
1266		basemem = 640;
1267	}
1268
1269	/*
1270	 * Make hole for "AP -> long mode" bootstrap code.  The
1271	 * mp_bootaddress vector is only available when the kernel
1272	 * is configured to support APs and APs for the system start
1273	 * in 32bit mode (e.g. SMP bare metal).
1274	 */
1275	if (init_ops.mp_bootaddress) {
1276		if (physmap[1] >= 0x100000000)
1277			panic(
1278	"Basemem segment is not suitable for AP bootstrap code!");
1279		physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
1280	}
1281
1282	/*
1283	 * Maxmem isn't the "maximum memory", it's one larger than the
1284	 * highest page of the physical address space.  It should be
1285	 * called something like "Maxphyspage".  We may adjust this
1286	 * based on ``hw.physmem'' and the results of the memory test.
1287	 */
1288	Maxmem = atop(physmap[physmap_idx + 1]);
1289
1290#ifdef MAXMEM
1291	Maxmem = MAXMEM / 4;
1292#endif
1293
1294	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1295		Maxmem = atop(physmem_tunable);
1296
1297	/*
1298	 * The boot memory test is disabled by default, as it takes a
1299	 * significant amount of time on large-memory systems, and is
1300	 * unfriendly to virtual machines as it unnecessarily touches all
1301	 * pages.
1302	 *
1303	 * A general name is used as the code may be extended to support
1304	 * additional tests beyond the current "page present" test.
1305	 */
1306	memtest = 0;
1307	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1308
1309	/*
1310	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1311	 * in the system.
1312	 */
1313	if (Maxmem > atop(physmap[physmap_idx + 1]))
1314		Maxmem = atop(physmap[physmap_idx + 1]);
1315
1316	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1317	    (boothowto & RB_VERBOSE))
1318		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1319
1320	/* call pmap initialization to make new kernel address space */
1321	pmap_bootstrap(&first);
1322
1323	/*
1324	 * Size up each available chunk of physical memory.
1325	 *
1326	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1327	 * By default, mask off the first 16 pages unless we appear to be
1328	 * running in a VM.
1329	 */
1330	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1331	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1332	if (physmap[0] < physmem_start) {
1333		if (physmem_start < PAGE_SIZE)
1334			physmap[0] = PAGE_SIZE;
1335		else if (physmem_start >= physmap[1])
1336			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1337		else
1338			physmap[0] = round_page(physmem_start);
1339	}
1340	pa_indx = 0;
1341	da_indx = 1;
1342	phys_avail[pa_indx++] = physmap[0];
1343	phys_avail[pa_indx] = physmap[0];
1344	dump_avail[da_indx] = physmap[0];
1345	pte = CMAP1;
1346
1347	/*
1348	 * Get dcons buffer address
1349	 */
1350	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1351	    getenv_quad("dcons.size", &dcons_size) == 0)
1352		dcons_addr = 0;
1353
1354	/*
1355	 * physmap is in bytes, so when converting to page boundaries,
1356	 * round up the start address and round down the end address.
1357	 */
1358	page_counter = 0;
1359	if (memtest != 0)
1360		printf("Testing system memory");
1361	for (i = 0; i <= physmap_idx; i += 2) {
1362		vm_paddr_t end;
1363
1364		end = ptoa((vm_paddr_t)Maxmem);
1365		if (physmap[i + 1] < end)
1366			end = trunc_page(physmap[i + 1]);
1367		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1368			int tmp, page_bad, full;
1369			int *ptr = (int *)CADDR1;
1370
1371			full = FALSE;
1372			/*
1373			 * block out kernel memory as not available.
1374			 */
1375			if (pa >= (vm_paddr_t)kernphys && pa < first)
1376				goto do_dump_avail;
1377
1378			/*
1379			 * block out dcons buffer
1380			 */
1381			if (dcons_addr > 0
1382			    && pa >= trunc_page(dcons_addr)
1383			    && pa < dcons_addr + dcons_size)
1384				goto do_dump_avail;
1385
1386			page_bad = FALSE;
1387			if (memtest == 0)
1388				goto skip_memtest;
1389
1390			/*
1391			 * Print a "." every GB to show we're making
1392			 * progress.
1393			 */
1394			page_counter++;
1395			if ((page_counter % PAGES_PER_GB) == 0)
1396				printf(".");
1397
1398			/*
1399			 * map page into kernel: valid, read/write,non-cacheable
1400			 */
1401			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1402			invltlb();
1403
1404			tmp = *(int *)ptr;
1405			/*
1406			 * Test for alternating 1's and 0's
1407			 */
1408			*(volatile int *)ptr = 0xaaaaaaaa;
1409			if (*(volatile int *)ptr != 0xaaaaaaaa)
1410				page_bad = TRUE;
1411			/*
1412			 * Test for alternating 0's and 1's
1413			 */
1414			*(volatile int *)ptr = 0x55555555;
1415			if (*(volatile int *)ptr != 0x55555555)
1416				page_bad = TRUE;
1417			/*
1418			 * Test for all 1's
1419			 */
1420			*(volatile int *)ptr = 0xffffffff;
1421			if (*(volatile int *)ptr != 0xffffffff)
1422				page_bad = TRUE;
1423			/*
1424			 * Test for all 0's
1425			 */
1426			*(volatile int *)ptr = 0x0;
1427			if (*(volatile int *)ptr != 0x0)
1428				page_bad = TRUE;
1429			/*
1430			 * Restore original value.
1431			 */
1432			*(int *)ptr = tmp;
1433
1434skip_memtest:
1435			/*
1436			 * Adjust array of valid/good pages.
1437			 */
1438			if (page_bad == TRUE)
1439				continue;
1440			/*
1441			 * If this good page is a continuation of the
1442			 * previous set of good pages, then just increase
1443			 * the end pointer. Otherwise start a new chunk.
1444			 * Note that "end" points one higher than end,
1445			 * making the range >= start and < end.
1446			 * If we're also doing a speculative memory
1447			 * test and we at or past the end, bump up Maxmem
1448			 * so that we keep going. The first bad page
1449			 * will terminate the loop.
1450			 */
1451			if (phys_avail[pa_indx] == pa) {
1452				phys_avail[pa_indx] += PAGE_SIZE;
1453			} else {
1454				pa_indx++;
1455				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1456					printf(
1457		"Too many holes in the physical address space, giving up\n");
1458					pa_indx--;
1459					full = TRUE;
1460					goto do_dump_avail;
1461				}
1462				phys_avail[pa_indx++] = pa;	/* start */
1463				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1464			}
1465			physmem++;
1466do_dump_avail:
1467			if (dump_avail[da_indx] == pa) {
1468				dump_avail[da_indx] += PAGE_SIZE;
1469			} else {
1470				da_indx++;
1471				if (da_indx == DUMP_AVAIL_ARRAY_END) {
1472					da_indx--;
1473					goto do_next;
1474				}
1475				dump_avail[da_indx++] = pa; /* start */
1476				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1477			}
1478do_next:
1479			if (full)
1480				break;
1481		}
1482	}
1483	*pte = 0;
1484	invltlb();
1485	if (memtest != 0)
1486		printf("\n");
1487
1488	/*
1489	 * XXX
1490	 * The last chunk must contain at least one page plus the message
1491	 * buffer to avoid complicating other code (message buffer address
1492	 * calculation, etc.).
1493	 */
1494	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1495	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1496		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1497		phys_avail[pa_indx--] = 0;
1498		phys_avail[pa_indx--] = 0;
1499	}
1500
1501	Maxmem = atop(phys_avail[pa_indx]);
1502
1503	/* Trim off space for the message buffer. */
1504	phys_avail[pa_indx] -= round_page(msgbufsize);
1505
1506	/* Map the message buffer. */
1507	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1508}
1509
1510static caddr_t
1511native_parse_preload_data(u_int64_t modulep)
1512{
1513	caddr_t kmdp;
1514	char *envp;
1515#ifdef DDB
1516	vm_offset_t ksym_start;
1517	vm_offset_t ksym_end;
1518#endif
1519
1520	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1521	preload_bootstrap_relocate(KERNBASE);
1522	kmdp = preload_search_by_type("elf kernel");
1523	if (kmdp == NULL)
1524		kmdp = preload_search_by_type("elf64 kernel");
1525	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1526	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1527	if (envp != NULL)
1528		envp += KERNBASE;
1529	init_static_kenv(envp, 0);
1530#ifdef DDB
1531	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1532	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1533	db_fetch_ksymtab(ksym_start, ksym_end);
1534#endif
1535	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1536
1537	return (kmdp);
1538}
1539
1540static void
1541amd64_kdb_init(void)
1542{
1543	kdb_init();
1544#ifdef KDB
1545	if (boothowto & RB_KDB)
1546		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1547#endif
1548}
1549
1550/* Set up the fast syscall stuff */
1551void
1552amd64_conf_fast_syscall(void)
1553{
1554	uint64_t msr;
1555
1556	msr = rdmsr(MSR_EFER) | EFER_SCE;
1557	wrmsr(MSR_EFER, msr);
1558	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1559	    (u_int64_t)IDTVEC(fast_syscall));
1560	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1561	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1562	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1563	wrmsr(MSR_STAR, msr);
1564	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
1565}
1566
1567u_int64_t
1568hammer_time(u_int64_t modulep, u_int64_t physfree)
1569{
1570	caddr_t kmdp;
1571	int gsel_tss, x;
1572	struct pcpu *pc;
1573	struct nmi_pcpu *np;
1574	struct xstate_hdr *xhdr;
1575	u_int64_t rsp0;
1576	char *env;
1577	size_t kstack0_sz;
1578	int late_console;
1579
1580	kmdp = init_ops.parse_preload_data(modulep);
1581
1582	physfree += ucode_load_bsp(physfree + KERNBASE);
1583	physfree = roundup2(physfree, PAGE_SIZE);
1584
1585	identify_cpu1();
1586	identify_hypervisor();
1587	/*
1588	 * hw.cpu_stdext_disable is ignored by the call, it will be
1589	 * re-evaluted by the below call to finishidentcpu().
1590	 */
1591	identify_cpu2();
1592
1593	link_elf_ireloc(kmdp);
1594
1595	/*
1596	 * This may be done better later if it gets more high level
1597	 * components in it. If so just link td->td_proc here.
1598	 */
1599	proc_linkup0(&proc0, &thread0);
1600
1601	/* Init basic tunables, hz etc */
1602	init_param1();
1603
1604	thread0.td_kstack = physfree + KERNBASE;
1605	thread0.td_kstack_pages = kstack_pages;
1606	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1607	bzero((void *)thread0.td_kstack, kstack0_sz);
1608	physfree += kstack0_sz;
1609
1610	/*
1611	 * make gdt memory segments
1612	 */
1613	for (x = 0; x < NGDT; x++) {
1614		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1615		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1616			ssdtosd(&gdt_segs[x], &gdt[x]);
1617	}
1618	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1619	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1620	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1621
1622	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1623	r_gdt.rd_base =  (long) gdt;
1624	lgdt(&r_gdt);
1625	pc = &__pcpu[0];
1626
1627	wrmsr(MSR_FSBASE, 0);		/* User value */
1628	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1629	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1630
1631	pcpu_init(pc, 0, sizeof(struct pcpu));
1632	dpcpu_init((void *)(physfree + KERNBASE), 0);
1633	physfree += DPCPU_SIZE;
1634	PCPU_SET(prvspace, pc);
1635	PCPU_SET(curthread, &thread0);
1636	/* Non-late cninit() and printf() can be moved up to here. */
1637	PCPU_SET(tssp, &common_tss[0]);
1638	PCPU_SET(commontssp, &common_tss[0]);
1639	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1640	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1641	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1642	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1643
1644	/*
1645	 * Initialize mutexes.
1646	 *
1647	 * icu_lock: in order to allow an interrupt to occur in a critical
1648	 * 	     section, to set pcpu->ipending (etc...) properly, we
1649	 *	     must be able to get the icu lock, so it can't be
1650	 *	     under witness.
1651	 */
1652	mutex_init();
1653	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1654	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1655
1656	/* exceptions */
1657	pti = pti_get_default();
1658	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1659
1660	for (x = 0; x < NIDT; x++)
1661		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1662		    SEL_KPL, 0);
1663	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1664	    SEL_KPL, 0);
1665	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1666	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1667	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1668	    SEL_UPL, 0);
1669	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1670	    SEL_UPL, 0);
1671	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1672	    SEL_KPL, 0);
1673	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1674	    SEL_KPL, 0);
1675	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1676	    SEL_KPL, 0);
1677	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1678	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1679	    SDT_SYSIGT, SEL_KPL, 0);
1680	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1681	    SEL_KPL, 0);
1682	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1683	    SDT_SYSIGT, SEL_KPL, 0);
1684	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1685	    SEL_KPL, 0);
1686	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1687	    SEL_KPL, 0);
1688	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1689	    SEL_KPL, 0);
1690	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1691	    SEL_KPL, 0);
1692	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1693	    SEL_KPL, 0);
1694	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1695	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1696	    SEL_KPL, 0);
1697#ifdef KDTRACE_HOOKS
1698	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1699	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1700#endif
1701#ifdef XENHVM
1702	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1703	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1704#endif
1705	r_idt.rd_limit = sizeof(idt0) - 1;
1706	r_idt.rd_base = (long) idt;
1707	lidt(&r_idt);
1708
1709	/*
1710	 * Initialize the clock before the console so that console
1711	 * initialization can use DELAY().
1712	 */
1713	clock_init();
1714
1715	/*
1716	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1717	 * transition).
1718	 * Once bootblocks have updated, we can test directly for
1719	 * efi_systbl != NULL here...
1720	 */
1721	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1722	    != NULL)
1723		vty_set_preferred(VTY_VT);
1724
1725	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1726	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1727	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1728	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1729
1730	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1731	    &x86_rngds_mitg_enable);
1732
1733	finishidentcpu();	/* Final stage of CPU initialization */
1734	initializecpu();	/* Initialize CPU registers */
1735	initializecpucache();
1736
1737	/* doublefault stack space, runs on ist1 */
1738	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1739
1740	/*
1741	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1742	 * above the start of the ist2 stack.
1743	 */
1744	np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1745	np->np_pcpu = (register_t) pc;
1746	common_tss[0].tss_ist2 = (long) np;
1747
1748	/*
1749	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1750	 * above the start of the ist3 stack.
1751	 */
1752	np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
1753	np->np_pcpu = (register_t) pc;
1754	common_tss[0].tss_ist3 = (long) np;
1755
1756	/*
1757	 * DB# stack, runs on ist4.
1758	 */
1759	np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
1760	np->np_pcpu = (register_t) pc;
1761	common_tss[0].tss_ist4 = (long) np;
1762
1763	/* Set the IO permission bitmap (empty due to tss seg limit) */
1764	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1765
1766	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1767	ltr(gsel_tss);
1768
1769	amd64_conf_fast_syscall();
1770
1771	/*
1772	 * Temporary forge some valid pointer to PCB, for exception
1773	 * handlers.  It is reinitialized properly below after FPU is
1774	 * set up.  Also set up td_critnest to short-cut the page
1775	 * fault handler.
1776	 */
1777	cpu_max_ext_state_size = sizeof(struct savefpu);
1778	thread0.td_pcb = get_pcb_td(&thread0);
1779	thread0.td_critnest = 1;
1780
1781	/*
1782	 * The console and kdb should be initialized even earlier than here,
1783	 * but some console drivers don't work until after getmemsize().
1784	 * Default to late console initialization to support these drivers.
1785	 * This loses mainly printf()s in getmemsize() and early debugging.
1786	 */
1787	late_console = 1;
1788	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1789	if (!late_console) {
1790		cninit();
1791		amd64_kdb_init();
1792	}
1793
1794	getmemsize(kmdp, physfree);
1795	init_param2(physmem);
1796
1797	/* now running on new page tables, configured,and u/iom is accessible */
1798
1799	if (late_console)
1800		cninit();
1801
1802#ifdef DEV_ISA
1803#ifdef DEV_ATPIC
1804	elcr_probe();
1805	atpic_startup();
1806#else
1807	/* Reset and mask the atpics and leave them shut down. */
1808	atpic_reset();
1809
1810	/*
1811	 * Point the ICU spurious interrupt vectors at the APIC spurious
1812	 * interrupt handler.
1813	 */
1814	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1815	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1816#endif
1817#else
1818#error "have you forgotten the isa device?";
1819#endif
1820
1821	if (late_console)
1822		amd64_kdb_init();
1823
1824	msgbufinit(msgbufp, msgbufsize);
1825	fpuinit();
1826
1827	/*
1828	 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1829	 * area size.  Zero out the extended state header in fpu save
1830	 * area.
1831	 */
1832	thread0.td_pcb = get_pcb_td(&thread0);
1833	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1834	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1835	if (use_xsave) {
1836		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1837		    1);
1838		xhdr->xstate_bv = xsave_mask;
1839	}
1840	/* make an initial tss so cpu can get interrupt stack on syscall! */
1841	rsp0 = (vm_offset_t)thread0.td_pcb;
1842	/* Ensure the stack is aligned to 16 bytes */
1843	rsp0 &= ~0xFul;
1844	common_tss[0].tss_rsp0 = rsp0;
1845	PCPU_SET(rsp0, rsp0);
1846	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1847	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1848	PCPU_SET(curpcb, thread0.td_pcb);
1849
1850	/* transfer to user mode */
1851
1852	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1853	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1854	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1855	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1856	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1857
1858	load_ds(_udatasel);
1859	load_es(_udatasel);
1860	load_fs(_ufssel);
1861
1862	/* setup proc 0's pcb */
1863	thread0.td_pcb->pcb_flags = 0;
1864	thread0.td_frame = &proc0_tf;
1865
1866        env = kern_getenv("kernelname");
1867	if (env != NULL)
1868		strlcpy(kernelname, env, sizeof(kernelname));
1869
1870	cpu_probe_amdc1e();
1871
1872#ifdef FDT
1873	x86_init_fdt();
1874#endif
1875	thread0.td_critnest = 0;
1876
1877	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1878	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1879	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1880
1881	/* Location of kernel stack for locore */
1882	return ((u_int64_t)thread0.td_pcb);
1883}
1884
1885void
1886cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1887{
1888
1889	pcpu->pc_acpi_id = 0xffffffff;
1890}
1891
1892static int
1893smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1894{
1895	struct bios_smap *smapbase;
1896	struct bios_smap_xattr smap;
1897	caddr_t kmdp;
1898	uint32_t *smapattr;
1899	int count, error, i;
1900
1901	/* Retrieve the system memory map from the loader. */
1902	kmdp = preload_search_by_type("elf kernel");
1903	if (kmdp == NULL)
1904		kmdp = preload_search_by_type("elf64 kernel");
1905	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1906	    MODINFO_METADATA | MODINFOMD_SMAP);
1907	if (smapbase == NULL)
1908		return (0);
1909	smapattr = (uint32_t *)preload_search_info(kmdp,
1910	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1911	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1912	error = 0;
1913	for (i = 0; i < count; i++) {
1914		smap.base = smapbase[i].base;
1915		smap.length = smapbase[i].length;
1916		smap.type = smapbase[i].type;
1917		if (smapattr != NULL)
1918			smap.xattr = smapattr[i];
1919		else
1920			smap.xattr = 0;
1921		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1922	}
1923	return (error);
1924}
1925SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1926    smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1927
1928static int
1929efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1930{
1931	struct efi_map_header *efihdr;
1932	caddr_t kmdp;
1933	uint32_t efisize;
1934
1935	kmdp = preload_search_by_type("elf kernel");
1936	if (kmdp == NULL)
1937		kmdp = preload_search_by_type("elf64 kernel");
1938	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1939	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1940	if (efihdr == NULL)
1941		return (0);
1942	efisize = *((uint32_t *)efihdr - 1);
1943	return (SYSCTL_OUT(req, efihdr, efisize));
1944}
1945SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1946    efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1947
1948void
1949spinlock_enter(void)
1950{
1951	struct thread *td;
1952	register_t flags;
1953
1954	td = curthread;
1955	if (td->td_md.md_spinlock_count == 0) {
1956		flags = intr_disable();
1957		td->td_md.md_spinlock_count = 1;
1958		td->td_md.md_saved_flags = flags;
1959	} else
1960		td->td_md.md_spinlock_count++;
1961	critical_enter();
1962}
1963
1964void
1965spinlock_exit(void)
1966{
1967	struct thread *td;
1968	register_t flags;
1969
1970	td = curthread;
1971	critical_exit();
1972	flags = td->td_md.md_saved_flags;
1973	td->td_md.md_spinlock_count--;
1974	if (td->td_md.md_spinlock_count == 0)
1975		intr_restore(flags);
1976}
1977
1978/*
1979 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1980 * we want to start a backtrace from the function that caused us to enter
1981 * the debugger. We have the context in the trapframe, but base the trace
1982 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1983 * enough for a backtrace.
1984 */
1985void
1986makectx(struct trapframe *tf, struct pcb *pcb)
1987{
1988
1989	pcb->pcb_r12 = tf->tf_r12;
1990	pcb->pcb_r13 = tf->tf_r13;
1991	pcb->pcb_r14 = tf->tf_r14;
1992	pcb->pcb_r15 = tf->tf_r15;
1993	pcb->pcb_rbp = tf->tf_rbp;
1994	pcb->pcb_rbx = tf->tf_rbx;
1995	pcb->pcb_rip = tf->tf_rip;
1996	pcb->pcb_rsp = tf->tf_rsp;
1997}
1998
1999int
2000ptrace_set_pc(struct thread *td, unsigned long addr)
2001{
2002
2003	td->td_frame->tf_rip = addr;
2004	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2005	return (0);
2006}
2007
2008int
2009ptrace_single_step(struct thread *td)
2010{
2011
2012	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2013	if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2014		td->td_frame->tf_rflags |= PSL_T;
2015		td->td_dbgflags |= TDB_STEP;
2016	}
2017	return (0);
2018}
2019
2020int
2021ptrace_clear_single_step(struct thread *td)
2022{
2023	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2024	td->td_frame->tf_rflags &= ~PSL_T;
2025	td->td_dbgflags &= ~TDB_STEP;
2026	return (0);
2027}
2028
2029int
2030fill_regs(struct thread *td, struct reg *regs)
2031{
2032	struct trapframe *tp;
2033
2034	tp = td->td_frame;
2035	return (fill_frame_regs(tp, regs));
2036}
2037
2038int
2039fill_frame_regs(struct trapframe *tp, struct reg *regs)
2040{
2041
2042	regs->r_r15 = tp->tf_r15;
2043	regs->r_r14 = tp->tf_r14;
2044	regs->r_r13 = tp->tf_r13;
2045	regs->r_r12 = tp->tf_r12;
2046	regs->r_r11 = tp->tf_r11;
2047	regs->r_r10 = tp->tf_r10;
2048	regs->r_r9  = tp->tf_r9;
2049	regs->r_r8  = tp->tf_r8;
2050	regs->r_rdi = tp->tf_rdi;
2051	regs->r_rsi = tp->tf_rsi;
2052	regs->r_rbp = tp->tf_rbp;
2053	regs->r_rbx = tp->tf_rbx;
2054	regs->r_rdx = tp->tf_rdx;
2055	regs->r_rcx = tp->tf_rcx;
2056	regs->r_rax = tp->tf_rax;
2057	regs->r_rip = tp->tf_rip;
2058	regs->r_cs = tp->tf_cs;
2059	regs->r_rflags = tp->tf_rflags;
2060	regs->r_rsp = tp->tf_rsp;
2061	regs->r_ss = tp->tf_ss;
2062	if (tp->tf_flags & TF_HASSEGS) {
2063		regs->r_ds = tp->tf_ds;
2064		regs->r_es = tp->tf_es;
2065		regs->r_fs = tp->tf_fs;
2066		regs->r_gs = tp->tf_gs;
2067	} else {
2068		regs->r_ds = 0;
2069		regs->r_es = 0;
2070		regs->r_fs = 0;
2071		regs->r_gs = 0;
2072	}
2073	regs->r_err = 0;
2074	regs->r_trapno = 0;
2075	return (0);
2076}
2077
2078int
2079set_regs(struct thread *td, struct reg *regs)
2080{
2081	struct trapframe *tp;
2082	register_t rflags;
2083
2084	tp = td->td_frame;
2085	rflags = regs->r_rflags & 0xffffffff;
2086	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2087		return (EINVAL);
2088	tp->tf_r15 = regs->r_r15;
2089	tp->tf_r14 = regs->r_r14;
2090	tp->tf_r13 = regs->r_r13;
2091	tp->tf_r12 = regs->r_r12;
2092	tp->tf_r11 = regs->r_r11;
2093	tp->tf_r10 = regs->r_r10;
2094	tp->tf_r9  = regs->r_r9;
2095	tp->tf_r8  = regs->r_r8;
2096	tp->tf_rdi = regs->r_rdi;
2097	tp->tf_rsi = regs->r_rsi;
2098	tp->tf_rbp = regs->r_rbp;
2099	tp->tf_rbx = regs->r_rbx;
2100	tp->tf_rdx = regs->r_rdx;
2101	tp->tf_rcx = regs->r_rcx;
2102	tp->tf_rax = regs->r_rax;
2103	tp->tf_rip = regs->r_rip;
2104	tp->tf_cs = regs->r_cs;
2105	tp->tf_rflags = rflags;
2106	tp->tf_rsp = regs->r_rsp;
2107	tp->tf_ss = regs->r_ss;
2108	if (0) {	/* XXXKIB */
2109		tp->tf_ds = regs->r_ds;
2110		tp->tf_es = regs->r_es;
2111		tp->tf_fs = regs->r_fs;
2112		tp->tf_gs = regs->r_gs;
2113		tp->tf_flags = TF_HASSEGS;
2114	}
2115	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2116	return (0);
2117}
2118
2119/* XXX check all this stuff! */
2120/* externalize from sv_xmm */
2121static void
2122fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2123{
2124	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2125	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2126	int i;
2127
2128	/* pcb -> fpregs */
2129	bzero(fpregs, sizeof(*fpregs));
2130
2131	/* FPU control/status */
2132	penv_fpreg->en_cw = penv_xmm->en_cw;
2133	penv_fpreg->en_sw = penv_xmm->en_sw;
2134	penv_fpreg->en_tw = penv_xmm->en_tw;
2135	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2136	penv_fpreg->en_rip = penv_xmm->en_rip;
2137	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2138	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2139	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2140
2141	/* FPU registers */
2142	for (i = 0; i < 8; ++i)
2143		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2144
2145	/* SSE registers */
2146	for (i = 0; i < 16; ++i)
2147		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2148}
2149
2150/* internalize from fpregs into sv_xmm */
2151static void
2152set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2153{
2154	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2155	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2156	int i;
2157
2158	/* fpregs -> pcb */
2159	/* FPU control/status */
2160	penv_xmm->en_cw = penv_fpreg->en_cw;
2161	penv_xmm->en_sw = penv_fpreg->en_sw;
2162	penv_xmm->en_tw = penv_fpreg->en_tw;
2163	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2164	penv_xmm->en_rip = penv_fpreg->en_rip;
2165	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2166	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2167	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2168
2169	/* FPU registers */
2170	for (i = 0; i < 8; ++i)
2171		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2172
2173	/* SSE registers */
2174	for (i = 0; i < 16; ++i)
2175		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2176}
2177
2178/* externalize from td->pcb */
2179int
2180fill_fpregs(struct thread *td, struct fpreg *fpregs)
2181{
2182
2183	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2184	    P_SHOULDSTOP(td->td_proc),
2185	    ("not suspended thread %p", td));
2186	fpugetregs(td);
2187	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2188	return (0);
2189}
2190
2191/* internalize to td->pcb */
2192int
2193set_fpregs(struct thread *td, struct fpreg *fpregs)
2194{
2195
2196	critical_enter();
2197	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2198	fpuuserinited(td);
2199	critical_exit();
2200	return (0);
2201}
2202
2203/*
2204 * Get machine context.
2205 */
2206int
2207get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2208{
2209	struct pcb *pcb;
2210	struct trapframe *tp;
2211
2212	pcb = td->td_pcb;
2213	tp = td->td_frame;
2214	PROC_LOCK(curthread->td_proc);
2215	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2216	PROC_UNLOCK(curthread->td_proc);
2217	mcp->mc_r15 = tp->tf_r15;
2218	mcp->mc_r14 = tp->tf_r14;
2219	mcp->mc_r13 = tp->tf_r13;
2220	mcp->mc_r12 = tp->tf_r12;
2221	mcp->mc_r11 = tp->tf_r11;
2222	mcp->mc_r10 = tp->tf_r10;
2223	mcp->mc_r9  = tp->tf_r9;
2224	mcp->mc_r8  = tp->tf_r8;
2225	mcp->mc_rdi = tp->tf_rdi;
2226	mcp->mc_rsi = tp->tf_rsi;
2227	mcp->mc_rbp = tp->tf_rbp;
2228	mcp->mc_rbx = tp->tf_rbx;
2229	mcp->mc_rcx = tp->tf_rcx;
2230	mcp->mc_rflags = tp->tf_rflags;
2231	if (flags & GET_MC_CLEAR_RET) {
2232		mcp->mc_rax = 0;
2233		mcp->mc_rdx = 0;
2234		mcp->mc_rflags &= ~PSL_C;
2235	} else {
2236		mcp->mc_rax = tp->tf_rax;
2237		mcp->mc_rdx = tp->tf_rdx;
2238	}
2239	mcp->mc_rip = tp->tf_rip;
2240	mcp->mc_cs = tp->tf_cs;
2241	mcp->mc_rsp = tp->tf_rsp;
2242	mcp->mc_ss = tp->tf_ss;
2243	mcp->mc_ds = tp->tf_ds;
2244	mcp->mc_es = tp->tf_es;
2245	mcp->mc_fs = tp->tf_fs;
2246	mcp->mc_gs = tp->tf_gs;
2247	mcp->mc_flags = tp->tf_flags;
2248	mcp->mc_len = sizeof(*mcp);
2249	get_fpcontext(td, mcp, NULL, 0);
2250	update_pcb_bases(pcb);
2251	mcp->mc_fsbase = pcb->pcb_fsbase;
2252	mcp->mc_gsbase = pcb->pcb_gsbase;
2253	mcp->mc_xfpustate = 0;
2254	mcp->mc_xfpustate_len = 0;
2255	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2256	return (0);
2257}
2258
2259/*
2260 * Set machine context.
2261 *
2262 * However, we don't set any but the user modifiable flags, and we won't
2263 * touch the cs selector.
2264 */
2265int
2266set_mcontext(struct thread *td, mcontext_t *mcp)
2267{
2268	struct pcb *pcb;
2269	struct trapframe *tp;
2270	char *xfpustate;
2271	long rflags;
2272	int ret;
2273
2274	pcb = td->td_pcb;
2275	tp = td->td_frame;
2276	if (mcp->mc_len != sizeof(*mcp) ||
2277	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2278		return (EINVAL);
2279	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2280	    (tp->tf_rflags & ~PSL_USERCHANGE);
2281	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2282		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2283		    sizeof(struct savefpu))
2284			return (EINVAL);
2285		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2286		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2287		    mcp->mc_xfpustate_len);
2288		if (ret != 0)
2289			return (ret);
2290	} else
2291		xfpustate = NULL;
2292	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2293	if (ret != 0)
2294		return (ret);
2295	tp->tf_r15 = mcp->mc_r15;
2296	tp->tf_r14 = mcp->mc_r14;
2297	tp->tf_r13 = mcp->mc_r13;
2298	tp->tf_r12 = mcp->mc_r12;
2299	tp->tf_r11 = mcp->mc_r11;
2300	tp->tf_r10 = mcp->mc_r10;
2301	tp->tf_r9  = mcp->mc_r9;
2302	tp->tf_r8  = mcp->mc_r8;
2303	tp->tf_rdi = mcp->mc_rdi;
2304	tp->tf_rsi = mcp->mc_rsi;
2305	tp->tf_rbp = mcp->mc_rbp;
2306	tp->tf_rbx = mcp->mc_rbx;
2307	tp->tf_rdx = mcp->mc_rdx;
2308	tp->tf_rcx = mcp->mc_rcx;
2309	tp->tf_rax = mcp->mc_rax;
2310	tp->tf_rip = mcp->mc_rip;
2311	tp->tf_rflags = rflags;
2312	tp->tf_rsp = mcp->mc_rsp;
2313	tp->tf_ss = mcp->mc_ss;
2314	tp->tf_flags = mcp->mc_flags;
2315	if (tp->tf_flags & TF_HASSEGS) {
2316		tp->tf_ds = mcp->mc_ds;
2317		tp->tf_es = mcp->mc_es;
2318		tp->tf_fs = mcp->mc_fs;
2319		tp->tf_gs = mcp->mc_gs;
2320	}
2321	set_pcb_flags(pcb, PCB_FULL_IRET);
2322	if (mcp->mc_flags & _MC_HASBASES) {
2323		pcb->pcb_fsbase = mcp->mc_fsbase;
2324		pcb->pcb_gsbase = mcp->mc_gsbase;
2325	}
2326	return (0);
2327}
2328
2329static void
2330get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2331    size_t xfpusave_len)
2332{
2333	size_t max_len, len;
2334
2335	mcp->mc_ownedfp = fpugetregs(td);
2336	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2337	    sizeof(mcp->mc_fpstate));
2338	mcp->mc_fpformat = fpuformat();
2339	if (!use_xsave || xfpusave_len == 0)
2340		return;
2341	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2342	len = xfpusave_len;
2343	if (len > max_len) {
2344		len = max_len;
2345		bzero(xfpusave + max_len, len - max_len);
2346	}
2347	mcp->mc_flags |= _MC_HASFPXSTATE;
2348	mcp->mc_xfpustate_len = len;
2349	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2350}
2351
2352static int
2353set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2354    size_t xfpustate_len)
2355{
2356	int error;
2357
2358	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2359		return (0);
2360	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2361		return (EINVAL);
2362	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2363		/* We don't care what state is left in the FPU or PCB. */
2364		fpstate_drop(td);
2365		error = 0;
2366	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2367	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2368		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2369		    xfpustate, xfpustate_len);
2370	} else
2371		return (EINVAL);
2372	return (error);
2373}
2374
2375void
2376fpstate_drop(struct thread *td)
2377{
2378
2379	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2380	critical_enter();
2381	if (PCPU_GET(fpcurthread) == td)
2382		fpudrop();
2383	/*
2384	 * XXX force a full drop of the fpu.  The above only drops it if we
2385	 * owned it.
2386	 *
2387	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2388	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2389	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2390	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2391	 * have too many layers.
2392	 */
2393	clear_pcb_flags(curthread->td_pcb,
2394	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2395	critical_exit();
2396}
2397
2398int
2399fill_dbregs(struct thread *td, struct dbreg *dbregs)
2400{
2401	struct pcb *pcb;
2402
2403	if (td == NULL) {
2404		dbregs->dr[0] = rdr0();
2405		dbregs->dr[1] = rdr1();
2406		dbregs->dr[2] = rdr2();
2407		dbregs->dr[3] = rdr3();
2408		dbregs->dr[6] = rdr6();
2409		dbregs->dr[7] = rdr7();
2410	} else {
2411		pcb = td->td_pcb;
2412		dbregs->dr[0] = pcb->pcb_dr0;
2413		dbregs->dr[1] = pcb->pcb_dr1;
2414		dbregs->dr[2] = pcb->pcb_dr2;
2415		dbregs->dr[3] = pcb->pcb_dr3;
2416		dbregs->dr[6] = pcb->pcb_dr6;
2417		dbregs->dr[7] = pcb->pcb_dr7;
2418	}
2419	dbregs->dr[4] = 0;
2420	dbregs->dr[5] = 0;
2421	dbregs->dr[8] = 0;
2422	dbregs->dr[9] = 0;
2423	dbregs->dr[10] = 0;
2424	dbregs->dr[11] = 0;
2425	dbregs->dr[12] = 0;
2426	dbregs->dr[13] = 0;
2427	dbregs->dr[14] = 0;
2428	dbregs->dr[15] = 0;
2429	return (0);
2430}
2431
2432int
2433set_dbregs(struct thread *td, struct dbreg *dbregs)
2434{
2435	struct pcb *pcb;
2436	int i;
2437
2438	if (td == NULL) {
2439		load_dr0(dbregs->dr[0]);
2440		load_dr1(dbregs->dr[1]);
2441		load_dr2(dbregs->dr[2]);
2442		load_dr3(dbregs->dr[3]);
2443		load_dr6(dbregs->dr[6]);
2444		load_dr7(dbregs->dr[7]);
2445	} else {
2446		/*
2447		 * Don't let an illegal value for dr7 get set.  Specifically,
2448		 * check for undefined settings.  Setting these bit patterns
2449		 * result in undefined behaviour and can lead to an unexpected
2450		 * TRCTRAP or a general protection fault right here.
2451		 * Upper bits of dr6 and dr7 must not be set
2452		 */
2453		for (i = 0; i < 4; i++) {
2454			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2455				return (EINVAL);
2456			if (td->td_frame->tf_cs == _ucode32sel &&
2457			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2458				return (EINVAL);
2459		}
2460		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2461		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2462			return (EINVAL);
2463
2464		pcb = td->td_pcb;
2465
2466		/*
2467		 * Don't let a process set a breakpoint that is not within the
2468		 * process's address space.  If a process could do this, it
2469		 * could halt the system by setting a breakpoint in the kernel
2470		 * (if ddb was enabled).  Thus, we need to check to make sure
2471		 * that no breakpoints are being enabled for addresses outside
2472		 * process's address space.
2473		 *
2474		 * XXX - what about when the watched area of the user's
2475		 * address space is written into from within the kernel
2476		 * ... wouldn't that still cause a breakpoint to be generated
2477		 * from within kernel mode?
2478		 */
2479
2480		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2481			/* dr0 is enabled */
2482			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2483				return (EINVAL);
2484		}
2485		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2486			/* dr1 is enabled */
2487			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2488				return (EINVAL);
2489		}
2490		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2491			/* dr2 is enabled */
2492			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2493				return (EINVAL);
2494		}
2495		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2496			/* dr3 is enabled */
2497			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2498				return (EINVAL);
2499		}
2500
2501		pcb->pcb_dr0 = dbregs->dr[0];
2502		pcb->pcb_dr1 = dbregs->dr[1];
2503		pcb->pcb_dr2 = dbregs->dr[2];
2504		pcb->pcb_dr3 = dbregs->dr[3];
2505		pcb->pcb_dr6 = dbregs->dr[6];
2506		pcb->pcb_dr7 = dbregs->dr[7];
2507
2508		set_pcb_flags(pcb, PCB_DBREGS);
2509	}
2510
2511	return (0);
2512}
2513
2514void
2515reset_dbregs(void)
2516{
2517
2518	load_dr7(0);	/* Turn off the control bits first */
2519	load_dr0(0);
2520	load_dr1(0);
2521	load_dr2(0);
2522	load_dr3(0);
2523	load_dr6(0);
2524}
2525
2526/*
2527 * Return > 0 if a hardware breakpoint has been hit, and the
2528 * breakpoint was in user space.  Return 0, otherwise.
2529 */
2530int
2531user_dbreg_trap(register_t dr6)
2532{
2533        u_int64_t dr7;
2534        u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2535        int nbp;            /* number of breakpoints that triggered */
2536        caddr_t addr[4];    /* breakpoint addresses */
2537        int i;
2538
2539        bp = dr6 & DBREG_DR6_BMASK;
2540        if (bp == 0) {
2541                /*
2542                 * None of the breakpoint bits are set meaning this
2543                 * trap was not caused by any of the debug registers
2544                 */
2545                return 0;
2546        }
2547
2548        dr7 = rdr7();
2549        if ((dr7 & 0x000000ff) == 0) {
2550                /*
2551                 * all GE and LE bits in the dr7 register are zero,
2552                 * thus the trap couldn't have been caused by the
2553                 * hardware debug registers
2554                 */
2555                return 0;
2556        }
2557
2558        nbp = 0;
2559
2560        /*
2561         * at least one of the breakpoints were hit, check to see
2562         * which ones and if any of them are user space addresses
2563         */
2564
2565        if (bp & 0x01) {
2566                addr[nbp++] = (caddr_t)rdr0();
2567        }
2568        if (bp & 0x02) {
2569                addr[nbp++] = (caddr_t)rdr1();
2570        }
2571        if (bp & 0x04) {
2572                addr[nbp++] = (caddr_t)rdr2();
2573        }
2574        if (bp & 0x08) {
2575                addr[nbp++] = (caddr_t)rdr3();
2576        }
2577
2578        for (i = 0; i < nbp; i++) {
2579                if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2580                        /*
2581                         * addr[i] is in user space
2582                         */
2583                        return nbp;
2584                }
2585        }
2586
2587        /*
2588         * None of the breakpoints are in user space.
2589         */
2590        return 0;
2591}
2592
2593/*
2594 * The pcb_flags is only modified by current thread, or by other threads
2595 * when current thread is stopped.  However, current thread may change it
2596 * from the interrupt context in cpu_switch(), or in the trap handler.
2597 * When we read-modify-write pcb_flags from C sources, compiler may generate
2598 * code that is not atomic regarding the interrupt handler.  If a trap or
2599 * interrupt happens and any flag is modified from the handler, it can be
2600 * clobbered with the cached value later.  Therefore, we implement setting
2601 * and clearing flags with single-instruction functions, which do not race
2602 * with possible modification of the flags from the trap or interrupt context,
2603 * because traps and interrupts are executed only on instruction boundary.
2604 */
2605void
2606set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2607{
2608
2609	__asm __volatile("orl %1,%0"
2610	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2611	    : "cc", "memory");
2612
2613}
2614
2615/*
2616 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2617 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2618 * pcb if user space modified the bases.  We must save on the context
2619 * switch or if the return to usermode happens through the doreti.
2620 *
2621 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2622 * which have a consequence that the base MSRs must be saved each time
2623 * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
2624 * context switches.
2625 */
2626void
2627set_pcb_flags(struct pcb *pcb, const u_int flags)
2628{
2629	register_t r;
2630
2631	if (curpcb == pcb &&
2632	    (flags & PCB_FULL_IRET) != 0 &&
2633	    (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
2634	    (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
2635		r = intr_disable();
2636		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2637			if (rfs() == _ufssel)
2638				pcb->pcb_fsbase = rdfsbase();
2639			if (rgs() == _ugssel)
2640				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2641		}
2642		set_pcb_flags_raw(pcb, flags);
2643		intr_restore(r);
2644	} else {
2645		set_pcb_flags_raw(pcb, flags);
2646	}
2647}
2648
2649void
2650clear_pcb_flags(struct pcb *pcb, const u_int flags)
2651{
2652
2653	__asm __volatile("andl %1,%0"
2654	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2655	    : "cc", "memory");
2656}
2657
2658#ifdef KDB
2659
2660/*
2661 * Provide inb() and outb() as functions.  They are normally only available as
2662 * inline functions, thus cannot be called from the debugger.
2663 */
2664
2665/* silence compiler warnings */
2666u_char inb_(u_short);
2667void outb_(u_short, u_char);
2668
2669u_char
2670inb_(u_short port)
2671{
2672	return inb(port);
2673}
2674
2675void
2676outb_(u_short port, u_char data)
2677{
2678	outb(port, data);
2679}
2680
2681#endif /* KDB */
2682