machdep.c revision 336963
1/*-
2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * William Jolitz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 336963 2018-07-31 10:18:30Z kib $");
43
44#include "opt_atpic.h"
45#include "opt_compat.h"
46#include "opt_cpu.h"
47#include "opt_ddb.h"
48#include "opt_inet.h"
49#include "opt_isa.h"
50#include "opt_kstack_pages.h"
51#include "opt_maxmem.h"
52#include "opt_mp_watchdog.h"
53#include "opt_perfmon.h"
54#include "opt_platform.h"
55#include "opt_sched.h"
56
57#include <sys/param.h>
58#include <sys/proc.h>
59#include <sys/systm.h>
60#include <sys/bio.h>
61#include <sys/buf.h>
62#include <sys/bus.h>
63#include <sys/callout.h>
64#include <sys/cons.h>
65#include <sys/cpu.h>
66#include <sys/efi.h>
67#include <sys/eventhandler.h>
68#include <sys/exec.h>
69#include <sys/imgact.h>
70#include <sys/kdb.h>
71#include <sys/kernel.h>
72#include <sys/ktr.h>
73#include <sys/linker.h>
74#include <sys/lock.h>
75#include <sys/malloc.h>
76#include <sys/memrange.h>
77#include <sys/msgbuf.h>
78#include <sys/mutex.h>
79#include <sys/pcpu.h>
80#include <sys/ptrace.h>
81#include <sys/reboot.h>
82#include <sys/rwlock.h>
83#include <sys/sched.h>
84#include <sys/signalvar.h>
85#ifdef SMP
86#include <sys/smp.h>
87#endif
88#include <sys/syscallsubr.h>
89#include <sys/sysctl.h>
90#include <sys/sysent.h>
91#include <sys/sysproto.h>
92#include <sys/ucontext.h>
93#include <sys/vmmeter.h>
94
95#include <vm/vm.h>
96#include <vm/vm_extern.h>
97#include <vm/vm_kern.h>
98#include <vm/vm_page.h>
99#include <vm/vm_map.h>
100#include <vm/vm_object.h>
101#include <vm/vm_pager.h>
102#include <vm/vm_param.h>
103
104#ifdef DDB
105#ifndef KDB
106#error KDB must be enabled in order for DDB to work!
107#endif
108#include <ddb/ddb.h>
109#include <ddb/db_sym.h>
110#endif
111
112#include <net/netisr.h>
113
114#include <machine/clock.h>
115#include <machine/cpu.h>
116#include <machine/cputypes.h>
117#include <machine/frame.h>
118#include <machine/intr_machdep.h>
119#include <x86/mca.h>
120#include <machine/md_var.h>
121#include <machine/metadata.h>
122#include <machine/mp_watchdog.h>
123#include <machine/pc/bios.h>
124#include <machine/pcb.h>
125#include <machine/proc.h>
126#include <machine/reg.h>
127#include <machine/sigframe.h>
128#include <machine/specialreg.h>
129#ifdef PERFMON
130#include <machine/perfmon.h>
131#endif
132#include <machine/tss.h>
133#ifdef SMP
134#include <machine/smp.h>
135#endif
136#ifdef FDT
137#include <x86/fdt.h>
138#endif
139
140#ifdef DEV_ATPIC
141#include <x86/isa/icu.h>
142#else
143#include <x86/apicvar.h>
144#endif
145
146#include <isa/isareg.h>
147#include <isa/rtc.h>
148#include <x86/init.h>
149
150/* Sanity check for __curthread() */
151CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
152
153/*
154 * The PTI trampoline stack needs enough space for a hardware trapframe and a
155 * couple of scratch registers, as well as the trapframe left behind after an
156 * iret fault.
157 */
158CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
159    offsetof(struct pti_frame, pti_rip));
160
161extern u_int64_t hammer_time(u_int64_t, u_int64_t);
162
163#define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
164#define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
165
166static void cpu_startup(void *);
167static void get_fpcontext(struct thread *td, mcontext_t *mcp,
168    char *xfpusave, size_t xfpusave_len);
169static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
170    char *xfpustate, size_t xfpustate_len);
171SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
172
173/* Preload data parse function */
174static caddr_t native_parse_preload_data(u_int64_t);
175
176/* Native function to fetch and parse the e820 map */
177static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
178
179/* Default init_ops implementation. */
180struct init_ops init_ops = {
181	.parse_preload_data =	native_parse_preload_data,
182	.early_clock_source_init =	i8254_init,
183	.early_delay =			i8254_delay,
184	.parse_memmap =			native_parse_memmap,
185#ifdef SMP
186	.mp_bootaddress =		mp_bootaddress,
187	.start_all_aps =		native_start_all_aps,
188#endif
189	.msi_init =			msi_init,
190};
191
192struct msgbuf *msgbufp;
193
194/*
195 * Physical address of the EFI System Table. Stashed from the metadata hints
196 * passed into the kernel and used by the EFI code to call runtime services.
197 */
198vm_paddr_t efi_systbl_phys;
199
200/* Intel ICH registers */
201#define ICH_PMBASE	0x400
202#define ICH_SMI_EN	ICH_PMBASE + 0x30
203
204int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
205
206int cold = 1;
207
208long Maxmem = 0;
209long realmem = 0;
210
211/*
212 * The number of PHYSMAP entries must be one less than the number of
213 * PHYSSEG entries because the PHYSMAP entry that spans the largest
214 * physical address that is accessible by ISA DMA is split into two
215 * PHYSSEG entries.
216 */
217#define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
218
219vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
220vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
221
222/* must be 2 less so 0 0 can signal end of chunks */
223#define	PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
224#define	DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
225
226struct kva_md_info kmi;
227
228static struct trapframe proc0_tf;
229struct region_descriptor r_gdt, r_idt;
230
231struct pcpu __pcpu[MAXCPU];
232
233struct mtx icu_lock;
234
235struct mem_range_softc mem_range_softc;
236
237struct mtx dt_lock;	/* lock for GDT and LDT */
238
239void (*vmm_resume_p)(void);
240
241static void
242cpu_startup(dummy)
243	void *dummy;
244{
245	uintmax_t memsize;
246	char *sysenv;
247
248	/*
249	 * On MacBooks, we need to disallow the legacy USB circuit to
250	 * generate an SMI# because this can cause several problems,
251	 * namely: incorrect CPU frequency detection and failure to
252	 * start the APs.
253	 * We do this by disabling a bit in the SMI_EN (SMI Control and
254	 * Enable register) of the Intel ICH LPC Interface Bridge.
255	 */
256	sysenv = kern_getenv("smbios.system.product");
257	if (sysenv != NULL) {
258		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
259		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
260		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
261		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
262		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
263		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
264		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
265		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
266			if (bootverbose)
267				printf("Disabling LEGACY_USB_EN bit on "
268				    "Intel ICH.\n");
269			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
270		}
271		freeenv(sysenv);
272	}
273
274	/*
275	 * Good {morning,afternoon,evening,night}.
276	 */
277	startrtclock();
278	printcpuinfo();
279#ifdef PERFMON
280	perfmon_init();
281#endif
282
283	/*
284	 * Display physical memory if SMBIOS reports reasonable amount.
285	 */
286	memsize = 0;
287	sysenv = kern_getenv("smbios.memory.enabled");
288	if (sysenv != NULL) {
289		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
290		freeenv(sysenv);
291	}
292	if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
293		memsize = ptoa((uintmax_t)Maxmem);
294	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
295	realmem = atop(memsize);
296
297	/*
298	 * Display any holes after the first chunk of extended memory.
299	 */
300	if (bootverbose) {
301		int indx;
302
303		printf("Physical memory chunk(s):\n");
304		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
305			vm_paddr_t size;
306
307			size = phys_avail[indx + 1] - phys_avail[indx];
308			printf(
309			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
310			    (uintmax_t)phys_avail[indx],
311			    (uintmax_t)phys_avail[indx + 1] - 1,
312			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
313		}
314	}
315
316	vm_ksubmap_init(&kmi);
317
318	printf("avail memory = %ju (%ju MB)\n",
319	    ptoa((uintmax_t)vm_cnt.v_free_count),
320	    ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
321
322	/*
323	 * Set up buffers, so they can be used to read disk labels.
324	 */
325	bufinit();
326	vm_pager_bufferinit();
327
328	cpu_setregs();
329}
330
331/*
332 * Send an interrupt to process.
333 *
334 * Stack is set up to allow sigcode stored
335 * at top to call routine, followed by call
336 * to sigreturn routine below.  After sigreturn
337 * resets the signal mask, the stack, and the
338 * frame pointer, it returns to the user
339 * specified pc, psl.
340 */
341void
342sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
343{
344	struct sigframe sf, *sfp;
345	struct pcb *pcb;
346	struct proc *p;
347	struct thread *td;
348	struct sigacts *psp;
349	char *sp;
350	struct trapframe *regs;
351	char *xfpusave;
352	size_t xfpusave_len;
353	int sig;
354	int oonstack;
355
356	td = curthread;
357	pcb = td->td_pcb;
358	p = td->td_proc;
359	PROC_LOCK_ASSERT(p, MA_OWNED);
360	sig = ksi->ksi_signo;
361	psp = p->p_sigacts;
362	mtx_assert(&psp->ps_mtx, MA_OWNED);
363	regs = td->td_frame;
364	oonstack = sigonstack(regs->tf_rsp);
365
366	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
367		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
368		xfpusave = __builtin_alloca(xfpusave_len);
369	} else {
370		xfpusave_len = 0;
371		xfpusave = NULL;
372	}
373
374	/* Save user context. */
375	bzero(&sf, sizeof(sf));
376	sf.sf_uc.uc_sigmask = *mask;
377	sf.sf_uc.uc_stack = td->td_sigstk;
378	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
379	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
380	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
381	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
382	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
383	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
384	fpstate_drop(td);
385	update_pcb_bases(pcb);
386	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
387	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
388	bzero(sf.sf_uc.uc_mcontext.mc_spare,
389	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
390	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
391
392	/* Allocate space for the signal handler context. */
393	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
394	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
395		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
396#if defined(COMPAT_43)
397		td->td_sigstk.ss_flags |= SS_ONSTACK;
398#endif
399	} else
400		sp = (char *)regs->tf_rsp - 128;
401	if (xfpusave != NULL) {
402		sp -= xfpusave_len;
403		sp = (char *)((unsigned long)sp & ~0x3Ful);
404		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
405	}
406	sp -= sizeof(struct sigframe);
407	/* Align to 16 bytes. */
408	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
409
410	/* Build the argument list for the signal handler. */
411	regs->tf_rdi = sig;			/* arg 1 in %rdi */
412	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
413	bzero(&sf.sf_si, sizeof(sf.sf_si));
414	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
415		/* Signal handler installed with SA_SIGINFO. */
416		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
417		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
418
419		/* Fill in POSIX parts */
420		sf.sf_si = ksi->ksi_info;
421		sf.sf_si.si_signo = sig; /* maybe a translated signal */
422		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
423	} else {
424		/* Old FreeBSD-style arguments. */
425		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
426		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
427		sf.sf_ahu.sf_handler = catcher;
428	}
429	mtx_unlock(&psp->ps_mtx);
430	PROC_UNLOCK(p);
431
432	/*
433	 * Copy the sigframe out to the user's stack.
434	 */
435	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
436	    (xfpusave != NULL && copyout(xfpusave,
437	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
438	    != 0)) {
439#ifdef DEBUG
440		printf("process %ld has trashed its stack\n", (long)p->p_pid);
441#endif
442		PROC_LOCK(p);
443		sigexit(td, SIGILL);
444	}
445
446	regs->tf_rsp = (long)sfp;
447	regs->tf_rip = p->p_sysent->sv_sigcode_base;
448	regs->tf_rflags &= ~(PSL_T | PSL_D);
449	regs->tf_cs = _ucodesel;
450	regs->tf_ds = _udatasel;
451	regs->tf_ss = _udatasel;
452	regs->tf_es = _udatasel;
453	regs->tf_fs = _ufssel;
454	regs->tf_gs = _ugssel;
455	regs->tf_flags = TF_HASSEGS;
456	PROC_LOCK(p);
457	mtx_lock(&psp->ps_mtx);
458}
459
460/*
461 * System call to cleanup state after a signal
462 * has been taken.  Reset signal mask and
463 * stack state from context left by sendsig (above).
464 * Return to previous pc and psl as specified by
465 * context left by sendsig. Check carefully to
466 * make sure that the user has not modified the
467 * state to gain improper privileges.
468 *
469 * MPSAFE
470 */
471int
472sys_sigreturn(td, uap)
473	struct thread *td;
474	struct sigreturn_args /* {
475		const struct __ucontext *sigcntxp;
476	} */ *uap;
477{
478	ucontext_t uc;
479	struct pcb *pcb;
480	struct proc *p;
481	struct trapframe *regs;
482	ucontext_t *ucp;
483	char *xfpustate;
484	size_t xfpustate_len;
485	long rflags;
486	int cs, error, ret;
487	ksiginfo_t ksi;
488
489	pcb = td->td_pcb;
490	p = td->td_proc;
491
492	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
493	if (error != 0) {
494		uprintf("pid %d (%s): sigreturn copyin failed\n",
495		    p->p_pid, td->td_name);
496		return (error);
497	}
498	ucp = &uc;
499	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
500		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
501		    td->td_name, ucp->uc_mcontext.mc_flags);
502		return (EINVAL);
503	}
504	regs = td->td_frame;
505	rflags = ucp->uc_mcontext.mc_rflags;
506	/*
507	 * Don't allow users to change privileged or reserved flags.
508	 */
509	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
510		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
511		    td->td_name, rflags);
512		return (EINVAL);
513	}
514
515	/*
516	 * Don't allow users to load a valid privileged %cs.  Let the
517	 * hardware check for invalid selectors, excess privilege in
518	 * other selectors, invalid %eip's and invalid %esp's.
519	 */
520	cs = ucp->uc_mcontext.mc_cs;
521	if (!CS_SECURE(cs)) {
522		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
523		    td->td_name, cs);
524		ksiginfo_init_trap(&ksi);
525		ksi.ksi_signo = SIGBUS;
526		ksi.ksi_code = BUS_OBJERR;
527		ksi.ksi_trapno = T_PROTFLT;
528		ksi.ksi_addr = (void *)regs->tf_rip;
529		trapsignal(td, &ksi);
530		return (EINVAL);
531	}
532
533	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
534		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
535		if (xfpustate_len > cpu_max_ext_state_size -
536		    sizeof(struct savefpu)) {
537			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
538			    p->p_pid, td->td_name, xfpustate_len);
539			return (EINVAL);
540		}
541		xfpustate = __builtin_alloca(xfpustate_len);
542		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
543		    xfpustate, xfpustate_len);
544		if (error != 0) {
545			uprintf(
546	"pid %d (%s): sigreturn copying xfpustate failed\n",
547			    p->p_pid, td->td_name);
548			return (error);
549		}
550	} else {
551		xfpustate = NULL;
552		xfpustate_len = 0;
553	}
554	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
555	if (ret != 0) {
556		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
557		    p->p_pid, td->td_name, ret);
558		return (ret);
559	}
560	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
561	update_pcb_bases(pcb);
562	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
563	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
564
565#if defined(COMPAT_43)
566	if (ucp->uc_mcontext.mc_onstack & 1)
567		td->td_sigstk.ss_flags |= SS_ONSTACK;
568	else
569		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
570#endif
571
572	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
573	return (EJUSTRETURN);
574}
575
576#ifdef COMPAT_FREEBSD4
577int
578freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
579{
580
581	return sys_sigreturn(td, (struct sigreturn_args *)uap);
582}
583#endif
584
585/*
586 * Reset registers to default values on exec.
587 */
588void
589exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
590{
591	struct trapframe *regs = td->td_frame;
592	struct pcb *pcb = td->td_pcb;
593
594	mtx_lock(&dt_lock);
595	if (td->td_proc->p_md.md_ldt != NULL)
596		user_ldt_free(td);
597	else
598		mtx_unlock(&dt_lock);
599
600	update_pcb_bases(pcb);
601	pcb->pcb_fsbase = 0;
602	pcb->pcb_gsbase = 0;
603	clear_pcb_flags(pcb, PCB_32BIT);
604	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
605
606	bzero((char *)regs, sizeof(struct trapframe));
607	regs->tf_rip = imgp->entry_addr;
608	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
609	regs->tf_rdi = stack;		/* argv */
610	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
611	regs->tf_ss = _udatasel;
612	regs->tf_cs = _ucodesel;
613	regs->tf_ds = _udatasel;
614	regs->tf_es = _udatasel;
615	regs->tf_fs = _ufssel;
616	regs->tf_gs = _ugssel;
617	regs->tf_flags = TF_HASSEGS;
618	td->td_retval[1] = 0;
619
620	/*
621	 * Reset the hardware debug registers if they were in use.
622	 * They won't have any meaning for the newly exec'd process.
623	 */
624	if (pcb->pcb_flags & PCB_DBREGS) {
625		pcb->pcb_dr0 = 0;
626		pcb->pcb_dr1 = 0;
627		pcb->pcb_dr2 = 0;
628		pcb->pcb_dr3 = 0;
629		pcb->pcb_dr6 = 0;
630		pcb->pcb_dr7 = 0;
631		if (pcb == curpcb) {
632			/*
633			 * Clear the debug registers on the running
634			 * CPU, otherwise they will end up affecting
635			 * the next process we switch to.
636			 */
637			reset_dbregs();
638		}
639		clear_pcb_flags(pcb, PCB_DBREGS);
640	}
641
642	/*
643	 * Drop the FP state if we hold it, so that the process gets a
644	 * clean FP state if it uses the FPU again.
645	 */
646	fpstate_drop(td);
647}
648
649void
650cpu_setregs(void)
651{
652	register_t cr0;
653
654	cr0 = rcr0();
655	/*
656	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
657	 * BSP.  See the comments there about why we set them.
658	 */
659	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
660	load_cr0(cr0);
661}
662
663/*
664 * Initialize amd64 and configure to run kernel
665 */
666
667/*
668 * Initialize segments & interrupt table
669 */
670
671struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
672static struct gate_descriptor idt0[NIDT];
673struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
674
675static char dblfault_stack[PAGE_SIZE] __aligned(16);
676static char mce0_stack[PAGE_SIZE] __aligned(16);
677static char nmi0_stack[PAGE_SIZE] __aligned(16);
678static char dbg0_stack[PAGE_SIZE] __aligned(16);
679CTASSERT(sizeof(struct nmi_pcpu) == 16);
680
681struct amd64tss common_tss[MAXCPU];
682
683/*
684 * Software prototypes -- in more palatable form.
685 *
686 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
687 * slots as corresponding segments for i386 kernel.
688 */
689struct soft_segment_descriptor gdt_segs[] = {
690/* GNULL_SEL	0 Null Descriptor */
691{	.ssd_base = 0x0,
692	.ssd_limit = 0x0,
693	.ssd_type = 0,
694	.ssd_dpl = 0,
695	.ssd_p = 0,
696	.ssd_long = 0,
697	.ssd_def32 = 0,
698	.ssd_gran = 0		},
699/* GNULL2_SEL	1 Null Descriptor */
700{	.ssd_base = 0x0,
701	.ssd_limit = 0x0,
702	.ssd_type = 0,
703	.ssd_dpl = 0,
704	.ssd_p = 0,
705	.ssd_long = 0,
706	.ssd_def32 = 0,
707	.ssd_gran = 0		},
708/* GUFS32_SEL	2 32 bit %gs Descriptor for user */
709{	.ssd_base = 0x0,
710	.ssd_limit = 0xfffff,
711	.ssd_type = SDT_MEMRWA,
712	.ssd_dpl = SEL_UPL,
713	.ssd_p = 1,
714	.ssd_long = 0,
715	.ssd_def32 = 1,
716	.ssd_gran = 1		},
717/* GUGS32_SEL	3 32 bit %fs Descriptor for user */
718{	.ssd_base = 0x0,
719	.ssd_limit = 0xfffff,
720	.ssd_type = SDT_MEMRWA,
721	.ssd_dpl = SEL_UPL,
722	.ssd_p = 1,
723	.ssd_long = 0,
724	.ssd_def32 = 1,
725	.ssd_gran = 1		},
726/* GCODE_SEL	4 Code Descriptor for kernel */
727{	.ssd_base = 0x0,
728	.ssd_limit = 0xfffff,
729	.ssd_type = SDT_MEMERA,
730	.ssd_dpl = SEL_KPL,
731	.ssd_p = 1,
732	.ssd_long = 1,
733	.ssd_def32 = 0,
734	.ssd_gran = 1		},
735/* GDATA_SEL	5 Data Descriptor for kernel */
736{	.ssd_base = 0x0,
737	.ssd_limit = 0xfffff,
738	.ssd_type = SDT_MEMRWA,
739	.ssd_dpl = SEL_KPL,
740	.ssd_p = 1,
741	.ssd_long = 1,
742	.ssd_def32 = 0,
743	.ssd_gran = 1		},
744/* GUCODE32_SEL	6 32 bit Code Descriptor for user */
745{	.ssd_base = 0x0,
746	.ssd_limit = 0xfffff,
747	.ssd_type = SDT_MEMERA,
748	.ssd_dpl = SEL_UPL,
749	.ssd_p = 1,
750	.ssd_long = 0,
751	.ssd_def32 = 1,
752	.ssd_gran = 1		},
753/* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
754{	.ssd_base = 0x0,
755	.ssd_limit = 0xfffff,
756	.ssd_type = SDT_MEMRWA,
757	.ssd_dpl = SEL_UPL,
758	.ssd_p = 1,
759	.ssd_long = 0,
760	.ssd_def32 = 1,
761	.ssd_gran = 1		},
762/* GUCODE_SEL	8 64 bit Code Descriptor for user */
763{	.ssd_base = 0x0,
764	.ssd_limit = 0xfffff,
765	.ssd_type = SDT_MEMERA,
766	.ssd_dpl = SEL_UPL,
767	.ssd_p = 1,
768	.ssd_long = 1,
769	.ssd_def32 = 0,
770	.ssd_gran = 1		},
771/* GPROC0_SEL	9 Proc 0 Tss Descriptor */
772{	.ssd_base = 0x0,
773	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
774	.ssd_type = SDT_SYSTSS,
775	.ssd_dpl = SEL_KPL,
776	.ssd_p = 1,
777	.ssd_long = 0,
778	.ssd_def32 = 0,
779	.ssd_gran = 0		},
780/* Actually, the TSS is a system descriptor which is double size */
781{	.ssd_base = 0x0,
782	.ssd_limit = 0x0,
783	.ssd_type = 0,
784	.ssd_dpl = 0,
785	.ssd_p = 0,
786	.ssd_long = 0,
787	.ssd_def32 = 0,
788	.ssd_gran = 0		},
789/* GUSERLDT_SEL	11 LDT Descriptor */
790{	.ssd_base = 0x0,
791	.ssd_limit = 0x0,
792	.ssd_type = 0,
793	.ssd_dpl = 0,
794	.ssd_p = 0,
795	.ssd_long = 0,
796	.ssd_def32 = 0,
797	.ssd_gran = 0		},
798/* GUSERLDT_SEL	12 LDT Descriptor, double size */
799{	.ssd_base = 0x0,
800	.ssd_limit = 0x0,
801	.ssd_type = 0,
802	.ssd_dpl = 0,
803	.ssd_p = 0,
804	.ssd_long = 0,
805	.ssd_def32 = 0,
806	.ssd_gran = 0		},
807};
808
809void
810setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
811{
812	struct gate_descriptor *ip;
813
814	ip = idt + idx;
815	ip->gd_looffset = (uintptr_t)func;
816	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
817	ip->gd_ist = ist;
818	ip->gd_xx = 0;
819	ip->gd_type = typ;
820	ip->gd_dpl = dpl;
821	ip->gd_p = 1;
822	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
823}
824
825extern inthand_t
826	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
827	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
828	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
829	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
830	IDTVEC(xmm), IDTVEC(dblfault),
831	IDTVEC(div_pti), IDTVEC(bpt_pti),
832	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
833	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
834	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
835	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
836	IDTVEC(xmm_pti),
837#ifdef KDTRACE_HOOKS
838	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
839#endif
840#ifdef XENHVM
841	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
842#endif
843	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
844	IDTVEC(fast_syscall_pti);
845
846#ifdef DDB
847/*
848 * Display the index and function name of any IDT entries that don't use
849 * the default 'rsvd' entry point.
850 */
851DB_SHOW_COMMAND(idt, db_show_idt)
852{
853	struct gate_descriptor *ip;
854	int idx;
855	uintptr_t func;
856
857	ip = idt;
858	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
859		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
860		if (func != (uintptr_t)&IDTVEC(rsvd)) {
861			db_printf("%3d\t", idx);
862			db_printsym(func, DB_STGY_PROC);
863			db_printf("\n");
864		}
865		ip++;
866	}
867}
868
869/* Show privileged registers. */
870DB_SHOW_COMMAND(sysregs, db_show_sysregs)
871{
872	struct {
873		uint16_t limit;
874		uint64_t base;
875	} __packed idtr, gdtr;
876	uint16_t ldt, tr;
877
878	__asm __volatile("sidt %0" : "=m" (idtr));
879	db_printf("idtr\t0x%016lx/%04x\n",
880	    (u_long)idtr.base, (u_int)idtr.limit);
881	__asm __volatile("sgdt %0" : "=m" (gdtr));
882	db_printf("gdtr\t0x%016lx/%04x\n",
883	    (u_long)gdtr.base, (u_int)gdtr.limit);
884	__asm __volatile("sldt %0" : "=r" (ldt));
885	db_printf("ldtr\t0x%04x\n", ldt);
886	__asm __volatile("str %0" : "=r" (tr));
887	db_printf("tr\t0x%04x\n", tr);
888	db_printf("cr0\t0x%016lx\n", rcr0());
889	db_printf("cr2\t0x%016lx\n", rcr2());
890	db_printf("cr3\t0x%016lx\n", rcr3());
891	db_printf("cr4\t0x%016lx\n", rcr4());
892	if (rcr4() & CR4_XSAVE)
893		db_printf("xcr0\t0x%016lx\n", rxcr(0));
894	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
895	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
896		db_printf("FEATURES_CTL\t%016lx\n",
897		    rdmsr(MSR_IA32_FEATURE_CONTROL));
898	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
899	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
900	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
901}
902
903DB_SHOW_COMMAND(dbregs, db_show_dbregs)
904{
905
906	db_printf("dr0\t0x%016lx\n", rdr0());
907	db_printf("dr1\t0x%016lx\n", rdr1());
908	db_printf("dr2\t0x%016lx\n", rdr2());
909	db_printf("dr3\t0x%016lx\n", rdr3());
910	db_printf("dr6\t0x%016lx\n", rdr6());
911	db_printf("dr7\t0x%016lx\n", rdr7());
912}
913#endif
914
915void
916sdtossd(sd, ssd)
917	struct user_segment_descriptor *sd;
918	struct soft_segment_descriptor *ssd;
919{
920
921	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
922	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
923	ssd->ssd_type  = sd->sd_type;
924	ssd->ssd_dpl   = sd->sd_dpl;
925	ssd->ssd_p     = sd->sd_p;
926	ssd->ssd_long  = sd->sd_long;
927	ssd->ssd_def32 = sd->sd_def32;
928	ssd->ssd_gran  = sd->sd_gran;
929}
930
931void
932ssdtosd(ssd, sd)
933	struct soft_segment_descriptor *ssd;
934	struct user_segment_descriptor *sd;
935{
936
937	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
938	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
939	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
940	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
941	sd->sd_type  = ssd->ssd_type;
942	sd->sd_dpl   = ssd->ssd_dpl;
943	sd->sd_p     = ssd->ssd_p;
944	sd->sd_long  = ssd->ssd_long;
945	sd->sd_def32 = ssd->ssd_def32;
946	sd->sd_gran  = ssd->ssd_gran;
947}
948
949void
950ssdtosyssd(ssd, sd)
951	struct soft_segment_descriptor *ssd;
952	struct system_segment_descriptor *sd;
953{
954
955	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
956	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
957	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
958	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
959	sd->sd_type  = ssd->ssd_type;
960	sd->sd_dpl   = ssd->ssd_dpl;
961	sd->sd_p     = ssd->ssd_p;
962	sd->sd_gran  = ssd->ssd_gran;
963}
964
965#if !defined(DEV_ATPIC) && defined(DEV_ISA)
966#include <isa/isavar.h>
967#include <isa/isareg.h>
968/*
969 * Return a bitmap of the current interrupt requests.  This is 8259-specific
970 * and is only suitable for use at probe time.
971 * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
972 * It shouldn't be here.  There should probably be an APIC centric
973 * implementation in the apic driver code, if at all.
974 */
975intrmask_t
976isa_irq_pending(void)
977{
978	u_char irr1;
979	u_char irr2;
980
981	irr1 = inb(IO_ICU1);
982	irr2 = inb(IO_ICU2);
983	return ((irr2 << 8) | irr1);
984}
985#endif
986
987u_int basemem;
988
989static int
990add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
991    int *physmap_idxp)
992{
993	int i, insert_idx, physmap_idx;
994
995	physmap_idx = *physmap_idxp;
996
997	if (length == 0)
998		return (1);
999
1000	/*
1001	 * Find insertion point while checking for overlap.  Start off by
1002	 * assuming the new entry will be added to the end.
1003	 *
1004	 * NB: physmap_idx points to the next free slot.
1005	 */
1006	insert_idx = physmap_idx;
1007	for (i = 0; i <= physmap_idx; i += 2) {
1008		if (base < physmap[i + 1]) {
1009			if (base + length <= physmap[i]) {
1010				insert_idx = i;
1011				break;
1012			}
1013			if (boothowto & RB_VERBOSE)
1014				printf(
1015		    "Overlapping memory regions, ignoring second region\n");
1016			return (1);
1017		}
1018	}
1019
1020	/* See if we can prepend to the next entry. */
1021	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1022		physmap[insert_idx] = base;
1023		return (1);
1024	}
1025
1026	/* See if we can append to the previous entry. */
1027	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1028		physmap[insert_idx - 1] += length;
1029		return (1);
1030	}
1031
1032	physmap_idx += 2;
1033	*physmap_idxp = physmap_idx;
1034	if (physmap_idx == PHYSMAP_SIZE) {
1035		printf(
1036		"Too many segments in the physical address map, giving up\n");
1037		return (0);
1038	}
1039
1040	/*
1041	 * Move the last 'N' entries down to make room for the new
1042	 * entry if needed.
1043	 */
1044	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1045		physmap[i] = physmap[i - 2];
1046		physmap[i + 1] = physmap[i - 1];
1047	}
1048
1049	/* Insert the new entry. */
1050	physmap[insert_idx] = base;
1051	physmap[insert_idx + 1] = base + length;
1052	return (1);
1053}
1054
1055void
1056bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1057                      vm_paddr_t *physmap, int *physmap_idx)
1058{
1059	struct bios_smap *smap, *smapend;
1060
1061	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1062
1063	for (smap = smapbase; smap < smapend; smap++) {
1064		if (boothowto & RB_VERBOSE)
1065			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1066			    smap->type, smap->base, smap->length);
1067
1068		if (smap->type != SMAP_TYPE_MEMORY)
1069			continue;
1070
1071		if (!add_physmap_entry(smap->base, smap->length, physmap,
1072		    physmap_idx))
1073			break;
1074	}
1075}
1076
1077static void
1078add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1079    int *physmap_idx)
1080{
1081	struct efi_md *map, *p;
1082	const char *type;
1083	size_t efisz;
1084	int ndesc, i;
1085
1086	static const char *types[] = {
1087		"Reserved",
1088		"LoaderCode",
1089		"LoaderData",
1090		"BootServicesCode",
1091		"BootServicesData",
1092		"RuntimeServicesCode",
1093		"RuntimeServicesData",
1094		"ConventionalMemory",
1095		"UnusableMemory",
1096		"ACPIReclaimMemory",
1097		"ACPIMemoryNVS",
1098		"MemoryMappedIO",
1099		"MemoryMappedIOPortSpace",
1100		"PalCode",
1101		"PersistentMemory"
1102	};
1103
1104	/*
1105	 * Memory map data provided by UEFI via the GetMemoryMap
1106	 * Boot Services API.
1107	 */
1108	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1109	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1110
1111	if (efihdr->descriptor_size == 0)
1112		return;
1113	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1114
1115	if (boothowto & RB_VERBOSE)
1116		printf("%23s %12s %12s %8s %4s\n",
1117		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1118
1119	for (i = 0, p = map; i < ndesc; i++,
1120	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1121		if (boothowto & RB_VERBOSE) {
1122			if (p->md_type < nitems(types))
1123				type = types[p->md_type];
1124			else
1125				type = "<INVALID>";
1126			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1127			    p->md_virt, p->md_pages);
1128			if (p->md_attr & EFI_MD_ATTR_UC)
1129				printf("UC ");
1130			if (p->md_attr & EFI_MD_ATTR_WC)
1131				printf("WC ");
1132			if (p->md_attr & EFI_MD_ATTR_WT)
1133				printf("WT ");
1134			if (p->md_attr & EFI_MD_ATTR_WB)
1135				printf("WB ");
1136			if (p->md_attr & EFI_MD_ATTR_UCE)
1137				printf("UCE ");
1138			if (p->md_attr & EFI_MD_ATTR_WP)
1139				printf("WP ");
1140			if (p->md_attr & EFI_MD_ATTR_RP)
1141				printf("RP ");
1142			if (p->md_attr & EFI_MD_ATTR_XP)
1143				printf("XP ");
1144			if (p->md_attr & EFI_MD_ATTR_NV)
1145				printf("NV ");
1146			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1147				printf("MORE_RELIABLE ");
1148			if (p->md_attr & EFI_MD_ATTR_RO)
1149				printf("RO ");
1150			if (p->md_attr & EFI_MD_ATTR_RT)
1151				printf("RUNTIME");
1152			printf("\n");
1153		}
1154
1155		switch (p->md_type) {
1156		case EFI_MD_TYPE_CODE:
1157		case EFI_MD_TYPE_DATA:
1158		case EFI_MD_TYPE_BS_CODE:
1159		case EFI_MD_TYPE_BS_DATA:
1160		case EFI_MD_TYPE_FREE:
1161			/*
1162			 * We're allowed to use any entry with these types.
1163			 */
1164			break;
1165		default:
1166			continue;
1167		}
1168
1169		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1170		    physmap, physmap_idx))
1171			break;
1172	}
1173}
1174
1175static char bootmethod[16] = "";
1176SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1177    "System firmware boot method");
1178
1179static void
1180native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1181{
1182	struct bios_smap *smap;
1183	struct efi_map_header *efihdr;
1184	u_int32_t size;
1185
1186	/*
1187	 * Memory map from INT 15:E820.
1188	 *
1189	 * subr_module.c says:
1190	 * "Consumer may safely assume that size value precedes data."
1191	 * ie: an int32_t immediately precedes smap.
1192	 */
1193
1194	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1195	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1196	smap = (struct bios_smap *)preload_search_info(kmdp,
1197	    MODINFO_METADATA | MODINFOMD_SMAP);
1198	if (efihdr == NULL && smap == NULL)
1199		panic("No BIOS smap or EFI map info from loader!");
1200
1201	if (efihdr != NULL) {
1202		add_efi_map_entries(efihdr, physmap, physmap_idx);
1203		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1204	} else {
1205		size = *((u_int32_t *)smap - 1);
1206		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1207		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1208	}
1209}
1210
1211#define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1212
1213/*
1214 * Populate the (physmap) array with base/bound pairs describing the
1215 * available physical memory in the system, then test this memory and
1216 * build the phys_avail array describing the actually-available memory.
1217 *
1218 * Total memory size may be set by the kernel environment variable
1219 * hw.physmem or the compile-time define MAXMEM.
1220 *
1221 * XXX first should be vm_paddr_t.
1222 */
1223static void
1224getmemsize(caddr_t kmdp, u_int64_t first)
1225{
1226	int i, physmap_idx, pa_indx, da_indx;
1227	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1228	u_long physmem_start, physmem_tunable, memtest;
1229	pt_entry_t *pte;
1230	quad_t dcons_addr, dcons_size;
1231	int page_counter;
1232
1233	bzero(physmap, sizeof(physmap));
1234	physmap_idx = 0;
1235
1236	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1237	physmap_idx -= 2;
1238
1239	/*
1240	 * Find the 'base memory' segment for SMP
1241	 */
1242	basemem = 0;
1243	for (i = 0; i <= physmap_idx; i += 2) {
1244		if (physmap[i] <= 0xA0000) {
1245			basemem = physmap[i + 1] / 1024;
1246			break;
1247		}
1248	}
1249	if (basemem == 0 || basemem > 640) {
1250		if (bootverbose)
1251			printf(
1252		"Memory map doesn't contain a basemem segment, faking it");
1253		basemem = 640;
1254	}
1255
1256	/*
1257	 * Make hole for "AP -> long mode" bootstrap code.  The
1258	 * mp_bootaddress vector is only available when the kernel
1259	 * is configured to support APs and APs for the system start
1260	 * in 32bit mode (e.g. SMP bare metal).
1261	 */
1262	if (init_ops.mp_bootaddress) {
1263		if (physmap[1] >= 0x100000000)
1264			panic(
1265	"Basemem segment is not suitable for AP bootstrap code!");
1266		physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
1267	}
1268
1269	/*
1270	 * Maxmem isn't the "maximum memory", it's one larger than the
1271	 * highest page of the physical address space.  It should be
1272	 * called something like "Maxphyspage".  We may adjust this
1273	 * based on ``hw.physmem'' and the results of the memory test.
1274	 */
1275	Maxmem = atop(physmap[physmap_idx + 1]);
1276
1277#ifdef MAXMEM
1278	Maxmem = MAXMEM / 4;
1279#endif
1280
1281	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1282		Maxmem = atop(physmem_tunable);
1283
1284	/*
1285	 * The boot memory test is disabled by default, as it takes a
1286	 * significant amount of time on large-memory systems, and is
1287	 * unfriendly to virtual machines as it unnecessarily touches all
1288	 * pages.
1289	 *
1290	 * A general name is used as the code may be extended to support
1291	 * additional tests beyond the current "page present" test.
1292	 */
1293	memtest = 0;
1294	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1295
1296	/*
1297	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1298	 * in the system.
1299	 */
1300	if (Maxmem > atop(physmap[physmap_idx + 1]))
1301		Maxmem = atop(physmap[physmap_idx + 1]);
1302
1303	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1304	    (boothowto & RB_VERBOSE))
1305		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1306
1307	/* call pmap initialization to make new kernel address space */
1308	pmap_bootstrap(&first);
1309
1310	/*
1311	 * Size up each available chunk of physical memory.
1312	 *
1313	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1314	 * By default, mask off the first 16 pages unless we appear to be
1315	 * running in a VM.
1316	 */
1317	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1318	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1319	if (physmap[0] < physmem_start) {
1320		if (physmem_start < PAGE_SIZE)
1321			physmap[0] = PAGE_SIZE;
1322		else if (physmem_start >= physmap[1])
1323			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1324		else
1325			physmap[0] = round_page(physmem_start);
1326	}
1327	pa_indx = 0;
1328	da_indx = 1;
1329	phys_avail[pa_indx++] = physmap[0];
1330	phys_avail[pa_indx] = physmap[0];
1331	dump_avail[da_indx] = physmap[0];
1332	pte = CMAP1;
1333
1334	/*
1335	 * Get dcons buffer address
1336	 */
1337	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1338	    getenv_quad("dcons.size", &dcons_size) == 0)
1339		dcons_addr = 0;
1340
1341	/*
1342	 * physmap is in bytes, so when converting to page boundaries,
1343	 * round up the start address and round down the end address.
1344	 */
1345	page_counter = 0;
1346	if (memtest != 0)
1347		printf("Testing system memory");
1348	for (i = 0; i <= physmap_idx; i += 2) {
1349		vm_paddr_t end;
1350
1351		end = ptoa((vm_paddr_t)Maxmem);
1352		if (physmap[i + 1] < end)
1353			end = trunc_page(physmap[i + 1]);
1354		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1355			int tmp, page_bad, full;
1356			int *ptr = (int *)CADDR1;
1357
1358			full = FALSE;
1359			/*
1360			 * block out kernel memory as not available.
1361			 */
1362			if (pa >= (vm_paddr_t)kernphys && pa < first)
1363				goto do_dump_avail;
1364
1365			/*
1366			 * block out dcons buffer
1367			 */
1368			if (dcons_addr > 0
1369			    && pa >= trunc_page(dcons_addr)
1370			    && pa < dcons_addr + dcons_size)
1371				goto do_dump_avail;
1372
1373			page_bad = FALSE;
1374			if (memtest == 0)
1375				goto skip_memtest;
1376
1377			/*
1378			 * Print a "." every GB to show we're making
1379			 * progress.
1380			 */
1381			page_counter++;
1382			if ((page_counter % PAGES_PER_GB) == 0)
1383				printf(".");
1384
1385			/*
1386			 * map page into kernel: valid, read/write,non-cacheable
1387			 */
1388			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1389			invltlb();
1390
1391			tmp = *(int *)ptr;
1392			/*
1393			 * Test for alternating 1's and 0's
1394			 */
1395			*(volatile int *)ptr = 0xaaaaaaaa;
1396			if (*(volatile int *)ptr != 0xaaaaaaaa)
1397				page_bad = TRUE;
1398			/*
1399			 * Test for alternating 0's and 1's
1400			 */
1401			*(volatile int *)ptr = 0x55555555;
1402			if (*(volatile int *)ptr != 0x55555555)
1403				page_bad = TRUE;
1404			/*
1405			 * Test for all 1's
1406			 */
1407			*(volatile int *)ptr = 0xffffffff;
1408			if (*(volatile int *)ptr != 0xffffffff)
1409				page_bad = TRUE;
1410			/*
1411			 * Test for all 0's
1412			 */
1413			*(volatile int *)ptr = 0x0;
1414			if (*(volatile int *)ptr != 0x0)
1415				page_bad = TRUE;
1416			/*
1417			 * Restore original value.
1418			 */
1419			*(int *)ptr = tmp;
1420
1421skip_memtest:
1422			/*
1423			 * Adjust array of valid/good pages.
1424			 */
1425			if (page_bad == TRUE)
1426				continue;
1427			/*
1428			 * If this good page is a continuation of the
1429			 * previous set of good pages, then just increase
1430			 * the end pointer. Otherwise start a new chunk.
1431			 * Note that "end" points one higher than end,
1432			 * making the range >= start and < end.
1433			 * If we're also doing a speculative memory
1434			 * test and we at or past the end, bump up Maxmem
1435			 * so that we keep going. The first bad page
1436			 * will terminate the loop.
1437			 */
1438			if (phys_avail[pa_indx] == pa) {
1439				phys_avail[pa_indx] += PAGE_SIZE;
1440			} else {
1441				pa_indx++;
1442				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1443					printf(
1444		"Too many holes in the physical address space, giving up\n");
1445					pa_indx--;
1446					full = TRUE;
1447					goto do_dump_avail;
1448				}
1449				phys_avail[pa_indx++] = pa;	/* start */
1450				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1451			}
1452			physmem++;
1453do_dump_avail:
1454			if (dump_avail[da_indx] == pa) {
1455				dump_avail[da_indx] += PAGE_SIZE;
1456			} else {
1457				da_indx++;
1458				if (da_indx == DUMP_AVAIL_ARRAY_END) {
1459					da_indx--;
1460					goto do_next;
1461				}
1462				dump_avail[da_indx++] = pa; /* start */
1463				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1464			}
1465do_next:
1466			if (full)
1467				break;
1468		}
1469	}
1470	*pte = 0;
1471	invltlb();
1472	if (memtest != 0)
1473		printf("\n");
1474
1475	/*
1476	 * XXX
1477	 * The last chunk must contain at least one page plus the message
1478	 * buffer to avoid complicating other code (message buffer address
1479	 * calculation, etc.).
1480	 */
1481	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1482	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1483		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1484		phys_avail[pa_indx--] = 0;
1485		phys_avail[pa_indx--] = 0;
1486	}
1487
1488	Maxmem = atop(phys_avail[pa_indx]);
1489
1490	/* Trim off space for the message buffer. */
1491	phys_avail[pa_indx] -= round_page(msgbufsize);
1492
1493	/* Map the message buffer. */
1494	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1495}
1496
1497static caddr_t
1498native_parse_preload_data(u_int64_t modulep)
1499{
1500	caddr_t kmdp;
1501	char *envp;
1502#ifdef DDB
1503	vm_offset_t ksym_start;
1504	vm_offset_t ksym_end;
1505#endif
1506
1507	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1508	preload_bootstrap_relocate(KERNBASE);
1509	kmdp = preload_search_by_type("elf kernel");
1510	if (kmdp == NULL)
1511		kmdp = preload_search_by_type("elf64 kernel");
1512	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1513	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1514	if (envp != NULL)
1515		envp += KERNBASE;
1516	init_static_kenv(envp, 0);
1517#ifdef DDB
1518	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1519	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1520	db_fetch_ksymtab(ksym_start, ksym_end);
1521#endif
1522	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1523
1524	return (kmdp);
1525}
1526
1527static void
1528amd64_kdb_init(void)
1529{
1530	kdb_init();
1531#ifdef KDB
1532	if (boothowto & RB_KDB)
1533		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1534#endif
1535}
1536
1537/* Set up the fast syscall stuff */
1538void
1539amd64_conf_fast_syscall(void)
1540{
1541	uint64_t msr;
1542
1543	msr = rdmsr(MSR_EFER) | EFER_SCE;
1544	wrmsr(MSR_EFER, msr);
1545	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1546	    (u_int64_t)IDTVEC(fast_syscall));
1547	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1548	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1549	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1550	wrmsr(MSR_STAR, msr);
1551	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
1552}
1553
1554u_int64_t
1555hammer_time(u_int64_t modulep, u_int64_t physfree)
1556{
1557	caddr_t kmdp;
1558	int gsel_tss, x;
1559	struct pcpu *pc;
1560	struct nmi_pcpu *np;
1561	struct xstate_hdr *xhdr;
1562	u_int64_t rsp0;
1563	char *env;
1564	size_t kstack0_sz;
1565	int late_console;
1566
1567	kmdp = init_ops.parse_preload_data(modulep);
1568
1569	identify_cpu1();
1570	identify_hypervisor();
1571	/*
1572	 * hw.cpu_stdext_disable is ignored by the call, it will be
1573	 * re-evaluted by the below call to finishidentcpu().
1574	 */
1575	identify_cpu2();
1576
1577	link_elf_ireloc(kmdp);
1578
1579	/*
1580	 * This may be done better later if it gets more high level
1581	 * components in it. If so just link td->td_proc here.
1582	 */
1583	proc_linkup0(&proc0, &thread0);
1584
1585	/* Init basic tunables, hz etc */
1586	init_param1();
1587
1588	thread0.td_kstack = physfree + KERNBASE;
1589	thread0.td_kstack_pages = kstack_pages;
1590	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1591	bzero((void *)thread0.td_kstack, kstack0_sz);
1592	physfree += kstack0_sz;
1593
1594	/*
1595	 * make gdt memory segments
1596	 */
1597	for (x = 0; x < NGDT; x++) {
1598		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1599		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1600			ssdtosd(&gdt_segs[x], &gdt[x]);
1601	}
1602	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1603	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1604	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1605
1606	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1607	r_gdt.rd_base =  (long) gdt;
1608	lgdt(&r_gdt);
1609	pc = &__pcpu[0];
1610
1611	wrmsr(MSR_FSBASE, 0);		/* User value */
1612	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1613	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1614
1615	pcpu_init(pc, 0, sizeof(struct pcpu));
1616	dpcpu_init((void *)(physfree + KERNBASE), 0);
1617	physfree += DPCPU_SIZE;
1618	PCPU_SET(prvspace, pc);
1619	PCPU_SET(curthread, &thread0);
1620	/* Non-late cninit() and printf() can be moved up to here. */
1621	PCPU_SET(tssp, &common_tss[0]);
1622	PCPU_SET(commontssp, &common_tss[0]);
1623	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1624	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1625	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1626	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1627
1628	/*
1629	 * Initialize mutexes.
1630	 *
1631	 * icu_lock: in order to allow an interrupt to occur in a critical
1632	 * 	     section, to set pcpu->ipending (etc...) properly, we
1633	 *	     must be able to get the icu lock, so it can't be
1634	 *	     under witness.
1635	 */
1636	mutex_init();
1637	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1638	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1639
1640	/* exceptions */
1641	pti = pti_get_default();
1642	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1643
1644	for (x = 0; x < NIDT; x++)
1645		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1646		    SEL_KPL, 0);
1647	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1648	    SEL_KPL, 0);
1649	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1650	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1651	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1652	    SEL_UPL, 0);
1653	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1654	    SEL_UPL, 0);
1655	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1656	    SEL_KPL, 0);
1657	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1658	    SEL_KPL, 0);
1659	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1660	    SEL_KPL, 0);
1661	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1662	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1663	    SDT_SYSIGT, SEL_KPL, 0);
1664	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1665	    SEL_KPL, 0);
1666	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1667	    SDT_SYSIGT, SEL_KPL, 0);
1668	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1669	    SEL_KPL, 0);
1670	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1671	    SEL_KPL, 0);
1672	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1673	    SEL_KPL, 0);
1674	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1675	    SEL_KPL, 0);
1676	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1677	    SEL_KPL, 0);
1678	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1679	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1680	    SEL_KPL, 0);
1681#ifdef KDTRACE_HOOKS
1682	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1683	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1684#endif
1685#ifdef XENHVM
1686	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1687	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1688#endif
1689	r_idt.rd_limit = sizeof(idt0) - 1;
1690	r_idt.rd_base = (long) idt;
1691	lidt(&r_idt);
1692
1693	/*
1694	 * Initialize the clock before the console so that console
1695	 * initialization can use DELAY().
1696	 */
1697	clock_init();
1698
1699	/*
1700	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1701	 * transition).
1702	 * Once bootblocks have updated, we can test directly for
1703	 * efi_systbl != NULL here...
1704	 */
1705	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1706	    != NULL)
1707		vty_set_preferred(VTY_VT);
1708
1709	finishidentcpu();	/* Final stage of CPU initialization */
1710	initializecpu();	/* Initialize CPU registers */
1711	initializecpucache();
1712
1713	/* doublefault stack space, runs on ist1 */
1714	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1715
1716	/*
1717	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1718	 * above the start of the ist2 stack.
1719	 */
1720	np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1721	np->np_pcpu = (register_t) pc;
1722	common_tss[0].tss_ist2 = (long) np;
1723
1724	/*
1725	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1726	 * above the start of the ist3 stack.
1727	 */
1728	np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
1729	np->np_pcpu = (register_t) pc;
1730	common_tss[0].tss_ist3 = (long) np;
1731
1732	/*
1733	 * DB# stack, runs on ist4.
1734	 */
1735	np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
1736	np->np_pcpu = (register_t) pc;
1737	common_tss[0].tss_ist4 = (long) np;
1738
1739	/* Set the IO permission bitmap (empty due to tss seg limit) */
1740	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1741
1742	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1743	ltr(gsel_tss);
1744
1745	amd64_conf_fast_syscall();
1746
1747	/*
1748	 * Temporary forge some valid pointer to PCB, for exception
1749	 * handlers.  It is reinitialized properly below after FPU is
1750	 * set up.  Also set up td_critnest to short-cut the page
1751	 * fault handler.
1752	 */
1753	cpu_max_ext_state_size = sizeof(struct savefpu);
1754	thread0.td_pcb = get_pcb_td(&thread0);
1755	thread0.td_critnest = 1;
1756
1757	/*
1758	 * The console and kdb should be initialized even earlier than here,
1759	 * but some console drivers don't work until after getmemsize().
1760	 * Default to late console initialization to support these drivers.
1761	 * This loses mainly printf()s in getmemsize() and early debugging.
1762	 */
1763	late_console = 1;
1764	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1765	if (!late_console) {
1766		cninit();
1767		amd64_kdb_init();
1768	}
1769
1770	getmemsize(kmdp, physfree);
1771	init_param2(physmem);
1772
1773	/* now running on new page tables, configured,and u/iom is accessible */
1774
1775	if (late_console)
1776		cninit();
1777
1778#ifdef DEV_ISA
1779#ifdef DEV_ATPIC
1780	elcr_probe();
1781	atpic_startup();
1782#else
1783	/* Reset and mask the atpics and leave them shut down. */
1784	atpic_reset();
1785
1786	/*
1787	 * Point the ICU spurious interrupt vectors at the APIC spurious
1788	 * interrupt handler.
1789	 */
1790	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1791	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1792#endif
1793#else
1794#error "have you forgotten the isa device?";
1795#endif
1796
1797	if (late_console)
1798		amd64_kdb_init();
1799
1800	msgbufinit(msgbufp, msgbufsize);
1801	fpuinit();
1802
1803	/*
1804	 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1805	 * area size.  Zero out the extended state header in fpu save
1806	 * area.
1807	 */
1808	thread0.td_pcb = get_pcb_td(&thread0);
1809	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1810	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1811	if (use_xsave) {
1812		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1813		    1);
1814		xhdr->xstate_bv = xsave_mask;
1815	}
1816	/* make an initial tss so cpu can get interrupt stack on syscall! */
1817	rsp0 = (vm_offset_t)thread0.td_pcb;
1818	/* Ensure the stack is aligned to 16 bytes */
1819	rsp0 &= ~0xFul;
1820	common_tss[0].tss_rsp0 = rsp0;
1821	PCPU_SET(rsp0, rsp0);
1822	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1823	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1824	PCPU_SET(curpcb, thread0.td_pcb);
1825
1826	/* transfer to user mode */
1827
1828	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1829	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1830	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1831	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1832	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1833
1834	load_ds(_udatasel);
1835	load_es(_udatasel);
1836	load_fs(_ufssel);
1837
1838	/* setup proc 0's pcb */
1839	thread0.td_pcb->pcb_flags = 0;
1840	thread0.td_frame = &proc0_tf;
1841
1842        env = kern_getenv("kernelname");
1843	if (env != NULL)
1844		strlcpy(kernelname, env, sizeof(kernelname));
1845
1846	cpu_probe_amdc1e();
1847
1848#ifdef FDT
1849	x86_init_fdt();
1850#endif
1851	thread0.td_critnest = 0;
1852
1853	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1854	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1855
1856	/* Location of kernel stack for locore */
1857	return ((u_int64_t)thread0.td_pcb);
1858}
1859
1860void
1861cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1862{
1863
1864	pcpu->pc_acpi_id = 0xffffffff;
1865}
1866
1867static int
1868smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1869{
1870	struct bios_smap *smapbase;
1871	struct bios_smap_xattr smap;
1872	caddr_t kmdp;
1873	uint32_t *smapattr;
1874	int count, error, i;
1875
1876	/* Retrieve the system memory map from the loader. */
1877	kmdp = preload_search_by_type("elf kernel");
1878	if (kmdp == NULL)
1879		kmdp = preload_search_by_type("elf64 kernel");
1880	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1881	    MODINFO_METADATA | MODINFOMD_SMAP);
1882	if (smapbase == NULL)
1883		return (0);
1884	smapattr = (uint32_t *)preload_search_info(kmdp,
1885	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1886	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1887	error = 0;
1888	for (i = 0; i < count; i++) {
1889		smap.base = smapbase[i].base;
1890		smap.length = smapbase[i].length;
1891		smap.type = smapbase[i].type;
1892		if (smapattr != NULL)
1893			smap.xattr = smapattr[i];
1894		else
1895			smap.xattr = 0;
1896		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1897	}
1898	return (error);
1899}
1900SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1901    smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1902
1903static int
1904efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1905{
1906	struct efi_map_header *efihdr;
1907	caddr_t kmdp;
1908	uint32_t efisize;
1909
1910	kmdp = preload_search_by_type("elf kernel");
1911	if (kmdp == NULL)
1912		kmdp = preload_search_by_type("elf64 kernel");
1913	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1914	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1915	if (efihdr == NULL)
1916		return (0);
1917	efisize = *((uint32_t *)efihdr - 1);
1918	return (SYSCTL_OUT(req, efihdr, efisize));
1919}
1920SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1921    efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1922
1923void
1924spinlock_enter(void)
1925{
1926	struct thread *td;
1927	register_t flags;
1928
1929	td = curthread;
1930	if (td->td_md.md_spinlock_count == 0) {
1931		flags = intr_disable();
1932		td->td_md.md_spinlock_count = 1;
1933		td->td_md.md_saved_flags = flags;
1934	} else
1935		td->td_md.md_spinlock_count++;
1936	critical_enter();
1937}
1938
1939void
1940spinlock_exit(void)
1941{
1942	struct thread *td;
1943	register_t flags;
1944
1945	td = curthread;
1946	critical_exit();
1947	flags = td->td_md.md_saved_flags;
1948	td->td_md.md_spinlock_count--;
1949	if (td->td_md.md_spinlock_count == 0)
1950		intr_restore(flags);
1951}
1952
1953/*
1954 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1955 * we want to start a backtrace from the function that caused us to enter
1956 * the debugger. We have the context in the trapframe, but base the trace
1957 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1958 * enough for a backtrace.
1959 */
1960void
1961makectx(struct trapframe *tf, struct pcb *pcb)
1962{
1963
1964	pcb->pcb_r12 = tf->tf_r12;
1965	pcb->pcb_r13 = tf->tf_r13;
1966	pcb->pcb_r14 = tf->tf_r14;
1967	pcb->pcb_r15 = tf->tf_r15;
1968	pcb->pcb_rbp = tf->tf_rbp;
1969	pcb->pcb_rbx = tf->tf_rbx;
1970	pcb->pcb_rip = tf->tf_rip;
1971	pcb->pcb_rsp = tf->tf_rsp;
1972}
1973
1974int
1975ptrace_set_pc(struct thread *td, unsigned long addr)
1976{
1977
1978	td->td_frame->tf_rip = addr;
1979	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
1980	return (0);
1981}
1982
1983int
1984ptrace_single_step(struct thread *td)
1985{
1986	td->td_frame->tf_rflags |= PSL_T;
1987	return (0);
1988}
1989
1990int
1991ptrace_clear_single_step(struct thread *td)
1992{
1993	td->td_frame->tf_rflags &= ~PSL_T;
1994	return (0);
1995}
1996
1997int
1998fill_regs(struct thread *td, struct reg *regs)
1999{
2000	struct trapframe *tp;
2001
2002	tp = td->td_frame;
2003	return (fill_frame_regs(tp, regs));
2004}
2005
2006int
2007fill_frame_regs(struct trapframe *tp, struct reg *regs)
2008{
2009	regs->r_r15 = tp->tf_r15;
2010	regs->r_r14 = tp->tf_r14;
2011	regs->r_r13 = tp->tf_r13;
2012	regs->r_r12 = tp->tf_r12;
2013	regs->r_r11 = tp->tf_r11;
2014	regs->r_r10 = tp->tf_r10;
2015	regs->r_r9  = tp->tf_r9;
2016	regs->r_r8  = tp->tf_r8;
2017	regs->r_rdi = tp->tf_rdi;
2018	regs->r_rsi = tp->tf_rsi;
2019	regs->r_rbp = tp->tf_rbp;
2020	regs->r_rbx = tp->tf_rbx;
2021	regs->r_rdx = tp->tf_rdx;
2022	regs->r_rcx = tp->tf_rcx;
2023	regs->r_rax = tp->tf_rax;
2024	regs->r_rip = tp->tf_rip;
2025	regs->r_cs = tp->tf_cs;
2026	regs->r_rflags = tp->tf_rflags;
2027	regs->r_rsp = tp->tf_rsp;
2028	regs->r_ss = tp->tf_ss;
2029	if (tp->tf_flags & TF_HASSEGS) {
2030		regs->r_ds = tp->tf_ds;
2031		regs->r_es = tp->tf_es;
2032		regs->r_fs = tp->tf_fs;
2033		regs->r_gs = tp->tf_gs;
2034	} else {
2035		regs->r_ds = 0;
2036		regs->r_es = 0;
2037		regs->r_fs = 0;
2038		regs->r_gs = 0;
2039	}
2040	return (0);
2041}
2042
2043int
2044set_regs(struct thread *td, struct reg *regs)
2045{
2046	struct trapframe *tp;
2047	register_t rflags;
2048
2049	tp = td->td_frame;
2050	rflags = regs->r_rflags & 0xffffffff;
2051	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2052		return (EINVAL);
2053	tp->tf_r15 = regs->r_r15;
2054	tp->tf_r14 = regs->r_r14;
2055	tp->tf_r13 = regs->r_r13;
2056	tp->tf_r12 = regs->r_r12;
2057	tp->tf_r11 = regs->r_r11;
2058	tp->tf_r10 = regs->r_r10;
2059	tp->tf_r9  = regs->r_r9;
2060	tp->tf_r8  = regs->r_r8;
2061	tp->tf_rdi = regs->r_rdi;
2062	tp->tf_rsi = regs->r_rsi;
2063	tp->tf_rbp = regs->r_rbp;
2064	tp->tf_rbx = regs->r_rbx;
2065	tp->tf_rdx = regs->r_rdx;
2066	tp->tf_rcx = regs->r_rcx;
2067	tp->tf_rax = regs->r_rax;
2068	tp->tf_rip = regs->r_rip;
2069	tp->tf_cs = regs->r_cs;
2070	tp->tf_rflags = rflags;
2071	tp->tf_rsp = regs->r_rsp;
2072	tp->tf_ss = regs->r_ss;
2073	if (0) {	/* XXXKIB */
2074		tp->tf_ds = regs->r_ds;
2075		tp->tf_es = regs->r_es;
2076		tp->tf_fs = regs->r_fs;
2077		tp->tf_gs = regs->r_gs;
2078		tp->tf_flags = TF_HASSEGS;
2079	}
2080	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2081	return (0);
2082}
2083
2084/* XXX check all this stuff! */
2085/* externalize from sv_xmm */
2086static void
2087fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2088{
2089	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2090	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2091	int i;
2092
2093	/* pcb -> fpregs */
2094	bzero(fpregs, sizeof(*fpregs));
2095
2096	/* FPU control/status */
2097	penv_fpreg->en_cw = penv_xmm->en_cw;
2098	penv_fpreg->en_sw = penv_xmm->en_sw;
2099	penv_fpreg->en_tw = penv_xmm->en_tw;
2100	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2101	penv_fpreg->en_rip = penv_xmm->en_rip;
2102	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2103	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2104	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2105
2106	/* FPU registers */
2107	for (i = 0; i < 8; ++i)
2108		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2109
2110	/* SSE registers */
2111	for (i = 0; i < 16; ++i)
2112		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2113}
2114
2115/* internalize from fpregs into sv_xmm */
2116static void
2117set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2118{
2119	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2120	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2121	int i;
2122
2123	/* fpregs -> pcb */
2124	/* FPU control/status */
2125	penv_xmm->en_cw = penv_fpreg->en_cw;
2126	penv_xmm->en_sw = penv_fpreg->en_sw;
2127	penv_xmm->en_tw = penv_fpreg->en_tw;
2128	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2129	penv_xmm->en_rip = penv_fpreg->en_rip;
2130	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2131	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2132	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2133
2134	/* FPU registers */
2135	for (i = 0; i < 8; ++i)
2136		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2137
2138	/* SSE registers */
2139	for (i = 0; i < 16; ++i)
2140		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2141}
2142
2143/* externalize from td->pcb */
2144int
2145fill_fpregs(struct thread *td, struct fpreg *fpregs)
2146{
2147
2148	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2149	    P_SHOULDSTOP(td->td_proc),
2150	    ("not suspended thread %p", td));
2151	fpugetregs(td);
2152	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2153	return (0);
2154}
2155
2156/* internalize to td->pcb */
2157int
2158set_fpregs(struct thread *td, struct fpreg *fpregs)
2159{
2160
2161	critical_enter();
2162	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2163	fpuuserinited(td);
2164	critical_exit();
2165	return (0);
2166}
2167
2168/*
2169 * Get machine context.
2170 */
2171int
2172get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2173{
2174	struct pcb *pcb;
2175	struct trapframe *tp;
2176
2177	pcb = td->td_pcb;
2178	tp = td->td_frame;
2179	PROC_LOCK(curthread->td_proc);
2180	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2181	PROC_UNLOCK(curthread->td_proc);
2182	mcp->mc_r15 = tp->tf_r15;
2183	mcp->mc_r14 = tp->tf_r14;
2184	mcp->mc_r13 = tp->tf_r13;
2185	mcp->mc_r12 = tp->tf_r12;
2186	mcp->mc_r11 = tp->tf_r11;
2187	mcp->mc_r10 = tp->tf_r10;
2188	mcp->mc_r9  = tp->tf_r9;
2189	mcp->mc_r8  = tp->tf_r8;
2190	mcp->mc_rdi = tp->tf_rdi;
2191	mcp->mc_rsi = tp->tf_rsi;
2192	mcp->mc_rbp = tp->tf_rbp;
2193	mcp->mc_rbx = tp->tf_rbx;
2194	mcp->mc_rcx = tp->tf_rcx;
2195	mcp->mc_rflags = tp->tf_rflags;
2196	if (flags & GET_MC_CLEAR_RET) {
2197		mcp->mc_rax = 0;
2198		mcp->mc_rdx = 0;
2199		mcp->mc_rflags &= ~PSL_C;
2200	} else {
2201		mcp->mc_rax = tp->tf_rax;
2202		mcp->mc_rdx = tp->tf_rdx;
2203	}
2204	mcp->mc_rip = tp->tf_rip;
2205	mcp->mc_cs = tp->tf_cs;
2206	mcp->mc_rsp = tp->tf_rsp;
2207	mcp->mc_ss = tp->tf_ss;
2208	mcp->mc_ds = tp->tf_ds;
2209	mcp->mc_es = tp->tf_es;
2210	mcp->mc_fs = tp->tf_fs;
2211	mcp->mc_gs = tp->tf_gs;
2212	mcp->mc_flags = tp->tf_flags;
2213	mcp->mc_len = sizeof(*mcp);
2214	get_fpcontext(td, mcp, NULL, 0);
2215	update_pcb_bases(pcb);
2216	mcp->mc_fsbase = pcb->pcb_fsbase;
2217	mcp->mc_gsbase = pcb->pcb_gsbase;
2218	mcp->mc_xfpustate = 0;
2219	mcp->mc_xfpustate_len = 0;
2220	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2221	return (0);
2222}
2223
2224/*
2225 * Set machine context.
2226 *
2227 * However, we don't set any but the user modifiable flags, and we won't
2228 * touch the cs selector.
2229 */
2230int
2231set_mcontext(struct thread *td, mcontext_t *mcp)
2232{
2233	struct pcb *pcb;
2234	struct trapframe *tp;
2235	char *xfpustate;
2236	long rflags;
2237	int ret;
2238
2239	pcb = td->td_pcb;
2240	tp = td->td_frame;
2241	if (mcp->mc_len != sizeof(*mcp) ||
2242	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2243		return (EINVAL);
2244	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2245	    (tp->tf_rflags & ~PSL_USERCHANGE);
2246	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2247		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2248		    sizeof(struct savefpu))
2249			return (EINVAL);
2250		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2251		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2252		    mcp->mc_xfpustate_len);
2253		if (ret != 0)
2254			return (ret);
2255	} else
2256		xfpustate = NULL;
2257	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2258	if (ret != 0)
2259		return (ret);
2260	tp->tf_r15 = mcp->mc_r15;
2261	tp->tf_r14 = mcp->mc_r14;
2262	tp->tf_r13 = mcp->mc_r13;
2263	tp->tf_r12 = mcp->mc_r12;
2264	tp->tf_r11 = mcp->mc_r11;
2265	tp->tf_r10 = mcp->mc_r10;
2266	tp->tf_r9  = mcp->mc_r9;
2267	tp->tf_r8  = mcp->mc_r8;
2268	tp->tf_rdi = mcp->mc_rdi;
2269	tp->tf_rsi = mcp->mc_rsi;
2270	tp->tf_rbp = mcp->mc_rbp;
2271	tp->tf_rbx = mcp->mc_rbx;
2272	tp->tf_rdx = mcp->mc_rdx;
2273	tp->tf_rcx = mcp->mc_rcx;
2274	tp->tf_rax = mcp->mc_rax;
2275	tp->tf_rip = mcp->mc_rip;
2276	tp->tf_rflags = rflags;
2277	tp->tf_rsp = mcp->mc_rsp;
2278	tp->tf_ss = mcp->mc_ss;
2279	tp->tf_flags = mcp->mc_flags;
2280	if (tp->tf_flags & TF_HASSEGS) {
2281		tp->tf_ds = mcp->mc_ds;
2282		tp->tf_es = mcp->mc_es;
2283		tp->tf_fs = mcp->mc_fs;
2284		tp->tf_gs = mcp->mc_gs;
2285	}
2286	set_pcb_flags(pcb, PCB_FULL_IRET);
2287	if (mcp->mc_flags & _MC_HASBASES) {
2288		pcb->pcb_fsbase = mcp->mc_fsbase;
2289		pcb->pcb_gsbase = mcp->mc_gsbase;
2290	}
2291	return (0);
2292}
2293
2294static void
2295get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2296    size_t xfpusave_len)
2297{
2298	size_t max_len, len;
2299
2300	mcp->mc_ownedfp = fpugetregs(td);
2301	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2302	    sizeof(mcp->mc_fpstate));
2303	mcp->mc_fpformat = fpuformat();
2304	if (!use_xsave || xfpusave_len == 0)
2305		return;
2306	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2307	len = xfpusave_len;
2308	if (len > max_len) {
2309		len = max_len;
2310		bzero(xfpusave + max_len, len - max_len);
2311	}
2312	mcp->mc_flags |= _MC_HASFPXSTATE;
2313	mcp->mc_xfpustate_len = len;
2314	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2315}
2316
2317static int
2318set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2319    size_t xfpustate_len)
2320{
2321	int error;
2322
2323	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2324		return (0);
2325	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2326		return (EINVAL);
2327	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2328		/* We don't care what state is left in the FPU or PCB. */
2329		fpstate_drop(td);
2330		error = 0;
2331	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2332	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2333		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2334		    xfpustate, xfpustate_len);
2335	} else
2336		return (EINVAL);
2337	return (error);
2338}
2339
2340void
2341fpstate_drop(struct thread *td)
2342{
2343
2344	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2345	critical_enter();
2346	if (PCPU_GET(fpcurthread) == td)
2347		fpudrop();
2348	/*
2349	 * XXX force a full drop of the fpu.  The above only drops it if we
2350	 * owned it.
2351	 *
2352	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2353	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2354	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2355	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2356	 * have too many layers.
2357	 */
2358	clear_pcb_flags(curthread->td_pcb,
2359	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2360	critical_exit();
2361}
2362
2363int
2364fill_dbregs(struct thread *td, struct dbreg *dbregs)
2365{
2366	struct pcb *pcb;
2367
2368	if (td == NULL) {
2369		dbregs->dr[0] = rdr0();
2370		dbregs->dr[1] = rdr1();
2371		dbregs->dr[2] = rdr2();
2372		dbregs->dr[3] = rdr3();
2373		dbregs->dr[6] = rdr6();
2374		dbregs->dr[7] = rdr7();
2375	} else {
2376		pcb = td->td_pcb;
2377		dbregs->dr[0] = pcb->pcb_dr0;
2378		dbregs->dr[1] = pcb->pcb_dr1;
2379		dbregs->dr[2] = pcb->pcb_dr2;
2380		dbregs->dr[3] = pcb->pcb_dr3;
2381		dbregs->dr[6] = pcb->pcb_dr6;
2382		dbregs->dr[7] = pcb->pcb_dr7;
2383	}
2384	dbregs->dr[4] = 0;
2385	dbregs->dr[5] = 0;
2386	dbregs->dr[8] = 0;
2387	dbregs->dr[9] = 0;
2388	dbregs->dr[10] = 0;
2389	dbregs->dr[11] = 0;
2390	dbregs->dr[12] = 0;
2391	dbregs->dr[13] = 0;
2392	dbregs->dr[14] = 0;
2393	dbregs->dr[15] = 0;
2394	return (0);
2395}
2396
2397int
2398set_dbregs(struct thread *td, struct dbreg *dbregs)
2399{
2400	struct pcb *pcb;
2401	int i;
2402
2403	if (td == NULL) {
2404		load_dr0(dbregs->dr[0]);
2405		load_dr1(dbregs->dr[1]);
2406		load_dr2(dbregs->dr[2]);
2407		load_dr3(dbregs->dr[3]);
2408		load_dr6(dbregs->dr[6]);
2409		load_dr7(dbregs->dr[7]);
2410	} else {
2411		/*
2412		 * Don't let an illegal value for dr7 get set.  Specifically,
2413		 * check for undefined settings.  Setting these bit patterns
2414		 * result in undefined behaviour and can lead to an unexpected
2415		 * TRCTRAP or a general protection fault right here.
2416		 * Upper bits of dr6 and dr7 must not be set
2417		 */
2418		for (i = 0; i < 4; i++) {
2419			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2420				return (EINVAL);
2421			if (td->td_frame->tf_cs == _ucode32sel &&
2422			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2423				return (EINVAL);
2424		}
2425		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2426		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2427			return (EINVAL);
2428
2429		pcb = td->td_pcb;
2430
2431		/*
2432		 * Don't let a process set a breakpoint that is not within the
2433		 * process's address space.  If a process could do this, it
2434		 * could halt the system by setting a breakpoint in the kernel
2435		 * (if ddb was enabled).  Thus, we need to check to make sure
2436		 * that no breakpoints are being enabled for addresses outside
2437		 * process's address space.
2438		 *
2439		 * XXX - what about when the watched area of the user's
2440		 * address space is written into from within the kernel
2441		 * ... wouldn't that still cause a breakpoint to be generated
2442		 * from within kernel mode?
2443		 */
2444
2445		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2446			/* dr0 is enabled */
2447			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2448				return (EINVAL);
2449		}
2450		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2451			/* dr1 is enabled */
2452			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2453				return (EINVAL);
2454		}
2455		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2456			/* dr2 is enabled */
2457			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2458				return (EINVAL);
2459		}
2460		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2461			/* dr3 is enabled */
2462			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2463				return (EINVAL);
2464		}
2465
2466		pcb->pcb_dr0 = dbregs->dr[0];
2467		pcb->pcb_dr1 = dbregs->dr[1];
2468		pcb->pcb_dr2 = dbregs->dr[2];
2469		pcb->pcb_dr3 = dbregs->dr[3];
2470		pcb->pcb_dr6 = dbregs->dr[6];
2471		pcb->pcb_dr7 = dbregs->dr[7];
2472
2473		set_pcb_flags(pcb, PCB_DBREGS);
2474	}
2475
2476	return (0);
2477}
2478
2479void
2480reset_dbregs(void)
2481{
2482
2483	load_dr7(0);	/* Turn off the control bits first */
2484	load_dr0(0);
2485	load_dr1(0);
2486	load_dr2(0);
2487	load_dr3(0);
2488	load_dr6(0);
2489}
2490
2491/*
2492 * Return > 0 if a hardware breakpoint has been hit, and the
2493 * breakpoint was in user space.  Return 0, otherwise.
2494 */
2495int
2496user_dbreg_trap(void)
2497{
2498        u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
2499        u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2500        int nbp;            /* number of breakpoints that triggered */
2501        caddr_t addr[4];    /* breakpoint addresses */
2502        int i;
2503
2504        dr7 = rdr7();
2505        if ((dr7 & 0x000000ff) == 0) {
2506                /*
2507                 * all GE and LE bits in the dr7 register are zero,
2508                 * thus the trap couldn't have been caused by the
2509                 * hardware debug registers
2510                 */
2511                return 0;
2512        }
2513
2514        nbp = 0;
2515        dr6 = rdr6();
2516        bp = dr6 & 0x0000000f;
2517
2518        if (!bp) {
2519                /*
2520                 * None of the breakpoint bits are set meaning this
2521                 * trap was not caused by any of the debug registers
2522                 */
2523                return 0;
2524        }
2525
2526        /*
2527         * at least one of the breakpoints were hit, check to see
2528         * which ones and if any of them are user space addresses
2529         */
2530
2531        if (bp & 0x01) {
2532                addr[nbp++] = (caddr_t)rdr0();
2533        }
2534        if (bp & 0x02) {
2535                addr[nbp++] = (caddr_t)rdr1();
2536        }
2537        if (bp & 0x04) {
2538                addr[nbp++] = (caddr_t)rdr2();
2539        }
2540        if (bp & 0x08) {
2541                addr[nbp++] = (caddr_t)rdr3();
2542        }
2543
2544        for (i = 0; i < nbp; i++) {
2545                if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2546                        /*
2547                         * addr[i] is in user space
2548                         */
2549                        return nbp;
2550                }
2551        }
2552
2553        /*
2554         * None of the breakpoints are in user space.
2555         */
2556        return 0;
2557}
2558
2559/*
2560 * The pcb_flags is only modified by current thread, or by other threads
2561 * when current thread is stopped.  However, current thread may change it
2562 * from the interrupt context in cpu_switch(), or in the trap handler.
2563 * When we read-modify-write pcb_flags from C sources, compiler may generate
2564 * code that is not atomic regarding the interrupt handler.  If a trap or
2565 * interrupt happens and any flag is modified from the handler, it can be
2566 * clobbered with the cached value later.  Therefore, we implement setting
2567 * and clearing flags with single-instruction functions, which do not race
2568 * with possible modification of the flags from the trap or interrupt context,
2569 * because traps and interrupts are executed only on instruction boundary.
2570 */
2571void
2572set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2573{
2574
2575	__asm __volatile("orl %1,%0"
2576	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2577	    : "cc", "memory");
2578
2579}
2580
2581/*
2582 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2583 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2584 * pcb if user space modified the bases.  We must save on the context
2585 * switch or if the return to usermode happens through the doreti.
2586 *
2587 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2588 * which have a consequence that the base MSRs must be saved each time
2589 * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
2590 * context switches.
2591 */
2592void
2593set_pcb_flags(struct pcb *pcb, const u_int flags)
2594{
2595	register_t r;
2596
2597	if (curpcb == pcb &&
2598	    (flags & PCB_FULL_IRET) != 0 &&
2599	    (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
2600	    (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
2601		r = intr_disable();
2602		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2603			if (rfs() == _ufssel)
2604				pcb->pcb_fsbase = rdfsbase();
2605			if (rgs() == _ugssel)
2606				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2607		}
2608		set_pcb_flags_raw(pcb, flags);
2609		intr_restore(r);
2610	} else {
2611		set_pcb_flags_raw(pcb, flags);
2612	}
2613}
2614
2615void
2616clear_pcb_flags(struct pcb *pcb, const u_int flags)
2617{
2618
2619	__asm __volatile("andl %1,%0"
2620	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2621	    : "cc", "memory");
2622}
2623
2624#ifdef KDB
2625
2626/*
2627 * Provide inb() and outb() as functions.  They are normally only available as
2628 * inline functions, thus cannot be called from the debugger.
2629 */
2630
2631/* silence compiler warnings */
2632u_char inb_(u_short);
2633void outb_(u_short, u_char);
2634
2635u_char
2636inb_(u_short port)
2637{
2638	return inb(port);
2639}
2640
2641void
2642outb_(u_short port, u_char data)
2643{
2644	outb(port, data);
2645}
2646
2647#endif /* KDB */
2648