machdep.c revision 341491
1/*-
2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * William Jolitz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 341491 2018-12-04 19:07:10Z markj $");
43
44#include "opt_atpic.h"
45#include "opt_compat.h"
46#include "opt_cpu.h"
47#include "opt_ddb.h"
48#include "opt_inet.h"
49#include "opt_isa.h"
50#include "opt_kstack_pages.h"
51#include "opt_maxmem.h"
52#include "opt_mp_watchdog.h"
53#include "opt_perfmon.h"
54#include "opt_platform.h"
55#include "opt_sched.h"
56
57#include <sys/param.h>
58#include <sys/proc.h>
59#include <sys/systm.h>
60#include <sys/bio.h>
61#include <sys/buf.h>
62#include <sys/bus.h>
63#include <sys/callout.h>
64#include <sys/cons.h>
65#include <sys/cpu.h>
66#include <sys/efi.h>
67#include <sys/eventhandler.h>
68#include <sys/exec.h>
69#include <sys/imgact.h>
70#include <sys/kdb.h>
71#include <sys/kernel.h>
72#include <sys/ktr.h>
73#include <sys/linker.h>
74#include <sys/lock.h>
75#include <sys/malloc.h>
76#include <sys/memrange.h>
77#include <sys/msgbuf.h>
78#include <sys/mutex.h>
79#include <sys/pcpu.h>
80#include <sys/ptrace.h>
81#include <sys/reboot.h>
82#include <sys/rwlock.h>
83#include <sys/sched.h>
84#include <sys/signalvar.h>
85#ifdef SMP
86#include <sys/smp.h>
87#endif
88#include <sys/syscallsubr.h>
89#include <sys/sysctl.h>
90#include <sys/sysent.h>
91#include <sys/sysproto.h>
92#include <sys/ucontext.h>
93#include <sys/vmmeter.h>
94
95#include <vm/vm.h>
96#include <vm/vm_extern.h>
97#include <vm/vm_kern.h>
98#include <vm/vm_page.h>
99#include <vm/vm_map.h>
100#include <vm/vm_object.h>
101#include <vm/vm_pager.h>
102#include <vm/vm_param.h>
103#include <vm/vm_phys.h>
104
105#ifdef DDB
106#ifndef KDB
107#error KDB must be enabled in order for DDB to work!
108#endif
109#include <ddb/ddb.h>
110#include <ddb/db_sym.h>
111#endif
112
113#include <net/netisr.h>
114
115#include <machine/clock.h>
116#include <machine/cpu.h>
117#include <machine/cputypes.h>
118#include <machine/frame.h>
119#include <machine/intr_machdep.h>
120#include <x86/mca.h>
121#include <machine/md_var.h>
122#include <machine/metadata.h>
123#include <machine/mp_watchdog.h>
124#include <machine/pc/bios.h>
125#include <machine/pcb.h>
126#include <machine/proc.h>
127#include <machine/reg.h>
128#include <machine/sigframe.h>
129#include <machine/specialreg.h>
130#ifdef PERFMON
131#include <machine/perfmon.h>
132#endif
133#include <machine/tss.h>
134#ifdef SMP
135#include <machine/smp.h>
136#endif
137#ifdef FDT
138#include <x86/fdt.h>
139#endif
140
141#ifdef DEV_ATPIC
142#include <x86/isa/icu.h>
143#else
144#include <x86/apicvar.h>
145#endif
146
147#include <isa/isareg.h>
148#include <isa/rtc.h>
149#include <x86/init.h>
150
151/* Sanity check for __curthread() */
152CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
153
154/*
155 * The PTI trampoline stack needs enough space for a hardware trapframe and a
156 * couple of scratch registers, as well as the trapframe left behind after an
157 * iret fault.
158 */
159CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
160    offsetof(struct pti_frame, pti_rip));
161
162extern u_int64_t hammer_time(u_int64_t, u_int64_t);
163
164#define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
165#define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
166
167static void cpu_startup(void *);
168static void get_fpcontext(struct thread *td, mcontext_t *mcp,
169    char *xfpusave, size_t xfpusave_len);
170static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
171    char *xfpustate, size_t xfpustate_len);
172SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
173
174/* Preload data parse function */
175static caddr_t native_parse_preload_data(u_int64_t);
176
177/* Native function to fetch and parse the e820 map */
178static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
179
180/* Default init_ops implementation. */
181struct init_ops init_ops = {
182	.parse_preload_data =	native_parse_preload_data,
183	.early_clock_source_init =	i8254_init,
184	.early_delay =			i8254_delay,
185	.parse_memmap =			native_parse_memmap,
186#ifdef SMP
187	.mp_bootaddress =		mp_bootaddress,
188	.start_all_aps =		native_start_all_aps,
189#endif
190	.msi_init =			msi_init,
191};
192
193struct msgbuf *msgbufp;
194
195/*
196 * Physical address of the EFI System Table. Stashed from the metadata hints
197 * passed into the kernel and used by the EFI code to call runtime services.
198 */
199vm_paddr_t efi_systbl_phys;
200
201/* Intel ICH registers */
202#define ICH_PMBASE	0x400
203#define ICH_SMI_EN	ICH_PMBASE + 0x30
204
205int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
206
207int cold = 1;
208
209long Maxmem = 0;
210long realmem = 0;
211
212/*
213 * The number of PHYSMAP entries must be one less than the number of
214 * PHYSSEG entries because the PHYSMAP entry that spans the largest
215 * physical address that is accessible by ISA DMA is split into two
216 * PHYSSEG entries.
217 */
218#define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
219
220vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
221vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
222
223/* must be 2 less so 0 0 can signal end of chunks */
224#define	PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
225#define	DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
226
227struct kva_md_info kmi;
228
229static struct trapframe proc0_tf;
230struct region_descriptor r_gdt, r_idt;
231
232struct pcpu __pcpu[MAXCPU];
233
234struct mtx icu_lock;
235
236struct mem_range_softc mem_range_softc;
237
238struct mtx dt_lock;	/* lock for GDT and LDT */
239
240void (*vmm_resume_p)(void);
241
242static void
243cpu_startup(dummy)
244	void *dummy;
245{
246	uintmax_t memsize;
247	char *sysenv;
248
249	/*
250	 * On MacBooks, we need to disallow the legacy USB circuit to
251	 * generate an SMI# because this can cause several problems,
252	 * namely: incorrect CPU frequency detection and failure to
253	 * start the APs.
254	 * We do this by disabling a bit in the SMI_EN (SMI Control and
255	 * Enable register) of the Intel ICH LPC Interface Bridge.
256	 */
257	sysenv = kern_getenv("smbios.system.product");
258	if (sysenv != NULL) {
259		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
260		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
261		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
262		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
263		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
264		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
265		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
266		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
267			if (bootverbose)
268				printf("Disabling LEGACY_USB_EN bit on "
269				    "Intel ICH.\n");
270			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
271		}
272		freeenv(sysenv);
273	}
274
275	/*
276	 * Good {morning,afternoon,evening,night}.
277	 */
278	startrtclock();
279	printcpuinfo();
280#ifdef PERFMON
281	perfmon_init();
282#endif
283
284	/*
285	 * Display physical memory if SMBIOS reports reasonable amount.
286	 */
287	memsize = 0;
288	sysenv = kern_getenv("smbios.memory.enabled");
289	if (sysenv != NULL) {
290		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
291		freeenv(sysenv);
292	}
293	if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
294		memsize = ptoa((uintmax_t)Maxmem);
295	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
296	realmem = atop(memsize);
297
298	/*
299	 * Display any holes after the first chunk of extended memory.
300	 */
301	if (bootverbose) {
302		int indx;
303
304		printf("Physical memory chunk(s):\n");
305		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
306			vm_paddr_t size;
307
308			size = phys_avail[indx + 1] - phys_avail[indx];
309			printf(
310			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
311			    (uintmax_t)phys_avail[indx],
312			    (uintmax_t)phys_avail[indx + 1] - 1,
313			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
314		}
315	}
316
317	vm_ksubmap_init(&kmi);
318
319	printf("avail memory = %ju (%ju MB)\n",
320	    ptoa((uintmax_t)vm_cnt.v_free_count),
321	    ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
322
323	/*
324	 * Set up buffers, so they can be used to read disk labels.
325	 */
326	bufinit();
327	vm_pager_bufferinit();
328
329	cpu_setregs();
330}
331
332/*
333 * Send an interrupt to process.
334 *
335 * Stack is set up to allow sigcode stored
336 * at top to call routine, followed by call
337 * to sigreturn routine below.  After sigreturn
338 * resets the signal mask, the stack, and the
339 * frame pointer, it returns to the user
340 * specified pc, psl.
341 */
342void
343sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
344{
345	struct sigframe sf, *sfp;
346	struct pcb *pcb;
347	struct proc *p;
348	struct thread *td;
349	struct sigacts *psp;
350	char *sp;
351	struct trapframe *regs;
352	char *xfpusave;
353	size_t xfpusave_len;
354	int sig;
355	int oonstack;
356
357	td = curthread;
358	pcb = td->td_pcb;
359	p = td->td_proc;
360	PROC_LOCK_ASSERT(p, MA_OWNED);
361	sig = ksi->ksi_signo;
362	psp = p->p_sigacts;
363	mtx_assert(&psp->ps_mtx, MA_OWNED);
364	regs = td->td_frame;
365	oonstack = sigonstack(regs->tf_rsp);
366
367	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
368		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
369		xfpusave = __builtin_alloca(xfpusave_len);
370	} else {
371		xfpusave_len = 0;
372		xfpusave = NULL;
373	}
374
375	/* Save user context. */
376	bzero(&sf, sizeof(sf));
377	sf.sf_uc.uc_sigmask = *mask;
378	sf.sf_uc.uc_stack = td->td_sigstk;
379	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
380	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
381	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
382	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
383	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
384	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
385	fpstate_drop(td);
386	update_pcb_bases(pcb);
387	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
388	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
389	bzero(sf.sf_uc.uc_mcontext.mc_spare,
390	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
391	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
392
393	/* Allocate space for the signal handler context. */
394	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
395	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
396		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
397#if defined(COMPAT_43)
398		td->td_sigstk.ss_flags |= SS_ONSTACK;
399#endif
400	} else
401		sp = (char *)regs->tf_rsp - 128;
402	if (xfpusave != NULL) {
403		sp -= xfpusave_len;
404		sp = (char *)((unsigned long)sp & ~0x3Ful);
405		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
406	}
407	sp -= sizeof(struct sigframe);
408	/* Align to 16 bytes. */
409	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
410
411	/* Build the argument list for the signal handler. */
412	regs->tf_rdi = sig;			/* arg 1 in %rdi */
413	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
414	bzero(&sf.sf_si, sizeof(sf.sf_si));
415	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
416		/* Signal handler installed with SA_SIGINFO. */
417		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
418		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
419
420		/* Fill in POSIX parts */
421		sf.sf_si = ksi->ksi_info;
422		sf.sf_si.si_signo = sig; /* maybe a translated signal */
423		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
424	} else {
425		/* Old FreeBSD-style arguments. */
426		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
427		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
428		sf.sf_ahu.sf_handler = catcher;
429	}
430	mtx_unlock(&psp->ps_mtx);
431	PROC_UNLOCK(p);
432
433	/*
434	 * Copy the sigframe out to the user's stack.
435	 */
436	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
437	    (xfpusave != NULL && copyout(xfpusave,
438	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
439	    != 0)) {
440#ifdef DEBUG
441		printf("process %ld has trashed its stack\n", (long)p->p_pid);
442#endif
443		PROC_LOCK(p);
444		sigexit(td, SIGILL);
445	}
446
447	regs->tf_rsp = (long)sfp;
448	regs->tf_rip = p->p_sysent->sv_sigcode_base;
449	regs->tf_rflags &= ~(PSL_T | PSL_D);
450	regs->tf_cs = _ucodesel;
451	regs->tf_ds = _udatasel;
452	regs->tf_ss = _udatasel;
453	regs->tf_es = _udatasel;
454	regs->tf_fs = _ufssel;
455	regs->tf_gs = _ugssel;
456	regs->tf_flags = TF_HASSEGS;
457	PROC_LOCK(p);
458	mtx_lock(&psp->ps_mtx);
459}
460
461/*
462 * System call to cleanup state after a signal
463 * has been taken.  Reset signal mask and
464 * stack state from context left by sendsig (above).
465 * Return to previous pc and psl as specified by
466 * context left by sendsig. Check carefully to
467 * make sure that the user has not modified the
468 * state to gain improper privileges.
469 *
470 * MPSAFE
471 */
472int
473sys_sigreturn(td, uap)
474	struct thread *td;
475	struct sigreturn_args /* {
476		const struct __ucontext *sigcntxp;
477	} */ *uap;
478{
479	ucontext_t uc;
480	struct pcb *pcb;
481	struct proc *p;
482	struct trapframe *regs;
483	ucontext_t *ucp;
484	char *xfpustate;
485	size_t xfpustate_len;
486	long rflags;
487	int cs, error, ret;
488	ksiginfo_t ksi;
489
490	pcb = td->td_pcb;
491	p = td->td_proc;
492
493	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
494	if (error != 0) {
495		uprintf("pid %d (%s): sigreturn copyin failed\n",
496		    p->p_pid, td->td_name);
497		return (error);
498	}
499	ucp = &uc;
500	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
501		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
502		    td->td_name, ucp->uc_mcontext.mc_flags);
503		return (EINVAL);
504	}
505	regs = td->td_frame;
506	rflags = ucp->uc_mcontext.mc_rflags;
507	/*
508	 * Don't allow users to change privileged or reserved flags.
509	 */
510	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
511		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
512		    td->td_name, rflags);
513		return (EINVAL);
514	}
515
516	/*
517	 * Don't allow users to load a valid privileged %cs.  Let the
518	 * hardware check for invalid selectors, excess privilege in
519	 * other selectors, invalid %eip's and invalid %esp's.
520	 */
521	cs = ucp->uc_mcontext.mc_cs;
522	if (!CS_SECURE(cs)) {
523		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
524		    td->td_name, cs);
525		ksiginfo_init_trap(&ksi);
526		ksi.ksi_signo = SIGBUS;
527		ksi.ksi_code = BUS_OBJERR;
528		ksi.ksi_trapno = T_PROTFLT;
529		ksi.ksi_addr = (void *)regs->tf_rip;
530		trapsignal(td, &ksi);
531		return (EINVAL);
532	}
533
534	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
535		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
536		if (xfpustate_len > cpu_max_ext_state_size -
537		    sizeof(struct savefpu)) {
538			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
539			    p->p_pid, td->td_name, xfpustate_len);
540			return (EINVAL);
541		}
542		xfpustate = __builtin_alloca(xfpustate_len);
543		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
544		    xfpustate, xfpustate_len);
545		if (error != 0) {
546			uprintf(
547	"pid %d (%s): sigreturn copying xfpustate failed\n",
548			    p->p_pid, td->td_name);
549			return (error);
550		}
551	} else {
552		xfpustate = NULL;
553		xfpustate_len = 0;
554	}
555	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
556	if (ret != 0) {
557		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
558		    p->p_pid, td->td_name, ret);
559		return (ret);
560	}
561	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
562	update_pcb_bases(pcb);
563	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
564	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
565
566#if defined(COMPAT_43)
567	if (ucp->uc_mcontext.mc_onstack & 1)
568		td->td_sigstk.ss_flags |= SS_ONSTACK;
569	else
570		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
571#endif
572
573	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
574	return (EJUSTRETURN);
575}
576
577#ifdef COMPAT_FREEBSD4
578int
579freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
580{
581
582	return sys_sigreturn(td, (struct sigreturn_args *)uap);
583}
584#endif
585
586/*
587 * Reset registers to default values on exec.
588 */
589void
590exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
591{
592	struct trapframe *regs;
593	struct pcb *pcb;
594	register_t saved_rflags;
595
596	regs = td->td_frame;
597	pcb = td->td_pcb;
598
599	mtx_lock(&dt_lock);
600	if (td->td_proc->p_md.md_ldt != NULL)
601		user_ldt_free(td);
602	else
603		mtx_unlock(&dt_lock);
604
605	update_pcb_bases(pcb);
606	pcb->pcb_fsbase = 0;
607	pcb->pcb_gsbase = 0;
608	clear_pcb_flags(pcb, PCB_32BIT);
609	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
610
611	saved_rflags = regs->tf_rflags & PSL_T;
612	bzero((char *)regs, sizeof(struct trapframe));
613	regs->tf_rip = imgp->entry_addr;
614	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
615	regs->tf_rdi = stack;		/* argv */
616	regs->tf_rflags = PSL_USER | saved_rflags;
617	regs->tf_ss = _udatasel;
618	regs->tf_cs = _ucodesel;
619	regs->tf_ds = _udatasel;
620	regs->tf_es = _udatasel;
621	regs->tf_fs = _ufssel;
622	regs->tf_gs = _ugssel;
623	regs->tf_flags = TF_HASSEGS;
624	td->td_retval[1] = 0;
625
626	/*
627	 * Reset the hardware debug registers if they were in use.
628	 * They won't have any meaning for the newly exec'd process.
629	 */
630	if (pcb->pcb_flags & PCB_DBREGS) {
631		pcb->pcb_dr0 = 0;
632		pcb->pcb_dr1 = 0;
633		pcb->pcb_dr2 = 0;
634		pcb->pcb_dr3 = 0;
635		pcb->pcb_dr6 = 0;
636		pcb->pcb_dr7 = 0;
637		if (pcb == curpcb) {
638			/*
639			 * Clear the debug registers on the running
640			 * CPU, otherwise they will end up affecting
641			 * the next process we switch to.
642			 */
643			reset_dbregs();
644		}
645		clear_pcb_flags(pcb, PCB_DBREGS);
646	}
647
648	/*
649	 * Drop the FP state if we hold it, so that the process gets a
650	 * clean FP state if it uses the FPU again.
651	 */
652	fpstate_drop(td);
653}
654
655void
656cpu_setregs(void)
657{
658	register_t cr0;
659
660	cr0 = rcr0();
661	/*
662	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
663	 * BSP.  See the comments there about why we set them.
664	 */
665	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
666	load_cr0(cr0);
667}
668
669/*
670 * Initialize amd64 and configure to run kernel
671 */
672
673/*
674 * Initialize segments & interrupt table
675 */
676
677struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
678static struct gate_descriptor idt0[NIDT];
679struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
680
681static char dblfault_stack[PAGE_SIZE] __aligned(16);
682static char mce0_stack[PAGE_SIZE] __aligned(16);
683static char nmi0_stack[PAGE_SIZE] __aligned(16);
684static char dbg0_stack[PAGE_SIZE] __aligned(16);
685CTASSERT(sizeof(struct nmi_pcpu) == 16);
686
687struct amd64tss common_tss[MAXCPU];
688
689/*
690 * Software prototypes -- in more palatable form.
691 *
692 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
693 * slots as corresponding segments for i386 kernel.
694 */
695struct soft_segment_descriptor gdt_segs[] = {
696/* GNULL_SEL	0 Null Descriptor */
697{	.ssd_base = 0x0,
698	.ssd_limit = 0x0,
699	.ssd_type = 0,
700	.ssd_dpl = 0,
701	.ssd_p = 0,
702	.ssd_long = 0,
703	.ssd_def32 = 0,
704	.ssd_gran = 0		},
705/* GNULL2_SEL	1 Null Descriptor */
706{	.ssd_base = 0x0,
707	.ssd_limit = 0x0,
708	.ssd_type = 0,
709	.ssd_dpl = 0,
710	.ssd_p = 0,
711	.ssd_long = 0,
712	.ssd_def32 = 0,
713	.ssd_gran = 0		},
714/* GUFS32_SEL	2 32 bit %gs Descriptor for user */
715{	.ssd_base = 0x0,
716	.ssd_limit = 0xfffff,
717	.ssd_type = SDT_MEMRWA,
718	.ssd_dpl = SEL_UPL,
719	.ssd_p = 1,
720	.ssd_long = 0,
721	.ssd_def32 = 1,
722	.ssd_gran = 1		},
723/* GUGS32_SEL	3 32 bit %fs Descriptor for user */
724{	.ssd_base = 0x0,
725	.ssd_limit = 0xfffff,
726	.ssd_type = SDT_MEMRWA,
727	.ssd_dpl = SEL_UPL,
728	.ssd_p = 1,
729	.ssd_long = 0,
730	.ssd_def32 = 1,
731	.ssd_gran = 1		},
732/* GCODE_SEL	4 Code Descriptor for kernel */
733{	.ssd_base = 0x0,
734	.ssd_limit = 0xfffff,
735	.ssd_type = SDT_MEMERA,
736	.ssd_dpl = SEL_KPL,
737	.ssd_p = 1,
738	.ssd_long = 1,
739	.ssd_def32 = 0,
740	.ssd_gran = 1		},
741/* GDATA_SEL	5 Data Descriptor for kernel */
742{	.ssd_base = 0x0,
743	.ssd_limit = 0xfffff,
744	.ssd_type = SDT_MEMRWA,
745	.ssd_dpl = SEL_KPL,
746	.ssd_p = 1,
747	.ssd_long = 1,
748	.ssd_def32 = 0,
749	.ssd_gran = 1		},
750/* GUCODE32_SEL	6 32 bit Code Descriptor for user */
751{	.ssd_base = 0x0,
752	.ssd_limit = 0xfffff,
753	.ssd_type = SDT_MEMERA,
754	.ssd_dpl = SEL_UPL,
755	.ssd_p = 1,
756	.ssd_long = 0,
757	.ssd_def32 = 1,
758	.ssd_gran = 1		},
759/* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
760{	.ssd_base = 0x0,
761	.ssd_limit = 0xfffff,
762	.ssd_type = SDT_MEMRWA,
763	.ssd_dpl = SEL_UPL,
764	.ssd_p = 1,
765	.ssd_long = 0,
766	.ssd_def32 = 1,
767	.ssd_gran = 1		},
768/* GUCODE_SEL	8 64 bit Code Descriptor for user */
769{	.ssd_base = 0x0,
770	.ssd_limit = 0xfffff,
771	.ssd_type = SDT_MEMERA,
772	.ssd_dpl = SEL_UPL,
773	.ssd_p = 1,
774	.ssd_long = 1,
775	.ssd_def32 = 0,
776	.ssd_gran = 1		},
777/* GPROC0_SEL	9 Proc 0 Tss Descriptor */
778{	.ssd_base = 0x0,
779	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
780	.ssd_type = SDT_SYSTSS,
781	.ssd_dpl = SEL_KPL,
782	.ssd_p = 1,
783	.ssd_long = 0,
784	.ssd_def32 = 0,
785	.ssd_gran = 0		},
786/* Actually, the TSS is a system descriptor which is double size */
787{	.ssd_base = 0x0,
788	.ssd_limit = 0x0,
789	.ssd_type = 0,
790	.ssd_dpl = 0,
791	.ssd_p = 0,
792	.ssd_long = 0,
793	.ssd_def32 = 0,
794	.ssd_gran = 0		},
795/* GUSERLDT_SEL	11 LDT Descriptor */
796{	.ssd_base = 0x0,
797	.ssd_limit = 0x0,
798	.ssd_type = 0,
799	.ssd_dpl = 0,
800	.ssd_p = 0,
801	.ssd_long = 0,
802	.ssd_def32 = 0,
803	.ssd_gran = 0		},
804/* GUSERLDT_SEL	12 LDT Descriptor, double size */
805{	.ssd_base = 0x0,
806	.ssd_limit = 0x0,
807	.ssd_type = 0,
808	.ssd_dpl = 0,
809	.ssd_p = 0,
810	.ssd_long = 0,
811	.ssd_def32 = 0,
812	.ssd_gran = 0		},
813};
814
815void
816setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
817{
818	struct gate_descriptor *ip;
819
820	ip = idt + idx;
821	ip->gd_looffset = (uintptr_t)func;
822	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
823	ip->gd_ist = ist;
824	ip->gd_xx = 0;
825	ip->gd_type = typ;
826	ip->gd_dpl = dpl;
827	ip->gd_p = 1;
828	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
829}
830
831extern inthand_t
832	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
833	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
834	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
835	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
836	IDTVEC(xmm), IDTVEC(dblfault),
837	IDTVEC(div_pti), IDTVEC(bpt_pti),
838	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
839	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
840	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
841	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
842	IDTVEC(xmm_pti),
843#ifdef KDTRACE_HOOKS
844	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
845#endif
846#ifdef XENHVM
847	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
848#endif
849	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
850	IDTVEC(fast_syscall_pti);
851
852#ifdef DDB
853/*
854 * Display the index and function name of any IDT entries that don't use
855 * the default 'rsvd' entry point.
856 */
857DB_SHOW_COMMAND(idt, db_show_idt)
858{
859	struct gate_descriptor *ip;
860	int idx;
861	uintptr_t func;
862
863	ip = idt;
864	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
865		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
866		if (func != (uintptr_t)&IDTVEC(rsvd)) {
867			db_printf("%3d\t", idx);
868			db_printsym(func, DB_STGY_PROC);
869			db_printf("\n");
870		}
871		ip++;
872	}
873}
874
875/* Show privileged registers. */
876DB_SHOW_COMMAND(sysregs, db_show_sysregs)
877{
878	struct {
879		uint16_t limit;
880		uint64_t base;
881	} __packed idtr, gdtr;
882	uint16_t ldt, tr;
883
884	__asm __volatile("sidt %0" : "=m" (idtr));
885	db_printf("idtr\t0x%016lx/%04x\n",
886	    (u_long)idtr.base, (u_int)idtr.limit);
887	__asm __volatile("sgdt %0" : "=m" (gdtr));
888	db_printf("gdtr\t0x%016lx/%04x\n",
889	    (u_long)gdtr.base, (u_int)gdtr.limit);
890	__asm __volatile("sldt %0" : "=r" (ldt));
891	db_printf("ldtr\t0x%04x\n", ldt);
892	__asm __volatile("str %0" : "=r" (tr));
893	db_printf("tr\t0x%04x\n", tr);
894	db_printf("cr0\t0x%016lx\n", rcr0());
895	db_printf("cr2\t0x%016lx\n", rcr2());
896	db_printf("cr3\t0x%016lx\n", rcr3());
897	db_printf("cr4\t0x%016lx\n", rcr4());
898	if (rcr4() & CR4_XSAVE)
899		db_printf("xcr0\t0x%016lx\n", rxcr(0));
900	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
901	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
902		db_printf("FEATURES_CTL\t%016lx\n",
903		    rdmsr(MSR_IA32_FEATURE_CONTROL));
904	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
905	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
906	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
907}
908
909DB_SHOW_COMMAND(dbregs, db_show_dbregs)
910{
911
912	db_printf("dr0\t0x%016lx\n", rdr0());
913	db_printf("dr1\t0x%016lx\n", rdr1());
914	db_printf("dr2\t0x%016lx\n", rdr2());
915	db_printf("dr3\t0x%016lx\n", rdr3());
916	db_printf("dr6\t0x%016lx\n", rdr6());
917	db_printf("dr7\t0x%016lx\n", rdr7());
918}
919#endif
920
921void
922sdtossd(sd, ssd)
923	struct user_segment_descriptor *sd;
924	struct soft_segment_descriptor *ssd;
925{
926
927	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
928	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
929	ssd->ssd_type  = sd->sd_type;
930	ssd->ssd_dpl   = sd->sd_dpl;
931	ssd->ssd_p     = sd->sd_p;
932	ssd->ssd_long  = sd->sd_long;
933	ssd->ssd_def32 = sd->sd_def32;
934	ssd->ssd_gran  = sd->sd_gran;
935}
936
937void
938ssdtosd(ssd, sd)
939	struct soft_segment_descriptor *ssd;
940	struct user_segment_descriptor *sd;
941{
942
943	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
944	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
945	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
946	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
947	sd->sd_type  = ssd->ssd_type;
948	sd->sd_dpl   = ssd->ssd_dpl;
949	sd->sd_p     = ssd->ssd_p;
950	sd->sd_long  = ssd->ssd_long;
951	sd->sd_def32 = ssd->ssd_def32;
952	sd->sd_gran  = ssd->ssd_gran;
953}
954
955void
956ssdtosyssd(ssd, sd)
957	struct soft_segment_descriptor *ssd;
958	struct system_segment_descriptor *sd;
959{
960
961	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
962	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
963	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
964	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
965	sd->sd_type  = ssd->ssd_type;
966	sd->sd_dpl   = ssd->ssd_dpl;
967	sd->sd_p     = ssd->ssd_p;
968	sd->sd_gran  = ssd->ssd_gran;
969}
970
971#if !defined(DEV_ATPIC) && defined(DEV_ISA)
972#include <isa/isavar.h>
973#include <isa/isareg.h>
974/*
975 * Return a bitmap of the current interrupt requests.  This is 8259-specific
976 * and is only suitable for use at probe time.
977 * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
978 * It shouldn't be here.  There should probably be an APIC centric
979 * implementation in the apic driver code, if at all.
980 */
981intrmask_t
982isa_irq_pending(void)
983{
984	u_char irr1;
985	u_char irr2;
986
987	irr1 = inb(IO_ICU1);
988	irr2 = inb(IO_ICU2);
989	return ((irr2 << 8) | irr1);
990}
991#endif
992
993u_int basemem;
994
995static int
996add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
997    int *physmap_idxp)
998{
999	int i, insert_idx, physmap_idx;
1000
1001	physmap_idx = *physmap_idxp;
1002
1003	if (length == 0)
1004		return (1);
1005
1006	/*
1007	 * Find insertion point while checking for overlap.  Start off by
1008	 * assuming the new entry will be added to the end.
1009	 *
1010	 * NB: physmap_idx points to the next free slot.
1011	 */
1012	insert_idx = physmap_idx;
1013	for (i = 0; i <= physmap_idx; i += 2) {
1014		if (base < physmap[i + 1]) {
1015			if (base + length <= physmap[i]) {
1016				insert_idx = i;
1017				break;
1018			}
1019			if (boothowto & RB_VERBOSE)
1020				printf(
1021		    "Overlapping memory regions, ignoring second region\n");
1022			return (1);
1023		}
1024	}
1025
1026	/* See if we can prepend to the next entry. */
1027	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1028		physmap[insert_idx] = base;
1029		return (1);
1030	}
1031
1032	/* See if we can append to the previous entry. */
1033	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1034		physmap[insert_idx - 1] += length;
1035		return (1);
1036	}
1037
1038	physmap_idx += 2;
1039	*physmap_idxp = physmap_idx;
1040	if (physmap_idx == PHYSMAP_SIZE) {
1041		printf(
1042		"Too many segments in the physical address map, giving up\n");
1043		return (0);
1044	}
1045
1046	/*
1047	 * Move the last 'N' entries down to make room for the new
1048	 * entry if needed.
1049	 */
1050	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1051		physmap[i] = physmap[i - 2];
1052		physmap[i + 1] = physmap[i - 1];
1053	}
1054
1055	/* Insert the new entry. */
1056	physmap[insert_idx] = base;
1057	physmap[insert_idx + 1] = base + length;
1058	return (1);
1059}
1060
1061void
1062bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1063                      vm_paddr_t *physmap, int *physmap_idx)
1064{
1065	struct bios_smap *smap, *smapend;
1066
1067	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1068
1069	for (smap = smapbase; smap < smapend; smap++) {
1070		if (boothowto & RB_VERBOSE)
1071			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1072			    smap->type, smap->base, smap->length);
1073
1074		if (smap->type != SMAP_TYPE_MEMORY)
1075			continue;
1076
1077		if (!add_physmap_entry(smap->base, smap->length, physmap,
1078		    physmap_idx))
1079			break;
1080	}
1081}
1082
1083static void
1084add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1085    int *physmap_idx)
1086{
1087	struct efi_md *map, *p;
1088	const char *type;
1089	size_t efisz;
1090	int ndesc, i;
1091
1092	static const char *types[] = {
1093		"Reserved",
1094		"LoaderCode",
1095		"LoaderData",
1096		"BootServicesCode",
1097		"BootServicesData",
1098		"RuntimeServicesCode",
1099		"RuntimeServicesData",
1100		"ConventionalMemory",
1101		"UnusableMemory",
1102		"ACPIReclaimMemory",
1103		"ACPIMemoryNVS",
1104		"MemoryMappedIO",
1105		"MemoryMappedIOPortSpace",
1106		"PalCode",
1107		"PersistentMemory"
1108	};
1109
1110	/*
1111	 * Memory map data provided by UEFI via the GetMemoryMap
1112	 * Boot Services API.
1113	 */
1114	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1115	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1116
1117	if (efihdr->descriptor_size == 0)
1118		return;
1119	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1120
1121	if (boothowto & RB_VERBOSE)
1122		printf("%23s %12s %12s %8s %4s\n",
1123		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1124
1125	for (i = 0, p = map; i < ndesc; i++,
1126	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1127		if (boothowto & RB_VERBOSE) {
1128			if (p->md_type < nitems(types))
1129				type = types[p->md_type];
1130			else
1131				type = "<INVALID>";
1132			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1133			    p->md_virt, p->md_pages);
1134			if (p->md_attr & EFI_MD_ATTR_UC)
1135				printf("UC ");
1136			if (p->md_attr & EFI_MD_ATTR_WC)
1137				printf("WC ");
1138			if (p->md_attr & EFI_MD_ATTR_WT)
1139				printf("WT ");
1140			if (p->md_attr & EFI_MD_ATTR_WB)
1141				printf("WB ");
1142			if (p->md_attr & EFI_MD_ATTR_UCE)
1143				printf("UCE ");
1144			if (p->md_attr & EFI_MD_ATTR_WP)
1145				printf("WP ");
1146			if (p->md_attr & EFI_MD_ATTR_RP)
1147				printf("RP ");
1148			if (p->md_attr & EFI_MD_ATTR_XP)
1149				printf("XP ");
1150			if (p->md_attr & EFI_MD_ATTR_NV)
1151				printf("NV ");
1152			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1153				printf("MORE_RELIABLE ");
1154			if (p->md_attr & EFI_MD_ATTR_RO)
1155				printf("RO ");
1156			if (p->md_attr & EFI_MD_ATTR_RT)
1157				printf("RUNTIME");
1158			printf("\n");
1159		}
1160
1161		switch (p->md_type) {
1162		case EFI_MD_TYPE_CODE:
1163		case EFI_MD_TYPE_DATA:
1164		case EFI_MD_TYPE_BS_CODE:
1165		case EFI_MD_TYPE_BS_DATA:
1166		case EFI_MD_TYPE_FREE:
1167			/*
1168			 * We're allowed to use any entry with these types.
1169			 */
1170			break;
1171		default:
1172			continue;
1173		}
1174
1175		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1176		    physmap, physmap_idx))
1177			break;
1178	}
1179}
1180
1181static char bootmethod[16] = "";
1182SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1183    "System firmware boot method");
1184
1185static void
1186native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1187{
1188	struct bios_smap *smap;
1189	struct efi_map_header *efihdr;
1190	u_int32_t size;
1191
1192	/*
1193	 * Memory map from INT 15:E820.
1194	 *
1195	 * subr_module.c says:
1196	 * "Consumer may safely assume that size value precedes data."
1197	 * ie: an int32_t immediately precedes smap.
1198	 */
1199
1200	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1201	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1202	smap = (struct bios_smap *)preload_search_info(kmdp,
1203	    MODINFO_METADATA | MODINFOMD_SMAP);
1204	if (efihdr == NULL && smap == NULL)
1205		panic("No BIOS smap or EFI map info from loader!");
1206
1207	if (efihdr != NULL) {
1208		add_efi_map_entries(efihdr, physmap, physmap_idx);
1209		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1210	} else {
1211		size = *((u_int32_t *)smap - 1);
1212		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1213		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1214	}
1215}
1216
1217#define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1218
1219/*
1220 * Populate the (physmap) array with base/bound pairs describing the
1221 * available physical memory in the system, then test this memory and
1222 * build the phys_avail array describing the actually-available memory.
1223 *
1224 * Total memory size may be set by the kernel environment variable
1225 * hw.physmem or the compile-time define MAXMEM.
1226 *
1227 * XXX first should be vm_paddr_t.
1228 */
1229static void
1230getmemsize(caddr_t kmdp, u_int64_t first)
1231{
1232	int i, physmap_idx, pa_indx, da_indx;
1233	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1234	u_long physmem_start, physmem_tunable, memtest;
1235	pt_entry_t *pte;
1236	quad_t dcons_addr, dcons_size;
1237	int page_counter;
1238
1239	/*
1240	 * Tell the physical memory allocator about pages used to store
1241	 * the kernel and preloaded data.  See kmem_bootstrap_free().
1242	 */
1243	vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1244
1245	bzero(physmap, sizeof(physmap));
1246	physmap_idx = 0;
1247
1248	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1249	physmap_idx -= 2;
1250
1251	/*
1252	 * Find the 'base memory' segment for SMP
1253	 */
1254	basemem = 0;
1255	for (i = 0; i <= physmap_idx; i += 2) {
1256		if (physmap[i] <= 0xA0000) {
1257			basemem = physmap[i + 1] / 1024;
1258			break;
1259		}
1260	}
1261	if (basemem == 0 || basemem > 640) {
1262		if (bootverbose)
1263			printf(
1264		"Memory map doesn't contain a basemem segment, faking it");
1265		basemem = 640;
1266	}
1267
1268	/*
1269	 * Make hole for "AP -> long mode" bootstrap code.  The
1270	 * mp_bootaddress vector is only available when the kernel
1271	 * is configured to support APs and APs for the system start
1272	 * in 32bit mode (e.g. SMP bare metal).
1273	 */
1274	if (init_ops.mp_bootaddress) {
1275		if (physmap[1] >= 0x100000000)
1276			panic(
1277	"Basemem segment is not suitable for AP bootstrap code!");
1278		physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
1279	}
1280
1281	/*
1282	 * Maxmem isn't the "maximum memory", it's one larger than the
1283	 * highest page of the physical address space.  It should be
1284	 * called something like "Maxphyspage".  We may adjust this
1285	 * based on ``hw.physmem'' and the results of the memory test.
1286	 */
1287	Maxmem = atop(physmap[physmap_idx + 1]);
1288
1289#ifdef MAXMEM
1290	Maxmem = MAXMEM / 4;
1291#endif
1292
1293	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1294		Maxmem = atop(physmem_tunable);
1295
1296	/*
1297	 * The boot memory test is disabled by default, as it takes a
1298	 * significant amount of time on large-memory systems, and is
1299	 * unfriendly to virtual machines as it unnecessarily touches all
1300	 * pages.
1301	 *
1302	 * A general name is used as the code may be extended to support
1303	 * additional tests beyond the current "page present" test.
1304	 */
1305	memtest = 0;
1306	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1307
1308	/*
1309	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1310	 * in the system.
1311	 */
1312	if (Maxmem > atop(physmap[physmap_idx + 1]))
1313		Maxmem = atop(physmap[physmap_idx + 1]);
1314
1315	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1316	    (boothowto & RB_VERBOSE))
1317		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1318
1319	/* call pmap initialization to make new kernel address space */
1320	pmap_bootstrap(&first);
1321
1322	/*
1323	 * Size up each available chunk of physical memory.
1324	 *
1325	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1326	 * By default, mask off the first 16 pages unless we appear to be
1327	 * running in a VM.
1328	 */
1329	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1330	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1331	if (physmap[0] < physmem_start) {
1332		if (physmem_start < PAGE_SIZE)
1333			physmap[0] = PAGE_SIZE;
1334		else if (physmem_start >= physmap[1])
1335			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1336		else
1337			physmap[0] = round_page(physmem_start);
1338	}
1339	pa_indx = 0;
1340	da_indx = 1;
1341	phys_avail[pa_indx++] = physmap[0];
1342	phys_avail[pa_indx] = physmap[0];
1343	dump_avail[da_indx] = physmap[0];
1344	pte = CMAP1;
1345
1346	/*
1347	 * Get dcons buffer address
1348	 */
1349	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1350	    getenv_quad("dcons.size", &dcons_size) == 0)
1351		dcons_addr = 0;
1352
1353	/*
1354	 * physmap is in bytes, so when converting to page boundaries,
1355	 * round up the start address and round down the end address.
1356	 */
1357	page_counter = 0;
1358	if (memtest != 0)
1359		printf("Testing system memory");
1360	for (i = 0; i <= physmap_idx; i += 2) {
1361		vm_paddr_t end;
1362
1363		end = ptoa((vm_paddr_t)Maxmem);
1364		if (physmap[i + 1] < end)
1365			end = trunc_page(physmap[i + 1]);
1366		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1367			int tmp, page_bad, full;
1368			int *ptr = (int *)CADDR1;
1369
1370			full = FALSE;
1371			/*
1372			 * block out kernel memory as not available.
1373			 */
1374			if (pa >= (vm_paddr_t)kernphys && pa < first)
1375				goto do_dump_avail;
1376
1377			/*
1378			 * block out dcons buffer
1379			 */
1380			if (dcons_addr > 0
1381			    && pa >= trunc_page(dcons_addr)
1382			    && pa < dcons_addr + dcons_size)
1383				goto do_dump_avail;
1384
1385			page_bad = FALSE;
1386			if (memtest == 0)
1387				goto skip_memtest;
1388
1389			/*
1390			 * Print a "." every GB to show we're making
1391			 * progress.
1392			 */
1393			page_counter++;
1394			if ((page_counter % PAGES_PER_GB) == 0)
1395				printf(".");
1396
1397			/*
1398			 * map page into kernel: valid, read/write,non-cacheable
1399			 */
1400			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1401			invltlb();
1402
1403			tmp = *(int *)ptr;
1404			/*
1405			 * Test for alternating 1's and 0's
1406			 */
1407			*(volatile int *)ptr = 0xaaaaaaaa;
1408			if (*(volatile int *)ptr != 0xaaaaaaaa)
1409				page_bad = TRUE;
1410			/*
1411			 * Test for alternating 0's and 1's
1412			 */
1413			*(volatile int *)ptr = 0x55555555;
1414			if (*(volatile int *)ptr != 0x55555555)
1415				page_bad = TRUE;
1416			/*
1417			 * Test for all 1's
1418			 */
1419			*(volatile int *)ptr = 0xffffffff;
1420			if (*(volatile int *)ptr != 0xffffffff)
1421				page_bad = TRUE;
1422			/*
1423			 * Test for all 0's
1424			 */
1425			*(volatile int *)ptr = 0x0;
1426			if (*(volatile int *)ptr != 0x0)
1427				page_bad = TRUE;
1428			/*
1429			 * Restore original value.
1430			 */
1431			*(int *)ptr = tmp;
1432
1433skip_memtest:
1434			/*
1435			 * Adjust array of valid/good pages.
1436			 */
1437			if (page_bad == TRUE)
1438				continue;
1439			/*
1440			 * If this good page is a continuation of the
1441			 * previous set of good pages, then just increase
1442			 * the end pointer. Otherwise start a new chunk.
1443			 * Note that "end" points one higher than end,
1444			 * making the range >= start and < end.
1445			 * If we're also doing a speculative memory
1446			 * test and we at or past the end, bump up Maxmem
1447			 * so that we keep going. The first bad page
1448			 * will terminate the loop.
1449			 */
1450			if (phys_avail[pa_indx] == pa) {
1451				phys_avail[pa_indx] += PAGE_SIZE;
1452			} else {
1453				pa_indx++;
1454				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1455					printf(
1456		"Too many holes in the physical address space, giving up\n");
1457					pa_indx--;
1458					full = TRUE;
1459					goto do_dump_avail;
1460				}
1461				phys_avail[pa_indx++] = pa;	/* start */
1462				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1463			}
1464			physmem++;
1465do_dump_avail:
1466			if (dump_avail[da_indx] == pa) {
1467				dump_avail[da_indx] += PAGE_SIZE;
1468			} else {
1469				da_indx++;
1470				if (da_indx == DUMP_AVAIL_ARRAY_END) {
1471					da_indx--;
1472					goto do_next;
1473				}
1474				dump_avail[da_indx++] = pa; /* start */
1475				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1476			}
1477do_next:
1478			if (full)
1479				break;
1480		}
1481	}
1482	*pte = 0;
1483	invltlb();
1484	if (memtest != 0)
1485		printf("\n");
1486
1487	/*
1488	 * XXX
1489	 * The last chunk must contain at least one page plus the message
1490	 * buffer to avoid complicating other code (message buffer address
1491	 * calculation, etc.).
1492	 */
1493	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1494	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1495		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1496		phys_avail[pa_indx--] = 0;
1497		phys_avail[pa_indx--] = 0;
1498	}
1499
1500	Maxmem = atop(phys_avail[pa_indx]);
1501
1502	/* Trim off space for the message buffer. */
1503	phys_avail[pa_indx] -= round_page(msgbufsize);
1504
1505	/* Map the message buffer. */
1506	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1507}
1508
1509static caddr_t
1510native_parse_preload_data(u_int64_t modulep)
1511{
1512	caddr_t kmdp;
1513	char *envp;
1514#ifdef DDB
1515	vm_offset_t ksym_start;
1516	vm_offset_t ksym_end;
1517#endif
1518
1519	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1520	preload_bootstrap_relocate(KERNBASE);
1521	kmdp = preload_search_by_type("elf kernel");
1522	if (kmdp == NULL)
1523		kmdp = preload_search_by_type("elf64 kernel");
1524	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1525	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1526	if (envp != NULL)
1527		envp += KERNBASE;
1528	init_static_kenv(envp, 0);
1529#ifdef DDB
1530	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1531	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1532	db_fetch_ksymtab(ksym_start, ksym_end);
1533#endif
1534	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1535
1536	return (kmdp);
1537}
1538
1539static void
1540amd64_kdb_init(void)
1541{
1542	kdb_init();
1543#ifdef KDB
1544	if (boothowto & RB_KDB)
1545		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1546#endif
1547}
1548
1549/* Set up the fast syscall stuff */
1550void
1551amd64_conf_fast_syscall(void)
1552{
1553	uint64_t msr;
1554
1555	msr = rdmsr(MSR_EFER) | EFER_SCE;
1556	wrmsr(MSR_EFER, msr);
1557	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1558	    (u_int64_t)IDTVEC(fast_syscall));
1559	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1560	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1561	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1562	wrmsr(MSR_STAR, msr);
1563	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
1564}
1565
1566u_int64_t
1567hammer_time(u_int64_t modulep, u_int64_t physfree)
1568{
1569	caddr_t kmdp;
1570	int gsel_tss, x;
1571	struct pcpu *pc;
1572	struct nmi_pcpu *np;
1573	struct xstate_hdr *xhdr;
1574	u_int64_t rsp0;
1575	char *env;
1576	size_t kstack0_sz;
1577	int late_console;
1578
1579	kmdp = init_ops.parse_preload_data(modulep);
1580
1581	identify_cpu1();
1582	identify_hypervisor();
1583	/*
1584	 * hw.cpu_stdext_disable is ignored by the call, it will be
1585	 * re-evaluted by the below call to finishidentcpu().
1586	 */
1587	identify_cpu2();
1588
1589	link_elf_ireloc(kmdp);
1590
1591	/*
1592	 * This may be done better later if it gets more high level
1593	 * components in it. If so just link td->td_proc here.
1594	 */
1595	proc_linkup0(&proc0, &thread0);
1596
1597	/* Init basic tunables, hz etc */
1598	init_param1();
1599
1600	thread0.td_kstack = physfree + KERNBASE;
1601	thread0.td_kstack_pages = kstack_pages;
1602	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1603	bzero((void *)thread0.td_kstack, kstack0_sz);
1604	physfree += kstack0_sz;
1605
1606	/*
1607	 * make gdt memory segments
1608	 */
1609	for (x = 0; x < NGDT; x++) {
1610		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1611		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1612			ssdtosd(&gdt_segs[x], &gdt[x]);
1613	}
1614	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1615	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1616	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1617
1618	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1619	r_gdt.rd_base =  (long) gdt;
1620	lgdt(&r_gdt);
1621	pc = &__pcpu[0];
1622
1623	wrmsr(MSR_FSBASE, 0);		/* User value */
1624	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1625	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1626
1627	pcpu_init(pc, 0, sizeof(struct pcpu));
1628	dpcpu_init((void *)(physfree + KERNBASE), 0);
1629	physfree += DPCPU_SIZE;
1630	PCPU_SET(prvspace, pc);
1631	PCPU_SET(curthread, &thread0);
1632	/* Non-late cninit() and printf() can be moved up to here. */
1633	PCPU_SET(tssp, &common_tss[0]);
1634	PCPU_SET(commontssp, &common_tss[0]);
1635	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1636	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1637	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1638	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1639
1640	/*
1641	 * Initialize mutexes.
1642	 *
1643	 * icu_lock: in order to allow an interrupt to occur in a critical
1644	 * 	     section, to set pcpu->ipending (etc...) properly, we
1645	 *	     must be able to get the icu lock, so it can't be
1646	 *	     under witness.
1647	 */
1648	mutex_init();
1649	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1650	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1651
1652	/* exceptions */
1653	pti = pti_get_default();
1654	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1655
1656	for (x = 0; x < NIDT; x++)
1657		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1658		    SEL_KPL, 0);
1659	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1660	    SEL_KPL, 0);
1661	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1662	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1663	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1664	    SEL_UPL, 0);
1665	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1666	    SEL_UPL, 0);
1667	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1668	    SEL_KPL, 0);
1669	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1670	    SEL_KPL, 0);
1671	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1672	    SEL_KPL, 0);
1673	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1674	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1675	    SDT_SYSIGT, SEL_KPL, 0);
1676	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1677	    SEL_KPL, 0);
1678	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1679	    SDT_SYSIGT, SEL_KPL, 0);
1680	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1681	    SEL_KPL, 0);
1682	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1683	    SEL_KPL, 0);
1684	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1685	    SEL_KPL, 0);
1686	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1687	    SEL_KPL, 0);
1688	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1689	    SEL_KPL, 0);
1690	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1691	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1692	    SEL_KPL, 0);
1693#ifdef KDTRACE_HOOKS
1694	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1695	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1696#endif
1697#ifdef XENHVM
1698	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1699	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1700#endif
1701	r_idt.rd_limit = sizeof(idt0) - 1;
1702	r_idt.rd_base = (long) idt;
1703	lidt(&r_idt);
1704
1705	/*
1706	 * Initialize the clock before the console so that console
1707	 * initialization can use DELAY().
1708	 */
1709	clock_init();
1710
1711	/*
1712	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1713	 * transition).
1714	 * Once bootblocks have updated, we can test directly for
1715	 * efi_systbl != NULL here...
1716	 */
1717	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1718	    != NULL)
1719		vty_set_preferred(VTY_VT);
1720
1721	finishidentcpu();	/* Final stage of CPU initialization */
1722	initializecpu();	/* Initialize CPU registers */
1723	initializecpucache();
1724
1725	/* doublefault stack space, runs on ist1 */
1726	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1727
1728	/*
1729	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1730	 * above the start of the ist2 stack.
1731	 */
1732	np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1733	np->np_pcpu = (register_t) pc;
1734	common_tss[0].tss_ist2 = (long) np;
1735
1736	/*
1737	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1738	 * above the start of the ist3 stack.
1739	 */
1740	np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
1741	np->np_pcpu = (register_t) pc;
1742	common_tss[0].tss_ist3 = (long) np;
1743
1744	/*
1745	 * DB# stack, runs on ist4.
1746	 */
1747	np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
1748	np->np_pcpu = (register_t) pc;
1749	common_tss[0].tss_ist4 = (long) np;
1750
1751	/* Set the IO permission bitmap (empty due to tss seg limit) */
1752	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1753
1754	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1755	ltr(gsel_tss);
1756
1757	amd64_conf_fast_syscall();
1758
1759	/*
1760	 * Temporary forge some valid pointer to PCB, for exception
1761	 * handlers.  It is reinitialized properly below after FPU is
1762	 * set up.  Also set up td_critnest to short-cut the page
1763	 * fault handler.
1764	 */
1765	cpu_max_ext_state_size = sizeof(struct savefpu);
1766	thread0.td_pcb = get_pcb_td(&thread0);
1767	thread0.td_critnest = 1;
1768
1769	/*
1770	 * The console and kdb should be initialized even earlier than here,
1771	 * but some console drivers don't work until after getmemsize().
1772	 * Default to late console initialization to support these drivers.
1773	 * This loses mainly printf()s in getmemsize() and early debugging.
1774	 */
1775	late_console = 1;
1776	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1777	if (!late_console) {
1778		cninit();
1779		amd64_kdb_init();
1780	}
1781
1782	getmemsize(kmdp, physfree);
1783	init_param2(physmem);
1784
1785	/* now running on new page tables, configured,and u/iom is accessible */
1786
1787	if (late_console)
1788		cninit();
1789
1790#ifdef DEV_ISA
1791#ifdef DEV_ATPIC
1792	elcr_probe();
1793	atpic_startup();
1794#else
1795	/* Reset and mask the atpics and leave them shut down. */
1796	atpic_reset();
1797
1798	/*
1799	 * Point the ICU spurious interrupt vectors at the APIC spurious
1800	 * interrupt handler.
1801	 */
1802	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1803	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1804#endif
1805#else
1806#error "have you forgotten the isa device?";
1807#endif
1808
1809	if (late_console)
1810		amd64_kdb_init();
1811
1812	msgbufinit(msgbufp, msgbufsize);
1813	fpuinit();
1814
1815	/*
1816	 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1817	 * area size.  Zero out the extended state header in fpu save
1818	 * area.
1819	 */
1820	thread0.td_pcb = get_pcb_td(&thread0);
1821	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1822	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1823	if (use_xsave) {
1824		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1825		    1);
1826		xhdr->xstate_bv = xsave_mask;
1827	}
1828	/* make an initial tss so cpu can get interrupt stack on syscall! */
1829	rsp0 = (vm_offset_t)thread0.td_pcb;
1830	/* Ensure the stack is aligned to 16 bytes */
1831	rsp0 &= ~0xFul;
1832	common_tss[0].tss_rsp0 = rsp0;
1833	PCPU_SET(rsp0, rsp0);
1834	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1835	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1836	PCPU_SET(curpcb, thread0.td_pcb);
1837
1838	/* transfer to user mode */
1839
1840	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1841	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1842	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1843	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1844	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1845
1846	load_ds(_udatasel);
1847	load_es(_udatasel);
1848	load_fs(_ufssel);
1849
1850	/* setup proc 0's pcb */
1851	thread0.td_pcb->pcb_flags = 0;
1852	thread0.td_frame = &proc0_tf;
1853
1854        env = kern_getenv("kernelname");
1855	if (env != NULL)
1856		strlcpy(kernelname, env, sizeof(kernelname));
1857
1858	cpu_probe_amdc1e();
1859
1860#ifdef FDT
1861	x86_init_fdt();
1862#endif
1863	thread0.td_critnest = 0;
1864
1865	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1866	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1867
1868	/* Location of kernel stack for locore */
1869	return ((u_int64_t)thread0.td_pcb);
1870}
1871
1872void
1873cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1874{
1875
1876	pcpu->pc_acpi_id = 0xffffffff;
1877}
1878
1879static int
1880smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1881{
1882	struct bios_smap *smapbase;
1883	struct bios_smap_xattr smap;
1884	caddr_t kmdp;
1885	uint32_t *smapattr;
1886	int count, error, i;
1887
1888	/* Retrieve the system memory map from the loader. */
1889	kmdp = preload_search_by_type("elf kernel");
1890	if (kmdp == NULL)
1891		kmdp = preload_search_by_type("elf64 kernel");
1892	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1893	    MODINFO_METADATA | MODINFOMD_SMAP);
1894	if (smapbase == NULL)
1895		return (0);
1896	smapattr = (uint32_t *)preload_search_info(kmdp,
1897	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1898	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1899	error = 0;
1900	for (i = 0; i < count; i++) {
1901		smap.base = smapbase[i].base;
1902		smap.length = smapbase[i].length;
1903		smap.type = smapbase[i].type;
1904		if (smapattr != NULL)
1905			smap.xattr = smapattr[i];
1906		else
1907			smap.xattr = 0;
1908		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1909	}
1910	return (error);
1911}
1912SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1913    smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1914
1915static int
1916efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1917{
1918	struct efi_map_header *efihdr;
1919	caddr_t kmdp;
1920	uint32_t efisize;
1921
1922	kmdp = preload_search_by_type("elf kernel");
1923	if (kmdp == NULL)
1924		kmdp = preload_search_by_type("elf64 kernel");
1925	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1926	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1927	if (efihdr == NULL)
1928		return (0);
1929	efisize = *((uint32_t *)efihdr - 1);
1930	return (SYSCTL_OUT(req, efihdr, efisize));
1931}
1932SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1933    efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1934
1935void
1936spinlock_enter(void)
1937{
1938	struct thread *td;
1939	register_t flags;
1940
1941	td = curthread;
1942	if (td->td_md.md_spinlock_count == 0) {
1943		flags = intr_disable();
1944		td->td_md.md_spinlock_count = 1;
1945		td->td_md.md_saved_flags = flags;
1946	} else
1947		td->td_md.md_spinlock_count++;
1948	critical_enter();
1949}
1950
1951void
1952spinlock_exit(void)
1953{
1954	struct thread *td;
1955	register_t flags;
1956
1957	td = curthread;
1958	critical_exit();
1959	flags = td->td_md.md_saved_flags;
1960	td->td_md.md_spinlock_count--;
1961	if (td->td_md.md_spinlock_count == 0)
1962		intr_restore(flags);
1963}
1964
1965/*
1966 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1967 * we want to start a backtrace from the function that caused us to enter
1968 * the debugger. We have the context in the trapframe, but base the trace
1969 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1970 * enough for a backtrace.
1971 */
1972void
1973makectx(struct trapframe *tf, struct pcb *pcb)
1974{
1975
1976	pcb->pcb_r12 = tf->tf_r12;
1977	pcb->pcb_r13 = tf->tf_r13;
1978	pcb->pcb_r14 = tf->tf_r14;
1979	pcb->pcb_r15 = tf->tf_r15;
1980	pcb->pcb_rbp = tf->tf_rbp;
1981	pcb->pcb_rbx = tf->tf_rbx;
1982	pcb->pcb_rip = tf->tf_rip;
1983	pcb->pcb_rsp = tf->tf_rsp;
1984}
1985
1986int
1987ptrace_set_pc(struct thread *td, unsigned long addr)
1988{
1989
1990	td->td_frame->tf_rip = addr;
1991	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
1992	return (0);
1993}
1994
1995int
1996ptrace_single_step(struct thread *td)
1997{
1998
1999	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2000	if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2001		td->td_frame->tf_rflags |= PSL_T;
2002		td->td_dbgflags |= TDB_STEP;
2003	}
2004	return (0);
2005}
2006
2007int
2008ptrace_clear_single_step(struct thread *td)
2009{
2010	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2011	td->td_frame->tf_rflags &= ~PSL_T;
2012	td->td_dbgflags &= ~TDB_STEP;
2013	return (0);
2014}
2015
2016int
2017fill_regs(struct thread *td, struct reg *regs)
2018{
2019	struct trapframe *tp;
2020
2021	tp = td->td_frame;
2022	return (fill_frame_regs(tp, regs));
2023}
2024
2025int
2026fill_frame_regs(struct trapframe *tp, struct reg *regs)
2027{
2028
2029	regs->r_r15 = tp->tf_r15;
2030	regs->r_r14 = tp->tf_r14;
2031	regs->r_r13 = tp->tf_r13;
2032	regs->r_r12 = tp->tf_r12;
2033	regs->r_r11 = tp->tf_r11;
2034	regs->r_r10 = tp->tf_r10;
2035	regs->r_r9  = tp->tf_r9;
2036	regs->r_r8  = tp->tf_r8;
2037	regs->r_rdi = tp->tf_rdi;
2038	regs->r_rsi = tp->tf_rsi;
2039	regs->r_rbp = tp->tf_rbp;
2040	regs->r_rbx = tp->tf_rbx;
2041	regs->r_rdx = tp->tf_rdx;
2042	regs->r_rcx = tp->tf_rcx;
2043	regs->r_rax = tp->tf_rax;
2044	regs->r_rip = tp->tf_rip;
2045	regs->r_cs = tp->tf_cs;
2046	regs->r_rflags = tp->tf_rflags;
2047	regs->r_rsp = tp->tf_rsp;
2048	regs->r_ss = tp->tf_ss;
2049	if (tp->tf_flags & TF_HASSEGS) {
2050		regs->r_ds = tp->tf_ds;
2051		regs->r_es = tp->tf_es;
2052		regs->r_fs = tp->tf_fs;
2053		regs->r_gs = tp->tf_gs;
2054	} else {
2055		regs->r_ds = 0;
2056		regs->r_es = 0;
2057		regs->r_fs = 0;
2058		regs->r_gs = 0;
2059	}
2060	regs->r_err = 0;
2061	regs->r_trapno = 0;
2062	return (0);
2063}
2064
2065int
2066set_regs(struct thread *td, struct reg *regs)
2067{
2068	struct trapframe *tp;
2069	register_t rflags;
2070
2071	tp = td->td_frame;
2072	rflags = regs->r_rflags & 0xffffffff;
2073	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2074		return (EINVAL);
2075	tp->tf_r15 = regs->r_r15;
2076	tp->tf_r14 = regs->r_r14;
2077	tp->tf_r13 = regs->r_r13;
2078	tp->tf_r12 = regs->r_r12;
2079	tp->tf_r11 = regs->r_r11;
2080	tp->tf_r10 = regs->r_r10;
2081	tp->tf_r9  = regs->r_r9;
2082	tp->tf_r8  = regs->r_r8;
2083	tp->tf_rdi = regs->r_rdi;
2084	tp->tf_rsi = regs->r_rsi;
2085	tp->tf_rbp = regs->r_rbp;
2086	tp->tf_rbx = regs->r_rbx;
2087	tp->tf_rdx = regs->r_rdx;
2088	tp->tf_rcx = regs->r_rcx;
2089	tp->tf_rax = regs->r_rax;
2090	tp->tf_rip = regs->r_rip;
2091	tp->tf_cs = regs->r_cs;
2092	tp->tf_rflags = rflags;
2093	tp->tf_rsp = regs->r_rsp;
2094	tp->tf_ss = regs->r_ss;
2095	if (0) {	/* XXXKIB */
2096		tp->tf_ds = regs->r_ds;
2097		tp->tf_es = regs->r_es;
2098		tp->tf_fs = regs->r_fs;
2099		tp->tf_gs = regs->r_gs;
2100		tp->tf_flags = TF_HASSEGS;
2101	}
2102	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2103	return (0);
2104}
2105
2106/* XXX check all this stuff! */
2107/* externalize from sv_xmm */
2108static void
2109fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2110{
2111	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2112	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2113	int i;
2114
2115	/* pcb -> fpregs */
2116	bzero(fpregs, sizeof(*fpregs));
2117
2118	/* FPU control/status */
2119	penv_fpreg->en_cw = penv_xmm->en_cw;
2120	penv_fpreg->en_sw = penv_xmm->en_sw;
2121	penv_fpreg->en_tw = penv_xmm->en_tw;
2122	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2123	penv_fpreg->en_rip = penv_xmm->en_rip;
2124	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2125	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2126	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2127
2128	/* FPU registers */
2129	for (i = 0; i < 8; ++i)
2130		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2131
2132	/* SSE registers */
2133	for (i = 0; i < 16; ++i)
2134		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2135}
2136
2137/* internalize from fpregs into sv_xmm */
2138static void
2139set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2140{
2141	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2142	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2143	int i;
2144
2145	/* fpregs -> pcb */
2146	/* FPU control/status */
2147	penv_xmm->en_cw = penv_fpreg->en_cw;
2148	penv_xmm->en_sw = penv_fpreg->en_sw;
2149	penv_xmm->en_tw = penv_fpreg->en_tw;
2150	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2151	penv_xmm->en_rip = penv_fpreg->en_rip;
2152	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2153	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2154	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2155
2156	/* FPU registers */
2157	for (i = 0; i < 8; ++i)
2158		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2159
2160	/* SSE registers */
2161	for (i = 0; i < 16; ++i)
2162		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2163}
2164
2165/* externalize from td->pcb */
2166int
2167fill_fpregs(struct thread *td, struct fpreg *fpregs)
2168{
2169
2170	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2171	    P_SHOULDSTOP(td->td_proc),
2172	    ("not suspended thread %p", td));
2173	fpugetregs(td);
2174	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2175	return (0);
2176}
2177
2178/* internalize to td->pcb */
2179int
2180set_fpregs(struct thread *td, struct fpreg *fpregs)
2181{
2182
2183	critical_enter();
2184	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2185	fpuuserinited(td);
2186	critical_exit();
2187	return (0);
2188}
2189
2190/*
2191 * Get machine context.
2192 */
2193int
2194get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2195{
2196	struct pcb *pcb;
2197	struct trapframe *tp;
2198
2199	pcb = td->td_pcb;
2200	tp = td->td_frame;
2201	PROC_LOCK(curthread->td_proc);
2202	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2203	PROC_UNLOCK(curthread->td_proc);
2204	mcp->mc_r15 = tp->tf_r15;
2205	mcp->mc_r14 = tp->tf_r14;
2206	mcp->mc_r13 = tp->tf_r13;
2207	mcp->mc_r12 = tp->tf_r12;
2208	mcp->mc_r11 = tp->tf_r11;
2209	mcp->mc_r10 = tp->tf_r10;
2210	mcp->mc_r9  = tp->tf_r9;
2211	mcp->mc_r8  = tp->tf_r8;
2212	mcp->mc_rdi = tp->tf_rdi;
2213	mcp->mc_rsi = tp->tf_rsi;
2214	mcp->mc_rbp = tp->tf_rbp;
2215	mcp->mc_rbx = tp->tf_rbx;
2216	mcp->mc_rcx = tp->tf_rcx;
2217	mcp->mc_rflags = tp->tf_rflags;
2218	if (flags & GET_MC_CLEAR_RET) {
2219		mcp->mc_rax = 0;
2220		mcp->mc_rdx = 0;
2221		mcp->mc_rflags &= ~PSL_C;
2222	} else {
2223		mcp->mc_rax = tp->tf_rax;
2224		mcp->mc_rdx = tp->tf_rdx;
2225	}
2226	mcp->mc_rip = tp->tf_rip;
2227	mcp->mc_cs = tp->tf_cs;
2228	mcp->mc_rsp = tp->tf_rsp;
2229	mcp->mc_ss = tp->tf_ss;
2230	mcp->mc_ds = tp->tf_ds;
2231	mcp->mc_es = tp->tf_es;
2232	mcp->mc_fs = tp->tf_fs;
2233	mcp->mc_gs = tp->tf_gs;
2234	mcp->mc_flags = tp->tf_flags;
2235	mcp->mc_len = sizeof(*mcp);
2236	get_fpcontext(td, mcp, NULL, 0);
2237	update_pcb_bases(pcb);
2238	mcp->mc_fsbase = pcb->pcb_fsbase;
2239	mcp->mc_gsbase = pcb->pcb_gsbase;
2240	mcp->mc_xfpustate = 0;
2241	mcp->mc_xfpustate_len = 0;
2242	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2243	return (0);
2244}
2245
2246/*
2247 * Set machine context.
2248 *
2249 * However, we don't set any but the user modifiable flags, and we won't
2250 * touch the cs selector.
2251 */
2252int
2253set_mcontext(struct thread *td, mcontext_t *mcp)
2254{
2255	struct pcb *pcb;
2256	struct trapframe *tp;
2257	char *xfpustate;
2258	long rflags;
2259	int ret;
2260
2261	pcb = td->td_pcb;
2262	tp = td->td_frame;
2263	if (mcp->mc_len != sizeof(*mcp) ||
2264	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2265		return (EINVAL);
2266	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2267	    (tp->tf_rflags & ~PSL_USERCHANGE);
2268	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2269		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2270		    sizeof(struct savefpu))
2271			return (EINVAL);
2272		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2273		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2274		    mcp->mc_xfpustate_len);
2275		if (ret != 0)
2276			return (ret);
2277	} else
2278		xfpustate = NULL;
2279	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2280	if (ret != 0)
2281		return (ret);
2282	tp->tf_r15 = mcp->mc_r15;
2283	tp->tf_r14 = mcp->mc_r14;
2284	tp->tf_r13 = mcp->mc_r13;
2285	tp->tf_r12 = mcp->mc_r12;
2286	tp->tf_r11 = mcp->mc_r11;
2287	tp->tf_r10 = mcp->mc_r10;
2288	tp->tf_r9  = mcp->mc_r9;
2289	tp->tf_r8  = mcp->mc_r8;
2290	tp->tf_rdi = mcp->mc_rdi;
2291	tp->tf_rsi = mcp->mc_rsi;
2292	tp->tf_rbp = mcp->mc_rbp;
2293	tp->tf_rbx = mcp->mc_rbx;
2294	tp->tf_rdx = mcp->mc_rdx;
2295	tp->tf_rcx = mcp->mc_rcx;
2296	tp->tf_rax = mcp->mc_rax;
2297	tp->tf_rip = mcp->mc_rip;
2298	tp->tf_rflags = rflags;
2299	tp->tf_rsp = mcp->mc_rsp;
2300	tp->tf_ss = mcp->mc_ss;
2301	tp->tf_flags = mcp->mc_flags;
2302	if (tp->tf_flags & TF_HASSEGS) {
2303		tp->tf_ds = mcp->mc_ds;
2304		tp->tf_es = mcp->mc_es;
2305		tp->tf_fs = mcp->mc_fs;
2306		tp->tf_gs = mcp->mc_gs;
2307	}
2308	set_pcb_flags(pcb, PCB_FULL_IRET);
2309	if (mcp->mc_flags & _MC_HASBASES) {
2310		pcb->pcb_fsbase = mcp->mc_fsbase;
2311		pcb->pcb_gsbase = mcp->mc_gsbase;
2312	}
2313	return (0);
2314}
2315
2316static void
2317get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2318    size_t xfpusave_len)
2319{
2320	size_t max_len, len;
2321
2322	mcp->mc_ownedfp = fpugetregs(td);
2323	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2324	    sizeof(mcp->mc_fpstate));
2325	mcp->mc_fpformat = fpuformat();
2326	if (!use_xsave || xfpusave_len == 0)
2327		return;
2328	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2329	len = xfpusave_len;
2330	if (len > max_len) {
2331		len = max_len;
2332		bzero(xfpusave + max_len, len - max_len);
2333	}
2334	mcp->mc_flags |= _MC_HASFPXSTATE;
2335	mcp->mc_xfpustate_len = len;
2336	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2337}
2338
2339static int
2340set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2341    size_t xfpustate_len)
2342{
2343	int error;
2344
2345	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2346		return (0);
2347	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2348		return (EINVAL);
2349	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2350		/* We don't care what state is left in the FPU or PCB. */
2351		fpstate_drop(td);
2352		error = 0;
2353	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2354	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2355		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2356		    xfpustate, xfpustate_len);
2357	} else
2358		return (EINVAL);
2359	return (error);
2360}
2361
2362void
2363fpstate_drop(struct thread *td)
2364{
2365
2366	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2367	critical_enter();
2368	if (PCPU_GET(fpcurthread) == td)
2369		fpudrop();
2370	/*
2371	 * XXX force a full drop of the fpu.  The above only drops it if we
2372	 * owned it.
2373	 *
2374	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2375	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2376	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2377	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2378	 * have too many layers.
2379	 */
2380	clear_pcb_flags(curthread->td_pcb,
2381	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2382	critical_exit();
2383}
2384
2385int
2386fill_dbregs(struct thread *td, struct dbreg *dbregs)
2387{
2388	struct pcb *pcb;
2389
2390	if (td == NULL) {
2391		dbregs->dr[0] = rdr0();
2392		dbregs->dr[1] = rdr1();
2393		dbregs->dr[2] = rdr2();
2394		dbregs->dr[3] = rdr3();
2395		dbregs->dr[6] = rdr6();
2396		dbregs->dr[7] = rdr7();
2397	} else {
2398		pcb = td->td_pcb;
2399		dbregs->dr[0] = pcb->pcb_dr0;
2400		dbregs->dr[1] = pcb->pcb_dr1;
2401		dbregs->dr[2] = pcb->pcb_dr2;
2402		dbregs->dr[3] = pcb->pcb_dr3;
2403		dbregs->dr[6] = pcb->pcb_dr6;
2404		dbregs->dr[7] = pcb->pcb_dr7;
2405	}
2406	dbregs->dr[4] = 0;
2407	dbregs->dr[5] = 0;
2408	dbregs->dr[8] = 0;
2409	dbregs->dr[9] = 0;
2410	dbregs->dr[10] = 0;
2411	dbregs->dr[11] = 0;
2412	dbregs->dr[12] = 0;
2413	dbregs->dr[13] = 0;
2414	dbregs->dr[14] = 0;
2415	dbregs->dr[15] = 0;
2416	return (0);
2417}
2418
2419int
2420set_dbregs(struct thread *td, struct dbreg *dbregs)
2421{
2422	struct pcb *pcb;
2423	int i;
2424
2425	if (td == NULL) {
2426		load_dr0(dbregs->dr[0]);
2427		load_dr1(dbregs->dr[1]);
2428		load_dr2(dbregs->dr[2]);
2429		load_dr3(dbregs->dr[3]);
2430		load_dr6(dbregs->dr[6]);
2431		load_dr7(dbregs->dr[7]);
2432	} else {
2433		/*
2434		 * Don't let an illegal value for dr7 get set.  Specifically,
2435		 * check for undefined settings.  Setting these bit patterns
2436		 * result in undefined behaviour and can lead to an unexpected
2437		 * TRCTRAP or a general protection fault right here.
2438		 * Upper bits of dr6 and dr7 must not be set
2439		 */
2440		for (i = 0; i < 4; i++) {
2441			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2442				return (EINVAL);
2443			if (td->td_frame->tf_cs == _ucode32sel &&
2444			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2445				return (EINVAL);
2446		}
2447		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2448		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2449			return (EINVAL);
2450
2451		pcb = td->td_pcb;
2452
2453		/*
2454		 * Don't let a process set a breakpoint that is not within the
2455		 * process's address space.  If a process could do this, it
2456		 * could halt the system by setting a breakpoint in the kernel
2457		 * (if ddb was enabled).  Thus, we need to check to make sure
2458		 * that no breakpoints are being enabled for addresses outside
2459		 * process's address space.
2460		 *
2461		 * XXX - what about when the watched area of the user's
2462		 * address space is written into from within the kernel
2463		 * ... wouldn't that still cause a breakpoint to be generated
2464		 * from within kernel mode?
2465		 */
2466
2467		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2468			/* dr0 is enabled */
2469			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2470				return (EINVAL);
2471		}
2472		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2473			/* dr1 is enabled */
2474			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2475				return (EINVAL);
2476		}
2477		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2478			/* dr2 is enabled */
2479			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2480				return (EINVAL);
2481		}
2482		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2483			/* dr3 is enabled */
2484			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2485				return (EINVAL);
2486		}
2487
2488		pcb->pcb_dr0 = dbregs->dr[0];
2489		pcb->pcb_dr1 = dbregs->dr[1];
2490		pcb->pcb_dr2 = dbregs->dr[2];
2491		pcb->pcb_dr3 = dbregs->dr[3];
2492		pcb->pcb_dr6 = dbregs->dr[6];
2493		pcb->pcb_dr7 = dbregs->dr[7];
2494
2495		set_pcb_flags(pcb, PCB_DBREGS);
2496	}
2497
2498	return (0);
2499}
2500
2501void
2502reset_dbregs(void)
2503{
2504
2505	load_dr7(0);	/* Turn off the control bits first */
2506	load_dr0(0);
2507	load_dr1(0);
2508	load_dr2(0);
2509	load_dr3(0);
2510	load_dr6(0);
2511}
2512
2513/*
2514 * Return > 0 if a hardware breakpoint has been hit, and the
2515 * breakpoint was in user space.  Return 0, otherwise.
2516 */
2517int
2518user_dbreg_trap(register_t dr6)
2519{
2520        u_int64_t dr7;
2521        u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2522        int nbp;            /* number of breakpoints that triggered */
2523        caddr_t addr[4];    /* breakpoint addresses */
2524        int i;
2525
2526        bp = dr6 & DBREG_DR6_BMASK;
2527        if (bp == 0) {
2528                /*
2529                 * None of the breakpoint bits are set meaning this
2530                 * trap was not caused by any of the debug registers
2531                 */
2532                return 0;
2533        }
2534
2535        dr7 = rdr7();
2536        if ((dr7 & 0x000000ff) == 0) {
2537                /*
2538                 * all GE and LE bits in the dr7 register are zero,
2539                 * thus the trap couldn't have been caused by the
2540                 * hardware debug registers
2541                 */
2542                return 0;
2543        }
2544
2545        nbp = 0;
2546
2547        /*
2548         * at least one of the breakpoints were hit, check to see
2549         * which ones and if any of them are user space addresses
2550         */
2551
2552        if (bp & 0x01) {
2553                addr[nbp++] = (caddr_t)rdr0();
2554        }
2555        if (bp & 0x02) {
2556                addr[nbp++] = (caddr_t)rdr1();
2557        }
2558        if (bp & 0x04) {
2559                addr[nbp++] = (caddr_t)rdr2();
2560        }
2561        if (bp & 0x08) {
2562                addr[nbp++] = (caddr_t)rdr3();
2563        }
2564
2565        for (i = 0; i < nbp; i++) {
2566                if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2567                        /*
2568                         * addr[i] is in user space
2569                         */
2570                        return nbp;
2571                }
2572        }
2573
2574        /*
2575         * None of the breakpoints are in user space.
2576         */
2577        return 0;
2578}
2579
2580/*
2581 * The pcb_flags is only modified by current thread, or by other threads
2582 * when current thread is stopped.  However, current thread may change it
2583 * from the interrupt context in cpu_switch(), or in the trap handler.
2584 * When we read-modify-write pcb_flags from C sources, compiler may generate
2585 * code that is not atomic regarding the interrupt handler.  If a trap or
2586 * interrupt happens and any flag is modified from the handler, it can be
2587 * clobbered with the cached value later.  Therefore, we implement setting
2588 * and clearing flags with single-instruction functions, which do not race
2589 * with possible modification of the flags from the trap or interrupt context,
2590 * because traps and interrupts are executed only on instruction boundary.
2591 */
2592void
2593set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2594{
2595
2596	__asm __volatile("orl %1,%0"
2597	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2598	    : "cc", "memory");
2599
2600}
2601
2602/*
2603 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2604 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2605 * pcb if user space modified the bases.  We must save on the context
2606 * switch or if the return to usermode happens through the doreti.
2607 *
2608 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2609 * which have a consequence that the base MSRs must be saved each time
2610 * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
2611 * context switches.
2612 */
2613void
2614set_pcb_flags(struct pcb *pcb, const u_int flags)
2615{
2616	register_t r;
2617
2618	if (curpcb == pcb &&
2619	    (flags & PCB_FULL_IRET) != 0 &&
2620	    (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
2621	    (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
2622		r = intr_disable();
2623		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2624			if (rfs() == _ufssel)
2625				pcb->pcb_fsbase = rdfsbase();
2626			if (rgs() == _ugssel)
2627				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2628		}
2629		set_pcb_flags_raw(pcb, flags);
2630		intr_restore(r);
2631	} else {
2632		set_pcb_flags_raw(pcb, flags);
2633	}
2634}
2635
2636void
2637clear_pcb_flags(struct pcb *pcb, const u_int flags)
2638{
2639
2640	__asm __volatile("andl %1,%0"
2641	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2642	    : "cc", "memory");
2643}
2644
2645#ifdef KDB
2646
2647/*
2648 * Provide inb() and outb() as functions.  They are normally only available as
2649 * inline functions, thus cannot be called from the debugger.
2650 */
2651
2652/* silence compiler warnings */
2653u_char inb_(u_short);
2654void outb_(u_short, u_char);
2655
2656u_char
2657inb_(u_short port)
2658{
2659	return inb(port);
2660}
2661
2662void
2663outb_(u_short port, u_char data)
2664{
2665	outb(port, data);
2666}
2667
2668#endif /* KDB */
2669