machdep.c revision 323431
1/*-
2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * William Jolitz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 323431 2017-09-11 08:48:36Z kib $");
43
44#include "opt_atpic.h"
45#include "opt_compat.h"
46#include "opt_cpu.h"
47#include "opt_ddb.h"
48#include "opt_inet.h"
49#include "opt_isa.h"
50#include "opt_kstack_pages.h"
51#include "opt_maxmem.h"
52#include "opt_mp_watchdog.h"
53#include "opt_perfmon.h"
54#include "opt_platform.h"
55#include "opt_sched.h"
56
57#include <sys/param.h>
58#include <sys/proc.h>
59#include <sys/systm.h>
60#include <sys/bio.h>
61#include <sys/buf.h>
62#include <sys/bus.h>
63#include <sys/callout.h>
64#include <sys/cons.h>
65#include <sys/cpu.h>
66#include <sys/efi.h>
67#include <sys/eventhandler.h>
68#include <sys/exec.h>
69#include <sys/imgact.h>
70#include <sys/kdb.h>
71#include <sys/kernel.h>
72#include <sys/ktr.h>
73#include <sys/linker.h>
74#include <sys/lock.h>
75#include <sys/malloc.h>
76#include <sys/memrange.h>
77#include <sys/msgbuf.h>
78#include <sys/mutex.h>
79#include <sys/pcpu.h>
80#include <sys/ptrace.h>
81#include <sys/reboot.h>
82#include <sys/rwlock.h>
83#include <sys/sched.h>
84#include <sys/signalvar.h>
85#ifdef SMP
86#include <sys/smp.h>
87#endif
88#include <sys/syscallsubr.h>
89#include <sys/sysctl.h>
90#include <sys/sysent.h>
91#include <sys/sysproto.h>
92#include <sys/ucontext.h>
93#include <sys/vmmeter.h>
94
95#include <vm/vm.h>
96#include <vm/vm_extern.h>
97#include <vm/vm_kern.h>
98#include <vm/vm_page.h>
99#include <vm/vm_map.h>
100#include <vm/vm_object.h>
101#include <vm/vm_pager.h>
102#include <vm/vm_param.h>
103
104#ifdef DDB
105#ifndef KDB
106#error KDB must be enabled in order for DDB to work!
107#endif
108#include <ddb/ddb.h>
109#include <ddb/db_sym.h>
110#endif
111
112#include <net/netisr.h>
113
114#include <machine/clock.h>
115#include <machine/cpu.h>
116#include <machine/cputypes.h>
117#include <machine/intr_machdep.h>
118#include <x86/mca.h>
119#include <machine/md_var.h>
120#include <machine/metadata.h>
121#include <machine/mp_watchdog.h>
122#include <machine/pc/bios.h>
123#include <machine/pcb.h>
124#include <machine/proc.h>
125#include <machine/reg.h>
126#include <machine/sigframe.h>
127#include <machine/specialreg.h>
128#ifdef PERFMON
129#include <machine/perfmon.h>
130#endif
131#include <machine/tss.h>
132#ifdef SMP
133#include <machine/smp.h>
134#endif
135#ifdef FDT
136#include <x86/fdt.h>
137#endif
138
139#ifdef DEV_ATPIC
140#include <x86/isa/icu.h>
141#else
142#include <x86/apicvar.h>
143#endif
144
145#include <isa/isareg.h>
146#include <isa/rtc.h>
147#include <x86/init.h>
148
149/* Sanity check for __curthread() */
150CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
151
152extern u_int64_t hammer_time(u_int64_t, u_int64_t);
153
154#define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
155#define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
156
157static void cpu_startup(void *);
158static void get_fpcontext(struct thread *td, mcontext_t *mcp,
159    char *xfpusave, size_t xfpusave_len);
160static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
161    char *xfpustate, size_t xfpustate_len);
162SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
163
164/* Preload data parse function */
165static caddr_t native_parse_preload_data(u_int64_t);
166
167/* Native function to fetch and parse the e820 map */
168static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
169
170/* Default init_ops implementation. */
171struct init_ops init_ops = {
172	.parse_preload_data =	native_parse_preload_data,
173	.early_clock_source_init =	i8254_init,
174	.early_delay =			i8254_delay,
175	.parse_memmap =			native_parse_memmap,
176#ifdef SMP
177	.mp_bootaddress =		mp_bootaddress,
178	.start_all_aps =		native_start_all_aps,
179#endif
180	.msi_init =			msi_init,
181};
182
183/*
184 * The file "conf/ldscript.amd64" defines the symbol "kernphys".  Its value is
185 * the physical address at which the kernel is loaded.
186 */
187extern char kernphys[];
188
189struct msgbuf *msgbufp;
190
191/*
192 * Physical address of the EFI System Table. Stashed from the metadata hints
193 * passed into the kernel and used by the EFI code to call runtime services.
194 */
195vm_paddr_t efi_systbl_phys;
196
197/* Intel ICH registers */
198#define ICH_PMBASE	0x400
199#define ICH_SMI_EN	ICH_PMBASE + 0x30
200
201int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
202
203int cold = 1;
204
205long Maxmem = 0;
206long realmem = 0;
207
208/*
209 * The number of PHYSMAP entries must be one less than the number of
210 * PHYSSEG entries because the PHYSMAP entry that spans the largest
211 * physical address that is accessible by ISA DMA is split into two
212 * PHYSSEG entries.
213 */
214#define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
215
216vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
217vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
218
219/* must be 2 less so 0 0 can signal end of chunks */
220#define	PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
221#define	DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
222
223struct kva_md_info kmi;
224
225static struct trapframe proc0_tf;
226struct region_descriptor r_gdt, r_idt;
227
228struct pcpu __pcpu[MAXCPU];
229
230struct mtx icu_lock;
231
232struct mem_range_softc mem_range_softc;
233
234struct mtx dt_lock;	/* lock for GDT and LDT */
235
236void (*vmm_resume_p)(void);
237
238static void
239cpu_startup(dummy)
240	void *dummy;
241{
242	uintmax_t memsize;
243	char *sysenv;
244
245	/*
246	 * On MacBooks, we need to disallow the legacy USB circuit to
247	 * generate an SMI# because this can cause several problems,
248	 * namely: incorrect CPU frequency detection and failure to
249	 * start the APs.
250	 * We do this by disabling a bit in the SMI_EN (SMI Control and
251	 * Enable register) of the Intel ICH LPC Interface Bridge.
252	 */
253	sysenv = kern_getenv("smbios.system.product");
254	if (sysenv != NULL) {
255		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
256		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
257		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
258		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
259		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
260		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
261		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
262		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
263			if (bootverbose)
264				printf("Disabling LEGACY_USB_EN bit on "
265				    "Intel ICH.\n");
266			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
267		}
268		freeenv(sysenv);
269	}
270
271	/*
272	 * Good {morning,afternoon,evening,night}.
273	 */
274	startrtclock();
275	printcpuinfo();
276#ifdef PERFMON
277	perfmon_init();
278#endif
279
280	/*
281	 * Display physical memory if SMBIOS reports reasonable amount.
282	 */
283	memsize = 0;
284	sysenv = kern_getenv("smbios.memory.enabled");
285	if (sysenv != NULL) {
286		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
287		freeenv(sysenv);
288	}
289	if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
290		memsize = ptoa((uintmax_t)Maxmem);
291	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
292	realmem = atop(memsize);
293
294	/*
295	 * Display any holes after the first chunk of extended memory.
296	 */
297	if (bootverbose) {
298		int indx;
299
300		printf("Physical memory chunk(s):\n");
301		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
302			vm_paddr_t size;
303
304			size = phys_avail[indx + 1] - phys_avail[indx];
305			printf(
306			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
307			    (uintmax_t)phys_avail[indx],
308			    (uintmax_t)phys_avail[indx + 1] - 1,
309			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
310		}
311	}
312
313	vm_ksubmap_init(&kmi);
314
315	printf("avail memory = %ju (%ju MB)\n",
316	    ptoa((uintmax_t)vm_cnt.v_free_count),
317	    ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
318
319	/*
320	 * Set up buffers, so they can be used to read disk labels.
321	 */
322	bufinit();
323	vm_pager_bufferinit();
324
325	cpu_setregs();
326}
327
328/*
329 * Send an interrupt to process.
330 *
331 * Stack is set up to allow sigcode stored
332 * at top to call routine, followed by call
333 * to sigreturn routine below.  After sigreturn
334 * resets the signal mask, the stack, and the
335 * frame pointer, it returns to the user
336 * specified pc, psl.
337 */
338void
339sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
340{
341	struct sigframe sf, *sfp;
342	struct pcb *pcb;
343	struct proc *p;
344	struct thread *td;
345	struct sigacts *psp;
346	char *sp;
347	struct trapframe *regs;
348	char *xfpusave;
349	size_t xfpusave_len;
350	int sig;
351	int oonstack;
352
353	td = curthread;
354	pcb = td->td_pcb;
355	p = td->td_proc;
356	PROC_LOCK_ASSERT(p, MA_OWNED);
357	sig = ksi->ksi_signo;
358	psp = p->p_sigacts;
359	mtx_assert(&psp->ps_mtx, MA_OWNED);
360	regs = td->td_frame;
361	oonstack = sigonstack(regs->tf_rsp);
362
363	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
364		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
365		xfpusave = __builtin_alloca(xfpusave_len);
366	} else {
367		xfpusave_len = 0;
368		xfpusave = NULL;
369	}
370
371	/* Save user context. */
372	bzero(&sf, sizeof(sf));
373	sf.sf_uc.uc_sigmask = *mask;
374	sf.sf_uc.uc_stack = td->td_sigstk;
375	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
376	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
377	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
378	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
379	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
380	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
381	fpstate_drop(td);
382	update_pcb_bases(pcb);
383	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
384	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
385	bzero(sf.sf_uc.uc_mcontext.mc_spare,
386	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
387	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
388
389	/* Allocate space for the signal handler context. */
390	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
391	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
392		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
393#if defined(COMPAT_43)
394		td->td_sigstk.ss_flags |= SS_ONSTACK;
395#endif
396	} else
397		sp = (char *)regs->tf_rsp - 128;
398	if (xfpusave != NULL) {
399		sp -= xfpusave_len;
400		sp = (char *)((unsigned long)sp & ~0x3Ful);
401		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
402	}
403	sp -= sizeof(struct sigframe);
404	/* Align to 16 bytes. */
405	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
406
407	/* Build the argument list for the signal handler. */
408	regs->tf_rdi = sig;			/* arg 1 in %rdi */
409	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
410	bzero(&sf.sf_si, sizeof(sf.sf_si));
411	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
412		/* Signal handler installed with SA_SIGINFO. */
413		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
414		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
415
416		/* Fill in POSIX parts */
417		sf.sf_si = ksi->ksi_info;
418		sf.sf_si.si_signo = sig; /* maybe a translated signal */
419		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
420	} else {
421		/* Old FreeBSD-style arguments. */
422		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
423		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
424		sf.sf_ahu.sf_handler = catcher;
425	}
426	mtx_unlock(&psp->ps_mtx);
427	PROC_UNLOCK(p);
428
429	/*
430	 * Copy the sigframe out to the user's stack.
431	 */
432	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
433	    (xfpusave != NULL && copyout(xfpusave,
434	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
435	    != 0)) {
436#ifdef DEBUG
437		printf("process %ld has trashed its stack\n", (long)p->p_pid);
438#endif
439		PROC_LOCK(p);
440		sigexit(td, SIGILL);
441	}
442
443	regs->tf_rsp = (long)sfp;
444	regs->tf_rip = p->p_sysent->sv_sigcode_base;
445	regs->tf_rflags &= ~(PSL_T | PSL_D);
446	regs->tf_cs = _ucodesel;
447	regs->tf_ds = _udatasel;
448	regs->tf_ss = _udatasel;
449	regs->tf_es = _udatasel;
450	regs->tf_fs = _ufssel;
451	regs->tf_gs = _ugssel;
452	regs->tf_flags = TF_HASSEGS;
453	PROC_LOCK(p);
454	mtx_lock(&psp->ps_mtx);
455}
456
457/*
458 * System call to cleanup state after a signal
459 * has been taken.  Reset signal mask and
460 * stack state from context left by sendsig (above).
461 * Return to previous pc and psl as specified by
462 * context left by sendsig. Check carefully to
463 * make sure that the user has not modified the
464 * state to gain improper privileges.
465 *
466 * MPSAFE
467 */
468int
469sys_sigreturn(td, uap)
470	struct thread *td;
471	struct sigreturn_args /* {
472		const struct __ucontext *sigcntxp;
473	} */ *uap;
474{
475	ucontext_t uc;
476	struct pcb *pcb;
477	struct proc *p;
478	struct trapframe *regs;
479	ucontext_t *ucp;
480	char *xfpustate;
481	size_t xfpustate_len;
482	long rflags;
483	int cs, error, ret;
484	ksiginfo_t ksi;
485
486	pcb = td->td_pcb;
487	p = td->td_proc;
488
489	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
490	if (error != 0) {
491		uprintf("pid %d (%s): sigreturn copyin failed\n",
492		    p->p_pid, td->td_name);
493		return (error);
494	}
495	ucp = &uc;
496	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
497		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
498		    td->td_name, ucp->uc_mcontext.mc_flags);
499		return (EINVAL);
500	}
501	regs = td->td_frame;
502	rflags = ucp->uc_mcontext.mc_rflags;
503	/*
504	 * Don't allow users to change privileged or reserved flags.
505	 */
506	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
507		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
508		    td->td_name, rflags);
509		return (EINVAL);
510	}
511
512	/*
513	 * Don't allow users to load a valid privileged %cs.  Let the
514	 * hardware check for invalid selectors, excess privilege in
515	 * other selectors, invalid %eip's and invalid %esp's.
516	 */
517	cs = ucp->uc_mcontext.mc_cs;
518	if (!CS_SECURE(cs)) {
519		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
520		    td->td_name, cs);
521		ksiginfo_init_trap(&ksi);
522		ksi.ksi_signo = SIGBUS;
523		ksi.ksi_code = BUS_OBJERR;
524		ksi.ksi_trapno = T_PROTFLT;
525		ksi.ksi_addr = (void *)regs->tf_rip;
526		trapsignal(td, &ksi);
527		return (EINVAL);
528	}
529
530	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
531		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
532		if (xfpustate_len > cpu_max_ext_state_size -
533		    sizeof(struct savefpu)) {
534			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
535			    p->p_pid, td->td_name, xfpustate_len);
536			return (EINVAL);
537		}
538		xfpustate = __builtin_alloca(xfpustate_len);
539		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
540		    xfpustate, xfpustate_len);
541		if (error != 0) {
542			uprintf(
543	"pid %d (%s): sigreturn copying xfpustate failed\n",
544			    p->p_pid, td->td_name);
545			return (error);
546		}
547	} else {
548		xfpustate = NULL;
549		xfpustate_len = 0;
550	}
551	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
552	if (ret != 0) {
553		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
554		    p->p_pid, td->td_name, ret);
555		return (ret);
556	}
557	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
558	update_pcb_bases(pcb);
559	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
560	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
561
562#if defined(COMPAT_43)
563	if (ucp->uc_mcontext.mc_onstack & 1)
564		td->td_sigstk.ss_flags |= SS_ONSTACK;
565	else
566		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
567#endif
568
569	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
570	return (EJUSTRETURN);
571}
572
573#ifdef COMPAT_FREEBSD4
574int
575freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
576{
577
578	return sys_sigreturn(td, (struct sigreturn_args *)uap);
579}
580#endif
581
582/*
583 * Reset registers to default values on exec.
584 */
585void
586exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
587{
588	struct trapframe *regs = td->td_frame;
589	struct pcb *pcb = td->td_pcb;
590
591	mtx_lock(&dt_lock);
592	if (td->td_proc->p_md.md_ldt != NULL)
593		user_ldt_free(td);
594	else
595		mtx_unlock(&dt_lock);
596
597	update_pcb_bases(pcb);
598	pcb->pcb_fsbase = 0;
599	pcb->pcb_gsbase = 0;
600	clear_pcb_flags(pcb, PCB_32BIT);
601	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
602
603	bzero((char *)regs, sizeof(struct trapframe));
604	regs->tf_rip = imgp->entry_addr;
605	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
606	regs->tf_rdi = stack;		/* argv */
607	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
608	regs->tf_ss = _udatasel;
609	regs->tf_cs = _ucodesel;
610	regs->tf_ds = _udatasel;
611	regs->tf_es = _udatasel;
612	regs->tf_fs = _ufssel;
613	regs->tf_gs = _ugssel;
614	regs->tf_flags = TF_HASSEGS;
615	td->td_retval[1] = 0;
616
617	/*
618	 * Reset the hardware debug registers if they were in use.
619	 * They won't have any meaning for the newly exec'd process.
620	 */
621	if (pcb->pcb_flags & PCB_DBREGS) {
622		pcb->pcb_dr0 = 0;
623		pcb->pcb_dr1 = 0;
624		pcb->pcb_dr2 = 0;
625		pcb->pcb_dr3 = 0;
626		pcb->pcb_dr6 = 0;
627		pcb->pcb_dr7 = 0;
628		if (pcb == curpcb) {
629			/*
630			 * Clear the debug registers on the running
631			 * CPU, otherwise they will end up affecting
632			 * the next process we switch to.
633			 */
634			reset_dbregs();
635		}
636		clear_pcb_flags(pcb, PCB_DBREGS);
637	}
638
639	/*
640	 * Drop the FP state if we hold it, so that the process gets a
641	 * clean FP state if it uses the FPU again.
642	 */
643	fpstate_drop(td);
644}
645
646void
647cpu_setregs(void)
648{
649	register_t cr0;
650
651	cr0 = rcr0();
652	/*
653	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
654	 * BSP.  See the comments there about why we set them.
655	 */
656	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
657	load_cr0(cr0);
658}
659
660/*
661 * Initialize amd64 and configure to run kernel
662 */
663
664/*
665 * Initialize segments & interrupt table
666 */
667
668struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
669static struct gate_descriptor idt0[NIDT];
670struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
671
672static char dblfault_stack[PAGE_SIZE] __aligned(16);
673
674static char nmi0_stack[PAGE_SIZE] __aligned(16);
675CTASSERT(sizeof(struct nmi_pcpu) == 16);
676
677struct amd64tss common_tss[MAXCPU];
678
679/*
680 * Software prototypes -- in more palatable form.
681 *
682 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
683 * slots as corresponding segments for i386 kernel.
684 */
685struct soft_segment_descriptor gdt_segs[] = {
686/* GNULL_SEL	0 Null Descriptor */
687{	.ssd_base = 0x0,
688	.ssd_limit = 0x0,
689	.ssd_type = 0,
690	.ssd_dpl = 0,
691	.ssd_p = 0,
692	.ssd_long = 0,
693	.ssd_def32 = 0,
694	.ssd_gran = 0		},
695/* GNULL2_SEL	1 Null Descriptor */
696{	.ssd_base = 0x0,
697	.ssd_limit = 0x0,
698	.ssd_type = 0,
699	.ssd_dpl = 0,
700	.ssd_p = 0,
701	.ssd_long = 0,
702	.ssd_def32 = 0,
703	.ssd_gran = 0		},
704/* GUFS32_SEL	2 32 bit %gs Descriptor for user */
705{	.ssd_base = 0x0,
706	.ssd_limit = 0xfffff,
707	.ssd_type = SDT_MEMRWA,
708	.ssd_dpl = SEL_UPL,
709	.ssd_p = 1,
710	.ssd_long = 0,
711	.ssd_def32 = 1,
712	.ssd_gran = 1		},
713/* GUGS32_SEL	3 32 bit %fs Descriptor for user */
714{	.ssd_base = 0x0,
715	.ssd_limit = 0xfffff,
716	.ssd_type = SDT_MEMRWA,
717	.ssd_dpl = SEL_UPL,
718	.ssd_p = 1,
719	.ssd_long = 0,
720	.ssd_def32 = 1,
721	.ssd_gran = 1		},
722/* GCODE_SEL	4 Code Descriptor for kernel */
723{	.ssd_base = 0x0,
724	.ssd_limit = 0xfffff,
725	.ssd_type = SDT_MEMERA,
726	.ssd_dpl = SEL_KPL,
727	.ssd_p = 1,
728	.ssd_long = 1,
729	.ssd_def32 = 0,
730	.ssd_gran = 1		},
731/* GDATA_SEL	5 Data Descriptor for kernel */
732{	.ssd_base = 0x0,
733	.ssd_limit = 0xfffff,
734	.ssd_type = SDT_MEMRWA,
735	.ssd_dpl = SEL_KPL,
736	.ssd_p = 1,
737	.ssd_long = 1,
738	.ssd_def32 = 0,
739	.ssd_gran = 1		},
740/* GUCODE32_SEL	6 32 bit Code Descriptor for user */
741{	.ssd_base = 0x0,
742	.ssd_limit = 0xfffff,
743	.ssd_type = SDT_MEMERA,
744	.ssd_dpl = SEL_UPL,
745	.ssd_p = 1,
746	.ssd_long = 0,
747	.ssd_def32 = 1,
748	.ssd_gran = 1		},
749/* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
750{	.ssd_base = 0x0,
751	.ssd_limit = 0xfffff,
752	.ssd_type = SDT_MEMRWA,
753	.ssd_dpl = SEL_UPL,
754	.ssd_p = 1,
755	.ssd_long = 0,
756	.ssd_def32 = 1,
757	.ssd_gran = 1		},
758/* GUCODE_SEL	8 64 bit Code Descriptor for user */
759{	.ssd_base = 0x0,
760	.ssd_limit = 0xfffff,
761	.ssd_type = SDT_MEMERA,
762	.ssd_dpl = SEL_UPL,
763	.ssd_p = 1,
764	.ssd_long = 1,
765	.ssd_def32 = 0,
766	.ssd_gran = 1		},
767/* GPROC0_SEL	9 Proc 0 Tss Descriptor */
768{	.ssd_base = 0x0,
769	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
770	.ssd_type = SDT_SYSTSS,
771	.ssd_dpl = SEL_KPL,
772	.ssd_p = 1,
773	.ssd_long = 0,
774	.ssd_def32 = 0,
775	.ssd_gran = 0		},
776/* Actually, the TSS is a system descriptor which is double size */
777{	.ssd_base = 0x0,
778	.ssd_limit = 0x0,
779	.ssd_type = 0,
780	.ssd_dpl = 0,
781	.ssd_p = 0,
782	.ssd_long = 0,
783	.ssd_def32 = 0,
784	.ssd_gran = 0		},
785/* GUSERLDT_SEL	11 LDT Descriptor */
786{	.ssd_base = 0x0,
787	.ssd_limit = 0x0,
788	.ssd_type = 0,
789	.ssd_dpl = 0,
790	.ssd_p = 0,
791	.ssd_long = 0,
792	.ssd_def32 = 0,
793	.ssd_gran = 0		},
794/* GUSERLDT_SEL	12 LDT Descriptor, double size */
795{	.ssd_base = 0x0,
796	.ssd_limit = 0x0,
797	.ssd_type = 0,
798	.ssd_dpl = 0,
799	.ssd_p = 0,
800	.ssd_long = 0,
801	.ssd_def32 = 0,
802	.ssd_gran = 0		},
803};
804
805void
806setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
807{
808	struct gate_descriptor *ip;
809
810	ip = idt + idx;
811	ip->gd_looffset = (uintptr_t)func;
812	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
813	ip->gd_ist = ist;
814	ip->gd_xx = 0;
815	ip->gd_type = typ;
816	ip->gd_dpl = dpl;
817	ip->gd_p = 1;
818	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
819}
820
821extern inthand_t
822	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
823	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
824	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
825	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
826	IDTVEC(xmm), IDTVEC(dblfault),
827#ifdef KDTRACE_HOOKS
828	IDTVEC(dtrace_ret),
829#endif
830#ifdef XENHVM
831	IDTVEC(xen_intr_upcall),
832#endif
833	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
834
835#ifdef DDB
836/*
837 * Display the index and function name of any IDT entries that don't use
838 * the default 'rsvd' entry point.
839 */
840DB_SHOW_COMMAND(idt, db_show_idt)
841{
842	struct gate_descriptor *ip;
843	int idx;
844	uintptr_t func;
845
846	ip = idt;
847	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
848		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
849		if (func != (uintptr_t)&IDTVEC(rsvd)) {
850			db_printf("%3d\t", idx);
851			db_printsym(func, DB_STGY_PROC);
852			db_printf("\n");
853		}
854		ip++;
855	}
856}
857
858/* Show privileged registers. */
859DB_SHOW_COMMAND(sysregs, db_show_sysregs)
860{
861	struct {
862		uint16_t limit;
863		uint64_t base;
864	} __packed idtr, gdtr;
865	uint16_t ldt, tr;
866
867	__asm __volatile("sidt %0" : "=m" (idtr));
868	db_printf("idtr\t0x%016lx/%04x\n",
869	    (u_long)idtr.base, (u_int)idtr.limit);
870	__asm __volatile("sgdt %0" : "=m" (gdtr));
871	db_printf("gdtr\t0x%016lx/%04x\n",
872	    (u_long)gdtr.base, (u_int)gdtr.limit);
873	__asm __volatile("sldt %0" : "=r" (ldt));
874	db_printf("ldtr\t0x%04x\n", ldt);
875	__asm __volatile("str %0" : "=r" (tr));
876	db_printf("tr\t0x%04x\n", tr);
877	db_printf("cr0\t0x%016lx\n", rcr0());
878	db_printf("cr2\t0x%016lx\n", rcr2());
879	db_printf("cr3\t0x%016lx\n", rcr3());
880	db_printf("cr4\t0x%016lx\n", rcr4());
881	if (rcr4() & CR4_XSAVE)
882		db_printf("xcr0\t0x%016lx\n", rxcr(0));
883	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
884	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
885		db_printf("FEATURES_CTL\t%016lx\n",
886		    rdmsr(MSR_IA32_FEATURE_CONTROL));
887	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
888	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
889	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
890}
891
892DB_SHOW_COMMAND(dbregs, db_show_dbregs)
893{
894
895	db_printf("dr0\t0x%016lx\n", rdr0());
896	db_printf("dr1\t0x%016lx\n", rdr1());
897	db_printf("dr2\t0x%016lx\n", rdr2());
898	db_printf("dr3\t0x%016lx\n", rdr3());
899	db_printf("dr6\t0x%016lx\n", rdr6());
900	db_printf("dr7\t0x%016lx\n", rdr7());
901}
902#endif
903
904void
905sdtossd(sd, ssd)
906	struct user_segment_descriptor *sd;
907	struct soft_segment_descriptor *ssd;
908{
909
910	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
911	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
912	ssd->ssd_type  = sd->sd_type;
913	ssd->ssd_dpl   = sd->sd_dpl;
914	ssd->ssd_p     = sd->sd_p;
915	ssd->ssd_long  = sd->sd_long;
916	ssd->ssd_def32 = sd->sd_def32;
917	ssd->ssd_gran  = sd->sd_gran;
918}
919
920void
921ssdtosd(ssd, sd)
922	struct soft_segment_descriptor *ssd;
923	struct user_segment_descriptor *sd;
924{
925
926	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
927	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
928	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
929	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
930	sd->sd_type  = ssd->ssd_type;
931	sd->sd_dpl   = ssd->ssd_dpl;
932	sd->sd_p     = ssd->ssd_p;
933	sd->sd_long  = ssd->ssd_long;
934	sd->sd_def32 = ssd->ssd_def32;
935	sd->sd_gran  = ssd->ssd_gran;
936}
937
938void
939ssdtosyssd(ssd, sd)
940	struct soft_segment_descriptor *ssd;
941	struct system_segment_descriptor *sd;
942{
943
944	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
945	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
946	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
947	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
948	sd->sd_type  = ssd->ssd_type;
949	sd->sd_dpl   = ssd->ssd_dpl;
950	sd->sd_p     = ssd->ssd_p;
951	sd->sd_gran  = ssd->ssd_gran;
952}
953
954#if !defined(DEV_ATPIC) && defined(DEV_ISA)
955#include <isa/isavar.h>
956#include <isa/isareg.h>
957/*
958 * Return a bitmap of the current interrupt requests.  This is 8259-specific
959 * and is only suitable for use at probe time.
960 * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
961 * It shouldn't be here.  There should probably be an APIC centric
962 * implementation in the apic driver code, if at all.
963 */
964intrmask_t
965isa_irq_pending(void)
966{
967	u_char irr1;
968	u_char irr2;
969
970	irr1 = inb(IO_ICU1);
971	irr2 = inb(IO_ICU2);
972	return ((irr2 << 8) | irr1);
973}
974#endif
975
976u_int basemem;
977
978static int
979add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
980    int *physmap_idxp)
981{
982	int i, insert_idx, physmap_idx;
983
984	physmap_idx = *physmap_idxp;
985
986	if (length == 0)
987		return (1);
988
989	/*
990	 * Find insertion point while checking for overlap.  Start off by
991	 * assuming the new entry will be added to the end.
992	 *
993	 * NB: physmap_idx points to the next free slot.
994	 */
995	insert_idx = physmap_idx;
996	for (i = 0; i <= physmap_idx; i += 2) {
997		if (base < physmap[i + 1]) {
998			if (base + length <= physmap[i]) {
999				insert_idx = i;
1000				break;
1001			}
1002			if (boothowto & RB_VERBOSE)
1003				printf(
1004		    "Overlapping memory regions, ignoring second region\n");
1005			return (1);
1006		}
1007	}
1008
1009	/* See if we can prepend to the next entry. */
1010	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1011		physmap[insert_idx] = base;
1012		return (1);
1013	}
1014
1015	/* See if we can append to the previous entry. */
1016	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1017		physmap[insert_idx - 1] += length;
1018		return (1);
1019	}
1020
1021	physmap_idx += 2;
1022	*physmap_idxp = physmap_idx;
1023	if (physmap_idx == PHYSMAP_SIZE) {
1024		printf(
1025		"Too many segments in the physical address map, giving up\n");
1026		return (0);
1027	}
1028
1029	/*
1030	 * Move the last 'N' entries down to make room for the new
1031	 * entry if needed.
1032	 */
1033	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1034		physmap[i] = physmap[i - 2];
1035		physmap[i + 1] = physmap[i - 1];
1036	}
1037
1038	/* Insert the new entry. */
1039	physmap[insert_idx] = base;
1040	physmap[insert_idx + 1] = base + length;
1041	return (1);
1042}
1043
1044void
1045bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1046                      vm_paddr_t *physmap, int *physmap_idx)
1047{
1048	struct bios_smap *smap, *smapend;
1049
1050	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1051
1052	for (smap = smapbase; smap < smapend; smap++) {
1053		if (boothowto & RB_VERBOSE)
1054			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1055			    smap->type, smap->base, smap->length);
1056
1057		if (smap->type != SMAP_TYPE_MEMORY)
1058			continue;
1059
1060		if (!add_physmap_entry(smap->base, smap->length, physmap,
1061		    physmap_idx))
1062			break;
1063	}
1064}
1065
1066static void
1067add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1068    int *physmap_idx)
1069{
1070	struct efi_md *map, *p;
1071	const char *type;
1072	size_t efisz;
1073	int ndesc, i;
1074
1075	static const char *types[] = {
1076		"Reserved",
1077		"LoaderCode",
1078		"LoaderData",
1079		"BootServicesCode",
1080		"BootServicesData",
1081		"RuntimeServicesCode",
1082		"RuntimeServicesData",
1083		"ConventionalMemory",
1084		"UnusableMemory",
1085		"ACPIReclaimMemory",
1086		"ACPIMemoryNVS",
1087		"MemoryMappedIO",
1088		"MemoryMappedIOPortSpace",
1089		"PalCode",
1090		"PersistentMemory"
1091	};
1092
1093	/*
1094	 * Memory map data provided by UEFI via the GetMemoryMap
1095	 * Boot Services API.
1096	 */
1097	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1098	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1099
1100	if (efihdr->descriptor_size == 0)
1101		return;
1102	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1103
1104	if (boothowto & RB_VERBOSE)
1105		printf("%23s %12s %12s %8s %4s\n",
1106		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1107
1108	for (i = 0, p = map; i < ndesc; i++,
1109	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1110		if (boothowto & RB_VERBOSE) {
1111			if (p->md_type < nitems(types))
1112				type = types[p->md_type];
1113			else
1114				type = "<INVALID>";
1115			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1116			    p->md_virt, p->md_pages);
1117			if (p->md_attr & EFI_MD_ATTR_UC)
1118				printf("UC ");
1119			if (p->md_attr & EFI_MD_ATTR_WC)
1120				printf("WC ");
1121			if (p->md_attr & EFI_MD_ATTR_WT)
1122				printf("WT ");
1123			if (p->md_attr & EFI_MD_ATTR_WB)
1124				printf("WB ");
1125			if (p->md_attr & EFI_MD_ATTR_UCE)
1126				printf("UCE ");
1127			if (p->md_attr & EFI_MD_ATTR_WP)
1128				printf("WP ");
1129			if (p->md_attr & EFI_MD_ATTR_RP)
1130				printf("RP ");
1131			if (p->md_attr & EFI_MD_ATTR_XP)
1132				printf("XP ");
1133			if (p->md_attr & EFI_MD_ATTR_NV)
1134				printf("NV ");
1135			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1136				printf("MORE_RELIABLE ");
1137			if (p->md_attr & EFI_MD_ATTR_RO)
1138				printf("RO ");
1139			if (p->md_attr & EFI_MD_ATTR_RT)
1140				printf("RUNTIME");
1141			printf("\n");
1142		}
1143
1144		switch (p->md_type) {
1145		case EFI_MD_TYPE_CODE:
1146		case EFI_MD_TYPE_DATA:
1147		case EFI_MD_TYPE_BS_CODE:
1148		case EFI_MD_TYPE_BS_DATA:
1149		case EFI_MD_TYPE_FREE:
1150			/*
1151			 * We're allowed to use any entry with these types.
1152			 */
1153			break;
1154		default:
1155			continue;
1156		}
1157
1158		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1159		    physmap, physmap_idx))
1160			break;
1161	}
1162}
1163
1164static char bootmethod[16] = "";
1165SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1166    "System firmware boot method");
1167
1168static void
1169native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1170{
1171	struct bios_smap *smap;
1172	struct efi_map_header *efihdr;
1173	u_int32_t size;
1174
1175	/*
1176	 * Memory map from INT 15:E820.
1177	 *
1178	 * subr_module.c says:
1179	 * "Consumer may safely assume that size value precedes data."
1180	 * ie: an int32_t immediately precedes smap.
1181	 */
1182
1183	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1184	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1185	smap = (struct bios_smap *)preload_search_info(kmdp,
1186	    MODINFO_METADATA | MODINFOMD_SMAP);
1187	if (efihdr == NULL && smap == NULL)
1188		panic("No BIOS smap or EFI map info from loader!");
1189
1190	if (efihdr != NULL) {
1191		add_efi_map_entries(efihdr, physmap, physmap_idx);
1192		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1193	} else {
1194		size = *((u_int32_t *)smap - 1);
1195		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1196		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1197	}
1198}
1199
1200#define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1201
1202/*
1203 * Populate the (physmap) array with base/bound pairs describing the
1204 * available physical memory in the system, then test this memory and
1205 * build the phys_avail array describing the actually-available memory.
1206 *
1207 * Total memory size may be set by the kernel environment variable
1208 * hw.physmem or the compile-time define MAXMEM.
1209 *
1210 * XXX first should be vm_paddr_t.
1211 */
1212static void
1213getmemsize(caddr_t kmdp, u_int64_t first)
1214{
1215	int i, physmap_idx, pa_indx, da_indx;
1216	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1217	u_long physmem_start, physmem_tunable, memtest;
1218	pt_entry_t *pte;
1219	quad_t dcons_addr, dcons_size;
1220	int page_counter;
1221
1222	bzero(physmap, sizeof(physmap));
1223	physmap_idx = 0;
1224
1225	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1226	physmap_idx -= 2;
1227
1228	/*
1229	 * Find the 'base memory' segment for SMP
1230	 */
1231	basemem = 0;
1232	for (i = 0; i <= physmap_idx; i += 2) {
1233		if (physmap[i] <= 0xA0000) {
1234			basemem = physmap[i + 1] / 1024;
1235			break;
1236		}
1237	}
1238	if (basemem == 0 || basemem > 640) {
1239		if (bootverbose)
1240			printf(
1241		"Memory map doesn't contain a basemem segment, faking it");
1242		basemem = 640;
1243	}
1244
1245	/*
1246	 * Make hole for "AP -> long mode" bootstrap code.  The
1247	 * mp_bootaddress vector is only available when the kernel
1248	 * is configured to support APs and APs for the system start
1249	 * in 32bit mode (e.g. SMP bare metal).
1250	 */
1251	if (init_ops.mp_bootaddress) {
1252		if (physmap[1] >= 0x100000000)
1253			panic(
1254	"Basemem segment is not suitable for AP bootstrap code!");
1255		physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
1256	}
1257
1258	/*
1259	 * Maxmem isn't the "maximum memory", it's one larger than the
1260	 * highest page of the physical address space.  It should be
1261	 * called something like "Maxphyspage".  We may adjust this
1262	 * based on ``hw.physmem'' and the results of the memory test.
1263	 */
1264	Maxmem = atop(physmap[physmap_idx + 1]);
1265
1266#ifdef MAXMEM
1267	Maxmem = MAXMEM / 4;
1268#endif
1269
1270	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1271		Maxmem = atop(physmem_tunable);
1272
1273	/*
1274	 * The boot memory test is disabled by default, as it takes a
1275	 * significant amount of time on large-memory systems, and is
1276	 * unfriendly to virtual machines as it unnecessarily touches all
1277	 * pages.
1278	 *
1279	 * A general name is used as the code may be extended to support
1280	 * additional tests beyond the current "page present" test.
1281	 */
1282	memtest = 0;
1283	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1284
1285	/*
1286	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1287	 * in the system.
1288	 */
1289	if (Maxmem > atop(physmap[physmap_idx + 1]))
1290		Maxmem = atop(physmap[physmap_idx + 1]);
1291
1292	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1293	    (boothowto & RB_VERBOSE))
1294		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1295
1296	/* call pmap initialization to make new kernel address space */
1297	pmap_bootstrap(&first);
1298
1299	/*
1300	 * Size up each available chunk of physical memory.
1301	 *
1302	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1303	 * By default, mask off the first 16 pages unless we appear to be
1304	 * running in a VM.
1305	 */
1306	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1307	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1308	if (physmap[0] < physmem_start) {
1309		if (physmem_start < PAGE_SIZE)
1310			physmap[0] = PAGE_SIZE;
1311		else if (physmem_start >= physmap[1])
1312			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1313		else
1314			physmap[0] = round_page(physmem_start);
1315	}
1316	pa_indx = 0;
1317	da_indx = 1;
1318	phys_avail[pa_indx++] = physmap[0];
1319	phys_avail[pa_indx] = physmap[0];
1320	dump_avail[da_indx] = physmap[0];
1321	pte = CMAP1;
1322
1323	/*
1324	 * Get dcons buffer address
1325	 */
1326	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1327	    getenv_quad("dcons.size", &dcons_size) == 0)
1328		dcons_addr = 0;
1329
1330	/*
1331	 * physmap is in bytes, so when converting to page boundaries,
1332	 * round up the start address and round down the end address.
1333	 */
1334	page_counter = 0;
1335	if (memtest != 0)
1336		printf("Testing system memory");
1337	for (i = 0; i <= physmap_idx; i += 2) {
1338		vm_paddr_t end;
1339
1340		end = ptoa((vm_paddr_t)Maxmem);
1341		if (physmap[i + 1] < end)
1342			end = trunc_page(physmap[i + 1]);
1343		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1344			int tmp, page_bad, full;
1345			int *ptr = (int *)CADDR1;
1346
1347			full = FALSE;
1348			/*
1349			 * block out kernel memory as not available.
1350			 */
1351			if (pa >= (vm_paddr_t)kernphys && pa < first)
1352				goto do_dump_avail;
1353
1354			/*
1355			 * block out dcons buffer
1356			 */
1357			if (dcons_addr > 0
1358			    && pa >= trunc_page(dcons_addr)
1359			    && pa < dcons_addr + dcons_size)
1360				goto do_dump_avail;
1361
1362			page_bad = FALSE;
1363			if (memtest == 0)
1364				goto skip_memtest;
1365
1366			/*
1367			 * Print a "." every GB to show we're making
1368			 * progress.
1369			 */
1370			page_counter++;
1371			if ((page_counter % PAGES_PER_GB) == 0)
1372				printf(".");
1373
1374			/*
1375			 * map page into kernel: valid, read/write,non-cacheable
1376			 */
1377			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1378			invltlb();
1379
1380			tmp = *(int *)ptr;
1381			/*
1382			 * Test for alternating 1's and 0's
1383			 */
1384			*(volatile int *)ptr = 0xaaaaaaaa;
1385			if (*(volatile int *)ptr != 0xaaaaaaaa)
1386				page_bad = TRUE;
1387			/*
1388			 * Test for alternating 0's and 1's
1389			 */
1390			*(volatile int *)ptr = 0x55555555;
1391			if (*(volatile int *)ptr != 0x55555555)
1392				page_bad = TRUE;
1393			/*
1394			 * Test for all 1's
1395			 */
1396			*(volatile int *)ptr = 0xffffffff;
1397			if (*(volatile int *)ptr != 0xffffffff)
1398				page_bad = TRUE;
1399			/*
1400			 * Test for all 0's
1401			 */
1402			*(volatile int *)ptr = 0x0;
1403			if (*(volatile int *)ptr != 0x0)
1404				page_bad = TRUE;
1405			/*
1406			 * Restore original value.
1407			 */
1408			*(int *)ptr = tmp;
1409
1410skip_memtest:
1411			/*
1412			 * Adjust array of valid/good pages.
1413			 */
1414			if (page_bad == TRUE)
1415				continue;
1416			/*
1417			 * If this good page is a continuation of the
1418			 * previous set of good pages, then just increase
1419			 * the end pointer. Otherwise start a new chunk.
1420			 * Note that "end" points one higher than end,
1421			 * making the range >= start and < end.
1422			 * If we're also doing a speculative memory
1423			 * test and we at or past the end, bump up Maxmem
1424			 * so that we keep going. The first bad page
1425			 * will terminate the loop.
1426			 */
1427			if (phys_avail[pa_indx] == pa) {
1428				phys_avail[pa_indx] += PAGE_SIZE;
1429			} else {
1430				pa_indx++;
1431				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1432					printf(
1433		"Too many holes in the physical address space, giving up\n");
1434					pa_indx--;
1435					full = TRUE;
1436					goto do_dump_avail;
1437				}
1438				phys_avail[pa_indx++] = pa;	/* start */
1439				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1440			}
1441			physmem++;
1442do_dump_avail:
1443			if (dump_avail[da_indx] == pa) {
1444				dump_avail[da_indx] += PAGE_SIZE;
1445			} else {
1446				da_indx++;
1447				if (da_indx == DUMP_AVAIL_ARRAY_END) {
1448					da_indx--;
1449					goto do_next;
1450				}
1451				dump_avail[da_indx++] = pa; /* start */
1452				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1453			}
1454do_next:
1455			if (full)
1456				break;
1457		}
1458	}
1459	*pte = 0;
1460	invltlb();
1461	if (memtest != 0)
1462		printf("\n");
1463
1464	/*
1465	 * XXX
1466	 * The last chunk must contain at least one page plus the message
1467	 * buffer to avoid complicating other code (message buffer address
1468	 * calculation, etc.).
1469	 */
1470	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1471	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1472		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1473		phys_avail[pa_indx--] = 0;
1474		phys_avail[pa_indx--] = 0;
1475	}
1476
1477	Maxmem = atop(phys_avail[pa_indx]);
1478
1479	/* Trim off space for the message buffer. */
1480	phys_avail[pa_indx] -= round_page(msgbufsize);
1481
1482	/* Map the message buffer. */
1483	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1484}
1485
1486static caddr_t
1487native_parse_preload_data(u_int64_t modulep)
1488{
1489	caddr_t kmdp;
1490	char *envp;
1491#ifdef DDB
1492	vm_offset_t ksym_start;
1493	vm_offset_t ksym_end;
1494#endif
1495
1496	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1497	preload_bootstrap_relocate(KERNBASE);
1498	kmdp = preload_search_by_type("elf kernel");
1499	if (kmdp == NULL)
1500		kmdp = preload_search_by_type("elf64 kernel");
1501	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1502	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1503	if (envp != NULL)
1504		envp += KERNBASE;
1505	init_static_kenv(envp, 0);
1506#ifdef DDB
1507	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1508	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1509	db_fetch_ksymtab(ksym_start, ksym_end);
1510#endif
1511	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1512
1513	return (kmdp);
1514}
1515
1516static void
1517amd64_kdb_init(void)
1518{
1519	kdb_init();
1520#ifdef KDB
1521	if (boothowto & RB_KDB)
1522		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1523#endif
1524}
1525
1526u_int64_t
1527hammer_time(u_int64_t modulep, u_int64_t physfree)
1528{
1529	caddr_t kmdp;
1530	int gsel_tss, x;
1531	struct pcpu *pc;
1532	struct nmi_pcpu *np;
1533	struct xstate_hdr *xhdr;
1534	u_int64_t msr;
1535	char *env;
1536	size_t kstack0_sz;
1537	int late_console;
1538
1539	/*
1540 	 * This may be done better later if it gets more high level
1541 	 * components in it. If so just link td->td_proc here.
1542	 */
1543	proc_linkup0(&proc0, &thread0);
1544
1545	kmdp = init_ops.parse_preload_data(modulep);
1546
1547	identify_cpu();
1548	identify_hypervisor();
1549
1550	/* Init basic tunables, hz etc */
1551	init_param1();
1552
1553	thread0.td_kstack = physfree + KERNBASE;
1554	thread0.td_kstack_pages = kstack_pages;
1555	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1556	bzero((void *)thread0.td_kstack, kstack0_sz);
1557	physfree += kstack0_sz;
1558
1559	/*
1560	 * make gdt memory segments
1561	 */
1562	for (x = 0; x < NGDT; x++) {
1563		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1564		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1565			ssdtosd(&gdt_segs[x], &gdt[x]);
1566	}
1567	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1568	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1569	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1570
1571	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1572	r_gdt.rd_base =  (long) gdt;
1573	lgdt(&r_gdt);
1574	pc = &__pcpu[0];
1575
1576	wrmsr(MSR_FSBASE, 0);		/* User value */
1577	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1578	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1579
1580	pcpu_init(pc, 0, sizeof(struct pcpu));
1581	dpcpu_init((void *)(physfree + KERNBASE), 0);
1582	physfree += DPCPU_SIZE;
1583	PCPU_SET(prvspace, pc);
1584	PCPU_SET(curthread, &thread0);
1585	/* Non-late cninit() and printf() can be moved up to here. */
1586	PCPU_SET(tssp, &common_tss[0]);
1587	PCPU_SET(commontssp, &common_tss[0]);
1588	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1589	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1590	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1591	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1592
1593	/*
1594	 * Initialize mutexes.
1595	 *
1596	 * icu_lock: in order to allow an interrupt to occur in a critical
1597	 * 	     section, to set pcpu->ipending (etc...) properly, we
1598	 *	     must be able to get the icu lock, so it can't be
1599	 *	     under witness.
1600	 */
1601	mutex_init();
1602	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1603	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1604
1605	/* exceptions */
1606	for (x = 0; x < NIDT; x++)
1607		setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
1608	setidt(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
1609	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 0);
1610	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1611 	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
1612	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
1613	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
1614	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
1615	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
1616	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1617	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
1618	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
1619	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
1620	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
1621	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
1622	setidt(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
1623	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
1624	setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
1625	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
1626	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
1627#ifdef KDTRACE_HOOKS
1628	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1629#endif
1630#ifdef XENHVM
1631	setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0);
1632#endif
1633
1634	r_idt.rd_limit = sizeof(idt0) - 1;
1635	r_idt.rd_base = (long) idt;
1636	lidt(&r_idt);
1637
1638	/*
1639	 * Initialize the clock before the console so that console
1640	 * initialization can use DELAY().
1641	 */
1642	clock_init();
1643
1644	/*
1645	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1646	 * transition).
1647	 * Once bootblocks have updated, we can test directly for
1648	 * efi_systbl != NULL here...
1649	 */
1650	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1651	    != NULL)
1652		vty_set_preferred(VTY_VT);
1653
1654	finishidentcpu();	/* Final stage of CPU initialization */
1655	initializecpu();	/* Initialize CPU registers */
1656	initializecpucache();
1657
1658	/* doublefault stack space, runs on ist1 */
1659	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1660
1661	/*
1662	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1663	 * above the start of the ist2 stack.
1664	 */
1665	np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1666	np->np_pcpu = (register_t) pc;
1667	common_tss[0].tss_ist2 = (long) np;
1668
1669	/* Set the IO permission bitmap (empty due to tss seg limit) */
1670	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1671
1672	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1673	ltr(gsel_tss);
1674
1675	/* Set up the fast syscall stuff */
1676	msr = rdmsr(MSR_EFER) | EFER_SCE;
1677	wrmsr(MSR_EFER, msr);
1678	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
1679	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1680	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1681	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1682	wrmsr(MSR_STAR, msr);
1683	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
1684
1685	/*
1686	 * Temporary forge some valid pointer to PCB, for exception
1687	 * handlers.  It is reinitialized properly below after FPU is
1688	 * set up.  Also set up td_critnest to short-cut the page
1689	 * fault handler.
1690	 */
1691	cpu_max_ext_state_size = sizeof(struct savefpu);
1692	thread0.td_pcb = get_pcb_td(&thread0);
1693	thread0.td_critnest = 1;
1694
1695	/*
1696	 * The console and kdb should be initialized even earlier than here,
1697	 * but some console drivers don't work until after getmemsize().
1698	 * Default to late console initialization to support these drivers.
1699	 * This loses mainly printf()s in getmemsize() and early debugging.
1700	 */
1701	late_console = 1;
1702	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1703	if (!late_console) {
1704		cninit();
1705		amd64_kdb_init();
1706	}
1707
1708	getmemsize(kmdp, physfree);
1709	init_param2(physmem);
1710
1711	/* now running on new page tables, configured,and u/iom is accessible */
1712
1713	if (late_console)
1714		cninit();
1715
1716#ifdef DEV_ISA
1717#ifdef DEV_ATPIC
1718	elcr_probe();
1719	atpic_startup();
1720#else
1721	/* Reset and mask the atpics and leave them shut down. */
1722	atpic_reset();
1723
1724	/*
1725	 * Point the ICU spurious interrupt vectors at the APIC spurious
1726	 * interrupt handler.
1727	 */
1728	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1729	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1730#endif
1731#else
1732#error "have you forgotten the isa device?";
1733#endif
1734
1735	if (late_console)
1736		amd64_kdb_init();
1737
1738	msgbufinit(msgbufp, msgbufsize);
1739	fpuinit();
1740
1741	/*
1742	 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1743	 * area size.  Zero out the extended state header in fpu save
1744	 * area.
1745	 */
1746	thread0.td_pcb = get_pcb_td(&thread0);
1747	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1748	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1749	if (use_xsave) {
1750		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1751		    1);
1752		xhdr->xstate_bv = xsave_mask;
1753	}
1754	/* make an initial tss so cpu can get interrupt stack on syscall! */
1755	common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb;
1756	/* Ensure the stack is aligned to 16 bytes */
1757	common_tss[0].tss_rsp0 &= ~0xFul;
1758	PCPU_SET(rsp0, common_tss[0].tss_rsp0);
1759	PCPU_SET(curpcb, thread0.td_pcb);
1760
1761	/* transfer to user mode */
1762
1763	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1764	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1765	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1766	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1767	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1768
1769	load_ds(_udatasel);
1770	load_es(_udatasel);
1771	load_fs(_ufssel);
1772
1773	/* setup proc 0's pcb */
1774	thread0.td_pcb->pcb_flags = 0;
1775	thread0.td_frame = &proc0_tf;
1776
1777        env = kern_getenv("kernelname");
1778	if (env != NULL)
1779		strlcpy(kernelname, env, sizeof(kernelname));
1780
1781	cpu_probe_amdc1e();
1782
1783#ifdef FDT
1784	x86_init_fdt();
1785#endif
1786	thread0.td_critnest = 0;
1787
1788	/* Location of kernel stack for locore */
1789	return ((u_int64_t)thread0.td_pcb);
1790}
1791
1792void
1793cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1794{
1795
1796	pcpu->pc_acpi_id = 0xffffffff;
1797}
1798
1799static int
1800smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1801{
1802	struct bios_smap *smapbase;
1803	struct bios_smap_xattr smap;
1804	caddr_t kmdp;
1805	uint32_t *smapattr;
1806	int count, error, i;
1807
1808	/* Retrieve the system memory map from the loader. */
1809	kmdp = preload_search_by_type("elf kernel");
1810	if (kmdp == NULL)
1811		kmdp = preload_search_by_type("elf64 kernel");
1812	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1813	    MODINFO_METADATA | MODINFOMD_SMAP);
1814	if (smapbase == NULL)
1815		return (0);
1816	smapattr = (uint32_t *)preload_search_info(kmdp,
1817	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1818	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1819	error = 0;
1820	for (i = 0; i < count; i++) {
1821		smap.base = smapbase[i].base;
1822		smap.length = smapbase[i].length;
1823		smap.type = smapbase[i].type;
1824		if (smapattr != NULL)
1825			smap.xattr = smapattr[i];
1826		else
1827			smap.xattr = 0;
1828		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1829	}
1830	return (error);
1831}
1832SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1833    smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1834
1835static int
1836efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1837{
1838	struct efi_map_header *efihdr;
1839	caddr_t kmdp;
1840	uint32_t efisize;
1841
1842	kmdp = preload_search_by_type("elf kernel");
1843	if (kmdp == NULL)
1844		kmdp = preload_search_by_type("elf64 kernel");
1845	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1846	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1847	if (efihdr == NULL)
1848		return (0);
1849	efisize = *((uint32_t *)efihdr - 1);
1850	return (SYSCTL_OUT(req, efihdr, efisize));
1851}
1852SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1853    efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1854
1855void
1856spinlock_enter(void)
1857{
1858	struct thread *td;
1859	register_t flags;
1860
1861	td = curthread;
1862	if (td->td_md.md_spinlock_count == 0) {
1863		flags = intr_disable();
1864		td->td_md.md_spinlock_count = 1;
1865		td->td_md.md_saved_flags = flags;
1866	} else
1867		td->td_md.md_spinlock_count++;
1868	critical_enter();
1869}
1870
1871void
1872spinlock_exit(void)
1873{
1874	struct thread *td;
1875	register_t flags;
1876
1877	td = curthread;
1878	critical_exit();
1879	flags = td->td_md.md_saved_flags;
1880	td->td_md.md_spinlock_count--;
1881	if (td->td_md.md_spinlock_count == 0)
1882		intr_restore(flags);
1883}
1884
1885/*
1886 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1887 * we want to start a backtrace from the function that caused us to enter
1888 * the debugger. We have the context in the trapframe, but base the trace
1889 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1890 * enough for a backtrace.
1891 */
1892void
1893makectx(struct trapframe *tf, struct pcb *pcb)
1894{
1895
1896	pcb->pcb_r12 = tf->tf_r12;
1897	pcb->pcb_r13 = tf->tf_r13;
1898	pcb->pcb_r14 = tf->tf_r14;
1899	pcb->pcb_r15 = tf->tf_r15;
1900	pcb->pcb_rbp = tf->tf_rbp;
1901	pcb->pcb_rbx = tf->tf_rbx;
1902	pcb->pcb_rip = tf->tf_rip;
1903	pcb->pcb_rsp = tf->tf_rsp;
1904}
1905
1906int
1907ptrace_set_pc(struct thread *td, unsigned long addr)
1908{
1909
1910	td->td_frame->tf_rip = addr;
1911	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
1912	return (0);
1913}
1914
1915int
1916ptrace_single_step(struct thread *td)
1917{
1918	td->td_frame->tf_rflags |= PSL_T;
1919	return (0);
1920}
1921
1922int
1923ptrace_clear_single_step(struct thread *td)
1924{
1925	td->td_frame->tf_rflags &= ~PSL_T;
1926	return (0);
1927}
1928
1929int
1930fill_regs(struct thread *td, struct reg *regs)
1931{
1932	struct trapframe *tp;
1933
1934	tp = td->td_frame;
1935	return (fill_frame_regs(tp, regs));
1936}
1937
1938int
1939fill_frame_regs(struct trapframe *tp, struct reg *regs)
1940{
1941	regs->r_r15 = tp->tf_r15;
1942	regs->r_r14 = tp->tf_r14;
1943	regs->r_r13 = tp->tf_r13;
1944	regs->r_r12 = tp->tf_r12;
1945	regs->r_r11 = tp->tf_r11;
1946	regs->r_r10 = tp->tf_r10;
1947	regs->r_r9  = tp->tf_r9;
1948	regs->r_r8  = tp->tf_r8;
1949	regs->r_rdi = tp->tf_rdi;
1950	regs->r_rsi = tp->tf_rsi;
1951	regs->r_rbp = tp->tf_rbp;
1952	regs->r_rbx = tp->tf_rbx;
1953	regs->r_rdx = tp->tf_rdx;
1954	regs->r_rcx = tp->tf_rcx;
1955	regs->r_rax = tp->tf_rax;
1956	regs->r_rip = tp->tf_rip;
1957	regs->r_cs = tp->tf_cs;
1958	regs->r_rflags = tp->tf_rflags;
1959	regs->r_rsp = tp->tf_rsp;
1960	regs->r_ss = tp->tf_ss;
1961	if (tp->tf_flags & TF_HASSEGS) {
1962		regs->r_ds = tp->tf_ds;
1963		regs->r_es = tp->tf_es;
1964		regs->r_fs = tp->tf_fs;
1965		regs->r_gs = tp->tf_gs;
1966	} else {
1967		regs->r_ds = 0;
1968		regs->r_es = 0;
1969		regs->r_fs = 0;
1970		regs->r_gs = 0;
1971	}
1972	return (0);
1973}
1974
1975int
1976set_regs(struct thread *td, struct reg *regs)
1977{
1978	struct trapframe *tp;
1979	register_t rflags;
1980
1981	tp = td->td_frame;
1982	rflags = regs->r_rflags & 0xffffffff;
1983	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
1984		return (EINVAL);
1985	tp->tf_r15 = regs->r_r15;
1986	tp->tf_r14 = regs->r_r14;
1987	tp->tf_r13 = regs->r_r13;
1988	tp->tf_r12 = regs->r_r12;
1989	tp->tf_r11 = regs->r_r11;
1990	tp->tf_r10 = regs->r_r10;
1991	tp->tf_r9  = regs->r_r9;
1992	tp->tf_r8  = regs->r_r8;
1993	tp->tf_rdi = regs->r_rdi;
1994	tp->tf_rsi = regs->r_rsi;
1995	tp->tf_rbp = regs->r_rbp;
1996	tp->tf_rbx = regs->r_rbx;
1997	tp->tf_rdx = regs->r_rdx;
1998	tp->tf_rcx = regs->r_rcx;
1999	tp->tf_rax = regs->r_rax;
2000	tp->tf_rip = regs->r_rip;
2001	tp->tf_cs = regs->r_cs;
2002	tp->tf_rflags = rflags;
2003	tp->tf_rsp = regs->r_rsp;
2004	tp->tf_ss = regs->r_ss;
2005	if (0) {	/* XXXKIB */
2006		tp->tf_ds = regs->r_ds;
2007		tp->tf_es = regs->r_es;
2008		tp->tf_fs = regs->r_fs;
2009		tp->tf_gs = regs->r_gs;
2010		tp->tf_flags = TF_HASSEGS;
2011	}
2012	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2013	return (0);
2014}
2015
2016/* XXX check all this stuff! */
2017/* externalize from sv_xmm */
2018static void
2019fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2020{
2021	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2022	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2023	int i;
2024
2025	/* pcb -> fpregs */
2026	bzero(fpregs, sizeof(*fpregs));
2027
2028	/* FPU control/status */
2029	penv_fpreg->en_cw = penv_xmm->en_cw;
2030	penv_fpreg->en_sw = penv_xmm->en_sw;
2031	penv_fpreg->en_tw = penv_xmm->en_tw;
2032	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2033	penv_fpreg->en_rip = penv_xmm->en_rip;
2034	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2035	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2036	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2037
2038	/* FPU registers */
2039	for (i = 0; i < 8; ++i)
2040		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2041
2042	/* SSE registers */
2043	for (i = 0; i < 16; ++i)
2044		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2045}
2046
2047/* internalize from fpregs into sv_xmm */
2048static void
2049set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2050{
2051	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2052	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2053	int i;
2054
2055	/* fpregs -> pcb */
2056	/* FPU control/status */
2057	penv_xmm->en_cw = penv_fpreg->en_cw;
2058	penv_xmm->en_sw = penv_fpreg->en_sw;
2059	penv_xmm->en_tw = penv_fpreg->en_tw;
2060	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2061	penv_xmm->en_rip = penv_fpreg->en_rip;
2062	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2063	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2064	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2065
2066	/* FPU registers */
2067	for (i = 0; i < 8; ++i)
2068		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2069
2070	/* SSE registers */
2071	for (i = 0; i < 16; ++i)
2072		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2073}
2074
2075/* externalize from td->pcb */
2076int
2077fill_fpregs(struct thread *td, struct fpreg *fpregs)
2078{
2079
2080	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2081	    P_SHOULDSTOP(td->td_proc),
2082	    ("not suspended thread %p", td));
2083	fpugetregs(td);
2084	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2085	return (0);
2086}
2087
2088/* internalize to td->pcb */
2089int
2090set_fpregs(struct thread *td, struct fpreg *fpregs)
2091{
2092
2093	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2094	fpuuserinited(td);
2095	return (0);
2096}
2097
2098/*
2099 * Get machine context.
2100 */
2101int
2102get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2103{
2104	struct pcb *pcb;
2105	struct trapframe *tp;
2106
2107	pcb = td->td_pcb;
2108	tp = td->td_frame;
2109	PROC_LOCK(curthread->td_proc);
2110	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2111	PROC_UNLOCK(curthread->td_proc);
2112	mcp->mc_r15 = tp->tf_r15;
2113	mcp->mc_r14 = tp->tf_r14;
2114	mcp->mc_r13 = tp->tf_r13;
2115	mcp->mc_r12 = tp->tf_r12;
2116	mcp->mc_r11 = tp->tf_r11;
2117	mcp->mc_r10 = tp->tf_r10;
2118	mcp->mc_r9  = tp->tf_r9;
2119	mcp->mc_r8  = tp->tf_r8;
2120	mcp->mc_rdi = tp->tf_rdi;
2121	mcp->mc_rsi = tp->tf_rsi;
2122	mcp->mc_rbp = tp->tf_rbp;
2123	mcp->mc_rbx = tp->tf_rbx;
2124	mcp->mc_rcx = tp->tf_rcx;
2125	mcp->mc_rflags = tp->tf_rflags;
2126	if (flags & GET_MC_CLEAR_RET) {
2127		mcp->mc_rax = 0;
2128		mcp->mc_rdx = 0;
2129		mcp->mc_rflags &= ~PSL_C;
2130	} else {
2131		mcp->mc_rax = tp->tf_rax;
2132		mcp->mc_rdx = tp->tf_rdx;
2133	}
2134	mcp->mc_rip = tp->tf_rip;
2135	mcp->mc_cs = tp->tf_cs;
2136	mcp->mc_rsp = tp->tf_rsp;
2137	mcp->mc_ss = tp->tf_ss;
2138	mcp->mc_ds = tp->tf_ds;
2139	mcp->mc_es = tp->tf_es;
2140	mcp->mc_fs = tp->tf_fs;
2141	mcp->mc_gs = tp->tf_gs;
2142	mcp->mc_flags = tp->tf_flags;
2143	mcp->mc_len = sizeof(*mcp);
2144	get_fpcontext(td, mcp, NULL, 0);
2145	update_pcb_bases(pcb);
2146	mcp->mc_fsbase = pcb->pcb_fsbase;
2147	mcp->mc_gsbase = pcb->pcb_gsbase;
2148	mcp->mc_xfpustate = 0;
2149	mcp->mc_xfpustate_len = 0;
2150	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2151	return (0);
2152}
2153
2154/*
2155 * Set machine context.
2156 *
2157 * However, we don't set any but the user modifiable flags, and we won't
2158 * touch the cs selector.
2159 */
2160int
2161set_mcontext(struct thread *td, mcontext_t *mcp)
2162{
2163	struct pcb *pcb;
2164	struct trapframe *tp;
2165	char *xfpustate;
2166	long rflags;
2167	int ret;
2168
2169	pcb = td->td_pcb;
2170	tp = td->td_frame;
2171	if (mcp->mc_len != sizeof(*mcp) ||
2172	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2173		return (EINVAL);
2174	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2175	    (tp->tf_rflags & ~PSL_USERCHANGE);
2176	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2177		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2178		    sizeof(struct savefpu))
2179			return (EINVAL);
2180		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2181		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2182		    mcp->mc_xfpustate_len);
2183		if (ret != 0)
2184			return (ret);
2185	} else
2186		xfpustate = NULL;
2187	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2188	if (ret != 0)
2189		return (ret);
2190	tp->tf_r15 = mcp->mc_r15;
2191	tp->tf_r14 = mcp->mc_r14;
2192	tp->tf_r13 = mcp->mc_r13;
2193	tp->tf_r12 = mcp->mc_r12;
2194	tp->tf_r11 = mcp->mc_r11;
2195	tp->tf_r10 = mcp->mc_r10;
2196	tp->tf_r9  = mcp->mc_r9;
2197	tp->tf_r8  = mcp->mc_r8;
2198	tp->tf_rdi = mcp->mc_rdi;
2199	tp->tf_rsi = mcp->mc_rsi;
2200	tp->tf_rbp = mcp->mc_rbp;
2201	tp->tf_rbx = mcp->mc_rbx;
2202	tp->tf_rdx = mcp->mc_rdx;
2203	tp->tf_rcx = mcp->mc_rcx;
2204	tp->tf_rax = mcp->mc_rax;
2205	tp->tf_rip = mcp->mc_rip;
2206	tp->tf_rflags = rflags;
2207	tp->tf_rsp = mcp->mc_rsp;
2208	tp->tf_ss = mcp->mc_ss;
2209	tp->tf_flags = mcp->mc_flags;
2210	if (tp->tf_flags & TF_HASSEGS) {
2211		tp->tf_ds = mcp->mc_ds;
2212		tp->tf_es = mcp->mc_es;
2213		tp->tf_fs = mcp->mc_fs;
2214		tp->tf_gs = mcp->mc_gs;
2215	}
2216	set_pcb_flags(pcb, PCB_FULL_IRET);
2217	if (mcp->mc_flags & _MC_HASBASES) {
2218		pcb->pcb_fsbase = mcp->mc_fsbase;
2219		pcb->pcb_gsbase = mcp->mc_gsbase;
2220	}
2221	return (0);
2222}
2223
2224static void
2225get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2226    size_t xfpusave_len)
2227{
2228	size_t max_len, len;
2229
2230	mcp->mc_ownedfp = fpugetregs(td);
2231	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2232	    sizeof(mcp->mc_fpstate));
2233	mcp->mc_fpformat = fpuformat();
2234	if (!use_xsave || xfpusave_len == 0)
2235		return;
2236	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2237	len = xfpusave_len;
2238	if (len > max_len) {
2239		len = max_len;
2240		bzero(xfpusave + max_len, len - max_len);
2241	}
2242	mcp->mc_flags |= _MC_HASFPXSTATE;
2243	mcp->mc_xfpustate_len = len;
2244	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2245}
2246
2247static int
2248set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2249    size_t xfpustate_len)
2250{
2251	struct savefpu *fpstate;
2252	int error;
2253
2254	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2255		return (0);
2256	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2257		return (EINVAL);
2258	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2259		/* We don't care what state is left in the FPU or PCB. */
2260		fpstate_drop(td);
2261		error = 0;
2262	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2263	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2264		fpstate = (struct savefpu *)&mcp->mc_fpstate;
2265		fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
2266		error = fpusetregs(td, fpstate, xfpustate, xfpustate_len);
2267	} else
2268		return (EINVAL);
2269	return (error);
2270}
2271
2272void
2273fpstate_drop(struct thread *td)
2274{
2275
2276	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2277	critical_enter();
2278	if (PCPU_GET(fpcurthread) == td)
2279		fpudrop();
2280	/*
2281	 * XXX force a full drop of the fpu.  The above only drops it if we
2282	 * owned it.
2283	 *
2284	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2285	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2286	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2287	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2288	 * have too many layers.
2289	 */
2290	clear_pcb_flags(curthread->td_pcb,
2291	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2292	critical_exit();
2293}
2294
2295int
2296fill_dbregs(struct thread *td, struct dbreg *dbregs)
2297{
2298	struct pcb *pcb;
2299
2300	if (td == NULL) {
2301		dbregs->dr[0] = rdr0();
2302		dbregs->dr[1] = rdr1();
2303		dbregs->dr[2] = rdr2();
2304		dbregs->dr[3] = rdr3();
2305		dbregs->dr[6] = rdr6();
2306		dbregs->dr[7] = rdr7();
2307	} else {
2308		pcb = td->td_pcb;
2309		dbregs->dr[0] = pcb->pcb_dr0;
2310		dbregs->dr[1] = pcb->pcb_dr1;
2311		dbregs->dr[2] = pcb->pcb_dr2;
2312		dbregs->dr[3] = pcb->pcb_dr3;
2313		dbregs->dr[6] = pcb->pcb_dr6;
2314		dbregs->dr[7] = pcb->pcb_dr7;
2315	}
2316	dbregs->dr[4] = 0;
2317	dbregs->dr[5] = 0;
2318	dbregs->dr[8] = 0;
2319	dbregs->dr[9] = 0;
2320	dbregs->dr[10] = 0;
2321	dbregs->dr[11] = 0;
2322	dbregs->dr[12] = 0;
2323	dbregs->dr[13] = 0;
2324	dbregs->dr[14] = 0;
2325	dbregs->dr[15] = 0;
2326	return (0);
2327}
2328
2329int
2330set_dbregs(struct thread *td, struct dbreg *dbregs)
2331{
2332	struct pcb *pcb;
2333	int i;
2334
2335	if (td == NULL) {
2336		load_dr0(dbregs->dr[0]);
2337		load_dr1(dbregs->dr[1]);
2338		load_dr2(dbregs->dr[2]);
2339		load_dr3(dbregs->dr[3]);
2340		load_dr6(dbregs->dr[6]);
2341		load_dr7(dbregs->dr[7]);
2342	} else {
2343		/*
2344		 * Don't let an illegal value for dr7 get set.  Specifically,
2345		 * check for undefined settings.  Setting these bit patterns
2346		 * result in undefined behaviour and can lead to an unexpected
2347		 * TRCTRAP or a general protection fault right here.
2348		 * Upper bits of dr6 and dr7 must not be set
2349		 */
2350		for (i = 0; i < 4; i++) {
2351			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2352				return (EINVAL);
2353			if (td->td_frame->tf_cs == _ucode32sel &&
2354			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2355				return (EINVAL);
2356		}
2357		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2358		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2359			return (EINVAL);
2360
2361		pcb = td->td_pcb;
2362
2363		/*
2364		 * Don't let a process set a breakpoint that is not within the
2365		 * process's address space.  If a process could do this, it
2366		 * could halt the system by setting a breakpoint in the kernel
2367		 * (if ddb was enabled).  Thus, we need to check to make sure
2368		 * that no breakpoints are being enabled for addresses outside
2369		 * process's address space.
2370		 *
2371		 * XXX - what about when the watched area of the user's
2372		 * address space is written into from within the kernel
2373		 * ... wouldn't that still cause a breakpoint to be generated
2374		 * from within kernel mode?
2375		 */
2376
2377		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2378			/* dr0 is enabled */
2379			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2380				return (EINVAL);
2381		}
2382		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2383			/* dr1 is enabled */
2384			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2385				return (EINVAL);
2386		}
2387		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2388			/* dr2 is enabled */
2389			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2390				return (EINVAL);
2391		}
2392		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2393			/* dr3 is enabled */
2394			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2395				return (EINVAL);
2396		}
2397
2398		pcb->pcb_dr0 = dbregs->dr[0];
2399		pcb->pcb_dr1 = dbregs->dr[1];
2400		pcb->pcb_dr2 = dbregs->dr[2];
2401		pcb->pcb_dr3 = dbregs->dr[3];
2402		pcb->pcb_dr6 = dbregs->dr[6];
2403		pcb->pcb_dr7 = dbregs->dr[7];
2404
2405		set_pcb_flags(pcb, PCB_DBREGS);
2406	}
2407
2408	return (0);
2409}
2410
2411void
2412reset_dbregs(void)
2413{
2414
2415	load_dr7(0);	/* Turn off the control bits first */
2416	load_dr0(0);
2417	load_dr1(0);
2418	load_dr2(0);
2419	load_dr3(0);
2420	load_dr6(0);
2421}
2422
2423/*
2424 * Return > 0 if a hardware breakpoint has been hit, and the
2425 * breakpoint was in user space.  Return 0, otherwise.
2426 */
2427int
2428user_dbreg_trap(void)
2429{
2430        u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
2431        u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2432        int nbp;            /* number of breakpoints that triggered */
2433        caddr_t addr[4];    /* breakpoint addresses */
2434        int i;
2435
2436        dr7 = rdr7();
2437        if ((dr7 & 0x000000ff) == 0) {
2438                /*
2439                 * all GE and LE bits in the dr7 register are zero,
2440                 * thus the trap couldn't have been caused by the
2441                 * hardware debug registers
2442                 */
2443                return 0;
2444        }
2445
2446        nbp = 0;
2447        dr6 = rdr6();
2448        bp = dr6 & 0x0000000f;
2449
2450        if (!bp) {
2451                /*
2452                 * None of the breakpoint bits are set meaning this
2453                 * trap was not caused by any of the debug registers
2454                 */
2455                return 0;
2456        }
2457
2458        /*
2459         * at least one of the breakpoints were hit, check to see
2460         * which ones and if any of them are user space addresses
2461         */
2462
2463        if (bp & 0x01) {
2464                addr[nbp++] = (caddr_t)rdr0();
2465        }
2466        if (bp & 0x02) {
2467                addr[nbp++] = (caddr_t)rdr1();
2468        }
2469        if (bp & 0x04) {
2470                addr[nbp++] = (caddr_t)rdr2();
2471        }
2472        if (bp & 0x08) {
2473                addr[nbp++] = (caddr_t)rdr3();
2474        }
2475
2476        for (i = 0; i < nbp; i++) {
2477                if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2478                        /*
2479                         * addr[i] is in user space
2480                         */
2481                        return nbp;
2482                }
2483        }
2484
2485        /*
2486         * None of the breakpoints are in user space.
2487         */
2488        return 0;
2489}
2490
2491/*
2492 * The pcb_flags is only modified by current thread, or by other threads
2493 * when current thread is stopped.  However, current thread may change it
2494 * from the interrupt context in cpu_switch(), or in the trap handler.
2495 * When we read-modify-write pcb_flags from C sources, compiler may generate
2496 * code that is not atomic regarding the interrupt handler.  If a trap or
2497 * interrupt happens and any flag is modified from the handler, it can be
2498 * clobbered with the cached value later.  Therefore, we implement setting
2499 * and clearing flags with single-instruction functions, which do not race
2500 * with possible modification of the flags from the trap or interrupt context,
2501 * because traps and interrupts are executed only on instruction boundary.
2502 */
2503void
2504set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2505{
2506
2507	__asm __volatile("orl %1,%0"
2508	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2509	    : "cc", "memory");
2510
2511}
2512
2513/*
2514 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2515 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2516 * pcb if user space modified the bases.  We must save on the context
2517 * switch or if the return to usermode happens through the doreti.
2518 *
2519 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2520 * which have a consequence that the base MSRs must be saved each time
2521 * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
2522 * context switches.
2523 */
2524void
2525set_pcb_flags(struct pcb *pcb, const u_int flags)
2526{
2527	register_t r;
2528
2529	if (curpcb == pcb &&
2530	    (flags & PCB_FULL_IRET) != 0 &&
2531	    (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
2532	    (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
2533		r = intr_disable();
2534		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2535			if (rfs() == _ufssel)
2536				pcb->pcb_fsbase = rdfsbase();
2537			if (rgs() == _ugssel)
2538				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2539		}
2540		set_pcb_flags_raw(pcb, flags);
2541		intr_restore(r);
2542	} else {
2543		set_pcb_flags_raw(pcb, flags);
2544	}
2545}
2546
2547void
2548clear_pcb_flags(struct pcb *pcb, const u_int flags)
2549{
2550
2551	__asm __volatile("andl %1,%0"
2552	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2553	    : "cc", "memory");
2554}
2555
2556#ifdef KDB
2557
2558/*
2559 * Provide inb() and outb() as functions.  They are normally only available as
2560 * inline functions, thus cannot be called from the debugger.
2561 */
2562
2563/* silence compiler warnings */
2564u_char inb_(u_short);
2565void outb_(u_short, u_char);
2566
2567u_char
2568inb_(u_short port)
2569{
2570	return inb(port);
2571}
2572
2573void
2574outb_(u_short port, u_char data)
2575{
2576	outb(port, data);
2577}
2578
2579#endif /* KDB */
2580