machdep.c revision 337262
1/*-
2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * William Jolitz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 337262 2018-08-03 15:42:39Z markj $");
43
44#include "opt_atpic.h"
45#include "opt_compat.h"
46#include "opt_cpu.h"
47#include "opt_ddb.h"
48#include "opt_inet.h"
49#include "opt_isa.h"
50#include "opt_kstack_pages.h"
51#include "opt_maxmem.h"
52#include "opt_mp_watchdog.h"
53#include "opt_perfmon.h"
54#include "opt_platform.h"
55#include "opt_sched.h"
56
57#include <sys/param.h>
58#include <sys/proc.h>
59#include <sys/systm.h>
60#include <sys/bio.h>
61#include <sys/buf.h>
62#include <sys/bus.h>
63#include <sys/callout.h>
64#include <sys/cons.h>
65#include <sys/cpu.h>
66#include <sys/efi.h>
67#include <sys/eventhandler.h>
68#include <sys/exec.h>
69#include <sys/imgact.h>
70#include <sys/kdb.h>
71#include <sys/kernel.h>
72#include <sys/ktr.h>
73#include <sys/linker.h>
74#include <sys/lock.h>
75#include <sys/malloc.h>
76#include <sys/memrange.h>
77#include <sys/msgbuf.h>
78#include <sys/mutex.h>
79#include <sys/pcpu.h>
80#include <sys/ptrace.h>
81#include <sys/reboot.h>
82#include <sys/rwlock.h>
83#include <sys/sched.h>
84#include <sys/signalvar.h>
85#ifdef SMP
86#include <sys/smp.h>
87#endif
88#include <sys/syscallsubr.h>
89#include <sys/sysctl.h>
90#include <sys/sysent.h>
91#include <sys/sysproto.h>
92#include <sys/ucontext.h>
93#include <sys/vmmeter.h>
94
95#include <vm/vm.h>
96#include <vm/vm_extern.h>
97#include <vm/vm_kern.h>
98#include <vm/vm_page.h>
99#include <vm/vm_map.h>
100#include <vm/vm_object.h>
101#include <vm/vm_pager.h>
102#include <vm/vm_param.h>
103#include <vm/vm_phys.h>
104
105#ifdef DDB
106#ifndef KDB
107#error KDB must be enabled in order for DDB to work!
108#endif
109#include <ddb/ddb.h>
110#include <ddb/db_sym.h>
111#endif
112
113#include <net/netisr.h>
114
115#include <machine/clock.h>
116#include <machine/cpu.h>
117#include <machine/cputypes.h>
118#include <machine/frame.h>
119#include <machine/intr_machdep.h>
120#include <x86/mca.h>
121#include <machine/md_var.h>
122#include <machine/metadata.h>
123#include <machine/mp_watchdog.h>
124#include <machine/pc/bios.h>
125#include <machine/pcb.h>
126#include <machine/proc.h>
127#include <machine/reg.h>
128#include <machine/sigframe.h>
129#include <machine/specialreg.h>
130#ifdef PERFMON
131#include <machine/perfmon.h>
132#endif
133#include <machine/tss.h>
134#ifdef SMP
135#include <machine/smp.h>
136#endif
137#ifdef FDT
138#include <x86/fdt.h>
139#endif
140
141#ifdef DEV_ATPIC
142#include <x86/isa/icu.h>
143#else
144#include <x86/apicvar.h>
145#endif
146
147#include <isa/isareg.h>
148#include <isa/rtc.h>
149#include <x86/init.h>
150
151/* Sanity check for __curthread() */
152CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
153
154/*
155 * The PTI trampoline stack needs enough space for a hardware trapframe and a
156 * couple of scratch registers, as well as the trapframe left behind after an
157 * iret fault.
158 */
159CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
160    offsetof(struct pti_frame, pti_rip));
161
162extern u_int64_t hammer_time(u_int64_t, u_int64_t);
163
164#define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
165#define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
166
167static void cpu_startup(void *);
168static void get_fpcontext(struct thread *td, mcontext_t *mcp,
169    char *xfpusave, size_t xfpusave_len);
170static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
171    char *xfpustate, size_t xfpustate_len);
172SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
173
174/* Preload data parse function */
175static caddr_t native_parse_preload_data(u_int64_t);
176
177/* Native function to fetch and parse the e820 map */
178static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
179
180/* Default init_ops implementation. */
181struct init_ops init_ops = {
182	.parse_preload_data =	native_parse_preload_data,
183	.early_clock_source_init =	i8254_init,
184	.early_delay =			i8254_delay,
185	.parse_memmap =			native_parse_memmap,
186#ifdef SMP
187	.mp_bootaddress =		mp_bootaddress,
188	.start_all_aps =		native_start_all_aps,
189#endif
190	.msi_init =			msi_init,
191};
192
193struct msgbuf *msgbufp;
194
195/*
196 * Physical address of the EFI System Table. Stashed from the metadata hints
197 * passed into the kernel and used by the EFI code to call runtime services.
198 */
199vm_paddr_t efi_systbl_phys;
200
201/* Intel ICH registers */
202#define ICH_PMBASE	0x400
203#define ICH_SMI_EN	ICH_PMBASE + 0x30
204
205int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
206
207int cold = 1;
208
209long Maxmem = 0;
210long realmem = 0;
211
212/*
213 * The number of PHYSMAP entries must be one less than the number of
214 * PHYSSEG entries because the PHYSMAP entry that spans the largest
215 * physical address that is accessible by ISA DMA is split into two
216 * PHYSSEG entries.
217 */
218#define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
219
220vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
221vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
222
223/* must be 2 less so 0 0 can signal end of chunks */
224#define	PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
225#define	DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
226
227struct kva_md_info kmi;
228
229static struct trapframe proc0_tf;
230struct region_descriptor r_gdt, r_idt;
231
232struct pcpu __pcpu[MAXCPU];
233
234struct mtx icu_lock;
235
236struct mem_range_softc mem_range_softc;
237
238struct mtx dt_lock;	/* lock for GDT and LDT */
239
240void (*vmm_resume_p)(void);
241
242static void
243cpu_startup(dummy)
244	void *dummy;
245{
246	uintmax_t memsize;
247	char *sysenv;
248
249	/*
250	 * On MacBooks, we need to disallow the legacy USB circuit to
251	 * generate an SMI# because this can cause several problems,
252	 * namely: incorrect CPU frequency detection and failure to
253	 * start the APs.
254	 * We do this by disabling a bit in the SMI_EN (SMI Control and
255	 * Enable register) of the Intel ICH LPC Interface Bridge.
256	 */
257	sysenv = kern_getenv("smbios.system.product");
258	if (sysenv != NULL) {
259		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
260		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
261		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
262		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
263		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
264		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
265		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
266		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
267			if (bootverbose)
268				printf("Disabling LEGACY_USB_EN bit on "
269				    "Intel ICH.\n");
270			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
271		}
272		freeenv(sysenv);
273	}
274
275	/*
276	 * Good {morning,afternoon,evening,night}.
277	 */
278	startrtclock();
279	printcpuinfo();
280#ifdef PERFMON
281	perfmon_init();
282#endif
283
284	/*
285	 * Display physical memory if SMBIOS reports reasonable amount.
286	 */
287	memsize = 0;
288	sysenv = kern_getenv("smbios.memory.enabled");
289	if (sysenv != NULL) {
290		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
291		freeenv(sysenv);
292	}
293	if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
294		memsize = ptoa((uintmax_t)Maxmem);
295	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
296	realmem = atop(memsize);
297
298	/*
299	 * Display any holes after the first chunk of extended memory.
300	 */
301	if (bootverbose) {
302		int indx;
303
304		printf("Physical memory chunk(s):\n");
305		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
306			vm_paddr_t size;
307
308			size = phys_avail[indx + 1] - phys_avail[indx];
309			printf(
310			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
311			    (uintmax_t)phys_avail[indx],
312			    (uintmax_t)phys_avail[indx + 1] - 1,
313			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
314		}
315	}
316
317	vm_ksubmap_init(&kmi);
318
319	printf("avail memory = %ju (%ju MB)\n",
320	    ptoa((uintmax_t)vm_cnt.v_free_count),
321	    ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
322
323	/*
324	 * Set up buffers, so they can be used to read disk labels.
325	 */
326	bufinit();
327	vm_pager_bufferinit();
328
329	cpu_setregs();
330}
331
332/*
333 * Send an interrupt to process.
334 *
335 * Stack is set up to allow sigcode stored
336 * at top to call routine, followed by call
337 * to sigreturn routine below.  After sigreturn
338 * resets the signal mask, the stack, and the
339 * frame pointer, it returns to the user
340 * specified pc, psl.
341 */
342void
343sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
344{
345	struct sigframe sf, *sfp;
346	struct pcb *pcb;
347	struct proc *p;
348	struct thread *td;
349	struct sigacts *psp;
350	char *sp;
351	struct trapframe *regs;
352	char *xfpusave;
353	size_t xfpusave_len;
354	int sig;
355	int oonstack;
356
357	td = curthread;
358	pcb = td->td_pcb;
359	p = td->td_proc;
360	PROC_LOCK_ASSERT(p, MA_OWNED);
361	sig = ksi->ksi_signo;
362	psp = p->p_sigacts;
363	mtx_assert(&psp->ps_mtx, MA_OWNED);
364	regs = td->td_frame;
365	oonstack = sigonstack(regs->tf_rsp);
366
367	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
368		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
369		xfpusave = __builtin_alloca(xfpusave_len);
370	} else {
371		xfpusave_len = 0;
372		xfpusave = NULL;
373	}
374
375	/* Save user context. */
376	bzero(&sf, sizeof(sf));
377	sf.sf_uc.uc_sigmask = *mask;
378	sf.sf_uc.uc_stack = td->td_sigstk;
379	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
380	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
381	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
382	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
383	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
384	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
385	fpstate_drop(td);
386	update_pcb_bases(pcb);
387	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
388	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
389	bzero(sf.sf_uc.uc_mcontext.mc_spare,
390	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
391	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
392
393	/* Allocate space for the signal handler context. */
394	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
395	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
396		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
397#if defined(COMPAT_43)
398		td->td_sigstk.ss_flags |= SS_ONSTACK;
399#endif
400	} else
401		sp = (char *)regs->tf_rsp - 128;
402	if (xfpusave != NULL) {
403		sp -= xfpusave_len;
404		sp = (char *)((unsigned long)sp & ~0x3Ful);
405		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
406	}
407	sp -= sizeof(struct sigframe);
408	/* Align to 16 bytes. */
409	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
410
411	/* Build the argument list for the signal handler. */
412	regs->tf_rdi = sig;			/* arg 1 in %rdi */
413	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
414	bzero(&sf.sf_si, sizeof(sf.sf_si));
415	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
416		/* Signal handler installed with SA_SIGINFO. */
417		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
418		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
419
420		/* Fill in POSIX parts */
421		sf.sf_si = ksi->ksi_info;
422		sf.sf_si.si_signo = sig; /* maybe a translated signal */
423		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
424	} else {
425		/* Old FreeBSD-style arguments. */
426		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
427		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
428		sf.sf_ahu.sf_handler = catcher;
429	}
430	mtx_unlock(&psp->ps_mtx);
431	PROC_UNLOCK(p);
432
433	/*
434	 * Copy the sigframe out to the user's stack.
435	 */
436	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
437	    (xfpusave != NULL && copyout(xfpusave,
438	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
439	    != 0)) {
440#ifdef DEBUG
441		printf("process %ld has trashed its stack\n", (long)p->p_pid);
442#endif
443		PROC_LOCK(p);
444		sigexit(td, SIGILL);
445	}
446
447	regs->tf_rsp = (long)sfp;
448	regs->tf_rip = p->p_sysent->sv_sigcode_base;
449	regs->tf_rflags &= ~(PSL_T | PSL_D);
450	regs->tf_cs = _ucodesel;
451	regs->tf_ds = _udatasel;
452	regs->tf_ss = _udatasel;
453	regs->tf_es = _udatasel;
454	regs->tf_fs = _ufssel;
455	regs->tf_gs = _ugssel;
456	regs->tf_flags = TF_HASSEGS;
457	PROC_LOCK(p);
458	mtx_lock(&psp->ps_mtx);
459}
460
461/*
462 * System call to cleanup state after a signal
463 * has been taken.  Reset signal mask and
464 * stack state from context left by sendsig (above).
465 * Return to previous pc and psl as specified by
466 * context left by sendsig. Check carefully to
467 * make sure that the user has not modified the
468 * state to gain improper privileges.
469 *
470 * MPSAFE
471 */
472int
473sys_sigreturn(td, uap)
474	struct thread *td;
475	struct sigreturn_args /* {
476		const struct __ucontext *sigcntxp;
477	} */ *uap;
478{
479	ucontext_t uc;
480	struct pcb *pcb;
481	struct proc *p;
482	struct trapframe *regs;
483	ucontext_t *ucp;
484	char *xfpustate;
485	size_t xfpustate_len;
486	long rflags;
487	int cs, error, ret;
488	ksiginfo_t ksi;
489
490	pcb = td->td_pcb;
491	p = td->td_proc;
492
493	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
494	if (error != 0) {
495		uprintf("pid %d (%s): sigreturn copyin failed\n",
496		    p->p_pid, td->td_name);
497		return (error);
498	}
499	ucp = &uc;
500	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
501		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
502		    td->td_name, ucp->uc_mcontext.mc_flags);
503		return (EINVAL);
504	}
505	regs = td->td_frame;
506	rflags = ucp->uc_mcontext.mc_rflags;
507	/*
508	 * Don't allow users to change privileged or reserved flags.
509	 */
510	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
511		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
512		    td->td_name, rflags);
513		return (EINVAL);
514	}
515
516	/*
517	 * Don't allow users to load a valid privileged %cs.  Let the
518	 * hardware check for invalid selectors, excess privilege in
519	 * other selectors, invalid %eip's and invalid %esp's.
520	 */
521	cs = ucp->uc_mcontext.mc_cs;
522	if (!CS_SECURE(cs)) {
523		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
524		    td->td_name, cs);
525		ksiginfo_init_trap(&ksi);
526		ksi.ksi_signo = SIGBUS;
527		ksi.ksi_code = BUS_OBJERR;
528		ksi.ksi_trapno = T_PROTFLT;
529		ksi.ksi_addr = (void *)regs->tf_rip;
530		trapsignal(td, &ksi);
531		return (EINVAL);
532	}
533
534	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
535		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
536		if (xfpustate_len > cpu_max_ext_state_size -
537		    sizeof(struct savefpu)) {
538			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
539			    p->p_pid, td->td_name, xfpustate_len);
540			return (EINVAL);
541		}
542		xfpustate = __builtin_alloca(xfpustate_len);
543		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
544		    xfpustate, xfpustate_len);
545		if (error != 0) {
546			uprintf(
547	"pid %d (%s): sigreturn copying xfpustate failed\n",
548			    p->p_pid, td->td_name);
549			return (error);
550		}
551	} else {
552		xfpustate = NULL;
553		xfpustate_len = 0;
554	}
555	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
556	if (ret != 0) {
557		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
558		    p->p_pid, td->td_name, ret);
559		return (ret);
560	}
561	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
562	update_pcb_bases(pcb);
563	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
564	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
565
566#if defined(COMPAT_43)
567	if (ucp->uc_mcontext.mc_onstack & 1)
568		td->td_sigstk.ss_flags |= SS_ONSTACK;
569	else
570		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
571#endif
572
573	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
574	return (EJUSTRETURN);
575}
576
577#ifdef COMPAT_FREEBSD4
578int
579freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
580{
581
582	return sys_sigreturn(td, (struct sigreturn_args *)uap);
583}
584#endif
585
586/*
587 * Reset registers to default values on exec.
588 */
589void
590exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
591{
592	struct trapframe *regs = td->td_frame;
593	struct pcb *pcb = td->td_pcb;
594
595	mtx_lock(&dt_lock);
596	if (td->td_proc->p_md.md_ldt != NULL)
597		user_ldt_free(td);
598	else
599		mtx_unlock(&dt_lock);
600
601	update_pcb_bases(pcb);
602	pcb->pcb_fsbase = 0;
603	pcb->pcb_gsbase = 0;
604	clear_pcb_flags(pcb, PCB_32BIT);
605	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
606
607	bzero((char *)regs, sizeof(struct trapframe));
608	regs->tf_rip = imgp->entry_addr;
609	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
610	regs->tf_rdi = stack;		/* argv */
611	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
612	regs->tf_ss = _udatasel;
613	regs->tf_cs = _ucodesel;
614	regs->tf_ds = _udatasel;
615	regs->tf_es = _udatasel;
616	regs->tf_fs = _ufssel;
617	regs->tf_gs = _ugssel;
618	regs->tf_flags = TF_HASSEGS;
619	td->td_retval[1] = 0;
620
621	/*
622	 * Reset the hardware debug registers if they were in use.
623	 * They won't have any meaning for the newly exec'd process.
624	 */
625	if (pcb->pcb_flags & PCB_DBREGS) {
626		pcb->pcb_dr0 = 0;
627		pcb->pcb_dr1 = 0;
628		pcb->pcb_dr2 = 0;
629		pcb->pcb_dr3 = 0;
630		pcb->pcb_dr6 = 0;
631		pcb->pcb_dr7 = 0;
632		if (pcb == curpcb) {
633			/*
634			 * Clear the debug registers on the running
635			 * CPU, otherwise they will end up affecting
636			 * the next process we switch to.
637			 */
638			reset_dbregs();
639		}
640		clear_pcb_flags(pcb, PCB_DBREGS);
641	}
642
643	/*
644	 * Drop the FP state if we hold it, so that the process gets a
645	 * clean FP state if it uses the FPU again.
646	 */
647	fpstate_drop(td);
648}
649
650void
651cpu_setregs(void)
652{
653	register_t cr0;
654
655	cr0 = rcr0();
656	/*
657	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
658	 * BSP.  See the comments there about why we set them.
659	 */
660	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
661	load_cr0(cr0);
662}
663
664/*
665 * Initialize amd64 and configure to run kernel
666 */
667
668/*
669 * Initialize segments & interrupt table
670 */
671
672struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
673static struct gate_descriptor idt0[NIDT];
674struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
675
676static char dblfault_stack[PAGE_SIZE] __aligned(16);
677static char mce0_stack[PAGE_SIZE] __aligned(16);
678static char nmi0_stack[PAGE_SIZE] __aligned(16);
679static char dbg0_stack[PAGE_SIZE] __aligned(16);
680CTASSERT(sizeof(struct nmi_pcpu) == 16);
681
682struct amd64tss common_tss[MAXCPU];
683
684/*
685 * Software prototypes -- in more palatable form.
686 *
687 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
688 * slots as corresponding segments for i386 kernel.
689 */
690struct soft_segment_descriptor gdt_segs[] = {
691/* GNULL_SEL	0 Null Descriptor */
692{	.ssd_base = 0x0,
693	.ssd_limit = 0x0,
694	.ssd_type = 0,
695	.ssd_dpl = 0,
696	.ssd_p = 0,
697	.ssd_long = 0,
698	.ssd_def32 = 0,
699	.ssd_gran = 0		},
700/* GNULL2_SEL	1 Null Descriptor */
701{	.ssd_base = 0x0,
702	.ssd_limit = 0x0,
703	.ssd_type = 0,
704	.ssd_dpl = 0,
705	.ssd_p = 0,
706	.ssd_long = 0,
707	.ssd_def32 = 0,
708	.ssd_gran = 0		},
709/* GUFS32_SEL	2 32 bit %gs Descriptor for user */
710{	.ssd_base = 0x0,
711	.ssd_limit = 0xfffff,
712	.ssd_type = SDT_MEMRWA,
713	.ssd_dpl = SEL_UPL,
714	.ssd_p = 1,
715	.ssd_long = 0,
716	.ssd_def32 = 1,
717	.ssd_gran = 1		},
718/* GUGS32_SEL	3 32 bit %fs Descriptor for user */
719{	.ssd_base = 0x0,
720	.ssd_limit = 0xfffff,
721	.ssd_type = SDT_MEMRWA,
722	.ssd_dpl = SEL_UPL,
723	.ssd_p = 1,
724	.ssd_long = 0,
725	.ssd_def32 = 1,
726	.ssd_gran = 1		},
727/* GCODE_SEL	4 Code Descriptor for kernel */
728{	.ssd_base = 0x0,
729	.ssd_limit = 0xfffff,
730	.ssd_type = SDT_MEMERA,
731	.ssd_dpl = SEL_KPL,
732	.ssd_p = 1,
733	.ssd_long = 1,
734	.ssd_def32 = 0,
735	.ssd_gran = 1		},
736/* GDATA_SEL	5 Data Descriptor for kernel */
737{	.ssd_base = 0x0,
738	.ssd_limit = 0xfffff,
739	.ssd_type = SDT_MEMRWA,
740	.ssd_dpl = SEL_KPL,
741	.ssd_p = 1,
742	.ssd_long = 1,
743	.ssd_def32 = 0,
744	.ssd_gran = 1		},
745/* GUCODE32_SEL	6 32 bit Code Descriptor for user */
746{	.ssd_base = 0x0,
747	.ssd_limit = 0xfffff,
748	.ssd_type = SDT_MEMERA,
749	.ssd_dpl = SEL_UPL,
750	.ssd_p = 1,
751	.ssd_long = 0,
752	.ssd_def32 = 1,
753	.ssd_gran = 1		},
754/* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
755{	.ssd_base = 0x0,
756	.ssd_limit = 0xfffff,
757	.ssd_type = SDT_MEMRWA,
758	.ssd_dpl = SEL_UPL,
759	.ssd_p = 1,
760	.ssd_long = 0,
761	.ssd_def32 = 1,
762	.ssd_gran = 1		},
763/* GUCODE_SEL	8 64 bit Code Descriptor for user */
764{	.ssd_base = 0x0,
765	.ssd_limit = 0xfffff,
766	.ssd_type = SDT_MEMERA,
767	.ssd_dpl = SEL_UPL,
768	.ssd_p = 1,
769	.ssd_long = 1,
770	.ssd_def32 = 0,
771	.ssd_gran = 1		},
772/* GPROC0_SEL	9 Proc 0 Tss Descriptor */
773{	.ssd_base = 0x0,
774	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
775	.ssd_type = SDT_SYSTSS,
776	.ssd_dpl = SEL_KPL,
777	.ssd_p = 1,
778	.ssd_long = 0,
779	.ssd_def32 = 0,
780	.ssd_gran = 0		},
781/* Actually, the TSS is a system descriptor which is double size */
782{	.ssd_base = 0x0,
783	.ssd_limit = 0x0,
784	.ssd_type = 0,
785	.ssd_dpl = 0,
786	.ssd_p = 0,
787	.ssd_long = 0,
788	.ssd_def32 = 0,
789	.ssd_gran = 0		},
790/* GUSERLDT_SEL	11 LDT Descriptor */
791{	.ssd_base = 0x0,
792	.ssd_limit = 0x0,
793	.ssd_type = 0,
794	.ssd_dpl = 0,
795	.ssd_p = 0,
796	.ssd_long = 0,
797	.ssd_def32 = 0,
798	.ssd_gran = 0		},
799/* GUSERLDT_SEL	12 LDT Descriptor, double size */
800{	.ssd_base = 0x0,
801	.ssd_limit = 0x0,
802	.ssd_type = 0,
803	.ssd_dpl = 0,
804	.ssd_p = 0,
805	.ssd_long = 0,
806	.ssd_def32 = 0,
807	.ssd_gran = 0		},
808};
809
810void
811setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
812{
813	struct gate_descriptor *ip;
814
815	ip = idt + idx;
816	ip->gd_looffset = (uintptr_t)func;
817	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
818	ip->gd_ist = ist;
819	ip->gd_xx = 0;
820	ip->gd_type = typ;
821	ip->gd_dpl = dpl;
822	ip->gd_p = 1;
823	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
824}
825
826extern inthand_t
827	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
828	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
829	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
830	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
831	IDTVEC(xmm), IDTVEC(dblfault),
832	IDTVEC(div_pti), IDTVEC(bpt_pti),
833	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
834	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
835	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
836	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
837	IDTVEC(xmm_pti),
838#ifdef KDTRACE_HOOKS
839	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
840#endif
841#ifdef XENHVM
842	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
843#endif
844	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
845	IDTVEC(fast_syscall_pti);
846
847#ifdef DDB
848/*
849 * Display the index and function name of any IDT entries that don't use
850 * the default 'rsvd' entry point.
851 */
852DB_SHOW_COMMAND(idt, db_show_idt)
853{
854	struct gate_descriptor *ip;
855	int idx;
856	uintptr_t func;
857
858	ip = idt;
859	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
860		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
861		if (func != (uintptr_t)&IDTVEC(rsvd)) {
862			db_printf("%3d\t", idx);
863			db_printsym(func, DB_STGY_PROC);
864			db_printf("\n");
865		}
866		ip++;
867	}
868}
869
870/* Show privileged registers. */
871DB_SHOW_COMMAND(sysregs, db_show_sysregs)
872{
873	struct {
874		uint16_t limit;
875		uint64_t base;
876	} __packed idtr, gdtr;
877	uint16_t ldt, tr;
878
879	__asm __volatile("sidt %0" : "=m" (idtr));
880	db_printf("idtr\t0x%016lx/%04x\n",
881	    (u_long)idtr.base, (u_int)idtr.limit);
882	__asm __volatile("sgdt %0" : "=m" (gdtr));
883	db_printf("gdtr\t0x%016lx/%04x\n",
884	    (u_long)gdtr.base, (u_int)gdtr.limit);
885	__asm __volatile("sldt %0" : "=r" (ldt));
886	db_printf("ldtr\t0x%04x\n", ldt);
887	__asm __volatile("str %0" : "=r" (tr));
888	db_printf("tr\t0x%04x\n", tr);
889	db_printf("cr0\t0x%016lx\n", rcr0());
890	db_printf("cr2\t0x%016lx\n", rcr2());
891	db_printf("cr3\t0x%016lx\n", rcr3());
892	db_printf("cr4\t0x%016lx\n", rcr4());
893	if (rcr4() & CR4_XSAVE)
894		db_printf("xcr0\t0x%016lx\n", rxcr(0));
895	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
896	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
897		db_printf("FEATURES_CTL\t%016lx\n",
898		    rdmsr(MSR_IA32_FEATURE_CONTROL));
899	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
900	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
901	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
902}
903
904DB_SHOW_COMMAND(dbregs, db_show_dbregs)
905{
906
907	db_printf("dr0\t0x%016lx\n", rdr0());
908	db_printf("dr1\t0x%016lx\n", rdr1());
909	db_printf("dr2\t0x%016lx\n", rdr2());
910	db_printf("dr3\t0x%016lx\n", rdr3());
911	db_printf("dr6\t0x%016lx\n", rdr6());
912	db_printf("dr7\t0x%016lx\n", rdr7());
913}
914#endif
915
916void
917sdtossd(sd, ssd)
918	struct user_segment_descriptor *sd;
919	struct soft_segment_descriptor *ssd;
920{
921
922	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
923	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
924	ssd->ssd_type  = sd->sd_type;
925	ssd->ssd_dpl   = sd->sd_dpl;
926	ssd->ssd_p     = sd->sd_p;
927	ssd->ssd_long  = sd->sd_long;
928	ssd->ssd_def32 = sd->sd_def32;
929	ssd->ssd_gran  = sd->sd_gran;
930}
931
932void
933ssdtosd(ssd, sd)
934	struct soft_segment_descriptor *ssd;
935	struct user_segment_descriptor *sd;
936{
937
938	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
939	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
940	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
941	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
942	sd->sd_type  = ssd->ssd_type;
943	sd->sd_dpl   = ssd->ssd_dpl;
944	sd->sd_p     = ssd->ssd_p;
945	sd->sd_long  = ssd->ssd_long;
946	sd->sd_def32 = ssd->ssd_def32;
947	sd->sd_gran  = ssd->ssd_gran;
948}
949
950void
951ssdtosyssd(ssd, sd)
952	struct soft_segment_descriptor *ssd;
953	struct system_segment_descriptor *sd;
954{
955
956	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
957	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
958	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
959	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
960	sd->sd_type  = ssd->ssd_type;
961	sd->sd_dpl   = ssd->ssd_dpl;
962	sd->sd_p     = ssd->ssd_p;
963	sd->sd_gran  = ssd->ssd_gran;
964}
965
966#if !defined(DEV_ATPIC) && defined(DEV_ISA)
967#include <isa/isavar.h>
968#include <isa/isareg.h>
969/*
970 * Return a bitmap of the current interrupt requests.  This is 8259-specific
971 * and is only suitable for use at probe time.
972 * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
973 * It shouldn't be here.  There should probably be an APIC centric
974 * implementation in the apic driver code, if at all.
975 */
976intrmask_t
977isa_irq_pending(void)
978{
979	u_char irr1;
980	u_char irr2;
981
982	irr1 = inb(IO_ICU1);
983	irr2 = inb(IO_ICU2);
984	return ((irr2 << 8) | irr1);
985}
986#endif
987
988u_int basemem;
989
990static int
991add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
992    int *physmap_idxp)
993{
994	int i, insert_idx, physmap_idx;
995
996	physmap_idx = *physmap_idxp;
997
998	if (length == 0)
999		return (1);
1000
1001	/*
1002	 * Find insertion point while checking for overlap.  Start off by
1003	 * assuming the new entry will be added to the end.
1004	 *
1005	 * NB: physmap_idx points to the next free slot.
1006	 */
1007	insert_idx = physmap_idx;
1008	for (i = 0; i <= physmap_idx; i += 2) {
1009		if (base < physmap[i + 1]) {
1010			if (base + length <= physmap[i]) {
1011				insert_idx = i;
1012				break;
1013			}
1014			if (boothowto & RB_VERBOSE)
1015				printf(
1016		    "Overlapping memory regions, ignoring second region\n");
1017			return (1);
1018		}
1019	}
1020
1021	/* See if we can prepend to the next entry. */
1022	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1023		physmap[insert_idx] = base;
1024		return (1);
1025	}
1026
1027	/* See if we can append to the previous entry. */
1028	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1029		physmap[insert_idx - 1] += length;
1030		return (1);
1031	}
1032
1033	physmap_idx += 2;
1034	*physmap_idxp = physmap_idx;
1035	if (physmap_idx == PHYSMAP_SIZE) {
1036		printf(
1037		"Too many segments in the physical address map, giving up\n");
1038		return (0);
1039	}
1040
1041	/*
1042	 * Move the last 'N' entries down to make room for the new
1043	 * entry if needed.
1044	 */
1045	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1046		physmap[i] = physmap[i - 2];
1047		physmap[i + 1] = physmap[i - 1];
1048	}
1049
1050	/* Insert the new entry. */
1051	physmap[insert_idx] = base;
1052	physmap[insert_idx + 1] = base + length;
1053	return (1);
1054}
1055
1056void
1057bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1058                      vm_paddr_t *physmap, int *physmap_idx)
1059{
1060	struct bios_smap *smap, *smapend;
1061
1062	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1063
1064	for (smap = smapbase; smap < smapend; smap++) {
1065		if (boothowto & RB_VERBOSE)
1066			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1067			    smap->type, smap->base, smap->length);
1068
1069		if (smap->type != SMAP_TYPE_MEMORY)
1070			continue;
1071
1072		if (!add_physmap_entry(smap->base, smap->length, physmap,
1073		    physmap_idx))
1074			break;
1075	}
1076}
1077
1078static void
1079add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1080    int *physmap_idx)
1081{
1082	struct efi_md *map, *p;
1083	const char *type;
1084	size_t efisz;
1085	int ndesc, i;
1086
1087	static const char *types[] = {
1088		"Reserved",
1089		"LoaderCode",
1090		"LoaderData",
1091		"BootServicesCode",
1092		"BootServicesData",
1093		"RuntimeServicesCode",
1094		"RuntimeServicesData",
1095		"ConventionalMemory",
1096		"UnusableMemory",
1097		"ACPIReclaimMemory",
1098		"ACPIMemoryNVS",
1099		"MemoryMappedIO",
1100		"MemoryMappedIOPortSpace",
1101		"PalCode",
1102		"PersistentMemory"
1103	};
1104
1105	/*
1106	 * Memory map data provided by UEFI via the GetMemoryMap
1107	 * Boot Services API.
1108	 */
1109	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1110	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1111
1112	if (efihdr->descriptor_size == 0)
1113		return;
1114	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1115
1116	if (boothowto & RB_VERBOSE)
1117		printf("%23s %12s %12s %8s %4s\n",
1118		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1119
1120	for (i = 0, p = map; i < ndesc; i++,
1121	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1122		if (boothowto & RB_VERBOSE) {
1123			if (p->md_type < nitems(types))
1124				type = types[p->md_type];
1125			else
1126				type = "<INVALID>";
1127			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1128			    p->md_virt, p->md_pages);
1129			if (p->md_attr & EFI_MD_ATTR_UC)
1130				printf("UC ");
1131			if (p->md_attr & EFI_MD_ATTR_WC)
1132				printf("WC ");
1133			if (p->md_attr & EFI_MD_ATTR_WT)
1134				printf("WT ");
1135			if (p->md_attr & EFI_MD_ATTR_WB)
1136				printf("WB ");
1137			if (p->md_attr & EFI_MD_ATTR_UCE)
1138				printf("UCE ");
1139			if (p->md_attr & EFI_MD_ATTR_WP)
1140				printf("WP ");
1141			if (p->md_attr & EFI_MD_ATTR_RP)
1142				printf("RP ");
1143			if (p->md_attr & EFI_MD_ATTR_XP)
1144				printf("XP ");
1145			if (p->md_attr & EFI_MD_ATTR_NV)
1146				printf("NV ");
1147			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1148				printf("MORE_RELIABLE ");
1149			if (p->md_attr & EFI_MD_ATTR_RO)
1150				printf("RO ");
1151			if (p->md_attr & EFI_MD_ATTR_RT)
1152				printf("RUNTIME");
1153			printf("\n");
1154		}
1155
1156		switch (p->md_type) {
1157		case EFI_MD_TYPE_CODE:
1158		case EFI_MD_TYPE_DATA:
1159		case EFI_MD_TYPE_BS_CODE:
1160		case EFI_MD_TYPE_BS_DATA:
1161		case EFI_MD_TYPE_FREE:
1162			/*
1163			 * We're allowed to use any entry with these types.
1164			 */
1165			break;
1166		default:
1167			continue;
1168		}
1169
1170		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1171		    physmap, physmap_idx))
1172			break;
1173	}
1174}
1175
1176static char bootmethod[16] = "";
1177SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1178    "System firmware boot method");
1179
1180static void
1181native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1182{
1183	struct bios_smap *smap;
1184	struct efi_map_header *efihdr;
1185	u_int32_t size;
1186
1187	/*
1188	 * Memory map from INT 15:E820.
1189	 *
1190	 * subr_module.c says:
1191	 * "Consumer may safely assume that size value precedes data."
1192	 * ie: an int32_t immediately precedes smap.
1193	 */
1194
1195	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1196	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1197	smap = (struct bios_smap *)preload_search_info(kmdp,
1198	    MODINFO_METADATA | MODINFOMD_SMAP);
1199	if (efihdr == NULL && smap == NULL)
1200		panic("No BIOS smap or EFI map info from loader!");
1201
1202	if (efihdr != NULL) {
1203		add_efi_map_entries(efihdr, physmap, physmap_idx);
1204		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1205	} else {
1206		size = *((u_int32_t *)smap - 1);
1207		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1208		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1209	}
1210}
1211
1212#define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1213
1214/*
1215 * Populate the (physmap) array with base/bound pairs describing the
1216 * available physical memory in the system, then test this memory and
1217 * build the phys_avail array describing the actually-available memory.
1218 *
1219 * Total memory size may be set by the kernel environment variable
1220 * hw.physmem or the compile-time define MAXMEM.
1221 *
1222 * XXX first should be vm_paddr_t.
1223 */
1224static void
1225getmemsize(caddr_t kmdp, u_int64_t first)
1226{
1227	int i, physmap_idx, pa_indx, da_indx;
1228	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1229	u_long physmem_start, physmem_tunable, memtest;
1230	pt_entry_t *pte;
1231	quad_t dcons_addr, dcons_size;
1232	int page_counter;
1233
1234	/*
1235	 * Tell the physical memory allocator about pages used to store
1236	 * the kernel and preloaded data.  See kmem_bootstrap_free().
1237	 */
1238	vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1239
1240	bzero(physmap, sizeof(physmap));
1241	physmap_idx = 0;
1242
1243	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1244	physmap_idx -= 2;
1245
1246	/*
1247	 * Find the 'base memory' segment for SMP
1248	 */
1249	basemem = 0;
1250	for (i = 0; i <= physmap_idx; i += 2) {
1251		if (physmap[i] <= 0xA0000) {
1252			basemem = physmap[i + 1] / 1024;
1253			break;
1254		}
1255	}
1256	if (basemem == 0 || basemem > 640) {
1257		if (bootverbose)
1258			printf(
1259		"Memory map doesn't contain a basemem segment, faking it");
1260		basemem = 640;
1261	}
1262
1263	/*
1264	 * Make hole for "AP -> long mode" bootstrap code.  The
1265	 * mp_bootaddress vector is only available when the kernel
1266	 * is configured to support APs and APs for the system start
1267	 * in 32bit mode (e.g. SMP bare metal).
1268	 */
1269	if (init_ops.mp_bootaddress) {
1270		if (physmap[1] >= 0x100000000)
1271			panic(
1272	"Basemem segment is not suitable for AP bootstrap code!");
1273		physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
1274	}
1275
1276	/*
1277	 * Maxmem isn't the "maximum memory", it's one larger than the
1278	 * highest page of the physical address space.  It should be
1279	 * called something like "Maxphyspage".  We may adjust this
1280	 * based on ``hw.physmem'' and the results of the memory test.
1281	 */
1282	Maxmem = atop(physmap[physmap_idx + 1]);
1283
1284#ifdef MAXMEM
1285	Maxmem = MAXMEM / 4;
1286#endif
1287
1288	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1289		Maxmem = atop(physmem_tunable);
1290
1291	/*
1292	 * The boot memory test is disabled by default, as it takes a
1293	 * significant amount of time on large-memory systems, and is
1294	 * unfriendly to virtual machines as it unnecessarily touches all
1295	 * pages.
1296	 *
1297	 * A general name is used as the code may be extended to support
1298	 * additional tests beyond the current "page present" test.
1299	 */
1300	memtest = 0;
1301	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1302
1303	/*
1304	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1305	 * in the system.
1306	 */
1307	if (Maxmem > atop(physmap[physmap_idx + 1]))
1308		Maxmem = atop(physmap[physmap_idx + 1]);
1309
1310	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1311	    (boothowto & RB_VERBOSE))
1312		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1313
1314	/* call pmap initialization to make new kernel address space */
1315	pmap_bootstrap(&first);
1316
1317	/*
1318	 * Size up each available chunk of physical memory.
1319	 *
1320	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1321	 * By default, mask off the first 16 pages unless we appear to be
1322	 * running in a VM.
1323	 */
1324	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1325	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1326	if (physmap[0] < physmem_start) {
1327		if (physmem_start < PAGE_SIZE)
1328			physmap[0] = PAGE_SIZE;
1329		else if (physmem_start >= physmap[1])
1330			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1331		else
1332			physmap[0] = round_page(physmem_start);
1333	}
1334	pa_indx = 0;
1335	da_indx = 1;
1336	phys_avail[pa_indx++] = physmap[0];
1337	phys_avail[pa_indx] = physmap[0];
1338	dump_avail[da_indx] = physmap[0];
1339	pte = CMAP1;
1340
1341	/*
1342	 * Get dcons buffer address
1343	 */
1344	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1345	    getenv_quad("dcons.size", &dcons_size) == 0)
1346		dcons_addr = 0;
1347
1348	/*
1349	 * physmap is in bytes, so when converting to page boundaries,
1350	 * round up the start address and round down the end address.
1351	 */
1352	page_counter = 0;
1353	if (memtest != 0)
1354		printf("Testing system memory");
1355	for (i = 0; i <= physmap_idx; i += 2) {
1356		vm_paddr_t end;
1357
1358		end = ptoa((vm_paddr_t)Maxmem);
1359		if (physmap[i + 1] < end)
1360			end = trunc_page(physmap[i + 1]);
1361		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1362			int tmp, page_bad, full;
1363			int *ptr = (int *)CADDR1;
1364
1365			full = FALSE;
1366			/*
1367			 * block out kernel memory as not available.
1368			 */
1369			if (pa >= (vm_paddr_t)kernphys && pa < first)
1370				goto do_dump_avail;
1371
1372			/*
1373			 * block out dcons buffer
1374			 */
1375			if (dcons_addr > 0
1376			    && pa >= trunc_page(dcons_addr)
1377			    && pa < dcons_addr + dcons_size)
1378				goto do_dump_avail;
1379
1380			page_bad = FALSE;
1381			if (memtest == 0)
1382				goto skip_memtest;
1383
1384			/*
1385			 * Print a "." every GB to show we're making
1386			 * progress.
1387			 */
1388			page_counter++;
1389			if ((page_counter % PAGES_PER_GB) == 0)
1390				printf(".");
1391
1392			/*
1393			 * map page into kernel: valid, read/write,non-cacheable
1394			 */
1395			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1396			invltlb();
1397
1398			tmp = *(int *)ptr;
1399			/*
1400			 * Test for alternating 1's and 0's
1401			 */
1402			*(volatile int *)ptr = 0xaaaaaaaa;
1403			if (*(volatile int *)ptr != 0xaaaaaaaa)
1404				page_bad = TRUE;
1405			/*
1406			 * Test for alternating 0's and 1's
1407			 */
1408			*(volatile int *)ptr = 0x55555555;
1409			if (*(volatile int *)ptr != 0x55555555)
1410				page_bad = TRUE;
1411			/*
1412			 * Test for all 1's
1413			 */
1414			*(volatile int *)ptr = 0xffffffff;
1415			if (*(volatile int *)ptr != 0xffffffff)
1416				page_bad = TRUE;
1417			/*
1418			 * Test for all 0's
1419			 */
1420			*(volatile int *)ptr = 0x0;
1421			if (*(volatile int *)ptr != 0x0)
1422				page_bad = TRUE;
1423			/*
1424			 * Restore original value.
1425			 */
1426			*(int *)ptr = tmp;
1427
1428skip_memtest:
1429			/*
1430			 * Adjust array of valid/good pages.
1431			 */
1432			if (page_bad == TRUE)
1433				continue;
1434			/*
1435			 * If this good page is a continuation of the
1436			 * previous set of good pages, then just increase
1437			 * the end pointer. Otherwise start a new chunk.
1438			 * Note that "end" points one higher than end,
1439			 * making the range >= start and < end.
1440			 * If we're also doing a speculative memory
1441			 * test and we at or past the end, bump up Maxmem
1442			 * so that we keep going. The first bad page
1443			 * will terminate the loop.
1444			 */
1445			if (phys_avail[pa_indx] == pa) {
1446				phys_avail[pa_indx] += PAGE_SIZE;
1447			} else {
1448				pa_indx++;
1449				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1450					printf(
1451		"Too many holes in the physical address space, giving up\n");
1452					pa_indx--;
1453					full = TRUE;
1454					goto do_dump_avail;
1455				}
1456				phys_avail[pa_indx++] = pa;	/* start */
1457				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1458			}
1459			physmem++;
1460do_dump_avail:
1461			if (dump_avail[da_indx] == pa) {
1462				dump_avail[da_indx] += PAGE_SIZE;
1463			} else {
1464				da_indx++;
1465				if (da_indx == DUMP_AVAIL_ARRAY_END) {
1466					da_indx--;
1467					goto do_next;
1468				}
1469				dump_avail[da_indx++] = pa; /* start */
1470				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1471			}
1472do_next:
1473			if (full)
1474				break;
1475		}
1476	}
1477	*pte = 0;
1478	invltlb();
1479	if (memtest != 0)
1480		printf("\n");
1481
1482	/*
1483	 * XXX
1484	 * The last chunk must contain at least one page plus the message
1485	 * buffer to avoid complicating other code (message buffer address
1486	 * calculation, etc.).
1487	 */
1488	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1489	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1490		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1491		phys_avail[pa_indx--] = 0;
1492		phys_avail[pa_indx--] = 0;
1493	}
1494
1495	Maxmem = atop(phys_avail[pa_indx]);
1496
1497	/* Trim off space for the message buffer. */
1498	phys_avail[pa_indx] -= round_page(msgbufsize);
1499
1500	/* Map the message buffer. */
1501	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1502}
1503
1504static caddr_t
1505native_parse_preload_data(u_int64_t modulep)
1506{
1507	caddr_t kmdp;
1508	char *envp;
1509#ifdef DDB
1510	vm_offset_t ksym_start;
1511	vm_offset_t ksym_end;
1512#endif
1513
1514	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1515	preload_bootstrap_relocate(KERNBASE);
1516	kmdp = preload_search_by_type("elf kernel");
1517	if (kmdp == NULL)
1518		kmdp = preload_search_by_type("elf64 kernel");
1519	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1520	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1521	if (envp != NULL)
1522		envp += KERNBASE;
1523	init_static_kenv(envp, 0);
1524#ifdef DDB
1525	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1526	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1527	db_fetch_ksymtab(ksym_start, ksym_end);
1528#endif
1529	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1530
1531	return (kmdp);
1532}
1533
1534static void
1535amd64_kdb_init(void)
1536{
1537	kdb_init();
1538#ifdef KDB
1539	if (boothowto & RB_KDB)
1540		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1541#endif
1542}
1543
1544/* Set up the fast syscall stuff */
1545void
1546amd64_conf_fast_syscall(void)
1547{
1548	uint64_t msr;
1549
1550	msr = rdmsr(MSR_EFER) | EFER_SCE;
1551	wrmsr(MSR_EFER, msr);
1552	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1553	    (u_int64_t)IDTVEC(fast_syscall));
1554	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1555	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1556	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1557	wrmsr(MSR_STAR, msr);
1558	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
1559}
1560
1561u_int64_t
1562hammer_time(u_int64_t modulep, u_int64_t physfree)
1563{
1564	caddr_t kmdp;
1565	int gsel_tss, x;
1566	struct pcpu *pc;
1567	struct nmi_pcpu *np;
1568	struct xstate_hdr *xhdr;
1569	u_int64_t rsp0;
1570	char *env;
1571	size_t kstack0_sz;
1572	int late_console;
1573
1574	kmdp = init_ops.parse_preload_data(modulep);
1575
1576	identify_cpu1();
1577	identify_hypervisor();
1578	/*
1579	 * hw.cpu_stdext_disable is ignored by the call, it will be
1580	 * re-evaluted by the below call to finishidentcpu().
1581	 */
1582	identify_cpu2();
1583
1584	link_elf_ireloc(kmdp);
1585
1586	/*
1587	 * This may be done better later if it gets more high level
1588	 * components in it. If so just link td->td_proc here.
1589	 */
1590	proc_linkup0(&proc0, &thread0);
1591
1592	/* Init basic tunables, hz etc */
1593	init_param1();
1594
1595	thread0.td_kstack = physfree + KERNBASE;
1596	thread0.td_kstack_pages = kstack_pages;
1597	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1598	bzero((void *)thread0.td_kstack, kstack0_sz);
1599	physfree += kstack0_sz;
1600
1601	/*
1602	 * make gdt memory segments
1603	 */
1604	for (x = 0; x < NGDT; x++) {
1605		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1606		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1607			ssdtosd(&gdt_segs[x], &gdt[x]);
1608	}
1609	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1610	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1611	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1612
1613	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1614	r_gdt.rd_base =  (long) gdt;
1615	lgdt(&r_gdt);
1616	pc = &__pcpu[0];
1617
1618	wrmsr(MSR_FSBASE, 0);		/* User value */
1619	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1620	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1621
1622	pcpu_init(pc, 0, sizeof(struct pcpu));
1623	dpcpu_init((void *)(physfree + KERNBASE), 0);
1624	physfree += DPCPU_SIZE;
1625	PCPU_SET(prvspace, pc);
1626	PCPU_SET(curthread, &thread0);
1627	/* Non-late cninit() and printf() can be moved up to here. */
1628	PCPU_SET(tssp, &common_tss[0]);
1629	PCPU_SET(commontssp, &common_tss[0]);
1630	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1631	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1632	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1633	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1634
1635	/*
1636	 * Initialize mutexes.
1637	 *
1638	 * icu_lock: in order to allow an interrupt to occur in a critical
1639	 * 	     section, to set pcpu->ipending (etc...) properly, we
1640	 *	     must be able to get the icu lock, so it can't be
1641	 *	     under witness.
1642	 */
1643	mutex_init();
1644	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1645	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1646
1647	/* exceptions */
1648	pti = pti_get_default();
1649	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1650
1651	for (x = 0; x < NIDT; x++)
1652		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1653		    SEL_KPL, 0);
1654	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1655	    SEL_KPL, 0);
1656	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1657	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1658	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1659	    SEL_UPL, 0);
1660	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1661	    SEL_UPL, 0);
1662	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1663	    SEL_KPL, 0);
1664	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1665	    SEL_KPL, 0);
1666	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1667	    SEL_KPL, 0);
1668	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1669	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1670	    SDT_SYSIGT, SEL_KPL, 0);
1671	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1672	    SEL_KPL, 0);
1673	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1674	    SDT_SYSIGT, SEL_KPL, 0);
1675	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1676	    SEL_KPL, 0);
1677	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1678	    SEL_KPL, 0);
1679	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1680	    SEL_KPL, 0);
1681	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1682	    SEL_KPL, 0);
1683	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1684	    SEL_KPL, 0);
1685	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1686	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1687	    SEL_KPL, 0);
1688#ifdef KDTRACE_HOOKS
1689	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1690	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1691#endif
1692#ifdef XENHVM
1693	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1694	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1695#endif
1696	r_idt.rd_limit = sizeof(idt0) - 1;
1697	r_idt.rd_base = (long) idt;
1698	lidt(&r_idt);
1699
1700	/*
1701	 * Initialize the clock before the console so that console
1702	 * initialization can use DELAY().
1703	 */
1704	clock_init();
1705
1706	/*
1707	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1708	 * transition).
1709	 * Once bootblocks have updated, we can test directly for
1710	 * efi_systbl != NULL here...
1711	 */
1712	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1713	    != NULL)
1714		vty_set_preferred(VTY_VT);
1715
1716	finishidentcpu();	/* Final stage of CPU initialization */
1717	initializecpu();	/* Initialize CPU registers */
1718	initializecpucache();
1719
1720	/* doublefault stack space, runs on ist1 */
1721	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1722
1723	/*
1724	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1725	 * above the start of the ist2 stack.
1726	 */
1727	np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1728	np->np_pcpu = (register_t) pc;
1729	common_tss[0].tss_ist2 = (long) np;
1730
1731	/*
1732	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1733	 * above the start of the ist3 stack.
1734	 */
1735	np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
1736	np->np_pcpu = (register_t) pc;
1737	common_tss[0].tss_ist3 = (long) np;
1738
1739	/*
1740	 * DB# stack, runs on ist4.
1741	 */
1742	np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
1743	np->np_pcpu = (register_t) pc;
1744	common_tss[0].tss_ist4 = (long) np;
1745
1746	/* Set the IO permission bitmap (empty due to tss seg limit) */
1747	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1748
1749	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1750	ltr(gsel_tss);
1751
1752	amd64_conf_fast_syscall();
1753
1754	/*
1755	 * Temporary forge some valid pointer to PCB, for exception
1756	 * handlers.  It is reinitialized properly below after FPU is
1757	 * set up.  Also set up td_critnest to short-cut the page
1758	 * fault handler.
1759	 */
1760	cpu_max_ext_state_size = sizeof(struct savefpu);
1761	thread0.td_pcb = get_pcb_td(&thread0);
1762	thread0.td_critnest = 1;
1763
1764	/*
1765	 * The console and kdb should be initialized even earlier than here,
1766	 * but some console drivers don't work until after getmemsize().
1767	 * Default to late console initialization to support these drivers.
1768	 * This loses mainly printf()s in getmemsize() and early debugging.
1769	 */
1770	late_console = 1;
1771	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1772	if (!late_console) {
1773		cninit();
1774		amd64_kdb_init();
1775	}
1776
1777	getmemsize(kmdp, physfree);
1778	init_param2(physmem);
1779
1780	/* now running on new page tables, configured,and u/iom is accessible */
1781
1782	if (late_console)
1783		cninit();
1784
1785#ifdef DEV_ISA
1786#ifdef DEV_ATPIC
1787	elcr_probe();
1788	atpic_startup();
1789#else
1790	/* Reset and mask the atpics and leave them shut down. */
1791	atpic_reset();
1792
1793	/*
1794	 * Point the ICU spurious interrupt vectors at the APIC spurious
1795	 * interrupt handler.
1796	 */
1797	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1798	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1799#endif
1800#else
1801#error "have you forgotten the isa device?";
1802#endif
1803
1804	if (late_console)
1805		amd64_kdb_init();
1806
1807	msgbufinit(msgbufp, msgbufsize);
1808	fpuinit();
1809
1810	/*
1811	 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1812	 * area size.  Zero out the extended state header in fpu save
1813	 * area.
1814	 */
1815	thread0.td_pcb = get_pcb_td(&thread0);
1816	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1817	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1818	if (use_xsave) {
1819		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1820		    1);
1821		xhdr->xstate_bv = xsave_mask;
1822	}
1823	/* make an initial tss so cpu can get interrupt stack on syscall! */
1824	rsp0 = (vm_offset_t)thread0.td_pcb;
1825	/* Ensure the stack is aligned to 16 bytes */
1826	rsp0 &= ~0xFul;
1827	common_tss[0].tss_rsp0 = rsp0;
1828	PCPU_SET(rsp0, rsp0);
1829	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1830	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1831	PCPU_SET(curpcb, thread0.td_pcb);
1832
1833	/* transfer to user mode */
1834
1835	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1836	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1837	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1838	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1839	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1840
1841	load_ds(_udatasel);
1842	load_es(_udatasel);
1843	load_fs(_ufssel);
1844
1845	/* setup proc 0's pcb */
1846	thread0.td_pcb->pcb_flags = 0;
1847	thread0.td_frame = &proc0_tf;
1848
1849        env = kern_getenv("kernelname");
1850	if (env != NULL)
1851		strlcpy(kernelname, env, sizeof(kernelname));
1852
1853	cpu_probe_amdc1e();
1854
1855#ifdef FDT
1856	x86_init_fdt();
1857#endif
1858	thread0.td_critnest = 0;
1859
1860	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1861	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1862
1863	/* Location of kernel stack for locore */
1864	return ((u_int64_t)thread0.td_pcb);
1865}
1866
1867void
1868cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1869{
1870
1871	pcpu->pc_acpi_id = 0xffffffff;
1872}
1873
1874static int
1875smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1876{
1877	struct bios_smap *smapbase;
1878	struct bios_smap_xattr smap;
1879	caddr_t kmdp;
1880	uint32_t *smapattr;
1881	int count, error, i;
1882
1883	/* Retrieve the system memory map from the loader. */
1884	kmdp = preload_search_by_type("elf kernel");
1885	if (kmdp == NULL)
1886		kmdp = preload_search_by_type("elf64 kernel");
1887	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1888	    MODINFO_METADATA | MODINFOMD_SMAP);
1889	if (smapbase == NULL)
1890		return (0);
1891	smapattr = (uint32_t *)preload_search_info(kmdp,
1892	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1893	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1894	error = 0;
1895	for (i = 0; i < count; i++) {
1896		smap.base = smapbase[i].base;
1897		smap.length = smapbase[i].length;
1898		smap.type = smapbase[i].type;
1899		if (smapattr != NULL)
1900			smap.xattr = smapattr[i];
1901		else
1902			smap.xattr = 0;
1903		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1904	}
1905	return (error);
1906}
1907SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1908    smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1909
1910static int
1911efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1912{
1913	struct efi_map_header *efihdr;
1914	caddr_t kmdp;
1915	uint32_t efisize;
1916
1917	kmdp = preload_search_by_type("elf kernel");
1918	if (kmdp == NULL)
1919		kmdp = preload_search_by_type("elf64 kernel");
1920	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1921	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1922	if (efihdr == NULL)
1923		return (0);
1924	efisize = *((uint32_t *)efihdr - 1);
1925	return (SYSCTL_OUT(req, efihdr, efisize));
1926}
1927SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1928    efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1929
1930void
1931spinlock_enter(void)
1932{
1933	struct thread *td;
1934	register_t flags;
1935
1936	td = curthread;
1937	if (td->td_md.md_spinlock_count == 0) {
1938		flags = intr_disable();
1939		td->td_md.md_spinlock_count = 1;
1940		td->td_md.md_saved_flags = flags;
1941	} else
1942		td->td_md.md_spinlock_count++;
1943	critical_enter();
1944}
1945
1946void
1947spinlock_exit(void)
1948{
1949	struct thread *td;
1950	register_t flags;
1951
1952	td = curthread;
1953	critical_exit();
1954	flags = td->td_md.md_saved_flags;
1955	td->td_md.md_spinlock_count--;
1956	if (td->td_md.md_spinlock_count == 0)
1957		intr_restore(flags);
1958}
1959
1960/*
1961 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1962 * we want to start a backtrace from the function that caused us to enter
1963 * the debugger. We have the context in the trapframe, but base the trace
1964 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1965 * enough for a backtrace.
1966 */
1967void
1968makectx(struct trapframe *tf, struct pcb *pcb)
1969{
1970
1971	pcb->pcb_r12 = tf->tf_r12;
1972	pcb->pcb_r13 = tf->tf_r13;
1973	pcb->pcb_r14 = tf->tf_r14;
1974	pcb->pcb_r15 = tf->tf_r15;
1975	pcb->pcb_rbp = tf->tf_rbp;
1976	pcb->pcb_rbx = tf->tf_rbx;
1977	pcb->pcb_rip = tf->tf_rip;
1978	pcb->pcb_rsp = tf->tf_rsp;
1979}
1980
1981int
1982ptrace_set_pc(struct thread *td, unsigned long addr)
1983{
1984
1985	td->td_frame->tf_rip = addr;
1986	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
1987	return (0);
1988}
1989
1990int
1991ptrace_single_step(struct thread *td)
1992{
1993	td->td_frame->tf_rflags |= PSL_T;
1994	return (0);
1995}
1996
1997int
1998ptrace_clear_single_step(struct thread *td)
1999{
2000	td->td_frame->tf_rflags &= ~PSL_T;
2001	return (0);
2002}
2003
2004int
2005fill_regs(struct thread *td, struct reg *regs)
2006{
2007	struct trapframe *tp;
2008
2009	tp = td->td_frame;
2010	return (fill_frame_regs(tp, regs));
2011}
2012
2013int
2014fill_frame_regs(struct trapframe *tp, struct reg *regs)
2015{
2016	regs->r_r15 = tp->tf_r15;
2017	regs->r_r14 = tp->tf_r14;
2018	regs->r_r13 = tp->tf_r13;
2019	regs->r_r12 = tp->tf_r12;
2020	regs->r_r11 = tp->tf_r11;
2021	regs->r_r10 = tp->tf_r10;
2022	regs->r_r9  = tp->tf_r9;
2023	regs->r_r8  = tp->tf_r8;
2024	regs->r_rdi = tp->tf_rdi;
2025	regs->r_rsi = tp->tf_rsi;
2026	regs->r_rbp = tp->tf_rbp;
2027	regs->r_rbx = tp->tf_rbx;
2028	regs->r_rdx = tp->tf_rdx;
2029	regs->r_rcx = tp->tf_rcx;
2030	regs->r_rax = tp->tf_rax;
2031	regs->r_rip = tp->tf_rip;
2032	regs->r_cs = tp->tf_cs;
2033	regs->r_rflags = tp->tf_rflags;
2034	regs->r_rsp = tp->tf_rsp;
2035	regs->r_ss = tp->tf_ss;
2036	if (tp->tf_flags & TF_HASSEGS) {
2037		regs->r_ds = tp->tf_ds;
2038		regs->r_es = tp->tf_es;
2039		regs->r_fs = tp->tf_fs;
2040		regs->r_gs = tp->tf_gs;
2041	} else {
2042		regs->r_ds = 0;
2043		regs->r_es = 0;
2044		regs->r_fs = 0;
2045		regs->r_gs = 0;
2046	}
2047	return (0);
2048}
2049
2050int
2051set_regs(struct thread *td, struct reg *regs)
2052{
2053	struct trapframe *tp;
2054	register_t rflags;
2055
2056	tp = td->td_frame;
2057	rflags = regs->r_rflags & 0xffffffff;
2058	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2059		return (EINVAL);
2060	tp->tf_r15 = regs->r_r15;
2061	tp->tf_r14 = regs->r_r14;
2062	tp->tf_r13 = regs->r_r13;
2063	tp->tf_r12 = regs->r_r12;
2064	tp->tf_r11 = regs->r_r11;
2065	tp->tf_r10 = regs->r_r10;
2066	tp->tf_r9  = regs->r_r9;
2067	tp->tf_r8  = regs->r_r8;
2068	tp->tf_rdi = regs->r_rdi;
2069	tp->tf_rsi = regs->r_rsi;
2070	tp->tf_rbp = regs->r_rbp;
2071	tp->tf_rbx = regs->r_rbx;
2072	tp->tf_rdx = regs->r_rdx;
2073	tp->tf_rcx = regs->r_rcx;
2074	tp->tf_rax = regs->r_rax;
2075	tp->tf_rip = regs->r_rip;
2076	tp->tf_cs = regs->r_cs;
2077	tp->tf_rflags = rflags;
2078	tp->tf_rsp = regs->r_rsp;
2079	tp->tf_ss = regs->r_ss;
2080	if (0) {	/* XXXKIB */
2081		tp->tf_ds = regs->r_ds;
2082		tp->tf_es = regs->r_es;
2083		tp->tf_fs = regs->r_fs;
2084		tp->tf_gs = regs->r_gs;
2085		tp->tf_flags = TF_HASSEGS;
2086	}
2087	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2088	return (0);
2089}
2090
2091/* XXX check all this stuff! */
2092/* externalize from sv_xmm */
2093static void
2094fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2095{
2096	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2097	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2098	int i;
2099
2100	/* pcb -> fpregs */
2101	bzero(fpregs, sizeof(*fpregs));
2102
2103	/* FPU control/status */
2104	penv_fpreg->en_cw = penv_xmm->en_cw;
2105	penv_fpreg->en_sw = penv_xmm->en_sw;
2106	penv_fpreg->en_tw = penv_xmm->en_tw;
2107	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2108	penv_fpreg->en_rip = penv_xmm->en_rip;
2109	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2110	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2111	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2112
2113	/* FPU registers */
2114	for (i = 0; i < 8; ++i)
2115		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2116
2117	/* SSE registers */
2118	for (i = 0; i < 16; ++i)
2119		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2120}
2121
2122/* internalize from fpregs into sv_xmm */
2123static void
2124set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2125{
2126	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2127	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2128	int i;
2129
2130	/* fpregs -> pcb */
2131	/* FPU control/status */
2132	penv_xmm->en_cw = penv_fpreg->en_cw;
2133	penv_xmm->en_sw = penv_fpreg->en_sw;
2134	penv_xmm->en_tw = penv_fpreg->en_tw;
2135	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2136	penv_xmm->en_rip = penv_fpreg->en_rip;
2137	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2138	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2139	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2140
2141	/* FPU registers */
2142	for (i = 0; i < 8; ++i)
2143		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2144
2145	/* SSE registers */
2146	for (i = 0; i < 16; ++i)
2147		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2148}
2149
2150/* externalize from td->pcb */
2151int
2152fill_fpregs(struct thread *td, struct fpreg *fpregs)
2153{
2154
2155	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2156	    P_SHOULDSTOP(td->td_proc),
2157	    ("not suspended thread %p", td));
2158	fpugetregs(td);
2159	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2160	return (0);
2161}
2162
2163/* internalize to td->pcb */
2164int
2165set_fpregs(struct thread *td, struct fpreg *fpregs)
2166{
2167
2168	critical_enter();
2169	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2170	fpuuserinited(td);
2171	critical_exit();
2172	return (0);
2173}
2174
2175/*
2176 * Get machine context.
2177 */
2178int
2179get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2180{
2181	struct pcb *pcb;
2182	struct trapframe *tp;
2183
2184	pcb = td->td_pcb;
2185	tp = td->td_frame;
2186	PROC_LOCK(curthread->td_proc);
2187	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2188	PROC_UNLOCK(curthread->td_proc);
2189	mcp->mc_r15 = tp->tf_r15;
2190	mcp->mc_r14 = tp->tf_r14;
2191	mcp->mc_r13 = tp->tf_r13;
2192	mcp->mc_r12 = tp->tf_r12;
2193	mcp->mc_r11 = tp->tf_r11;
2194	mcp->mc_r10 = tp->tf_r10;
2195	mcp->mc_r9  = tp->tf_r9;
2196	mcp->mc_r8  = tp->tf_r8;
2197	mcp->mc_rdi = tp->tf_rdi;
2198	mcp->mc_rsi = tp->tf_rsi;
2199	mcp->mc_rbp = tp->tf_rbp;
2200	mcp->mc_rbx = tp->tf_rbx;
2201	mcp->mc_rcx = tp->tf_rcx;
2202	mcp->mc_rflags = tp->tf_rflags;
2203	if (flags & GET_MC_CLEAR_RET) {
2204		mcp->mc_rax = 0;
2205		mcp->mc_rdx = 0;
2206		mcp->mc_rflags &= ~PSL_C;
2207	} else {
2208		mcp->mc_rax = tp->tf_rax;
2209		mcp->mc_rdx = tp->tf_rdx;
2210	}
2211	mcp->mc_rip = tp->tf_rip;
2212	mcp->mc_cs = tp->tf_cs;
2213	mcp->mc_rsp = tp->tf_rsp;
2214	mcp->mc_ss = tp->tf_ss;
2215	mcp->mc_ds = tp->tf_ds;
2216	mcp->mc_es = tp->tf_es;
2217	mcp->mc_fs = tp->tf_fs;
2218	mcp->mc_gs = tp->tf_gs;
2219	mcp->mc_flags = tp->tf_flags;
2220	mcp->mc_len = sizeof(*mcp);
2221	get_fpcontext(td, mcp, NULL, 0);
2222	update_pcb_bases(pcb);
2223	mcp->mc_fsbase = pcb->pcb_fsbase;
2224	mcp->mc_gsbase = pcb->pcb_gsbase;
2225	mcp->mc_xfpustate = 0;
2226	mcp->mc_xfpustate_len = 0;
2227	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2228	return (0);
2229}
2230
2231/*
2232 * Set machine context.
2233 *
2234 * However, we don't set any but the user modifiable flags, and we won't
2235 * touch the cs selector.
2236 */
2237int
2238set_mcontext(struct thread *td, mcontext_t *mcp)
2239{
2240	struct pcb *pcb;
2241	struct trapframe *tp;
2242	char *xfpustate;
2243	long rflags;
2244	int ret;
2245
2246	pcb = td->td_pcb;
2247	tp = td->td_frame;
2248	if (mcp->mc_len != sizeof(*mcp) ||
2249	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2250		return (EINVAL);
2251	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2252	    (tp->tf_rflags & ~PSL_USERCHANGE);
2253	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2254		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2255		    sizeof(struct savefpu))
2256			return (EINVAL);
2257		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2258		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2259		    mcp->mc_xfpustate_len);
2260		if (ret != 0)
2261			return (ret);
2262	} else
2263		xfpustate = NULL;
2264	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2265	if (ret != 0)
2266		return (ret);
2267	tp->tf_r15 = mcp->mc_r15;
2268	tp->tf_r14 = mcp->mc_r14;
2269	tp->tf_r13 = mcp->mc_r13;
2270	tp->tf_r12 = mcp->mc_r12;
2271	tp->tf_r11 = mcp->mc_r11;
2272	tp->tf_r10 = mcp->mc_r10;
2273	tp->tf_r9  = mcp->mc_r9;
2274	tp->tf_r8  = mcp->mc_r8;
2275	tp->tf_rdi = mcp->mc_rdi;
2276	tp->tf_rsi = mcp->mc_rsi;
2277	tp->tf_rbp = mcp->mc_rbp;
2278	tp->tf_rbx = mcp->mc_rbx;
2279	tp->tf_rdx = mcp->mc_rdx;
2280	tp->tf_rcx = mcp->mc_rcx;
2281	tp->tf_rax = mcp->mc_rax;
2282	tp->tf_rip = mcp->mc_rip;
2283	tp->tf_rflags = rflags;
2284	tp->tf_rsp = mcp->mc_rsp;
2285	tp->tf_ss = mcp->mc_ss;
2286	tp->tf_flags = mcp->mc_flags;
2287	if (tp->tf_flags & TF_HASSEGS) {
2288		tp->tf_ds = mcp->mc_ds;
2289		tp->tf_es = mcp->mc_es;
2290		tp->tf_fs = mcp->mc_fs;
2291		tp->tf_gs = mcp->mc_gs;
2292	}
2293	set_pcb_flags(pcb, PCB_FULL_IRET);
2294	if (mcp->mc_flags & _MC_HASBASES) {
2295		pcb->pcb_fsbase = mcp->mc_fsbase;
2296		pcb->pcb_gsbase = mcp->mc_gsbase;
2297	}
2298	return (0);
2299}
2300
2301static void
2302get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2303    size_t xfpusave_len)
2304{
2305	size_t max_len, len;
2306
2307	mcp->mc_ownedfp = fpugetregs(td);
2308	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2309	    sizeof(mcp->mc_fpstate));
2310	mcp->mc_fpformat = fpuformat();
2311	if (!use_xsave || xfpusave_len == 0)
2312		return;
2313	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2314	len = xfpusave_len;
2315	if (len > max_len) {
2316		len = max_len;
2317		bzero(xfpusave + max_len, len - max_len);
2318	}
2319	mcp->mc_flags |= _MC_HASFPXSTATE;
2320	mcp->mc_xfpustate_len = len;
2321	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2322}
2323
2324static int
2325set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2326    size_t xfpustate_len)
2327{
2328	int error;
2329
2330	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2331		return (0);
2332	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2333		return (EINVAL);
2334	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2335		/* We don't care what state is left in the FPU or PCB. */
2336		fpstate_drop(td);
2337		error = 0;
2338	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2339	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2340		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2341		    xfpustate, xfpustate_len);
2342	} else
2343		return (EINVAL);
2344	return (error);
2345}
2346
2347void
2348fpstate_drop(struct thread *td)
2349{
2350
2351	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2352	critical_enter();
2353	if (PCPU_GET(fpcurthread) == td)
2354		fpudrop();
2355	/*
2356	 * XXX force a full drop of the fpu.  The above only drops it if we
2357	 * owned it.
2358	 *
2359	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2360	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2361	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2362	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2363	 * have too many layers.
2364	 */
2365	clear_pcb_flags(curthread->td_pcb,
2366	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2367	critical_exit();
2368}
2369
2370int
2371fill_dbregs(struct thread *td, struct dbreg *dbregs)
2372{
2373	struct pcb *pcb;
2374
2375	if (td == NULL) {
2376		dbregs->dr[0] = rdr0();
2377		dbregs->dr[1] = rdr1();
2378		dbregs->dr[2] = rdr2();
2379		dbregs->dr[3] = rdr3();
2380		dbregs->dr[6] = rdr6();
2381		dbregs->dr[7] = rdr7();
2382	} else {
2383		pcb = td->td_pcb;
2384		dbregs->dr[0] = pcb->pcb_dr0;
2385		dbregs->dr[1] = pcb->pcb_dr1;
2386		dbregs->dr[2] = pcb->pcb_dr2;
2387		dbregs->dr[3] = pcb->pcb_dr3;
2388		dbregs->dr[6] = pcb->pcb_dr6;
2389		dbregs->dr[7] = pcb->pcb_dr7;
2390	}
2391	dbregs->dr[4] = 0;
2392	dbregs->dr[5] = 0;
2393	dbregs->dr[8] = 0;
2394	dbregs->dr[9] = 0;
2395	dbregs->dr[10] = 0;
2396	dbregs->dr[11] = 0;
2397	dbregs->dr[12] = 0;
2398	dbregs->dr[13] = 0;
2399	dbregs->dr[14] = 0;
2400	dbregs->dr[15] = 0;
2401	return (0);
2402}
2403
2404int
2405set_dbregs(struct thread *td, struct dbreg *dbregs)
2406{
2407	struct pcb *pcb;
2408	int i;
2409
2410	if (td == NULL) {
2411		load_dr0(dbregs->dr[0]);
2412		load_dr1(dbregs->dr[1]);
2413		load_dr2(dbregs->dr[2]);
2414		load_dr3(dbregs->dr[3]);
2415		load_dr6(dbregs->dr[6]);
2416		load_dr7(dbregs->dr[7]);
2417	} else {
2418		/*
2419		 * Don't let an illegal value for dr7 get set.  Specifically,
2420		 * check for undefined settings.  Setting these bit patterns
2421		 * result in undefined behaviour and can lead to an unexpected
2422		 * TRCTRAP or a general protection fault right here.
2423		 * Upper bits of dr6 and dr7 must not be set
2424		 */
2425		for (i = 0; i < 4; i++) {
2426			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2427				return (EINVAL);
2428			if (td->td_frame->tf_cs == _ucode32sel &&
2429			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2430				return (EINVAL);
2431		}
2432		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2433		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2434			return (EINVAL);
2435
2436		pcb = td->td_pcb;
2437
2438		/*
2439		 * Don't let a process set a breakpoint that is not within the
2440		 * process's address space.  If a process could do this, it
2441		 * could halt the system by setting a breakpoint in the kernel
2442		 * (if ddb was enabled).  Thus, we need to check to make sure
2443		 * that no breakpoints are being enabled for addresses outside
2444		 * process's address space.
2445		 *
2446		 * XXX - what about when the watched area of the user's
2447		 * address space is written into from within the kernel
2448		 * ... wouldn't that still cause a breakpoint to be generated
2449		 * from within kernel mode?
2450		 */
2451
2452		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2453			/* dr0 is enabled */
2454			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2455				return (EINVAL);
2456		}
2457		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2458			/* dr1 is enabled */
2459			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2460				return (EINVAL);
2461		}
2462		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2463			/* dr2 is enabled */
2464			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2465				return (EINVAL);
2466		}
2467		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2468			/* dr3 is enabled */
2469			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2470				return (EINVAL);
2471		}
2472
2473		pcb->pcb_dr0 = dbregs->dr[0];
2474		pcb->pcb_dr1 = dbregs->dr[1];
2475		pcb->pcb_dr2 = dbregs->dr[2];
2476		pcb->pcb_dr3 = dbregs->dr[3];
2477		pcb->pcb_dr6 = dbregs->dr[6];
2478		pcb->pcb_dr7 = dbregs->dr[7];
2479
2480		set_pcb_flags(pcb, PCB_DBREGS);
2481	}
2482
2483	return (0);
2484}
2485
2486void
2487reset_dbregs(void)
2488{
2489
2490	load_dr7(0);	/* Turn off the control bits first */
2491	load_dr0(0);
2492	load_dr1(0);
2493	load_dr2(0);
2494	load_dr3(0);
2495	load_dr6(0);
2496}
2497
2498/*
2499 * Return > 0 if a hardware breakpoint has been hit, and the
2500 * breakpoint was in user space.  Return 0, otherwise.
2501 */
2502int
2503user_dbreg_trap(void)
2504{
2505        u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
2506        u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2507        int nbp;            /* number of breakpoints that triggered */
2508        caddr_t addr[4];    /* breakpoint addresses */
2509        int i;
2510
2511        dr7 = rdr7();
2512        if ((dr7 & 0x000000ff) == 0) {
2513                /*
2514                 * all GE and LE bits in the dr7 register are zero,
2515                 * thus the trap couldn't have been caused by the
2516                 * hardware debug registers
2517                 */
2518                return 0;
2519        }
2520
2521        nbp = 0;
2522        dr6 = rdr6();
2523        bp = dr6 & 0x0000000f;
2524
2525        if (!bp) {
2526                /*
2527                 * None of the breakpoint bits are set meaning this
2528                 * trap was not caused by any of the debug registers
2529                 */
2530                return 0;
2531        }
2532
2533        /*
2534         * at least one of the breakpoints were hit, check to see
2535         * which ones and if any of them are user space addresses
2536         */
2537
2538        if (bp & 0x01) {
2539                addr[nbp++] = (caddr_t)rdr0();
2540        }
2541        if (bp & 0x02) {
2542                addr[nbp++] = (caddr_t)rdr1();
2543        }
2544        if (bp & 0x04) {
2545                addr[nbp++] = (caddr_t)rdr2();
2546        }
2547        if (bp & 0x08) {
2548                addr[nbp++] = (caddr_t)rdr3();
2549        }
2550
2551        for (i = 0; i < nbp; i++) {
2552                if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2553                        /*
2554                         * addr[i] is in user space
2555                         */
2556                        return nbp;
2557                }
2558        }
2559
2560        /*
2561         * None of the breakpoints are in user space.
2562         */
2563        return 0;
2564}
2565
2566/*
2567 * The pcb_flags is only modified by current thread, or by other threads
2568 * when current thread is stopped.  However, current thread may change it
2569 * from the interrupt context in cpu_switch(), or in the trap handler.
2570 * When we read-modify-write pcb_flags from C sources, compiler may generate
2571 * code that is not atomic regarding the interrupt handler.  If a trap or
2572 * interrupt happens and any flag is modified from the handler, it can be
2573 * clobbered with the cached value later.  Therefore, we implement setting
2574 * and clearing flags with single-instruction functions, which do not race
2575 * with possible modification of the flags from the trap or interrupt context,
2576 * because traps and interrupts are executed only on instruction boundary.
2577 */
2578void
2579set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2580{
2581
2582	__asm __volatile("orl %1,%0"
2583	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2584	    : "cc", "memory");
2585
2586}
2587
2588/*
2589 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2590 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2591 * pcb if user space modified the bases.  We must save on the context
2592 * switch or if the return to usermode happens through the doreti.
2593 *
2594 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2595 * which have a consequence that the base MSRs must be saved each time
2596 * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
2597 * context switches.
2598 */
2599void
2600set_pcb_flags(struct pcb *pcb, const u_int flags)
2601{
2602	register_t r;
2603
2604	if (curpcb == pcb &&
2605	    (flags & PCB_FULL_IRET) != 0 &&
2606	    (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
2607	    (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
2608		r = intr_disable();
2609		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2610			if (rfs() == _ufssel)
2611				pcb->pcb_fsbase = rdfsbase();
2612			if (rgs() == _ugssel)
2613				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2614		}
2615		set_pcb_flags_raw(pcb, flags);
2616		intr_restore(r);
2617	} else {
2618		set_pcb_flags_raw(pcb, flags);
2619	}
2620}
2621
2622void
2623clear_pcb_flags(struct pcb *pcb, const u_int flags)
2624{
2625
2626	__asm __volatile("andl %1,%0"
2627	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2628	    : "cc", "memory");
2629}
2630
2631#ifdef KDB
2632
2633/*
2634 * Provide inb() and outb() as functions.  They are normally only available as
2635 * inline functions, thus cannot be called from the debugger.
2636 */
2637
2638/* silence compiler warnings */
2639u_char inb_(u_short);
2640void outb_(u_short, u_char);
2641
2642u_char
2643inb_(u_short port)
2644{
2645	return inb(port);
2646}
2647
2648void
2649outb_(u_short port, u_char data)
2650{
2651	outb(port, data);
2652}
2653
2654#endif /* KDB */
2655