Deleted Added
full compact
machdep.c (248084) machdep.c (250840)
1/*-
2 * Copyright (c) 1992 Terrence R. Lambert.
3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
38 */
39
40#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 1992 Terrence R. Lambert.
3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
38 */
39
40#include <sys/cdefs.h>
41__FBSDID("$FreeBSD: head/sys/i386/i386/machdep.c 248084 2013-03-09 02:32:23Z attilio $");
41__FBSDID("$FreeBSD: head/sys/i386/i386/machdep.c 250840 2013-05-21 03:05:49Z marcel $");
42
43#include "opt_apic.h"
44#include "opt_atalk.h"
45#include "opt_atpic.h"
46#include "opt_compat.h"
47#include "opt_cpu.h"
48#include "opt_ddb.h"
49#include "opt_inet.h"
50#include "opt_ipx.h"
51#include "opt_isa.h"
52#include "opt_kstack_pages.h"
53#include "opt_maxmem.h"
54#include "opt_mp_watchdog.h"
55#include "opt_npx.h"
56#include "opt_perfmon.h"
42
43#include "opt_apic.h"
44#include "opt_atalk.h"
45#include "opt_atpic.h"
46#include "opt_compat.h"
47#include "opt_cpu.h"
48#include "opt_ddb.h"
49#include "opt_inet.h"
50#include "opt_ipx.h"
51#include "opt_isa.h"
52#include "opt_kstack_pages.h"
53#include "opt_maxmem.h"
54#include "opt_mp_watchdog.h"
55#include "opt_npx.h"
56#include "opt_perfmon.h"
57#include "opt_platform.h"
57#include "opt_xbox.h"
58#include "opt_kdtrace.h"
59
60#include <sys/param.h>
61#include <sys/proc.h>
62#include <sys/systm.h>
63#include <sys/bio.h>
64#include <sys/buf.h>
65#include <sys/bus.h>
66#include <sys/callout.h>
67#include <sys/cons.h>
68#include <sys/cpu.h>
69#include <sys/eventhandler.h>
70#include <sys/exec.h>
71#include <sys/imgact.h>
72#include <sys/kdb.h>
73#include <sys/kernel.h>
74#include <sys/ktr.h>
75#include <sys/linker.h>
76#include <sys/lock.h>
77#include <sys/malloc.h>
78#include <sys/memrange.h>
79#include <sys/msgbuf.h>
80#include <sys/mutex.h>
81#include <sys/pcpu.h>
82#include <sys/ptrace.h>
83#include <sys/reboot.h>
84#include <sys/rwlock.h>
85#include <sys/sched.h>
86#include <sys/signalvar.h>
87#ifdef SMP
88#include <sys/smp.h>
89#endif
90#include <sys/syscallsubr.h>
91#include <sys/sysctl.h>
92#include <sys/sysent.h>
93#include <sys/sysproto.h>
94#include <sys/ucontext.h>
95#include <sys/vmmeter.h>
96
97#include <vm/vm.h>
98#include <vm/vm_extern.h>
99#include <vm/vm_kern.h>
100#include <vm/vm_page.h>
101#include <vm/vm_map.h>
102#include <vm/vm_object.h>
103#include <vm/vm_pager.h>
104#include <vm/vm_param.h>
105
106#ifdef DDB
107#ifndef KDB
108#error KDB must be enabled in order for DDB to work!
109#endif
110#include <ddb/ddb.h>
111#include <ddb/db_sym.h>
112#endif
113
114#include <isa/rtc.h>
115
116#include <net/netisr.h>
117
118#include <machine/bootinfo.h>
119#include <machine/clock.h>
120#include <machine/cpu.h>
121#include <machine/cputypes.h>
122#include <machine/intr_machdep.h>
123#include <x86/mca.h>
124#include <machine/md_var.h>
125#include <machine/metadata.h>
126#include <machine/mp_watchdog.h>
127#include <machine/pc/bios.h>
128#include <machine/pcb.h>
129#include <machine/pcb_ext.h>
130#include <machine/proc.h>
131#include <machine/reg.h>
132#include <machine/sigframe.h>
133#include <machine/specialreg.h>
134#include <machine/vm86.h>
135#ifdef PERFMON
136#include <machine/perfmon.h>
137#endif
138#ifdef SMP
139#include <machine/smp.h>
140#endif
58#include "opt_xbox.h"
59#include "opt_kdtrace.h"
60
61#include <sys/param.h>
62#include <sys/proc.h>
63#include <sys/systm.h>
64#include <sys/bio.h>
65#include <sys/buf.h>
66#include <sys/bus.h>
67#include <sys/callout.h>
68#include <sys/cons.h>
69#include <sys/cpu.h>
70#include <sys/eventhandler.h>
71#include <sys/exec.h>
72#include <sys/imgact.h>
73#include <sys/kdb.h>
74#include <sys/kernel.h>
75#include <sys/ktr.h>
76#include <sys/linker.h>
77#include <sys/lock.h>
78#include <sys/malloc.h>
79#include <sys/memrange.h>
80#include <sys/msgbuf.h>
81#include <sys/mutex.h>
82#include <sys/pcpu.h>
83#include <sys/ptrace.h>
84#include <sys/reboot.h>
85#include <sys/rwlock.h>
86#include <sys/sched.h>
87#include <sys/signalvar.h>
88#ifdef SMP
89#include <sys/smp.h>
90#endif
91#include <sys/syscallsubr.h>
92#include <sys/sysctl.h>
93#include <sys/sysent.h>
94#include <sys/sysproto.h>
95#include <sys/ucontext.h>
96#include <sys/vmmeter.h>
97
98#include <vm/vm.h>
99#include <vm/vm_extern.h>
100#include <vm/vm_kern.h>
101#include <vm/vm_page.h>
102#include <vm/vm_map.h>
103#include <vm/vm_object.h>
104#include <vm/vm_pager.h>
105#include <vm/vm_param.h>
106
107#ifdef DDB
108#ifndef KDB
109#error KDB must be enabled in order for DDB to work!
110#endif
111#include <ddb/ddb.h>
112#include <ddb/db_sym.h>
113#endif
114
115#include <isa/rtc.h>
116
117#include <net/netisr.h>
118
119#include <machine/bootinfo.h>
120#include <machine/clock.h>
121#include <machine/cpu.h>
122#include <machine/cputypes.h>
123#include <machine/intr_machdep.h>
124#include <x86/mca.h>
125#include <machine/md_var.h>
126#include <machine/metadata.h>
127#include <machine/mp_watchdog.h>
128#include <machine/pc/bios.h>
129#include <machine/pcb.h>
130#include <machine/pcb_ext.h>
131#include <machine/proc.h>
132#include <machine/reg.h>
133#include <machine/sigframe.h>
134#include <machine/specialreg.h>
135#include <machine/vm86.h>
136#ifdef PERFMON
137#include <machine/perfmon.h>
138#endif
139#ifdef SMP
140#include <machine/smp.h>
141#endif
142#ifdef FDT
143#include <x86/fdt.h>
144#endif
141
142#ifdef DEV_APIC
143#include <machine/apicvar.h>
144#endif
145
146#ifdef DEV_ISA
147#include <x86/isa/icu.h>
148#endif
149
150#ifdef XBOX
151#include <machine/xbox.h>
152
153int arch_i386_is_xbox = 0;
154uint32_t arch_i386_xbox_memsize = 0;
155#endif
156
157#ifdef XEN
158/* XEN includes */
159#include <machine/xen/xen-os.h>
160#include <xen/hypervisor.h>
161#include <machine/xen/xen-os.h>
162#include <machine/xen/xenvar.h>
163#include <machine/xen/xenfunc.h>
164#include <xen/xen_intr.h>
165
166void Xhypervisor_callback(void);
167void failsafe_callback(void);
168
169extern trap_info_t trap_table[];
170struct proc_ldt default_proc_ldt;
171extern int init_first;
172int running_xen = 1;
173extern unsigned long physfree;
174#endif /* XEN */
175
176/* Sanity check for __curthread() */
177CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
178
179extern void init386(int first);
180extern void dblfault_handler(void);
181
182extern void printcpuinfo(void); /* XXX header file */
183extern void finishidentcpu(void);
184extern void panicifcpuunsupported(void);
185
186#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
187#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
188
189#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
190#define CPU_ENABLE_SSE
191#endif
192
193static void cpu_startup(void *);
194static void fpstate_drop(struct thread *td);
195static void get_fpcontext(struct thread *td, mcontext_t *mcp);
196static int set_fpcontext(struct thread *td, const mcontext_t *mcp);
197#ifdef CPU_ENABLE_SSE
198static void set_fpregs_xmm(struct save87 *, struct savexmm *);
199static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
200#endif /* CPU_ENABLE_SSE */
201SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
202
203#ifdef DDB
204extern vm_offset_t ksym_start, ksym_end;
205#endif
206
207/* Intel ICH registers */
208#define ICH_PMBASE 0x400
209#define ICH_SMI_EN ICH_PMBASE + 0x30
210
211int _udatasel, _ucodesel;
212u_int basemem;
213
214int cold = 1;
215
216#ifdef COMPAT_43
217static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
218#endif
219#ifdef COMPAT_FREEBSD4
220static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
221#endif
222
223long Maxmem = 0;
224long realmem = 0;
225
226#ifdef PAE
227FEATURE(pae, "Physical Address Extensions");
228#endif
229
230/*
231 * The number of PHYSMAP entries must be one less than the number of
232 * PHYSSEG entries because the PHYSMAP entry that spans the largest
233 * physical address that is accessible by ISA DMA is split into two
234 * PHYSSEG entries.
235 */
236#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
237
238vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
239vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
240
241/* must be 2 less so 0 0 can signal end of chunks */
242#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
243#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
244
245struct kva_md_info kmi;
246
247static struct trapframe proc0_tf;
248struct pcpu __pcpu[MAXCPU];
249
250struct mtx icu_lock;
251
252struct mem_range_softc mem_range_softc;
253
254static void
255cpu_startup(dummy)
256 void *dummy;
257{
258 uintmax_t memsize;
259 char *sysenv;
260
261 /*
262 * On MacBooks, we need to disallow the legacy USB circuit to
263 * generate an SMI# because this can cause several problems,
264 * namely: incorrect CPU frequency detection and failure to
265 * start the APs.
266 * We do this by disabling a bit in the SMI_EN (SMI Control and
267 * Enable register) of the Intel ICH LPC Interface Bridge.
268 */
269 sysenv = getenv("smbios.system.product");
270 if (sysenv != NULL) {
271 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
272 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
273 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
274 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
275 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
276 strncmp(sysenv, "Macmini1,1", 10) == 0) {
277 if (bootverbose)
278 printf("Disabling LEGACY_USB_EN bit on "
279 "Intel ICH.\n");
280 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
281 }
282 freeenv(sysenv);
283 }
284
285 /*
286 * Good {morning,afternoon,evening,night}.
287 */
288 startrtclock();
289 printcpuinfo();
290 panicifcpuunsupported();
291#ifdef PERFMON
292 perfmon_init();
293#endif
294 realmem = Maxmem;
295
296 /*
297 * Display physical memory if SMBIOS reports reasonable amount.
298 */
299 memsize = 0;
300 sysenv = getenv("smbios.memory.enabled");
301 if (sysenv != NULL) {
302 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
303 freeenv(sysenv);
304 }
305 if (memsize < ptoa((uintmax_t)cnt.v_free_count))
306 memsize = ptoa((uintmax_t)Maxmem);
307 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
308
309 /*
310 * Display any holes after the first chunk of extended memory.
311 */
312 if (bootverbose) {
313 int indx;
314
315 printf("Physical memory chunk(s):\n");
316 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
317 vm_paddr_t size;
318
319 size = phys_avail[indx + 1] - phys_avail[indx];
320 printf(
321 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
322 (uintmax_t)phys_avail[indx],
323 (uintmax_t)phys_avail[indx + 1] - 1,
324 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
325 }
326 }
327
328 vm_ksubmap_init(&kmi);
329
330 printf("avail memory = %ju (%ju MB)\n",
331 ptoa((uintmax_t)cnt.v_free_count),
332 ptoa((uintmax_t)cnt.v_free_count) / 1048576);
333
334 /*
335 * Set up buffers, so they can be used to read disk labels.
336 */
337 bufinit();
338 vm_pager_bufferinit();
339#ifndef XEN
340 cpu_setregs();
341#endif
342}
343
344/*
345 * Send an interrupt to process.
346 *
347 * Stack is set up to allow sigcode stored
348 * at top to call routine, followed by kcall
349 * to sigreturn routine below. After sigreturn
350 * resets the signal mask, the stack, and the
351 * frame pointer, it returns to the user
352 * specified pc, psl.
353 */
354#ifdef COMPAT_43
355static void
356osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
357{
358 struct osigframe sf, *fp;
359 struct proc *p;
360 struct thread *td;
361 struct sigacts *psp;
362 struct trapframe *regs;
363 int sig;
364 int oonstack;
365
366 td = curthread;
367 p = td->td_proc;
368 PROC_LOCK_ASSERT(p, MA_OWNED);
369 sig = ksi->ksi_signo;
370 psp = p->p_sigacts;
371 mtx_assert(&psp->ps_mtx, MA_OWNED);
372 regs = td->td_frame;
373 oonstack = sigonstack(regs->tf_esp);
374
375 /* Allocate space for the signal handler context. */
376 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
377 SIGISMEMBER(psp->ps_sigonstack, sig)) {
378 fp = (struct osigframe *)(td->td_sigstk.ss_sp +
379 td->td_sigstk.ss_size - sizeof(struct osigframe));
380#if defined(COMPAT_43)
381 td->td_sigstk.ss_flags |= SS_ONSTACK;
382#endif
383 } else
384 fp = (struct osigframe *)regs->tf_esp - 1;
385
386 /* Translate the signal if appropriate. */
387 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
388 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
389
390 /* Build the argument list for the signal handler. */
391 sf.sf_signum = sig;
392 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
393 bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
394 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
395 /* Signal handler installed with SA_SIGINFO. */
396 sf.sf_arg2 = (register_t)&fp->sf_siginfo;
397 sf.sf_siginfo.si_signo = sig;
398 sf.sf_siginfo.si_code = ksi->ksi_code;
399 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
400 sf.sf_addr = 0;
401 } else {
402 /* Old FreeBSD-style arguments. */
403 sf.sf_arg2 = ksi->ksi_code;
404 sf.sf_addr = (register_t)ksi->ksi_addr;
405 sf.sf_ahu.sf_handler = catcher;
406 }
407 mtx_unlock(&psp->ps_mtx);
408 PROC_UNLOCK(p);
409
410 /* Save most if not all of trap frame. */
411 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
412 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
413 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
414 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
415 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
416 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
417 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
418 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
419 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
420 sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
421 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
422 sf.sf_siginfo.si_sc.sc_gs = rgs();
423 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
424
425 /* Build the signal context to be used by osigreturn(). */
426 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
427 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
428 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
429 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
430 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
431 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
432 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
433 sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
434
435 /*
436 * If we're a vm86 process, we want to save the segment registers.
437 * We also change eflags to be our emulated eflags, not the actual
438 * eflags.
439 */
440 if (regs->tf_eflags & PSL_VM) {
441 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
442 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
443 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
444
445 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
446 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
447 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
448 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
449
450 if (vm86->vm86_has_vme == 0)
451 sf.sf_siginfo.si_sc.sc_ps =
452 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
453 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
454
455 /* See sendsig() for comments. */
456 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
457 }
458
459 /*
460 * Copy the sigframe out to the user's stack.
461 */
462 if (copyout(&sf, fp, sizeof(*fp)) != 0) {
463#ifdef DEBUG
464 printf("process %ld has trashed its stack\n", (long)p->p_pid);
465#endif
466 PROC_LOCK(p);
467 sigexit(td, SIGILL);
468 }
469
470 regs->tf_esp = (int)fp;
471 if (p->p_sysent->sv_sigcode_base != 0) {
472 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
473 szosigcode;
474 } else {
475 /* a.out sysentvec does not use shared page */
476 regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode;
477 }
478 regs->tf_eflags &= ~(PSL_T | PSL_D);
479 regs->tf_cs = _ucodesel;
480 regs->tf_ds = _udatasel;
481 regs->tf_es = _udatasel;
482 regs->tf_fs = _udatasel;
483 load_gs(_udatasel);
484 regs->tf_ss = _udatasel;
485 PROC_LOCK(p);
486 mtx_lock(&psp->ps_mtx);
487}
488#endif /* COMPAT_43 */
489
490#ifdef COMPAT_FREEBSD4
491static void
492freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
493{
494 struct sigframe4 sf, *sfp;
495 struct proc *p;
496 struct thread *td;
497 struct sigacts *psp;
498 struct trapframe *regs;
499 int sig;
500 int oonstack;
501
502 td = curthread;
503 p = td->td_proc;
504 PROC_LOCK_ASSERT(p, MA_OWNED);
505 sig = ksi->ksi_signo;
506 psp = p->p_sigacts;
507 mtx_assert(&psp->ps_mtx, MA_OWNED);
508 regs = td->td_frame;
509 oonstack = sigonstack(regs->tf_esp);
510
511 /* Save user context. */
512 bzero(&sf, sizeof(sf));
513 sf.sf_uc.uc_sigmask = *mask;
514 sf.sf_uc.uc_stack = td->td_sigstk;
515 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
516 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
517 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
518 sf.sf_uc.uc_mcontext.mc_gs = rgs();
519 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
520 bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
521 sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
522 bzero(sf.sf_uc.uc_mcontext.__spare__,
523 sizeof(sf.sf_uc.uc_mcontext.__spare__));
524 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
525
526 /* Allocate space for the signal handler context. */
527 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
528 SIGISMEMBER(psp->ps_sigonstack, sig)) {
529 sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
530 td->td_sigstk.ss_size - sizeof(struct sigframe4));
531#if defined(COMPAT_43)
532 td->td_sigstk.ss_flags |= SS_ONSTACK;
533#endif
534 } else
535 sfp = (struct sigframe4 *)regs->tf_esp - 1;
536
537 /* Translate the signal if appropriate. */
538 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
539 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
540
541 /* Build the argument list for the signal handler. */
542 sf.sf_signum = sig;
543 sf.sf_ucontext = (register_t)&sfp->sf_uc;
544 bzero(&sf.sf_si, sizeof(sf.sf_si));
545 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
546 /* Signal handler installed with SA_SIGINFO. */
547 sf.sf_siginfo = (register_t)&sfp->sf_si;
548 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
549
550 /* Fill in POSIX parts */
551 sf.sf_si.si_signo = sig;
552 sf.sf_si.si_code = ksi->ksi_code;
553 sf.sf_si.si_addr = ksi->ksi_addr;
554 } else {
555 /* Old FreeBSD-style arguments. */
556 sf.sf_siginfo = ksi->ksi_code;
557 sf.sf_addr = (register_t)ksi->ksi_addr;
558 sf.sf_ahu.sf_handler = catcher;
559 }
560 mtx_unlock(&psp->ps_mtx);
561 PROC_UNLOCK(p);
562
563 /*
564 * If we're a vm86 process, we want to save the segment registers.
565 * We also change eflags to be our emulated eflags, not the actual
566 * eflags.
567 */
568 if (regs->tf_eflags & PSL_VM) {
569 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
570 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
571
572 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
573 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
574 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
575 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
576
577 if (vm86->vm86_has_vme == 0)
578 sf.sf_uc.uc_mcontext.mc_eflags =
579 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
580 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
581
582 /*
583 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
584 * syscalls made by the signal handler. This just avoids
585 * wasting time for our lazy fixup of such faults. PSL_NT
586 * does nothing in vm86 mode, but vm86 programs can set it
587 * almost legitimately in probes for old cpu types.
588 */
589 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
590 }
591
592 /*
593 * Copy the sigframe out to the user's stack.
594 */
595 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
596#ifdef DEBUG
597 printf("process %ld has trashed its stack\n", (long)p->p_pid);
598#endif
599 PROC_LOCK(p);
600 sigexit(td, SIGILL);
601 }
602
603 regs->tf_esp = (int)sfp;
604 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
605 szfreebsd4_sigcode;
606 regs->tf_eflags &= ~(PSL_T | PSL_D);
607 regs->tf_cs = _ucodesel;
608 regs->tf_ds = _udatasel;
609 regs->tf_es = _udatasel;
610 regs->tf_fs = _udatasel;
611 regs->tf_ss = _udatasel;
612 PROC_LOCK(p);
613 mtx_lock(&psp->ps_mtx);
614}
615#endif /* COMPAT_FREEBSD4 */
616
617void
618sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
619{
620 struct sigframe sf, *sfp;
621 struct proc *p;
622 struct thread *td;
623 struct sigacts *psp;
624 char *sp;
625 struct trapframe *regs;
626 struct segment_descriptor *sdp;
627 int sig;
628 int oonstack;
629
630 td = curthread;
631 p = td->td_proc;
632 PROC_LOCK_ASSERT(p, MA_OWNED);
633 sig = ksi->ksi_signo;
634 psp = p->p_sigacts;
635 mtx_assert(&psp->ps_mtx, MA_OWNED);
636#ifdef COMPAT_FREEBSD4
637 if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
638 freebsd4_sendsig(catcher, ksi, mask);
639 return;
640 }
641#endif
642#ifdef COMPAT_43
643 if (SIGISMEMBER(psp->ps_osigset, sig)) {
644 osendsig(catcher, ksi, mask);
645 return;
646 }
647#endif
648 regs = td->td_frame;
649 oonstack = sigonstack(regs->tf_esp);
650
651 /* Save user context. */
652 bzero(&sf, sizeof(sf));
653 sf.sf_uc.uc_sigmask = *mask;
654 sf.sf_uc.uc_stack = td->td_sigstk;
655 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
656 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
657 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
658 sf.sf_uc.uc_mcontext.mc_gs = rgs();
659 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
660 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
661 get_fpcontext(td, &sf.sf_uc.uc_mcontext);
662 fpstate_drop(td);
663 /*
664 * Unconditionally fill the fsbase and gsbase into the mcontext.
665 */
666 sdp = &td->td_pcb->pcb_fsd;
667 sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 |
668 sdp->sd_lobase;
669 sdp = &td->td_pcb->pcb_gsd;
670 sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 |
671 sdp->sd_lobase;
672 sf.sf_uc.uc_mcontext.mc_flags = 0;
673 bzero(sf.sf_uc.uc_mcontext.mc_spare2,
674 sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
675 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
676
677 /* Allocate space for the signal handler context. */
678 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
679 SIGISMEMBER(psp->ps_sigonstack, sig)) {
680 sp = td->td_sigstk.ss_sp +
681 td->td_sigstk.ss_size - sizeof(struct sigframe);
682#if defined(COMPAT_43)
683 td->td_sigstk.ss_flags |= SS_ONSTACK;
684#endif
685 } else
686 sp = (char *)regs->tf_esp - sizeof(struct sigframe);
687 /* Align to 16 bytes. */
688 sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
689
690 /* Translate the signal if appropriate. */
691 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
692 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
693
694 /* Build the argument list for the signal handler. */
695 sf.sf_signum = sig;
696 sf.sf_ucontext = (register_t)&sfp->sf_uc;
697 bzero(&sf.sf_si, sizeof(sf.sf_si));
698 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
699 /* Signal handler installed with SA_SIGINFO. */
700 sf.sf_siginfo = (register_t)&sfp->sf_si;
701 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
702
703 /* Fill in POSIX parts */
704 sf.sf_si = ksi->ksi_info;
705 sf.sf_si.si_signo = sig; /* maybe a translated signal */
706 } else {
707 /* Old FreeBSD-style arguments. */
708 sf.sf_siginfo = ksi->ksi_code;
709 sf.sf_addr = (register_t)ksi->ksi_addr;
710 sf.sf_ahu.sf_handler = catcher;
711 }
712 mtx_unlock(&psp->ps_mtx);
713 PROC_UNLOCK(p);
714
715 /*
716 * If we're a vm86 process, we want to save the segment registers.
717 * We also change eflags to be our emulated eflags, not the actual
718 * eflags.
719 */
720 if (regs->tf_eflags & PSL_VM) {
721 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
722 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
723
724 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
725 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
726 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
727 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
728
729 if (vm86->vm86_has_vme == 0)
730 sf.sf_uc.uc_mcontext.mc_eflags =
731 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
732 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
733
734 /*
735 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
736 * syscalls made by the signal handler. This just avoids
737 * wasting time for our lazy fixup of such faults. PSL_NT
738 * does nothing in vm86 mode, but vm86 programs can set it
739 * almost legitimately in probes for old cpu types.
740 */
741 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
742 }
743
744 /*
745 * Copy the sigframe out to the user's stack.
746 */
747 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
748#ifdef DEBUG
749 printf("process %ld has trashed its stack\n", (long)p->p_pid);
750#endif
751 PROC_LOCK(p);
752 sigexit(td, SIGILL);
753 }
754
755 regs->tf_esp = (int)sfp;
756 regs->tf_eip = p->p_sysent->sv_sigcode_base;
757 regs->tf_eflags &= ~(PSL_T | PSL_D);
758 regs->tf_cs = _ucodesel;
759 regs->tf_ds = _udatasel;
760 regs->tf_es = _udatasel;
761 regs->tf_fs = _udatasel;
762 regs->tf_ss = _udatasel;
763 PROC_LOCK(p);
764 mtx_lock(&psp->ps_mtx);
765}
766
767/*
768 * System call to cleanup state after a signal
769 * has been taken. Reset signal mask and
770 * stack state from context left by sendsig (above).
771 * Return to previous pc and psl as specified by
772 * context left by sendsig. Check carefully to
773 * make sure that the user has not modified the
774 * state to gain improper privileges.
775 *
776 * MPSAFE
777 */
778#ifdef COMPAT_43
779int
780osigreturn(td, uap)
781 struct thread *td;
782 struct osigreturn_args /* {
783 struct osigcontext *sigcntxp;
784 } */ *uap;
785{
786 struct osigcontext sc;
787 struct trapframe *regs;
788 struct osigcontext *scp;
789 int eflags, error;
790 ksiginfo_t ksi;
791
792 regs = td->td_frame;
793 error = copyin(uap->sigcntxp, &sc, sizeof(sc));
794 if (error != 0)
795 return (error);
796 scp = &sc;
797 eflags = scp->sc_ps;
798 if (eflags & PSL_VM) {
799 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
800 struct vm86_kernel *vm86;
801
802 /*
803 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
804 * set up the vm86 area, and we can't enter vm86 mode.
805 */
806 if (td->td_pcb->pcb_ext == 0)
807 return (EINVAL);
808 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
809 if (vm86->vm86_inited == 0)
810 return (EINVAL);
811
812 /* Go back to user mode if both flags are set. */
813 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
814 ksiginfo_init_trap(&ksi);
815 ksi.ksi_signo = SIGBUS;
816 ksi.ksi_code = BUS_OBJERR;
817 ksi.ksi_addr = (void *)regs->tf_eip;
818 trapsignal(td, &ksi);
819 }
820
821 if (vm86->vm86_has_vme) {
822 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
823 (eflags & VME_USERCHANGE) | PSL_VM;
824 } else {
825 vm86->vm86_eflags = eflags; /* save VIF, VIP */
826 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
827 (eflags & VM_USERCHANGE) | PSL_VM;
828 }
829 tf->tf_vm86_ds = scp->sc_ds;
830 tf->tf_vm86_es = scp->sc_es;
831 tf->tf_vm86_fs = scp->sc_fs;
832 tf->tf_vm86_gs = scp->sc_gs;
833 tf->tf_ds = _udatasel;
834 tf->tf_es = _udatasel;
835 tf->tf_fs = _udatasel;
836 } else {
837 /*
838 * Don't allow users to change privileged or reserved flags.
839 */
840 /*
841 * XXX do allow users to change the privileged flag PSL_RF.
842 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
843 * should sometimes set it there too. tf_eflags is kept in
844 * the signal context during signal handling and there is no
845 * other place to remember it, so the PSL_RF bit may be
846 * corrupted by the signal handler without us knowing.
847 * Corruption of the PSL_RF bit at worst causes one more or
848 * one less debugger trap, so allowing it is fairly harmless.
849 */
850 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
851 return (EINVAL);
852 }
853
854 /*
855 * Don't allow users to load a valid privileged %cs. Let the
856 * hardware check for invalid selectors, excess privilege in
857 * other selectors, invalid %eip's and invalid %esp's.
858 */
859 if (!CS_SECURE(scp->sc_cs)) {
860 ksiginfo_init_trap(&ksi);
861 ksi.ksi_signo = SIGBUS;
862 ksi.ksi_code = BUS_OBJERR;
863 ksi.ksi_trapno = T_PROTFLT;
864 ksi.ksi_addr = (void *)regs->tf_eip;
865 trapsignal(td, &ksi);
866 return (EINVAL);
867 }
868 regs->tf_ds = scp->sc_ds;
869 regs->tf_es = scp->sc_es;
870 regs->tf_fs = scp->sc_fs;
871 }
872
873 /* Restore remaining registers. */
874 regs->tf_eax = scp->sc_eax;
875 regs->tf_ebx = scp->sc_ebx;
876 regs->tf_ecx = scp->sc_ecx;
877 regs->tf_edx = scp->sc_edx;
878 regs->tf_esi = scp->sc_esi;
879 regs->tf_edi = scp->sc_edi;
880 regs->tf_cs = scp->sc_cs;
881 regs->tf_ss = scp->sc_ss;
882 regs->tf_isp = scp->sc_isp;
883 regs->tf_ebp = scp->sc_fp;
884 regs->tf_esp = scp->sc_sp;
885 regs->tf_eip = scp->sc_pc;
886 regs->tf_eflags = eflags;
887
888#if defined(COMPAT_43)
889 if (scp->sc_onstack & 1)
890 td->td_sigstk.ss_flags |= SS_ONSTACK;
891 else
892 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
893#endif
894 kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
895 SIGPROCMASK_OLD);
896 return (EJUSTRETURN);
897}
898#endif /* COMPAT_43 */
899
900#ifdef COMPAT_FREEBSD4
901/*
902 * MPSAFE
903 */
904int
905freebsd4_sigreturn(td, uap)
906 struct thread *td;
907 struct freebsd4_sigreturn_args /* {
908 const ucontext4 *sigcntxp;
909 } */ *uap;
910{
911 struct ucontext4 uc;
912 struct trapframe *regs;
913 struct ucontext4 *ucp;
914 int cs, eflags, error;
915 ksiginfo_t ksi;
916
917 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
918 if (error != 0)
919 return (error);
920 ucp = &uc;
921 regs = td->td_frame;
922 eflags = ucp->uc_mcontext.mc_eflags;
923 if (eflags & PSL_VM) {
924 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
925 struct vm86_kernel *vm86;
926
927 /*
928 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
929 * set up the vm86 area, and we can't enter vm86 mode.
930 */
931 if (td->td_pcb->pcb_ext == 0)
932 return (EINVAL);
933 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
934 if (vm86->vm86_inited == 0)
935 return (EINVAL);
936
937 /* Go back to user mode if both flags are set. */
938 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
939 ksiginfo_init_trap(&ksi);
940 ksi.ksi_signo = SIGBUS;
941 ksi.ksi_code = BUS_OBJERR;
942 ksi.ksi_addr = (void *)regs->tf_eip;
943 trapsignal(td, &ksi);
944 }
945 if (vm86->vm86_has_vme) {
946 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
947 (eflags & VME_USERCHANGE) | PSL_VM;
948 } else {
949 vm86->vm86_eflags = eflags; /* save VIF, VIP */
950 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
951 (eflags & VM_USERCHANGE) | PSL_VM;
952 }
953 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
954 tf->tf_eflags = eflags;
955 tf->tf_vm86_ds = tf->tf_ds;
956 tf->tf_vm86_es = tf->tf_es;
957 tf->tf_vm86_fs = tf->tf_fs;
958 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
959 tf->tf_ds = _udatasel;
960 tf->tf_es = _udatasel;
961 tf->tf_fs = _udatasel;
962 } else {
963 /*
964 * Don't allow users to change privileged or reserved flags.
965 */
966 /*
967 * XXX do allow users to change the privileged flag PSL_RF.
968 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
969 * should sometimes set it there too. tf_eflags is kept in
970 * the signal context during signal handling and there is no
971 * other place to remember it, so the PSL_RF bit may be
972 * corrupted by the signal handler without us knowing.
973 * Corruption of the PSL_RF bit at worst causes one more or
974 * one less debugger trap, so allowing it is fairly harmless.
975 */
976 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
977 uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n",
978 td->td_proc->p_pid, td->td_name, eflags);
979 return (EINVAL);
980 }
981
982 /*
983 * Don't allow users to load a valid privileged %cs. Let the
984 * hardware check for invalid selectors, excess privilege in
985 * other selectors, invalid %eip's and invalid %esp's.
986 */
987 cs = ucp->uc_mcontext.mc_cs;
988 if (!CS_SECURE(cs)) {
989 uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
990 td->td_proc->p_pid, td->td_name, cs);
991 ksiginfo_init_trap(&ksi);
992 ksi.ksi_signo = SIGBUS;
993 ksi.ksi_code = BUS_OBJERR;
994 ksi.ksi_trapno = T_PROTFLT;
995 ksi.ksi_addr = (void *)regs->tf_eip;
996 trapsignal(td, &ksi);
997 return (EINVAL);
998 }
999
1000 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
1001 }
1002
1003#if defined(COMPAT_43)
1004 if (ucp->uc_mcontext.mc_onstack & 1)
1005 td->td_sigstk.ss_flags |= SS_ONSTACK;
1006 else
1007 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
1008#endif
1009 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
1010 return (EJUSTRETURN);
1011}
1012#endif /* COMPAT_FREEBSD4 */
1013
1014/*
1015 * MPSAFE
1016 */
1017int
1018sys_sigreturn(td, uap)
1019 struct thread *td;
1020 struct sigreturn_args /* {
1021 const struct __ucontext *sigcntxp;
1022 } */ *uap;
1023{
1024 ucontext_t uc;
1025 struct trapframe *regs;
1026 ucontext_t *ucp;
1027 int cs, eflags, error, ret;
1028 ksiginfo_t ksi;
1029
1030 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
1031 if (error != 0)
1032 return (error);
1033 ucp = &uc;
1034 regs = td->td_frame;
1035 eflags = ucp->uc_mcontext.mc_eflags;
1036 if (eflags & PSL_VM) {
1037 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
1038 struct vm86_kernel *vm86;
1039
1040 /*
1041 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
1042 * set up the vm86 area, and we can't enter vm86 mode.
1043 */
1044 if (td->td_pcb->pcb_ext == 0)
1045 return (EINVAL);
1046 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
1047 if (vm86->vm86_inited == 0)
1048 return (EINVAL);
1049
1050 /* Go back to user mode if both flags are set. */
1051 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
1052 ksiginfo_init_trap(&ksi);
1053 ksi.ksi_signo = SIGBUS;
1054 ksi.ksi_code = BUS_OBJERR;
1055 ksi.ksi_addr = (void *)regs->tf_eip;
1056 trapsignal(td, &ksi);
1057 }
1058
1059 if (vm86->vm86_has_vme) {
1060 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
1061 (eflags & VME_USERCHANGE) | PSL_VM;
1062 } else {
1063 vm86->vm86_eflags = eflags; /* save VIF, VIP */
1064 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
1065 (eflags & VM_USERCHANGE) | PSL_VM;
1066 }
1067 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
1068 tf->tf_eflags = eflags;
1069 tf->tf_vm86_ds = tf->tf_ds;
1070 tf->tf_vm86_es = tf->tf_es;
1071 tf->tf_vm86_fs = tf->tf_fs;
1072 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
1073 tf->tf_ds = _udatasel;
1074 tf->tf_es = _udatasel;
1075 tf->tf_fs = _udatasel;
1076 } else {
1077 /*
1078 * Don't allow users to change privileged or reserved flags.
1079 */
1080 /*
1081 * XXX do allow users to change the privileged flag PSL_RF.
1082 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
1083 * should sometimes set it there too. tf_eflags is kept in
1084 * the signal context during signal handling and there is no
1085 * other place to remember it, so the PSL_RF bit may be
1086 * corrupted by the signal handler without us knowing.
1087 * Corruption of the PSL_RF bit at worst causes one more or
1088 * one less debugger trap, so allowing it is fairly harmless.
1089 */
1090 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
1091 uprintf("pid %d (%s): sigreturn eflags = 0x%x\n",
1092 td->td_proc->p_pid, td->td_name, eflags);
1093 return (EINVAL);
1094 }
1095
1096 /*
1097 * Don't allow users to load a valid privileged %cs. Let the
1098 * hardware check for invalid selectors, excess privilege in
1099 * other selectors, invalid %eip's and invalid %esp's.
1100 */
1101 cs = ucp->uc_mcontext.mc_cs;
1102 if (!CS_SECURE(cs)) {
1103 uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
1104 td->td_proc->p_pid, td->td_name, cs);
1105 ksiginfo_init_trap(&ksi);
1106 ksi.ksi_signo = SIGBUS;
1107 ksi.ksi_code = BUS_OBJERR;
1108 ksi.ksi_trapno = T_PROTFLT;
1109 ksi.ksi_addr = (void *)regs->tf_eip;
1110 trapsignal(td, &ksi);
1111 return (EINVAL);
1112 }
1113
1114 ret = set_fpcontext(td, &ucp->uc_mcontext);
1115 if (ret != 0)
1116 return (ret);
1117 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
1118 }
1119
1120#if defined(COMPAT_43)
1121 if (ucp->uc_mcontext.mc_onstack & 1)
1122 td->td_sigstk.ss_flags |= SS_ONSTACK;
1123 else
1124 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
1125#endif
1126
1127 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
1128 return (EJUSTRETURN);
1129}
1130
1131/*
1132 * Machine dependent boot() routine
1133 *
1134 * I haven't seen anything to put here yet
1135 * Possibly some stuff might be grafted back here from boot()
1136 */
1137void
1138cpu_boot(int howto)
1139{
1140}
1141
1142/*
1143 * Flush the D-cache for non-DMA I/O so that the I-cache can
1144 * be made coherent later.
1145 */
1146void
1147cpu_flush_dcache(void *ptr, size_t len)
1148{
1149 /* Not applicable */
1150}
1151
1152/* Get current clock frequency for the given cpu id. */
1153int
1154cpu_est_clockrate(int cpu_id, uint64_t *rate)
1155{
1156 uint64_t tsc1, tsc2;
1157 uint64_t acnt, mcnt, perf;
1158 register_t reg;
1159
1160 if (pcpu_find(cpu_id) == NULL || rate == NULL)
1161 return (EINVAL);
1162 if ((cpu_feature & CPUID_TSC) == 0)
1163 return (EOPNOTSUPP);
1164
1165 /*
1166 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
1167 * DELAY(9) based logic fails.
1168 */
1169 if (tsc_is_invariant && !tsc_perf_stat)
1170 return (EOPNOTSUPP);
1171
1172#ifdef SMP
1173 if (smp_cpus > 1) {
1174 /* Schedule ourselves on the indicated cpu. */
1175 thread_lock(curthread);
1176 sched_bind(curthread, cpu_id);
1177 thread_unlock(curthread);
1178 }
1179#endif
1180
1181 /* Calibrate by measuring a short delay. */
1182 reg = intr_disable();
1183 if (tsc_is_invariant) {
1184 wrmsr(MSR_MPERF, 0);
1185 wrmsr(MSR_APERF, 0);
1186 tsc1 = rdtsc();
1187 DELAY(1000);
1188 mcnt = rdmsr(MSR_MPERF);
1189 acnt = rdmsr(MSR_APERF);
1190 tsc2 = rdtsc();
1191 intr_restore(reg);
1192 perf = 1000 * acnt / mcnt;
1193 *rate = (tsc2 - tsc1) * perf;
1194 } else {
1195 tsc1 = rdtsc();
1196 DELAY(1000);
1197 tsc2 = rdtsc();
1198 intr_restore(reg);
1199 *rate = (tsc2 - tsc1) * 1000;
1200 }
1201
1202#ifdef SMP
1203 if (smp_cpus > 1) {
1204 thread_lock(curthread);
1205 sched_unbind(curthread);
1206 thread_unlock(curthread);
1207 }
1208#endif
1209
1210 return (0);
1211}
1212
1213#ifdef XEN
1214
1215void
1216cpu_halt(void)
1217{
1218 HYPERVISOR_shutdown(SHUTDOWN_poweroff);
1219}
1220
1221int scheduler_running;
1222
1223static void
1224cpu_idle_hlt(sbintime_t sbt)
1225{
1226
1227 scheduler_running = 1;
1228 enable_intr();
1229 idle_block();
1230}
1231
1232#else
1233/*
1234 * Shutdown the CPU as much as possible
1235 */
1236void
1237cpu_halt(void)
1238{
1239 for (;;)
1240 halt();
1241}
1242
1243#endif
1244
1245void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */
1246static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */
1247static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */
1248TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
1249SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
1250 0, "Use MONITOR/MWAIT for short idle");
1251
1252#define STATE_RUNNING 0x0
1253#define STATE_MWAIT 0x1
1254#define STATE_SLEEPING 0x2
1255
1256static void
1257cpu_idle_acpi(sbintime_t sbt)
1258{
1259 int *state;
1260
1261 state = (int *)PCPU_PTR(monitorbuf);
1262 *state = STATE_SLEEPING;
1263
1264 /* See comments in cpu_idle_hlt(). */
1265 disable_intr();
1266 if (sched_runnable())
1267 enable_intr();
1268 else if (cpu_idle_hook)
1269 cpu_idle_hook(sbt);
1270 else
1271 __asm __volatile("sti; hlt");
1272 *state = STATE_RUNNING;
1273}
1274
1275#ifndef XEN
1276static void
1277cpu_idle_hlt(sbintime_t sbt)
1278{
1279 int *state;
1280
1281 state = (int *)PCPU_PTR(monitorbuf);
1282 *state = STATE_SLEEPING;
1283
1284 /*
1285 * Since we may be in a critical section from cpu_idle(), if
1286 * an interrupt fires during that critical section we may have
1287 * a pending preemption. If the CPU halts, then that thread
1288 * may not execute until a later interrupt awakens the CPU.
1289 * To handle this race, check for a runnable thread after
1290 * disabling interrupts and immediately return if one is
1291 * found. Also, we must absolutely guarentee that hlt is
1292 * the next instruction after sti. This ensures that any
1293 * interrupt that fires after the call to disable_intr() will
1294 * immediately awaken the CPU from hlt. Finally, please note
1295 * that on x86 this works fine because of interrupts enabled only
1296 * after the instruction following sti takes place, while IF is set
1297 * to 1 immediately, allowing hlt instruction to acknowledge the
1298 * interrupt.
1299 */
1300 disable_intr();
1301 if (sched_runnable())
1302 enable_intr();
1303 else
1304 __asm __volatile("sti; hlt");
1305 *state = STATE_RUNNING;
1306}
1307#endif
1308
1309/*
1310 * MWAIT cpu power states. Lower 4 bits are sub-states.
1311 */
1312#define MWAIT_C0 0xf0
1313#define MWAIT_C1 0x00
1314#define MWAIT_C2 0x10
1315#define MWAIT_C3 0x20
1316#define MWAIT_C4 0x30
1317
1318static void
1319cpu_idle_mwait(sbintime_t sbt)
1320{
1321 int *state;
1322
1323 state = (int *)PCPU_PTR(monitorbuf);
1324 *state = STATE_MWAIT;
1325
1326 /* See comments in cpu_idle_hlt(). */
1327 disable_intr();
1328 if (sched_runnable()) {
1329 enable_intr();
1330 *state = STATE_RUNNING;
1331 return;
1332 }
1333 cpu_monitor(state, 0, 0);
1334 if (*state == STATE_MWAIT)
1335 __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
1336 else
1337 enable_intr();
1338 *state = STATE_RUNNING;
1339}
1340
1341static void
1342cpu_idle_spin(sbintime_t sbt)
1343{
1344 int *state;
1345 int i;
1346
1347 state = (int *)PCPU_PTR(monitorbuf);
1348 *state = STATE_RUNNING;
1349
1350 /*
1351 * The sched_runnable() call is racy but as long as there is
1352 * a loop missing it one time will have just a little impact if any
1353 * (and it is much better than missing the check at all).
1354 */
1355 for (i = 0; i < 1000; i++) {
1356 if (sched_runnable())
1357 return;
1358 cpu_spinwait();
1359 }
1360}
1361
1362/*
1363 * C1E renders the local APIC timer dead, so we disable it by
1364 * reading the Interrupt Pending Message register and clearing
1365 * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
1366 *
1367 * Reference:
1368 * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
1369 * #32559 revision 3.00+
1370 */
1371#define MSR_AMDK8_IPM 0xc0010055
1372#define AMDK8_SMIONCMPHALT (1ULL << 27)
1373#define AMDK8_C1EONCMPHALT (1ULL << 28)
1374#define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
1375
1376static void
1377cpu_probe_amdc1e(void)
1378{
1379
1380 /*
1381 * Detect the presence of C1E capability mostly on latest
1382 * dual-cores (or future) k8 family.
1383 */
1384 if (cpu_vendor_id == CPU_VENDOR_AMD &&
1385 (cpu_id & 0x00000f00) == 0x00000f00 &&
1386 (cpu_id & 0x0fff0000) >= 0x00040000) {
1387 cpu_ident_amdc1e = 1;
1388 }
1389}
1390
1391#ifdef XEN
1392void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt;
1393#else
1394void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
1395#endif
1396
1397void
1398cpu_idle(int busy)
1399{
1400#ifndef XEN
1401 uint64_t msr;
1402#endif
1403 sbintime_t sbt = -1;
1404
1405 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
1406 busy, curcpu);
1407#if defined(MP_WATCHDOG) && !defined(XEN)
1408 ap_watchdog(PCPU_GET(cpuid));
1409#endif
1410#ifndef XEN
1411 /* If we are busy - try to use fast methods. */
1412 if (busy) {
1413 if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
1414 cpu_idle_mwait(busy);
1415 goto out;
1416 }
1417 }
1418#endif
1419
1420 /* If we have time - switch timers into idle mode. */
1421 if (!busy) {
1422 critical_enter();
1423 sbt = cpu_idleclock();
1424 }
1425
1426#ifndef XEN
1427 /* Apply AMD APIC timer C1E workaround. */
1428 if (cpu_ident_amdc1e && cpu_disable_deep_sleep) {
1429 msr = rdmsr(MSR_AMDK8_IPM);
1430 if (msr & AMDK8_CMPHALT)
1431 wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
1432 }
1433#endif
1434
1435 /* Call main idle method. */
1436 cpu_idle_fn(sbt);
1437
1438 /* Switch timers mack into active mode. */
1439 if (!busy) {
1440 cpu_activeclock();
1441 critical_exit();
1442 }
1443#ifndef XEN
1444out:
1445#endif
1446 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
1447 busy, curcpu);
1448}
1449
1450int
1451cpu_idle_wakeup(int cpu)
1452{
1453 struct pcpu *pcpu;
1454 int *state;
1455
1456 pcpu = pcpu_find(cpu);
1457 state = (int *)pcpu->pc_monitorbuf;
1458 /*
1459 * This doesn't need to be atomic since missing the race will
1460 * simply result in unnecessary IPIs.
1461 */
1462 if (*state == STATE_SLEEPING)
1463 return (0);
1464 if (*state == STATE_MWAIT)
1465 *state = STATE_RUNNING;
1466 return (1);
1467}
1468
1469/*
1470 * Ordered by speed/power consumption.
1471 */
1472struct {
1473 void *id_fn;
1474 char *id_name;
1475} idle_tbl[] = {
1476 { cpu_idle_spin, "spin" },
1477 { cpu_idle_mwait, "mwait" },
1478 { cpu_idle_hlt, "hlt" },
1479 { cpu_idle_acpi, "acpi" },
1480 { NULL, NULL }
1481};
1482
1483static int
1484idle_sysctl_available(SYSCTL_HANDLER_ARGS)
1485{
1486 char *avail, *p;
1487 int error;
1488 int i;
1489
1490 avail = malloc(256, M_TEMP, M_WAITOK);
1491 p = avail;
1492 for (i = 0; idle_tbl[i].id_name != NULL; i++) {
1493 if (strstr(idle_tbl[i].id_name, "mwait") &&
1494 (cpu_feature2 & CPUID2_MON) == 0)
1495 continue;
1496 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
1497 cpu_idle_hook == NULL)
1498 continue;
1499 p += sprintf(p, "%s%s", p != avail ? ", " : "",
1500 idle_tbl[i].id_name);
1501 }
1502 error = sysctl_handle_string(oidp, avail, 0, req);
1503 free(avail, M_TEMP);
1504 return (error);
1505}
1506
1507SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
1508 0, 0, idle_sysctl_available, "A", "list of available idle functions");
1509
1510static int
1511idle_sysctl(SYSCTL_HANDLER_ARGS)
1512{
1513 char buf[16];
1514 int error;
1515 char *p;
1516 int i;
1517
1518 p = "unknown";
1519 for (i = 0; idle_tbl[i].id_name != NULL; i++) {
1520 if (idle_tbl[i].id_fn == cpu_idle_fn) {
1521 p = idle_tbl[i].id_name;
1522 break;
1523 }
1524 }
1525 strncpy(buf, p, sizeof(buf));
1526 error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
1527 if (error != 0 || req->newptr == NULL)
1528 return (error);
1529 for (i = 0; idle_tbl[i].id_name != NULL; i++) {
1530 if (strstr(idle_tbl[i].id_name, "mwait") &&
1531 (cpu_feature2 & CPUID2_MON) == 0)
1532 continue;
1533 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
1534 cpu_idle_hook == NULL)
1535 continue;
1536 if (strcmp(idle_tbl[i].id_name, buf))
1537 continue;
1538 cpu_idle_fn = idle_tbl[i].id_fn;
1539 return (0);
1540 }
1541 return (EINVAL);
1542}
1543
1544SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
1545 idle_sysctl, "A", "currently selected idle function");
1546
1547uint64_t (*atomic_load_acq_64)(volatile uint64_t *) =
1548 atomic_load_acq_64_i386;
1549void (*atomic_store_rel_64)(volatile uint64_t *, uint64_t) =
1550 atomic_store_rel_64_i386;
1551
1552static void
1553cpu_probe_cmpxchg8b(void)
1554{
1555
1556 if ((cpu_feature & CPUID_CX8) != 0 ||
1557 cpu_vendor_id == CPU_VENDOR_RISE) {
1558 atomic_load_acq_64 = atomic_load_acq_64_i586;
1559 atomic_store_rel_64 = atomic_store_rel_64_i586;
1560 }
1561}
1562
1563/*
1564 * Reset registers to default values on exec.
1565 */
1566void
1567exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
1568{
1569 struct trapframe *regs = td->td_frame;
1570 struct pcb *pcb = td->td_pcb;
1571
1572 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */
1573 pcb->pcb_gs = _udatasel;
1574 load_gs(_udatasel);
1575
1576 mtx_lock_spin(&dt_lock);
1577 if (td->td_proc->p_md.md_ldt)
1578 user_ldt_free(td);
1579 else
1580 mtx_unlock_spin(&dt_lock);
1581
1582 bzero((char *)regs, sizeof(struct trapframe));
1583 regs->tf_eip = imgp->entry_addr;
1584 regs->tf_esp = stack;
1585 regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
1586 regs->tf_ss = _udatasel;
1587 regs->tf_ds = _udatasel;
1588 regs->tf_es = _udatasel;
1589 regs->tf_fs = _udatasel;
1590 regs->tf_cs = _ucodesel;
1591
1592 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
1593 regs->tf_ebx = imgp->ps_strings;
1594
1595 /*
1596 * Reset the hardware debug registers if they were in use.
1597 * They won't have any meaning for the newly exec'd process.
1598 */
1599 if (pcb->pcb_flags & PCB_DBREGS) {
1600 pcb->pcb_dr0 = 0;
1601 pcb->pcb_dr1 = 0;
1602 pcb->pcb_dr2 = 0;
1603 pcb->pcb_dr3 = 0;
1604 pcb->pcb_dr6 = 0;
1605 pcb->pcb_dr7 = 0;
1606 if (pcb == curpcb) {
1607 /*
1608 * Clear the debug registers on the running
1609 * CPU, otherwise they will end up affecting
1610 * the next process we switch to.
1611 */
1612 reset_dbregs();
1613 }
1614 pcb->pcb_flags &= ~PCB_DBREGS;
1615 }
1616
1617 /*
1618 * Initialize the math emulator (if any) for the current process.
1619 * Actually, just clear the bit that says that the emulator has
1620 * been initialized. Initialization is delayed until the process
1621 * traps to the emulator (if it is done at all) mainly because
1622 * emulators don't provide an entry point for initialization.
1623 */
1624 td->td_pcb->pcb_flags &= ~FP_SOFTFP;
1625 pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;
1626
1627 /*
1628 * Drop the FP state if we hold it, so that the process gets a
1629 * clean FP state if it uses the FPU again.
1630 */
1631 fpstate_drop(td);
1632
1633 /*
1634 * XXX - Linux emulator
1635 * Make sure sure edx is 0x0 on entry. Linux binaries depend
1636 * on it.
1637 */
1638 td->td_retval[1] = 0;
1639}
1640
1641void
1642cpu_setregs(void)
1643{
1644 unsigned int cr0;
1645
1646 cr0 = rcr0();
1647
1648 /*
1649 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
1650 *
1651 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
1652 * instructions. We must set the CR0_MP bit and use the CR0_TS
1653 * bit to control the trap, because setting the CR0_EM bit does
1654 * not cause WAIT instructions to trap. It's important to trap
1655 * WAIT instructions - otherwise the "wait" variants of no-wait
1656 * control instructions would degenerate to the "no-wait" variants
1657 * after FP context switches but work correctly otherwise. It's
1658 * particularly important to trap WAITs when there is no NPX -
1659 * otherwise the "wait" variants would always degenerate.
1660 *
1661 * Try setting CR0_NE to get correct error reporting on 486DX's.
1662 * Setting it should fail or do nothing on lesser processors.
1663 */
1664 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
1665 load_cr0(cr0);
1666 load_gs(_udatasel);
1667}
1668
1669u_long bootdev; /* not a struct cdev *- encoding is different */
1670SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
1671 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
1672
1673/*
1674 * Initialize 386 and configure to run kernel
1675 */
1676
1677/*
1678 * Initialize segments & interrupt table
1679 */
1680
1681int _default_ldt;
1682
1683#ifdef XEN
1684union descriptor *gdt;
1685union descriptor *ldt;
1686#else
1687union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */
1688union descriptor ldt[NLDT]; /* local descriptor table */
1689#endif
1690static struct gate_descriptor idt0[NIDT];
1691struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
1692struct region_descriptor r_gdt, r_idt; /* table descriptors */
1693struct mtx dt_lock; /* lock for GDT and LDT */
1694
1695#if defined(I586_CPU) && !defined(NO_F00F_HACK)
1696extern int has_f00f_bug;
1697#endif
1698
1699static struct i386tss dblfault_tss;
1700static char dblfault_stack[PAGE_SIZE];
1701
1702extern vm_offset_t proc0kstack;
1703
1704
1705/*
1706 * software prototypes -- in more palatable form.
1707 *
1708 * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
1709 * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
1710 */
1711struct soft_segment_descriptor gdt_segs[] = {
1712/* GNULL_SEL 0 Null Descriptor */
1713{ .ssd_base = 0x0,
1714 .ssd_limit = 0x0,
1715 .ssd_type = 0,
1716 .ssd_dpl = SEL_KPL,
1717 .ssd_p = 0,
1718 .ssd_xx = 0, .ssd_xx1 = 0,
1719 .ssd_def32 = 0,
1720 .ssd_gran = 0 },
1721/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */
1722{ .ssd_base = 0x0,
1723 .ssd_limit = 0xfffff,
1724 .ssd_type = SDT_MEMRWA,
1725 .ssd_dpl = SEL_KPL,
1726 .ssd_p = 1,
1727 .ssd_xx = 0, .ssd_xx1 = 0,
1728 .ssd_def32 = 1,
1729 .ssd_gran = 1 },
1730/* GUFS_SEL 2 %fs Descriptor for user */
1731{ .ssd_base = 0x0,
1732 .ssd_limit = 0xfffff,
1733 .ssd_type = SDT_MEMRWA,
1734 .ssd_dpl = SEL_UPL,
1735 .ssd_p = 1,
1736 .ssd_xx = 0, .ssd_xx1 = 0,
1737 .ssd_def32 = 1,
1738 .ssd_gran = 1 },
1739/* GUGS_SEL 3 %gs Descriptor for user */
1740{ .ssd_base = 0x0,
1741 .ssd_limit = 0xfffff,
1742 .ssd_type = SDT_MEMRWA,
1743 .ssd_dpl = SEL_UPL,
1744 .ssd_p = 1,
1745 .ssd_xx = 0, .ssd_xx1 = 0,
1746 .ssd_def32 = 1,
1747 .ssd_gran = 1 },
1748/* GCODE_SEL 4 Code Descriptor for kernel */
1749{ .ssd_base = 0x0,
1750 .ssd_limit = 0xfffff,
1751 .ssd_type = SDT_MEMERA,
1752 .ssd_dpl = SEL_KPL,
1753 .ssd_p = 1,
1754 .ssd_xx = 0, .ssd_xx1 = 0,
1755 .ssd_def32 = 1,
1756 .ssd_gran = 1 },
1757/* GDATA_SEL 5 Data Descriptor for kernel */
1758{ .ssd_base = 0x0,
1759 .ssd_limit = 0xfffff,
1760 .ssd_type = SDT_MEMRWA,
1761 .ssd_dpl = SEL_KPL,
1762 .ssd_p = 1,
1763 .ssd_xx = 0, .ssd_xx1 = 0,
1764 .ssd_def32 = 1,
1765 .ssd_gran = 1 },
1766/* GUCODE_SEL 6 Code Descriptor for user */
1767{ .ssd_base = 0x0,
1768 .ssd_limit = 0xfffff,
1769 .ssd_type = SDT_MEMERA,
1770 .ssd_dpl = SEL_UPL,
1771 .ssd_p = 1,
1772 .ssd_xx = 0, .ssd_xx1 = 0,
1773 .ssd_def32 = 1,
1774 .ssd_gran = 1 },
1775/* GUDATA_SEL 7 Data Descriptor for user */
1776{ .ssd_base = 0x0,
1777 .ssd_limit = 0xfffff,
1778 .ssd_type = SDT_MEMRWA,
1779 .ssd_dpl = SEL_UPL,
1780 .ssd_p = 1,
1781 .ssd_xx = 0, .ssd_xx1 = 0,
1782 .ssd_def32 = 1,
1783 .ssd_gran = 1 },
1784/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
1785{ .ssd_base = 0x400,
1786 .ssd_limit = 0xfffff,
1787 .ssd_type = SDT_MEMRWA,
1788 .ssd_dpl = SEL_KPL,
1789 .ssd_p = 1,
1790 .ssd_xx = 0, .ssd_xx1 = 0,
1791 .ssd_def32 = 1,
1792 .ssd_gran = 1 },
1793#ifndef XEN
1794/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
1795{
1796 .ssd_base = 0x0,
1797 .ssd_limit = sizeof(struct i386tss)-1,
1798 .ssd_type = SDT_SYS386TSS,
1799 .ssd_dpl = 0,
1800 .ssd_p = 1,
1801 .ssd_xx = 0, .ssd_xx1 = 0,
1802 .ssd_def32 = 0,
1803 .ssd_gran = 0 },
1804/* GLDT_SEL 10 LDT Descriptor */
1805{ .ssd_base = (int) ldt,
1806 .ssd_limit = sizeof(ldt)-1,
1807 .ssd_type = SDT_SYSLDT,
1808 .ssd_dpl = SEL_UPL,
1809 .ssd_p = 1,
1810 .ssd_xx = 0, .ssd_xx1 = 0,
1811 .ssd_def32 = 0,
1812 .ssd_gran = 0 },
1813/* GUSERLDT_SEL 11 User LDT Descriptor per process */
1814{ .ssd_base = (int) ldt,
1815 .ssd_limit = (512 * sizeof(union descriptor)-1),
1816 .ssd_type = SDT_SYSLDT,
1817 .ssd_dpl = 0,
1818 .ssd_p = 1,
1819 .ssd_xx = 0, .ssd_xx1 = 0,
1820 .ssd_def32 = 0,
1821 .ssd_gran = 0 },
1822/* GPANIC_SEL 12 Panic Tss Descriptor */
1823{ .ssd_base = (int) &dblfault_tss,
1824 .ssd_limit = sizeof(struct i386tss)-1,
1825 .ssd_type = SDT_SYS386TSS,
1826 .ssd_dpl = 0,
1827 .ssd_p = 1,
1828 .ssd_xx = 0, .ssd_xx1 = 0,
1829 .ssd_def32 = 0,
1830 .ssd_gran = 0 },
1831/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
1832{ .ssd_base = 0,
1833 .ssd_limit = 0xfffff,
1834 .ssd_type = SDT_MEMERA,
1835 .ssd_dpl = 0,
1836 .ssd_p = 1,
1837 .ssd_xx = 0, .ssd_xx1 = 0,
1838 .ssd_def32 = 0,
1839 .ssd_gran = 1 },
1840/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
1841{ .ssd_base = 0,
1842 .ssd_limit = 0xfffff,
1843 .ssd_type = SDT_MEMERA,
1844 .ssd_dpl = 0,
1845 .ssd_p = 1,
1846 .ssd_xx = 0, .ssd_xx1 = 0,
1847 .ssd_def32 = 0,
1848 .ssd_gran = 1 },
1849/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
1850{ .ssd_base = 0,
1851 .ssd_limit = 0xfffff,
1852 .ssd_type = SDT_MEMRWA,
1853 .ssd_dpl = 0,
1854 .ssd_p = 1,
1855 .ssd_xx = 0, .ssd_xx1 = 0,
1856 .ssd_def32 = 1,
1857 .ssd_gran = 1 },
1858/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
1859{ .ssd_base = 0,
1860 .ssd_limit = 0xfffff,
1861 .ssd_type = SDT_MEMRWA,
1862 .ssd_dpl = 0,
1863 .ssd_p = 1,
1864 .ssd_xx = 0, .ssd_xx1 = 0,
1865 .ssd_def32 = 0,
1866 .ssd_gran = 1 },
1867/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
1868{ .ssd_base = 0,
1869 .ssd_limit = 0xfffff,
1870 .ssd_type = SDT_MEMRWA,
1871 .ssd_dpl = 0,
1872 .ssd_p = 1,
1873 .ssd_xx = 0, .ssd_xx1 = 0,
1874 .ssd_def32 = 0,
1875 .ssd_gran = 1 },
1876/* GNDIS_SEL 18 NDIS Descriptor */
1877{ .ssd_base = 0x0,
1878 .ssd_limit = 0x0,
1879 .ssd_type = 0,
1880 .ssd_dpl = 0,
1881 .ssd_p = 0,
1882 .ssd_xx = 0, .ssd_xx1 = 0,
1883 .ssd_def32 = 0,
1884 .ssd_gran = 0 },
1885#endif /* !XEN */
1886};
1887
1888static struct soft_segment_descriptor ldt_segs[] = {
1889 /* Null Descriptor - overwritten by call gate */
1890{ .ssd_base = 0x0,
1891 .ssd_limit = 0x0,
1892 .ssd_type = 0,
1893 .ssd_dpl = 0,
1894 .ssd_p = 0,
1895 .ssd_xx = 0, .ssd_xx1 = 0,
1896 .ssd_def32 = 0,
1897 .ssd_gran = 0 },
1898 /* Null Descriptor - overwritten by call gate */
1899{ .ssd_base = 0x0,
1900 .ssd_limit = 0x0,
1901 .ssd_type = 0,
1902 .ssd_dpl = 0,
1903 .ssd_p = 0,
1904 .ssd_xx = 0, .ssd_xx1 = 0,
1905 .ssd_def32 = 0,
1906 .ssd_gran = 0 },
1907 /* Null Descriptor - overwritten by call gate */
1908{ .ssd_base = 0x0,
1909 .ssd_limit = 0x0,
1910 .ssd_type = 0,
1911 .ssd_dpl = 0,
1912 .ssd_p = 0,
1913 .ssd_xx = 0, .ssd_xx1 = 0,
1914 .ssd_def32 = 0,
1915 .ssd_gran = 0 },
1916 /* Code Descriptor for user */
1917{ .ssd_base = 0x0,
1918 .ssd_limit = 0xfffff,
1919 .ssd_type = SDT_MEMERA,
1920 .ssd_dpl = SEL_UPL,
1921 .ssd_p = 1,
1922 .ssd_xx = 0, .ssd_xx1 = 0,
1923 .ssd_def32 = 1,
1924 .ssd_gran = 1 },
1925 /* Null Descriptor - overwritten by call gate */
1926{ .ssd_base = 0x0,
1927 .ssd_limit = 0x0,
1928 .ssd_type = 0,
1929 .ssd_dpl = 0,
1930 .ssd_p = 0,
1931 .ssd_xx = 0, .ssd_xx1 = 0,
1932 .ssd_def32 = 0,
1933 .ssd_gran = 0 },
1934 /* Data Descriptor for user */
1935{ .ssd_base = 0x0,
1936 .ssd_limit = 0xfffff,
1937 .ssd_type = SDT_MEMRWA,
1938 .ssd_dpl = SEL_UPL,
1939 .ssd_p = 1,
1940 .ssd_xx = 0, .ssd_xx1 = 0,
1941 .ssd_def32 = 1,
1942 .ssd_gran = 1 },
1943};
1944
1945void
1946setidt(idx, func, typ, dpl, selec)
1947 int idx;
1948 inthand_t *func;
1949 int typ;
1950 int dpl;
1951 int selec;
1952{
1953 struct gate_descriptor *ip;
1954
1955 ip = idt + idx;
1956 ip->gd_looffset = (int)func;
1957 ip->gd_selector = selec;
1958 ip->gd_stkcpy = 0;
1959 ip->gd_xx = 0;
1960 ip->gd_type = typ;
1961 ip->gd_dpl = dpl;
1962 ip->gd_p = 1;
1963 ip->gd_hioffset = ((int)func)>>16 ;
1964}
1965
1966extern inthand_t
1967 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
1968 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
1969 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
1970 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
1971 IDTVEC(xmm),
1972#ifdef KDTRACE_HOOKS
1973 IDTVEC(dtrace_ret),
1974#endif
1975 IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
1976
1977#ifdef DDB
1978/*
1979 * Display the index and function name of any IDT entries that don't use
1980 * the default 'rsvd' entry point.
1981 */
1982DB_SHOW_COMMAND(idt, db_show_idt)
1983{
1984 struct gate_descriptor *ip;
1985 int idx;
1986 uintptr_t func;
1987
1988 ip = idt;
1989 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
1990 func = (ip->gd_hioffset << 16 | ip->gd_looffset);
1991 if (func != (uintptr_t)&IDTVEC(rsvd)) {
1992 db_printf("%3d\t", idx);
1993 db_printsym(func, DB_STGY_PROC);
1994 db_printf("\n");
1995 }
1996 ip++;
1997 }
1998}
1999
2000/* Show privileged registers. */
2001DB_SHOW_COMMAND(sysregs, db_show_sysregs)
2002{
2003 uint64_t idtr, gdtr;
2004
2005 idtr = ridt();
2006 db_printf("idtr\t0x%08x/%04x\n",
2007 (u_int)(idtr >> 16), (u_int)idtr & 0xffff);
2008 gdtr = rgdt();
2009 db_printf("gdtr\t0x%08x/%04x\n",
2010 (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
2011 db_printf("ldtr\t0x%04x\n", rldt());
2012 db_printf("tr\t0x%04x\n", rtr());
2013 db_printf("cr0\t0x%08x\n", rcr0());
2014 db_printf("cr2\t0x%08x\n", rcr2());
2015 db_printf("cr3\t0x%08x\n", rcr3());
2016 db_printf("cr4\t0x%08x\n", rcr4());
2017}
2018#endif
2019
2020void
2021sdtossd(sd, ssd)
2022 struct segment_descriptor *sd;
2023 struct soft_segment_descriptor *ssd;
2024{
2025 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
2026 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
2027 ssd->ssd_type = sd->sd_type;
2028 ssd->ssd_dpl = sd->sd_dpl;
2029 ssd->ssd_p = sd->sd_p;
2030 ssd->ssd_def32 = sd->sd_def32;
2031 ssd->ssd_gran = sd->sd_gran;
2032}
2033
2034#ifndef XEN
2035static int
2036add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
2037{
2038 int i, insert_idx, physmap_idx;
2039
2040 physmap_idx = *physmap_idxp;
2041
2042 if (boothowto & RB_VERBOSE)
2043 printf("SMAP type=%02x base=%016llx len=%016llx\n",
2044 smap->type, smap->base, smap->length);
2045
2046 if (smap->type != SMAP_TYPE_MEMORY)
2047 return (1);
2048
2049 if (smap->length == 0)
2050 return (1);
2051
2052#ifndef PAE
2053 if (smap->base > 0xffffffff) {
2054 printf("%uK of memory above 4GB ignored\n",
2055 (u_int)(smap->length / 1024));
2056 return (1);
2057 }
2058#endif
2059
2060 /*
2061 * Find insertion point while checking for overlap. Start off by
2062 * assuming the new entry will be added to the end.
2063 */
2064 insert_idx = physmap_idx + 2;
2065 for (i = 0; i <= physmap_idx; i += 2) {
2066 if (smap->base < physmap[i + 1]) {
2067 if (smap->base + smap->length <= physmap[i]) {
2068 insert_idx = i;
2069 break;
2070 }
2071 if (boothowto & RB_VERBOSE)
2072 printf(
2073 "Overlapping memory regions, ignoring second region\n");
2074 return (1);
2075 }
2076 }
2077
2078 /* See if we can prepend to the next entry. */
2079 if (insert_idx <= physmap_idx &&
2080 smap->base + smap->length == physmap[insert_idx]) {
2081 physmap[insert_idx] = smap->base;
2082 return (1);
2083 }
2084
2085 /* See if we can append to the previous entry. */
2086 if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) {
2087 physmap[insert_idx - 1] += smap->length;
2088 return (1);
2089 }
2090
2091 physmap_idx += 2;
2092 *physmap_idxp = physmap_idx;
2093 if (physmap_idx == PHYSMAP_SIZE) {
2094 printf(
2095 "Too many segments in the physical address map, giving up\n");
2096 return (0);
2097 }
2098
2099 /*
2100 * Move the last 'N' entries down to make room for the new
2101 * entry if needed.
2102 */
2103 for (i = physmap_idx; i > insert_idx; i -= 2) {
2104 physmap[i] = physmap[i - 2];
2105 physmap[i + 1] = physmap[i - 1];
2106 }
2107
2108 /* Insert the new entry. */
2109 physmap[insert_idx] = smap->base;
2110 physmap[insert_idx + 1] = smap->base + smap->length;
2111 return (1);
2112}
2113
2114static void
2115basemem_setup(void)
2116{
2117 vm_paddr_t pa;
2118 pt_entry_t *pte;
2119 int i;
2120
2121 if (basemem > 640) {
2122 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
2123 basemem);
2124 basemem = 640;
2125 }
2126
2127 /*
2128 * XXX if biosbasemem is now < 640, there is a `hole'
2129 * between the end of base memory and the start of
2130 * ISA memory. The hole may be empty or it may
2131 * contain BIOS code or data. Map it read/write so
2132 * that the BIOS can write to it. (Memory from 0 to
2133 * the physical end of the kernel is mapped read-only
2134 * to begin with and then parts of it are remapped.
2135 * The parts that aren't remapped form holes that
2136 * remain read-only and are unused by the kernel.
2137 * The base memory area is below the physical end of
2138 * the kernel and right now forms a read-only hole.
2139 * The part of it from PAGE_SIZE to
2140 * (trunc_page(biosbasemem * 1024) - 1) will be
2141 * remapped and used by the kernel later.)
2142 *
2143 * This code is similar to the code used in
2144 * pmap_mapdev, but since no memory needs to be
2145 * allocated we simply change the mapping.
2146 */
2147 for (pa = trunc_page(basemem * 1024);
2148 pa < ISA_HOLE_START; pa += PAGE_SIZE)
2149 pmap_kenter(KERNBASE + pa, pa);
2150
2151 /*
2152 * Map pages between basemem and ISA_HOLE_START, if any, r/w into
2153 * the vm86 page table so that vm86 can scribble on them using
2154 * the vm86 map too. XXX: why 2 ways for this and only 1 way for
2155 * page 0, at least as initialized here?
2156 */
2157 pte = (pt_entry_t *)vm86paddr;
2158 for (i = basemem / 4; i < 160; i++)
2159 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
2160}
2161#endif
2162
2163/*
2164 * Populate the (physmap) array with base/bound pairs describing the
2165 * available physical memory in the system, then test this memory and
2166 * build the phys_avail array describing the actually-available memory.
2167 *
2168 * If we cannot accurately determine the physical memory map, then use
2169 * value from the 0xE801 call, and failing that, the RTC.
2170 *
2171 * Total memory size may be set by the kernel environment variable
2172 * hw.physmem or the compile-time define MAXMEM.
2173 *
2174 * XXX first should be vm_paddr_t.
2175 */
2176static void
2177getmemsize(int first)
2178{
2179 int has_smap, off, physmap_idx, pa_indx, da_indx;
2180 u_long physmem_tunable, memtest;
2181 vm_paddr_t physmap[PHYSMAP_SIZE];
2182 pt_entry_t *pte;
2183 quad_t dcons_addr, dcons_size;
2184#ifndef XEN
2185 int hasbrokenint12, i, res;
2186 u_int extmem;
2187 struct vm86frame vmf;
2188 struct vm86context vmc;
2189 vm_paddr_t pa;
2190 struct bios_smap *smap, *smapbase, *smapend;
2191 u_int32_t smapsize;
2192 caddr_t kmdp;
2193#endif
2194
2195 has_smap = 0;
2196#if defined(XEN)
2197 Maxmem = xen_start_info->nr_pages - init_first;
2198 physmem = Maxmem;
2199 basemem = 0;
2200 physmap[0] = init_first << PAGE_SHIFT;
2201 physmap[1] = ptoa(Maxmem) - round_page(msgbufsize);
2202 physmap_idx = 0;
2203#else
2204#ifdef XBOX
2205 if (arch_i386_is_xbox) {
2206 /*
2207 * We queried the memory size before, so chop off 4MB for
2208 * the framebuffer and inform the OS of this.
2209 */
2210 physmap[0] = 0;
2211 physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE;
2212 physmap_idx = 0;
2213 goto physmap_done;
2214 }
2215#endif
2216 bzero(&vmf, sizeof(vmf));
2217 bzero(physmap, sizeof(physmap));
2218 basemem = 0;
2219
2220 /*
2221 * Check if the loader supplied an SMAP memory map. If so,
2222 * use that and do not make any VM86 calls.
2223 */
2224 physmap_idx = 0;
2225 smapbase = NULL;
2226 kmdp = preload_search_by_type("elf kernel");
2227 if (kmdp == NULL)
2228 kmdp = preload_search_by_type("elf32 kernel");
2229 if (kmdp != NULL)
2230 smapbase = (struct bios_smap *)preload_search_info(kmdp,
2231 MODINFO_METADATA | MODINFOMD_SMAP);
2232 if (smapbase != NULL) {
2233 /*
2234 * subr_module.c says:
2235 * "Consumer may safely assume that size value precedes data."
2236 * ie: an int32_t immediately precedes SMAP.
2237 */
2238 smapsize = *((u_int32_t *)smapbase - 1);
2239 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
2240 has_smap = 1;
2241
2242 for (smap = smapbase; smap < smapend; smap++)
2243 if (!add_smap_entry(smap, physmap, &physmap_idx))
2244 break;
2245 goto have_smap;
2246 }
2247
2248 /*
2249 * Some newer BIOSes have a broken INT 12H implementation
2250 * which causes a kernel panic immediately. In this case, we
2251 * need use the SMAP to determine the base memory size.
2252 */
2253 hasbrokenint12 = 0;
2254 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
2255 if (hasbrokenint12 == 0) {
2256 /* Use INT12 to determine base memory size. */
2257 vm86_intcall(0x12, &vmf);
2258 basemem = vmf.vmf_ax;
2259 basemem_setup();
2260 }
2261
2262 /*
2263 * Fetch the memory map with INT 15:E820. Map page 1 R/W into
2264 * the kernel page table so we can use it as a buffer. The
2265 * kernel will unmap this page later.
2266 */
2267 pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
2268 vmc.npages = 0;
2269 smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
2270 res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
2271 KASSERT(res != 0, ("vm86_getptr() failed: address not found"));
2272
2273 vmf.vmf_ebx = 0;
2274 do {
2275 vmf.vmf_eax = 0xE820;
2276 vmf.vmf_edx = SMAP_SIG;
2277 vmf.vmf_ecx = sizeof(struct bios_smap);
2278 i = vm86_datacall(0x15, &vmf, &vmc);
2279 if (i || vmf.vmf_eax != SMAP_SIG)
2280 break;
2281 has_smap = 1;
2282 if (!add_smap_entry(smap, physmap, &physmap_idx))
2283 break;
2284 } while (vmf.vmf_ebx != 0);
2285
2286have_smap:
2287 /*
2288 * If we didn't fetch the "base memory" size from INT12,
2289 * figure it out from the SMAP (or just guess).
2290 */
2291 if (basemem == 0) {
2292 for (i = 0; i <= physmap_idx; i += 2) {
2293 if (physmap[i] == 0x00000000) {
2294 basemem = physmap[i + 1] / 1024;
2295 break;
2296 }
2297 }
2298
2299 /* XXX: If we couldn't find basemem from SMAP, just guess. */
2300 if (basemem == 0)
2301 basemem = 640;
2302 basemem_setup();
2303 }
2304
2305 if (physmap[1] != 0)
2306 goto physmap_done;
2307
2308 /*
2309 * If we failed to find an SMAP, figure out the extended
2310 * memory size. We will then build a simple memory map with
2311 * two segments, one for "base memory" and the second for
2312 * "extended memory". Note that "extended memory" starts at a
2313 * physical address of 1MB and that both basemem and extmem
2314 * are in units of 1KB.
2315 *
2316 * First, try to fetch the extended memory size via INT 15:E801.
2317 */
2318 vmf.vmf_ax = 0xE801;
2319 if (vm86_intcall(0x15, &vmf) == 0) {
2320 extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
2321 } else {
2322 /*
2323 * If INT15:E801 fails, this is our last ditch effort
2324 * to determine the extended memory size. Currently
2325 * we prefer the RTC value over INT15:88.
2326 */
2327#if 0
2328 vmf.vmf_ah = 0x88;
2329 vm86_intcall(0x15, &vmf);
2330 extmem = vmf.vmf_ax;
2331#else
2332 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
2333#endif
2334 }
2335
2336 /*
2337 * Special hack for chipsets that still remap the 384k hole when
2338 * there's 16MB of memory - this really confuses people that
2339 * are trying to use bus mastering ISA controllers with the
2340 * "16MB limit"; they only have 16MB, but the remapping puts
2341 * them beyond the limit.
2342 *
2343 * If extended memory is between 15-16MB (16-17MB phys address range),
2344 * chop it to 15MB.
2345 */
2346 if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
2347 extmem = 15 * 1024;
2348
2349 physmap[0] = 0;
2350 physmap[1] = basemem * 1024;
2351 physmap_idx = 2;
2352 physmap[physmap_idx] = 0x100000;
2353 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
2354
2355physmap_done:
2356#endif
2357 /*
2358 * Now, physmap contains a map of physical memory.
2359 */
2360
2361#ifdef SMP
2362 /* make hole for AP bootstrap code */
2363 physmap[1] = mp_bootaddress(physmap[1]);
2364#endif
2365
2366 /*
2367 * Maxmem isn't the "maximum memory", it's one larger than the
2368 * highest page of the physical address space. It should be
2369 * called something like "Maxphyspage". We may adjust this
2370 * based on ``hw.physmem'' and the results of the memory test.
2371 */
2372 Maxmem = atop(physmap[physmap_idx + 1]);
2373
2374#ifdef MAXMEM
2375 Maxmem = MAXMEM / 4;
2376#endif
2377
2378 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
2379 Maxmem = atop(physmem_tunable);
2380
2381 /*
2382 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
2383 * the amount of memory in the system.
2384 */
2385 if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
2386 Maxmem = atop(physmap[physmap_idx + 1]);
2387
2388 /*
2389 * By default enable the memory test on real hardware, and disable
2390 * it if we appear to be running in a VM. This avoids touching all
2391 * pages unnecessarily, which doesn't matter on real hardware but is
2392 * bad for shared VM hosts. Use a general name so that
2393 * one could eventually do more with the code than just disable it.
2394 */
2395 memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1;
2396 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
2397
2398 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
2399 (boothowto & RB_VERBOSE))
2400 printf("Physical memory use set to %ldK\n", Maxmem * 4);
2401
2402 /*
2403 * If Maxmem has been increased beyond what the system has detected,
2404 * extend the last memory segment to the new limit.
2405 */
2406 if (atop(physmap[physmap_idx + 1]) < Maxmem)
2407 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
2408
2409 /* call pmap initialization to make new kernel address space */
2410 pmap_bootstrap(first);
2411
2412 /*
2413 * Size up each available chunk of physical memory.
2414 */
2415 physmap[0] = PAGE_SIZE; /* mask off page 0 */
2416 pa_indx = 0;
2417 da_indx = 1;
2418 phys_avail[pa_indx++] = physmap[0];
2419 phys_avail[pa_indx] = physmap[0];
2420 dump_avail[da_indx] = physmap[0];
2421 pte = CMAP1;
2422
2423 /*
2424 * Get dcons buffer address
2425 */
2426 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
2427 getenv_quad("dcons.size", &dcons_size) == 0)
2428 dcons_addr = 0;
2429
2430#ifndef XEN
2431 /*
2432 * physmap is in bytes, so when converting to page boundaries,
2433 * round up the start address and round down the end address.
2434 */
2435 for (i = 0; i <= physmap_idx; i += 2) {
2436 vm_paddr_t end;
2437
2438 end = ptoa((vm_paddr_t)Maxmem);
2439 if (physmap[i + 1] < end)
2440 end = trunc_page(physmap[i + 1]);
2441 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
2442 int tmp, page_bad, full;
2443 int *ptr = (int *)CADDR1;
2444
2445 full = FALSE;
2446 /*
2447 * block out kernel memory as not available.
2448 */
2449 if (pa >= KERNLOAD && pa < first)
2450 goto do_dump_avail;
2451
2452 /*
2453 * block out dcons buffer
2454 */
2455 if (dcons_addr > 0
2456 && pa >= trunc_page(dcons_addr)
2457 && pa < dcons_addr + dcons_size)
2458 goto do_dump_avail;
2459
2460 page_bad = FALSE;
2461 if (memtest == 0)
2462 goto skip_memtest;
2463
2464 /*
2465 * map page into kernel: valid, read/write,non-cacheable
2466 */
2467 *pte = pa | PG_V | PG_RW | PG_N;
2468 invltlb();
2469
2470 tmp = *(int *)ptr;
2471 /*
2472 * Test for alternating 1's and 0's
2473 */
2474 *(volatile int *)ptr = 0xaaaaaaaa;
2475 if (*(volatile int *)ptr != 0xaaaaaaaa)
2476 page_bad = TRUE;
2477 /*
2478 * Test for alternating 0's and 1's
2479 */
2480 *(volatile int *)ptr = 0x55555555;
2481 if (*(volatile int *)ptr != 0x55555555)
2482 page_bad = TRUE;
2483 /*
2484 * Test for all 1's
2485 */
2486 *(volatile int *)ptr = 0xffffffff;
2487 if (*(volatile int *)ptr != 0xffffffff)
2488 page_bad = TRUE;
2489 /*
2490 * Test for all 0's
2491 */
2492 *(volatile int *)ptr = 0x0;
2493 if (*(volatile int *)ptr != 0x0)
2494 page_bad = TRUE;
2495 /*
2496 * Restore original value.
2497 */
2498 *(int *)ptr = tmp;
2499
2500skip_memtest:
2501 /*
2502 * Adjust array of valid/good pages.
2503 */
2504 if (page_bad == TRUE)
2505 continue;
2506 /*
2507 * If this good page is a continuation of the
2508 * previous set of good pages, then just increase
2509 * the end pointer. Otherwise start a new chunk.
2510 * Note that "end" points one higher than end,
2511 * making the range >= start and < end.
2512 * If we're also doing a speculative memory
2513 * test and we at or past the end, bump up Maxmem
2514 * so that we keep going. The first bad page
2515 * will terminate the loop.
2516 */
2517 if (phys_avail[pa_indx] == pa) {
2518 phys_avail[pa_indx] += PAGE_SIZE;
2519 } else {
2520 pa_indx++;
2521 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
2522 printf(
2523 "Too many holes in the physical address space, giving up\n");
2524 pa_indx--;
2525 full = TRUE;
2526 goto do_dump_avail;
2527 }
2528 phys_avail[pa_indx++] = pa; /* start */
2529 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
2530 }
2531 physmem++;
2532do_dump_avail:
2533 if (dump_avail[da_indx] == pa) {
2534 dump_avail[da_indx] += PAGE_SIZE;
2535 } else {
2536 da_indx++;
2537 if (da_indx == DUMP_AVAIL_ARRAY_END) {
2538 da_indx--;
2539 goto do_next;
2540 }
2541 dump_avail[da_indx++] = pa; /* start */
2542 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
2543 }
2544do_next:
2545 if (full)
2546 break;
2547 }
2548 }
2549 *pte = 0;
2550 invltlb();
2551#else
2552 phys_avail[0] = physfree;
2553 phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
2554 dump_avail[0] = 0;
2555 dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
2556
2557#endif
2558
2559 /*
2560 * XXX
2561 * The last chunk must contain at least one page plus the message
2562 * buffer to avoid complicating other code (message buffer address
2563 * calculation, etc.).
2564 */
2565 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
2566 round_page(msgbufsize) >= phys_avail[pa_indx]) {
2567 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
2568 phys_avail[pa_indx--] = 0;
2569 phys_avail[pa_indx--] = 0;
2570 }
2571
2572 Maxmem = atop(phys_avail[pa_indx]);
2573
2574 /* Trim off space for the message buffer. */
2575 phys_avail[pa_indx] -= round_page(msgbufsize);
2576
2577 /* Map the message buffer. */
2578 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
2579 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
2580 off);
2581
2582 PT_UPDATES_FLUSH();
2583}
2584
2585#ifdef XEN
2586#define MTOPSIZE (1<<(14 + PAGE_SHIFT))
2587
2588void
2589init386(first)
2590 int first;
2591{
2592 unsigned long gdtmachpfn;
2593 int error, gsel_tss, metadata_missing, x, pa;
2594 size_t kstack0_sz;
2595 struct pcpu *pc;
2596 struct callback_register event = {
2597 .type = CALLBACKTYPE_event,
2598 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback },
2599 };
2600 struct callback_register failsafe = {
2601 .type = CALLBACKTYPE_failsafe,
2602 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback },
2603 };
2604
2605 thread0.td_kstack = proc0kstack;
2606 thread0.td_kstack_pages = KSTACK_PAGES;
2607 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
2608 thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;
2609
2610 /*
2611 * This may be done better later if it gets more high level
2612 * components in it. If so just link td->td_proc here.
2613 */
2614 proc_linkup0(&proc0, &thread0);
2615
2616 metadata_missing = 0;
2617 if (xen_start_info->mod_start) {
2618 preload_metadata = (caddr_t)xen_start_info->mod_start;
2619 preload_bootstrap_relocate(KERNBASE);
2620 } else {
2621 metadata_missing = 1;
2622 }
2623 if (envmode == 1)
2624 kern_envp = static_env;
2625 else if ((caddr_t)xen_start_info->cmd_line)
2626 kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line);
2627
2628 boothowto |= xen_boothowto(kern_envp);
2629
2630 /* Init basic tunables, hz etc */
2631 init_param1();
2632
2633 /*
2634 * XEN occupies a portion of the upper virtual address space
2635 * At its base it manages an array mapping machine page frames
2636 * to physical page frames - hence we need to be able to
2637 * access 4GB - (64MB - 4MB + 64k)
2638 */
2639 gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2640 gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2641 gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2642 gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2643 gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2644 gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2645 gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2646 gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2647
2648 pc = &__pcpu[0];
2649 gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
2650 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
2651
2652 PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW);
2653 bzero(gdt, PAGE_SIZE);
2654 for (x = 0; x < NGDT; x++)
2655 ssdtosd(&gdt_segs[x], &gdt[x].sd);
2656
2657 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
2658
2659 gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT;
2660 PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V);
2661 PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0);
2662 lgdt(&r_gdt);
2663 gdtset = 1;
2664
2665 if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) {
2666 panic("set_trap_table failed - error %d\n", error);
2667 }
2668
2669 error = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
2670 if (error == 0)
2671 error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
2672#if CONFIG_XEN_COMPAT <= 0x030002
2673 if (error == -ENOXENSYS)
2674 HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL),
2675 (unsigned long)Xhypervisor_callback,
2676 GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
2677#endif
2678 pcpu_init(pc, 0, sizeof(struct pcpu));
2679 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
2680 pmap_kenter(pa + KERNBASE, pa);
2681 dpcpu_init((void *)(first + KERNBASE), 0);
2682 first += DPCPU_SIZE;
2683 physfree += DPCPU_SIZE;
2684 init_first += DPCPU_SIZE / PAGE_SIZE;
2685
2686 PCPU_SET(prvspace, pc);
2687 PCPU_SET(curthread, &thread0);
2688 PCPU_SET(curpcb, thread0.td_pcb);
2689
2690 /*
2691 * Initialize mutexes.
2692 *
2693 * icu_lock: in order to allow an interrupt to occur in a critical
2694 * section, to set pcpu->ipending (etc...) properly, we
2695 * must be able to get the icu lock, so it can't be
2696 * under witness.
2697 */
2698 mutex_init();
2699 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
2700
2701 /* make ldt memory segments */
2702 PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW);
2703 bzero(ldt, PAGE_SIZE);
2704 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
2705 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
2706 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
2707 ssdtosd(&ldt_segs[x], &ldt[x].sd);
2708
2709 default_proc_ldt.ldt_base = (caddr_t)ldt;
2710 default_proc_ldt.ldt_len = 6;
2711 _default_ldt = (int)&default_proc_ldt;
2712 PCPU_SET(currentldt, _default_ldt);
2713 PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW);
2714 xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0]));
2715
2716#if defined(XEN_PRIVILEGED)
2717 /*
2718 * Initialize the i8254 before the console so that console
2719 * initialization can use DELAY().
2720 */
2721 i8254_init();
2722#endif
2723
2724 /*
2725 * Initialize the console before we print anything out.
2726 */
2727 cninit();
2728
2729 if (metadata_missing)
2730 printf("WARNING: loader(8) metadata is missing!\n");
2731
2732#ifdef DEV_ISA
2733#ifdef DEV_ATPIC
2734 elcr_probe();
2735 atpic_startup();
2736#else
2737 /* Reset and mask the atpics and leave them shut down. */
2738 atpic_reset();
2739
2740 /*
2741 * Point the ICU spurious interrupt vectors at the APIC spurious
2742 * interrupt handler.
2743 */
2744 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
2745 GSEL(GCODE_SEL, SEL_KPL));
2746 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
2747 GSEL(GCODE_SEL, SEL_KPL));
2748#endif
2749#endif
2750
2751#ifdef DDB
2752 ksym_start = bootinfo.bi_symtab;
2753 ksym_end = bootinfo.bi_esymtab;
2754#endif
2755
2756 kdb_init();
2757
2758#ifdef KDB
2759 if (boothowto & RB_KDB)
2760 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
2761#endif
2762
2763 finishidentcpu(); /* Final stage of CPU initialization */
2764 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
2765 GSEL(GCODE_SEL, SEL_KPL));
2766 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
2767 GSEL(GCODE_SEL, SEL_KPL));
2768 initializecpu(); /* Initialize CPU registers */
2769
2770 /* make an initial tss so cpu can get interrupt stack on syscall! */
2771 /* Note: -16 is so we can grow the trapframe if we came from vm86 */
2772 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
2773 kstack0_sz - sizeof(struct pcb) - 16);
2774 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
2775 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
2776 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL),
2777 PCPU_GET(common_tss.tss_esp0));
2778
2779 /* pointer to selector slot for %fs/%gs */
2780 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
2781
2782 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
2783 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
2784 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
2785 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
2786#ifdef PAE
2787 dblfault_tss.tss_cr3 = (int)IdlePDPT;
2788#else
2789 dblfault_tss.tss_cr3 = (int)IdlePTD;
2790#endif
2791 dblfault_tss.tss_eip = (int)dblfault_handler;
2792 dblfault_tss.tss_eflags = PSL_KERNEL;
2793 dblfault_tss.tss_ds = dblfault_tss.tss_es =
2794 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
2795 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
2796 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
2797 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
2798
2799 vm86_initialize();
2800 getmemsize(first);
2801 init_param2(physmem);
2802
2803 /* now running on new page tables, configured,and u/iom is accessible */
2804
2805 msgbufinit(msgbufp, msgbufsize);
2806 /* transfer to user mode */
2807
2808 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
2809 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
2810
2811 /* setup proc 0's pcb */
2812 thread0.td_pcb->pcb_flags = 0;
2813#ifdef PAE
2814 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
2815#else
2816 thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
2817#endif
2818 thread0.td_pcb->pcb_ext = 0;
2819 thread0.td_frame = &proc0_tf;
2820 thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0];
2821 thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1];
2822
2823 cpu_probe_amdc1e();
2824 cpu_probe_cmpxchg8b();
2825}
2826
2827#else
2828void
2829init386(first)
2830 int first;
2831{
2832 struct gate_descriptor *gdp;
2833 int gsel_tss, metadata_missing, x, pa;
2834 size_t kstack0_sz;
2835 struct pcpu *pc;
2836
2837 thread0.td_kstack = proc0kstack;
2838 thread0.td_kstack_pages = KSTACK_PAGES;
2839 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
2840 thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;
2841
2842 /*
2843 * This may be done better later if it gets more high level
2844 * components in it. If so just link td->td_proc here.
2845 */
2846 proc_linkup0(&proc0, &thread0);
2847
2848 metadata_missing = 0;
2849 if (bootinfo.bi_modulep) {
2850 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
2851 preload_bootstrap_relocate(KERNBASE);
2852 } else {
2853 metadata_missing = 1;
2854 }
2855 if (envmode == 1)
2856 kern_envp = static_env;
2857 else if (bootinfo.bi_envp)
2858 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
2859
2860 /* Init basic tunables, hz etc */
2861 init_param1();
2862
2863 /*
2864 * Make gdt memory segments. All segments cover the full 4GB
2865 * of address space and permissions are enforced at page level.
2866 */
2867 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
2868 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
2869 gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
2870 gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
2871 gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
2872 gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
2873
2874 pc = &__pcpu[0];
2875 gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
2876 gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
2877 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
2878
2879 for (x = 0; x < NGDT; x++)
2880 ssdtosd(&gdt_segs[x], &gdt[x].sd);
2881
2882 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
2883 r_gdt.rd_base = (int) gdt;
2884 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
2885 lgdt(&r_gdt);
2886
2887 pcpu_init(pc, 0, sizeof(struct pcpu));
2888 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
2889 pmap_kenter(pa + KERNBASE, pa);
2890 dpcpu_init((void *)(first + KERNBASE), 0);
2891 first += DPCPU_SIZE;
2892 PCPU_SET(prvspace, pc);
2893 PCPU_SET(curthread, &thread0);
2894 PCPU_SET(curpcb, thread0.td_pcb);
2895
2896 /*
2897 * Initialize mutexes.
2898 *
2899 * icu_lock: in order to allow an interrupt to occur in a critical
2900 * section, to set pcpu->ipending (etc...) properly, we
2901 * must be able to get the icu lock, so it can't be
2902 * under witness.
2903 */
2904 mutex_init();
2905 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
2906
2907 /* make ldt memory segments */
2908 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
2909 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
2910 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
2911 ssdtosd(&ldt_segs[x], &ldt[x].sd);
2912
2913 _default_ldt = GSEL(GLDT_SEL, SEL_KPL);
2914 lldt(_default_ldt);
2915 PCPU_SET(currentldt, _default_ldt);
2916
2917 /* exceptions */
2918 for (x = 0; x < NIDT; x++)
2919 setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
2920 GSEL(GCODE_SEL, SEL_KPL));
2921 setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL,
2922 GSEL(GCODE_SEL, SEL_KPL));
2923 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
2924 GSEL(GCODE_SEL, SEL_KPL));
2925 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL,
2926 GSEL(GCODE_SEL, SEL_KPL));
2927 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
2928 GSEL(GCODE_SEL, SEL_KPL));
2929 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL,
2930 GSEL(GCODE_SEL, SEL_KPL));
2931 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL,
2932 GSEL(GCODE_SEL, SEL_KPL));
2933 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
2934 GSEL(GCODE_SEL, SEL_KPL));
2935 setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL
2936 , GSEL(GCODE_SEL, SEL_KPL));
2937 setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
2938 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL,
2939 GSEL(GCODE_SEL, SEL_KPL));
2940 setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL,
2941 GSEL(GCODE_SEL, SEL_KPL));
2942 setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL,
2943 GSEL(GCODE_SEL, SEL_KPL));
2944 setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL,
2945 GSEL(GCODE_SEL, SEL_KPL));
2946 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
2947 GSEL(GCODE_SEL, SEL_KPL));
2948 setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
2949 GSEL(GCODE_SEL, SEL_KPL));
2950 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL,
2951 GSEL(GCODE_SEL, SEL_KPL));
2952 setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
2953 GSEL(GCODE_SEL, SEL_KPL));
2954 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL,
2955 GSEL(GCODE_SEL, SEL_KPL));
2956 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
2957 GSEL(GCODE_SEL, SEL_KPL));
2958 setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
2959 GSEL(GCODE_SEL, SEL_KPL));
2960#ifdef KDTRACE_HOOKS
2961 setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL,
2962 GSEL(GCODE_SEL, SEL_KPL));
2963#endif
2964
2965 r_idt.rd_limit = sizeof(idt0) - 1;
2966 r_idt.rd_base = (int) idt;
2967 lidt(&r_idt);
2968
2969#ifdef XBOX
2970 /*
2971 * The following code queries the PCI ID of 0:0:0. For the XBOX,
2972 * This should be 0x10de / 0x02a5.
2973 *
2974 * This is exactly what Linux does.
2975 */
2976 outl(0xcf8, 0x80000000);
2977 if (inl(0xcfc) == 0x02a510de) {
2978 arch_i386_is_xbox = 1;
2979 pic16l_setled(XBOX_LED_GREEN);
2980
2981 /*
2982 * We are an XBOX, but we may have either 64MB or 128MB of
2983 * memory. The PCI host bridge should be programmed for this,
2984 * so we just query it.
2985 */
2986 outl(0xcf8, 0x80000084);
2987 arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64;
2988 }
2989#endif /* XBOX */
2990
2991 /*
2992 * Initialize the i8254 before the console so that console
2993 * initialization can use DELAY().
2994 */
2995 i8254_init();
2996
2997 /*
2998 * Initialize the console before we print anything out.
2999 */
3000 cninit();
3001
3002 if (metadata_missing)
3003 printf("WARNING: loader(8) metadata is missing!\n");
3004
3005#ifdef DEV_ISA
3006#ifdef DEV_ATPIC
3007 elcr_probe();
3008 atpic_startup();
3009#else
3010 /* Reset and mask the atpics and leave them shut down. */
3011 atpic_reset();
3012
3013 /*
3014 * Point the ICU spurious interrupt vectors at the APIC spurious
3015 * interrupt handler.
3016 */
3017 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
3018 GSEL(GCODE_SEL, SEL_KPL));
3019 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
3020 GSEL(GCODE_SEL, SEL_KPL));
3021#endif
3022#endif
3023
3024#ifdef DDB
3025 ksym_start = bootinfo.bi_symtab;
3026 ksym_end = bootinfo.bi_esymtab;
3027#endif
3028
3029 kdb_init();
3030
3031#ifdef KDB
3032 if (boothowto & RB_KDB)
3033 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
3034#endif
3035
3036 finishidentcpu(); /* Final stage of CPU initialization */
3037 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
3038 GSEL(GCODE_SEL, SEL_KPL));
3039 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
3040 GSEL(GCODE_SEL, SEL_KPL));
3041 initializecpu(); /* Initialize CPU registers */
3042
3043 /* make an initial tss so cpu can get interrupt stack on syscall! */
3044 /* Note: -16 is so we can grow the trapframe if we came from vm86 */
3045 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
3046 kstack0_sz - sizeof(struct pcb) - 16);
3047 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
3048 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
3049 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
3050 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
3051 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
3052 ltr(gsel_tss);
3053
3054 /* pointer to selector slot for %fs/%gs */
3055 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
3056
3057 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
3058 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
3059 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
3060 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
3061#ifdef PAE
3062 dblfault_tss.tss_cr3 = (int)IdlePDPT;
3063#else
3064 dblfault_tss.tss_cr3 = (int)IdlePTD;
3065#endif
3066 dblfault_tss.tss_eip = (int)dblfault_handler;
3067 dblfault_tss.tss_eflags = PSL_KERNEL;
3068 dblfault_tss.tss_ds = dblfault_tss.tss_es =
3069 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
3070 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
3071 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
3072 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
3073
3074 vm86_initialize();
3075 getmemsize(first);
3076 init_param2(physmem);
3077
3078 /* now running on new page tables, configured,and u/iom is accessible */
3079
3080 msgbufinit(msgbufp, msgbufsize);
3081
3082 /* make a call gate to reenter kernel with */
3083 gdp = &ldt[LSYS5CALLS_SEL].gd;
3084
3085 x = (int) &IDTVEC(lcall_syscall);
3086 gdp->gd_looffset = x;
3087 gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
3088 gdp->gd_stkcpy = 1;
3089 gdp->gd_type = SDT_SYS386CGT;
3090 gdp->gd_dpl = SEL_UPL;
3091 gdp->gd_p = 1;
3092 gdp->gd_hioffset = x >> 16;
3093
3094 /* XXX does this work? */
3095 /* XXX yes! */
3096 ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
3097 ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
3098
3099 /* transfer to user mode */
3100
3101 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
3102 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
3103
3104 /* setup proc 0's pcb */
3105 thread0.td_pcb->pcb_flags = 0;
3106#ifdef PAE
3107 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
3108#else
3109 thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
3110#endif
3111 thread0.td_pcb->pcb_ext = 0;
3112 thread0.td_frame = &proc0_tf;
3113
3114 cpu_probe_amdc1e();
3115 cpu_probe_cmpxchg8b();
145
146#ifdef DEV_APIC
147#include <machine/apicvar.h>
148#endif
149
150#ifdef DEV_ISA
151#include <x86/isa/icu.h>
152#endif
153
154#ifdef XBOX
155#include <machine/xbox.h>
156
157int arch_i386_is_xbox = 0;
158uint32_t arch_i386_xbox_memsize = 0;
159#endif
160
161#ifdef XEN
162/* XEN includes */
163#include <machine/xen/xen-os.h>
164#include <xen/hypervisor.h>
165#include <machine/xen/xen-os.h>
166#include <machine/xen/xenvar.h>
167#include <machine/xen/xenfunc.h>
168#include <xen/xen_intr.h>
169
170void Xhypervisor_callback(void);
171void failsafe_callback(void);
172
173extern trap_info_t trap_table[];
174struct proc_ldt default_proc_ldt;
175extern int init_first;
176int running_xen = 1;
177extern unsigned long physfree;
178#endif /* XEN */
179
180/* Sanity check for __curthread() */
181CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
182
183extern void init386(int first);
184extern void dblfault_handler(void);
185
186extern void printcpuinfo(void); /* XXX header file */
187extern void finishidentcpu(void);
188extern void panicifcpuunsupported(void);
189
190#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
191#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
192
193#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
194#define CPU_ENABLE_SSE
195#endif
196
197static void cpu_startup(void *);
198static void fpstate_drop(struct thread *td);
199static void get_fpcontext(struct thread *td, mcontext_t *mcp);
200static int set_fpcontext(struct thread *td, const mcontext_t *mcp);
201#ifdef CPU_ENABLE_SSE
202static void set_fpregs_xmm(struct save87 *, struct savexmm *);
203static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
204#endif /* CPU_ENABLE_SSE */
205SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
206
207#ifdef DDB
208extern vm_offset_t ksym_start, ksym_end;
209#endif
210
211/* Intel ICH registers */
212#define ICH_PMBASE 0x400
213#define ICH_SMI_EN ICH_PMBASE + 0x30
214
215int _udatasel, _ucodesel;
216u_int basemem;
217
218int cold = 1;
219
220#ifdef COMPAT_43
221static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
222#endif
223#ifdef COMPAT_FREEBSD4
224static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
225#endif
226
227long Maxmem = 0;
228long realmem = 0;
229
230#ifdef PAE
231FEATURE(pae, "Physical Address Extensions");
232#endif
233
234/*
235 * The number of PHYSMAP entries must be one less than the number of
236 * PHYSSEG entries because the PHYSMAP entry that spans the largest
237 * physical address that is accessible by ISA DMA is split into two
238 * PHYSSEG entries.
239 */
240#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
241
242vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
243vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
244
245/* must be 2 less so 0 0 can signal end of chunks */
246#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
247#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
248
249struct kva_md_info kmi;
250
251static struct trapframe proc0_tf;
252struct pcpu __pcpu[MAXCPU];
253
254struct mtx icu_lock;
255
256struct mem_range_softc mem_range_softc;
257
258static void
259cpu_startup(dummy)
260 void *dummy;
261{
262 uintmax_t memsize;
263 char *sysenv;
264
265 /*
266 * On MacBooks, we need to disallow the legacy USB circuit to
267 * generate an SMI# because this can cause several problems,
268 * namely: incorrect CPU frequency detection and failure to
269 * start the APs.
270 * We do this by disabling a bit in the SMI_EN (SMI Control and
271 * Enable register) of the Intel ICH LPC Interface Bridge.
272 */
273 sysenv = getenv("smbios.system.product");
274 if (sysenv != NULL) {
275 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
276 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
277 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
278 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
279 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
280 strncmp(sysenv, "Macmini1,1", 10) == 0) {
281 if (bootverbose)
282 printf("Disabling LEGACY_USB_EN bit on "
283 "Intel ICH.\n");
284 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
285 }
286 freeenv(sysenv);
287 }
288
289 /*
290 * Good {morning,afternoon,evening,night}.
291 */
292 startrtclock();
293 printcpuinfo();
294 panicifcpuunsupported();
295#ifdef PERFMON
296 perfmon_init();
297#endif
298 realmem = Maxmem;
299
300 /*
301 * Display physical memory if SMBIOS reports reasonable amount.
302 */
303 memsize = 0;
304 sysenv = getenv("smbios.memory.enabled");
305 if (sysenv != NULL) {
306 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
307 freeenv(sysenv);
308 }
309 if (memsize < ptoa((uintmax_t)cnt.v_free_count))
310 memsize = ptoa((uintmax_t)Maxmem);
311 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
312
313 /*
314 * Display any holes after the first chunk of extended memory.
315 */
316 if (bootverbose) {
317 int indx;
318
319 printf("Physical memory chunk(s):\n");
320 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
321 vm_paddr_t size;
322
323 size = phys_avail[indx + 1] - phys_avail[indx];
324 printf(
325 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
326 (uintmax_t)phys_avail[indx],
327 (uintmax_t)phys_avail[indx + 1] - 1,
328 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
329 }
330 }
331
332 vm_ksubmap_init(&kmi);
333
334 printf("avail memory = %ju (%ju MB)\n",
335 ptoa((uintmax_t)cnt.v_free_count),
336 ptoa((uintmax_t)cnt.v_free_count) / 1048576);
337
338 /*
339 * Set up buffers, so they can be used to read disk labels.
340 */
341 bufinit();
342 vm_pager_bufferinit();
343#ifndef XEN
344 cpu_setregs();
345#endif
346}
347
348/*
349 * Send an interrupt to process.
350 *
351 * Stack is set up to allow sigcode stored
352 * at top to call routine, followed by kcall
353 * to sigreturn routine below. After sigreturn
354 * resets the signal mask, the stack, and the
355 * frame pointer, it returns to the user
356 * specified pc, psl.
357 */
358#ifdef COMPAT_43
359static void
360osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
361{
362 struct osigframe sf, *fp;
363 struct proc *p;
364 struct thread *td;
365 struct sigacts *psp;
366 struct trapframe *regs;
367 int sig;
368 int oonstack;
369
370 td = curthread;
371 p = td->td_proc;
372 PROC_LOCK_ASSERT(p, MA_OWNED);
373 sig = ksi->ksi_signo;
374 psp = p->p_sigacts;
375 mtx_assert(&psp->ps_mtx, MA_OWNED);
376 regs = td->td_frame;
377 oonstack = sigonstack(regs->tf_esp);
378
379 /* Allocate space for the signal handler context. */
380 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
381 SIGISMEMBER(psp->ps_sigonstack, sig)) {
382 fp = (struct osigframe *)(td->td_sigstk.ss_sp +
383 td->td_sigstk.ss_size - sizeof(struct osigframe));
384#if defined(COMPAT_43)
385 td->td_sigstk.ss_flags |= SS_ONSTACK;
386#endif
387 } else
388 fp = (struct osigframe *)regs->tf_esp - 1;
389
390 /* Translate the signal if appropriate. */
391 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
392 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
393
394 /* Build the argument list for the signal handler. */
395 sf.sf_signum = sig;
396 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
397 bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
398 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
399 /* Signal handler installed with SA_SIGINFO. */
400 sf.sf_arg2 = (register_t)&fp->sf_siginfo;
401 sf.sf_siginfo.si_signo = sig;
402 sf.sf_siginfo.si_code = ksi->ksi_code;
403 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
404 sf.sf_addr = 0;
405 } else {
406 /* Old FreeBSD-style arguments. */
407 sf.sf_arg2 = ksi->ksi_code;
408 sf.sf_addr = (register_t)ksi->ksi_addr;
409 sf.sf_ahu.sf_handler = catcher;
410 }
411 mtx_unlock(&psp->ps_mtx);
412 PROC_UNLOCK(p);
413
414 /* Save most if not all of trap frame. */
415 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
416 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
417 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
418 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
419 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
420 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
421 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
422 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
423 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
424 sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
425 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
426 sf.sf_siginfo.si_sc.sc_gs = rgs();
427 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
428
429 /* Build the signal context to be used by osigreturn(). */
430 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
431 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
432 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
433 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
434 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
435 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
436 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
437 sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
438
439 /*
440 * If we're a vm86 process, we want to save the segment registers.
441 * We also change eflags to be our emulated eflags, not the actual
442 * eflags.
443 */
444 if (regs->tf_eflags & PSL_VM) {
445 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
446 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
447 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
448
449 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
450 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
451 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
452 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
453
454 if (vm86->vm86_has_vme == 0)
455 sf.sf_siginfo.si_sc.sc_ps =
456 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
457 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
458
459 /* See sendsig() for comments. */
460 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
461 }
462
463 /*
464 * Copy the sigframe out to the user's stack.
465 */
466 if (copyout(&sf, fp, sizeof(*fp)) != 0) {
467#ifdef DEBUG
468 printf("process %ld has trashed its stack\n", (long)p->p_pid);
469#endif
470 PROC_LOCK(p);
471 sigexit(td, SIGILL);
472 }
473
474 regs->tf_esp = (int)fp;
475 if (p->p_sysent->sv_sigcode_base != 0) {
476 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
477 szosigcode;
478 } else {
479 /* a.out sysentvec does not use shared page */
480 regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode;
481 }
482 regs->tf_eflags &= ~(PSL_T | PSL_D);
483 regs->tf_cs = _ucodesel;
484 regs->tf_ds = _udatasel;
485 regs->tf_es = _udatasel;
486 regs->tf_fs = _udatasel;
487 load_gs(_udatasel);
488 regs->tf_ss = _udatasel;
489 PROC_LOCK(p);
490 mtx_lock(&psp->ps_mtx);
491}
492#endif /* COMPAT_43 */
493
494#ifdef COMPAT_FREEBSD4
495static void
496freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
497{
498 struct sigframe4 sf, *sfp;
499 struct proc *p;
500 struct thread *td;
501 struct sigacts *psp;
502 struct trapframe *regs;
503 int sig;
504 int oonstack;
505
506 td = curthread;
507 p = td->td_proc;
508 PROC_LOCK_ASSERT(p, MA_OWNED);
509 sig = ksi->ksi_signo;
510 psp = p->p_sigacts;
511 mtx_assert(&psp->ps_mtx, MA_OWNED);
512 regs = td->td_frame;
513 oonstack = sigonstack(regs->tf_esp);
514
515 /* Save user context. */
516 bzero(&sf, sizeof(sf));
517 sf.sf_uc.uc_sigmask = *mask;
518 sf.sf_uc.uc_stack = td->td_sigstk;
519 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
520 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
521 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
522 sf.sf_uc.uc_mcontext.mc_gs = rgs();
523 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
524 bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
525 sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
526 bzero(sf.sf_uc.uc_mcontext.__spare__,
527 sizeof(sf.sf_uc.uc_mcontext.__spare__));
528 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
529
530 /* Allocate space for the signal handler context. */
531 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
532 SIGISMEMBER(psp->ps_sigonstack, sig)) {
533 sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
534 td->td_sigstk.ss_size - sizeof(struct sigframe4));
535#if defined(COMPAT_43)
536 td->td_sigstk.ss_flags |= SS_ONSTACK;
537#endif
538 } else
539 sfp = (struct sigframe4 *)regs->tf_esp - 1;
540
541 /* Translate the signal if appropriate. */
542 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
543 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
544
545 /* Build the argument list for the signal handler. */
546 sf.sf_signum = sig;
547 sf.sf_ucontext = (register_t)&sfp->sf_uc;
548 bzero(&sf.sf_si, sizeof(sf.sf_si));
549 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
550 /* Signal handler installed with SA_SIGINFO. */
551 sf.sf_siginfo = (register_t)&sfp->sf_si;
552 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
553
554 /* Fill in POSIX parts */
555 sf.sf_si.si_signo = sig;
556 sf.sf_si.si_code = ksi->ksi_code;
557 sf.sf_si.si_addr = ksi->ksi_addr;
558 } else {
559 /* Old FreeBSD-style arguments. */
560 sf.sf_siginfo = ksi->ksi_code;
561 sf.sf_addr = (register_t)ksi->ksi_addr;
562 sf.sf_ahu.sf_handler = catcher;
563 }
564 mtx_unlock(&psp->ps_mtx);
565 PROC_UNLOCK(p);
566
567 /*
568 * If we're a vm86 process, we want to save the segment registers.
569 * We also change eflags to be our emulated eflags, not the actual
570 * eflags.
571 */
572 if (regs->tf_eflags & PSL_VM) {
573 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
574 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
575
576 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
577 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
578 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
579 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
580
581 if (vm86->vm86_has_vme == 0)
582 sf.sf_uc.uc_mcontext.mc_eflags =
583 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
584 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
585
586 /*
587 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
588 * syscalls made by the signal handler. This just avoids
589 * wasting time for our lazy fixup of such faults. PSL_NT
590 * does nothing in vm86 mode, but vm86 programs can set it
591 * almost legitimately in probes for old cpu types.
592 */
593 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
594 }
595
596 /*
597 * Copy the sigframe out to the user's stack.
598 */
599 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
600#ifdef DEBUG
601 printf("process %ld has trashed its stack\n", (long)p->p_pid);
602#endif
603 PROC_LOCK(p);
604 sigexit(td, SIGILL);
605 }
606
607 regs->tf_esp = (int)sfp;
608 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
609 szfreebsd4_sigcode;
610 regs->tf_eflags &= ~(PSL_T | PSL_D);
611 regs->tf_cs = _ucodesel;
612 regs->tf_ds = _udatasel;
613 regs->tf_es = _udatasel;
614 regs->tf_fs = _udatasel;
615 regs->tf_ss = _udatasel;
616 PROC_LOCK(p);
617 mtx_lock(&psp->ps_mtx);
618}
619#endif /* COMPAT_FREEBSD4 */
620
621void
622sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
623{
624 struct sigframe sf, *sfp;
625 struct proc *p;
626 struct thread *td;
627 struct sigacts *psp;
628 char *sp;
629 struct trapframe *regs;
630 struct segment_descriptor *sdp;
631 int sig;
632 int oonstack;
633
634 td = curthread;
635 p = td->td_proc;
636 PROC_LOCK_ASSERT(p, MA_OWNED);
637 sig = ksi->ksi_signo;
638 psp = p->p_sigacts;
639 mtx_assert(&psp->ps_mtx, MA_OWNED);
640#ifdef COMPAT_FREEBSD4
641 if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
642 freebsd4_sendsig(catcher, ksi, mask);
643 return;
644 }
645#endif
646#ifdef COMPAT_43
647 if (SIGISMEMBER(psp->ps_osigset, sig)) {
648 osendsig(catcher, ksi, mask);
649 return;
650 }
651#endif
652 regs = td->td_frame;
653 oonstack = sigonstack(regs->tf_esp);
654
655 /* Save user context. */
656 bzero(&sf, sizeof(sf));
657 sf.sf_uc.uc_sigmask = *mask;
658 sf.sf_uc.uc_stack = td->td_sigstk;
659 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
660 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
661 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
662 sf.sf_uc.uc_mcontext.mc_gs = rgs();
663 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
664 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
665 get_fpcontext(td, &sf.sf_uc.uc_mcontext);
666 fpstate_drop(td);
667 /*
668 * Unconditionally fill the fsbase and gsbase into the mcontext.
669 */
670 sdp = &td->td_pcb->pcb_fsd;
671 sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 |
672 sdp->sd_lobase;
673 sdp = &td->td_pcb->pcb_gsd;
674 sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 |
675 sdp->sd_lobase;
676 sf.sf_uc.uc_mcontext.mc_flags = 0;
677 bzero(sf.sf_uc.uc_mcontext.mc_spare2,
678 sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
679 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
680
681 /* Allocate space for the signal handler context. */
682 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
683 SIGISMEMBER(psp->ps_sigonstack, sig)) {
684 sp = td->td_sigstk.ss_sp +
685 td->td_sigstk.ss_size - sizeof(struct sigframe);
686#if defined(COMPAT_43)
687 td->td_sigstk.ss_flags |= SS_ONSTACK;
688#endif
689 } else
690 sp = (char *)regs->tf_esp - sizeof(struct sigframe);
691 /* Align to 16 bytes. */
692 sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
693
694 /* Translate the signal if appropriate. */
695 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
696 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
697
698 /* Build the argument list for the signal handler. */
699 sf.sf_signum = sig;
700 sf.sf_ucontext = (register_t)&sfp->sf_uc;
701 bzero(&sf.sf_si, sizeof(sf.sf_si));
702 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
703 /* Signal handler installed with SA_SIGINFO. */
704 sf.sf_siginfo = (register_t)&sfp->sf_si;
705 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
706
707 /* Fill in POSIX parts */
708 sf.sf_si = ksi->ksi_info;
709 sf.sf_si.si_signo = sig; /* maybe a translated signal */
710 } else {
711 /* Old FreeBSD-style arguments. */
712 sf.sf_siginfo = ksi->ksi_code;
713 sf.sf_addr = (register_t)ksi->ksi_addr;
714 sf.sf_ahu.sf_handler = catcher;
715 }
716 mtx_unlock(&psp->ps_mtx);
717 PROC_UNLOCK(p);
718
719 /*
720 * If we're a vm86 process, we want to save the segment registers.
721 * We also change eflags to be our emulated eflags, not the actual
722 * eflags.
723 */
724 if (regs->tf_eflags & PSL_VM) {
725 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
726 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
727
728 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
729 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
730 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
731 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
732
733 if (vm86->vm86_has_vme == 0)
734 sf.sf_uc.uc_mcontext.mc_eflags =
735 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
736 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
737
738 /*
739 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
740 * syscalls made by the signal handler. This just avoids
741 * wasting time for our lazy fixup of such faults. PSL_NT
742 * does nothing in vm86 mode, but vm86 programs can set it
743 * almost legitimately in probes for old cpu types.
744 */
745 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
746 }
747
748 /*
749 * Copy the sigframe out to the user's stack.
750 */
751 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
752#ifdef DEBUG
753 printf("process %ld has trashed its stack\n", (long)p->p_pid);
754#endif
755 PROC_LOCK(p);
756 sigexit(td, SIGILL);
757 }
758
759 regs->tf_esp = (int)sfp;
760 regs->tf_eip = p->p_sysent->sv_sigcode_base;
761 regs->tf_eflags &= ~(PSL_T | PSL_D);
762 regs->tf_cs = _ucodesel;
763 regs->tf_ds = _udatasel;
764 regs->tf_es = _udatasel;
765 regs->tf_fs = _udatasel;
766 regs->tf_ss = _udatasel;
767 PROC_LOCK(p);
768 mtx_lock(&psp->ps_mtx);
769}
770
771/*
772 * System call to cleanup state after a signal
773 * has been taken. Reset signal mask and
774 * stack state from context left by sendsig (above).
775 * Return to previous pc and psl as specified by
776 * context left by sendsig. Check carefully to
777 * make sure that the user has not modified the
778 * state to gain improper privileges.
779 *
780 * MPSAFE
781 */
782#ifdef COMPAT_43
783int
784osigreturn(td, uap)
785 struct thread *td;
786 struct osigreturn_args /* {
787 struct osigcontext *sigcntxp;
788 } */ *uap;
789{
790 struct osigcontext sc;
791 struct trapframe *regs;
792 struct osigcontext *scp;
793 int eflags, error;
794 ksiginfo_t ksi;
795
796 regs = td->td_frame;
797 error = copyin(uap->sigcntxp, &sc, sizeof(sc));
798 if (error != 0)
799 return (error);
800 scp = &sc;
801 eflags = scp->sc_ps;
802 if (eflags & PSL_VM) {
803 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
804 struct vm86_kernel *vm86;
805
806 /*
807 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
808 * set up the vm86 area, and we can't enter vm86 mode.
809 */
810 if (td->td_pcb->pcb_ext == 0)
811 return (EINVAL);
812 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
813 if (vm86->vm86_inited == 0)
814 return (EINVAL);
815
816 /* Go back to user mode if both flags are set. */
817 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
818 ksiginfo_init_trap(&ksi);
819 ksi.ksi_signo = SIGBUS;
820 ksi.ksi_code = BUS_OBJERR;
821 ksi.ksi_addr = (void *)regs->tf_eip;
822 trapsignal(td, &ksi);
823 }
824
825 if (vm86->vm86_has_vme) {
826 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
827 (eflags & VME_USERCHANGE) | PSL_VM;
828 } else {
829 vm86->vm86_eflags = eflags; /* save VIF, VIP */
830 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
831 (eflags & VM_USERCHANGE) | PSL_VM;
832 }
833 tf->tf_vm86_ds = scp->sc_ds;
834 tf->tf_vm86_es = scp->sc_es;
835 tf->tf_vm86_fs = scp->sc_fs;
836 tf->tf_vm86_gs = scp->sc_gs;
837 tf->tf_ds = _udatasel;
838 tf->tf_es = _udatasel;
839 tf->tf_fs = _udatasel;
840 } else {
841 /*
842 * Don't allow users to change privileged or reserved flags.
843 */
844 /*
845 * XXX do allow users to change the privileged flag PSL_RF.
846 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
847 * should sometimes set it there too. tf_eflags is kept in
848 * the signal context during signal handling and there is no
849 * other place to remember it, so the PSL_RF bit may be
850 * corrupted by the signal handler without us knowing.
851 * Corruption of the PSL_RF bit at worst causes one more or
852 * one less debugger trap, so allowing it is fairly harmless.
853 */
854 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
855 return (EINVAL);
856 }
857
858 /*
859 * Don't allow users to load a valid privileged %cs. Let the
860 * hardware check for invalid selectors, excess privilege in
861 * other selectors, invalid %eip's and invalid %esp's.
862 */
863 if (!CS_SECURE(scp->sc_cs)) {
864 ksiginfo_init_trap(&ksi);
865 ksi.ksi_signo = SIGBUS;
866 ksi.ksi_code = BUS_OBJERR;
867 ksi.ksi_trapno = T_PROTFLT;
868 ksi.ksi_addr = (void *)regs->tf_eip;
869 trapsignal(td, &ksi);
870 return (EINVAL);
871 }
872 regs->tf_ds = scp->sc_ds;
873 regs->tf_es = scp->sc_es;
874 regs->tf_fs = scp->sc_fs;
875 }
876
877 /* Restore remaining registers. */
878 regs->tf_eax = scp->sc_eax;
879 regs->tf_ebx = scp->sc_ebx;
880 regs->tf_ecx = scp->sc_ecx;
881 regs->tf_edx = scp->sc_edx;
882 regs->tf_esi = scp->sc_esi;
883 regs->tf_edi = scp->sc_edi;
884 regs->tf_cs = scp->sc_cs;
885 regs->tf_ss = scp->sc_ss;
886 regs->tf_isp = scp->sc_isp;
887 regs->tf_ebp = scp->sc_fp;
888 regs->tf_esp = scp->sc_sp;
889 regs->tf_eip = scp->sc_pc;
890 regs->tf_eflags = eflags;
891
892#if defined(COMPAT_43)
893 if (scp->sc_onstack & 1)
894 td->td_sigstk.ss_flags |= SS_ONSTACK;
895 else
896 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
897#endif
898 kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
899 SIGPROCMASK_OLD);
900 return (EJUSTRETURN);
901}
902#endif /* COMPAT_43 */
903
904#ifdef COMPAT_FREEBSD4
905/*
906 * MPSAFE
907 */
908int
909freebsd4_sigreturn(td, uap)
910 struct thread *td;
911 struct freebsd4_sigreturn_args /* {
912 const ucontext4 *sigcntxp;
913 } */ *uap;
914{
915 struct ucontext4 uc;
916 struct trapframe *regs;
917 struct ucontext4 *ucp;
918 int cs, eflags, error;
919 ksiginfo_t ksi;
920
921 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
922 if (error != 0)
923 return (error);
924 ucp = &uc;
925 regs = td->td_frame;
926 eflags = ucp->uc_mcontext.mc_eflags;
927 if (eflags & PSL_VM) {
928 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
929 struct vm86_kernel *vm86;
930
931 /*
932 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
933 * set up the vm86 area, and we can't enter vm86 mode.
934 */
935 if (td->td_pcb->pcb_ext == 0)
936 return (EINVAL);
937 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
938 if (vm86->vm86_inited == 0)
939 return (EINVAL);
940
941 /* Go back to user mode if both flags are set. */
942 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
943 ksiginfo_init_trap(&ksi);
944 ksi.ksi_signo = SIGBUS;
945 ksi.ksi_code = BUS_OBJERR;
946 ksi.ksi_addr = (void *)regs->tf_eip;
947 trapsignal(td, &ksi);
948 }
949 if (vm86->vm86_has_vme) {
950 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
951 (eflags & VME_USERCHANGE) | PSL_VM;
952 } else {
953 vm86->vm86_eflags = eflags; /* save VIF, VIP */
954 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
955 (eflags & VM_USERCHANGE) | PSL_VM;
956 }
957 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
958 tf->tf_eflags = eflags;
959 tf->tf_vm86_ds = tf->tf_ds;
960 tf->tf_vm86_es = tf->tf_es;
961 tf->tf_vm86_fs = tf->tf_fs;
962 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
963 tf->tf_ds = _udatasel;
964 tf->tf_es = _udatasel;
965 tf->tf_fs = _udatasel;
966 } else {
967 /*
968 * Don't allow users to change privileged or reserved flags.
969 */
970 /*
971 * XXX do allow users to change the privileged flag PSL_RF.
972 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
973 * should sometimes set it there too. tf_eflags is kept in
974 * the signal context during signal handling and there is no
975 * other place to remember it, so the PSL_RF bit may be
976 * corrupted by the signal handler without us knowing.
977 * Corruption of the PSL_RF bit at worst causes one more or
978 * one less debugger trap, so allowing it is fairly harmless.
979 */
980 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
981 uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n",
982 td->td_proc->p_pid, td->td_name, eflags);
983 return (EINVAL);
984 }
985
986 /*
987 * Don't allow users to load a valid privileged %cs. Let the
988 * hardware check for invalid selectors, excess privilege in
989 * other selectors, invalid %eip's and invalid %esp's.
990 */
991 cs = ucp->uc_mcontext.mc_cs;
992 if (!CS_SECURE(cs)) {
993 uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
994 td->td_proc->p_pid, td->td_name, cs);
995 ksiginfo_init_trap(&ksi);
996 ksi.ksi_signo = SIGBUS;
997 ksi.ksi_code = BUS_OBJERR;
998 ksi.ksi_trapno = T_PROTFLT;
999 ksi.ksi_addr = (void *)regs->tf_eip;
1000 trapsignal(td, &ksi);
1001 return (EINVAL);
1002 }
1003
1004 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
1005 }
1006
1007#if defined(COMPAT_43)
1008 if (ucp->uc_mcontext.mc_onstack & 1)
1009 td->td_sigstk.ss_flags |= SS_ONSTACK;
1010 else
1011 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
1012#endif
1013 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
1014 return (EJUSTRETURN);
1015}
1016#endif /* COMPAT_FREEBSD4 */
1017
1018/*
1019 * MPSAFE
1020 */
1021int
1022sys_sigreturn(td, uap)
1023 struct thread *td;
1024 struct sigreturn_args /* {
1025 const struct __ucontext *sigcntxp;
1026 } */ *uap;
1027{
1028 ucontext_t uc;
1029 struct trapframe *regs;
1030 ucontext_t *ucp;
1031 int cs, eflags, error, ret;
1032 ksiginfo_t ksi;
1033
1034 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
1035 if (error != 0)
1036 return (error);
1037 ucp = &uc;
1038 regs = td->td_frame;
1039 eflags = ucp->uc_mcontext.mc_eflags;
1040 if (eflags & PSL_VM) {
1041 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
1042 struct vm86_kernel *vm86;
1043
1044 /*
1045 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
1046 * set up the vm86 area, and we can't enter vm86 mode.
1047 */
1048 if (td->td_pcb->pcb_ext == 0)
1049 return (EINVAL);
1050 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
1051 if (vm86->vm86_inited == 0)
1052 return (EINVAL);
1053
1054 /* Go back to user mode if both flags are set. */
1055 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
1056 ksiginfo_init_trap(&ksi);
1057 ksi.ksi_signo = SIGBUS;
1058 ksi.ksi_code = BUS_OBJERR;
1059 ksi.ksi_addr = (void *)regs->tf_eip;
1060 trapsignal(td, &ksi);
1061 }
1062
1063 if (vm86->vm86_has_vme) {
1064 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
1065 (eflags & VME_USERCHANGE) | PSL_VM;
1066 } else {
1067 vm86->vm86_eflags = eflags; /* save VIF, VIP */
1068 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
1069 (eflags & VM_USERCHANGE) | PSL_VM;
1070 }
1071 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
1072 tf->tf_eflags = eflags;
1073 tf->tf_vm86_ds = tf->tf_ds;
1074 tf->tf_vm86_es = tf->tf_es;
1075 tf->tf_vm86_fs = tf->tf_fs;
1076 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
1077 tf->tf_ds = _udatasel;
1078 tf->tf_es = _udatasel;
1079 tf->tf_fs = _udatasel;
1080 } else {
1081 /*
1082 * Don't allow users to change privileged or reserved flags.
1083 */
1084 /*
1085 * XXX do allow users to change the privileged flag PSL_RF.
1086 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
1087 * should sometimes set it there too. tf_eflags is kept in
1088 * the signal context during signal handling and there is no
1089 * other place to remember it, so the PSL_RF bit may be
1090 * corrupted by the signal handler without us knowing.
1091 * Corruption of the PSL_RF bit at worst causes one more or
1092 * one less debugger trap, so allowing it is fairly harmless.
1093 */
1094 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
1095 uprintf("pid %d (%s): sigreturn eflags = 0x%x\n",
1096 td->td_proc->p_pid, td->td_name, eflags);
1097 return (EINVAL);
1098 }
1099
1100 /*
1101 * Don't allow users to load a valid privileged %cs. Let the
1102 * hardware check for invalid selectors, excess privilege in
1103 * other selectors, invalid %eip's and invalid %esp's.
1104 */
1105 cs = ucp->uc_mcontext.mc_cs;
1106 if (!CS_SECURE(cs)) {
1107 uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
1108 td->td_proc->p_pid, td->td_name, cs);
1109 ksiginfo_init_trap(&ksi);
1110 ksi.ksi_signo = SIGBUS;
1111 ksi.ksi_code = BUS_OBJERR;
1112 ksi.ksi_trapno = T_PROTFLT;
1113 ksi.ksi_addr = (void *)regs->tf_eip;
1114 trapsignal(td, &ksi);
1115 return (EINVAL);
1116 }
1117
1118 ret = set_fpcontext(td, &ucp->uc_mcontext);
1119 if (ret != 0)
1120 return (ret);
1121 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
1122 }
1123
1124#if defined(COMPAT_43)
1125 if (ucp->uc_mcontext.mc_onstack & 1)
1126 td->td_sigstk.ss_flags |= SS_ONSTACK;
1127 else
1128 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
1129#endif
1130
1131 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
1132 return (EJUSTRETURN);
1133}
1134
1135/*
1136 * Machine dependent boot() routine
1137 *
1138 * I haven't seen anything to put here yet
1139 * Possibly some stuff might be grafted back here from boot()
1140 */
1141void
1142cpu_boot(int howto)
1143{
1144}
1145
1146/*
1147 * Flush the D-cache for non-DMA I/O so that the I-cache can
1148 * be made coherent later.
1149 */
1150void
1151cpu_flush_dcache(void *ptr, size_t len)
1152{
1153 /* Not applicable */
1154}
1155
1156/* Get current clock frequency for the given cpu id. */
1157int
1158cpu_est_clockrate(int cpu_id, uint64_t *rate)
1159{
1160 uint64_t tsc1, tsc2;
1161 uint64_t acnt, mcnt, perf;
1162 register_t reg;
1163
1164 if (pcpu_find(cpu_id) == NULL || rate == NULL)
1165 return (EINVAL);
1166 if ((cpu_feature & CPUID_TSC) == 0)
1167 return (EOPNOTSUPP);
1168
1169 /*
1170 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
1171 * DELAY(9) based logic fails.
1172 */
1173 if (tsc_is_invariant && !tsc_perf_stat)
1174 return (EOPNOTSUPP);
1175
1176#ifdef SMP
1177 if (smp_cpus > 1) {
1178 /* Schedule ourselves on the indicated cpu. */
1179 thread_lock(curthread);
1180 sched_bind(curthread, cpu_id);
1181 thread_unlock(curthread);
1182 }
1183#endif
1184
1185 /* Calibrate by measuring a short delay. */
1186 reg = intr_disable();
1187 if (tsc_is_invariant) {
1188 wrmsr(MSR_MPERF, 0);
1189 wrmsr(MSR_APERF, 0);
1190 tsc1 = rdtsc();
1191 DELAY(1000);
1192 mcnt = rdmsr(MSR_MPERF);
1193 acnt = rdmsr(MSR_APERF);
1194 tsc2 = rdtsc();
1195 intr_restore(reg);
1196 perf = 1000 * acnt / mcnt;
1197 *rate = (tsc2 - tsc1) * perf;
1198 } else {
1199 tsc1 = rdtsc();
1200 DELAY(1000);
1201 tsc2 = rdtsc();
1202 intr_restore(reg);
1203 *rate = (tsc2 - tsc1) * 1000;
1204 }
1205
1206#ifdef SMP
1207 if (smp_cpus > 1) {
1208 thread_lock(curthread);
1209 sched_unbind(curthread);
1210 thread_unlock(curthread);
1211 }
1212#endif
1213
1214 return (0);
1215}
1216
1217#ifdef XEN
1218
1219void
1220cpu_halt(void)
1221{
1222 HYPERVISOR_shutdown(SHUTDOWN_poweroff);
1223}
1224
1225int scheduler_running;
1226
1227static void
1228cpu_idle_hlt(sbintime_t sbt)
1229{
1230
1231 scheduler_running = 1;
1232 enable_intr();
1233 idle_block();
1234}
1235
1236#else
1237/*
1238 * Shutdown the CPU as much as possible
1239 */
1240void
1241cpu_halt(void)
1242{
1243 for (;;)
1244 halt();
1245}
1246
1247#endif
1248
1249void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */
1250static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */
1251static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */
1252TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
1253SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
1254 0, "Use MONITOR/MWAIT for short idle");
1255
1256#define STATE_RUNNING 0x0
1257#define STATE_MWAIT 0x1
1258#define STATE_SLEEPING 0x2
1259
1260static void
1261cpu_idle_acpi(sbintime_t sbt)
1262{
1263 int *state;
1264
1265 state = (int *)PCPU_PTR(monitorbuf);
1266 *state = STATE_SLEEPING;
1267
1268 /* See comments in cpu_idle_hlt(). */
1269 disable_intr();
1270 if (sched_runnable())
1271 enable_intr();
1272 else if (cpu_idle_hook)
1273 cpu_idle_hook(sbt);
1274 else
1275 __asm __volatile("sti; hlt");
1276 *state = STATE_RUNNING;
1277}
1278
1279#ifndef XEN
1280static void
1281cpu_idle_hlt(sbintime_t sbt)
1282{
1283 int *state;
1284
1285 state = (int *)PCPU_PTR(monitorbuf);
1286 *state = STATE_SLEEPING;
1287
1288 /*
1289 * Since we may be in a critical section from cpu_idle(), if
1290 * an interrupt fires during that critical section we may have
1291 * a pending preemption. If the CPU halts, then that thread
1292 * may not execute until a later interrupt awakens the CPU.
1293 * To handle this race, check for a runnable thread after
1294 * disabling interrupts and immediately return if one is
1295 * found. Also, we must absolutely guarentee that hlt is
1296 * the next instruction after sti. This ensures that any
1297 * interrupt that fires after the call to disable_intr() will
1298 * immediately awaken the CPU from hlt. Finally, please note
1299 * that on x86 this works fine because of interrupts enabled only
1300 * after the instruction following sti takes place, while IF is set
1301 * to 1 immediately, allowing hlt instruction to acknowledge the
1302 * interrupt.
1303 */
1304 disable_intr();
1305 if (sched_runnable())
1306 enable_intr();
1307 else
1308 __asm __volatile("sti; hlt");
1309 *state = STATE_RUNNING;
1310}
1311#endif
1312
1313/*
1314 * MWAIT cpu power states. Lower 4 bits are sub-states.
1315 */
1316#define MWAIT_C0 0xf0
1317#define MWAIT_C1 0x00
1318#define MWAIT_C2 0x10
1319#define MWAIT_C3 0x20
1320#define MWAIT_C4 0x30
1321
1322static void
1323cpu_idle_mwait(sbintime_t sbt)
1324{
1325 int *state;
1326
1327 state = (int *)PCPU_PTR(monitorbuf);
1328 *state = STATE_MWAIT;
1329
1330 /* See comments in cpu_idle_hlt(). */
1331 disable_intr();
1332 if (sched_runnable()) {
1333 enable_intr();
1334 *state = STATE_RUNNING;
1335 return;
1336 }
1337 cpu_monitor(state, 0, 0);
1338 if (*state == STATE_MWAIT)
1339 __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
1340 else
1341 enable_intr();
1342 *state = STATE_RUNNING;
1343}
1344
1345static void
1346cpu_idle_spin(sbintime_t sbt)
1347{
1348 int *state;
1349 int i;
1350
1351 state = (int *)PCPU_PTR(monitorbuf);
1352 *state = STATE_RUNNING;
1353
1354 /*
1355 * The sched_runnable() call is racy but as long as there is
1356 * a loop missing it one time will have just a little impact if any
1357 * (and it is much better than missing the check at all).
1358 */
1359 for (i = 0; i < 1000; i++) {
1360 if (sched_runnable())
1361 return;
1362 cpu_spinwait();
1363 }
1364}
1365
1366/*
1367 * C1E renders the local APIC timer dead, so we disable it by
1368 * reading the Interrupt Pending Message register and clearing
1369 * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
1370 *
1371 * Reference:
1372 * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
1373 * #32559 revision 3.00+
1374 */
1375#define MSR_AMDK8_IPM 0xc0010055
1376#define AMDK8_SMIONCMPHALT (1ULL << 27)
1377#define AMDK8_C1EONCMPHALT (1ULL << 28)
1378#define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
1379
1380static void
1381cpu_probe_amdc1e(void)
1382{
1383
1384 /*
1385 * Detect the presence of C1E capability mostly on latest
1386 * dual-cores (or future) k8 family.
1387 */
1388 if (cpu_vendor_id == CPU_VENDOR_AMD &&
1389 (cpu_id & 0x00000f00) == 0x00000f00 &&
1390 (cpu_id & 0x0fff0000) >= 0x00040000) {
1391 cpu_ident_amdc1e = 1;
1392 }
1393}
1394
1395#ifdef XEN
1396void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt;
1397#else
1398void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
1399#endif
1400
1401void
1402cpu_idle(int busy)
1403{
1404#ifndef XEN
1405 uint64_t msr;
1406#endif
1407 sbintime_t sbt = -1;
1408
1409 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
1410 busy, curcpu);
1411#if defined(MP_WATCHDOG) && !defined(XEN)
1412 ap_watchdog(PCPU_GET(cpuid));
1413#endif
1414#ifndef XEN
1415 /* If we are busy - try to use fast methods. */
1416 if (busy) {
1417 if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
1418 cpu_idle_mwait(busy);
1419 goto out;
1420 }
1421 }
1422#endif
1423
1424 /* If we have time - switch timers into idle mode. */
1425 if (!busy) {
1426 critical_enter();
1427 sbt = cpu_idleclock();
1428 }
1429
1430#ifndef XEN
1431 /* Apply AMD APIC timer C1E workaround. */
1432 if (cpu_ident_amdc1e && cpu_disable_deep_sleep) {
1433 msr = rdmsr(MSR_AMDK8_IPM);
1434 if (msr & AMDK8_CMPHALT)
1435 wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
1436 }
1437#endif
1438
1439 /* Call main idle method. */
1440 cpu_idle_fn(sbt);
1441
1442 /* Switch timers mack into active mode. */
1443 if (!busy) {
1444 cpu_activeclock();
1445 critical_exit();
1446 }
1447#ifndef XEN
1448out:
1449#endif
1450 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
1451 busy, curcpu);
1452}
1453
1454int
1455cpu_idle_wakeup(int cpu)
1456{
1457 struct pcpu *pcpu;
1458 int *state;
1459
1460 pcpu = pcpu_find(cpu);
1461 state = (int *)pcpu->pc_monitorbuf;
1462 /*
1463 * This doesn't need to be atomic since missing the race will
1464 * simply result in unnecessary IPIs.
1465 */
1466 if (*state == STATE_SLEEPING)
1467 return (0);
1468 if (*state == STATE_MWAIT)
1469 *state = STATE_RUNNING;
1470 return (1);
1471}
1472
1473/*
1474 * Ordered by speed/power consumption.
1475 */
1476struct {
1477 void *id_fn;
1478 char *id_name;
1479} idle_tbl[] = {
1480 { cpu_idle_spin, "spin" },
1481 { cpu_idle_mwait, "mwait" },
1482 { cpu_idle_hlt, "hlt" },
1483 { cpu_idle_acpi, "acpi" },
1484 { NULL, NULL }
1485};
1486
1487static int
1488idle_sysctl_available(SYSCTL_HANDLER_ARGS)
1489{
1490 char *avail, *p;
1491 int error;
1492 int i;
1493
1494 avail = malloc(256, M_TEMP, M_WAITOK);
1495 p = avail;
1496 for (i = 0; idle_tbl[i].id_name != NULL; i++) {
1497 if (strstr(idle_tbl[i].id_name, "mwait") &&
1498 (cpu_feature2 & CPUID2_MON) == 0)
1499 continue;
1500 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
1501 cpu_idle_hook == NULL)
1502 continue;
1503 p += sprintf(p, "%s%s", p != avail ? ", " : "",
1504 idle_tbl[i].id_name);
1505 }
1506 error = sysctl_handle_string(oidp, avail, 0, req);
1507 free(avail, M_TEMP);
1508 return (error);
1509}
1510
1511SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
1512 0, 0, idle_sysctl_available, "A", "list of available idle functions");
1513
1514static int
1515idle_sysctl(SYSCTL_HANDLER_ARGS)
1516{
1517 char buf[16];
1518 int error;
1519 char *p;
1520 int i;
1521
1522 p = "unknown";
1523 for (i = 0; idle_tbl[i].id_name != NULL; i++) {
1524 if (idle_tbl[i].id_fn == cpu_idle_fn) {
1525 p = idle_tbl[i].id_name;
1526 break;
1527 }
1528 }
1529 strncpy(buf, p, sizeof(buf));
1530 error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
1531 if (error != 0 || req->newptr == NULL)
1532 return (error);
1533 for (i = 0; idle_tbl[i].id_name != NULL; i++) {
1534 if (strstr(idle_tbl[i].id_name, "mwait") &&
1535 (cpu_feature2 & CPUID2_MON) == 0)
1536 continue;
1537 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
1538 cpu_idle_hook == NULL)
1539 continue;
1540 if (strcmp(idle_tbl[i].id_name, buf))
1541 continue;
1542 cpu_idle_fn = idle_tbl[i].id_fn;
1543 return (0);
1544 }
1545 return (EINVAL);
1546}
1547
1548SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
1549 idle_sysctl, "A", "currently selected idle function");
1550
1551uint64_t (*atomic_load_acq_64)(volatile uint64_t *) =
1552 atomic_load_acq_64_i386;
1553void (*atomic_store_rel_64)(volatile uint64_t *, uint64_t) =
1554 atomic_store_rel_64_i386;
1555
1556static void
1557cpu_probe_cmpxchg8b(void)
1558{
1559
1560 if ((cpu_feature & CPUID_CX8) != 0 ||
1561 cpu_vendor_id == CPU_VENDOR_RISE) {
1562 atomic_load_acq_64 = atomic_load_acq_64_i586;
1563 atomic_store_rel_64 = atomic_store_rel_64_i586;
1564 }
1565}
1566
1567/*
1568 * Reset registers to default values on exec.
1569 */
1570void
1571exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
1572{
1573 struct trapframe *regs = td->td_frame;
1574 struct pcb *pcb = td->td_pcb;
1575
1576 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */
1577 pcb->pcb_gs = _udatasel;
1578 load_gs(_udatasel);
1579
1580 mtx_lock_spin(&dt_lock);
1581 if (td->td_proc->p_md.md_ldt)
1582 user_ldt_free(td);
1583 else
1584 mtx_unlock_spin(&dt_lock);
1585
1586 bzero((char *)regs, sizeof(struct trapframe));
1587 regs->tf_eip = imgp->entry_addr;
1588 regs->tf_esp = stack;
1589 regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
1590 regs->tf_ss = _udatasel;
1591 regs->tf_ds = _udatasel;
1592 regs->tf_es = _udatasel;
1593 regs->tf_fs = _udatasel;
1594 regs->tf_cs = _ucodesel;
1595
1596 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
1597 regs->tf_ebx = imgp->ps_strings;
1598
1599 /*
1600 * Reset the hardware debug registers if they were in use.
1601 * They won't have any meaning for the newly exec'd process.
1602 */
1603 if (pcb->pcb_flags & PCB_DBREGS) {
1604 pcb->pcb_dr0 = 0;
1605 pcb->pcb_dr1 = 0;
1606 pcb->pcb_dr2 = 0;
1607 pcb->pcb_dr3 = 0;
1608 pcb->pcb_dr6 = 0;
1609 pcb->pcb_dr7 = 0;
1610 if (pcb == curpcb) {
1611 /*
1612 * Clear the debug registers on the running
1613 * CPU, otherwise they will end up affecting
1614 * the next process we switch to.
1615 */
1616 reset_dbregs();
1617 }
1618 pcb->pcb_flags &= ~PCB_DBREGS;
1619 }
1620
1621 /*
1622 * Initialize the math emulator (if any) for the current process.
1623 * Actually, just clear the bit that says that the emulator has
1624 * been initialized. Initialization is delayed until the process
1625 * traps to the emulator (if it is done at all) mainly because
1626 * emulators don't provide an entry point for initialization.
1627 */
1628 td->td_pcb->pcb_flags &= ~FP_SOFTFP;
1629 pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;
1630
1631 /*
1632 * Drop the FP state if we hold it, so that the process gets a
1633 * clean FP state if it uses the FPU again.
1634 */
1635 fpstate_drop(td);
1636
1637 /*
1638 * XXX - Linux emulator
1639 * Make sure sure edx is 0x0 on entry. Linux binaries depend
1640 * on it.
1641 */
1642 td->td_retval[1] = 0;
1643}
1644
1645void
1646cpu_setregs(void)
1647{
1648 unsigned int cr0;
1649
1650 cr0 = rcr0();
1651
1652 /*
1653 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
1654 *
1655 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
1656 * instructions. We must set the CR0_MP bit and use the CR0_TS
1657 * bit to control the trap, because setting the CR0_EM bit does
1658 * not cause WAIT instructions to trap. It's important to trap
1659 * WAIT instructions - otherwise the "wait" variants of no-wait
1660 * control instructions would degenerate to the "no-wait" variants
1661 * after FP context switches but work correctly otherwise. It's
1662 * particularly important to trap WAITs when there is no NPX -
1663 * otherwise the "wait" variants would always degenerate.
1664 *
1665 * Try setting CR0_NE to get correct error reporting on 486DX's.
1666 * Setting it should fail or do nothing on lesser processors.
1667 */
1668 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
1669 load_cr0(cr0);
1670 load_gs(_udatasel);
1671}
1672
1673u_long bootdev; /* not a struct cdev *- encoding is different */
1674SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
1675 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
1676
1677/*
1678 * Initialize 386 and configure to run kernel
1679 */
1680
1681/*
1682 * Initialize segments & interrupt table
1683 */
1684
1685int _default_ldt;
1686
1687#ifdef XEN
1688union descriptor *gdt;
1689union descriptor *ldt;
1690#else
1691union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */
1692union descriptor ldt[NLDT]; /* local descriptor table */
1693#endif
1694static struct gate_descriptor idt0[NIDT];
1695struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
1696struct region_descriptor r_gdt, r_idt; /* table descriptors */
1697struct mtx dt_lock; /* lock for GDT and LDT */
1698
1699#if defined(I586_CPU) && !defined(NO_F00F_HACK)
1700extern int has_f00f_bug;
1701#endif
1702
1703static struct i386tss dblfault_tss;
1704static char dblfault_stack[PAGE_SIZE];
1705
1706extern vm_offset_t proc0kstack;
1707
1708
1709/*
1710 * software prototypes -- in more palatable form.
1711 *
1712 * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
1713 * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
1714 */
1715struct soft_segment_descriptor gdt_segs[] = {
1716/* GNULL_SEL 0 Null Descriptor */
1717{ .ssd_base = 0x0,
1718 .ssd_limit = 0x0,
1719 .ssd_type = 0,
1720 .ssd_dpl = SEL_KPL,
1721 .ssd_p = 0,
1722 .ssd_xx = 0, .ssd_xx1 = 0,
1723 .ssd_def32 = 0,
1724 .ssd_gran = 0 },
1725/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */
1726{ .ssd_base = 0x0,
1727 .ssd_limit = 0xfffff,
1728 .ssd_type = SDT_MEMRWA,
1729 .ssd_dpl = SEL_KPL,
1730 .ssd_p = 1,
1731 .ssd_xx = 0, .ssd_xx1 = 0,
1732 .ssd_def32 = 1,
1733 .ssd_gran = 1 },
1734/* GUFS_SEL 2 %fs Descriptor for user */
1735{ .ssd_base = 0x0,
1736 .ssd_limit = 0xfffff,
1737 .ssd_type = SDT_MEMRWA,
1738 .ssd_dpl = SEL_UPL,
1739 .ssd_p = 1,
1740 .ssd_xx = 0, .ssd_xx1 = 0,
1741 .ssd_def32 = 1,
1742 .ssd_gran = 1 },
1743/* GUGS_SEL 3 %gs Descriptor for user */
1744{ .ssd_base = 0x0,
1745 .ssd_limit = 0xfffff,
1746 .ssd_type = SDT_MEMRWA,
1747 .ssd_dpl = SEL_UPL,
1748 .ssd_p = 1,
1749 .ssd_xx = 0, .ssd_xx1 = 0,
1750 .ssd_def32 = 1,
1751 .ssd_gran = 1 },
1752/* GCODE_SEL 4 Code Descriptor for kernel */
1753{ .ssd_base = 0x0,
1754 .ssd_limit = 0xfffff,
1755 .ssd_type = SDT_MEMERA,
1756 .ssd_dpl = SEL_KPL,
1757 .ssd_p = 1,
1758 .ssd_xx = 0, .ssd_xx1 = 0,
1759 .ssd_def32 = 1,
1760 .ssd_gran = 1 },
1761/* GDATA_SEL 5 Data Descriptor for kernel */
1762{ .ssd_base = 0x0,
1763 .ssd_limit = 0xfffff,
1764 .ssd_type = SDT_MEMRWA,
1765 .ssd_dpl = SEL_KPL,
1766 .ssd_p = 1,
1767 .ssd_xx = 0, .ssd_xx1 = 0,
1768 .ssd_def32 = 1,
1769 .ssd_gran = 1 },
1770/* GUCODE_SEL 6 Code Descriptor for user */
1771{ .ssd_base = 0x0,
1772 .ssd_limit = 0xfffff,
1773 .ssd_type = SDT_MEMERA,
1774 .ssd_dpl = SEL_UPL,
1775 .ssd_p = 1,
1776 .ssd_xx = 0, .ssd_xx1 = 0,
1777 .ssd_def32 = 1,
1778 .ssd_gran = 1 },
1779/* GUDATA_SEL 7 Data Descriptor for user */
1780{ .ssd_base = 0x0,
1781 .ssd_limit = 0xfffff,
1782 .ssd_type = SDT_MEMRWA,
1783 .ssd_dpl = SEL_UPL,
1784 .ssd_p = 1,
1785 .ssd_xx = 0, .ssd_xx1 = 0,
1786 .ssd_def32 = 1,
1787 .ssd_gran = 1 },
1788/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
1789{ .ssd_base = 0x400,
1790 .ssd_limit = 0xfffff,
1791 .ssd_type = SDT_MEMRWA,
1792 .ssd_dpl = SEL_KPL,
1793 .ssd_p = 1,
1794 .ssd_xx = 0, .ssd_xx1 = 0,
1795 .ssd_def32 = 1,
1796 .ssd_gran = 1 },
1797#ifndef XEN
1798/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
1799{
1800 .ssd_base = 0x0,
1801 .ssd_limit = sizeof(struct i386tss)-1,
1802 .ssd_type = SDT_SYS386TSS,
1803 .ssd_dpl = 0,
1804 .ssd_p = 1,
1805 .ssd_xx = 0, .ssd_xx1 = 0,
1806 .ssd_def32 = 0,
1807 .ssd_gran = 0 },
1808/* GLDT_SEL 10 LDT Descriptor */
1809{ .ssd_base = (int) ldt,
1810 .ssd_limit = sizeof(ldt)-1,
1811 .ssd_type = SDT_SYSLDT,
1812 .ssd_dpl = SEL_UPL,
1813 .ssd_p = 1,
1814 .ssd_xx = 0, .ssd_xx1 = 0,
1815 .ssd_def32 = 0,
1816 .ssd_gran = 0 },
1817/* GUSERLDT_SEL 11 User LDT Descriptor per process */
1818{ .ssd_base = (int) ldt,
1819 .ssd_limit = (512 * sizeof(union descriptor)-1),
1820 .ssd_type = SDT_SYSLDT,
1821 .ssd_dpl = 0,
1822 .ssd_p = 1,
1823 .ssd_xx = 0, .ssd_xx1 = 0,
1824 .ssd_def32 = 0,
1825 .ssd_gran = 0 },
1826/* GPANIC_SEL 12 Panic Tss Descriptor */
1827{ .ssd_base = (int) &dblfault_tss,
1828 .ssd_limit = sizeof(struct i386tss)-1,
1829 .ssd_type = SDT_SYS386TSS,
1830 .ssd_dpl = 0,
1831 .ssd_p = 1,
1832 .ssd_xx = 0, .ssd_xx1 = 0,
1833 .ssd_def32 = 0,
1834 .ssd_gran = 0 },
1835/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
1836{ .ssd_base = 0,
1837 .ssd_limit = 0xfffff,
1838 .ssd_type = SDT_MEMERA,
1839 .ssd_dpl = 0,
1840 .ssd_p = 1,
1841 .ssd_xx = 0, .ssd_xx1 = 0,
1842 .ssd_def32 = 0,
1843 .ssd_gran = 1 },
1844/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
1845{ .ssd_base = 0,
1846 .ssd_limit = 0xfffff,
1847 .ssd_type = SDT_MEMERA,
1848 .ssd_dpl = 0,
1849 .ssd_p = 1,
1850 .ssd_xx = 0, .ssd_xx1 = 0,
1851 .ssd_def32 = 0,
1852 .ssd_gran = 1 },
1853/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
1854{ .ssd_base = 0,
1855 .ssd_limit = 0xfffff,
1856 .ssd_type = SDT_MEMRWA,
1857 .ssd_dpl = 0,
1858 .ssd_p = 1,
1859 .ssd_xx = 0, .ssd_xx1 = 0,
1860 .ssd_def32 = 1,
1861 .ssd_gran = 1 },
1862/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
1863{ .ssd_base = 0,
1864 .ssd_limit = 0xfffff,
1865 .ssd_type = SDT_MEMRWA,
1866 .ssd_dpl = 0,
1867 .ssd_p = 1,
1868 .ssd_xx = 0, .ssd_xx1 = 0,
1869 .ssd_def32 = 0,
1870 .ssd_gran = 1 },
1871/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
1872{ .ssd_base = 0,
1873 .ssd_limit = 0xfffff,
1874 .ssd_type = SDT_MEMRWA,
1875 .ssd_dpl = 0,
1876 .ssd_p = 1,
1877 .ssd_xx = 0, .ssd_xx1 = 0,
1878 .ssd_def32 = 0,
1879 .ssd_gran = 1 },
1880/* GNDIS_SEL 18 NDIS Descriptor */
1881{ .ssd_base = 0x0,
1882 .ssd_limit = 0x0,
1883 .ssd_type = 0,
1884 .ssd_dpl = 0,
1885 .ssd_p = 0,
1886 .ssd_xx = 0, .ssd_xx1 = 0,
1887 .ssd_def32 = 0,
1888 .ssd_gran = 0 },
1889#endif /* !XEN */
1890};
1891
1892static struct soft_segment_descriptor ldt_segs[] = {
1893 /* Null Descriptor - overwritten by call gate */
1894{ .ssd_base = 0x0,
1895 .ssd_limit = 0x0,
1896 .ssd_type = 0,
1897 .ssd_dpl = 0,
1898 .ssd_p = 0,
1899 .ssd_xx = 0, .ssd_xx1 = 0,
1900 .ssd_def32 = 0,
1901 .ssd_gran = 0 },
1902 /* Null Descriptor - overwritten by call gate */
1903{ .ssd_base = 0x0,
1904 .ssd_limit = 0x0,
1905 .ssd_type = 0,
1906 .ssd_dpl = 0,
1907 .ssd_p = 0,
1908 .ssd_xx = 0, .ssd_xx1 = 0,
1909 .ssd_def32 = 0,
1910 .ssd_gran = 0 },
1911 /* Null Descriptor - overwritten by call gate */
1912{ .ssd_base = 0x0,
1913 .ssd_limit = 0x0,
1914 .ssd_type = 0,
1915 .ssd_dpl = 0,
1916 .ssd_p = 0,
1917 .ssd_xx = 0, .ssd_xx1 = 0,
1918 .ssd_def32 = 0,
1919 .ssd_gran = 0 },
1920 /* Code Descriptor for user */
1921{ .ssd_base = 0x0,
1922 .ssd_limit = 0xfffff,
1923 .ssd_type = SDT_MEMERA,
1924 .ssd_dpl = SEL_UPL,
1925 .ssd_p = 1,
1926 .ssd_xx = 0, .ssd_xx1 = 0,
1927 .ssd_def32 = 1,
1928 .ssd_gran = 1 },
1929 /* Null Descriptor - overwritten by call gate */
1930{ .ssd_base = 0x0,
1931 .ssd_limit = 0x0,
1932 .ssd_type = 0,
1933 .ssd_dpl = 0,
1934 .ssd_p = 0,
1935 .ssd_xx = 0, .ssd_xx1 = 0,
1936 .ssd_def32 = 0,
1937 .ssd_gran = 0 },
1938 /* Data Descriptor for user */
1939{ .ssd_base = 0x0,
1940 .ssd_limit = 0xfffff,
1941 .ssd_type = SDT_MEMRWA,
1942 .ssd_dpl = SEL_UPL,
1943 .ssd_p = 1,
1944 .ssd_xx = 0, .ssd_xx1 = 0,
1945 .ssd_def32 = 1,
1946 .ssd_gran = 1 },
1947};
1948
1949void
1950setidt(idx, func, typ, dpl, selec)
1951 int idx;
1952 inthand_t *func;
1953 int typ;
1954 int dpl;
1955 int selec;
1956{
1957 struct gate_descriptor *ip;
1958
1959 ip = idt + idx;
1960 ip->gd_looffset = (int)func;
1961 ip->gd_selector = selec;
1962 ip->gd_stkcpy = 0;
1963 ip->gd_xx = 0;
1964 ip->gd_type = typ;
1965 ip->gd_dpl = dpl;
1966 ip->gd_p = 1;
1967 ip->gd_hioffset = ((int)func)>>16 ;
1968}
1969
1970extern inthand_t
1971 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
1972 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
1973 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
1974 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
1975 IDTVEC(xmm),
1976#ifdef KDTRACE_HOOKS
1977 IDTVEC(dtrace_ret),
1978#endif
1979 IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
1980
1981#ifdef DDB
1982/*
1983 * Display the index and function name of any IDT entries that don't use
1984 * the default 'rsvd' entry point.
1985 */
1986DB_SHOW_COMMAND(idt, db_show_idt)
1987{
1988 struct gate_descriptor *ip;
1989 int idx;
1990 uintptr_t func;
1991
1992 ip = idt;
1993 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
1994 func = (ip->gd_hioffset << 16 | ip->gd_looffset);
1995 if (func != (uintptr_t)&IDTVEC(rsvd)) {
1996 db_printf("%3d\t", idx);
1997 db_printsym(func, DB_STGY_PROC);
1998 db_printf("\n");
1999 }
2000 ip++;
2001 }
2002}
2003
2004/* Show privileged registers. */
2005DB_SHOW_COMMAND(sysregs, db_show_sysregs)
2006{
2007 uint64_t idtr, gdtr;
2008
2009 idtr = ridt();
2010 db_printf("idtr\t0x%08x/%04x\n",
2011 (u_int)(idtr >> 16), (u_int)idtr & 0xffff);
2012 gdtr = rgdt();
2013 db_printf("gdtr\t0x%08x/%04x\n",
2014 (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
2015 db_printf("ldtr\t0x%04x\n", rldt());
2016 db_printf("tr\t0x%04x\n", rtr());
2017 db_printf("cr0\t0x%08x\n", rcr0());
2018 db_printf("cr2\t0x%08x\n", rcr2());
2019 db_printf("cr3\t0x%08x\n", rcr3());
2020 db_printf("cr4\t0x%08x\n", rcr4());
2021}
2022#endif
2023
2024void
2025sdtossd(sd, ssd)
2026 struct segment_descriptor *sd;
2027 struct soft_segment_descriptor *ssd;
2028{
2029 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
2030 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
2031 ssd->ssd_type = sd->sd_type;
2032 ssd->ssd_dpl = sd->sd_dpl;
2033 ssd->ssd_p = sd->sd_p;
2034 ssd->ssd_def32 = sd->sd_def32;
2035 ssd->ssd_gran = sd->sd_gran;
2036}
2037
2038#ifndef XEN
2039static int
2040add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
2041{
2042 int i, insert_idx, physmap_idx;
2043
2044 physmap_idx = *physmap_idxp;
2045
2046 if (boothowto & RB_VERBOSE)
2047 printf("SMAP type=%02x base=%016llx len=%016llx\n",
2048 smap->type, smap->base, smap->length);
2049
2050 if (smap->type != SMAP_TYPE_MEMORY)
2051 return (1);
2052
2053 if (smap->length == 0)
2054 return (1);
2055
2056#ifndef PAE
2057 if (smap->base > 0xffffffff) {
2058 printf("%uK of memory above 4GB ignored\n",
2059 (u_int)(smap->length / 1024));
2060 return (1);
2061 }
2062#endif
2063
2064 /*
2065 * Find insertion point while checking for overlap. Start off by
2066 * assuming the new entry will be added to the end.
2067 */
2068 insert_idx = physmap_idx + 2;
2069 for (i = 0; i <= physmap_idx; i += 2) {
2070 if (smap->base < physmap[i + 1]) {
2071 if (smap->base + smap->length <= physmap[i]) {
2072 insert_idx = i;
2073 break;
2074 }
2075 if (boothowto & RB_VERBOSE)
2076 printf(
2077 "Overlapping memory regions, ignoring second region\n");
2078 return (1);
2079 }
2080 }
2081
2082 /* See if we can prepend to the next entry. */
2083 if (insert_idx <= physmap_idx &&
2084 smap->base + smap->length == physmap[insert_idx]) {
2085 physmap[insert_idx] = smap->base;
2086 return (1);
2087 }
2088
2089 /* See if we can append to the previous entry. */
2090 if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) {
2091 physmap[insert_idx - 1] += smap->length;
2092 return (1);
2093 }
2094
2095 physmap_idx += 2;
2096 *physmap_idxp = physmap_idx;
2097 if (physmap_idx == PHYSMAP_SIZE) {
2098 printf(
2099 "Too many segments in the physical address map, giving up\n");
2100 return (0);
2101 }
2102
2103 /*
2104 * Move the last 'N' entries down to make room for the new
2105 * entry if needed.
2106 */
2107 for (i = physmap_idx; i > insert_idx; i -= 2) {
2108 physmap[i] = physmap[i - 2];
2109 physmap[i + 1] = physmap[i - 1];
2110 }
2111
2112 /* Insert the new entry. */
2113 physmap[insert_idx] = smap->base;
2114 physmap[insert_idx + 1] = smap->base + smap->length;
2115 return (1);
2116}
2117
2118static void
2119basemem_setup(void)
2120{
2121 vm_paddr_t pa;
2122 pt_entry_t *pte;
2123 int i;
2124
2125 if (basemem > 640) {
2126 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
2127 basemem);
2128 basemem = 640;
2129 }
2130
2131 /*
2132 * XXX if biosbasemem is now < 640, there is a `hole'
2133 * between the end of base memory and the start of
2134 * ISA memory. The hole may be empty or it may
2135 * contain BIOS code or data. Map it read/write so
2136 * that the BIOS can write to it. (Memory from 0 to
2137 * the physical end of the kernel is mapped read-only
2138 * to begin with and then parts of it are remapped.
2139 * The parts that aren't remapped form holes that
2140 * remain read-only and are unused by the kernel.
2141 * The base memory area is below the physical end of
2142 * the kernel and right now forms a read-only hole.
2143 * The part of it from PAGE_SIZE to
2144 * (trunc_page(biosbasemem * 1024) - 1) will be
2145 * remapped and used by the kernel later.)
2146 *
2147 * This code is similar to the code used in
2148 * pmap_mapdev, but since no memory needs to be
2149 * allocated we simply change the mapping.
2150 */
2151 for (pa = trunc_page(basemem * 1024);
2152 pa < ISA_HOLE_START; pa += PAGE_SIZE)
2153 pmap_kenter(KERNBASE + pa, pa);
2154
2155 /*
2156 * Map pages between basemem and ISA_HOLE_START, if any, r/w into
2157 * the vm86 page table so that vm86 can scribble on them using
2158 * the vm86 map too. XXX: why 2 ways for this and only 1 way for
2159 * page 0, at least as initialized here?
2160 */
2161 pte = (pt_entry_t *)vm86paddr;
2162 for (i = basemem / 4; i < 160; i++)
2163 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
2164}
2165#endif
2166
2167/*
2168 * Populate the (physmap) array with base/bound pairs describing the
2169 * available physical memory in the system, then test this memory and
2170 * build the phys_avail array describing the actually-available memory.
2171 *
2172 * If we cannot accurately determine the physical memory map, then use
2173 * value from the 0xE801 call, and failing that, the RTC.
2174 *
2175 * Total memory size may be set by the kernel environment variable
2176 * hw.physmem or the compile-time define MAXMEM.
2177 *
2178 * XXX first should be vm_paddr_t.
2179 */
2180static void
2181getmemsize(int first)
2182{
2183 int has_smap, off, physmap_idx, pa_indx, da_indx;
2184 u_long physmem_tunable, memtest;
2185 vm_paddr_t physmap[PHYSMAP_SIZE];
2186 pt_entry_t *pte;
2187 quad_t dcons_addr, dcons_size;
2188#ifndef XEN
2189 int hasbrokenint12, i, res;
2190 u_int extmem;
2191 struct vm86frame vmf;
2192 struct vm86context vmc;
2193 vm_paddr_t pa;
2194 struct bios_smap *smap, *smapbase, *smapend;
2195 u_int32_t smapsize;
2196 caddr_t kmdp;
2197#endif
2198
2199 has_smap = 0;
2200#if defined(XEN)
2201 Maxmem = xen_start_info->nr_pages - init_first;
2202 physmem = Maxmem;
2203 basemem = 0;
2204 physmap[0] = init_first << PAGE_SHIFT;
2205 physmap[1] = ptoa(Maxmem) - round_page(msgbufsize);
2206 physmap_idx = 0;
2207#else
2208#ifdef XBOX
2209 if (arch_i386_is_xbox) {
2210 /*
2211 * We queried the memory size before, so chop off 4MB for
2212 * the framebuffer and inform the OS of this.
2213 */
2214 physmap[0] = 0;
2215 physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE;
2216 physmap_idx = 0;
2217 goto physmap_done;
2218 }
2219#endif
2220 bzero(&vmf, sizeof(vmf));
2221 bzero(physmap, sizeof(physmap));
2222 basemem = 0;
2223
2224 /*
2225 * Check if the loader supplied an SMAP memory map. If so,
2226 * use that and do not make any VM86 calls.
2227 */
2228 physmap_idx = 0;
2229 smapbase = NULL;
2230 kmdp = preload_search_by_type("elf kernel");
2231 if (kmdp == NULL)
2232 kmdp = preload_search_by_type("elf32 kernel");
2233 if (kmdp != NULL)
2234 smapbase = (struct bios_smap *)preload_search_info(kmdp,
2235 MODINFO_METADATA | MODINFOMD_SMAP);
2236 if (smapbase != NULL) {
2237 /*
2238 * subr_module.c says:
2239 * "Consumer may safely assume that size value precedes data."
2240 * ie: an int32_t immediately precedes SMAP.
2241 */
2242 smapsize = *((u_int32_t *)smapbase - 1);
2243 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
2244 has_smap = 1;
2245
2246 for (smap = smapbase; smap < smapend; smap++)
2247 if (!add_smap_entry(smap, physmap, &physmap_idx))
2248 break;
2249 goto have_smap;
2250 }
2251
2252 /*
2253 * Some newer BIOSes have a broken INT 12H implementation
2254 * which causes a kernel panic immediately. In this case, we
2255 * need use the SMAP to determine the base memory size.
2256 */
2257 hasbrokenint12 = 0;
2258 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
2259 if (hasbrokenint12 == 0) {
2260 /* Use INT12 to determine base memory size. */
2261 vm86_intcall(0x12, &vmf);
2262 basemem = vmf.vmf_ax;
2263 basemem_setup();
2264 }
2265
2266 /*
2267 * Fetch the memory map with INT 15:E820. Map page 1 R/W into
2268 * the kernel page table so we can use it as a buffer. The
2269 * kernel will unmap this page later.
2270 */
2271 pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
2272 vmc.npages = 0;
2273 smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
2274 res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
2275 KASSERT(res != 0, ("vm86_getptr() failed: address not found"));
2276
2277 vmf.vmf_ebx = 0;
2278 do {
2279 vmf.vmf_eax = 0xE820;
2280 vmf.vmf_edx = SMAP_SIG;
2281 vmf.vmf_ecx = sizeof(struct bios_smap);
2282 i = vm86_datacall(0x15, &vmf, &vmc);
2283 if (i || vmf.vmf_eax != SMAP_SIG)
2284 break;
2285 has_smap = 1;
2286 if (!add_smap_entry(smap, physmap, &physmap_idx))
2287 break;
2288 } while (vmf.vmf_ebx != 0);
2289
2290have_smap:
2291 /*
2292 * If we didn't fetch the "base memory" size from INT12,
2293 * figure it out from the SMAP (or just guess).
2294 */
2295 if (basemem == 0) {
2296 for (i = 0; i <= physmap_idx; i += 2) {
2297 if (physmap[i] == 0x00000000) {
2298 basemem = physmap[i + 1] / 1024;
2299 break;
2300 }
2301 }
2302
2303 /* XXX: If we couldn't find basemem from SMAP, just guess. */
2304 if (basemem == 0)
2305 basemem = 640;
2306 basemem_setup();
2307 }
2308
2309 if (physmap[1] != 0)
2310 goto physmap_done;
2311
2312 /*
2313 * If we failed to find an SMAP, figure out the extended
2314 * memory size. We will then build a simple memory map with
2315 * two segments, one for "base memory" and the second for
2316 * "extended memory". Note that "extended memory" starts at a
2317 * physical address of 1MB and that both basemem and extmem
2318 * are in units of 1KB.
2319 *
2320 * First, try to fetch the extended memory size via INT 15:E801.
2321 */
2322 vmf.vmf_ax = 0xE801;
2323 if (vm86_intcall(0x15, &vmf) == 0) {
2324 extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
2325 } else {
2326 /*
2327 * If INT15:E801 fails, this is our last ditch effort
2328 * to determine the extended memory size. Currently
2329 * we prefer the RTC value over INT15:88.
2330 */
2331#if 0
2332 vmf.vmf_ah = 0x88;
2333 vm86_intcall(0x15, &vmf);
2334 extmem = vmf.vmf_ax;
2335#else
2336 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
2337#endif
2338 }
2339
2340 /*
2341 * Special hack for chipsets that still remap the 384k hole when
2342 * there's 16MB of memory - this really confuses people that
2343 * are trying to use bus mastering ISA controllers with the
2344 * "16MB limit"; they only have 16MB, but the remapping puts
2345 * them beyond the limit.
2346 *
2347 * If extended memory is between 15-16MB (16-17MB phys address range),
2348 * chop it to 15MB.
2349 */
2350 if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
2351 extmem = 15 * 1024;
2352
2353 physmap[0] = 0;
2354 physmap[1] = basemem * 1024;
2355 physmap_idx = 2;
2356 physmap[physmap_idx] = 0x100000;
2357 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
2358
2359physmap_done:
2360#endif
2361 /*
2362 * Now, physmap contains a map of physical memory.
2363 */
2364
2365#ifdef SMP
2366 /* make hole for AP bootstrap code */
2367 physmap[1] = mp_bootaddress(physmap[1]);
2368#endif
2369
2370 /*
2371 * Maxmem isn't the "maximum memory", it's one larger than the
2372 * highest page of the physical address space. It should be
2373 * called something like "Maxphyspage". We may adjust this
2374 * based on ``hw.physmem'' and the results of the memory test.
2375 */
2376 Maxmem = atop(physmap[physmap_idx + 1]);
2377
2378#ifdef MAXMEM
2379 Maxmem = MAXMEM / 4;
2380#endif
2381
2382 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
2383 Maxmem = atop(physmem_tunable);
2384
2385 /*
2386 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
2387 * the amount of memory in the system.
2388 */
2389 if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
2390 Maxmem = atop(physmap[physmap_idx + 1]);
2391
2392 /*
2393 * By default enable the memory test on real hardware, and disable
2394 * it if we appear to be running in a VM. This avoids touching all
2395 * pages unnecessarily, which doesn't matter on real hardware but is
2396 * bad for shared VM hosts. Use a general name so that
2397 * one could eventually do more with the code than just disable it.
2398 */
2399 memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1;
2400 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
2401
2402 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
2403 (boothowto & RB_VERBOSE))
2404 printf("Physical memory use set to %ldK\n", Maxmem * 4);
2405
2406 /*
2407 * If Maxmem has been increased beyond what the system has detected,
2408 * extend the last memory segment to the new limit.
2409 */
2410 if (atop(physmap[physmap_idx + 1]) < Maxmem)
2411 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
2412
2413 /* call pmap initialization to make new kernel address space */
2414 pmap_bootstrap(first);
2415
2416 /*
2417 * Size up each available chunk of physical memory.
2418 */
2419 physmap[0] = PAGE_SIZE; /* mask off page 0 */
2420 pa_indx = 0;
2421 da_indx = 1;
2422 phys_avail[pa_indx++] = physmap[0];
2423 phys_avail[pa_indx] = physmap[0];
2424 dump_avail[da_indx] = physmap[0];
2425 pte = CMAP1;
2426
2427 /*
2428 * Get dcons buffer address
2429 */
2430 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
2431 getenv_quad("dcons.size", &dcons_size) == 0)
2432 dcons_addr = 0;
2433
2434#ifndef XEN
2435 /*
2436 * physmap is in bytes, so when converting to page boundaries,
2437 * round up the start address and round down the end address.
2438 */
2439 for (i = 0; i <= physmap_idx; i += 2) {
2440 vm_paddr_t end;
2441
2442 end = ptoa((vm_paddr_t)Maxmem);
2443 if (physmap[i + 1] < end)
2444 end = trunc_page(physmap[i + 1]);
2445 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
2446 int tmp, page_bad, full;
2447 int *ptr = (int *)CADDR1;
2448
2449 full = FALSE;
2450 /*
2451 * block out kernel memory as not available.
2452 */
2453 if (pa >= KERNLOAD && pa < first)
2454 goto do_dump_avail;
2455
2456 /*
2457 * block out dcons buffer
2458 */
2459 if (dcons_addr > 0
2460 && pa >= trunc_page(dcons_addr)
2461 && pa < dcons_addr + dcons_size)
2462 goto do_dump_avail;
2463
2464 page_bad = FALSE;
2465 if (memtest == 0)
2466 goto skip_memtest;
2467
2468 /*
2469 * map page into kernel: valid, read/write,non-cacheable
2470 */
2471 *pte = pa | PG_V | PG_RW | PG_N;
2472 invltlb();
2473
2474 tmp = *(int *)ptr;
2475 /*
2476 * Test for alternating 1's and 0's
2477 */
2478 *(volatile int *)ptr = 0xaaaaaaaa;
2479 if (*(volatile int *)ptr != 0xaaaaaaaa)
2480 page_bad = TRUE;
2481 /*
2482 * Test for alternating 0's and 1's
2483 */
2484 *(volatile int *)ptr = 0x55555555;
2485 if (*(volatile int *)ptr != 0x55555555)
2486 page_bad = TRUE;
2487 /*
2488 * Test for all 1's
2489 */
2490 *(volatile int *)ptr = 0xffffffff;
2491 if (*(volatile int *)ptr != 0xffffffff)
2492 page_bad = TRUE;
2493 /*
2494 * Test for all 0's
2495 */
2496 *(volatile int *)ptr = 0x0;
2497 if (*(volatile int *)ptr != 0x0)
2498 page_bad = TRUE;
2499 /*
2500 * Restore original value.
2501 */
2502 *(int *)ptr = tmp;
2503
2504skip_memtest:
2505 /*
2506 * Adjust array of valid/good pages.
2507 */
2508 if (page_bad == TRUE)
2509 continue;
2510 /*
2511 * If this good page is a continuation of the
2512 * previous set of good pages, then just increase
2513 * the end pointer. Otherwise start a new chunk.
2514 * Note that "end" points one higher than end,
2515 * making the range >= start and < end.
2516 * If we're also doing a speculative memory
2517 * test and we at or past the end, bump up Maxmem
2518 * so that we keep going. The first bad page
2519 * will terminate the loop.
2520 */
2521 if (phys_avail[pa_indx] == pa) {
2522 phys_avail[pa_indx] += PAGE_SIZE;
2523 } else {
2524 pa_indx++;
2525 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
2526 printf(
2527 "Too many holes in the physical address space, giving up\n");
2528 pa_indx--;
2529 full = TRUE;
2530 goto do_dump_avail;
2531 }
2532 phys_avail[pa_indx++] = pa; /* start */
2533 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
2534 }
2535 physmem++;
2536do_dump_avail:
2537 if (dump_avail[da_indx] == pa) {
2538 dump_avail[da_indx] += PAGE_SIZE;
2539 } else {
2540 da_indx++;
2541 if (da_indx == DUMP_AVAIL_ARRAY_END) {
2542 da_indx--;
2543 goto do_next;
2544 }
2545 dump_avail[da_indx++] = pa; /* start */
2546 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
2547 }
2548do_next:
2549 if (full)
2550 break;
2551 }
2552 }
2553 *pte = 0;
2554 invltlb();
2555#else
2556 phys_avail[0] = physfree;
2557 phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
2558 dump_avail[0] = 0;
2559 dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
2560
2561#endif
2562
2563 /*
2564 * XXX
2565 * The last chunk must contain at least one page plus the message
2566 * buffer to avoid complicating other code (message buffer address
2567 * calculation, etc.).
2568 */
2569 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
2570 round_page(msgbufsize) >= phys_avail[pa_indx]) {
2571 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
2572 phys_avail[pa_indx--] = 0;
2573 phys_avail[pa_indx--] = 0;
2574 }
2575
2576 Maxmem = atop(phys_avail[pa_indx]);
2577
2578 /* Trim off space for the message buffer. */
2579 phys_avail[pa_indx] -= round_page(msgbufsize);
2580
2581 /* Map the message buffer. */
2582 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
2583 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
2584 off);
2585
2586 PT_UPDATES_FLUSH();
2587}
2588
2589#ifdef XEN
2590#define MTOPSIZE (1<<(14 + PAGE_SHIFT))
2591
2592void
2593init386(first)
2594 int first;
2595{
2596 unsigned long gdtmachpfn;
2597 int error, gsel_tss, metadata_missing, x, pa;
2598 size_t kstack0_sz;
2599 struct pcpu *pc;
2600 struct callback_register event = {
2601 .type = CALLBACKTYPE_event,
2602 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback },
2603 };
2604 struct callback_register failsafe = {
2605 .type = CALLBACKTYPE_failsafe,
2606 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback },
2607 };
2608
2609 thread0.td_kstack = proc0kstack;
2610 thread0.td_kstack_pages = KSTACK_PAGES;
2611 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
2612 thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;
2613
2614 /*
2615 * This may be done better later if it gets more high level
2616 * components in it. If so just link td->td_proc here.
2617 */
2618 proc_linkup0(&proc0, &thread0);
2619
2620 metadata_missing = 0;
2621 if (xen_start_info->mod_start) {
2622 preload_metadata = (caddr_t)xen_start_info->mod_start;
2623 preload_bootstrap_relocate(KERNBASE);
2624 } else {
2625 metadata_missing = 1;
2626 }
2627 if (envmode == 1)
2628 kern_envp = static_env;
2629 else if ((caddr_t)xen_start_info->cmd_line)
2630 kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line);
2631
2632 boothowto |= xen_boothowto(kern_envp);
2633
2634 /* Init basic tunables, hz etc */
2635 init_param1();
2636
2637 /*
2638 * XEN occupies a portion of the upper virtual address space
2639 * At its base it manages an array mapping machine page frames
2640 * to physical page frames - hence we need to be able to
2641 * access 4GB - (64MB - 4MB + 64k)
2642 */
2643 gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2644 gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2645 gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2646 gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2647 gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2648 gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2649 gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2650 gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
2651
2652 pc = &__pcpu[0];
2653 gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
2654 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
2655
2656 PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW);
2657 bzero(gdt, PAGE_SIZE);
2658 for (x = 0; x < NGDT; x++)
2659 ssdtosd(&gdt_segs[x], &gdt[x].sd);
2660
2661 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
2662
2663 gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT;
2664 PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V);
2665 PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0);
2666 lgdt(&r_gdt);
2667 gdtset = 1;
2668
2669 if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) {
2670 panic("set_trap_table failed - error %d\n", error);
2671 }
2672
2673 error = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
2674 if (error == 0)
2675 error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
2676#if CONFIG_XEN_COMPAT <= 0x030002
2677 if (error == -ENOXENSYS)
2678 HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL),
2679 (unsigned long)Xhypervisor_callback,
2680 GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
2681#endif
2682 pcpu_init(pc, 0, sizeof(struct pcpu));
2683 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
2684 pmap_kenter(pa + KERNBASE, pa);
2685 dpcpu_init((void *)(first + KERNBASE), 0);
2686 first += DPCPU_SIZE;
2687 physfree += DPCPU_SIZE;
2688 init_first += DPCPU_SIZE / PAGE_SIZE;
2689
2690 PCPU_SET(prvspace, pc);
2691 PCPU_SET(curthread, &thread0);
2692 PCPU_SET(curpcb, thread0.td_pcb);
2693
2694 /*
2695 * Initialize mutexes.
2696 *
2697 * icu_lock: in order to allow an interrupt to occur in a critical
2698 * section, to set pcpu->ipending (etc...) properly, we
2699 * must be able to get the icu lock, so it can't be
2700 * under witness.
2701 */
2702 mutex_init();
2703 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
2704
2705 /* make ldt memory segments */
2706 PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW);
2707 bzero(ldt, PAGE_SIZE);
2708 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
2709 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
2710 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
2711 ssdtosd(&ldt_segs[x], &ldt[x].sd);
2712
2713 default_proc_ldt.ldt_base = (caddr_t)ldt;
2714 default_proc_ldt.ldt_len = 6;
2715 _default_ldt = (int)&default_proc_ldt;
2716 PCPU_SET(currentldt, _default_ldt);
2717 PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW);
2718 xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0]));
2719
2720#if defined(XEN_PRIVILEGED)
2721 /*
2722 * Initialize the i8254 before the console so that console
2723 * initialization can use DELAY().
2724 */
2725 i8254_init();
2726#endif
2727
2728 /*
2729 * Initialize the console before we print anything out.
2730 */
2731 cninit();
2732
2733 if (metadata_missing)
2734 printf("WARNING: loader(8) metadata is missing!\n");
2735
2736#ifdef DEV_ISA
2737#ifdef DEV_ATPIC
2738 elcr_probe();
2739 atpic_startup();
2740#else
2741 /* Reset and mask the atpics and leave them shut down. */
2742 atpic_reset();
2743
2744 /*
2745 * Point the ICU spurious interrupt vectors at the APIC spurious
2746 * interrupt handler.
2747 */
2748 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
2749 GSEL(GCODE_SEL, SEL_KPL));
2750 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
2751 GSEL(GCODE_SEL, SEL_KPL));
2752#endif
2753#endif
2754
2755#ifdef DDB
2756 ksym_start = bootinfo.bi_symtab;
2757 ksym_end = bootinfo.bi_esymtab;
2758#endif
2759
2760 kdb_init();
2761
2762#ifdef KDB
2763 if (boothowto & RB_KDB)
2764 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
2765#endif
2766
2767 finishidentcpu(); /* Final stage of CPU initialization */
2768 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
2769 GSEL(GCODE_SEL, SEL_KPL));
2770 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
2771 GSEL(GCODE_SEL, SEL_KPL));
2772 initializecpu(); /* Initialize CPU registers */
2773
2774 /* make an initial tss so cpu can get interrupt stack on syscall! */
2775 /* Note: -16 is so we can grow the trapframe if we came from vm86 */
2776 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
2777 kstack0_sz - sizeof(struct pcb) - 16);
2778 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
2779 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
2780 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL),
2781 PCPU_GET(common_tss.tss_esp0));
2782
2783 /* pointer to selector slot for %fs/%gs */
2784 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
2785
2786 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
2787 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
2788 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
2789 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
2790#ifdef PAE
2791 dblfault_tss.tss_cr3 = (int)IdlePDPT;
2792#else
2793 dblfault_tss.tss_cr3 = (int)IdlePTD;
2794#endif
2795 dblfault_tss.tss_eip = (int)dblfault_handler;
2796 dblfault_tss.tss_eflags = PSL_KERNEL;
2797 dblfault_tss.tss_ds = dblfault_tss.tss_es =
2798 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
2799 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
2800 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
2801 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
2802
2803 vm86_initialize();
2804 getmemsize(first);
2805 init_param2(physmem);
2806
2807 /* now running on new page tables, configured,and u/iom is accessible */
2808
2809 msgbufinit(msgbufp, msgbufsize);
2810 /* transfer to user mode */
2811
2812 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
2813 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
2814
2815 /* setup proc 0's pcb */
2816 thread0.td_pcb->pcb_flags = 0;
2817#ifdef PAE
2818 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
2819#else
2820 thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
2821#endif
2822 thread0.td_pcb->pcb_ext = 0;
2823 thread0.td_frame = &proc0_tf;
2824 thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0];
2825 thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1];
2826
2827 cpu_probe_amdc1e();
2828 cpu_probe_cmpxchg8b();
2829}
2830
2831#else
2832void
2833init386(first)
2834 int first;
2835{
2836 struct gate_descriptor *gdp;
2837 int gsel_tss, metadata_missing, x, pa;
2838 size_t kstack0_sz;
2839 struct pcpu *pc;
2840
2841 thread0.td_kstack = proc0kstack;
2842 thread0.td_kstack_pages = KSTACK_PAGES;
2843 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
2844 thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;
2845
2846 /*
2847 * This may be done better later if it gets more high level
2848 * components in it. If so just link td->td_proc here.
2849 */
2850 proc_linkup0(&proc0, &thread0);
2851
2852 metadata_missing = 0;
2853 if (bootinfo.bi_modulep) {
2854 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
2855 preload_bootstrap_relocate(KERNBASE);
2856 } else {
2857 metadata_missing = 1;
2858 }
2859 if (envmode == 1)
2860 kern_envp = static_env;
2861 else if (bootinfo.bi_envp)
2862 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
2863
2864 /* Init basic tunables, hz etc */
2865 init_param1();
2866
2867 /*
2868 * Make gdt memory segments. All segments cover the full 4GB
2869 * of address space and permissions are enforced at page level.
2870 */
2871 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
2872 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
2873 gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
2874 gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
2875 gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
2876 gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
2877
2878 pc = &__pcpu[0];
2879 gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
2880 gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
2881 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
2882
2883 for (x = 0; x < NGDT; x++)
2884 ssdtosd(&gdt_segs[x], &gdt[x].sd);
2885
2886 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
2887 r_gdt.rd_base = (int) gdt;
2888 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
2889 lgdt(&r_gdt);
2890
2891 pcpu_init(pc, 0, sizeof(struct pcpu));
2892 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
2893 pmap_kenter(pa + KERNBASE, pa);
2894 dpcpu_init((void *)(first + KERNBASE), 0);
2895 first += DPCPU_SIZE;
2896 PCPU_SET(prvspace, pc);
2897 PCPU_SET(curthread, &thread0);
2898 PCPU_SET(curpcb, thread0.td_pcb);
2899
2900 /*
2901 * Initialize mutexes.
2902 *
2903 * icu_lock: in order to allow an interrupt to occur in a critical
2904 * section, to set pcpu->ipending (etc...) properly, we
2905 * must be able to get the icu lock, so it can't be
2906 * under witness.
2907 */
2908 mutex_init();
2909 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
2910
2911 /* make ldt memory segments */
2912 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
2913 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
2914 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
2915 ssdtosd(&ldt_segs[x], &ldt[x].sd);
2916
2917 _default_ldt = GSEL(GLDT_SEL, SEL_KPL);
2918 lldt(_default_ldt);
2919 PCPU_SET(currentldt, _default_ldt);
2920
2921 /* exceptions */
2922 for (x = 0; x < NIDT; x++)
2923 setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
2924 GSEL(GCODE_SEL, SEL_KPL));
2925 setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL,
2926 GSEL(GCODE_SEL, SEL_KPL));
2927 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
2928 GSEL(GCODE_SEL, SEL_KPL));
2929 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL,
2930 GSEL(GCODE_SEL, SEL_KPL));
2931 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
2932 GSEL(GCODE_SEL, SEL_KPL));
2933 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL,
2934 GSEL(GCODE_SEL, SEL_KPL));
2935 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL,
2936 GSEL(GCODE_SEL, SEL_KPL));
2937 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
2938 GSEL(GCODE_SEL, SEL_KPL));
2939 setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL
2940 , GSEL(GCODE_SEL, SEL_KPL));
2941 setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
2942 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL,
2943 GSEL(GCODE_SEL, SEL_KPL));
2944 setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL,
2945 GSEL(GCODE_SEL, SEL_KPL));
2946 setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL,
2947 GSEL(GCODE_SEL, SEL_KPL));
2948 setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL,
2949 GSEL(GCODE_SEL, SEL_KPL));
2950 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
2951 GSEL(GCODE_SEL, SEL_KPL));
2952 setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
2953 GSEL(GCODE_SEL, SEL_KPL));
2954 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL,
2955 GSEL(GCODE_SEL, SEL_KPL));
2956 setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
2957 GSEL(GCODE_SEL, SEL_KPL));
2958 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL,
2959 GSEL(GCODE_SEL, SEL_KPL));
2960 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
2961 GSEL(GCODE_SEL, SEL_KPL));
2962 setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
2963 GSEL(GCODE_SEL, SEL_KPL));
2964#ifdef KDTRACE_HOOKS
2965 setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL,
2966 GSEL(GCODE_SEL, SEL_KPL));
2967#endif
2968
2969 r_idt.rd_limit = sizeof(idt0) - 1;
2970 r_idt.rd_base = (int) idt;
2971 lidt(&r_idt);
2972
2973#ifdef XBOX
2974 /*
2975 * The following code queries the PCI ID of 0:0:0. For the XBOX,
2976 * This should be 0x10de / 0x02a5.
2977 *
2978 * This is exactly what Linux does.
2979 */
2980 outl(0xcf8, 0x80000000);
2981 if (inl(0xcfc) == 0x02a510de) {
2982 arch_i386_is_xbox = 1;
2983 pic16l_setled(XBOX_LED_GREEN);
2984
2985 /*
2986 * We are an XBOX, but we may have either 64MB or 128MB of
2987 * memory. The PCI host bridge should be programmed for this,
2988 * so we just query it.
2989 */
2990 outl(0xcf8, 0x80000084);
2991 arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64;
2992 }
2993#endif /* XBOX */
2994
2995 /*
2996 * Initialize the i8254 before the console so that console
2997 * initialization can use DELAY().
2998 */
2999 i8254_init();
3000
3001 /*
3002 * Initialize the console before we print anything out.
3003 */
3004 cninit();
3005
3006 if (metadata_missing)
3007 printf("WARNING: loader(8) metadata is missing!\n");
3008
3009#ifdef DEV_ISA
3010#ifdef DEV_ATPIC
3011 elcr_probe();
3012 atpic_startup();
3013#else
3014 /* Reset and mask the atpics and leave them shut down. */
3015 atpic_reset();
3016
3017 /*
3018 * Point the ICU spurious interrupt vectors at the APIC spurious
3019 * interrupt handler.
3020 */
3021 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
3022 GSEL(GCODE_SEL, SEL_KPL));
3023 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
3024 GSEL(GCODE_SEL, SEL_KPL));
3025#endif
3026#endif
3027
3028#ifdef DDB
3029 ksym_start = bootinfo.bi_symtab;
3030 ksym_end = bootinfo.bi_esymtab;
3031#endif
3032
3033 kdb_init();
3034
3035#ifdef KDB
3036 if (boothowto & RB_KDB)
3037 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
3038#endif
3039
3040 finishidentcpu(); /* Final stage of CPU initialization */
3041 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
3042 GSEL(GCODE_SEL, SEL_KPL));
3043 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
3044 GSEL(GCODE_SEL, SEL_KPL));
3045 initializecpu(); /* Initialize CPU registers */
3046
3047 /* make an initial tss so cpu can get interrupt stack on syscall! */
3048 /* Note: -16 is so we can grow the trapframe if we came from vm86 */
3049 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
3050 kstack0_sz - sizeof(struct pcb) - 16);
3051 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
3052 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
3053 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
3054 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
3055 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
3056 ltr(gsel_tss);
3057
3058 /* pointer to selector slot for %fs/%gs */
3059 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
3060
3061 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
3062 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
3063 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
3064 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
3065#ifdef PAE
3066 dblfault_tss.tss_cr3 = (int)IdlePDPT;
3067#else
3068 dblfault_tss.tss_cr3 = (int)IdlePTD;
3069#endif
3070 dblfault_tss.tss_eip = (int)dblfault_handler;
3071 dblfault_tss.tss_eflags = PSL_KERNEL;
3072 dblfault_tss.tss_ds = dblfault_tss.tss_es =
3073 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
3074 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
3075 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
3076 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
3077
3078 vm86_initialize();
3079 getmemsize(first);
3080 init_param2(physmem);
3081
3082 /* now running on new page tables, configured,and u/iom is accessible */
3083
3084 msgbufinit(msgbufp, msgbufsize);
3085
3086 /* make a call gate to reenter kernel with */
3087 gdp = &ldt[LSYS5CALLS_SEL].gd;
3088
3089 x = (int) &IDTVEC(lcall_syscall);
3090 gdp->gd_looffset = x;
3091 gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
3092 gdp->gd_stkcpy = 1;
3093 gdp->gd_type = SDT_SYS386CGT;
3094 gdp->gd_dpl = SEL_UPL;
3095 gdp->gd_p = 1;
3096 gdp->gd_hioffset = x >> 16;
3097
3098 /* XXX does this work? */
3099 /* XXX yes! */
3100 ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
3101 ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
3102
3103 /* transfer to user mode */
3104
3105 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
3106 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
3107
3108 /* setup proc 0's pcb */
3109 thread0.td_pcb->pcb_flags = 0;
3110#ifdef PAE
3111 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
3112#else
3113 thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
3114#endif
3115 thread0.td_pcb->pcb_ext = 0;
3116 thread0.td_frame = &proc0_tf;
3117
3118 cpu_probe_amdc1e();
3119 cpu_probe_cmpxchg8b();
3120
3121#ifdef FDT
3122 x86_init_fdt();
3123#endif
3116}
3117#endif
3118
3119void
3120cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
3121{
3122
3123 pcpu->pc_acpi_id = 0xffffffff;
3124}
3125
3126void
3127spinlock_enter(void)
3128{
3129 struct thread *td;
3130 register_t flags;
3131
3132 td = curthread;
3133 if (td->td_md.md_spinlock_count == 0) {
3134 flags = intr_disable();
3135 td->td_md.md_spinlock_count = 1;
3136 td->td_md.md_saved_flags = flags;
3137 } else
3138 td->td_md.md_spinlock_count++;
3139 critical_enter();
3140}
3141
3142void
3143spinlock_exit(void)
3144{
3145 struct thread *td;
3146 register_t flags;
3147
3148 td = curthread;
3149 critical_exit();
3150 flags = td->td_md.md_saved_flags;
3151 td->td_md.md_spinlock_count--;
3152 if (td->td_md.md_spinlock_count == 0)
3153 intr_restore(flags);
3154}
3155
3156#if defined(I586_CPU) && !defined(NO_F00F_HACK)
3157static void f00f_hack(void *unused);
3158SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
3159
3160static void
3161f00f_hack(void *unused)
3162{
3163 struct gate_descriptor *new_idt;
3164 vm_offset_t tmp;
3165
3166 if (!has_f00f_bug)
3167 return;
3168
3169 GIANT_REQUIRED;
3170
3171 printf("Intel Pentium detected, installing workaround for F00F bug\n");
3172
3173 tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
3174 if (tmp == 0)
3175 panic("kmem_alloc returned 0");
3176
3177 /* Put the problematic entry (#6) at the end of the lower page. */
3178 new_idt = (struct gate_descriptor*)
3179 (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
3180 bcopy(idt, new_idt, sizeof(idt0));
3181 r_idt.rd_base = (u_int)new_idt;
3182 lidt(&r_idt);
3183 idt = new_idt;
3184 if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
3185 VM_PROT_READ, FALSE) != KERN_SUCCESS)
3186 panic("vm_map_protect failed");
3187}
3188#endif /* defined(I586_CPU) && !NO_F00F_HACK */
3189
3190/*
3191 * Construct a PCB from a trapframe. This is called from kdb_trap() where
3192 * we want to start a backtrace from the function that caused us to enter
3193 * the debugger. We have the context in the trapframe, but base the trace
3194 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
3195 * enough for a backtrace.
3196 */
3197void
3198makectx(struct trapframe *tf, struct pcb *pcb)
3199{
3200
3201 pcb->pcb_edi = tf->tf_edi;
3202 pcb->pcb_esi = tf->tf_esi;
3203 pcb->pcb_ebp = tf->tf_ebp;
3204 pcb->pcb_ebx = tf->tf_ebx;
3205 pcb->pcb_eip = tf->tf_eip;
3206 pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
3207}
3208
3209int
3210ptrace_set_pc(struct thread *td, u_long addr)
3211{
3212
3213 td->td_frame->tf_eip = addr;
3214 return (0);
3215}
3216
3217int
3218ptrace_single_step(struct thread *td)
3219{
3220 td->td_frame->tf_eflags |= PSL_T;
3221 return (0);
3222}
3223
3224int
3225ptrace_clear_single_step(struct thread *td)
3226{
3227 td->td_frame->tf_eflags &= ~PSL_T;
3228 return (0);
3229}
3230
3231int
3232fill_regs(struct thread *td, struct reg *regs)
3233{
3234 struct pcb *pcb;
3235 struct trapframe *tp;
3236
3237 tp = td->td_frame;
3238 pcb = td->td_pcb;
3239 regs->r_gs = pcb->pcb_gs;
3240 return (fill_frame_regs(tp, regs));
3241}
3242
3243int
3244fill_frame_regs(struct trapframe *tp, struct reg *regs)
3245{
3246 regs->r_fs = tp->tf_fs;
3247 regs->r_es = tp->tf_es;
3248 regs->r_ds = tp->tf_ds;
3249 regs->r_edi = tp->tf_edi;
3250 regs->r_esi = tp->tf_esi;
3251 regs->r_ebp = tp->tf_ebp;
3252 regs->r_ebx = tp->tf_ebx;
3253 regs->r_edx = tp->tf_edx;
3254 regs->r_ecx = tp->tf_ecx;
3255 regs->r_eax = tp->tf_eax;
3256 regs->r_eip = tp->tf_eip;
3257 regs->r_cs = tp->tf_cs;
3258 regs->r_eflags = tp->tf_eflags;
3259 regs->r_esp = tp->tf_esp;
3260 regs->r_ss = tp->tf_ss;
3261 return (0);
3262}
3263
3264int
3265set_regs(struct thread *td, struct reg *regs)
3266{
3267 struct pcb *pcb;
3268 struct trapframe *tp;
3269
3270 tp = td->td_frame;
3271 if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
3272 !CS_SECURE(regs->r_cs))
3273 return (EINVAL);
3274 pcb = td->td_pcb;
3275 tp->tf_fs = regs->r_fs;
3276 tp->tf_es = regs->r_es;
3277 tp->tf_ds = regs->r_ds;
3278 tp->tf_edi = regs->r_edi;
3279 tp->tf_esi = regs->r_esi;
3280 tp->tf_ebp = regs->r_ebp;
3281 tp->tf_ebx = regs->r_ebx;
3282 tp->tf_edx = regs->r_edx;
3283 tp->tf_ecx = regs->r_ecx;
3284 tp->tf_eax = regs->r_eax;
3285 tp->tf_eip = regs->r_eip;
3286 tp->tf_cs = regs->r_cs;
3287 tp->tf_eflags = regs->r_eflags;
3288 tp->tf_esp = regs->r_esp;
3289 tp->tf_ss = regs->r_ss;
3290 pcb->pcb_gs = regs->r_gs;
3291 return (0);
3292}
3293
3294#ifdef CPU_ENABLE_SSE
3295static void
3296fill_fpregs_xmm(sv_xmm, sv_87)
3297 struct savexmm *sv_xmm;
3298 struct save87 *sv_87;
3299{
3300 register struct env87 *penv_87 = &sv_87->sv_env;
3301 register struct envxmm *penv_xmm = &sv_xmm->sv_env;
3302 int i;
3303
3304 bzero(sv_87, sizeof(*sv_87));
3305
3306 /* FPU control/status */
3307 penv_87->en_cw = penv_xmm->en_cw;
3308 penv_87->en_sw = penv_xmm->en_sw;
3309 penv_87->en_tw = penv_xmm->en_tw;
3310 penv_87->en_fip = penv_xmm->en_fip;
3311 penv_87->en_fcs = penv_xmm->en_fcs;
3312 penv_87->en_opcode = penv_xmm->en_opcode;
3313 penv_87->en_foo = penv_xmm->en_foo;
3314 penv_87->en_fos = penv_xmm->en_fos;
3315
3316 /* FPU registers */
3317 for (i = 0; i < 8; ++i)
3318 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
3319}
3320
3321static void
3322set_fpregs_xmm(sv_87, sv_xmm)
3323 struct save87 *sv_87;
3324 struct savexmm *sv_xmm;
3325{
3326 register struct env87 *penv_87 = &sv_87->sv_env;
3327 register struct envxmm *penv_xmm = &sv_xmm->sv_env;
3328 int i;
3329
3330 /* FPU control/status */
3331 penv_xmm->en_cw = penv_87->en_cw;
3332 penv_xmm->en_sw = penv_87->en_sw;
3333 penv_xmm->en_tw = penv_87->en_tw;
3334 penv_xmm->en_fip = penv_87->en_fip;
3335 penv_xmm->en_fcs = penv_87->en_fcs;
3336 penv_xmm->en_opcode = penv_87->en_opcode;
3337 penv_xmm->en_foo = penv_87->en_foo;
3338 penv_xmm->en_fos = penv_87->en_fos;
3339
3340 /* FPU registers */
3341 for (i = 0; i < 8; ++i)
3342 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
3343}
3344#endif /* CPU_ENABLE_SSE */
3345
3346int
3347fill_fpregs(struct thread *td, struct fpreg *fpregs)
3348{
3349
3350 KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
3351 P_SHOULDSTOP(td->td_proc),
3352 ("not suspended thread %p", td));
3353#ifdef DEV_NPX
3354 npxgetregs(td);
3355#else
3356 bzero(fpregs, sizeof(*fpregs));
3357#endif
3358#ifdef CPU_ENABLE_SSE
3359 if (cpu_fxsr)
3360 fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm,
3361 (struct save87 *)fpregs);
3362 else
3363#endif /* CPU_ENABLE_SSE */
3364 bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs,
3365 sizeof(*fpregs));
3366 return (0);
3367}
3368
3369int
3370set_fpregs(struct thread *td, struct fpreg *fpregs)
3371{
3372
3373#ifdef CPU_ENABLE_SSE
3374 if (cpu_fxsr)
3375 set_fpregs_xmm((struct save87 *)fpregs,
3376 &td->td_pcb->pcb_user_save.sv_xmm);
3377 else
3378#endif /* CPU_ENABLE_SSE */
3379 bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87,
3380 sizeof(*fpregs));
3381#ifdef DEV_NPX
3382 npxuserinited(td);
3383#endif
3384 return (0);
3385}
3386
3387/*
3388 * Get machine context.
3389 */
3390int
3391get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
3392{
3393 struct trapframe *tp;
3394 struct segment_descriptor *sdp;
3395
3396 tp = td->td_frame;
3397
3398 PROC_LOCK(curthread->td_proc);
3399 mcp->mc_onstack = sigonstack(tp->tf_esp);
3400 PROC_UNLOCK(curthread->td_proc);
3401 mcp->mc_gs = td->td_pcb->pcb_gs;
3402 mcp->mc_fs = tp->tf_fs;
3403 mcp->mc_es = tp->tf_es;
3404 mcp->mc_ds = tp->tf_ds;
3405 mcp->mc_edi = tp->tf_edi;
3406 mcp->mc_esi = tp->tf_esi;
3407 mcp->mc_ebp = tp->tf_ebp;
3408 mcp->mc_isp = tp->tf_isp;
3409 mcp->mc_eflags = tp->tf_eflags;
3410 if (flags & GET_MC_CLEAR_RET) {
3411 mcp->mc_eax = 0;
3412 mcp->mc_edx = 0;
3413 mcp->mc_eflags &= ~PSL_C;
3414 } else {
3415 mcp->mc_eax = tp->tf_eax;
3416 mcp->mc_edx = tp->tf_edx;
3417 }
3418 mcp->mc_ebx = tp->tf_ebx;
3419 mcp->mc_ecx = tp->tf_ecx;
3420 mcp->mc_eip = tp->tf_eip;
3421 mcp->mc_cs = tp->tf_cs;
3422 mcp->mc_esp = tp->tf_esp;
3423 mcp->mc_ss = tp->tf_ss;
3424 mcp->mc_len = sizeof(*mcp);
3425 get_fpcontext(td, mcp);
3426 sdp = &td->td_pcb->pcb_fsd;
3427 mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
3428 sdp = &td->td_pcb->pcb_gsd;
3429 mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
3430 mcp->mc_flags = 0;
3431 bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
3432 return (0);
3433}
3434
3435/*
3436 * Set machine context.
3437 *
3438 * However, we don't set any but the user modifiable flags, and we won't
3439 * touch the cs selector.
3440 */
3441int
3442set_mcontext(struct thread *td, const mcontext_t *mcp)
3443{
3444 struct trapframe *tp;
3445 int eflags, ret;
3446
3447 tp = td->td_frame;
3448 if (mcp->mc_len != sizeof(*mcp))
3449 return (EINVAL);
3450 eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
3451 (tp->tf_eflags & ~PSL_USERCHANGE);
3452 if ((ret = set_fpcontext(td, mcp)) == 0) {
3453 tp->tf_fs = mcp->mc_fs;
3454 tp->tf_es = mcp->mc_es;
3455 tp->tf_ds = mcp->mc_ds;
3456 tp->tf_edi = mcp->mc_edi;
3457 tp->tf_esi = mcp->mc_esi;
3458 tp->tf_ebp = mcp->mc_ebp;
3459 tp->tf_ebx = mcp->mc_ebx;
3460 tp->tf_edx = mcp->mc_edx;
3461 tp->tf_ecx = mcp->mc_ecx;
3462 tp->tf_eax = mcp->mc_eax;
3463 tp->tf_eip = mcp->mc_eip;
3464 tp->tf_eflags = eflags;
3465 tp->tf_esp = mcp->mc_esp;
3466 tp->tf_ss = mcp->mc_ss;
3467 td->td_pcb->pcb_gs = mcp->mc_gs;
3468 ret = 0;
3469 }
3470 return (ret);
3471}
3472
3473static void
3474get_fpcontext(struct thread *td, mcontext_t *mcp)
3475{
3476
3477#ifndef DEV_NPX
3478 mcp->mc_fpformat = _MC_FPFMT_NODEV;
3479 mcp->mc_ownedfp = _MC_FPOWNED_NONE;
3480 bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
3481#else
3482 mcp->mc_ownedfp = npxgetregs(td);
3483 bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate,
3484 sizeof(mcp->mc_fpstate));
3485 mcp->mc_fpformat = npxformat();
3486#endif
3487}
3488
3489static int
3490set_fpcontext(struct thread *td, const mcontext_t *mcp)
3491{
3492
3493 if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
3494 return (0);
3495 else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
3496 mcp->mc_fpformat != _MC_FPFMT_XMM)
3497 return (EINVAL);
3498 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
3499 /* We don't care what state is left in the FPU or PCB. */
3500 fpstate_drop(td);
3501 else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
3502 mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
3503#ifdef DEV_NPX
3504#ifdef CPU_ENABLE_SSE
3505 if (cpu_fxsr)
3506 ((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env.
3507 en_mxcsr &= cpu_mxcsr_mask;
3508#endif
3509 npxsetregs(td, (union savefpu *)&mcp->mc_fpstate);
3510#endif
3511 } else
3512 return (EINVAL);
3513 return (0);
3514}
3515
3516static void
3517fpstate_drop(struct thread *td)
3518{
3519
3520 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
3521 critical_enter();
3522#ifdef DEV_NPX
3523 if (PCPU_GET(fpcurthread) == td)
3524 npxdrop();
3525#endif
3526 /*
3527 * XXX force a full drop of the npx. The above only drops it if we
3528 * owned it. npxgetregs() has the same bug in the !cpu_fxsr case.
3529 *
3530 * XXX I don't much like npxgetregs()'s semantics of doing a full
3531 * drop. Dropping only to the pcb matches fnsave's behaviour.
3532 * We only need to drop to !PCB_INITDONE in sendsig(). But
3533 * sendsig() is the only caller of npxgetregs()... perhaps we just
3534 * have too many layers.
3535 */
3536 curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE |
3537 PCB_NPXUSERINITDONE);
3538 critical_exit();
3539}
3540
3541int
3542fill_dbregs(struct thread *td, struct dbreg *dbregs)
3543{
3544 struct pcb *pcb;
3545
3546 if (td == NULL) {
3547 dbregs->dr[0] = rdr0();
3548 dbregs->dr[1] = rdr1();
3549 dbregs->dr[2] = rdr2();
3550 dbregs->dr[3] = rdr3();
3551 dbregs->dr[4] = rdr4();
3552 dbregs->dr[5] = rdr5();
3553 dbregs->dr[6] = rdr6();
3554 dbregs->dr[7] = rdr7();
3555 } else {
3556 pcb = td->td_pcb;
3557 dbregs->dr[0] = pcb->pcb_dr0;
3558 dbregs->dr[1] = pcb->pcb_dr1;
3559 dbregs->dr[2] = pcb->pcb_dr2;
3560 dbregs->dr[3] = pcb->pcb_dr3;
3561 dbregs->dr[4] = 0;
3562 dbregs->dr[5] = 0;
3563 dbregs->dr[6] = pcb->pcb_dr6;
3564 dbregs->dr[7] = pcb->pcb_dr7;
3565 }
3566 return (0);
3567}
3568
3569int
3570set_dbregs(struct thread *td, struct dbreg *dbregs)
3571{
3572 struct pcb *pcb;
3573 int i;
3574
3575 if (td == NULL) {
3576 load_dr0(dbregs->dr[0]);
3577 load_dr1(dbregs->dr[1]);
3578 load_dr2(dbregs->dr[2]);
3579 load_dr3(dbregs->dr[3]);
3580 load_dr4(dbregs->dr[4]);
3581 load_dr5(dbregs->dr[5]);
3582 load_dr6(dbregs->dr[6]);
3583 load_dr7(dbregs->dr[7]);
3584 } else {
3585 /*
3586 * Don't let an illegal value for dr7 get set. Specifically,
3587 * check for undefined settings. Setting these bit patterns
3588 * result in undefined behaviour and can lead to an unexpected
3589 * TRCTRAP.
3590 */
3591 for (i = 0; i < 4; i++) {
3592 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
3593 return (EINVAL);
3594 if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
3595 return (EINVAL);
3596 }
3597
3598 pcb = td->td_pcb;
3599
3600 /*
3601 * Don't let a process set a breakpoint that is not within the
3602 * process's address space. If a process could do this, it
3603 * could halt the system by setting a breakpoint in the kernel
3604 * (if ddb was enabled). Thus, we need to check to make sure
3605 * that no breakpoints are being enabled for addresses outside
3606 * process's address space.
3607 *
3608 * XXX - what about when the watched area of the user's
3609 * address space is written into from within the kernel
3610 * ... wouldn't that still cause a breakpoint to be generated
3611 * from within kernel mode?
3612 */
3613
3614 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
3615 /* dr0 is enabled */
3616 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
3617 return (EINVAL);
3618 }
3619
3620 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
3621 /* dr1 is enabled */
3622 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
3623 return (EINVAL);
3624 }
3625
3626 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
3627 /* dr2 is enabled */
3628 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
3629 return (EINVAL);
3630 }
3631
3632 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
3633 /* dr3 is enabled */
3634 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
3635 return (EINVAL);
3636 }
3637
3638 pcb->pcb_dr0 = dbregs->dr[0];
3639 pcb->pcb_dr1 = dbregs->dr[1];
3640 pcb->pcb_dr2 = dbregs->dr[2];
3641 pcb->pcb_dr3 = dbregs->dr[3];
3642 pcb->pcb_dr6 = dbregs->dr[6];
3643 pcb->pcb_dr7 = dbregs->dr[7];
3644
3645 pcb->pcb_flags |= PCB_DBREGS;
3646 }
3647
3648 return (0);
3649}
3650
3651/*
3652 * Return > 0 if a hardware breakpoint has been hit, and the
3653 * breakpoint was in user space. Return 0, otherwise.
3654 */
3655int
3656user_dbreg_trap(void)
3657{
3658 u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
3659 u_int32_t bp; /* breakpoint bits extracted from dr6 */
3660 int nbp; /* number of breakpoints that triggered */
3661 caddr_t addr[4]; /* breakpoint addresses */
3662 int i;
3663
3664 dr7 = rdr7();
3665 if ((dr7 & 0x000000ff) == 0) {
3666 /*
3667 * all GE and LE bits in the dr7 register are zero,
3668 * thus the trap couldn't have been caused by the
3669 * hardware debug registers
3670 */
3671 return 0;
3672 }
3673
3674 nbp = 0;
3675 dr6 = rdr6();
3676 bp = dr6 & 0x0000000f;
3677
3678 if (!bp) {
3679 /*
3680 * None of the breakpoint bits are set meaning this
3681 * trap was not caused by any of the debug registers
3682 */
3683 return 0;
3684 }
3685
3686 /*
3687 * at least one of the breakpoints were hit, check to see
3688 * which ones and if any of them are user space addresses
3689 */
3690
3691 if (bp & 0x01) {
3692 addr[nbp++] = (caddr_t)rdr0();
3693 }
3694 if (bp & 0x02) {
3695 addr[nbp++] = (caddr_t)rdr1();
3696 }
3697 if (bp & 0x04) {
3698 addr[nbp++] = (caddr_t)rdr2();
3699 }
3700 if (bp & 0x08) {
3701 addr[nbp++] = (caddr_t)rdr3();
3702 }
3703
3704 for (i = 0; i < nbp; i++) {
3705 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
3706 /*
3707 * addr[i] is in user space
3708 */
3709 return nbp;
3710 }
3711 }
3712
3713 /*
3714 * None of the breakpoints are in user space.
3715 */
3716 return 0;
3717}
3718
3719#ifdef KDB
3720
3721/*
3722 * Provide inb() and outb() as functions. They are normally only available as
3723 * inline functions, thus cannot be called from the debugger.
3724 */
3725
3726/* silence compiler warnings */
3727u_char inb_(u_short);
3728void outb_(u_short, u_char);
3729
3730u_char
3731inb_(u_short port)
3732{
3733 return inb(port);
3734}
3735
3736void
3737outb_(u_short port, u_char data)
3738{
3739 outb(port, data);
3740}
3741
3742#endif /* KDB */
3124}
3125#endif
3126
3127void
3128cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
3129{
3130
3131 pcpu->pc_acpi_id = 0xffffffff;
3132}
3133
3134void
3135spinlock_enter(void)
3136{
3137 struct thread *td;
3138 register_t flags;
3139
3140 td = curthread;
3141 if (td->td_md.md_spinlock_count == 0) {
3142 flags = intr_disable();
3143 td->td_md.md_spinlock_count = 1;
3144 td->td_md.md_saved_flags = flags;
3145 } else
3146 td->td_md.md_spinlock_count++;
3147 critical_enter();
3148}
3149
3150void
3151spinlock_exit(void)
3152{
3153 struct thread *td;
3154 register_t flags;
3155
3156 td = curthread;
3157 critical_exit();
3158 flags = td->td_md.md_saved_flags;
3159 td->td_md.md_spinlock_count--;
3160 if (td->td_md.md_spinlock_count == 0)
3161 intr_restore(flags);
3162}
3163
3164#if defined(I586_CPU) && !defined(NO_F00F_HACK)
3165static void f00f_hack(void *unused);
3166SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
3167
3168static void
3169f00f_hack(void *unused)
3170{
3171 struct gate_descriptor *new_idt;
3172 vm_offset_t tmp;
3173
3174 if (!has_f00f_bug)
3175 return;
3176
3177 GIANT_REQUIRED;
3178
3179 printf("Intel Pentium detected, installing workaround for F00F bug\n");
3180
3181 tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
3182 if (tmp == 0)
3183 panic("kmem_alloc returned 0");
3184
3185 /* Put the problematic entry (#6) at the end of the lower page. */
3186 new_idt = (struct gate_descriptor*)
3187 (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
3188 bcopy(idt, new_idt, sizeof(idt0));
3189 r_idt.rd_base = (u_int)new_idt;
3190 lidt(&r_idt);
3191 idt = new_idt;
3192 if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
3193 VM_PROT_READ, FALSE) != KERN_SUCCESS)
3194 panic("vm_map_protect failed");
3195}
3196#endif /* defined(I586_CPU) && !NO_F00F_HACK */
3197
3198/*
3199 * Construct a PCB from a trapframe. This is called from kdb_trap() where
3200 * we want to start a backtrace from the function that caused us to enter
3201 * the debugger. We have the context in the trapframe, but base the trace
3202 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
3203 * enough for a backtrace.
3204 */
3205void
3206makectx(struct trapframe *tf, struct pcb *pcb)
3207{
3208
3209 pcb->pcb_edi = tf->tf_edi;
3210 pcb->pcb_esi = tf->tf_esi;
3211 pcb->pcb_ebp = tf->tf_ebp;
3212 pcb->pcb_ebx = tf->tf_ebx;
3213 pcb->pcb_eip = tf->tf_eip;
3214 pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
3215}
3216
3217int
3218ptrace_set_pc(struct thread *td, u_long addr)
3219{
3220
3221 td->td_frame->tf_eip = addr;
3222 return (0);
3223}
3224
3225int
3226ptrace_single_step(struct thread *td)
3227{
3228 td->td_frame->tf_eflags |= PSL_T;
3229 return (0);
3230}
3231
3232int
3233ptrace_clear_single_step(struct thread *td)
3234{
3235 td->td_frame->tf_eflags &= ~PSL_T;
3236 return (0);
3237}
3238
3239int
3240fill_regs(struct thread *td, struct reg *regs)
3241{
3242 struct pcb *pcb;
3243 struct trapframe *tp;
3244
3245 tp = td->td_frame;
3246 pcb = td->td_pcb;
3247 regs->r_gs = pcb->pcb_gs;
3248 return (fill_frame_regs(tp, regs));
3249}
3250
3251int
3252fill_frame_regs(struct trapframe *tp, struct reg *regs)
3253{
3254 regs->r_fs = tp->tf_fs;
3255 regs->r_es = tp->tf_es;
3256 regs->r_ds = tp->tf_ds;
3257 regs->r_edi = tp->tf_edi;
3258 regs->r_esi = tp->tf_esi;
3259 regs->r_ebp = tp->tf_ebp;
3260 regs->r_ebx = tp->tf_ebx;
3261 regs->r_edx = tp->tf_edx;
3262 regs->r_ecx = tp->tf_ecx;
3263 regs->r_eax = tp->tf_eax;
3264 regs->r_eip = tp->tf_eip;
3265 regs->r_cs = tp->tf_cs;
3266 regs->r_eflags = tp->tf_eflags;
3267 regs->r_esp = tp->tf_esp;
3268 regs->r_ss = tp->tf_ss;
3269 return (0);
3270}
3271
3272int
3273set_regs(struct thread *td, struct reg *regs)
3274{
3275 struct pcb *pcb;
3276 struct trapframe *tp;
3277
3278 tp = td->td_frame;
3279 if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
3280 !CS_SECURE(regs->r_cs))
3281 return (EINVAL);
3282 pcb = td->td_pcb;
3283 tp->tf_fs = regs->r_fs;
3284 tp->tf_es = regs->r_es;
3285 tp->tf_ds = regs->r_ds;
3286 tp->tf_edi = regs->r_edi;
3287 tp->tf_esi = regs->r_esi;
3288 tp->tf_ebp = regs->r_ebp;
3289 tp->tf_ebx = regs->r_ebx;
3290 tp->tf_edx = regs->r_edx;
3291 tp->tf_ecx = regs->r_ecx;
3292 tp->tf_eax = regs->r_eax;
3293 tp->tf_eip = regs->r_eip;
3294 tp->tf_cs = regs->r_cs;
3295 tp->tf_eflags = regs->r_eflags;
3296 tp->tf_esp = regs->r_esp;
3297 tp->tf_ss = regs->r_ss;
3298 pcb->pcb_gs = regs->r_gs;
3299 return (0);
3300}
3301
3302#ifdef CPU_ENABLE_SSE
3303static void
3304fill_fpregs_xmm(sv_xmm, sv_87)
3305 struct savexmm *sv_xmm;
3306 struct save87 *sv_87;
3307{
3308 register struct env87 *penv_87 = &sv_87->sv_env;
3309 register struct envxmm *penv_xmm = &sv_xmm->sv_env;
3310 int i;
3311
3312 bzero(sv_87, sizeof(*sv_87));
3313
3314 /* FPU control/status */
3315 penv_87->en_cw = penv_xmm->en_cw;
3316 penv_87->en_sw = penv_xmm->en_sw;
3317 penv_87->en_tw = penv_xmm->en_tw;
3318 penv_87->en_fip = penv_xmm->en_fip;
3319 penv_87->en_fcs = penv_xmm->en_fcs;
3320 penv_87->en_opcode = penv_xmm->en_opcode;
3321 penv_87->en_foo = penv_xmm->en_foo;
3322 penv_87->en_fos = penv_xmm->en_fos;
3323
3324 /* FPU registers */
3325 for (i = 0; i < 8; ++i)
3326 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
3327}
3328
3329static void
3330set_fpregs_xmm(sv_87, sv_xmm)
3331 struct save87 *sv_87;
3332 struct savexmm *sv_xmm;
3333{
3334 register struct env87 *penv_87 = &sv_87->sv_env;
3335 register struct envxmm *penv_xmm = &sv_xmm->sv_env;
3336 int i;
3337
3338 /* FPU control/status */
3339 penv_xmm->en_cw = penv_87->en_cw;
3340 penv_xmm->en_sw = penv_87->en_sw;
3341 penv_xmm->en_tw = penv_87->en_tw;
3342 penv_xmm->en_fip = penv_87->en_fip;
3343 penv_xmm->en_fcs = penv_87->en_fcs;
3344 penv_xmm->en_opcode = penv_87->en_opcode;
3345 penv_xmm->en_foo = penv_87->en_foo;
3346 penv_xmm->en_fos = penv_87->en_fos;
3347
3348 /* FPU registers */
3349 for (i = 0; i < 8; ++i)
3350 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
3351}
3352#endif /* CPU_ENABLE_SSE */
3353
3354int
3355fill_fpregs(struct thread *td, struct fpreg *fpregs)
3356{
3357
3358 KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
3359 P_SHOULDSTOP(td->td_proc),
3360 ("not suspended thread %p", td));
3361#ifdef DEV_NPX
3362 npxgetregs(td);
3363#else
3364 bzero(fpregs, sizeof(*fpregs));
3365#endif
3366#ifdef CPU_ENABLE_SSE
3367 if (cpu_fxsr)
3368 fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm,
3369 (struct save87 *)fpregs);
3370 else
3371#endif /* CPU_ENABLE_SSE */
3372 bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs,
3373 sizeof(*fpregs));
3374 return (0);
3375}
3376
3377int
3378set_fpregs(struct thread *td, struct fpreg *fpregs)
3379{
3380
3381#ifdef CPU_ENABLE_SSE
3382 if (cpu_fxsr)
3383 set_fpregs_xmm((struct save87 *)fpregs,
3384 &td->td_pcb->pcb_user_save.sv_xmm);
3385 else
3386#endif /* CPU_ENABLE_SSE */
3387 bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87,
3388 sizeof(*fpregs));
3389#ifdef DEV_NPX
3390 npxuserinited(td);
3391#endif
3392 return (0);
3393}
3394
3395/*
3396 * Get machine context.
3397 */
3398int
3399get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
3400{
3401 struct trapframe *tp;
3402 struct segment_descriptor *sdp;
3403
3404 tp = td->td_frame;
3405
3406 PROC_LOCK(curthread->td_proc);
3407 mcp->mc_onstack = sigonstack(tp->tf_esp);
3408 PROC_UNLOCK(curthread->td_proc);
3409 mcp->mc_gs = td->td_pcb->pcb_gs;
3410 mcp->mc_fs = tp->tf_fs;
3411 mcp->mc_es = tp->tf_es;
3412 mcp->mc_ds = tp->tf_ds;
3413 mcp->mc_edi = tp->tf_edi;
3414 mcp->mc_esi = tp->tf_esi;
3415 mcp->mc_ebp = tp->tf_ebp;
3416 mcp->mc_isp = tp->tf_isp;
3417 mcp->mc_eflags = tp->tf_eflags;
3418 if (flags & GET_MC_CLEAR_RET) {
3419 mcp->mc_eax = 0;
3420 mcp->mc_edx = 0;
3421 mcp->mc_eflags &= ~PSL_C;
3422 } else {
3423 mcp->mc_eax = tp->tf_eax;
3424 mcp->mc_edx = tp->tf_edx;
3425 }
3426 mcp->mc_ebx = tp->tf_ebx;
3427 mcp->mc_ecx = tp->tf_ecx;
3428 mcp->mc_eip = tp->tf_eip;
3429 mcp->mc_cs = tp->tf_cs;
3430 mcp->mc_esp = tp->tf_esp;
3431 mcp->mc_ss = tp->tf_ss;
3432 mcp->mc_len = sizeof(*mcp);
3433 get_fpcontext(td, mcp);
3434 sdp = &td->td_pcb->pcb_fsd;
3435 mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
3436 sdp = &td->td_pcb->pcb_gsd;
3437 mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
3438 mcp->mc_flags = 0;
3439 bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
3440 return (0);
3441}
3442
3443/*
3444 * Set machine context.
3445 *
3446 * However, we don't set any but the user modifiable flags, and we won't
3447 * touch the cs selector.
3448 */
3449int
3450set_mcontext(struct thread *td, const mcontext_t *mcp)
3451{
3452 struct trapframe *tp;
3453 int eflags, ret;
3454
3455 tp = td->td_frame;
3456 if (mcp->mc_len != sizeof(*mcp))
3457 return (EINVAL);
3458 eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
3459 (tp->tf_eflags & ~PSL_USERCHANGE);
3460 if ((ret = set_fpcontext(td, mcp)) == 0) {
3461 tp->tf_fs = mcp->mc_fs;
3462 tp->tf_es = mcp->mc_es;
3463 tp->tf_ds = mcp->mc_ds;
3464 tp->tf_edi = mcp->mc_edi;
3465 tp->tf_esi = mcp->mc_esi;
3466 tp->tf_ebp = mcp->mc_ebp;
3467 tp->tf_ebx = mcp->mc_ebx;
3468 tp->tf_edx = mcp->mc_edx;
3469 tp->tf_ecx = mcp->mc_ecx;
3470 tp->tf_eax = mcp->mc_eax;
3471 tp->tf_eip = mcp->mc_eip;
3472 tp->tf_eflags = eflags;
3473 tp->tf_esp = mcp->mc_esp;
3474 tp->tf_ss = mcp->mc_ss;
3475 td->td_pcb->pcb_gs = mcp->mc_gs;
3476 ret = 0;
3477 }
3478 return (ret);
3479}
3480
3481static void
3482get_fpcontext(struct thread *td, mcontext_t *mcp)
3483{
3484
3485#ifndef DEV_NPX
3486 mcp->mc_fpformat = _MC_FPFMT_NODEV;
3487 mcp->mc_ownedfp = _MC_FPOWNED_NONE;
3488 bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
3489#else
3490 mcp->mc_ownedfp = npxgetregs(td);
3491 bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate,
3492 sizeof(mcp->mc_fpstate));
3493 mcp->mc_fpformat = npxformat();
3494#endif
3495}
3496
3497static int
3498set_fpcontext(struct thread *td, const mcontext_t *mcp)
3499{
3500
3501 if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
3502 return (0);
3503 else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
3504 mcp->mc_fpformat != _MC_FPFMT_XMM)
3505 return (EINVAL);
3506 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
3507 /* We don't care what state is left in the FPU or PCB. */
3508 fpstate_drop(td);
3509 else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
3510 mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
3511#ifdef DEV_NPX
3512#ifdef CPU_ENABLE_SSE
3513 if (cpu_fxsr)
3514 ((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env.
3515 en_mxcsr &= cpu_mxcsr_mask;
3516#endif
3517 npxsetregs(td, (union savefpu *)&mcp->mc_fpstate);
3518#endif
3519 } else
3520 return (EINVAL);
3521 return (0);
3522}
3523
3524static void
3525fpstate_drop(struct thread *td)
3526{
3527
3528 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
3529 critical_enter();
3530#ifdef DEV_NPX
3531 if (PCPU_GET(fpcurthread) == td)
3532 npxdrop();
3533#endif
3534 /*
3535 * XXX force a full drop of the npx. The above only drops it if we
3536 * owned it. npxgetregs() has the same bug in the !cpu_fxsr case.
3537 *
3538 * XXX I don't much like npxgetregs()'s semantics of doing a full
3539 * drop. Dropping only to the pcb matches fnsave's behaviour.
3540 * We only need to drop to !PCB_INITDONE in sendsig(). But
3541 * sendsig() is the only caller of npxgetregs()... perhaps we just
3542 * have too many layers.
3543 */
3544 curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE |
3545 PCB_NPXUSERINITDONE);
3546 critical_exit();
3547}
3548
3549int
3550fill_dbregs(struct thread *td, struct dbreg *dbregs)
3551{
3552 struct pcb *pcb;
3553
3554 if (td == NULL) {
3555 dbregs->dr[0] = rdr0();
3556 dbregs->dr[1] = rdr1();
3557 dbregs->dr[2] = rdr2();
3558 dbregs->dr[3] = rdr3();
3559 dbregs->dr[4] = rdr4();
3560 dbregs->dr[5] = rdr5();
3561 dbregs->dr[6] = rdr6();
3562 dbregs->dr[7] = rdr7();
3563 } else {
3564 pcb = td->td_pcb;
3565 dbregs->dr[0] = pcb->pcb_dr0;
3566 dbregs->dr[1] = pcb->pcb_dr1;
3567 dbregs->dr[2] = pcb->pcb_dr2;
3568 dbregs->dr[3] = pcb->pcb_dr3;
3569 dbregs->dr[4] = 0;
3570 dbregs->dr[5] = 0;
3571 dbregs->dr[6] = pcb->pcb_dr6;
3572 dbregs->dr[7] = pcb->pcb_dr7;
3573 }
3574 return (0);
3575}
3576
3577int
3578set_dbregs(struct thread *td, struct dbreg *dbregs)
3579{
3580 struct pcb *pcb;
3581 int i;
3582
3583 if (td == NULL) {
3584 load_dr0(dbregs->dr[0]);
3585 load_dr1(dbregs->dr[1]);
3586 load_dr2(dbregs->dr[2]);
3587 load_dr3(dbregs->dr[3]);
3588 load_dr4(dbregs->dr[4]);
3589 load_dr5(dbregs->dr[5]);
3590 load_dr6(dbregs->dr[6]);
3591 load_dr7(dbregs->dr[7]);
3592 } else {
3593 /*
3594 * Don't let an illegal value for dr7 get set. Specifically,
3595 * check for undefined settings. Setting these bit patterns
3596 * result in undefined behaviour and can lead to an unexpected
3597 * TRCTRAP.
3598 */
3599 for (i = 0; i < 4; i++) {
3600 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
3601 return (EINVAL);
3602 if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
3603 return (EINVAL);
3604 }
3605
3606 pcb = td->td_pcb;
3607
3608 /*
3609 * Don't let a process set a breakpoint that is not within the
3610 * process's address space. If a process could do this, it
3611 * could halt the system by setting a breakpoint in the kernel
3612 * (if ddb was enabled). Thus, we need to check to make sure
3613 * that no breakpoints are being enabled for addresses outside
3614 * process's address space.
3615 *
3616 * XXX - what about when the watched area of the user's
3617 * address space is written into from within the kernel
3618 * ... wouldn't that still cause a breakpoint to be generated
3619 * from within kernel mode?
3620 */
3621
3622 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
3623 /* dr0 is enabled */
3624 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
3625 return (EINVAL);
3626 }
3627
3628 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
3629 /* dr1 is enabled */
3630 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
3631 return (EINVAL);
3632 }
3633
3634 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
3635 /* dr2 is enabled */
3636 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
3637 return (EINVAL);
3638 }
3639
3640 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
3641 /* dr3 is enabled */
3642 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
3643 return (EINVAL);
3644 }
3645
3646 pcb->pcb_dr0 = dbregs->dr[0];
3647 pcb->pcb_dr1 = dbregs->dr[1];
3648 pcb->pcb_dr2 = dbregs->dr[2];
3649 pcb->pcb_dr3 = dbregs->dr[3];
3650 pcb->pcb_dr6 = dbregs->dr[6];
3651 pcb->pcb_dr7 = dbregs->dr[7];
3652
3653 pcb->pcb_flags |= PCB_DBREGS;
3654 }
3655
3656 return (0);
3657}
3658
3659/*
3660 * Return > 0 if a hardware breakpoint has been hit, and the
3661 * breakpoint was in user space. Return 0, otherwise.
3662 */
3663int
3664user_dbreg_trap(void)
3665{
3666 u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
3667 u_int32_t bp; /* breakpoint bits extracted from dr6 */
3668 int nbp; /* number of breakpoints that triggered */
3669 caddr_t addr[4]; /* breakpoint addresses */
3670 int i;
3671
3672 dr7 = rdr7();
3673 if ((dr7 & 0x000000ff) == 0) {
3674 /*
3675 * all GE and LE bits in the dr7 register are zero,
3676 * thus the trap couldn't have been caused by the
3677 * hardware debug registers
3678 */
3679 return 0;
3680 }
3681
3682 nbp = 0;
3683 dr6 = rdr6();
3684 bp = dr6 & 0x0000000f;
3685
3686 if (!bp) {
3687 /*
3688 * None of the breakpoint bits are set meaning this
3689 * trap was not caused by any of the debug registers
3690 */
3691 return 0;
3692 }
3693
3694 /*
3695 * at least one of the breakpoints were hit, check to see
3696 * which ones and if any of them are user space addresses
3697 */
3698
3699 if (bp & 0x01) {
3700 addr[nbp++] = (caddr_t)rdr0();
3701 }
3702 if (bp & 0x02) {
3703 addr[nbp++] = (caddr_t)rdr1();
3704 }
3705 if (bp & 0x04) {
3706 addr[nbp++] = (caddr_t)rdr2();
3707 }
3708 if (bp & 0x08) {
3709 addr[nbp++] = (caddr_t)rdr3();
3710 }
3711
3712 for (i = 0; i < nbp; i++) {
3713 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
3714 /*
3715 * addr[i] is in user space
3716 */
3717 return nbp;
3718 }
3719 }
3720
3721 /*
3722 * None of the breakpoints are in user space.
3723 */
3724 return 0;
3725}
3726
3727#ifdef KDB
3728
3729/*
3730 * Provide inb() and outb() as functions. They are normally only available as
3731 * inline functions, thus cannot be called from the debugger.
3732 */
3733
3734/* silence compiler warnings */
3735u_char inb_(u_short);
3736void outb_(u_short, u_char);
3737
3738u_char
3739inb_(u_short port)
3740{
3741 return inb(port);
3742}
3743
3744void
3745outb_(u_short port, u_char data)
3746{
3747 outb(port, data);
3748}
3749
3750#endif /* KDB */