1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56
57/*
58 */
59
60#include <kern/cpu_number.h>
61#include <kern/kalloc.h>
62#include <kern/cpu_data.h>
63#include <mach/mach_types.h>
64#include <mach/machine.h>
65#include <mach/vm_map.h>
66#include <mach/machine/vm_param.h>
67#include <vm/vm_kern.h>
68#include <vm/vm_map.h>
69
70#include <i386/bit_routines.h>
71#include <i386/mp_desc.h>
72#include <i386/misc_protos.h>
73#include <i386/mp.h>
74#include <i386/pmap.h>
75#if defined(__i386__) || defined(__x86_64__)
76#include <i386/pmap_internal.h>
77#endif /* i386 */
78#if CONFIG_MCA
79#include <i386/machine_check.h>
80#endif
81
82#include <kern/misc_protos.h>
83
84#define K_INTR_GATE (ACC_P|ACC_PL_K|ACC_INTR_GATE)
85#define U_INTR_GATE (ACC_P|ACC_PL_U|ACC_INTR_GATE)
86
87// Declare macros that will declare the externs
88#define TRAP(n, name)		extern void *name ;
89#define TRAP_ERR(n, name)	extern void *name ;
90#define TRAP_SPC(n, name)	extern void *name ;
91#define TRAP_IST1(n, name)	extern void *name ;
92#define TRAP_IST2(n, name)	extern void *name ;
93#define INTERRUPT(n)		extern void *_intr_ ## n ;
94#define USER_TRAP(n, name)	extern void *name ;
95#define USER_TRAP_SPC(n, name)	extern void *name ;
96
97// Include the table to declare the externs
98#include "../x86_64/idt_table.h"
99
100// Undef the macros, then redefine them so we can declare the table
101#undef TRAP
102#undef TRAP_ERR
103#undef TRAP_SPC
104#undef TRAP_IST1
105#undef TRAP_IST2
106#undef INTERRUPT
107#undef USER_TRAP
108#undef USER_TRAP_SPC
109
110#define TRAP(n, name)			\
111	[n] = {				\
112		(uintptr_t)&name,	\
113		KERNEL64_CS,		\
114		0,			\
115		K_INTR_GATE,		\
116		0			\
117	},
118
119#define TRAP_ERR TRAP
120#define TRAP_SPC TRAP
121
122#define TRAP_IST1(n, name) \
123	[n] = {				\
124		(uintptr_t)&name,	\
125		KERNEL64_CS,		\
126		1,			\
127		K_INTR_GATE,		\
128		0			\
129	},
130
131#define TRAP_IST2(n, name) \
132	[n] = {				\
133		(uintptr_t)&name,	\
134		KERNEL64_CS,		\
135		2,			\
136		K_INTR_GATE,		\
137		0			\
138	},
139
140#define INTERRUPT(n) \
141	[n] = {				\
142		(uintptr_t)&_intr_ ## n,\
143		KERNEL64_CS,		\
144		0,			\
145		K_INTR_GATE,		\
146		0			\
147	},
148
149#define USER_TRAP(n, name) \
150	[n] = {				\
151		(uintptr_t)&name,	\
152		KERNEL64_CS,		\
153		0,			\
154		U_INTR_GATE,		\
155		0			\
156	},
157
158#define USER_TRAP_SPC USER_TRAP
159
160// Declare the table using the macros we just set up
161struct fake_descriptor64 master_idt64[IDTSZ]
162	__attribute__ ((section("__HIB,__desc")))
163	__attribute__ ((aligned(PAGE_SIZE))) = {
164#include "../x86_64/idt_table.h"
165};
166
167/*
168 * First cpu`s interrupt stack.
169 */
170extern uint32_t		low_intstack[];		/* bottom */
171extern uint32_t		low_eintstack[];	/* top */
172
173/*
174 * Per-cpu data area pointers.
175 * The master cpu (cpu 0) has its data area statically allocated;
176 * others are allocated dynamically and this array is updated at runtime.
177 */
178static cpu_data_t	cpu_data_master = {
179	.cpu_this = &cpu_data_master,
180	.cpu_nanotime = &pal_rtc_nanotime_info,
181	.cpu_int_stack_top = (vm_offset_t) low_eintstack,
182};
183cpu_data_t	*cpu_data_ptr[MAX_CPUS] = { [0] = &cpu_data_master };
184
185decl_simple_lock_data(,ncpus_lock);	/* protects real_ncpus */
186unsigned int	real_ncpus = 1;
187unsigned int	max_ncpus = MAX_CPUS;
188
189extern void hi64_sysenter(void);
190extern void hi64_syscall(void);
191
192/*
193 * Multiprocessor i386/i486 systems use a separate copy of the
194 * GDT, IDT, LDT, and kernel TSS per processor.  The first three
195 * are separate to avoid lock contention: the i386 uses locked
196 * memory cycles to access the descriptor tables.  The TSS is
197 * separate since each processor needs its own kernel stack,
198 * and since using a TSS marks it busy.
199 */
200
201/*
202 * Allocate and initialize the per-processor descriptor tables.
203 */
204
205struct fake_descriptor ldt_desc_pattern = {
206	(unsigned int) 0,
207	LDTSZ_MIN * sizeof(struct fake_descriptor) - 1,
208	0,
209	ACC_P|ACC_PL_K|ACC_LDT
210};
211
212struct fake_descriptor tss_desc_pattern = {
213	(unsigned int) 0,
214	sizeof(struct i386_tss) - 1,
215	0,
216	ACC_P|ACC_PL_K|ACC_TSS
217};
218
219struct fake_descriptor cpudata_desc_pattern = {
220	(unsigned int) 0,
221	sizeof(cpu_data_t)-1,
222	SZ_32,
223	ACC_P|ACC_PL_K|ACC_DATA_W
224};
225
226#if	NCOPY_WINDOWS > 0
227struct fake_descriptor userwindow_desc_pattern = {
228	(unsigned int) 0,
229	((NBPDE * NCOPY_WINDOWS) / PAGE_SIZE) - 1,
230	SZ_32 | SZ_G,
231	ACC_P|ACC_PL_U|ACC_DATA_W
232};
233#endif
234
235struct fake_descriptor physwindow_desc_pattern = {
236	(unsigned int) 0,
237	PAGE_SIZE - 1,
238	SZ_32,
239	ACC_P|ACC_PL_K|ACC_DATA_W
240};
241
242/*
243 * This is the expanded, 64-bit variant of the kernel LDT descriptor.
244 * When switching to 64-bit mode this replaces KERNEL_LDT entry
245 * and the following empty slot. This enables the LDT to be referenced
246 * in the uber-space remapping window on the kernel.
247 */
248struct fake_descriptor64 kernel_ldt_desc64 = {
249	0,
250	LDTSZ_MIN*sizeof(struct fake_descriptor)-1,
251	0,
252	ACC_P|ACC_PL_K|ACC_LDT,
253	0
254};
255
256/*
257 * This is the expanded, 64-bit variant of the kernel TSS descriptor.
258 * It is follows pattern of the KERNEL_LDT.
259 */
260struct fake_descriptor64 kernel_tss_desc64 = {
261	0,
262	sizeof(struct x86_64_tss)-1,
263	0,
264	ACC_P|ACC_PL_K|ACC_TSS,
265	0
266};
267
268/*
269 * Convert a descriptor from fake to real format.
270 *
271 * Fake descriptor format:
272 *	bytes 0..3		base 31..0
273 *	bytes 4..5		limit 15..0
274 *	byte  6			access byte 2 | limit 19..16
275 *	byte  7			access byte 1
276 *
277 * Real descriptor format:
278 *	bytes 0..1		limit 15..0
279 *	bytes 2..3		base 15..0
280 *	byte  4			base 23..16
281 *	byte  5			access byte 1
282 *	byte  6			access byte 2 | limit 19..16
283 *	byte  7			base 31..24
284 *
285 * Fake gate format:
286 *	bytes 0..3		offset
287 *	bytes 4..5		selector
288 *	byte  6			word count << 4 (to match fake descriptor)
289 *	byte  7			access byte 1
290 *
291 * Real gate format:
292 *	bytes 0..1		offset 15..0
293 *	bytes 2..3		selector
294 *	byte  4			word count
295 *	byte  5			access byte 1
296 *	bytes 6..7		offset 31..16
297 */
298void
299fix_desc(void *d, int num_desc) {
300	//early_kprintf("fix_desc(%x, %x)\n", d, num_desc);
301	uint8_t *desc = (uint8_t*) d;
302
303	do {
304		if ((desc[7] & 0x14) == 0x04) { /* gate */
305			uint32_t offset;
306			uint16_t selector;
307			uint8_t wordcount;
308			uint8_t acc;
309
310			offset = *((uint32_t*)(desc));
311			selector = *((uint32_t*)(desc+4));
312			wordcount = desc[6] >> 4;
313			acc = desc[7];
314
315			*((uint16_t*)desc) = offset & 0xFFFF;
316			*((uint16_t*)(desc+2)) = selector;
317			desc[4] = wordcount;
318			desc[5] = acc;
319			*((uint16_t*)(desc+6)) = offset >> 16;
320
321		} else { /* descriptor */
322			uint32_t base;
323			uint16_t limit;
324			uint8_t acc1, acc2;
325
326			base = *((uint32_t*)(desc));
327			limit = *((uint16_t*)(desc+4));
328			acc2 = desc[6];
329			acc1 = desc[7];
330
331			*((uint16_t*)(desc)) = limit;
332			*((uint16_t*)(desc+2)) = base & 0xFFFF;
333			desc[4] = (base >> 16) & 0xFF;
334			desc[5] = acc1;
335			desc[6] = acc2;
336			desc[7] = base >> 24;
337		}
338		desc += 8;
339	} while (--num_desc);
340}
341
342void
343fix_desc64(void *descp, int count)
344{
345	struct fake_descriptor64	*fakep;
346	union {
347		struct real_gate64		gate;
348		struct real_descriptor64	desc;
349	}				real;
350	int				i;
351
352	fakep = (struct fake_descriptor64 *) descp;
353
354	for (i = 0; i < count; i++, fakep++) {
355		/*
356		 * Construct the real decriptor locally.
357		 */
358
359		bzero((void *) &real, sizeof(real));
360
361		switch (fakep->access & ACC_TYPE) {
362		case 0:
363			break;
364		case ACC_CALL_GATE:
365		case ACC_INTR_GATE:
366		case ACC_TRAP_GATE:
367			real.gate.offset_low16 = (uint16_t)(fakep->offset64 & 0xFFFF);
368			real.gate.selector16 = fakep->lim_or_seg & 0xFFFF;
369			real.gate.IST = fakep->size_or_IST & 0x7;
370			real.gate.access8 = fakep->access;
371			real.gate.offset_high16 = (uint16_t)((fakep->offset64>>16) & 0xFFFF);
372			real.gate.offset_top32 = (uint32_t)(fakep->offset64>>32);
373			break;
374		default:	/* Otherwise */
375			real.desc.limit_low16 = fakep->lim_or_seg & 0xFFFF;
376			real.desc.base_low16 = (uint16_t)(fakep->offset64 & 0xFFFF);
377			real.desc.base_med8 = (uint8_t)((fakep->offset64 >> 16) & 0xFF);
378			real.desc.access8 = fakep->access;
379			real.desc.limit_high4 = (fakep->lim_or_seg >> 16) & 0xFF;
380			real.desc.granularity4 = fakep->size_or_IST;
381			real.desc.base_high8 = (uint8_t)((fakep->offset64 >> 24) & 0xFF);
382			real.desc.base_top32 = (uint32_t)(fakep->offset64>>32);
383		}
384
385		/*
386		 * Now copy back over the fake structure.
387		 */
388		bcopy((void *) &real, (void *) fakep, sizeof(real));
389	}
390}
391
392static void
393cpu_gdt_alias(vm_map_offset_t gdt, vm_map_offset_t alias)
394{
395	pt_entry_t *pte = NULL;
396
397	/* Require page alignment */
398	assert(page_aligned(gdt));
399	assert(page_aligned(alias));
400
401	pte = pmap_pte(kernel_pmap, alias);
402	pmap_store_pte(pte, kvtophys(gdt) | INTEL_PTE_REF
403					  | INTEL_PTE_MOD
404					  | INTEL_PTE_WIRED
405					  | INTEL_PTE_VALID
406					  | INTEL_PTE_WRITE
407					  | INTEL_PTE_NX);
408
409	/* TLB flush unneccessry because target processor isn't running yet */
410}
411
412
413void
414cpu_desc_init64(cpu_data_t *cdp)
415{
416	cpu_desc_index_t	*cdi = &cdp->cpu_desc_index;
417
418	if (cdp == &cpu_data_master) {
419		/*
420		 * Master CPU uses the tables built at boot time.
421		 * Just set the index pointers to the low memory space.
422		 */
423		cdi->cdi_ktss = (void *)&master_ktss64;
424		cdi->cdi_sstk = (vm_offset_t) &master_sstk.top;
425		cdi->cdi_gdt.ptr  = (void *)MASTER_GDT_ALIAS;
426		cdi->cdi_idt.ptr  = (void *)MASTER_IDT_ALIAS;
427		cdi->cdi_ldt  = (struct fake_descriptor *) master_ldt;
428
429		/* Replace the expanded LDTs and TSS slots in the GDT */
430		kernel_ldt_desc64.offset64 = (uintptr_t) &master_ldt;
431		*(struct fake_descriptor64 *) &master_gdt[sel_idx(KERNEL_LDT)] =
432			kernel_ldt_desc64;
433		*(struct fake_descriptor64 *) &master_gdt[sel_idx(USER_LDT)] =
434			kernel_ldt_desc64;
435		kernel_tss_desc64.offset64 = (uintptr_t) &master_ktss64;
436		*(struct fake_descriptor64 *) &master_gdt[sel_idx(KERNEL_TSS)] =
437			kernel_tss_desc64;
438
439		/* Fix up the expanded descriptors for 64-bit. */
440		fix_desc64((void *) &master_idt64, IDTSZ);
441		fix_desc64((void *) &master_gdt[sel_idx(KERNEL_LDT)], 1);
442		fix_desc64((void *) &master_gdt[sel_idx(USER_LDT)], 1);
443		fix_desc64((void *) &master_gdt[sel_idx(KERNEL_TSS)], 1);
444
445		/*
446		 * Set the NMI/fault stacks as IST2/IST1 in the 64-bit TSS
447		 * Note: this will be dynamically re-allocated in VM later.
448		 */
449		master_ktss64.ist2 = (uintptr_t) low_eintstack;
450		master_ktss64.ist1 = (uintptr_t) low_eintstack
451					- sizeof(x86_64_intr_stack_frame_t);
452
453	} else if (cdi->cdi_ktss == NULL) {	/* Skipping re-init on wake */
454		cpu_desc_table64_t	*cdt = (cpu_desc_table64_t *) cdp->cpu_desc_tablep;
455
456		/*
457		 * Per-cpu GDT, IDT, KTSS descriptors are allocated in kernel
458		 * heap (cpu_desc_table).
459		 * LDT descriptors are mapped into a separate area.
460		 * GDT descriptors are addressed by alias to avoid sgdt leaks to user-space.
461		 */
462		cdi->cdi_idt.ptr  = (void *)MASTER_IDT_ALIAS;
463		cdi->cdi_gdt.ptr  = (void *)CPU_GDT_ALIAS(cdp->cpu_number);
464		cdi->cdi_ktss = (void *)&cdt->ktss;
465		cdi->cdi_sstk = (vm_offset_t)&cdt->sstk.top;
466		cdi->cdi_ldt  = cdp->cpu_ldtp;
467
468		/* Make the virtual alias address for the GDT */
469		cpu_gdt_alias((vm_map_offset_t) &cdt->gdt,
470			      (vm_map_offset_t) cdi->cdi_gdt.ptr);
471
472		/*
473		 * Copy the tables
474		 */
475		bcopy((char *)master_gdt, (char *)cdt->gdt, sizeof(master_gdt));
476		bcopy((char *)master_ldt, (char *)cdp->cpu_ldtp, sizeof(master_ldt));
477		bcopy((char *)&master_ktss64, (char *)&cdt->ktss, sizeof(struct x86_64_tss));
478
479		/*
480		 * Fix up the entries in the GDT to point to
481		 * this LDT and this TSS.
482		 */
483		kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldt;
484		*(struct fake_descriptor64 *) &cdt->gdt[sel_idx(KERNEL_LDT)] =
485			kernel_ldt_desc64;
486		fix_desc64(&cdt->gdt[sel_idx(KERNEL_LDT)], 1);
487
488		kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldt;
489		*(struct fake_descriptor64 *) &cdt->gdt[sel_idx(USER_LDT)] =
490			kernel_ldt_desc64;
491		fix_desc64(&cdt->gdt[sel_idx(USER_LDT)], 1);
492
493		kernel_tss_desc64.offset64 = (uintptr_t) cdi->cdi_ktss;
494		*(struct fake_descriptor64 *) &cdt->gdt[sel_idx(KERNEL_TSS)] =
495			kernel_tss_desc64;
496		fix_desc64(&cdt->gdt[sel_idx(KERNEL_TSS)], 1);
497
498		/* Set (zeroed) fault stack as IST1, NMI intr stack IST2 */
499		bzero((void *) cdt->fstk, sizeof(cdt->fstk));
500		cdt->ktss.ist2 = (unsigned long)cdt->fstk + sizeof(cdt->fstk);
501		cdt->ktss.ist1 = cdt->ktss.ist2
502					- sizeof(x86_64_intr_stack_frame_t);
503	}
504
505	/* Require that the top of the sysenter stack is 16-byte aligned */
506	if ((cdi->cdi_sstk % 16) != 0)
507		panic("cpu_desc_init64() sysenter stack not 16-byte aligned");
508}
509
510
511void
512cpu_desc_load64(cpu_data_t *cdp)
513{
514	cpu_desc_index_t	*cdi = &cdp->cpu_desc_index;
515
516	/* Stuff the kernel per-cpu data area address into the MSRs */
517	wrmsr64(MSR_IA32_GS_BASE, (uintptr_t) cdp);
518	wrmsr64(MSR_IA32_KERNEL_GS_BASE, (uintptr_t) cdp);
519
520	/*
521	 * Ensure the TSS segment's busy bit is clear. This is required
522	 * for the case of reloading descriptors at wake to avoid
523	 * their complete re-initialization.
524	 */
525	gdt_desc_p(KERNEL_TSS)->access &= ~ACC_TSS_BUSY;
526
527	/* Load the GDT, LDT, IDT and TSS */
528	cdi->cdi_gdt.size = sizeof(struct real_descriptor)*GDTSZ - 1;
529	cdi->cdi_idt.size = 0x1000 + cdp->cpu_number;
530	lgdt((uintptr_t *) &cdi->cdi_gdt);
531	lidt((uintptr_t *) &cdi->cdi_idt);
532	lldt(KERNEL_LDT);
533	set_tr(KERNEL_TSS);
534
535#if GPROF // Hack to enable mcount to work on K64
536	__asm__ volatile("mov %0, %%gs" : : "rm" ((unsigned short)(KERNEL_DS)));
537#endif
538}
539
540
541/*
542 * Set MSRs for sysenter/sysexit and syscall/sysret for 64-bit.
543 */
544static void
545fast_syscall_init64(__unused cpu_data_t *cdp)
546{
547	wrmsr64(MSR_IA32_SYSENTER_CS, SYSENTER_CS);
548	wrmsr64(MSR_IA32_SYSENTER_EIP, (uintptr_t) hi64_sysenter);
549	wrmsr64(MSR_IA32_SYSENTER_ESP, current_sstk());
550	/* Enable syscall/sysret */
551	wrmsr64(MSR_IA32_EFER, rdmsr64(MSR_IA32_EFER) | MSR_IA32_EFER_SCE);
552
553	/*
554	 * MSRs for 64-bit syscall/sysret
555	 * Note USER_CS because sysret uses this + 16 when returning to
556	 * 64-bit code.
557	 */
558	wrmsr64(MSR_IA32_LSTAR, (uintptr_t) hi64_syscall);
559	wrmsr64(MSR_IA32_STAR, (((uint64_t)USER_CS) << 48) |
560				(((uint64_t)KERNEL64_CS) << 32));
561	/*
562	 * Emulate eflags cleared by sysenter but note that
563	 * we also clear the trace trap to avoid the complications
564	 * of single-stepping into a syscall. The nested task bit
565	 * is also cleared to avoid a spurious "task switch"
566	 * should we choose to return via an IRET.
567	 */
568	wrmsr64(MSR_IA32_FMASK, EFL_DF|EFL_IF|EFL_TF|EFL_NT);
569
570}
571
572
573cpu_data_t *
574cpu_data_alloc(boolean_t is_boot_cpu)
575{
576	int		ret;
577	cpu_data_t	*cdp;
578
579	if (is_boot_cpu) {
580		assert(real_ncpus == 1);
581		cdp = cpu_datap(0);
582		if (cdp->cpu_processor == NULL) {
583			simple_lock_init(&ncpus_lock, 0);
584			cdp->cpu_processor = cpu_processor_alloc(TRUE);
585#if NCOPY_WINDOWS > 0
586			cdp->cpu_pmap = pmap_cpu_alloc(TRUE);
587#endif
588		}
589		return cdp;
590	}
591
592	/*
593	 * Allocate per-cpu data:
594	 */
595	ret = kmem_alloc(kernel_map, (vm_offset_t *) &cdp, sizeof(cpu_data_t));
596	if (ret != KERN_SUCCESS) {
597		printf("cpu_data_alloc() failed, ret=%d\n", ret);
598		goto abort;
599	}
600	bzero((void*) cdp, sizeof(cpu_data_t));
601	cdp->cpu_this = cdp;
602
603	/*
604	 * Allocate interrupt stack:
605	 */
606	ret = kmem_alloc(kernel_map,
607			 (vm_offset_t *) &cdp->cpu_int_stack_top,
608			 INTSTACK_SIZE);
609	if (ret != KERN_SUCCESS) {
610		printf("cpu_data_alloc() int stack failed, ret=%d\n", ret);
611		goto abort;
612	}
613	bzero((void*) cdp->cpu_int_stack_top, INTSTACK_SIZE);
614	cdp->cpu_int_stack_top += INTSTACK_SIZE;
615
616	/*
617	 * Allocate descriptor table:
618	 */
619	ret = kmem_alloc(kernel_map,
620			 (vm_offset_t *) &cdp->cpu_desc_tablep,
621			 sizeof(cpu_desc_table64_t));
622	if (ret != KERN_SUCCESS) {
623		printf("cpu_data_alloc() desc_table failed, ret=%d\n", ret);
624		goto abort;
625	}
626
627	/*
628	 * Allocate LDT
629	 */
630	ret = kmem_alloc(kernel_map,
631			 (vm_offset_t *) &cdp->cpu_ldtp,
632			 sizeof(struct real_descriptor) * LDTSZ);
633	if (ret != KERN_SUCCESS) {
634		printf("cpu_data_alloc() ldt failed, ret=%d\n", ret);
635		goto abort;
636	}
637
638#if CONFIG_MCA
639	/* Machine-check shadow register allocation. */
640	mca_cpu_alloc(cdp);
641#endif
642
643	simple_lock(&ncpus_lock);
644
645	cpu_data_ptr[real_ncpus] = cdp;
646	cdp->cpu_number = real_ncpus;
647	real_ncpus++;
648	simple_unlock(&ncpus_lock);
649
650	/*
651	 * Before this cpu has been assigned a real thread context,
652	 * we give it a fake, unique, non-zero thread id which the locking
653	 * primitives use as their lock value.
654	 * Note that this does not apply to the boot processor, cpu 0, which
655	 * transitions to a thread context well before other processors are
656	 * started.
657	 */
658	cdp->cpu_active_thread = (thread_t) (uintptr_t) cdp->cpu_number;
659
660	cdp->cpu_nanotime = &pal_rtc_nanotime_info;
661
662	kprintf("cpu_data_alloc(%d) %p desc_table: %p "
663		"ldt: %p "
664		"int_stack: 0x%lx-0x%lx\n",
665		cdp->cpu_number, cdp, cdp->cpu_desc_tablep, cdp->cpu_ldtp,
666		(long)(cdp->cpu_int_stack_top - INTSTACK_SIZE), (long)(cdp->cpu_int_stack_top));
667
668	return cdp;
669
670abort:
671	if (cdp) {
672		if (cdp->cpu_desc_tablep)
673			kfree((void *) cdp->cpu_desc_tablep,
674				sizeof(cpu_desc_table64_t));
675		if (cdp->cpu_int_stack_top)
676			kfree((void *) (cdp->cpu_int_stack_top - INTSTACK_SIZE),
677				INTSTACK_SIZE);
678		kfree((void *) cdp, sizeof(*cdp));
679	}
680	return NULL;
681}
682
683boolean_t
684valid_user_data_selector(uint16_t selector)
685{
686    sel_t	sel = selector_to_sel(selector);
687
688    if (selector == 0)
689    	return (TRUE);
690
691    if (sel.ti == SEL_LDT)
692	return (TRUE);
693    else if (sel.index < GDTSZ) {
694	if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U)
695	    return (TRUE);
696    }
697
698    return (FALSE);
699}
700
701boolean_t
702valid_user_code_selector(uint16_t selector)
703{
704    sel_t	sel = selector_to_sel(selector);
705
706    if (selector == 0)
707    	return (FALSE);
708
709    if (sel.ti == SEL_LDT) {
710	if (sel.rpl == USER_PRIV)
711	    return (TRUE);
712    }
713    else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) {
714	if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U)
715	    return (TRUE);
716    }
717
718    return (FALSE);
719}
720
721boolean_t
722valid_user_stack_selector(uint16_t selector)
723{
724    sel_t	sel = selector_to_sel(selector);
725
726    if (selector == 0)
727    	return (FALSE);
728
729    if (sel.ti == SEL_LDT) {
730	if (sel.rpl == USER_PRIV)
731	    return (TRUE);
732    }
733    else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) {
734	if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U)
735	    return (TRUE);
736    }
737
738    return (FALSE);
739}
740
741boolean_t
742valid_user_segment_selectors(uint16_t cs,
743		uint16_t ss,
744		uint16_t ds,
745		uint16_t es,
746		uint16_t fs,
747		uint16_t gs)
748{
749	return valid_user_code_selector(cs)  &&
750		valid_user_stack_selector(ss) &&
751		valid_user_data_selector(ds)  &&
752		valid_user_data_selector(es)  &&
753		valid_user_data_selector(fs)  &&
754		valid_user_data_selector(gs);
755}
756
757#if NCOPY_WINDOWS > 0
758
759static vm_offset_t user_window_base = 0;
760
761void
762cpu_userwindow_init(int cpu)
763{
764	cpu_data_t		*cdp = cpu_data_ptr[cpu];
765	vm_offset_t 		user_window;
766	vm_offset_t 		vaddr;
767	int			num_cpus;
768
769	num_cpus = ml_get_max_cpus();
770
771	if (cpu >= num_cpus)
772		panic("cpu_userwindow_init: cpu > num_cpus");
773
774	if (user_window_base == 0) {
775
776		if (vm_allocate(kernel_map, &vaddr,
777					(NBPDE * NCOPY_WINDOWS * num_cpus) + NBPDE,
778					VM_FLAGS_ANYWHERE) != KERN_SUCCESS)
779			panic("cpu_userwindow_init: "
780					"couldn't allocate user map window");
781
782		/*
783		 * window must start on a page table boundary
784		 * in the virtual address space
785		 */
786		user_window_base = (vaddr + (NBPDE - 1)) & ~(NBPDE - 1);
787
788		/*
789		 * get rid of any allocation leading up to our
790		 * starting boundary
791		 */
792		vm_deallocate(kernel_map, vaddr, user_window_base - vaddr);
793
794		/*
795		 * get rid of tail that we don't need
796		 */
797		user_window = user_window_base +
798					(NBPDE * NCOPY_WINDOWS * num_cpus);
799
800		vm_deallocate(kernel_map, user_window,
801				(vaddr +
802				 ((NBPDE * NCOPY_WINDOWS * num_cpus) + NBPDE)) -
803				 user_window);
804	}
805
806 	user_window = user_window_base + (cpu * NCOPY_WINDOWS * NBPDE);
807
808	cdp->cpu_copywindow_base = user_window;
809	/*
810	 * Abuse this pdp entry, the pdp now actually points to
811	 * an array of copy windows addresses.
812	 */
813	cdp->cpu_copywindow_pdp  = pmap_pde(kernel_pmap, user_window);
814
815}
816
817void
818cpu_physwindow_init(int cpu)
819{
820	cpu_data_t		*cdp = cpu_data_ptr[cpu];
821        vm_offset_t 		phys_window = cdp->cpu_physwindow_base;
822
823	if (phys_window == 0) {
824		if (vm_allocate(kernel_map, &phys_window,
825				PAGE_SIZE, VM_FLAGS_ANYWHERE)
826				!= KERN_SUCCESS)
827		        panic("cpu_physwindow_init: "
828				"couldn't allocate phys map window");
829
830		/*
831		 * make sure the page that encompasses the
832		 * pte pointer we're interested in actually
833		 * exists in the page table
834		 */
835		pmap_expand(kernel_pmap, phys_window, PMAP_EXPAND_OPTIONS_NONE);
836
837		cdp->cpu_physwindow_base = phys_window;
838		cdp->cpu_physwindow_ptep = vtopte(phys_window);
839	}
840}
841#endif /* NCOPY_WINDOWS > 0 */
842
843/*
844 * Load the segment descriptor tables for the current processor.
845 */
846void
847cpu_mode_init(cpu_data_t *cdp)
848{
849	fast_syscall_init64(cdp);
850}
851
852/*
853 * Allocate a new interrupt stack for the boot processor from the
854 * heap rather than continue to use the statically allocated space.
855 * Also switch to a dynamically allocated cpu data area.
856 */
857void
858cpu_data_realloc(void)
859{
860	int		ret;
861	vm_offset_t	istk;
862	vm_offset_t	fstk;
863	cpu_data_t	*cdp;
864	boolean_t	istate;
865
866	ret = kmem_alloc(kernel_map, &istk, INTSTACK_SIZE);
867	if (ret != KERN_SUCCESS) {
868		panic("cpu_data_realloc() stack alloc, ret=%d\n", ret);
869	}
870	bzero((void*) istk, INTSTACK_SIZE);
871	istk += INTSTACK_SIZE;
872
873	ret = kmem_alloc(kernel_map, (vm_offset_t *) &cdp, sizeof(cpu_data_t));
874	if (ret != KERN_SUCCESS) {
875		panic("cpu_data_realloc() cpu data alloc, ret=%d\n", ret);
876	}
877
878	/* Copy old contents into new area and make fix-ups */
879	assert(cpu_number() == 0);
880	bcopy((void *) cpu_data_ptr[0], (void*) cdp, sizeof(cpu_data_t));
881	cdp->cpu_this = cdp;
882	cdp->cpu_int_stack_top = istk;
883	timer_call_queue_init(&cdp->rtclock_timer.queue);
884
885	/* Allocate the separate fault stack */
886	ret = kmem_alloc(kernel_map, &fstk, PAGE_SIZE);
887	if (ret != KERN_SUCCESS) {
888		panic("cpu_data_realloc() fault stack alloc, ret=%d\n", ret);
889	}
890	bzero((void*) fstk, PAGE_SIZE);
891	fstk += PAGE_SIZE;
892
893	/*
894	 * With interrupts disabled commmit the new areas.
895	 */
896	istate = ml_set_interrupts_enabled(FALSE);
897	cpu_data_ptr[0] = cdp;
898	master_ktss64.ist2 = (uintptr_t) fstk;
899	master_ktss64.ist1 = (uintptr_t) fstk
900				- sizeof(x86_64_intr_stack_frame_t);
901	wrmsr64(MSR_IA32_GS_BASE, (uintptr_t) cdp);
902	wrmsr64(MSR_IA32_KERNEL_GS_BASE, (uintptr_t) cdp);
903	(void) ml_set_interrupts_enabled(istate);
904
905	kprintf("Reallocated master cpu data: %p,"
906		" interrupt stack: %p, fault stack: %p\n",
907		(void *) cdp, (void *) istk, (void *) fstk);
908}
909