1/*
2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56
57/*
58 */
59
60#include <kern/cpu_number.h>
61#include <kern/kalloc.h>
62#include <kern/cpu_data.h>
63#include <mach/mach_types.h>
64#include <mach/machine.h>
65#include <kern/etimer.h>
66#include <mach/vm_map.h>
67#include <mach/machine/vm_param.h>
68#include <vm/vm_kern.h>
69#include <vm/vm_map.h>
70
71#include <i386/lock.h>
72#include <i386/mp_desc.h>
73#include <i386/misc_protos.h>
74#include <i386/mp.h>
75#include <i386/pmap.h>
76#if defined(__i386__)
77#include <i386/pmap_internal.h>
78#endif /* i386 */
79#if CONFIG_MCA
80#include <i386/machine_check.h>
81#endif
82
83#include <kern/misc_protos.h>
84
85#ifdef __x86_64__
86#define K_INTR_GATE (ACC_P|ACC_PL_K|ACC_INTR_GATE)
87#define U_INTR_GATE (ACC_P|ACC_PL_U|ACC_INTR_GATE)
88
89// Declare macros that will declare the externs
90#define TRAP(n, name)		extern void *name ;
91#define TRAP_ERR(n, name)	extern void *name ;
92#define TRAP_SPC(n, name)	extern void *name ;
93#define TRAP_IST(n, name)	extern void *name ;
94#define INTERRUPT(n)		extern void *_intr_ ## n ;
95#define USER_TRAP(n, name)	extern void *name ;
96#define USER_TRAP_SPC(n, name)	extern void *name ;
97
98// Include the table to declare the externs
99#include "../x86_64/idt_table.h"
100
101// Undef the macros, then redefine them so we can declare the table
102#undef TRAP
103#undef TRAP_ERR
104#undef TRAP_SPC
105#undef TRAP_IST
106#undef INTERRUPT
107#undef USER_TRAP
108#undef USER_TRAP_SPC
109
110#define TRAP(n, name)			\
111	[n] = {				\
112		(uintptr_t)&name,	\
113		KERNEL64_CS,		\
114		0,			\
115		K_INTR_GATE,		\
116		0			\
117	},
118
119#define TRAP_ERR TRAP
120#define TRAP_SPC TRAP
121
122#define TRAP_IST(n, name) \
123	[n] = {				\
124		(uintptr_t)&name,	\
125		KERNEL64_CS,		\
126		1,			\
127		K_INTR_GATE,		\
128		0			\
129	},
130
131#define INTERRUPT(n) \
132	[n] = {				\
133		(uintptr_t)&_intr_ ## n,\
134		KERNEL64_CS,		\
135		0,			\
136		K_INTR_GATE,		\
137		0			\
138	},
139
140#define USER_TRAP(n, name) \
141	[n] = {				\
142		(uintptr_t)&name,	\
143		KERNEL64_CS,		\
144		0,			\
145		U_INTR_GATE,		\
146		0			\
147	},
148
149#define USER_TRAP_SPC USER_TRAP
150
151// Declare the table using the macros we just set up
152struct fake_descriptor64 master_idt64[IDTSZ]
153	__attribute__ ((section("__HIB,__desc")))
154	__attribute__ ((aligned(PAGE_SIZE))) = {
155#include "../x86_64/idt_table.h"
156};
157#endif
158
159/*
160 * The i386 needs an interrupt stack to keep the PCB stack from being
161 * overrun by interrupts.  All interrupt stacks MUST lie at lower addresses
162 * than any thread`s kernel stack.
163 */
164
165/*
166 * First cpu`s interrupt stack.
167 */
168extern uint32_t		low_intstack[];		/* bottom */
169extern uint32_t		low_eintstack[];	/* top */
170
171/*
172 * Per-cpu data area pointers.
173 * The master cpu (cpu 0) has its data area statically allocated;
174 * others are allocated dynamically and this array is updated at runtime.
175 */
176cpu_data_t	cpu_data_master = {
177	.cpu_this = &cpu_data_master,
178	.cpu_nanotime = &pal_rtc_nanotime_info,
179	.cpu_int_stack_top = (vm_offset_t) low_eintstack,
180#ifdef __i386__
181	.cpu_is64bit = FALSE,
182#else
183	.cpu_is64bit = TRUE
184#endif
185};
186cpu_data_t	*cpu_data_ptr[MAX_CPUS] = { [0] = &cpu_data_master };
187
188decl_simple_lock_data(,ncpus_lock);	/* protects real_ncpus */
189unsigned int	real_ncpus = 1;
190unsigned int	max_ncpus = MAX_CPUS;
191
192#ifdef __i386__
193extern void *hi_remap_text;
194#define HI_TEXT(lo_text)	\
195	(((uint32_t)&lo_text - (uint32_t)&hi_remap_text) + HIGH_MEM_BASE)
196
197extern void hi_sysenter(void);
198
199typedef struct {
200	uint16_t	length;
201	uint32_t	offset[2];
202} __attribute__((__packed__)) table_descriptor64_t;
203
204extern	table_descriptor64_t	gdtptr64;
205extern	table_descriptor64_t	idtptr64;
206#endif
207extern void hi64_sysenter(void);
208extern void hi64_syscall(void);
209
210#if defined(__x86_64__) && !defined(UBER64)
211#define UBER64(x) ((uintptr_t)x)
212#endif
213
214/*
215 * Multiprocessor i386/i486 systems use a separate copy of the
216 * GDT, IDT, LDT, and kernel TSS per processor.  The first three
217 * are separate to avoid lock contention: the i386 uses locked
218 * memory cycles to access the descriptor tables.  The TSS is
219 * separate since each processor needs its own kernel stack,
220 * and since using a TSS marks it busy.
221 */
222
223/*
224 * Allocate and initialize the per-processor descriptor tables.
225 */
226
227struct fake_descriptor ldt_desc_pattern = {
228	(unsigned int) 0,
229	LDTSZ_MIN * sizeof(struct fake_descriptor) - 1,
230	0,
231	ACC_P|ACC_PL_K|ACC_LDT
232};
233
234struct fake_descriptor tss_desc_pattern = {
235	(unsigned int) 0,
236	sizeof(struct i386_tss) - 1,
237	0,
238	ACC_P|ACC_PL_K|ACC_TSS
239};
240
241struct fake_descriptor cpudata_desc_pattern = {
242	(unsigned int) 0,
243	sizeof(cpu_data_t)-1,
244	SZ_32,
245	ACC_P|ACC_PL_K|ACC_DATA_W
246};
247
248#if	NCOPY_WINDOWS > 0
249struct fake_descriptor userwindow_desc_pattern = {
250	(unsigned int) 0,
251	((NBPDE * NCOPY_WINDOWS) / PAGE_SIZE) - 1,
252	SZ_32 | SZ_G,
253	ACC_P|ACC_PL_U|ACC_DATA_W
254};
255#endif
256
257struct fake_descriptor physwindow_desc_pattern = {
258	(unsigned int) 0,
259	PAGE_SIZE - 1,
260	SZ_32,
261	ACC_P|ACC_PL_K|ACC_DATA_W
262};
263
264/*
265 * This is the expanded, 64-bit variant of the kernel LDT descriptor.
266 * When switching to 64-bit mode this replaces KERNEL_LDT entry
267 * and the following empty slot. This enables the LDT to be referenced
268 * in the uber-space remapping window on the kernel.
269 */
270struct fake_descriptor64 kernel_ldt_desc64 = {
271	0,
272	LDTSZ_MIN*sizeof(struct fake_descriptor)-1,
273	0,
274	ACC_P|ACC_PL_K|ACC_LDT,
275	0
276};
277
278/*
279 * This is the expanded, 64-bit variant of the kernel TSS descriptor.
280 * It is follows pattern of the KERNEL_LDT.
281 */
282struct fake_descriptor64 kernel_tss_desc64 = {
283	0,
284	sizeof(struct x86_64_tss)-1,
285	0,
286	ACC_P|ACC_PL_K|ACC_TSS,
287	0
288};
289
290/*
291 * Convert a descriptor from fake to real format.
292 *
293 * Fake descriptor format:
294 *	bytes 0..3		base 31..0
295 *	bytes 4..5		limit 15..0
296 *	byte  6			access byte 2 | limit 19..16
297 *	byte  7			access byte 1
298 *
299 * Real descriptor format:
300 *	bytes 0..1		limit 15..0
301 *	bytes 2..3		base 15..0
302 *	byte  4			base 23..16
303 *	byte  5			access byte 1
304 *	byte  6			access byte 2 | limit 19..16
305 *	byte  7			base 31..24
306 *
307 * Fake gate format:
308 *	bytes 0..3		offset
309 *	bytes 4..5		selector
310 *	byte  6			word count << 4 (to match fake descriptor)
311 *	byte  7			access byte 1
312 *
313 * Real gate format:
314 *	bytes 0..1		offset 15..0
315 *	bytes 2..3		selector
316 *	byte  4			word count
317 *	byte  5			access byte 1
318 *	bytes 6..7		offset 31..16
319 */
320void
321fix_desc(void *d, int num_desc) {
322	//early_kprintf("fix_desc(%x, %x)\n", d, num_desc);
323	uint8_t *desc = (uint8_t*) d;
324
325	do {
326		if ((desc[7] & 0x14) == 0x04) { /* gate */
327			uint32_t offset;
328			uint16_t selector;
329			uint8_t wordcount;
330			uint8_t acc;
331
332			offset = *((uint32_t*)(desc));
333			selector = *((uint32_t*)(desc+4));
334			wordcount = desc[6] >> 4;
335			acc = desc[7];
336
337			*((uint16_t*)desc) = offset & 0xFFFF;
338			*((uint16_t*)(desc+2)) = selector;
339			desc[4] = wordcount;
340			desc[5] = acc;
341			*((uint16_t*)(desc+6)) = offset >> 16;
342
343		} else { /* descriptor */
344			uint32_t base;
345			uint16_t limit;
346			uint8_t acc1, acc2;
347
348			base = *((uint32_t*)(desc));
349			limit = *((uint16_t*)(desc+4));
350			acc2 = desc[6];
351			acc1 = desc[7];
352
353			*((uint16_t*)(desc)) = limit;
354			*((uint16_t*)(desc+2)) = base & 0xFFFF;
355			desc[4] = (base >> 16) & 0xFF;
356			desc[5] = acc1;
357			desc[6] = acc2;
358			desc[7] = base >> 24;
359		}
360		desc += 8;
361	} while (--num_desc);
362}
363
364void
365fix_desc64(void *descp, int count)
366{
367	struct fake_descriptor64	*fakep;
368	union {
369		struct real_gate64		gate;
370		struct real_descriptor64	desc;
371	}				real;
372	int				i;
373
374	fakep = (struct fake_descriptor64 *) descp;
375
376	for (i = 0; i < count; i++, fakep++) {
377		/*
378		 * Construct the real decriptor locally.
379		 */
380
381		bzero((void *) &real, sizeof(real));
382
383		switch (fakep->access & ACC_TYPE) {
384		case 0:
385			break;
386		case ACC_CALL_GATE:
387		case ACC_INTR_GATE:
388		case ACC_TRAP_GATE:
389			real.gate.offset_low16 = (uint16_t)(fakep->offset64 & 0xFFFF);
390			real.gate.selector16 = fakep->lim_or_seg & 0xFFFF;
391			real.gate.IST = fakep->size_or_IST & 0x7;
392			real.gate.access8 = fakep->access;
393			real.gate.offset_high16 = (uint16_t)((fakep->offset64>>16) & 0xFFFF);
394			real.gate.offset_top32 = (uint32_t)(fakep->offset64>>32);
395			break;
396		default:	/* Otherwise */
397			real.desc.limit_low16 = fakep->lim_or_seg & 0xFFFF;
398			real.desc.base_low16 = (uint16_t)(fakep->offset64 & 0xFFFF);
399			real.desc.base_med8 = (uint8_t)((fakep->offset64 >> 16) & 0xFF);
400			real.desc.access8 = fakep->access;
401			real.desc.limit_high4 = (fakep->lim_or_seg >> 16) & 0xFF;
402			real.desc.granularity4 = fakep->size_or_IST;
403			real.desc.base_high8 = (uint8_t)((fakep->offset64 >> 24) & 0xFF);
404			real.desc.base_top32 = (uint32_t)(fakep->offset64>>32);
405		}
406
407		/*
408		 * Now copy back over the fake structure.
409		 */
410		bcopy((void *) &real, (void *) fakep, sizeof(real));
411	}
412}
413
414#ifdef __i386__
415void
416cpu_desc_init(cpu_data_t *cdp)
417{
418	cpu_desc_index_t	*cdi = &cdp->cpu_desc_index;
419
420	if (cdp == &cpu_data_master) {
421		/*
422		 * Fix up the entries in the GDT to point to
423		 * this LDT and this TSS.
424		 */
425		struct fake_descriptor temp_fake_desc;
426		temp_fake_desc = ldt_desc_pattern;
427		temp_fake_desc.offset = (vm_offset_t) &master_ldt;
428		fix_desc(&temp_fake_desc, 1);
429		*(struct fake_descriptor *) &master_gdt[sel_idx(KERNEL_LDT)] =
430			temp_fake_desc;
431		*(struct fake_descriptor *) &master_gdt[sel_idx(USER_LDT)] =
432			temp_fake_desc;
433
434		temp_fake_desc = tss_desc_pattern;
435		temp_fake_desc.offset = (vm_offset_t) &master_ktss;
436		fix_desc(&temp_fake_desc, 1);
437		*(struct fake_descriptor *) &master_gdt[sel_idx(KERNEL_TSS)] =
438			temp_fake_desc;
439
440		temp_fake_desc = cpudata_desc_pattern;
441		temp_fake_desc.offset = (vm_offset_t) &cpu_data_master;
442		fix_desc(&temp_fake_desc, 1);
443		*(struct fake_descriptor *) &master_gdt[sel_idx(CPU_DATA_GS)] =
444			temp_fake_desc;
445
446		fix_desc((void *)&master_idt, IDTSZ);
447
448		cdi->cdi_idt.ptr = master_idt;
449		cdi->cdi_gdt.ptr = (void *)master_gdt;
450
451
452		/*
453		 * Master CPU uses the tables built at boot time.
454		 * Just set the index pointers to the high shared-mapping space.
455		 * Note that the sysenter stack uses empty space above the ktss
456		 * in the HIGH_FIXED_KTSS page. In this case we don't map the
457		 * the real master_sstk in low memory.
458		 */
459		cdi->cdi_ktss = (struct i386_tss *)
460			pmap_index_to_virt(HIGH_FIXED_KTSS) ;
461		cdi->cdi_sstk = (vm_offset_t) (cdi->cdi_ktss + 1) +
462				(vm_offset_t) &master_sstk.top -
463				(vm_offset_t) &master_sstk;
464	} else {
465		cpu_desc_table_t	*cdt = (cpu_desc_table_t *) cdp->cpu_desc_tablep;
466
467		vm_offset_t	cpu_hi_desc;
468
469		cpu_hi_desc = pmap_cpu_high_shared_remap(
470					cdp->cpu_number,
471					HIGH_CPU_DESC,
472					(vm_offset_t) cdt, 1);
473
474		/*
475		 * Per-cpu GDT, IDT, LDT, KTSS descriptors are allocated in one
476		 * block (cpu_desc_table) and double-mapped into high shared space
477		 * in one page window.
478		 * Also, a transient stack for the fast sysenter path. The top of
479		 * which is set at context switch time to point to the PCB using
480		 * the high address.
481		 */
482		cdi->cdi_gdt.ptr  = (struct fake_descriptor *) (cpu_hi_desc +
483					offsetof(cpu_desc_table_t, gdt[0]));
484		cdi->cdi_idt.ptr  = (struct fake_descriptor *) (cpu_hi_desc +
485					offsetof(cpu_desc_table_t, idt[0]));
486		cdi->cdi_ktss = (struct i386_tss *) (cpu_hi_desc +
487					offsetof(cpu_desc_table_t, ktss));
488		cdi->cdi_sstk = cpu_hi_desc + offsetof(cpu_desc_table_t, sstk.top);
489
490		/*
491		 * LDT descriptors are mapped into a seperate area.
492		 */
493		cdi->cdi_ldt  = (struct fake_descriptor *)
494				pmap_cpu_high_shared_remap(
495					cdp->cpu_number,
496					HIGH_CPU_LDT_BEGIN,
497					(vm_offset_t) cdp->cpu_ldtp,
498					HIGH_CPU_LDT_END - HIGH_CPU_LDT_BEGIN + 1);
499
500		/*
501		 * Copy the tables
502		 */
503		bcopy((char *)master_idt, (char *)cdt->idt, sizeof(master_idt));
504		bcopy((char *)master_gdt, (char *)cdt->gdt, sizeof(master_gdt));
505		bcopy((char *)master_ldt, (char *)cdp->cpu_ldtp, sizeof(master_ldt));
506		bzero((char *)&cdt->ktss, sizeof(struct i386_tss));
507
508		/*
509		 * Fix up the entries in the GDT to point to
510		 * this LDT and this TSS.
511		 */
512		struct fake_descriptor temp_ldt = ldt_desc_pattern;
513		temp_ldt.offset = (vm_offset_t)cdi->cdi_ldt;
514		fix_desc(&temp_ldt, 1);
515
516		cdt->gdt[sel_idx(KERNEL_LDT)] = temp_ldt;
517		cdt->gdt[sel_idx(USER_LDT)] = temp_ldt;
518
519		cdt->gdt[sel_idx(KERNEL_TSS)] = tss_desc_pattern;
520		cdt->gdt[sel_idx(KERNEL_TSS)].offset = (vm_offset_t) cdi->cdi_ktss;
521		fix_desc(&cdt->gdt[sel_idx(KERNEL_TSS)], 1);
522
523		cdt->gdt[sel_idx(CPU_DATA_GS)] = cpudata_desc_pattern;
524		cdt->gdt[sel_idx(CPU_DATA_GS)].offset = (vm_offset_t) cdp;
525		fix_desc(&cdt->gdt[sel_idx(CPU_DATA_GS)], 1);
526
527		cdt->ktss.ss0 = KERNEL_DS;
528		cdt->ktss.io_bit_map_offset = 0x0FFF;	/* no IO bitmap */
529
530		cpu_userwindow_init(cdp->cpu_number);
531		cpu_physwindow_init(cdp->cpu_number);
532
533	}
534}
535#endif /* __i386__ */
536
537void
538cpu_desc_init64(cpu_data_t *cdp)
539{
540	cpu_desc_index_t	*cdi = &cdp->cpu_desc_index;
541
542	if (cdp == &cpu_data_master) {
543		/*
544		 * Master CPU uses the tables built at boot time.
545		 * Just set the index pointers to the low memory space.
546		 */
547		cdi->cdi_ktss = (void *)&master_ktss64;
548		cdi->cdi_sstk = (vm_offset_t) &master_sstk.top;
549#if __x86_64__
550		cdi->cdi_gdt.ptr  = (void *)MASTER_GDT_ALIAS;
551		cdi->cdi_idt.ptr  = (void *)MASTER_IDT_ALIAS;
552#else
553		cdi->cdi_gdt.ptr  = (void *)master_gdt;
554		cdi->cdi_idt.ptr  = (void *)master_idt64;
555#endif
556		cdi->cdi_ldt  = (struct fake_descriptor *) master_ldt;
557
558		/* Replace the expanded LDTs and TSS slots in the GDT */
559		kernel_ldt_desc64.offset64 = UBER64(&master_ldt);
560		*(struct fake_descriptor64 *) &master_gdt[sel_idx(KERNEL_LDT)] =
561			kernel_ldt_desc64;
562		*(struct fake_descriptor64 *) &master_gdt[sel_idx(USER_LDT)] =
563			kernel_ldt_desc64;
564		kernel_tss_desc64.offset64 = UBER64(&master_ktss64);
565		*(struct fake_descriptor64 *) &master_gdt[sel_idx(KERNEL_TSS)] =
566			kernel_tss_desc64;
567
568		/* Fix up the expanded descriptors for 64-bit. */
569		fix_desc64((void *) &master_idt64, IDTSZ);
570		fix_desc64((void *) &master_gdt[sel_idx(KERNEL_LDT)], 1);
571		fix_desc64((void *) &master_gdt[sel_idx(USER_LDT)], 1);
572		fix_desc64((void *) &master_gdt[sel_idx(KERNEL_TSS)], 1);
573
574		/*
575		 * Set the double-fault stack as IST1 in the 64-bit TSS
576		 */
577#if __x86_64__
578		master_ktss64.ist1 = (uintptr_t) low_eintstack;
579#else
580		master_ktss64.ist1 = UBER64((uintptr_t) df_task_stack_end);
581#endif
582
583	} else {
584		cpu_desc_table64_t	*cdt = (cpu_desc_table64_t *) cdp->cpu_desc_tablep;
585		/*
586		 * Per-cpu GDT, IDT, KTSS descriptors are allocated in kernel
587		 * heap (cpu_desc_table).
588		 * LDT descriptors are mapped into a separate area.
589		 */
590#if __x86_64__
591		cdi->cdi_idt.ptr  = (void *)MASTER_IDT_ALIAS;
592#else
593		cdi->cdi_idt.ptr  = (void *)cdt->idt;
594#endif
595		cdi->cdi_gdt.ptr  = (struct fake_descriptor *)cdt->gdt;
596		cdi->cdi_ktss = (void *)&cdt->ktss;
597		cdi->cdi_sstk = (vm_offset_t)&cdt->sstk.top;
598		cdi->cdi_ldt  = cdp->cpu_ldtp;
599
600		/*
601		 * Copy the tables
602		 */
603#if !__x86_64__
604		bcopy((char *)master_idt64, (char *)cdt->idt, sizeof(master_idt64));
605#endif
606		bcopy((char *)master_gdt, (char *)cdt->gdt, sizeof(master_gdt));
607		bcopy((char *)master_ldt, (char *)cdp->cpu_ldtp, sizeof(master_ldt));
608		bcopy((char *)&master_ktss64, (char *)&cdt->ktss, sizeof(struct x86_64_tss));
609
610		/*
611		 * Fix up the entries in the GDT to point to
612		 * this LDT and this TSS.
613		 */
614		kernel_ldt_desc64.offset64 = UBER64(cdi->cdi_ldt);
615		*(struct fake_descriptor64 *) &cdt->gdt[sel_idx(KERNEL_LDT)] =
616			kernel_ldt_desc64;
617		fix_desc64(&cdt->gdt[sel_idx(KERNEL_LDT)], 1);
618
619		kernel_ldt_desc64.offset64 = UBER64(cdi->cdi_ldt);
620		*(struct fake_descriptor64 *) &cdt->gdt[sel_idx(USER_LDT)] =
621			kernel_ldt_desc64;
622		fix_desc64(&cdt->gdt[sel_idx(USER_LDT)], 1);
623
624		kernel_tss_desc64.offset64 = UBER64(cdi->cdi_ktss);
625		*(struct fake_descriptor64 *) &cdt->gdt[sel_idx(KERNEL_TSS)] =
626			kernel_tss_desc64;
627		fix_desc64(&cdt->gdt[sel_idx(KERNEL_TSS)], 1);
628
629		/* Set (zeroed) double-fault stack as IST1 */
630		bzero((void *) cdt->dfstk, sizeof(cdt->dfstk));
631		cdt->ktss.ist1 = UBER64((unsigned long)cdt->dfstk + sizeof(cdt->dfstk));
632#ifdef __i386__
633		cdt->gdt[sel_idx(CPU_DATA_GS)] = cpudata_desc_pattern;
634		cdt->gdt[sel_idx(CPU_DATA_GS)].offset = (vm_offset_t) cdp;
635		fix_desc(&cdt->gdt[sel_idx(CPU_DATA_GS)], 1);
636
637		/* Allocate copyio windows */
638		cpu_userwindow_init(cdp->cpu_number);
639		cpu_physwindow_init(cdp->cpu_number);
640#endif
641	}
642
643	/* Require that the top of the sysenter stack is 16-byte aligned */
644	if ((cdi->cdi_sstk % 16) != 0)
645		panic("cpu_desc_init64() sysenter stack not 16-byte aligned");
646}
647
648#ifdef __i386__
649void
650cpu_desc_load(cpu_data_t *cdp)
651{
652	cpu_desc_index_t	*cdi = &cdp->cpu_desc_index;
653
654	cdi->cdi_idt.size = 0x1000 + cdp->cpu_number;
655	cdi->cdi_gdt.size = sizeof(struct real_descriptor)*GDTSZ - 1;
656
657	lgdt((uintptr_t *) &cdi->cdi_gdt);
658	lidt((uintptr_t *) &cdi->cdi_idt);
659	lldt(KERNEL_LDT);
660
661	set_tr(KERNEL_TSS);
662
663	__asm__ volatile("mov %0, %%gs" : : "rm" ((unsigned short)(CPU_DATA_GS)));
664}
665#endif /* __i386__ */
666
667void
668cpu_desc_load64(cpu_data_t *cdp)
669{
670	cpu_desc_index_t	*cdi = &cdp->cpu_desc_index;
671
672#ifdef __i386__
673	/*
674	 * Load up the new descriptors etc
675	 * ml_load_desc64() expects these global pseudo-descriptors:
676	 *   gdtptr64 -> per-cpu gdt
677	 *   idtptr64 -> per-cpu idt
678	 * These are 10-byte descriptors with 64-bit addresses into
679	 * uber-space.
680	 *
681 	 * Refer to commpage/cpu_number.s for the IDT limit trick.
682	 */
683	gdtptr64.length = GDTSZ * sizeof(struct real_descriptor) - 1;
684	gdtptr64.offset[0] = (uint32_t) cdi->cdi_gdt.ptr;
685	gdtptr64.offset[1] = KERNEL_UBER_BASE_HI32;
686	idtptr64.length = 0x1000 + cdp->cpu_number;
687	idtptr64.offset[0] = (uint32_t) cdi->cdi_idt.ptr;
688	idtptr64.offset[1] = KERNEL_UBER_BASE_HI32;
689
690	/* Make sure busy bit is cleared in the TSS */
691	gdt_desc_p(KERNEL_TSS)->access &= ~ACC_TSS_BUSY;
692
693	ml_load_desc64();
694#else
695	/* Load the GDT, LDT, IDT and TSS */
696	cdi->cdi_gdt.size = sizeof(struct real_descriptor)*GDTSZ - 1;
697	cdi->cdi_idt.size = 0x1000 + cdp->cpu_number;
698	lgdt((uintptr_t *) &cdi->cdi_gdt);
699	lidt((uintptr_t *) &cdi->cdi_idt);
700	lldt(KERNEL_LDT);
701	set_tr(KERNEL_TSS);
702
703	/* Stuff the kernel per-cpu data area address into the MSRs */
704	wrmsr64(MSR_IA32_GS_BASE, (uintptr_t) cdp);
705	wrmsr64(MSR_IA32_KERNEL_GS_BASE, (uintptr_t) cdp);
706
707#if GPROF // Hack to enable mcount to work on K64
708	__asm__ volatile("mov %0, %%gs" : : "rm" ((unsigned short)(KERNEL_DS)));
709#endif
710#endif
711}
712
713#ifdef __i386__
714/*
715 * Set MSRs for sysenter/sysexit for 32-bit.
716 */
717static void
718fast_syscall_init(__unused cpu_data_t *cdp)
719{
720	wrmsr(MSR_IA32_SYSENTER_CS, SYSENTER_CS, 0);
721	wrmsr(MSR_IA32_SYSENTER_EIP, HI_TEXT(hi_sysenter), 0);
722	wrmsr(MSR_IA32_SYSENTER_ESP, current_sstk(), 0);
723}
724#endif
725
726/*
727 * Set MSRs for sysenter/sysexit and syscall/sysret for 64-bit.
728 */
729static void
730fast_syscall_init64(__unused cpu_data_t *cdp)
731{
732	wrmsr64(MSR_IA32_SYSENTER_CS, SYSENTER_CS);
733	wrmsr64(MSR_IA32_SYSENTER_EIP, UBER64((uintptr_t) hi64_sysenter));
734	wrmsr64(MSR_IA32_SYSENTER_ESP, UBER64(current_sstk()));
735	/* Enable syscall/sysret */
736	wrmsr64(MSR_IA32_EFER, rdmsr64(MSR_IA32_EFER) | MSR_IA32_EFER_SCE);
737
738	/*
739	 * MSRs for 64-bit syscall/sysret
740	 * Note USER_CS because sysret uses this + 16 when returning to
741	 * 64-bit code.
742	 */
743	wrmsr64(MSR_IA32_LSTAR, UBER64((uintptr_t) hi64_syscall));
744	wrmsr64(MSR_IA32_STAR, (((uint64_t)USER_CS) << 48) |
745				(((uint64_t)KERNEL64_CS) << 32));
746	/*
747	 * Emulate eflags cleared by sysenter but note that
748	 * we also clear the trace trap to avoid the complications
749	 * of single-stepping into a syscall. The nested task bit
750	 * is also cleared to avoid a spurious "task switch"
751	 * should we choose to return via an IRET.
752	 */
753	wrmsr64(MSR_IA32_FMASK, EFL_DF|EFL_IF|EFL_TF|EFL_NT);
754
755#ifdef __i386__
756	/*
757	 * Set the Kernel GS base MSR to point to per-cpu data in uber-space.
758	 * The uber-space handler (hi64_syscall) uses the swapgs instruction.
759	 */
760	wrmsr64(MSR_IA32_KERNEL_GS_BASE, UBER64(cdp));
761
762#if ONLY_SAFE_FOR_LINDA_SERIAL
763	kprintf("fast_syscall_init64() KERNEL_GS_BASE=0x%016llx\n",
764			rdmsr64(MSR_IA32_KERNEL_GS_BASE));
765#endif
766#endif
767}
768
769
770cpu_data_t *
771cpu_data_alloc(boolean_t is_boot_cpu)
772{
773	int		ret;
774	cpu_data_t	*cdp;
775
776	if (is_boot_cpu) {
777		assert(real_ncpus == 1);
778		cdp = cpu_datap(0);
779		if (cdp->cpu_processor == NULL) {
780			simple_lock_init(&ncpus_lock, 0);
781			cdp->cpu_processor = cpu_processor_alloc(TRUE);
782#if NCOPY_WINDOWS > 0
783			cdp->cpu_pmap = pmap_cpu_alloc(TRUE);
784#endif
785		}
786		return cdp;
787	}
788
789	/*
790	 * Allocate per-cpu data:
791	 */
792	ret = kmem_alloc(kernel_map, (vm_offset_t *) &cdp, sizeof(cpu_data_t));
793	if (ret != KERN_SUCCESS) {
794		printf("cpu_data_alloc() failed, ret=%d\n", ret);
795		goto abort;
796	}
797	bzero((void*) cdp, sizeof(cpu_data_t));
798	cdp->cpu_this = cdp;
799
800	/* Propagate mode */
801	cdp->cpu_is64bit = cpu_mode_is64bit();
802
803	/*
804	 * Allocate interrupt stack:
805	 */
806	ret = kmem_alloc(kernel_map,
807			 (vm_offset_t *) &cdp->cpu_int_stack_top,
808			 INTSTACK_SIZE);
809	if (ret != KERN_SUCCESS) {
810		printf("cpu_data_alloc() int stack failed, ret=%d\n", ret);
811		goto abort;
812	}
813	bzero((void*) cdp->cpu_int_stack_top, INTSTACK_SIZE);
814	cdp->cpu_int_stack_top += INTSTACK_SIZE;
815
816	/*
817	 * Allocate descriptor table:
818	 * Size depends on cpu mode.
819	 */
820
821	ret = kmem_alloc(kernel_map,
822			 (vm_offset_t *) &cdp->cpu_desc_tablep,
823			 cdp->cpu_is64bit ? sizeof(cpu_desc_table64_t)
824					  : sizeof(cpu_desc_table_t));
825	if (ret != KERN_SUCCESS) {
826		printf("cpu_data_alloc() desc_table failed, ret=%d\n", ret);
827		goto abort;
828	}
829
830	/*
831	 * Allocate LDT
832	 */
833	ret = kmem_alloc(kernel_map,
834			 (vm_offset_t *) &cdp->cpu_ldtp,
835			 sizeof(struct real_descriptor) * LDTSZ);
836	if (ret != KERN_SUCCESS) {
837		printf("cpu_data_alloc() ldt failed, ret=%d\n", ret);
838		goto abort;
839	}
840
841#if CONFIG_MCA
842	/* Machine-check shadow register allocation. */
843	mca_cpu_alloc(cdp);
844#endif
845
846	simple_lock(&ncpus_lock);
847
848	cpu_data_ptr[real_ncpus] = cdp;
849	cdp->cpu_number = real_ncpus;
850	real_ncpus++;
851	simple_unlock(&ncpus_lock);
852
853	cdp->cpu_nanotime = &pal_rtc_nanotime_info;
854
855	kprintf("cpu_data_alloc(%d) %p desc_table: %p "
856		"ldt: %p "
857		"int_stack: 0x%lx-0x%lx\n",
858		cdp->cpu_number, cdp, cdp->cpu_desc_tablep, cdp->cpu_ldtp,
859		(long)(cdp->cpu_int_stack_top - INTSTACK_SIZE), (long)(cdp->cpu_int_stack_top));
860
861	return cdp;
862
863abort:
864	if (cdp) {
865		if (cdp->cpu_desc_tablep)
866			kfree((void *) cdp->cpu_desc_tablep,
867				sizeof(*cdp->cpu_desc_tablep));
868		if (cdp->cpu_int_stack_top)
869			kfree((void *) (cdp->cpu_int_stack_top - INTSTACK_SIZE),
870				INTSTACK_SIZE);
871		kfree((void *) cdp, sizeof(*cdp));
872	}
873	return NULL;
874}
875
876boolean_t
877valid_user_data_selector(uint16_t selector)
878{
879    sel_t	sel = selector_to_sel(selector);
880
881    if (selector == 0)
882    	return (TRUE);
883
884    if (sel.ti == SEL_LDT)
885	return (TRUE);
886    else if (sel.index < GDTSZ) {
887	if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U)
888	    return (TRUE);
889    }
890
891    return (FALSE);
892}
893
894boolean_t
895valid_user_code_selector(uint16_t selector)
896{
897    sel_t	sel = selector_to_sel(selector);
898
899    if (selector == 0)
900    	return (FALSE);
901
902    if (sel.ti == SEL_LDT) {
903	if (sel.rpl == USER_PRIV)
904	    return (TRUE);
905    }
906    else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) {
907	if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U)
908	    return (TRUE);
909    }
910
911    return (FALSE);
912}
913
914boolean_t
915valid_user_stack_selector(uint16_t selector)
916{
917    sel_t	sel = selector_to_sel(selector);
918
919    if (selector == 0)
920    	return (FALSE);
921
922    if (sel.ti == SEL_LDT) {
923	if (sel.rpl == USER_PRIV)
924	    return (TRUE);
925    }
926    else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) {
927	if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U)
928	    return (TRUE);
929    }
930
931    return (FALSE);
932}
933
934boolean_t
935valid_user_segment_selectors(uint16_t cs,
936		uint16_t ss,
937		uint16_t ds,
938		uint16_t es,
939		uint16_t fs,
940		uint16_t gs)
941{
942	return valid_user_code_selector(cs)  &&
943		valid_user_stack_selector(ss) &&
944		valid_user_data_selector(ds)  &&
945		valid_user_data_selector(es)  &&
946		valid_user_data_selector(fs)  &&
947		valid_user_data_selector(gs);
948}
949
950#if NCOPY_WINDOWS > 0
951
952static vm_offset_t user_window_base = 0;
953
954void
955cpu_userwindow_init(int cpu)
956{
957	cpu_data_t		*cdp = cpu_data_ptr[cpu];
958	vm_offset_t 		user_window;
959	vm_offset_t 		vaddr;
960	int			num_cpus;
961
962	num_cpus = ml_get_max_cpus();
963
964	if (cpu >= num_cpus)
965		panic("cpu_userwindow_init: cpu > num_cpus");
966
967	if (user_window_base == 0) {
968
969		if (vm_allocate(kernel_map, &vaddr,
970					(NBPDE * NCOPY_WINDOWS * num_cpus) + NBPDE,
971					VM_FLAGS_ANYWHERE) != KERN_SUCCESS)
972			panic("cpu_userwindow_init: "
973					"couldn't allocate user map window");
974
975		/*
976		 * window must start on a page table boundary
977		 * in the virtual address space
978		 */
979		user_window_base = (vaddr + (NBPDE - 1)) & ~(NBPDE - 1);
980
981		/*
982		 * get rid of any allocation leading up to our
983		 * starting boundary
984		 */
985		vm_deallocate(kernel_map, vaddr, user_window_base - vaddr);
986
987		/*
988		 * get rid of tail that we don't need
989		 */
990		user_window = user_window_base +
991					(NBPDE * NCOPY_WINDOWS * num_cpus);
992
993		vm_deallocate(kernel_map, user_window,
994				(vaddr +
995				 ((NBPDE * NCOPY_WINDOWS * num_cpus) + NBPDE)) -
996				 user_window);
997	}
998
999 	user_window = user_window_base + (cpu * NCOPY_WINDOWS * NBPDE);
1000
1001	cdp->cpu_copywindow_base = user_window;
1002	/*
1003	 * Abuse this pdp entry, the pdp now actually points to
1004	 * an array of copy windows addresses.
1005	 */
1006	cdp->cpu_copywindow_pdp  = pmap_pde(kernel_pmap, user_window);
1007
1008#ifdef __i386__
1009	cpu_desc_index_t	*cdi = &cdp->cpu_desc_index;
1010	cdi->cdi_gdt.ptr[sel_idx(USER_WINDOW_SEL)] = userwindow_desc_pattern;
1011	cdi->cdi_gdt.ptr[sel_idx(USER_WINDOW_SEL)].offset = user_window;
1012
1013	fix_desc(&cdi->cdi_gdt.ptr[sel_idx(USER_WINDOW_SEL)], 1);
1014#endif /* __i386__ */
1015}
1016
1017void
1018cpu_physwindow_init(int cpu)
1019{
1020	cpu_data_t		*cdp = cpu_data_ptr[cpu];
1021        vm_offset_t 		phys_window = cdp->cpu_physwindow_base;
1022
1023	if (phys_window == 0) {
1024		if (vm_allocate(kernel_map, &phys_window,
1025				PAGE_SIZE, VM_FLAGS_ANYWHERE)
1026				!= KERN_SUCCESS)
1027		        panic("cpu_physwindow_init: "
1028				"couldn't allocate phys map window");
1029
1030		/*
1031		 * make sure the page that encompasses the
1032		 * pte pointer we're interested in actually
1033		 * exists in the page table
1034		 */
1035		pmap_expand(kernel_pmap, phys_window, PMAP_EXPAND_OPTIONS_NONE);
1036
1037		cdp->cpu_physwindow_base = phys_window;
1038		cdp->cpu_physwindow_ptep = vtopte(phys_window);
1039	}
1040#ifdef __i386__
1041	cpu_desc_index_t	*cdi = &cdp->cpu_desc_index;
1042	cdi->cdi_gdt.ptr[sel_idx(PHYS_WINDOW_SEL)] = physwindow_desc_pattern;
1043	cdi->cdi_gdt.ptr[sel_idx(PHYS_WINDOW_SEL)].offset = phys_window;
1044
1045	fix_desc(&cdi->cdi_gdt.ptr[sel_idx(PHYS_WINDOW_SEL)], 1);
1046#endif /* __i386__ */
1047}
1048#endif /* NCOPY_WINDOWS > 0 */
1049
1050/*
1051 * Load the segment descriptor tables for the current processor.
1052 */
1053void
1054cpu_mode_init(cpu_data_t *cdp)
1055{
1056#ifdef __i386__
1057	if (cdp->cpu_is64bit) {
1058		cpu_IA32e_enable(cdp);
1059		cpu_desc_load64(cdp);
1060		fast_syscall_init64(cdp);
1061	} else {
1062		fast_syscall_init(cdp);
1063	}
1064#else
1065	fast_syscall_init64(cdp);
1066#endif
1067}
1068
1069#if __x86_64__
1070/*
1071 * Allocate a new interrupt stack for the boot processor from the
1072 * heap rather than continue to use the statically allocated space.
1073 * Also switch to a dynamically allocated cpu data area.
1074 */
1075void
1076cpu_data_realloc(void)
1077{
1078	int		ret;
1079	vm_offset_t	stack;
1080	cpu_data_t	*cdp;
1081	boolean_t	istate;
1082
1083	ret = kmem_alloc(kernel_map, &stack, INTSTACK_SIZE);
1084	if (ret != KERN_SUCCESS) {
1085		panic("cpu_data_realloc() stack alloc, ret=%d\n", ret);
1086	}
1087	bzero((void*) stack, INTSTACK_SIZE);
1088	stack += INTSTACK_SIZE;
1089
1090	ret = kmem_alloc(kernel_map, (vm_offset_t *) &cdp, sizeof(cpu_data_t));
1091	if (ret != KERN_SUCCESS) {
1092		panic("cpu_data_realloc() cpu data alloc, ret=%d\n", ret);
1093	}
1094
1095	/* Copy old contents into new area and make fix-ups */
1096	bcopy((void *) &cpu_data_master, (void*) cdp, sizeof(cpu_data_t));
1097	cdp->cpu_this = cdp;
1098	cdp->cpu_int_stack_top = stack;
1099	timer_call_initialize_queue(&cdp->rtclock_timer.queue);
1100
1101	kprintf("Reallocated master cpu data: %p, interrupt stack top: %p\n",
1102		(void *) cdp, (void *) stack);
1103
1104	/*
1105	 * With interrupts disabled commmit the new areas.
1106	 */
1107	istate = ml_set_interrupts_enabled(FALSE);
1108	cpu_data_ptr[0] = cdp;
1109	wrmsr64(MSR_IA32_GS_BASE, (uintptr_t) cdp);
1110	wrmsr64(MSR_IA32_KERNEL_GS_BASE, (uintptr_t) cdp);
1111	(void) ml_set_interrupts_enabled(istate);
1112}
1113#endif /* __x86_64__ */
1114