1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24/*
25 * Copyright (c) 2010, Intel Corporation.
26 * All rights reserved.
27 */
28
29#include <sys/types.h>
30#include <sys/t_lock.h>
31#include <sys/param.h>
32#include <sys/sysmacros.h>
33#include <sys/signal.h>
34#include <sys/systm.h>
35#include <sys/user.h>
36#include <sys/mman.h>
37#include <sys/vm.h>
38#include <sys/conf.h>
39#include <sys/avintr.h>
40#include <sys/autoconf.h>
41#include <sys/disp.h>
42#include <sys/class.h>
43#include <sys/bitmap.h>
44
45#include <sys/privregs.h>
46
47#include <sys/proc.h>
48#include <sys/buf.h>
49#include <sys/kmem.h>
50#include <sys/mem.h>
51#include <sys/kstat.h>
52
53#include <sys/reboot.h>
54
55#include <sys/cred.h>
56#include <sys/vnode.h>
57#include <sys/file.h>
58
59#include <sys/procfs.h>
60
61#include <sys/vfs.h>
62#include <sys/cmn_err.h>
63#include <sys/utsname.h>
64#include <sys/debug.h>
65#include <sys/kdi.h>
66
67#include <sys/dumphdr.h>
68#include <sys/bootconf.h>
69#include <sys/memlist_plat.h>
70#include <sys/varargs.h>
71#include <sys/promif.h>
72#include <sys/modctl.h>
73
74#include <sys/sunddi.h>
75#include <sys/sunndi.h>
76#include <sys/ndi_impldefs.h>
77#include <sys/ddidmareq.h>
78#include <sys/psw.h>
79#include <sys/regset.h>
80#include <sys/clock.h>
81#include <sys/pte.h>
82#include <sys/tss.h>
83#include <sys/stack.h>
84#include <sys/trap.h>
85#include <sys/fp.h>
86#include <vm/kboot_mmu.h>
87#include <vm/anon.h>
88#include <vm/as.h>
89#include <vm/page.h>
90#include <vm/seg.h>
91#include <vm/seg_dev.h>
92#include <vm/seg_kmem.h>
93#include <vm/seg_kpm.h>
94#include <vm/seg_map.h>
95#include <vm/seg_vn.h>
96#include <vm/seg_kp.h>
97#include <sys/memnode.h>
98#include <vm/vm_dep.h>
99#include <sys/thread.h>
100#include <sys/sysconf.h>
101#include <sys/vm_machparam.h>
102#include <sys/archsystm.h>
103#include <sys/machsystm.h>
104#include <vm/hat.h>
105#include <vm/hat_i86.h>
106#include <sys/pmem.h>
107#include <sys/smp_impldefs.h>
108#include <sys/x86_archext.h>
109#include <sys/cpuvar.h>
110#include <sys/segments.h>
111#include <sys/clconf.h>
112#include <sys/kobj.h>
113#include <sys/kobj_lex.h>
114#include <sys/cpc_impl.h>
115#include <sys/cpu_module.h>
116#include <sys/smbios.h>
117#include <sys/debug_info.h>
118#include <sys/bootinfo.h>
119#include <sys/ddi_timer.h>
120#include <sys/systeminfo.h>
121#include <sys/multiboot.h>
122
123#ifdef	__xpv
124
125#include <sys/hypervisor.h>
126#include <sys/xen_mmu.h>
127#include <sys/evtchn_impl.h>
128#include <sys/gnttab.h>
129#include <sys/xpv_panic.h>
130#include <xen/sys/xenbus_comms.h>
131#include <xen/public/physdev.h>
132
133extern void xen_late_startup(void);
134
135struct xen_evt_data cpu0_evt_data;
136
137#else	/* __xpv */
138#include <sys/memlist_impl.h>
139
140extern void mem_config_init(void);
141#endif /* __xpv */
142
143extern void progressbar_init(void);
144extern void brand_init(void);
145extern void pcf_init(void);
146extern void pg_init(void);
147
148extern int size_pse_array(pgcnt_t, int);
149
150#if defined(_SOFT_HOSTID)
151
152#include <sys/rtc.h>
153
154static int32_t set_soft_hostid(void);
155static char hostid_file[] = "/etc/hostid";
156
157#endif
158
159void *gfx_devinfo_list;
160
161#if defined(__amd64) && !defined(__xpv)
162extern void immu_startup(void);
163#endif
164
165/*
166 * XXX make declaration below "static" when drivers no longer use this
167 * interface.
168 */
169extern caddr_t p0_va;	/* Virtual address for accessing physical page 0 */
170
171/*
172 * segkp
173 */
174extern int segkp_fromheap;
175
176static void kvm_init(void);
177static void startup_init(void);
178static void startup_memlist(void);
179static void startup_kmem(void);
180static void startup_modules(void);
181static void startup_vm(void);
182static void startup_end(void);
183static void layout_kernel_va(void);
184
185/*
186 * Declare these as initialized data so we can patch them.
187 */
188#ifdef __i386
189
190/*
191 * Due to virtual address space limitations running in 32 bit mode, restrict
192 * the amount of physical memory configured to a max of PHYSMEM pages (16g).
193 *
194 * If the physical max memory size of 64g were allowed to be configured, the
195 * size of user virtual address space will be less than 1g. A limited user
196 * address space greatly reduces the range of applications that can run.
197 *
198 * If more physical memory than PHYSMEM is required, users should preferably
199 * run in 64 bit mode which has far looser virtual address space limitations.
200 *
201 * If 64 bit mode is not available (as in IA32) and/or more physical memory
202 * than PHYSMEM is required in 32 bit mode, physmem can be set to the desired
203 * value or to 0 (to configure all available memory) via eeprom(1M). kernelbase
204 * should also be carefully tuned to balance out the need of the user
205 * application while minimizing the risk of kernel heap exhaustion due to
206 * kernelbase being set too high.
207 */
208#define	PHYSMEM	0x400000
209
210#else /* __amd64 */
211
212/*
213 * For now we can handle memory with physical addresses up to about
214 * 64 Terabytes. This keeps the kernel above the VA hole, leaving roughly
215 * half the VA space for seg_kpm. When systems get bigger than 64TB this
216 * code will need revisiting. There is an implicit assumption that there
217 * are no *huge* holes in the physical address space too.
218 */
219#define	TERABYTE		(1ul << 40)
220#define	PHYSMEM_MAX64		mmu_btop(64 * TERABYTE)
221#define	PHYSMEM			PHYSMEM_MAX64
222#define	AMD64_VA_HOLE_END	0xFFFF800000000000ul
223
224#endif /* __amd64 */
225
226pgcnt_t physmem = PHYSMEM;
227pgcnt_t obp_pages;	/* Memory used by PROM for its text and data */
228
229char *kobj_file_buf;
230int kobj_file_bufsize;	/* set in /etc/system */
231
232/* Global variables for MP support. Used in mp_startup */
233caddr_t	rm_platter_va = 0;
234uint32_t rm_platter_pa;
235
236int	auto_lpg_disable = 1;
237
238/*
239 * Some CPUs have holes in the middle of the 64-bit virtual address range.
240 */
241uintptr_t hole_start, hole_end;
242
243/*
244 * kpm mapping window
245 */
246caddr_t kpm_vbase;
247size_t  kpm_size;
248static int kpm_desired;
249#ifdef __amd64
250static uintptr_t segkpm_base = (uintptr_t)SEGKPM_BASE;
251#endif
252
253/*
254 * Configuration parameters set at boot time.
255 */
256
257caddr_t econtig;		/* end of first block of contiguous kernel */
258
259struct bootops		*bootops = 0;	/* passed in from boot */
260struct bootops		**bootopsp;
261struct boot_syscalls	*sysp;		/* passed in from boot */
262
263char bootblock_fstype[16];
264
265char kern_bootargs[OBP_MAXPATHLEN];
266char kern_bootfile[OBP_MAXPATHLEN];
267
268/*
269 * ZFS zio segment.  This allows us to exclude large portions of ZFS data that
270 * gets cached in kmem caches on the heap.  If this is set to zero, we allocate
271 * zio buffers from their own segment, otherwise they are allocated from the
272 * heap.  The optimization of allocating zio buffers from their own segment is
273 * only valid on 64-bit kernels.
274 */
275#if defined(__amd64)
276int segzio_fromheap = 0;
277#else
278int segzio_fromheap = 1;
279#endif
280
281/*
282 * new memory fragmentations are possible in startup() due to BOP_ALLOCs. this
283 * depends on number of BOP_ALLOC calls made and requested size, memory size
284 * combination and whether boot.bin memory needs to be freed.
285 */
286#define	POSS_NEW_FRAGMENTS	12
287
288/*
289 * VM data structures
290 */
291long page_hashsz;		/* Size of page hash table (power of two) */
292unsigned int page_hashsz_shift;	/* log2(page_hashsz) */
293struct page *pp_base;		/* Base of initial system page struct array */
294struct page **page_hash;	/* Page hash table */
295pad_mutex_t *pse_mutex;		/* Locks protecting pp->p_selock */
296size_t pse_table_size;		/* Number of mutexes in pse_mutex[] */
297int pse_shift;			/* log2(pse_table_size) */
298struct seg ktextseg;		/* Segment used for kernel executable image */
299struct seg kvalloc;		/* Segment used for "valloc" mapping */
300struct seg kpseg;		/* Segment used for pageable kernel virt mem */
301struct seg kmapseg;		/* Segment used for generic kernel mappings */
302struct seg kdebugseg;		/* Segment used for the kernel debugger */
303
304struct seg *segkmap = &kmapseg;	/* Kernel generic mapping segment */
305static struct seg *segmap = &kmapseg;	/* easier to use name for in here */
306
307struct seg *segkp = &kpseg;	/* Pageable kernel virtual memory segment */
308
309#if defined(__amd64)
310struct seg kvseg_core;		/* Segment used for the core heap */
311struct seg kpmseg;		/* Segment used for physical mapping */
312struct seg *segkpm = &kpmseg;	/* 64bit kernel physical mapping segment */
313#else
314struct seg *segkpm = NULL;	/* Unused on IA32 */
315#endif
316
317caddr_t segkp_base;		/* Base address of segkp */
318caddr_t segzio_base;		/* Base address of segzio */
319#if defined(__amd64)
320pgcnt_t segkpsize = btop(SEGKPDEFSIZE);	/* size of segkp segment in pages */
321#else
322pgcnt_t segkpsize = 0;
323#endif
324pgcnt_t segziosize = 0;		/* size of zio segment in pages */
325
326/*
327 * A static DR page_t VA map is reserved that can map the page structures
328 * for a domain's entire RA space. The pages that back this space are
329 * dynamically allocated and need not be physically contiguous.  The DR
330 * map size is derived from KPM size.
331 * This mechanism isn't used by x86 yet, so just stubs here.
332 */
333int ppvm_enable = 0;		/* Static virtual map for page structs */
334page_t *ppvm_base = NULL;	/* Base of page struct map */
335pgcnt_t ppvm_size = 0;		/* Size of page struct map */
336
337/*
338 * VA range available to the debugger
339 */
340const caddr_t kdi_segdebugbase = (const caddr_t)SEGDEBUGBASE;
341const size_t kdi_segdebugsize = SEGDEBUGSIZE;
342
343struct memseg *memseg_base;
344struct vnode unused_pages_vp;
345
346#define	FOURGB	0x100000000LL
347
348struct memlist *memlist;
349
350caddr_t s_text;		/* start of kernel text segment */
351caddr_t e_text;		/* end of kernel text segment */
352caddr_t s_data;		/* start of kernel data segment */
353caddr_t e_data;		/* end of kernel data segment */
354caddr_t modtext;	/* start of loadable module text reserved */
355caddr_t e_modtext;	/* end of loadable module text reserved */
356caddr_t moddata;	/* start of loadable module data reserved */
357caddr_t e_moddata;	/* end of loadable module data reserved */
358
359struct memlist *phys_install;	/* Total installed physical memory */
360struct memlist *phys_avail;	/* Total available physical memory */
361struct memlist *bios_rsvd;	/* Bios reserved memory */
362
363/*
364 * kphysm_init returns the number of pages that were processed
365 */
366static pgcnt_t kphysm_init(page_t *, pgcnt_t);
367
368#define	IO_PROP_SIZE	64	/* device property size */
369
370/*
371 * a couple useful roundup macros
372 */
373#define	ROUND_UP_PAGE(x)	\
374	((uintptr_t)P2ROUNDUP((uintptr_t)(x), (uintptr_t)MMU_PAGESIZE))
375#define	ROUND_UP_LPAGE(x)	\
376	((uintptr_t)P2ROUNDUP((uintptr_t)(x), mmu.level_size[1]))
377#define	ROUND_UP_4MEG(x)	\
378	((uintptr_t)P2ROUNDUP((uintptr_t)(x), (uintptr_t)FOUR_MEG))
379#define	ROUND_UP_TOPLEVEL(x)	\
380	((uintptr_t)P2ROUNDUP((uintptr_t)(x), mmu.level_size[mmu.max_level]))
381
382/*
383 *	32-bit Kernel's Virtual memory layout.
384 *		+-----------------------+
385 *		|			|
386 * 0xFFC00000  -|-----------------------|- ARGSBASE
387 *		|	debugger	|
388 * 0xFF800000  -|-----------------------|- SEGDEBUGBASE
389 *		|      Kernel Data	|
390 * 0xFEC00000  -|-----------------------|
391 *              |      Kernel Text	|
392 * 0xFE800000  -|-----------------------|- KERNEL_TEXT (0xFB400000 on Xen)
393 *		|---       GDT       ---|- GDT page (GDT_VA)
394 *		|---    debug info   ---|- debug info (DEBUG_INFO_VA)
395 *		|			|
396 * 		|   page_t structures	|
397 * 		|   memsegs, memlists, 	|
398 * 		|   page hash, etc.	|
399 * ---	       -|-----------------------|- ekernelheap, valloc_base (floating)
400 *		|			|  (segkp is just an arena in the heap)
401 *		|			|
402 *		|	kvseg		|
403 *		|			|
404 *		|			|
405 * ---         -|-----------------------|- kernelheap (floating)
406 * 		|        Segkmap	|
407 * 0xC3002000  -|-----------------------|- segmap_start (floating)
408 *		|	Red Zone	|
409 * 0xC3000000  -|-----------------------|- kernelbase / userlimit (floating)
410 *		|			|			||
411 *		|     Shared objects	|			\/
412 *		|			|
413 *		:			:
414 *		|	user data	|
415 *		|-----------------------|
416 *		|	user text	|
417 * 0x08048000  -|-----------------------|
418 *		|	user stack	|
419 *		:			:
420 *		|	invalid		|
421 * 0x00000000	+-----------------------+
422 *
423 *
424 *		64-bit Kernel's Virtual memory layout. (assuming 64 bit app)
425 *			+-----------------------+
426 *			|			|
427 * 0xFFFFFFFF.FFC00000  |-----------------------|- ARGSBASE
428 *			|	debugger (?)	|
429 * 0xFFFFFFFF.FF800000  |-----------------------|- SEGDEBUGBASE
430 *			|      unused    	|
431 *			+-----------------------+
432 *			|      Kernel Data	|
433 * 0xFFFFFFFF.FBC00000  |-----------------------|
434 *			|      Kernel Text	|
435 * 0xFFFFFFFF.FB800000  |-----------------------|- KERNEL_TEXT
436 *			|---       GDT       ---|- GDT page (GDT_VA)
437 *			|---    debug info   ---|- debug info (DEBUG_INFO_VA)
438 *			|			|
439 * 			|      Core heap	| (used for loadable modules)
440 * 0xFFFFFFFF.C0000000  |-----------------------|- core_base / ekernelheap
441 *			|	 Kernel		|
442 *			|	  heap		|
443 * 0xFFFFFXXX.XXX00000  |-----------------------|- kernelheap (floating)
444 *			|	 segmap		|
445 * 0xFFFFFXXX.XXX00000  |-----------------------|- segmap_start (floating)
446 *			|    device mappings	|
447 * 0xFFFFFXXX.XXX00000  |-----------------------|- toxic_addr (floating)
448 *			|	  segzio	|
449 * 0xFFFFFXXX.XXX00000  |-----------------------|- segzio_base (floating)
450 *			|	  segkp		|
451 * ---                  |-----------------------|- segkp_base (floating)
452 * 			|   page_t structures	|  valloc_base + valloc_sz
453 * 			|   memsegs, memlists, 	|
454 * 			|   page hash, etc.	|
455 * 0xFFFFFF00.00000000  |-----------------------|- valloc_base (lower if > 1TB)
456 *			|	 segkpm		|
457 * 0xFFFFFE00.00000000  |-----------------------|
458 *			|	Red Zone	|
459 * 0xFFFFFD80.00000000  |-----------------------|- KERNELBASE (lower if > 1TB)
460 *			|     User stack	|- User space memory
461 * 			|			|
462 * 			| shared objects, etc	|	(grows downwards)
463 *			:			:
464 * 			|			|
465 * 0xFFFF8000.00000000  |-----------------------|
466 * 			|			|
467 * 			| VA Hole / unused	|
468 * 			|			|
469 * 0x00008000.00000000  |-----------------------|
470 *			|			|
471 *			|			|
472 *			:			:
473 *			|	user heap	|	(grows upwards)
474 *			|			|
475 *			|	user data	|
476 *			|-----------------------|
477 *			|	user text	|
478 * 0x00000000.04000000  |-----------------------|
479 *			|	invalid		|
480 * 0x00000000.00000000	+-----------------------+
481 *
482 * A 32 bit app on the 64 bit kernel sees the same layout as on the 32 bit
483 * kernel, except that userlimit is raised to 0xfe000000
484 *
485 * Floating values:
486 *
487 * valloc_base: start of the kernel's memory management/tracking data
488 * structures.  This region contains page_t structures for
489 * physical memory, memsegs, memlists, and the page hash.
490 *
491 * core_base: start of the kernel's "core" heap area on 64-bit systems.
492 * This area is intended to be used for global data as well as for module
493 * text/data that does not fit into the nucleus pages.  The core heap is
494 * restricted to a 2GB range, allowing every address within it to be
495 * accessed using rip-relative addressing
496 *
497 * ekernelheap: end of kernelheap and start of segmap.
498 *
499 * kernelheap: start of kernel heap.  On 32-bit systems, this starts right
500 * above a red zone that separates the user's address space from the
501 * kernel's.  On 64-bit systems, it sits above segkp and segkpm.
502 *
503 * segmap_start: start of segmap. The length of segmap can be modified
504 * through eeprom. The default length is 16MB on 32-bit systems and 64MB
505 * on 64-bit systems.
506 *
507 * kernelbase: On a 32-bit kernel the default value of 0xd4000000 will be
508 * decreased by 2X the size required for page_t.  This allows the kernel
509 * heap to grow in size with physical memory.  With sizeof(page_t) == 80
510 * bytes, the following shows the values of kernelbase and kernel heap
511 * sizes for different memory configurations (assuming default segmap and
512 * segkp sizes).
513 *
514 *	mem	size for	kernelbase	kernel heap
515 *	size	page_t's			size
516 *	----	---------	----------	-----------
517 *	1gb	0x01400000	0xd1800000	684MB
518 *	2gb	0x02800000	0xcf000000	704MB
519 *	4gb	0x05000000	0xca000000	744MB
520 *	6gb	0x07800000	0xc5000000	784MB
521 *	8gb	0x0a000000	0xc0000000	824MB
522 *	16gb	0x14000000	0xac000000	984MB
523 *	32gb	0x28000000	0x84000000	1304MB
524 *	64gb	0x50000000	0x34000000	1944MB (*)
525 *
526 * kernelbase is less than the abi minimum of 0xc0000000 for memory
527 * configurations above 8gb.
528 *
529 * (*) support for memory configurations above 32gb will require manual tuning
530 * of kernelbase to balance out the need of user applications.
531 */
532
533/* real-time-clock initialization parameters */
534extern time_t process_rtc_config_file(void);
535
536uintptr_t	kernelbase;
537uintptr_t	postbootkernelbase;	/* not set till boot loader is gone */
538uintptr_t	eprom_kernelbase;
539size_t		segmapsize;
540uintptr_t	segmap_start;
541int		segmapfreelists;
542pgcnt_t		npages;
543pgcnt_t		orig_npages;
544size_t		core_size;		/* size of "core" heap */
545uintptr_t	core_base;		/* base address of "core" heap */
546
547/*
548 * List of bootstrap pages. We mark these as allocated in startup.
549 * release_bootstrap() will free them when we're completely done with
550 * the bootstrap.
551 */
552static page_t *bootpages;
553
554/*
555 * boot time pages that have a vnode from the ramdisk will keep that forever.
556 */
557static page_t *rd_pages;
558
559/*
560 * Lower 64K
561 */
562static page_t *lower_pages = NULL;
563static int lower_pages_count = 0;
564
565struct system_hardware system_hardware;
566
567/*
568 * Enable some debugging messages concerning memory usage...
569 */
570static void
571print_memlist(char *title, struct memlist *mp)
572{
573	prom_printf("MEMLIST: %s:\n", title);
574	while (mp != NULL)  {
575		prom_printf("\tAddress 0x%" PRIx64 ", size 0x%" PRIx64 "\n",
576		    mp->ml_address, mp->ml_size);
577		mp = mp->ml_next;
578	}
579}
580
581/*
582 * XX64 need a comment here.. are these just default values, surely
583 * we read the "cpuid" type information to figure this out.
584 */
585int	l2cache_sz = 0x80000;
586int	l2cache_linesz = 0x40;
587int	l2cache_assoc = 1;
588
589static size_t	textrepl_min_gb = 10;
590
591/*
592 * on 64 bit we use a predifined VA range for mapping devices in the kernel
593 * on 32 bit the mappings are intermixed in the heap, so we use a bit map
594 */
595#ifdef __amd64
596
597vmem_t		*device_arena;
598uintptr_t	toxic_addr = (uintptr_t)NULL;
599size_t		toxic_size = 1024 * 1024 * 1024; /* Sparc uses 1 gig too */
600
601#else	/* __i386 */
602
603ulong_t		*toxic_bit_map;	/* one bit for each 4k of VA in heap_arena */
604size_t		toxic_bit_map_len = 0;	/* in bits */
605
606#endif	/* __i386 */
607
608/*
609 * Simple boot time debug facilities
610 */
611static char *prm_dbg_str[] = {
612	"%s:%d: '%s' is 0x%x\n",
613	"%s:%d: '%s' is 0x%llx\n"
614};
615
616int prom_debug;
617
618#define	PRM_DEBUG(q)	if (prom_debug) 	\
619	prom_printf(prm_dbg_str[sizeof (q) >> 3], "startup.c", __LINE__, #q, q);
620#define	PRM_POINT(q)	if (prom_debug) 	\
621	prom_printf("%s:%d: %s\n", "startup.c", __LINE__, q);
622
623/*
624 * This structure is used to keep track of the intial allocations
625 * done in startup_memlist(). The value of NUM_ALLOCATIONS needs to
626 * be >= the number of ADD_TO_ALLOCATIONS() executed in the code.
627 */
628#define	NUM_ALLOCATIONS 8
629int num_allocations = 0;
630struct {
631	void **al_ptr;
632	size_t al_size;
633} allocations[NUM_ALLOCATIONS];
634size_t valloc_sz = 0;
635uintptr_t valloc_base;
636
637#define	ADD_TO_ALLOCATIONS(ptr, size) {					\
638		size = ROUND_UP_PAGE(size);		 		\
639		if (num_allocations == NUM_ALLOCATIONS)			\
640			panic("too many ADD_TO_ALLOCATIONS()");		\
641		allocations[num_allocations].al_ptr = (void**)&ptr;	\
642		allocations[num_allocations].al_size = size;		\
643		valloc_sz += size;					\
644		++num_allocations;				 	\
645	}
646
647/*
648 * Allocate all the initial memory needed by the page allocator.
649 */
650static void
651perform_allocations(void)
652{
653	caddr_t mem;
654	int i;
655	int valloc_align;
656
657	PRM_DEBUG(valloc_base);
658	PRM_DEBUG(valloc_sz);
659	valloc_align = mmu.level_size[mmu.max_page_level > 0];
660	mem = BOP_ALLOC(bootops, (caddr_t)valloc_base, valloc_sz, valloc_align);
661	if (mem != (caddr_t)valloc_base)
662		panic("BOP_ALLOC() failed");
663	bzero(mem, valloc_sz);
664	for (i = 0; i < num_allocations; ++i) {
665		*allocations[i].al_ptr = (void *)mem;
666		mem += allocations[i].al_size;
667	}
668}
669
670/*
671 * Our world looks like this at startup time.
672 *
673 * In a 32-bit OS, boot loads the kernel text at 0xfe800000 and kernel data
674 * at 0xfec00000.  On a 64-bit OS, kernel text and data are loaded at
675 * 0xffffffff.fe800000 and 0xffffffff.fec00000 respectively.  Those
676 * addresses are fixed in the binary at link time.
677 *
678 * On the text page:
679 * unix/genunix/krtld/module text loads.
680 *
681 * On the data page:
682 * unix/genunix/krtld/module data loads.
683 *
684 * Machine-dependent startup code
685 */
686void
687startup(void)
688{
689#if !defined(__xpv)
690	extern void startup_pci_bios(void);
691#endif
692	extern cpuset_t cpu_ready_set;
693
694	/*
695	 * Make sure that nobody tries to use sekpm until we have
696	 * initialized it properly.
697	 */
698#if defined(__amd64)
699	kpm_desired = 1;
700#endif
701	kpm_enable = 0;
702	CPUSET_ONLY(cpu_ready_set, 0);	/* cpu 0 is boot cpu */
703
704#if defined(__xpv)	/* XXPV fix me! */
705	{
706		extern int segvn_use_regions;
707		segvn_use_regions = 0;
708	}
709#endif
710	progressbar_init();
711	startup_init();
712#if defined(__xpv)
713	startup_xen_version();
714#endif
715	startup_memlist();
716	startup_kmem();
717	startup_vm();
718#if !defined(__xpv)
719	/*
720	 * Note we need to do this even on fast reboot in order to access
721	 * the irq routing table (used for pci labels).
722	 */
723	startup_pci_bios();
724#endif
725#if defined(__xpv)
726	startup_xen_mca();
727#endif
728	startup_modules();
729
730	startup_end();
731}
732
733static void
734startup_init()
735{
736	PRM_POINT("startup_init() starting...");
737
738	/*
739	 * Complete the extraction of cpuid data
740	 */
741	cpuid_pass2(CPU);
742
743	(void) check_boot_version(BOP_GETVERSION(bootops));
744
745	/*
746	 * Check for prom_debug in boot environment
747	 */
748	if (BOP_GETPROPLEN(bootops, "prom_debug") >= 0) {
749		++prom_debug;
750		PRM_POINT("prom_debug found in boot enviroment");
751	}
752
753	/*
754	 * Collect node, cpu and memory configuration information.
755	 */
756	get_system_configuration();
757
758	/*
759	 * Halt if this is an unsupported processor.
760	 */
761	if (x86_type == X86_TYPE_486 || x86_type == X86_TYPE_CYRIX_486) {
762		printf("\n486 processor (\"%s\") detected.\n",
763		    CPU->cpu_brandstr);
764		halt("This processor is not supported by this release "
765		    "of Solaris.");
766	}
767
768	PRM_POINT("startup_init() done");
769}
770
771/*
772 * Callback for copy_memlist_filter() to filter nucleus, kadb/kmdb, (ie.
773 * everything mapped above KERNEL_TEXT) pages from phys_avail. Note it
774 * also filters out physical page zero.  There is some reliance on the
775 * boot loader allocating only a few contiguous physical memory chunks.
776 */
777static void
778avail_filter(uint64_t *addr, uint64_t *size)
779{
780	uintptr_t va;
781	uintptr_t next_va;
782	pfn_t pfn;
783	uint64_t pfn_addr;
784	uint64_t pfn_eaddr;
785	uint_t prot;
786	size_t len;
787	uint_t change;
788
789	if (prom_debug)
790		prom_printf("\tFilter: in: a=%" PRIx64 ", s=%" PRIx64 "\n",
791		    *addr, *size);
792
793	/*
794	 * page zero is required for BIOS.. never make it available
795	 */
796	if (*addr == 0) {
797		*addr += MMU_PAGESIZE;
798		*size -= MMU_PAGESIZE;
799	}
800
801	/*
802	 * First we trim from the front of the range. Since kbm_probe()
803	 * walks ranges in virtual order, but addr/size are physical, we need
804	 * to the list until no changes are seen.  This deals with the case
805	 * where page "p" is mapped at v, page "p + PAGESIZE" is mapped at w
806	 * but w < v.
807	 */
808	do {
809		change = 0;
810		for (va = KERNEL_TEXT;
811		    *size > 0 && kbm_probe(&va, &len, &pfn, &prot) != 0;
812		    va = next_va) {
813
814			next_va = va + len;
815			pfn_addr = pfn_to_pa(pfn);
816			pfn_eaddr = pfn_addr + len;
817
818			if (pfn_addr <= *addr && pfn_eaddr > *addr) {
819				change = 1;
820				while (*size > 0 && len > 0) {
821					*addr += MMU_PAGESIZE;
822					*size -= MMU_PAGESIZE;
823					len -= MMU_PAGESIZE;
824				}
825			}
826		}
827		if (change && prom_debug)
828			prom_printf("\t\ttrim: a=%" PRIx64 ", s=%" PRIx64 "\n",
829			    *addr, *size);
830	} while (change);
831
832	/*
833	 * Trim pages from the end of the range.
834	 */
835	for (va = KERNEL_TEXT;
836	    *size > 0 && kbm_probe(&va, &len, &pfn, &prot) != 0;
837	    va = next_va) {
838
839		next_va = va + len;
840		pfn_addr = pfn_to_pa(pfn);
841
842		if (pfn_addr >= *addr && pfn_addr < *addr + *size)
843			*size = pfn_addr - *addr;
844	}
845
846	if (prom_debug)
847		prom_printf("\tFilter out: a=%" PRIx64 ", s=%" PRIx64 "\n",
848		    *addr, *size);
849}
850
851static void
852kpm_init()
853{
854	struct segkpm_crargs b;
855
856	/*
857	 * These variables were all designed for sfmmu in which segkpm is
858	 * mapped using a single pagesize - either 8KB or 4MB.  On x86, we
859	 * might use 2+ page sizes on a single machine, so none of these
860	 * variables have a single correct value.  They are set up as if we
861	 * always use a 4KB pagesize, which should do no harm.  In the long
862	 * run, we should get rid of KPM's assumption that only a single
863	 * pagesize is used.
864	 */
865	kpm_pgshft = MMU_PAGESHIFT;
866	kpm_pgsz =  MMU_PAGESIZE;
867	kpm_pgoff = MMU_PAGEOFFSET;
868	kpmp2pshft = 0;
869	kpmpnpgs = 1;
870	ASSERT(((uintptr_t)kpm_vbase & (kpm_pgsz - 1)) == 0);
871
872	PRM_POINT("about to create segkpm");
873	rw_enter(&kas.a_lock, RW_WRITER);
874
875	if (seg_attach(&kas, kpm_vbase, kpm_size, segkpm) < 0)
876		panic("cannot attach segkpm");
877
878	b.prot = PROT_READ | PROT_WRITE;
879	b.nvcolors = 1;
880
881	if (segkpm_create(segkpm, (caddr_t)&b) != 0)
882		panic("segkpm_create segkpm");
883
884	rw_exit(&kas.a_lock);
885}
886
887/*
888 * The debug info page provides enough information to allow external
889 * inspectors (e.g. when running under a hypervisor) to bootstrap
890 * themselves into allowing full-blown kernel debugging.
891 */
892static void
893init_debug_info(void)
894{
895	caddr_t mem;
896	debug_info_t *di;
897
898#ifndef __lint
899	ASSERT(sizeof (debug_info_t) < MMU_PAGESIZE);
900#endif
901
902	mem = BOP_ALLOC(bootops, (caddr_t)DEBUG_INFO_VA, MMU_PAGESIZE,
903	    MMU_PAGESIZE);
904
905	if (mem != (caddr_t)DEBUG_INFO_VA)
906		panic("BOP_ALLOC() failed");
907	bzero(mem, MMU_PAGESIZE);
908
909	di = (debug_info_t *)mem;
910
911	di->di_magic = DEBUG_INFO_MAGIC;
912	di->di_version = DEBUG_INFO_VERSION;
913	di->di_modules = (uintptr_t)&modules;
914	di->di_s_text = (uintptr_t)s_text;
915	di->di_e_text = (uintptr_t)e_text;
916	di->di_s_data = (uintptr_t)s_data;
917	di->di_e_data = (uintptr_t)e_data;
918	di->di_hat_htable_off = offsetof(hat_t, hat_htable);
919	di->di_ht_pfn_off = offsetof(htable_t, ht_pfn);
920}
921
922/*
923 * Build the memlists and other kernel essential memory system data structures.
924 * This is everything at valloc_base.
925 */
926static void
927startup_memlist(void)
928{
929	size_t memlist_sz;
930	size_t memseg_sz;
931	size_t pagehash_sz;
932	size_t pp_sz;
933	uintptr_t va;
934	size_t len;
935	uint_t prot;
936	pfn_t pfn;
937	int memblocks;
938	pfn_t rsvd_high_pfn;
939	pgcnt_t rsvd_pgcnt;
940	size_t rsvdmemlist_sz;
941	int rsvdmemblocks;
942	caddr_t pagecolor_mem;
943	size_t pagecolor_memsz;
944	caddr_t page_ctrs_mem;
945	size_t page_ctrs_size;
946	size_t pse_table_alloc_size;
947	struct memlist *current;
948	extern void startup_build_mem_nodes(struct memlist *);
949
950	/* XX64 fix these - they should be in include files */
951	extern size_t page_coloring_init(uint_t, int, int);
952	extern void page_coloring_setup(caddr_t);
953
954	PRM_POINT("startup_memlist() starting...");
955
956	/*
957	 * Use leftover large page nucleus text/data space for loadable modules.
958	 * Use at most MODTEXT/MODDATA.
959	 */
960	len = kbm_nucleus_size;
961	ASSERT(len > MMU_PAGESIZE);
962
963	moddata = (caddr_t)ROUND_UP_PAGE(e_data);
964	e_moddata = (caddr_t)P2ROUNDUP((uintptr_t)e_data, (uintptr_t)len);
965	if (e_moddata - moddata > MODDATA)
966		e_moddata = moddata + MODDATA;
967
968	modtext = (caddr_t)ROUND_UP_PAGE(e_text);
969	e_modtext = (caddr_t)P2ROUNDUP((uintptr_t)e_text, (uintptr_t)len);
970	if (e_modtext - modtext > MODTEXT)
971		e_modtext = modtext + MODTEXT;
972
973	econtig = e_moddata;
974
975	PRM_DEBUG(modtext);
976	PRM_DEBUG(e_modtext);
977	PRM_DEBUG(moddata);
978	PRM_DEBUG(e_moddata);
979	PRM_DEBUG(econtig);
980
981	/*
982	 * Examine the boot loader physical memory map to find out:
983	 * - total memory in system - physinstalled
984	 * - the max physical address - physmax
985	 * - the number of discontiguous segments of memory.
986	 */
987	if (prom_debug)
988		print_memlist("boot physinstalled",
989		    bootops->boot_mem->physinstalled);
990	installed_top_size_ex(bootops->boot_mem->physinstalled, &physmax,
991	    &physinstalled, &memblocks);
992	PRM_DEBUG(physmax);
993	PRM_DEBUG(physinstalled);
994	PRM_DEBUG(memblocks);
995
996	/*
997	 * Compute maximum physical address for memory DR operations.
998	 * Memory DR operations are unsupported on xpv or 32bit OSes.
999	 */
1000#ifdef	__amd64
1001	if (plat_dr_support_memory()) {
1002		if (plat_dr_physmax == 0) {
1003			uint_t pabits = UINT_MAX;
1004
1005			cpuid_get_addrsize(CPU, &pabits, NULL);
1006			plat_dr_physmax = btop(1ULL << pabits);
1007		}
1008		if (plat_dr_physmax > PHYSMEM_MAX64)
1009			plat_dr_physmax = PHYSMEM_MAX64;
1010	} else
1011#endif
1012		plat_dr_physmax = 0;
1013
1014	/*
1015	 * Examine the bios reserved memory to find out:
1016	 * - the number of discontiguous segments of memory.
1017	 */
1018	if (prom_debug)
1019		print_memlist("boot reserved mem",
1020		    bootops->boot_mem->rsvdmem);
1021	installed_top_size_ex(bootops->boot_mem->rsvdmem, &rsvd_high_pfn,
1022	    &rsvd_pgcnt, &rsvdmemblocks);
1023	PRM_DEBUG(rsvd_high_pfn);
1024	PRM_DEBUG(rsvd_pgcnt);
1025	PRM_DEBUG(rsvdmemblocks);
1026
1027	/*
1028	 * Initialize hat's mmu parameters.
1029	 * Check for enforce-prot-exec in boot environment. It's used to
1030	 * enable/disable support for the page table entry NX bit.
1031	 * The default is to enforce PROT_EXEC on processors that support NX.
1032	 * Boot seems to round up the "len", but 8 seems to be big enough.
1033	 */
1034	mmu_init();
1035
1036#ifdef	__i386
1037	/*
1038	 * physmax is lowered if there is more memory than can be
1039	 * physically addressed in 32 bit (PAE/non-PAE) modes.
1040	 */
1041	if (mmu.pae_hat) {
1042		if (PFN_ABOVE64G(physmax)) {
1043			physinstalled -= (physmax - (PFN_64G - 1));
1044			physmax = PFN_64G - 1;
1045		}
1046	} else {
1047		if (PFN_ABOVE4G(physmax)) {
1048			physinstalled -= (physmax - (PFN_4G - 1));
1049			physmax = PFN_4G - 1;
1050		}
1051	}
1052#endif
1053
1054	startup_build_mem_nodes(bootops->boot_mem->physinstalled);
1055
1056	if (BOP_GETPROPLEN(bootops, "enforce-prot-exec") >= 0) {
1057		int len = BOP_GETPROPLEN(bootops, "enforce-prot-exec");
1058		char value[8];
1059
1060		if (len < 8)
1061			(void) BOP_GETPROP(bootops, "enforce-prot-exec", value);
1062		else
1063			(void) strcpy(value, "");
1064		if (strcmp(value, "off") == 0)
1065			mmu.pt_nx = 0;
1066	}
1067	PRM_DEBUG(mmu.pt_nx);
1068
1069	/*
1070	 * We will need page_t's for every page in the system, except for
1071	 * memory mapped at or above above the start of the kernel text segment.
1072	 *
1073	 * pages above e_modtext are attributed to kernel debugger (obp_pages)
1074	 */
1075	npages = physinstalled - 1; /* avail_filter() skips page 0, so "- 1" */
1076	obp_pages = 0;
1077	va = KERNEL_TEXT;
1078	while (kbm_probe(&va, &len, &pfn, &prot) != 0) {
1079		npages -= len >> MMU_PAGESHIFT;
1080		if (va >= (uintptr_t)e_moddata)
1081			obp_pages += len >> MMU_PAGESHIFT;
1082		va += len;
1083	}
1084	PRM_DEBUG(npages);
1085	PRM_DEBUG(obp_pages);
1086
1087	/*
1088	 * If physmem is patched to be non-zero, use it instead of the computed
1089	 * value unless it is larger than the actual amount of memory on hand.
1090	 */
1091	if (physmem == 0 || physmem > npages) {
1092		physmem = npages;
1093	} else if (physmem < npages) {
1094		orig_npages = npages;
1095		npages = physmem;
1096	}
1097	PRM_DEBUG(physmem);
1098
1099	/*
1100	 * We now compute the sizes of all the  initial allocations for
1101	 * structures the kernel needs in order do kmem_alloc(). These
1102	 * include:
1103	 *	memsegs
1104	 *	memlists
1105	 *	page hash table
1106	 *	page_t's
1107	 *	page coloring data structs
1108	 */
1109	memseg_sz = sizeof (struct memseg) * (memblocks + POSS_NEW_FRAGMENTS);
1110	ADD_TO_ALLOCATIONS(memseg_base, memseg_sz);
1111	PRM_DEBUG(memseg_sz);
1112
1113	/*
1114	 * Reserve space for memlists. There's no real good way to know exactly
1115	 * how much room we'll need, but this should be a good upper bound.
1116	 */
1117	memlist_sz = ROUND_UP_PAGE(2 * sizeof (struct memlist) *
1118	    (memblocks + POSS_NEW_FRAGMENTS));
1119	ADD_TO_ALLOCATIONS(memlist, memlist_sz);
1120	PRM_DEBUG(memlist_sz);
1121
1122	/*
1123	 * Reserve space for bios reserved memlists.
1124	 */
1125	rsvdmemlist_sz = ROUND_UP_PAGE(2 * sizeof (struct memlist) *
1126	    (rsvdmemblocks + POSS_NEW_FRAGMENTS));
1127	ADD_TO_ALLOCATIONS(bios_rsvd, rsvdmemlist_sz);
1128	PRM_DEBUG(rsvdmemlist_sz);
1129
1130	/* LINTED */
1131	ASSERT(P2SAMEHIGHBIT((1 << PP_SHIFT), sizeof (struct page)));
1132	/*
1133	 * The page structure hash table size is a power of 2
1134	 * such that the average hash chain length is PAGE_HASHAVELEN.
1135	 */
1136	page_hashsz = npages / PAGE_HASHAVELEN;
1137	page_hashsz_shift = highbit(page_hashsz);
1138	page_hashsz = 1 << page_hashsz_shift;
1139	pagehash_sz = sizeof (struct page *) * page_hashsz;
1140	ADD_TO_ALLOCATIONS(page_hash, pagehash_sz);
1141	PRM_DEBUG(pagehash_sz);
1142
1143	/*
1144	 * Set aside room for the page structures themselves.
1145	 */
1146	PRM_DEBUG(npages);
1147	pp_sz = sizeof (struct page) * npages;
1148	ADD_TO_ALLOCATIONS(pp_base, pp_sz);
1149	PRM_DEBUG(pp_sz);
1150
1151	/*
1152	 * determine l2 cache info and memory size for page coloring
1153	 */
1154	(void) getl2cacheinfo(CPU,
1155	    &l2cache_sz, &l2cache_linesz, &l2cache_assoc);
1156	pagecolor_memsz =
1157	    page_coloring_init(l2cache_sz, l2cache_linesz, l2cache_assoc);
1158	ADD_TO_ALLOCATIONS(pagecolor_mem, pagecolor_memsz);
1159	PRM_DEBUG(pagecolor_memsz);
1160
1161	page_ctrs_size = page_ctrs_sz();
1162	ADD_TO_ALLOCATIONS(page_ctrs_mem, page_ctrs_size);
1163	PRM_DEBUG(page_ctrs_size);
1164
1165	/*
1166	 * Allocate the array that protects pp->p_selock.
1167	 */
1168	pse_shift = size_pse_array(physmem, max_ncpus);
1169	pse_table_size = 1 << pse_shift;
1170	pse_table_alloc_size = pse_table_size * sizeof (pad_mutex_t);
1171	ADD_TO_ALLOCATIONS(pse_mutex, pse_table_alloc_size);
1172
1173#if defined(__amd64)
1174	valloc_sz = ROUND_UP_LPAGE(valloc_sz);
1175	valloc_base = VALLOC_BASE;
1176
1177	/*
1178	 * The default values of VALLOC_BASE and SEGKPM_BASE should work
1179	 * for values of physmax up to 1 Terabyte. They need adjusting when
1180	 * memory is at addresses above 1 TB. When adjusted, segkpm_base must
1181	 * be aligned on KERNEL_REDZONE_SIZE boundary (span of top level pte).
1182	 */
1183	if (physmax + 1 > mmu_btop(TERABYTE) ||
1184	    plat_dr_physmax > mmu_btop(TERABYTE)) {
1185		uint64_t kpm_resv_amount = mmu_ptob(physmax + 1);
1186
1187		if (kpm_resv_amount < mmu_ptob(plat_dr_physmax)) {
1188			kpm_resv_amount = mmu_ptob(plat_dr_physmax);
1189		}
1190
1191		segkpm_base = -(P2ROUNDUP((2 * kpm_resv_amount),
1192		    KERNEL_REDZONE_SIZE));	/* down from top VA */
1193
1194		/* make sure we leave some space for user apps above hole */
1195		segkpm_base = MAX(segkpm_base, AMD64_VA_HOLE_END + TERABYTE);
1196		if (segkpm_base > SEGKPM_BASE)
1197			segkpm_base = SEGKPM_BASE;
1198		PRM_DEBUG(segkpm_base);
1199
1200		valloc_base = segkpm_base + P2ROUNDUP(kpm_resv_amount, ONE_GIG);
1201		if (valloc_base < segkpm_base)
1202			panic("not enough kernel VA to support memory size");
1203		PRM_DEBUG(valloc_base);
1204	}
1205#else	/* __i386 */
1206	valloc_base = (uintptr_t)(MISC_VA_BASE - valloc_sz);
1207	valloc_base = P2ALIGN(valloc_base, mmu.level_size[1]);
1208	PRM_DEBUG(valloc_base);
1209#endif	/* __i386 */
1210
1211	/*
1212	 * do all the initial allocations
1213	 */
1214	perform_allocations();
1215
1216	/*
1217	 * Build phys_install and phys_avail in kernel memspace.
1218	 * - phys_install should be all memory in the system.
1219	 * - phys_avail is phys_install minus any memory mapped before this
1220	 *    point above KERNEL_TEXT.
1221	 */
1222	current = phys_install = memlist;
1223	copy_memlist_filter(bootops->boot_mem->physinstalled, &current, NULL);
1224	if ((caddr_t)current > (caddr_t)memlist + memlist_sz)
1225		panic("physinstalled was too big!");
1226	if (prom_debug)
1227		print_memlist("phys_install", phys_install);
1228
1229	phys_avail = current;
1230	PRM_POINT("Building phys_avail:\n");
1231	copy_memlist_filter(bootops->boot_mem->physinstalled, &current,
1232	    avail_filter);
1233	if ((caddr_t)current > (caddr_t)memlist + memlist_sz)
1234		panic("physavail was too big!");
1235	if (prom_debug)
1236		print_memlist("phys_avail", phys_avail);
1237#ifndef	__xpv
1238	/*
1239	 * Free unused memlist items, which may be used by memory DR driver
1240	 * at runtime.
1241	 */
1242	if ((caddr_t)current < (caddr_t)memlist + memlist_sz) {
1243		memlist_free_block((caddr_t)current,
1244		    (caddr_t)memlist + memlist_sz - (caddr_t)current);
1245	}
1246#endif
1247
1248	/*
1249	 * Build bios reserved memspace
1250	 */
1251	current = bios_rsvd;
1252	copy_memlist_filter(bootops->boot_mem->rsvdmem, &current, NULL);
1253	if ((caddr_t)current > (caddr_t)bios_rsvd + rsvdmemlist_sz)
1254		panic("bios_rsvd was too big!");
1255	if (prom_debug)
1256		print_memlist("bios_rsvd", bios_rsvd);
1257#ifndef	__xpv
1258	/*
1259	 * Free unused memlist items, which may be used by memory DR driver
1260	 * at runtime.
1261	 */
1262	if ((caddr_t)current < (caddr_t)bios_rsvd + rsvdmemlist_sz) {
1263		memlist_free_block((caddr_t)current,
1264		    (caddr_t)bios_rsvd + rsvdmemlist_sz - (caddr_t)current);
1265	}
1266#endif
1267
1268	/*
1269	 * setup page coloring
1270	 */
1271	page_coloring_setup(pagecolor_mem);
1272	page_lock_init();	/* currently a no-op */
1273
1274	/*
1275	 * free page list counters
1276	 */
1277	(void) page_ctrs_alloc(page_ctrs_mem);
1278
1279	/*
1280	 * Size the pcf array based on the number of cpus in the box at
1281	 * boot time.
1282	 */
1283
1284	pcf_init();
1285
1286	/*
1287	 * Initialize the page structures from the memory lists.
1288	 */
1289	availrmem_initial = availrmem = freemem = 0;
1290	PRM_POINT("Calling kphysm_init()...");
1291	npages = kphysm_init(pp_base, npages);
1292	PRM_POINT("kphysm_init() done");
1293	PRM_DEBUG(npages);
1294
1295	init_debug_info();
1296
1297	/*
1298	 * Now that page_t's have been initialized, remove all the
1299	 * initial allocation pages from the kernel free page lists.
1300	 */
1301	boot_mapin((caddr_t)valloc_base, valloc_sz);
1302	boot_mapin((caddr_t)MISC_VA_BASE, MISC_VA_SIZE);
1303	PRM_POINT("startup_memlist() done");
1304
1305	PRM_DEBUG(valloc_sz);
1306
1307#if defined(__amd64)
1308	if ((availrmem >> (30 - MMU_PAGESHIFT)) >=
1309	    textrepl_min_gb && l2cache_sz <= 2 << 20) {
1310		extern size_t textrepl_size_thresh;
1311		textrepl_size_thresh = (16 << 20) - 1;
1312	}
1313#endif
1314}
1315
1316/*
1317 * Layout the kernel's part of address space and initialize kmem allocator.
1318 */
1319static void
1320startup_kmem(void)
1321{
1322	extern void page_set_colorequiv_arr(void);
1323
1324	PRM_POINT("startup_kmem() starting...");
1325
1326#if defined(__amd64)
1327	if (eprom_kernelbase && eprom_kernelbase != KERNELBASE)
1328		cmn_err(CE_NOTE, "!kernelbase cannot be changed on 64-bit "
1329		    "systems.");
1330	kernelbase = segkpm_base - KERNEL_REDZONE_SIZE;
1331	core_base = (uintptr_t)COREHEAP_BASE;
1332	core_size = (size_t)MISC_VA_BASE - COREHEAP_BASE;
1333#else	/* __i386 */
1334	/*
1335	 * We configure kernelbase based on:
1336	 *
1337	 * 1. user specified kernelbase via eeprom command. Value cannot exceed
1338	 *    KERNELBASE_MAX. we large page align eprom_kernelbase
1339	 *
1340	 * 2. Default to KERNELBASE and adjust to 2X less the size for page_t.
1341	 *    On large memory systems we must lower kernelbase to allow
1342	 *    enough room for page_t's for all of memory.
1343	 *
1344	 * The value set here, might be changed a little later.
1345	 */
1346	if (eprom_kernelbase) {
1347		kernelbase = eprom_kernelbase & mmu.level_mask[1];
1348		if (kernelbase > KERNELBASE_MAX)
1349			kernelbase = KERNELBASE_MAX;
1350	} else {
1351		kernelbase = (uintptr_t)KERNELBASE;
1352		kernelbase -= ROUND_UP_4MEG(2 * valloc_sz);
1353	}
1354	ASSERT((kernelbase & mmu.level_offset[1]) == 0);
1355	core_base = valloc_base;
1356	core_size = 0;
1357#endif	/* __i386 */
1358
1359	PRM_DEBUG(core_base);
1360	PRM_DEBUG(core_size);
1361	PRM_DEBUG(kernelbase);
1362
1363#if defined(__i386)
1364	segkp_fromheap = 1;
1365#endif	/* __i386 */
1366
1367	ekernelheap = (char *)core_base;
1368	PRM_DEBUG(ekernelheap);
1369
1370	/*
1371	 * Now that we know the real value of kernelbase,
1372	 * update variables that were initialized with a value of
1373	 * KERNELBASE (in common/conf/param.c).
1374	 *
1375	 * XXX	The problem with this sort of hackery is that the
1376	 *	compiler just may feel like putting the const declarations
1377	 *	(in param.c) into the .text section.  Perhaps they should
1378	 *	just be declared as variables there?
1379	 */
1380
1381	*(uintptr_t *)&_kernelbase = kernelbase;
1382	*(uintptr_t *)&_userlimit = kernelbase;
1383#if defined(__amd64)
1384	*(uintptr_t *)&_userlimit -= KERNELBASE - USERLIMIT;
1385#else
1386	*(uintptr_t *)&_userlimit32 = _userlimit;
1387#endif
1388	PRM_DEBUG(_kernelbase);
1389	PRM_DEBUG(_userlimit);
1390	PRM_DEBUG(_userlimit32);
1391
1392	layout_kernel_va();
1393
1394#if defined(__i386)
1395	/*
1396	 * If segmap is too large we can push the bottom of the kernel heap
1397	 * higher than the base.  Or worse, it could exceed the top of the
1398	 * VA space entirely, causing it to wrap around.
1399	 */
1400	if (kernelheap >= ekernelheap || (uintptr_t)kernelheap < kernelbase)
1401		panic("too little address space available for kernelheap,"
1402		    " use eeprom for lower kernelbase or smaller segmapsize");
1403#endif	/* __i386 */
1404
1405	/*
1406	 * Initialize the kernel heap. Note 3rd argument must be > 1st.
1407	 */
1408	kernelheap_init(kernelheap, ekernelheap,
1409	    kernelheap + MMU_PAGESIZE,
1410	    (void *)core_base, (void *)(core_base + core_size));
1411
1412#if defined(__xpv)
1413	/*
1414	 * Link pending events struct into cpu struct
1415	 */
1416	CPU->cpu_m.mcpu_evt_pend = &cpu0_evt_data;
1417#endif
1418	/*
1419	 * Initialize kernel memory allocator.
1420	 */
1421	kmem_init();
1422
1423	/*
1424	 * Factor in colorequiv to check additional 'equivalent' bins
1425	 */
1426	page_set_colorequiv_arr();
1427
1428	/*
1429	 * print this out early so that we know what's going on
1430	 */
1431	print_x86_featureset(x86_featureset);
1432
1433	/*
1434	 * Initialize bp_mapin().
1435	 */
1436	bp_init(MMU_PAGESIZE, HAT_STORECACHING_OK);
1437
1438	/*
1439	 * orig_npages is non-zero if physmem has been configured for less
1440	 * than the available memory.
1441	 */
1442	if (orig_npages) {
1443		cmn_err(CE_WARN, "!%slimiting physmem to 0x%lx of 0x%lx pages",
1444		    (npages == PHYSMEM ? "Due to virtual address space " : ""),
1445		    npages, orig_npages);
1446	}
1447#if defined(__i386)
1448	if (eprom_kernelbase && (eprom_kernelbase != kernelbase))
1449		cmn_err(CE_WARN, "kernelbase value, User specified 0x%lx, "
1450		    "System using 0x%lx",
1451		    (uintptr_t)eprom_kernelbase, (uintptr_t)kernelbase);
1452#endif
1453
1454#ifdef	KERNELBASE_ABI_MIN
1455	if (kernelbase < (uintptr_t)KERNELBASE_ABI_MIN) {
1456		cmn_err(CE_NOTE, "!kernelbase set to 0x%lx, system is not "
1457		    "i386 ABI compliant.", (uintptr_t)kernelbase);
1458	}
1459#endif
1460
1461#ifndef __xpv
1462	if (plat_dr_support_memory()) {
1463		mem_config_init();
1464	}
1465#else	/* __xpv */
1466	/*
1467	 * Some of the xen start information has to be relocated up
1468	 * into the kernel's permanent address space.
1469	 */
1470	PRM_POINT("calling xen_relocate_start_info()");
1471	xen_relocate_start_info();
1472	PRM_POINT("xen_relocate_start_info() done");
1473
1474	/*
1475	 * (Update the vcpu pointer in our cpu structure to point into
1476	 * the relocated shared info.)
1477	 */
1478	CPU->cpu_m.mcpu_vcpu_info =
1479	    &HYPERVISOR_shared_info->vcpu_info[CPU->cpu_id];
1480#endif	/* __xpv */
1481
1482	PRM_POINT("startup_kmem() done");
1483}
1484
1485#ifndef __xpv
1486/*
1487 * If we have detected that we are running in an HVM environment, we need
1488 * to prepend the PV driver directory to the module search path.
1489 */
1490#define	HVM_MOD_DIR "/platform/i86hvm/kernel"
1491static void
1492update_default_path()
1493{
1494	char *current, *newpath;
1495	int newlen;
1496
1497	/*
1498	 * We are about to resync with krtld.  krtld will reset its
1499	 * internal module search path iff Solaris has set default_path.
1500	 * We want to be sure we're prepending this new directory to the
1501	 * right search path.
1502	 */
1503	current = (default_path == NULL) ? kobj_module_path : default_path;
1504
1505	newlen = strlen(HVM_MOD_DIR) + strlen(current) + 2;
1506	newpath = kmem_alloc(newlen, KM_SLEEP);
1507	(void) strcpy(newpath, HVM_MOD_DIR);
1508	(void) strcat(newpath, " ");
1509	(void) strcat(newpath, current);
1510
1511	default_path = newpath;
1512}
1513#endif
1514
1515static void
1516startup_modules(void)
1517{
1518	int cnt;
1519	extern void prom_setup(void);
1520	int32_t v, h;
1521	char d[11];
1522	char *cp;
1523	cmi_hdl_t hdl;
1524
1525	PRM_POINT("startup_modules() starting...");
1526
1527#ifndef __xpv
1528	/*
1529	 * Initialize ten-micro second timer so that drivers will
1530	 * not get short changed in their init phase. This was
1531	 * not getting called until clkinit which, on fast cpu's
1532	 * caused the drv_usecwait to be way too short.
1533	 */
1534	microfind();
1535
1536	if (get_hwenv() == HW_XEN_HVM)
1537		update_default_path();
1538#endif
1539
1540	/*
1541	 * Read the GMT lag from /etc/rtc_config.
1542	 */
1543	sgmtl(process_rtc_config_file());
1544
1545	/*
1546	 * Calculate default settings of system parameters based upon
1547	 * maxusers, yet allow to be overridden via the /etc/system file.
1548	 */
1549	param_calc(0);
1550
1551	mod_setup();
1552
1553	/*
1554	 * Initialize system parameters.
1555	 */
1556	param_init();
1557
1558	/*
1559	 * Initialize the default brands
1560	 */
1561	brand_init();
1562
1563	/*
1564	 * maxmem is the amount of physical memory we're playing with.
1565	 */
1566	maxmem = physmem;
1567
1568	/*
1569	 * Initialize segment management stuff.
1570	 */
1571	seg_init();
1572
1573	if (modload("fs", "specfs") == -1)
1574		halt("Can't load specfs");
1575
1576	if (modload("fs", "devfs") == -1)
1577		halt("Can't load devfs");
1578
1579	if (modload("fs", "dev") == -1)
1580		halt("Can't load dev");
1581
1582	if (modload("fs", "procfs") == -1)
1583		halt("Can't load procfs");
1584
1585	(void) modloadonly("sys", "lbl_edition");
1586
1587	dispinit();
1588
1589	/*
1590	 * This is needed here to initialize hw_serial[] for cluster booting.
1591	 */
1592	if ((h = set_soft_hostid()) == HW_INVALID_HOSTID) {
1593		cmn_err(CE_WARN, "Unable to set hostid");
1594	} else {
1595		for (v = h, cnt = 0; cnt < 10; cnt++) {
1596			d[cnt] = (char)(v % 10);
1597			v /= 10;
1598			if (v == 0)
1599				break;
1600		}
1601		for (cp = hw_serial; cnt >= 0; cnt--)
1602			*cp++ = d[cnt] + '0';
1603		*cp = 0;
1604	}
1605
1606	/* Read cluster configuration data. */
1607	clconf_init();
1608
1609#if defined(__xpv)
1610	(void) ec_init();
1611	gnttab_init();
1612	(void) xs_early_init();
1613#endif /* __xpv */
1614
1615	/*
1616	 * Create a kernel device tree. First, create rootnex and
1617	 * then invoke bus specific code to probe devices.
1618	 */
1619	setup_ddi();
1620
1621#ifdef __xpv
1622	if (DOMAIN_IS_INITDOMAIN(xen_info))
1623#endif
1624	{
1625		/*
1626		 * Load the System Management BIOS into the global ksmbios
1627		 * handle, if an SMBIOS is present on this system.
1628		 */
1629		ksmbios = smbios_open(NULL, SMB_VERSION, ksmbios_flags, NULL);
1630	}
1631
1632
1633	/*
1634	 * Set up the CPU module subsystem for the boot cpu in the native
1635	 * case, and all physical cpu resource in the xpv dom0 case.
1636	 * Modifies the device tree, so this must be done after
1637	 * setup_ddi().
1638	 */
1639#ifdef __xpv
1640	/*
1641	 * If paravirtualized and on dom0 then we initialize all physical
1642	 * cpu handles now;  if paravirtualized on a domU then do not
1643	 * initialize.
1644	 */
1645	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1646		xen_mc_lcpu_cookie_t cpi;
1647
1648		for (cpi = xen_physcpu_next(NULL); cpi != NULL;
1649		    cpi = xen_physcpu_next(cpi)) {
1650			if ((hdl = cmi_init(CMI_HDL_SOLARIS_xVM_MCA,
1651			    xen_physcpu_chipid(cpi), xen_physcpu_coreid(cpi),
1652			    xen_physcpu_strandid(cpi))) != NULL &&
1653			    is_x86_feature(x86_featureset, X86FSET_MCA))
1654				cmi_mca_init(hdl);
1655		}
1656	}
1657#else
1658	/*
1659	 * Initialize a handle for the boot cpu - others will initialize
1660	 * as they startup.  Do not do this if we know we are in an HVM domU.
1661	 */
1662	if ((get_hwenv() != HW_XEN_HVM) &&
1663	    (hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(CPU),
1664	    cmi_ntv_hwcoreid(CPU), cmi_ntv_hwstrandid(CPU))) != NULL &&
1665	    is_x86_feature(x86_featureset, X86FSET_MCA)) {
1666			cmi_mca_init(hdl);
1667			CPU->cpu_m.mcpu_cmi_hdl = hdl;
1668	}
1669#endif	/* __xpv */
1670
1671	/*
1672	 * Fake a prom tree such that /dev/openprom continues to work
1673	 */
1674	PRM_POINT("startup_modules: calling prom_setup...");
1675	prom_setup();
1676	PRM_POINT("startup_modules: done");
1677
1678	/*
1679	 * Load all platform specific modules
1680	 */
1681	PRM_POINT("startup_modules: calling psm_modload...");
1682	psm_modload();
1683
1684	PRM_POINT("startup_modules() done");
1685}
1686
1687/*
1688 * claim a "setaside" boot page for use in the kernel
1689 */
1690page_t *
1691boot_claim_page(pfn_t pfn)
1692{
1693	page_t *pp;
1694
1695	pp = page_numtopp_nolock(pfn);
1696	ASSERT(pp != NULL);
1697
1698	if (PP_ISBOOTPAGES(pp)) {
1699		if (pp->p_next != NULL)
1700			pp->p_next->p_prev = pp->p_prev;
1701		if (pp->p_prev == NULL)
1702			bootpages = pp->p_next;
1703		else
1704			pp->p_prev->p_next = pp->p_next;
1705	} else {
1706		/*
1707		 * htable_attach() expects a base pagesize page
1708		 */
1709		if (pp->p_szc != 0)
1710			page_boot_demote(pp);
1711		pp = page_numtopp(pfn, SE_EXCL);
1712	}
1713	return (pp);
1714}
1715
1716/*
1717 * Walk through the pagetables looking for pages mapped in by boot.  If the
1718 * setaside flag is set the pages are expected to be returned to the
1719 * kernel later in boot, so we add them to the bootpages list.
1720 */
1721static void
1722protect_boot_range(uintptr_t low, uintptr_t high, int setaside)
1723{
1724	uintptr_t va = low;
1725	size_t len;
1726	uint_t prot;
1727	pfn_t pfn;
1728	page_t *pp;
1729	pgcnt_t boot_protect_cnt = 0;
1730
1731	while (kbm_probe(&va, &len, &pfn, &prot) != 0 && va < high) {
1732		if (va + len >= high)
1733			panic("0x%lx byte mapping at 0x%p exceeds boot's "
1734			    "legal range.", len, (void *)va);
1735
1736		while (len > 0) {
1737			pp = page_numtopp_alloc(pfn);
1738			if (pp != NULL) {
1739				if (setaside == 0)
1740					panic("Unexpected mapping by boot.  "
1741					    "addr=%p pfn=%lx\n",
1742					    (void *)va, pfn);
1743
1744				pp->p_next = bootpages;
1745				pp->p_prev = NULL;
1746				PP_SETBOOTPAGES(pp);
1747				if (bootpages != NULL) {
1748					bootpages->p_prev = pp;
1749				}
1750				bootpages = pp;
1751				++boot_protect_cnt;
1752			}
1753
1754			++pfn;
1755			len -= MMU_PAGESIZE;
1756			va += MMU_PAGESIZE;
1757		}
1758	}
1759	PRM_DEBUG(boot_protect_cnt);
1760}
1761
1762/*
1763 *
1764 */
1765static void
1766layout_kernel_va(void)
1767{
1768	PRM_POINT("layout_kernel_va() starting...");
1769	/*
1770	 * Establish the final size of the kernel's heap, size of segmap,
1771	 * segkp, etc.
1772	 */
1773
1774#if defined(__amd64)
1775
1776	kpm_vbase = (caddr_t)segkpm_base;
1777	if (physmax + 1 < plat_dr_physmax) {
1778		kpm_size = ROUND_UP_LPAGE(mmu_ptob(plat_dr_physmax));
1779	} else {
1780		kpm_size = ROUND_UP_LPAGE(mmu_ptob(physmax + 1));
1781	}
1782	if ((uintptr_t)kpm_vbase + kpm_size > (uintptr_t)valloc_base)
1783		panic("not enough room for kpm!");
1784	PRM_DEBUG(kpm_size);
1785	PRM_DEBUG(kpm_vbase);
1786
1787	/*
1788	 * By default we create a seg_kp in 64 bit kernels, it's a little
1789	 * faster to access than embedding it in the heap.
1790	 */
1791	segkp_base = (caddr_t)valloc_base + valloc_sz;
1792	if (!segkp_fromheap) {
1793		size_t sz = mmu_ptob(segkpsize);
1794
1795		/*
1796		 * determine size of segkp
1797		 */
1798		if (sz < SEGKPMINSIZE || sz > SEGKPMAXSIZE) {
1799			sz = SEGKPDEFSIZE;
1800			cmn_err(CE_WARN, "!Illegal value for segkpsize. "
1801			    "segkpsize has been reset to %ld pages",
1802			    mmu_btop(sz));
1803		}
1804		sz = MIN(sz, MAX(SEGKPMINSIZE, mmu_ptob(physmem)));
1805
1806		segkpsize = mmu_btop(ROUND_UP_LPAGE(sz));
1807	}
1808	PRM_DEBUG(segkp_base);
1809	PRM_DEBUG(segkpsize);
1810
1811	/*
1812	 * segzio is used for ZFS cached data. It uses a distinct VA
1813	 * segment (from kernel heap) so that we can easily tell not to
1814	 * include it in kernel crash dumps on 64 bit kernels. The trick is
1815	 * to give it lots of VA, but not constrain the kernel heap.
1816	 * We scale the size of segzio linearly with physmem up to
1817	 * SEGZIOMAXSIZE. Above that amount it scales at 50% of physmem.
1818	 */
1819	segzio_base = segkp_base + mmu_ptob(segkpsize);
1820	if (segzio_fromheap) {
1821		segziosize = 0;
1822	} else {
1823		size_t physmem_size = mmu_ptob(physmem);
1824		size_t size = (segziosize == 0) ?
1825		    physmem_size : mmu_ptob(segziosize);
1826
1827		if (size < SEGZIOMINSIZE)
1828			size = SEGZIOMINSIZE;
1829		if (size > SEGZIOMAXSIZE) {
1830			size = SEGZIOMAXSIZE;
1831			if (physmem_size > size)
1832				size += (physmem_size - size) / 2;
1833		}
1834		segziosize = mmu_btop(ROUND_UP_LPAGE(size));
1835	}
1836	PRM_DEBUG(segziosize);
1837	PRM_DEBUG(segzio_base);
1838
1839	/*
1840	 * Put the range of VA for device mappings next, kmdb knows to not
1841	 * grep in this range of addresses.
1842	 */
1843	toxic_addr =
1844	    ROUND_UP_LPAGE((uintptr_t)segzio_base + mmu_ptob(segziosize));
1845	PRM_DEBUG(toxic_addr);
1846	segmap_start = ROUND_UP_LPAGE(toxic_addr + toxic_size);
1847#else /* __i386 */
1848	segmap_start = ROUND_UP_LPAGE(kernelbase);
1849#endif /* __i386 */
1850	PRM_DEBUG(segmap_start);
1851
1852	/*
1853	 * Users can change segmapsize through eeprom. If the variable
1854	 * is tuned through eeprom, there is no upper bound on the
1855	 * size of segmap.
1856	 */
1857	segmapsize = MAX(ROUND_UP_LPAGE(segmapsize), SEGMAPDEFAULT);
1858
1859#if defined(__i386)
1860	/*
1861	 * 32-bit systems don't have segkpm or segkp, so segmap appears at
1862	 * the bottom of the kernel's address range.  Set aside space for a
1863	 * small red zone just below the start of segmap.
1864	 */
1865	segmap_start += KERNEL_REDZONE_SIZE;
1866	segmapsize -= KERNEL_REDZONE_SIZE;
1867#endif
1868
1869	PRM_DEBUG(segmap_start);
1870	PRM_DEBUG(segmapsize);
1871	kernelheap = (caddr_t)ROUND_UP_LPAGE(segmap_start + segmapsize);
1872	PRM_DEBUG(kernelheap);
1873	PRM_POINT("layout_kernel_va() done...");
1874}
1875
1876/*
1877 * Finish initializing the VM system, now that we are no longer
1878 * relying on the boot time memory allocators.
1879 */
1880static void
1881startup_vm(void)
1882{
1883	struct segmap_crargs a;
1884
1885	extern int use_brk_lpg, use_stk_lpg;
1886
1887	PRM_POINT("startup_vm() starting...");
1888
1889	/*
1890	 * Initialize the hat layer.
1891	 */
1892	hat_init();
1893
1894	/*
1895	 * Do final allocations of HAT data structures that need to
1896	 * be allocated before quiescing the boot loader.
1897	 */
1898	PRM_POINT("Calling hat_kern_alloc()...");
1899	hat_kern_alloc((caddr_t)segmap_start, segmapsize, ekernelheap);
1900	PRM_POINT("hat_kern_alloc() done");
1901
1902#ifndef __xpv
1903	/*
1904	 * Setup Page Attribute Table
1905	 */
1906	pat_sync();
1907#endif
1908
1909	/*
1910	 * The next two loops are done in distinct steps in order
1911	 * to be sure that any page that is doubly mapped (both above
1912	 * KERNEL_TEXT and below kernelbase) is dealt with correctly.
1913	 * Note this may never happen, but it might someday.
1914	 */
1915	bootpages = NULL;
1916	PRM_POINT("Protecting boot pages");
1917
1918	/*
1919	 * Protect any pages mapped above KERNEL_TEXT that somehow have
1920	 * page_t's. This can only happen if something weird allocated
1921	 * in this range (like kadb/kmdb).
1922	 */
1923	protect_boot_range(KERNEL_TEXT, (uintptr_t)-1, 0);
1924
1925	/*
1926	 * Before we can take over memory allocation/mapping from the boot
1927	 * loader we must remove from our free page lists any boot allocated
1928	 * pages that stay mapped until release_bootstrap().
1929	 */
1930	protect_boot_range(0, kernelbase, 1);
1931
1932
1933	/*
1934	 * Switch to running on regular HAT (not boot_mmu)
1935	 */
1936	PRM_POINT("Calling hat_kern_setup()...");
1937	hat_kern_setup();
1938
1939	/*
1940	 * It is no longer safe to call BOP_ALLOC(), so make sure we don't.
1941	 */
1942	bop_no_more_mem();
1943
1944	PRM_POINT("hat_kern_setup() done");
1945
1946	hat_cpu_online(CPU);
1947
1948	/*
1949	 * Initialize VM system
1950	 */
1951	PRM_POINT("Calling kvm_init()...");
1952	kvm_init();
1953	PRM_POINT("kvm_init() done");
1954
1955	/*
1956	 * Tell kmdb that the VM system is now working
1957	 */
1958	if (boothowto & RB_DEBUG)
1959		kdi_dvec_vmready();
1960
1961#if defined(__xpv)
1962	/*
1963	 * Populate the I/O pool on domain 0
1964	 */
1965	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1966		extern long populate_io_pool(void);
1967		long init_io_pool_cnt;
1968
1969		PRM_POINT("Populating reserve I/O page pool");
1970		init_io_pool_cnt = populate_io_pool();
1971		PRM_DEBUG(init_io_pool_cnt);
1972	}
1973#endif
1974	/*
1975	 * Mangle the brand string etc.
1976	 */
1977	cpuid_pass3(CPU);
1978
1979#if defined(__amd64)
1980
1981	/*
1982	 * Create the device arena for toxic (to dtrace/kmdb) mappings.
1983	 */
1984	device_arena = vmem_create("device", (void *)toxic_addr,
1985	    toxic_size, MMU_PAGESIZE, NULL, NULL, NULL, 0, VM_SLEEP);
1986
1987#else	/* __i386 */
1988
1989	/*
1990	 * allocate the bit map that tracks toxic pages
1991	 */
1992	toxic_bit_map_len = btop((ulong_t)(valloc_base - kernelbase));
1993	PRM_DEBUG(toxic_bit_map_len);
1994	toxic_bit_map =
1995	    kmem_zalloc(BT_SIZEOFMAP(toxic_bit_map_len), KM_NOSLEEP);
1996	ASSERT(toxic_bit_map != NULL);
1997	PRM_DEBUG(toxic_bit_map);
1998
1999#endif	/* __i386 */
2000
2001
2002	/*
2003	 * Now that we've got more VA, as well as the ability to allocate from
2004	 * it, tell the debugger.
2005	 */
2006	if (boothowto & RB_DEBUG)
2007		kdi_dvec_memavail();
2008
2009	/*
2010	 * The following code installs a special page fault handler (#pf)
2011	 * to work around a pentium bug.
2012	 */
2013#if !defined(__amd64) && !defined(__xpv)
2014	if (x86_type == X86_TYPE_P5) {
2015		desctbr_t idtr;
2016		gate_desc_t *newidt;
2017
2018		if ((newidt = kmem_zalloc(MMU_PAGESIZE, KM_NOSLEEP)) == NULL)
2019			panic("failed to install pentium_pftrap");
2020
2021		bcopy(idt0, newidt, NIDT * sizeof (*idt0));
2022		set_gatesegd(&newidt[T_PGFLT], &pentium_pftrap,
2023		    KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
2024
2025		(void) as_setprot(&kas, (caddr_t)newidt, MMU_PAGESIZE,
2026		    PROT_READ | PROT_EXEC);
2027
2028		CPU->cpu_idt = newidt;
2029		idtr.dtr_base = (uintptr_t)CPU->cpu_idt;
2030		idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
2031		wr_idtr(&idtr);
2032	}
2033#endif	/* !__amd64 */
2034
2035#if !defined(__xpv)
2036	/*
2037	 * Map page pfn=0 for drivers, such as kd, that need to pick up
2038	 * parameters left there by controllers/BIOS.
2039	 */
2040	PRM_POINT("setup up p0_va");
2041	p0_va = i86devmap(0, 1, PROT_READ);
2042	PRM_DEBUG(p0_va);
2043#endif
2044
2045	cmn_err(CE_CONT, "?mem = %luK (0x%lx)\n",
2046	    physinstalled << (MMU_PAGESHIFT - 10), ptob(physinstalled));
2047
2048	/*
2049	 * disable automatic large pages for small memory systems or
2050	 * when the disable flag is set.
2051	 *
2052	 * Do not yet consider page sizes larger than 2m/4m.
2053	 */
2054	if (!auto_lpg_disable && mmu.max_page_level > 0) {
2055		max_uheap_lpsize = LEVEL_SIZE(1);
2056		max_ustack_lpsize = LEVEL_SIZE(1);
2057		max_privmap_lpsize = LEVEL_SIZE(1);
2058		max_uidata_lpsize = LEVEL_SIZE(1);
2059		max_utext_lpsize = LEVEL_SIZE(1);
2060		max_shm_lpsize = LEVEL_SIZE(1);
2061	}
2062	if (physmem < privm_lpg_min_physmem || mmu.max_page_level == 0 ||
2063	    auto_lpg_disable) {
2064		use_brk_lpg = 0;
2065		use_stk_lpg = 0;
2066	}
2067	mcntl0_lpsize = LEVEL_SIZE(mmu.umax_page_level);
2068
2069	PRM_POINT("Calling hat_init_finish()...");
2070	hat_init_finish();
2071	PRM_POINT("hat_init_finish() done");
2072
2073	/*
2074	 * Initialize the segkp segment type.
2075	 */
2076	rw_enter(&kas.a_lock, RW_WRITER);
2077	PRM_POINT("Attaching segkp");
2078	if (segkp_fromheap) {
2079		segkp->s_as = &kas;
2080	} else if (seg_attach(&kas, (caddr_t)segkp_base, mmu_ptob(segkpsize),
2081	    segkp) < 0) {
2082		panic("startup: cannot attach segkp");
2083		/*NOTREACHED*/
2084	}
2085	PRM_POINT("Doing segkp_create()");
2086	if (segkp_create(segkp) != 0) {
2087		panic("startup: segkp_create failed");
2088		/*NOTREACHED*/
2089	}
2090	PRM_DEBUG(segkp);
2091	rw_exit(&kas.a_lock);
2092
2093	/*
2094	 * kpm segment
2095	 */
2096	segmap_kpm = 0;
2097	if (kpm_desired) {
2098		kpm_init();
2099		kpm_enable = 1;
2100	}
2101
2102	/*
2103	 * Now create segmap segment.
2104	 */
2105	rw_enter(&kas.a_lock, RW_WRITER);
2106	if (seg_attach(&kas, (caddr_t)segmap_start, segmapsize, segmap) < 0) {
2107		panic("cannot attach segmap");
2108		/*NOTREACHED*/
2109	}
2110	PRM_DEBUG(segmap);
2111
2112	a.prot = PROT_READ | PROT_WRITE;
2113	a.shmsize = 0;
2114	a.nfreelist = segmapfreelists;
2115
2116	if (segmap_create(segmap, (caddr_t)&a) != 0)
2117		panic("segmap_create segmap");
2118	rw_exit(&kas.a_lock);
2119
2120	setup_vaddr_for_ppcopy(CPU);
2121
2122	segdev_init();
2123#if defined(__xpv)
2124	if (DOMAIN_IS_INITDOMAIN(xen_info))
2125#endif
2126		pmem_init();
2127
2128	PRM_POINT("startup_vm() done");
2129}
2130
2131/*
2132 * Load a tod module for the non-standard tod part found on this system.
2133 */
2134static void
2135load_tod_module(char *todmod)
2136{
2137	if (modload("tod", todmod) == -1)
2138		halt("Can't load TOD module");
2139}
2140
2141static void
2142startup_end(void)
2143{
2144	int i;
2145	extern void setx86isalist(void);
2146	extern void cpu_event_init(void);
2147
2148	PRM_POINT("startup_end() starting...");
2149
2150	/*
2151	 * Perform tasks that get done after most of the VM
2152	 * initialization has been done but before the clock
2153	 * and other devices get started.
2154	 */
2155	kern_setup1();
2156
2157	/*
2158	 * Perform CPC initialization for this CPU.
2159	 */
2160	kcpc_hw_init(CPU);
2161
2162	/*
2163	 * Initialize cpu event framework.
2164	 */
2165	cpu_event_init();
2166
2167#if defined(OPTERON_WORKAROUND_6323525)
2168	if (opteron_workaround_6323525)
2169		patch_workaround_6323525();
2170#endif
2171	/*
2172	 * If needed, load TOD module now so that ddi_get_time(9F) etc. work
2173	 * (For now, "needed" is defined as set tod_module_name in /etc/system)
2174	 */
2175	if (tod_module_name != NULL) {
2176		PRM_POINT("load_tod_module()");
2177		load_tod_module(tod_module_name);
2178	}
2179
2180#if defined(__xpv)
2181	/*
2182	 * Forceload interposing TOD module for the hypervisor.
2183	 */
2184	PRM_POINT("load_tod_module()");
2185	load_tod_module("xpvtod");
2186#endif
2187
2188	/*
2189	 * Configure the system.
2190	 */
2191	PRM_POINT("Calling configure()...");
2192	configure();		/* set up devices */
2193	PRM_POINT("configure() done");
2194
2195	/*
2196	 * We can now setup for XSAVE because fpu_probe is done in configure().
2197	 */
2198	if (fp_save_mech == FP_XSAVE) {
2199		xsave_setup_msr(CPU);
2200	}
2201
2202	/*
2203	 * Set the isa_list string to the defined instruction sets we
2204	 * support.
2205	 */
2206	setx86isalist();
2207	cpu_intr_alloc(CPU, NINTR_THREADS);
2208	psm_install();
2209
2210	/*
2211	 * We're done with bootops.  We don't unmap the bootstrap yet because
2212	 * we're still using bootsvcs.
2213	 */
2214	PRM_POINT("NULLing out bootops");
2215	*bootopsp = (struct bootops *)NULL;
2216	bootops = (struct bootops *)NULL;
2217
2218#if defined(__xpv)
2219	ec_init_debug_irq();
2220	xs_domu_init();
2221#endif
2222
2223#if defined(__amd64) && !defined(__xpv)
2224	/*
2225	 * Intel IOMMU has been setup/initialized in ddi_impl.c
2226	 * Start it up now.
2227	 */
2228	immu_startup();
2229#endif
2230
2231	PRM_POINT("Enabling interrupts");
2232	(*picinitf)();
2233	sti();
2234#if defined(__xpv)
2235	ASSERT(CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0);
2236	xen_late_startup();
2237#endif
2238
2239	(void) add_avsoftintr((void *)&softlevel1_hdl, 1, softlevel1,
2240	    "softlevel1", NULL, NULL); /* XXX to be moved later */
2241
2242	/*
2243	 * Register these software interrupts for ddi timer.
2244	 * Software interrupts up to the level 10 are supported.
2245	 */
2246	for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
2247		char name[sizeof ("timer_softintr") + 2];
2248		(void) sprintf(name, "timer_softintr%02d", i);
2249		(void) add_avsoftintr((void *)&softlevel_hdl[i-1], i,
2250		    (avfunc)timer_softintr, name, (caddr_t)(uintptr_t)i, NULL);
2251	}
2252
2253#if !defined(__xpv)
2254	if (modload("drv", "amd_iommu") < 0) {
2255		PRM_POINT("No AMD IOMMU present\n");
2256	} else if (ddi_hold_installed_driver(ddi_name_to_major(
2257	    "amd_iommu")) == NULL) {
2258		prom_printf("ERROR: failed to attach AMD IOMMU\n");
2259	}
2260#endif
2261	post_startup_cpu_fixups();
2262
2263	PRM_POINT("startup_end() done");
2264}
2265
2266/*
2267 * Don't remove the following 2 variables.  They are necessary
2268 * for reading the hostid from the legacy file (/kernel/misc/sysinit).
2269 */
2270char *_hs1107 = hw_serial;
2271ulong_t  _bdhs34;
2272
2273void
2274post_startup(void)
2275{
2276	extern void cpupm_init(cpu_t *);
2277	extern void cpu_event_init_cpu(cpu_t *);
2278
2279	/*
2280	 * Set the system wide, processor-specific flags to be passed
2281	 * to userland via the aux vector for performance hints and
2282	 * instruction set extensions.
2283	 */
2284	bind_hwcap();
2285
2286#ifdef __xpv
2287	if (DOMAIN_IS_INITDOMAIN(xen_info))
2288#endif
2289	{
2290#if defined(__xpv)
2291		xpv_panic_init();
2292#else
2293		/*
2294		 * Startup the memory scrubber.
2295		 * XXPV	This should be running somewhere ..
2296		 */
2297		if (get_hwenv() != HW_XEN_HVM)
2298			memscrub_init();
2299#endif
2300	}
2301
2302	/*
2303	 * Complete CPU module initialization
2304	 */
2305	cmi_post_startup();
2306
2307	/*
2308	 * Perform forceloading tasks for /etc/system.
2309	 */
2310	(void) mod_sysctl(SYS_FORCELOAD, NULL);
2311
2312	/*
2313	 * ON4.0: Force /proc module in until clock interrupt handle fixed
2314	 * ON4.0: This must be fixed or restated in /etc/systems.
2315	 */
2316	(void) modload("fs", "procfs");
2317
2318	(void) i_ddi_attach_hw_nodes("pit_beep");
2319
2320#if defined(__i386)
2321	/*
2322	 * Check for required functional Floating Point hardware,
2323	 * unless FP hardware explicitly disabled.
2324	 */
2325	if (fpu_exists && (fpu_pentium_fdivbug || fp_kind == FP_NO))
2326		halt("No working FP hardware found");
2327#endif
2328
2329	maxmem = freemem;
2330
2331	cpu_event_init_cpu(CPU);
2332	cpupm_init(CPU);
2333	(void) mach_cpu_create_device_node(CPU, NULL);
2334
2335	pg_init();
2336}
2337
2338static int
2339pp_in_range(page_t *pp, uint64_t low_addr, uint64_t high_addr)
2340{
2341	return ((pp->p_pagenum >= btop(low_addr)) &&
2342	    (pp->p_pagenum < btopr(high_addr)));
2343}
2344
2345void
2346release_bootstrap(void)
2347{
2348	int root_is_ramdisk;
2349	page_t *pp;
2350	extern void kobj_boot_unmountroot(void);
2351	extern dev_t rootdev;
2352#if !defined(__xpv)
2353	pfn_t	pfn;
2354#endif
2355
2356	/* unmount boot ramdisk and release kmem usage */
2357	kobj_boot_unmountroot();
2358
2359	/*
2360	 * We're finished using the boot loader so free its pages.
2361	 */
2362	PRM_POINT("Unmapping lower boot pages");
2363
2364	clear_boot_mappings(0, _userlimit);
2365
2366	postbootkernelbase = kernelbase;
2367
2368	/*
2369	 * If root isn't on ramdisk, destroy the hardcoded
2370	 * ramdisk node now and release the memory. Else,
2371	 * ramdisk memory is kept in rd_pages.
2372	 */
2373	root_is_ramdisk = (getmajor(rootdev) == ddi_name_to_major("ramdisk"));
2374	if (!root_is_ramdisk) {
2375		dev_info_t *dip = ddi_find_devinfo("ramdisk", -1, 0);
2376		ASSERT(dip && ddi_get_parent(dip) == ddi_root_node());
2377		ndi_rele_devi(dip);	/* held from ddi_find_devinfo */
2378		(void) ddi_remove_child(dip, 0);
2379	}
2380
2381	PRM_POINT("Releasing boot pages");
2382	while (bootpages) {
2383		extern uint64_t ramdisk_start, ramdisk_end;
2384		pp = bootpages;
2385		bootpages = pp->p_next;
2386
2387
2388		/* Keep pages for the lower 64K */
2389		if (pp_in_range(pp, 0, 0x40000)) {
2390			pp->p_next = lower_pages;
2391			lower_pages = pp;
2392			lower_pages_count++;
2393			continue;
2394		}
2395
2396
2397		if (root_is_ramdisk && pp_in_range(pp, ramdisk_start,
2398		    ramdisk_end)) {
2399			pp->p_next = rd_pages;
2400			rd_pages = pp;
2401			continue;
2402		}
2403		pp->p_next = (struct page *)0;
2404		pp->p_prev = (struct page *)0;
2405		PP_CLRBOOTPAGES(pp);
2406		page_free(pp, 1);
2407	}
2408	PRM_POINT("Boot pages released");
2409
2410#if !defined(__xpv)
2411/* XXPV -- note this following bunch of code needs to be revisited in Xen 3.0 */
2412	/*
2413	 * Find 1 page below 1 MB so that other processors can boot up or
2414	 * so that any processor can resume.
2415	 * Make sure it has a kernel VA as well as a 1:1 mapping.
2416	 * We should have just free'd one up.
2417	 */
2418
2419	/*
2420	 * 0x10 pages is 64K.  Leave the bottom 64K alone
2421	 * for BIOS.
2422	 */
2423	for (pfn = 0x10; pfn < btop(1*1024*1024); pfn++) {
2424		if (page_numtopp_alloc(pfn) == NULL)
2425			continue;
2426		rm_platter_va = i86devmap(pfn, 1,
2427		    PROT_READ | PROT_WRITE | PROT_EXEC);
2428		rm_platter_pa = ptob(pfn);
2429		break;
2430	}
2431	if (pfn == btop(1*1024*1024) && use_mp)
2432		panic("No page below 1M available for starting "
2433		    "other processors or for resuming from system-suspend");
2434#endif	/* !__xpv */
2435}
2436
2437/*
2438 * Initialize the platform-specific parts of a page_t.
2439 */
2440void
2441add_physmem_cb(page_t *pp, pfn_t pnum)
2442{
2443	pp->p_pagenum = pnum;
2444	pp->p_mapping = NULL;
2445	pp->p_embed = 0;
2446	pp->p_share = 0;
2447	pp->p_mlentry = 0;
2448}
2449
2450/*
2451 * kphysm_init() initializes physical memory.
2452 */
2453static pgcnt_t
2454kphysm_init(
2455	page_t *pp,
2456	pgcnt_t npages)
2457{
2458	struct memlist	*pmem;
2459	struct memseg	*cur_memseg;
2460	pfn_t		base_pfn;
2461	pfn_t		end_pfn;
2462	pgcnt_t		num;
2463	pgcnt_t		pages_done = 0;
2464	uint64_t	addr;
2465	uint64_t	size;
2466	extern pfn_t	ddiphysmin;
2467	extern int	mnode_xwa;
2468	int		ms = 0, me = 0;
2469
2470	ASSERT(page_hash != NULL && page_hashsz != 0);
2471
2472	cur_memseg = memseg_base;
2473	for (pmem = phys_avail; pmem && npages; pmem = pmem->ml_next) {
2474		/*
2475		 * In a 32 bit kernel can't use higher memory if we're
2476		 * not booting in PAE mode. This check takes care of that.
2477		 */
2478		addr = pmem->ml_address;
2479		size = pmem->ml_size;
2480		if (btop(addr) > physmax)
2481			continue;
2482
2483		/*
2484		 * align addr and size - they may not be at page boundaries
2485		 */
2486		if ((addr & MMU_PAGEOFFSET) != 0) {
2487			addr += MMU_PAGEOFFSET;
2488			addr &= ~(uint64_t)MMU_PAGEOFFSET;
2489			size -= addr - pmem->ml_address;
2490		}
2491
2492		/* only process pages below or equal to physmax */
2493		if ((btop(addr + size) - 1) > physmax)
2494			size = ptob(physmax - btop(addr) + 1);
2495
2496		num = btop(size);
2497		if (num == 0)
2498			continue;
2499
2500		if (num > npages)
2501			num = npages;
2502
2503		npages -= num;
2504		pages_done += num;
2505		base_pfn = btop(addr);
2506
2507		if (prom_debug)
2508			prom_printf("MEMSEG addr=0x%" PRIx64
2509			    " pgs=0x%lx pfn 0x%lx-0x%lx\n",
2510			    addr, num, base_pfn, base_pfn + num);
2511
2512		/*
2513		 * Ignore pages below ddiphysmin to simplify ddi memory
2514		 * allocation with non-zero addr_lo requests.
2515		 */
2516		if (base_pfn < ddiphysmin) {
2517			if (base_pfn + num <= ddiphysmin)
2518				continue;
2519			pp += (ddiphysmin - base_pfn);
2520			num -= (ddiphysmin - base_pfn);
2521			base_pfn = ddiphysmin;
2522		}
2523
2524		/*
2525		 * mnode_xwa is greater than 1 when large pages regions can
2526		 * cross memory node boundaries. To prevent the formation
2527		 * of these large pages, configure the memsegs based on the
2528		 * memory node ranges which had been made non-contiguous.
2529		 */
2530		if (mnode_xwa > 1) {
2531
2532			end_pfn = base_pfn + num - 1;
2533			ms = PFN_2_MEM_NODE(base_pfn);
2534			me = PFN_2_MEM_NODE(end_pfn);
2535
2536			if (ms != me) {
2537				/*
2538				 * current range spans more than 1 memory node.
2539				 * Set num to only the pfn range in the start
2540				 * memory node.
2541				 */
2542				num = mem_node_config[ms].physmax - base_pfn
2543				    + 1;
2544				ASSERT(end_pfn > mem_node_config[ms].physmax);
2545			}
2546		}
2547
2548		for (;;) {
2549			/*
2550			 * Build the memsegs entry
2551			 */
2552			cur_memseg->pages = pp;
2553			cur_memseg->epages = pp + num;
2554			cur_memseg->pages_base = base_pfn;
2555			cur_memseg->pages_end = base_pfn + num;
2556
2557			/*
2558			 * Insert into memseg list in decreasing pfn range
2559			 * order. Low memory is typically more fragmented such
2560			 * that this ordering keeps the larger ranges at the
2561			 * front of the list for code that searches memseg.
2562			 * This ASSERTS that the memsegs coming in from boot
2563			 * are in increasing physical address order and not
2564			 * contiguous.
2565			 */
2566			if (memsegs != NULL) {
2567				ASSERT(cur_memseg->pages_base >=
2568				    memsegs->pages_end);
2569				cur_memseg->next = memsegs;
2570			}
2571			memsegs = cur_memseg;
2572
2573			/*
2574			 * add_physmem() initializes the PSM part of the page
2575			 * struct by calling the PSM back with add_physmem_cb().
2576			 * In addition it coalesces pages into larger pages as
2577			 * it initializes them.
2578			 */
2579			add_physmem(pp, num, base_pfn);
2580			cur_memseg++;
2581			availrmem_initial += num;
2582			availrmem += num;
2583
2584			pp += num;
2585			if (ms >= me)
2586				break;
2587
2588			/* process next memory node range */
2589			ms++;
2590			base_pfn = mem_node_config[ms].physbase;
2591			num = MIN(mem_node_config[ms].physmax,
2592			    end_pfn) - base_pfn + 1;
2593		}
2594	}
2595
2596	PRM_DEBUG(availrmem_initial);
2597	PRM_DEBUG(availrmem);
2598	PRM_DEBUG(freemem);
2599	build_pfn_hash();
2600	return (pages_done);
2601}
2602
2603/*
2604 * Kernel VM initialization.
2605 */
2606static void
2607kvm_init(void)
2608{
2609	ASSERT((((uintptr_t)s_text) & MMU_PAGEOFFSET) == 0);
2610
2611	/*
2612	 * Put the kernel segments in kernel address space.
2613	 */
2614	rw_enter(&kas.a_lock, RW_WRITER);
2615	as_avlinit(&kas);
2616
2617	(void) seg_attach(&kas, s_text, e_moddata - s_text, &ktextseg);
2618	(void) segkmem_create(&ktextseg);
2619
2620	(void) seg_attach(&kas, (caddr_t)valloc_base, valloc_sz, &kvalloc);
2621	(void) segkmem_create(&kvalloc);
2622
2623	(void) seg_attach(&kas, kernelheap,
2624	    ekernelheap - kernelheap, &kvseg);
2625	(void) segkmem_create(&kvseg);
2626
2627	if (core_size > 0) {
2628		PRM_POINT("attaching kvseg_core");
2629		(void) seg_attach(&kas, (caddr_t)core_base, core_size,
2630		    &kvseg_core);
2631		(void) segkmem_create(&kvseg_core);
2632	}
2633
2634	if (segziosize > 0) {
2635		PRM_POINT("attaching segzio");
2636		(void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize),
2637		    &kzioseg);
2638		(void) segkmem_zio_create(&kzioseg);
2639
2640		/* create zio area covering new segment */
2641		segkmem_zio_init(segzio_base, mmu_ptob(segziosize));
2642	}
2643
2644	(void) seg_attach(&kas, kdi_segdebugbase, kdi_segdebugsize, &kdebugseg);
2645	(void) segkmem_create(&kdebugseg);
2646
2647	rw_exit(&kas.a_lock);
2648
2649	/*
2650	 * Ensure that the red zone at kernelbase is never accessible.
2651	 */
2652	PRM_POINT("protecting redzone");
2653	(void) as_setprot(&kas, (caddr_t)kernelbase, KERNEL_REDZONE_SIZE, 0);
2654
2655	/*
2656	 * Make the text writable so that it can be hot patched by DTrace.
2657	 */
2658	(void) as_setprot(&kas, s_text, e_modtext - s_text,
2659	    PROT_READ | PROT_WRITE | PROT_EXEC);
2660
2661	/*
2662	 * Make data writable until end.
2663	 */
2664	(void) as_setprot(&kas, s_data, e_moddata - s_data,
2665	    PROT_READ | PROT_WRITE | PROT_EXEC);
2666}
2667
2668#ifndef __xpv
2669/*
2670 * Solaris adds an entry for Write Combining caching to the PAT
2671 */
2672static uint64_t pat_attr_reg = PAT_DEFAULT_ATTRIBUTE;
2673
2674void
2675pat_sync(void)
2676{
2677	ulong_t	cr0, cr0_orig, cr4;
2678
2679	if (!is_x86_feature(x86_featureset, X86FSET_PAT))
2680		return;
2681	cr0_orig = cr0 = getcr0();
2682	cr4 = getcr4();
2683
2684	/* disable caching and flush all caches and TLBs */
2685	cr0 |= CR0_CD;
2686	cr0 &= ~CR0_NW;
2687	setcr0(cr0);
2688	invalidate_cache();
2689	if (cr4 & CR4_PGE) {
2690		setcr4(cr4 & ~(ulong_t)CR4_PGE);
2691		setcr4(cr4);
2692	} else {
2693		reload_cr3();
2694	}
2695
2696	/* add our entry to the PAT */
2697	wrmsr(REG_PAT, pat_attr_reg);
2698
2699	/* flush TLBs and cache again, then reenable cr0 caching */
2700	if (cr4 & CR4_PGE) {
2701		setcr4(cr4 & ~(ulong_t)CR4_PGE);
2702		setcr4(cr4);
2703	} else {
2704		reload_cr3();
2705	}
2706	invalidate_cache();
2707	setcr0(cr0_orig);
2708}
2709
2710#endif /* !__xpv */
2711
2712#if defined(_SOFT_HOSTID)
2713/*
2714 * On platforms that do not have a hardware serial number, attempt
2715 * to set one based on the contents of /etc/hostid.  If this file does
2716 * not exist, assume that we are to generate a new hostid and set
2717 * it in the kernel, for subsequent saving by a userland process
2718 * once the system is up and the root filesystem is mounted r/w.
2719 *
2720 * In order to gracefully support upgrade on OpenSolaris, if
2721 * /etc/hostid does not exist, we will attempt to get a serial number
2722 * using the legacy method (/kernel/misc/sysinit).
2723 *
2724 * In an attempt to make the hostid less prone to abuse
2725 * (for license circumvention, etc), we store it in /etc/hostid
2726 * in rot47 format.
2727 */
2728extern volatile unsigned long tenmicrodata;
2729static int atoi(char *);
2730
2731static int32_t
2732set_soft_hostid(void)
2733{
2734	struct _buf *file;
2735	char tokbuf[MAXNAMELEN];
2736	token_t token;
2737	int done = 0;
2738	u_longlong_t tmp;
2739	int i;
2740	int32_t hostid = (int32_t)HW_INVALID_HOSTID;
2741	unsigned char *c;
2742	hrtime_t tsc;
2743
2744	/*
2745	 * If /etc/hostid file not found, we'd like to get a pseudo
2746	 * random number to use at the hostid.  A nice way to do this
2747	 * is to read the real time clock.  To remain xen-compatible,
2748	 * we can't poke the real hardware, so we use tsc_read() to
2749	 * read the real time clock.  However, there is an ominous
2750	 * warning in tsc_read that says it can return zero, so we
2751	 * deal with that possibility by falling back to using the
2752	 * (hopefully random enough) value in tenmicrodata.
2753	 */
2754
2755	if ((file = kobj_open_file(hostid_file)) == (struct _buf *)-1) {
2756		/*
2757		 * hostid file not found - try to load sysinit module
2758		 * and see if it has a nonzero hostid value...use that
2759		 * instead of generating a new hostid here if so.
2760		 */
2761		if ((i = modload("misc", "sysinit")) != -1) {
2762			if (strlen(hw_serial) > 0)
2763				hostid = (int32_t)atoi(hw_serial);
2764			(void) modunload(i);
2765		}
2766		if (hostid == HW_INVALID_HOSTID) {
2767			tsc = tsc_read();
2768			if (tsc == 0)	/* tsc_read can return zero sometimes */
2769				hostid = (int32_t)tenmicrodata & 0x0CFFFFF;
2770			else
2771				hostid = (int32_t)tsc & 0x0CFFFFF;
2772		}
2773	} else {
2774		/* hostid file found */
2775		while (!done) {
2776			token = kobj_lex(file, tokbuf, sizeof (tokbuf));
2777
2778			switch (token) {
2779			case POUND:
2780				/*
2781				 * skip comments
2782				 */
2783				kobj_find_eol(file);
2784				break;
2785			case STRING:
2786				/*
2787				 * un-rot47 - obviously this
2788				 * nonsense is ascii-specific
2789				 */
2790				for (c = (unsigned char *)tokbuf;
2791				    *c != '\0'; c++) {
2792					*c += 47;
2793					if (*c > '~')
2794						*c -= 94;
2795					else if (*c < '!')
2796						*c += 94;
2797				}
2798				/*
2799				 * now we should have a real number
2800				 */
2801
2802				if (kobj_getvalue(tokbuf, &tmp) != 0)
2803					kobj_file_err(CE_WARN, file,
2804					    "Bad value %s for hostid",
2805					    tokbuf);
2806				else
2807					hostid = (int32_t)tmp;
2808
2809				break;
2810			case EOF:
2811				done = 1;
2812				/* FALLTHROUGH */
2813			case NEWLINE:
2814				kobj_newline(file);
2815				break;
2816			default:
2817				break;
2818
2819			}
2820		}
2821		if (hostid == HW_INVALID_HOSTID) /* didn't find a hostid */
2822			kobj_file_err(CE_WARN, file,
2823			    "hostid missing or corrupt");
2824
2825		kobj_close_file(file);
2826	}
2827	/*
2828	 * hostid is now the value read from /etc/hostid, or the
2829	 * new hostid we generated in this routine or HW_INVALID_HOSTID if not
2830	 * set.
2831	 */
2832	return (hostid);
2833}
2834
2835static int
2836atoi(char *p)
2837{
2838	int i = 0;
2839
2840	while (*p != '\0')
2841		i = 10 * i + (*p++ - '0');
2842
2843	return (i);
2844}
2845
2846#endif /* _SOFT_HOSTID */
2847
2848void
2849get_system_configuration(void)
2850{
2851	char	prop[32];
2852	u_longlong_t nodes_ll, cpus_pernode_ll, lvalue;
2853
2854	if (BOP_GETPROPLEN(bootops, "nodes") > sizeof (prop) ||
2855	    BOP_GETPROP(bootops, "nodes", prop) < 0 ||
2856	    kobj_getvalue(prop, &nodes_ll) == -1 ||
2857	    nodes_ll > MAXNODES ||
2858	    BOP_GETPROPLEN(bootops, "cpus_pernode") > sizeof (prop) ||
2859	    BOP_GETPROP(bootops, "cpus_pernode", prop) < 0 ||
2860	    kobj_getvalue(prop, &cpus_pernode_ll) == -1) {
2861		system_hardware.hd_nodes = 1;
2862		system_hardware.hd_cpus_per_node = 0;
2863	} else {
2864		system_hardware.hd_nodes = (int)nodes_ll;
2865		system_hardware.hd_cpus_per_node = (int)cpus_pernode_ll;
2866	}
2867
2868	if (BOP_GETPROPLEN(bootops, "kernelbase") > sizeof (prop) ||
2869	    BOP_GETPROP(bootops, "kernelbase", prop) < 0 ||
2870	    kobj_getvalue(prop, &lvalue) == -1)
2871		eprom_kernelbase = NULL;
2872	else
2873		eprom_kernelbase = (uintptr_t)lvalue;
2874
2875	if (BOP_GETPROPLEN(bootops, "segmapsize") > sizeof (prop) ||
2876	    BOP_GETPROP(bootops, "segmapsize", prop) < 0 ||
2877	    kobj_getvalue(prop, &lvalue) == -1)
2878		segmapsize = SEGMAPDEFAULT;
2879	else
2880		segmapsize = (uintptr_t)lvalue;
2881
2882	if (BOP_GETPROPLEN(bootops, "segmapfreelists") > sizeof (prop) ||
2883	    BOP_GETPROP(bootops, "segmapfreelists", prop) < 0 ||
2884	    kobj_getvalue(prop, &lvalue) == -1)
2885		segmapfreelists = 0;	/* use segmap driver default */
2886	else
2887		segmapfreelists = (int)lvalue;
2888
2889	/* physmem used to be here, but moved much earlier to fakebop.c */
2890}
2891
2892/*
2893 * Add to a memory list.
2894 * start = start of new memory segment
2895 * len = length of new memory segment in bytes
2896 * new = pointer to a new struct memlist
2897 * memlistp = memory list to which to add segment.
2898 */
2899void
2900memlist_add(
2901	uint64_t start,
2902	uint64_t len,
2903	struct memlist *new,
2904	struct memlist **memlistp)
2905{
2906	struct memlist *cur;
2907	uint64_t end = start + len;
2908
2909	new->ml_address = start;
2910	new->ml_size = len;
2911
2912	cur = *memlistp;
2913
2914	while (cur) {
2915		if (cur->ml_address >= end) {
2916			new->ml_next = cur;
2917			*memlistp = new;
2918			new->ml_prev = cur->ml_prev;
2919			cur->ml_prev = new;
2920			return;
2921		}
2922		ASSERT(cur->ml_address + cur->ml_size <= start);
2923		if (cur->ml_next == NULL) {
2924			cur->ml_next = new;
2925			new->ml_prev = cur;
2926			new->ml_next = NULL;
2927			return;
2928		}
2929		memlistp = &cur->ml_next;
2930		cur = cur->ml_next;
2931	}
2932}
2933
2934void
2935kobj_vmem_init(vmem_t **text_arena, vmem_t **data_arena)
2936{
2937	size_t tsize = e_modtext - modtext;
2938	size_t dsize = e_moddata - moddata;
2939
2940	*text_arena = vmem_create("module_text", tsize ? modtext : NULL, tsize,
2941	    1, segkmem_alloc, segkmem_free, heaptext_arena, 0, VM_SLEEP);
2942	*data_arena = vmem_create("module_data", dsize ? moddata : NULL, dsize,
2943	    1, segkmem_alloc, segkmem_free, heap32_arena, 0, VM_SLEEP);
2944}
2945
2946caddr_t
2947kobj_text_alloc(vmem_t *arena, size_t size)
2948{
2949	return (vmem_alloc(arena, size, VM_SLEEP | VM_BESTFIT));
2950}
2951
2952/*ARGSUSED*/
2953caddr_t
2954kobj_texthole_alloc(caddr_t addr, size_t size)
2955{
2956	panic("unexpected call to kobj_texthole_alloc()");
2957	/*NOTREACHED*/
2958	return (0);
2959}
2960
2961/*ARGSUSED*/
2962void
2963kobj_texthole_free(caddr_t addr, size_t size)
2964{
2965	panic("unexpected call to kobj_texthole_free()");
2966}
2967
2968/*
2969 * This is called just after configure() in startup().
2970 *
2971 * The ISALIST concept is a bit hopeless on Intel, because
2972 * there's no guarantee of an ever-more-capable processor
2973 * given that various parts of the instruction set may appear
2974 * and disappear between different implementations.
2975 *
2976 * While it would be possible to correct it and even enhance
2977 * it somewhat, the explicit hardware capability bitmask allows
2978 * more flexibility.
2979 *
2980 * So, we just leave this alone.
2981 */
2982void
2983setx86isalist(void)
2984{
2985	char *tp;
2986	size_t len;
2987	extern char *isa_list;
2988
2989#define	TBUFSIZE	1024
2990
2991	tp = kmem_alloc(TBUFSIZE, KM_SLEEP);
2992	*tp = '\0';
2993
2994#if defined(__amd64)
2995	(void) strcpy(tp, "amd64 ");
2996#endif
2997
2998	switch (x86_vendor) {
2999	case X86_VENDOR_Intel:
3000	case X86_VENDOR_AMD:
3001	case X86_VENDOR_TM:
3002		if (is_x86_feature(x86_featureset, X86FSET_CMOV)) {
3003			/*
3004			 * Pentium Pro or later
3005			 */
3006			(void) strcat(tp, "pentium_pro");
3007			(void) strcat(tp,
3008			    is_x86_feature(x86_featureset, X86FSET_MMX) ?
3009			    "+mmx pentium_pro " : " ");
3010		}
3011		/*FALLTHROUGH*/
3012	case X86_VENDOR_Cyrix:
3013		/*
3014		 * The Cyrix 6x86 does not have any Pentium features
3015		 * accessible while not at privilege level 0.
3016		 */
3017		if (is_x86_feature(x86_featureset, X86FSET_CPUID)) {
3018			(void) strcat(tp, "pentium");
3019			(void) strcat(tp,
3020			    is_x86_feature(x86_featureset, X86FSET_MMX) ?
3021			    "+mmx pentium " : " ");
3022		}
3023		break;
3024	default:
3025		break;
3026	}
3027	(void) strcat(tp, "i486 i386 i86");
3028	len = strlen(tp) + 1;   /* account for NULL at end of string */
3029	isa_list = strcpy(kmem_alloc(len, KM_SLEEP), tp);
3030	kmem_free(tp, TBUFSIZE);
3031
3032#undef TBUFSIZE
3033}
3034
3035
3036#ifdef __amd64
3037
3038void *
3039device_arena_alloc(size_t size, int vm_flag)
3040{
3041	return (vmem_alloc(device_arena, size, vm_flag));
3042}
3043
3044void
3045device_arena_free(void *vaddr, size_t size)
3046{
3047	vmem_free(device_arena, vaddr, size);
3048}
3049
3050#else /* __i386 */
3051
3052void *
3053device_arena_alloc(size_t size, int vm_flag)
3054{
3055	caddr_t	vaddr;
3056	uintptr_t v;
3057	size_t	start;
3058	size_t	end;
3059
3060	vaddr = vmem_alloc(heap_arena, size, vm_flag);
3061	if (vaddr == NULL)
3062		return (NULL);
3063
3064	v = (uintptr_t)vaddr;
3065	ASSERT(v >= kernelbase);
3066	ASSERT(v + size <= valloc_base);
3067
3068	start = btop(v - kernelbase);
3069	end = btop(v + size - 1 - kernelbase);
3070	ASSERT(start < toxic_bit_map_len);
3071	ASSERT(end < toxic_bit_map_len);
3072
3073	while (start <= end) {
3074		BT_ATOMIC_SET(toxic_bit_map, start);
3075		++start;
3076	}
3077	return (vaddr);
3078}
3079
3080void
3081device_arena_free(void *vaddr, size_t size)
3082{
3083	uintptr_t v = (uintptr_t)vaddr;
3084	size_t	start;
3085	size_t	end;
3086
3087	ASSERT(v >= kernelbase);
3088	ASSERT(v + size <= valloc_base);
3089
3090	start = btop(v - kernelbase);
3091	end = btop(v + size - 1 - kernelbase);
3092	ASSERT(start < toxic_bit_map_len);
3093	ASSERT(end < toxic_bit_map_len);
3094
3095	while (start <= end) {
3096		ASSERT(BT_TEST(toxic_bit_map, start) != 0);
3097		BT_ATOMIC_CLEAR(toxic_bit_map, start);
3098		++start;
3099	}
3100	vmem_free(heap_arena, vaddr, size);
3101}
3102
3103/*
3104 * returns 1st address in range that is in device arena, or NULL
3105 * if len is not NULL it returns the length of the toxic range
3106 */
3107void *
3108device_arena_contains(void *vaddr, size_t size, size_t *len)
3109{
3110	uintptr_t v = (uintptr_t)vaddr;
3111	uintptr_t eaddr = v + size;
3112	size_t start;
3113	size_t end;
3114
3115	/*
3116	 * if called very early by kmdb, just return NULL
3117	 */
3118	if (toxic_bit_map == NULL)
3119		return (NULL);
3120
3121	/*
3122	 * First check if we're completely outside the bitmap range.
3123	 */
3124	if (v >= valloc_base || eaddr < kernelbase)
3125		return (NULL);
3126
3127	/*
3128	 * Trim ends of search to look at only what the bitmap covers.
3129	 */
3130	if (v < kernelbase)
3131		v = kernelbase;
3132	start = btop(v - kernelbase);
3133	end = btop(eaddr - kernelbase);
3134	if (end >= toxic_bit_map_len)
3135		end = toxic_bit_map_len;
3136
3137	if (bt_range(toxic_bit_map, &start, &end, end) == 0)
3138		return (NULL);
3139
3140	v = kernelbase + ptob(start);
3141	if (len != NULL)
3142		*len = ptob(end - start);
3143	return ((void *)v);
3144}
3145
3146#endif	/* __i386 */
3147