1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25#include <sys/types.h>
26#include <sys/clock.h>
27#include <sys/psm.h>
28#include <sys/archsystm.h>
29#include <sys/machsystm.h>
30#include <sys/compress.h>
31#include <sys/modctl.h>
32#include <sys/trap.h>
33#include <sys/panic.h>
34#include <sys/regset.h>
35#include <sys/frame.h>
36#include <sys/kobj.h>
37#include <sys/apic.h>
38#include <sys/apic_timer.h>
39#include <sys/dumphdr.h>
40#include <sys/mem.h>
41#include <sys/x86_archext.h>
42#include <sys/xpv_panic.h>
43#include <sys/boot_console.h>
44#include <sys/bootsvcs.h>
45#include <sys/consdev.h>
46#include <vm/hat_pte.h>
47#include <vm/hat_i86.h>
48
49/* XXX: need to add a PAE version too, if we ever support both PAE and non */
50#if defined(__i386)
51#define	XPV_FILENAME	"/boot/xen-syms"
52#else
53#define	XPV_FILENAME	"/boot/amd64/xen-syms"
54#endif
55#define	XPV_MODNAME	"xpv"
56
57int xpv_panicking = 0;
58
59struct module *xpv_module;
60struct modctl *xpv_modctl;
61
62#define	ALIGN(x, a)	((a) == 0 ? (uintptr_t)(x) : \
63	(((uintptr_t)(x) + (uintptr_t)(a) - 1l) & ~((uintptr_t)(a) - 1l)))
64
65/* Pointer to the xpv_panic_info structure handed to us by Xen.  */
66static struct panic_info *xpv_panic_info = NULL;
67
68/* Timer support */
69#define	NSEC_SHIFT 5
70#define	T_XPV_TIMER	0xd1
71#define	XPV_TIMER_INTERVAL	1000	/* 1000 microseconds */
72static uint32_t *xpv_apicadr = NULL;
73static uint_t	nsec_scale;
74
75/* IDT support */
76#pragma	align	16(xpv_panic_idt)
77static gate_desc_t	xpv_panic_idt[NIDT];	/* interrupt descriptor table */
78
79/* Xen pagetables mapped into our HAT's ptable windows */
80static pfn_t ptable_pfn[MAX_NUM_LEVEL];
81
82/* Number of MMU_PAGESIZE pages we're adding to the Solaris dump */
83static int xpv_dump_pages;
84
85/*
86 * There are up to two large swathes of RAM that we don't want to include
87 * in the dump: those that comprise the Xen version of segkpm.  On 32-bit
88 * systems there is no such region of memory.  On 64-bit systems, there
89 * should be just a single contiguous region that corresponds to all of
90 * physical memory.  The tricky bit is that Xen's heap sometimes lives in
91 * the middle of their segkpm, and is mapped using only kpm-like addresses.
92 * In that case, we need to skip the swathes before and after Xen's heap.
93 */
94uintptr_t kpm1_low = 0;
95uintptr_t kpm1_high = 0;
96uintptr_t kpm2_low = 0;
97uintptr_t kpm2_high = 0;
98
99/*
100 * Some commonly used values that we don't want to recompute over and over.
101 */
102static int xpv_panic_nptes[MAX_NUM_LEVEL];
103static ulong_t xpv_panic_cr3;
104static uintptr_t xpv_end;
105
106static void xpv_panic_console_print(const char *fmt, ...);
107static void (*xpv_panic_printf)(const char *, ...) = xpv_panic_console_print;
108
109#define	CONSOLE_BUF_SIZE	256
110static char console_buffer[CONSOLE_BUF_SIZE];
111static boolean_t use_polledio;
112
113/*
114 * Pointers to machine check panic info (if any).
115 */
116xpv_mca_panic_data_t *xpv_mca_panic_data = NULL;
117
118static void
119xpv_panic_putc(int m)
120{
121	struct cons_polledio *c = cons_polledio;
122
123	/* This really shouldn't happen */
124	if (console == CONS_HYPERVISOR)
125		return;
126
127	if (use_polledio == B_TRUE)
128		c->cons_polledio_putchar(c->cons_polledio_argument, m);
129	else
130		bcons_putchar(m);
131}
132
133static void
134xpv_panic_puts(char *msg)
135{
136	char *m;
137
138	dump_timeleft = dump_timeout;
139	for (m = msg; *m; m++)
140		xpv_panic_putc((int)*m);
141}
142
143static void
144xpv_panic_console_print(const char *fmt, ...)
145{
146	va_list ap;
147
148	va_start(ap, fmt);
149	(void) vsnprintf(console_buffer, sizeof (console_buffer), fmt, ap);
150	va_end(ap);
151
152	xpv_panic_puts(console_buffer);
153}
154
155static void
156xpv_panic_map(int level, pfn_t pfn)
157{
158	x86pte_t pte, *pteptr;
159
160	/*
161	 * The provided pfn represents a level 'level' page table.  Map it
162	 * into the 'level' slot in the list of page table windows.
163	 */
164	pteptr = (x86pte_t *)PWIN_PTE_VA(level);
165	pte = pfn_to_pa(pfn) | PT_VALID;
166
167	XPV_ALLOW_PAGETABLE_UPDATES();
168	if (mmu.pae_hat)
169		*pteptr = pte;
170	else
171		*(x86pte32_t *)pteptr = pte;
172	XPV_DISALLOW_PAGETABLE_UPDATES();
173
174	mmu_tlbflush_entry(PWIN_VA(level));
175}
176
177/*
178 * Walk the page tables to find the pfn mapped by the given va.
179 */
180static pfn_t
181xpv_va_walk(uintptr_t *vaddr)
182{
183	int l, idx;
184	pfn_t pfn;
185	x86pte_t pte;
186	x86pte_t *ptep;
187	uintptr_t va = *vaddr;
188	uintptr_t scan_va;
189	caddr_t ptable_window;
190	static pfn_t toplevel_pfn;
191	static uintptr_t lastva;
192
193	/*
194	 * If we do anything other than a simple scan through memory, don't
195	 * trust the mapped page tables.
196	 */
197	if (va != lastva + MMU_PAGESIZE)
198		for (l = mmu.max_level; l >= 0; l--)
199			ptable_pfn[l] = PFN_INVALID;
200
201	toplevel_pfn = mmu_btop(xpv_panic_cr3);
202
203	while (va < xpv_end && va >= *vaddr) {
204		/* Find the lowest table with any entry for va */
205		pfn = toplevel_pfn;
206		for (l = mmu.max_level; l >= 0; l--) {
207			if (ptable_pfn[l] != pfn) {
208				xpv_panic_map(l, pfn);
209				ptable_pfn[l] = pfn;
210			}
211
212			/*
213			 * Search this pagetable for any mapping to an
214			 * address >= va.
215			 */
216			ptable_window = PWIN_VA(l);
217			if (l == mmu.max_level && mmu.pae_hat)
218				ptable_window +=
219				    (xpv_panic_cr3 & MMU_PAGEOFFSET);
220
221			idx = (va >> LEVEL_SHIFT(l)) & (xpv_panic_nptes[l] - 1);
222			scan_va = va;
223			while (idx < xpv_panic_nptes[l] && scan_va < xpv_end &&
224			    scan_va >= *vaddr) {
225				ptep = (x86pte_t *)(ptable_window +
226				    (idx << mmu.pte_size_shift));
227				pte = GET_PTE(ptep);
228				if (pte & PTE_VALID)
229					break;
230				idx++;
231				scan_va += mmu.level_size[l];
232			}
233
234			/*
235			 * If there are no valid mappings in this table, we
236			 * can skip to the end of the VA range it covers.
237			 */
238			if (idx == xpv_panic_nptes[l]) {
239				va = NEXT_ENTRY_VA(va, l + 1);
240				break;
241			}
242
243			va = scan_va;
244			/*
245			 * See if we've hit the end of the range.
246			 */
247			if (va >= xpv_end || va < *vaddr)
248				break;
249
250			/*
251			 * If this mapping is for a pagetable, we drop down
252			 * to the next level in the hierarchy and look for
253			 * a mapping in it.
254			 */
255			pfn = PTE2MFN(pte, l);
256			if (!PTE_ISPAGE(pte, l))
257				continue;
258
259			/*
260			 * The APIC page is magic.  Nothing to see here;
261			 * move along.
262			 */
263			if (((uintptr_t)xpv_apicadr & MMU_PAGEMASK) ==
264			    (va & MMU_PAGEMASK)) {
265				va += MMU_PAGESIZE;
266				break;
267			}
268
269			/*
270			 * See if the address is within one of the two
271			 * kpm-like regions we want to skip.
272			 */
273			if (va >= kpm1_low && va < kpm1_high) {
274				va = kpm1_high;
275				break;
276			}
277			if (va >= kpm2_low && va < kpm2_high) {
278				va = kpm2_high;
279				break;
280			}
281
282			/*
283			 * The Xen panic code only handles small pages.  If
284			 * this mapping is for a large page, we need to
285			 * identify the consituent page that covers the
286			 * specific VA we were looking for.
287			 */
288			if (l > 0) {
289				if (l > 1)
290					panic("Xen panic can't cope with "
291					    "giant pages.");
292				idx = (va >> LEVEL_SHIFT(0)) &
293				    (xpv_panic_nptes[0] - 1);
294				pfn += idx;
295			}
296
297			*vaddr = va;
298			lastva = va;
299			return (pfn | PFN_IS_FOREIGN_MFN);
300		}
301	}
302	return (PFN_INVALID);
303}
304
305/*
306 * Walk through the Xen VA space, finding pages that are mapped in.
307 *
308 * These pages all have MFNs rather than PFNs, meaning they may be outside
309 * the physical address space the kernel knows about, or they may collide
310 * with PFNs the kernel is using.
311 *
312 * The obvious trick of just adding the PFN_IS_FOREIGN_MFN bit to the MFNs
313 * to avoid collisions doesn't work.  The pages need to be written to disk
314 * in PFN-order or savecore gets confused.  We can't allocate memory to
315 * contruct a sorted pfn->VA reverse mapping, so we have to write the pages
316 * to disk in VA order.
317 *
318 * To square this circle, we simply make up PFNs for each of Xen's pages.
319 * We assign each mapped page a fake PFN in ascending order.  These fake
320 * PFNs each have the FOREIGN bit set, ensuring that they fall outside the
321 * range of Solaris PFNs written by the kernel.
322 */
323int
324dump_xpv_addr()
325{
326	uintptr_t va;
327	mem_vtop_t mem_vtop;
328
329	xpv_dump_pages = 0;
330	va = xen_virt_start;
331
332	while (xpv_va_walk(&va) != PFN_INVALID) {
333		mem_vtop.m_as = &kas;
334		mem_vtop.m_va = (void *)va;
335		mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
336
337		dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
338		xpv_dump_pages++;
339
340		va += MMU_PAGESIZE;
341	}
342
343	/*
344	 * Add the shared_info page.  This page actually ends up in the
345	 * dump twice: once for the Xen va and once for the Solaris va.
346	 * This isn't ideal, but we don't know the address Xen is using for
347	 * the page, so we can't share it.
348	 */
349	mem_vtop.m_as = &kas;
350	mem_vtop.m_va = HYPERVISOR_shared_info;
351	mem_vtop.m_pfn = (pfn_t)xpv_dump_pages | PFN_IS_FOREIGN_MFN;
352	dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
353	xpv_dump_pages++;
354
355	return (xpv_dump_pages);
356}
357
358void
359dump_xpv_pfn()
360{
361	pfn_t pfn;
362	int cnt;
363
364	for (cnt = 0; cnt < xpv_dump_pages; cnt++) {
365		pfn = (pfn_t)cnt | PFN_IS_FOREIGN_MFN;
366		dumpvp_write(&pfn, sizeof (pfn));
367	}
368}
369
370int
371dump_xpv_data(void *dump_cbuf)
372{
373	uintptr_t va;
374	uint32_t csize;
375	int cnt = 0;
376
377	/*
378	 * XXX: we should probably run this data through a UE check.  The
379	 * catch is that the UE code relies on on_trap() and getpfnum()
380	 * working.
381	 */
382	va = xen_virt_start;
383
384	while (xpv_va_walk(&va) != PFN_INVALID) {
385		csize = (uint32_t)compress((void *)va, dump_cbuf, PAGESIZE);
386		dumpvp_write(&csize, sizeof (uint32_t));
387		dumpvp_write(dump_cbuf, csize);
388		if (dump_ioerr) {
389			dumphdr->dump_flags &= ~DF_COMPLETE;
390			return (cnt);
391		}
392		cnt++;
393		va += MMU_PAGESIZE;
394	}
395
396	/*
397	 * Finally, dump the shared_info page
398	 */
399	csize = (uint32_t)compress((void *)HYPERVISOR_shared_info, dump_cbuf,
400	    PAGESIZE);
401	dumpvp_write(&csize, sizeof (uint32_t));
402	dumpvp_write(dump_cbuf, csize);
403	if (dump_ioerr)
404		dumphdr->dump_flags &= ~DF_COMPLETE;
405	cnt++;
406
407	return (cnt);
408}
409
410static void *
411showstack(void *fpreg, int xpv_only)
412{
413	struct frame *fpp;
414	ulong_t off;
415	char *sym;
416	uintptr_t pc, fp, lastfp;
417	uintptr_t minaddr = min(KERNELBASE, xen_virt_start);
418
419	fp = (uintptr_t)fpreg;
420	if (fp < minaddr) {
421		xpv_panic_printf("Bad frame ptr: 0x%p\n", fpreg);
422		return (fpreg);
423	}
424
425	do {
426		fpp = (struct frame *)fp;
427		pc = fpp->fr_savpc;
428
429		if ((xpv_only != 0) &&
430		    (fp > xpv_end || fp < xen_virt_start))
431			break;
432		if ((sym = kobj_getsymname(pc, &off)) != NULL)
433			xpv_panic_printf("%08lx %s:%s+%lx\n", fp,
434			    mod_containing_pc((caddr_t)pc), sym, off);
435		else if ((pc >= xen_virt_start) && (pc <= xpv_end))
436			xpv_panic_printf("%08lx 0x%lx (in Xen)\n", fp, pc);
437		else
438			xpv_panic_printf("%08lx %lx\n", fp, pc);
439
440		lastfp = fp;
441		fp = fpp->fr_savfp;
442
443		/*
444		 * Xen marks an exception frame by inverting the frame
445		 * pointer.
446		 */
447		if (fp < lastfp) {
448			if ((~fp > minaddr) && ((~fp) ^ lastfp) < 0xfff)
449				fp = ~fp;
450		}
451	} while (fp > lastfp);
452	return ((void *)fp);
453}
454
455void *
456xpv_traceback(void *fpreg)
457{
458	return (showstack(fpreg, 1));
459}
460
461#if defined(__amd64)
462static void
463xpv_panic_hypercall(ulong_t call)
464{
465	panic("Illegally issued hypercall %d during panic!\n", (int)call);
466}
467#endif
468
469void
470xpv_die(struct regs *rp)
471{
472	struct panic_trap_info ti;
473	struct cregs creg;
474
475	ti.trap_regs = rp;
476	ti.trap_type = rp->r_trapno;
477
478	curthread->t_panic_trap = &ti;
479	if (ti.trap_type == T_PGFLT) {
480		getcregs(&creg);
481		ti.trap_addr = (caddr_t)creg.cr_cr2;
482		panic("Fatal pagefault at 0x%lx.  fault addr=0x%p  rp=0x%p",
483		    rp->r_pc, (void *)ti.trap_addr, (void *)rp);
484	} else {
485		ti.trap_addr = (caddr_t)rp->r_pc;
486		panic("Fatal trap %ld at 0x%lx.  rp=0x%p", rp->r_trapno,
487		    rp->r_pc, (void *)rp);
488	}
489}
490
491/*
492 * Build IDT to handle a Xen panic
493 */
494static void
495switch_to_xpv_panic_idt()
496{
497	int i;
498	desctbr_t idtr;
499	gate_desc_t *idt = xpv_panic_idt;
500	selector_t cs = get_cs_register();
501
502	for (i = 0; i < 32; i++)
503		set_gatesegd(&idt[i], &xpv_invaltrap, cs, SDT_SYSIGT, TRP_XPL,
504		    0);
505
506	set_gatesegd(&idt[T_ZERODIV], &xpv_div0trap, cs, SDT_SYSIGT, TRP_XPL,
507	    0);
508	set_gatesegd(&idt[T_SGLSTP], &xpv_dbgtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
509	set_gatesegd(&idt[T_NMIFLT], &xpv_nmiint, cs, SDT_SYSIGT, TRP_XPL, 0);
510	set_gatesegd(&idt[T_BOUNDFLT], &xpv_boundstrap, cs, SDT_SYSIGT,
511	    TRP_XPL, 0);
512	set_gatesegd(&idt[T_ILLINST], &xpv_invoptrap, cs, SDT_SYSIGT, TRP_XPL,
513	    0);
514	set_gatesegd(&idt[T_NOEXTFLT], &xpv_ndptrap, cs, SDT_SYSIGT, TRP_XPL,
515	    0);
516	set_gatesegd(&idt[T_TSSFLT], &xpv_invtsstrap, cs, SDT_SYSIGT, TRP_XPL,
517	    0);
518	set_gatesegd(&idt[T_SEGFLT], &xpv_segnptrap, cs, SDT_SYSIGT, TRP_XPL,
519	    0);
520	set_gatesegd(&idt[T_STKFLT], &xpv_stktrap, cs, SDT_SYSIGT, TRP_XPL, 0);
521	set_gatesegd(&idt[T_GPFLT], &xpv_gptrap, cs, SDT_SYSIGT, TRP_XPL, 0);
522	set_gatesegd(&idt[T_PGFLT], &xpv_pftrap, cs, SDT_SYSIGT, TRP_XPL, 0);
523	set_gatesegd(&idt[T_EXTERRFLT], &xpv_ndperr, cs, SDT_SYSIGT, TRP_XPL,
524	    0);
525	set_gatesegd(&idt[T_ALIGNMENT], &xpv_achktrap, cs, SDT_SYSIGT, TRP_XPL,
526	    0);
527	set_gatesegd(&idt[T_MCE], &xpv_mcetrap, cs, SDT_SYSIGT, TRP_XPL, 0);
528	set_gatesegd(&idt[T_SIMDFPE], &xpv_xmtrap, cs, SDT_SYSIGT, TRP_XPL, 0);
529
530	/*
531	 * We have no double fault handler.  Any single fault represents a
532	 * catastrophic failure for us, so there is no attempt to handle
533	 * them cleanly: we just print a message and reboot.  If we
534	 * encounter a second fault while doing that, there is nothing
535	 * else we can do.
536	 */
537
538	/*
539	 * Be prepared to absorb any stray device interrupts received
540	 * while writing the core to disk.
541	 */
542	for (i = 33; i < NIDT; i++)
543		set_gatesegd(&idt[i], &xpv_surprise_intr, cs, SDT_SYSIGT,
544		    TRP_XPL, 0);
545
546	/* The one interrupt we expect to get is from the APIC timer.  */
547	set_gatesegd(&idt[T_XPV_TIMER], &xpv_timer_trap, cs, SDT_SYSIGT,
548	    TRP_XPL, 0);
549
550	idtr.dtr_base = (uintptr_t)xpv_panic_idt;
551	idtr.dtr_limit = sizeof (xpv_panic_idt) - 1;
552	wr_idtr(&idtr);
553
554#if defined(__amd64)
555	/* Catch any hypercalls. */
556	wrmsr(MSR_AMD_LSTAR, (uintptr_t)xpv_panic_hypercall);
557	wrmsr(MSR_AMD_CSTAR, (uintptr_t)xpv_panic_hypercall);
558#endif
559}
560
561static void
562xpv_apic_clkinit()
563{
564	uint_t		apic_ticks = 0;
565
566	/*
567	 * Measure how many APIC ticks there are within a fixed time
568	 * period.  We're going to be fairly coarse here.  This timer is
569	 * just being used to detect a stalled panic, so as long as we have
570	 * the right order of magnitude, everything should be fine.
571	 */
572	xpv_apicadr[APIC_SPUR_INT_REG] = AV_UNIT_ENABLE | APIC_SPUR_INTR;
573	xpv_apicadr[APIC_LOCAL_TIMER] = AV_MASK;
574	xpv_apicadr[APIC_INT_VECT0] = AV_MASK;	/* local intr reg 0 */
575
576	xpv_apicadr[APIC_DIVIDE_REG] = 0;
577	xpv_apicadr[APIC_INIT_COUNT] = APIC_MAXVAL;
578	drv_usecwait(XPV_TIMER_INTERVAL);
579	apic_ticks = APIC_MAXVAL - xpv_apicadr[APIC_CURR_COUNT];
580
581	/*
582	 * apic_ticks now represents roughly how many apic ticks comprise
583	 * one timeout interval.  Program the timer to send us an interrupt
584	 * every time that interval expires.
585	 */
586	xpv_apicadr[APIC_LOCAL_TIMER] = T_XPV_TIMER | AV_PERIODIC;
587	xpv_apicadr[APIC_INIT_COUNT] = apic_ticks;
588	xpv_apicadr[APIC_EOI_REG] = 0;
589}
590
591void
592xpv_timer_tick(void)
593{
594	static int ticks = 0;
595
596	if (ticks++ >= MICROSEC / XPV_TIMER_INTERVAL) {
597		ticks = 0;
598		if (dump_timeleft && (--dump_timeleft == 0))
599			panic("Xen panic timeout\n");
600	}
601	xpv_apicadr[APIC_EOI_REG] = 0;
602}
603
604void
605xpv_interrupt(void)
606{
607#ifdef	DEBUG
608	static int cnt = 0;
609
610	if (cnt++ < 10)
611		xpv_panic_printf("Unexpected interrupt received.\n");
612	if ((cnt < 1000) && ((cnt % 100) == 0))
613		xpv_panic_printf("%d unexpected interrupts received.\n", cnt);
614#endif
615
616	xpv_apicadr[APIC_EOI_REG] = 0;
617}
618
619/*
620 * Managing time in panic context is trivial.  We only have a single CPU,
621 * we never get rescheduled, we never get suspended.  We just need to
622 * convert clock ticks into nanoseconds.
623 */
624static hrtime_t
625xpv_panic_gethrtime(void)
626{
627	hrtime_t tsc, hrt;
628	unsigned int *l = (unsigned int *)&(tsc);
629
630	tsc = __rdtsc_insn();
631	hrt = (mul32(l[1], nsec_scale) << NSEC_SHIFT) +
632	    (mul32(l[0], nsec_scale) >> (32 - NSEC_SHIFT));
633
634	return (hrt);
635}
636
637static void
638xpv_panic_time_init()
639{
640	nsec_scale =
641	    CPU->cpu_m.mcpu_vcpu_info->time.tsc_to_system_mul >> NSEC_SHIFT;
642
643	gethrtimef = xpv_panic_gethrtime;
644}
645
646static void
647xpv_panicsys(struct regs *rp, char *fmt, ...)
648{
649	extern void panicsys(const char *, va_list, struct regs *, int);
650	va_list alist;
651
652	va_start(alist, fmt);
653	panicsys(fmt, alist, rp, 1);
654	va_end(alist);
655}
656
657void
658xpv_do_panic(void *arg)
659{
660	struct panic_info *pip = (struct panic_info *)arg;
661	int l;
662	struct cregs creg;
663#if defined(__amd64)
664	extern uintptr_t postbootkernelbase;
665#endif
666
667	if (xpv_panicking++ > 0)
668		panic("multiple calls to xpv_do_panic()");
669
670	/*
671	 * Indicate to the underlying panic framework that a panic has been
672	 * initiated.  This is ordinarily done as part of vpanic().  Since
673	 * we already have all the register state saved by the hypervisor,
674	 * we skip that and jump straight into the panic processing code.
675	 *
676	 * XXX If another thread grabs and wins the panic_quiesce trigger
677	 * then we'll have two threads in panicsys believing they are in
678	 * charge of the panic attempt!
679	 */
680	(void) panic_trigger(&panic_quiesce);
681
682#if defined(__amd64)
683	/*
684	 * bzero() and bcopy() get unhappy when asked to operate on
685	 * addresses outside of the kernel.  At this point Xen is really a
686	 * part of the kernel, so we update the routines' notion of where
687	 * the kernel starts.
688	 */
689	postbootkernelbase = xen_virt_start;
690#endif
691
692#if defined(HYPERVISOR_VIRT_END)
693	xpv_end = HYPERVISOR_VIRT_END;
694#else
695	xpv_end = (uintptr_t)UINTPTR_MAX - sizeof (uintptr_t);
696#endif
697
698	/*
699	 * If we were redirecting console output to the hypervisor, we have
700	 * to stop.
701	 */
702	use_polledio = B_FALSE;
703	if (console == CONS_HYPERVISOR) {
704		bcons_device_change(CONS_HYPERVISOR);
705	} else if (cons_polledio != NULL &&
706	    cons_polledio->cons_polledio_putchar != NULL)  {
707		if (cons_polledio->cons_polledio_enter != NULL)
708			cons_polledio->cons_polledio_enter(
709			    cons_polledio->cons_polledio_argument);
710		use_polledio = 1;
711	}
712
713	/* Make sure we handle all console output from here on. */
714	sysp->bsvc_putchar = xpv_panic_putc;
715
716	/*
717	 * If we find an unsupported panic_info structure, there's not much
718	 * we can do other than complain, plow on, and hope for the best.
719	 */
720	if (pip->pi_version != PANIC_INFO_VERSION)
721		xpv_panic_printf("Warning: Xen is using an unsupported "
722		    "version of the panic_info structure.\n");
723
724	xpv_panic_info = pip;
725
726#if defined(__amd64)
727	kpm1_low = (uintptr_t)xpv_panic_info->pi_ram_start;
728	if (xpv_panic_info->pi_xen_start == NULL) {
729		kpm1_high = (uintptr_t)xpv_panic_info->pi_ram_end;
730	} else {
731		kpm1_high = (uintptr_t)xpv_panic_info->pi_xen_start;
732		kpm2_low = (uintptr_t)xpv_panic_info->pi_xen_end;
733		kpm2_high = (uintptr_t)xpv_panic_info->pi_ram_end;
734	}
735#endif
736
737	/*
738	 * Make sure we are running on the Solaris %gs.  The Xen panic code
739	 * should already have set up the GDT properly.
740	 */
741	xpv_panic_resetgs();
742#if defined(__amd64)
743	wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
744#endif
745
746	xpv_panic_time_init();
747
748	/*
749	 * Switch to our own IDT, avoiding any accidental returns to Xen
750	 * world.
751	 */
752	switch_to_xpv_panic_idt();
753
754	/*
755	 * Initialize the APIC timer, which is used to detect a hung dump
756	 * attempt.
757	 */
758	xpv_apicadr = pip->pi_apic;
759	xpv_apic_clkinit();
760
761	/*
762	 * Set up a few values that we'll need repeatedly.
763	 */
764	getcregs(&creg);
765	xpv_panic_cr3 = creg.cr_cr3;
766	for (l = mmu.max_level; l >= 0; l--)
767		xpv_panic_nptes[l] = mmu.ptes_per_table;
768#ifdef __i386
769	if (mmu.pae_hat)
770		xpv_panic_nptes[mmu.max_level] = 4;
771#endif
772
773	/* Add the fake Xen module to the module list */
774	if (xpv_module != NULL) {
775		extern int last_module_id;
776
777		xpv_modctl->mod_id = last_module_id++;
778		xpv_modctl->mod_next = &modules;
779		xpv_modctl->mod_prev = modules.mod_prev;
780		modules.mod_prev->mod_next = xpv_modctl;
781		modules.mod_prev = xpv_modctl;
782	}
783
784	if (pip->pi_mca.mpd_magic == MCA_PANICDATA_MAGIC)
785		xpv_mca_panic_data = &pip->pi_mca;
786
787	xpv_panic_printf = printf;
788	xpv_panicsys((struct regs *)pip->pi_regs, pip->pi_panicstr);
789	xpv_panic_printf("Failed to reboot following panic.\n");
790	for (;;)
791		;
792}
793
794/*
795 * Set up the necessary data structures to pretend that the Xen hypervisor
796 * is a loadable module, allowing mdb to find the Xen symbols in a crash
797 * dump.  Since these symbols all map to VA space Solaris doesn't normally
798 * have access to, we don't link these structures into the kernel's lists
799 * until/unless we hit a Xen panic.
800 *
801 * The observant reader will note a striking amount of overlap between this
802 * code and that found in krtld.  While it would be handy if we could just
803 * ask krtld to do this work for us, it's not that simple.  Among the
804 * complications: we're not actually loading the text here (grub did it at
805 * boot), the .text section is writable, there are no relocations to do,
806 * none of the module text/data is in readable memory, etc.  Training krtld
807 * to deal with this weird module is as complicated, and more risky, than
808 * reimplementing the necessary subset of it here.
809 */
810static void
811init_xen_module()
812{
813	struct _buf *file = NULL;
814	struct module *mp;
815	struct modctl *mcp;
816	int i, shn;
817	Shdr *shp, *ctf_shp;
818	char *names = NULL;
819	size_t n, namesize, text_align, data_align;
820#if defined(__amd64)
821	const char machine = EM_AMD64;
822#else
823	const char machine = EM_386;
824#endif
825
826	/* Allocate and init the module structure */
827	mp = kmem_zalloc(sizeof (*mp), KM_SLEEP);
828	mp->filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
829	(void) strcpy(mp->filename, XPV_FILENAME);
830
831	/* Allocate and init the modctl structure */
832	mcp = kmem_zalloc(sizeof (*mcp), KM_SLEEP);
833	mcp->mod_modname = kobj_zalloc(strlen(XPV_MODNAME) + 1, KM_SLEEP);
834	(void) strcpy(mcp->mod_modname, XPV_MODNAME);
835	mcp->mod_filename = kobj_zalloc(strlen(XPV_FILENAME) + 1, KM_SLEEP);
836	(void) strcpy(mcp->mod_filename, XPV_FILENAME);
837	mcp->mod_inprogress_thread = (kthread_id_t)-1;
838	mcp->mod_ref = 1;
839	mcp->mod_loaded = 1;
840	mcp->mod_loadcnt = 1;
841	mcp->mod_mp = mp;
842
843	/*
844	 * Try to open a Xen image that hasn't had its symbol and CTF
845	 * information stripped off.
846	 */
847	file = kobj_open_file(XPV_FILENAME);
848	if (file == (struct _buf *)-1) {
849		file = NULL;
850		goto err;
851	}
852
853	/*
854	 * Read the header and ensure that this is an ELF file for the
855	 * proper ISA.  If it's not, somebody has done something very
856	 * stupid.  Why bother?  See Mencken.
857	 */
858	if (kobj_read_file(file, (char *)&mp->hdr, sizeof (mp->hdr), 0) < 0)
859		goto err;
860	for (i = 0; i < SELFMAG; i++)
861		if (mp->hdr.e_ident[i] != ELFMAG[i])
862			goto err;
863	if ((mp->hdr.e_ident[EI_DATA] != ELFDATA2LSB) ||
864	    (mp->hdr.e_machine != machine))
865		goto err;
866
867	/* Read in the section headers */
868	n = mp->hdr.e_shentsize * mp->hdr.e_shnum;
869	mp->shdrs = kmem_zalloc(n, KM_SLEEP);
870	if (kobj_read_file(file, mp->shdrs, n, mp->hdr.e_shoff) < 0)
871		goto err;
872
873	/* Read the section names */
874	shp = (Shdr *)(mp->shdrs + mp->hdr.e_shstrndx * mp->hdr.e_shentsize);
875	namesize = shp->sh_size;
876	names = kmem_zalloc(shp->sh_size, KM_SLEEP);
877	if (kobj_read_file(file, names, shp->sh_size, shp->sh_offset) < 0)
878		goto err;
879
880	/*
881	 * Fill in the text and data size fields.
882	 */
883	ctf_shp = NULL;
884	text_align = data_align = 0;
885	for (shn = 1; shn < mp->hdr.e_shnum; shn++) {
886		shp = (Shdr *)(mp->shdrs + shn * mp->hdr.e_shentsize);
887
888		/* Sanity check the offset of the section name */
889		if (shp->sh_name >= namesize)
890			continue;
891
892		/* If we find the symtab section, remember it for later. */
893		if (shp->sh_type == SHT_SYMTAB) {
894			mp->symtbl_section = shn;
895			mp->symhdr = shp;
896			continue;
897		}
898
899		/* If we find the CTF section, remember it for later. */
900		if ((shp->sh_size != 0) &&
901		    (strcmp(names + shp->sh_name, ".SUNW_ctf") == 0)) {
902			ctf_shp = shp;
903			continue;
904		}
905
906		if (!(shp->sh_flags & SHF_ALLOC))
907			continue;
908
909		/*
910		 * Xen marks its text section as writable, so we need to
911		 * look for the name - not just the flag.
912		 */
913		if ((strcmp(&names[shp->sh_name], ".text") != NULL) &&
914		    (shp->sh_flags & SHF_WRITE) != 0) {
915			if (shp->sh_addralign > data_align)
916				data_align = shp->sh_addralign;
917			mp->data_size = ALIGN(mp->data_size, data_align);
918			mp->data_size += ALIGN(shp->sh_size, 8);
919			if (mp->data == NULL || mp->data > (char *)shp->sh_addr)
920				mp->data = (char *)shp->sh_addr;
921		} else {
922			if (shp->sh_addralign > text_align)
923				text_align = shp->sh_addralign;
924			mp->text_size = ALIGN(mp->text_size, text_align);
925			mp->text_size += ALIGN(shp->sh_size, 8);
926			if (mp->text == NULL || mp->text > (char *)shp->sh_addr)
927				mp->text = (char *)shp->sh_addr;
928		}
929	}
930	kmem_free(names, namesize);
931	names = NULL;
932	shp = NULL;
933	mcp->mod_text = mp->text;
934	mcp->mod_text_size = mp->text_size;
935
936	/*
937	 * If we have symbol table and string table sections, read them in
938	 * now.  If we don't, we just plow on.  We'll still get a valid
939	 * core dump, but finding anything useful will be just a bit
940	 * harder.
941	 *
942	 * Note: we don't bother with a hash table.  We'll never do a
943	 * symbol lookup unless we crash, and then mdb creates its own.  We
944	 * also don't try to perform any relocations.  Xen should be loaded
945	 * exactly where the ELF file indicates, and the symbol information
946	 * in the file should be complete and correct already.  Static
947	 * linking ain't all bad.
948	 */
949	if ((mp->symhdr != NULL) && (mp->symhdr->sh_link < mp->hdr.e_shnum)) {
950		mp->strhdr = (Shdr *)
951		    (mp->shdrs + mp->symhdr->sh_link * mp->hdr.e_shentsize);
952		mp->nsyms = mp->symhdr->sh_size / mp->symhdr->sh_entsize;
953
954		/* Allocate space for the symbol table and strings.  */
955		mp->symsize = mp->symhdr->sh_size +
956		    mp->nsyms * sizeof (symid_t) + mp->strhdr->sh_size;
957		mp->symspace = kmem_zalloc(mp->symsize, KM_SLEEP);
958		mp->symtbl = mp->symspace;
959		mp->strings = (char *)(mp->symtbl + mp->symhdr->sh_size);
960
961		if ((kobj_read_file(file, mp->symtbl,
962		    mp->symhdr->sh_size, mp->symhdr->sh_offset) < 0) ||
963		    (kobj_read_file(file, mp->strings,
964		    mp->strhdr->sh_size, mp->strhdr->sh_offset) < 0))
965			goto err;
966	}
967
968	/*
969	 * Read in the CTF section
970	 */
971	if ((ctf_shp != NULL) && ((moddebug & MODDEBUG_NOCTF) == 0)) {
972		mp->ctfdata = kmem_zalloc(ctf_shp->sh_size, KM_SLEEP);
973		mp->ctfsize = ctf_shp->sh_size;
974		if (kobj_read_file(file, mp->ctfdata, mp->ctfsize,
975		    ctf_shp->sh_offset) < 0)
976			goto err;
977	}
978
979	kobj_close_file(file);
980
981	xpv_module = mp;
982	xpv_modctl = mcp;
983	return;
984
985err:
986	cmn_err(CE_WARN, "Failed to initialize xpv module.");
987	if (file != NULL)
988		kobj_close_file(file);
989
990	kmem_free(mp->filename, strlen(XPV_FILENAME) + 1);
991	if (mp->shdrs != NULL)
992		kmem_free(mp->shdrs, mp->hdr.e_shentsize * mp->hdr.e_shnum);
993	if (mp->symspace != NULL)
994		kmem_free(mp->symspace, mp->symsize);
995	if (mp->ctfdata != NULL)
996		kmem_free(mp->ctfdata, mp->ctfsize);
997	kmem_free(mp, sizeof (*mp));
998	kmem_free(mcp->mod_filename, strlen(XPV_FILENAME) + 1);
999	kmem_free(mcp->mod_modname, strlen(XPV_MODNAME) + 1);
1000	kmem_free(mcp, sizeof (*mcp));
1001	if (names != NULL)
1002		kmem_free(names, namesize);
1003}
1004
1005void
1006xpv_panic_init()
1007{
1008	xen_platform_op_t op;
1009	int i;
1010
1011	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
1012
1013	for (i = 0; i < mmu.num_level; i++)
1014		ptable_pfn[i] = PFN_INVALID;
1015
1016	/* Let Xen know where to jump if/when it panics. */
1017	op.cmd = XENPF_panic_init;
1018	op.interface_version = XENPF_INTERFACE_VERSION;
1019	op.u.panic_init.panic_addr = (unsigned long)xpv_panic_hdlr;
1020
1021	(void) HYPERVISOR_platform_op(&op);
1022
1023	init_xen_module();
1024}
1025