minidump_machdep.c revision 236503
1/*-
2 * Copyright (c) 2006 Peter Wemm
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: head/sys/amd64/amd64/minidump_machdep.c 236503 2012-06-03 08:01:12Z avg $");
29
30#include "opt_pmap.h"
31#include "opt_watchdog.h"
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/conf.h>
36#include <sys/cons.h>
37#include <sys/kernel.h>
38#include <sys/kerneldump.h>
39#include <sys/msgbuf.h>
40#include <sys/watchdog.h>
41#include <vm/vm.h>
42#include <vm/vm_page.h>
43#include <vm/pmap.h>
44#include <machine/atomic.h>
45#include <machine/elf.h>
46#include <machine/md_var.h>
47#include <machine/vmparam.h>
48#include <machine/minidump.h>
49
50CTASSERT(sizeof(struct kerneldumpheader) == 512);
51
52/*
53 * Don't touch the first SIZEOF_METADATA bytes on the dump device. This
54 * is to protect us from metadata and to protect metadata from us.
55 */
56#define	SIZEOF_METADATA		(64*1024)
57
58#define	MD_ALIGN(x)	(((off_t)(x) + PAGE_MASK) & ~PAGE_MASK)
59#define	DEV_ALIGN(x)	(((off_t)(x) + (DEV_BSIZE-1)) & ~(DEV_BSIZE-1))
60
61uint64_t *vm_page_dump;
62int vm_page_dump_size;
63
64static struct kerneldumpheader kdh;
65static off_t dumplo;
66
67/* Handle chunked writes. */
68static size_t fragsz;
69static void *dump_va;
70static size_t counter, progress, dumpsize;
71
72CTASSERT(sizeof(*vm_page_dump) == 8);
73
74static int
75is_dumpable(vm_paddr_t pa)
76{
77	vm_page_t m;
78	int i;
79
80	if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
81		return ((m->flags & PG_NODUMP) == 0);
82	for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
83		if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
84			return (1);
85	}
86	return (0);
87}
88
89#define PG2MB(pgs) (((pgs) + (1 << 8) - 1) >> 8)
90
91static int
92blk_flush(struct dumperinfo *di)
93{
94	int error;
95
96	if (fragsz == 0)
97		return (0);
98
99	error = dump_write(di, dump_va, 0, dumplo, fragsz);
100	dumplo += fragsz;
101	fragsz = 0;
102	return (error);
103}
104
105static struct {
106	int min_per;
107	int max_per;
108	int visited;
109} progress_track[10] = {
110	{  0,  10, 0},
111	{ 10,  20, 0},
112	{ 20,  30, 0},
113	{ 30,  40, 0},
114	{ 40,  50, 0},
115	{ 50,  60, 0},
116	{ 60,  70, 0},
117	{ 70,  80, 0},
118	{ 80,  90, 0},
119	{ 90, 100, 0}
120};
121
122static void
123report_progress(size_t progress, size_t dumpsize)
124{
125	int sofar, i;
126
127	sofar = 100 - ((progress * 100) / dumpsize);
128	for (i = 0; i < 10; i++) {
129		if (sofar < progress_track[i].min_per || sofar > progress_track[i].max_per)
130			continue;
131		if (progress_track[i].visited)
132			return;
133		progress_track[i].visited = 1;
134		printf("..%d%%", sofar);
135		return;
136	}
137}
138
139static int
140blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz)
141{
142	size_t len;
143	int error, i, c;
144	u_int maxdumpsz;
145
146	maxdumpsz = min(di->maxiosize, MAXDUMPPGS * PAGE_SIZE);
147	if (maxdumpsz == 0)	/* seatbelt */
148		maxdumpsz = PAGE_SIZE;
149	error = 0;
150	if ((sz % PAGE_SIZE) != 0) {
151		printf("size not page aligned\n");
152		return (EINVAL);
153	}
154	if (ptr != NULL && pa != 0) {
155		printf("cant have both va and pa!\n");
156		return (EINVAL);
157	}
158	if (pa != 0 && (((uintptr_t)ptr) % PAGE_SIZE) != 0) {
159		printf("address not page aligned\n");
160		return (EINVAL);
161	}
162	if (ptr != NULL) {
163		/* If we're doing a virtual dump, flush any pre-existing pa pages */
164		error = blk_flush(di);
165		if (error)
166			return (error);
167	}
168	while (sz) {
169		len = maxdumpsz - fragsz;
170		if (len > sz)
171			len = sz;
172		counter += len;
173		progress -= len;
174		if (counter >> 24) {
175			report_progress(progress, dumpsize);
176			counter &= (1<<24) - 1;
177		}
178
179		wdog_kern_pat(WD_LASTVAL);
180
181		if (ptr) {
182			error = dump_write(di, ptr, 0, dumplo, len);
183			if (error)
184				return (error);
185			dumplo += len;
186			ptr += len;
187			sz -= len;
188		} else {
189			for (i = 0; i < len; i += PAGE_SIZE)
190				dump_va = pmap_kenter_temporary(pa + i, (i + fragsz) >> PAGE_SHIFT);
191			fragsz += len;
192			pa += len;
193			sz -= len;
194			if (fragsz == maxdumpsz) {
195				error = blk_flush(di);
196				if (error)
197					return (error);
198			}
199		}
200
201		/* Check for user abort. */
202		c = cncheckc();
203		if (c == 0x03)
204			return (ECANCELED);
205		if (c != -1)
206			printf(" (CTRL-C to abort) ");
207	}
208
209	return (0);
210}
211
212/* A fake page table page, to avoid having to handle both 4K and 2M pages */
213static pd_entry_t fakepd[NPDEPG];
214
215void
216minidumpsys(struct dumperinfo *di)
217{
218	uint32_t pmapsize;
219	vm_offset_t va;
220	int error;
221	uint64_t bits;
222	uint64_t *pdp, *pd, *pt, pa;
223	int i, j, k, n, bit;
224	int retry_count;
225	struct minidumphdr mdhdr;
226
227	retry_count = 0;
228 retry:
229	retry_count++;
230	counter = 0;
231	/* Walk page table pages, set bits in vm_page_dump */
232	pmapsize = 0;
233	pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
234	for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + NKPT * NBPDR,
235	    kernel_vm_end); ) {
236		/*
237		 * We always write a page, even if it is zero. Each
238		 * page written corresponds to 1GB of space
239		 */
240		pmapsize += PAGE_SIZE;
241		i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
242		if ((pdp[i] & PG_V) == 0) {
243			va += NBPDP;
244			continue;
245		}
246
247		/*
248		 * 1GB page is represented as 512 2MB pages in a dump.
249		 */
250		if ((pdp[i] & PG_PS) != 0) {
251			va += NBPDP;
252			pa = pdp[i] & PG_PS_FRAME;
253			for (n = 0; n < NPDEPG * NPTEPG; n++) {
254				if (is_dumpable(pa))
255					dump_add_page(pa);
256				pa += PAGE_SIZE;
257			}
258			continue;
259		}
260
261		pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
262		for (n = 0; n < NPDEPG; n++, va += NBPDR) {
263			j = (va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1);
264
265			if ((pd[j] & PG_V) == 0)
266				continue;
267
268			if ((pd[j] & PG_PS) != 0) {
269				/* This is an entire 2M page. */
270				pa = pd[j] & PG_PS_FRAME;
271				for (k = 0; k < NPTEPG; k++) {
272					if (is_dumpable(pa))
273						dump_add_page(pa);
274					pa += PAGE_SIZE;
275				}
276				continue;
277			}
278
279			pa = pd[j] & PG_FRAME;
280			/* set bit for this PTE page */
281			if (is_dumpable(pa))
282				dump_add_page(pa);
283			/* and for each valid page in this 2MB block */
284			pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME);
285			for (k = 0; k < NPTEPG; k++) {
286				if ((pt[k] & PG_V) == 0)
287					continue;
288				pa = pt[k] & PG_FRAME;
289				if (is_dumpable(pa))
290					dump_add_page(pa);
291			}
292		}
293	}
294
295	/* Calculate dump size. */
296	dumpsize = pmapsize;
297	dumpsize += round_page(msgbufp->msg_size);
298	dumpsize += round_page(vm_page_dump_size);
299	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
300		bits = vm_page_dump[i];
301		while (bits) {
302			bit = bsfq(bits);
303			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) + bit) * PAGE_SIZE;
304			/* Clear out undumpable pages now if needed */
305			if (is_dumpable(pa)) {
306				dumpsize += PAGE_SIZE;
307			} else {
308				dump_drop_page(pa);
309			}
310			bits &= ~(1ul << bit);
311		}
312	}
313	dumpsize += PAGE_SIZE;
314
315	/* Determine dump offset on device. */
316	if (di->mediasize < SIZEOF_METADATA + dumpsize + sizeof(kdh) * 2) {
317		error = E2BIG;
318		goto fail;
319	}
320	dumplo = di->mediaoffset + di->mediasize - dumpsize;
321	dumplo -= sizeof(kdh) * 2;
322	progress = dumpsize;
323
324	/* Initialize mdhdr */
325	bzero(&mdhdr, sizeof(mdhdr));
326	strcpy(mdhdr.magic, MINIDUMP_MAGIC);
327	mdhdr.version = MINIDUMP_VERSION;
328	mdhdr.msgbufsize = msgbufp->msg_size;
329	mdhdr.bitmapsize = vm_page_dump_size;
330	mdhdr.pmapsize = pmapsize;
331	mdhdr.kernbase = VM_MIN_KERNEL_ADDRESS;
332	mdhdr.dmapbase = DMAP_MIN_ADDRESS;
333	mdhdr.dmapend = DMAP_MAX_ADDRESS;
334
335	mkdumpheader(&kdh, KERNELDUMPMAGIC, KERNELDUMP_AMD64_VERSION, dumpsize, di->blocksize);
336
337	printf("Dumping %llu out of %ju MB:", (long long)dumpsize >> 20,
338	    ptoa((uintmax_t)physmem) / 1048576);
339
340	/* Dump leader */
341	error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
342	if (error)
343		goto fail;
344	dumplo += sizeof(kdh);
345
346	/* Dump my header */
347	bzero(&fakepd, sizeof(fakepd));
348	bcopy(&mdhdr, &fakepd, sizeof(mdhdr));
349	error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
350	if (error)
351		goto fail;
352
353	/* Dump msgbuf up front */
354	error = blk_write(di, (char *)msgbufp->msg_ptr, 0, round_page(msgbufp->msg_size));
355	if (error)
356		goto fail;
357
358	/* Dump bitmap */
359	error = blk_write(di, (char *)vm_page_dump, 0, round_page(vm_page_dump_size));
360	if (error)
361		goto fail;
362
363	/* Dump kernel page directory pages */
364	bzero(fakepd, sizeof(fakepd));
365	pdp = (uint64_t *)PHYS_TO_DMAP(KPDPphys);
366	for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + NKPT * NBPDR,
367	    kernel_vm_end); va += NBPDP) {
368		i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
369
370		/* We always write a page, even if it is zero */
371		if ((pdp[i] & PG_V) == 0) {
372			error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
373			if (error)
374				goto fail;
375			/* flush, in case we reuse fakepd in the same block */
376			error = blk_flush(di);
377			if (error)
378				goto fail;
379			continue;
380		}
381
382		/* 1GB page is represented as 512 2MB pages in a dump */
383		if ((pdp[i] & PG_PS) != 0) {
384			/* PDPE and PDP have identical layout in this case */
385			fakepd[0] = pdp[i];
386			for (j = 1; j < NPDEPG; j++)
387				fakepd[j] = fakepd[j - 1] + NBPDR;
388			error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
389			if (error)
390				goto fail;
391			/* flush, in case we reuse fakepd in the same block */
392			error = blk_flush(di);
393			if (error)
394				goto fail;
395			bzero(fakepd, sizeof(fakepd));
396			continue;
397		}
398
399		pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
400		error = blk_write(di, (char *)pd, 0, PAGE_SIZE);
401		if (error)
402			goto fail;
403		error = blk_flush(di);
404		if (error)
405			goto fail;
406	}
407
408	/* Dump memory chunks */
409	/* XXX cluster it up and use blk_dump() */
410	for (i = 0; i < vm_page_dump_size / sizeof(*vm_page_dump); i++) {
411		bits = vm_page_dump[i];
412		while (bits) {
413			bit = bsfq(bits);
414			pa = (((uint64_t)i * sizeof(*vm_page_dump) * NBBY) + bit) * PAGE_SIZE;
415			error = blk_write(di, 0, pa, PAGE_SIZE);
416			if (error)
417				goto fail;
418			bits &= ~(1ul << bit);
419		}
420	}
421
422	error = blk_flush(di);
423	if (error)
424		goto fail;
425
426	/* Dump trailer */
427	error = dump_write(di, &kdh, 0, dumplo, sizeof(kdh));
428	if (error)
429		goto fail;
430	dumplo += sizeof(kdh);
431
432	/* Signal completion, signoff and exit stage left. */
433	dump_write(di, NULL, 0, 0, 0);
434	printf("\nDump complete\n");
435	return;
436
437 fail:
438	if (error < 0)
439		error = -error;
440
441	printf("\n");
442	if (error == ENOSPC) {
443		printf("Dump map grown while dumping. ");
444		if (retry_count < 5) {
445			printf("Retrying...\n");
446			goto retry;
447		}
448		printf("Dump failed.\n");
449	}
450	else if (error == ECANCELED)
451		printf("Dump aborted\n");
452	else if (error == E2BIG)
453		printf("Dump failed. Partition too small.\n");
454	else
455		printf("** DUMP FAILED (ERROR %d) **\n", error);
456}
457
458void
459dump_add_page(vm_paddr_t pa)
460{
461	int idx, bit;
462
463	pa >>= PAGE_SHIFT;
464	idx = pa >> 6;		/* 2^6 = 64 */
465	bit = pa & 63;
466	atomic_set_long(&vm_page_dump[idx], 1ul << bit);
467}
468
469void
470dump_drop_page(vm_paddr_t pa)
471{
472	int idx, bit;
473
474	pa >>= PAGE_SHIFT;
475	idx = pa >> 6;		/* 2^6 = 64 */
476	bit = pa & 63;
477	atomic_clear_long(&vm_page_dump[idx], 1ul << bit);
478}
479