cpr_dump.c revision 7240:c4957ab6a78e
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28/*
29 * Fill in and write out the cpr state file
30 *	1. Allocate and write headers, ELF and cpr dump header
31 *	2. Allocate bitmaps according to phys_install
32 *	3. Tag kernel pages into corresponding bitmap
33 *	4. Write bitmaps to state file
34 *	5. Write actual physical page data to state file
35 */
36
37#include <sys/types.h>
38#include <sys/systm.h>
39#include <sys/vm.h>
40#include <sys/memlist.h>
41#include <sys/kmem.h>
42#include <sys/vnode.h>
43#include <sys/fs/ufs_inode.h>
44#include <sys/errno.h>
45#include <sys/cmn_err.h>
46#include <sys/debug.h>
47#include <vm/page.h>
48#include <vm/seg.h>
49#include <vm/seg_kmem.h>
50#include <vm/seg_kpm.h>
51#include <vm/hat.h>
52#include <sys/cpr.h>
53#include <sys/conf.h>
54#include <sys/ddi.h>
55#include <sys/panic.h>
56#include <sys/thread.h>
57#include <sys/note.h>
58
59/* Local defines and variables */
60#define	BTOb(bytes)	((bytes) << 3)		/* Bytes to bits, log2(NBBY) */
61#define	bTOB(bits)	((bits) >> 3)		/* bits to Bytes, log2(NBBY) */
62
63#if defined(__sparc)
64static uint_t cpr_pages_tobe_dumped;
65static uint_t cpr_regular_pgs_dumped;
66static int cpr_dump_regular_pages(vnode_t *);
67static int cpr_count_upages(int, bitfunc_t);
68static int cpr_compress_and_write(vnode_t *, uint_t, pfn_t, pgcnt_t);
69#endif
70
71int cpr_flush_write(vnode_t *);
72
73int cpr_contig_pages(vnode_t *, int);
74
75void cpr_clear_bitmaps();
76
77extern size_t cpr_get_devsize(dev_t);
78extern int i_cpr_dump_setup(vnode_t *);
79extern int i_cpr_blockzero(char *, char **, int *, vnode_t *);
80extern int cpr_test_mode;
81int cpr_setbit(pfn_t, int);
82int cpr_clrbit(pfn_t, int);
83
84ctrm_t cpr_term;
85
86char *cpr_buf, *cpr_buf_end;
87int cpr_buf_blocks;		/* size of cpr_buf in blocks */
88size_t cpr_buf_size;		/* size of cpr_buf in bytes */
89size_t cpr_bitmap_size;
90int cpr_nbitmaps;
91
92char *cpr_pagedata;		/* page buffer for compression / tmp copy */
93size_t cpr_pagedata_size;	/* page buffer size in bytes */
94
95#if defined(__sparc)
96static char *cpr_wptr;		/* keep track of where to write to next */
97static int cpr_file_bn;		/* cpr state-file block offset */
98static int cpr_disk_writes_ok;
99static size_t cpr_dev_space = 0;
100#endif
101
102char cpr_pagecopy[CPR_MAXCONTIG * MMU_PAGESIZE];
103
104#if defined(__sparc)
105/*
106 * On some platforms bcopy may modify the thread structure
107 * during bcopy (eg, to prevent cpu migration).  If the
108 * range we are currently writing out includes our own
109 * thread structure then it will be snapshotted by bcopy
110 * including those modified members - and the updates made
111 * on exit from bcopy will no longer be seen when we later
112 * restore the mid-bcopy kthread_t.  So if the range we
113 * need to copy overlaps with our thread structure we will
114 * use a simple byte copy.
115 */
116void
117cprbcopy(void *from, void *to, size_t bytes)
118{
119	extern int curthreadremapped;
120	caddr_t kthrend;
121
122	kthrend = (caddr_t)curthread + sizeof (kthread_t) - 1;
123	if (curthreadremapped || (kthrend >= (caddr_t)from &&
124	    kthrend < (caddr_t)from + bytes + sizeof (kthread_t) - 1)) {
125		caddr_t src = from, dst = to;
126
127		while (bytes-- > 0)
128			*dst++ = *src++;
129	} else {
130		bcopy(from, to, bytes);
131	}
132}
133
134/*
135 * Allocate pages for buffers used in writing out the statefile
136 */
137static int
138cpr_alloc_bufs(void)
139{
140	char *allocerr = "Unable to allocate memory for cpr buffer";
141	size_t size;
142
143	/*
144	 * set the cpr write buffer size to at least the historic
145	 * size (128k) or large enough to store the both the early
146	 * set of statefile structures (well under 0x800) plus the
147	 * bitmaps, and roundup to the next pagesize.
148	 */
149	size = PAGE_ROUNDUP(dbtob(4) + cpr_bitmap_size);
150	cpr_buf_size = MAX(size, CPRBUFSZ);
151	cpr_buf_blocks = btodb(cpr_buf_size);
152	cpr_buf = kmem_alloc(cpr_buf_size, KM_NOSLEEP);
153	if (cpr_buf == NULL) {
154		cpr_err(CE_WARN, allocerr);
155		return (ENOMEM);
156	}
157	cpr_buf_end = cpr_buf + cpr_buf_size;
158
159	cpr_pagedata_size = mmu_ptob(CPR_MAXCONTIG + 1);
160	cpr_pagedata = kmem_alloc(cpr_pagedata_size, KM_NOSLEEP);
161	if (cpr_pagedata == NULL) {
162		kmem_free(cpr_buf, cpr_buf_size);
163		cpr_buf = NULL;
164		cpr_err(CE_WARN, allocerr);
165		return (ENOMEM);
166	}
167
168	return (0);
169}
170
171
172/*
173 * Set bitmap size in bytes based on phys_install.
174 */
175void
176cpr_set_bitmap_size(void)
177{
178	struct memlist *pmem;
179	size_t size = 0;
180
181	memlist_read_lock();
182	for (pmem = phys_install; pmem; pmem = pmem->next)
183		size += pmem->size;
184	memlist_read_unlock();
185	cpr_bitmap_size = BITMAP_BYTES(size);
186}
187
188
189/*
190 * CPR dump header contains the following information:
191 *	1. header magic -- unique to cpr state file
192 *	2. kernel return pc & ppn for resume
193 *	3. current thread info
194 *	4. debug level and test mode
195 *	5. number of bitmaps allocated
196 *	6. number of page records
197 */
198static int
199cpr_write_header(vnode_t *vp)
200{
201	extern ushort_t cpr_mach_type;
202	struct cpr_dump_desc cdump;
203	pgcnt_t bitmap_pages;
204	pgcnt_t kpages, vpages, upages;
205	pgcnt_t cpr_count_kpages(int mapflag, bitfunc_t bitfunc);
206
207	cdump.cdd_magic = (uint_t)CPR_DUMP_MAGIC;
208	cdump.cdd_version = CPR_VERSION;
209	cdump.cdd_machine = cpr_mach_type;
210	cdump.cdd_debug = cpr_debug;
211	cdump.cdd_test_mode = cpr_test_mode;
212	cdump.cdd_bitmaprec = cpr_nbitmaps;
213
214	cpr_clear_bitmaps();
215
216	/*
217	 * Remember how many pages we plan to save to statefile.
218	 * This information will be used for sanity checks.
219	 * Untag those pages that will not be saved to statefile.
220	 */
221	kpages = cpr_count_kpages(REGULAR_BITMAP, cpr_setbit);
222	vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit);
223	upages = cpr_count_upages(REGULAR_BITMAP, cpr_setbit);
224	cdump.cdd_dumppgsize = kpages - vpages + upages;
225	cpr_pages_tobe_dumped = cdump.cdd_dumppgsize;
226	CPR_DEBUG(CPR_DEBUG7,
227	    "\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n",
228	    kpages, vpages, upages, cdump.cdd_dumppgsize);
229
230	/*
231	 * Some pages contain volatile data (cpr_buf and storage area for
232	 * sensitive kpages), which are no longer needed after the statefile
233	 * is dumped to disk.  We have already untagged them from regular
234	 * bitmaps.  Now tag them into the volatile bitmaps.  The pages in
235	 * volatile bitmaps will be claimed during resume, and the resumed
236	 * kernel will free them.
237	 */
238	(void) cpr_count_volatile_pages(VOLATILE_BITMAP, cpr_setbit);
239
240	bitmap_pages = mmu_btopr(cpr_bitmap_size);
241
242	/*
243	 * Export accurate statefile size for statefile allocation retry.
244	 * statefile_size = all the headers + total pages +
245	 * number of pages used by the bitmaps.
246	 * Roundup will be done in the file allocation code.
247	 */
248	STAT->cs_nocomp_statefsz = sizeof (cdd_t) + sizeof (cmd_t) +
249	    (sizeof (cbd_t) * cdump.cdd_bitmaprec) +
250	    (sizeof (cpd_t) * cdump.cdd_dumppgsize) +
251	    mmu_ptob(cdump.cdd_dumppgsize + bitmap_pages);
252
253	/*
254	 * If the estimated statefile is not big enough,
255	 * go retry now to save un-necessary operations.
256	 */
257	if (!(CPR->c_flags & C_COMPRESSING) &&
258	    (STAT->cs_nocomp_statefsz > STAT->cs_est_statefsz)) {
259		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
260			prom_printf("cpr_write_header: "
261			    "STAT->cs_nocomp_statefsz > "
262			    "STAT->cs_est_statefsz\n");
263		return (ENOSPC);
264	}
265
266	/* now write cpr dump descriptor */
267	return (cpr_write(vp, (caddr_t)&cdump, sizeof (cdd_t)));
268}
269
270
271/*
272 * CPR dump tail record contains the following information:
273 *	1. header magic -- unique to cpr state file
274 *	2. all misc info that needs to be passed to cprboot or resumed kernel
275 */
276static int
277cpr_write_terminator(vnode_t *vp)
278{
279	cpr_term.magic = (uint_t)CPR_TERM_MAGIC;
280	cpr_term.va = (cpr_ptr)&cpr_term;
281	cpr_term.pfn = (cpr_ext)va_to_pfn(&cpr_term);
282
283	/* count the last one (flush) */
284	cpr_term.real_statef_size = STAT->cs_real_statefsz +
285	    btod(cpr_wptr - cpr_buf) * DEV_BSIZE;
286
287	CPR_DEBUG(CPR_DEBUG9, "cpr_dump: Real Statefile Size: %ld\n",
288	    STAT->cs_real_statefsz);
289
290	cpr_tod_get(&cpr_term.tm_shutdown);
291
292	return (cpr_write(vp, (caddr_t)&cpr_term, sizeof (cpr_term)));
293}
294
295/*
296 * Write bitmap descriptor array, followed by merged bitmaps.
297 */
298static int
299cpr_write_bitmap(vnode_t *vp)
300{
301	char *rmap, *vmap, *dst, *tail;
302	size_t size, bytes;
303	cbd_t *dp;
304	int err;
305
306	dp = CPR->c_bmda;
307	if (err = cpr_write(vp, (caddr_t)dp, cpr_nbitmaps * sizeof (*dp)))
308		return (err);
309
310	/*
311	 * merge regular and volatile bitmaps into tmp space
312	 * and write to disk
313	 */
314	for (; dp->cbd_size; dp++) {
315		rmap = (char *)dp->cbd_reg_bitmap;
316		vmap = (char *)dp->cbd_vlt_bitmap;
317		for (size = dp->cbd_size; size; size -= bytes) {
318			bytes = min(size, sizeof (cpr_pagecopy));
319			tail = &cpr_pagecopy[bytes];
320			for (dst = cpr_pagecopy; dst < tail; dst++)
321				*dst = *rmap++ | *vmap++;
322			if (err = cpr_write(vp, cpr_pagecopy, bytes))
323				break;
324		}
325	}
326
327	return (err);
328}
329
330
331static int
332cpr_write_statefile(vnode_t *vp)
333{
334	uint_t error = 0;
335	extern	int	i_cpr_check_pgs_dumped();
336	void flush_windows(void);
337	pgcnt_t spages;
338	char *str;
339
340	flush_windows();
341
342	/*
343	 * to get an accurate view of kas, we need to untag sensitive
344	 * pages *before* dumping them because the disk driver makes
345	 * allocations and changes kas along the way.  The remaining
346	 * pages referenced in the bitmaps are dumped out later as
347	 * regular kpages.
348	 */
349	str = "cpr_write_statefile:";
350	spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_clrbit);
351	CPR_DEBUG(CPR_DEBUG7, "%s untag %ld sens pages\n", str, spages);
352
353	/*
354	 * now it's OK to call a driver that makes allocations
355	 */
356	cpr_disk_writes_ok = 1;
357
358	/*
359	 * now write out the clean sensitive kpages
360	 * according to the sensitive descriptors
361	 */
362	error = i_cpr_dump_sensitive_kpages(vp);
363	if (error) {
364		CPR_DEBUG(CPR_DEBUG7,
365		    "%s cpr_dump_sensitive_kpages() failed!\n", str);
366		return (error);
367	}
368
369	/*
370	 * cpr_dump_regular_pages() counts cpr_regular_pgs_dumped
371	 */
372	error = cpr_dump_regular_pages(vp);
373	if (error) {
374		CPR_DEBUG(CPR_DEBUG7,
375		    "%s cpr_dump_regular_pages() failed!\n", str);
376		return (error);
377	}
378
379	/*
380	 * sanity check to verify the right number of pages were dumped
381	 */
382	error = i_cpr_check_pgs_dumped(cpr_pages_tobe_dumped,
383	    cpr_regular_pgs_dumped);
384
385	if (error) {
386		prom_printf("\n%s page count mismatch!\n", str);
387#ifdef DEBUG
388		if (cpr_test_mode)
389			debug_enter(NULL);
390#endif
391	}
392
393	return (error);
394}
395#endif
396
397
398/*
399 * creates the CPR state file, the following sections are
400 * written out in sequence:
401 *    - writes the cpr dump header
402 *    - writes the memory usage bitmaps
403 *    - writes the platform dependent info
404 *    - writes the remaining user pages
405 *    - writes the kernel pages
406 */
407#if defined(__x86)
408	_NOTE(ARGSUSED(0))
409#endif
410int
411cpr_dump(vnode_t *vp)
412{
413#if defined(__sparc)
414	int error;
415
416	if (cpr_buf == NULL) {
417		ASSERT(cpr_pagedata == NULL);
418		if (error = cpr_alloc_bufs())
419			return (error);
420	}
421	/* point to top of internal buffer */
422	cpr_wptr = cpr_buf;
423
424	/* initialize global variables used by the write operation */
425	cpr_file_bn = cpr_statefile_offset();
426	cpr_dev_space = 0;
427
428	/* allocate bitmaps */
429	if (CPR->c_bmda == NULL) {
430		if (error = i_cpr_alloc_bitmaps()) {
431			cpr_err(CE_WARN, "cannot allocate bitmaps");
432			return (error);
433		}
434	}
435
436	if (error = i_cpr_prom_pages(CPR_PROM_SAVE))
437		return (error);
438
439	if (error = i_cpr_dump_setup(vp))
440		return (error);
441
442	/*
443	 * set internal cross checking; we dont want to call
444	 * a disk driver that makes allocations until after
445	 * sensitive pages are saved
446	 */
447	cpr_disk_writes_ok = 0;
448
449	/*
450	 * 1253112: heap corruption due to memory allocation when dumpping
451	 *	    statefile.
452	 * Theoretically on Sun4u only the kernel data nucleus, kvalloc and
453	 * kvseg segments can be contaminated should memory allocations happen
454	 * during sddump, which is not supposed to happen after the system
455	 * is quiesced. Let's call the kernel pages that tend to be affected
456	 * 'sensitive kpages' here. To avoid saving inconsistent pages, we
457	 * will allocate some storage space to save the clean sensitive pages
458	 * aside before statefile dumping takes place. Since there may not be
459	 * much memory left at this stage, the sensitive pages will be
460	 * compressed before they are saved into the storage area.
461	 */
462	if (error = i_cpr_save_sensitive_kpages()) {
463		CPR_DEBUG(CPR_DEBUG7,
464		    "cpr_dump: save_sensitive_kpages failed!\n");
465		return (error);
466	}
467
468	/*
469	 * since all cpr allocations are done (space for sensitive kpages,
470	 * bitmaps, cpr_buf), kas is stable, and now we can accurately
471	 * count regular and sensitive kpages.
472	 */
473	if (error = cpr_write_header(vp)) {
474		CPR_DEBUG(CPR_DEBUG7,
475		    "cpr_dump: cpr_write_header() failed!\n");
476		return (error);
477	}
478
479	if (error = i_cpr_write_machdep(vp))
480		return (error);
481
482	if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, NULL, NULL))
483		return (error);
484
485	if (error = cpr_write_bitmap(vp))
486		return (error);
487
488	if (error = cpr_write_statefile(vp)) {
489		CPR_DEBUG(CPR_DEBUG7,
490		    "cpr_dump: cpr_write_statefile() failed!\n");
491		return (error);
492	}
493
494	if (error = cpr_write_terminator(vp))
495		return (error);
496
497	if (error = cpr_flush_write(vp))
498		return (error);
499
500	if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, &cpr_file_bn, vp))
501		return (error);
502#endif
503
504	return (0);
505}
506
507
508#if defined(__sparc)
509/*
510 * cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc;
511 * a page-count from each range is accumulated at arg->pages.
512 */
513static void
514cpr_xwalk(void *arg, void *base, size_t size)
515{
516	struct cpr_walkinfo *cwip = arg;
517
518	cwip->pages += cpr_count_pages(base, size,
519	    cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
520	cwip->size += size;
521	cwip->ranges++;
522}
523
524/*
525 * cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc;
526 * a page-count from each range is accumulated at arg->pages.
527 */
528static void
529cpr_walk(void *arg, void *base, size_t size)
530{
531	caddr_t addr = base;
532	caddr_t addr_end = addr + size;
533
534	/*
535	 * If we are about to start walking the range of addresses we
536	 * carved out of the kernel heap for the large page heap walk
537	 * heap_lp_arena to find what segments are actually populated
538	 */
539	if (SEGKMEM_USE_LARGEPAGES &&
540	    addr == heap_lp_base && addr_end == heap_lp_end &&
541	    vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
542		vmem_walk(heap_lp_arena, VMEM_ALLOC, cpr_xwalk, arg);
543	} else {
544		cpr_xwalk(arg, base, size);
545	}
546}
547
548
549/*
550 * faster scan of kvseg using vmem_walk() to visit
551 * allocated ranges.
552 */
553pgcnt_t
554cpr_scan_kvseg(int mapflag, bitfunc_t bitfunc, struct seg *seg)
555{
556	struct cpr_walkinfo cwinfo;
557
558	bzero(&cwinfo, sizeof (cwinfo));
559	cwinfo.mapflag = mapflag;
560	cwinfo.bitfunc = bitfunc;
561
562	vmem_walk(heap_arena, VMEM_ALLOC, cpr_walk, &cwinfo);
563
564	if (cpr_debug & CPR_DEBUG7) {
565		prom_printf("walked %d sub-ranges, total pages %ld\n",
566		    cwinfo.ranges, mmu_btop(cwinfo.size));
567		cpr_show_range(seg->s_base, seg->s_size,
568		    mapflag, bitfunc, cwinfo.pages);
569	}
570
571	return (cwinfo.pages);
572}
573
574
575/*
576 * cpr_walk_kpm() is called for every used area within the large
577 * segkpm virtual address window. A page-count is accumulated at
578 * arg->pages.
579 */
580static void
581cpr_walk_kpm(void *arg, void *base, size_t size)
582{
583	struct cpr_walkinfo *cwip = arg;
584
585	cwip->pages += cpr_count_pages(base, size,
586	    cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
587	cwip->size += size;
588	cwip->ranges++;
589}
590
591
592/*
593 * faster scan of segkpm using hat_kpm_walk() to visit only used ranges.
594 */
595/*ARGSUSED*/
596static pgcnt_t
597cpr_scan_segkpm(int mapflag, bitfunc_t bitfunc, struct seg *seg)
598{
599	struct cpr_walkinfo cwinfo;
600
601	if (kpm_enable == 0)
602		return (0);
603
604	bzero(&cwinfo, sizeof (cwinfo));
605	cwinfo.mapflag = mapflag;
606	cwinfo.bitfunc = bitfunc;
607	hat_kpm_walk(cpr_walk_kpm, &cwinfo);
608
609	if (cpr_debug & CPR_DEBUG7) {
610		prom_printf("walked %d sub-ranges, total pages %ld\n",
611		    cwinfo.ranges, mmu_btop(cwinfo.size));
612		cpr_show_range(segkpm->s_base, segkpm->s_size,
613		    mapflag, bitfunc, cwinfo.pages);
614	}
615
616	return (cwinfo.pages);
617}
618
619
620/*
621 * Sparsely filled kernel segments are registered in kseg_table for
622 * easier lookup. See also block comment for cpr_count_seg_pages.
623 */
624
625#define	KSEG_SEG_ADDR	0	/* address of struct seg */
626#define	KSEG_PTR_ADDR	1	/* address of pointer to struct seg */
627
628typedef struct {
629	struct seg **st_seg;		/* segment pointer or segment address */
630	pgcnt_t	(*st_fcn)(int, bitfunc_t, struct seg *); /* function to call */
631	int	st_addrtype;		/* address type in st_seg */
632} ksegtbl_entry_t;
633
634ksegtbl_entry_t kseg_table[] = {
635	{(struct seg **)&kvseg,		cpr_scan_kvseg,		KSEG_SEG_ADDR},
636	{&segkpm,			cpr_scan_segkpm,	KSEG_PTR_ADDR},
637	{NULL,				0,			0}
638};
639
640
641/*
642 * Compare seg with each entry in kseg_table; when there is a match
643 * return the entry pointer, otherwise return NULL.
644 */
645static ksegtbl_entry_t *
646cpr_sparse_seg_check(struct seg *seg)
647{
648	ksegtbl_entry_t *ste = &kseg_table[0];
649	struct seg *tseg;
650
651	for (; ste->st_seg; ste++) {
652		tseg = (ste->st_addrtype == KSEG_PTR_ADDR) ?
653		    *ste->st_seg : (struct seg *)ste->st_seg;
654
655		if (seg == tseg)
656			return (ste);
657	}
658
659	return ((ksegtbl_entry_t *)NULL);
660}
661
662
663/*
664 * Count pages within each kernel segment; call cpr_sparse_seg_check()
665 * to find out whether a sparsely filled segment needs special
666 * treatment (e.g. kvseg).
667 * Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr
668 *       module shouldn't need to know segment details like if it is
669 *       sparsely filled or not (makes kseg_table obsolete).
670 */
671pgcnt_t
672cpr_count_seg_pages(int mapflag, bitfunc_t bitfunc)
673{
674	struct seg *segp;
675	pgcnt_t pages;
676	ksegtbl_entry_t *ste;
677
678	pages = 0;
679	for (segp = AS_SEGFIRST(&kas); segp; segp = AS_SEGNEXT(&kas, segp)) {
680		if (ste = cpr_sparse_seg_check(segp)) {
681			pages += (ste->st_fcn)(mapflag, bitfunc, segp);
682		} else {
683			pages += cpr_count_pages(segp->s_base,
684			    segp->s_size, mapflag, bitfunc, DBG_SHOWRANGE);
685		}
686	}
687
688	return (pages);
689}
690
691
692/*
693 * count kernel pages within kas and any special ranges
694 */
695pgcnt_t
696cpr_count_kpages(int mapflag, bitfunc_t bitfunc)
697{
698	pgcnt_t kas_cnt;
699
700	/*
701	 * Some pages need to be taken care of differently.
702	 * eg: panicbuf pages of sun4m are not in kas but they need
703	 * to be saved.  On sun4u, the physical pages of panicbuf are
704	 * allocated via prom_retain().
705	 */
706	kas_cnt = i_cpr_count_special_kpages(mapflag, bitfunc);
707	kas_cnt += cpr_count_seg_pages(mapflag, bitfunc);
708
709	CPR_DEBUG(CPR_DEBUG9, "cpr_count_kpages: kas_cnt=%ld\n", kas_cnt);
710	CPR_DEBUG(CPR_DEBUG7, "\ncpr_count_kpages: %ld pages, 0x%lx bytes\n",
711	    kas_cnt, mmu_ptob(kas_cnt));
712
713	return (kas_cnt);
714}
715
716
717/*
718 * Set a bit corresponding to the arg phys page number;
719 * returns 0 when the ppn is valid and the corresponding
720 * map bit was clear, otherwise returns 1.
721 */
722int
723cpr_setbit(pfn_t ppn, int mapflag)
724{
725	char *bitmap;
726	cbd_t *dp;
727	pfn_t rel;
728	int clr;
729
730	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
731		if (PPN_IN_RANGE(ppn, dp)) {
732			bitmap = DESC_TO_MAP(dp, mapflag);
733			rel = ppn - dp->cbd_spfn;
734			if ((clr = isclr(bitmap, rel)) != 0)
735				setbit(bitmap, rel);
736			return (clr == 0);
737		}
738	}
739
740	return (1);
741}
742
743
744/*
745 * Clear a bit corresponding to the arg phys page number.
746 */
747int
748cpr_clrbit(pfn_t ppn, int mapflag)
749{
750	char *bitmap;
751	cbd_t *dp;
752	pfn_t rel;
753	int set;
754
755	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
756		if (PPN_IN_RANGE(ppn, dp)) {
757			bitmap = DESC_TO_MAP(dp, mapflag);
758			rel = ppn - dp->cbd_spfn;
759			if ((set = isset(bitmap, rel)) != 0)
760				clrbit(bitmap, rel);
761			return (set == 0);
762		}
763	}
764
765	return (1);
766}
767
768
769/* ARGSUSED */
770int
771cpr_nobit(pfn_t ppn, int mapflag)
772{
773	return (0);
774}
775
776
777/*
778 * Lookup a bit corresponding to the arg phys page number.
779 */
780int
781cpr_isset(pfn_t ppn, int mapflag)
782{
783	char *bitmap;
784	cbd_t *dp;
785	pfn_t rel;
786
787	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
788		if (PPN_IN_RANGE(ppn, dp)) {
789			bitmap = DESC_TO_MAP(dp, mapflag);
790			rel = ppn - dp->cbd_spfn;
791			return (isset(bitmap, rel));
792		}
793	}
794
795	return (0);
796}
797
798
799/*
800 * Go thru all pages and pick up any page not caught during the invalidation
801 * stage. This is also used to save pages with cow lock or phys page lock held
802 * (none zero p_lckcnt or p_cowcnt)
803 */
804static	int
805cpr_count_upages(int mapflag, bitfunc_t bitfunc)
806{
807	page_t *pp, *page0;
808	pgcnt_t dcnt = 0, tcnt = 0;
809	pfn_t pfn;
810
811	page0 = pp = page_first();
812
813	do {
814#if defined(__sparc)
815		extern struct vnode prom_ppages;
816		if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
817		    pp->p_vnode == &prom_ppages ||
818		    PP_ISFREE(pp) && PP_ISAGED(pp))
819#else
820		if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
821		    PP_ISFREE(pp) && PP_ISAGED(pp))
822#endif /* __sparc */
823			continue;
824
825		pfn = page_pptonum(pp);
826		if (pf_is_memory(pfn)) {
827			tcnt++;
828			if ((*bitfunc)(pfn, mapflag) == 0)
829				dcnt++; /* dirty count */
830		}
831	} while ((pp = page_next(pp)) != page0);
832
833	STAT->cs_upage2statef = dcnt;
834	CPR_DEBUG(CPR_DEBUG9, "cpr_count_upages: dirty=%ld total=%ld\n",
835	    dcnt, tcnt);
836	CPR_DEBUG(CPR_DEBUG7, "cpr_count_upages: %ld pages, 0x%lx bytes\n",
837	    dcnt, mmu_ptob(dcnt));
838
839	return (dcnt);
840}
841
842
843/*
844 * try compressing pages based on cflag,
845 * and for DEBUG kernels, verify uncompressed data checksum;
846 *
847 * this routine replaces common code from
848 * i_cpr_compress_and_save() and cpr_compress_and_write()
849 */
850char *
851cpr_compress_pages(cpd_t *dp, pgcnt_t pages, int cflag)
852{
853	size_t nbytes, clen, len;
854	uint32_t test_sum;
855	char *datap;
856
857	nbytes = mmu_ptob(pages);
858
859	/*
860	 * set length to the original uncompressed data size;
861	 * always init cpd_flag to zero
862	 */
863	dp->cpd_length = nbytes;
864	dp->cpd_flag = 0;
865
866#ifdef	DEBUG
867	/*
868	 * Make a copy of the uncompressed data so we can checksum it.
869	 * Compress that copy so the checksum works at the other end
870	 */
871	cprbcopy(CPR->c_mapping_area, cpr_pagecopy, nbytes);
872	dp->cpd_usum = checksum32(cpr_pagecopy, nbytes);
873	dp->cpd_flag |= CPD_USUM;
874	datap = cpr_pagecopy;
875#else
876	datap = CPR->c_mapping_area;
877	dp->cpd_usum = 0;
878#endif
879
880	/*
881	 * try compressing the raw data to cpr_pagedata;
882	 * if there was a size reduction: record the new length,
883	 * flag the compression, and point to the compressed data.
884	 */
885	dp->cpd_csum = 0;
886	if (cflag) {
887		clen = compress(datap, cpr_pagedata, nbytes);
888		if (clen < nbytes) {
889			dp->cpd_flag |= CPD_COMPRESS;
890			dp->cpd_length = clen;
891			datap = cpr_pagedata;
892#ifdef	DEBUG
893			dp->cpd_csum = checksum32(datap, clen);
894			dp->cpd_flag |= CPD_CSUM;
895
896			/*
897			 * decompress the data back to a scratch area
898			 * and compare the new checksum with the original
899			 * checksum to verify the compression.
900			 */
901			bzero(cpr_pagecopy, sizeof (cpr_pagecopy));
902			len = decompress(datap, cpr_pagecopy,
903			    clen, sizeof (cpr_pagecopy));
904			test_sum = checksum32(cpr_pagecopy, len);
905			ASSERT(test_sum == dp->cpd_usum);
906#endif
907		}
908	}
909
910	return (datap);
911}
912
913
914/*
915 * 1. Prepare cpr page descriptor and write it to file
916 * 2. Compress page data and write it out
917 */
918static int
919cpr_compress_and_write(vnode_t *vp, uint_t va, pfn_t pfn, pgcnt_t npg)
920{
921	int error = 0;
922	char *datap;
923	cpd_t cpd;	/* cpr page descriptor */
924	extern void i_cpr_mapin(caddr_t, uint_t, pfn_t);
925	extern void i_cpr_mapout(caddr_t, uint_t);
926
927	i_cpr_mapin(CPR->c_mapping_area, npg, pfn);
928
929	CPR_DEBUG(CPR_DEBUG3, "mapped-in %ld pages, vaddr 0x%p, pfn 0x%lx\n",
930	    npg, (void *)CPR->c_mapping_area, pfn);
931
932	/*
933	 * Fill cpr page descriptor.
934	 */
935	cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC;
936	cpd.cpd_pfn = pfn;
937	cpd.cpd_pages = npg;
938
939	STAT->cs_dumped_statefsz += mmu_ptob(npg);
940
941	datap = cpr_compress_pages(&cpd, npg, CPR->c_flags & C_COMPRESSING);
942
943	/* Write cpr page descriptor */
944	error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd_t));
945
946	/* Write compressed page data */
947	error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length);
948
949	/*
950	 * Unmap the pages for tlb and vac flushing
951	 */
952	i_cpr_mapout(CPR->c_mapping_area, npg);
953
954	if (error) {
955		CPR_DEBUG(CPR_DEBUG1,
956		    "cpr_compress_and_write: vp 0x%p va 0x%x ", (void *)vp, va);
957		CPR_DEBUG(CPR_DEBUG1, "pfn 0x%lx blk %d err %d\n",
958		    pfn, cpr_file_bn, error);
959	} else {
960		cpr_regular_pgs_dumped += npg;
961	}
962
963	return (error);
964}
965
966
967int
968cpr_write(vnode_t *vp, caddr_t buffer, size_t size)
969{
970	caddr_t	fromp = buffer;
971	size_t bytes, wbytes;
972	int error;
973
974	if (cpr_dev_space == 0) {
975		if (vp->v_type == VBLK) {
976			cpr_dev_space = cpr_get_devsize(vp->v_rdev);
977			ASSERT(cpr_dev_space);
978		} else
979			cpr_dev_space = 1;	/* not used in this case */
980	}
981
982	/*
983	 * break the write into multiple part if request is large,
984	 * calculate count up to buf page boundary, then write it out.
985	 * repeat until done.
986	 */
987	while (size) {
988		bytes = MIN(size, cpr_buf_end - cpr_wptr);
989		cprbcopy(fromp, cpr_wptr, bytes);
990		cpr_wptr += bytes;
991		fromp += bytes;
992		size -= bytes;
993		if (cpr_wptr < cpr_buf_end)
994			return (0);	/* buffer not full yet */
995		ASSERT(cpr_wptr == cpr_buf_end);
996
997		wbytes = dbtob(cpr_file_bn + cpr_buf_blocks);
998		if (vp->v_type == VBLK) {
999			if (wbytes > cpr_dev_space)
1000				return (ENOSPC);
1001		} else {
1002			if (wbytes > VTOI(vp)->i_size)
1003				return (ENOSPC);
1004		}
1005
1006		CPR_DEBUG(CPR_DEBUG3,
1007		    "cpr_write: frmp=%p wptr=%p cnt=%lx...",
1008		    (void *)fromp, (void *)cpr_wptr, bytes);
1009		/*
1010		 * cross check, this should not happen!
1011		 */
1012		if (cpr_disk_writes_ok == 0) {
1013			prom_printf("cpr_write: disk write too early!\n");
1014			return (EINVAL);
1015		}
1016
1017		do_polled_io = 1;
1018		error = VOP_DUMP(vp, cpr_buf, cpr_file_bn, cpr_buf_blocks,
1019		    NULL);
1020		do_polled_io = 0;
1021		CPR_DEBUG(CPR_DEBUG3, "done\n");
1022
1023		STAT->cs_real_statefsz += cpr_buf_size;
1024
1025		if (error) {
1026			cpr_err(CE_WARN, "cpr_write error %d", error);
1027			return (error);
1028		}
1029		cpr_file_bn += cpr_buf_blocks;	/* Increment block count */
1030		cpr_wptr = cpr_buf;		/* back to top of buffer */
1031	}
1032	return (0);
1033}
1034
1035
1036int
1037cpr_flush_write(vnode_t *vp)
1038{
1039	int	nblk;
1040	int	error;
1041
1042	/*
1043	 * Calculate remaining blocks in buffer, rounded up to nearest
1044	 * disk block
1045	 */
1046	nblk = btod(cpr_wptr - cpr_buf);
1047
1048	do_polled_io = 1;
1049	error = VOP_DUMP(vp, (caddr_t)cpr_buf, cpr_file_bn, nblk, NULL);
1050	do_polled_io = 0;
1051
1052	cpr_file_bn += nblk;
1053	if (error)
1054		CPR_DEBUG(CPR_DEBUG2, "cpr_flush_write: error (%d)\n",
1055		    error);
1056	return (error);
1057}
1058
1059void
1060cpr_clear_bitmaps(void)
1061{
1062	cbd_t *dp;
1063
1064	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
1065		bzero((void *)dp->cbd_reg_bitmap,
1066		    (size_t)dp->cbd_size * 2);
1067	}
1068	CPR_DEBUG(CPR_DEBUG7, "\ncleared reg and vlt bitmaps\n");
1069}
1070
1071int
1072cpr_contig_pages(vnode_t *vp, int flag)
1073{
1074	int chunks = 0, error = 0;
1075	pgcnt_t i, j, totbit;
1076	pfn_t spfn;
1077	cbd_t *dp;
1078	uint_t	spin_cnt = 0;
1079	extern	int i_cpr_compress_and_save();
1080
1081	for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
1082		spfn = dp->cbd_spfn;
1083		totbit = BTOb(dp->cbd_size);
1084		i = 0; /* Beginning of bitmap */
1085		j = 0;
1086		while (i < totbit) {
1087			while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) {
1088				if (isset((char *)dp->cbd_reg_bitmap, j+i))
1089					j++;
1090				else /* not contiguous anymore */
1091					break;
1092			}
1093
1094			if (j) {
1095				chunks++;
1096				if (flag == SAVE_TO_STORAGE) {
1097					error = i_cpr_compress_and_save(
1098					    chunks, spfn + i, j);
1099					if (error)
1100						return (error);
1101				} else if (flag == WRITE_TO_STATEFILE) {
1102					error = cpr_compress_and_write(vp, 0,
1103					    spfn + i, j);
1104					if (error)
1105						return (error);
1106					else {
1107						spin_cnt++;
1108						if ((spin_cnt & 0x5F) == 1)
1109							cpr_spinning_bar();
1110					}
1111				}
1112			}
1113
1114			i += j;
1115			if (j != CPR_MAXCONTIG) {
1116				/* Stopped on a non-tagged page */
1117				i++;
1118			}
1119
1120			j = 0;
1121		}
1122	}
1123
1124	if (flag == STORAGE_DESC_ALLOC)
1125		return (chunks);
1126	else
1127		return (0);
1128}
1129
1130
1131void
1132cpr_show_range(caddr_t vaddr, size_t size,
1133    int mapflag, bitfunc_t bitfunc, pgcnt_t count)
1134{
1135	char *action, *bname;
1136
1137	bname = (mapflag == REGULAR_BITMAP) ? "regular" : "volatile";
1138	if (bitfunc == cpr_setbit)
1139		action = "tag";
1140	else if (bitfunc == cpr_clrbit)
1141		action = "untag";
1142	else
1143		action = "none";
1144	prom_printf("range (0x%p, 0x%p), %s bitmap, %s %ld\n",
1145	    (void *)vaddr, (void *)(vaddr + size), bname, action, count);
1146}
1147
1148
1149pgcnt_t
1150cpr_count_pages(caddr_t sva, size_t size,
1151    int mapflag, bitfunc_t bitfunc, int showrange)
1152{
1153	caddr_t	va, eva;
1154	pfn_t pfn;
1155	pgcnt_t count = 0;
1156
1157	eva = sva + PAGE_ROUNDUP(size);
1158	for (va = sva; va < eva; va += MMU_PAGESIZE) {
1159		pfn = va_to_pfn(va);
1160		if (pfn != PFN_INVALID && pf_is_memory(pfn)) {
1161			if ((*bitfunc)(pfn, mapflag) == 0)
1162				count++;
1163		}
1164	}
1165
1166	if ((cpr_debug & CPR_DEBUG7) && showrange == DBG_SHOWRANGE)
1167		cpr_show_range(sva, size, mapflag, bitfunc, count);
1168
1169	return (count);
1170}
1171
1172
1173pgcnt_t
1174cpr_count_volatile_pages(int mapflag, bitfunc_t bitfunc)
1175{
1176	pgcnt_t count = 0;
1177
1178	if (cpr_buf) {
1179		count += cpr_count_pages(cpr_buf, cpr_buf_size,
1180		    mapflag, bitfunc, DBG_SHOWRANGE);
1181	}
1182	if (cpr_pagedata) {
1183		count += cpr_count_pages(cpr_pagedata, cpr_pagedata_size,
1184		    mapflag, bitfunc, DBG_SHOWRANGE);
1185	}
1186	count += i_cpr_count_storage_pages(mapflag, bitfunc);
1187
1188	CPR_DEBUG(CPR_DEBUG7, "cpr_count_vpages: %ld pages, 0x%lx bytes\n",
1189	    count, mmu_ptob(count));
1190	return (count);
1191}
1192
1193
1194static int
1195cpr_dump_regular_pages(vnode_t *vp)
1196{
1197	int error;
1198
1199	cpr_regular_pgs_dumped = 0;
1200	error = cpr_contig_pages(vp, WRITE_TO_STATEFILE);
1201	if (!error)
1202		CPR_DEBUG(CPR_DEBUG7, "cpr_dump_regular_pages() done.\n");
1203	return (error);
1204}
1205#endif
1206