activemap.c revision 219864
1/*-
2 * Copyright (c) 2009-2010 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Pawel Jakub Dawidek under sponsorship from
6 * the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sbin/hastd/activemap.c 219864 2011-03-22 10:39:34Z pjd $");
32
33#include <sys/param.h>	/* powerof2() */
34#include <sys/queue.h>
35
36#include <assert.h>
37#include <bitstring.h>
38#include <errno.h>
39#include <stdint.h>
40#include <stdio.h>
41#include <stdlib.h>
42#include <string.h>
43
44#include <activemap.h>
45
46#define	ACTIVEMAP_MAGIC	0xac71e4
47struct activemap {
48	int		 am_magic;	/* Magic value. */
49	off_t		 am_mediasize;	/* Media size in bytes. */
50	uint32_t	 am_extentsize;	/* Extent size in bytes,
51					   must be power of 2. */
52	uint8_t		 am_extentshift;/* 2 ^ extentbits == extentsize */
53	int		 am_nextents;	/* Number of extents. */
54	size_t		 am_mapsize;	/* Bitmap size in bytes. */
55	uint16_t	*am_memtab;	/* An array that holds number of pending
56					   writes per extent. */
57	bitstr_t	*am_diskmap;	/* On-disk bitmap of dirty extents. */
58	bitstr_t	*am_memmap;	/* In-memory bitmap of dirty extents. */
59	size_t		 am_diskmapsize; /* Map size rounded up to sector size. */
60	uint64_t	 am_ndirty;	/* Number of dirty regions. */
61	bitstr_t	*am_syncmap;	/* Bitmap of extents to sync. */
62	off_t		 am_syncoff;	/* Next synchronization offset. */
63	TAILQ_HEAD(skeepdirty, keepdirty) am_keepdirty; /* List of extents that
64					   we keep dirty to reduce bitmap
65					   updates. */
66	int		 am_nkeepdirty;	/* Number of am_keepdirty elements. */
67	int		 am_nkeepdirty_limit; /* Maximum number of am_keepdirty
68					         elements. */
69};
70
71struct keepdirty {
72	int	kd_extent;
73	TAILQ_ENTRY(keepdirty) kd_next;
74};
75
76/*
77 * Helper function taken from sys/systm.h to calculate extentshift.
78 */
79static uint32_t
80bitcount32(uint32_t x)
81{
82
83	x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1);
84	x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2);
85	x = (x + (x >> 4)) & 0x0f0f0f0f;
86	x = (x + (x >> 8));
87	x = (x + (x >> 16)) & 0x000000ff;
88	return (x);
89}
90
91static __inline int
92off2ext(const struct activemap *amp, off_t offset)
93{
94	int extent;
95
96	assert(offset >= 0 && offset < amp->am_mediasize);
97	extent = (offset >> amp->am_extentshift);
98	assert(extent >= 0 && extent < amp->am_nextents);
99	return (extent);
100}
101
102static __inline off_t
103ext2off(const struct activemap *amp, int extent)
104{
105	off_t offset;
106
107	assert(extent >= 0 && extent < amp->am_nextents);
108	offset = ((off_t)extent << amp->am_extentshift);
109	assert(offset >= 0 && offset < amp->am_mediasize);
110	return (offset);
111}
112
113/*
114 * Function calculates number of requests needed to synchronize the given
115 * extent.
116 */
117static __inline int
118ext2reqs(const struct activemap *amp, int ext)
119{
120	off_t left;
121
122	if (ext < amp->am_nextents - 1)
123		return (((amp->am_extentsize - 1) / MAXPHYS) + 1);
124
125	assert(ext == amp->am_nextents - 1);
126	left = amp->am_mediasize % amp->am_extentsize;
127	if (left == 0)
128		left = amp->am_extentsize;
129	return (((left - 1) / MAXPHYS) + 1);
130}
131
132/*
133 * Initialize activemap structure and allocate memory for internal needs.
134 * Function returns 0 on success and -1 if any of the allocations failed.
135 */
136int
137activemap_init(struct activemap **ampp, uint64_t mediasize, uint32_t extentsize,
138    uint32_t sectorsize, uint32_t keepdirty)
139{
140	struct activemap *amp;
141
142	assert(ampp != NULL);
143	assert(mediasize > 0);
144	assert(extentsize > 0);
145	assert(powerof2(extentsize));
146	assert(sectorsize > 0);
147	assert(powerof2(sectorsize));
148	assert(keepdirty > 0);
149
150	amp = malloc(sizeof(*amp));
151	if (amp == NULL)
152		return (-1);
153
154	amp->am_mediasize = mediasize;
155	amp->am_nkeepdirty_limit = keepdirty;
156	amp->am_extentsize = extentsize;
157	amp->am_extentshift = bitcount32(extentsize - 1);
158	amp->am_nextents = ((mediasize - 1) / extentsize) + 1;
159	amp->am_mapsize = sizeof(bitstr_t) * bitstr_size(amp->am_nextents);
160	amp->am_diskmapsize = roundup2(amp->am_mapsize, sectorsize);
161	amp->am_ndirty = 0;
162	amp->am_syncoff = -2;
163	TAILQ_INIT(&amp->am_keepdirty);
164	amp->am_nkeepdirty = 0;
165
166	amp->am_memtab = calloc(amp->am_nextents, sizeof(amp->am_memtab[0]));
167	amp->am_diskmap = calloc(1, amp->am_diskmapsize);
168	amp->am_memmap = bit_alloc(amp->am_nextents);
169	amp->am_syncmap = bit_alloc(amp->am_nextents);
170
171	/*
172	 * Check to see if any of the allocations above failed.
173	 */
174	if (amp->am_memtab == NULL || amp->am_diskmap == NULL ||
175	    amp->am_memmap == NULL || amp->am_syncmap == NULL) {
176		if (amp->am_memtab != NULL)
177			free(amp->am_memtab);
178		if (amp->am_diskmap != NULL)
179			free(amp->am_diskmap);
180		if (amp->am_memmap != NULL)
181			free(amp->am_memmap);
182		if (amp->am_syncmap != NULL)
183			free(amp->am_syncmap);
184		amp->am_magic = 0;
185		free(amp);
186		errno = ENOMEM;
187		return (-1);
188	}
189
190	amp->am_magic = ACTIVEMAP_MAGIC;
191	*ampp = amp;
192
193	return (0);
194}
195
196static struct keepdirty *
197keepdirty_find(struct activemap *amp, int extent)
198{
199	struct keepdirty *kd;
200
201	TAILQ_FOREACH(kd, &amp->am_keepdirty, kd_next) {
202		if (kd->kd_extent == extent)
203			break;
204	}
205	return (kd);
206}
207
208static void
209keepdirty_add(struct activemap *amp, int extent)
210{
211	struct keepdirty *kd;
212
213	kd = keepdirty_find(amp, extent);
214	if (kd != NULL) {
215		/*
216		 * Only move element at the begining.
217		 */
218		TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
219		TAILQ_INSERT_HEAD(&amp->am_keepdirty, kd, kd_next);
220		return;
221	}
222	/*
223	 * Add new element, but first remove the most unused one if
224	 * we have too many.
225	 */
226	if (amp->am_nkeepdirty >= amp->am_nkeepdirty_limit) {
227		kd = TAILQ_LAST(&amp->am_keepdirty, skeepdirty);
228		assert(kd != NULL);
229		TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
230		amp->am_nkeepdirty--;
231		assert(amp->am_nkeepdirty > 0);
232	}
233	if (kd == NULL)
234		kd = malloc(sizeof(*kd));
235	/* We can ignore allocation failure. */
236	if (kd != NULL) {
237		kd->kd_extent = extent;
238		amp->am_nkeepdirty++;
239		TAILQ_INSERT_HEAD(&amp->am_keepdirty, kd, kd_next);
240	}
241}
242
243static void
244keepdirty_fill(struct activemap *amp)
245{
246	struct keepdirty *kd;
247
248	TAILQ_FOREACH(kd, &amp->am_keepdirty, kd_next)
249		bit_set(amp->am_diskmap, kd->kd_extent);
250}
251
252static void
253keepdirty_free(struct activemap *amp)
254{
255	struct keepdirty *kd;
256
257	while ((kd = TAILQ_FIRST(&amp->am_keepdirty)) != NULL) {
258		TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
259		amp->am_nkeepdirty--;
260		free(kd);
261	}
262	assert(amp->am_nkeepdirty == 0);
263}
264
265/*
266 * Function frees resources allocated by activemap_init() function.
267 */
268void
269activemap_free(struct activemap *amp)
270{
271
272	assert(amp->am_magic == ACTIVEMAP_MAGIC);
273
274	amp->am_magic = 0;
275
276	keepdirty_free(amp);
277	free(amp->am_memtab);
278	free(amp->am_diskmap);
279	free(amp->am_memmap);
280	free(amp->am_syncmap);
281}
282
283/*
284 * Function should be called before we handle write requests. It updates
285 * internal structures and returns true if on-disk metadata should be updated.
286 */
287bool
288activemap_write_start(struct activemap *amp, off_t offset, off_t length)
289{
290	bool modified;
291	off_t end;
292	int ext;
293
294	assert(amp->am_magic == ACTIVEMAP_MAGIC);
295	assert(length > 0);
296
297	modified = false;
298	end = offset + length - 1;
299
300	for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
301		/*
302		 * If the number of pending writes is increased from 0,
303		 * we have to mark the extent as dirty also in on-disk bitmap.
304		 * By returning true we inform the caller that on-disk bitmap
305		 * was modified and has to be flushed to disk.
306		 */
307		if (amp->am_memtab[ext]++ == 0) {
308			assert(!bit_test(amp->am_memmap, ext));
309			bit_set(amp->am_memmap, ext);
310			amp->am_ndirty++;
311			modified = true;
312		}
313		keepdirty_add(amp, ext);
314	}
315
316	return (modified);
317}
318
319/*
320 * Function should be called after receiving write confirmation. It updates
321 * internal structures and returns true if on-disk metadata should be updated.
322 */
323bool
324activemap_write_complete(struct activemap *amp, off_t offset, off_t length)
325{
326	bool modified;
327	off_t end;
328	int ext;
329
330	assert(amp->am_magic == ACTIVEMAP_MAGIC);
331	assert(length > 0);
332
333	modified = false;
334	end = offset + length - 1;
335
336	for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
337		/*
338		 * If the number of pending writes goes down to 0, we have to
339		 * mark the extent as clean also in on-disk bitmap.
340		 * By returning true we inform the caller that on-disk bitmap
341		 * was modified and has to be flushed to disk.
342		 */
343		assert(amp->am_memtab[ext] > 0);
344		assert(bit_test(amp->am_memmap, ext));
345		if (--amp->am_memtab[ext] == 0) {
346			bit_clear(amp->am_memmap, ext);
347			amp->am_ndirty--;
348			modified = true;
349		}
350	}
351
352	return (modified);
353}
354
355/*
356 * Function should be called after finishing synchronization of one extent.
357 * It returns true if on-disk metadata should be updated.
358 */
359bool
360activemap_extent_complete(struct activemap *amp, int extent)
361{
362	bool modified;
363	int reqs;
364
365	assert(amp->am_magic == ACTIVEMAP_MAGIC);
366	assert(extent >= 0 && extent < amp->am_nextents);
367
368	modified = false;
369
370	reqs = ext2reqs(amp, extent);
371	assert(amp->am_memtab[extent] >= reqs);
372	amp->am_memtab[extent] -= reqs;
373	assert(bit_test(amp->am_memmap, extent));
374	if (amp->am_memtab[extent] == 0) {
375		bit_clear(amp->am_memmap, extent);
376		amp->am_ndirty--;
377		modified = true;
378	}
379
380	return (modified);
381}
382
383/*
384 * Function returns number of dirty regions.
385 */
386uint64_t
387activemap_ndirty(const struct activemap *amp)
388{
389
390	assert(amp->am_magic == ACTIVEMAP_MAGIC);
391
392	return (amp->am_ndirty);
393}
394
395/*
396 * Function compare on-disk bitmap and in-memory bitmap and returns true if
397 * they differ and should be flushed to the disk.
398 */
399bool
400activemap_differ(const struct activemap *amp)
401{
402
403	assert(amp->am_magic == ACTIVEMAP_MAGIC);
404
405	return (memcmp(amp->am_diskmap, amp->am_memmap,
406	    amp->am_mapsize) != 0);
407}
408
409/*
410 * Function returns number of bytes used by bitmap.
411 */
412size_t
413activemap_size(const struct activemap *amp)
414{
415
416	assert(amp->am_magic == ACTIVEMAP_MAGIC);
417
418	return (amp->am_mapsize);
419}
420
421/*
422 * Function returns number of bytes needed for storing on-disk bitmap.
423 * This is the same as activemap_size(), but rounded up to sector size.
424 */
425size_t
426activemap_ondisk_size(const struct activemap *amp)
427{
428
429	assert(amp->am_magic == ACTIVEMAP_MAGIC);
430
431	return (amp->am_diskmapsize);
432}
433
434/*
435 * Function copies the given buffer read from disk to the internal bitmap.
436 */
437void
438activemap_copyin(struct activemap *amp, const unsigned char *buf, size_t size)
439{
440	int ext;
441
442	assert(amp->am_magic == ACTIVEMAP_MAGIC);
443	assert(size >= amp->am_mapsize);
444
445	memcpy(amp->am_diskmap, buf, amp->am_mapsize);
446	memcpy(amp->am_memmap, buf, amp->am_mapsize);
447	memcpy(amp->am_syncmap, buf, amp->am_mapsize);
448
449	bit_ffs(amp->am_memmap, amp->am_nextents, &ext);
450	if (ext == -1) {
451		/* There are no dirty extents, so we can leave now. */
452		return;
453	}
454	/*
455	 * Set synchronization offset to the first dirty extent.
456	 */
457	activemap_sync_rewind(amp);
458	/*
459	 * We have dirty extents and we want them to stay that way until
460	 * we synchronize, so we set number of pending writes to number
461	 * of requests needed to synchronize one extent.
462	 */
463	amp->am_ndirty = 0;
464	for (; ext < amp->am_nextents; ext++) {
465		if (bit_test(amp->am_memmap, ext)) {
466			amp->am_memtab[ext] = ext2reqs(amp, ext);
467			amp->am_ndirty++;
468		}
469	}
470}
471
472/*
473 * Function merges the given bitmap with existng one.
474 */
475void
476activemap_merge(struct activemap *amp, const unsigned char *buf, size_t size)
477{
478	bitstr_t *remmap = __DECONST(bitstr_t *, buf);
479	int ext;
480
481	assert(amp->am_magic == ACTIVEMAP_MAGIC);
482	assert(size >= amp->am_mapsize);
483
484	bit_ffs(remmap, amp->am_nextents, &ext);
485	if (ext == -1) {
486		/* There are no dirty extents, so we can leave now. */
487		return;
488	}
489	/*
490	 * We have dirty extents and we want them to stay that way until
491	 * we synchronize, so we set number of pending writes to number
492	 * of requests needed to synchronize one extent.
493	 */
494	for (; ext < amp->am_nextents; ext++) {
495		/* Local extent already dirty. */
496		if (bit_test(amp->am_syncmap, ext))
497			continue;
498		/* Remote extent isn't dirty. */
499		if (!bit_test(remmap, ext))
500			continue;
501		bit_set(amp->am_syncmap, ext);
502		bit_set(amp->am_memmap, ext);
503		bit_set(amp->am_diskmap, ext);
504		if (amp->am_memtab[ext] == 0)
505			amp->am_ndirty++;
506		amp->am_memtab[ext] = ext2reqs(amp, ext);
507	}
508	/*
509	 * Set synchronization offset to the first dirty extent.
510	 */
511	activemap_sync_rewind(amp);
512}
513
514/*
515 * Function returns pointer to internal bitmap that should be written to disk.
516 */
517const unsigned char *
518activemap_bitmap(struct activemap *amp, size_t *sizep)
519{
520
521	assert(amp->am_magic == ACTIVEMAP_MAGIC);
522
523	if (sizep != NULL)
524		*sizep = amp->am_diskmapsize;
525	memcpy(amp->am_diskmap, amp->am_memmap, amp->am_mapsize);
526	keepdirty_fill(amp);
527	return ((const unsigned char *)amp->am_diskmap);
528}
529
530/*
531 * Function calculates size needed to store bitmap on disk.
532 */
533size_t
534activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize,
535    uint32_t sectorsize)
536{
537	uint64_t nextents, mapsize;
538
539	assert(mediasize > 0);
540	assert(extentsize > 0);
541	assert(powerof2(extentsize));
542	assert(sectorsize > 0);
543	assert(powerof2(sectorsize));
544
545	nextents = ((mediasize - 1) / extentsize) + 1;
546	mapsize = sizeof(bitstr_t) * bitstr_size(nextents);
547	return (roundup2(mapsize, sectorsize));
548}
549
550/*
551 * Set synchronization offset to the first dirty extent.
552 */
553void
554activemap_sync_rewind(struct activemap *amp)
555{
556	int ext;
557
558	assert(amp->am_magic == ACTIVEMAP_MAGIC);
559
560	bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
561	if (ext == -1) {
562		/* There are no extents to synchronize. */
563		amp->am_syncoff = -2;
564		return;
565	}
566	/*
567	 * Mark that we want to start synchronization from the begining.
568	 */
569	amp->am_syncoff = -1;
570}
571
572/*
573 * Return next offset of where we should synchronize.
574 */
575off_t
576activemap_sync_offset(struct activemap *amp, off_t *lengthp, int *syncextp)
577{
578	off_t syncoff, left;
579	int ext;
580
581	assert(amp->am_magic == ACTIVEMAP_MAGIC);
582	assert(lengthp != NULL);
583	assert(syncextp != NULL);
584
585	*syncextp = -1;
586
587	if (amp->am_syncoff == -2)
588		return (-1);
589
590	if (amp->am_syncoff >= 0 &&
591	    (amp->am_syncoff + MAXPHYS >= amp->am_mediasize ||
592	     off2ext(amp, amp->am_syncoff) !=
593	     off2ext(amp, amp->am_syncoff + MAXPHYS))) {
594		/*
595		 * We are about to change extent, so mark previous one as clean.
596		 */
597		ext = off2ext(amp, amp->am_syncoff);
598		bit_clear(amp->am_syncmap, ext);
599		*syncextp = ext;
600		amp->am_syncoff = -1;
601	}
602
603	if (amp->am_syncoff == -1) {
604		/*
605		 * Let's find first extent to synchronize.
606		 */
607		bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
608		if (ext == -1) {
609			amp->am_syncoff = -2;
610			return (-1);
611		}
612		amp->am_syncoff = ext2off(amp, ext);
613	} else {
614		/*
615		 * We don't change extent, so just increase offset.
616		 */
617		amp->am_syncoff += MAXPHYS;
618		if (amp->am_syncoff >= amp->am_mediasize) {
619			amp->am_syncoff = -2;
620			return (-1);
621		}
622	}
623
624	syncoff = amp->am_syncoff;
625	left = ext2off(amp, off2ext(amp, syncoff)) +
626	    amp->am_extentsize - syncoff;
627	if (syncoff + left > amp->am_mediasize)
628		left = amp->am_mediasize - syncoff;
629	if (left > MAXPHYS)
630		left = MAXPHYS;
631
632	assert(left >= 0 && left <= MAXPHYS);
633	assert(syncoff >= 0 && syncoff < amp->am_mediasize);
634	assert(syncoff + left >= 0 && syncoff + left <= amp->am_mediasize);
635
636	*lengthp = left;
637	return (syncoff);
638}
639
640/*
641 * Mark extent(s) containing the given region for synchronization.
642 * Most likely one of the components is unavailable.
643 */
644bool
645activemap_need_sync(struct activemap *amp, off_t offset, off_t length)
646{
647	bool modified;
648	off_t end;
649	int ext;
650
651	assert(amp->am_magic == ACTIVEMAP_MAGIC);
652
653	modified = false;
654	end = offset + length - 1;
655
656	for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
657		if (bit_test(amp->am_syncmap, ext)) {
658			/* Already marked for synchronization. */
659			assert(bit_test(amp->am_memmap, ext));
660			continue;
661		}
662		bit_set(amp->am_syncmap, ext);
663		if (!bit_test(amp->am_memmap, ext)) {
664			bit_set(amp->am_memmap, ext);
665			amp->am_ndirty++;
666		}
667		amp->am_memtab[ext] += ext2reqs(amp, ext);
668		modified = true;
669	}
670
671	return (modified);
672}
673
674void
675activemap_dump(const struct activemap *amp)
676{
677	int bit;
678
679	printf("M: ");
680	for (bit = 0; bit < amp->am_nextents; bit++)
681		printf("%d", bit_test(amp->am_memmap, bit) ? 1 : 0);
682	printf("\n");
683	printf("D: ");
684	for (bit = 0; bit < amp->am_nextents; bit++)
685		printf("%d", bit_test(amp->am_diskmap, bit) ? 1 : 0);
686	printf("\n");
687	printf("S: ");
688	for (bit = 0; bit < amp->am_nextents; bit++)
689		printf("%d", bit_test(amp->am_syncmap, bit) ? 1 : 0);
690	printf("\n");
691}
692