activemap.c revision 223654
1/*-
2 * Copyright (c) 2009-2010 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Pawel Jakub Dawidek under sponsorship from
6 * the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sbin/hastd/activemap.c 223654 2011-06-28 20:57:54Z trociny $");
32
33#include <sys/param.h>	/* powerof2() */
34#include <sys/queue.h>
35
36#include <assert.h>
37#include <bitstring.h>
38#include <errno.h>
39#include <stdint.h>
40#include <stdio.h>
41#include <stdlib.h>
42#include <string.h>
43
44#include <activemap.h>
45
46#define	ACTIVEMAP_MAGIC	0xac71e4
47struct activemap {
48	int		 am_magic;	/* Magic value. */
49	off_t		 am_mediasize;	/* Media size in bytes. */
50	uint32_t	 am_extentsize;	/* Extent size in bytes,
51					   must be power of 2. */
52	uint8_t		 am_extentshift;/* 2 ^ extentbits == extentsize */
53	int		 am_nextents;	/* Number of extents. */
54	size_t		 am_mapsize;	/* Bitmap size in bytes. */
55	uint16_t	*am_memtab;	/* An array that holds number of pending
56					   writes per extent. */
57	bitstr_t	*am_diskmap;	/* On-disk bitmap of dirty extents. */
58	bitstr_t	*am_memmap;	/* In-memory bitmap of dirty extents. */
59	size_t		 am_diskmapsize; /* Map size rounded up to sector size. */
60	uint64_t	 am_ndirty;	/* Number of dirty regions. */
61	bitstr_t	*am_syncmap;	/* Bitmap of extents to sync. */
62	off_t		 am_syncoff;	/* Next synchronization offset. */
63	TAILQ_HEAD(skeepdirty, keepdirty) am_keepdirty; /* List of extents that
64					   we keep dirty to reduce bitmap
65					   updates. */
66	int		 am_nkeepdirty;	/* Number of am_keepdirty elements. */
67	int		 am_nkeepdirty_limit; /* Maximum number of am_keepdirty
68					         elements. */
69};
70
71struct keepdirty {
72	int	kd_extent;
73	TAILQ_ENTRY(keepdirty) kd_next;
74};
75
76/*
77 * Helper function taken from sys/systm.h to calculate extentshift.
78 */
79static uint32_t
80bitcount32(uint32_t x)
81{
82
83	x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1);
84	x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2);
85	x = (x + (x >> 4)) & 0x0f0f0f0f;
86	x = (x + (x >> 8));
87	x = (x + (x >> 16)) & 0x000000ff;
88	return (x);
89}
90
91static __inline int
92off2ext(const struct activemap *amp, off_t offset)
93{
94	int extent;
95
96	assert(offset >= 0 && offset < amp->am_mediasize);
97	extent = (offset >> amp->am_extentshift);
98	assert(extent >= 0 && extent < amp->am_nextents);
99	return (extent);
100}
101
102static __inline off_t
103ext2off(const struct activemap *amp, int extent)
104{
105	off_t offset;
106
107	assert(extent >= 0 && extent < amp->am_nextents);
108	offset = ((off_t)extent << amp->am_extentshift);
109	assert(offset >= 0 && offset < amp->am_mediasize);
110	return (offset);
111}
112
113/*
114 * Function calculates number of requests needed to synchronize the given
115 * extent.
116 */
117static __inline int
118ext2reqs(const struct activemap *amp, int ext)
119{
120	off_t left;
121
122	if (ext < amp->am_nextents - 1)
123		return (((amp->am_extentsize - 1) / MAXPHYS) + 1);
124
125	assert(ext == amp->am_nextents - 1);
126	left = amp->am_mediasize % amp->am_extentsize;
127	if (left == 0)
128		left = amp->am_extentsize;
129	return (((left - 1) / MAXPHYS) + 1);
130}
131
132/*
133 * Initialize activemap structure and allocate memory for internal needs.
134 * Function returns 0 on success and -1 if any of the allocations failed.
135 */
136int
137activemap_init(struct activemap **ampp, uint64_t mediasize, uint32_t extentsize,
138    uint32_t sectorsize, uint32_t keepdirty)
139{
140	struct activemap *amp;
141
142	assert(ampp != NULL);
143	assert(mediasize > 0);
144	assert(extentsize > 0);
145	assert(powerof2(extentsize));
146	assert(sectorsize > 0);
147	assert(powerof2(sectorsize));
148	assert(keepdirty > 0);
149
150	amp = malloc(sizeof(*amp));
151	if (amp == NULL)
152		return (-1);
153
154	amp->am_mediasize = mediasize;
155	amp->am_nkeepdirty_limit = keepdirty;
156	amp->am_extentsize = extentsize;
157	amp->am_extentshift = bitcount32(extentsize - 1);
158	amp->am_nextents = ((mediasize - 1) / extentsize) + 1;
159	amp->am_mapsize = sizeof(bitstr_t) * bitstr_size(amp->am_nextents);
160	amp->am_diskmapsize = roundup2(amp->am_mapsize, sectorsize);
161	amp->am_ndirty = 0;
162	amp->am_syncoff = -2;
163	TAILQ_INIT(&amp->am_keepdirty);
164	amp->am_nkeepdirty = 0;
165
166	amp->am_memtab = calloc(amp->am_nextents, sizeof(amp->am_memtab[0]));
167	amp->am_diskmap = calloc(1, amp->am_diskmapsize);
168	amp->am_memmap = bit_alloc(amp->am_nextents);
169	amp->am_syncmap = bit_alloc(amp->am_nextents);
170
171	/*
172	 * Check to see if any of the allocations above failed.
173	 */
174	if (amp->am_memtab == NULL || amp->am_diskmap == NULL ||
175	    amp->am_memmap == NULL || amp->am_syncmap == NULL) {
176		if (amp->am_memtab != NULL)
177			free(amp->am_memtab);
178		if (amp->am_diskmap != NULL)
179			free(amp->am_diskmap);
180		if (amp->am_memmap != NULL)
181			free(amp->am_memmap);
182		if (amp->am_syncmap != NULL)
183			free(amp->am_syncmap);
184		amp->am_magic = 0;
185		free(amp);
186		errno = ENOMEM;
187		return (-1);
188	}
189
190	amp->am_magic = ACTIVEMAP_MAGIC;
191	*ampp = amp;
192
193	return (0);
194}
195
196static struct keepdirty *
197keepdirty_find(struct activemap *amp, int extent)
198{
199	struct keepdirty *kd;
200
201	TAILQ_FOREACH(kd, &amp->am_keepdirty, kd_next) {
202		if (kd->kd_extent == extent)
203			break;
204	}
205	return (kd);
206}
207
208static bool
209keepdirty_add(struct activemap *amp, int extent)
210{
211	struct keepdirty *kd;
212
213	kd = keepdirty_find(amp, extent);
214	if (kd != NULL) {
215		/*
216		 * Only move element at the begining.
217		 */
218		TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
219		TAILQ_INSERT_HEAD(&amp->am_keepdirty, kd, kd_next);
220		return (false);
221	}
222	/*
223	 * Add new element, but first remove the most unused one if
224	 * we have too many.
225	 */
226	if (amp->am_nkeepdirty >= amp->am_nkeepdirty_limit) {
227		kd = TAILQ_LAST(&amp->am_keepdirty, skeepdirty);
228		assert(kd != NULL);
229		TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
230		amp->am_nkeepdirty--;
231		assert(amp->am_nkeepdirty > 0);
232	}
233	if (kd == NULL)
234		kd = malloc(sizeof(*kd));
235	/* We can ignore allocation failure. */
236	if (kd != NULL) {
237		kd->kd_extent = extent;
238		amp->am_nkeepdirty++;
239		TAILQ_INSERT_HEAD(&amp->am_keepdirty, kd, kd_next);
240	}
241
242	return (true);
243}
244
245static void
246keepdirty_fill(struct activemap *amp)
247{
248	struct keepdirty *kd;
249
250	TAILQ_FOREACH(kd, &amp->am_keepdirty, kd_next)
251		bit_set(amp->am_diskmap, kd->kd_extent);
252}
253
254static void
255keepdirty_free(struct activemap *amp)
256{
257	struct keepdirty *kd;
258
259	while ((kd = TAILQ_FIRST(&amp->am_keepdirty)) != NULL) {
260		TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
261		amp->am_nkeepdirty--;
262		free(kd);
263	}
264	assert(amp->am_nkeepdirty == 0);
265}
266
267/*
268 * Function frees resources allocated by activemap_init() function.
269 */
270void
271activemap_free(struct activemap *amp)
272{
273
274	assert(amp->am_magic == ACTIVEMAP_MAGIC);
275
276	amp->am_magic = 0;
277
278	keepdirty_free(amp);
279	free(amp->am_memtab);
280	free(amp->am_diskmap);
281	free(amp->am_memmap);
282	free(amp->am_syncmap);
283}
284
285/*
286 * Function should be called before we handle write requests. It updates
287 * internal structures and returns true if on-disk metadata should be updated.
288 */
289bool
290activemap_write_start(struct activemap *amp, off_t offset, off_t length)
291{
292	bool modified;
293	off_t end;
294	int ext;
295
296	assert(amp->am_magic == ACTIVEMAP_MAGIC);
297	assert(length > 0);
298
299	modified = false;
300	end = offset + length - 1;
301
302	for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
303		/*
304		 * If the number of pending writes is increased from 0,
305		 * we have to mark the extent as dirty also in on-disk bitmap.
306		 * By returning true we inform the caller that on-disk bitmap
307		 * was modified and has to be flushed to disk.
308		 */
309		if (amp->am_memtab[ext]++ == 0) {
310			assert(!bit_test(amp->am_memmap, ext));
311			bit_set(amp->am_memmap, ext);
312			amp->am_ndirty++;
313		}
314		if (keepdirty_add(amp, ext))
315			modified = true;
316	}
317
318	return (modified);
319}
320
321/*
322 * Function should be called after receiving write confirmation. It updates
323 * internal structures and returns true if on-disk metadata should be updated.
324 */
325bool
326activemap_write_complete(struct activemap *amp, off_t offset, off_t length)
327{
328	bool modified;
329	off_t end;
330	int ext;
331
332	assert(amp->am_magic == ACTIVEMAP_MAGIC);
333	assert(length > 0);
334
335	modified = false;
336	end = offset + length - 1;
337
338	for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
339		/*
340		 * If the number of pending writes goes down to 0, we have to
341		 * mark the extent as clean also in on-disk bitmap.
342		 * By returning true we inform the caller that on-disk bitmap
343		 * was modified and has to be flushed to disk.
344		 */
345		assert(amp->am_memtab[ext] > 0);
346		assert(bit_test(amp->am_memmap, ext));
347		if (--amp->am_memtab[ext] == 0) {
348			bit_clear(amp->am_memmap, ext);
349			amp->am_ndirty--;
350			if (keepdirty_find(amp, ext) == NULL)
351				modified = true;
352		}
353	}
354
355	return (modified);
356}
357
358/*
359 * Function should be called after finishing synchronization of one extent.
360 * It returns true if on-disk metadata should be updated.
361 */
362bool
363activemap_extent_complete(struct activemap *amp, int extent)
364{
365	bool modified;
366	int reqs;
367
368	assert(amp->am_magic == ACTIVEMAP_MAGIC);
369	assert(extent >= 0 && extent < amp->am_nextents);
370
371	modified = false;
372
373	reqs = ext2reqs(amp, extent);
374	assert(amp->am_memtab[extent] >= reqs);
375	amp->am_memtab[extent] -= reqs;
376	assert(bit_test(amp->am_memmap, extent));
377	if (amp->am_memtab[extent] == 0) {
378		bit_clear(amp->am_memmap, extent);
379		amp->am_ndirty--;
380		modified = true;
381	}
382
383	return (modified);
384}
385
386/*
387 * Function returns number of dirty regions.
388 */
389uint64_t
390activemap_ndirty(const struct activemap *amp)
391{
392
393	assert(amp->am_magic == ACTIVEMAP_MAGIC);
394
395	return (amp->am_ndirty);
396}
397
398/*
399 * Function compare on-disk bitmap and in-memory bitmap and returns true if
400 * they differ and should be flushed to the disk.
401 */
402bool
403activemap_differ(const struct activemap *amp)
404{
405
406	assert(amp->am_magic == ACTIVEMAP_MAGIC);
407
408	return (memcmp(amp->am_diskmap, amp->am_memmap,
409	    amp->am_mapsize) != 0);
410}
411
412/*
413 * Function returns number of bytes used by bitmap.
414 */
415size_t
416activemap_size(const struct activemap *amp)
417{
418
419	assert(amp->am_magic == ACTIVEMAP_MAGIC);
420
421	return (amp->am_mapsize);
422}
423
424/*
425 * Function returns number of bytes needed for storing on-disk bitmap.
426 * This is the same as activemap_size(), but rounded up to sector size.
427 */
428size_t
429activemap_ondisk_size(const struct activemap *amp)
430{
431
432	assert(amp->am_magic == ACTIVEMAP_MAGIC);
433
434	return (amp->am_diskmapsize);
435}
436
437/*
438 * Function copies the given buffer read from disk to the internal bitmap.
439 */
440void
441activemap_copyin(struct activemap *amp, const unsigned char *buf, size_t size)
442{
443	int ext;
444
445	assert(amp->am_magic == ACTIVEMAP_MAGIC);
446	assert(size >= amp->am_mapsize);
447
448	memcpy(amp->am_diskmap, buf, amp->am_mapsize);
449	memcpy(amp->am_memmap, buf, amp->am_mapsize);
450	memcpy(amp->am_syncmap, buf, amp->am_mapsize);
451
452	bit_ffs(amp->am_memmap, amp->am_nextents, &ext);
453	if (ext == -1) {
454		/* There are no dirty extents, so we can leave now. */
455		return;
456	}
457	/*
458	 * Set synchronization offset to the first dirty extent.
459	 */
460	activemap_sync_rewind(amp);
461	/*
462	 * We have dirty extents and we want them to stay that way until
463	 * we synchronize, so we set number of pending writes to number
464	 * of requests needed to synchronize one extent.
465	 */
466	amp->am_ndirty = 0;
467	for (; ext < amp->am_nextents; ext++) {
468		if (bit_test(amp->am_memmap, ext)) {
469			amp->am_memtab[ext] = ext2reqs(amp, ext);
470			amp->am_ndirty++;
471		}
472	}
473}
474
475/*
476 * Function merges the given bitmap with existing one.
477 */
478void
479activemap_merge(struct activemap *amp, const unsigned char *buf, size_t size)
480{
481	bitstr_t *remmap = __DECONST(bitstr_t *, buf);
482	int ext;
483
484	assert(amp->am_magic == ACTIVEMAP_MAGIC);
485	assert(size >= amp->am_mapsize);
486
487	bit_ffs(remmap, amp->am_nextents, &ext);
488	if (ext == -1) {
489		/* There are no dirty extents, so we can leave now. */
490		return;
491	}
492	/*
493	 * We have dirty extents and we want them to stay that way until
494	 * we synchronize, so we set number of pending writes to number
495	 * of requests needed to synchronize one extent.
496	 */
497	for (; ext < amp->am_nextents; ext++) {
498		/* Local extent already dirty. */
499		if (bit_test(amp->am_syncmap, ext))
500			continue;
501		/* Remote extent isn't dirty. */
502		if (!bit_test(remmap, ext))
503			continue;
504		bit_set(amp->am_syncmap, ext);
505		bit_set(amp->am_memmap, ext);
506		bit_set(amp->am_diskmap, ext);
507		if (amp->am_memtab[ext] == 0)
508			amp->am_ndirty++;
509		amp->am_memtab[ext] = ext2reqs(amp, ext);
510	}
511	/*
512	 * Set synchronization offset to the first dirty extent.
513	 */
514	activemap_sync_rewind(amp);
515}
516
517/*
518 * Function returns pointer to internal bitmap that should be written to disk.
519 */
520const unsigned char *
521activemap_bitmap(struct activemap *amp, size_t *sizep)
522{
523
524	assert(amp->am_magic == ACTIVEMAP_MAGIC);
525
526	if (sizep != NULL)
527		*sizep = amp->am_diskmapsize;
528	memcpy(amp->am_diskmap, amp->am_memmap, amp->am_mapsize);
529	keepdirty_fill(amp);
530	return ((const unsigned char *)amp->am_diskmap);
531}
532
533/*
534 * Function calculates size needed to store bitmap on disk.
535 */
536size_t
537activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize,
538    uint32_t sectorsize)
539{
540	uint64_t nextents, mapsize;
541
542	assert(mediasize > 0);
543	assert(extentsize > 0);
544	assert(powerof2(extentsize));
545	assert(sectorsize > 0);
546	assert(powerof2(sectorsize));
547
548	nextents = ((mediasize - 1) / extentsize) + 1;
549	mapsize = sizeof(bitstr_t) * bitstr_size(nextents);
550	return (roundup2(mapsize, sectorsize));
551}
552
553/*
554 * Set synchronization offset to the first dirty extent.
555 */
556void
557activemap_sync_rewind(struct activemap *amp)
558{
559	int ext;
560
561	assert(amp->am_magic == ACTIVEMAP_MAGIC);
562
563	bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
564	if (ext == -1) {
565		/* There are no extents to synchronize. */
566		amp->am_syncoff = -2;
567		return;
568	}
569	/*
570	 * Mark that we want to start synchronization from the begining.
571	 */
572	amp->am_syncoff = -1;
573}
574
575/*
576 * Return next offset of where we should synchronize.
577 */
578off_t
579activemap_sync_offset(struct activemap *amp, off_t *lengthp, int *syncextp)
580{
581	off_t syncoff, left;
582	int ext;
583
584	assert(amp->am_magic == ACTIVEMAP_MAGIC);
585	assert(lengthp != NULL);
586	assert(syncextp != NULL);
587
588	*syncextp = -1;
589
590	if (amp->am_syncoff == -2)
591		return (-1);
592
593	if (amp->am_syncoff >= 0 &&
594	    (amp->am_syncoff + MAXPHYS >= amp->am_mediasize ||
595	     off2ext(amp, amp->am_syncoff) !=
596	     off2ext(amp, amp->am_syncoff + MAXPHYS))) {
597		/*
598		 * We are about to change extent, so mark previous one as clean.
599		 */
600		ext = off2ext(amp, amp->am_syncoff);
601		bit_clear(amp->am_syncmap, ext);
602		*syncextp = ext;
603		amp->am_syncoff = -1;
604	}
605
606	if (amp->am_syncoff == -1) {
607		/*
608		 * Let's find first extent to synchronize.
609		 */
610		bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
611		if (ext == -1) {
612			amp->am_syncoff = -2;
613			return (-1);
614		}
615		amp->am_syncoff = ext2off(amp, ext);
616	} else {
617		/*
618		 * We don't change extent, so just increase offset.
619		 */
620		amp->am_syncoff += MAXPHYS;
621		if (amp->am_syncoff >= amp->am_mediasize) {
622			amp->am_syncoff = -2;
623			return (-1);
624		}
625	}
626
627	syncoff = amp->am_syncoff;
628	left = ext2off(amp, off2ext(amp, syncoff)) +
629	    amp->am_extentsize - syncoff;
630	if (syncoff + left > amp->am_mediasize)
631		left = amp->am_mediasize - syncoff;
632	if (left > MAXPHYS)
633		left = MAXPHYS;
634
635	assert(left >= 0 && left <= MAXPHYS);
636	assert(syncoff >= 0 && syncoff < amp->am_mediasize);
637	assert(syncoff + left >= 0 && syncoff + left <= amp->am_mediasize);
638
639	*lengthp = left;
640	return (syncoff);
641}
642
643/*
644 * Mark extent(s) containing the given region for synchronization.
645 * Most likely one of the components is unavailable.
646 */
647bool
648activemap_need_sync(struct activemap *amp, off_t offset, off_t length)
649{
650	bool modified;
651	off_t end;
652	int ext;
653
654	assert(amp->am_magic == ACTIVEMAP_MAGIC);
655
656	modified = false;
657	end = offset + length - 1;
658
659	for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
660		if (bit_test(amp->am_syncmap, ext)) {
661			/* Already marked for synchronization. */
662			assert(bit_test(amp->am_memmap, ext));
663			continue;
664		}
665		bit_set(amp->am_syncmap, ext);
666		if (!bit_test(amp->am_memmap, ext)) {
667			bit_set(amp->am_memmap, ext);
668			amp->am_ndirty++;
669		}
670		amp->am_memtab[ext] += ext2reqs(amp, ext);
671		modified = true;
672	}
673
674	return (modified);
675}
676
677void
678activemap_dump(const struct activemap *amp)
679{
680	int bit;
681
682	printf("M: ");
683	for (bit = 0; bit < amp->am_nextents; bit++)
684		printf("%d", bit_test(amp->am_memmap, bit) ? 1 : 0);
685	printf("\n");
686	printf("D: ");
687	for (bit = 0; bit < amp->am_nextents; bit++)
688		printf("%d", bit_test(amp->am_diskmap, bit) ? 1 : 0);
689	printf("\n");
690	printf("S: ");
691	for (bit = 0; bit < amp->am_nextents; bit++)
692		printf("%d", bit_test(amp->am_syncmap, bit) ? 1 : 0);
693	printf("\n");
694}
695