/* * Copyright (c) 2001-2006 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * shadow.c * * Implement copy-on-write shadow map to allow a disk image to be * mounted read-only, yet be writable by transferring writes to a * "shadow" file. Subsequent reads from blocks that have been * written will then go the "shadow" file. * * The map has two parts: * 1) a bit map to track which blocks have been written * 2) a band map to map a "band" within the original file to a corresponding * "band" in the shadow file. Each band has the same size. * * The band map is used to ensure that blocks that are contiguous in the * original file will remain contiguous in the shadow file. * * For debugging purposes, this file can be compiled standalone using: * cc -o shadow shadow.c -DTEST_SHADOW */ /* * Modification History * * December 21, 2001 Dieter Siegmund (dieter@apple.com) * - initial revision */ #include #include #include #include #ifdef TEST_SHADOW #include #include #define my_malloc(a) malloc(a) #define my_free(a) free(a) #else /* !TEST_SHADOW */ #include #define my_malloc(a) _MALLOC(a, M_TEMP, M_WAITOK) #define my_free(a) FREE(a, M_TEMP) #include #endif /* TEST_SHADOW */ #include "shadow.h" #define UINT32_ALL_ONES ((uint32_t)(-1)) #define USHORT_ALL_ONES ((u_short)(-1)) #define UCHAR_ALL_ONES ((u_char)(-1)) #define my_trunc(value, divisor) ((value) / (divisor) * (divisor)) /* a band size of 128K can represent a file up to 8GB */ #define BAND_SIZE_DEFAULT_POWER_2 17 #define BAND_SIZE_DEFAULT (1 << BAND_SIZE_DEFAULT_POWER_2) typedef u_short band_number_t; #define BAND_ZERO ((band_number_t)0) #define BAND_MAX ((band_number_t)65535) struct shadow_map { uint32_t blocks_per_band;/* size in blocks */ uint32_t block_size; u_char * block_bitmap; /* 1 bit per block; 1=written */ band_number_t * bands; /* band map array */ uint32_t file_size_blocks; /* size of file in bands */ uint32_t shadow_size_bands; /* size of shadow in bands */ uint32_t next_band; /* next free band */ uint32_t zeroth_band; /* special-case 0th band */ }; typedef struct { uint32_t byte; uint32_t bit; } bitmap_offset_t; static __inline__ u_char bit(int b) { return ((u_char)(1 << b)); } /* * Function: bits_lower * Purpose: * Return a byte value in which bits numbered lower than 'b' are set. */ static __inline__ u_char bits_lower(int b) { return ((u_char)(bit(b) - 1)); } /* * Function: byte_set_bits * Purpose: * Set the given range of bits within a byte. */ static __inline__ u_char byte_set_bits(int start, int end) { return ((u_char)((~bits_lower(start)) & (bits_lower(end) | bit(end)))); } static __inline__ bitmap_offset_t bitmap_offset(off_t where) { bitmap_offset_t b; b.byte = where / NBBY; b.bit = where % NBBY; return (b); } /* * Function: bitmap_set * * Purpose: * Set the given range of bits. * * This algorithm tries to set the extents using the biggest * units, using longs, then a short, then a byte, then bits. */ static void bitmap_set(u_char * map, uint32_t start_bit, uint32_t bit_count) { bitmap_offset_t start; bitmap_offset_t end; start = bitmap_offset(start_bit); end = bitmap_offset(start_bit + bit_count); if (start.byte < end.byte) { uint32_t n_bytes; if (start.bit) { map[start.byte] |= byte_set_bits(start.bit, NBBY - 1); start.bit = 0; start.byte++; if (start.byte == end.byte) goto end; } n_bytes = end.byte - start.byte; while (n_bytes >= (sizeof(uint32_t))) { *((uint32_t *)(map + start.byte)) = UINT32_ALL_ONES; start.byte += sizeof(uint32_t); n_bytes -= sizeof(uint32_t); } if (n_bytes >= sizeof(u_short)) { *((u_short *)(map + start.byte)) = USHORT_ALL_ONES; start.byte += sizeof(u_short); n_bytes -= sizeof(u_short); } if (n_bytes == 1) { map[start.byte] = UCHAR_ALL_ONES; start.byte++; n_bytes = 0; } } end: if (end.bit > start.bit) { map[start.byte] |= byte_set_bits(start.bit, end.bit - 1); } return; } /* * Function: bitmap_get * * Purpose: * Return the number of bits in the range that are the same e.g. * 11101 returns 3 because the first 3 bits are the same (1's), whereas * 001100 returns 2 because the first 2 bits are the same. * This algorithm tries to count things in as big a chunk as possible, * first aligning to a byte offset, then trying to count longs, a short, * a byte, then any remaining bits to find the bit that is different. */ static uint32_t bitmap_get(u_char * map, uint32_t start_bit, uint32_t bit_count, boolean_t * ret_is_set) { uint32_t count; int i; boolean_t is_set; bitmap_offset_t start; bitmap_offset_t end; start = bitmap_offset(start_bit); end = bitmap_offset(start_bit + bit_count); is_set = (map[start.byte] & bit(start.bit)) ? TRUE : FALSE; count = 0; if (start.byte < end.byte) { uint32_t n_bytes; if (start.bit) { /* try to align to a byte */ for (i = start.bit; i < NBBY; i++) { boolean_t this_is_set; this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE; if (this_is_set != is_set) { goto done; /* found bit that was different, we're done */ } count++; } start.bit = 0; /* made it to the next byte */ start.byte++; if (start.byte == end.byte) goto end; /* no more bytes, check for any leftover bits */ } /* calculate how many bytes are left in the range */ n_bytes = end.byte - start.byte; /* check for 4 bytes of the same bits */ while (n_bytes >= sizeof(uint32_t)) { uint32_t * valPtr = (uint32_t *)(map + start.byte); if ((is_set && *valPtr == UINT32_ALL_ONES) || (!is_set && *valPtr == 0)) { count += sizeof(*valPtr) * NBBY; start.byte += sizeof(*valPtr); n_bytes -= sizeof(*valPtr); } else break; /* bits differ */ } /* check for 2 bytes of the same bits */ if (n_bytes >= sizeof(u_short)) { u_short * valPtr = (u_short *)(map + start.byte); if ((is_set && *valPtr == USHORT_ALL_ONES) || (!is_set && (*valPtr == 0))) { count += sizeof(*valPtr) * NBBY; start.byte += sizeof(*valPtr); n_bytes -= sizeof(*valPtr); } } /* check for 1 byte of the same bits */ if (n_bytes) { if ((is_set && map[start.byte] == UCHAR_ALL_ONES) || (!is_set && map[start.byte] == 0)) { count += NBBY; start.byte++; n_bytes--; } /* we found bits that were different, find the first one */ if (n_bytes) { for (i = 0; i < NBBY; i++) { boolean_t this_is_set; this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE; if (this_is_set != is_set) { break; } count++; } goto done; } } } end: for (i = start.bit; i < (int)end.bit; i++) { boolean_t this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE; if (this_is_set != is_set) { break; } count++; } done: *ret_is_set = is_set; return (count); } static __inline__ band_number_t shadow_map_block_to_band(shadow_map_t * map, uint32_t block) { return (block / map->blocks_per_band); } /* * Function: shadow_map_mapped_band * Purpose: * Return the mapped band for the given band. * If map_it is FALSE, and the band is not mapped, return FALSE. * If map_it is TRUE, then this function will always return TRUE. */ static boolean_t shadow_map_mapped_band(shadow_map_t * map, band_number_t band, boolean_t map_it, band_number_t * mapped_band) { boolean_t is_mapped = FALSE; if (band == map->zeroth_band) { *mapped_band = BAND_ZERO; is_mapped = TRUE; } else { *mapped_band = map->bands[band]; if (*mapped_band == BAND_ZERO) { if (map_it) { /* grow the file */ if (map->next_band == 0) { /* remember the zero'th band */ map->zeroth_band = band; } *mapped_band = map->bands[band] = map->next_band++; is_mapped = TRUE; } } else { is_mapped = TRUE; } } return (is_mapped); } /* * Function: shadow_map_contiguous * * Purpose: * Return the first offset within the range position..(position + count) * that is not a contiguous mapped band. * * If called with is_write = TRUE, this function will map bands as it goes. */ static uint32_t shadow_map_contiguous(shadow_map_t * map, uint32_t start_block, uint32_t num_blocks, boolean_t is_write) { band_number_t band = shadow_map_block_to_band(map, start_block); uint32_t end_block = start_block + num_blocks; boolean_t is_mapped; band_number_t mapped_band; uint32_t ret_end_block = end_block; uint32_t p; is_mapped = shadow_map_mapped_band(map, band, is_write, &mapped_band); if (is_write == FALSE && is_mapped == FALSE) { static int happened = 0; /* this can't happen */ if (happened == 0) { printf("shadow_map_contiguous: this can't happen!\n"); happened = 1; } return (start_block); } for (p = my_trunc(start_block + map->blocks_per_band, map->blocks_per_band); p < end_block; p += map->blocks_per_band) { band_number_t next_mapped_band; band++; is_mapped = shadow_map_mapped_band(map, band, is_write, &next_mapped_band); if (is_write == FALSE && is_mapped == FALSE) { return (p); } if ((mapped_band + 1) != next_mapped_band) { /* not contiguous */ ret_end_block = p; break; } mapped_band = next_mapped_band; } return (ret_end_block); } /* * Function: block_bitmap_size * Purpose: * The number of bytes required in a block bitmap to represent a file of size * file_size. * * The bytes required is the number of blocks in the file, * divided by the number of bits per byte. * Note: * An 8GB file requires (assuming 512 byte block): * 2^33 / 2^9 / 2^3 = 2^21 = 2MB * of bitmap space. This is a non-trival amount of memory, * particularly since most of the bits will be zero. * A sparse bitmap would really help in this case. */ static __inline__ uint32_t block_bitmap_size(off_t file_size, uint32_t block_size) { off_t blocks = howmany(file_size, block_size); return (howmany(blocks, NBBY)); } /* * Function: shadow_map_read * * Purpose: * Calculate the block offset within the shadow to read, and the number * blocks to read. The input values (block_offset, block_count) refer * to the original file. * * The output values (*incr_block_offset, *incr_block_count) refer to the * shadow file if the return value is TRUE. They refer to the original * file if the return value is FALSE. * Blocks within a band may or may not have been written, in addition, * Bands are not necessarily contiguous, therefore: * *incr_block_count <= block_count * The caller must be prepared to call this function interatively * to complete the whole i/o. * Returns: * TRUE if the shadow file should be read, FALSE if the original file * should be read. */ boolean_t shadow_map_read(shadow_map_t * map, uint32_t block_offset, uint32_t block_count, uint32_t * incr_block_offset, uint32_t * incr_block_count) { boolean_t written = FALSE; uint32_t n_blocks; if (block_offset >= map->file_size_blocks || (block_offset + block_count) > map->file_size_blocks) { printf("shadow_map_read: request (%d, %d) exceeds file size %d\n", block_offset, block_count, map->file_size_blocks); *incr_block_count = 0; } n_blocks = bitmap_get(map->block_bitmap, block_offset, block_count, &written); if (written == FALSE) { *incr_block_count = n_blocks; *incr_block_offset = block_offset; } else { /* start has been written, and therefore mapped */ band_number_t mapped_band; uint32_t band_limit; mapped_band = map->bands[shadow_map_block_to_band(map, block_offset)]; *incr_block_offset = mapped_band * map->blocks_per_band + (block_offset % map->blocks_per_band); band_limit = shadow_map_contiguous(map, block_offset, block_count, FALSE); *incr_block_count = band_limit - block_offset; if (*incr_block_count > n_blocks) { *incr_block_count = n_blocks; } } return (written); } /* * Function: shadow_map_write * * Purpose: * Calculate the block offset within the shadow to write, and the number * blocks to write. The input values (block_offset, block_count) refer * to the original file. The output values * (*incr_block_offset, *incr_block_count) refer to the shadow file. * * Bands are not necessarily contiguous, therefore: * *incr_block_count <= block_count * The caller must be prepared to call this function interatively * to complete the whole i/o. * Returns: * TRUE if the shadow file was grown, FALSE otherwise. */ boolean_t shadow_map_write(shadow_map_t * map, uint32_t block_offset, uint32_t block_count, uint32_t * incr_block_offset, uint32_t * incr_block_count) { uint32_t band_limit; band_number_t mapped_band; boolean_t shadow_grew = FALSE; if (block_offset >= map->file_size_blocks || (block_offset + block_count) > map->file_size_blocks) { printf("shadow_map_write: request (%d, %d) exceeds file size %d\n", block_offset, block_count, map->file_size_blocks); *incr_block_count = 0; } band_limit = shadow_map_contiguous(map, block_offset, block_count, TRUE); mapped_band = map->bands[shadow_map_block_to_band(map, block_offset)]; *incr_block_offset = mapped_band * map->blocks_per_band + (block_offset % map->blocks_per_band); *incr_block_count = band_limit - block_offset; /* mark these blocks as written */ bitmap_set(map->block_bitmap, block_offset, *incr_block_count); if (map->next_band > map->shadow_size_bands) { map->shadow_size_bands = map->next_band; shadow_grew = TRUE; } return (shadow_grew); } boolean_t shadow_map_is_written(shadow_map_t * map, uint32_t block_offset) { bitmap_offset_t b; b = bitmap_offset(block_offset); return ((map->block_bitmap[b.byte] & bit(b.bit)) ? TRUE : FALSE); } /* * Function: shadow_map_shadow_size * * Purpose: * To return the size of the shadow file in blocks. */ uint32_t shadow_map_shadow_size(shadow_map_t * map) { return (map->shadow_size_bands * map->blocks_per_band); } /* * Function: shadow_map_create * * Purpose: * Allocate the dynamic data for keeping track of the shadow dirty blocks * and the band mapping table. * Returns: * NULL if an error occurred. */ shadow_map_t * shadow_map_create(off_t file_size, off_t shadow_size, uint32_t band_size, uint32_t block_size) { void * block_bitmap = NULL; uint32_t bitmap_size; band_number_t * bands = NULL; shadow_map_t * map; uint32_t n_bands = 0; if (band_size == 0) { band_size = BAND_SIZE_DEFAULT; } n_bands = howmany(file_size, band_size); if (n_bands > (BAND_MAX + 1)) { printf("file is too big: %d > %d\n", n_bands, BAND_MAX); goto failure; } /* create a block bitmap, one bit per block */ bitmap_size = block_bitmap_size(file_size, block_size); block_bitmap = my_malloc(bitmap_size); if (block_bitmap == NULL) { printf("failed to allocate bitmap\n"); goto failure; } bzero(block_bitmap, bitmap_size); /* get the band map */ bands = (band_number_t *)my_malloc(n_bands * sizeof(band_number_t)); if (bands == NULL) { printf("failed to allocate bands\n"); goto failure; } bzero(bands, n_bands * sizeof(band_number_t)); map = my_malloc(sizeof(*map)); if (map == NULL) { printf("failed to allocate map\n"); goto failure; } map->blocks_per_band = band_size / block_size; map->block_bitmap = block_bitmap; map->bands = bands; map->file_size_blocks = n_bands * map->blocks_per_band; map->next_band = 0; map->zeroth_band = -1; map->shadow_size_bands = howmany(shadow_size, band_size); map->block_size = block_size; return (map); failure: if (block_bitmap) my_free(block_bitmap); if (bands) my_free(bands); return (NULL); } /* * Function: shadow_map_free * Purpose: * Frees the data structure to deal with the shadow map. */ void shadow_map_free(shadow_map_t * map) { if (map->block_bitmap) my_free(map->block_bitmap); if (map->bands) my_free(map->bands); map->block_bitmap = NULL; map->bands = NULL; my_free(map); return; } #ifdef TEST_SHADOW #define BAND_SIZE_BLOCKS (BAND_SIZE_DEFAULT / 512) enum { ReadRequest, WriteRequest, }; typedef struct { int type; uint32_t offset; uint32_t count; } block_request_t; int main() { shadow_map_t * map; int i; block_request_t requests[] = { { WriteRequest, BAND_SIZE_BLOCKS * 2, 1 }, { ReadRequest, BAND_SIZE_BLOCKS / 2, BAND_SIZE_BLOCKS * 2 - 2 }, { WriteRequest, BAND_SIZE_BLOCKS * 1, 5 * BAND_SIZE_BLOCKS + 3}, { ReadRequest, 0, BAND_SIZE_BLOCKS * 10 }, { WriteRequest, BAND_SIZE_BLOCKS * (BAND_MAX - 1), BAND_SIZE_BLOCKS * 2}, { 0, 0 }, }; map = shadow_map_create(1024 * 1024 * 1024 * 8ULL, 0, 0, 512); if (map == NULL) { printf("shadow_map_create failed\n"); exit(1); } for (i = 0; TRUE; i++) { uint32_t offset; uint32_t resid; boolean_t shadow_grew; boolean_t read_shadow; if (requests[i].count == 0) { break; } offset = requests[i].offset; resid = requests[i].count; printf("\n%s REQUEST (%ld, %ld)\n", requests[i].type == WriteRequest ? "WRITE" : "READ", offset, resid); switch (requests[i].type) { case WriteRequest: while (resid > 0) { uint32_t this_offset; uint32_t this_count; shadow_grew = shadow_map_write(map, offset, resid, &this_offset, &this_count); printf("\t(%ld, %ld) => (%ld, %ld)", offset, resid, this_offset, this_count); resid -= this_count; offset += this_count; if (shadow_grew) { printf(" shadow grew to %ld", shadow_map_shadow_size(map)); } printf("\n"); } break; case ReadRequest: while (resid > 0) { uint32_t this_offset; uint32_t this_count; read_shadow = shadow_map_read(map, offset, resid, &this_offset, &this_count); printf("\t(%ld, %ld) => (%ld, %ld)%s\n", offset, resid, this_offset, this_count, read_shadow ? " from shadow" : ""); if (this_count == 0) { printf("this_count is 0, aborting\n"); break; } resid -= this_count; offset += this_count; } break; default: break; } } if (map) { shadow_map_free(map); } exit(0); return (0); } #endif