1/* 2 * Copyright (c) 2001-2006 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* 30 * shadow.c 31 * 32 * Implement copy-on-write shadow map to allow a disk image to be 33 * mounted read-only, yet be writable by transferring writes to a 34 * "shadow" file. Subsequent reads from blocks that have been 35 * written will then go the "shadow" file. 36 * 37 * The map has two parts: 38 * 1) a bit map to track which blocks have been written 39 * 2) a band map to map a "band" within the original file to a corresponding 40 * "band" in the shadow file. Each band has the same size. 41 * 42 * The band map is used to ensure that blocks that are contiguous in the 43 * original file will remain contiguous in the shadow file. 44 * 45 * For debugging purposes, this file can be compiled standalone using: 46 * cc -o shadow shadow.c -DTEST_SHADOW 47 */ 48 49/* 50 * Modification History 51 * 52 * December 21, 2001 Dieter Siegmund (dieter@apple.com) 53 * - initial revision 54 */ 55#include <sys/param.h> 56#include <sys/types.h> 57#include <mach/boolean.h> 58 59#include <string.h> 60 61#ifdef TEST_SHADOW 62#include <unistd.h> 63#include <stdlib.h> 64#define my_malloc(a) malloc(a) 65#define my_free(a) free(a) 66#else /* !TEST_SHADOW */ 67#include <sys/malloc.h> 68#define my_malloc(a) _MALLOC(a, M_TEMP, M_WAITOK) 69#define my_free(a) FREE(a, M_TEMP) 70#include <libkern/libkern.h> 71#endif /* TEST_SHADOW */ 72 73#include "shadow.h" 74 75#define UINT32_ALL_ONES ((uint32_t)(-1)) 76#define USHORT_ALL_ONES ((u_short)(-1)) 77#define UCHAR_ALL_ONES ((u_char)(-1)) 78 79#define my_trunc(value, divisor) ((value) / (divisor) * (divisor)) 80 81/* a band size of 128K can represent a file up to 8GB */ 82#define BAND_SIZE_DEFAULT_POWER_2 17 83#define BAND_SIZE_DEFAULT (1 << BAND_SIZE_DEFAULT_POWER_2) 84 85typedef u_short band_number_t; 86#define BAND_ZERO ((band_number_t)0) 87#define BAND_MAX ((band_number_t)65535) 88 89struct shadow_map { 90 uint32_t blocks_per_band;/* size in blocks */ 91 uint32_t block_size; 92 u_char * block_bitmap; /* 1 bit per block; 1=written */ 93 band_number_t * bands; /* band map array */ 94 uint32_t file_size_blocks; /* size of file in bands */ 95 uint32_t shadow_size_bands; /* size of shadow in bands */ 96 uint32_t next_band; /* next free band */ 97 uint32_t zeroth_band; /* special-case 0th band */ 98}; 99 100 101typedef struct { 102 uint32_t byte; 103 uint32_t bit; 104} bitmap_offset_t; 105 106static __inline__ u_char 107bit(int b) 108{ 109 return ((u_char)(1 << b)); 110} 111 112/* 113 * Function: bits_lower 114 * Purpose: 115 * Return a byte value in which bits numbered lower than 'b' are set. 116 */ 117static __inline__ u_char 118bits_lower(int b) 119{ 120 return ((u_char)(bit(b) - 1)); 121} 122 123/* 124 * Function: byte_set_bits 125 * Purpose: 126 * Set the given range of bits within a byte. 127 */ 128static __inline__ u_char 129byte_set_bits(int start, int end) 130{ 131 return ((u_char)((~bits_lower(start)) & (bits_lower(end) | bit(end)))); 132} 133 134static __inline__ bitmap_offset_t 135bitmap_offset(off_t where) 136{ 137 bitmap_offset_t b; 138 139 b.byte = where / NBBY; 140 b.bit = where % NBBY; 141 return (b); 142} 143 144/* 145 * Function: bitmap_set 146 * 147 * Purpose: 148 * Set the given range of bits. 149 * 150 * This algorithm tries to set the extents using the biggest 151 * units, using longs, then a short, then a byte, then bits. 152 */ 153static void 154bitmap_set(u_char * map, uint32_t start_bit, uint32_t bit_count) 155{ 156 bitmap_offset_t start; 157 bitmap_offset_t end; 158 159 start = bitmap_offset(start_bit); 160 end = bitmap_offset(start_bit + bit_count); 161 if (start.byte < end.byte) { 162 uint32_t n_bytes; 163 164 if (start.bit) { 165 map[start.byte] |= byte_set_bits(start.bit, NBBY - 1); 166 start.bit = 0; 167 start.byte++; 168 if (start.byte == end.byte) 169 goto end; 170 } 171 172 n_bytes = end.byte - start.byte; 173 174 while (n_bytes >= (sizeof(uint32_t))) { 175 *((uint32_t *)(map + start.byte)) = UINT32_ALL_ONES; 176 start.byte += sizeof(uint32_t); 177 n_bytes -= sizeof(uint32_t); 178 } 179 if (n_bytes >= sizeof(u_short)) { 180 *((u_short *)(map + start.byte)) = USHORT_ALL_ONES; 181 start.byte += sizeof(u_short); 182 n_bytes -= sizeof(u_short); 183 } 184 if (n_bytes == 1) { 185 map[start.byte] = UCHAR_ALL_ONES; 186 start.byte++; 187 n_bytes = 0; 188 } 189 } 190 191 end: 192 if (end.bit > start.bit) { 193 map[start.byte] |= byte_set_bits(start.bit, end.bit - 1); 194 } 195 196 return; 197} 198 199/* 200 * Function: bitmap_get 201 * 202 * Purpose: 203 * Return the number of bits in the range that are the same e.g. 204 * 11101 returns 3 because the first 3 bits are the same (1's), whereas 205 * 001100 returns 2 because the first 2 bits are the same. 206 * This algorithm tries to count things in as big a chunk as possible, 207 * first aligning to a byte offset, then trying to count longs, a short, 208 * a byte, then any remaining bits to find the bit that is different. 209 */ 210 211static uint32_t 212bitmap_get(u_char * map, uint32_t start_bit, uint32_t bit_count, 213 boolean_t * ret_is_set) 214{ 215 uint32_t count; 216 int i; 217 boolean_t is_set; 218 bitmap_offset_t start; 219 bitmap_offset_t end; 220 221 start = bitmap_offset(start_bit); 222 end = bitmap_offset(start_bit + bit_count); 223 224 is_set = (map[start.byte] & bit(start.bit)) ? TRUE : FALSE; 225 count = 0; 226 227 if (start.byte < end.byte) { 228 uint32_t n_bytes; 229 230 if (start.bit) { /* try to align to a byte */ 231 for (i = start.bit; i < NBBY; i++) { 232 boolean_t this_is_set; 233 234 this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE; 235 if (this_is_set != is_set) { 236 goto done; /* found bit that was different, we're done */ 237 } 238 count++; 239 } 240 start.bit = 0; /* made it to the next byte */ 241 start.byte++; 242 if (start.byte == end.byte) 243 goto end; /* no more bytes, check for any leftover bits */ 244 } 245 /* calculate how many bytes are left in the range */ 246 n_bytes = end.byte - start.byte; 247 248 /* check for 4 bytes of the same bits */ 249 while (n_bytes >= sizeof(uint32_t)) { 250 uint32_t * valPtr = (uint32_t *)(map + start.byte); 251 if ((is_set && *valPtr == UINT32_ALL_ONES) 252 || (!is_set && *valPtr == 0)) { 253 count += sizeof(*valPtr) * NBBY; 254 start.byte += sizeof(*valPtr); 255 n_bytes -= sizeof(*valPtr); 256 } 257 else 258 break; /* bits differ */ 259 260 } 261 /* check for 2 bytes of the same bits */ 262 if (n_bytes >= sizeof(u_short)) { 263 u_short * valPtr = (u_short *)(map + start.byte); 264 265 if ((is_set && *valPtr == USHORT_ALL_ONES) 266 || (!is_set && (*valPtr == 0))) { 267 count += sizeof(*valPtr) * NBBY; 268 start.byte += sizeof(*valPtr); 269 n_bytes -= sizeof(*valPtr); 270 } 271 } 272 273 /* check for 1 byte of the same bits */ 274 if (n_bytes) { 275 if ((is_set && map[start.byte] == UCHAR_ALL_ONES) 276 || (!is_set && map[start.byte] == 0)) { 277 count += NBBY; 278 start.byte++; 279 n_bytes--; 280 } 281 /* we found bits that were different, find the first one */ 282 if (n_bytes) { 283 for (i = 0; i < NBBY; i++) { 284 boolean_t this_is_set; 285 286 this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE; 287 if (this_is_set != is_set) { 288 break; 289 } 290 count++; 291 } 292 goto done; 293 } 294 } 295 } 296 297 end: 298 for (i = start.bit; i < (int)end.bit; i++) { 299 boolean_t this_is_set = (map[start.byte] & bit(i)) ? TRUE : FALSE; 300 301 if (this_is_set != is_set) { 302 break; 303 } 304 count++; 305 } 306 307 done: 308 *ret_is_set = is_set; 309 return (count); 310} 311 312static __inline__ band_number_t 313shadow_map_block_to_band(shadow_map_t * map, uint32_t block) 314{ 315 return (block / map->blocks_per_band); 316} 317 318/* 319 * Function: shadow_map_mapped_band 320 * Purpose: 321 * Return the mapped band for the given band. 322 * If map_it is FALSE, and the band is not mapped, return FALSE. 323 * If map_it is TRUE, then this function will always return TRUE. 324 */ 325static boolean_t 326shadow_map_mapped_band(shadow_map_t * map, band_number_t band, 327 boolean_t map_it, band_number_t * mapped_band) 328{ 329 boolean_t is_mapped = FALSE; 330 331 if (band == map->zeroth_band) { 332 *mapped_band = BAND_ZERO; 333 is_mapped = TRUE; 334 } 335 else { 336 *mapped_band = map->bands[band]; 337 if (*mapped_band == BAND_ZERO) { 338 if (map_it) { 339 /* grow the file */ 340 if (map->next_band == 0) { 341 /* remember the zero'th band */ 342 map->zeroth_band = band; 343 } 344 *mapped_band = map->bands[band] = map->next_band++; 345 is_mapped = TRUE; 346 } 347 } 348 else { 349 is_mapped = TRUE; 350 } 351 } 352 return (is_mapped); 353} 354 355/* 356 * Function: shadow_map_contiguous 357 * 358 * Purpose: 359 * Return the first offset within the range position..(position + count) 360 * that is not a contiguous mapped band. 361 * 362 * If called with is_write = TRUE, this function will map bands as it goes. 363 */ 364static uint32_t 365shadow_map_contiguous(shadow_map_t * map, uint32_t start_block, 366 uint32_t num_blocks, boolean_t is_write) 367{ 368 band_number_t band = shadow_map_block_to_band(map, start_block); 369 uint32_t end_block = start_block + num_blocks; 370 boolean_t is_mapped; 371 band_number_t mapped_band; 372 uint32_t ret_end_block = end_block; 373 uint32_t p; 374 375 is_mapped = shadow_map_mapped_band(map, band, is_write, &mapped_band); 376 if (is_write == FALSE && is_mapped == FALSE) { 377 static int happened = 0; 378 /* this can't happen */ 379 if (happened == 0) { 380 printf("shadow_map_contiguous: this can't happen!\n"); 381 happened = 1; 382 } 383 return (start_block); 384 } 385 for (p = my_trunc(start_block + map->blocks_per_band, 386 map->blocks_per_band); 387 p < end_block; p += map->blocks_per_band) { 388 band_number_t next_mapped_band; 389 390 band++; 391 is_mapped = shadow_map_mapped_band(map, band, is_write, 392 &next_mapped_band); 393 if (is_write == FALSE && is_mapped == FALSE) { 394 return (p); 395 } 396 if ((mapped_band + 1) != next_mapped_band) { 397 /* not contiguous */ 398 ret_end_block = p; 399 break; 400 } 401 mapped_band = next_mapped_band; 402 } 403 return (ret_end_block); 404} 405 406 407/* 408 * Function: block_bitmap_size 409 * Purpose: 410 * The number of bytes required in a block bitmap to represent a file of size 411 * file_size. 412 * 413 * The bytes required is the number of blocks in the file, 414 * divided by the number of bits per byte. 415 * Note: 416 * An 8GB file requires (assuming 512 byte block): 417 * 2^33 / 2^9 / 2^3 = 2^21 = 2MB 418 * of bitmap space. This is a non-trival amount of memory, 419 * particularly since most of the bits will be zero. 420 * A sparse bitmap would really help in this case. 421 */ 422static __inline__ uint32_t 423block_bitmap_size(off_t file_size, uint32_t block_size) 424{ 425 off_t blocks = howmany(file_size, block_size); 426 return (howmany(blocks, NBBY)); 427} 428 429/* 430 * Function: shadow_map_read 431 * 432 * Purpose: 433 * Calculate the block offset within the shadow to read, and the number 434 * blocks to read. The input values (block_offset, block_count) refer 435 * to the original file. 436 * 437 * The output values (*incr_block_offset, *incr_block_count) refer to the 438 * shadow file if the return value is TRUE. They refer to the original 439 * file if the return value is FALSE. 440 441 * Blocks within a band may or may not have been written, in addition, 442 * Bands are not necessarily contiguous, therefore: 443 * *incr_block_count <= block_count 444 * The caller must be prepared to call this function interatively 445 * to complete the whole i/o. 446 * Returns: 447 * TRUE if the shadow file should be read, FALSE if the original file 448 * should be read. 449 */ 450boolean_t 451shadow_map_read(shadow_map_t * map, uint32_t block_offset, uint32_t block_count, 452 uint32_t * incr_block_offset, uint32_t * incr_block_count) 453{ 454 boolean_t written = FALSE; 455 uint32_t n_blocks; 456 457 if (block_offset >= map->file_size_blocks 458 || (block_offset + block_count) > map->file_size_blocks) { 459 printf("shadow_map_read: request (%d, %d) exceeds file size %d\n", 460 block_offset, block_count, map->file_size_blocks); 461 *incr_block_count = 0; 462 } 463 n_blocks = bitmap_get(map->block_bitmap, block_offset, block_count, 464 &written); 465 if (written == FALSE) { 466 *incr_block_count = n_blocks; 467 *incr_block_offset = block_offset; 468 } 469 else { /* start has been written, and therefore mapped */ 470 band_number_t mapped_band; 471 uint32_t band_limit; 472 473 mapped_band = map->bands[shadow_map_block_to_band(map, block_offset)]; 474 *incr_block_offset = mapped_band * map->blocks_per_band 475 + (block_offset % map->blocks_per_band); 476 band_limit 477 = shadow_map_contiguous(map, block_offset, block_count, FALSE); 478 *incr_block_count = band_limit - block_offset; 479 if (*incr_block_count > n_blocks) { 480 *incr_block_count = n_blocks; 481 } 482 } 483 return (written); 484} 485 486/* 487 * Function: shadow_map_write 488 * 489 * Purpose: 490 * Calculate the block offset within the shadow to write, and the number 491 * blocks to write. The input values (block_offset, block_count) refer 492 * to the original file. The output values 493 * (*incr_block_offset, *incr_block_count) refer to the shadow file. 494 * 495 * Bands are not necessarily contiguous, therefore: 496 * *incr_block_count <= block_count 497 * The caller must be prepared to call this function interatively 498 * to complete the whole i/o. 499 * Returns: 500 * TRUE if the shadow file was grown, FALSE otherwise. 501 */ 502boolean_t 503shadow_map_write(shadow_map_t * map, uint32_t block_offset, 504 uint32_t block_count, uint32_t * incr_block_offset, 505 uint32_t * incr_block_count) 506{ 507 uint32_t band_limit; 508 band_number_t mapped_band; 509 boolean_t shadow_grew = FALSE; 510 511 if (block_offset >= map->file_size_blocks 512 || (block_offset + block_count) > map->file_size_blocks) { 513 printf("shadow_map_write: request (%d, %d) exceeds file size %d\n", 514 block_offset, block_count, map->file_size_blocks); 515 *incr_block_count = 0; 516 } 517 518 band_limit = shadow_map_contiguous(map, block_offset, block_count, TRUE); 519 mapped_band = map->bands[shadow_map_block_to_band(map, block_offset)]; 520 *incr_block_offset = mapped_band * map->blocks_per_band 521 + (block_offset % map->blocks_per_band); 522 *incr_block_count = band_limit - block_offset; 523 524 /* mark these blocks as written */ 525 bitmap_set(map->block_bitmap, block_offset, *incr_block_count); 526 527 if (map->next_band > map->shadow_size_bands) { 528 map->shadow_size_bands = map->next_band; 529 shadow_grew = TRUE; 530 } 531 return (shadow_grew); 532} 533 534boolean_t 535shadow_map_is_written(shadow_map_t * map, uint32_t block_offset) 536{ 537 bitmap_offset_t b; 538 539 b = bitmap_offset(block_offset); 540 return ((map->block_bitmap[b.byte] & bit(b.bit)) ? TRUE : FALSE); 541} 542 543/* 544 * Function: shadow_map_shadow_size 545 * 546 * Purpose: 547 * To return the size of the shadow file in blocks. 548 */ 549uint32_t 550shadow_map_shadow_size(shadow_map_t * map) 551{ 552 return (map->shadow_size_bands * map->blocks_per_band); 553} 554 555/* 556 * Function: shadow_map_create 557 * 558 * Purpose: 559 * Allocate the dynamic data for keeping track of the shadow dirty blocks 560 * and the band mapping table. 561 * Returns: 562 * NULL if an error occurred. 563 */ 564shadow_map_t * 565shadow_map_create(off_t file_size, off_t shadow_size, 566 uint32_t band_size, uint32_t block_size) 567{ 568 void * block_bitmap = NULL; 569 uint32_t bitmap_size; 570 band_number_t * bands = NULL; 571 shadow_map_t * map; 572 uint32_t n_bands = 0; 573 574 if (band_size == 0) { 575 band_size = BAND_SIZE_DEFAULT; 576 } 577 578 n_bands = howmany(file_size, band_size); 579 if (n_bands > (BAND_MAX + 1)) { 580 printf("file is too big: %d > %d\n", 581 n_bands, BAND_MAX); 582 goto failure; 583 } 584 585 /* create a block bitmap, one bit per block */ 586 bitmap_size = block_bitmap_size(file_size, block_size); 587 block_bitmap = my_malloc(bitmap_size); 588 if (block_bitmap == NULL) { 589 printf("failed to allocate bitmap\n"); 590 goto failure; 591 } 592 bzero(block_bitmap, bitmap_size); 593 594 /* get the band map */ 595 bands = (band_number_t *)my_malloc(n_bands * sizeof(band_number_t)); 596 if (bands == NULL) { 597 printf("failed to allocate bands\n"); 598 goto failure; 599 } 600 bzero(bands, n_bands * sizeof(band_number_t)); 601 602 map = my_malloc(sizeof(*map)); 603 if (map == NULL) { 604 printf("failed to allocate map\n"); 605 goto failure; 606 } 607 map->blocks_per_band = band_size / block_size; 608 map->block_bitmap = block_bitmap; 609 map->bands = bands; 610 map->file_size_blocks = n_bands * map->blocks_per_band; 611 map->next_band = 0; 612 map->zeroth_band = -1; 613 map->shadow_size_bands = howmany(shadow_size, band_size); 614 map->block_size = block_size; 615 return (map); 616 617 failure: 618 if (block_bitmap) 619 my_free(block_bitmap); 620 if (bands) 621 my_free(bands); 622 return (NULL); 623} 624 625/* 626 * Function: shadow_map_free 627 * Purpose: 628 * Frees the data structure to deal with the shadow map. 629 */ 630void 631shadow_map_free(shadow_map_t * map) 632{ 633 if (map->block_bitmap) 634 my_free(map->block_bitmap); 635 if (map->bands) 636 my_free(map->bands); 637 map->block_bitmap = NULL; 638 map->bands = NULL; 639 my_free(map); 640 return; 641} 642 643#ifdef TEST_SHADOW 644#define BAND_SIZE_BLOCKS (BAND_SIZE_DEFAULT / 512) 645 646enum { 647 ReadRequest, 648 WriteRequest, 649}; 650 651typedef struct { 652 int type; 653 uint32_t offset; 654 uint32_t count; 655} block_request_t; 656 657int 658main() 659{ 660 shadow_map_t * map; 661 int i; 662 block_request_t requests[] = { 663 { WriteRequest, BAND_SIZE_BLOCKS * 2, 1 }, 664 { ReadRequest, BAND_SIZE_BLOCKS / 2, BAND_SIZE_BLOCKS * 2 - 2 }, 665 { WriteRequest, BAND_SIZE_BLOCKS * 1, 5 * BAND_SIZE_BLOCKS + 3}, 666 { ReadRequest, 0, BAND_SIZE_BLOCKS * 10 }, 667 { WriteRequest, BAND_SIZE_BLOCKS * (BAND_MAX - 1), 668 BAND_SIZE_BLOCKS * 2}, 669 { 0, 0 }, 670 }; 671 672 map = shadow_map_create(1024 * 1024 * 1024 * 8ULL, 0, 0, 512); 673 if (map == NULL) { 674 printf("shadow_map_create failed\n"); 675 exit(1); 676 } 677 for (i = 0; TRUE; i++) { 678 uint32_t offset; 679 uint32_t resid; 680 boolean_t shadow_grew; 681 boolean_t read_shadow; 682 683 if (requests[i].count == 0) { 684 break; 685 } 686 offset = requests[i].offset; 687 resid = requests[i].count; 688 printf("\n%s REQUEST (%ld, %ld)\n", 689 requests[i].type == WriteRequest ? "WRITE" : "READ", 690 offset, resid); 691 switch (requests[i].type) { 692 case WriteRequest: 693 while (resid > 0) { 694 uint32_t this_offset; 695 uint32_t this_count; 696 697 shadow_grew = shadow_map_write(map, offset, 698 resid, 699 &this_offset, 700 &this_count); 701 printf("\t(%ld, %ld) => (%ld, %ld)", 702 offset, resid, this_offset, this_count); 703 resid -= this_count; 704 offset += this_count; 705 if (shadow_grew) { 706 printf(" shadow grew to %ld", shadow_map_shadow_size(map)); 707 } 708 printf("\n"); 709 } 710 break; 711 case ReadRequest: 712 while (resid > 0) { 713 uint32_t this_offset; 714 uint32_t this_count; 715 716 read_shadow = shadow_map_read(map, offset, 717 resid, 718 &this_offset, 719 &this_count); 720 printf("\t(%ld, %ld) => (%ld, %ld)%s\n", 721 offset, resid, this_offset, this_count, 722 read_shadow ? " from shadow" : ""); 723 if (this_count == 0) { 724 printf("this_count is 0, aborting\n"); 725 break; 726 } 727 resid -= this_count; 728 offset += this_count; 729 } 730 break; 731 default: 732 break; 733 } 734 } 735 if (map) { 736 shadow_map_free(map); 737 } 738 exit(0); 739 return (0); 740} 741#endif 742