// SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2023 Red Hat */ #include "volume-index.h" #include #include #include #include #include #include "errors.h" #include "logger.h" #include "memory-alloc.h" #include "numeric.h" #include "permassert.h" #include "thread-utils.h" #include "config.h" #include "geometry.h" #include "hash-utils.h" #include "indexer.h" /* * The volume index is a combination of two separate subindexes, one containing sparse hook entries * (retained for all chapters), and one containing the remaining entries (retained only for the * dense chapters). If there are no sparse chapters, only the non-hook sub index is used, and it * will contain all records for all chapters. * * The volume index is also divided into zones, with one thread operating on each zone. Each * incoming request is dispatched to the appropriate thread, and then to the appropriate subindex. * Each delta list is handled by a single zone. To ensure that the distribution of delta lists to * zones doesn't underflow (leaving some zone with no delta lists), the minimum number of delta * lists must be the square of the maximum zone count for both subindexes. * * Each subindex zone is a delta index where the payload is a chapter number. The volume index can * compute the delta list number, address, and zone number from the record name in order to * dispatch record handling to the correct structures. * * Most operations that use all the zones take place either before request processing is allowed, * or after all requests have been flushed in order to shut down. The only multi-threaded operation * supported during normal operation is the uds_lookup_volume_index_name() method, used to determine * whether a new chapter should be loaded into the sparse index cache. This operation only uses the * sparse hook subindex, and the zone mutexes are used to make this operation safe. * * There are three ways of expressing chapter numbers in the volume index: virtual, index, and * rolling. The interface to the volume index uses virtual chapter numbers, which are 64 bits long. * Internally the subindex stores only the minimal number of bits necessary by masking away the * high-order bits. When the index needs to deal with ordering of index chapter numbers, as when * flushing entries from older chapters, it rolls the index chapter number around so that the * smallest one in use is mapped to 0. See convert_index_to_virtual() or flush_invalid_entries() * for an example of this technique. * * For efficiency, when older chapter numbers become invalid, the index does not immediately remove * the invalidated entries. Instead it lazily removes them from a given delta list the next time it * walks that list during normal operation. Because of this, the index size must be increased * somewhat to accommodate all the invalid entries that have not yet been removed. For the standard * index sizes, this requires about 4 chapters of old entries per 1024 chapters of valid entries in * the index. */ struct sub_index_parameters { /* The number of bits in address mask */ u8 address_bits; /* The number of bits in chapter number */ u8 chapter_bits; /* The mean delta */ u32 mean_delta; /* The number of delta lists */ u64 list_count; /* The number of chapters used */ u32 chapter_count; /* The number of bits per chapter */ size_t chapter_size_in_bits; /* The number of bytes of delta list memory */ size_t memory_size; /* The number of bytes the index should keep free at all times */ size_t target_free_bytes; }; struct split_config { /* The hook subindex configuration */ struct uds_configuration hook_config; struct index_geometry hook_geometry; /* The non-hook subindex configuration */ struct uds_configuration non_hook_config; struct index_geometry non_hook_geometry; }; struct chapter_range { u32 chapter_start; u32 chapter_count; }; #define MAGIC_SIZE 8 static const char MAGIC_START_5[] = "MI5-0005"; struct sub_index_data { char magic[MAGIC_SIZE]; /* MAGIC_START_5 */ u64 volume_nonce; u64 virtual_chapter_low; u64 virtual_chapter_high; u32 first_list; u32 list_count; }; static const char MAGIC_START_6[] = "MI6-0001"; struct volume_index_data { char magic[MAGIC_SIZE]; /* MAGIC_START_6 */ u32 sparse_sample_rate; }; static inline u32 extract_address(const struct volume_sub_index *sub_index, const struct uds_record_name *name) { return uds_extract_volume_index_bytes(name) & sub_index->address_mask; } static inline u32 extract_dlist_num(const struct volume_sub_index *sub_index, const struct uds_record_name *name) { u64 bits = uds_extract_volume_index_bytes(name); return (bits >> sub_index->address_bits) % sub_index->list_count; } static inline const struct volume_sub_index_zone * get_zone_for_record(const struct volume_index_record *record) { return &record->sub_index->zones[record->zone_number]; } static inline u64 convert_index_to_virtual(const struct volume_index_record *record, u32 index_chapter) { const struct volume_sub_index_zone *volume_index_zone = get_zone_for_record(record); u32 rolling_chapter = ((index_chapter - volume_index_zone->virtual_chapter_low) & record->sub_index->chapter_mask); return volume_index_zone->virtual_chapter_low + rolling_chapter; } static inline u32 convert_virtual_to_index(const struct volume_sub_index *sub_index, u64 virtual_chapter) { return virtual_chapter & sub_index->chapter_mask; } static inline bool is_virtual_chapter_indexed(const struct volume_index_record *record, u64 virtual_chapter) { const struct volume_sub_index_zone *volume_index_zone = get_zone_for_record(record); return ((virtual_chapter >= volume_index_zone->virtual_chapter_low) && (virtual_chapter <= volume_index_zone->virtual_chapter_high)); } static inline bool has_sparse(const struct volume_index *volume_index) { return volume_index->sparse_sample_rate > 0; } bool uds_is_volume_index_sample(const struct volume_index *volume_index, const struct uds_record_name *name) { if (!has_sparse(volume_index)) return false; return (uds_extract_sampling_bytes(name) % volume_index->sparse_sample_rate) == 0; } static inline const struct volume_sub_index * get_volume_sub_index(const struct volume_index *volume_index, const struct uds_record_name *name) { return (uds_is_volume_index_sample(volume_index, name) ? &volume_index->vi_hook : &volume_index->vi_non_hook); } static unsigned int get_volume_sub_index_zone(const struct volume_sub_index *sub_index, const struct uds_record_name *name) { return extract_dlist_num(sub_index, name) / sub_index->delta_index.lists_per_zone; } unsigned int uds_get_volume_index_zone(const struct volume_index *volume_index, const struct uds_record_name *name) { return get_volume_sub_index_zone(get_volume_sub_index(volume_index, name), name); } #define DELTA_LIST_SIZE 256 static int compute_volume_sub_index_parameters(const struct uds_configuration *config, struct sub_index_parameters *params) { u64 entries_in_volume_index, address_span; u32 chapters_in_volume_index, invalid_chapters; u32 rounded_chapters; u64 delta_list_records; u32 address_count; u64 index_size_in_bits; size_t expected_index_size; u64 min_delta_lists = MAX_ZONES * MAX_ZONES; struct index_geometry *geometry = config->geometry; u64 records_per_chapter = geometry->records_per_chapter; params->chapter_count = geometry->chapters_per_volume; /* * Make sure that the number of delta list records in the volume index does not change when * the volume is reduced by one chapter. This preserves the mapping from name to volume * index delta list. */ rounded_chapters = params->chapter_count; if (uds_is_reduced_index_geometry(geometry)) rounded_chapters += 1; delta_list_records = records_per_chapter * rounded_chapters; address_count = config->volume_index_mean_delta * DELTA_LIST_SIZE; params->list_count = max(delta_list_records / DELTA_LIST_SIZE, min_delta_lists); params->address_bits = bits_per(address_count - 1); params->chapter_bits = bits_per(rounded_chapters - 1); if ((u32) params->list_count != params->list_count) { return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT, "cannot initialize volume index with %llu delta lists", (unsigned long long) params->list_count); } if (params->address_bits > 31) { return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT, "cannot initialize volume index with %u address bits", params->address_bits); } /* * The probability that a given delta list is not touched during the writing of an entire * chapter is: * * double p_not_touched = pow((double) (params->list_count - 1) / params->list_count, * records_per_chapter); * * For the standard index sizes, about 78% of the delta lists are not touched, and * therefore contain old index entries that have not been eliminated by the lazy LRU * processing. Then the number of old index entries that accumulate over the entire index, * in terms of full chapters worth of entries, is: * * double invalid_chapters = p_not_touched / (1.0 - p_not_touched); * * For the standard index sizes, the index needs about 3.5 chapters of space for the old * entries in a 1024 chapter index, so round this up to use 4 chapters per 1024 chapters in * the index. */ invalid_chapters = max(rounded_chapters / 256, 2U); chapters_in_volume_index = rounded_chapters + invalid_chapters; entries_in_volume_index = records_per_chapter * chapters_in_volume_index; address_span = params->list_count << params->address_bits; params->mean_delta = address_span / entries_in_volume_index; /* * Compute the expected size of a full index, then set the total memory to be 6% larger * than that expected size. This number should be large enough that there are not many * rebalances when the index is full. */ params->chapter_size_in_bits = uds_compute_delta_index_size(records_per_chapter, params->mean_delta, params->chapter_bits); index_size_in_bits = params->chapter_size_in_bits * chapters_in_volume_index; expected_index_size = index_size_in_bits / BITS_PER_BYTE; params->memory_size = expected_index_size * 106 / 100; params->target_free_bytes = expected_index_size / 20; return UDS_SUCCESS; } static void uninitialize_volume_sub_index(struct volume_sub_index *sub_index) { vdo_free(vdo_forget(sub_index->flush_chapters)); vdo_free(vdo_forget(sub_index->zones)); uds_uninitialize_delta_index(&sub_index->delta_index); } void uds_free_volume_index(struct volume_index *volume_index) { if (volume_index == NULL) return; if (volume_index->zones != NULL) vdo_free(vdo_forget(volume_index->zones)); uninitialize_volume_sub_index(&volume_index->vi_non_hook); uninitialize_volume_sub_index(&volume_index->vi_hook); vdo_free(volume_index); } static int compute_volume_sub_index_save_bytes(const struct uds_configuration *config, size_t *bytes) { struct sub_index_parameters params = { .address_bits = 0 }; int result; result = compute_volume_sub_index_parameters(config, ¶ms); if (result != UDS_SUCCESS) return result; *bytes = (sizeof(struct sub_index_data) + params.list_count * sizeof(u64) + uds_compute_delta_index_save_bytes(params.list_count, params.memory_size)); return UDS_SUCCESS; } /* This function is only useful if the configuration includes sparse chapters. */ static void split_configuration(const struct uds_configuration *config, struct split_config *split) { u64 sample_rate, sample_records; u64 dense_chapters, sparse_chapters; /* Start with copies of the base configuration. */ split->hook_config = *config; split->hook_geometry = *config->geometry; split->hook_config.geometry = &split->hook_geometry; split->non_hook_config = *config; split->non_hook_geometry = *config->geometry; split->non_hook_config.geometry = &split->non_hook_geometry; sample_rate = config->sparse_sample_rate; sparse_chapters = config->geometry->sparse_chapters_per_volume; dense_chapters = config->geometry->chapters_per_volume - sparse_chapters; sample_records = config->geometry->records_per_chapter / sample_rate; /* Adjust the number of records indexed for each chapter. */ split->hook_geometry.records_per_chapter = sample_records; split->non_hook_geometry.records_per_chapter -= sample_records; /* Adjust the number of chapters indexed. */ split->hook_geometry.sparse_chapters_per_volume = 0; split->non_hook_geometry.sparse_chapters_per_volume = 0; split->non_hook_geometry.chapters_per_volume = dense_chapters; } static int compute_volume_index_save_bytes(const struct uds_configuration *config, size_t *bytes) { size_t hook_bytes, non_hook_bytes; struct split_config split; int result; if (!uds_is_sparse_index_geometry(config->geometry)) return compute_volume_sub_index_save_bytes(config, bytes); split_configuration(config, &split); result = compute_volume_sub_index_save_bytes(&split.hook_config, &hook_bytes); if (result != UDS_SUCCESS) return result; result = compute_volume_sub_index_save_bytes(&split.non_hook_config, &non_hook_bytes); if (result != UDS_SUCCESS) return result; *bytes = sizeof(struct volume_index_data) + hook_bytes + non_hook_bytes; return UDS_SUCCESS; } int uds_compute_volume_index_save_blocks(const struct uds_configuration *config, size_t block_size, u64 *block_count) { size_t bytes; int result; result = compute_volume_index_save_bytes(config, &bytes); if (result != UDS_SUCCESS) return result; bytes += sizeof(struct delta_list_save_info); *block_count = DIV_ROUND_UP(bytes, block_size) + MAX_ZONES; return UDS_SUCCESS; } /* Flush invalid entries while walking the delta list. */ static inline int flush_invalid_entries(struct volume_index_record *record, struct chapter_range *flush_range, u32 *next_chapter_to_invalidate) { int result; result = uds_next_delta_index_entry(&record->delta_entry); if (result != UDS_SUCCESS) return result; while (!record->delta_entry.at_end) { u32 index_chapter = uds_get_delta_entry_value(&record->delta_entry); u32 relative_chapter = ((index_chapter - flush_range->chapter_start) & record->sub_index->chapter_mask); if (likely(relative_chapter >= flush_range->chapter_count)) { if (relative_chapter < *next_chapter_to_invalidate) *next_chapter_to_invalidate = relative_chapter; break; } result = uds_remove_delta_index_entry(&record->delta_entry); if (result != UDS_SUCCESS) return result; } return UDS_SUCCESS; } /* Find the matching record, or the list offset where the record would go. */ static int get_volume_index_entry(struct volume_index_record *record, u32 list_number, u32 key, struct chapter_range *flush_range) { struct volume_index_record other_record; const struct volume_sub_index *sub_index = record->sub_index; u32 next_chapter_to_invalidate = sub_index->chapter_mask; int result; result = uds_start_delta_index_search(&sub_index->delta_index, list_number, 0, &record->delta_entry); if (result != UDS_SUCCESS) return result; do { result = flush_invalid_entries(record, flush_range, &next_chapter_to_invalidate); if (result != UDS_SUCCESS) return result; } while (!record->delta_entry.at_end && (key > record->delta_entry.key)); result = uds_remember_delta_index_offset(&record->delta_entry); if (result != UDS_SUCCESS) return result; /* Check any collision records for a more precise match. */ other_record = *record; if (!other_record.delta_entry.at_end && (key == other_record.delta_entry.key)) { for (;;) { u8 collision_name[UDS_RECORD_NAME_SIZE]; result = flush_invalid_entries(&other_record, flush_range, &next_chapter_to_invalidate); if (result != UDS_SUCCESS) return result; if (other_record.delta_entry.at_end || !other_record.delta_entry.is_collision) break; result = uds_get_delta_entry_collision(&other_record.delta_entry, collision_name); if (result != UDS_SUCCESS) return result; if (memcmp(collision_name, record->name, UDS_RECORD_NAME_SIZE) == 0) { *record = other_record; break; } } } while (!other_record.delta_entry.at_end) { result = flush_invalid_entries(&other_record, flush_range, &next_chapter_to_invalidate); if (result != UDS_SUCCESS) return result; } next_chapter_to_invalidate += flush_range->chapter_start; next_chapter_to_invalidate &= sub_index->chapter_mask; flush_range->chapter_start = next_chapter_to_invalidate; flush_range->chapter_count = 0; return UDS_SUCCESS; } static int get_volume_sub_index_record(struct volume_sub_index *sub_index, const struct uds_record_name *name, struct volume_index_record *record) { int result; const struct volume_sub_index_zone *volume_index_zone; u32 address = extract_address(sub_index, name); u32 delta_list_number = extract_dlist_num(sub_index, name); u64 flush_chapter = sub_index->flush_chapters[delta_list_number]; record->sub_index = sub_index; record->mutex = NULL; record->name = name; record->zone_number = delta_list_number / sub_index->delta_index.lists_per_zone; volume_index_zone = get_zone_for_record(record); if (flush_chapter < volume_index_zone->virtual_chapter_low) { struct chapter_range range; u64 flush_count = volume_index_zone->virtual_chapter_low - flush_chapter; range.chapter_start = convert_virtual_to_index(sub_index, flush_chapter); range.chapter_count = (flush_count > sub_index->chapter_mask ? sub_index->chapter_mask + 1 : flush_count); result = get_volume_index_entry(record, delta_list_number, address, &range); flush_chapter = convert_index_to_virtual(record, range.chapter_start); if (flush_chapter > volume_index_zone->virtual_chapter_high) flush_chapter = volume_index_zone->virtual_chapter_high; sub_index->flush_chapters[delta_list_number] = flush_chapter; } else { result = uds_get_delta_index_entry(&sub_index->delta_index, delta_list_number, address, name->name, &record->delta_entry); } if (result != UDS_SUCCESS) return result; record->is_found = (!record->delta_entry.at_end && (record->delta_entry.key == address)); if (record->is_found) { u32 index_chapter = uds_get_delta_entry_value(&record->delta_entry); record->virtual_chapter = convert_index_to_virtual(record, index_chapter); } record->is_collision = record->delta_entry.is_collision; return UDS_SUCCESS; } int uds_get_volume_index_record(struct volume_index *volume_index, const struct uds_record_name *name, struct volume_index_record *record) { int result; if (uds_is_volume_index_sample(volume_index, name)) { /* * Other threads cannot be allowed to call uds_lookup_volume_index_name() while * this thread is finding the volume index record. Due to the lazy LRU flushing of * the volume index, uds_get_volume_index_record() is not a read-only operation. */ unsigned int zone = get_volume_sub_index_zone(&volume_index->vi_hook, name); struct mutex *mutex = &volume_index->zones[zone].hook_mutex; mutex_lock(mutex); result = get_volume_sub_index_record(&volume_index->vi_hook, name, record); mutex_unlock(mutex); /* Remember the mutex so that other operations on the index record can use it. */ record->mutex = mutex; } else { result = get_volume_sub_index_record(&volume_index->vi_non_hook, name, record); } return result; } int uds_put_volume_index_record(struct volume_index_record *record, u64 virtual_chapter) { int result; u32 address; const struct volume_sub_index *sub_index = record->sub_index; if (!is_virtual_chapter_indexed(record, virtual_chapter)) { u64 low = get_zone_for_record(record)->virtual_chapter_low; u64 high = get_zone_for_record(record)->virtual_chapter_high; return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT, "cannot put record into chapter number %llu that is out of the valid range %llu to %llu", (unsigned long long) virtual_chapter, (unsigned long long) low, (unsigned long long) high); } address = extract_address(sub_index, record->name); if (unlikely(record->mutex != NULL)) mutex_lock(record->mutex); result = uds_put_delta_index_entry(&record->delta_entry, address, convert_virtual_to_index(sub_index, virtual_chapter), record->is_found ? record->name->name : NULL); if (unlikely(record->mutex != NULL)) mutex_unlock(record->mutex); switch (result) { case UDS_SUCCESS: record->virtual_chapter = virtual_chapter; record->is_collision = record->delta_entry.is_collision; record->is_found = true; break; case UDS_OVERFLOW: vdo_log_ratelimit(vdo_log_warning_strerror, UDS_OVERFLOW, "Volume index entry dropped due to overflow condition"); uds_log_delta_index_entry(&record->delta_entry); break; default: break; } return result; } int uds_remove_volume_index_record(struct volume_index_record *record) { int result; if (!record->is_found) return vdo_log_warning_strerror(UDS_BAD_STATE, "illegal operation on new record"); /* Mark the record so that it cannot be used again */ record->is_found = false; if (unlikely(record->mutex != NULL)) mutex_lock(record->mutex); result = uds_remove_delta_index_entry(&record->delta_entry); if (unlikely(record->mutex != NULL)) mutex_unlock(record->mutex); return result; } static void set_volume_sub_index_zone_open_chapter(struct volume_sub_index *sub_index, unsigned int zone_number, u64 virtual_chapter) { u64 used_bits = 0; struct volume_sub_index_zone *zone = &sub_index->zones[zone_number]; struct delta_zone *delta_zone; u32 i; zone->virtual_chapter_low = (virtual_chapter >= sub_index->chapter_count ? virtual_chapter - sub_index->chapter_count + 1 : 0); zone->virtual_chapter_high = virtual_chapter; /* Check to see if the new zone data is too large. */ delta_zone = &sub_index->delta_index.delta_zones[zone_number]; for (i = 1; i <= delta_zone->list_count; i++) used_bits += delta_zone->delta_lists[i].size; if (used_bits > sub_index->max_zone_bits) { /* Expire enough chapters to free the desired space. */ u64 expire_count = 1 + (used_bits - sub_index->max_zone_bits) / sub_index->chapter_zone_bits; if (expire_count == 1) { vdo_log_ratelimit(vdo_log_info, "zone %u: At chapter %llu, expiring chapter %llu early", zone_number, (unsigned long long) virtual_chapter, (unsigned long long) zone->virtual_chapter_low); zone->early_flushes++; zone->virtual_chapter_low++; } else { u64 first_expired = zone->virtual_chapter_low; if (first_expired + expire_count < zone->virtual_chapter_high) { zone->early_flushes += expire_count; zone->virtual_chapter_low += expire_count; } else { zone->early_flushes += zone->virtual_chapter_high - zone->virtual_chapter_low; zone->virtual_chapter_low = zone->virtual_chapter_high; } vdo_log_ratelimit(vdo_log_info, "zone %u: At chapter %llu, expiring chapters %llu to %llu early", zone_number, (unsigned long long) virtual_chapter, (unsigned long long) first_expired, (unsigned long long) zone->virtual_chapter_low - 1); } } } void uds_set_volume_index_zone_open_chapter(struct volume_index *volume_index, unsigned int zone_number, u64 virtual_chapter) { struct mutex *mutex = &volume_index->zones[zone_number].hook_mutex; set_volume_sub_index_zone_open_chapter(&volume_index->vi_non_hook, zone_number, virtual_chapter); /* * Other threads cannot be allowed to call uds_lookup_volume_index_name() while the open * chapter number is changing. */ if (has_sparse(volume_index)) { mutex_lock(mutex); set_volume_sub_index_zone_open_chapter(&volume_index->vi_hook, zone_number, virtual_chapter); mutex_unlock(mutex); } } /* * Set the newest open chapter number for the index, while also advancing the oldest valid chapter * number. */ void uds_set_volume_index_open_chapter(struct volume_index *volume_index, u64 virtual_chapter) { unsigned int zone; for (zone = 0; zone < volume_index->zone_count; zone++) uds_set_volume_index_zone_open_chapter(volume_index, zone, virtual_chapter); } int uds_set_volume_index_record_chapter(struct volume_index_record *record, u64 virtual_chapter) { const struct volume_sub_index *sub_index = record->sub_index; int result; if (!record->is_found) return vdo_log_warning_strerror(UDS_BAD_STATE, "illegal operation on new record"); if (!is_virtual_chapter_indexed(record, virtual_chapter)) { u64 low = get_zone_for_record(record)->virtual_chapter_low; u64 high = get_zone_for_record(record)->virtual_chapter_high; return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT, "cannot set chapter number %llu that is out of the valid range %llu to %llu", (unsigned long long) virtual_chapter, (unsigned long long) low, (unsigned long long) high); } if (unlikely(record->mutex != NULL)) mutex_lock(record->mutex); result = uds_set_delta_entry_value(&record->delta_entry, convert_virtual_to_index(sub_index, virtual_chapter)); if (unlikely(record->mutex != NULL)) mutex_unlock(record->mutex); if (result != UDS_SUCCESS) return result; record->virtual_chapter = virtual_chapter; return UDS_SUCCESS; } static u64 lookup_volume_sub_index_name(const struct volume_sub_index *sub_index, const struct uds_record_name *name) { int result; u32 address = extract_address(sub_index, name); u32 delta_list_number = extract_dlist_num(sub_index, name); unsigned int zone_number = get_volume_sub_index_zone(sub_index, name); const struct volume_sub_index_zone *zone = &sub_index->zones[zone_number]; u64 virtual_chapter; u32 index_chapter; u32 rolling_chapter; struct delta_index_entry delta_entry; result = uds_get_delta_index_entry(&sub_index->delta_index, delta_list_number, address, name->name, &delta_entry); if (result != UDS_SUCCESS) return NO_CHAPTER; if (delta_entry.at_end || (delta_entry.key != address)) return NO_CHAPTER; index_chapter = uds_get_delta_entry_value(&delta_entry); rolling_chapter = (index_chapter - zone->virtual_chapter_low) & sub_index->chapter_mask; virtual_chapter = zone->virtual_chapter_low + rolling_chapter; if (virtual_chapter > zone->virtual_chapter_high) return NO_CHAPTER; return virtual_chapter; } /* Do a read-only lookup of the record name for sparse cache management. */ u64 uds_lookup_volume_index_name(const struct volume_index *volume_index, const struct uds_record_name *name) { unsigned int zone_number = uds_get_volume_index_zone(volume_index, name); struct mutex *mutex = &volume_index->zones[zone_number].hook_mutex; u64 virtual_chapter; if (!uds_is_volume_index_sample(volume_index, name)) return NO_CHAPTER; mutex_lock(mutex); virtual_chapter = lookup_volume_sub_index_name(&volume_index->vi_hook, name); mutex_unlock(mutex); return virtual_chapter; } static void abort_restoring_volume_sub_index(struct volume_sub_index *sub_index) { uds_reset_delta_index(&sub_index->delta_index); } static void abort_restoring_volume_index(struct volume_index *volume_index) { abort_restoring_volume_sub_index(&volume_index->vi_non_hook); if (has_sparse(volume_index)) abort_restoring_volume_sub_index(&volume_index->vi_hook); } static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index, struct buffered_reader **readers, unsigned int reader_count) { unsigned int z; int result; u64 virtual_chapter_low = 0, virtual_chapter_high = 0; unsigned int i; for (i = 0; i < reader_count; i++) { struct sub_index_data header; u8 buffer[sizeof(struct sub_index_data)]; size_t offset = 0; u32 j; result = uds_read_from_buffered_reader(readers[i], buffer, sizeof(buffer)); if (result != UDS_SUCCESS) { return vdo_log_warning_strerror(result, "failed to read volume index header"); } memcpy(&header.magic, buffer, MAGIC_SIZE); offset += MAGIC_SIZE; decode_u64_le(buffer, &offset, &header.volume_nonce); decode_u64_le(buffer, &offset, &header.virtual_chapter_low); decode_u64_le(buffer, &offset, &header.virtual_chapter_high); decode_u32_le(buffer, &offset, &header.first_list); decode_u32_le(buffer, &offset, &header.list_count); result = VDO_ASSERT(offset == sizeof(buffer), "%zu bytes decoded of %zu expected", offset, sizeof(buffer)); if (result != VDO_SUCCESS) result = UDS_CORRUPT_DATA; if (memcmp(header.magic, MAGIC_START_5, MAGIC_SIZE) != 0) { return vdo_log_warning_strerror(UDS_CORRUPT_DATA, "volume index file had bad magic number"); } if (sub_index->volume_nonce == 0) { sub_index->volume_nonce = header.volume_nonce; } else if (header.volume_nonce != sub_index->volume_nonce) { return vdo_log_warning_strerror(UDS_CORRUPT_DATA, "volume index volume nonce incorrect"); } if (i == 0) { virtual_chapter_low = header.virtual_chapter_low; virtual_chapter_high = header.virtual_chapter_high; } else if (virtual_chapter_high != header.virtual_chapter_high) { u64 low = header.virtual_chapter_low; u64 high = header.virtual_chapter_high; return vdo_log_warning_strerror(UDS_CORRUPT_DATA, "Inconsistent volume index zone files: Chapter range is [%llu,%llu], chapter range %d is [%llu,%llu]", (unsigned long long) virtual_chapter_low, (unsigned long long) virtual_chapter_high, i, (unsigned long long) low, (unsigned long long) high); } else if (virtual_chapter_low < header.virtual_chapter_low) { virtual_chapter_low = header.virtual_chapter_low; } for (j = 0; j < header.list_count; j++) { u8 decoded[sizeof(u64)]; result = uds_read_from_buffered_reader(readers[i], decoded, sizeof(u64)); if (result != UDS_SUCCESS) { return vdo_log_warning_strerror(result, "failed to read volume index flush ranges"); } sub_index->flush_chapters[header.first_list + j] = get_unaligned_le64(decoded); } } for (z = 0; z < sub_index->zone_count; z++) { memset(&sub_index->zones[z], 0, sizeof(struct volume_sub_index_zone)); sub_index->zones[z].virtual_chapter_low = virtual_chapter_low; sub_index->zones[z].virtual_chapter_high = virtual_chapter_high; } result = uds_start_restoring_delta_index(&sub_index->delta_index, readers, reader_count); if (result != UDS_SUCCESS) return vdo_log_warning_strerror(result, "restoring delta index failed"); return UDS_SUCCESS; } static int start_restoring_volume_index(struct volume_index *volume_index, struct buffered_reader **buffered_readers, unsigned int reader_count) { unsigned int i; int result; if (!has_sparse(volume_index)) { return start_restoring_volume_sub_index(&volume_index->vi_non_hook, buffered_readers, reader_count); } for (i = 0; i < reader_count; i++) { struct volume_index_data header; u8 buffer[sizeof(struct volume_index_data)]; size_t offset = 0; result = uds_read_from_buffered_reader(buffered_readers[i], buffer, sizeof(buffer)); if (result != UDS_SUCCESS) { return vdo_log_warning_strerror(result, "failed to read volume index header"); } memcpy(&header.magic, buffer, MAGIC_SIZE); offset += MAGIC_SIZE; decode_u32_le(buffer, &offset, &header.sparse_sample_rate); result = VDO_ASSERT(offset == sizeof(buffer), "%zu bytes decoded of %zu expected", offset, sizeof(buffer)); if (result != VDO_SUCCESS) result = UDS_CORRUPT_DATA; if (memcmp(header.magic, MAGIC_START_6, MAGIC_SIZE) != 0) return vdo_log_warning_strerror(UDS_CORRUPT_DATA, "volume index file had bad magic number"); if (i == 0) { volume_index->sparse_sample_rate = header.sparse_sample_rate; } else if (volume_index->sparse_sample_rate != header.sparse_sample_rate) { vdo_log_warning_strerror(UDS_CORRUPT_DATA, "Inconsistent sparse sample rate in delta index zone files: %u vs. %u", volume_index->sparse_sample_rate, header.sparse_sample_rate); return UDS_CORRUPT_DATA; } } result = start_restoring_volume_sub_index(&volume_index->vi_non_hook, buffered_readers, reader_count); if (result != UDS_SUCCESS) return result; return start_restoring_volume_sub_index(&volume_index->vi_hook, buffered_readers, reader_count); } static int finish_restoring_volume_sub_index(struct volume_sub_index *sub_index, struct buffered_reader **buffered_readers, unsigned int reader_count) { return uds_finish_restoring_delta_index(&sub_index->delta_index, buffered_readers, reader_count); } static int finish_restoring_volume_index(struct volume_index *volume_index, struct buffered_reader **buffered_readers, unsigned int reader_count) { int result; result = finish_restoring_volume_sub_index(&volume_index->vi_non_hook, buffered_readers, reader_count); if ((result == UDS_SUCCESS) && has_sparse(volume_index)) { result = finish_restoring_volume_sub_index(&volume_index->vi_hook, buffered_readers, reader_count); } return result; } int uds_load_volume_index(struct volume_index *volume_index, struct buffered_reader **readers, unsigned int reader_count) { int result; /* Start by reading the header section of the stream. */ result = start_restoring_volume_index(volume_index, readers, reader_count); if (result != UDS_SUCCESS) return result; result = finish_restoring_volume_index(volume_index, readers, reader_count); if (result != UDS_SUCCESS) { abort_restoring_volume_index(volume_index); return result; } /* Check the final guard lists to make sure there is no extra data. */ result = uds_check_guard_delta_lists(readers, reader_count); if (result != UDS_SUCCESS) abort_restoring_volume_index(volume_index); return result; } static int start_saving_volume_sub_index(const struct volume_sub_index *sub_index, unsigned int zone_number, struct buffered_writer *buffered_writer) { int result; struct volume_sub_index_zone *volume_index_zone = &sub_index->zones[zone_number]; u32 first_list = sub_index->delta_index.delta_zones[zone_number].first_list; u32 list_count = sub_index->delta_index.delta_zones[zone_number].list_count; u8 buffer[sizeof(struct sub_index_data)]; size_t offset = 0; u32 i; memcpy(buffer, MAGIC_START_5, MAGIC_SIZE); offset += MAGIC_SIZE; encode_u64_le(buffer, &offset, sub_index->volume_nonce); encode_u64_le(buffer, &offset, volume_index_zone->virtual_chapter_low); encode_u64_le(buffer, &offset, volume_index_zone->virtual_chapter_high); encode_u32_le(buffer, &offset, first_list); encode_u32_le(buffer, &offset, list_count); result = VDO_ASSERT(offset == sizeof(struct sub_index_data), "%zu bytes of config written, of %zu expected", offset, sizeof(struct sub_index_data)); if (result != VDO_SUCCESS) return result; result = uds_write_to_buffered_writer(buffered_writer, buffer, offset); if (result != UDS_SUCCESS) return vdo_log_warning_strerror(result, "failed to write volume index header"); for (i = 0; i < list_count; i++) { u8 encoded[sizeof(u64)]; put_unaligned_le64(sub_index->flush_chapters[first_list + i], &encoded); result = uds_write_to_buffered_writer(buffered_writer, encoded, sizeof(u64)); if (result != UDS_SUCCESS) { return vdo_log_warning_strerror(result, "failed to write volume index flush ranges"); } } return uds_start_saving_delta_index(&sub_index->delta_index, zone_number, buffered_writer); } static int start_saving_volume_index(const struct volume_index *volume_index, unsigned int zone_number, struct buffered_writer *writer) { u8 buffer[sizeof(struct volume_index_data)]; size_t offset = 0; int result; if (!has_sparse(volume_index)) { return start_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number, writer); } memcpy(buffer, MAGIC_START_6, MAGIC_SIZE); offset += MAGIC_SIZE; encode_u32_le(buffer, &offset, volume_index->sparse_sample_rate); result = VDO_ASSERT(offset == sizeof(struct volume_index_data), "%zu bytes of header written, of %zu expected", offset, sizeof(struct volume_index_data)); if (result != VDO_SUCCESS) return result; result = uds_write_to_buffered_writer(writer, buffer, offset); if (result != UDS_SUCCESS) { vdo_log_warning_strerror(result, "failed to write volume index header"); return result; } result = start_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number, writer); if (result != UDS_SUCCESS) return result; return start_saving_volume_sub_index(&volume_index->vi_hook, zone_number, writer); } static int finish_saving_volume_sub_index(const struct volume_sub_index *sub_index, unsigned int zone_number) { return uds_finish_saving_delta_index(&sub_index->delta_index, zone_number); } static int finish_saving_volume_index(const struct volume_index *volume_index, unsigned int zone_number) { int result; result = finish_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number); if ((result == UDS_SUCCESS) && has_sparse(volume_index)) result = finish_saving_volume_sub_index(&volume_index->vi_hook, zone_number); return result; } int uds_save_volume_index(struct volume_index *volume_index, struct buffered_writer **writers, unsigned int writer_count) { int result = UDS_SUCCESS; unsigned int zone; for (zone = 0; zone < writer_count; zone++) { result = start_saving_volume_index(volume_index, zone, writers[zone]); if (result != UDS_SUCCESS) break; result = finish_saving_volume_index(volume_index, zone); if (result != UDS_SUCCESS) break; result = uds_write_guard_delta_list(writers[zone]); if (result != UDS_SUCCESS) break; result = uds_flush_buffered_writer(writers[zone]); if (result != UDS_SUCCESS) break; } return result; } static void get_volume_sub_index_stats(const struct volume_sub_index *sub_index, struct volume_index_stats *stats) { struct delta_index_stats dis; unsigned int z; uds_get_delta_index_stats(&sub_index->delta_index, &dis); stats->rebalance_time = dis.rebalance_time; stats->rebalance_count = dis.rebalance_count; stats->record_count = dis.record_count; stats->collision_count = dis.collision_count; stats->discard_count = dis.discard_count; stats->overflow_count = dis.overflow_count; stats->delta_lists = dis.list_count; stats->early_flushes = 0; for (z = 0; z < sub_index->zone_count; z++) stats->early_flushes += sub_index->zones[z].early_flushes; } void uds_get_volume_index_stats(const struct volume_index *volume_index, struct volume_index_stats *stats) { struct volume_index_stats sparse_stats; get_volume_sub_index_stats(&volume_index->vi_non_hook, stats); if (!has_sparse(volume_index)) return; get_volume_sub_index_stats(&volume_index->vi_hook, &sparse_stats); stats->rebalance_time += sparse_stats.rebalance_time; stats->rebalance_count += sparse_stats.rebalance_count; stats->record_count += sparse_stats.record_count; stats->collision_count += sparse_stats.collision_count; stats->discard_count += sparse_stats.discard_count; stats->overflow_count += sparse_stats.overflow_count; stats->delta_lists += sparse_stats.delta_lists; stats->early_flushes += sparse_stats.early_flushes; } static int initialize_volume_sub_index(const struct uds_configuration *config, u64 volume_nonce, u8 tag, struct volume_sub_index *sub_index) { struct sub_index_parameters params = { .address_bits = 0 }; unsigned int zone_count = config->zone_count; u64 available_bytes = 0; unsigned int z; int result; result = compute_volume_sub_index_parameters(config, ¶ms); if (result != UDS_SUCCESS) return result; sub_index->address_bits = params.address_bits; sub_index->address_mask = (1u << params.address_bits) - 1; sub_index->chapter_bits = params.chapter_bits; sub_index->chapter_mask = (1u << params.chapter_bits) - 1; sub_index->chapter_count = params.chapter_count; sub_index->list_count = params.list_count; sub_index->zone_count = zone_count; sub_index->chapter_zone_bits = params.chapter_size_in_bits / zone_count; sub_index->volume_nonce = volume_nonce; result = uds_initialize_delta_index(&sub_index->delta_index, zone_count, params.list_count, params.mean_delta, params.chapter_bits, params.memory_size, tag); if (result != UDS_SUCCESS) return result; for (z = 0; z < sub_index->delta_index.zone_count; z++) available_bytes += sub_index->delta_index.delta_zones[z].size; available_bytes -= params.target_free_bytes; sub_index->max_zone_bits = (available_bytes * BITS_PER_BYTE) / zone_count; sub_index->memory_size = (sub_index->delta_index.memory_size + sizeof(struct volume_sub_index) + (params.list_count * sizeof(u64)) + (zone_count * sizeof(struct volume_sub_index_zone))); /* The following arrays are initialized to all zeros. */ result = vdo_allocate(params.list_count, u64, "first chapter to flush", &sub_index->flush_chapters); if (result != VDO_SUCCESS) return result; return vdo_allocate(zone_count, struct volume_sub_index_zone, "volume index zones", &sub_index->zones); } int uds_make_volume_index(const struct uds_configuration *config, u64 volume_nonce, struct volume_index **volume_index_ptr) { struct split_config split; unsigned int zone; struct volume_index *volume_index; int result; result = vdo_allocate(1, struct volume_index, "volume index", &volume_index); if (result != VDO_SUCCESS) return result; volume_index->zone_count = config->zone_count; if (!uds_is_sparse_index_geometry(config->geometry)) { result = initialize_volume_sub_index(config, volume_nonce, 'm', &volume_index->vi_non_hook); if (result != UDS_SUCCESS) { uds_free_volume_index(volume_index); return result; } volume_index->memory_size = volume_index->vi_non_hook.memory_size; *volume_index_ptr = volume_index; return UDS_SUCCESS; } volume_index->sparse_sample_rate = config->sparse_sample_rate; result = vdo_allocate(config->zone_count, struct volume_index_zone, "volume index zones", &volume_index->zones); if (result != VDO_SUCCESS) { uds_free_volume_index(volume_index); return result; } for (zone = 0; zone < config->zone_count; zone++) mutex_init(&volume_index->zones[zone].hook_mutex); split_configuration(config, &split); result = initialize_volume_sub_index(&split.non_hook_config, volume_nonce, 'd', &volume_index->vi_non_hook); if (result != UDS_SUCCESS) { uds_free_volume_index(volume_index); return vdo_log_error_strerror(result, "Error creating non hook volume index"); } result = initialize_volume_sub_index(&split.hook_config, volume_nonce, 's', &volume_index->vi_hook); if (result != UDS_SUCCESS) { uds_free_volume_index(volume_index); return vdo_log_error_strerror(result, "Error creating hook volume index"); } volume_index->memory_size = volume_index->vi_non_hook.memory_size + volume_index->vi_hook.memory_size; *volume_index_ptr = volume_index; return UDS_SUCCESS; }