1/*
2 * CDDL HEADER START
3 *
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
8 *
9 * A full copy of the text of the CDDL should have accompanied this
10 * source.  A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
12 *
13 * CDDL HEADER END
14 */
15
16/*
17 * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
18 */
19
20#include <sys/dmu_tx.h>
21#include <sys/dsl_pool.h>
22#include <sys/spa.h>
23#include <sys/vdev_impl.h>
24#include <sys/vdev_indirect_mapping.h>
25#include <sys/zfeature.h>
26#include <sys/dmu_objset.h>
27
28static boolean_t
29vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
30{
31	ASSERT(vim != NULL);
32
33	ASSERT(vim->vim_object != 0);
34	ASSERT(vim->vim_objset != NULL);
35	ASSERT(vim->vim_phys != NULL);
36	ASSERT(vim->vim_dbuf != NULL);
37
38	EQUIV(vim->vim_phys->vimp_num_entries > 0,
39	    vim->vim_entries != NULL);
40	if (vim->vim_phys->vimp_num_entries > 0) {
41		vdev_indirect_mapping_entry_phys_t *last_entry =
42		    &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1];
43		uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry);
44		uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst);
45
46		ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size);
47	}
48	if (vim->vim_havecounts) {
49		ASSERT(vim->vim_phys->vimp_counts_object != 0);
50	}
51
52	return (B_TRUE);
53}
54
55uint64_t
56vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim)
57{
58	ASSERT(vdev_indirect_mapping_verify(vim));
59
60	return (vim->vim_phys->vimp_num_entries);
61}
62
63uint64_t
64vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim)
65{
66	ASSERT(vdev_indirect_mapping_verify(vim));
67
68	return (vim->vim_phys->vimp_max_offset);
69}
70
71uint64_t
72vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim)
73{
74	ASSERT(vdev_indirect_mapping_verify(vim));
75
76	return (vim->vim_object);
77}
78
79uint64_t
80vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim)
81{
82	ASSERT(vdev_indirect_mapping_verify(vim));
83
84	return (vim->vim_phys->vimp_bytes_mapped);
85}
86
87/*
88 * The length (in bytes) of the mapping object array in memory and
89 * (logically) on disk.
90 *
91 * Note that unlike most of our accessor functions,
92 * we don't assert that the struct is consistent; therefore it can be
93 * called while there may be concurrent changes, if we don't care about
94 * the value being immediately stale (e.g. from spa_removal_get_stats()).
95 */
96uint64_t
97vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim)
98{
99	return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries));
100}
101
102/*
103 * Compare an offset with an indirect mapping entry; there are three
104 * possible scenarios:
105 *
106 *     1. The offset is "less than" the mapping entry; meaning the
107 *        offset is less than the source offset of the mapping entry. In
108 *        this case, there is no overlap between the offset and the
109 *        mapping entry and -1 will be returned.
110 *
111 *     2. The offset is "greater than" the mapping entry; meaning the
112 *        offset is greater than the mapping entry's source offset plus
113 *        the entry's size. In this case, there is no overlap between
114 *        the offset and the mapping entry and 1 will be returned.
115 *
116 *        NOTE: If the offset is actually equal to the entry's offset
117 *        plus size, this is considered to be "greater" than the entry,
118 *        and this case applies (i.e. 1 will be returned). Thus, the
119 *        entry's "range" can be considered to be inclusive at its
120 *        start, but exclusive at its end: e.g. [src, src + size).
121 *
122 *     3. The last case to consider is if the offset actually falls
123 *        within the mapping entry's range. If this is the case, the
124 *        offset is considered to be "equal to" the mapping entry and
125 *        0 will be returned.
126 *
127 *        NOTE: If the offset is equal to the entry's source offset,
128 *        this case applies and 0 will be returned. If the offset is
129 *        equal to the entry's source plus its size, this case does
130 *        *not* apply (see "NOTE" above for scenario 2), and 1 will be
131 *        returned.
132 */
133static int
134dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
135{
136	const uint64_t *key = v_key;
137	const vdev_indirect_mapping_entry_phys_t *array_elem =
138	    v_array_elem;
139	uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
140
141	if (*key < src_offset) {
142		return (-1);
143	} else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
144		return (0);
145	} else {
146		return (1);
147	}
148}
149
150/*
151 * Returns the mapping entry for the given offset.
152 *
153 * It's possible that the given offset will not be in the mapping table
154 * (i.e. no mapping entries contain this offset), in which case, the
155 * return value value depends on the "next_if_missing" parameter.
156 *
157 * If the offset is not found in the table and "next_if_missing" is
158 * B_FALSE, then NULL will always be returned. The behavior is intended
159 * to allow consumers to get the entry corresponding to the offset
160 * parameter, iff the offset overlaps with an entry in the table.
161 *
162 * If the offset is not found in the table and "next_if_missing" is
163 * B_TRUE, then the entry nearest to the given offset will be returned,
164 * such that the entry's source offset is greater than the offset
165 * passed in (i.e. the "next" mapping entry in the table is returned, if
166 * the offset is missing from the table). If there are no entries whose
167 * source offset is greater than the passed in offset, NULL is returned.
168 */
169static vdev_indirect_mapping_entry_phys_t *
170vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim,
171    uint64_t offset, boolean_t next_if_missing)
172{
173	ASSERT(vdev_indirect_mapping_verify(vim));
174	ASSERT(vim->vim_phys->vimp_num_entries > 0);
175
176	vdev_indirect_mapping_entry_phys_t *entry = NULL;
177
178	uint64_t last = vim->vim_phys->vimp_num_entries - 1;
179	uint64_t base = 0;
180
181	/*
182	 * We don't define these inside of the while loop because we use
183	 * their value in the case that offset isn't in the mapping.
184	 */
185	uint64_t mid;
186	int result;
187
188	while (last >= base) {
189		mid = base + ((last - base) >> 1);
190
191		result = dva_mapping_overlap_compare(&offset,
192		    &vim->vim_entries[mid]);
193
194		if (result == 0) {
195			entry = &vim->vim_entries[mid];
196			break;
197		} else if (result < 0) {
198			last = mid - 1;
199		} else {
200			base = mid + 1;
201		}
202	}
203
204	if (entry == NULL && next_if_missing) {
205		ASSERT3U(base, ==, last + 1);
206		ASSERT(mid == base || mid == last);
207		ASSERT3S(result, !=, 0);
208
209		/*
210		 * The offset we're looking for isn't actually contained
211		 * in the mapping table, thus we need to return the
212		 * closest mapping entry that is greater than the
213		 * offset. We reuse the result of the last comparison,
214		 * comparing the mapping entry at index "mid" and the
215		 * offset. The offset is guaranteed to lie between
216		 * indices one less than "mid", and one greater than
217		 * "mid"; we just need to determine if offset is greater
218		 * than, or less than the mapping entry contained at
219		 * index "mid".
220		 */
221
222		uint64_t index;
223		if (result < 0)
224			index = mid;
225		else
226			index = mid + 1;
227
228		ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries);
229
230		if (index == vim->vim_phys->vimp_num_entries) {
231			/*
232			 * If "index" is past the end of the entries
233			 * array, then not only is the offset not in the
234			 * mapping table, but it's actually greater than
235			 * all entries in the table. In this case, we
236			 * can't return a mapping entry greater than the
237			 * offset (since none exist), so we return NULL.
238			 */
239
240			ASSERT3S(dva_mapping_overlap_compare(&offset,
241			    &vim->vim_entries[index - 1]), >, 0);
242
243			return (NULL);
244		} else {
245			/*
246			 * Just to be safe, we verify the offset falls
247			 * in between the mapping entries at index and
248			 * one less than index. Since we know the offset
249			 * doesn't overlap an entry, and we're supposed
250			 * to return the entry just greater than the
251			 * offset, both of the following tests must be
252			 * true.
253			 */
254			ASSERT3S(dva_mapping_overlap_compare(&offset,
255			    &vim->vim_entries[index]), <, 0);
256			IMPLY(index >= 1, dva_mapping_overlap_compare(&offset,
257			    &vim->vim_entries[index - 1]) > 0);
258
259			return (&vim->vim_entries[index]);
260		}
261	} else {
262		return (entry);
263	}
264}
265
266vdev_indirect_mapping_entry_phys_t *
267vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
268    uint64_t offset)
269{
270	return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
271	    B_FALSE));
272}
273
274vdev_indirect_mapping_entry_phys_t *
275vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
276    uint64_t offset)
277{
278	return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
279	    B_TRUE));
280}
281
282void
283vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim)
284{
285	ASSERT(vdev_indirect_mapping_verify(vim));
286
287	if (vim->vim_phys->vimp_num_entries > 0) {
288		uint64_t map_size = vdev_indirect_mapping_size(vim);
289		kmem_free(vim->vim_entries, map_size);
290		vim->vim_entries = NULL;
291	}
292
293	dmu_buf_rele(vim->vim_dbuf, vim);
294
295	vim->vim_objset = NULL;
296	vim->vim_object = 0;
297	vim->vim_dbuf = NULL;
298	vim->vim_phys = NULL;
299
300	kmem_free(vim, sizeof (*vim));
301}
302
303uint64_t
304vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx)
305{
306	uint64_t object;
307	ASSERT(dmu_tx_is_syncing(tx));
308	uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0;
309
310	if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
311		bonus_size = sizeof (vdev_indirect_mapping_phys_t);
312	}
313
314	object = dmu_object_alloc(os,
315	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
316	    DMU_OTN_UINT64_METADATA, bonus_size,
317	    tx);
318
319	if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
320		dmu_buf_t *dbuf;
321		vdev_indirect_mapping_phys_t *vimp;
322
323		VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf));
324		dmu_buf_will_dirty(dbuf, tx);
325		vimp = dbuf->db_data;
326		vimp->vimp_counts_object = dmu_object_alloc(os,
327		    DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE,
328		    DMU_OT_NONE, 0, tx);
329		spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
330		dmu_buf_rele(dbuf, FTAG);
331	}
332
333	return (object);
334}
335
336
337vdev_indirect_mapping_t *
338vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object)
339{
340	vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP);
341	dmu_object_info_t doi;
342	VERIFY0(dmu_object_info(os, mapping_object, &doi));
343
344	vim->vim_objset = os;
345	vim->vim_object = mapping_object;
346
347	VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim,
348	    &vim->vim_dbuf));
349	vim->vim_phys = vim->vim_dbuf->db_data;
350
351	vim->vim_havecounts =
352	    (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0);
353
354	if (vim->vim_phys->vimp_num_entries > 0) {
355		uint64_t map_size = vdev_indirect_mapping_size(vim);
356		vim->vim_entries = kmem_alloc(map_size, KM_SLEEP);
357		VERIFY0(dmu_read(os, vim->vim_object, 0, map_size,
358		    vim->vim_entries, DMU_READ_PREFETCH));
359	}
360
361	ASSERT(vdev_indirect_mapping_verify(vim));
362
363	return (vim);
364}
365
366void
367vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
368{
369	vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object);
370	if (vim->vim_havecounts) {
371		VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object,
372		    tx));
373		spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
374	}
375	vdev_indirect_mapping_close(vim);
376
377	VERIFY0(dmu_object_free(os, object, tx));
378}
379
380/*
381 * Append the list of vdev_indirect_mapping_entry_t's to the on-disk
382 * mapping object.  Also remove the entries from the list and free them.
383 * This also implicitly extends the max_offset of the mapping (to the end
384 * of the last entry).
385 */
386void
387vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
388    list_t *list, dmu_tx_t *tx)
389{
390	vdev_indirect_mapping_entry_phys_t *mapbuf;
391	uint64_t old_size;
392	uint32_t *countbuf = NULL;
393	vdev_indirect_mapping_entry_phys_t *old_entries;
394	uint64_t old_count;
395	uint64_t entries_written = 0;
396
397	ASSERT(vdev_indirect_mapping_verify(vim));
398	ASSERT(dmu_tx_is_syncing(tx));
399	ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
400	ASSERT(!list_is_empty(list));
401
402	old_size = vdev_indirect_mapping_size(vim);
403	old_entries = vim->vim_entries;
404	old_count = vim->vim_phys->vimp_num_entries;
405
406	dmu_buf_will_dirty(vim->vim_dbuf, tx);
407
408	mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
409	if (vim->vim_havecounts) {
410		countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
411		ASSERT(spa_feature_is_active(vim->vim_objset->os_spa,
412		    SPA_FEATURE_OBSOLETE_COUNTS));
413	}
414	while (!list_is_empty(list)) {
415		uint64_t i;
416		/*
417		 * Write entries from the list to the
418		 * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE.
419		 */
420		for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) {
421			vdev_indirect_mapping_entry_t *entry =
422			    list_remove_head(list);
423			if (entry == NULL)
424				break;
425
426			uint64_t size =
427			    DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst);
428			uint64_t src_offset =
429			    DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping);
430
431			/*
432			 * We shouldn't be adding an entry which is fully
433			 * obsolete.
434			 */
435			ASSERT3U(entry->vime_obsolete_count, <, size);
436			IMPLY(entry->vime_obsolete_count != 0,
437			    vim->vim_havecounts);
438
439			mapbuf[i] = entry->vime_mapping;
440			if (vim->vim_havecounts)
441				countbuf[i] = entry->vime_obsolete_count;
442
443			vim->vim_phys->vimp_bytes_mapped += size;
444			ASSERT3U(src_offset, >=,
445			    vim->vim_phys->vimp_max_offset);
446			vim->vim_phys->vimp_max_offset = src_offset + size;
447
448			entries_written++;
449
450			kmem_free(entry, sizeof (*entry));
451		}
452		dmu_write(vim->vim_objset, vim->vim_object,
453		    vim->vim_phys->vimp_num_entries * sizeof (*mapbuf),
454		    i * sizeof (*mapbuf),
455		    mapbuf, tx);
456		if (vim->vim_havecounts) {
457			dmu_write(vim->vim_objset,
458			    vim->vim_phys->vimp_counts_object,
459			    vim->vim_phys->vimp_num_entries *
460			    sizeof (*countbuf),
461			    i * sizeof (*countbuf), countbuf, tx);
462		}
463		vim->vim_phys->vimp_num_entries += i;
464	}
465	zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE);
466	if (vim->vim_havecounts)
467		zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE);
468
469	/*
470	 * Update the entry array to reflect the new entries. First, copy
471	 * over any old entries then read back the new entries we just wrote.
472	 */
473	uint64_t new_size = vdev_indirect_mapping_size(vim);
474	ASSERT3U(new_size, >, old_size);
475	ASSERT3U(new_size - old_size, ==,
476	    entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
477	vim->vim_entries = kmem_alloc(new_size, KM_SLEEP);
478	if (old_size > 0) {
479		bcopy(old_entries, vim->vim_entries, old_size);
480		kmem_free(old_entries, old_size);
481	}
482	VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
483	    new_size - old_size, &vim->vim_entries[old_count],
484	    DMU_READ_PREFETCH));
485
486	zfs_dbgmsg("txg %llu: wrote %llu entries to "
487	    "indirect mapping obj %llu; max offset=0x%llx",
488	    (u_longlong_t)dmu_tx_get_txg(tx),
489	    (u_longlong_t)entries_written,
490	    (u_longlong_t)vim->vim_object,
491	    (u_longlong_t)vim->vim_phys->vimp_max_offset);
492}
493
494/*
495 * Increment the relevant counts for the specified offset and length.
496 * The counts array must be obtained from
497 * vdev_indirect_mapping_load_obsolete_counts().
498 */
499void
500vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim,
501    uint64_t offset, uint64_t length, uint32_t *counts)
502{
503	vdev_indirect_mapping_entry_phys_t *mapping;
504	uint64_t index;
505
506	mapping = vdev_indirect_mapping_entry_for_offset(vim,  offset);
507
508	ASSERT(length > 0);
509	ASSERT3P(mapping, !=, NULL);
510
511	index = mapping - vim->vim_entries;
512
513	while (length > 0) {
514		ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim));
515
516		uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
517		uint64_t inner_offset = offset -
518		    DVA_MAPPING_GET_SRC_OFFSET(mapping);
519		VERIFY3U(inner_offset, <, size);
520		uint64_t inner_size = MIN(length, size - inner_offset);
521
522		VERIFY3U(counts[index] + inner_size, <=, size);
523		counts[index] += inner_size;
524
525		offset += inner_size;
526		length -= inner_size;
527		mapping++;
528		index++;
529	}
530}
531
532typedef struct load_obsolete_space_map_arg {
533	vdev_indirect_mapping_t	*losma_vim;
534	uint32_t		*losma_counts;
535} load_obsolete_space_map_arg_t;
536
537static int
538load_obsolete_sm_callback(space_map_entry_t *sme, void *arg)
539{
540	load_obsolete_space_map_arg_t *losma = arg;
541	ASSERT3S(sme->sme_type, ==, SM_ALLOC);
542
543	vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
544	    sme->sme_offset, sme->sme_run, losma->losma_counts);
545
546	return (0);
547}
548
549/*
550 * Modify the counts (increment them) based on the spacemap.
551 */
552void
553vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
554    uint32_t *counts, space_map_t *obsolete_space_sm)
555{
556	load_obsolete_space_map_arg_t losma;
557	losma.losma_counts = counts;
558	losma.losma_vim = vim;
559	VERIFY0(space_map_iterate(obsolete_space_sm,
560	    space_map_length(obsolete_space_sm),
561	    load_obsolete_sm_callback, &losma));
562}
563
564/*
565 * Read the obsolete counts from disk, returning them in an array.
566 */
567uint32_t *
568vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
569{
570	ASSERT(vdev_indirect_mapping_verify(vim));
571
572	uint64_t counts_size =
573	    vim->vim_phys->vimp_num_entries * sizeof (uint32_t);
574	uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP);
575	if (vim->vim_havecounts) {
576		VERIFY0(dmu_read(vim->vim_objset,
577		    vim->vim_phys->vimp_counts_object,
578		    0, counts_size,
579		    counts, DMU_READ_PREFETCH));
580	} else {
581		bzero(counts, counts_size);
582	}
583	return (counts);
584}
585
586extern void
587vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim,
588    uint32_t *counts)
589{
590	ASSERT(vdev_indirect_mapping_verify(vim));
591
592	kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t));
593}
594