1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25/*
26 * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
27 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
28 */
29
30#include <sys/zfs_context.h>
31#include <sys/spa.h>
32#include <sys/dmu.h>
33#include <sys/dnode.h>
34#include <sys/zio.h>
35#include <sys/range_tree.h>
36
37/*
38 * Range trees are tree-based data structures that can be used to
39 * track free space or generally any space allocation information.
40 * A range tree keeps track of individual segments and automatically
41 * provides facilities such as adjacent extent merging and extent
42 * splitting in response to range add/remove requests.
43 *
44 * A range tree starts out completely empty, with no segments in it.
45 * Adding an allocation via range_tree_add to the range tree can either:
46 * 1) create a new extent
47 * 2) extend an adjacent extent
48 * 3) merge two adjacent extents
49 * Conversely, removing an allocation via range_tree_remove can:
50 * 1) completely remove an extent
51 * 2) shorten an extent (if the allocation was near one of its ends)
52 * 3) split an extent into two extents, in effect punching a hole
53 *
54 * A range tree is also capable of 'bridging' gaps when adding
55 * allocations. This is useful for cases when close proximity of
56 * allocations is an important detail that needs to be represented
57 * in the range tree. See range_tree_set_gap(). The default behavior
58 * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
59 *
60 * In order to traverse a range tree, use either the range_tree_walk()
61 * or range_tree_vacate() functions.
62 *
63 * To obtain more accurate information on individual segment
64 * operations that the range tree performs "under the hood", you can
65 * specify a set of callbacks by passing a range_tree_ops_t structure
66 * to the range_tree_create function. Any callbacks that are non-NULL
67 * are then called at the appropriate times.
68 *
69 * The range tree code also supports a special variant of range trees
70 * that can bridge small gaps between segments. This kind of tree is used
71 * by the dsl scanning code to group I/Os into mostly sequential chunks to
72 * optimize disk performance. The code here attempts to do this with as
73 * little memory and computational overhead as possible. One limitation of
74 * this implementation is that segments of range trees with gaps can only
75 * support removing complete segments.
76 */
77
78static inline void
79rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt)
80{
81	ASSERT3U(rt->rt_type, <, RANGE_SEG_NUM_TYPES);
82	size_t size = 0;
83	switch (rt->rt_type) {
84	case RANGE_SEG32:
85		size = sizeof (range_seg32_t);
86		break;
87	case RANGE_SEG64:
88		size = sizeof (range_seg64_t);
89		break;
90	case RANGE_SEG_GAP:
91		size = sizeof (range_seg_gap_t);
92		break;
93	default:
94		__builtin_unreachable();
95	}
96	memcpy(dest, src, size);
97}
98
99void
100range_tree_stat_verify(range_tree_t *rt)
101{
102	range_seg_t *rs;
103	zfs_btree_index_t where;
104	uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
105	int i;
106
107	for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL;
108	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
109		uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
110		int idx	= highbit64(size) - 1;
111
112		hist[idx]++;
113		ASSERT3U(hist[idx], !=, 0);
114	}
115
116	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
117		if (hist[i] != rt->rt_histogram[i]) {
118			zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu",
119			    i, hist, (u_longlong_t)hist[i],
120			    (u_longlong_t)rt->rt_histogram[i]);
121		}
122		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
123	}
124}
125
126static void
127range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
128{
129	uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
130	int idx = highbit64(size) - 1;
131
132	ASSERT(size != 0);
133	ASSERT3U(idx, <,
134	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
135
136	rt->rt_histogram[idx]++;
137	ASSERT3U(rt->rt_histogram[idx], !=, 0);
138}
139
140static void
141range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
142{
143	uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
144	int idx = highbit64(size) - 1;
145
146	ASSERT(size != 0);
147	ASSERT3U(idx, <,
148	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
149
150	ASSERT3U(rt->rt_histogram[idx], !=, 0);
151	rt->rt_histogram[idx]--;
152}
153
154__attribute__((always_inline)) inline
155static int
156range_tree_seg32_compare(const void *x1, const void *x2)
157{
158	const range_seg32_t *r1 = x1;
159	const range_seg32_t *r2 = x2;
160
161	ASSERT3U(r1->rs_start, <=, r1->rs_end);
162	ASSERT3U(r2->rs_start, <=, r2->rs_end);
163
164	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
165}
166
167__attribute__((always_inline)) inline
168static int
169range_tree_seg64_compare(const void *x1, const void *x2)
170{
171	const range_seg64_t *r1 = x1;
172	const range_seg64_t *r2 = x2;
173
174	ASSERT3U(r1->rs_start, <=, r1->rs_end);
175	ASSERT3U(r2->rs_start, <=, r2->rs_end);
176
177	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
178}
179
180__attribute__((always_inline)) inline
181static int
182range_tree_seg_gap_compare(const void *x1, const void *x2)
183{
184	const range_seg_gap_t *r1 = x1;
185	const range_seg_gap_t *r2 = x2;
186
187	ASSERT3U(r1->rs_start, <=, r1->rs_end);
188	ASSERT3U(r2->rs_start, <=, r2->rs_end);
189
190	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
191}
192
193ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t,
194    range_tree_seg32_compare)
195
196ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t,
197    range_tree_seg64_compare)
198
199ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t,
200    range_tree_seg_gap_compare)
201
202range_tree_t *
203range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type,
204    void *arg, uint64_t start, uint64_t shift, uint64_t gap)
205{
206	range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
207
208	ASSERT3U(shift, <, 64);
209	ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES);
210	size_t size;
211	int (*compare) (const void *, const void *);
212	bt_find_in_buf_f bt_find;
213	switch (type) {
214	case RANGE_SEG32:
215		size = sizeof (range_seg32_t);
216		compare = range_tree_seg32_compare;
217		bt_find = range_tree_seg32_find_in_buf;
218		break;
219	case RANGE_SEG64:
220		size = sizeof (range_seg64_t);
221		compare = range_tree_seg64_compare;
222		bt_find = range_tree_seg64_find_in_buf;
223		break;
224	case RANGE_SEG_GAP:
225		size = sizeof (range_seg_gap_t);
226		compare = range_tree_seg_gap_compare;
227		bt_find = range_tree_seg_gap_find_in_buf;
228		break;
229	default:
230		panic("Invalid range seg type %d", type);
231	}
232	zfs_btree_create(&rt->rt_root, compare, bt_find, size);
233
234	rt->rt_ops = ops;
235	rt->rt_gap = gap;
236	rt->rt_arg = arg;
237	rt->rt_type = type;
238	rt->rt_start = start;
239	rt->rt_shift = shift;
240
241	if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
242		rt->rt_ops->rtop_create(rt, rt->rt_arg);
243
244	return (rt);
245}
246
247range_tree_t *
248range_tree_create(const range_tree_ops_t *ops, range_seg_type_t type,
249    void *arg, uint64_t start, uint64_t shift)
250{
251	return (range_tree_create_gap(ops, type, arg, start, shift, 0));
252}
253
254void
255range_tree_destroy(range_tree_t *rt)
256{
257	VERIFY0(rt->rt_space);
258
259	if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
260		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
261
262	zfs_btree_destroy(&rt->rt_root);
263	kmem_free(rt, sizeof (*rt));
264}
265
266void
267range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
268{
269	if (delta < 0 && delta * -1 >= rs_get_fill(rs, rt)) {
270		zfs_panic_recover("zfs: attempting to decrease fill to or "
271		    "below 0; probable double remove in segment [%llx:%llx]",
272		    (longlong_t)rs_get_start(rs, rt),
273		    (longlong_t)rs_get_end(rs, rt));
274	}
275	if (rs_get_fill(rs, rt) + delta > rs_get_end(rs, rt) -
276	    rs_get_start(rs, rt)) {
277		zfs_panic_recover("zfs: attempting to increase fill beyond "
278		    "max; probable double add in segment [%llx:%llx]",
279		    (longlong_t)rs_get_start(rs, rt),
280		    (longlong_t)rs_get_end(rs, rt));
281	}
282
283	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
284		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
285	rs_set_fill(rs, rt, rs_get_fill(rs, rt) + delta);
286	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
287		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
288}
289
290static void
291range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
292{
293	range_tree_t *rt = arg;
294	zfs_btree_index_t where;
295	range_seg_t *rs_before, *rs_after, *rs;
296	range_seg_max_t tmp, rsearch;
297	uint64_t end = start + size, gap = rt->rt_gap;
298	uint64_t bridge_size = 0;
299	boolean_t merge_before, merge_after;
300
301	ASSERT3U(size, !=, 0);
302	ASSERT3U(fill, <=, size);
303	ASSERT3U(start + size, >, start);
304
305	rs_set_start(&rsearch, rt, start);
306	rs_set_end(&rsearch, rt, end);
307	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
308
309	/*
310	 * If this is a gap-supporting range tree, it is possible that we
311	 * are inserting into an existing segment. In this case simply
312	 * bump the fill count and call the remove / add callbacks. If the
313	 * new range will extend an existing segment, we remove the
314	 * existing one, apply the new extent to it and re-insert it using
315	 * the normal code paths.
316	 */
317	if (rs != NULL) {
318		if (gap == 0) {
319			zfs_panic_recover("zfs: adding existent segment to "
320			    "range tree (offset=%llx size=%llx)",
321			    (longlong_t)start, (longlong_t)size);
322			return;
323		}
324		uint64_t rstart = rs_get_start(rs, rt);
325		uint64_t rend = rs_get_end(rs, rt);
326		if (rstart <= start && rend >= end) {
327			range_tree_adjust_fill(rt, rs, fill);
328			return;
329		}
330
331		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
332			rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
333
334		range_tree_stat_decr(rt, rs);
335		rt->rt_space -= rend - rstart;
336
337		fill += rs_get_fill(rs, rt);
338		start = MIN(start, rstart);
339		end = MAX(end, rend);
340		size = end - start;
341
342		zfs_btree_remove(&rt->rt_root, rs);
343		range_tree_add_impl(rt, start, size, fill);
344		return;
345	}
346
347	ASSERT3P(rs, ==, NULL);
348
349	/*
350	 * Determine whether or not we will have to merge with our neighbors.
351	 * If gap != 0, we might need to merge with our neighbors even if we
352	 * aren't directly touching.
353	 */
354	zfs_btree_index_t where_before, where_after;
355	rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before);
356	rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after);
357
358	merge_before = (rs_before != NULL && rs_get_end(rs_before, rt) >=
359	    start - gap);
360	merge_after = (rs_after != NULL && rs_get_start(rs_after, rt) <= end +
361	    gap);
362
363	if (merge_before && gap != 0)
364		bridge_size += start - rs_get_end(rs_before, rt);
365	if (merge_after && gap != 0)
366		bridge_size += rs_get_start(rs_after, rt) - end;
367
368	if (merge_before && merge_after) {
369		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
370			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
371			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
372		}
373
374		range_tree_stat_decr(rt, rs_before);
375		range_tree_stat_decr(rt, rs_after);
376
377		rs_copy(rs_after, &tmp, rt);
378		uint64_t before_start = rs_get_start_raw(rs_before, rt);
379		uint64_t before_fill = rs_get_fill(rs_before, rt);
380		uint64_t after_fill = rs_get_fill(rs_after, rt);
381		zfs_btree_remove_idx(&rt->rt_root, &where_before);
382
383		/*
384		 * We have to re-find the node because our old reference is
385		 * invalid as soon as we do any mutating btree operations.
386		 */
387		rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
388		ASSERT3P(rs_after, !=, NULL);
389		rs_set_start_raw(rs_after, rt, before_start);
390		rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
391		rs = rs_after;
392	} else if (merge_before) {
393		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
394			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
395
396		range_tree_stat_decr(rt, rs_before);
397
398		uint64_t before_fill = rs_get_fill(rs_before, rt);
399		rs_set_end(rs_before, rt, end);
400		rs_set_fill(rs_before, rt, before_fill + fill);
401		rs = rs_before;
402	} else if (merge_after) {
403		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
404			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
405
406		range_tree_stat_decr(rt, rs_after);
407
408		uint64_t after_fill = rs_get_fill(rs_after, rt);
409		rs_set_start(rs_after, rt, start);
410		rs_set_fill(rs_after, rt, after_fill + fill);
411		rs = rs_after;
412	} else {
413		rs = &tmp;
414
415		rs_set_start(rs, rt, start);
416		rs_set_end(rs, rt, end);
417		rs_set_fill(rs, rt, fill);
418		zfs_btree_add_idx(&rt->rt_root, rs, &where);
419	}
420
421	if (gap != 0) {
422		ASSERT3U(rs_get_fill(rs, rt), <=, rs_get_end(rs, rt) -
423		    rs_get_start(rs, rt));
424	} else {
425		ASSERT3U(rs_get_fill(rs, rt), ==, rs_get_end(rs, rt) -
426		    rs_get_start(rs, rt));
427	}
428
429	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
430		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
431
432	range_tree_stat_incr(rt, rs);
433	rt->rt_space += size + bridge_size;
434}
435
436void
437range_tree_add(void *arg, uint64_t start, uint64_t size)
438{
439	range_tree_add_impl(arg, start, size, size);
440}
441
442static void
443range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
444    boolean_t do_fill)
445{
446	zfs_btree_index_t where;
447	range_seg_t *rs;
448	range_seg_max_t rsearch, rs_tmp;
449	uint64_t end = start + size;
450	boolean_t left_over, right_over;
451
452	VERIFY3U(size, !=, 0);
453	VERIFY3U(size, <=, rt->rt_space);
454	if (rt->rt_type == RANGE_SEG64)
455		ASSERT3U(start + size, >, start);
456
457	rs_set_start(&rsearch, rt, start);
458	rs_set_end(&rsearch, rt, end);
459	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
460
461	/* Make sure we completely overlap with someone */
462	if (rs == NULL) {
463		zfs_panic_recover("zfs: removing nonexistent segment from "
464		    "range tree (offset=%llx size=%llx)",
465		    (longlong_t)start, (longlong_t)size);
466		return;
467	}
468
469	/*
470	 * Range trees with gap support must only remove complete segments
471	 * from the tree. This allows us to maintain accurate fill accounting
472	 * and to ensure that bridged sections are not leaked. If we need to
473	 * remove less than the full segment, we can only adjust the fill count.
474	 */
475	if (rt->rt_gap != 0) {
476		if (do_fill) {
477			if (rs_get_fill(rs, rt) == size) {
478				start = rs_get_start(rs, rt);
479				end = rs_get_end(rs, rt);
480				size = end - start;
481			} else {
482				range_tree_adjust_fill(rt, rs, -size);
483				return;
484			}
485		} else if (rs_get_start(rs, rt) != start ||
486		    rs_get_end(rs, rt) != end) {
487			zfs_panic_recover("zfs: freeing partial segment of "
488			    "gap tree (offset=%llx size=%llx) of "
489			    "(offset=%llx size=%llx)",
490			    (longlong_t)start, (longlong_t)size,
491			    (longlong_t)rs_get_start(rs, rt),
492			    (longlong_t)rs_get_end(rs, rt) - rs_get_start(rs,
493			    rt));
494			return;
495		}
496	}
497
498	VERIFY3U(rs_get_start(rs, rt), <=, start);
499	VERIFY3U(rs_get_end(rs, rt), >=, end);
500
501	left_over = (rs_get_start(rs, rt) != start);
502	right_over = (rs_get_end(rs, rt) != end);
503
504	range_tree_stat_decr(rt, rs);
505
506	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
507		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
508
509	if (left_over && right_over) {
510		range_seg_max_t newseg;
511		rs_set_start(&newseg, rt, end);
512		rs_set_end_raw(&newseg, rt, rs_get_end_raw(rs, rt));
513		rs_set_fill(&newseg, rt, rs_get_end(rs, rt) - end);
514		range_tree_stat_incr(rt, &newseg);
515
516		// This modifies the buffer already inside the range tree
517		rs_set_end(rs, rt, start);
518
519		rs_copy(rs, &rs_tmp, rt);
520		if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL)
521			zfs_btree_add_idx(&rt->rt_root, &newseg, &where);
522		else
523			zfs_btree_add(&rt->rt_root, &newseg);
524
525		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
526			rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg);
527	} else if (left_over) {
528		// This modifies the buffer already inside the range tree
529		rs_set_end(rs, rt, start);
530		rs_copy(rs, &rs_tmp, rt);
531	} else if (right_over) {
532		// This modifies the buffer already inside the range tree
533		rs_set_start(rs, rt, end);
534		rs_copy(rs, &rs_tmp, rt);
535	} else {
536		zfs_btree_remove_idx(&rt->rt_root, &where);
537		rs = NULL;
538	}
539
540	if (rs != NULL) {
541		/*
542		 * The fill of the leftover segment will always be equal to
543		 * the size, since we do not support removing partial segments
544		 * of range trees with gaps.
545		 */
546		rs_set_fill_raw(rs, rt, rs_get_end_raw(rs, rt) -
547		    rs_get_start_raw(rs, rt));
548		range_tree_stat_incr(rt, &rs_tmp);
549
550		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
551			rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg);
552	}
553
554	rt->rt_space -= size;
555}
556
557void
558range_tree_remove(void *arg, uint64_t start, uint64_t size)
559{
560	range_tree_remove_impl(arg, start, size, B_FALSE);
561}
562
563void
564range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size)
565{
566	range_tree_remove_impl(rt, start, size, B_TRUE);
567}
568
569void
570range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
571    uint64_t newstart, uint64_t newsize)
572{
573	int64_t delta = newsize - (rs_get_end(rs, rt) - rs_get_start(rs, rt));
574
575	range_tree_stat_decr(rt, rs);
576	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
577		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
578
579	rs_set_start(rs, rt, newstart);
580	rs_set_end(rs, rt, newstart + newsize);
581
582	range_tree_stat_incr(rt, rs);
583	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
584		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
585
586	rt->rt_space += delta;
587}
588
589static range_seg_t *
590range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
591{
592	range_seg_max_t rsearch;
593	uint64_t end = start + size;
594
595	VERIFY(size != 0);
596
597	rs_set_start(&rsearch, rt, start);
598	rs_set_end(&rsearch, rt, end);
599	return (zfs_btree_find(&rt->rt_root, &rsearch, NULL));
600}
601
602range_seg_t *
603range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
604{
605	if (rt->rt_type == RANGE_SEG64)
606		ASSERT3U(start + size, >, start);
607
608	range_seg_t *rs = range_tree_find_impl(rt, start, size);
609	if (rs != NULL && rs_get_start(rs, rt) <= start &&
610	    rs_get_end(rs, rt) >= start + size) {
611		return (rs);
612	}
613	return (NULL);
614}
615
616void
617range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size)
618{
619	range_seg_t *rs = range_tree_find(rt, off, size);
620	if (rs != NULL)
621		panic("segment already in tree; rs=%p", (void *)rs);
622}
623
624boolean_t
625range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
626{
627	return (range_tree_find(rt, start, size) != NULL);
628}
629
630/*
631 * Returns the first subset of the given range which overlaps with the range
632 * tree. Returns true if there is a segment in the range, and false if there
633 * isn't.
634 */
635boolean_t
636range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
637    uint64_t *ostart, uint64_t *osize)
638{
639	if (rt->rt_type == RANGE_SEG64)
640		ASSERT3U(start + size, >, start);
641
642	range_seg_max_t rsearch;
643	rs_set_start(&rsearch, rt, start);
644	rs_set_end_raw(&rsearch, rt, rs_get_start_raw(&rsearch, rt) + 1);
645
646	zfs_btree_index_t where;
647	range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
648	if (rs != NULL) {
649		*ostart = start;
650		*osize = MIN(size, rs_get_end(rs, rt) - start);
651		return (B_TRUE);
652	}
653
654	rs = zfs_btree_next(&rt->rt_root, &where, &where);
655	if (rs == NULL || rs_get_start(rs, rt) > start + size)
656		return (B_FALSE);
657
658	*ostart = rs_get_start(rs, rt);
659	*osize = MIN(start + size, rs_get_end(rs, rt)) -
660	    rs_get_start(rs, rt);
661	return (B_TRUE);
662}
663
664/*
665 * Ensure that this range is not in the tree, regardless of whether
666 * it is currently in the tree.
667 */
668void
669range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
670{
671	range_seg_t *rs;
672
673	if (size == 0)
674		return;
675
676	if (rt->rt_type == RANGE_SEG64)
677		ASSERT3U(start + size, >, start);
678
679	while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
680		uint64_t free_start = MAX(rs_get_start(rs, rt), start);
681		uint64_t free_end = MIN(rs_get_end(rs, rt), start + size);
682		range_tree_remove(rt, free_start, free_end - free_start);
683	}
684}
685
686void
687range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
688{
689	range_tree_t *rt;
690
691	ASSERT0(range_tree_space(*rtdst));
692	ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root));
693
694	rt = *rtsrc;
695	*rtsrc = *rtdst;
696	*rtdst = rt;
697}
698
699void
700range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
701{
702	if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
703		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
704
705	if (func != NULL) {
706		range_seg_t *rs;
707		zfs_btree_index_t *cookie = NULL;
708
709		while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) !=
710		    NULL) {
711			func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) -
712			    rs_get_start(rs, rt));
713		}
714	} else {
715		zfs_btree_clear(&rt->rt_root);
716	}
717
718	memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram));
719	rt->rt_space = 0;
720}
721
722void
723range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
724{
725	zfs_btree_index_t where;
726	for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where);
727	    rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
728		func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) -
729		    rs_get_start(rs, rt));
730	}
731}
732
733range_seg_t *
734range_tree_first(range_tree_t *rt)
735{
736	return (zfs_btree_first(&rt->rt_root, NULL));
737}
738
739uint64_t
740range_tree_space(range_tree_t *rt)
741{
742	return (rt->rt_space);
743}
744
745uint64_t
746range_tree_numsegs(range_tree_t *rt)
747{
748	return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root));
749}
750
751boolean_t
752range_tree_is_empty(range_tree_t *rt)
753{
754	ASSERT(rt != NULL);
755	return (range_tree_space(rt) == 0);
756}
757
758/*
759 * Remove any overlapping ranges between the given segment [start, end)
760 * from removefrom. Add non-overlapping leftovers to addto.
761 */
762void
763range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
764    range_tree_t *removefrom, range_tree_t *addto)
765{
766	zfs_btree_index_t where;
767	range_seg_max_t starting_rs;
768	rs_set_start(&starting_rs, removefrom, start);
769	rs_set_end_raw(&starting_rs, removefrom, rs_get_start_raw(&starting_rs,
770	    removefrom) + 1);
771
772	range_seg_t *curr = zfs_btree_find(&removefrom->rt_root,
773	    &starting_rs, &where);
774
775	if (curr == NULL)
776		curr = zfs_btree_next(&removefrom->rt_root, &where, &where);
777
778	range_seg_t *next;
779	for (; curr != NULL; curr = next) {
780		if (start == end)
781			return;
782		VERIFY3U(start, <, end);
783
784		/* there is no overlap */
785		if (end <= rs_get_start(curr, removefrom)) {
786			range_tree_add(addto, start, end - start);
787			return;
788		}
789
790		uint64_t overlap_start = MAX(rs_get_start(curr, removefrom),
791		    start);
792		uint64_t overlap_end = MIN(rs_get_end(curr, removefrom),
793		    end);
794		uint64_t overlap_size = overlap_end - overlap_start;
795		ASSERT3S(overlap_size, >, 0);
796		range_seg_max_t rs;
797		rs_copy(curr, &rs, removefrom);
798
799		range_tree_remove(removefrom, overlap_start, overlap_size);
800
801		if (start < overlap_start)
802			range_tree_add(addto, start, overlap_start - start);
803
804		start = overlap_end;
805		next = zfs_btree_find(&removefrom->rt_root, &rs, &where);
806		/*
807		 * If we find something here, we only removed part of the
808		 * curr segment. Either there's some left at the end
809		 * because we've reached the end of the range we're removing,
810		 * or there's some left at the start because we started
811		 * partway through the range.  Either way, we continue with
812		 * the loop. If it's the former, we'll return at the start of
813		 * the loop, and if it's the latter we'll see if there is more
814		 * area to process.
815		 */
816		if (next != NULL) {
817			ASSERT(start == end || start == rs_get_end(&rs,
818			    removefrom));
819		}
820
821		next = zfs_btree_next(&removefrom->rt_root, &where, &where);
822	}
823	VERIFY3P(curr, ==, NULL);
824
825	if (start != end) {
826		VERIFY3U(start, <, end);
827		range_tree_add(addto, start, end - start);
828	} else {
829		VERIFY3U(start, ==, end);
830	}
831}
832
833/*
834 * For each entry in rt, if it exists in removefrom, remove it
835 * from removefrom. Otherwise, add it to addto.
836 */
837void
838range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom,
839    range_tree_t *addto)
840{
841	zfs_btree_index_t where;
842	for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs;
843	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
844		range_tree_remove_xor_add_segment(rs_get_start(rs, rt),
845		    rs_get_end(rs, rt), removefrom, addto);
846	}
847}
848
849uint64_t
850range_tree_min(range_tree_t *rt)
851{
852	range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL);
853	return (rs != NULL ? rs_get_start(rs, rt) : 0);
854}
855
856uint64_t
857range_tree_max(range_tree_t *rt)
858{
859	range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL);
860	return (rs != NULL ? rs_get_end(rs, rt) : 0);
861}
862
863uint64_t
864range_tree_span(range_tree_t *rt)
865{
866	return (range_tree_max(rt) - range_tree_min(rt));
867}
868