1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2011 Red Hat, Inc.
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm-btree.h"
9#include "dm-btree-internal.h"
10#include "dm-transaction-manager.h"
11
12#include <linux/export.h>
13#include <linux/device-mapper.h>
14
15#define DM_MSG_PREFIX "btree"
16
17/*
18 * Removing an entry from a btree
19 * ==============================
20 *
21 * A very important constraint for our btree is that no node, except the
22 * root, may have fewer than a certain number of entries.
23 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
24 *
25 * Ensuring this is complicated by the way we want to only ever hold the
26 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
27 * fashion.
28 *
29 * Each node may have a left or right sibling.  When decending the spine,
30 * if a node contains only MIN_ENTRIES then we try and increase this to at
31 * least MIN_ENTRIES + 1.  We do this in the following ways:
32 *
33 * [A] No siblings => this can only happen if the node is the root, in which
34 *     case we copy the childs contents over the root.
35 *
36 * [B] No left sibling
37 *     ==> rebalance(node, right sibling)
38 *
39 * [C] No right sibling
40 *     ==> rebalance(left sibling, node)
41 *
42 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
43 *     ==> delete node adding it's contents to left and right
44 *
45 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
46 *     ==> rebalance(left, node, right)
47 *
48 * After these operations it's possible that the our original node no
49 * longer contains the desired sub tree.  For this reason this rebalancing
50 * is performed on the children of the current node.  This also avoids
51 * having a special case for the root.
52 *
53 * Once this rebalancing has occurred we can then step into the child node
54 * for internal nodes.  Or delete the entry for leaf nodes.
55 */
56
57/*
58 * Some little utilities for moving node data around.
59 */
60static void node_shift(struct btree_node *n, int shift)
61{
62	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
63	uint32_t value_size = le32_to_cpu(n->header.value_size);
64
65	if (shift < 0) {
66		shift = -shift;
67		BUG_ON(shift > nr_entries);
68		BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
69		memmove(key_ptr(n, 0),
70			key_ptr(n, shift),
71			(nr_entries - shift) * sizeof(__le64));
72		memmove(value_ptr(n, 0),
73			value_ptr(n, shift),
74			(nr_entries - shift) * value_size);
75	} else {
76		BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
77		memmove(key_ptr(n, shift),
78			key_ptr(n, 0),
79			nr_entries * sizeof(__le64));
80		memmove(value_ptr(n, shift),
81			value_ptr(n, 0),
82			nr_entries * value_size);
83	}
84}
85
86static int node_copy(struct btree_node *left, struct btree_node *right, int shift)
87{
88	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
89	uint32_t value_size = le32_to_cpu(left->header.value_size);
90
91	if (value_size != le32_to_cpu(right->header.value_size)) {
92		DMERR("mismatched value size");
93		return -EILSEQ;
94	}
95
96	if (shift < 0) {
97		shift = -shift;
98
99		if (nr_left + shift > le32_to_cpu(left->header.max_entries)) {
100			DMERR("bad shift");
101			return -EINVAL;
102		}
103
104		memcpy(key_ptr(left, nr_left),
105		       key_ptr(right, 0),
106		       shift * sizeof(__le64));
107		memcpy(value_ptr(left, nr_left),
108		       value_ptr(right, 0),
109		       shift * value_size);
110	} else {
111		if (shift > le32_to_cpu(right->header.max_entries)) {
112			DMERR("bad shift");
113			return -EINVAL;
114		}
115
116		memcpy(key_ptr(right, 0),
117		       key_ptr(left, nr_left - shift),
118		       shift * sizeof(__le64));
119		memcpy(value_ptr(right, 0),
120		       value_ptr(left, nr_left - shift),
121		       shift * value_size);
122	}
123	return 0;
124}
125
126/*
127 * Delete a specific entry from a leaf node.
128 */
129static void delete_at(struct btree_node *n, unsigned int index)
130{
131	unsigned int nr_entries = le32_to_cpu(n->header.nr_entries);
132	unsigned int nr_to_copy = nr_entries - (index + 1);
133	uint32_t value_size = le32_to_cpu(n->header.value_size);
134
135	BUG_ON(index >= nr_entries);
136
137	if (nr_to_copy) {
138		memmove(key_ptr(n, index),
139			key_ptr(n, index + 1),
140			nr_to_copy * sizeof(__le64));
141
142		memmove(value_ptr(n, index),
143			value_ptr(n, index + 1),
144			nr_to_copy * value_size);
145	}
146
147	n->header.nr_entries = cpu_to_le32(nr_entries - 1);
148}
149
150static unsigned int merge_threshold(struct btree_node *n)
151{
152	return le32_to_cpu(n->header.max_entries) / 3;
153}
154
155struct child {
156	unsigned int index;
157	struct dm_block *block;
158	struct btree_node *n;
159};
160
161static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
162		      struct btree_node *parent,
163		      unsigned int index, struct child *result)
164{
165	int r, inc;
166	dm_block_t root;
167
168	result->index = index;
169	root = value64(parent, index);
170
171	r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
172			       &result->block, &inc);
173	if (r)
174		return r;
175
176	result->n = dm_block_data(result->block);
177
178	if (inc)
179		inc_children(info->tm, result->n, vt);
180
181	*((__le64 *) value_ptr(parent, index)) =
182		cpu_to_le64(dm_block_location(result->block));
183
184	return 0;
185}
186
187static void exit_child(struct dm_btree_info *info, struct child *c)
188{
189	dm_tm_unlock(info->tm, c->block);
190}
191
192static int shift(struct btree_node *left, struct btree_node *right, int count)
193{
194	int r;
195	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
196	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
197	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
198	uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
199
200	if (max_entries != r_max_entries) {
201		DMERR("node max_entries mismatch");
202		return -EILSEQ;
203	}
204
205	if (nr_left - count > max_entries) {
206		DMERR("node shift out of bounds");
207		return -EINVAL;
208	}
209
210	if (nr_right + count > max_entries) {
211		DMERR("node shift out of bounds");
212		return -EINVAL;
213	}
214
215	if (!count)
216		return 0;
217
218	if (count > 0) {
219		node_shift(right, count);
220		r = node_copy(left, right, count);
221		if (r)
222			return r;
223	} else {
224		r = node_copy(left, right, count);
225		if (r)
226			return r;
227		node_shift(right, count);
228	}
229
230	left->header.nr_entries = cpu_to_le32(nr_left - count);
231	right->header.nr_entries = cpu_to_le32(nr_right + count);
232
233	return 0;
234}
235
236static int __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
237			struct child *l, struct child *r)
238{
239	int ret;
240	struct btree_node *left = l->n;
241	struct btree_node *right = r->n;
242	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
243	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
244	/*
245	 * Ensure the number of entries in each child will be greater
246	 * than or equal to (max_entries / 3 + 1), so no matter which
247	 * child is used for removal, the number will still be not
248	 * less than (max_entries / 3).
249	 */
250	unsigned int threshold = 2 * (merge_threshold(left) + 1);
251
252	if (nr_left + nr_right < threshold) {
253		/*
254		 * Merge
255		 */
256		node_copy(left, right, -nr_right);
257		left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
258		delete_at(parent, r->index);
259
260		/*
261		 * We need to decrement the right block, but not it's
262		 * children, since they're still referenced by left.
263		 */
264		dm_tm_dec(info->tm, dm_block_location(r->block));
265	} else {
266		/*
267		 * Rebalance.
268		 */
269		unsigned int target_left = (nr_left + nr_right) / 2;
270
271		ret = shift(left, right, nr_left - target_left);
272		if (ret)
273			return ret;
274		*key_ptr(parent, r->index) = right->keys[0];
275	}
276	return 0;
277}
278
279static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
280		      struct dm_btree_value_type *vt, unsigned int left_index)
281{
282	int r;
283	struct btree_node *parent;
284	struct child left, right;
285
286	parent = dm_block_data(shadow_current(s));
287
288	r = init_child(info, vt, parent, left_index, &left);
289	if (r)
290		return r;
291
292	r = init_child(info, vt, parent, left_index + 1, &right);
293	if (r) {
294		exit_child(info, &left);
295		return r;
296	}
297
298	r = __rebalance2(info, parent, &left, &right);
299
300	exit_child(info, &left);
301	exit_child(info, &right);
302
303	return r;
304}
305
306/*
307 * We dump as many entries from center as possible into left, then the rest
308 * in right, then rebalance2.  This wastes some cpu, but I want something
309 * simple atm.
310 */
311static int delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
312			      struct child *l, struct child *c, struct child *r,
313			      struct btree_node *left, struct btree_node *center, struct btree_node *right,
314			      uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
315{
316	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
317	unsigned int shift = min(max_entries - nr_left, nr_center);
318
319	if (nr_left + shift > max_entries) {
320		DMERR("node shift out of bounds");
321		return -EINVAL;
322	}
323
324	node_copy(left, center, -shift);
325	left->header.nr_entries = cpu_to_le32(nr_left + shift);
326
327	if (shift != nr_center) {
328		shift = nr_center - shift;
329
330		if ((nr_right + shift) > max_entries) {
331			DMERR("node shift out of bounds");
332			return -EINVAL;
333		}
334
335		node_shift(right, shift);
336		node_copy(center, right, shift);
337		right->header.nr_entries = cpu_to_le32(nr_right + shift);
338	}
339	*key_ptr(parent, r->index) = right->keys[0];
340
341	delete_at(parent, c->index);
342	r->index--;
343
344	dm_tm_dec(info->tm, dm_block_location(c->block));
345	return __rebalance2(info, parent, l, r);
346}
347
348/*
349 * Redistributes entries among 3 sibling nodes.
350 */
351static int redistribute3(struct dm_btree_info *info, struct btree_node *parent,
352			 struct child *l, struct child *c, struct child *r,
353			 struct btree_node *left, struct btree_node *center, struct btree_node *right,
354			 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
355{
356	int s, ret;
357	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
358	unsigned int total = nr_left + nr_center + nr_right;
359	unsigned int target_right = total / 3;
360	unsigned int remainder = (target_right * 3) != total;
361	unsigned int target_left = target_right + remainder;
362
363	BUG_ON(target_left > max_entries);
364	BUG_ON(target_right > max_entries);
365
366	if (nr_left < nr_right) {
367		s = nr_left - target_left;
368
369		if (s < 0 && nr_center < -s) {
370			/* not enough in central node */
371			ret = shift(left, center, -nr_center);
372			if (ret)
373				return ret;
374
375			s += nr_center;
376			ret = shift(left, right, s);
377			if (ret)
378				return ret;
379
380			nr_right += s;
381		} else {
382			ret = shift(left, center, s);
383			if (ret)
384				return ret;
385		}
386
387		ret = shift(center, right, target_right - nr_right);
388		if (ret)
389			return ret;
390	} else {
391		s = target_right - nr_right;
392		if (s > 0 && nr_center < s) {
393			/* not enough in central node */
394			ret = shift(center, right, nr_center);
395			if (ret)
396				return ret;
397			s -= nr_center;
398			ret = shift(left, right, s);
399			if (ret)
400				return ret;
401			nr_left -= s;
402		} else {
403			ret = shift(center, right, s);
404			if (ret)
405				return ret;
406		}
407
408		ret = shift(left, center, nr_left - target_left);
409		if (ret)
410			return ret;
411	}
412
413	*key_ptr(parent, c->index) = center->keys[0];
414	*key_ptr(parent, r->index) = right->keys[0];
415	return 0;
416}
417
418static int __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
419			struct child *l, struct child *c, struct child *r)
420{
421	struct btree_node *left = l->n;
422	struct btree_node *center = c->n;
423	struct btree_node *right = r->n;
424
425	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
426	uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
427	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
428
429	unsigned int threshold = merge_threshold(left) * 4 + 1;
430
431	if ((left->header.max_entries != center->header.max_entries) ||
432	    (center->header.max_entries != right->header.max_entries)) {
433		DMERR("bad btree metadata, max_entries differ");
434		return -EILSEQ;
435	}
436
437	if ((nr_left + nr_center + nr_right) < threshold) {
438		return delete_center_node(info, parent, l, c, r, left, center, right,
439					  nr_left, nr_center, nr_right);
440	}
441
442	return redistribute3(info, parent, l, c, r, left, center, right,
443			     nr_left, nr_center, nr_right);
444}
445
446static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
447		      struct dm_btree_value_type *vt, unsigned int left_index)
448{
449	int r;
450	struct btree_node *parent = dm_block_data(shadow_current(s));
451	struct child left, center, right;
452
453	/*
454	 * FIXME: fill out an array?
455	 */
456	r = init_child(info, vt, parent, left_index, &left);
457	if (r)
458		return r;
459
460	r = init_child(info, vt, parent, left_index + 1, &center);
461	if (r) {
462		exit_child(info, &left);
463		return r;
464	}
465
466	r = init_child(info, vt, parent, left_index + 2, &right);
467	if (r) {
468		exit_child(info, &left);
469		exit_child(info, &center);
470		return r;
471	}
472
473	r = __rebalance3(info, parent, &left, &center, &right);
474
475	exit_child(info, &left);
476	exit_child(info, &center);
477	exit_child(info, &right);
478
479	return r;
480}
481
482static int rebalance_children(struct shadow_spine *s,
483			      struct dm_btree_info *info,
484			      struct dm_btree_value_type *vt, uint64_t key)
485{
486	int i, r, has_left_sibling, has_right_sibling;
487	struct btree_node *n;
488
489	n = dm_block_data(shadow_current(s));
490
491	if (le32_to_cpu(n->header.nr_entries) == 1) {
492		struct dm_block *child;
493		dm_block_t b = value64(n, 0);
494
495		r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
496		if (r)
497			return r;
498
499		memcpy(n, dm_block_data(child),
500		       dm_bm_block_size(dm_tm_get_bm(info->tm)));
501
502		dm_tm_dec(info->tm, dm_block_location(child));
503		dm_tm_unlock(info->tm, child);
504		return 0;
505	}
506
507	i = lower_bound(n, key);
508	if (i < 0)
509		return -ENODATA;
510
511	has_left_sibling = i > 0;
512	has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
513
514	if (!has_left_sibling)
515		r = rebalance2(s, info, vt, i);
516
517	else if (!has_right_sibling)
518		r = rebalance2(s, info, vt, i - 1);
519
520	else
521		r = rebalance3(s, info, vt, i - 1);
522
523	return r;
524}
525
526static int do_leaf(struct btree_node *n, uint64_t key, unsigned int *index)
527{
528	int i = lower_bound(n, key);
529
530	if ((i < 0) ||
531	    (i >= le32_to_cpu(n->header.nr_entries)) ||
532	    (le64_to_cpu(n->keys[i]) != key))
533		return -ENODATA;
534
535	*index = i;
536
537	return 0;
538}
539
540/*
541 * Prepares for removal from one level of the hierarchy.  The caller must
542 * call delete_at() to remove the entry at index.
543 */
544static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
545		      struct dm_btree_value_type *vt, dm_block_t root,
546		      uint64_t key, unsigned int *index)
547{
548	int i = *index, r;
549	struct btree_node *n;
550
551	for (;;) {
552		r = shadow_step(s, root, vt);
553		if (r < 0)
554			break;
555
556		/*
557		 * We have to patch up the parent node, ugly, but I don't
558		 * see a way to do this automatically as part of the spine
559		 * op.
560		 */
561		if (shadow_has_parent(s)) {
562			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
563
564			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
565			       &location, sizeof(__le64));
566		}
567
568		n = dm_block_data(shadow_current(s));
569
570		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
571			return do_leaf(n, key, index);
572
573		r = rebalance_children(s, info, vt, key);
574		if (r)
575			break;
576
577		n = dm_block_data(shadow_current(s));
578		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
579			return do_leaf(n, key, index);
580
581		i = lower_bound(n, key);
582
583		/*
584		 * We know the key is present, or else
585		 * rebalance_children would have returned
586		 * -ENODATA
587		 */
588		root = value64(n, i);
589	}
590
591	return r;
592}
593
594int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
595		    uint64_t *keys, dm_block_t *new_root)
596{
597	unsigned int level, last_level = info->levels - 1;
598	int index = 0, r = 0;
599	struct shadow_spine spine;
600	struct btree_node *n;
601	struct dm_btree_value_type le64_vt;
602
603	init_le64_type(info->tm, &le64_vt);
604	init_shadow_spine(&spine, info);
605	for (level = 0; level < info->levels; level++) {
606		r = remove_raw(&spine, info,
607			       (level == last_level ?
608				&info->value_type : &le64_vt),
609			       root, keys[level], (unsigned int *)&index);
610		if (r < 0)
611			break;
612
613		n = dm_block_data(shadow_current(&spine));
614		if (level != last_level) {
615			root = value64(n, index);
616			continue;
617		}
618
619		BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
620
621		if (info->value_type.dec)
622			info->value_type.dec(info->value_type.context,
623					     value_ptr(n, index), 1);
624
625		delete_at(n, index);
626	}
627
628	if (!r)
629		*new_root = shadow_root(&spine);
630	exit_shadow_spine(&spine);
631
632	return r;
633}
634EXPORT_SYMBOL_GPL(dm_btree_remove);
635
636/*----------------------------------------------------------------*/
637
638static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
639			  struct dm_btree_value_type *vt, dm_block_t root,
640			  uint64_t key, int *index)
641{
642	int i = *index, r;
643	struct btree_node *n;
644
645	for (;;) {
646		r = shadow_step(s, root, vt);
647		if (r < 0)
648			break;
649
650		/*
651		 * We have to patch up the parent node, ugly, but I don't
652		 * see a way to do this automatically as part of the spine
653		 * op.
654		 */
655		if (shadow_has_parent(s)) {
656			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
657
658			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
659			       &location, sizeof(__le64));
660		}
661
662		n = dm_block_data(shadow_current(s));
663
664		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
665			*index = lower_bound(n, key);
666			return 0;
667		}
668
669		r = rebalance_children(s, info, vt, key);
670		if (r)
671			break;
672
673		n = dm_block_data(shadow_current(s));
674		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
675			*index = lower_bound(n, key);
676			return 0;
677		}
678
679		i = lower_bound(n, key);
680
681		/*
682		 * We know the key is present, or else
683		 * rebalance_children would have returned
684		 * -ENODATA
685		 */
686		root = value64(n, i);
687	}
688
689	return r;
690}
691
692static int remove_one(struct dm_btree_info *info, dm_block_t root,
693		      uint64_t *keys, uint64_t end_key,
694		      dm_block_t *new_root, unsigned int *nr_removed)
695{
696	unsigned int level, last_level = info->levels - 1;
697	int index = 0, r = 0;
698	struct shadow_spine spine;
699	struct btree_node *n;
700	struct dm_btree_value_type le64_vt;
701	uint64_t k;
702
703	init_le64_type(info->tm, &le64_vt);
704	init_shadow_spine(&spine, info);
705	for (level = 0; level < last_level; level++) {
706		r = remove_raw(&spine, info, &le64_vt,
707			       root, keys[level], (unsigned int *) &index);
708		if (r < 0)
709			goto out;
710
711		n = dm_block_data(shadow_current(&spine));
712		root = value64(n, index);
713	}
714
715	r = remove_nearest(&spine, info, &info->value_type,
716			   root, keys[last_level], &index);
717	if (r < 0)
718		goto out;
719
720	n = dm_block_data(shadow_current(&spine));
721
722	if (index < 0)
723		index = 0;
724
725	if (index >= le32_to_cpu(n->header.nr_entries)) {
726		r = -ENODATA;
727		goto out;
728	}
729
730	k = le64_to_cpu(n->keys[index]);
731	if (k >= keys[last_level] && k < end_key) {
732		if (info->value_type.dec)
733			info->value_type.dec(info->value_type.context,
734					     value_ptr(n, index), 1);
735
736		delete_at(n, index);
737		keys[last_level] = k + 1ull;
738
739	} else
740		r = -ENODATA;
741
742out:
743	*new_root = shadow_root(&spine);
744	exit_shadow_spine(&spine);
745
746	return r;
747}
748
749int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
750			   uint64_t *first_key, uint64_t end_key,
751			   dm_block_t *new_root, unsigned int *nr_removed)
752{
753	int r;
754
755	*nr_removed = 0;
756	do {
757		r = remove_one(info, root, first_key, end_key, &root, nr_removed);
758		if (!r)
759			(*nr_removed)++;
760	} while (!r);
761
762	*new_root = root;
763	return r == -ENODATA ? 0 : r;
764}
765EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);
766