1276506Snwhitehorn// SPDX-License-Identifier: GPL-2.0
2276506Snwhitehorn
3276506Snwhitehorn#include "bcachefs.h"
4276506Snwhitehorn#include "btree_cache.h"
5276506Snwhitehorn#include "btree_io.h"
6276506Snwhitehorn#include "btree_journal_iter.h"
7276506Snwhitehorn#include "btree_node_scan.h"
8276506Snwhitehorn#include "btree_update_interior.h"
9276506Snwhitehorn#include "buckets.h"
10276506Snwhitehorn#include "error.h"
11276506Snwhitehorn#include "journal_io.h"
12276506Snwhitehorn#include "recovery_passes.h"
13276506Snwhitehorn
14276506Snwhitehorn#include <linux/kthread.h>
15276506Snwhitehorn#include <linux/sort.h>
16276506Snwhitehorn
17276506Snwhitehornstruct find_btree_nodes_worker {
18276506Snwhitehorn	struct closure		*cl;
19276506Snwhitehorn	struct find_btree_nodes	*f;
20276506Snwhitehorn	struct bch_dev		*ca;
21276506Snwhitehorn};
22276506Snwhitehorn
23276506Snwhitehornstatic void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
24276506Snwhitehorn{
25276506Snwhitehorn	prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
26276506Snwhitehorn	bch2_bpos_to_text(out, n->min_key);
27276506Snwhitehorn	prt_str(out, "-");
28276506Snwhitehorn	bch2_bpos_to_text(out, n->max_key);
29276506Snwhitehorn
30276506Snwhitehorn	if (n->range_updated)
31276506Snwhitehorn		prt_str(out, " range updated");
32276506Snwhitehorn	if (n->overwritten)
33276506Snwhitehorn		prt_str(out, " overwritten");
34276506Snwhitehorn
35276506Snwhitehorn	for (unsigned i = 0; i < n->nr_ptrs; i++) {
36276506Snwhitehorn		prt_char(out, ' ');
37276506Snwhitehorn		bch2_extent_ptr_to_text(out, c, n->ptrs + i);
38276506Snwhitehorn	}
39276506Snwhitehorn}
40276506Snwhitehorn
41276506Snwhitehornstatic void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
42276506Snwhitehorn{
43276506Snwhitehorn	printbuf_indent_add(out, 2);
44276506Snwhitehorn	darray_for_each(nodes, i) {
45276506Snwhitehorn		found_btree_node_to_text(out, c, i);
46276506Snwhitehorn		prt_newline(out);
47276506Snwhitehorn	}
48276506Snwhitehorn	printbuf_indent_sub(out, 2);
49276506Snwhitehorn}
50276506Snwhitehorn
51276506Snwhitehornstatic void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
52276506Snwhitehorn{
53276506Snwhitehorn	struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
54276507Snwhitehorn
55276506Snwhitehorn	set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
56276506Snwhitehorn	bp->k.p			= f->max_key;
57276506Snwhitehorn	bp->v.seq		= cpu_to_le64(f->cookie);
58276506Snwhitehorn	bp->v.sectors_written	= 0;
59276506Snwhitehorn	bp->v.flags		= 0;
60276506Snwhitehorn	bp->v.min_key		= f->min_key;
61276506Snwhitehorn	SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
62276506Snwhitehorn	memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
63276506Snwhitehorn}
64276506Snwhitehorn
65276506Snwhitehornstatic bool found_btree_node_is_readable(struct btree_trans *trans,
66276506Snwhitehorn					 const struct found_btree_node *f)
67276506Snwhitehorn{
68276506Snwhitehorn	struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
69276506Snwhitehorn
70276506Snwhitehorn	found_btree_node_to_key(&k.k, f);
71276506Snwhitehorn
72276506Snwhitehorn	struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
73276506Snwhitehorn	bool ret = !IS_ERR_OR_NULL(b);
74276506Snwhitehorn	if (ret)
75276506Snwhitehorn		six_unlock_read(&b->c.lock);
76276506Snwhitehorn
77276506Snwhitehorn	/*
78276506Snwhitehorn	 * We might update this node's range; if that happens, we need the node
79276507Snwhitehorn	 * to be re-read so the read path can trim keys that are no longer in
80276507Snwhitehorn	 * this node
81276507Snwhitehorn	 */
82276507Snwhitehorn	if (b != btree_node_root(trans->c, b))
83276507Snwhitehorn		bch2_btree_node_evict(trans, &k.k);
84276507Snwhitehorn	return ret;
85276506Snwhitehorn}
86276506Snwhitehorn
87276506Snwhitehornstatic int found_btree_node_cmp_cookie(const void *_l, const void *_r)
88276506Snwhitehorn{
89276506Snwhitehorn	const struct found_btree_node *l = _l;
90276506Snwhitehorn	const struct found_btree_node *r = _r;
91276506Snwhitehorn
92276506Snwhitehorn	return  cmp_int(l->btree_id,	r->btree_id) ?:
93276506Snwhitehorn		cmp_int(l->level,	r->level) ?:
94276506Snwhitehorn		cmp_int(l->cookie,	r->cookie);
95276506Snwhitehorn}
96276507Snwhitehorn
97276506Snwhitehorn/*
98276506Snwhitehorn * Given two found btree nodes, if their sequence numbers are equal, take the
99276513Snwhitehorn * one that's readable:
100276513Snwhitehorn */
101276513Snwhitehornstatic int found_btree_node_cmp_time(const struct found_btree_node *l,
102276513Snwhitehorn				     const struct found_btree_node *r)
103276513Snwhitehorn{
104276513Snwhitehorn	return cmp_int(l->seq, r->seq);
105276513Snwhitehorn}
106276513Snwhitehorn
107276513Snwhitehornstatic int found_btree_node_cmp_pos(const void *_l, const void *_r)
108276513Snwhitehorn{
109276513Snwhitehorn	const struct found_btree_node *l = _l;
110276513Snwhitehorn	const struct found_btree_node *r = _r;
111276513Snwhitehorn
112276513Snwhitehorn	return  cmp_int(l->btree_id,	r->btree_id) ?:
113277991Snwhitehorn	       -cmp_int(l->level,	r->level) ?:
114277991Snwhitehorn		bpos_cmp(l->min_key,	r->min_key) ?:
115277991Snwhitehorn	       -found_btree_node_cmp_time(l, r);
116277991Snwhitehorn}
117277991Snwhitehorn
118277991Snwhitehornstatic void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
119277991Snwhitehorn				struct bio *bio, struct btree_node *bn, u64 offset)
120277991Snwhitehorn{
121276513Snwhitehorn	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
122329175Skevans
123277991Snwhitehorn	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
124277991Snwhitehorn	bio->bi_iter.bi_sector	= offset;
125277991Snwhitehorn	bch2_bio_map(bio, bn, PAGE_SIZE);
126277991Snwhitehorn
127277991Snwhitehorn	submit_bio_wait(bio);
128277991Snwhitehorn	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
129277991Snwhitehorn			       "IO error in try_read_btree_node() at %llu: %s",
130276513Snwhitehorn			       offset, bch2_blk_status_to_str(bio->bi_status)))
131277991Snwhitehorn		return;
132277991Snwhitehorn
133329175Skevans	if (le64_to_cpu(bn->magic) != bset_magic(c))
134277991Snwhitehorn		return;
135277991Snwhitehorn
136277991Snwhitehorn	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
137277991Snwhitehorn		struct nonce nonce = btree_nonce(&bn->keys, 0);
138277991Snwhitehorn		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
139277991Snwhitehorn
140276513Snwhitehorn		bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
141276513Snwhitehorn	}
142276513Snwhitehorn
143276513Snwhitehorn	if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
144276513Snwhitehorn		return;
145276513Snwhitehorn
146276513Snwhitehorn	if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
147276513Snwhitehorn		return;
148276513Snwhitehorn
149276513Snwhitehorn	rcu_read_lock();
150276513Snwhitehorn	struct found_btree_node n = {
151276513Snwhitehorn		.btree_id	= BTREE_NODE_ID(bn),
152276513Snwhitehorn		.level		= BTREE_NODE_LEVEL(bn),
153276513Snwhitehorn		.seq		= BTREE_NODE_SEQ(bn),
154276513Snwhitehorn		.cookie		= le64_to_cpu(bn->keys.seq),
155276513Snwhitehorn		.min_key	= bn->min_key,
156276513Snwhitehorn		.max_key	= bn->max_key,
157276513Snwhitehorn		.nr_ptrs	= 1,
158276513Snwhitehorn		.ptrs[0].type	= 1 << BCH_EXTENT_ENTRY_ptr,
159276506Snwhitehorn		.ptrs[0].offset	= offset,
160276506Snwhitehorn		.ptrs[0].dev	= ca->dev_idx,
161276506Snwhitehorn		.ptrs[0].gen	= *bucket_gen(ca, sector_to_bucket(ca, offset)),
162276506Snwhitehorn	};
163276506Snwhitehorn	rcu_read_unlock();
164276506Snwhitehorn
165276506Snwhitehorn	if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
166276506Snwhitehorn		mutex_lock(&f->lock);
167276513Snwhitehorn		if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
168276513Snwhitehorn			bch_err(c, "try_read_btree_node() can't handle endian conversion");
169276513Snwhitehorn			f->ret = -EINVAL;
170276513Snwhitehorn			goto unlock;
171276506Snwhitehorn		}
172276506Snwhitehorn
173276506Snwhitehorn		if (darray_push(&f->nodes, n))
174276513Snwhitehorn			f->ret = -ENOMEM;
175276506Snwhitehornunlock:
176276506Snwhitehorn		mutex_unlock(&f->lock);
177276506Snwhitehorn	}
178276506Snwhitehorn}
179276506Snwhitehorn
180346302Skevansstatic int read_btree_nodes_worker(void *p)
181346302Skevans{
182346302Skevans	struct find_btree_nodes_worker *w = p;
183346302Skevans	struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
184346302Skevans	struct bch_dev *ca = w->ca;
185346302Skevans	void *buf = (void *) __get_free_page(GFP_KERNEL);
186276506Snwhitehorn	struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
187276506Snwhitehorn	unsigned long last_print = jiffies;
188276506Snwhitehorn
189276506Snwhitehorn	if (!buf || !bio) {
190276506Snwhitehorn		bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
191		w->f->ret = -ENOMEM;
192		goto err;
193	}
194
195	for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
196		for (unsigned bucket_offset = 0;
197		     bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
198		     bucket_offset += btree_sectors(c)) {
199			if (time_after(jiffies, last_print + HZ * 30)) {
200				u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
201				u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
202
203				bch_info(ca, "%s: %2u%% done", __func__,
204					 (unsigned) div64_u64(cur_sector * 100, end_sector));
205				last_print = jiffies;
206			}
207
208			u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
209
210			if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
211			    !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
212				continue;
213
214			try_read_btree_node(w->f, ca, bio, buf, sector);
215		}
216err:
217	bio_put(bio);
218	free_page((unsigned long) buf);
219	percpu_ref_get(&ca->io_ref);
220	closure_put(w->cl);
221	kfree(w);
222	return 0;
223}
224
225static int read_btree_nodes(struct find_btree_nodes *f)
226{
227	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
228	struct closure cl;
229	int ret = 0;
230
231	closure_init_stack(&cl);
232
233	for_each_online_member(c, ca) {
234		if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
235			continue;
236
237		struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
238		struct task_struct *t;
239
240		if (!w) {
241			percpu_ref_put(&ca->io_ref);
242			ret = -ENOMEM;
243			goto err;
244		}
245
246		percpu_ref_get(&ca->io_ref);
247		closure_get(&cl);
248		w->cl		= &cl;
249		w->f		= f;
250		w->ca		= ca;
251
252		t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
253		ret = IS_ERR_OR_NULL(t);
254		if (ret) {
255			percpu_ref_put(&ca->io_ref);
256			closure_put(&cl);
257			f->ret = ret;
258			bch_err(c, "error starting kthread: %i", ret);
259			break;
260		}
261	}
262err:
263	closure_sync(&cl);
264	return f->ret ?: ret;
265}
266
267static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
268{
269	while (n + 1 < end &&
270	       found_btree_node_cmp_pos(n, n + 1) > 0) {
271		swap(n[0], n[1]);
272		n++;
273	}
274}
275
276static int handle_overwrites(struct bch_fs *c,
277			     struct found_btree_node *start,
278			     struct found_btree_node *end)
279{
280	struct found_btree_node *n;
281again:
282	for (n = start + 1;
283	     n < end &&
284	     n->btree_id	== start->btree_id &&
285	     n->level		== start->level &&
286	     bpos_lt(n->min_key, start->max_key);
287	     n++)  {
288		int cmp = found_btree_node_cmp_time(start, n);
289
290		if (cmp > 0) {
291			if (bpos_cmp(start->max_key, n->max_key) >= 0)
292				n->overwritten = true;
293			else {
294				n->range_updated = true;
295				n->min_key = bpos_successor(start->max_key);
296				n->range_updated = true;
297				bubble_up(n, end);
298				goto again;
299			}
300		} else if (cmp < 0) {
301			BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
302
303			start->max_key = bpos_predecessor(n->min_key);
304			start->range_updated = true;
305		} else if (n->level) {
306			n->overwritten = true;
307		} else {
308			struct printbuf buf = PRINTBUF;
309
310			prt_str(&buf, "overlapping btree nodes with same seq! halting\n  ");
311			found_btree_node_to_text(&buf, c, start);
312			prt_str(&buf, "\n  ");
313			found_btree_node_to_text(&buf, c, n);
314			bch_err(c, "%s", buf.buf);
315			printbuf_exit(&buf);
316			return -BCH_ERR_fsck_repair_unimplemented;
317		}
318	}
319
320	return 0;
321}
322
323int bch2_scan_for_btree_nodes(struct bch_fs *c)
324{
325	struct find_btree_nodes *f = &c->found_btree_nodes;
326	struct printbuf buf = PRINTBUF;
327	size_t dst;
328	int ret = 0;
329
330	if (f->nodes.nr)
331		return 0;
332
333	mutex_init(&f->lock);
334
335	ret = read_btree_nodes(f);
336	if (ret)
337		return ret;
338
339	if (!f->nodes.nr) {
340		bch_err(c, "%s: no btree nodes found", __func__);
341		ret = -EINVAL;
342		goto err;
343	}
344
345	if (0 && c->opts.verbose) {
346		printbuf_reset(&buf);
347		prt_printf(&buf, "%s: nodes found:\n", __func__);
348		found_btree_nodes_to_text(&buf, c, f->nodes);
349		bch2_print_string_as_lines(KERN_INFO, buf.buf);
350	}
351
352	sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
353
354	dst = 0;
355	darray_for_each(f->nodes, i) {
356		struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
357
358		if (prev &&
359		    prev->cookie == i->cookie) {
360			if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
361				bch_err(c, "%s: found too many replicas for btree node", __func__);
362				ret = -EINVAL;
363				goto err;
364			}
365			prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
366		} else {
367			f->nodes.data[dst++] = *i;
368		}
369	}
370	f->nodes.nr = dst;
371
372	sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
373
374	if (0 && c->opts.verbose) {
375		printbuf_reset(&buf);
376		prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
377		found_btree_nodes_to_text(&buf, c, f->nodes);
378		bch2_print_string_as_lines(KERN_INFO, buf.buf);
379	}
380
381	dst = 0;
382	darray_for_each(f->nodes, i) {
383		if (i->overwritten)
384			continue;
385
386		ret = handle_overwrites(c, i, &darray_top(f->nodes));
387		if (ret)
388			goto err;
389
390		BUG_ON(i->overwritten);
391		f->nodes.data[dst++] = *i;
392	}
393	f->nodes.nr = dst;
394
395	if (c->opts.verbose) {
396		printbuf_reset(&buf);
397		prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
398		found_btree_nodes_to_text(&buf, c, f->nodes);
399		bch2_print_string_as_lines(KERN_INFO, buf.buf);
400	}
401
402	eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
403err:
404	printbuf_exit(&buf);
405	return ret;
406}
407
408static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
409{
410	const struct found_btree_node *l = _l;
411	const struct found_btree_node *r = _r;
412
413	return  cmp_int(l->btree_id,	r->btree_id) ?:
414	       -cmp_int(l->level,	r->level) ?:
415		bpos_cmp(l->max_key,	r->min_key);
416}
417
418#define for_each_found_btree_node_in_range(_f, _search, _idx)				\
419	for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr,		\
420					sizeof((_f)->nodes.data[0]),			\
421					found_btree_node_range_start_cmp, &search);	\
422	     _idx < (_f)->nodes.nr &&							\
423	     (_f)->nodes.data[_idx].btree_id == _search.btree_id &&			\
424	     (_f)->nodes.data[_idx].level == _search.level &&				\
425	     bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key);			\
426	     _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
427
428bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
429{
430	struct find_btree_nodes *f = &c->found_btree_nodes;
431
432	struct found_btree_node search = {
433		.btree_id	= b->c.btree_id,
434		.level		= b->c.level,
435		.min_key	= b->data->min_key,
436		.max_key	= b->key.k.p,
437	};
438
439	for_each_found_btree_node_in_range(f, search, idx)
440		if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
441			return true;
442	return false;
443}
444
445bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
446{
447	struct found_btree_node search = {
448		.btree_id	= btree,
449		.level		= 0,
450		.min_key	= POS_MIN,
451		.max_key	= SPOS_MAX,
452	};
453
454	for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
455		return true;
456	return false;
457}
458
459int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
460			   unsigned level, struct bpos node_min, struct bpos node_max)
461{
462	if (btree_id_is_alloc(btree))
463		return 0;
464
465	struct find_btree_nodes *f = &c->found_btree_nodes;
466
467	int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
468	if (ret)
469		return ret;
470
471	if (c->opts.verbose) {
472		struct printbuf buf = PRINTBUF;
473
474		prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
475		bch2_bpos_to_text(&buf, node_min);
476		prt_str(&buf, " - ");
477		bch2_bpos_to_text(&buf, node_max);
478
479		bch_info(c, "%s(): %s", __func__, buf.buf);
480		printbuf_exit(&buf);
481	}
482
483	struct found_btree_node search = {
484		.btree_id	= btree,
485		.level		= level,
486		.min_key	= node_min,
487		.max_key	= node_max,
488	};
489
490	for_each_found_btree_node_in_range(f, search, idx) {
491		struct found_btree_node n = f->nodes.data[idx];
492
493		n.range_updated |= bpos_lt(n.min_key, node_min);
494		n.min_key = bpos_max(n.min_key, node_min);
495
496		n.range_updated |= bpos_gt(n.max_key, node_max);
497		n.max_key = bpos_min(n.max_key, node_max);
498
499		struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
500
501		found_btree_node_to_key(&tmp.k, &n);
502
503		struct printbuf buf = PRINTBUF;
504		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
505		bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
506		printbuf_exit(&buf);
507
508		BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
509
510		ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
511		if (ret)
512			return ret;
513	}
514
515	return 0;
516}
517
518void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
519{
520	darray_exit(&f->nodes);
521}
522