1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
4#include "bkey_methods.h"
5#include "bkey_sort.h"
6#include "btree_cache.h"
7#include "btree_io.h"
8#include "btree_iter.h"
9#include "btree_locking.h"
10#include "btree_update.h"
11#include "btree_update_interior.h"
12#include "buckets.h"
13#include "checksum.h"
14#include "debug.h"
15#include "error.h"
16#include "extents.h"
17#include "io_write.h"
18#include "journal_reclaim.h"
19#include "journal_seq_blacklist.h"
20#include "recovery.h"
21#include "super-io.h"
22#include "trace.h"
23
24#include <linux/sched/mm.h>
25
26void bch2_btree_node_io_unlock(struct btree *b)
27{
28	EBUG_ON(!btree_node_write_in_flight(b));
29
30	clear_btree_node_write_in_flight_inner(b);
31	clear_btree_node_write_in_flight(b);
32	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
33}
34
35void bch2_btree_node_io_lock(struct btree *b)
36{
37	bch2_assert_btree_nodes_not_locked();
38
39	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
40			    TASK_UNINTERRUPTIBLE);
41}
42
43void __bch2_btree_node_wait_on_read(struct btree *b)
44{
45	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
46		       TASK_UNINTERRUPTIBLE);
47}
48
49void __bch2_btree_node_wait_on_write(struct btree *b)
50{
51	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
52		       TASK_UNINTERRUPTIBLE);
53}
54
55void bch2_btree_node_wait_on_read(struct btree *b)
56{
57	bch2_assert_btree_nodes_not_locked();
58
59	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
60		       TASK_UNINTERRUPTIBLE);
61}
62
63void bch2_btree_node_wait_on_write(struct btree *b)
64{
65	bch2_assert_btree_nodes_not_locked();
66
67	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
68		       TASK_UNINTERRUPTIBLE);
69}
70
71static void verify_no_dups(struct btree *b,
72			   struct bkey_packed *start,
73			   struct bkey_packed *end)
74{
75#ifdef CONFIG_BCACHEFS_DEBUG
76	struct bkey_packed *k, *p;
77
78	if (start == end)
79		return;
80
81	for (p = start, k = bkey_p_next(start);
82	     k != end;
83	     p = k, k = bkey_p_next(k)) {
84		struct bkey l = bkey_unpack_key(b, p);
85		struct bkey r = bkey_unpack_key(b, k);
86
87		BUG_ON(bpos_ge(l.p, bkey_start_pos(&r)));
88	}
89#endif
90}
91
92static void set_needs_whiteout(struct bset *i, int v)
93{
94	struct bkey_packed *k;
95
96	for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
97		k->needs_whiteout = v;
98}
99
100static void btree_bounce_free(struct bch_fs *c, size_t size,
101			      bool used_mempool, void *p)
102{
103	if (used_mempool)
104		mempool_free(p, &c->btree_bounce_pool);
105	else
106		kvfree(p);
107}
108
109static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
110				bool *used_mempool)
111{
112	unsigned flags = memalloc_nofs_save();
113	void *p;
114
115	BUG_ON(size > c->opts.btree_node_size);
116
117	*used_mempool = false;
118	p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
119	if (!p) {
120		*used_mempool = true;
121		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
122	}
123	memalloc_nofs_restore(flags);
124	return p;
125}
126
127static void sort_bkey_ptrs(const struct btree *bt,
128			   struct bkey_packed **ptrs, unsigned nr)
129{
130	unsigned n = nr, a = nr / 2, b, c, d;
131
132	if (!a)
133		return;
134
135	/* Heap sort: see lib/sort.c: */
136	while (1) {
137		if (a)
138			a--;
139		else if (--n)
140			swap(ptrs[0], ptrs[n]);
141		else
142			break;
143
144		for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
145			b = bch2_bkey_cmp_packed(bt,
146					    ptrs[c],
147					    ptrs[d]) >= 0 ? c : d;
148		if (d == n)
149			b = c;
150
151		while (b != a &&
152		       bch2_bkey_cmp_packed(bt,
153				       ptrs[a],
154				       ptrs[b]) >= 0)
155			b = (b - 1) / 2;
156		c = b;
157		while (b != a) {
158			b = (b - 1) / 2;
159			swap(ptrs[b], ptrs[c]);
160		}
161	}
162}
163
164static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
165{
166	struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
167	bool used_mempool = false;
168	size_t bytes = b->whiteout_u64s * sizeof(u64);
169
170	if (!b->whiteout_u64s)
171		return;
172
173	new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
174
175	ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
176
177	for (k = unwritten_whiteouts_start(b);
178	     k != unwritten_whiteouts_end(b);
179	     k = bkey_p_next(k))
180		*--ptrs = k;
181
182	sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
183
184	k = new_whiteouts;
185
186	while (ptrs != ptrs_end) {
187		bkey_p_copy(k, *ptrs);
188		k = bkey_p_next(k);
189		ptrs++;
190	}
191
192	verify_no_dups(b, new_whiteouts,
193		       (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
194
195	memcpy_u64s(unwritten_whiteouts_start(b),
196		    new_whiteouts, b->whiteout_u64s);
197
198	btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
199}
200
201static bool should_compact_bset(struct btree *b, struct bset_tree *t,
202				bool compacting, enum compact_mode mode)
203{
204	if (!bset_dead_u64s(b, t))
205		return false;
206
207	switch (mode) {
208	case COMPACT_LAZY:
209		return should_compact_bset_lazy(b, t) ||
210			(compacting && !bset_written(b, bset(b, t)));
211	case COMPACT_ALL:
212		return true;
213	default:
214		BUG();
215	}
216}
217
218static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
219{
220	struct bset_tree *t;
221	bool ret = false;
222
223	for_each_bset(b, t) {
224		struct bset *i = bset(b, t);
225		struct bkey_packed *k, *n, *out, *start, *end;
226		struct btree_node_entry *src = NULL, *dst = NULL;
227
228		if (t != b->set && !bset_written(b, i)) {
229			src = container_of(i, struct btree_node_entry, keys);
230			dst = max(write_block(b),
231				  (void *) btree_bkey_last(b, t - 1));
232		}
233
234		if (src != dst)
235			ret = true;
236
237		if (!should_compact_bset(b, t, ret, mode)) {
238			if (src != dst) {
239				memmove(dst, src, sizeof(*src) +
240					le16_to_cpu(src->keys.u64s) *
241					sizeof(u64));
242				i = &dst->keys;
243				set_btree_bset(b, t, i);
244			}
245			continue;
246		}
247
248		start	= btree_bkey_first(b, t);
249		end	= btree_bkey_last(b, t);
250
251		if (src != dst) {
252			memmove(dst, src, sizeof(*src));
253			i = &dst->keys;
254			set_btree_bset(b, t, i);
255		}
256
257		out = i->start;
258
259		for (k = start; k != end; k = n) {
260			n = bkey_p_next(k);
261
262			if (!bkey_deleted(k)) {
263				bkey_p_copy(out, k);
264				out = bkey_p_next(out);
265			} else {
266				BUG_ON(k->needs_whiteout);
267			}
268		}
269
270		i->u64s = cpu_to_le16((u64 *) out - i->_data);
271		set_btree_bset_end(b, t);
272		bch2_bset_set_no_aux_tree(b, t);
273		ret = true;
274	}
275
276	bch2_verify_btree_nr_keys(b);
277
278	bch2_btree_build_aux_trees(b);
279
280	return ret;
281}
282
283bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
284			    enum compact_mode mode)
285{
286	return bch2_drop_whiteouts(b, mode);
287}
288
289static void btree_node_sort(struct bch_fs *c, struct btree *b,
290			    unsigned start_idx,
291			    unsigned end_idx,
292			    bool filter_whiteouts)
293{
294	struct btree_node *out;
295	struct sort_iter_stack sort_iter;
296	struct bset_tree *t;
297	struct bset *start_bset = bset(b, &b->set[start_idx]);
298	bool used_mempool = false;
299	u64 start_time, seq = 0;
300	unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
301	bool sorting_entire_node = start_idx == 0 &&
302		end_idx == b->nsets;
303
304	sort_iter_stack_init(&sort_iter, b);
305
306	for (t = b->set + start_idx;
307	     t < b->set + end_idx;
308	     t++) {
309		u64s += le16_to_cpu(bset(b, t)->u64s);
310		sort_iter_add(&sort_iter.iter,
311			      btree_bkey_first(b, t),
312			      btree_bkey_last(b, t));
313	}
314
315	bytes = sorting_entire_node
316		? btree_buf_bytes(b)
317		: __vstruct_bytes(struct btree_node, u64s);
318
319	out = btree_bounce_alloc(c, bytes, &used_mempool);
320
321	start_time = local_clock();
322
323	u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts);
324
325	out->keys.u64s = cpu_to_le16(u64s);
326
327	BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
328
329	if (sorting_entire_node)
330		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
331				       start_time);
332
333	/* Make sure we preserve bset journal_seq: */
334	for (t = b->set + start_idx; t < b->set + end_idx; t++)
335		seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
336	start_bset->journal_seq = cpu_to_le64(seq);
337
338	if (sorting_entire_node) {
339		u64s = le16_to_cpu(out->keys.u64s);
340
341		BUG_ON(bytes != btree_buf_bytes(b));
342
343		/*
344		 * Our temporary buffer is the same size as the btree node's
345		 * buffer, we can just swap buffers instead of doing a big
346		 * memcpy()
347		 */
348		*out = *b->data;
349		out->keys.u64s = cpu_to_le16(u64s);
350		swap(out, b->data);
351		set_btree_bset(b, b->set, &b->data->keys);
352	} else {
353		start_bset->u64s = out->keys.u64s;
354		memcpy_u64s(start_bset->start,
355			    out->keys.start,
356			    le16_to_cpu(out->keys.u64s));
357	}
358
359	for (i = start_idx + 1; i < end_idx; i++)
360		b->nr.bset_u64s[start_idx] +=
361			b->nr.bset_u64s[i];
362
363	b->nsets -= shift;
364
365	for (i = start_idx + 1; i < b->nsets; i++) {
366		b->nr.bset_u64s[i]	= b->nr.bset_u64s[i + shift];
367		b->set[i]		= b->set[i + shift];
368	}
369
370	for (i = b->nsets; i < MAX_BSETS; i++)
371		b->nr.bset_u64s[i] = 0;
372
373	set_btree_bset_end(b, &b->set[start_idx]);
374	bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
375
376	btree_bounce_free(c, bytes, used_mempool, out);
377
378	bch2_verify_btree_nr_keys(b);
379}
380
381void bch2_btree_sort_into(struct bch_fs *c,
382			 struct btree *dst,
383			 struct btree *src)
384{
385	struct btree_nr_keys nr;
386	struct btree_node_iter src_iter;
387	u64 start_time = local_clock();
388
389	BUG_ON(dst->nsets != 1);
390
391	bch2_bset_set_no_aux_tree(dst, dst->set);
392
393	bch2_btree_node_iter_init_from_start(&src_iter, src);
394
395	nr = bch2_sort_repack(btree_bset_first(dst),
396			src, &src_iter,
397			&dst->format,
398			true);
399
400	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
401			       start_time);
402
403	set_btree_bset_end(dst, dst->set);
404
405	dst->nr.live_u64s	+= nr.live_u64s;
406	dst->nr.bset_u64s[0]	+= nr.bset_u64s[0];
407	dst->nr.packed_keys	+= nr.packed_keys;
408	dst->nr.unpacked_keys	+= nr.unpacked_keys;
409
410	bch2_verify_btree_nr_keys(dst);
411}
412
413/*
414 * We're about to add another bset to the btree node, so if there's currently
415 * too many bsets - sort some of them together:
416 */
417static bool btree_node_compact(struct bch_fs *c, struct btree *b)
418{
419	unsigned unwritten_idx;
420	bool ret = false;
421
422	for (unwritten_idx = 0;
423	     unwritten_idx < b->nsets;
424	     unwritten_idx++)
425		if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
426			break;
427
428	if (b->nsets - unwritten_idx > 1) {
429		btree_node_sort(c, b, unwritten_idx,
430				b->nsets, false);
431		ret = true;
432	}
433
434	if (unwritten_idx > 1) {
435		btree_node_sort(c, b, 0, unwritten_idx, false);
436		ret = true;
437	}
438
439	return ret;
440}
441
442void bch2_btree_build_aux_trees(struct btree *b)
443{
444	struct bset_tree *t;
445
446	for_each_bset(b, t)
447		bch2_bset_build_aux_tree(b, t,
448				!bset_written(b, bset(b, t)) &&
449				t == bset_tree_last(b));
450}
451
452/*
453 * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
454 *
455 * The first bset is going to be of similar order to the size of the node, the
456 * last bset is bounded by btree_write_set_buffer(), which is set to keep the
457 * memmove on insert from being too expensive: the middle bset should, ideally,
458 * be the geometric mean of the first and the last.
459 *
460 * Returns true if the middle bset is greater than that geometric mean:
461 */
462static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
463{
464	unsigned mid_u64s_bits =
465		(ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
466
467	return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
468}
469
470/*
471 * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
472 * inserted into
473 *
474 * Safe to call if there already is an unwritten bset - will only add a new bset
475 * if @b doesn't already have one.
476 *
477 * Returns true if we sorted (i.e. invalidated iterators
478 */
479void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
480{
481	struct bch_fs *c = trans->c;
482	struct btree_node_entry *bne;
483	bool reinit_iter = false;
484
485	EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]);
486	BUG_ON(bset_written(b, bset(b, &b->set[1])));
487	BUG_ON(btree_node_just_written(b));
488
489	if (b->nsets == MAX_BSETS &&
490	    !btree_node_write_in_flight(b) &&
491	    should_compact_all(c, b)) {
492		bch2_btree_node_write(c, b, SIX_LOCK_write,
493				      BTREE_WRITE_init_next_bset);
494		reinit_iter = true;
495	}
496
497	if (b->nsets == MAX_BSETS &&
498	    btree_node_compact(c, b))
499		reinit_iter = true;
500
501	BUG_ON(b->nsets >= MAX_BSETS);
502
503	bne = want_new_bset(c, b);
504	if (bne)
505		bch2_bset_init_next(b, bne);
506
507	bch2_btree_build_aux_trees(b);
508
509	if (reinit_iter)
510		bch2_trans_node_reinit_iter(trans, b);
511}
512
513static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
514			  struct bch_dev *ca,
515			  struct btree *b, struct bset *i,
516			  unsigned offset, int write)
517{
518	prt_printf(out, bch2_log_msg(c, "%s"),
519		   write == READ
520		   ? "error validating btree node "
521		   : "corrupt btree node before write ");
522	if (ca)
523		prt_printf(out, "on %s ", ca->name);
524	prt_printf(out, "at btree ");
525	bch2_btree_pos_to_text(out, c, b);
526
527	prt_printf(out, "\n  node offset %u/%u",
528		   b->written, btree_ptr_sectors_written(&b->key));
529	if (i)
530		prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
531	prt_str(out, ": ");
532}
533
534__printf(9, 10)
535static int __btree_err(int ret,
536		       struct bch_fs *c,
537		       struct bch_dev *ca,
538		       struct btree *b,
539		       struct bset *i,
540		       int write,
541		       bool have_retry,
542		       enum bch_sb_error_id err_type,
543		       const char *fmt, ...)
544{
545	struct printbuf out = PRINTBUF;
546	va_list args;
547
548	btree_err_msg(&out, c, ca, b, i, b->written, write);
549
550	va_start(args, fmt);
551	prt_vprintf(&out, fmt, args);
552	va_end(args);
553
554	if (write == WRITE) {
555		bch2_print_string_as_lines(KERN_ERR, out.buf);
556		ret = c->opts.errors == BCH_ON_ERROR_continue
557			? 0
558			: -BCH_ERR_fsck_errors_not_fixed;
559		goto out;
560	}
561
562	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
563		ret = -BCH_ERR_btree_node_read_err_fixable;
564	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
565		ret = -BCH_ERR_btree_node_read_err_bad_node;
566
567	if (ret != -BCH_ERR_btree_node_read_err_fixable)
568		bch2_sb_error_count(c, err_type);
569
570	switch (ret) {
571	case -BCH_ERR_btree_node_read_err_fixable:
572		ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf);
573		if (ret != -BCH_ERR_fsck_fix &&
574		    ret != -BCH_ERR_fsck_ignore)
575			goto fsck_err;
576		ret = -BCH_ERR_fsck_fix;
577		break;
578	case -BCH_ERR_btree_node_read_err_want_retry:
579	case -BCH_ERR_btree_node_read_err_must_retry:
580		bch2_print_string_as_lines(KERN_ERR, out.buf);
581		break;
582	case -BCH_ERR_btree_node_read_err_bad_node:
583		bch2_print_string_as_lines(KERN_ERR, out.buf);
584		ret = bch2_topology_error(c);
585		break;
586	case -BCH_ERR_btree_node_read_err_incompatible:
587		bch2_print_string_as_lines(KERN_ERR, out.buf);
588		ret = -BCH_ERR_fsck_errors_not_fixed;
589		break;
590	default:
591		BUG();
592	}
593out:
594fsck_err:
595	printbuf_exit(&out);
596	return ret;
597}
598
599#define btree_err(type, c, ca, b, i, _err_type, msg, ...)		\
600({									\
601	int _ret = __btree_err(type, c, ca, b, i, write, have_retry,	\
602			       BCH_FSCK_ERR_##_err_type,		\
603			       msg, ##__VA_ARGS__);			\
604									\
605	if (_ret != -BCH_ERR_fsck_fix) {				\
606		ret = _ret;						\
607		goto fsck_err;						\
608	}								\
609									\
610	*saw_error = true;						\
611})
612
613#define btree_err_on(cond, ...)	((cond) ? btree_err(__VA_ARGS__) : false)
614
615/*
616 * When btree topology repair changes the start or end of a node, that might
617 * mean we have to drop keys that are no longer inside the node:
618 */
619__cold
620void bch2_btree_node_drop_keys_outside_node(struct btree *b)
621{
622	struct bset_tree *t;
623
624	for_each_bset(b, t) {
625		struct bset *i = bset(b, t);
626		struct bkey_packed *k;
627
628		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
629			if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
630				break;
631
632		if (k != i->start) {
633			unsigned shift = (u64 *) k - (u64 *) i->start;
634
635			memmove_u64s_down(i->start, k,
636					  (u64 *) vstruct_end(i) - (u64 *) k);
637			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
638			set_btree_bset_end(b, t);
639		}
640
641		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
642			if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
643				break;
644
645		if (k != vstruct_last(i)) {
646			i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
647			set_btree_bset_end(b, t);
648		}
649	}
650
651	/*
652	 * Always rebuild search trees: eytzinger search tree nodes directly
653	 * depend on the values of min/max key:
654	 */
655	bch2_bset_set_no_aux_tree(b, b->set);
656	bch2_btree_build_aux_trees(b);
657	b->nr = bch2_btree_node_count_keys(b);
658
659	struct bkey_s_c k;
660	struct bkey unpacked;
661	struct btree_node_iter iter;
662	for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
663		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
664		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
665	}
666}
667
668static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
669			 struct btree *b, struct bset *i,
670			 unsigned offset, unsigned sectors,
671			 int write, bool have_retry, bool *saw_error)
672{
673	unsigned version = le16_to_cpu(i->version);
674	struct printbuf buf1 = PRINTBUF;
675	struct printbuf buf2 = PRINTBUF;
676	int ret = 0;
677
678	btree_err_on(!bch2_version_compatible(version),
679		     -BCH_ERR_btree_node_read_err_incompatible,
680		     c, ca, b, i,
681		     btree_node_unsupported_version,
682		     "unsupported bset version %u.%u",
683		     BCH_VERSION_MAJOR(version),
684		     BCH_VERSION_MINOR(version));
685
686	if (btree_err_on(version < c->sb.version_min,
687			 -BCH_ERR_btree_node_read_err_fixable,
688			 c, NULL, b, i,
689			 btree_node_bset_older_than_sb_min,
690			 "bset version %u older than superblock version_min %u",
691			 version, c->sb.version_min)) {
692		mutex_lock(&c->sb_lock);
693		c->disk_sb.sb->version_min = cpu_to_le16(version);
694		bch2_write_super(c);
695		mutex_unlock(&c->sb_lock);
696	}
697
698	if (btree_err_on(BCH_VERSION_MAJOR(version) >
699			 BCH_VERSION_MAJOR(c->sb.version),
700			 -BCH_ERR_btree_node_read_err_fixable,
701			 c, NULL, b, i,
702			 btree_node_bset_newer_than_sb,
703			 "bset version %u newer than superblock version %u",
704			 version, c->sb.version)) {
705		mutex_lock(&c->sb_lock);
706		c->disk_sb.sb->version = cpu_to_le16(version);
707		bch2_write_super(c);
708		mutex_unlock(&c->sb_lock);
709	}
710
711	btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
712		     -BCH_ERR_btree_node_read_err_incompatible,
713		     c, ca, b, i,
714		     btree_node_unsupported_version,
715		     "BSET_SEPARATE_WHITEOUTS no longer supported");
716
717	if (btree_err_on(offset + sectors > btree_sectors(c),
718			 -BCH_ERR_btree_node_read_err_fixable,
719			 c, ca, b, i,
720			 bset_past_end_of_btree_node,
721			 "bset past end of btree node")) {
722		i->u64s = 0;
723		ret = 0;
724		goto out;
725	}
726
727	btree_err_on(offset && !i->u64s,
728		     -BCH_ERR_btree_node_read_err_fixable,
729		     c, ca, b, i,
730		     bset_empty,
731		     "empty bset");
732
733	btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
734		     -BCH_ERR_btree_node_read_err_want_retry,
735		     c, ca, b, i,
736		     bset_wrong_sector_offset,
737		     "bset at wrong sector offset");
738
739	if (!offset) {
740		struct btree_node *bn =
741			container_of(i, struct btree_node, keys);
742		/* These indicate that we read the wrong btree node: */
743
744		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
745			struct bch_btree_ptr_v2 *bp =
746				&bkey_i_to_btree_ptr_v2(&b->key)->v;
747
748			/* XXX endianness */
749			btree_err_on(bp->seq != bn->keys.seq,
750				     -BCH_ERR_btree_node_read_err_must_retry,
751				     c, ca, b, NULL,
752				     bset_bad_seq,
753				     "incorrect sequence number (wrong btree node)");
754		}
755
756		btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
757			     -BCH_ERR_btree_node_read_err_must_retry,
758			     c, ca, b, i,
759			     btree_node_bad_btree,
760			     "incorrect btree id");
761
762		btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
763			     -BCH_ERR_btree_node_read_err_must_retry,
764			     c, ca, b, i,
765			     btree_node_bad_level,
766			     "incorrect level");
767
768		if (!write)
769			compat_btree_node(b->c.level, b->c.btree_id, version,
770					  BSET_BIG_ENDIAN(i), write, bn);
771
772		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
773			struct bch_btree_ptr_v2 *bp =
774				&bkey_i_to_btree_ptr_v2(&b->key)->v;
775
776			if (BTREE_PTR_RANGE_UPDATED(bp)) {
777				b->data->min_key = bp->min_key;
778				b->data->max_key = b->key.k.p;
779			}
780
781			btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
782				     -BCH_ERR_btree_node_read_err_must_retry,
783				     c, ca, b, NULL,
784				     btree_node_bad_min_key,
785				     "incorrect min_key: got %s should be %s",
786				     (printbuf_reset(&buf1),
787				      bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
788				     (printbuf_reset(&buf2),
789				      bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
790		}
791
792		btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
793			     -BCH_ERR_btree_node_read_err_must_retry,
794			     c, ca, b, i,
795			     btree_node_bad_max_key,
796			     "incorrect max key %s",
797			     (printbuf_reset(&buf1),
798			      bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
799
800		if (write)
801			compat_btree_node(b->c.level, b->c.btree_id, version,
802					  BSET_BIG_ENDIAN(i), write, bn);
803
804		btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
805			     -BCH_ERR_btree_node_read_err_bad_node,
806			     c, ca, b, i,
807			     btree_node_bad_format,
808			     "invalid bkey format: %s\n  %s", buf1.buf,
809			     (printbuf_reset(&buf2),
810			      bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
811		printbuf_reset(&buf1);
812
813		compat_bformat(b->c.level, b->c.btree_id, version,
814			       BSET_BIG_ENDIAN(i), write,
815			       &bn->format);
816	}
817out:
818fsck_err:
819	printbuf_exit(&buf2);
820	printbuf_exit(&buf1);
821	return ret;
822}
823
824static int bset_key_invalid(struct bch_fs *c, struct btree *b,
825			    struct bkey_s_c k,
826			    bool updated_range, int rw,
827			    struct printbuf *err)
828{
829	return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
830		(!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?:
831		(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
832}
833
834static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
835			 struct bset *i, struct bkey_packed *k)
836{
837	if (bkey_p_next(k) > vstruct_last(i))
838		return false;
839
840	if (k->format > KEY_FORMAT_CURRENT)
841		return false;
842
843	if (!bkeyp_u64s_valid(&b->format, k))
844		return false;
845
846	struct printbuf buf = PRINTBUF;
847	struct bkey tmp;
848	struct bkey_s u = __bkey_disassemble(b, k, &tmp);
849	bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf);
850	printbuf_exit(&buf);
851	return ret;
852}
853
854static int validate_bset_keys(struct bch_fs *c, struct btree *b,
855			 struct bset *i, int write,
856			 bool have_retry, bool *saw_error)
857{
858	unsigned version = le16_to_cpu(i->version);
859	struct bkey_packed *k, *prev = NULL;
860	struct printbuf buf = PRINTBUF;
861	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
862		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
863	int ret = 0;
864
865	for (k = i->start;
866	     k != vstruct_last(i);) {
867		struct bkey_s u;
868		struct bkey tmp;
869		unsigned next_good_key;
870
871		if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
872				 -BCH_ERR_btree_node_read_err_fixable,
873				 c, NULL, b, i,
874				 btree_node_bkey_past_bset_end,
875				 "key extends past end of bset")) {
876			i->u64s = cpu_to_le16((u64 *) k - i->_data);
877			break;
878		}
879
880		if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
881				 -BCH_ERR_btree_node_read_err_fixable,
882				 c, NULL, b, i,
883				 btree_node_bkey_bad_format,
884				 "invalid bkey format %u", k->format))
885			goto drop_this_key;
886
887		if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
888				 -BCH_ERR_btree_node_read_err_fixable,
889				 c, NULL, b, i,
890				 btree_node_bkey_bad_u64s,
891				 "bad k->u64s %u (min %u max %zu)", k->u64s,
892				 bkeyp_key_u64s(&b->format, k),
893				 U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
894			goto drop_this_key;
895
896		if (!write)
897			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
898				    BSET_BIG_ENDIAN(i), write,
899				    &b->format, k);
900
901		u = __bkey_disassemble(b, k, &tmp);
902
903		printbuf_reset(&buf);
904		if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
905			printbuf_reset(&buf);
906			bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
907			prt_printf(&buf, "\n  ");
908			bch2_bkey_val_to_text(&buf, c, u.s_c);
909
910			btree_err(-BCH_ERR_btree_node_read_err_fixable,
911				  c, NULL, b, i,
912				  btree_node_bad_bkey,
913				  "invalid bkey: %s", buf.buf);
914			goto drop_this_key;
915		}
916
917		if (write)
918			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
919				    BSET_BIG_ENDIAN(i), write,
920				    &b->format, k);
921
922		if (prev && bkey_iter_cmp(b, prev, k) > 0) {
923			struct bkey up = bkey_unpack_key(b, prev);
924
925			printbuf_reset(&buf);
926			prt_printf(&buf, "keys out of order: ");
927			bch2_bkey_to_text(&buf, &up);
928			prt_printf(&buf, " > ");
929			bch2_bkey_to_text(&buf, u.k);
930
931			if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
932				      c, NULL, b, i,
933				      btree_node_bkey_out_of_order,
934				      "%s", buf.buf))
935				goto drop_this_key;
936		}
937
938		prev = k;
939		k = bkey_p_next(k);
940		continue;
941drop_this_key:
942		next_good_key = k->u64s;
943
944		if (!next_good_key ||
945		    (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN &&
946		     version >= bcachefs_metadata_version_snapshot)) {
947			/*
948			 * only do scanning if bch2_bkey_compat() has nothing to
949			 * do
950			 */
951
952			if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
953				for (next_good_key = 1;
954				     next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
955				     next_good_key++)
956					if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
957						goto got_good_key;
958			}
959
960			/*
961			 * didn't find a good key, have to truncate the rest of
962			 * the bset
963			 */
964			next_good_key = (u64 *) vstruct_last(i) - (u64 *) k;
965		}
966got_good_key:
967		le16_add_cpu(&i->u64s, -next_good_key);
968		memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
969	}
970fsck_err:
971	printbuf_exit(&buf);
972	return ret;
973}
974
975int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
976			      struct btree *b, bool have_retry, bool *saw_error)
977{
978	struct btree_node_entry *bne;
979	struct sort_iter *iter;
980	struct btree_node *sorted;
981	struct bkey_packed *k;
982	struct bset *i;
983	bool used_mempool, blacklisted;
984	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
985		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
986	unsigned u64s;
987	unsigned ptr_written = btree_ptr_sectors_written(&b->key);
988	struct printbuf buf = PRINTBUF;
989	int ret = 0, retry_read = 0, write = READ;
990	u64 start_time = local_clock();
991
992	b->version_ondisk = U16_MAX;
993	/* We might get called multiple times on read retry: */
994	b->written = 0;
995
996	iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
997	sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
998
999	if (bch2_meta_read_fault("btree"))
1000		btree_err(-BCH_ERR_btree_node_read_err_must_retry,
1001			  c, ca, b, NULL,
1002			  btree_node_fault_injected,
1003			  "dynamic fault");
1004
1005	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
1006		     -BCH_ERR_btree_node_read_err_must_retry,
1007		     c, ca, b, NULL,
1008		     btree_node_bad_magic,
1009		     "bad magic: want %llx, got %llx",
1010		     bset_magic(c), le64_to_cpu(b->data->magic));
1011
1012	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
1013		struct bch_btree_ptr_v2 *bp =
1014			&bkey_i_to_btree_ptr_v2(&b->key)->v;
1015
1016		bch2_bpos_to_text(&buf, b->data->min_key);
1017		prt_str(&buf, "-");
1018		bch2_bpos_to_text(&buf, b->data->max_key);
1019
1020		btree_err_on(b->data->keys.seq != bp->seq,
1021			     -BCH_ERR_btree_node_read_err_must_retry,
1022			     c, ca, b, NULL,
1023			     btree_node_bad_seq,
1024			     "got wrong btree node (want %llx got %llx)\n"
1025			     "got btree %s level %llu pos %s",
1026			     bp->seq, b->data->keys.seq,
1027			     bch2_btree_id_str(BTREE_NODE_ID(b->data)),
1028			     BTREE_NODE_LEVEL(b->data),
1029			     buf.buf);
1030	} else {
1031		btree_err_on(!b->data->keys.seq,
1032			     -BCH_ERR_btree_node_read_err_must_retry,
1033			     c, ca, b, NULL,
1034			     btree_node_bad_seq,
1035			     "bad btree header: seq 0");
1036	}
1037
1038	while (b->written < (ptr_written ?: btree_sectors(c))) {
1039		unsigned sectors;
1040		struct nonce nonce;
1041		bool first = !b->written;
1042		bool csum_bad;
1043
1044		if (!b->written) {
1045			i = &b->data->keys;
1046
1047			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
1048				     -BCH_ERR_btree_node_read_err_want_retry,
1049				     c, ca, b, i,
1050				     bset_unknown_csum,
1051				     "unknown checksum type %llu", BSET_CSUM_TYPE(i));
1052
1053			nonce = btree_nonce(i, b->written << 9);
1054
1055			struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
1056			csum_bad = bch2_crc_cmp(b->data->csum, csum);
1057			if (csum_bad)
1058				bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
1059
1060			btree_err_on(csum_bad,
1061				     -BCH_ERR_btree_node_read_err_want_retry,
1062				     c, ca, b, i,
1063				     bset_bad_csum,
1064				     "%s",
1065				     (printbuf_reset(&buf),
1066				      bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
1067				      buf.buf));
1068
1069			ret = bset_encrypt(c, i, b->written << 9);
1070			if (bch2_fs_fatal_err_on(ret, c,
1071					"decrypting btree node: %s", bch2_err_str(ret)))
1072				goto fsck_err;
1073
1074			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
1075				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
1076				     -BCH_ERR_btree_node_read_err_incompatible,
1077				     c, NULL, b, NULL,
1078				     btree_node_unsupported_version,
1079				     "btree node does not have NEW_EXTENT_OVERWRITE set");
1080
1081			sectors = vstruct_sectors(b->data, c->block_bits);
1082		} else {
1083			bne = write_block(b);
1084			i = &bne->keys;
1085
1086			if (i->seq != b->data->keys.seq)
1087				break;
1088
1089			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
1090				     -BCH_ERR_btree_node_read_err_want_retry,
1091				     c, ca, b, i,
1092				     bset_unknown_csum,
1093				     "unknown checksum type %llu", BSET_CSUM_TYPE(i));
1094
1095			nonce = btree_nonce(i, b->written << 9);
1096			struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
1097			csum_bad = bch2_crc_cmp(bne->csum, csum);
1098			if (csum_bad)
1099				bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
1100
1101			btree_err_on(csum_bad,
1102				     -BCH_ERR_btree_node_read_err_want_retry,
1103				     c, ca, b, i,
1104				     bset_bad_csum,
1105				     "%s",
1106				     (printbuf_reset(&buf),
1107				      bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
1108				      buf.buf));
1109
1110			ret = bset_encrypt(c, i, b->written << 9);
1111			if (bch2_fs_fatal_err_on(ret, c,
1112					"decrypting btree node: %s", bch2_err_str(ret)))
1113				goto fsck_err;
1114
1115			sectors = vstruct_sectors(bne, c->block_bits);
1116		}
1117
1118		b->version_ondisk = min(b->version_ondisk,
1119					le16_to_cpu(i->version));
1120
1121		ret = validate_bset(c, ca, b, i, b->written, sectors,
1122				    READ, have_retry, saw_error);
1123		if (ret)
1124			goto fsck_err;
1125
1126		if (!b->written)
1127			btree_node_set_format(b, b->data->format);
1128
1129		ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
1130		if (ret)
1131			goto fsck_err;
1132
1133		SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
1134
1135		blacklisted = bch2_journal_seq_is_blacklisted(c,
1136					le64_to_cpu(i->journal_seq),
1137					true);
1138
1139		btree_err_on(blacklisted && first,
1140			     -BCH_ERR_btree_node_read_err_fixable,
1141			     c, ca, b, i,
1142			     bset_blacklisted_journal_seq,
1143			     "first btree node bset has blacklisted journal seq (%llu)",
1144			     le64_to_cpu(i->journal_seq));
1145
1146		btree_err_on(blacklisted && ptr_written,
1147			     -BCH_ERR_btree_node_read_err_fixable,
1148			     c, ca, b, i,
1149			     first_bset_blacklisted_journal_seq,
1150			     "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
1151			     le64_to_cpu(i->journal_seq),
1152			     b->written, b->written + sectors, ptr_written);
1153
1154		b->written += sectors;
1155
1156		if (blacklisted && !first)
1157			continue;
1158
1159		sort_iter_add(iter,
1160			      vstruct_idx(i, 0),
1161			      vstruct_last(i));
1162	}
1163
1164	if (ptr_written) {
1165		btree_err_on(b->written < ptr_written,
1166			     -BCH_ERR_btree_node_read_err_want_retry,
1167			     c, ca, b, NULL,
1168			     btree_node_data_missing,
1169			     "btree node data missing: expected %u sectors, found %u",
1170			     ptr_written, b->written);
1171	} else {
1172		for (bne = write_block(b);
1173		     bset_byte_offset(b, bne) < btree_buf_bytes(b);
1174		     bne = (void *) bne + block_bytes(c))
1175			btree_err_on(bne->keys.seq == b->data->keys.seq &&
1176				     !bch2_journal_seq_is_blacklisted(c,
1177								      le64_to_cpu(bne->keys.journal_seq),
1178								      true),
1179				     -BCH_ERR_btree_node_read_err_want_retry,
1180				     c, ca, b, NULL,
1181				     btree_node_bset_after_end,
1182				     "found bset signature after last bset");
1183	}
1184
1185	sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
1186	sorted->keys.u64s = 0;
1187
1188	set_btree_bset(b, b->set, &b->data->keys);
1189
1190	b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
1191
1192	u64s = le16_to_cpu(sorted->keys.u64s);
1193	*sorted = *b->data;
1194	sorted->keys.u64s = cpu_to_le16(u64s);
1195	swap(sorted, b->data);
1196	set_btree_bset(b, b->set, &b->data->keys);
1197	b->nsets = 1;
1198
1199	BUG_ON(b->nr.live_u64s != u64s);
1200
1201	btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
1202
1203	if (updated_range)
1204		bch2_btree_node_drop_keys_outside_node(b);
1205
1206	i = &b->data->keys;
1207	for (k = i->start; k != vstruct_last(i);) {
1208		struct bkey tmp;
1209		struct bkey_s u = __bkey_disassemble(b, k, &tmp);
1210
1211		printbuf_reset(&buf);
1212
1213		if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) ||
1214		    (bch2_inject_invalid_keys &&
1215		     !bversion_cmp(u.k->version, MAX_VERSION))) {
1216			printbuf_reset(&buf);
1217
1218			prt_printf(&buf, "invalid bkey: ");
1219			bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
1220			prt_printf(&buf, "\n  ");
1221			bch2_bkey_val_to_text(&buf, c, u.s_c);
1222
1223			btree_err(-BCH_ERR_btree_node_read_err_fixable,
1224				  c, NULL, b, i,
1225				  btree_node_bad_bkey,
1226				  "%s", buf.buf);
1227
1228			btree_keys_account_key_drop(&b->nr, 0, k);
1229
1230			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
1231			memmove_u64s_down(k, bkey_p_next(k),
1232					  (u64 *) vstruct_end(i) - (u64 *) k);
1233			set_btree_bset_end(b, b->set);
1234			continue;
1235		}
1236
1237		if (u.k->type == KEY_TYPE_btree_ptr_v2) {
1238			struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
1239
1240			bp.v->mem_ptr = 0;
1241		}
1242
1243		k = bkey_p_next(k);
1244	}
1245
1246	bch2_bset_build_aux_tree(b, b->set, false);
1247
1248	set_needs_whiteout(btree_bset_first(b), true);
1249
1250	btree_node_reset_sib_u64s(b);
1251
1252	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
1253		struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev);
1254
1255		if (ca2->mi.state != BCH_MEMBER_STATE_rw)
1256			set_btree_node_need_rewrite(b);
1257	}
1258
1259	if (!ptr_written)
1260		set_btree_node_need_rewrite(b);
1261out:
1262	mempool_free(iter, &c->fill_iter);
1263	printbuf_exit(&buf);
1264	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
1265	return retry_read;
1266fsck_err:
1267	if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
1268	    ret == -BCH_ERR_btree_node_read_err_must_retry) {
1269		retry_read = 1;
1270	} else {
1271		set_btree_node_read_error(b);
1272		bch2_btree_lost_data(c, b->c.btree_id);
1273	}
1274	goto out;
1275}
1276
1277static void btree_node_read_work(struct work_struct *work)
1278{
1279	struct btree_read_bio *rb =
1280		container_of(work, struct btree_read_bio, work);
1281	struct bch_fs *c	= rb->c;
1282	struct btree *b		= rb->b;
1283	struct bch_dev *ca	= bch_dev_bkey_exists(c, rb->pick.ptr.dev);
1284	struct bio *bio		= &rb->bio;
1285	struct bch_io_failures failed = { .nr = 0 };
1286	struct printbuf buf = PRINTBUF;
1287	bool saw_error = false;
1288	bool retry = false;
1289	bool can_retry;
1290
1291	goto start;
1292	while (1) {
1293		retry = true;
1294		bch_info(c, "retrying read");
1295		ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
1296		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
1297		bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
1298		bio->bi_iter.bi_sector	= rb->pick.ptr.offset;
1299		bio->bi_iter.bi_size	= btree_buf_bytes(b);
1300
1301		if (rb->have_ioref) {
1302			bio_set_dev(bio, ca->disk_sb.bdev);
1303			submit_bio_wait(bio);
1304		} else {
1305			bio->bi_status = BLK_STS_REMOVED;
1306		}
1307start:
1308		printbuf_reset(&buf);
1309		bch2_btree_pos_to_text(&buf, c, b);
1310		bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
1311				   "btree read error %s for %s",
1312				   bch2_blk_status_to_str(bio->bi_status), buf.buf);
1313		if (rb->have_ioref)
1314			percpu_ref_put(&ca->io_ref);
1315		rb->have_ioref = false;
1316
1317		bch2_mark_io_failure(&failed, &rb->pick);
1318
1319		can_retry = bch2_bkey_pick_read_device(c,
1320				bkey_i_to_s_c(&b->key),
1321				&failed, &rb->pick) > 0;
1322
1323		if (!bio->bi_status &&
1324		    !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
1325			if (retry)
1326				bch_info(c, "retry success");
1327			break;
1328		}
1329
1330		saw_error = true;
1331
1332		if (!can_retry) {
1333			set_btree_node_read_error(b);
1334			bch2_btree_lost_data(c, b->c.btree_id);
1335			break;
1336		}
1337	}
1338
1339	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
1340			       rb->start_time);
1341	bio_put(&rb->bio);
1342
1343	if (saw_error &&
1344	    !btree_node_read_error(b) &&
1345	    c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
1346		printbuf_reset(&buf);
1347		bch2_bpos_to_text(&buf, b->key.k.p);
1348		bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
1349			 __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
1350
1351		bch2_btree_node_rewrite_async(c, b);
1352	}
1353
1354	printbuf_exit(&buf);
1355	clear_btree_node_read_in_flight(b);
1356	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
1357}
1358
1359static void btree_node_read_endio(struct bio *bio)
1360{
1361	struct btree_read_bio *rb =
1362		container_of(bio, struct btree_read_bio, bio);
1363	struct bch_fs *c	= rb->c;
1364
1365	if (rb->have_ioref) {
1366		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
1367
1368		bch2_latency_acct(ca, rb->start_time, READ);
1369	}
1370
1371	queue_work(c->io_complete_wq, &rb->work);
1372}
1373
1374struct btree_node_read_all {
1375	struct closure		cl;
1376	struct bch_fs		*c;
1377	struct btree		*b;
1378	unsigned		nr;
1379	void			*buf[BCH_REPLICAS_MAX];
1380	struct bio		*bio[BCH_REPLICAS_MAX];
1381	blk_status_t		err[BCH_REPLICAS_MAX];
1382};
1383
1384static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
1385{
1386	struct btree_node *bn = data;
1387	struct btree_node_entry *bne;
1388	unsigned offset = 0;
1389
1390	if (le64_to_cpu(bn->magic) !=  bset_magic(c))
1391		return 0;
1392
1393	while (offset < btree_sectors(c)) {
1394		if (!offset) {
1395			offset += vstruct_sectors(bn, c->block_bits);
1396		} else {
1397			bne = data + (offset << 9);
1398			if (bne->keys.seq != bn->keys.seq)
1399				break;
1400			offset += vstruct_sectors(bne, c->block_bits);
1401		}
1402	}
1403
1404	return offset;
1405}
1406
1407static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
1408{
1409	struct btree_node *bn = data;
1410	struct btree_node_entry *bne;
1411
1412	if (!offset)
1413		return false;
1414
1415	while (offset < btree_sectors(c)) {
1416		bne = data + (offset << 9);
1417		if (bne->keys.seq == bn->keys.seq)
1418			return true;
1419		offset++;
1420	}
1421
1422	return false;
1423	return offset;
1424}
1425
1426static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
1427{
1428	closure_type(ra, struct btree_node_read_all, cl);
1429	struct bch_fs *c = ra->c;
1430	struct btree *b = ra->b;
1431	struct printbuf buf = PRINTBUF;
1432	bool dump_bset_maps = false;
1433	bool have_retry = false;
1434	int ret = 0, best = -1, write = READ;
1435	unsigned i, written = 0, written2 = 0;
1436	__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
1437		? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
1438	bool _saw_error = false, *saw_error = &_saw_error;
1439
1440	for (i = 0; i < ra->nr; i++) {
1441		struct btree_node *bn = ra->buf[i];
1442
1443		if (ra->err[i])
1444			continue;
1445
1446		if (le64_to_cpu(bn->magic) != bset_magic(c) ||
1447		    (seq && seq != bn->keys.seq))
1448			continue;
1449
1450		if (best < 0) {
1451			best = i;
1452			written = btree_node_sectors_written(c, bn);
1453			continue;
1454		}
1455
1456		written2 = btree_node_sectors_written(c, ra->buf[i]);
1457		if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
1458				 c, NULL, b, NULL,
1459				 btree_node_replicas_sectors_written_mismatch,
1460				 "btree node sectors written mismatch: %u != %u",
1461				 written, written2) ||
1462		    btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
1463				 -BCH_ERR_btree_node_read_err_fixable,
1464				 c, NULL, b, NULL,
1465				 btree_node_bset_after_end,
1466				 "found bset signature after last bset") ||
1467		    btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
1468				 -BCH_ERR_btree_node_read_err_fixable,
1469				 c, NULL, b, NULL,
1470				 btree_node_replicas_data_mismatch,
1471				 "btree node replicas content mismatch"))
1472			dump_bset_maps = true;
1473
1474		if (written2 > written) {
1475			written = written2;
1476			best = i;
1477		}
1478	}
1479fsck_err:
1480	if (dump_bset_maps) {
1481		for (i = 0; i < ra->nr; i++) {
1482			struct btree_node *bn = ra->buf[i];
1483			struct btree_node_entry *bne = NULL;
1484			unsigned offset = 0, sectors;
1485			bool gap = false;
1486
1487			if (ra->err[i])
1488				continue;
1489
1490			printbuf_reset(&buf);
1491
1492			while (offset < btree_sectors(c)) {
1493				if (!offset) {
1494					sectors = vstruct_sectors(bn, c->block_bits);
1495				} else {
1496					bne = ra->buf[i] + (offset << 9);
1497					if (bne->keys.seq != bn->keys.seq)
1498						break;
1499					sectors = vstruct_sectors(bne, c->block_bits);
1500				}
1501
1502				prt_printf(&buf, " %u-%u", offset, offset + sectors);
1503				if (bne && bch2_journal_seq_is_blacklisted(c,
1504							le64_to_cpu(bne->keys.journal_seq), false))
1505					prt_printf(&buf, "*");
1506				offset += sectors;
1507			}
1508
1509			while (offset < btree_sectors(c)) {
1510				bne = ra->buf[i] + (offset << 9);
1511				if (bne->keys.seq == bn->keys.seq) {
1512					if (!gap)
1513						prt_printf(&buf, " GAP");
1514					gap = true;
1515
1516					sectors = vstruct_sectors(bne, c->block_bits);
1517					prt_printf(&buf, " %u-%u", offset, offset + sectors);
1518					if (bch2_journal_seq_is_blacklisted(c,
1519							le64_to_cpu(bne->keys.journal_seq), false))
1520						prt_printf(&buf, "*");
1521				}
1522				offset++;
1523			}
1524
1525			bch_err(c, "replica %u:%s", i, buf.buf);
1526		}
1527	}
1528
1529	if (best >= 0) {
1530		memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
1531		ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
1532	} else {
1533		ret = -1;
1534	}
1535
1536	if (ret) {
1537		set_btree_node_read_error(b);
1538		bch2_btree_lost_data(c, b->c.btree_id);
1539	} else if (*saw_error)
1540		bch2_btree_node_rewrite_async(c, b);
1541
1542	for (i = 0; i < ra->nr; i++) {
1543		mempool_free(ra->buf[i], &c->btree_bounce_pool);
1544		bio_put(ra->bio[i]);
1545	}
1546
1547	closure_debug_destroy(&ra->cl);
1548	kfree(ra);
1549	printbuf_exit(&buf);
1550
1551	clear_btree_node_read_in_flight(b);
1552	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
1553}
1554
1555static void btree_node_read_all_replicas_endio(struct bio *bio)
1556{
1557	struct btree_read_bio *rb =
1558		container_of(bio, struct btree_read_bio, bio);
1559	struct bch_fs *c	= rb->c;
1560	struct btree_node_read_all *ra = rb->ra;
1561
1562	if (rb->have_ioref) {
1563		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
1564
1565		bch2_latency_acct(ca, rb->start_time, READ);
1566	}
1567
1568	ra->err[rb->idx] = bio->bi_status;
1569	closure_put(&ra->cl);
1570}
1571
1572/*
1573 * XXX This allocates multiple times from the same mempools, and can deadlock
1574 * under sufficient memory pressure (but is only a debug path)
1575 */
1576static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
1577{
1578	struct bkey_s_c k = bkey_i_to_s_c(&b->key);
1579	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1580	const union bch_extent_entry *entry;
1581	struct extent_ptr_decoded pick;
1582	struct btree_node_read_all *ra;
1583	unsigned i;
1584
1585	ra = kzalloc(sizeof(*ra), GFP_NOFS);
1586	if (!ra)
1587		return -BCH_ERR_ENOMEM_btree_node_read_all_replicas;
1588
1589	closure_init(&ra->cl, NULL);
1590	ra->c	= c;
1591	ra->b	= b;
1592	ra->nr	= bch2_bkey_nr_ptrs(k);
1593
1594	for (i = 0; i < ra->nr; i++) {
1595		ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
1596		ra->bio[i] = bio_alloc_bioset(NULL,
1597					      buf_pages(ra->buf[i], btree_buf_bytes(b)),
1598					      REQ_OP_READ|REQ_SYNC|REQ_META,
1599					      GFP_NOFS,
1600					      &c->btree_bio);
1601	}
1602
1603	i = 0;
1604	bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
1605		struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
1606		struct btree_read_bio *rb =
1607			container_of(ra->bio[i], struct btree_read_bio, bio);
1608		rb->c			= c;
1609		rb->b			= b;
1610		rb->ra			= ra;
1611		rb->start_time		= local_clock();
1612		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
1613		rb->idx			= i;
1614		rb->pick		= pick;
1615		rb->bio.bi_iter.bi_sector = pick.ptr.offset;
1616		rb->bio.bi_end_io	= btree_node_read_all_replicas_endio;
1617		bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
1618
1619		if (rb->have_ioref) {
1620			this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
1621				     bio_sectors(&rb->bio));
1622			bio_set_dev(&rb->bio, ca->disk_sb.bdev);
1623
1624			closure_get(&ra->cl);
1625			submit_bio(&rb->bio);
1626		} else {
1627			ra->err[i] = BLK_STS_REMOVED;
1628		}
1629
1630		i++;
1631	}
1632
1633	if (sync) {
1634		closure_sync(&ra->cl);
1635		btree_node_read_all_replicas_done(&ra->cl.work);
1636	} else {
1637		continue_at(&ra->cl, btree_node_read_all_replicas_done,
1638			    c->io_complete_wq);
1639	}
1640
1641	return 0;
1642}
1643
1644void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
1645			  bool sync)
1646{
1647	struct bch_fs *c = trans->c;
1648	struct extent_ptr_decoded pick;
1649	struct btree_read_bio *rb;
1650	struct bch_dev *ca;
1651	struct bio *bio;
1652	int ret;
1653
1654	trace_and_count(c, btree_node_read, trans, b);
1655
1656	if (bch2_verify_all_btree_replicas &&
1657	    !btree_node_read_all_replicas(c, b, sync))
1658		return;
1659
1660	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
1661					 NULL, &pick);
1662
1663	if (ret <= 0) {
1664		struct printbuf buf = PRINTBUF;
1665
1666		prt_str(&buf, "btree node read error: no device to read from\n at ");
1667		bch2_btree_pos_to_text(&buf, c, b);
1668		bch_err_ratelimited(c, "%s", buf.buf);
1669
1670		if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
1671		    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
1672			bch2_fatal_error(c);
1673
1674		set_btree_node_read_error(b);
1675		bch2_btree_lost_data(c, b->c.btree_id);
1676		clear_btree_node_read_in_flight(b);
1677		wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
1678		printbuf_exit(&buf);
1679		return;
1680	}
1681
1682	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
1683
1684	bio = bio_alloc_bioset(NULL,
1685			       buf_pages(b->data, btree_buf_bytes(b)),
1686			       REQ_OP_READ|REQ_SYNC|REQ_META,
1687			       GFP_NOFS,
1688			       &c->btree_bio);
1689	rb = container_of(bio, struct btree_read_bio, bio);
1690	rb->c			= c;
1691	rb->b			= b;
1692	rb->ra			= NULL;
1693	rb->start_time		= local_clock();
1694	rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
1695	rb->pick		= pick;
1696	INIT_WORK(&rb->work, btree_node_read_work);
1697	bio->bi_iter.bi_sector	= pick.ptr.offset;
1698	bio->bi_end_io		= btree_node_read_endio;
1699	bch2_bio_map(bio, b->data, btree_buf_bytes(b));
1700
1701	if (rb->have_ioref) {
1702		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
1703			     bio_sectors(bio));
1704		bio_set_dev(bio, ca->disk_sb.bdev);
1705
1706		if (sync) {
1707			submit_bio_wait(bio);
1708			bch2_latency_acct(ca, rb->start_time, READ);
1709			btree_node_read_work(&rb->work);
1710		} else {
1711			submit_bio(bio);
1712		}
1713	} else {
1714		bio->bi_status = BLK_STS_REMOVED;
1715
1716		if (sync)
1717			btree_node_read_work(&rb->work);
1718		else
1719			queue_work(c->io_complete_wq, &rb->work);
1720	}
1721}
1722
1723static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
1724				  const struct bkey_i *k, unsigned level)
1725{
1726	struct bch_fs *c = trans->c;
1727	struct closure cl;
1728	struct btree *b;
1729	int ret;
1730
1731	closure_init_stack(&cl);
1732
1733	do {
1734		ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
1735		closure_sync(&cl);
1736	} while (ret);
1737
1738	b = bch2_btree_node_mem_alloc(trans, level != 0);
1739	bch2_btree_cache_cannibalize_unlock(trans);
1740
1741	BUG_ON(IS_ERR(b));
1742
1743	bkey_copy(&b->key, k);
1744	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
1745
1746	set_btree_node_read_in_flight(b);
1747
1748	bch2_btree_node_read(trans, b, true);
1749
1750	if (btree_node_read_error(b)) {
1751		bch2_btree_node_hash_remove(&c->btree_cache, b);
1752
1753		mutex_lock(&c->btree_cache.lock);
1754		list_move(&b->list, &c->btree_cache.freeable);
1755		mutex_unlock(&c->btree_cache.lock);
1756
1757		ret = -BCH_ERR_btree_node_read_error;
1758		goto err;
1759	}
1760
1761	bch2_btree_set_root_for_read(c, b);
1762err:
1763	six_unlock_write(&b->c.lock);
1764	six_unlock_intent(&b->c.lock);
1765
1766	return ret;
1767}
1768
1769int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
1770			const struct bkey_i *k, unsigned level)
1771{
1772	return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
1773}
1774
1775static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
1776				      struct btree_write *w)
1777{
1778	unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
1779
1780	do {
1781		old = new = v;
1782		if (!(old & 1))
1783			break;
1784
1785		new &= ~1UL;
1786	} while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
1787
1788	if (old & 1)
1789		closure_put(&((struct btree_update *) new)->cl);
1790
1791	bch2_journal_pin_drop(&c->journal, &w->journal);
1792}
1793
1794static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
1795{
1796	struct btree_write *w = btree_prev_write(b);
1797	unsigned long old, new, v;
1798	unsigned type = 0;
1799
1800	bch2_btree_complete_write(c, b, w);
1801
1802	v = READ_ONCE(b->flags);
1803	do {
1804		old = new = v;
1805
1806		if ((old & (1U << BTREE_NODE_dirty)) &&
1807		    (old & (1U << BTREE_NODE_need_write)) &&
1808		    !(old & (1U << BTREE_NODE_never_write)) &&
1809		    !(old & (1U << BTREE_NODE_write_blocked)) &&
1810		    !(old & (1U << BTREE_NODE_will_make_reachable))) {
1811			new &= ~(1U << BTREE_NODE_dirty);
1812			new &= ~(1U << BTREE_NODE_need_write);
1813			new |=  (1U << BTREE_NODE_write_in_flight);
1814			new |=  (1U << BTREE_NODE_write_in_flight_inner);
1815			new |=  (1U << BTREE_NODE_just_written);
1816			new ^=  (1U << BTREE_NODE_write_idx);
1817
1818			type = new & BTREE_WRITE_TYPE_MASK;
1819			new &= ~BTREE_WRITE_TYPE_MASK;
1820		} else {
1821			new &= ~(1U << BTREE_NODE_write_in_flight);
1822			new &= ~(1U << BTREE_NODE_write_in_flight_inner);
1823		}
1824	} while ((v = cmpxchg(&b->flags, old, new)) != old);
1825
1826	if (new & (1U << BTREE_NODE_write_in_flight))
1827		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
1828	else
1829		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
1830}
1831
1832static void btree_node_write_done(struct bch_fs *c, struct btree *b)
1833{
1834	struct btree_trans *trans = bch2_trans_get(c);
1835
1836	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
1837	__btree_node_write_done(c, b);
1838	six_unlock_read(&b->c.lock);
1839
1840	bch2_trans_put(trans);
1841}
1842
1843static void btree_node_write_work(struct work_struct *work)
1844{
1845	struct btree_write_bio *wbio =
1846		container_of(work, struct btree_write_bio, work);
1847	struct bch_fs *c	= wbio->wbio.c;
1848	struct btree *b		= wbio->wbio.bio.bi_private;
1849	struct bch_extent_ptr *ptr;
1850	int ret = 0;
1851
1852	btree_bounce_free(c,
1853		wbio->data_bytes,
1854		wbio->wbio.used_mempool,
1855		wbio->data);
1856
1857	bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
1858		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
1859
1860	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
1861		ret = -BCH_ERR_btree_node_write_all_failed;
1862		goto err;
1863	}
1864
1865	if (wbio->wbio.first_btree_write) {
1866		if (wbio->wbio.failed.nr) {
1867
1868		}
1869	} else {
1870		ret = bch2_trans_do(c, NULL, NULL, 0,
1871			bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
1872					BCH_WATERMARK_interior_updates|
1873					BCH_TRANS_COMMIT_journal_reclaim|
1874					BCH_TRANS_COMMIT_no_enospc|
1875					BCH_TRANS_COMMIT_no_check_rw,
1876					!wbio->wbio.failed.nr));
1877		if (ret)
1878			goto err;
1879	}
1880out:
1881	bio_put(&wbio->wbio.bio);
1882	btree_node_write_done(c, b);
1883	return;
1884err:
1885	set_btree_node_noevict(b);
1886	bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
1887			     "writing btree node: %s", bch2_err_str(ret));
1888	goto out;
1889}
1890
1891static void btree_node_write_endio(struct bio *bio)
1892{
1893	struct bch_write_bio *wbio	= to_wbio(bio);
1894	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
1895	struct bch_write_bio *orig	= parent ?: wbio;
1896	struct btree_write_bio *wb	= container_of(orig, struct btree_write_bio, wbio);
1897	struct bch_fs *c		= wbio->c;
1898	struct btree *b			= wbio->bio.bi_private;
1899	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
1900	unsigned long flags;
1901
1902	if (wbio->have_ioref)
1903		bch2_latency_acct(ca, wbio->submit_time, WRITE);
1904
1905	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
1906			       "btree write error: %s",
1907			       bch2_blk_status_to_str(bio->bi_status)) ||
1908	    bch2_meta_write_fault("btree")) {
1909		spin_lock_irqsave(&c->btree_write_error_lock, flags);
1910		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
1911		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
1912	}
1913
1914	if (wbio->have_ioref)
1915		percpu_ref_put(&ca->io_ref);
1916
1917	if (parent) {
1918		bio_put(bio);
1919		bio_endio(&parent->bio);
1920		return;
1921	}
1922
1923	clear_btree_node_write_in_flight_inner(b);
1924	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
1925	INIT_WORK(&wb->work, btree_node_write_work);
1926	queue_work(c->btree_io_complete_wq, &wb->work);
1927}
1928
1929static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
1930				   struct bset *i, unsigned sectors)
1931{
1932	struct printbuf buf = PRINTBUF;
1933	bool saw_error;
1934	int ret;
1935
1936	ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
1937				BKEY_TYPE_btree, WRITE, &buf);
1938
1939	if (ret)
1940		bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
1941	printbuf_exit(&buf);
1942	if (ret)
1943		return ret;
1944
1945	ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
1946		validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
1947	if (ret) {
1948		bch2_inconsistent_error(c);
1949		dump_stack();
1950	}
1951
1952	return ret;
1953}
1954
1955static void btree_write_submit(struct work_struct *work)
1956{
1957	struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
1958	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
1959
1960	bkey_copy(&tmp.k, &wbio->key);
1961
1962	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
1963		ptr->offset += wbio->sector_offset;
1964
1965	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree,
1966				  &tmp.k, false);
1967}
1968
1969void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
1970{
1971	struct btree_write_bio *wbio;
1972	struct bset_tree *t;
1973	struct bset *i;
1974	struct btree_node *bn = NULL;
1975	struct btree_node_entry *bne = NULL;
1976	struct sort_iter_stack sort_iter;
1977	struct nonce nonce;
1978	unsigned bytes_to_write, sectors_to_write, bytes, u64s;
1979	u64 seq = 0;
1980	bool used_mempool;
1981	unsigned long old, new;
1982	bool validate_before_checksum = false;
1983	enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
1984	void *data;
1985	int ret;
1986
1987	if (flags & BTREE_WRITE_ALREADY_STARTED)
1988		goto do_write;
1989
1990	/*
1991	 * We may only have a read lock on the btree node - the dirty bit is our
1992	 * "lock" against racing with other threads that may be trying to start
1993	 * a write, we do a write iff we clear the dirty bit. Since setting the
1994	 * dirty bit requires a write lock, we can't race with other threads
1995	 * redirtying it:
1996	 */
1997	do {
1998		old = new = READ_ONCE(b->flags);
1999
2000		if (!(old & (1 << BTREE_NODE_dirty)))
2001			return;
2002
2003		if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
2004		    !(old & (1 << BTREE_NODE_need_write)))
2005			return;
2006
2007		if (old &
2008		    ((1 << BTREE_NODE_never_write)|
2009		     (1 << BTREE_NODE_write_blocked)))
2010			return;
2011
2012		if (b->written &&
2013		    (old & (1 << BTREE_NODE_will_make_reachable)))
2014			return;
2015
2016		if (old & (1 << BTREE_NODE_write_in_flight))
2017			return;
2018
2019		if (flags & BTREE_WRITE_ONLY_IF_NEED)
2020			type = new & BTREE_WRITE_TYPE_MASK;
2021		new &= ~BTREE_WRITE_TYPE_MASK;
2022
2023		new &= ~(1 << BTREE_NODE_dirty);
2024		new &= ~(1 << BTREE_NODE_need_write);
2025		new |=  (1 << BTREE_NODE_write_in_flight);
2026		new |=  (1 << BTREE_NODE_write_in_flight_inner);
2027		new |=  (1 << BTREE_NODE_just_written);
2028		new ^=  (1 << BTREE_NODE_write_idx);
2029	} while (cmpxchg_acquire(&b->flags, old, new) != old);
2030
2031	if (new & (1U << BTREE_NODE_need_write))
2032		return;
2033do_write:
2034	BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
2035
2036	atomic_dec(&c->btree_cache.dirty);
2037
2038	BUG_ON(btree_node_fake(b));
2039	BUG_ON((b->will_make_reachable != 0) != !b->written);
2040
2041	BUG_ON(b->written >= btree_sectors(c));
2042	BUG_ON(b->written & (block_sectors(c) - 1));
2043	BUG_ON(bset_written(b, btree_bset_last(b)));
2044	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
2045	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
2046
2047	bch2_sort_whiteouts(c, b);
2048
2049	sort_iter_stack_init(&sort_iter, b);
2050
2051	bytes = !b->written
2052		? sizeof(struct btree_node)
2053		: sizeof(struct btree_node_entry);
2054
2055	bytes += b->whiteout_u64s * sizeof(u64);
2056
2057	for_each_bset(b, t) {
2058		i = bset(b, t);
2059
2060		if (bset_written(b, i))
2061			continue;
2062
2063		bytes += le16_to_cpu(i->u64s) * sizeof(u64);
2064		sort_iter_add(&sort_iter.iter,
2065			      btree_bkey_first(b, t),
2066			      btree_bkey_last(b, t));
2067		seq = max(seq, le64_to_cpu(i->journal_seq));
2068	}
2069
2070	BUG_ON(b->written && !seq);
2071
2072	/* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
2073	bytes += 8;
2074
2075	/* buffer must be a multiple of the block size */
2076	bytes = round_up(bytes, block_bytes(c));
2077
2078	data = btree_bounce_alloc(c, bytes, &used_mempool);
2079
2080	if (!b->written) {
2081		bn = data;
2082		*bn = *b->data;
2083		i = &bn->keys;
2084	} else {
2085		bne = data;
2086		bne->keys = b->data->keys;
2087		i = &bne->keys;
2088	}
2089
2090	i->journal_seq	= cpu_to_le64(seq);
2091	i->u64s		= 0;
2092
2093	sort_iter_add(&sort_iter.iter,
2094		      unwritten_whiteouts_start(b),
2095		      unwritten_whiteouts_end(b));
2096	SET_BSET_SEPARATE_WHITEOUTS(i, false);
2097
2098	b->whiteout_u64s = 0;
2099
2100	u64s = bch2_sort_keys(i->start, &sort_iter.iter, false);
2101	le16_add_cpu(&i->u64s, u64s);
2102
2103	BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
2104
2105	set_needs_whiteout(i, false);
2106
2107	/* do we have data to write? */
2108	if (b->written && !i->u64s)
2109		goto nowrite;
2110
2111	bytes_to_write = vstruct_end(i) - data;
2112	sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
2113
2114	if (!b->written &&
2115	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
2116		BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
2117
2118	memset(data + bytes_to_write, 0,
2119	       (sectors_to_write << 9) - bytes_to_write);
2120
2121	BUG_ON(b->written + sectors_to_write > btree_sectors(c));
2122	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
2123	BUG_ON(i->seq != b->data->keys.seq);
2124
2125	i->version = cpu_to_le16(c->sb.version);
2126	SET_BSET_OFFSET(i, b->written);
2127	SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
2128
2129	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
2130		validate_before_checksum = true;
2131
2132	/* validate_bset will be modifying: */
2133	if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
2134		validate_before_checksum = true;
2135
2136	/* if we're going to be encrypting, check metadata validity first: */
2137	if (validate_before_checksum &&
2138	    validate_bset_for_write(c, b, i, sectors_to_write))
2139		goto err;
2140
2141	ret = bset_encrypt(c, i, b->written << 9);
2142	if (bch2_fs_fatal_err_on(ret, c,
2143			"encrypting btree node: %s", bch2_err_str(ret)))
2144		goto err;
2145
2146	nonce = btree_nonce(i, b->written << 9);
2147
2148	if (bn)
2149		bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
2150	else
2151		bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
2152
2153	/* if we're not encrypting, check metadata after checksumming: */
2154	if (!validate_before_checksum &&
2155	    validate_bset_for_write(c, b, i, sectors_to_write))
2156		goto err;
2157
2158	/*
2159	 * We handle btree write errors by immediately halting the journal -
2160	 * after we've done that, we can't issue any subsequent btree writes
2161	 * because they might have pointers to new nodes that failed to write.
2162	 *
2163	 * Furthermore, there's no point in doing any more btree writes because
2164	 * with the journal stopped, we're never going to update the journal to
2165	 * reflect that those writes were done and the data flushed from the
2166	 * journal:
2167	 *
2168	 * Also on journal error, the pending write may have updates that were
2169	 * never journalled (interior nodes, see btree_update_nodes_written()) -
2170	 * it's critical that we don't do the write in that case otherwise we
2171	 * will have updates visible that weren't in the journal:
2172	 *
2173	 * Make sure to update b->written so bch2_btree_init_next() doesn't
2174	 * break:
2175	 */
2176	if (bch2_journal_error(&c->journal) ||
2177	    c->opts.nochanges)
2178		goto err;
2179
2180	trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
2181
2182	wbio = container_of(bio_alloc_bioset(NULL,
2183				buf_pages(data, sectors_to_write << 9),
2184				REQ_OP_WRITE|REQ_META,
2185				GFP_NOFS,
2186				&c->btree_bio),
2187			    struct btree_write_bio, wbio.bio);
2188	wbio_init(&wbio->wbio.bio);
2189	wbio->data			= data;
2190	wbio->data_bytes		= bytes;
2191	wbio->sector_offset		= b->written;
2192	wbio->wbio.c			= c;
2193	wbio->wbio.used_mempool		= used_mempool;
2194	wbio->wbio.first_btree_write	= !b->written;
2195	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
2196	wbio->wbio.bio.bi_private	= b;
2197
2198	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
2199
2200	bkey_copy(&wbio->key, &b->key);
2201
2202	b->written += sectors_to_write;
2203
2204	if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
2205		bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
2206			cpu_to_le16(b->written);
2207
2208	atomic64_inc(&c->btree_write_stats[type].nr);
2209	atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
2210
2211	INIT_WORK(&wbio->work, btree_write_submit);
2212	queue_work(c->io_complete_wq, &wbio->work);
2213	return;
2214err:
2215	set_btree_node_noevict(b);
2216	b->written += sectors_to_write;
2217nowrite:
2218	btree_bounce_free(c, bytes, used_mempool, data);
2219	__btree_node_write_done(c, b);
2220}
2221
2222/*
2223 * Work that must be done with write lock held:
2224 */
2225bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
2226{
2227	bool invalidated_iter = false;
2228	struct btree_node_entry *bne;
2229	struct bset_tree *t;
2230
2231	if (!btree_node_just_written(b))
2232		return false;
2233
2234	BUG_ON(b->whiteout_u64s);
2235
2236	clear_btree_node_just_written(b);
2237
2238	/*
2239	 * Note: immediately after write, bset_written() doesn't work - the
2240	 * amount of data we had to write after compaction might have been
2241	 * smaller than the offset of the last bset.
2242	 *
2243	 * However, we know that all bsets have been written here, as long as
2244	 * we're still holding the write lock:
2245	 */
2246
2247	/*
2248	 * XXX: decide if we really want to unconditionally sort down to a
2249	 * single bset:
2250	 */
2251	if (b->nsets > 1) {
2252		btree_node_sort(c, b, 0, b->nsets, true);
2253		invalidated_iter = true;
2254	} else {
2255		invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
2256	}
2257
2258	for_each_bset(b, t)
2259		set_needs_whiteout(bset(b, t), true);
2260
2261	bch2_btree_verify(c, b);
2262
2263	/*
2264	 * If later we don't unconditionally sort down to a single bset, we have
2265	 * to ensure this is still true:
2266	 */
2267	BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
2268
2269	bne = want_new_bset(c, b);
2270	if (bne)
2271		bch2_bset_init_next(b, bne);
2272
2273	bch2_btree_build_aux_trees(b);
2274
2275	return invalidated_iter;
2276}
2277
2278/*
2279 * Use this one if the node is intent locked:
2280 */
2281void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
2282			   enum six_lock_type lock_type_held,
2283			   unsigned flags)
2284{
2285	if (lock_type_held == SIX_LOCK_intent ||
2286	    (lock_type_held == SIX_LOCK_read &&
2287	     six_lock_tryupgrade(&b->c.lock))) {
2288		__bch2_btree_node_write(c, b, flags);
2289
2290		/* don't cycle lock unnecessarily: */
2291		if (btree_node_just_written(b) &&
2292		    six_trylock_write(&b->c.lock)) {
2293			bch2_btree_post_write_cleanup(c, b);
2294			six_unlock_write(&b->c.lock);
2295		}
2296
2297		if (lock_type_held == SIX_LOCK_read)
2298			six_lock_downgrade(&b->c.lock);
2299	} else {
2300		__bch2_btree_node_write(c, b, flags);
2301		if (lock_type_held == SIX_LOCK_write &&
2302		    btree_node_just_written(b))
2303			bch2_btree_post_write_cleanup(c, b);
2304	}
2305}
2306
2307static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
2308{
2309	struct bucket_table *tbl;
2310	struct rhash_head *pos;
2311	struct btree *b;
2312	unsigned i;
2313	bool ret = false;
2314restart:
2315	rcu_read_lock();
2316	for_each_cached_btree(b, c, tbl, i, pos)
2317		if (test_bit(flag, &b->flags)) {
2318			rcu_read_unlock();
2319			wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
2320			ret = true;
2321			goto restart;
2322		}
2323	rcu_read_unlock();
2324
2325	return ret;
2326}
2327
2328bool bch2_btree_flush_all_reads(struct bch_fs *c)
2329{
2330	return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
2331}
2332
2333bool bch2_btree_flush_all_writes(struct bch_fs *c)
2334{
2335	return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
2336}
2337
2338static const char * const bch2_btree_write_types[] = {
2339#define x(t, n) [n] = #t,
2340	BCH_BTREE_WRITE_TYPES()
2341	NULL
2342};
2343
2344void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
2345{
2346	printbuf_tabstop_push(out, 20);
2347	printbuf_tabstop_push(out, 10);
2348
2349	prt_tab(out);
2350	prt_str(out, "nr");
2351	prt_tab(out);
2352	prt_str(out, "size");
2353	prt_newline(out);
2354
2355	for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
2356		u64 nr		= atomic64_read(&c->btree_write_stats[i].nr);
2357		u64 bytes	= atomic64_read(&c->btree_write_stats[i].bytes);
2358
2359		prt_printf(out, "%s:", bch2_btree_write_types[i]);
2360		prt_tab(out);
2361		prt_u64(out, nr);
2362		prt_tab(out);
2363		prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
2364		prt_newline(out);
2365	}
2366}
2367