1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
4#include "bkey_buf.h"
5#include "btree_cache.h"
6#include "btree_update.h"
7#include "buckets.h"
8#include "darray.h"
9#include "dirent.h"
10#include "error.h"
11#include "fs-common.h"
12#include "fsck.h"
13#include "inode.h"
14#include "keylist.h"
15#include "recovery_passes.h"
16#include "snapshot.h"
17#include "super.h"
18#include "xattr.h"
19
20#include <linux/bsearch.h>
21#include <linux/dcache.h> /* struct qstr */
22
23/*
24 * XXX: this is handling transaction restarts without returning
25 * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
26 */
27static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
28				    u32 snapshot)
29{
30	u64 sectors = 0;
31
32	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
33				SPOS(inum, 0, snapshot),
34				POS(inum, U64_MAX),
35				0, k, ({
36		if (bkey_extent_is_allocation(k.k))
37			sectors += k.k->size;
38		0;
39	}));
40
41	return ret ?: sectors;
42}
43
44static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
45				    u32 snapshot)
46{
47	u64 subdirs = 0;
48
49	int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
50				    SPOS(inum, 0, snapshot),
51				    POS(inum, U64_MAX),
52				    0, k, ({
53		if (k.k->type == KEY_TYPE_dirent &&
54		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
55			subdirs++;
56		0;
57	}));
58
59	return ret ?: subdirs;
60}
61
62static int subvol_lookup(struct btree_trans *trans, u32 subvol,
63			 u32 *snapshot, u64 *inum)
64{
65	struct bch_subvolume s;
66	int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
67
68	*snapshot = le32_to_cpu(s.snapshot);
69	*inum = le64_to_cpu(s.inode);
70	return ret;
71}
72
73static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
74			      struct bch_inode_unpacked *inode)
75{
76	struct btree_iter iter;
77	struct bkey_s_c k;
78	int ret;
79
80	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
81			     POS(0, inode_nr),
82			     BTREE_ITER_ALL_SNAPSHOTS);
83	k = bch2_btree_iter_peek(&iter);
84	ret = bkey_err(k);
85	if (ret)
86		goto err;
87
88	if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
89		ret = -BCH_ERR_ENOENT_inode;
90		goto err;
91	}
92
93	ret = bch2_inode_unpack(k, inode);
94err:
95	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
96	bch2_trans_iter_exit(trans, &iter);
97	return ret;
98}
99
100static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
101			struct bch_inode_unpacked *inode,
102			u32 *snapshot)
103{
104	struct btree_iter iter;
105	struct bkey_s_c k;
106	int ret;
107
108	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
109			       SPOS(0, inode_nr, *snapshot), 0);
110	ret = bkey_err(k);
111	if (ret)
112		goto err;
113
114	ret = bkey_is_inode(k.k)
115		? bch2_inode_unpack(k, inode)
116		: -BCH_ERR_ENOENT_inode;
117	if (!ret)
118		*snapshot = iter.pos.snapshot;
119err:
120	bch2_trans_iter_exit(trans, &iter);
121	return ret;
122}
123
124static int lookup_dirent_in_snapshot(struct btree_trans *trans,
125			   struct bch_hash_info hash_info,
126			   subvol_inum dir, struct qstr *name,
127			   u64 *target, unsigned *type, u32 snapshot)
128{
129	struct btree_iter iter;
130	struct bkey_s_c_dirent d;
131	int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
132			       &hash_info, dir, name, 0, snapshot);
133	if (ret)
134		return ret;
135
136	d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
137	*target = le64_to_cpu(d.v->d_inum);
138	*type = d.v->d_type;
139	bch2_trans_iter_exit(trans, &iter);
140	return 0;
141}
142
143static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
144{
145	struct bch_fs *c = trans->c;
146	struct btree_iter iter;
147	struct bch_inode_unpacked dir_inode;
148	struct bch_hash_info dir_hash_info;
149	int ret;
150
151	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
152	if (ret)
153		goto err;
154
155	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
156
157	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
158
159	ret =   bch2_btree_iter_traverse(&iter) ?:
160		bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
161				    &dir_hash_info, &iter,
162				    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
163	bch2_trans_iter_exit(trans, &iter);
164err:
165	bch_err_fn(c, ret);
166	return ret;
167}
168
169/* Get lost+found, create if it doesn't exist: */
170static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
171			    struct bch_inode_unpacked *lostfound,
172			    u64 reattaching_inum)
173{
174	struct bch_fs *c = trans->c;
175	struct qstr lostfound_str = QSTR("lost+found");
176	u64 inum = 0;
177	unsigned d_type = 0;
178	int ret;
179
180	struct bch_snapshot_tree st;
181	ret = bch2_snapshot_tree_lookup(trans,
182			bch2_snapshot_tree(c, snapshot), &st);
183	if (ret)
184		return ret;
185
186	subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
187
188	struct bch_subvolume subvol;
189	ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
190				 false, 0, &subvol);
191	bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
192		    le32_to_cpu(st.master_subvol), snapshot);
193	if (ret)
194		return ret;
195
196	if (!subvol.inode) {
197		struct btree_iter iter;
198		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
199				BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
200				0, subvolume);
201		ret = PTR_ERR_OR_ZERO(subvol);
202		if (ret)
203			return ret;
204
205		subvol->v.inode = cpu_to_le64(reattaching_inum);
206		bch2_trans_iter_exit(trans, &iter);
207	}
208
209	root_inum.inum = le64_to_cpu(subvol.inode);
210
211	struct bch_inode_unpacked root_inode;
212	struct bch_hash_info root_hash_info;
213	u32 root_inode_snapshot = snapshot;
214	ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
215	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
216		    root_inum.inum, le32_to_cpu(st.master_subvol));
217	if (ret)
218		return ret;
219
220	root_hash_info = bch2_hash_info_init(c, &root_inode);
221
222	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
223			      &lostfound_str, &inum, &d_type, snapshot);
224	if (bch2_err_matches(ret, ENOENT))
225		goto create_lostfound;
226
227	bch_err_fn(c, ret);
228	if (ret)
229		return ret;
230
231	if (d_type != DT_DIR) {
232		bch_err(c, "error looking up lost+found: not a directory");
233		return -BCH_ERR_ENOENT_not_directory;
234	}
235
236	/*
237	 * The bch2_check_dirents pass has already run, dangling dirents
238	 * shouldn't exist here:
239	 */
240	ret = lookup_inode(trans, inum, lostfound, &snapshot);
241	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
242		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
243	return ret;
244
245create_lostfound:
246	/*
247	 * XXX: we could have a nicer log message here  if we had a nice way to
248	 * walk backpointers to print a path
249	 */
250	bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
251
252	u64 now = bch2_current_time(c);
253	struct btree_iter lostfound_iter = { NULL };
254	u64 cpu = raw_smp_processor_id();
255
256	bch2_inode_init_early(c, lostfound);
257	bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
258	lostfound->bi_dir = root_inode.bi_inum;
259
260	root_inode.bi_nlink++;
261
262	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
263	if (ret)
264		goto err;
265
266	bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
267	ret = bch2_btree_iter_traverse(&lostfound_iter);
268	if (ret)
269		goto err;
270
271	ret =   bch2_dirent_create_snapshot(trans,
272				0, root_inode.bi_inum, snapshot, &root_hash_info,
273				mode_to_type(lostfound->bi_mode),
274				&lostfound_str,
275				lostfound->bi_inum,
276				&lostfound->bi_dir_offset,
277				BCH_HASH_SET_MUST_CREATE) ?:
278		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
279				       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
280err:
281	bch_err_msg(c, ret, "creating lost+found");
282	bch2_trans_iter_exit(trans, &lostfound_iter);
283	return ret;
284}
285
286static int reattach_inode(struct btree_trans *trans,
287			  struct bch_inode_unpacked *inode,
288			  u32 inode_snapshot)
289{
290	struct bch_hash_info dir_hash;
291	struct bch_inode_unpacked lostfound;
292	char name_buf[20];
293	struct qstr name;
294	u64 dir_offset = 0;
295	u32 dirent_snapshot = inode_snapshot;
296	int ret;
297
298	if (inode->bi_subvol) {
299		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
300
301		u64 root_inum;
302		ret = subvol_lookup(trans, inode->bi_parent_subvol,
303				    &dirent_snapshot, &root_inum);
304		if (ret)
305			return ret;
306
307		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
308	} else {
309		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
310	}
311
312	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
313	if (ret)
314		return ret;
315
316	if (S_ISDIR(inode->bi_mode)) {
317		lostfound.bi_nlink++;
318
319		ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
320		if (ret)
321			return ret;
322	}
323
324	dir_hash = bch2_hash_info_init(trans->c, &lostfound);
325
326	name = (struct qstr) QSTR(name_buf);
327
328	ret = bch2_dirent_create_snapshot(trans,
329				inode->bi_parent_subvol, lostfound.bi_inum,
330				dirent_snapshot,
331				&dir_hash,
332				inode_d_type(inode),
333				&name,
334				inode->bi_subvol ?: inode->bi_inum,
335				&dir_offset,
336				BCH_HASH_SET_MUST_CREATE);
337	if (ret)
338		return ret;
339
340	inode->bi_dir		= lostfound.bi_inum;
341	inode->bi_dir_offset	= dir_offset;
342
343	return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
344}
345
346static int remove_backpointer(struct btree_trans *trans,
347			      struct bch_inode_unpacked *inode)
348{
349	struct btree_iter iter;
350	struct bkey_s_c_dirent d;
351	int ret;
352
353	d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
354				     POS(inode->bi_dir, inode->bi_dir_offset), 0,
355				     dirent);
356	ret =   bkey_err(d) ?:
357		__remove_dirent(trans, d.k->p);
358	bch2_trans_iter_exit(trans, &iter);
359	return ret;
360}
361
362static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
363{
364	struct bch_fs *c = trans->c;
365
366	struct bch_inode_unpacked inode;
367	int ret = bch2_inode_find_by_inum_trans(trans,
368				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
369				&inode);
370	if (ret)
371		return ret;
372
373	ret = remove_backpointer(trans, &inode);
374	bch_err_msg(c, ret, "removing dirent");
375	if (ret)
376		return ret;
377
378	ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
379	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
380	return ret;
381}
382
383static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
384{
385	struct bch_fs *c = trans->c;
386
387	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
388		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
389		return -BCH_ERR_fsck_repair_unimplemented;
390	}
391
392	/*
393	 * If inum isn't set, that means we're being called from check_dirents,
394	 * not check_inodes - the root of this subvolume doesn't exist or we
395	 * would have found it there:
396	 */
397	if (!inum) {
398		struct btree_iter inode_iter = {};
399		struct bch_inode_unpacked new_inode;
400		u64 cpu = raw_smp_processor_id();
401
402		bch2_inode_init_early(c, &new_inode);
403		bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
404
405		new_inode.bi_subvol = subvolid;
406
407		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
408			  bch2_btree_iter_traverse(&inode_iter) ?:
409			  bch2_inode_write(trans, &inode_iter, &new_inode);
410		bch2_trans_iter_exit(trans, &inode_iter);
411		if (ret)
412			return ret;
413
414		inum = new_inode.bi_inum;
415	}
416
417	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
418
419	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
420	int ret = PTR_ERR_OR_ZERO(new_subvol);
421	if (ret)
422		return ret;
423
424	bkey_subvolume_init(&new_subvol->k_i);
425	new_subvol->k.p.offset	= subvolid;
426	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
427	new_subvol->v.inode	= cpu_to_le64(inum);
428	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
429	if (ret)
430		return ret;
431
432	struct btree_iter iter;
433	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
434			BTREE_ID_snapshots, POS(0, snapshotid),
435			0, snapshot);
436	ret = PTR_ERR_OR_ZERO(s);
437	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
438	if (ret)
439		return ret;
440
441	u32 snapshot_tree = le32_to_cpu(s->v.tree);
442
443	s->v.subvol = cpu_to_le32(subvolid);
444	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
445	bch2_trans_iter_exit(trans, &iter);
446
447	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
448			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
449			0, snapshot_tree);
450	ret = PTR_ERR_OR_ZERO(st);
451	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
452	if (ret)
453		return ret;
454
455	if (!st->v.master_subvol)
456		st->v.master_subvol = cpu_to_le32(subvolid);
457
458	bch2_trans_iter_exit(trans, &iter);
459	return 0;
460}
461
462static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
463{
464	struct bch_fs *c = trans->c;
465	struct bch_inode_unpacked new_inode;
466
467	bch2_inode_init_early(c, &new_inode);
468	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
469	new_inode.bi_size = size;
470	new_inode.bi_inum = inum;
471
472	return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
473}
474
475static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
476{
477	struct btree_iter iter = {};
478
479	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
480	struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
481	bch2_trans_iter_exit(trans, &iter);
482	int ret = bkey_err(k);
483	if (ret)
484		return ret;
485
486	return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
487}
488
489struct snapshots_seen_entry {
490	u32				id;
491	u32				equiv;
492};
493
494struct snapshots_seen {
495	struct bpos			pos;
496	DARRAY(struct snapshots_seen_entry) ids;
497};
498
499static inline void snapshots_seen_exit(struct snapshots_seen *s)
500{
501	darray_exit(&s->ids);
502}
503
504static inline void snapshots_seen_init(struct snapshots_seen *s)
505{
506	memset(s, 0, sizeof(*s));
507}
508
509static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
510{
511	struct snapshots_seen_entry *i, n = {
512		.id	= id,
513		.equiv	= bch2_snapshot_equiv(c, id),
514	};
515	int ret = 0;
516
517	__darray_for_each(s->ids, i) {
518		if (i->id == id)
519			return 0;
520		if (i->id > id)
521			break;
522	}
523
524	ret = darray_insert_item(&s->ids, i - s->ids.data, n);
525	if (ret)
526		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
527			s->ids.size);
528	return ret;
529}
530
531static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
532				 enum btree_id btree_id, struct bpos pos)
533{
534	struct snapshots_seen_entry n = {
535		.id	= pos.snapshot,
536		.equiv	= bch2_snapshot_equiv(c, pos.snapshot),
537	};
538	int ret = 0;
539
540	if (!bkey_eq(s->pos, pos))
541		s->ids.nr = 0;
542
543	s->pos = pos;
544	s->pos.snapshot = n.equiv;
545
546	darray_for_each(s->ids, i) {
547		if (i->id == n.id)
548			return 0;
549
550		/*
551		 * We currently don't rigorously track for snapshot cleanup
552		 * needing to be run, so it shouldn't be a fsck error yet:
553		 */
554		if (i->equiv == n.equiv) {
555			bch_err(c, "snapshot deletion did not finish:\n"
556				"  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
557				bch2_btree_id_str(btree_id),
558				pos.inode, pos.offset,
559				i->id, n.id, n.equiv);
560			set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
561			return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
562		}
563	}
564
565	ret = darray_push(&s->ids, n);
566	if (ret)
567		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
568			s->ids.size);
569	return ret;
570}
571
572/**
573 * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
574 * and @ancestor hasn't been overwritten in @seen
575 *
576 * @c:		filesystem handle
577 * @seen:	list of snapshot ids already seen at current position
578 * @id:		descendent snapshot id
579 * @ancestor:	ancestor snapshot id
580 *
581 * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
582 */
583static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
584				    u32 id, u32 ancestor)
585{
586	ssize_t i;
587
588	EBUG_ON(id > ancestor);
589	EBUG_ON(!bch2_snapshot_is_equiv(c, id));
590	EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
591
592	/* @ancestor should be the snapshot most recently added to @seen */
593	EBUG_ON(ancestor != seen->pos.snapshot);
594	EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv);
595
596	if (id == ancestor)
597		return true;
598
599	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
600		return false;
601
602	/*
603	 * We know that @id is a descendant of @ancestor, we're checking if
604	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
605	 * @ascestor and with @id as a descendent.
606	 *
607	 * But we already know that we're scanning IDs between @id and @ancestor
608	 * numerically, since snapshot ID lists are kept sorted, so if we find
609	 * an id that's an ancestor of @id we're done:
610	 */
611
612	for (i = seen->ids.nr - 2;
613	     i >= 0 && seen->ids.data[i].equiv >= id;
614	     --i)
615		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv))
616			return false;
617
618	return true;
619}
620
621/**
622 * ref_visible - given a key with snapshot id @src that points to a key with
623 * snapshot id @dst, test whether there is some snapshot in which @dst is
624 * visible.
625 *
626 * @c:		filesystem handle
627 * @s:		list of snapshot IDs already seen at @src
628 * @src:	snapshot ID of src key
629 * @dst:	snapshot ID of dst key
630 * Returns:	true if there is some snapshot in which @dst is visible
631 *
632 * Assumes we're visiting @src keys in natural key order
633 */
634static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
635			u32 src, u32 dst)
636{
637	return dst <= src
638		? key_visible_in_snapshot(c, s, dst, src)
639		: bch2_snapshot_is_ancestor(c, src, dst);
640}
641
642static int ref_visible2(struct bch_fs *c,
643			u32 src, struct snapshots_seen *src_seen,
644			u32 dst, struct snapshots_seen *dst_seen)
645{
646	src = bch2_snapshot_equiv(c, src);
647	dst = bch2_snapshot_equiv(c, dst);
648
649	if (dst > src) {
650		swap(dst, src);
651		swap(dst_seen, src_seen);
652	}
653	return key_visible_in_snapshot(c, src_seen, dst, src);
654}
655
656#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
657	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
658	     (_i)->snapshot <= (_snapshot); _i++)					\
659		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
660
661struct inode_walker_entry {
662	struct bch_inode_unpacked inode;
663	u32			snapshot;
664	bool			seen_this_pos;
665	u64			count;
666};
667
668struct inode_walker {
669	bool				first_this_inode;
670	bool				recalculate_sums;
671	struct bpos			last_pos;
672
673	DARRAY(struct inode_walker_entry) inodes;
674};
675
676static void inode_walker_exit(struct inode_walker *w)
677{
678	darray_exit(&w->inodes);
679}
680
681static struct inode_walker inode_walker_init(void)
682{
683	return (struct inode_walker) { 0, };
684}
685
686static int add_inode(struct bch_fs *c, struct inode_walker *w,
687		     struct bkey_s_c inode)
688{
689	struct bch_inode_unpacked u;
690
691	BUG_ON(bch2_inode_unpack(inode, &u));
692
693	return darray_push(&w->inodes, ((struct inode_walker_entry) {
694		.inode		= u,
695		.snapshot	= bch2_snapshot_equiv(c, inode.k->p.snapshot),
696	}));
697}
698
699static int get_inodes_all_snapshots(struct btree_trans *trans,
700				    struct inode_walker *w, u64 inum)
701{
702	struct bch_fs *c = trans->c;
703	struct btree_iter iter;
704	struct bkey_s_c k;
705	int ret;
706
707	w->recalculate_sums = false;
708	w->inodes.nr = 0;
709
710	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
711				     BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
712		if (k.k->p.offset != inum)
713			break;
714
715		if (bkey_is_inode(k.k))
716			add_inode(c, w, k);
717	}
718	bch2_trans_iter_exit(trans, &iter);
719
720	if (ret)
721		return ret;
722
723	w->first_this_inode = true;
724	return 0;
725}
726
727static struct inode_walker_entry *
728lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
729{
730	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
731	u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
732
733	struct inode_walker_entry *i;
734	__darray_for_each(w->inodes, i)
735		if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
736			goto found;
737
738	return NULL;
739found:
740	BUG_ON(snapshot > i->snapshot);
741
742	if (snapshot != i->snapshot && !is_whiteout) {
743		struct inode_walker_entry new = *i;
744
745		new.snapshot = snapshot;
746		new.count = 0;
747
748		struct printbuf buf = PRINTBUF;
749		bch2_bkey_val_to_text(&buf, c, k);
750
751		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
752			 "unexpected because we should always update the inode when we update a key in that inode\n"
753			 "%s",
754			 w->last_pos.inode, snapshot, i->snapshot, buf.buf);
755		printbuf_exit(&buf);
756
757		while (i > w->inodes.data && i[-1].snapshot > snapshot)
758			--i;
759
760		size_t pos = i - w->inodes.data;
761		int ret = darray_insert_item(&w->inodes, pos, new);
762		if (ret)
763			return ERR_PTR(ret);
764
765		i = w->inodes.data + pos;
766	}
767
768	return i;
769}
770
771static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
772					     struct inode_walker *w,
773					     struct bkey_s_c k)
774{
775	if (w->last_pos.inode != k.k->p.inode) {
776		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
777		if (ret)
778			return ERR_PTR(ret);
779	} else if (bkey_cmp(w->last_pos, k.k->p)) {
780		darray_for_each(w->inodes, i)
781			i->seen_this_pos = false;
782	}
783
784	w->last_pos = k.k->p;
785
786	return lookup_inode_for_snapshot(trans->c, w, k);
787}
788
789static int __get_visible_inodes(struct btree_trans *trans,
790				struct inode_walker *w,
791				struct snapshots_seen *s,
792				u64 inum)
793{
794	struct bch_fs *c = trans->c;
795	struct btree_iter iter;
796	struct bkey_s_c k;
797	int ret;
798
799	w->inodes.nr = 0;
800
801	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
802			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
803		u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
804
805		if (k.k->p.offset != inum)
806			break;
807
808		if (!ref_visible(c, s, s->pos.snapshot, equiv))
809			continue;
810
811		if (bkey_is_inode(k.k))
812			add_inode(c, w, k);
813
814		if (equiv >= s->pos.snapshot)
815			break;
816	}
817	bch2_trans_iter_exit(trans, &iter);
818
819	return ret;
820}
821
822static int check_key_has_snapshot(struct btree_trans *trans,
823				  struct btree_iter *iter,
824				  struct bkey_s_c k)
825{
826	struct bch_fs *c = trans->c;
827	struct printbuf buf = PRINTBUF;
828	int ret = 0;
829
830	if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
831				bkey_in_missing_snapshot,
832				"key in missing snapshot: %s",
833				(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
834		ret = bch2_btree_delete_at(trans, iter,
835					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
836fsck_err:
837	printbuf_exit(&buf);
838	return ret;
839}
840
841static int hash_redo_key(struct btree_trans *trans,
842			 const struct bch_hash_desc desc,
843			 struct bch_hash_info *hash_info,
844			 struct btree_iter *k_iter, struct bkey_s_c k)
845{
846	struct bkey_i *delete;
847	struct bkey_i *tmp;
848
849	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
850	if (IS_ERR(delete))
851		return PTR_ERR(delete);
852
853	tmp = bch2_bkey_make_mut_noupdate(trans, k);
854	if (IS_ERR(tmp))
855		return PTR_ERR(tmp);
856
857	bkey_init(&delete->k);
858	delete->k.p = k_iter->pos;
859	return  bch2_btree_iter_traverse(k_iter) ?:
860		bch2_trans_update(trans, k_iter, delete, 0) ?:
861		bch2_hash_set_in_snapshot(trans, desc, hash_info,
862				       (subvol_inum) { 0, k.k->p.inode },
863				       k.k->p.snapshot, tmp,
864				       BCH_HASH_SET_MUST_CREATE,
865				       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
866		bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
867}
868
869static int hash_check_key(struct btree_trans *trans,
870			  const struct bch_hash_desc desc,
871			  struct bch_hash_info *hash_info,
872			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
873{
874	struct bch_fs *c = trans->c;
875	struct btree_iter iter = { NULL };
876	struct printbuf buf = PRINTBUF;
877	struct bkey_s_c k;
878	u64 hash;
879	int ret = 0;
880
881	if (hash_k.k->type != desc.key_type)
882		return 0;
883
884	hash = desc.hash_bkey(hash_info, hash_k);
885
886	if (likely(hash == hash_k.k->p.offset))
887		return 0;
888
889	if (hash_k.k->p.offset < hash)
890		goto bad_hash;
891
892	for_each_btree_key_norestart(trans, iter, desc.btree_id,
893				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
894				     BTREE_ITER_SLOTS, k, ret) {
895		if (bkey_eq(k.k->p, hash_k.k->p))
896			break;
897
898		if (fsck_err_on(k.k->type == desc.key_type &&
899				!desc.cmp_bkey(k, hash_k), c,
900				hash_table_key_duplicate,
901				"duplicate hash table keys:\n%s",
902				(printbuf_reset(&buf),
903				 bch2_bkey_val_to_text(&buf, c, hash_k),
904				 buf.buf))) {
905			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
906			break;
907		}
908
909		if (bkey_deleted(k.k)) {
910			bch2_trans_iter_exit(trans, &iter);
911			goto bad_hash;
912		}
913	}
914out:
915	bch2_trans_iter_exit(trans, &iter);
916	printbuf_exit(&buf);
917	return ret;
918bad_hash:
919	if (fsck_err(c, hash_table_key_wrong_offset,
920		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
921		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
922		     (printbuf_reset(&buf),
923		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
924		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
925		bch_err_fn(c, ret);
926		if (ret)
927			return ret;
928		ret = -BCH_ERR_transaction_restart_nested;
929	}
930fsck_err:
931	goto out;
932}
933
934static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
935						struct btree_iter *iter,
936						struct bpos pos)
937{
938	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
939}
940
941static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
942					       struct btree_iter *iter,
943					       struct bch_inode_unpacked *inode,
944					       u32 *snapshot)
945{
946	if (inode->bi_subvol) {
947		u64 inum;
948		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
949		if (ret)
950			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
951	}
952
953	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
954}
955
956static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
957				   struct bkey_s_c_dirent d)
958{
959	return  inode->bi_dir		== d.k->p.inode &&
960		inode->bi_dir_offset	== d.k->p.offset;
961}
962
963static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
964				   struct bch_inode_unpacked *inode)
965{
966	return d.v->d_type == DT_SUBVOL
967		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
968		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
969}
970
971static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
972{
973	struct btree_iter iter;
974	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
975	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
976	bch2_trans_iter_exit(trans, &iter);
977	return ret;
978}
979
980static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
981				    struct bch_inode_unpacked *inode,
982				    u32 inode_snapshot, bool *write_inode)
983{
984	struct bch_fs *c = trans->c;
985	struct printbuf buf = PRINTBUF;
986
987	struct btree_iter dirent_iter = {};
988	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
989	int ret = bkey_err(d);
990	if (ret && !bch2_err_matches(ret, ENOENT))
991		return ret;
992
993	if (fsck_err_on(ret,
994			c, inode_points_to_missing_dirent,
995			"inode points to missing dirent\n%s",
996			(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
997	    fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
998			c, inode_points_to_wrong_dirent,
999			"inode points to dirent that does not point back:\n%s",
1000			(bch2_bkey_val_to_text(&buf, c, inode_k),
1001			 prt_newline(&buf),
1002			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1003		/*
1004		 * We just clear the backpointer fields for now. If we find a
1005		 * dirent that points to this inode in check_dirents(), we'll
1006		 * update it then; then when we get to check_path() if the
1007		 * backpointer is still 0 we'll reattach it.
1008		 */
1009		inode->bi_dir = 0;
1010		inode->bi_dir_offset = 0;
1011		inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
1012		*write_inode = true;
1013	}
1014
1015	ret = 0;
1016fsck_err:
1017	bch2_trans_iter_exit(trans, &dirent_iter);
1018	printbuf_exit(&buf);
1019	bch_err_fn(c, ret);
1020	return ret;
1021}
1022
1023static int check_inode(struct btree_trans *trans,
1024		       struct btree_iter *iter,
1025		       struct bkey_s_c k,
1026		       struct bch_inode_unpacked *prev,
1027		       struct snapshots_seen *s,
1028		       bool full)
1029{
1030	struct bch_fs *c = trans->c;
1031	struct bch_inode_unpacked u;
1032	bool do_update = false;
1033	int ret;
1034
1035	ret = check_key_has_snapshot(trans, iter, k);
1036	if (ret < 0)
1037		goto err;
1038	if (ret)
1039		return 0;
1040
1041	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1042	if (ret)
1043		goto err;
1044
1045	if (!bkey_is_inode(k.k))
1046		return 0;
1047
1048	BUG_ON(bch2_inode_unpack(k, &u));
1049
1050	if (!full &&
1051	    !(u.bi_flags & (BCH_INODE_i_size_dirty|
1052			    BCH_INODE_i_sectors_dirty|
1053			    BCH_INODE_unlinked)))
1054		return 0;
1055
1056	if (prev->bi_inum != u.bi_inum)
1057		*prev = u;
1058
1059	if (fsck_err_on(prev->bi_hash_seed	!= u.bi_hash_seed ||
1060			inode_d_type(prev)	!= inode_d_type(&u),
1061			c, inode_snapshot_mismatch,
1062			"inodes in different snapshots don't match")) {
1063		bch_err(c, "repair not implemented yet");
1064		return -BCH_ERR_fsck_repair_unimplemented;
1065	}
1066
1067	if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
1068	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
1069		struct bpos new_min_pos;
1070
1071		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
1072		if (ret)
1073			goto err;
1074
1075		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
1076
1077		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1078
1079		bch_err_msg(c, ret, "in fsck updating inode");
1080		if (ret)
1081			return ret;
1082
1083		if (!bpos_eq(new_min_pos, POS_MIN))
1084			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
1085		return 0;
1086	}
1087
1088	if (u.bi_flags & BCH_INODE_unlinked) {
1089		ret = check_inode_deleted_list(trans, k.k->p);
1090		if (ret < 0)
1091			return ret;
1092
1093		fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
1094			    "inode %llu:%u unlinked, but not on deleted list",
1095			    u.bi_inum, k.k->p.snapshot);
1096		ret = 0;
1097	}
1098
1099	if (u.bi_flags & BCH_INODE_unlinked &&
1100	    (!c->sb.clean ||
1101	     fsck_err(c, inode_unlinked_but_clean,
1102		      "filesystem marked clean, but inode %llu unlinked",
1103		      u.bi_inum))) {
1104		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1105		bch_err_msg(c, ret, "in fsck deleting inode");
1106		return ret;
1107	}
1108
1109	if (u.bi_flags & BCH_INODE_i_size_dirty &&
1110	    (!c->sb.clean ||
1111	     fsck_err(c, inode_i_size_dirty_but_clean,
1112		      "filesystem marked clean, but inode %llu has i_size dirty",
1113		      u.bi_inum))) {
1114		bch_verbose(c, "truncating inode %llu", u.bi_inum);
1115
1116		/*
1117		 * XXX: need to truncate partial blocks too here - or ideally
1118		 * just switch units to bytes and that issue goes away
1119		 */
1120		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1121				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
1122				     iter->pos.snapshot),
1123				POS(u.bi_inum, U64_MAX),
1124				0, NULL);
1125		bch_err_msg(c, ret, "in fsck truncating inode");
1126		if (ret)
1127			return ret;
1128
1129		/*
1130		 * We truncated without our normal sector accounting hook, just
1131		 * make sure we recalculate it:
1132		 */
1133		u.bi_flags |= BCH_INODE_i_sectors_dirty;
1134
1135		u.bi_flags &= ~BCH_INODE_i_size_dirty;
1136		do_update = true;
1137	}
1138
1139	if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
1140	    (!c->sb.clean ||
1141	     fsck_err(c, inode_i_sectors_dirty_but_clean,
1142		      "filesystem marked clean, but inode %llu has i_sectors dirty",
1143		      u.bi_inum))) {
1144		s64 sectors;
1145
1146		bch_verbose(c, "recounting sectors for inode %llu",
1147			    u.bi_inum);
1148
1149		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
1150		if (sectors < 0) {
1151			bch_err_msg(c, sectors, "in fsck recounting inode sectors");
1152			return sectors;
1153		}
1154
1155		u.bi_sectors = sectors;
1156		u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
1157		do_update = true;
1158	}
1159
1160	if (u.bi_flags & BCH_INODE_backptr_untrusted) {
1161		u.bi_dir = 0;
1162		u.bi_dir_offset = 0;
1163		u.bi_flags &= ~BCH_INODE_backptr_untrusted;
1164		do_update = true;
1165	}
1166
1167	if (u.bi_dir || u.bi_dir_offset) {
1168		ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
1169		if (ret)
1170			goto err;
1171	}
1172
1173	if (fsck_err_on(u.bi_parent_subvol &&
1174			(u.bi_subvol == 0 ||
1175			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1176			c, inode_bi_parent_nonzero,
1177			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
1178			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1179		u.bi_parent_subvol = 0;
1180		do_update = true;
1181	}
1182
1183	if (u.bi_subvol) {
1184		struct bch_subvolume s;
1185
1186		ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
1187		if (ret && !bch2_err_matches(ret, ENOENT))
1188			goto err;
1189
1190		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1191			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1192			goto do_update;
1193		}
1194
1195		if (fsck_err_on(ret,
1196				c, inode_bi_subvol_missing,
1197				"inode %llu:%u bi_subvol points to missing subvolume %u",
1198				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1199		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1200				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1201							   k.k->p.snapshot),
1202				c, inode_bi_subvol_wrong,
1203				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1204				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1205				le64_to_cpu(s.inode),
1206				le32_to_cpu(s.snapshot))) {
1207			u.bi_subvol = 0;
1208			u.bi_parent_subvol = 0;
1209			do_update = true;
1210		}
1211	}
1212do_update:
1213	if (do_update) {
1214		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1215		bch_err_msg(c, ret, "in fsck updating inode");
1216		if (ret)
1217			return ret;
1218	}
1219err:
1220fsck_err:
1221	bch_err_fn(c, ret);
1222	return ret;
1223}
1224
1225int bch2_check_inodes(struct bch_fs *c)
1226{
1227	bool full = c->opts.fsck;
1228	struct bch_inode_unpacked prev = { 0 };
1229	struct snapshots_seen s;
1230
1231	snapshots_seen_init(&s);
1232
1233	int ret = bch2_trans_run(c,
1234		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1235				POS_MIN,
1236				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1237				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1238			check_inode(trans, &iter, k, &prev, &s, full)));
1239
1240	snapshots_seen_exit(&s);
1241	bch_err_fn(c, ret);
1242	return ret;
1243}
1244
1245static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1246{
1247	struct bch_fs *c = trans->c;
1248	int ret = 0;
1249	s64 count2;
1250
1251	darray_for_each(w->inodes, i) {
1252		if (i->inode.bi_sectors == i->count)
1253			continue;
1254
1255		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1256
1257		if (w->recalculate_sums)
1258			i->count = count2;
1259
1260		if (i->count != count2) {
1261			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1262					    w->last_pos.inode, i->snapshot, i->count, count2);
1263			return -BCH_ERR_internal_fsck_err;
1264		}
1265
1266		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1267				c, inode_i_sectors_wrong,
1268				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1269				w->last_pos.inode, i->snapshot,
1270				i->inode.bi_sectors, i->count)) {
1271			i->inode.bi_sectors = i->count;
1272			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1273			if (ret)
1274				break;
1275		}
1276	}
1277fsck_err:
1278	bch_err_fn(c, ret);
1279	return ret;
1280}
1281
1282static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1283{
1284	u32 restart_count = trans->restart_count;
1285	return check_i_sectors_notnested(trans, w) ?:
1286		trans_was_restarted(trans, restart_count);
1287}
1288
1289struct extent_end {
1290	u32			snapshot;
1291	u64			offset;
1292	struct snapshots_seen	seen;
1293};
1294
1295struct extent_ends {
1296	struct bpos			last_pos;
1297	DARRAY(struct extent_end)	e;
1298};
1299
1300static void extent_ends_reset(struct extent_ends *extent_ends)
1301{
1302	darray_for_each(extent_ends->e, i)
1303		snapshots_seen_exit(&i->seen);
1304	extent_ends->e.nr = 0;
1305}
1306
1307static void extent_ends_exit(struct extent_ends *extent_ends)
1308{
1309	extent_ends_reset(extent_ends);
1310	darray_exit(&extent_ends->e);
1311}
1312
1313static void extent_ends_init(struct extent_ends *extent_ends)
1314{
1315	memset(extent_ends, 0, sizeof(*extent_ends));
1316}
1317
1318static int extent_ends_at(struct bch_fs *c,
1319			  struct extent_ends *extent_ends,
1320			  struct snapshots_seen *seen,
1321			  struct bkey_s_c k)
1322{
1323	struct extent_end *i, n = (struct extent_end) {
1324		.offset		= k.k->p.offset,
1325		.snapshot	= k.k->p.snapshot,
1326		.seen		= *seen,
1327	};
1328
1329	n.seen.ids.data = kmemdup(seen->ids.data,
1330			      sizeof(seen->ids.data[0]) * seen->ids.size,
1331			      GFP_KERNEL);
1332	if (!n.seen.ids.data)
1333		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1334
1335	__darray_for_each(extent_ends->e, i) {
1336		if (i->snapshot == k.k->p.snapshot) {
1337			snapshots_seen_exit(&i->seen);
1338			*i = n;
1339			return 0;
1340		}
1341
1342		if (i->snapshot >= k.k->p.snapshot)
1343			break;
1344	}
1345
1346	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1347}
1348
1349static int overlapping_extents_found(struct btree_trans *trans,
1350				     enum btree_id btree,
1351				     struct bpos pos1, struct snapshots_seen *pos1_seen,
1352				     struct bkey pos2,
1353				     bool *fixed,
1354				     struct extent_end *extent_end)
1355{
1356	struct bch_fs *c = trans->c;
1357	struct printbuf buf = PRINTBUF;
1358	struct btree_iter iter1, iter2 = { NULL };
1359	struct bkey_s_c k1, k2;
1360	int ret;
1361
1362	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1363
1364	bch2_trans_iter_init(trans, &iter1, btree, pos1,
1365			     BTREE_ITER_ALL_SNAPSHOTS|
1366			     BTREE_ITER_NOT_EXTENTS);
1367	k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1368	ret = bkey_err(k1);
1369	if (ret)
1370		goto err;
1371
1372	prt_str(&buf, "\n  ");
1373	bch2_bkey_val_to_text(&buf, c, k1);
1374
1375	if (!bpos_eq(pos1, k1.k->p)) {
1376		prt_str(&buf, "\n  wanted\n  ");
1377		bch2_bpos_to_text(&buf, pos1);
1378		prt_str(&buf, "\n  ");
1379		bch2_bkey_to_text(&buf, &pos2);
1380
1381		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1382			__func__, buf.buf);
1383		ret = -BCH_ERR_internal_fsck_err;
1384		goto err;
1385	}
1386
1387	bch2_trans_copy_iter(&iter2, &iter1);
1388
1389	while (1) {
1390		bch2_btree_iter_advance(&iter2);
1391
1392		k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1393		ret = bkey_err(k2);
1394		if (ret)
1395			goto err;
1396
1397		if (bpos_ge(k2.k->p, pos2.p))
1398			break;
1399	}
1400
1401	prt_str(&buf, "\n  ");
1402	bch2_bkey_val_to_text(&buf, c, k2);
1403
1404	if (bpos_gt(k2.k->p, pos2.p) ||
1405	    pos2.size != k2.k->size) {
1406		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1407			__func__, buf.buf);
1408		ret = -BCH_ERR_internal_fsck_err;
1409		goto err;
1410	}
1411
1412	prt_printf(&buf, "\n  overwriting %s extent",
1413		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1414
1415	if (fsck_err(c, extent_overlapping,
1416		     "overlapping extents%s", buf.buf)) {
1417		struct btree_iter *old_iter = &iter1;
1418		struct disk_reservation res = { 0 };
1419
1420		if (pos1.snapshot < pos2.p.snapshot) {
1421			old_iter = &iter2;
1422			swap(k1, k2);
1423		}
1424
1425		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1426
1427		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1428				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
1429				k1, k2) ?:
1430			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1431		bch2_disk_reservation_put(c, &res);
1432
1433		if (ret)
1434			goto err;
1435
1436		*fixed = true;
1437
1438		if (pos1.snapshot == pos2.p.snapshot) {
1439			/*
1440			 * We overwrote the first extent, and did the overwrite
1441			 * in the same snapshot:
1442			 */
1443			extent_end->offset = bkey_start_offset(&pos2);
1444		} else if (pos1.snapshot > pos2.p.snapshot) {
1445			/*
1446			 * We overwrote the first extent in pos2's snapshot:
1447			 */
1448			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1449		} else {
1450			/*
1451			 * We overwrote the second extent - restart
1452			 * check_extent() from the top:
1453			 */
1454			ret = -BCH_ERR_transaction_restart_nested;
1455		}
1456	}
1457fsck_err:
1458err:
1459	bch2_trans_iter_exit(trans, &iter2);
1460	bch2_trans_iter_exit(trans, &iter1);
1461	printbuf_exit(&buf);
1462	return ret;
1463}
1464
1465static int check_overlapping_extents(struct btree_trans *trans,
1466			      struct snapshots_seen *seen,
1467			      struct extent_ends *extent_ends,
1468			      struct bkey_s_c k,
1469			      u32 equiv,
1470			      struct btree_iter *iter,
1471			      bool *fixed)
1472{
1473	struct bch_fs *c = trans->c;
1474	int ret = 0;
1475
1476	/* transaction restart, running again */
1477	if (bpos_eq(extent_ends->last_pos, k.k->p))
1478		return 0;
1479
1480	if (extent_ends->last_pos.inode != k.k->p.inode)
1481		extent_ends_reset(extent_ends);
1482
1483	darray_for_each(extent_ends->e, i) {
1484		if (i->offset <= bkey_start_offset(k.k))
1485			continue;
1486
1487		if (!ref_visible2(c,
1488				  k.k->p.snapshot, seen,
1489				  i->snapshot, &i->seen))
1490			continue;
1491
1492		ret = overlapping_extents_found(trans, iter->btree_id,
1493						SPOS(iter->pos.inode,
1494						     i->offset,
1495						     i->snapshot),
1496						&i->seen,
1497						*k.k, fixed, i);
1498		if (ret)
1499			goto err;
1500	}
1501
1502	extent_ends->last_pos = k.k->p;
1503err:
1504	return ret;
1505}
1506
1507static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1508				struct bkey_s_c k)
1509{
1510	struct bch_fs *c = trans->c;
1511	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1512	struct bch_extent_crc_unpacked crc;
1513	const union bch_extent_entry *i;
1514	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1515
1516	bkey_for_each_crc(k.k, ptrs, crc, i)
1517		if (crc_is_encoded(crc) &&
1518		    crc.uncompressed_size > encoded_extent_max_sectors) {
1519			struct printbuf buf = PRINTBUF;
1520
1521			bch2_bkey_val_to_text(&buf, c, k);
1522			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1523			printbuf_exit(&buf);
1524		}
1525
1526	return 0;
1527}
1528
1529static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1530			struct bkey_s_c k,
1531			struct inode_walker *inode,
1532			struct snapshots_seen *s,
1533			struct extent_ends *extent_ends)
1534{
1535	struct bch_fs *c = trans->c;
1536	struct inode_walker_entry *i;
1537	struct printbuf buf = PRINTBUF;
1538	struct bpos equiv = k.k->p;
1539	int ret = 0;
1540
1541	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1542
1543	ret = check_key_has_snapshot(trans, iter, k);
1544	if (ret) {
1545		ret = ret < 0 ? ret : 0;
1546		goto out;
1547	}
1548
1549	if (inode->last_pos.inode != k.k->p.inode) {
1550		ret = check_i_sectors(trans, inode);
1551		if (ret)
1552			goto err;
1553	}
1554
1555	i = walk_inode(trans, inode, k);
1556	ret = PTR_ERR_OR_ZERO(i);
1557	if (ret)
1558		goto err;
1559
1560	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1561	if (ret)
1562		goto err;
1563
1564	if (k.k->type != KEY_TYPE_whiteout) {
1565		if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1566			ret =   reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
1567				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1568			if (ret)
1569				goto err;
1570
1571			inode->last_pos.inode--;
1572			ret = -BCH_ERR_transaction_restart_nested;
1573			goto err;
1574		}
1575
1576		if (fsck_err_on(!i, c, extent_in_missing_inode,
1577				"extent in missing inode:\n  %s",
1578				(printbuf_reset(&buf),
1579				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1580			goto delete;
1581
1582		if (fsck_err_on(i &&
1583				!S_ISREG(i->inode.bi_mode) &&
1584				!S_ISLNK(i->inode.bi_mode),
1585				c, extent_in_non_reg_inode,
1586				"extent in non regular inode mode %o:\n  %s",
1587				i->inode.bi_mode,
1588				(printbuf_reset(&buf),
1589				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1590			goto delete;
1591
1592		ret = check_overlapping_extents(trans, s, extent_ends, k,
1593						equiv.snapshot, iter,
1594						&inode->recalculate_sums);
1595		if (ret)
1596			goto err;
1597	}
1598
1599	/*
1600	 * Check inodes in reverse order, from oldest snapshots to newest,
1601	 * starting from the inode that matches this extent's snapshot. If we
1602	 * didn't have one, iterate over all inodes:
1603	 */
1604	if (!i)
1605		i = inode->inodes.data + inode->inodes.nr - 1;
1606
1607	for (;
1608	     inode->inodes.data && i >= inode->inodes.data;
1609	     --i) {
1610		if (i->snapshot > equiv.snapshot ||
1611		    !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
1612			continue;
1613
1614		if (k.k->type != KEY_TYPE_whiteout) {
1615			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
1616					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1617					!bkey_extent_is_reservation(k),
1618					c, extent_past_end_of_inode,
1619					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
1620					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1621					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1622				struct btree_iter iter2;
1623
1624				bch2_trans_copy_iter(&iter2, iter);
1625				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1626				ret =   bch2_btree_iter_traverse(&iter2) ?:
1627					bch2_btree_delete_at(trans, &iter2,
1628						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1629				bch2_trans_iter_exit(trans, &iter2);
1630				if (ret)
1631					goto err;
1632
1633				iter->k.type = KEY_TYPE_whiteout;
1634			}
1635
1636			if (bkey_extent_is_allocation(k.k))
1637				i->count += k.k->size;
1638		}
1639
1640		i->seen_this_pos = true;
1641	}
1642
1643	if (k.k->type != KEY_TYPE_whiteout) {
1644		ret = extent_ends_at(c, extent_ends, s, k);
1645		if (ret)
1646			goto err;
1647	}
1648out:
1649err:
1650fsck_err:
1651	printbuf_exit(&buf);
1652	bch_err_fn(c, ret);
1653	return ret;
1654delete:
1655	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1656	goto out;
1657}
1658
1659/*
1660 * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1661 * that i_size an i_sectors are consistent
1662 */
1663int bch2_check_extents(struct bch_fs *c)
1664{
1665	struct inode_walker w = inode_walker_init();
1666	struct snapshots_seen s;
1667	struct extent_ends extent_ends;
1668	struct disk_reservation res = { 0 };
1669
1670	snapshots_seen_init(&s);
1671	extent_ends_init(&extent_ends);
1672
1673	int ret = bch2_trans_run(c,
1674		for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1675				POS(BCACHEFS_ROOT_INO, 0),
1676				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1677				&res, NULL,
1678				BCH_TRANS_COMMIT_no_enospc, ({
1679			bch2_disk_reservation_put(c, &res);
1680			check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1681			check_extent_overbig(trans, &iter, k);
1682		})) ?:
1683		check_i_sectors_notnested(trans, &w));
1684
1685	bch2_disk_reservation_put(c, &res);
1686	extent_ends_exit(&extent_ends);
1687	inode_walker_exit(&w);
1688	snapshots_seen_exit(&s);
1689
1690	bch_err_fn(c, ret);
1691	return ret;
1692}
1693
1694int bch2_check_indirect_extents(struct bch_fs *c)
1695{
1696	struct disk_reservation res = { 0 };
1697
1698	int ret = bch2_trans_run(c,
1699		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1700				POS_MIN,
1701				BTREE_ITER_PREFETCH, k,
1702				&res, NULL,
1703				BCH_TRANS_COMMIT_no_enospc, ({
1704			bch2_disk_reservation_put(c, &res);
1705			check_extent_overbig(trans, &iter, k);
1706		})));
1707
1708	bch2_disk_reservation_put(c, &res);
1709	bch_err_fn(c, ret);
1710	return ret;
1711}
1712
1713static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1714{
1715	struct bch_fs *c = trans->c;
1716	int ret = 0;
1717	s64 count2;
1718
1719	darray_for_each(w->inodes, i) {
1720		if (i->inode.bi_nlink == i->count)
1721			continue;
1722
1723		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1724		if (count2 < 0)
1725			return count2;
1726
1727		if (i->count != count2) {
1728			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1729					    w->last_pos.inode, i->snapshot, i->count, count2);
1730			i->count = count2;
1731			if (i->inode.bi_nlink == i->count)
1732				continue;
1733		}
1734
1735		if (fsck_err_on(i->inode.bi_nlink != i->count,
1736				c, inode_dir_wrong_nlink,
1737				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1738				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1739			i->inode.bi_nlink = i->count;
1740			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1741			if (ret)
1742				break;
1743		}
1744	}
1745fsck_err:
1746	bch_err_fn(c, ret);
1747	return ret;
1748}
1749
1750static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1751{
1752	u32 restart_count = trans->restart_count;
1753	return check_subdir_count_notnested(trans, w) ?:
1754		trans_was_restarted(trans, restart_count);
1755}
1756
1757static int check_dirent_inode_dirent(struct btree_trans *trans,
1758				   struct btree_iter *iter,
1759				   struct bkey_s_c_dirent d,
1760				   struct bch_inode_unpacked *target,
1761				   u32 target_snapshot)
1762{
1763	struct bch_fs *c = trans->c;
1764	struct printbuf buf = PRINTBUF;
1765	int ret = 0;
1766
1767	if (inode_points_to_dirent(target, d))
1768		return 0;
1769
1770	if (!target->bi_dir &&
1771	    !target->bi_dir_offset) {
1772		target->bi_dir		= d.k->p.inode;
1773		target->bi_dir_offset	= d.k->p.offset;
1774		return __bch2_fsck_write_inode(trans, target, target_snapshot);
1775	}
1776
1777	struct btree_iter bp_iter = { NULL };
1778	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1779			      SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1780	ret = bkey_err(bp_dirent);
1781	if (ret && !bch2_err_matches(ret, ENOENT))
1782		goto err;
1783
1784	bool backpointer_exists = !ret;
1785	ret = 0;
1786
1787	if (fsck_err_on(!backpointer_exists,
1788			c, inode_wrong_backpointer,
1789			"inode %llu:%u has wrong backpointer:\n"
1790			"got       %llu:%llu\n"
1791			"should be %llu:%llu",
1792			target->bi_inum, target_snapshot,
1793			target->bi_dir,
1794			target->bi_dir_offset,
1795			d.k->p.inode,
1796			d.k->p.offset)) {
1797		target->bi_dir		= d.k->p.inode;
1798		target->bi_dir_offset	= d.k->p.offset;
1799		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1800		goto out;
1801	}
1802
1803	bch2_bkey_val_to_text(&buf, c, d.s_c);
1804	prt_newline(&buf);
1805	if (backpointer_exists)
1806		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1807
1808	if (fsck_err_on(backpointer_exists &&
1809			(S_ISDIR(target->bi_mode) ||
1810			 target->bi_subvol),
1811			c, inode_dir_multiple_links,
1812			"%s %llu:%u with multiple links\n%s",
1813			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
1814			target->bi_inum, target_snapshot, buf.buf)) {
1815		ret = __remove_dirent(trans, d.k->p);
1816		goto out;
1817	}
1818
1819	/*
1820	 * hardlinked file with nlink 0:
1821	 * We're just adjusting nlink here so check_nlinks() will pick
1822	 * it up, it ignores inodes with nlink 0
1823	 */
1824	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1825			c, inode_multiple_links_but_nlink_0,
1826			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1827			target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1828		target->bi_nlink++;
1829		target->bi_flags &= ~BCH_INODE_unlinked;
1830		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1831		if (ret)
1832			goto err;
1833	}
1834out:
1835err:
1836fsck_err:
1837	bch2_trans_iter_exit(trans, &bp_iter);
1838	printbuf_exit(&buf);
1839	bch_err_fn(c, ret);
1840	return ret;
1841}
1842
1843static int check_dirent_target(struct btree_trans *trans,
1844			       struct btree_iter *iter,
1845			       struct bkey_s_c_dirent d,
1846			       struct bch_inode_unpacked *target,
1847			       u32 target_snapshot)
1848{
1849	struct bch_fs *c = trans->c;
1850	struct bkey_i_dirent *n;
1851	struct printbuf buf = PRINTBUF;
1852	int ret = 0;
1853
1854	ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
1855	if (ret)
1856		goto err;
1857
1858	if (fsck_err_on(d.v->d_type != inode_d_type(target),
1859			c, dirent_d_type_wrong,
1860			"incorrect d_type: got %s, should be %s:\n%s",
1861			bch2_d_type_str(d.v->d_type),
1862			bch2_d_type_str(inode_d_type(target)),
1863			(printbuf_reset(&buf),
1864			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1865		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1866		ret = PTR_ERR_OR_ZERO(n);
1867		if (ret)
1868			goto err;
1869
1870		bkey_reassemble(&n->k_i, d.s_c);
1871		n->v.d_type = inode_d_type(target);
1872		if (n->v.d_type == DT_SUBVOL) {
1873			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1874			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
1875		} else {
1876			n->v.d_inum = cpu_to_le64(target->bi_inum);
1877		}
1878
1879		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1880		if (ret)
1881			goto err;
1882
1883		d = dirent_i_to_s_c(n);
1884	}
1885err:
1886fsck_err:
1887	printbuf_exit(&buf);
1888	bch_err_fn(c, ret);
1889	return ret;
1890}
1891
1892/* find a subvolume that's a descendent of @snapshot: */
1893static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1894{
1895	struct btree_iter iter;
1896	struct bkey_s_c k;
1897	int ret;
1898
1899	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1900		if (k.k->type != KEY_TYPE_subvolume)
1901			continue;
1902
1903		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1904		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1905			bch2_trans_iter_exit(trans, &iter);
1906			*subvolid = k.k->p.offset;
1907			goto found;
1908		}
1909	}
1910	if (!ret)
1911		ret = -ENOENT;
1912found:
1913	bch2_trans_iter_exit(trans, &iter);
1914	return ret;
1915}
1916
1917static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
1918				  struct bkey_s_c_dirent d)
1919{
1920	struct bch_fs *c = trans->c;
1921	struct btree_iter subvol_iter = {};
1922	struct bch_inode_unpacked subvol_root;
1923	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
1924	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1925	u32 parent_snapshot;
1926	u32 new_parent_subvol = 0;
1927	u64 parent_inum;
1928	struct printbuf buf = PRINTBUF;
1929	int ret = 0;
1930
1931	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
1932	if (ret && !bch2_err_matches(ret, ENOENT))
1933		return ret;
1934
1935	if (ret ||
1936	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
1937		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
1938		if (ret2 && !bch2_err_matches(ret, ENOENT))
1939			return ret2;
1940	}
1941
1942	if (ret &&
1943	    !new_parent_subvol &&
1944	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1945		/*
1946		 * Couldn't find a subvol for dirent's snapshot - but we lost
1947		 * subvols, so we need to reconstruct:
1948		 */
1949		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
1950		if (ret)
1951			return ret;
1952
1953		parent_snapshot = d.k->p.snapshot;
1954	}
1955
1956	if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
1957			"dirent parent_subvol points to missing subvolume\n%s",
1958			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
1959	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
1960			c, dirent_not_visible_in_parent_subvol,
1961			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
1962			parent_snapshot,
1963			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1964		if (!new_parent_subvol) {
1965			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
1966			return -BCH_ERR_fsck_repair_unimplemented;
1967		}
1968
1969		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
1970		ret = PTR_ERR_OR_ZERO(new_dirent);
1971		if (ret)
1972			goto err;
1973
1974		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
1975	}
1976
1977	struct bkey_s_c_subvolume s =
1978		bch2_bkey_get_iter_typed(trans, &subvol_iter,
1979					 BTREE_ID_subvolumes, POS(0, target_subvol),
1980					 0, subvolume);
1981	ret = bkey_err(s.s_c);
1982	if (ret && !bch2_err_matches(ret, ENOENT))
1983		return ret;
1984
1985	if (ret) {
1986		if (fsck_err(c, dirent_to_missing_subvol,
1987			     "dirent points to missing subvolume\n%s",
1988			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
1989			return __remove_dirent(trans, d.k->p);
1990		ret = 0;
1991		goto out;
1992	}
1993
1994	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
1995			c, subvol_fs_path_parent_wrong,
1996			"subvol with wrong fs_path_parent, should be be %u\n%s",
1997			parent_subvol,
1998			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
1999		struct bkey_i_subvolume *n =
2000			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
2001		ret = PTR_ERR_OR_ZERO(n);
2002		if (ret)
2003			goto err;
2004
2005		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
2006	}
2007
2008	u64 target_inum = le64_to_cpu(s.v->inode);
2009	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
2010
2011	ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
2012	if (ret && !bch2_err_matches(ret, ENOENT))
2013		goto err;
2014
2015	if (ret) {
2016		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2017		ret = -BCH_ERR_fsck_repair_unimplemented;
2018		ret = 0;
2019		goto err;
2020	}
2021
2022	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2023			c, inode_bi_parent_wrong,
2024			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2025			target_inum,
2026			subvol_root.bi_parent_subvol, parent_subvol)) {
2027		subvol_root.bi_parent_subvol = parent_subvol;
2028		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
2029		if (ret)
2030			goto err;
2031	}
2032
2033	ret = check_dirent_target(trans, iter, d, &subvol_root,
2034				  target_snapshot);
2035	if (ret)
2036		goto err;
2037out:
2038err:
2039fsck_err:
2040	bch2_trans_iter_exit(trans, &subvol_iter);
2041	printbuf_exit(&buf);
2042	return ret;
2043}
2044
2045static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2046			struct bkey_s_c k,
2047			struct bch_hash_info *hash_info,
2048			struct inode_walker *dir,
2049			struct inode_walker *target,
2050			struct snapshots_seen *s)
2051{
2052	struct bch_fs *c = trans->c;
2053	struct inode_walker_entry *i;
2054	struct printbuf buf = PRINTBUF;
2055	struct bpos equiv;
2056	int ret = 0;
2057
2058	ret = check_key_has_snapshot(trans, iter, k);
2059	if (ret) {
2060		ret = ret < 0 ? ret : 0;
2061		goto out;
2062	}
2063
2064	equiv = k.k->p;
2065	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
2066
2067	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2068	if (ret)
2069		goto err;
2070
2071	if (k.k->type == KEY_TYPE_whiteout)
2072		goto out;
2073
2074	if (dir->last_pos.inode != k.k->p.inode) {
2075		ret = check_subdir_count(trans, dir);
2076		if (ret)
2077			goto err;
2078	}
2079
2080	BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
2081
2082	i = walk_inode(trans, dir, k);
2083	ret = PTR_ERR_OR_ZERO(i);
2084	if (ret < 0)
2085		goto err;
2086
2087	if (dir->first_this_inode && dir->inodes.nr)
2088		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
2089	dir->first_this_inode = false;
2090
2091	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
2092		ret =   reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
2093			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
2094		if (ret)
2095			goto err;
2096
2097		dir->last_pos.inode--;
2098		ret = -BCH_ERR_transaction_restart_nested;
2099		goto err;
2100	}
2101
2102	if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
2103			"dirent in nonexisting directory:\n%s",
2104			(printbuf_reset(&buf),
2105			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
2106		ret = bch2_btree_delete_at(trans, iter,
2107				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
2108		goto out;
2109	}
2110
2111	if (!i)
2112		goto out;
2113
2114	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
2115			c, dirent_in_non_dir_inode,
2116			"dirent in non directory inode type %s:\n%s",
2117			bch2_d_type_str(inode_d_type(&i->inode)),
2118			(printbuf_reset(&buf),
2119			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
2120		ret = bch2_btree_delete_at(trans, iter, 0);
2121		goto out;
2122	}
2123
2124	ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
2125	if (ret < 0)
2126		goto err;
2127	if (ret) {
2128		/* dirent has been deleted */
2129		ret = 0;
2130		goto out;
2131	}
2132
2133	if (k.k->type != KEY_TYPE_dirent)
2134		goto out;
2135
2136	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2137
2138	if (d.v->d_type == DT_SUBVOL) {
2139		ret = check_dirent_to_subvol(trans, iter, d);
2140		if (ret)
2141			goto err;
2142	} else {
2143		ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2144		if (ret)
2145			goto err;
2146
2147		if (fsck_err_on(!target->inodes.nr,
2148				c, dirent_to_missing_inode,
2149				"dirent points to missing inode: (equiv %u)\n%s",
2150				equiv.snapshot,
2151				(printbuf_reset(&buf),
2152				 bch2_bkey_val_to_text(&buf, c, k),
2153				 buf.buf))) {
2154			ret = __remove_dirent(trans, d.k->p);
2155			if (ret)
2156				goto err;
2157		}
2158
2159		darray_for_each(target->inodes, i) {
2160			ret = check_dirent_target(trans, iter, d,
2161						  &i->inode, i->snapshot);
2162			if (ret)
2163				goto err;
2164		}
2165
2166		if (d.v->d_type == DT_DIR)
2167			for_each_visible_inode(c, s, dir, equiv.snapshot, i)
2168				i->count++;
2169	}
2170out:
2171err:
2172fsck_err:
2173	printbuf_exit(&buf);
2174	bch_err_fn(c, ret);
2175	return ret;
2176}
2177
2178/*
2179 * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2180 * validate d_type
2181 */
2182int bch2_check_dirents(struct bch_fs *c)
2183{
2184	struct inode_walker dir = inode_walker_init();
2185	struct inode_walker target = inode_walker_init();
2186	struct snapshots_seen s;
2187	struct bch_hash_info hash_info;
2188
2189	snapshots_seen_init(&s);
2190
2191	int ret = bch2_trans_run(c,
2192		for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
2193				POS(BCACHEFS_ROOT_INO, 0),
2194				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
2195				k,
2196				NULL, NULL,
2197				BCH_TRANS_COMMIT_no_enospc,
2198			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2199		check_subdir_count_notnested(trans, &dir));
2200
2201	snapshots_seen_exit(&s);
2202	inode_walker_exit(&dir);
2203	inode_walker_exit(&target);
2204	bch_err_fn(c, ret);
2205	return ret;
2206}
2207
2208static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2209		       struct bkey_s_c k,
2210		       struct bch_hash_info *hash_info,
2211		       struct inode_walker *inode)
2212{
2213	struct bch_fs *c = trans->c;
2214	struct inode_walker_entry *i;
2215	int ret;
2216
2217	ret = check_key_has_snapshot(trans, iter, k);
2218	if (ret < 0)
2219		return ret;
2220	if (ret)
2221		return 0;
2222
2223	i = walk_inode(trans, inode, k);
2224	ret = PTR_ERR_OR_ZERO(i);
2225	if (ret)
2226		return ret;
2227
2228	if (inode->first_this_inode && inode->inodes.nr)
2229		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
2230	inode->first_this_inode = false;
2231
2232	if (fsck_err_on(!i, c, xattr_in_missing_inode,
2233			"xattr for missing inode %llu",
2234			k.k->p.inode))
2235		return bch2_btree_delete_at(trans, iter, 0);
2236
2237	if (!i)
2238		return 0;
2239
2240	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
2241fsck_err:
2242	bch_err_fn(c, ret);
2243	return ret;
2244}
2245
2246/*
2247 * Walk xattrs: verify that they all have a corresponding inode
2248 */
2249int bch2_check_xattrs(struct bch_fs *c)
2250{
2251	struct inode_walker inode = inode_walker_init();
2252	struct bch_hash_info hash_info;
2253	int ret = 0;
2254
2255	ret = bch2_trans_run(c,
2256		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2257			POS(BCACHEFS_ROOT_INO, 0),
2258			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
2259			k,
2260			NULL, NULL,
2261			BCH_TRANS_COMMIT_no_enospc,
2262		check_xattr(trans, &iter, k, &hash_info, &inode)));
2263	bch_err_fn(c, ret);
2264	return ret;
2265}
2266
2267static int check_root_trans(struct btree_trans *trans)
2268{
2269	struct bch_fs *c = trans->c;
2270	struct bch_inode_unpacked root_inode;
2271	u32 snapshot;
2272	u64 inum;
2273	int ret;
2274
2275	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2276	if (ret && !bch2_err_matches(ret, ENOENT))
2277		return ret;
2278
2279	if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
2280				"root subvol missing")) {
2281		struct bkey_i_subvolume *root_subvol =
2282			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2283		ret = PTR_ERR_OR_ZERO(root_subvol);
2284		if (ret)
2285			goto err;
2286
2287		snapshot	= U32_MAX;
2288		inum		= BCACHEFS_ROOT_INO;
2289
2290		bkey_subvolume_init(&root_subvol->k_i);
2291		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2292		root_subvol->v.flags	= 0;
2293		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
2294		root_subvol->v.inode	= cpu_to_le64(inum);
2295		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2296		bch_err_msg(c, ret, "writing root subvol");
2297		if (ret)
2298			goto err;
2299	}
2300
2301	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
2302	if (ret && !bch2_err_matches(ret, ENOENT))
2303		return ret;
2304
2305	if (mustfix_fsck_err_on(ret, c, root_dir_missing,
2306				"root directory missing") ||
2307	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2308				c, root_inode_not_dir,
2309				"root inode not a directory")) {
2310		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2311				0, NULL);
2312		root_inode.bi_inum = inum;
2313
2314		ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
2315		bch_err_msg(c, ret, "writing root inode");
2316	}
2317err:
2318fsck_err:
2319	return ret;
2320}
2321
2322/* Get root directory, create if it doesn't exist: */
2323int bch2_check_root(struct bch_fs *c)
2324{
2325	int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2326		check_root_trans(trans));
2327	bch_err_fn(c, ret);
2328	return ret;
2329}
2330
2331typedef DARRAY(u32) darray_u32;
2332
2333static bool darray_u32_has(darray_u32 *d, u32 v)
2334{
2335	darray_for_each(*d, i)
2336		if (*i == v)
2337			return true;
2338	return false;
2339}
2340
2341/*
2342 * We've checked that inode backpointers point to valid dirents; here, it's
2343 * sufficient to check that the subvolume root has a dirent:
2344 */
2345static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
2346{
2347	struct bch_inode_unpacked inode;
2348	int ret = bch2_inode_find_by_inum_trans(trans,
2349				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2350				&inode);
2351	if (ret)
2352		return ret;
2353
2354	return inode.bi_dir != 0;
2355}
2356
2357static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2358{
2359	struct bch_fs *c = trans->c;
2360	struct btree_iter parent_iter = {};
2361	darray_u32 subvol_path = {};
2362	struct printbuf buf = PRINTBUF;
2363	int ret = 0;
2364
2365	if (k.k->type != KEY_TYPE_subvolume)
2366		return 0;
2367
2368	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2369		ret = darray_push(&subvol_path, k.k->p.offset);
2370		if (ret)
2371			goto err;
2372
2373		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2374
2375		ret = subvol_has_dirent(trans, s);
2376		if (ret < 0)
2377			break;
2378
2379		if (fsck_err_on(!ret,
2380				c, subvol_unreachable,
2381				"unreachable subvolume %s",
2382				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2383				 buf.buf))) {
2384			ret = reattach_subvol(trans, s);
2385			break;
2386		}
2387
2388		u32 parent = le32_to_cpu(s.v->fs_path_parent);
2389
2390		if (darray_u32_has(&subvol_path, parent)) {
2391			if (fsck_err(c, subvol_loop, "subvolume loop"))
2392				ret = reattach_subvol(trans, s);
2393			break;
2394		}
2395
2396		bch2_trans_iter_exit(trans, &parent_iter);
2397		bch2_trans_iter_init(trans, &parent_iter,
2398				     BTREE_ID_subvolumes, POS(0, parent), 0);
2399		k = bch2_btree_iter_peek_slot(&parent_iter);
2400		ret = bkey_err(k);
2401		if (ret)
2402			goto err;
2403
2404		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2405				c, subvol_unreachable,
2406				"unreachable subvolume %s",
2407				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2408				 buf.buf))) {
2409			ret = reattach_subvol(trans, s);
2410			break;
2411		}
2412	}
2413fsck_err:
2414err:
2415	printbuf_exit(&buf);
2416	darray_exit(&subvol_path);
2417	bch2_trans_iter_exit(trans, &parent_iter);
2418	return ret;
2419}
2420
2421int bch2_check_subvolume_structure(struct bch_fs *c)
2422{
2423	int ret = bch2_trans_run(c,
2424		for_each_btree_key_commit(trans, iter,
2425				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
2426				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2427			check_subvol_path(trans, &iter, k)));
2428	bch_err_fn(c, ret);
2429	return ret;
2430}
2431
2432struct pathbuf_entry {
2433	u64	inum;
2434	u32	snapshot;
2435};
2436
2437typedef DARRAY(struct pathbuf_entry) pathbuf;
2438
2439static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2440{
2441	darray_for_each(*p, i)
2442		if (i->inum	== inum &&
2443		    i->snapshot	== snapshot)
2444			return true;
2445	return false;
2446}
2447
2448/*
2449 * Check that a given inode is reachable from its subvolume root - we already
2450 * verified subvolume connectivity:
2451 *
2452 * XXX: we should also be verifying that inodes are in the right subvolumes
2453 */
2454static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
2455{
2456	struct bch_fs *c = trans->c;
2457	struct btree_iter inode_iter = {};
2458	struct bch_inode_unpacked inode;
2459	struct printbuf buf = PRINTBUF;
2460	u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot);
2461	int ret = 0;
2462
2463	p->nr = 0;
2464
2465	BUG_ON(bch2_inode_unpack(inode_k, &inode));
2466
2467	while (!inode.bi_subvol) {
2468		struct btree_iter dirent_iter;
2469		struct bkey_s_c_dirent d;
2470		u32 parent_snapshot = snapshot;
2471
2472		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2473		ret = bkey_err(d.s_c);
2474		if (ret && !bch2_err_matches(ret, ENOENT))
2475			break;
2476
2477		if (!ret && !dirent_points_to_inode(d, &inode)) {
2478			bch2_trans_iter_exit(trans, &dirent_iter);
2479			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
2480		}
2481
2482		if (bch2_err_matches(ret, ENOENT)) {
2483			ret = 0;
2484			if (fsck_err(c, inode_unreachable,
2485				     "unreachable inode\n%s",
2486				     (printbuf_reset(&buf),
2487				      bch2_bkey_val_to_text(&buf, c, inode_k),
2488				      buf.buf)))
2489				ret = reattach_inode(trans, &inode, snapshot);
2490			goto out;
2491		}
2492
2493		bch2_trans_iter_exit(trans, &dirent_iter);
2494
2495		if (!S_ISDIR(inode.bi_mode))
2496			break;
2497
2498		ret = darray_push(p, ((struct pathbuf_entry) {
2499			.inum		= inode.bi_inum,
2500			.snapshot	= snapshot,
2501		}));
2502		if (ret)
2503			return ret;
2504
2505		snapshot = parent_snapshot;
2506
2507		bch2_trans_iter_exit(trans, &inode_iter);
2508		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2509					     SPOS(0, inode.bi_dir, snapshot), 0);
2510		ret = bkey_err(inode_k) ?:
2511			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2512			: bch2_inode_unpack(inode_k, &inode);
2513		if (ret) {
2514			/* Should have been caught in dirents pass */
2515			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
2516				bch_err(c, "error looking up parent directory: %i", ret);
2517			break;
2518		}
2519
2520		snapshot = inode_k.k->p.snapshot;
2521
2522		if (path_is_dup(p, inode.bi_inum, snapshot)) {
2523			/* XXX print path */
2524			bch_err(c, "directory structure loop");
2525
2526			darray_for_each(*p, i)
2527				pr_err("%llu:%u", i->inum, i->snapshot);
2528			pr_err("%llu:%u", inode.bi_inum, snapshot);
2529
2530			if (fsck_err(c, dir_loop, "directory structure loop")) {
2531				ret = remove_backpointer(trans, &inode);
2532				bch_err_msg(c, ret, "removing dirent");
2533				if (ret)
2534					break;
2535
2536				ret = reattach_inode(trans, &inode, snapshot);
2537				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2538			}
2539			break;
2540		}
2541	}
2542out:
2543fsck_err:
2544	bch2_trans_iter_exit(trans, &inode_iter);
2545	printbuf_exit(&buf);
2546	bch_err_fn(c, ret);
2547	return ret;
2548}
2549
2550/*
2551 * Check for unreachable inodes, as well as loops in the directory structure:
2552 * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2553 * unreachable:
2554 */
2555int bch2_check_directory_structure(struct bch_fs *c)
2556{
2557	pathbuf path = { 0, };
2558	int ret;
2559
2560	ret = bch2_trans_run(c,
2561		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2562					  BTREE_ITER_INTENT|
2563					  BTREE_ITER_PREFETCH|
2564					  BTREE_ITER_ALL_SNAPSHOTS, k,
2565					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2566			if (!bkey_is_inode(k.k))
2567				continue;
2568
2569			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2570				continue;
2571
2572			check_path(trans, &path, k);
2573		})));
2574	darray_exit(&path);
2575
2576	bch_err_fn(c, ret);
2577	return ret;
2578}
2579
2580struct nlink_table {
2581	size_t		nr;
2582	size_t		size;
2583
2584	struct nlink {
2585		u64	inum;
2586		u32	snapshot;
2587		u32	count;
2588	}		*d;
2589};
2590
2591static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2592		     u64 inum, u32 snapshot)
2593{
2594	if (t->nr == t->size) {
2595		size_t new_size = max_t(size_t, 128UL, t->size * 2);
2596		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2597
2598		if (!d) {
2599			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2600				new_size);
2601			return -BCH_ERR_ENOMEM_fsck_add_nlink;
2602		}
2603
2604		if (t->d)
2605			memcpy(d, t->d, t->size * sizeof(t->d[0]));
2606		kvfree(t->d);
2607
2608		t->d = d;
2609		t->size = new_size;
2610	}
2611
2612
2613	t->d[t->nr++] = (struct nlink) {
2614		.inum		= inum,
2615		.snapshot	= snapshot,
2616	};
2617
2618	return 0;
2619}
2620
2621static int nlink_cmp(const void *_l, const void *_r)
2622{
2623	const struct nlink *l = _l;
2624	const struct nlink *r = _r;
2625
2626	return cmp_int(l->inum, r->inum);
2627}
2628
2629static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2630		     struct nlink_table *links,
2631		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2632{
2633	struct nlink *link, key = {
2634		.inum = inum, .snapshot = U32_MAX,
2635	};
2636
2637	if (inum < range_start || inum >= range_end)
2638		return;
2639
2640	link = __inline_bsearch(&key, links->d, links->nr,
2641				sizeof(links->d[0]), nlink_cmp);
2642	if (!link)
2643		return;
2644
2645	while (link > links->d && link[0].inum == link[-1].inum)
2646		--link;
2647
2648	for (; link < links->d + links->nr && link->inum == inum; link++)
2649		if (ref_visible(c, s, snapshot, link->snapshot)) {
2650			link->count++;
2651			if (link->snapshot >= snapshot)
2652				break;
2653		}
2654}
2655
2656noinline_for_stack
2657static int check_nlinks_find_hardlinks(struct bch_fs *c,
2658				       struct nlink_table *t,
2659				       u64 start, u64 *end)
2660{
2661	int ret = bch2_trans_run(c,
2662		for_each_btree_key(trans, iter, BTREE_ID_inodes,
2663				   POS(0, start),
2664				   BTREE_ITER_INTENT|
2665				   BTREE_ITER_PREFETCH|
2666				   BTREE_ITER_ALL_SNAPSHOTS, k, ({
2667			if (!bkey_is_inode(k.k))
2668				continue;
2669
2670			/* Should never fail, checked by bch2_inode_invalid: */
2671			struct bch_inode_unpacked u;
2672			BUG_ON(bch2_inode_unpack(k, &u));
2673
2674			/*
2675			 * Backpointer and directory structure checks are sufficient for
2676			 * directories, since they can't have hardlinks:
2677			 */
2678			if (S_ISDIR(u.bi_mode))
2679				continue;
2680
2681			if (!u.bi_nlink)
2682				continue;
2683
2684			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2685			if (ret) {
2686				*end = k.k->p.offset;
2687				ret = 0;
2688				break;
2689			}
2690			0;
2691		})));
2692
2693	bch_err_fn(c, ret);
2694	return ret;
2695}
2696
2697noinline_for_stack
2698static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2699				     u64 range_start, u64 range_end)
2700{
2701	struct snapshots_seen s;
2702
2703	snapshots_seen_init(&s);
2704
2705	int ret = bch2_trans_run(c,
2706		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2707				   BTREE_ITER_INTENT|
2708				   BTREE_ITER_PREFETCH|
2709				   BTREE_ITER_ALL_SNAPSHOTS, k, ({
2710			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2711			if (ret)
2712				break;
2713
2714			if (k.k->type == KEY_TYPE_dirent) {
2715				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2716
2717				if (d.v->d_type != DT_DIR &&
2718				    d.v->d_type != DT_SUBVOL)
2719					inc_link(c, &s, links, range_start, range_end,
2720						 le64_to_cpu(d.v->d_inum),
2721						 bch2_snapshot_equiv(c, d.k->p.snapshot));
2722			}
2723			0;
2724		})));
2725
2726	snapshots_seen_exit(&s);
2727
2728	bch_err_fn(c, ret);
2729	return ret;
2730}
2731
2732static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2733				     struct bkey_s_c k,
2734				     struct nlink_table *links,
2735				     size_t *idx, u64 range_end)
2736{
2737	struct bch_fs *c = trans->c;
2738	struct bch_inode_unpacked u;
2739	struct nlink *link = &links->d[*idx];
2740	int ret = 0;
2741
2742	if (k.k->p.offset >= range_end)
2743		return 1;
2744
2745	if (!bkey_is_inode(k.k))
2746		return 0;
2747
2748	BUG_ON(bch2_inode_unpack(k, &u));
2749
2750	if (S_ISDIR(u.bi_mode))
2751		return 0;
2752
2753	if (!u.bi_nlink)
2754		return 0;
2755
2756	while ((cmp_int(link->inum, k.k->p.offset) ?:
2757		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2758		BUG_ON(*idx == links->nr);
2759		link = &links->d[++*idx];
2760	}
2761
2762	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2763			c, inode_wrong_nlink,
2764			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
2765			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2766			bch2_inode_nlink_get(&u), link->count)) {
2767		bch2_inode_nlink_set(&u, link->count);
2768		ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
2769	}
2770fsck_err:
2771	return ret;
2772}
2773
2774noinline_for_stack
2775static int check_nlinks_update_hardlinks(struct bch_fs *c,
2776			       struct nlink_table *links,
2777			       u64 range_start, u64 range_end)
2778{
2779	size_t idx = 0;
2780
2781	int ret = bch2_trans_run(c,
2782		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2783				POS(0, range_start),
2784				BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
2785				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2786			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2787	if (ret < 0) {
2788		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2789		return ret;
2790	}
2791
2792	return 0;
2793}
2794
2795int bch2_check_nlinks(struct bch_fs *c)
2796{
2797	struct nlink_table links = { 0 };
2798	u64 this_iter_range_start, next_iter_range_start = 0;
2799	int ret = 0;
2800
2801	do {
2802		this_iter_range_start = next_iter_range_start;
2803		next_iter_range_start = U64_MAX;
2804
2805		ret = check_nlinks_find_hardlinks(c, &links,
2806						  this_iter_range_start,
2807						  &next_iter_range_start);
2808
2809		ret = check_nlinks_walk_dirents(c, &links,
2810					  this_iter_range_start,
2811					  next_iter_range_start);
2812		if (ret)
2813			break;
2814
2815		ret = check_nlinks_update_hardlinks(c, &links,
2816					 this_iter_range_start,
2817					 next_iter_range_start);
2818		if (ret)
2819			break;
2820
2821		links.nr = 0;
2822	} while (next_iter_range_start != U64_MAX);
2823
2824	kvfree(links.d);
2825	bch_err_fn(c, ret);
2826	return ret;
2827}
2828
2829static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2830			     struct bkey_s_c k)
2831{
2832	struct bkey_s_c_reflink_p p;
2833	struct bkey_i_reflink_p *u;
2834
2835	if (k.k->type != KEY_TYPE_reflink_p)
2836		return 0;
2837
2838	p = bkey_s_c_to_reflink_p(k);
2839
2840	if (!p.v->front_pad && !p.v->back_pad)
2841		return 0;
2842
2843	u = bch2_trans_kmalloc(trans, sizeof(*u));
2844	int ret = PTR_ERR_OR_ZERO(u);
2845	if (ret)
2846		return ret;
2847
2848	bkey_reassemble(&u->k_i, k);
2849	u->v.front_pad	= 0;
2850	u->v.back_pad	= 0;
2851
2852	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
2853}
2854
2855int bch2_fix_reflink_p(struct bch_fs *c)
2856{
2857	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2858		return 0;
2859
2860	int ret = bch2_trans_run(c,
2861		for_each_btree_key_commit(trans, iter,
2862				BTREE_ID_extents, POS_MIN,
2863				BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
2864				BTREE_ITER_ALL_SNAPSHOTS, k,
2865				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2866			fix_reflink_p_key(trans, &iter, k)));
2867	bch_err_fn(c, ret);
2868	return ret;
2869}
2870