1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_log_format.h"
13#include "xfs_trans.h"
14#include "xfs_inode.h"
15#include "xfs_icache.h"
16#include "xfs_iwalk.h"
17#include "xfs_ialloc.h"
18#include "xfs_dir2.h"
19#include "xfs_dir2_priv.h"
20#include "xfs_ag.h"
21#include "scrub/scrub.h"
22#include "scrub/common.h"
23#include "scrub/repair.h"
24#include "scrub/xfile.h"
25#include "scrub/xfarray.h"
26#include "scrub/iscan.h"
27#include "scrub/nlinks.h"
28#include "scrub/trace.h"
29#include "scrub/readdir.h"
30
31/*
32 * Live Inode Link Count Checking
33 * ==============================
34 *
35 * Inode link counts are "summary" metadata, in the sense that they are
36 * computed as the number of directory entries referencing each file on the
37 * filesystem.  Therefore, we compute the correct link counts by creating a
38 * shadow link count structure and walking every inode.
39 */
40
41/* Set us up to scrub inode link counts. */
42int
43xchk_setup_nlinks(
44	struct xfs_scrub	*sc)
45{
46	xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
47
48	sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
49	if (!sc->buf)
50		return -ENOMEM;
51
52	return xchk_setup_fs(sc);
53}
54
55/*
56 * Part 1: Collecting file link counts.  For each file, we create a shadow link
57 * counting structure, then walk the entire directory tree, incrementing parent
58 * and child link counts for each directory entry seen.
59 *
60 * To avoid false corruption reports in part 2, any failure in this part must
61 * set the INCOMPLETE flag even when a negative errno is returned.  This care
62 * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
63 * ECANCELED) that are absorbed into a scrub state flag update by
64 * xchk_*_process_error.  Scrub and repair share the same incore data
65 * structures, so the INCOMPLETE flag is critical to prevent a repair based on
66 * insufficient information.
67 *
68 * Because we are scanning a live filesystem, it's possible that another thread
69 * will try to update the link counts for an inode that we've already scanned.
70 * This will cause our counts to be incorrect.  Therefore, we hook all
71 * directory entry updates because that is when link count updates occur.  By
72 * shadowing transaction updates in this manner, live nlink check can ensure by
73 * locking the inode and the shadow structure that its own copies are not out
74 * of date.  Because the hook code runs in a different process context from the
75 * scrub code and the scrub state flags are not accessed atomically, failures
76 * in the hook code must abort the iscan and the scrubber must notice the
77 * aborted scan and set the incomplete flag.
78 *
79 * Note that we use jump labels and srcu notifier hooks to minimize the
80 * overhead when live nlinks is /not/ running.  Locking order for nlink
81 * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock.
82 */
83
84/*
85 * Add a delta to an nlink counter, clamping the value to U32_MAX.  Because
86 * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results
87 * even if we lose some precision.
88 */
89static inline void
90careful_add(
91	xfs_nlink_t	*nlinkp,
92	int		delta)
93{
94	uint64_t	new_value = (uint64_t)(*nlinkp) + delta;
95
96	BUILD_BUG_ON(XFS_MAXLINK > U32_MAX);
97	*nlinkp = min_t(uint64_t, new_value, U32_MAX);
98}
99
100/* Update incore link count information.  Caller must hold the nlinks lock. */
101STATIC int
102xchk_nlinks_update_incore(
103	struct xchk_nlink_ctrs	*xnc,
104	xfs_ino_t		ino,
105	int			parents_delta,
106	int			backrefs_delta,
107	int			children_delta)
108{
109	struct xchk_nlink	nl;
110	int			error;
111
112	if (!xnc->nlinks)
113		return 0;
114
115	error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
116	if (error)
117		return error;
118
119	trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta,
120			backrefs_delta, children_delta);
121
122	careful_add(&nl.parents, parents_delta);
123	careful_add(&nl.backrefs, backrefs_delta);
124	careful_add(&nl.children, children_delta);
125
126	nl.flags |= XCHK_NLINK_WRITTEN;
127	error = xfarray_store(xnc->nlinks, ino, &nl);
128	if (error == -EFBIG) {
129		/*
130		 * EFBIG means we tried to store data at too high a byte offset
131		 * in the sparse array.  IOWs, we cannot complete the check and
132		 * must notify userspace that the check was incomplete.
133		 */
134		error = -ECANCELED;
135	}
136	return error;
137}
138
139/*
140 * Apply a link count change from the regular filesystem into our shadow link
141 * count structure based on a directory update in progress.
142 */
143STATIC int
144xchk_nlinks_live_update(
145	struct notifier_block		*nb,
146	unsigned long			action,
147	void				*data)
148{
149	struct xfs_dir_update_params	*p = data;
150	struct xchk_nlink_ctrs		*xnc;
151	int				error;
152
153	xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb);
154
155	trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino,
156			p->delta, p->name->name, p->name->len);
157
158	/*
159	 * If we've already scanned @dp, update the number of parents that link
160	 * to @ip.  If @ip is a subdirectory, update the number of child links
161	 * going out of @dp.
162	 */
163	if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) {
164		mutex_lock(&xnc->lock);
165		error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta,
166				0, 0);
167		if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode))
168			error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
169					0, p->delta);
170		mutex_unlock(&xnc->lock);
171		if (error)
172			goto out_abort;
173	}
174
175	/*
176	 * If @ip is a subdirectory and we've already scanned it, update the
177	 * number of backrefs pointing to @dp.
178	 */
179	if (S_ISDIR(VFS_IC(p->ip)->i_mode) &&
180	    xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) {
181		mutex_lock(&xnc->lock);
182		error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
183				p->delta, 0);
184		mutex_unlock(&xnc->lock);
185		if (error)
186			goto out_abort;
187	}
188
189	return NOTIFY_DONE;
190
191out_abort:
192	xchk_iscan_abort(&xnc->collect_iscan);
193	return NOTIFY_DONE;
194}
195
196/* Bump the observed link count for the inode referenced by this entry. */
197STATIC int
198xchk_nlinks_collect_dirent(
199	struct xfs_scrub	*sc,
200	struct xfs_inode	*dp,
201	xfs_dir2_dataptr_t	dapos,
202	const struct xfs_name	*name,
203	xfs_ino_t		ino,
204	void			*priv)
205{
206	struct xchk_nlink_ctrs	*xnc = priv;
207	bool			dot = false, dotdot = false;
208	int			error;
209
210	/* Does this name make sense? */
211	if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) {
212		error = -ECANCELED;
213		goto out_abort;
214	}
215
216	if (name->len == 1 && name->name[0] == '.')
217		dot = true;
218	else if (name->len == 2 && name->name[0] == '.' &&
219				   name->name[1] == '.')
220		dotdot = true;
221
222	/* Don't accept a '.' entry that points somewhere else. */
223	if (dot && ino != dp->i_ino) {
224		error = -ECANCELED;
225		goto out_abort;
226	}
227
228	/* Don't accept an invalid inode number. */
229	if (!xfs_verify_dir_ino(sc->mp, ino)) {
230		error = -ECANCELED;
231		goto out_abort;
232	}
233
234	/* Update the shadow link counts if we haven't already failed. */
235
236	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
237		error = -ECANCELED;
238		goto out_incomplete;
239	}
240
241	trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name);
242
243	mutex_lock(&xnc->lock);
244
245	/*
246	 * If this is a dotdot entry, it is a back link from dp to ino.  How
247	 * we handle this depends on whether or not dp is the root directory.
248	 *
249	 * The root directory is its own parent, so we pretend the dotdot entry
250	 * establishes the "parent" of the root directory.  Increment the
251	 * number of parents of the root directory.
252	 *
253	 * Otherwise, increment the number of backrefs pointing back to ino.
254	 */
255	if (dotdot) {
256		if (dp == sc->mp->m_rootip)
257			error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
258		else
259			error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0);
260		if (error)
261			goto out_unlock;
262	}
263
264	/*
265	 * If this dirent is a forward link from dp to ino, increment the
266	 * number of parents linking into ino.
267	 */
268	if (!dot && !dotdot) {
269		error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
270		if (error)
271			goto out_unlock;
272	}
273
274	/*
275	 * If this dirent is a forward link to a subdirectory, increment the
276	 * number of child links of dp.
277	 */
278	if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) {
279		error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1);
280		if (error)
281			goto out_unlock;
282	}
283
284	mutex_unlock(&xnc->lock);
285	return 0;
286
287out_unlock:
288	mutex_unlock(&xnc->lock);
289out_abort:
290	xchk_iscan_abort(&xnc->collect_iscan);
291out_incomplete:
292	xchk_set_incomplete(sc);
293	return error;
294}
295
296/* Walk a directory to bump the observed link counts of the children. */
297STATIC int
298xchk_nlinks_collect_dir(
299	struct xchk_nlink_ctrs	*xnc,
300	struct xfs_inode	*dp)
301{
302	struct xfs_scrub	*sc = xnc->sc;
303	unsigned int		lock_mode;
304	int			error = 0;
305
306	/* Prevent anyone from changing this directory while we walk it. */
307	xfs_ilock(dp, XFS_IOLOCK_SHARED);
308	lock_mode = xfs_ilock_data_map_shared(dp);
309
310	/*
311	 * The dotdot entry of an unlinked directory still points to the last
312	 * parent, but the parent no longer links to this directory.  Skip the
313	 * directory to avoid overcounting.
314	 */
315	if (VFS_I(dp)->i_nlink == 0)
316		goto out_unlock;
317
318	/*
319	 * We cannot count file links if the directory looks as though it has
320	 * been zapped by the inode record repair code.
321	 */
322	if (xchk_dir_looks_zapped(dp)) {
323		error = -EBUSY;
324		goto out_abort;
325	}
326
327	error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc);
328	if (error == -ECANCELED) {
329		error = 0;
330		goto out_unlock;
331	}
332	if (error)
333		goto out_abort;
334
335	xchk_iscan_mark_visited(&xnc->collect_iscan, dp);
336	goto out_unlock;
337
338out_abort:
339	xchk_set_incomplete(sc);
340	xchk_iscan_abort(&xnc->collect_iscan);
341out_unlock:
342	xfs_iunlock(dp, lock_mode);
343	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
344	return error;
345}
346
347/* If this looks like a valid pointer, count it. */
348static inline int
349xchk_nlinks_collect_metafile(
350	struct xchk_nlink_ctrs	*xnc,
351	xfs_ino_t		ino)
352{
353	if (!xfs_verify_ino(xnc->sc->mp, ino))
354		return 0;
355
356	trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino);
357	return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
358}
359
360/* Bump the link counts of metadata files rooted in the superblock. */
361STATIC int
362xchk_nlinks_collect_metafiles(
363	struct xchk_nlink_ctrs	*xnc)
364{
365	struct xfs_mount	*mp = xnc->sc->mp;
366	int			error = -ECANCELED;
367
368
369	if (xchk_iscan_aborted(&xnc->collect_iscan))
370		goto out_incomplete;
371
372	mutex_lock(&xnc->lock);
373	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino);
374	if (error)
375		goto out_abort;
376
377	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino);
378	if (error)
379		goto out_abort;
380
381	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino);
382	if (error)
383		goto out_abort;
384
385	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino);
386	if (error)
387		goto out_abort;
388
389	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino);
390	if (error)
391		goto out_abort;
392	mutex_unlock(&xnc->lock);
393
394	return 0;
395
396out_abort:
397	mutex_unlock(&xnc->lock);
398	xchk_iscan_abort(&xnc->collect_iscan);
399out_incomplete:
400	xchk_set_incomplete(xnc->sc);
401	return error;
402}
403
404/* Advance the collection scan cursor for this non-directory file. */
405static inline int
406xchk_nlinks_collect_file(
407	struct xchk_nlink_ctrs	*xnc,
408	struct xfs_inode	*ip)
409{
410	xfs_ilock(ip, XFS_IOLOCK_SHARED);
411	xchk_iscan_mark_visited(&xnc->collect_iscan, ip);
412	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
413	return 0;
414}
415
416/* Walk all directories and count inode links. */
417STATIC int
418xchk_nlinks_collect(
419	struct xchk_nlink_ctrs	*xnc)
420{
421	struct xfs_scrub	*sc = xnc->sc;
422	struct xfs_inode	*ip;
423	int			error;
424
425	/* Count the rt and quota files that are rooted in the superblock. */
426	error = xchk_nlinks_collect_metafiles(xnc);
427	if (error)
428		return error;
429
430	/*
431	 * Set up for a potentially lengthy filesystem scan by reducing our
432	 * transaction resource usage for the duration.  Specifically:
433	 *
434	 * Cancel the transaction to release the log grant space while we scan
435	 * the filesystem.
436	 *
437	 * Create a new empty transaction to eliminate the possibility of the
438	 * inode scan deadlocking on cyclical metadata.
439	 *
440	 * We pass the empty transaction to the file scanning function to avoid
441	 * repeatedly cycling empty transactions.  This can be done even though
442	 * we take the IOLOCK to quiesce the file because empty transactions
443	 * do not take sb_internal.
444	 */
445	xchk_trans_cancel(sc);
446	error = xchk_trans_alloc_empty(sc);
447	if (error)
448		return error;
449
450	while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) {
451		if (S_ISDIR(VFS_I(ip)->i_mode))
452			error = xchk_nlinks_collect_dir(xnc, ip);
453		else
454			error = xchk_nlinks_collect_file(xnc, ip);
455		xchk_irele(sc, ip);
456		if (error)
457			break;
458
459		if (xchk_should_terminate(sc, &error))
460			break;
461	}
462	xchk_iscan_iter_finish(&xnc->collect_iscan);
463	if (error) {
464		xchk_set_incomplete(sc);
465		/*
466		 * If we couldn't grab an inode that was busy with a state
467		 * change, change the error code so that we exit to userspace
468		 * as quickly as possible.
469		 */
470		if (error == -EBUSY)
471			return -ECANCELED;
472		return error;
473	}
474
475	/*
476	 * Switch out for a real transaction in preparation for building a new
477	 * tree.
478	 */
479	xchk_trans_cancel(sc);
480	return xchk_setup_fs(sc);
481}
482
483/*
484 * Part 2: Comparing file link counters.  Walk each inode and compare the link
485 * counts against our shadow information; and then walk each shadow link count
486 * structure (that wasn't covered in the first part), comparing it against the
487 * file.
488 */
489
490/* Read the observed link count for comparison with the actual inode. */
491STATIC int
492xchk_nlinks_comparison_read(
493	struct xchk_nlink_ctrs	*xnc,
494	xfs_ino_t		ino,
495	struct xchk_nlink	*obs)
496{
497	struct xchk_nlink	nl;
498	int			error;
499
500	error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
501	if (error)
502		return error;
503
504	nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN);
505
506	error = xfarray_store(xnc->nlinks, ino, &nl);
507	if (error == -EFBIG) {
508		/*
509		 * EFBIG means we tried to store data at too high a byte offset
510		 * in the sparse array.  IOWs, we cannot complete the check and
511		 * must notify userspace that the check was incomplete.  This
512		 * shouldn't really happen outside of the collection phase.
513		 */
514		xchk_set_incomplete(xnc->sc);
515		return -ECANCELED;
516	}
517	if (error)
518		return error;
519
520	/* Copy the counters, but do not expose the internal state. */
521	obs->parents = nl.parents;
522	obs->backrefs = nl.backrefs;
523	obs->children = nl.children;
524	obs->flags = 0;
525	return 0;
526}
527
528/* Check our link count against an inode. */
529STATIC int
530xchk_nlinks_compare_inode(
531	struct xchk_nlink_ctrs	*xnc,
532	struct xfs_inode	*ip)
533{
534	struct xchk_nlink	obs;
535	struct xfs_scrub	*sc = xnc->sc;
536	uint64_t		total_links;
537	unsigned int		actual_nlink;
538	int			error;
539
540	xfs_ilock(ip, XFS_ILOCK_SHARED);
541	mutex_lock(&xnc->lock);
542
543	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
544		xchk_set_incomplete(xnc->sc);
545		error = -ECANCELED;
546		goto out_scanlock;
547	}
548
549	error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs);
550	if (error)
551		goto out_scanlock;
552
553	/*
554	 * If we don't have ftype to get an accurate count of the subdirectory
555	 * entries in this directory, take advantage of the fact that on a
556	 * consistent ftype=0 filesystem, the number of subdirectory
557	 * backreferences (dotdot entries) pointing towards this directory
558	 * should be equal to the number of subdirectory entries in the
559	 * directory.
560	 */
561	if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode))
562		obs.children = obs.backrefs;
563
564	total_links = xchk_nlink_total(ip, &obs);
565	actual_nlink = VFS_I(ip)->i_nlink;
566
567	trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs);
568
569	/*
570	 * If we found so many parents that we'd overflow i_nlink, we must flag
571	 * this as a corruption.  The VFS won't let users increase the link
572	 * count, but it will let them decrease it.
573	 */
574	if (total_links > XFS_MAXLINK) {
575		xchk_ino_set_corrupt(sc, ip->i_ino);
576		goto out_corrupt;
577	}
578
579	/* Link counts should match. */
580	if (total_links != actual_nlink) {
581		xchk_ino_set_corrupt(sc, ip->i_ino);
582		goto out_corrupt;
583	}
584
585	if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) {
586		/*
587		 * The collection phase ignores directories with zero link
588		 * count, so we ignore them here too.
589		 *
590		 * The number of subdirectory backreferences (dotdot entries)
591		 * pointing towards this directory should be equal to the
592		 * number of subdirectory entries in the directory.
593		 */
594		if (obs.children != obs.backrefs)
595			xchk_ino_xref_set_corrupt(sc, ip->i_ino);
596	} else {
597		/*
598		 * Non-directories and unlinked directories should not have
599		 * back references.
600		 */
601		if (obs.backrefs != 0) {
602			xchk_ino_set_corrupt(sc, ip->i_ino);
603			goto out_corrupt;
604		}
605
606		/*
607		 * Non-directories and unlinked directories should not have
608		 * children.
609		 */
610		if (obs.children != 0) {
611			xchk_ino_set_corrupt(sc, ip->i_ino);
612			goto out_corrupt;
613		}
614	}
615
616	if (ip == sc->mp->m_rootip) {
617		/*
618		 * For the root of a directory tree, both the '.' and '..'
619		 * entries should point to the root directory.  The dotdot
620		 * entry is counted as a parent of the root /and/ a backref of
621		 * the root directory.
622		 */
623		if (obs.parents != 1) {
624			xchk_ino_set_corrupt(sc, ip->i_ino);
625			goto out_corrupt;
626		}
627	} else if (actual_nlink > 0) {
628		/*
629		 * Linked files that are not the root directory should have at
630		 * least one parent.
631		 */
632		if (obs.parents == 0) {
633			xchk_ino_set_corrupt(sc, ip->i_ino);
634			goto out_corrupt;
635		}
636	}
637
638out_corrupt:
639	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
640		error = -ECANCELED;
641out_scanlock:
642	mutex_unlock(&xnc->lock);
643	xfs_iunlock(ip, XFS_ILOCK_SHARED);
644	return error;
645}
646
647/*
648 * Check our link count against an inode that wasn't checked previously.  This
649 * is intended to catch directories with dangling links, though we could be
650 * racing with inode allocation in other threads.
651 */
652STATIC int
653xchk_nlinks_compare_inum(
654	struct xchk_nlink_ctrs	*xnc,
655	xfs_ino_t		ino)
656{
657	struct xchk_nlink	obs;
658	struct xfs_mount	*mp = xnc->sc->mp;
659	struct xfs_trans	*tp = xnc->sc->tp;
660	struct xfs_buf		*agi_bp;
661	struct xfs_inode	*ip;
662	int			error;
663
664	/*
665	 * The first iget failed, so try again with the variant that returns
666	 * either an incore inode or the AGI buffer.  If the function returns
667	 * EINVAL/ENOENT, it should have passed us the AGI buffer so that we
668	 * can guarantee that the inode won't be allocated while we check for
669	 * a zero link count in the observed link count data.
670	 */
671	error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip);
672	if (!error) {
673		/* Actually got an inode, so use the inode compare. */
674		error = xchk_nlinks_compare_inode(xnc, ip);
675		xchk_irele(xnc->sc, ip);
676		return error;
677	}
678	if (error == -ENOENT || error == -EINVAL) {
679		/* No inode was found.  Check for zero link count below. */
680		error = 0;
681	}
682	if (error)
683		goto out_agi;
684
685	/* Ensure that we have protected against inode allocation/freeing. */
686	if (agi_bp == NULL) {
687		ASSERT(agi_bp != NULL);
688		xchk_set_incomplete(xnc->sc);
689		return -ECANCELED;
690	}
691
692	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
693		xchk_set_incomplete(xnc->sc);
694		error = -ECANCELED;
695		goto out_agi;
696	}
697
698	mutex_lock(&xnc->lock);
699	error = xchk_nlinks_comparison_read(xnc, ino, &obs);
700	if (error)
701		goto out_scanlock;
702
703	trace_xchk_nlinks_check_zero(mp, ino, &obs);
704
705	/*
706	 * If we can't grab the inode, the link count had better be zero.  We
707	 * still hold the AGI to prevent inode allocation/freeing.
708	 */
709	if (xchk_nlink_total(NULL, &obs) != 0) {
710		xchk_ino_set_corrupt(xnc->sc, ino);
711		error = -ECANCELED;
712	}
713
714out_scanlock:
715	mutex_unlock(&xnc->lock);
716out_agi:
717	if (agi_bp)
718		xfs_trans_brelse(tp, agi_bp);
719	return error;
720}
721
722/*
723 * Try to visit every inode in the filesystem to compare the link count.  Move
724 * on if we can't grab an inode, since we'll revisit unchecked nlink records in
725 * the second part.
726 */
727static int
728xchk_nlinks_compare_iter(
729	struct xchk_nlink_ctrs	*xnc,
730	struct xfs_inode	**ipp)
731{
732	int			error;
733
734	do {
735		error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
736	} while (error == -EBUSY);
737
738	return error;
739}
740
741/* Compare the link counts we observed against the live information. */
742STATIC int
743xchk_nlinks_compare(
744	struct xchk_nlink_ctrs	*xnc)
745{
746	struct xchk_nlink	nl;
747	struct xfs_scrub	*sc = xnc->sc;
748	struct xfs_inode	*ip;
749	xfarray_idx_t		cur = XFARRAY_CURSOR_INIT;
750	int			error;
751
752	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
753		return 0;
754
755	/*
756	 * Create a new empty transaction so that we can advance the iscan
757	 * cursor without deadlocking if the inobt has a cycle and push on the
758	 * inactivation workqueue.
759	 */
760	xchk_trans_cancel(sc);
761	error = xchk_trans_alloc_empty(sc);
762	if (error)
763		return error;
764
765	/*
766	 * Use the inobt to walk all allocated inodes to compare the link
767	 * counts.  Inodes skipped by _compare_iter will be tried again in the
768	 * next phase of the scan.
769	 */
770	xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan);
771	while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) {
772		error = xchk_nlinks_compare_inode(xnc, ip);
773		xchk_iscan_mark_visited(&xnc->compare_iscan, ip);
774		xchk_irele(sc, ip);
775		if (error)
776			break;
777
778		if (xchk_should_terminate(sc, &error))
779			break;
780	}
781	xchk_iscan_iter_finish(&xnc->compare_iscan);
782	xchk_iscan_teardown(&xnc->compare_iscan);
783	if (error)
784		return error;
785
786	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
787		return 0;
788
789	/*
790	 * Walk all the non-null nlink observations that weren't checked in the
791	 * previous step.
792	 */
793	mutex_lock(&xnc->lock);
794	while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) {
795		xfs_ino_t	ino = cur - 1;
796
797		if (nl.flags & XCHK_NLINK_COMPARE_SCANNED)
798			continue;
799
800		mutex_unlock(&xnc->lock);
801
802		error = xchk_nlinks_compare_inum(xnc, ino);
803		if (error)
804			return error;
805
806		if (xchk_should_terminate(xnc->sc, &error))
807			return error;
808
809		mutex_lock(&xnc->lock);
810	}
811	mutex_unlock(&xnc->lock);
812
813	return error;
814}
815
816/* Tear down everything associated with a nlinks check. */
817static void
818xchk_nlinks_teardown_scan(
819	void			*priv)
820{
821	struct xchk_nlink_ctrs	*xnc = priv;
822
823	/* Discourage any hook functions that might be running. */
824	xchk_iscan_abort(&xnc->collect_iscan);
825
826	xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook);
827
828	xfarray_destroy(xnc->nlinks);
829	xnc->nlinks = NULL;
830
831	xchk_iscan_teardown(&xnc->collect_iscan);
832	mutex_destroy(&xnc->lock);
833	xnc->sc = NULL;
834}
835
836/*
837 * Scan all inodes in the entire filesystem to generate link count data.  If
838 * the scan is successful, the counts will be left alive for a repair.  If any
839 * error occurs, we'll tear everything down.
840 */
841STATIC int
842xchk_nlinks_setup_scan(
843	struct xfs_scrub	*sc,
844	struct xchk_nlink_ctrs	*xnc)
845{
846	struct xfs_mount	*mp = sc->mp;
847	char			*descr;
848	unsigned long long	max_inos;
849	xfs_agnumber_t		last_agno = mp->m_sb.sb_agcount - 1;
850	xfs_agino_t		first_agino, last_agino;
851	int			error;
852
853	ASSERT(xnc->sc == NULL);
854	xnc->sc = sc;
855
856	mutex_init(&xnc->lock);
857
858	/* Retry iget every tenth of a second for up to 30 seconds. */
859	xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan);
860
861	/*
862	 * Set up enough space to store an nlink record for the highest
863	 * possible inode number in this system.
864	 */
865	xfs_agino_range(mp, last_agno, &first_agino, &last_agino);
866	max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1;
867	descr = xchk_xfile_descr(sc, "file link counts");
868	error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos),
869			sizeof(struct xchk_nlink), &xnc->nlinks);
870	kfree(descr);
871	if (error)
872		goto out_teardown;
873
874	/*
875	 * Hook into the directory entry code so that we can capture updates to
876	 * file link counts.  The hook only triggers for inodes that were
877	 * already scanned, and the scanner thread takes each inode's ILOCK,
878	 * which means that any in-progress inode updates will finish before we
879	 * can scan the inode.
880	 */
881	ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
882	xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update);
883	error = xfs_dir_hook_add(mp, &xnc->dhook);
884	if (error)
885		goto out_teardown;
886
887	/* Use deferred cleanup to pass the inode link count data to repair. */
888	sc->buf_cleanup = xchk_nlinks_teardown_scan;
889	return 0;
890
891out_teardown:
892	xchk_nlinks_teardown_scan(xnc);
893	return error;
894}
895
896/* Scrub the link count of all inodes on the filesystem. */
897int
898xchk_nlinks(
899	struct xfs_scrub	*sc)
900{
901	struct xchk_nlink_ctrs	*xnc = sc->buf;
902	int			error = 0;
903
904	/* Set ourselves up to check link counts on the live filesystem. */
905	error = xchk_nlinks_setup_scan(sc, xnc);
906	if (error)
907		return error;
908
909	/* Walk all inodes, picking up link count information. */
910	error = xchk_nlinks_collect(xnc);
911	if (!xchk_xref_process_error(sc, 0, 0, &error))
912		return error;
913
914	/* Fail fast if we're not playing with a full dataset. */
915	if (xchk_iscan_aborted(&xnc->collect_iscan))
916		xchk_set_incomplete(sc);
917	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
918		return 0;
919
920	/* Compare link counts. */
921	error = xchk_nlinks_compare(xnc);
922	if (!xchk_xref_process_error(sc, 0, 0, &error))
923		return error;
924
925	/* Check one last time for an incomplete dataset. */
926	if (xchk_iscan_aborted(&xnc->collect_iscan))
927		xchk_set_incomplete(sc);
928
929	return 0;
930}
931