1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_btree.h"
13#include "xfs_log_format.h"
14#include "xfs_trans.h"
15#include "xfs_sb.h"
16#include "xfs_inode.h"
17#include "xfs_alloc.h"
18#include "xfs_alloc_btree.h"
19#include "xfs_ialloc.h"
20#include "xfs_ialloc_btree.h"
21#include "xfs_rmap.h"
22#include "xfs_rmap_btree.h"
23#include "xfs_refcount.h"
24#include "xfs_refcount_btree.h"
25#include "xfs_extent_busy.h"
26#include "xfs_ag.h"
27#include "xfs_ag_resv.h"
28#include "xfs_quota.h"
29#include "xfs_qm.h"
30#include "xfs_bmap.h"
31#include "xfs_da_format.h"
32#include "xfs_da_btree.h"
33#include "xfs_attr.h"
34#include "xfs_attr_remote.h"
35#include "xfs_defer.h"
36#include "scrub/scrub.h"
37#include "scrub/common.h"
38#include "scrub/trace.h"
39#include "scrub/repair.h"
40#include "scrub/bitmap.h"
41#include "scrub/agb_bitmap.h"
42#include "scrub/fsb_bitmap.h"
43#include "scrub/reap.h"
44
45/*
46 * Disposal of Blocks from Old Metadata
47 *
48 * Now that we've constructed a new btree to replace the damaged one, we want
49 * to dispose of the blocks that (we think) the old btree was using.
50 * Previously, we used the rmapbt to collect the extents (bitmap) with the
51 * rmap owner corresponding to the tree we rebuilt, collected extents for any
52 * blocks with the same rmap owner that are owned by another data structure
53 * (sublist), and subtracted sublist from bitmap.  In theory the extents
54 * remaining in bitmap are the old btree's blocks.
55 *
56 * Unfortunately, it's possible that the btree was crosslinked with other
57 * blocks on disk.  The rmap data can tell us if there are multiple owners, so
58 * if the rmapbt says there is an owner of this block other than @oinfo, then
59 * the block is crosslinked.  Remove the reverse mapping and continue.
60 *
61 * If there is one rmap record, we can free the block, which removes the
62 * reverse mapping but doesn't add the block to the free space.  Our repair
63 * strategy is to hope the other metadata objects crosslinked on this block
64 * will be rebuilt (atop different blocks), thereby removing all the cross
65 * links.
66 *
67 * If there are no rmap records at all, we also free the block.  If the btree
68 * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
69 * supposed to be a rmap record and everything is ok.  For other btrees there
70 * had to have been an rmap entry for the block to have ended up on @bitmap,
71 * so if it's gone now there's something wrong and the fs will shut down.
72 *
73 * Note: If there are multiple rmap records with only the same rmap owner as
74 * the btree we're trying to rebuild and the block is indeed owned by another
75 * data structure with the same rmap owner, then the block will be in sublist
76 * and therefore doesn't need disposal.  If there are multiple rmap records
77 * with only the same rmap owner but the block is not owned by something with
78 * the same rmap owner, the block will be freed.
79 *
80 * The caller is responsible for locking the AG headers/inode for the entire
81 * rebuild operation so that nothing else can sneak in and change the incore
82 * state while we're not looking.  We must also invalidate any buffers
83 * associated with @bitmap.
84 */
85
86/* Information about reaping extents after a repair. */
87struct xreap_state {
88	struct xfs_scrub		*sc;
89
90	/* Reverse mapping owner and metadata reservation type. */
91	const struct xfs_owner_info	*oinfo;
92	enum xfs_ag_resv_type		resv;
93
94	/* If true, roll the transaction before reaping the next extent. */
95	bool				force_roll;
96
97	/* Number of deferred reaps attached to the current transaction. */
98	unsigned int			deferred;
99
100	/* Number of invalidated buffers logged to the current transaction. */
101	unsigned int			invalidated;
102
103	/* Number of deferred reaps queued during the whole reap sequence. */
104	unsigned long long		total_deferred;
105};
106
107/* Put a block back on the AGFL. */
108STATIC int
109xreap_put_freelist(
110	struct xfs_scrub	*sc,
111	xfs_agblock_t		agbno)
112{
113	struct xfs_buf		*agfl_bp;
114	int			error;
115
116	/* Make sure there's space on the freelist. */
117	error = xrep_fix_freelist(sc, 0);
118	if (error)
119		return error;
120
121	/*
122	 * Since we're "freeing" a lost block onto the AGFL, we have to
123	 * create an rmap for the block prior to merging it or else other
124	 * parts will break.
125	 */
126	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
127			&XFS_RMAP_OINFO_AG);
128	if (error)
129		return error;
130
131	/* Put the block on the AGFL. */
132	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
133	if (error)
134		return error;
135
136	error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
137			agfl_bp, agbno, 0);
138	if (error)
139		return error;
140	xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
141			XFS_EXTENT_BUSY_SKIP_DISCARD);
142
143	return 0;
144}
145
146/* Are there any uncommitted reap operations? */
147static inline bool xreap_dirty(const struct xreap_state *rs)
148{
149	if (rs->force_roll)
150		return true;
151	if (rs->deferred)
152		return true;
153	if (rs->invalidated)
154		return true;
155	if (rs->total_deferred)
156		return true;
157	return false;
158}
159
160#define XREAP_MAX_BINVAL	(2048)
161
162/*
163 * Decide if we want to roll the transaction after reaping an extent.  We don't
164 * want to overrun the transaction reservation, so we prohibit more than
165 * 128 EFIs per transaction.  For the same reason, we limit the number
166 * of buffer invalidations to 2048.
167 */
168static inline bool xreap_want_roll(const struct xreap_state *rs)
169{
170	if (rs->force_roll)
171		return true;
172	if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
173		return true;
174	if (rs->invalidated > XREAP_MAX_BINVAL)
175		return true;
176	return false;
177}
178
179static inline void xreap_reset(struct xreap_state *rs)
180{
181	rs->total_deferred += rs->deferred;
182	rs->deferred = 0;
183	rs->invalidated = 0;
184	rs->force_roll = false;
185}
186
187#define XREAP_MAX_DEFER_CHAIN		(2048)
188
189/*
190 * Decide if we want to finish the deferred ops that are attached to the scrub
191 * transaction.  We don't want to queue huge chains of deferred ops because
192 * that can consume a lot of log space and kernel memory.  Hence we trigger a
193 * xfs_defer_finish if there are more than 2048 deferred reap operations or the
194 * caller did some real work.
195 */
196static inline bool
197xreap_want_defer_finish(const struct xreap_state *rs)
198{
199	if (rs->force_roll)
200		return true;
201	if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
202		return true;
203	return false;
204}
205
206static inline void xreap_defer_finish_reset(struct xreap_state *rs)
207{
208	rs->total_deferred = 0;
209	rs->deferred = 0;
210	rs->invalidated = 0;
211	rs->force_roll = false;
212}
213
214/* Try to invalidate the incore buffers for an extent that we're freeing. */
215STATIC void
216xreap_agextent_binval(
217	struct xreap_state	*rs,
218	xfs_agblock_t		agbno,
219	xfs_extlen_t		*aglenp)
220{
221	struct xfs_scrub	*sc = rs->sc;
222	struct xfs_perag	*pag = sc->sa.pag;
223	struct xfs_mount	*mp = sc->mp;
224	xfs_agnumber_t		agno = sc->sa.pag->pag_agno;
225	xfs_agblock_t		agbno_next = agbno + *aglenp;
226	xfs_agblock_t		bno = agbno;
227
228	/*
229	 * Avoid invalidating AG headers and post-EOFS blocks because we never
230	 * own those.
231	 */
232	if (!xfs_verify_agbno(pag, agbno) ||
233	    !xfs_verify_agbno(pag, agbno_next - 1))
234		return;
235
236	/*
237	 * If there are incore buffers for these blocks, invalidate them.  We
238	 * assume that the lack of any other known owners means that the buffer
239	 * can be locked without risk of deadlocking.  The buffer cache cannot
240	 * detect aliasing, so employ nested loops to scan for incore buffers
241	 * of any plausible size.
242	 */
243	while (bno < agbno_next) {
244		xfs_agblock_t	fsbcount;
245		xfs_agblock_t	max_fsbs;
246
247		/*
248		 * Max buffer size is the max remote xattr buffer size, which
249		 * is one fs block larger than 64k.
250		 */
251		max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
252				xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
253
254		for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) {
255			struct xfs_buf	*bp = NULL;
256			xfs_daddr_t	daddr;
257			int		error;
258
259			daddr = XFS_AGB_TO_DADDR(mp, agno, bno);
260			error = xfs_buf_incore(mp->m_ddev_targp, daddr,
261					XFS_FSB_TO_BB(mp, fsbcount),
262					XBF_LIVESCAN, &bp);
263			if (error)
264				continue;
265
266			xfs_trans_bjoin(sc->tp, bp);
267			xfs_trans_binval(sc->tp, bp);
268			rs->invalidated++;
269
270			/*
271			 * Stop invalidating if we've hit the limit; we should
272			 * still have enough reservation left to free however
273			 * far we've gotten.
274			 */
275			if (rs->invalidated > XREAP_MAX_BINVAL) {
276				*aglenp -= agbno_next - bno;
277				goto out;
278			}
279		}
280
281		bno++;
282	}
283
284out:
285	trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
286}
287
288/*
289 * Figure out the longest run of blocks that we can dispose of with a single
290 * call.  Cross-linked blocks should have their reverse mappings removed, but
291 * single-owner extents can be freed.  AGFL blocks can only be put back one at
292 * a time.
293 */
294STATIC int
295xreap_agextent_select(
296	struct xreap_state	*rs,
297	xfs_agblock_t		agbno,
298	xfs_agblock_t		agbno_next,
299	bool			*crosslinked,
300	xfs_extlen_t		*aglenp)
301{
302	struct xfs_scrub	*sc = rs->sc;
303	struct xfs_btree_cur	*cur;
304	xfs_agblock_t		bno = agbno + 1;
305	xfs_extlen_t		len = 1;
306	int			error;
307
308	/*
309	 * Determine if there are any other rmap records covering the first
310	 * block of this extent.  If so, the block is crosslinked.
311	 */
312	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
313			sc->sa.pag);
314	error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
315			crosslinked);
316	if (error)
317		goto out_cur;
318
319	/* AGFL blocks can only be deal with one at a time. */
320	if (rs->resv == XFS_AG_RESV_AGFL)
321		goto out_found;
322
323	/*
324	 * Figure out how many of the subsequent blocks have the same crosslink
325	 * status.
326	 */
327	while (bno < agbno_next) {
328		bool		also_crosslinked;
329
330		error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
331				&also_crosslinked);
332		if (error)
333			goto out_cur;
334
335		if (*crosslinked != also_crosslinked)
336			break;
337
338		len++;
339		bno++;
340	}
341
342out_found:
343	*aglenp = len;
344	trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
345out_cur:
346	xfs_btree_del_cursor(cur, error);
347	return error;
348}
349
350/*
351 * Dispose of as much of the beginning of this AG extent as possible.  The
352 * number of blocks disposed of will be returned in @aglenp.
353 */
354STATIC int
355xreap_agextent_iter(
356	struct xreap_state	*rs,
357	xfs_agblock_t		agbno,
358	xfs_extlen_t		*aglenp,
359	bool			crosslinked)
360{
361	struct xfs_scrub	*sc = rs->sc;
362	xfs_fsblock_t		fsbno;
363	int			error = 0;
364
365	fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
366
367	/*
368	 * If there are other rmappings, this block is cross linked and must
369	 * not be freed.  Remove the reverse mapping and move on.  Otherwise,
370	 * we were the only owner of the block, so free the extent, which will
371	 * also remove the rmap.
372	 *
373	 * XXX: XFS doesn't support detecting the case where a single block
374	 * metadata structure is crosslinked with a multi-block structure
375	 * because the buffer cache doesn't detect aliasing problems, so we
376	 * can't fix 100% of crosslinking problems (yet).  The verifiers will
377	 * blow on writeout, the filesystem will shut down, and the admin gets
378	 * to run xfs_repair.
379	 */
380	if (crosslinked) {
381		trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
382
383		rs->force_roll = true;
384
385		if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
386			/*
387			 * If we're unmapping CoW staging extents, remove the
388			 * records from the refcountbt, which will remove the
389			 * rmap record as well.
390			 */
391			xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
392			return 0;
393		}
394
395		return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
396				*aglenp, rs->oinfo);
397	}
398
399	trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
400
401	/*
402	 * Invalidate as many buffers as we can, starting at agbno.  If this
403	 * function sets *aglenp to zero, the transaction is full of logged
404	 * buffer invalidations, so we need to return early so that we can
405	 * roll and retry.
406	 */
407	xreap_agextent_binval(rs, agbno, aglenp);
408	if (*aglenp == 0) {
409		ASSERT(xreap_want_roll(rs));
410		return 0;
411	}
412
413	/*
414	 * If we're getting rid of CoW staging extents, use deferred work items
415	 * to remove the refcountbt records (which removes the rmap records)
416	 * and free the extent.  We're not worried about the system going down
417	 * here because log recovery walks the refcount btree to clean out the
418	 * CoW staging extents.
419	 */
420	if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
421		ASSERT(rs->resv == XFS_AG_RESV_NONE);
422
423		xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
424		error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
425				rs->resv, true);
426		if (error)
427			return error;
428
429		rs->force_roll = true;
430		return 0;
431	}
432
433	/* Put blocks back on the AGFL one at a time. */
434	if (rs->resv == XFS_AG_RESV_AGFL) {
435		ASSERT(*aglenp == 1);
436		error = xreap_put_freelist(sc, agbno);
437		if (error)
438			return error;
439
440		rs->force_roll = true;
441		return 0;
442	}
443
444	/*
445	 * Use deferred frees to get rid of the old btree blocks to try to
446	 * minimize the window in which we could crash and lose the old blocks.
447	 * Add a defer ops barrier every other extent to avoid stressing the
448	 * system with large EFIs.
449	 */
450	error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
451			rs->resv, true);
452	if (error)
453		return error;
454
455	rs->deferred++;
456	if (rs->deferred % 2 == 0)
457		xfs_defer_add_barrier(sc->tp);
458	return 0;
459}
460
461/*
462 * Break an AG metadata extent into sub-extents by fate (crosslinked, not
463 * crosslinked), and dispose of each sub-extent separately.
464 */
465STATIC int
466xreap_agmeta_extent(
467	uint32_t		agbno,
468	uint32_t		len,
469	void			*priv)
470{
471	struct xreap_state	*rs = priv;
472	struct xfs_scrub	*sc = rs->sc;
473	xfs_agblock_t		agbno_next = agbno + len;
474	int			error = 0;
475
476	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
477	ASSERT(sc->ip == NULL);
478
479	while (agbno < agbno_next) {
480		xfs_extlen_t	aglen;
481		bool		crosslinked;
482
483		error = xreap_agextent_select(rs, agbno, agbno_next,
484				&crosslinked, &aglen);
485		if (error)
486			return error;
487
488		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
489		if (error)
490			return error;
491
492		if (xreap_want_defer_finish(rs)) {
493			error = xrep_defer_finish(sc);
494			if (error)
495				return error;
496			xreap_defer_finish_reset(rs);
497		} else if (xreap_want_roll(rs)) {
498			error = xrep_roll_ag_trans(sc);
499			if (error)
500				return error;
501			xreap_reset(rs);
502		}
503
504		agbno += aglen;
505	}
506
507	return 0;
508}
509
510/* Dispose of every block of every AG metadata extent in the bitmap. */
511int
512xrep_reap_agblocks(
513	struct xfs_scrub		*sc,
514	struct xagb_bitmap		*bitmap,
515	const struct xfs_owner_info	*oinfo,
516	enum xfs_ag_resv_type		type)
517{
518	struct xreap_state		rs = {
519		.sc			= sc,
520		.oinfo			= oinfo,
521		.resv			= type,
522	};
523	int				error;
524
525	ASSERT(xfs_has_rmapbt(sc->mp));
526	ASSERT(sc->ip == NULL);
527
528	error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
529	if (error)
530		return error;
531
532	if (xreap_dirty(&rs))
533		return xrep_defer_finish(sc);
534
535	return 0;
536}
537
538/*
539 * Break a file metadata extent into sub-extents by fate (crosslinked, not
540 * crosslinked), and dispose of each sub-extent separately.  The extent must
541 * not cross an AG boundary.
542 */
543STATIC int
544xreap_fsmeta_extent(
545	uint64_t		fsbno,
546	uint64_t		len,
547	void			*priv)
548{
549	struct xreap_state	*rs = priv;
550	struct xfs_scrub	*sc = rs->sc;
551	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
552	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
553	xfs_agblock_t		agbno_next = agbno + len;
554	int			error = 0;
555
556	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
557	ASSERT(sc->ip != NULL);
558	ASSERT(!sc->sa.pag);
559
560	/*
561	 * We're reaping blocks after repairing file metadata, which means that
562	 * we have to init the xchk_ag structure ourselves.
563	 */
564	sc->sa.pag = xfs_perag_get(sc->mp, agno);
565	if (!sc->sa.pag)
566		return -EFSCORRUPTED;
567
568	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
569	if (error)
570		goto out_pag;
571
572	while (agbno < agbno_next) {
573		xfs_extlen_t	aglen;
574		bool		crosslinked;
575
576		error = xreap_agextent_select(rs, agbno, agbno_next,
577				&crosslinked, &aglen);
578		if (error)
579			goto out_agf;
580
581		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
582		if (error)
583			goto out_agf;
584
585		if (xreap_want_defer_finish(rs)) {
586			/*
587			 * Holds the AGF buffer across the deferred chain
588			 * processing.
589			 */
590			error = xrep_defer_finish(sc);
591			if (error)
592				goto out_agf;
593			xreap_defer_finish_reset(rs);
594		} else if (xreap_want_roll(rs)) {
595			/*
596			 * Hold the AGF buffer across the transaction roll so
597			 * that we don't have to reattach it to the scrub
598			 * context.
599			 */
600			xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
601			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
602			xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
603			if (error)
604				goto out_agf;
605			xreap_reset(rs);
606		}
607
608		agbno += aglen;
609	}
610
611out_agf:
612	xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
613	sc->sa.agf_bp = NULL;
614out_pag:
615	xfs_perag_put(sc->sa.pag);
616	sc->sa.pag = NULL;
617	return error;
618}
619
620/*
621 * Dispose of every block of every fs metadata extent in the bitmap.
622 * Do not use this to dispose of the mappings in an ondisk inode fork.
623 */
624int
625xrep_reap_fsblocks(
626	struct xfs_scrub		*sc,
627	struct xfsb_bitmap		*bitmap,
628	const struct xfs_owner_info	*oinfo)
629{
630	struct xreap_state		rs = {
631		.sc			= sc,
632		.oinfo			= oinfo,
633		.resv			= XFS_AG_RESV_NONE,
634	};
635	int				error;
636
637	ASSERT(xfs_has_rmapbt(sc->mp));
638	ASSERT(sc->ip != NULL);
639
640	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
641	if (error)
642		return error;
643
644	if (xreap_dirty(&rs))
645		return xrep_defer_finish(sc);
646
647	return 0;
648}
649