1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_defer.h"
13#include "xfs_btree.h"
14#include "xfs_log_format.h"
15#include "xfs_trans.h"
16#include "xfs_inode.h"
17#include "xfs_inode_fork.h"
18#include "xfs_alloc.h"
19#include "xfs_bmap.h"
20#include "xfs_rmap.h"
21#include "xfs_refcount.h"
22#include "xfs_quota.h"
23#include "xfs_ialloc.h"
24#include "xfs_ag.h"
25#include "xfs_error.h"
26#include "xfs_errortag.h"
27#include "xfs_icache.h"
28#include "xfs_refcount_btree.h"
29#include "scrub/xfs_scrub.h"
30#include "scrub/scrub.h"
31#include "scrub/common.h"
32#include "scrub/trace.h"
33#include "scrub/repair.h"
34#include "scrub/bitmap.h"
35#include "scrub/off_bitmap.h"
36#include "scrub/fsb_bitmap.h"
37#include "scrub/reap.h"
38
39/*
40 * CoW Fork Mapping Repair
41 * =======================
42 *
43 * Although CoW staging extents are owned by incore CoW inode forks, on disk
44 * they are owned by the refcount btree.  The ondisk metadata does not record
45 * any ownership information, which limits what we can do to repair the
46 * mappings in the CoW fork.  At most, we can replace ifork mappings that lack
47 * an entry in the refcount btree or are described by a reverse mapping record
48 * whose owner is not OWN_COW.
49 *
50 * Replacing extents is also tricky -- we can't touch written CoW fork extents
51 * since they are undergoing writeback, and delalloc extents do not require
52 * repair since they only exist incore.  Hence the most we can do is find the
53 * bad parts of unwritten mappings, allocate a replacement set of blocks, and
54 * replace the incore mapping.  We use the regular reaping process to unmap
55 * or free the discarded blocks, as appropriate.
56 */
57struct xrep_cow {
58	struct xfs_scrub	*sc;
59
60	/* Bitmap of file offset ranges that need replacing. */
61	struct xoff_bitmap	bad_fileoffs;
62
63	/* Bitmap of fsblocks that were removed from the CoW fork. */
64	struct xfsb_bitmap	old_cowfork_fsblocks;
65
66	/* CoW fork mappings used to scan for bad CoW staging extents. */
67	struct xfs_bmbt_irec	irec;
68
69	/* refcount btree block number of irec.br_startblock */
70	unsigned int		irec_startbno;
71
72	/* refcount btree block number of the next refcount record we expect */
73	unsigned int		next_bno;
74};
75
76/* CoW staging extent. */
77struct xrep_cow_extent {
78	xfs_fsblock_t		fsbno;
79	xfs_extlen_t		len;
80};
81
82/*
83 * Mark the part of the file range that corresponds to the given physical
84 * space.  Caller must ensure that the physical range is within xc->irec.
85 */
86STATIC int
87xrep_cow_mark_file_range(
88	struct xrep_cow		*xc,
89	xfs_fsblock_t		startblock,
90	xfs_filblks_t		blockcount)
91{
92	xfs_fileoff_t		startoff;
93
94	startoff = xc->irec.br_startoff +
95				(startblock - xc->irec.br_startblock);
96
97	trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff,
98			blockcount);
99
100	return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount);
101}
102
103/*
104 * Trim @src to fit within the CoW fork mapping being examined, and put the
105 * result in @dst.
106 */
107static inline void
108xrep_cow_trim_refcount(
109	struct xrep_cow			*xc,
110	struct xfs_refcount_irec	*dst,
111	const struct xfs_refcount_irec	*src)
112{
113	unsigned int			adj;
114
115	memcpy(dst, src, sizeof(*dst));
116
117	if (dst->rc_startblock < xc->irec_startbno) {
118		adj = xc->irec_startbno - dst->rc_startblock;
119		dst->rc_blockcount -= adj;
120		dst->rc_startblock += adj;
121	}
122
123	if (dst->rc_startblock + dst->rc_blockcount >
124	    xc->irec_startbno + xc->irec.br_blockcount) {
125		adj = (dst->rc_startblock + dst->rc_blockcount) -
126		      (xc->irec_startbno + xc->irec.br_blockcount);
127		dst->rc_blockcount -= adj;
128	}
129}
130
131/* Mark any shared CoW staging extents. */
132STATIC int
133xrep_cow_mark_shared_staging(
134	struct xfs_btree_cur		*cur,
135	const struct xfs_refcount_irec	*rec,
136	void				*priv)
137{
138	struct xrep_cow			*xc = priv;
139	struct xfs_refcount_irec	rrec;
140	xfs_fsblock_t			fsbno;
141
142	if (!xfs_refcount_check_domain(rec) ||
143	    rec->rc_domain != XFS_REFC_DOMAIN_SHARED)
144		return -EFSCORRUPTED;
145
146	xrep_cow_trim_refcount(xc, &rrec, rec);
147
148	fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
149			rrec.rc_startblock);
150	return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount);
151}
152
153/*
154 * Mark any portion of the CoW fork file offset range where there is not a CoW
155 * staging extent record in the refcountbt, and keep a record of where we did
156 * find correct refcountbt records.  Staging records are always cleaned out at
157 * mount time, so any two inodes trying to map the same staging area would have
158 * already taken the fs down due to refcount btree verifier errors.  Hence this
159 * inode should be the sole creator of the staging extent records ondisk.
160 */
161STATIC int
162xrep_cow_mark_missing_staging(
163	struct xfs_btree_cur		*cur,
164	const struct xfs_refcount_irec	*rec,
165	void				*priv)
166{
167	struct xrep_cow			*xc = priv;
168	struct xfs_refcount_irec	rrec;
169	int				error;
170
171	if (!xfs_refcount_check_domain(rec) ||
172	    rec->rc_domain != XFS_REFC_DOMAIN_COW)
173		return -EFSCORRUPTED;
174
175	xrep_cow_trim_refcount(xc, &rrec, rec);
176
177	if (xc->next_bno >= rrec.rc_startblock)
178		goto next;
179
180	error = xrep_cow_mark_file_range(xc,
181			XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
182				       xc->next_bno),
183			rrec.rc_startblock - xc->next_bno);
184	if (error)
185		return error;
186
187next:
188	xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount;
189	return 0;
190}
191
192/*
193 * Mark any area that does not correspond to a CoW staging rmap.  These are
194 * cross-linked areas that must be avoided.
195 */
196STATIC int
197xrep_cow_mark_missing_staging_rmap(
198	struct xfs_btree_cur		*cur,
199	const struct xfs_rmap_irec	*rec,
200	void				*priv)
201{
202	struct xrep_cow			*xc = priv;
203	xfs_fsblock_t			fsbno;
204	xfs_agblock_t			rec_bno;
205	xfs_extlen_t			rec_len;
206	unsigned int			adj;
207
208	if (rec->rm_owner == XFS_RMAP_OWN_COW)
209		return 0;
210
211	rec_bno = rec->rm_startblock;
212	rec_len = rec->rm_blockcount;
213	if (rec_bno < xc->irec_startbno) {
214		adj = xc->irec_startbno - rec_bno;
215		rec_len -= adj;
216		rec_bno += adj;
217	}
218
219	if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) {
220		adj = (rec_bno + rec_len) -
221		      (xc->irec_startbno + xc->irec.br_blockcount);
222		rec_len -= adj;
223	}
224
225	fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno);
226	return xrep_cow_mark_file_range(xc, fsbno, rec_len);
227}
228
229/*
230 * Find any part of the CoW fork mapping that isn't a single-owner CoW staging
231 * extent and mark the corresponding part of the file range in the bitmap.
232 */
233STATIC int
234xrep_cow_find_bad(
235	struct xrep_cow			*xc)
236{
237	struct xfs_refcount_irec	rc_low = { 0 };
238	struct xfs_refcount_irec	rc_high = { 0 };
239	struct xfs_rmap_irec		rm_low = { 0 };
240	struct xfs_rmap_irec		rm_high = { 0 };
241	struct xfs_perag		*pag;
242	struct xfs_scrub		*sc = xc->sc;
243	xfs_agnumber_t			agno;
244	int				error;
245
246	agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock);
247	xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock);
248
249	pag = xfs_perag_get(sc->mp, agno);
250	if (!pag)
251		return -EFSCORRUPTED;
252
253	error = xrep_ag_init(sc, pag, &sc->sa);
254	if (error)
255		goto out_pag;
256
257	/* Mark any CoW fork extents that are shared. */
258	rc_low.rc_startblock = xc->irec_startbno;
259	rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
260	rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED;
261	error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
262			xrep_cow_mark_shared_staging, xc);
263	if (error)
264		goto out_sa;
265
266	/* Make sure there are CoW staging extents for the whole mapping. */
267	rc_low.rc_startblock = xc->irec_startbno;
268	rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
269	rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW;
270	xc->next_bno = xc->irec_startbno;
271	error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
272			xrep_cow_mark_missing_staging, xc);
273	if (error)
274		goto out_sa;
275
276	if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
277		error = xrep_cow_mark_file_range(xc,
278				XFS_AGB_TO_FSB(sc->mp, pag->pag_agno,
279					       xc->next_bno),
280				xc->irec_startbno + xc->irec.br_blockcount -
281				xc->next_bno);
282		if (error)
283			goto out_sa;
284	}
285
286	/* Mark any area has an rmap that isn't a COW staging extent. */
287	rm_low.rm_startblock = xc->irec_startbno;
288	memset(&rm_high, 0xFF, sizeof(rm_high));
289	rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
290	error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high,
291			xrep_cow_mark_missing_staging_rmap, xc);
292	if (error)
293		goto out_sa;
294
295	/*
296	 * If userspace is forcing us to rebuild the CoW fork or someone turned
297	 * on the debugging knob, replace everything in the CoW fork.
298	 */
299	if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
300	    XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
301		error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
302				xc->irec.br_blockcount);
303		if (error)
304			return error;
305	}
306
307out_sa:
308	xchk_ag_free(sc, &sc->sa);
309out_pag:
310	xfs_perag_put(pag);
311	return 0;
312}
313
314/*
315 * Allocate a replacement CoW staging extent of up to the given number of
316 * blocks, and fill out the mapping.
317 */
318STATIC int
319xrep_cow_alloc(
320	struct xfs_scrub	*sc,
321	xfs_extlen_t		maxlen,
322	struct xrep_cow_extent	*repl)
323{
324	struct xfs_alloc_arg	args = {
325		.tp		= sc->tp,
326		.mp		= sc->mp,
327		.oinfo		= XFS_RMAP_OINFO_SKIP_UPDATE,
328		.minlen		= 1,
329		.maxlen		= maxlen,
330		.prod		= 1,
331		.resv		= XFS_AG_RESV_NONE,
332		.datatype	= XFS_ALLOC_USERDATA,
333	};
334	int			error;
335
336	error = xfs_trans_reserve_more(sc->tp, maxlen, 0);
337	if (error)
338		return error;
339
340	error = xfs_alloc_vextent_start_ag(&args,
341			XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino));
342	if (error)
343		return error;
344	if (args.fsbno == NULLFSBLOCK)
345		return -ENOSPC;
346
347	xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len);
348
349	repl->fsbno = args.fsbno;
350	repl->len = args.len;
351	return 0;
352}
353
354/*
355 * Look up the current CoW fork mapping so that we only allocate enough to
356 * replace a single mapping.  If we don't find a mapping that covers the start
357 * of the file range, or we find a delalloc or written extent, something is
358 * seriously wrong, since we didn't drop the ILOCK.
359 */
360static inline int
361xrep_cow_find_mapping(
362	struct xrep_cow		*xc,
363	struct xfs_iext_cursor	*icur,
364	xfs_fileoff_t		startoff,
365	struct xfs_bmbt_irec	*got)
366{
367	struct xfs_inode	*ip = xc->sc->ip;
368	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
369
370	if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got))
371		goto bad;
372
373	if (got->br_startoff > startoff)
374		goto bad;
375
376	if (got->br_blockcount == 0)
377		goto bad;
378
379	if (isnullstartblock(got->br_startblock))
380		goto bad;
381
382	if (xfs_bmap_is_written_extent(got))
383		goto bad;
384
385	return 0;
386bad:
387	ASSERT(0);
388	return -EFSCORRUPTED;
389}
390
391#define REPLACE_LEFT_SIDE	(1U << 0)
392#define REPLACE_RIGHT_SIDE	(1U << 1)
393
394/*
395 * Given a CoW fork mapping @got and a replacement mapping @repl, remap the
396 * beginning of @got with the space described by @rep.
397 */
398static inline void
399xrep_cow_replace_mapping(
400	struct xfs_inode		*ip,
401	struct xfs_iext_cursor		*icur,
402	const struct xfs_bmbt_irec	*got,
403	const struct xrep_cow_extent	*repl)
404{
405	struct xfs_bmbt_irec		new = *got; /* struct copy */
406
407	ASSERT(repl->len > 0);
408	ASSERT(!isnullstartblock(got->br_startblock));
409
410	trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len);
411
412	if (got->br_blockcount == repl->len) {
413		/*
414		 * The new extent is a complete replacement for the existing
415		 * extent.  Update the COW fork record.
416		 */
417		new.br_startblock = repl->fsbno;
418		xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
419		return;
420	}
421
422	/*
423	 * The new extent can replace the beginning of the COW fork record.
424	 * Move the left side of @got upwards, then insert the new record.
425	 */
426	new.br_startoff += repl->len;
427	new.br_startblock += repl->len;
428	new.br_blockcount -= repl->len;
429	xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
430
431	new.br_startoff = got->br_startoff;
432	new.br_startblock = repl->fsbno;
433	new.br_blockcount = repl->len;
434	xfs_iext_insert(ip, icur, &new, BMAP_COWFORK);
435}
436
437/*
438 * Replace the unwritten CoW staging extent backing the given file range with a
439 * new space extent that isn't as problematic.
440 */
441STATIC int
442xrep_cow_replace_range(
443	struct xrep_cow		*xc,
444	xfs_fileoff_t		startoff,
445	xfs_extlen_t		*blockcount)
446{
447	struct xfs_iext_cursor	icur;
448	struct xrep_cow_extent	repl;
449	struct xfs_bmbt_irec	got;
450	struct xfs_scrub	*sc = xc->sc;
451	xfs_fileoff_t		nextoff;
452	xfs_extlen_t		alloc_len;
453	int			error;
454
455	/*
456	 * Put the existing CoW fork mapping in @got.  If @got ends before
457	 * @rep, truncate @rep so we only replace one extent mapping at a time.
458	 */
459	error = xrep_cow_find_mapping(xc, &icur, startoff, &got);
460	if (error)
461		return error;
462	nextoff = min(startoff + *blockcount,
463		      got.br_startoff + got.br_blockcount);
464
465	/*
466	 * Allocate a replacement extent.  If we don't fill all the blocks,
467	 * shorten the quantity that will be deleted in this step.
468	 */
469	alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN,
470			  nextoff - startoff);
471	error = xrep_cow_alloc(sc, alloc_len, &repl);
472	if (error)
473		return error;
474
475	/*
476	 * Replace the old mapping with the new one, and commit the metadata
477	 * changes made so far.
478	 */
479	xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl);
480
481	xfs_inode_set_cowblocks_tag(sc->ip);
482	error = xfs_defer_finish(&sc->tp);
483	if (error)
484		return error;
485
486	/* Note the old CoW staging extents; we'll reap them all later. */
487	error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock,
488			repl.len);
489	if (error)
490		return error;
491
492	*blockcount = repl.len;
493	return 0;
494}
495
496/*
497 * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc
498 * reservation.
499 */
500STATIC int
501xrep_cow_replace(
502	uint64_t		startoff,
503	uint64_t		blockcount,
504	void			*priv)
505{
506	struct xrep_cow		*xc = priv;
507	int			error = 0;
508
509	while (blockcount > 0) {
510		xfs_extlen_t	len = min_t(xfs_filblks_t, blockcount,
511					    XFS_MAX_BMBT_EXTLEN);
512
513		error = xrep_cow_replace_range(xc, startoff, &len);
514		if (error)
515			break;
516
517		blockcount -= len;
518		startoff += len;
519	}
520
521	return error;
522}
523
524/*
525 * Repair an inode's CoW fork.  The CoW fork is an in-core structure, so
526 * there's no btree to rebuid.  Instead, we replace any mappings that are
527 * cross-linked or lack ondisk CoW fork records in the refcount btree.
528 */
529int
530xrep_bmap_cow(
531	struct xfs_scrub	*sc)
532{
533	struct xrep_cow		*xc;
534	struct xfs_iext_cursor	icur;
535	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK);
536	int			error;
537
538	if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp))
539		return -EOPNOTSUPP;
540
541	if (!ifp)
542		return 0;
543
544	/* realtime files aren't supported yet */
545	if (XFS_IS_REALTIME_INODE(sc->ip))
546		return -EOPNOTSUPP;
547
548	/*
549	 * If we're somehow not in extents format, then reinitialize it to
550	 * an empty extent mapping fork and exit.
551	 */
552	if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
553		ifp->if_format = XFS_DINODE_FMT_EXTENTS;
554		ifp->if_nextents = 0;
555		return 0;
556	}
557
558	xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS);
559	if (!xc)
560		return -ENOMEM;
561
562	xfs_trans_ijoin(sc->tp, sc->ip, 0);
563
564	xc->sc = sc;
565	xoff_bitmap_init(&xc->bad_fileoffs);
566	xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
567
568	for_each_xfs_iext(ifp, &icur, &xc->irec) {
569		if (xchk_should_terminate(sc, &error))
570			goto out_bitmap;
571
572		/*
573		 * delalloc reservations only exist incore, so there is no
574		 * ondisk metadata that we can examine.  Hence we leave them
575		 * alone.
576		 */
577		if (isnullstartblock(xc->irec.br_startblock))
578			continue;
579
580		/*
581		 * COW fork extents are only in the written state if writeback
582		 * is actively writing to disk.  We cannot restart the write
583		 * at a different disk address since we've already issued the
584		 * IO, so we leave these alone and hope for the best.
585		 */
586		if (xfs_bmap_is_written_extent(&xc->irec))
587			continue;
588
589		error = xrep_cow_find_bad(xc);
590		if (error)
591			goto out_bitmap;
592	}
593
594	/* Replace any bad unwritten mappings with fresh reservations. */
595	error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc);
596	if (error)
597		goto out_bitmap;
598
599	/*
600	 * Reap as many of the old CoW blocks as we can.  They are owned ondisk
601	 * by the refcount btree, not the inode, so it is correct to treat them
602	 * like inode metadata.
603	 */
604	error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
605			&XFS_RMAP_OINFO_COW);
606	if (error)
607		goto out_bitmap;
608
609out_bitmap:
610	xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
611	xoff_bitmap_destroy(&xc->bad_fileoffs);
612	kfree(xc);
613	return error;
614}
615