1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_defer.h"
13#include "xfs_btree.h"
14#include "xfs_btree_staging.h"
15#include "xfs_bit.h"
16#include "xfs_log_format.h"
17#include "xfs_trans.h"
18#include "xfs_sb.h"
19#include "xfs_alloc.h"
20#include "xfs_alloc_btree.h"
21#include "xfs_rmap.h"
22#include "xfs_rmap_btree.h"
23#include "xfs_inode.h"
24#include "xfs_refcount.h"
25#include "xfs_extent_busy.h"
26#include "xfs_health.h"
27#include "xfs_bmap.h"
28#include "xfs_ialloc.h"
29#include "xfs_ag.h"
30#include "scrub/xfs_scrub.h"
31#include "scrub/scrub.h"
32#include "scrub/common.h"
33#include "scrub/btree.h"
34#include "scrub/trace.h"
35#include "scrub/repair.h"
36#include "scrub/bitmap.h"
37#include "scrub/agb_bitmap.h"
38#include "scrub/xfile.h"
39#include "scrub/xfarray.h"
40#include "scrub/newbt.h"
41#include "scrub/reap.h"
42
43/*
44 * Free Space Btree Repair
45 * =======================
46 *
47 * The reverse mappings are supposed to record all space usage for the entire
48 * AG.  Therefore, we can recreate the free extent records in an AG by looking
49 * for gaps in the physical extents recorded in the rmapbt.  These records are
50 * staged in @free_records.  Identifying the gaps is more difficult on a
51 * reflink filesystem because rmap records are allowed to overlap.
52 *
53 * Because the final step of building a new index is to free the space used by
54 * the old index, repair needs to find that space.  Unfortunately, all
55 * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share
56 * the same rmapbt owner code (OWN_AG), so this is not straightforward.
57 *
58 * The scan of the reverse mapping information records the space used by OWN_AG
59 * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed.  While
60 * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to
61 * record all visited rmap btree blocks and all blocks owned by the AGFL.
62 *
63 * After that is where the definitions of old_allocbt_blocks shifts.  This
64 * expression identifies possible former bnobt/cntbt blocks:
65 *
66 *	(OWN_AG blocks) & ~(rmapbt blocks | agfl blocks);
67 *
68 * Substituting from above definitions, that becomes:
69 *
70 *	old_allocbt_blocks & ~not_allocbt_blocks
71 *
72 * The OWN_AG bitmap itself isn't needed after this point, so what we really do
73 * instead is:
74 *
75 *	old_allocbt_blocks &= ~not_allocbt_blocks;
76 *
77 * After this point, @old_allocbt_blocks is a bitmap of alleged former
78 * bnobt/cntbt blocks.  The xagb_bitmap_disunion operation modifies its first
79 * parameter in place to avoid copying records around.
80 *
81 * Next, some of the space described by @free_records are diverted to the newbt
82 * reservation and used to format new btree blocks.  The remaining records are
83 * written to the new btree indices.  We reconstruct both bnobt and cntbt at
84 * the same time since we've already done all the work.
85 *
86 * We use the prefix 'xrep_abt' here because we regenerate both free space
87 * allocation btrees at the same time.
88 */
89
90struct xrep_abt {
91	/* Blocks owned by the rmapbt or the agfl. */
92	struct xagb_bitmap	not_allocbt_blocks;
93
94	/* All OWN_AG blocks. */
95	struct xagb_bitmap	old_allocbt_blocks;
96
97	/*
98	 * New bnobt information.  All btree block reservations are added to
99	 * the reservation list in new_bnobt.
100	 */
101	struct xrep_newbt	new_bnobt;
102
103	/* new cntbt information */
104	struct xrep_newbt	new_cntbt;
105
106	/* Free space extents. */
107	struct xfarray		*free_records;
108
109	struct xfs_scrub	*sc;
110
111	/* Number of non-null records in @free_records. */
112	uint64_t		nr_real_records;
113
114	/* get_records()'s position in the free space record array. */
115	xfarray_idx_t		array_cur;
116
117	/*
118	 * Next block we anticipate seeing in the rmap records.  If the next
119	 * rmap record is greater than next_agbno, we have found unused space.
120	 */
121	xfs_agblock_t		next_agbno;
122
123	/* Number of free blocks in this AG. */
124	xfs_agblock_t		nr_blocks;
125
126	/* Longest free extent we found in the AG. */
127	xfs_agblock_t		longest;
128};
129
130/* Set up to repair AG free space btrees. */
131int
132xrep_setup_ag_allocbt(
133	struct xfs_scrub	*sc)
134{
135	unsigned int		busy_gen;
136
137	/*
138	 * Make sure the busy extent list is clear because we can't put extents
139	 * on there twice.
140	 */
141	busy_gen = READ_ONCE(sc->sa.pag->pagb_gen);
142	if (xfs_extent_busy_list_empty(sc->sa.pag))
143		return 0;
144
145	return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0);
146}
147
148/* Check for any obvious conflicts in the free extent. */
149STATIC int
150xrep_abt_check_free_ext(
151	struct xfs_scrub	*sc,
152	const struct xfs_alloc_rec_incore *rec)
153{
154	enum xbtree_recpacking	outcome;
155	int			error;
156
157	if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL)
158		return -EFSCORRUPTED;
159
160	/* Must not be an inode chunk. */
161	error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
162			rec->ar_startblock, rec->ar_blockcount, &outcome);
163	if (error)
164		return error;
165	if (outcome != XBTREE_RECPACKING_EMPTY)
166		return -EFSCORRUPTED;
167
168	/* Must not be shared or CoW staging. */
169	if (sc->sa.refc_cur) {
170		error = xfs_refcount_has_records(sc->sa.refc_cur,
171				XFS_REFC_DOMAIN_SHARED, rec->ar_startblock,
172				rec->ar_blockcount, &outcome);
173		if (error)
174			return error;
175		if (outcome != XBTREE_RECPACKING_EMPTY)
176			return -EFSCORRUPTED;
177
178		error = xfs_refcount_has_records(sc->sa.refc_cur,
179				XFS_REFC_DOMAIN_COW, rec->ar_startblock,
180				rec->ar_blockcount, &outcome);
181		if (error)
182			return error;
183		if (outcome != XBTREE_RECPACKING_EMPTY)
184			return -EFSCORRUPTED;
185	}
186
187	return 0;
188}
189
190/*
191 * Stash a free space record for all the space since the last bno we found
192 * all the way up to @end.
193 */
194static int
195xrep_abt_stash(
196	struct xrep_abt		*ra,
197	xfs_agblock_t		end)
198{
199	struct xfs_alloc_rec_incore arec = {
200		.ar_startblock	= ra->next_agbno,
201		.ar_blockcount	= end - ra->next_agbno,
202	};
203	struct xfs_scrub	*sc = ra->sc;
204	int			error = 0;
205
206	if (xchk_should_terminate(sc, &error))
207		return error;
208
209	error = xrep_abt_check_free_ext(ra->sc, &arec);
210	if (error)
211		return error;
212
213	trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec);
214
215	error = xfarray_append(ra->free_records, &arec);
216	if (error)
217		return error;
218
219	ra->nr_blocks += arec.ar_blockcount;
220	return 0;
221}
222
223/* Record extents that aren't in use from gaps in the rmap records. */
224STATIC int
225xrep_abt_walk_rmap(
226	struct xfs_btree_cur		*cur,
227	const struct xfs_rmap_irec	*rec,
228	void				*priv)
229{
230	struct xrep_abt			*ra = priv;
231	int				error;
232
233	/* Record all the OWN_AG blocks... */
234	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
235		error = xagb_bitmap_set(&ra->old_allocbt_blocks,
236				rec->rm_startblock, rec->rm_blockcount);
237		if (error)
238			return error;
239	}
240
241	/* ...and all the rmapbt blocks... */
242	error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur);
243	if (error)
244		return error;
245
246	/* ...and all the free space. */
247	if (rec->rm_startblock > ra->next_agbno) {
248		error = xrep_abt_stash(ra, rec->rm_startblock);
249		if (error)
250			return error;
251	}
252
253	/*
254	 * rmap records can overlap on reflink filesystems, so project
255	 * next_agbno as far out into the AG space as we currently know about.
256	 */
257	ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno,
258			rec->rm_startblock + rec->rm_blockcount);
259	return 0;
260}
261
262/* Collect an AGFL block for the not-to-release list. */
263static int
264xrep_abt_walk_agfl(
265	struct xfs_mount	*mp,
266	xfs_agblock_t		agbno,
267	void			*priv)
268{
269	struct xrep_abt		*ra = priv;
270
271	return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1);
272}
273
274/*
275 * Compare two free space extents by block number.  We want to sort in order of
276 * increasing block number.
277 */
278static int
279xrep_bnobt_extent_cmp(
280	const void		*a,
281	const void		*b)
282{
283	const struct xfs_alloc_rec_incore *ap = a;
284	const struct xfs_alloc_rec_incore *bp = b;
285
286	if (ap->ar_startblock > bp->ar_startblock)
287		return 1;
288	else if (ap->ar_startblock < bp->ar_startblock)
289		return -1;
290	return 0;
291}
292
293/*
294 * Re-sort the free extents by block number so that we can put the records into
295 * the bnobt in the correct order.  Make sure the records do not overlap in
296 * physical space.
297 */
298STATIC int
299xrep_bnobt_sort_records(
300	struct xrep_abt			*ra)
301{
302	struct xfs_alloc_rec_incore	arec;
303	xfarray_idx_t			cur = XFARRAY_CURSOR_INIT;
304	xfs_agblock_t			next_agbno = 0;
305	int				error;
306
307	error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0);
308	if (error)
309		return error;
310
311	while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) {
312		if (arec.ar_startblock < next_agbno)
313			return -EFSCORRUPTED;
314
315		next_agbno = arec.ar_startblock + arec.ar_blockcount;
316	}
317
318	return error;
319}
320
321/*
322 * Compare two free space extents by length and then block number.  We want
323 * to sort first in order of increasing length and then in order of increasing
324 * block number.
325 */
326static int
327xrep_cntbt_extent_cmp(
328	const void			*a,
329	const void			*b)
330{
331	const struct xfs_alloc_rec_incore *ap = a;
332	const struct xfs_alloc_rec_incore *bp = b;
333
334	if (ap->ar_blockcount > bp->ar_blockcount)
335		return 1;
336	else if (ap->ar_blockcount < bp->ar_blockcount)
337		return -1;
338	return xrep_bnobt_extent_cmp(a, b);
339}
340
341/*
342 * Sort the free extents by length so so that we can put the records into the
343 * cntbt in the correct order.  Don't let userspace kill us if we're resorting
344 * after allocating btree blocks.
345 */
346STATIC int
347xrep_cntbt_sort_records(
348	struct xrep_abt			*ra,
349	bool				is_resort)
350{
351	return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp,
352			is_resort ? 0 : XFARRAY_SORT_KILLABLE);
353}
354
355/*
356 * Iterate all reverse mappings to find (1) the gaps between rmap records (all
357 * unowned space), (2) the OWN_AG extents (which encompass the free space
358 * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL
359 * blocks.  The free space is (1) + (2) - (3) - (4).
360 */
361STATIC int
362xrep_abt_find_freespace(
363	struct xrep_abt		*ra)
364{
365	struct xfs_scrub	*sc = ra->sc;
366	struct xfs_mount	*mp = sc->mp;
367	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
368	struct xfs_buf		*agfl_bp;
369	xfs_agblock_t		agend;
370	int			error;
371
372	xagb_bitmap_init(&ra->not_allocbt_blocks);
373
374	xrep_ag_btcur_init(sc, &sc->sa);
375
376	/*
377	 * Iterate all the reverse mappings to find gaps in the physical
378	 * mappings, all the OWN_AG blocks, and all the rmapbt extents.
379	 */
380	error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra);
381	if (error)
382		goto err;
383
384	/* Insert a record for space between the last rmap and EOAG. */
385	agend = be32_to_cpu(agf->agf_length);
386	if (ra->next_agbno < agend) {
387		error = xrep_abt_stash(ra, agend);
388		if (error)
389			goto err;
390	}
391
392	/* Collect all the AGFL blocks. */
393	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
394	if (error)
395		goto err;
396
397	error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra);
398	if (error)
399		goto err_agfl;
400
401	/* Compute the old bnobt/cntbt blocks. */
402	error = xagb_bitmap_disunion(&ra->old_allocbt_blocks,
403			&ra->not_allocbt_blocks);
404	if (error)
405		goto err_agfl;
406
407	ra->nr_real_records = xfarray_length(ra->free_records);
408err_agfl:
409	xfs_trans_brelse(sc->tp, agfl_bp);
410err:
411	xchk_ag_btcur_free(&sc->sa);
412	xagb_bitmap_destroy(&ra->not_allocbt_blocks);
413	return error;
414}
415
416/*
417 * We're going to use the observed free space records to reserve blocks for the
418 * new free space btrees, so we play an iterative game where we try to converge
419 * on the number of blocks we need:
420 *
421 * 1. Estimate how many blocks we'll need to store the records.
422 * 2. If the first free record has more blocks than we need, we're done.
423 *    We will have to re-sort the records prior to building the cntbt.
424 * 3. If that record has exactly the number of blocks we need, null out the
425 *    record.  We're done.
426 * 4. Otherwise, we still need more blocks.  Null out the record, subtract its
427 *    length from the number of blocks we need, and go back to step 1.
428 *
429 * Fortunately, we don't have to do any transaction work to play this game, so
430 * we don't have to tear down the staging cursors.
431 */
432STATIC int
433xrep_abt_reserve_space(
434	struct xrep_abt		*ra,
435	struct xfs_btree_cur	*bno_cur,
436	struct xfs_btree_cur	*cnt_cur,
437	bool			*needs_resort)
438{
439	struct xfs_scrub	*sc = ra->sc;
440	xfarray_idx_t		record_nr;
441	unsigned int		allocated = 0;
442	int			error = 0;
443
444	record_nr = xfarray_length(ra->free_records) - 1;
445	do {
446		struct xfs_alloc_rec_incore arec;
447		uint64_t		required;
448		unsigned int		desired;
449		unsigned int		len;
450
451		/* Compute how many blocks we'll need. */
452		error = xfs_btree_bload_compute_geometry(cnt_cur,
453				&ra->new_cntbt.bload, ra->nr_real_records);
454		if (error)
455			break;
456
457		error = xfs_btree_bload_compute_geometry(bno_cur,
458				&ra->new_bnobt.bload, ra->nr_real_records);
459		if (error)
460			break;
461
462		/* How many btree blocks do we need to store all records? */
463		required = ra->new_bnobt.bload.nr_blocks +
464			   ra->new_cntbt.bload.nr_blocks;
465		ASSERT(required < INT_MAX);
466
467		/* If we've reserved enough blocks, we're done. */
468		if (allocated >= required)
469			break;
470
471		desired = required - allocated;
472
473		/* We need space but there's none left; bye! */
474		if (ra->nr_real_records == 0) {
475			error = -ENOSPC;
476			break;
477		}
478
479		/* Grab the first record from the list. */
480		error = xfarray_load(ra->free_records, record_nr, &arec);
481		if (error)
482			break;
483
484		ASSERT(arec.ar_blockcount <= UINT_MAX);
485		len = min_t(unsigned int, arec.ar_blockcount, desired);
486
487		trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno,
488				arec.ar_startblock, len, XFS_RMAP_OWN_AG);
489
490		error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag,
491				arec.ar_startblock, len);
492		if (error)
493			break;
494		allocated += len;
495		ra->nr_blocks -= len;
496
497		if (arec.ar_blockcount > desired) {
498			/*
499			 * Record has more space than we need.  The number of
500			 * free records doesn't change, so shrink the free
501			 * record, inform the caller that the records are no
502			 * longer sorted by length, and exit.
503			 */
504			arec.ar_startblock += desired;
505			arec.ar_blockcount -= desired;
506			error = xfarray_store(ra->free_records, record_nr,
507					&arec);
508			if (error)
509				break;
510
511			*needs_resort = true;
512			return 0;
513		}
514
515		/*
516		 * We're going to use up the entire record, so unset it and
517		 * move on to the next one.  This changes the number of free
518		 * records (but doesn't break the sorting order), so we must
519		 * go around the loop once more to re-run _bload_init.
520		 */
521		error = xfarray_unset(ra->free_records, record_nr);
522		if (error)
523			break;
524		ra->nr_real_records--;
525		record_nr--;
526	} while (1);
527
528	return error;
529}
530
531STATIC int
532xrep_abt_dispose_one(
533	struct xrep_abt		*ra,
534	struct xrep_newbt_resv	*resv)
535{
536	struct xfs_scrub	*sc = ra->sc;
537	struct xfs_perag	*pag = sc->sa.pag;
538	xfs_agblock_t		free_agbno = resv->agbno + resv->used;
539	xfs_extlen_t		free_aglen = resv->len - resv->used;
540	int			error;
541
542	ASSERT(pag == resv->pag);
543
544	/* Add a deferred rmap for each extent we used. */
545	if (resv->used > 0)
546		xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno,
547				resv->used, XFS_RMAP_OWN_AG);
548
549	/*
550	 * For each reserved btree block we didn't use, add it to the free
551	 * space btree.  We didn't touch fdblocks when we reserved them, so
552	 * we don't touch it now.
553	 */
554	if (free_aglen == 0)
555		return 0;
556
557	trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
558			free_aglen, ra->new_bnobt.oinfo.oi_owner);
559
560	error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen,
561			&ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true);
562	if (error)
563		return error;
564
565	return xrep_defer_finish(sc);
566}
567
568/*
569 * Deal with all the space we reserved.  Blocks that were allocated for the
570 * free space btrees need to have a (deferred) rmap added for the OWN_AG
571 * allocation, and blocks that didn't get used can be freed via the usual
572 * (deferred) means.
573 */
574STATIC void
575xrep_abt_dispose_reservations(
576	struct xrep_abt		*ra,
577	int			error)
578{
579	struct xrep_newbt_resv	*resv, *n;
580
581	if (error)
582		goto junkit;
583
584	list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
585		error = xrep_abt_dispose_one(ra, resv);
586		if (error)
587			goto junkit;
588	}
589
590junkit:
591	list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
592		xfs_perag_put(resv->pag);
593		list_del(&resv->list);
594		kfree(resv);
595	}
596
597	xrep_newbt_cancel(&ra->new_bnobt);
598	xrep_newbt_cancel(&ra->new_cntbt);
599}
600
601/* Retrieve free space data for bulk load. */
602STATIC int
603xrep_abt_get_records(
604	struct xfs_btree_cur		*cur,
605	unsigned int			idx,
606	struct xfs_btree_block		*block,
607	unsigned int			nr_wanted,
608	void				*priv)
609{
610	struct xfs_alloc_rec_incore	*arec = &cur->bc_rec.a;
611	struct xrep_abt			*ra = priv;
612	union xfs_btree_rec		*block_rec;
613	unsigned int			loaded;
614	int				error;
615
616	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
617		error = xfarray_load_next(ra->free_records, &ra->array_cur,
618				arec);
619		if (error)
620			return error;
621
622		ra->longest = max(ra->longest, arec->ar_blockcount);
623
624		block_rec = xfs_btree_rec_addr(cur, idx, block);
625		cur->bc_ops->init_rec_from_cur(cur, block_rec);
626	}
627
628	return loaded;
629}
630
631/* Feed one of the new btree blocks to the bulk loader. */
632STATIC int
633xrep_abt_claim_block(
634	struct xfs_btree_cur	*cur,
635	union xfs_btree_ptr	*ptr,
636	void			*priv)
637{
638	struct xrep_abt		*ra = priv;
639
640	return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr);
641}
642
643/*
644 * Reset the AGF counters to reflect the free space btrees that we just
645 * rebuilt, then reinitialize the per-AG data.
646 */
647STATIC int
648xrep_abt_reset_counters(
649	struct xrep_abt		*ra)
650{
651	struct xfs_scrub	*sc = ra->sc;
652	struct xfs_perag	*pag = sc->sa.pag;
653	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
654	unsigned int		freesp_btreeblks = 0;
655
656	/*
657	 * Compute the contribution to agf_btreeblks for the new free space
658	 * btrees.  This is the computed btree size minus anything we didn't
659	 * use.
660	 */
661	freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1;
662	freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1;
663
664	freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt);
665	freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt);
666
667	/*
668	 * The AGF header contains extra information related to the free space
669	 * btrees, so we must update those fields here.
670	 */
671	agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks +
672				(be32_to_cpu(agf->agf_rmap_blocks) - 1));
673	agf->agf_freeblks = cpu_to_be32(ra->nr_blocks);
674	agf->agf_longest = cpu_to_be32(ra->longest);
675	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS |
676						 XFS_AGF_LONGEST |
677						 XFS_AGF_FREEBLKS);
678
679	/*
680	 * After we commit the new btree to disk, it is possible that the
681	 * process to reap the old btree blocks will race with the AIL trying
682	 * to checkpoint the old btree blocks into the filesystem.  If the new
683	 * tree is shorter than the old one, the allocbt write verifier will
684	 * fail and the AIL will shut down the filesystem.
685	 *
686	 * To avoid this, save the old incore btree height values as the alt
687	 * height values before re-initializing the perag info from the updated
688	 * AGF to capture all the new values.
689	 */
690	pag->pagf_repair_bno_level = pag->pagf_bno_level;
691	pag->pagf_repair_cnt_level = pag->pagf_cnt_level;
692
693	/* Reinitialize with the values we just logged. */
694	return xrep_reinit_pagf(sc);
695}
696
697/*
698 * Use the collected free space information to stage new free space btrees.
699 * If this is successful we'll return with the new btree root
700 * information logged to the repair transaction but not yet committed.
701 */
702STATIC int
703xrep_abt_build_new_trees(
704	struct xrep_abt		*ra)
705{
706	struct xfs_scrub	*sc = ra->sc;
707	struct xfs_btree_cur	*bno_cur;
708	struct xfs_btree_cur	*cnt_cur;
709	struct xfs_perag	*pag = sc->sa.pag;
710	bool			needs_resort = false;
711	int			error;
712
713	/*
714	 * Sort the free extents by length so that we can set up the free space
715	 * btrees in as few extents as possible.  This reduces the amount of
716	 * deferred rmap / free work we have to do at the end.
717	 */
718	error = xrep_cntbt_sort_records(ra, false);
719	if (error)
720		return error;
721
722	/*
723	 * Prepare to construct the new btree by reserving disk space for the
724	 * new btree and setting up all the accounting information we'll need
725	 * to root the new btree while it's under construction and before we
726	 * attach it to the AG header.
727	 */
728	xrep_newbt_init_bare(&ra->new_bnobt, sc);
729	xrep_newbt_init_bare(&ra->new_cntbt, sc);
730
731	ra->new_bnobt.bload.get_records = xrep_abt_get_records;
732	ra->new_cntbt.bload.get_records = xrep_abt_get_records;
733
734	ra->new_bnobt.bload.claim_block = xrep_abt_claim_block;
735	ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
736
737	/* Allocate cursors for the staged btrees. */
738	bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag);
739	xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake);
740
741	cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag);
742	xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake);
743
744	/* Last chance to abort before we start committing fixes. */
745	if (xchk_should_terminate(sc, &error))
746		goto err_cur;
747
748	/* Reserve the space we'll need for the new btrees. */
749	error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort);
750	if (error)
751		goto err_cur;
752
753	/*
754	 * If we need to re-sort the free extents by length, do so so that we
755	 * can put the records into the cntbt in the correct order.
756	 */
757	if (needs_resort) {
758		error = xrep_cntbt_sort_records(ra, needs_resort);
759		if (error)
760			goto err_cur;
761	}
762
763	/*
764	 * Due to btree slack factors, it's possible for a new btree to be one
765	 * level taller than the old btree.  Update the alternate incore btree
766	 * height so that we don't trip the verifiers when writing the new
767	 * btree blocks to disk.
768	 */
769	pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height;
770	pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height;
771
772	/* Load the free space by length tree. */
773	ra->array_cur = XFARRAY_CURSOR_INIT;
774	ra->longest = 0;
775	error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra);
776	if (error)
777		goto err_levels;
778
779	error = xrep_bnobt_sort_records(ra);
780	if (error)
781		return error;
782
783	/* Load the free space by block number tree. */
784	ra->array_cur = XFARRAY_CURSOR_INIT;
785	error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra);
786	if (error)
787		goto err_levels;
788
789	/*
790	 * Install the new btrees in the AG header.  After this point the old
791	 * btrees are no longer accessible and the new trees are live.
792	 */
793	xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp);
794	xfs_btree_del_cursor(bno_cur, 0);
795	xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp);
796	xfs_btree_del_cursor(cnt_cur, 0);
797
798	/* Reset the AGF counters now that we've changed the btree shape. */
799	error = xrep_abt_reset_counters(ra);
800	if (error)
801		goto err_newbt;
802
803	/* Dispose of any unused blocks and the accounting information. */
804	xrep_abt_dispose_reservations(ra, error);
805
806	return xrep_roll_ag_trans(sc);
807
808err_levels:
809	pag->pagf_repair_bno_level = 0;
810	pag->pagf_repair_cnt_level = 0;
811err_cur:
812	xfs_btree_del_cursor(cnt_cur, error);
813	xfs_btree_del_cursor(bno_cur, error);
814err_newbt:
815	xrep_abt_dispose_reservations(ra, error);
816	return error;
817}
818
819/*
820 * Now that we've logged the roots of the new btrees, invalidate all of the
821 * old blocks and free them.
822 */
823STATIC int
824xrep_abt_remove_old_trees(
825	struct xrep_abt		*ra)
826{
827	struct xfs_perag	*pag = ra->sc->sa.pag;
828	int			error;
829
830	/* Free the old btree blocks if they're not in use. */
831	error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks,
832			&XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE);
833	if (error)
834		return error;
835
836	/*
837	 * Now that we've zapped all the old allocbt blocks we can turn off
838	 * the alternate height mechanism.
839	 */
840	pag->pagf_repair_bno_level = 0;
841	pag->pagf_repair_cnt_level = 0;
842	return 0;
843}
844
845/* Repair the freespace btrees for some AG. */
846int
847xrep_allocbt(
848	struct xfs_scrub	*sc)
849{
850	struct xrep_abt		*ra;
851	struct xfs_mount	*mp = sc->mp;
852	char			*descr;
853	int			error;
854
855	/* We require the rmapbt to rebuild anything. */
856	if (!xfs_has_rmapbt(mp))
857		return -EOPNOTSUPP;
858
859	ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS);
860	if (!ra)
861		return -ENOMEM;
862	ra->sc = sc;
863
864	/* We rebuild both data structures. */
865	sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT;
866
867	/*
868	 * Make sure the busy extent list is clear because we can't put extents
869	 * on there twice.  In theory we cleared this before we started, but
870	 * let's not risk the filesystem.
871	 */
872	if (!xfs_extent_busy_list_empty(sc->sa.pag)) {
873		error = -EDEADLOCK;
874		goto out_ra;
875	}
876
877	/* Set up enough storage to handle maximally fragmented free space. */
878	descr = xchk_xfile_ag_descr(sc, "free space records");
879	error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2,
880			sizeof(struct xfs_alloc_rec_incore),
881			&ra->free_records);
882	kfree(descr);
883	if (error)
884		goto out_ra;
885
886	/* Collect the free space data and find the old btree blocks. */
887	xagb_bitmap_init(&ra->old_allocbt_blocks);
888	error = xrep_abt_find_freespace(ra);
889	if (error)
890		goto out_bitmap;
891
892	/* Rebuild the free space information. */
893	error = xrep_abt_build_new_trees(ra);
894	if (error)
895		goto out_bitmap;
896
897	/* Kill the old trees. */
898	error = xrep_abt_remove_old_trees(ra);
899	if (error)
900		goto out_bitmap;
901
902out_bitmap:
903	xagb_bitmap_destroy(&ra->old_allocbt_blocks);
904	xfarray_destroy(ra->free_records);
905out_ra:
906	kfree(ra);
907	return error;
908}
909
910/* Make sure both btrees are ok after we've rebuilt them. */
911int
912xrep_revalidate_allocbt(
913	struct xfs_scrub	*sc)
914{
915	__u32			old_type = sc->sm->sm_type;
916	int			error;
917
918	/*
919	 * We must update sm_type temporarily so that the tree-to-tree cross
920	 * reference checks will work in the correct direction, and also so
921	 * that tracing will report correctly if there are more errors.
922	 */
923	sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT;
924	error = xchk_allocbt(sc);
925	if (error)
926		goto out;
927
928	sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT;
929	error = xchk_allocbt(sc);
930out:
931	sc->sm->sm_type = old_type;
932	return error;
933}
934