1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_inode.h"
13#include "xfs_btree.h"
14#include "scrub/scrub.h"
15#include "scrub/common.h"
16#include "scrub/btree.h"
17#include "scrub/trace.h"
18
19/* btree scrubbing */
20
21/*
22 * Check for btree operation errors.  See the section about handling
23 * operational errors in common.c.
24 */
25static bool
26__xchk_btree_process_error(
27	struct xfs_scrub	*sc,
28	struct xfs_btree_cur	*cur,
29	int			level,
30	int			*error,
31	__u32			errflag,
32	void			*ret_ip)
33{
34	if (*error == 0)
35		return true;
36
37	switch (*error) {
38	case -EDEADLOCK:
39	case -ECHRNG:
40		/* Used to restart an op with deadlock avoidance. */
41		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
42		break;
43	case -EFSBADCRC:
44	case -EFSCORRUPTED:
45		/* Note the badness but don't abort. */
46		sc->sm->sm_flags |= errflag;
47		*error = 0;
48		fallthrough;
49	default:
50		if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
51			trace_xchk_ifork_btree_op_error(sc, cur, level,
52					*error, ret_ip);
53		else
54			trace_xchk_btree_op_error(sc, cur, level,
55					*error, ret_ip);
56		break;
57	}
58	return false;
59}
60
61bool
62xchk_btree_process_error(
63	struct xfs_scrub	*sc,
64	struct xfs_btree_cur	*cur,
65	int			level,
66	int			*error)
67{
68	return __xchk_btree_process_error(sc, cur, level, error,
69			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
70}
71
72bool
73xchk_btree_xref_process_error(
74	struct xfs_scrub	*sc,
75	struct xfs_btree_cur	*cur,
76	int			level,
77	int			*error)
78{
79	return __xchk_btree_process_error(sc, cur, level, error,
80			XFS_SCRUB_OFLAG_XFAIL, __return_address);
81}
82
83/* Record btree block corruption. */
84static void
85__xchk_btree_set_corrupt(
86	struct xfs_scrub	*sc,
87	struct xfs_btree_cur	*cur,
88	int			level,
89	__u32			errflag,
90	void			*ret_ip)
91{
92	sc->sm->sm_flags |= errflag;
93
94	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
95		trace_xchk_ifork_btree_error(sc, cur, level,
96				ret_ip);
97	else
98		trace_xchk_btree_error(sc, cur, level,
99				ret_ip);
100}
101
102void
103xchk_btree_set_corrupt(
104	struct xfs_scrub	*sc,
105	struct xfs_btree_cur	*cur,
106	int			level)
107{
108	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_CORRUPT,
109			__return_address);
110}
111
112void
113xchk_btree_xref_set_corrupt(
114	struct xfs_scrub	*sc,
115	struct xfs_btree_cur	*cur,
116	int			level)
117{
118	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_XCORRUPT,
119			__return_address);
120}
121
122void
123xchk_btree_set_preen(
124	struct xfs_scrub	*sc,
125	struct xfs_btree_cur	*cur,
126	int			level)
127{
128	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_PREEN,
129			__return_address);
130}
131
132/*
133 * Make sure this record is in order and doesn't stray outside of the parent
134 * keys.
135 */
136STATIC void
137xchk_btree_rec(
138	struct xchk_btree	*bs)
139{
140	struct xfs_btree_cur	*cur = bs->cur;
141	union xfs_btree_rec	*rec;
142	union xfs_btree_key	key;
143	union xfs_btree_key	hkey;
144	union xfs_btree_key	*keyp;
145	struct xfs_btree_block	*block;
146	struct xfs_btree_block	*keyblock;
147	struct xfs_buf		*bp;
148
149	block = xfs_btree_get_block(cur, 0, &bp);
150	rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block);
151
152	trace_xchk_btree_rec(bs->sc, cur, 0);
153
154	/* Are all records across all record blocks in order? */
155	if (bs->lastrec_valid &&
156	    !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
157		xchk_btree_set_corrupt(bs->sc, cur, 0);
158	memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
159	bs->lastrec_valid = true;
160
161	if (cur->bc_nlevels == 1)
162		return;
163
164	/* Is low_key(rec) at least as large as the parent low key? */
165	cur->bc_ops->init_key_from_rec(&key, rec);
166	keyblock = xfs_btree_get_block(cur, 1, &bp);
167	keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
168	if (xfs_btree_keycmp_lt(cur, &key, keyp))
169		xchk_btree_set_corrupt(bs->sc, cur, 1);
170
171	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
172		return;
173
174	/* Is high_key(rec) no larger than the parent high key? */
175	cur->bc_ops->init_high_key_from_rec(&hkey, rec);
176	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
177	if (xfs_btree_keycmp_lt(cur, keyp, &hkey))
178		xchk_btree_set_corrupt(bs->sc, cur, 1);
179}
180
181/*
182 * Make sure this key is in order and doesn't stray outside of the parent
183 * keys.
184 */
185STATIC void
186xchk_btree_key(
187	struct xchk_btree	*bs,
188	int			level)
189{
190	struct xfs_btree_cur	*cur = bs->cur;
191	union xfs_btree_key	*key;
192	union xfs_btree_key	*keyp;
193	struct xfs_btree_block	*block;
194	struct xfs_btree_block	*keyblock;
195	struct xfs_buf		*bp;
196
197	block = xfs_btree_get_block(cur, level, &bp);
198	key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
199
200	trace_xchk_btree_key(bs->sc, cur, level);
201
202	/* Are all low keys across all node blocks in order? */
203	if (bs->lastkey[level - 1].valid &&
204	    !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1].key, key))
205		xchk_btree_set_corrupt(bs->sc, cur, level);
206	memcpy(&bs->lastkey[level - 1].key, key, cur->bc_ops->key_len);
207	bs->lastkey[level - 1].valid = true;
208
209	if (level + 1 >= cur->bc_nlevels)
210		return;
211
212	/* Is this block's low key at least as large as the parent low key? */
213	keyblock = xfs_btree_get_block(cur, level + 1, &bp);
214	keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
215	if (xfs_btree_keycmp_lt(cur, key, keyp))
216		xchk_btree_set_corrupt(bs->sc, cur, level);
217
218	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
219		return;
220
221	/* Is this block's high key no larger than the parent high key? */
222	key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
223	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
224			keyblock);
225	if (xfs_btree_keycmp_lt(cur, keyp, key))
226		xchk_btree_set_corrupt(bs->sc, cur, level);
227}
228
229/*
230 * Check a btree pointer.  Returns true if it's ok to use this pointer.
231 * Callers do not need to set the corrupt flag.
232 */
233static bool
234xchk_btree_ptr_ok(
235	struct xchk_btree	*bs,
236	int			level,
237	union xfs_btree_ptr	*ptr)
238{
239	/* A btree rooted in an inode has no block pointer to the root. */
240	if (bs->cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
241	    level == bs->cur->bc_nlevels)
242		return true;
243
244	/* Otherwise, check the pointers. */
245	if (__xfs_btree_check_ptr(bs->cur, ptr, 0, level)) {
246		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
247		return false;
248	}
249
250	return true;
251}
252
253/* Check that a btree block's sibling matches what we expect it. */
254STATIC int
255xchk_btree_block_check_sibling(
256	struct xchk_btree	*bs,
257	int			level,
258	int			direction,
259	union xfs_btree_ptr	*sibling)
260{
261	struct xfs_btree_cur	*cur = bs->cur;
262	struct xfs_btree_block	*pblock;
263	struct xfs_buf		*pbp;
264	struct xfs_btree_cur	*ncur = NULL;
265	union xfs_btree_ptr	*pp;
266	int			success;
267	int			error;
268
269	error = xfs_btree_dup_cursor(cur, &ncur);
270	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error) ||
271	    !ncur)
272		return error;
273
274	/*
275	 * If the pointer is null, we shouldn't be able to move the upper
276	 * level pointer anywhere.
277	 */
278	if (xfs_btree_ptr_is_null(cur, sibling)) {
279		if (direction > 0)
280			error = xfs_btree_increment(ncur, level + 1, &success);
281		else
282			error = xfs_btree_decrement(ncur, level + 1, &success);
283		if (error == 0 && success)
284			xchk_btree_set_corrupt(bs->sc, cur, level);
285		error = 0;
286		goto out;
287	}
288
289	/* Increment upper level pointer. */
290	if (direction > 0)
291		error = xfs_btree_increment(ncur, level + 1, &success);
292	else
293		error = xfs_btree_decrement(ncur, level + 1, &success);
294	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error))
295		goto out;
296	if (!success) {
297		xchk_btree_set_corrupt(bs->sc, cur, level + 1);
298		goto out;
299	}
300
301	/* Compare upper level pointer to sibling pointer. */
302	pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
303	pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock);
304	if (!xchk_btree_ptr_ok(bs, level + 1, pp))
305		goto out;
306	if (pbp)
307		xchk_buffer_recheck(bs->sc, pbp);
308
309	if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
310		xchk_btree_set_corrupt(bs->sc, cur, level);
311out:
312	xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
313	return error;
314}
315
316/* Check the siblings of a btree block. */
317STATIC int
318xchk_btree_block_check_siblings(
319	struct xchk_btree	*bs,
320	struct xfs_btree_block	*block)
321{
322	struct xfs_btree_cur	*cur = bs->cur;
323	union xfs_btree_ptr	leftsib;
324	union xfs_btree_ptr	rightsib;
325	int			level;
326	int			error = 0;
327
328	xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB);
329	xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB);
330	level = xfs_btree_get_level(block);
331
332	/* Root block should never have siblings. */
333	if (level == cur->bc_nlevels - 1) {
334		if (!xfs_btree_ptr_is_null(cur, &leftsib) ||
335		    !xfs_btree_ptr_is_null(cur, &rightsib))
336			xchk_btree_set_corrupt(bs->sc, cur, level);
337		goto out;
338	}
339
340	/*
341	 * Does the left & right sibling pointers match the adjacent
342	 * parent level pointers?
343	 * (These function absorbs error codes for us.)
344	 */
345	error = xchk_btree_block_check_sibling(bs, level, -1, &leftsib);
346	if (error)
347		return error;
348	error = xchk_btree_block_check_sibling(bs, level, 1, &rightsib);
349	if (error)
350		return error;
351out:
352	return error;
353}
354
355struct check_owner {
356	struct list_head	list;
357	xfs_daddr_t		daddr;
358	int			level;
359};
360
361/*
362 * Make sure this btree block isn't in the free list and that there's
363 * an rmap record for it.
364 */
365STATIC int
366xchk_btree_check_block_owner(
367	struct xchk_btree	*bs,
368	int			level,
369	xfs_daddr_t		daddr)
370{
371	xfs_agnumber_t		agno;
372	xfs_agblock_t		agbno;
373	bool			init_sa;
374	int			error = 0;
375
376	if (!bs->cur)
377		return 0;
378
379	agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
380	agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
381
382	/*
383	 * If the btree being examined is not itself a per-AG btree, initialize
384	 * sc->sa so that we can check for the presence of an ownership record
385	 * in the rmap btree for the AG containing the block.
386	 */
387	init_sa = bs->cur->bc_ops->type != XFS_BTREE_TYPE_AG;
388	if (init_sa) {
389		error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
390		if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
391				level, &error))
392			goto out_free;
393	}
394
395	xchk_xref_is_used_space(bs->sc, agbno, 1);
396	/*
397	 * The bnobt scrubber aliases bs->cur to bs->sc->sa.bno_cur, so we
398	 * have to nullify it (to shut down further block owner checks) if
399	 * self-xref encounters problems.
400	 */
401	if (!bs->sc->sa.bno_cur && xfs_btree_is_bno(bs->cur->bc_ops))
402		bs->cur = NULL;
403
404	xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo);
405	if (!bs->sc->sa.rmap_cur && xfs_btree_is_rmap(bs->cur->bc_ops))
406		bs->cur = NULL;
407
408out_free:
409	if (init_sa)
410		xchk_ag_free(bs->sc, &bs->sc->sa);
411
412	return error;
413}
414
415/* Check the owner of a btree block. */
416STATIC int
417xchk_btree_check_owner(
418	struct xchk_btree	*bs,
419	int			level,
420	struct xfs_buf		*bp)
421{
422	struct xfs_btree_cur	*cur = bs->cur;
423
424	/*
425	 * In theory, xfs_btree_get_block should only give us a null buffer
426	 * pointer for the root of a root-in-inode btree type, but we need
427	 * to check defensively here in case the cursor state is also screwed
428	 * up.
429	 */
430	if (bp == NULL) {
431		if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE)
432			xchk_btree_set_corrupt(bs->sc, bs->cur, level);
433		return 0;
434	}
435
436	/*
437	 * We want to cross-reference each btree block with the bnobt
438	 * and the rmapbt.  We cannot cross-reference the bnobt or
439	 * rmapbt while scanning the bnobt or rmapbt, respectively,
440	 * because we cannot alter the cursor and we'd prefer not to
441	 * duplicate cursors.  Therefore, save the buffer daddr for
442	 * later scanning.
443	 */
444	if (xfs_btree_is_bno(cur->bc_ops) || xfs_btree_is_rmap(cur->bc_ops)) {
445		struct check_owner	*co;
446
447		co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
448		if (!co)
449			return -ENOMEM;
450
451		INIT_LIST_HEAD(&co->list);
452		co->level = level;
453		co->daddr = xfs_buf_daddr(bp);
454		list_add_tail(&co->list, &bs->to_check);
455		return 0;
456	}
457
458	return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp));
459}
460
461/* Decide if we want to check minrecs of a btree block in the inode root. */
462static inline bool
463xchk_btree_check_iroot_minrecs(
464	struct xchk_btree	*bs)
465{
466	/*
467	 * xfs_bmap_add_attrfork_btree had an implementation bug wherein it
468	 * would miscalculate the space required for the data fork bmbt root
469	 * when adding an attr fork, and promote the iroot contents to an
470	 * external block unnecessarily.  This went unnoticed for many years
471	 * until scrub found filesystems in this state.  Inode rooted btrees are
472	 * not supposed to have immediate child blocks that are small enough
473	 * that the contents could fit in the inode root, but we can't fail
474	 * existing filesystems, so instead we disable the check for data fork
475	 * bmap btrees when there's an attr fork.
476	 */
477	if (xfs_btree_is_bmap(bs->cur->bc_ops) &&
478	    bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
479	    xfs_inode_has_attr_fork(bs->sc->ip))
480		return false;
481
482	return true;
483}
484
485/*
486 * Check that this btree block has at least minrecs records or is one of the
487 * special blocks that don't require that.
488 */
489STATIC void
490xchk_btree_check_minrecs(
491	struct xchk_btree	*bs,
492	int			level,
493	struct xfs_btree_block	*block)
494{
495	struct xfs_btree_cur	*cur = bs->cur;
496	unsigned int		root_level = cur->bc_nlevels - 1;
497	unsigned int		numrecs = be16_to_cpu(block->bb_numrecs);
498
499	/* More records than minrecs means the block is ok. */
500	if (numrecs >= cur->bc_ops->get_minrecs(cur, level))
501		return;
502
503	/*
504	 * For btrees rooted in the inode, it's possible that the root block
505	 * contents spilled into a regular ondisk block because there wasn't
506	 * enough space in the inode root.  The number of records in that
507	 * child block might be less than the standard minrecs, but that's ok
508	 * provided that there's only one direct child of the root.
509	 */
510	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
511	    level == cur->bc_nlevels - 2) {
512		struct xfs_btree_block	*root_block;
513		struct xfs_buf		*root_bp;
514		int			root_maxrecs;
515
516		root_block = xfs_btree_get_block(cur, root_level, &root_bp);
517		root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level);
518		if (xchk_btree_check_iroot_minrecs(bs) &&
519		    (be16_to_cpu(root_block->bb_numrecs) != 1 ||
520		     numrecs <= root_maxrecs))
521			xchk_btree_set_corrupt(bs->sc, cur, level);
522		return;
523	}
524
525	/*
526	 * Otherwise, only the root level is allowed to have fewer than minrecs
527	 * records or keyptrs.
528	 */
529	if (level < root_level)
530		xchk_btree_set_corrupt(bs->sc, cur, level);
531}
532
533/*
534 * If this btree block has a parent, make sure that the parent's keys capture
535 * the keyspace contained in this block.
536 */
537STATIC void
538xchk_btree_block_check_keys(
539	struct xchk_btree	*bs,
540	int			level,
541	struct xfs_btree_block	*block)
542{
543	union xfs_btree_key	block_key;
544	union xfs_btree_key	*block_high_key;
545	union xfs_btree_key	*parent_low_key, *parent_high_key;
546	struct xfs_btree_cur	*cur = bs->cur;
547	struct xfs_btree_block	*parent_block;
548	struct xfs_buf		*bp;
549
550	if (level == cur->bc_nlevels - 1)
551		return;
552
553	xfs_btree_get_keys(cur, block, &block_key);
554
555	/* Make sure the low key of this block matches the parent. */
556	parent_block = xfs_btree_get_block(cur, level + 1, &bp);
557	parent_low_key = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
558			parent_block);
559	if (xfs_btree_keycmp_ne(cur, &block_key, parent_low_key)) {
560		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
561		return;
562	}
563
564	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
565		return;
566
567	/* Make sure the high key of this block matches the parent. */
568	parent_high_key = xfs_btree_high_key_addr(cur,
569			cur->bc_levels[level + 1].ptr, parent_block);
570	block_high_key = xfs_btree_high_key_from_key(cur, &block_key);
571	if (xfs_btree_keycmp_ne(cur, block_high_key, parent_high_key))
572		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
573}
574
575/*
576 * Grab and scrub a btree block given a btree pointer.  Returns block
577 * and buffer pointers (if applicable) if they're ok to use.
578 */
579STATIC int
580xchk_btree_get_block(
581	struct xchk_btree	*bs,
582	int			level,
583	union xfs_btree_ptr	*pp,
584	struct xfs_btree_block	**pblock,
585	struct xfs_buf		**pbp)
586{
587	int			error;
588
589	*pblock = NULL;
590	*pbp = NULL;
591
592	error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock);
593	if (!xchk_btree_process_error(bs->sc, bs->cur, level, &error) ||
594	    !*pblock)
595		return error;
596
597	xfs_btree_get_block(bs->cur, level, pbp);
598	if (__xfs_btree_check_block(bs->cur, *pblock, level, *pbp)) {
599		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
600		return 0;
601	}
602	if (*pbp)
603		xchk_buffer_recheck(bs->sc, *pbp);
604
605	xchk_btree_check_minrecs(bs, level, *pblock);
606
607	/*
608	 * Check the block's owner; this function absorbs error codes
609	 * for us.
610	 */
611	error = xchk_btree_check_owner(bs, level, *pbp);
612	if (error)
613		return error;
614
615	/*
616	 * Check the block's siblings; this function absorbs error codes
617	 * for us.
618	 */
619	error = xchk_btree_block_check_siblings(bs, *pblock);
620	if (error)
621		return error;
622
623	xchk_btree_block_check_keys(bs, level, *pblock);
624	return 0;
625}
626
627/*
628 * Check that the low and high keys of this block match the keys stored
629 * in the parent block.
630 */
631STATIC void
632xchk_btree_block_keys(
633	struct xchk_btree	*bs,
634	int			level,
635	struct xfs_btree_block	*block)
636{
637	union xfs_btree_key	block_keys;
638	struct xfs_btree_cur	*cur = bs->cur;
639	union xfs_btree_key	*high_bk;
640	union xfs_btree_key	*parent_keys;
641	union xfs_btree_key	*high_pk;
642	struct xfs_btree_block	*parent_block;
643	struct xfs_buf		*bp;
644
645	if (level >= cur->bc_nlevels - 1)
646		return;
647
648	/* Calculate the keys for this block. */
649	xfs_btree_get_keys(cur, block, &block_keys);
650
651	/* Obtain the parent's copy of the keys for this block. */
652	parent_block = xfs_btree_get_block(cur, level + 1, &bp);
653	parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
654			parent_block);
655
656	if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys))
657		xchk_btree_set_corrupt(bs->sc, cur, 1);
658
659	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
660		return;
661
662	/* Get high keys */
663	high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
664	high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
665			parent_block);
666
667	if (xfs_btree_keycmp_ne(cur, high_bk, high_pk))
668		xchk_btree_set_corrupt(bs->sc, cur, 1);
669}
670
671/*
672 * Visit all nodes and leaves of a btree.  Check that all pointers and
673 * records are in order, that the keys reflect the records, and use a callback
674 * so that the caller can verify individual records.
675 */
676int
677xchk_btree(
678	struct xfs_scrub		*sc,
679	struct xfs_btree_cur		*cur,
680	xchk_btree_rec_fn		scrub_fn,
681	const struct xfs_owner_info	*oinfo,
682	void				*private)
683{
684	union xfs_btree_ptr		ptr;
685	struct xchk_btree		*bs;
686	union xfs_btree_ptr		*pp;
687	union xfs_btree_rec		*recp;
688	struct xfs_btree_block		*block;
689	struct xfs_buf			*bp;
690	struct check_owner		*co;
691	struct check_owner		*n;
692	size_t				cur_sz;
693	int				level;
694	int				error = 0;
695
696	/*
697	 * Allocate the btree scrub context from the heap, because this
698	 * structure can get rather large.  Don't let a caller feed us a
699	 * totally absurd size.
700	 */
701	cur_sz = xchk_btree_sizeof(cur->bc_nlevels);
702	if (cur_sz > PAGE_SIZE) {
703		xchk_btree_set_corrupt(sc, cur, 0);
704		return 0;
705	}
706	bs = kzalloc(cur_sz, XCHK_GFP_FLAGS);
707	if (!bs)
708		return -ENOMEM;
709	bs->cur = cur;
710	bs->scrub_rec = scrub_fn;
711	bs->oinfo = oinfo;
712	bs->private = private;
713	bs->sc = sc;
714
715	/* Initialize scrub state */
716	INIT_LIST_HEAD(&bs->to_check);
717
718	/*
719	 * Load the root of the btree.  The helper function absorbs
720	 * error codes for us.
721	 */
722	level = cur->bc_nlevels - 1;
723	xfs_btree_init_ptr_from_cur(cur, &ptr);
724	if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
725		goto out;
726	error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
727	if (error || !block)
728		goto out;
729
730	cur->bc_levels[level].ptr = 1;
731
732	while (level < cur->bc_nlevels) {
733		block = xfs_btree_get_block(cur, level, &bp);
734
735		if (level == 0) {
736			/* End of leaf, pop back towards the root. */
737			if (cur->bc_levels[level].ptr >
738			    be16_to_cpu(block->bb_numrecs)) {
739				xchk_btree_block_keys(bs, level, block);
740				if (level < cur->bc_nlevels - 1)
741					cur->bc_levels[level + 1].ptr++;
742				level++;
743				continue;
744			}
745
746			/* Records in order for scrub? */
747			xchk_btree_rec(bs);
748
749			/* Call out to the record checker. */
750			recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
751					block);
752			error = bs->scrub_rec(bs, recp);
753			if (error)
754				break;
755			if (xchk_should_terminate(sc, &error) ||
756			    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
757				break;
758
759			cur->bc_levels[level].ptr++;
760			continue;
761		}
762
763		/* End of node, pop back towards the root. */
764		if (cur->bc_levels[level].ptr >
765					be16_to_cpu(block->bb_numrecs)) {
766			xchk_btree_block_keys(bs, level, block);
767			if (level < cur->bc_nlevels - 1)
768				cur->bc_levels[level + 1].ptr++;
769			level++;
770			continue;
771		}
772
773		/* Keys in order for scrub? */
774		xchk_btree_key(bs, level);
775
776		/* Drill another level deeper. */
777		pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
778		if (!xchk_btree_ptr_ok(bs, level, pp)) {
779			cur->bc_levels[level].ptr++;
780			continue;
781		}
782		level--;
783		error = xchk_btree_get_block(bs, level, pp, &block, &bp);
784		if (error || !block)
785			goto out;
786
787		cur->bc_levels[level].ptr = 1;
788	}
789
790out:
791	/* Process deferred owner checks on btree blocks. */
792	list_for_each_entry_safe(co, n, &bs->to_check, list) {
793		if (!error && bs->cur)
794			error = xchk_btree_check_block_owner(bs, co->level,
795					co->daddr);
796		list_del(&co->list);
797		kfree(co);
798	}
799	kfree(bs);
800
801	return error;
802}
803