1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_defer.h"
13#include "xfs_bit.h"
14#include "xfs_log_format.h"
15#include "xfs_trans.h"
16#include "xfs_sb.h"
17#include "xfs_inode.h"
18#include "xfs_icache.h"
19#include "xfs_da_format.h"
20#include "xfs_da_btree.h"
21#include "xfs_dir2.h"
22#include "xfs_dir2_priv.h"
23#include "xfs_bmap.h"
24#include "xfs_quota.h"
25#include "xfs_bmap_btree.h"
26#include "xfs_trans_space.h"
27#include "xfs_bmap_util.h"
28#include "xfs_exchmaps.h"
29#include "xfs_exchrange.h"
30#include "xfs_ag.h"
31#include "xfs_parent.h"
32#include "scrub/xfs_scrub.h"
33#include "scrub/scrub.h"
34#include "scrub/common.h"
35#include "scrub/trace.h"
36#include "scrub/repair.h"
37#include "scrub/tempfile.h"
38#include "scrub/tempexch.h"
39#include "scrub/xfile.h"
40#include "scrub/xfarray.h"
41#include "scrub/xfblob.h"
42#include "scrub/iscan.h"
43#include "scrub/readdir.h"
44#include "scrub/reap.h"
45#include "scrub/findparent.h"
46#include "scrub/orphanage.h"
47#include "scrub/listxattr.h"
48
49/*
50 * Directory Repair
51 * ================
52 *
53 * We repair directories by reading the directory data blocks looking for
54 * directory entries that look salvageable (name passes verifiers, entry points
55 * to a valid allocated inode, etc).  Each entry worth salvaging is stashed in
56 * memory, and the stashed entries are periodically replayed into a temporary
57 * directory to constrain memory use.  Batching the construction of the
58 * temporary directory in this fashion reduces lock cycling of the directory
59 * being repaired and the temporary directory, and will later become important
60 * for parent pointer scanning.
61 *
62 * If parent pointers are enabled on this filesystem, we instead reconstruct
63 * the directory by visiting each parent pointer of each file in the filesystem
64 * and translating the relevant parent pointer records into dirents.  In this
65 * case, it is advantageous to stash all directory entries created from parent
66 * pointers for a single child file before replaying them into the temporary
67 * directory.  To save memory, the live filesystem scan reuses the findparent
68 * fields.  Directory repair chooses either parent pointer scanning or
69 * directory entry salvaging, but not both.
70 *
71 * Directory entries added to the temporary directory do not elevate the link
72 * counts of the inodes found.  When salvaging completes, the remaining stashed
73 * entries are replayed to the temporary directory.  An atomic mapping exchange
74 * is used to commit the new directory blocks to the directory being repaired.
75 * This will disrupt readdir cursors.
76 *
77 * Locking Issues
78 * --------------
79 *
80 * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on
81 * /a/b for a "mv /a/b /c/" operation.  This means that only b's ILOCK protects
82 * b's dotdot update.  This is in contrast to every other dotdot update (link,
83 * remove, mkdir).  If the repair code drops the ILOCK, it must either
84 * revalidate the dotdot entry or use dirent hooks to capture updates from
85 * other threads.
86 */
87
88/* Create a dirent in the tempdir. */
89#define XREP_DIRENT_ADD		(1)
90
91/* Remove a dirent from the tempdir. */
92#define XREP_DIRENT_REMOVE	(2)
93
94/* Directory entry to be restored in the new directory. */
95struct xrep_dirent {
96	/* Cookie for retrieval of the dirent name. */
97	xfblob_cookie		name_cookie;
98
99	/* Target inode number. */
100	xfs_ino_t		ino;
101
102	/* Length of the dirent name. */
103	uint8_t			namelen;
104
105	/* File type of the dirent. */
106	uint8_t			ftype;
107
108	/* XREP_DIRENT_{ADD,REMOVE} */
109	uint8_t			action;
110};
111
112/*
113 * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names
114 * before we write them to the temp dir.
115 */
116#define XREP_DIR_MAX_STASH_BYTES	(PAGE_SIZE * 8)
117
118struct xrep_dir {
119	struct xfs_scrub	*sc;
120
121	/* Fixed-size array of xrep_dirent structures. */
122	struct xfarray		*dir_entries;
123
124	/* Blobs containing directory entry names. */
125	struct xfblob		*dir_names;
126
127	/* Information for exchanging data forks at the end. */
128	struct xrep_tempexch	tx;
129
130	/* Preallocated args struct for performing dir operations */
131	struct xfs_da_args	args;
132
133	/*
134	 * Information used to scan the filesystem to find the inumber of the
135	 * dotdot entry for this directory.  For directory salvaging when
136	 * parent pointers are not enabled, we use the findparent_* functions
137	 * on this object and access only the parent_ino field directly.
138	 *
139	 * When parent pointers are enabled, however, the pptr scanner uses the
140	 * iscan, hooks, lock, and parent_ino fields of this object directly.
141	 * @pscan.lock coordinates access to dir_entries, dir_names,
142	 * parent_ino, subdirs, dirents, and args.  This reduces the memory
143	 * requirements of this structure.
144	 */
145	struct xrep_parent_scan_info pscan;
146
147	/*
148	 * Context information for attaching this directory to the lost+found
149	 * if this directory does not have a parent.
150	 */
151	struct xrep_adoption	adoption;
152
153	/* How many subdirectories did we find? */
154	uint64_t		subdirs;
155
156	/* How many dirents did we find? */
157	unsigned int		dirents;
158
159	/* Should we move this directory to the orphanage? */
160	bool			needs_adoption;
161
162	/* Directory entry name, plus the trailing null. */
163	struct xfs_name		xname;
164	unsigned char		namebuf[MAXNAMELEN];
165};
166
167/* Tear down all the incore stuff we created. */
168static void
169xrep_dir_teardown(
170	struct xfs_scrub	*sc)
171{
172	struct xrep_dir		*rd = sc->buf;
173
174	xrep_findparent_scan_teardown(&rd->pscan);
175	xfblob_destroy(rd->dir_names);
176	xfarray_destroy(rd->dir_entries);
177}
178
179/* Set up for a directory repair. */
180int
181xrep_setup_directory(
182	struct xfs_scrub	*sc)
183{
184	struct xrep_dir		*rd;
185	int			error;
186
187	xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
188
189	error = xrep_orphanage_try_create(sc);
190	if (error)
191		return error;
192
193	error = xrep_tempfile_create(sc, S_IFDIR);
194	if (error)
195		return error;
196
197	rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS);
198	if (!rd)
199		return -ENOMEM;
200	rd->sc = sc;
201	rd->xname.name = rd->namebuf;
202	sc->buf = rd;
203
204	return 0;
205}
206
207/*
208 * Look up the dotdot entry and confirm that it's really the parent.
209 * Returns NULLFSINO if we don't know what to do.
210 */
211static inline xfs_ino_t
212xrep_dir_lookup_parent(
213	struct xrep_dir		*rd)
214{
215	struct xfs_scrub	*sc = rd->sc;
216	xfs_ino_t		ino;
217	int			error;
218
219	error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL);
220	if (error)
221		return NULLFSINO;
222	if (!xfs_verify_dir_ino(sc->mp, ino))
223		return NULLFSINO;
224
225	error = xrep_findparent_confirm(sc, &ino);
226	if (error)
227		return NULLFSINO;
228
229	return ino;
230}
231
232/*
233 * Look up '..' in the dentry cache and confirm that it's really the parent.
234 * Returns NULLFSINO if the dcache misses or if the hit is implausible.
235 */
236static inline xfs_ino_t
237xrep_dir_dcache_parent(
238	struct xrep_dir		*rd)
239{
240	struct xfs_scrub	*sc = rd->sc;
241	xfs_ino_t		parent_ino;
242	int			error;
243
244	parent_ino = xrep_findparent_from_dcache(sc);
245	if (parent_ino == NULLFSINO)
246		return parent_ino;
247
248	error = xrep_findparent_confirm(sc, &parent_ino);
249	if (error)
250		return NULLFSINO;
251
252	return parent_ino;
253}
254
255/* Try to find the parent of the directory being repaired. */
256STATIC int
257xrep_dir_find_parent(
258	struct xrep_dir		*rd)
259{
260	xfs_ino_t		ino;
261
262	ino = xrep_findparent_self_reference(rd->sc);
263	if (ino != NULLFSINO) {
264		xrep_findparent_scan_finish_early(&rd->pscan, ino);
265		return 0;
266	}
267
268	ino = xrep_dir_dcache_parent(rd);
269	if (ino != NULLFSINO) {
270		xrep_findparent_scan_finish_early(&rd->pscan, ino);
271		return 0;
272	}
273
274	ino = xrep_dir_lookup_parent(rd);
275	if (ino != NULLFSINO) {
276		xrep_findparent_scan_finish_early(&rd->pscan, ino);
277		return 0;
278	}
279
280	/*
281	 * A full filesystem scan is the last resort.  On a busy filesystem,
282	 * the scan can fail with -EBUSY if we cannot grab IOLOCKs.  That means
283	 * that we don't know what who the parent is, so we should return to
284	 * userspace.
285	 */
286	return xrep_findparent_scan(&rd->pscan);
287}
288
289/*
290 * Decide if we want to salvage this entry.  We don't bother with oversized
291 * names or the dot entry.
292 */
293STATIC int
294xrep_dir_want_salvage(
295	struct xrep_dir		*rd,
296	const char		*name,
297	int			namelen,
298	xfs_ino_t		ino)
299{
300	struct xfs_mount	*mp = rd->sc->mp;
301
302	/* No pointers to ourselves or to garbage. */
303	if (ino == rd->sc->ip->i_ino)
304		return false;
305	if (!xfs_verify_dir_ino(mp, ino))
306		return false;
307
308	/* No weird looking names or dot entries. */
309	if (namelen >= MAXNAMELEN || namelen <= 0)
310		return false;
311	if (namelen == 1 && name[0] == '.')
312		return false;
313	if (!xfs_dir2_namecheck(name, namelen))
314		return false;
315
316	return true;
317}
318
319/*
320 * Remember that we want to create a dirent in the tempdir.  These stashed
321 * actions will be replayed later.
322 */
323STATIC int
324xrep_dir_stash_createname(
325	struct xrep_dir		*rd,
326	const struct xfs_name	*name,
327	xfs_ino_t		ino)
328{
329	struct xrep_dirent	dirent = {
330		.action		= XREP_DIRENT_ADD,
331		.ino		= ino,
332		.namelen	= name->len,
333		.ftype		= name->type,
334	};
335	int			error;
336
337	trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino);
338
339	error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
340	if (error)
341		return error;
342
343	return xfarray_append(rd->dir_entries, &dirent);
344}
345
346/*
347 * Remember that we want to remove a dirent from the tempdir.  These stashed
348 * actions will be replayed later.
349 */
350STATIC int
351xrep_dir_stash_removename(
352	struct xrep_dir		*rd,
353	const struct xfs_name	*name,
354	xfs_ino_t		ino)
355{
356	struct xrep_dirent	dirent = {
357		.action		= XREP_DIRENT_REMOVE,
358		.ino		= ino,
359		.namelen	= name->len,
360		.ftype		= name->type,
361	};
362	int			error;
363
364	trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino);
365
366	error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
367	if (error)
368		return error;
369
370	return xfarray_append(rd->dir_entries, &dirent);
371}
372
373/* Allocate an in-core record to hold entries while we rebuild the dir data. */
374STATIC int
375xrep_dir_salvage_entry(
376	struct xrep_dir		*rd,
377	unsigned char		*name,
378	unsigned int		namelen,
379	xfs_ino_t		ino)
380{
381	struct xfs_name		xname = {
382		.name		= name,
383	};
384	struct xfs_scrub	*sc = rd->sc;
385	struct xfs_inode	*ip;
386	unsigned int		i = 0;
387	int			error = 0;
388
389	if (xchk_should_terminate(sc, &error))
390		return error;
391
392	/*
393	 * Truncate the name to the first character that would trip namecheck.
394	 * If we no longer have a name after that, ignore this entry.
395	 */
396	while (i < namelen && name[i] != 0 && name[i] != '/')
397		i++;
398	if (i == 0)
399		return 0;
400	xname.len = i;
401
402	/* Ignore '..' entries; we already picked the new parent. */
403	if (xname.len == 2 && name[0] == '.' && name[1] == '.') {
404		trace_xrep_dir_salvaged_parent(sc->ip, ino);
405		return 0;
406	}
407
408	trace_xrep_dir_salvage_entry(sc->ip, &xname, ino);
409
410	/*
411	 * Compute the ftype or dump the entry if we can't.  We don't lock the
412	 * inode because inodes can't change type while we have a reference.
413	 */
414	error = xchk_iget(sc, ino, &ip);
415	if (error)
416		return 0;
417
418	xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
419	xchk_irele(sc, ip);
420
421	return xrep_dir_stash_createname(rd, &xname, ino);
422}
423
424/* Record a shortform directory entry for later reinsertion. */
425STATIC int
426xrep_dir_salvage_sf_entry(
427	struct xrep_dir			*rd,
428	struct xfs_dir2_sf_hdr		*sfp,
429	struct xfs_dir2_sf_entry	*sfep)
430{
431	xfs_ino_t			ino;
432
433	ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep);
434	if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino))
435		return 0;
436
437	return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino);
438}
439
440/* Record a regular directory entry for later reinsertion. */
441STATIC int
442xrep_dir_salvage_data_entry(
443	struct xrep_dir			*rd,
444	struct xfs_dir2_data_entry	*dep)
445{
446	xfs_ino_t			ino;
447
448	ino = be64_to_cpu(dep->inumber);
449	if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino))
450		return 0;
451
452	return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino);
453}
454
455/* Try to recover block/data format directory entries. */
456STATIC int
457xrep_dir_recover_data(
458	struct xrep_dir		*rd,
459	struct xfs_buf		*bp)
460{
461	struct xfs_da_geometry	*geo = rd->sc->mp->m_dir_geo;
462	unsigned int		offset;
463	unsigned int		end;
464	int			error = 0;
465
466	/*
467	 * Loop over the data portion of the block.
468	 * Each object is a real entry (dep) or an unused one (dup).
469	 */
470	offset = geo->data_entry_offset;
471	end = min_t(unsigned int, BBTOB(bp->b_length),
472			xfs_dir3_data_end_offset(geo, bp->b_addr));
473
474	while (offset < end) {
475		struct xfs_dir2_data_unused	*dup = bp->b_addr + offset;
476		struct xfs_dir2_data_entry	*dep = bp->b_addr + offset;
477
478		if (xchk_should_terminate(rd->sc, &error))
479			return error;
480
481		/* Skip unused entries. */
482		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
483			offset += be16_to_cpu(dup->length);
484			continue;
485		}
486
487		/* Don't walk off the end of the block. */
488		offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen);
489		if (offset > end)
490			break;
491
492		/* Ok, let's save this entry. */
493		error = xrep_dir_salvage_data_entry(rd, dep);
494		if (error)
495			return error;
496
497	}
498
499	return 0;
500}
501
502/* Try to recover shortform directory entries. */
503STATIC int
504xrep_dir_recover_sf(
505	struct xrep_dir			*rd)
506{
507	struct xfs_dir2_sf_hdr		*hdr;
508	struct xfs_dir2_sf_entry	*sfep;
509	struct xfs_dir2_sf_entry	*next;
510	struct xfs_ifork		*ifp;
511	xfs_ino_t			ino;
512	unsigned char			*end;
513	int				error = 0;
514
515	ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK);
516	hdr = ifp->if_data;
517	end = (unsigned char *)ifp->if_data + ifp->if_bytes;
518
519	ino = xfs_dir2_sf_get_parent_ino(hdr);
520	trace_xrep_dir_salvaged_parent(rd->sc->ip, ino);
521
522	sfep = xfs_dir2_sf_firstentry(hdr);
523	while ((unsigned char *)sfep < end) {
524		if (xchk_should_terminate(rd->sc, &error))
525			return error;
526
527		next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep);
528		if ((unsigned char *)next > end)
529			break;
530
531		/* Ok, let's save this entry. */
532		error = xrep_dir_salvage_sf_entry(rd, hdr, sfep);
533		if (error)
534			return error;
535
536		sfep = next;
537	}
538
539	return 0;
540}
541
542/*
543 * Try to figure out the format of this directory from the data fork mappings
544 * and the directory size.  If we can be reasonably sure of format, we can be
545 * more aggressive in salvaging directory entries.  On return, @magic_guess
546 * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format"
547 * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory,
548 * and 0 if we can't tell.
549 */
550STATIC void
551xrep_dir_guess_format(
552	struct xrep_dir		*rd,
553	__be32			*magic_guess)
554{
555	struct xfs_inode	*dp = rd->sc->ip;
556	struct xfs_mount	*mp = rd->sc->mp;
557	struct xfs_da_geometry	*geo = mp->m_dir_geo;
558	xfs_fileoff_t		last;
559	int			error;
560
561	ASSERT(xfs_has_crc(mp));
562
563	*magic_guess = 0;
564
565	/*
566	 * If there's a single directory block and the directory size is
567	 * exactly one block, this has to be a single block format directory.
568	 */
569	error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK);
570	if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize &&
571	    dp->i_disk_size == geo->blksize) {
572		*magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
573		return;
574	}
575
576	/*
577	 * If the last extent before the leaf offset matches the directory
578	 * size and the directory size is larger than 1 block, this is a
579	 * data format directory.
580	 */
581	last = geo->leafblk;
582	error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK);
583	if (!error &&
584	    XFS_FSB_TO_B(mp, last) > geo->blksize &&
585	    XFS_FSB_TO_B(mp, last) == dp->i_disk_size) {
586		*magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
587		return;
588	}
589}
590
591/* Recover directory entries from a specific directory block. */
592STATIC int
593xrep_dir_recover_dirblock(
594	struct xrep_dir		*rd,
595	__be32			magic_guess,
596	xfs_dablk_t		dabno)
597{
598	struct xfs_dir2_data_hdr *hdr;
599	struct xfs_buf		*bp;
600	__be32			oldmagic;
601	int			error;
602
603	/*
604	 * Try to read buffer.  We invalidate them in the next step so we don't
605	 * bother to set a buffer type or ops.
606	 */
607	error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno,
608			XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL);
609	if (error || !bp)
610		return error;
611
612	hdr = bp->b_addr;
613	oldmagic = hdr->magic;
614
615	trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno,
616			be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess));
617
618	/*
619	 * If we're sure of the block's format, proceed with the salvage
620	 * operation using the specified magic number.
621	 */
622	if (magic_guess) {
623		hdr->magic = magic_guess;
624		goto recover;
625	}
626
627	/*
628	 * If we couldn't guess what type of directory this is, then we will
629	 * only salvage entries from directory blocks that match the magic
630	 * number and pass verifiers.
631	 */
632	switch (hdr->magic) {
633	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
634	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
635		if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops))
636			goto out;
637		if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL)
638			goto out;
639		break;
640	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
641	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
642		if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops))
643			goto out;
644		if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL)
645			goto out;
646		break;
647	default:
648		goto out;
649	}
650
651recover:
652	error = xrep_dir_recover_data(rd, bp);
653
654out:
655	hdr->magic = oldmagic;
656	xfs_trans_brelse(rd->sc->tp, bp);
657	return error;
658}
659
660static inline void
661xrep_dir_init_args(
662	struct xrep_dir		*rd,
663	struct xfs_inode	*dp,
664	const struct xfs_name	*name)
665{
666	memset(&rd->args, 0, sizeof(struct xfs_da_args));
667	rd->args.geo = rd->sc->mp->m_dir_geo;
668	rd->args.whichfork = XFS_DATA_FORK;
669	rd->args.owner = rd->sc->ip->i_ino;
670	rd->args.trans = rd->sc->tp;
671	rd->args.dp = dp;
672	if (!name)
673		return;
674	rd->args.name = name->name;
675	rd->args.namelen = name->len;
676	rd->args.filetype = name->type;
677	rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name);
678}
679
680/* Replay a stashed createname into the temporary directory. */
681STATIC int
682xrep_dir_replay_createname(
683	struct xrep_dir		*rd,
684	const struct xfs_name	*name,
685	xfs_ino_t		inum,
686	xfs_extlen_t		total)
687{
688	struct xfs_scrub	*sc = rd->sc;
689	struct xfs_inode	*dp = rd->sc->tempip;
690	int			error;
691
692	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
693
694	error = xfs_dir_ino_validate(sc->mp, inum);
695	if (error)
696		return error;
697
698	trace_xrep_dir_replay_createname(dp, name, inum);
699
700	xrep_dir_init_args(rd, dp, name);
701	rd->args.inumber = inum;
702	rd->args.total = total;
703	rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
704	return xfs_dir_createname_args(&rd->args);
705}
706
707/* Replay a stashed removename onto the temporary directory. */
708STATIC int
709xrep_dir_replay_removename(
710	struct xrep_dir		*rd,
711	const struct xfs_name	*name,
712	xfs_extlen_t		total)
713{
714	struct xfs_inode	*dp = rd->args.dp;
715
716	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
717
718	xrep_dir_init_args(rd, dp, name);
719	rd->args.op_flags = 0;
720	rd->args.total = total;
721
722	trace_xrep_dir_replay_removename(dp, name, 0);
723	return xfs_dir_removename_args(&rd->args);
724}
725
726/*
727 * Add this stashed incore directory entry to the temporary directory.
728 * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
729 * must not be in transaction context.
730 */
731STATIC int
732xrep_dir_replay_update(
733	struct xrep_dir			*rd,
734	const struct xfs_name		*xname,
735	const struct xrep_dirent	*dirent)
736{
737	struct xfs_mount		*mp = rd->sc->mp;
738#ifdef DEBUG
739	xfs_ino_t			ino;
740#endif
741	uint				resblks;
742	int				error;
743
744	resblks = xfs_link_space_res(mp, xname->len);
745	error = xchk_trans_alloc(rd->sc, resblks);
746	if (error)
747		return error;
748
749	/* Lock the temporary directory and join it to the transaction */
750	xrep_tempfile_ilock(rd->sc);
751	xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0);
752
753	switch (dirent->action) {
754	case XREP_DIRENT_ADD:
755		/*
756		 * Create a replacement dirent in the temporary directory.
757		 * Note that _createname doesn't check for existing entries.
758		 * There shouldn't be any in the temporary dir, but we'll
759		 * verify this in debug mode.
760		 */
761#ifdef DEBUG
762		error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
763		if (error != -ENOENT) {
764			ASSERT(error != -ENOENT);
765			goto out_cancel;
766		}
767#endif
768
769		error = xrep_dir_replay_createname(rd, xname, dirent->ino,
770				resblks);
771		if (error)
772			goto out_cancel;
773
774		if (xname->type == XFS_DIR3_FT_DIR)
775			rd->subdirs++;
776		rd->dirents++;
777		break;
778	case XREP_DIRENT_REMOVE:
779		/*
780		 * Remove a dirent from the temporary directory.  Note that
781		 * _removename doesn't check the inode target of the exist
782		 * entry.  There should be a perfect match in the temporary
783		 * dir, but we'll verify this in debug mode.
784		 */
785#ifdef DEBUG
786		error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
787		if (error) {
788			ASSERT(error != 0);
789			goto out_cancel;
790		}
791		if (ino != dirent->ino) {
792			ASSERT(ino == dirent->ino);
793			error = -EIO;
794			goto out_cancel;
795		}
796#endif
797
798		error = xrep_dir_replay_removename(rd, xname, resblks);
799		if (error)
800			goto out_cancel;
801
802		if (xname->type == XFS_DIR3_FT_DIR)
803			rd->subdirs--;
804		rd->dirents--;
805		break;
806	default:
807		ASSERT(0);
808		error = -EIO;
809		goto out_cancel;
810	}
811
812	/* Commit and unlock. */
813	error = xrep_trans_commit(rd->sc);
814	if (error)
815		return error;
816
817	xrep_tempfile_iunlock(rd->sc);
818	return 0;
819out_cancel:
820	xchk_trans_cancel(rd->sc);
821	xrep_tempfile_iunlock(rd->sc);
822	return error;
823}
824
825/*
826 * Flush stashed incore dirent updates that have been recorded by the scanner.
827 * This is done to reduce the memory requirements of the directory rebuild,
828 * since directories can contain up to 32GB of directory data.
829 *
830 * Caller must not hold transactions or ILOCKs.  Caller must hold the tempdir
831 * IOLOCK.
832 */
833STATIC int
834xrep_dir_replay_updates(
835	struct xrep_dir		*rd)
836{
837	xfarray_idx_t		array_cur;
838	int			error;
839
840	/* Add all the salvaged dirents to the temporary directory. */
841	mutex_lock(&rd->pscan.lock);
842	foreach_xfarray_idx(rd->dir_entries, array_cur) {
843		struct xrep_dirent	dirent;
844
845		error = xfarray_load(rd->dir_entries, array_cur, &dirent);
846		if (error)
847			goto out_unlock;
848
849		error = xfblob_loadname(rd->dir_names, dirent.name_cookie,
850				&rd->xname, dirent.namelen);
851		if (error)
852			goto out_unlock;
853		rd->xname.type = dirent.ftype;
854		mutex_unlock(&rd->pscan.lock);
855
856		error = xrep_dir_replay_update(rd, &rd->xname, &dirent);
857		if (error)
858			return error;
859		mutex_lock(&rd->pscan.lock);
860	}
861
862	/* Empty out both arrays now that we've added the entries. */
863	xfarray_truncate(rd->dir_entries);
864	xfblob_truncate(rd->dir_names);
865	mutex_unlock(&rd->pscan.lock);
866	return 0;
867out_unlock:
868	mutex_unlock(&rd->pscan.lock);
869	return error;
870}
871
872/*
873 * Periodically flush stashed directory entries to the temporary dir.  This
874 * is done to reduce the memory requirements of the directory rebuild, since
875 * directories can contain up to 32GB of directory data.
876 */
877STATIC int
878xrep_dir_flush_stashed(
879	struct xrep_dir		*rd)
880{
881	int			error;
882
883	/*
884	 * Entering this function, the scrub context has a reference to the
885	 * inode being repaired, the temporary file, and a scrub transaction
886	 * that we use during dirent salvaging to avoid livelocking if there
887	 * are cycles in the directory structures.  We hold ILOCK_EXCL on both
888	 * the inode being repaired and the temporary file, though they are
889	 * not ijoined to the scrub transaction.
890	 *
891	 * To constrain kernel memory use, we occasionally write salvaged
892	 * dirents from the xfarray and xfblob structures into the temporary
893	 * directory in preparation for exchanging the directory structures at
894	 * the end.  Updating the temporary file requires a transaction, so we
895	 * commit the scrub transaction and drop the two ILOCKs so that
896	 * we can allocate whatever transaction we want.
897	 *
898	 * We still hold IOLOCK_EXCL on the inode being repaired, which
899	 * prevents anyone from accessing the damaged directory data while we
900	 * repair it.
901	 */
902	error = xrep_trans_commit(rd->sc);
903	if (error)
904		return error;
905	xchk_iunlock(rd->sc, XFS_ILOCK_EXCL);
906
907	/*
908	 * Take the IOLOCK of the temporary file while we modify dirents.  This
909	 * isn't strictly required because the temporary file is never revealed
910	 * to userspace, but we follow the same locking rules.  We still hold
911	 * sc->ip's IOLOCK.
912	 */
913	error = xrep_tempfile_iolock_polled(rd->sc);
914	if (error)
915		return error;
916
917	/* Write to the tempdir all the updates that we've stashed. */
918	error = xrep_dir_replay_updates(rd);
919	xrep_tempfile_iounlock(rd->sc);
920	if (error)
921		return error;
922
923	/*
924	 * Recreate the salvage transaction and relock the dir we're salvaging.
925	 */
926	error = xchk_trans_alloc(rd->sc, 0);
927	if (error)
928		return error;
929	xchk_ilock(rd->sc, XFS_ILOCK_EXCL);
930	return 0;
931}
932
933/* Decide if we've stashed too much dirent data in memory. */
934static inline bool
935xrep_dir_want_flush_stashed(
936	struct xrep_dir		*rd)
937{
938	unsigned long long	bytes;
939
940	bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names);
941	return bytes > XREP_DIR_MAX_STASH_BYTES;
942}
943
944/* Extract as many directory entries as we can. */
945STATIC int
946xrep_dir_recover(
947	struct xrep_dir		*rd)
948{
949	struct xfs_bmbt_irec	got;
950	struct xfs_scrub	*sc = rd->sc;
951	struct xfs_da_geometry	*geo = sc->mp->m_dir_geo;
952	xfs_fileoff_t		offset;
953	xfs_dablk_t		dabno;
954	__be32			magic_guess;
955	int			nmap;
956	int			error;
957
958	xrep_dir_guess_format(rd, &magic_guess);
959
960	/* Iterate each directory data block in the data fork. */
961	for (offset = 0;
962	     offset < geo->leafblk;
963	     offset = got.br_startoff + got.br_blockcount) {
964		nmap = 1;
965		error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset,
966				&got, &nmap, 0);
967		if (error)
968			return error;
969		if (nmap != 1)
970			return -EFSCORRUPTED;
971		if (!xfs_bmap_is_written_extent(&got))
972			continue;
973
974		for (dabno = round_up(got.br_startoff, geo->fsbcount);
975		     dabno < got.br_startoff + got.br_blockcount;
976		     dabno += geo->fsbcount) {
977			if (xchk_should_terminate(rd->sc, &error))
978				return error;
979
980			error = xrep_dir_recover_dirblock(rd,
981					magic_guess, dabno);
982			if (error)
983				return error;
984
985			/* Flush dirents to constrain memory usage. */
986			if (xrep_dir_want_flush_stashed(rd)) {
987				error = xrep_dir_flush_stashed(rd);
988				if (error)
989					return error;
990			}
991		}
992	}
993
994	return 0;
995}
996
997/*
998 * Find all the directory entries for this inode by scraping them out of the
999 * directory leaf blocks by hand, and flushing them into the temp dir.
1000 */
1001STATIC int
1002xrep_dir_find_entries(
1003	struct xrep_dir		*rd)
1004{
1005	struct xfs_inode	*dp = rd->sc->ip;
1006	int			error;
1007
1008	/*
1009	 * Salvage directory entries from the old directory, and write them to
1010	 * the temporary directory.
1011	 */
1012	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
1013		error = xrep_dir_recover_sf(rd);
1014	} else {
1015		error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK);
1016		if (error)
1017			return error;
1018
1019		error = xrep_dir_recover(rd);
1020	}
1021	if (error)
1022		return error;
1023
1024	return xrep_dir_flush_stashed(rd);
1025}
1026
1027/* Scan all files in the filesystem for dirents. */
1028STATIC int
1029xrep_dir_salvage_entries(
1030	struct xrep_dir		*rd)
1031{
1032	struct xfs_scrub	*sc = rd->sc;
1033	int			error;
1034
1035	/*
1036	 * Drop the ILOCK on this directory so that we can scan for this
1037	 * directory's parent.  Figure out who is going to be the parent of
1038	 * this directory, then retake the ILOCK so that we can salvage
1039	 * directory entries.
1040	 */
1041	xchk_iunlock(sc, XFS_ILOCK_EXCL);
1042	error = xrep_dir_find_parent(rd);
1043	xchk_ilock(sc, XFS_ILOCK_EXCL);
1044	if (error)
1045		return error;
1046
1047	/*
1048	 * Collect directory entries by parsing raw leaf blocks to salvage
1049	 * whatever we can.  When we're done, free the staging memory before
1050	 * exchanging the directories to reduce memory usage.
1051	 */
1052	error = xrep_dir_find_entries(rd);
1053	if (error)
1054		return error;
1055
1056	/*
1057	 * Cancel the repair transaction and drop the ILOCK so that we can
1058	 * (later) use the atomic mapping exchange functions to compute the
1059	 * correct block reservations and re-lock the inodes.
1060	 *
1061	 * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory
1062	 * modifications, but there's nothing to prevent userspace from reading
1063	 * the directory until we're ready for the exchange operation.  Reads
1064	 * will return -EIO without shutting down the fs, so we're ok with
1065	 * that.
1066	 *
1067	 * The VFS can change dotdot on us, but the findparent scan will keep
1068	 * our incore parent inode up to date.  See the note on locking issues
1069	 * for more details.
1070	 */
1071	error = xrep_trans_commit(sc);
1072	if (error)
1073		return error;
1074
1075	xchk_iunlock(sc, XFS_ILOCK_EXCL);
1076	return 0;
1077}
1078
1079
1080/*
1081 * Examine a parent pointer of a file.  If it leads us back to the directory
1082 * that we're rebuilding, create an incore dirent from the parent pointer and
1083 * stash it.
1084 */
1085STATIC int
1086xrep_dir_scan_pptr(
1087	struct xfs_scrub		*sc,
1088	struct xfs_inode		*ip,
1089	unsigned int			attr_flags,
1090	const unsigned char		*name,
1091	unsigned int			namelen,
1092	const void			*value,
1093	unsigned int			valuelen,
1094	void				*priv)
1095{
1096	struct xfs_name			xname = {
1097		.name			= name,
1098		.len			= namelen,
1099		.type			= xfs_mode_to_ftype(VFS_I(ip)->i_mode),
1100	};
1101	xfs_ino_t			parent_ino;
1102	uint32_t			parent_gen;
1103	struct xrep_dir			*rd = priv;
1104	int				error;
1105
1106	if (!(attr_flags & XFS_ATTR_PARENT))
1107		return 0;
1108
1109	/*
1110	 * Ignore parent pointers that point back to a different dir, list the
1111	 * wrong generation number, or are invalid.
1112	 */
1113	error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
1114			valuelen, &parent_ino, &parent_gen);
1115	if (error)
1116		return error;
1117
1118	if (parent_ino != sc->ip->i_ino ||
1119	    parent_gen != VFS_I(sc->ip)->i_generation)
1120		return 0;
1121
1122	mutex_lock(&rd->pscan.lock);
1123	error = xrep_dir_stash_createname(rd, &xname, ip->i_ino);
1124	mutex_unlock(&rd->pscan.lock);
1125	return error;
1126}
1127
1128/*
1129 * If this child dirent points to the directory being repaired, remember that
1130 * fact so that we can reset the dotdot entry if necessary.
1131 */
1132STATIC int
1133xrep_dir_scan_dirent(
1134	struct xfs_scrub	*sc,
1135	struct xfs_inode	*dp,
1136	xfs_dir2_dataptr_t	dapos,
1137	const struct xfs_name	*name,
1138	xfs_ino_t		ino,
1139	void			*priv)
1140{
1141	struct xrep_dir		*rd = priv;
1142
1143	/* Dirent doesn't point to this directory. */
1144	if (ino != rd->sc->ip->i_ino)
1145		return 0;
1146
1147	/* Ignore garbage inum. */
1148	if (!xfs_verify_dir_ino(rd->sc->mp, ino))
1149		return 0;
1150
1151	/* No weird looking names. */
1152	if (name->len >= MAXNAMELEN || name->len <= 0)
1153		return 0;
1154
1155	/* Don't pick up dot or dotdot entries; we only want child dirents. */
1156	if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
1157	    xfs_dir2_samename(name, &xfs_name_dot))
1158		return 0;
1159
1160	trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot,
1161			dp->i_ino);
1162
1163	xrep_findparent_scan_found(&rd->pscan, dp->i_ino);
1164	return 0;
1165}
1166
1167/*
1168 * Decide if we want to look for child dirents or parent pointers in this file.
1169 * Skip the dir being repaired and any files being used to stage repairs.
1170 */
1171static inline bool
1172xrep_dir_want_scan(
1173	struct xrep_dir		*rd,
1174	const struct xfs_inode	*ip)
1175{
1176	return ip != rd->sc->ip && !xrep_is_tempfile(ip);
1177}
1178
1179/*
1180 * Take ILOCK on a file that we want to scan.
1181 *
1182 * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or
1183 * has an unloaded attr bmbt.  Otherwise, take ILOCK_SHARED.
1184 */
1185static inline unsigned int
1186xrep_dir_scan_ilock(
1187	struct xrep_dir		*rd,
1188	struct xfs_inode	*ip)
1189{
1190	uint			lock_mode = XFS_ILOCK_SHARED;
1191
1192	/* Need to take the shared ILOCK to advance the iscan cursor. */
1193	if (!xrep_dir_want_scan(rd, ip))
1194		goto lock;
1195
1196	if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) {
1197		lock_mode = XFS_ILOCK_EXCL;
1198		goto lock;
1199	}
1200
1201	if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
1202		lock_mode = XFS_ILOCK_EXCL;
1203
1204lock:
1205	xfs_ilock(ip, lock_mode);
1206	return lock_mode;
1207}
1208
1209/*
1210 * Scan this file for relevant child dirents or parent pointers that point to
1211 * the directory we're rebuilding.
1212 */
1213STATIC int
1214xrep_dir_scan_file(
1215	struct xrep_dir		*rd,
1216	struct xfs_inode	*ip)
1217{
1218	unsigned int		lock_mode;
1219	int			error = 0;
1220
1221	lock_mode = xrep_dir_scan_ilock(rd, ip);
1222
1223	if (!xrep_dir_want_scan(rd, ip))
1224		goto scan_done;
1225
1226	/*
1227	 * If the extended attributes look as though they has been zapped by
1228	 * the inode record repair code, we cannot scan for parent pointers.
1229	 */
1230	if (xchk_pptr_looks_zapped(ip)) {
1231		error = -EBUSY;
1232		goto scan_done;
1233	}
1234
1235	error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd);
1236	if (error)
1237		goto scan_done;
1238
1239	if (S_ISDIR(VFS_I(ip)->i_mode)) {
1240		/*
1241		 * If the directory looks as though it has been zapped by the
1242		 * inode record repair code, we cannot scan for child dirents.
1243		 */
1244		if (xchk_dir_looks_zapped(ip)) {
1245			error = -EBUSY;
1246			goto scan_done;
1247		}
1248
1249		error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd);
1250		if (error)
1251			goto scan_done;
1252	}
1253
1254scan_done:
1255	xchk_iscan_mark_visited(&rd->pscan.iscan, ip);
1256	xfs_iunlock(ip, lock_mode);
1257	return error;
1258}
1259
1260/*
1261 * Scan all files in the filesystem for parent pointers that we can turn into
1262 * replacement dirents, and a dirent that we can use to set the dotdot pointer.
1263 */
1264STATIC int
1265xrep_dir_scan_dirtree(
1266	struct xrep_dir		*rd)
1267{
1268	struct xfs_scrub	*sc = rd->sc;
1269	struct xfs_inode	*ip;
1270	int			error;
1271
1272	/* Roots of directory trees are their own parents. */
1273	if (sc->ip == sc->mp->m_rootip)
1274		xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino);
1275
1276	/*
1277	 * Filesystem scans are time consuming.  Drop the directory ILOCK and
1278	 * all other resources for the duration of the scan and hope for the
1279	 * best.  The live update hooks will keep our scan information up to
1280	 * date even though we've dropped the locks.
1281	 */
1282	xchk_trans_cancel(sc);
1283	if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
1284		xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
1285						    XFS_ILOCK_EXCL));
1286	error = xchk_trans_alloc_empty(sc);
1287	if (error)
1288		return error;
1289
1290	while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) {
1291		bool		flush;
1292
1293		error = xrep_dir_scan_file(rd, ip);
1294		xchk_irele(sc, ip);
1295		if (error)
1296			break;
1297
1298		/* Flush stashed dirent updates to constrain memory usage. */
1299		mutex_lock(&rd->pscan.lock);
1300		flush = xrep_dir_want_flush_stashed(rd);
1301		mutex_unlock(&rd->pscan.lock);
1302		if (flush) {
1303			xchk_trans_cancel(sc);
1304
1305			error = xrep_tempfile_iolock_polled(sc);
1306			if (error)
1307				break;
1308
1309			error = xrep_dir_replay_updates(rd);
1310			xrep_tempfile_iounlock(sc);
1311			if (error)
1312				break;
1313
1314			error = xchk_trans_alloc_empty(sc);
1315			if (error)
1316				break;
1317		}
1318
1319		if (xchk_should_terminate(sc, &error))
1320			break;
1321	}
1322	xchk_iscan_iter_finish(&rd->pscan.iscan);
1323	if (error) {
1324		/*
1325		 * If we couldn't grab an inode that was busy with a state
1326		 * change, change the error code so that we exit to userspace
1327		 * as quickly as possible.
1328		 */
1329		if (error == -EBUSY)
1330			return -ECANCELED;
1331		return error;
1332	}
1333
1334	/*
1335	 * Cancel the empty transaction so that we can (later) use the atomic
1336	 * file mapping exchange functions to lock files and commit the new
1337	 * directory.
1338	 */
1339	xchk_trans_cancel(rd->sc);
1340	return 0;
1341}
1342
1343/*
1344 * Capture dirent updates being made by other threads which are relevant to the
1345 * directory being repaired.
1346 */
1347STATIC int
1348xrep_dir_live_update(
1349	struct notifier_block		*nb,
1350	unsigned long			action,
1351	void				*data)
1352{
1353	struct xfs_dir_update_params	*p = data;
1354	struct xrep_dir			*rd;
1355	struct xfs_scrub		*sc;
1356	int				error = 0;
1357
1358	rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb);
1359	sc = rd->sc;
1360
1361	/*
1362	 * This thread updated a child dirent in the directory that we're
1363	 * rebuilding.  Stash the update for replay against the temporary
1364	 * directory.
1365	 */
1366	if (p->dp->i_ino == sc->ip->i_ino &&
1367	    xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) {
1368		mutex_lock(&rd->pscan.lock);
1369		if (p->delta > 0)
1370			error = xrep_dir_stash_createname(rd, p->name,
1371					p->ip->i_ino);
1372		else
1373			error = xrep_dir_stash_removename(rd, p->name,
1374					p->ip->i_ino);
1375		mutex_unlock(&rd->pscan.lock);
1376		if (error)
1377			goto out_abort;
1378	}
1379
1380	/*
1381	 * This thread updated another directory's child dirent that points to
1382	 * the directory that we're rebuilding, so remember the new dotdot
1383	 * target.
1384	 */
1385	if (p->ip->i_ino == sc->ip->i_ino &&
1386	    xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) {
1387		if (p->delta > 0) {
1388			trace_xrep_dir_stash_createname(sc->tempip,
1389					&xfs_name_dotdot,
1390					p->dp->i_ino);
1391
1392			xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino);
1393		} else {
1394			trace_xrep_dir_stash_removename(sc->tempip,
1395					&xfs_name_dotdot,
1396					rd->pscan.parent_ino);
1397
1398			xrep_findparent_scan_found(&rd->pscan, NULLFSINO);
1399		}
1400	}
1401
1402	return NOTIFY_DONE;
1403out_abort:
1404	xchk_iscan_abort(&rd->pscan.iscan);
1405	return NOTIFY_DONE;
1406}
1407
1408/*
1409 * Free all the directory blocks and reset the data fork.  The caller must
1410 * join the inode to the transaction.  This function returns with the inode
1411 * joined to a clean scrub transaction.
1412 */
1413STATIC int
1414xrep_dir_reset_fork(
1415	struct xrep_dir		*rd,
1416	xfs_ino_t		parent_ino)
1417{
1418	struct xfs_scrub	*sc = rd->sc;
1419	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
1420	int			error;
1421
1422	/* Unmap all the directory buffers. */
1423	if (xfs_ifork_has_extents(ifp)) {
1424		error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
1425		if (error)
1426			return error;
1427	}
1428
1429	trace_xrep_dir_reset_fork(sc->tempip, parent_ino);
1430
1431	/* Reset the data fork to an empty data fork. */
1432	xfs_idestroy_fork(ifp);
1433	ifp->if_bytes = 0;
1434	sc->tempip->i_disk_size = 0;
1435
1436	/* Reinitialize the short form directory. */
1437	xrep_dir_init_args(rd, sc->tempip, NULL);
1438	return xfs_dir2_sf_create(&rd->args, parent_ino);
1439}
1440
1441/*
1442 * Prepare both inodes' directory forks for exchanging mappings.  Promote the
1443 * tempfile from short format to leaf format, and if the file being repaired
1444 * has a short format data fork, turn it into an empty extent list.
1445 */
1446STATIC int
1447xrep_dir_swap_prep(
1448	struct xfs_scrub	*sc,
1449	bool			temp_local,
1450	bool			ip_local)
1451{
1452	int			error;
1453
1454	/*
1455	 * If the tempfile's directory is in shortform format, convert that to
1456	 * a single leaf extent so that we can use the atomic mapping exchange.
1457	 */
1458	if (temp_local) {
1459		struct xfs_da_args	args = {
1460			.dp		= sc->tempip,
1461			.geo		= sc->mp->m_dir_geo,
1462			.whichfork	= XFS_DATA_FORK,
1463			.trans		= sc->tp,
1464			.total		= 1,
1465			.owner		= sc->ip->i_ino,
1466		};
1467
1468		error = xfs_dir2_sf_to_block(&args);
1469		if (error)
1470			return error;
1471
1472		/*
1473		 * Roll the deferred log items to get us back to a clean
1474		 * transaction.
1475		 */
1476		error = xfs_defer_finish(&sc->tp);
1477		if (error)
1478			return error;
1479	}
1480
1481	/*
1482	 * If the file being repaired had a shortform data fork, convert that
1483	 * to an empty extent list in preparation for the atomic mapping
1484	 * exchange.
1485	 */
1486	if (ip_local) {
1487		struct xfs_ifork	*ifp;
1488
1489		ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1490		xfs_idestroy_fork(ifp);
1491		ifp->if_format = XFS_DINODE_FMT_EXTENTS;
1492		ifp->if_nextents = 0;
1493		ifp->if_bytes = 0;
1494		ifp->if_data = NULL;
1495		ifp->if_height = 0;
1496
1497		xfs_trans_log_inode(sc->tp, sc->ip,
1498				XFS_ILOG_CORE | XFS_ILOG_DDATA);
1499	}
1500
1501	return 0;
1502}
1503
1504/*
1505 * Replace the inode number of a directory entry.
1506 */
1507static int
1508xrep_dir_replace(
1509	struct xrep_dir		*rd,
1510	struct xfs_inode	*dp,
1511	const struct xfs_name	*name,
1512	xfs_ino_t		inum,
1513	xfs_extlen_t		total)
1514{
1515	struct xfs_scrub	*sc = rd->sc;
1516	int			error;
1517
1518	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
1519
1520	error = xfs_dir_ino_validate(sc->mp, inum);
1521	if (error)
1522		return error;
1523
1524	xrep_dir_init_args(rd, dp, name);
1525	rd->args.inumber = inum;
1526	rd->args.total = total;
1527	return xfs_dir_replace_args(&rd->args);
1528}
1529
1530/*
1531 * Reset the link count of this directory and adjust the unlinked list pointers
1532 * as needed.
1533 */
1534STATIC int
1535xrep_dir_set_nlink(
1536	struct xrep_dir		*rd)
1537{
1538	struct xfs_scrub	*sc = rd->sc;
1539	struct xfs_inode	*dp = sc->ip;
1540	struct xfs_perag	*pag;
1541	unsigned int		new_nlink = min_t(unsigned long long,
1542						  rd->subdirs + 2,
1543						  XFS_NLINK_PINNED);
1544	int			error;
1545
1546	/*
1547	 * The directory is not on the incore unlinked list, which means that
1548	 * it needs to be reachable via the directory tree.  Update the nlink
1549	 * with our observed link count.  If the directory has no parent, it
1550	 * will be moved to the orphanage.
1551	 */
1552	if (!xfs_inode_on_unlinked_list(dp))
1553		goto reset_nlink;
1554
1555	/*
1556	 * The directory is on the unlinked list and we did not find any
1557	 * dirents.  Set the link count to zero and let the directory
1558	 * inactivate when the last reference drops.
1559	 */
1560	if (rd->dirents == 0) {
1561		rd->needs_adoption = false;
1562		new_nlink = 0;
1563		goto reset_nlink;
1564	}
1565
1566	/*
1567	 * The directory is on the unlinked list and we found dirents.  This
1568	 * directory needs to be reachable via the directory tree.  Remove the
1569	 * dir from the unlinked list and update nlink with the observed link
1570	 * count.  If the directory has no parent, it will be moved to the
1571	 * orphanage.
1572	 */
1573	pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino));
1574	if (!pag) {
1575		ASSERT(0);
1576		return -EFSCORRUPTED;
1577	}
1578
1579	error = xfs_iunlink_remove(sc->tp, pag, dp);
1580	xfs_perag_put(pag);
1581	if (error)
1582		return error;
1583
1584reset_nlink:
1585	if (VFS_I(dp)->i_nlink != new_nlink)
1586		set_nlink(VFS_I(dp), new_nlink);
1587	return 0;
1588}
1589
1590/*
1591 * Finish replaying stashed dirent updates, allocate a transaction for
1592 * exchanging data fork mappings, and take the ILOCKs of both directories
1593 * before we commit the new directory structure.
1594 */
1595STATIC int
1596xrep_dir_finalize_tempdir(
1597	struct xrep_dir		*rd)
1598{
1599	struct xfs_scrub	*sc = rd->sc;
1600	int			error;
1601
1602	if (!xfs_has_parent(sc->mp))
1603		return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1604
1605	/*
1606	 * Repair relies on the ILOCK to quiesce all possible dirent updates.
1607	 * Replay all queued dirent updates into the tempdir before exchanging
1608	 * the contents, even if that means dropping the ILOCKs and the
1609	 * transaction.
1610	 */
1611	do {
1612		error = xrep_dir_replay_updates(rd);
1613		if (error)
1614			return error;
1615
1616		error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1617		if (error)
1618			return error;
1619
1620		if (xfarray_length(rd->dir_entries) == 0)
1621			break;
1622
1623		xchk_trans_cancel(sc);
1624		xrep_tempfile_iunlock_both(sc);
1625	} while (!xchk_should_terminate(sc, &error));
1626	return error;
1627}
1628
1629/* Exchange the temporary directory's data fork with the one being repaired. */
1630STATIC int
1631xrep_dir_swap(
1632	struct xrep_dir		*rd)
1633{
1634	struct xfs_scrub	*sc = rd->sc;
1635	bool			ip_local, temp_local;
1636	int			error = 0;
1637
1638	/*
1639	 * If we never found the parent for this directory, temporarily assign
1640	 * the root dir as the parent; we'll move this to the orphanage after
1641	 * exchanging the dir contents.  We hold the ILOCK of the dir being
1642	 * repaired, so we're not worried about racy updates of dotdot.
1643	 */
1644	ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
1645	if (rd->pscan.parent_ino == NULLFSINO) {
1646		rd->needs_adoption = true;
1647		rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino;
1648	}
1649
1650	/*
1651	 * Reset the temporary directory's '..' entry to point to the parent
1652	 * that we found.  The temporary directory was created with the root
1653	 * directory as the parent, so we can skip this if repairing a
1654	 * subdirectory of the root.
1655	 *
1656	 * It's also possible that this replacement could also expand a sf
1657	 * tempdir into block format.
1658	 */
1659	if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) {
1660		error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot,
1661				rd->pscan.parent_ino, rd->tx.req.resblks);
1662		if (error)
1663			return error;
1664	}
1665
1666	/*
1667	 * Changing the dot and dotdot entries could have changed the shape of
1668	 * the directory, so we recompute these.
1669	 */
1670	ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1671	temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1672
1673	/*
1674	 * If the both files have a local format data fork and the rebuilt
1675	 * directory data would fit in the repaired file's data fork, copy
1676	 * the contents from the tempfile and update the directory link count.
1677	 * We're done now.
1678	 */
1679	if (ip_local && temp_local &&
1680	    sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
1681		xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
1682		return xrep_dir_set_nlink(rd);
1683	}
1684
1685	/*
1686	 * Clean the transaction before we start working on exchanging
1687	 * directory contents.
1688	 */
1689	error = xrep_tempfile_roll_trans(rd->sc);
1690	if (error)
1691		return error;
1692
1693	/* Otherwise, make sure both data forks are in block-mapping mode. */
1694	error = xrep_dir_swap_prep(sc, temp_local, ip_local);
1695	if (error)
1696		return error;
1697
1698	/*
1699	 * Set nlink of the directory in the same transaction sequence that
1700	 * (atomically) commits the new directory data.
1701	 */
1702	error = xrep_dir_set_nlink(rd);
1703	if (error)
1704		return error;
1705
1706	return xrep_tempexch_contents(sc, &rd->tx);
1707}
1708
1709/*
1710 * Exchange the new directory contents (which we created in the tempfile) with
1711 * the directory being repaired.
1712 */
1713STATIC int
1714xrep_dir_rebuild_tree(
1715	struct xrep_dir		*rd)
1716{
1717	struct xfs_scrub	*sc = rd->sc;
1718	int			error;
1719
1720	trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino);
1721
1722	/*
1723	 * Take the IOLOCK on the temporary file so that we can run dir
1724	 * operations with the same locks held as we would for a normal file.
1725	 * We still hold sc->ip's IOLOCK.
1726	 */
1727	error = xrep_tempfile_iolock_polled(rd->sc);
1728	if (error)
1729		return error;
1730
1731	/*
1732	 * Allocate transaction, lock inodes, and make sure that we've replayed
1733	 * all the stashed dirent updates to the tempdir.  After this point,
1734	 * we're ready to exchange data fork mappings.
1735	 */
1736	error = xrep_dir_finalize_tempdir(rd);
1737	if (error)
1738		return error;
1739
1740	if (xchk_iscan_aborted(&rd->pscan.iscan))
1741		return -ECANCELED;
1742
1743	/*
1744	 * Exchange the tempdir's data fork with the file being repaired.  This
1745	 * recreates the transaction and re-takes the ILOCK in the scrub
1746	 * context.
1747	 */
1748	error = xrep_dir_swap(rd);
1749	if (error)
1750		return error;
1751
1752	/*
1753	 * Release the old directory blocks and reset the data fork of the temp
1754	 * directory to an empty shortform directory because inactivation does
1755	 * nothing for directories.
1756	 */
1757	error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino);
1758	if (error)
1759		return error;
1760
1761	/*
1762	 * Roll to get a transaction without any inodes joined to it.  Then we
1763	 * can drop the tempfile's ILOCK and IOLOCK before doing more work on
1764	 * the scrub target directory.
1765	 */
1766	error = xfs_trans_roll(&sc->tp);
1767	if (error)
1768		return error;
1769
1770	xrep_tempfile_iunlock(sc);
1771	xrep_tempfile_iounlock(sc);
1772	return 0;
1773}
1774
1775/* Set up the filesystem scan so we can regenerate directory entries. */
1776STATIC int
1777xrep_dir_setup_scan(
1778	struct xrep_dir		*rd)
1779{
1780	struct xfs_scrub	*sc = rd->sc;
1781	char			*descr;
1782	int			error;
1783
1784	/* Set up some staging memory for salvaging dirents. */
1785	descr = xchk_xfile_ino_descr(sc, "directory entries");
1786	error = xfarray_create(descr, 0, sizeof(struct xrep_dirent),
1787			&rd->dir_entries);
1788	kfree(descr);
1789	if (error)
1790		return error;
1791
1792	descr = xchk_xfile_ino_descr(sc, "directory entry names");
1793	error = xfblob_create(descr, &rd->dir_names);
1794	kfree(descr);
1795	if (error)
1796		goto out_xfarray;
1797
1798	if (xfs_has_parent(sc->mp))
1799		error = __xrep_findparent_scan_start(sc, &rd->pscan,
1800				xrep_dir_live_update);
1801	else
1802		error = xrep_findparent_scan_start(sc, &rd->pscan);
1803	if (error)
1804		goto out_xfblob;
1805
1806	return 0;
1807
1808out_xfblob:
1809	xfblob_destroy(rd->dir_names);
1810	rd->dir_names = NULL;
1811out_xfarray:
1812	xfarray_destroy(rd->dir_entries);
1813	rd->dir_entries = NULL;
1814	return error;
1815}
1816
1817/*
1818 * Move the current file to the orphanage.
1819 *
1820 * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks.  Upon
1821 * successful return, the scrub transaction will have enough extra reservation
1822 * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the
1823 * orphanage; and both inodes will be ijoined.
1824 */
1825STATIC int
1826xrep_dir_move_to_orphanage(
1827	struct xrep_dir		*rd)
1828{
1829	struct xfs_scrub	*sc = rd->sc;
1830	xfs_ino_t		orig_parent, new_parent;
1831	int			error;
1832
1833	/*
1834	 * We are about to drop the ILOCK on sc->ip to lock the orphanage and
1835	 * prepare for the adoption.  Therefore, look up the old dotdot entry
1836	 * for sc->ip so that we can compare it after we re-lock sc->ip.
1837	 */
1838	error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent);
1839	if (error)
1840		return error;
1841
1842	/*
1843	 * Drop the ILOCK on the scrub target and commit the transaction.
1844	 * Adoption computes its own resource requirements and gathers the
1845	 * necessary components.
1846	 */
1847	error = xrep_trans_commit(sc);
1848	if (error)
1849		return error;
1850	xchk_iunlock(sc, XFS_ILOCK_EXCL);
1851
1852	/* If we can take the orphanage's iolock then we're ready to move. */
1853	if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
1854		xchk_iunlock(sc, sc->ilock_flags);
1855		error = xrep_orphanage_iolock_two(sc);
1856		if (error)
1857			return error;
1858	}
1859
1860	/* Grab transaction and ILOCK the two files. */
1861	error = xrep_adoption_trans_alloc(sc, &rd->adoption);
1862	if (error)
1863		return error;
1864
1865	error = xrep_adoption_compute_name(&rd->adoption, &rd->xname);
1866	if (error)
1867		return error;
1868
1869	/*
1870	 * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot
1871	 * entry again.  If the parent changed or the child was unlinked while
1872	 * the child directory was unlocked, we don't need to move the child to
1873	 * the orphanage after all.
1874	 */
1875	error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent);
1876	if (error)
1877		return error;
1878
1879	/*
1880	 * Attach to the orphanage if we still have a linked directory and it
1881	 * hasn't been moved.
1882	 */
1883	if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) {
1884		error = xrep_adoption_move(&rd->adoption);
1885		if (error)
1886			return error;
1887	}
1888
1889	/*
1890	 * Launder the scrub transaction so we can drop the orphanage ILOCK
1891	 * and IOLOCK.  Return holding the scrub target's ILOCK and IOLOCK.
1892	 */
1893	error = xrep_adoption_trans_roll(&rd->adoption);
1894	if (error)
1895		return error;
1896
1897	xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
1898	xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
1899	return 0;
1900}
1901
1902/*
1903 * Repair the directory metadata.
1904 *
1905 * XXX: Directory entry buffers can be multiple fsblocks in size.  The buffer
1906 * cache in XFS can't handle aliased multiblock buffers, so this might
1907 * misbehave if the directory blocks are crosslinked with other filesystem
1908 * metadata.
1909 *
1910 * XXX: Is it necessary to check the dcache for this directory to make sure
1911 * that we always recreate every cached entry?
1912 */
1913int
1914xrep_directory(
1915	struct xfs_scrub	*sc)
1916{
1917	struct xrep_dir		*rd = sc->buf;
1918	int			error;
1919
1920	/* The rmapbt is required to reap the old data fork. */
1921	if (!xfs_has_rmapbt(sc->mp))
1922		return -EOPNOTSUPP;
1923	/* We require atomic file exchange range to rebuild anything. */
1924	if (!xfs_has_exchange_range(sc->mp))
1925		return -EOPNOTSUPP;
1926
1927	error = xrep_dir_setup_scan(rd);
1928	if (error)
1929		return error;
1930
1931	if (xfs_has_parent(sc->mp))
1932		error = xrep_dir_scan_dirtree(rd);
1933	else
1934		error = xrep_dir_salvage_entries(rd);
1935	if (error)
1936		goto out_teardown;
1937
1938	/* Last chance to abort before we start committing fixes. */
1939	if (xchk_should_terminate(sc, &error))
1940		goto out_teardown;
1941
1942	error = xrep_dir_rebuild_tree(rd);
1943	if (error)
1944		goto out_teardown;
1945
1946	if (rd->needs_adoption) {
1947		if (!xrep_orphanage_can_adopt(rd->sc))
1948			error = -EFSCORRUPTED;
1949		else
1950			error = xrep_dir_move_to_orphanage(rd);
1951		if (error)
1952			goto out_teardown;
1953	}
1954
1955out_teardown:
1956	xrep_dir_teardown(sc);
1957	return error;
1958}
1959