inode.c revision 61e0d0cc
1// SPDX-License-Identifier: GPL-2.0+
2/*
3 * Copyright (C) 2017 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_btree.h"
13#include "xfs_log_format.h"
14#include "xfs_inode.h"
15#include "xfs_ialloc.h"
16#include "xfs_da_format.h"
17#include "xfs_reflink.h"
18#include "xfs_rmap.h"
19#include "xfs_bmap_util.h"
20#include "scrub/scrub.h"
21#include "scrub/common.h"
22#include "scrub/btree.h"
23
24/*
25 * Grab total control of the inode metadata.  It doesn't matter here if
26 * the file data is still changing; exclusive access to the metadata is
27 * the goal.
28 */
29int
30xchk_setup_inode(
31	struct xfs_scrub	*sc)
32{
33	int			error;
34
35	/*
36	 * Try to get the inode.  If the verifiers fail, we try again
37	 * in raw mode.
38	 */
39	error = xchk_get_inode(sc);
40	switch (error) {
41	case 0:
42		break;
43	case -EFSCORRUPTED:
44	case -EFSBADCRC:
45		return xchk_trans_alloc(sc, 0);
46	default:
47		return error;
48	}
49
50	/* Got the inode, lock it and we're ready to go. */
51	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
52	xfs_ilock(sc->ip, sc->ilock_flags);
53	error = xchk_trans_alloc(sc, 0);
54	if (error)
55		goto out;
56	sc->ilock_flags |= XFS_ILOCK_EXCL;
57	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
58
59out:
60	/* scrub teardown will unlock and release the inode for us */
61	return error;
62}
63
64/* Inode core */
65
66/* Validate di_extsize hint. */
67STATIC void
68xchk_inode_extsize(
69	struct xfs_scrub	*sc,
70	struct xfs_dinode	*dip,
71	xfs_ino_t		ino,
72	uint16_t		mode,
73	uint16_t		flags)
74{
75	xfs_failaddr_t		fa;
76	uint32_t		value = be32_to_cpu(dip->di_extsize);
77
78	fa = xfs_inode_validate_extsize(sc->mp, value, mode, flags);
79	if (fa)
80		xchk_ino_set_corrupt(sc, ino);
81
82	/*
83	 * XFS allows a sysadmin to change the rt extent size when adding a rt
84	 * section to a filesystem after formatting.  If there are any
85	 * directories with extszinherit and rtinherit set, the hint could
86	 * become misaligned with the new rextsize.  The verifier doesn't check
87	 * this, because we allow rtinherit directories even without an rt
88	 * device.  Flag this as an administrative warning since we will clean
89	 * this up eventually.
90	 */
91	if ((flags & XFS_DIFLAG_RTINHERIT) &&
92	    (flags & XFS_DIFLAG_EXTSZINHERIT) &&
93	    value % sc->mp->m_sb.sb_rextsize > 0)
94		xchk_ino_set_warning(sc, ino);
95}
96
97/*
98 * Validate di_cowextsize hint.
99 *
100 * The rules are documented at xfs_ioctl_setattr_check_cowextsize().
101 * These functions must be kept in sync with each other.
102 */
103STATIC void
104xchk_inode_cowextsize(
105	struct xfs_scrub	*sc,
106	struct xfs_dinode	*dip,
107	xfs_ino_t		ino,
108	uint16_t		mode,
109	uint16_t		flags,
110	uint64_t		flags2)
111{
112	xfs_failaddr_t		fa;
113
114	fa = xfs_inode_validate_cowextsize(sc->mp,
115			be32_to_cpu(dip->di_cowextsize), mode, flags,
116			flags2);
117	if (fa)
118		xchk_ino_set_corrupt(sc, ino);
119}
120
121/* Make sure the di_flags make sense for the inode. */
122STATIC void
123xchk_inode_flags(
124	struct xfs_scrub	*sc,
125	struct xfs_dinode	*dip,
126	xfs_ino_t		ino,
127	uint16_t		mode,
128	uint16_t		flags)
129{
130	struct xfs_mount	*mp = sc->mp;
131
132	/* di_flags are all taken, last bit cannot be used */
133	if (flags & ~XFS_DIFLAG_ANY)
134		goto bad;
135
136	/* rt flags require rt device */
137	if ((flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp)
138		goto bad;
139
140	/* new rt bitmap flag only valid for rbmino */
141	if ((flags & XFS_DIFLAG_NEWRTBM) && ino != mp->m_sb.sb_rbmino)
142		goto bad;
143
144	/* directory-only flags */
145	if ((flags & (XFS_DIFLAG_RTINHERIT |
146		     XFS_DIFLAG_EXTSZINHERIT |
147		     XFS_DIFLAG_PROJINHERIT |
148		     XFS_DIFLAG_NOSYMLINKS)) &&
149	    !S_ISDIR(mode))
150		goto bad;
151
152	/* file-only flags */
153	if ((flags & (XFS_DIFLAG_REALTIME | FS_XFLAG_EXTSIZE)) &&
154	    !S_ISREG(mode))
155		goto bad;
156
157	/* filestreams and rt make no sense */
158	if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME))
159		goto bad;
160
161	return;
162bad:
163	xchk_ino_set_corrupt(sc, ino);
164}
165
166/* Make sure the di_flags2 make sense for the inode. */
167STATIC void
168xchk_inode_flags2(
169	struct xfs_scrub	*sc,
170	struct xfs_dinode	*dip,
171	xfs_ino_t		ino,
172	uint16_t		mode,
173	uint16_t		flags,
174	uint64_t		flags2)
175{
176	struct xfs_mount	*mp = sc->mp;
177
178	/* Unknown di_flags2 could be from a future kernel */
179	if (flags2 & ~XFS_DIFLAG2_ANY)
180		xchk_ino_set_warning(sc, ino);
181
182	/* reflink flag requires reflink feature */
183	if ((flags2 & XFS_DIFLAG2_REFLINK) &&
184	    !xfs_has_reflink(mp))
185		goto bad;
186
187	/* cowextsize flag is checked w.r.t. mode separately */
188
189	/* file/dir-only flags */
190	if ((flags2 & XFS_DIFLAG2_DAX) && !(S_ISREG(mode) || S_ISDIR(mode)))
191		goto bad;
192
193	/* file-only flags */
194	if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode))
195		goto bad;
196
197	/* realtime and reflink make no sense, currently */
198	if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK))
199		goto bad;
200
201	/* no bigtime iflag without the bigtime feature */
202	if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp))
203		goto bad;
204
205	return;
206bad:
207	xchk_ino_set_corrupt(sc, ino);
208}
209
210static inline void
211xchk_dinode_nsec(
212	struct xfs_scrub	*sc,
213	xfs_ino_t		ino,
214	struct xfs_dinode	*dip,
215	const xfs_timestamp_t	ts)
216{
217	struct timespec64	tv;
218
219	tv = xfs_inode_from_disk_ts(dip, ts);
220	if (tv.tv_nsec < 0 || tv.tv_nsec >= NSEC_PER_SEC)
221		xchk_ino_set_corrupt(sc, ino);
222}
223
224/* Scrub all the ondisk inode fields. */
225STATIC void
226xchk_dinode(
227	struct xfs_scrub	*sc,
228	struct xfs_dinode	*dip,
229	xfs_ino_t		ino)
230{
231	struct xfs_mount	*mp = sc->mp;
232	size_t			fork_recs;
233	unsigned long long	isize;
234	uint64_t		flags2;
235	uint32_t		nextents;
236	uint16_t		flags;
237	uint16_t		mode;
238
239	flags = be16_to_cpu(dip->di_flags);
240	if (dip->di_version >= 3)
241		flags2 = be64_to_cpu(dip->di_flags2);
242	else
243		flags2 = 0;
244
245	/* di_mode */
246	mode = be16_to_cpu(dip->di_mode);
247	switch (mode & S_IFMT) {
248	case S_IFLNK:
249	case S_IFREG:
250	case S_IFDIR:
251	case S_IFCHR:
252	case S_IFBLK:
253	case S_IFIFO:
254	case S_IFSOCK:
255		/* mode is recognized */
256		break;
257	default:
258		xchk_ino_set_corrupt(sc, ino);
259		break;
260	}
261
262	/* v1/v2 fields */
263	switch (dip->di_version) {
264	case 1:
265		/*
266		 * We autoconvert v1 inodes into v2 inodes on writeout,
267		 * so just mark this inode for preening.
268		 */
269		xchk_ino_set_preen(sc, ino);
270		break;
271	case 2:
272	case 3:
273		if (dip->di_onlink != 0)
274			xchk_ino_set_corrupt(sc, ino);
275
276		if (dip->di_mode == 0 && sc->ip)
277			xchk_ino_set_corrupt(sc, ino);
278
279		if (dip->di_projid_hi != 0 &&
280		    !xfs_has_projid32(mp))
281			xchk_ino_set_corrupt(sc, ino);
282		break;
283	default:
284		xchk_ino_set_corrupt(sc, ino);
285		return;
286	}
287
288	/*
289	 * di_uid/di_gid -- -1 isn't invalid, but there's no way that
290	 * userspace could have created that.
291	 */
292	if (dip->di_uid == cpu_to_be32(-1U) ||
293	    dip->di_gid == cpu_to_be32(-1U))
294		xchk_ino_set_warning(sc, ino);
295
296	/* di_format */
297	switch (dip->di_format) {
298	case XFS_DINODE_FMT_DEV:
299		if (!S_ISCHR(mode) && !S_ISBLK(mode) &&
300		    !S_ISFIFO(mode) && !S_ISSOCK(mode))
301			xchk_ino_set_corrupt(sc, ino);
302		break;
303	case XFS_DINODE_FMT_LOCAL:
304		if (!S_ISDIR(mode) && !S_ISLNK(mode))
305			xchk_ino_set_corrupt(sc, ino);
306		break;
307	case XFS_DINODE_FMT_EXTENTS:
308		if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode))
309			xchk_ino_set_corrupt(sc, ino);
310		break;
311	case XFS_DINODE_FMT_BTREE:
312		if (!S_ISREG(mode) && !S_ISDIR(mode))
313			xchk_ino_set_corrupt(sc, ino);
314		break;
315	case XFS_DINODE_FMT_UUID:
316	default:
317		xchk_ino_set_corrupt(sc, ino);
318		break;
319	}
320
321	/* di_[amc]time.nsec */
322	xchk_dinode_nsec(sc, ino, dip, dip->di_atime);
323	xchk_dinode_nsec(sc, ino, dip, dip->di_mtime);
324	xchk_dinode_nsec(sc, ino, dip, dip->di_ctime);
325
326	/*
327	 * di_size.  xfs_dinode_verify checks for things that screw up
328	 * the VFS such as the upper bit being set and zero-length
329	 * symlinks/directories, but we can do more here.
330	 */
331	isize = be64_to_cpu(dip->di_size);
332	if (isize & (1ULL << 63))
333		xchk_ino_set_corrupt(sc, ino);
334
335	/* Devices, fifos, and sockets must have zero size */
336	if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0)
337		xchk_ino_set_corrupt(sc, ino);
338
339	/* Directories can't be larger than the data section size (32G) */
340	if (S_ISDIR(mode) && (isize == 0 || isize >= XFS_DIR2_SPACE_SIZE))
341		xchk_ino_set_corrupt(sc, ino);
342
343	/* Symlinks can't be larger than SYMLINK_MAXLEN */
344	if (S_ISLNK(mode) && (isize == 0 || isize >= XFS_SYMLINK_MAXLEN))
345		xchk_ino_set_corrupt(sc, ino);
346
347	/*
348	 * Warn if the running kernel can't handle the kinds of offsets
349	 * needed to deal with the file size.  In other words, if the
350	 * pagecache can't cache all the blocks in this file due to
351	 * overly large offsets, flag the inode for admin review.
352	 */
353	if (isize >= mp->m_super->s_maxbytes)
354		xchk_ino_set_warning(sc, ino);
355
356	/* di_nblocks */
357	if (flags2 & XFS_DIFLAG2_REFLINK) {
358		; /* nblocks can exceed dblocks */
359	} else if (flags & XFS_DIFLAG_REALTIME) {
360		/*
361		 * nblocks is the sum of data extents (in the rtdev),
362		 * attr extents (in the datadev), and both forks' bmbt
363		 * blocks (in the datadev).  This clumsy check is the
364		 * best we can do without cross-referencing with the
365		 * inode forks.
366		 */
367		if (be64_to_cpu(dip->di_nblocks) >=
368		    mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks)
369			xchk_ino_set_corrupt(sc, ino);
370	} else {
371		if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks)
372			xchk_ino_set_corrupt(sc, ino);
373	}
374
375	xchk_inode_flags(sc, dip, ino, mode, flags);
376
377	xchk_inode_extsize(sc, dip, ino, mode, flags);
378
379	/* di_nextents */
380	nextents = be32_to_cpu(dip->di_nextents);
381	fork_recs =  XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
382	switch (dip->di_format) {
383	case XFS_DINODE_FMT_EXTENTS:
384		if (nextents > fork_recs)
385			xchk_ino_set_corrupt(sc, ino);
386		break;
387	case XFS_DINODE_FMT_BTREE:
388		if (nextents <= fork_recs)
389			xchk_ino_set_corrupt(sc, ino);
390		break;
391	default:
392		if (nextents != 0)
393			xchk_ino_set_corrupt(sc, ino);
394		break;
395	}
396
397	/* di_forkoff */
398	if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
399		xchk_ino_set_corrupt(sc, ino);
400	if (dip->di_anextents != 0 && dip->di_forkoff == 0)
401		xchk_ino_set_corrupt(sc, ino);
402	if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS)
403		xchk_ino_set_corrupt(sc, ino);
404
405	/* di_aformat */
406	if (dip->di_aformat != XFS_DINODE_FMT_LOCAL &&
407	    dip->di_aformat != XFS_DINODE_FMT_EXTENTS &&
408	    dip->di_aformat != XFS_DINODE_FMT_BTREE)
409		xchk_ino_set_corrupt(sc, ino);
410
411	/* di_anextents */
412	nextents = be16_to_cpu(dip->di_anextents);
413	fork_recs =  XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
414	switch (dip->di_aformat) {
415	case XFS_DINODE_FMT_EXTENTS:
416		if (nextents > fork_recs)
417			xchk_ino_set_corrupt(sc, ino);
418		break;
419	case XFS_DINODE_FMT_BTREE:
420		if (nextents <= fork_recs)
421			xchk_ino_set_corrupt(sc, ino);
422		break;
423	default:
424		if (nextents != 0)
425			xchk_ino_set_corrupt(sc, ino);
426	}
427
428	if (dip->di_version >= 3) {
429		xchk_dinode_nsec(sc, ino, dip, dip->di_crtime);
430		xchk_inode_flags2(sc, dip, ino, mode, flags, flags2);
431		xchk_inode_cowextsize(sc, dip, ino, mode, flags,
432				flags2);
433	}
434}
435
436/*
437 * Make sure the finobt doesn't think this inode is free.
438 * We don't have to check the inobt ourselves because we got the inode via
439 * IGET_UNTRUSTED, which checks the inobt for us.
440 */
441static void
442xchk_inode_xref_finobt(
443	struct xfs_scrub		*sc,
444	xfs_ino_t			ino)
445{
446	struct xfs_inobt_rec_incore	rec;
447	xfs_agino_t			agino;
448	int				has_record;
449	int				error;
450
451	if (!sc->sa.fino_cur || xchk_skip_xref(sc->sm))
452		return;
453
454	agino = XFS_INO_TO_AGINO(sc->mp, ino);
455
456	/*
457	 * Try to get the finobt record.  If we can't get it, then we're
458	 * in good shape.
459	 */
460	error = xfs_inobt_lookup(sc->sa.fino_cur, agino, XFS_LOOKUP_LE,
461			&has_record);
462	if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur) ||
463	    !has_record)
464		return;
465
466	error = xfs_inobt_get_rec(sc->sa.fino_cur, &rec, &has_record);
467	if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur) ||
468	    !has_record)
469		return;
470
471	/*
472	 * Otherwise, make sure this record either doesn't cover this inode,
473	 * or that it does but it's marked present.
474	 */
475	if (rec.ir_startino > agino ||
476	    rec.ir_startino + XFS_INODES_PER_CHUNK <= agino)
477		return;
478
479	if (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))
480		xchk_btree_xref_set_corrupt(sc, sc->sa.fino_cur, 0);
481}
482
483/* Cross reference the inode fields with the forks. */
484STATIC void
485xchk_inode_xref_bmap(
486	struct xfs_scrub	*sc,
487	struct xfs_dinode	*dip)
488{
489	xfs_extnum_t		nextents;
490	xfs_filblks_t		count;
491	xfs_filblks_t		acount;
492	int			error;
493
494	if (xchk_skip_xref(sc->sm))
495		return;
496
497	/* Walk all the extents to check nextents/naextents/nblocks. */
498	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
499			&nextents, &count);
500	if (!xchk_should_check_xref(sc, &error, NULL))
501		return;
502	if (nextents < be32_to_cpu(dip->di_nextents))
503		xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
504
505	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
506			&nextents, &acount);
507	if (!xchk_should_check_xref(sc, &error, NULL))
508		return;
509	if (nextents != be16_to_cpu(dip->di_anextents))
510		xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
511
512	/* Check nblocks against the inode. */
513	if (count + acount != be64_to_cpu(dip->di_nblocks))
514		xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
515}
516
517/* Cross-reference with the other btrees. */
518STATIC void
519xchk_inode_xref(
520	struct xfs_scrub	*sc,
521	xfs_ino_t		ino,
522	struct xfs_dinode	*dip)
523{
524	xfs_agnumber_t		agno;
525	xfs_agblock_t		agbno;
526	int			error;
527
528	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
529		return;
530
531	agno = XFS_INO_TO_AGNO(sc->mp, ino);
532	agbno = XFS_INO_TO_AGBNO(sc->mp, ino);
533
534	error = xchk_ag_init_existing(sc, agno, &sc->sa);
535	if (!xchk_xref_process_error(sc, agno, agbno, &error))
536		goto out_free;
537
538	xchk_xref_is_used_space(sc, agbno, 1);
539	xchk_inode_xref_finobt(sc, ino);
540	xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES);
541	xchk_xref_is_not_shared(sc, agbno, 1);
542	xchk_inode_xref_bmap(sc, dip);
543
544out_free:
545	xchk_ag_free(sc, &sc->sa);
546}
547
548/*
549 * If the reflink iflag disagrees with a scan for shared data fork extents,
550 * either flag an error (shared extents w/ no flag) or a preen (flag set w/o
551 * any shared extents).  We already checked for reflink iflag set on a non
552 * reflink filesystem.
553 */
554static void
555xchk_inode_check_reflink_iflag(
556	struct xfs_scrub	*sc,
557	xfs_ino_t		ino)
558{
559	struct xfs_mount	*mp = sc->mp;
560	bool			has_shared;
561	int			error;
562
563	if (!xfs_has_reflink(mp))
564		return;
565
566	error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
567			&has_shared);
568	if (!xchk_xref_process_error(sc, XFS_INO_TO_AGNO(mp, ino),
569			XFS_INO_TO_AGBNO(mp, ino), &error))
570		return;
571	if (xfs_is_reflink_inode(sc->ip) && !has_shared)
572		xchk_ino_set_preen(sc, ino);
573	else if (!xfs_is_reflink_inode(sc->ip) && has_shared)
574		xchk_ino_set_corrupt(sc, ino);
575}
576
577/* Scrub an inode. */
578int
579xchk_inode(
580	struct xfs_scrub	*sc)
581{
582	struct xfs_dinode	di;
583	int			error = 0;
584
585	/*
586	 * If sc->ip is NULL, that means that the setup function called
587	 * xfs_iget to look up the inode.  xfs_iget returned a EFSCORRUPTED
588	 * and a NULL inode, so flag the corruption error and return.
589	 */
590	if (!sc->ip) {
591		xchk_ino_set_corrupt(sc, sc->sm->sm_ino);
592		return 0;
593	}
594
595	/* Scrub the inode core. */
596	xfs_inode_to_disk(sc->ip, &di, 0);
597	xchk_dinode(sc, &di, sc->ip->i_ino);
598	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
599		goto out;
600
601	/*
602	 * Look for discrepancies between file's data blocks and the reflink
603	 * iflag.  We already checked the iflag against the file mode when
604	 * we scrubbed the dinode.
605	 */
606	if (S_ISREG(VFS_I(sc->ip)->i_mode))
607		xchk_inode_check_reflink_iflag(sc, sc->ip->i_ino);
608
609	xchk_inode_xref(sc, sc->ip->i_ino, &di);
610out:
611	return error;
612}
613