1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2022 Fujitsu.  All Rights Reserved.
4 */
5
6#include "xfs.h"
7#include "xfs_shared.h"
8#include "xfs_format.h"
9#include "xfs_log_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_alloc.h"
13#include "xfs_bit.h"
14#include "xfs_btree.h"
15#include "xfs_inode.h"
16#include "xfs_icache.h"
17#include "xfs_rmap.h"
18#include "xfs_rmap_btree.h"
19#include "xfs_rtalloc.h"
20#include "xfs_trans.h"
21#include "xfs_ag.h"
22
23#include <linux/mm.h>
24#include <linux/dax.h>
25#include <linux/fs.h>
26
27struct xfs_failure_info {
28	xfs_agblock_t		startblock;
29	xfs_extlen_t		blockcount;
30	int			mf_flags;
31	bool			want_shutdown;
32};
33
34static pgoff_t
35xfs_failure_pgoff(
36	struct xfs_mount		*mp,
37	const struct xfs_rmap_irec	*rec,
38	const struct xfs_failure_info	*notify)
39{
40	loff_t				pos = XFS_FSB_TO_B(mp, rec->rm_offset);
41
42	if (notify->startblock > rec->rm_startblock)
43		pos += XFS_FSB_TO_B(mp,
44				notify->startblock - rec->rm_startblock);
45	return pos >> PAGE_SHIFT;
46}
47
48static unsigned long
49xfs_failure_pgcnt(
50	struct xfs_mount		*mp,
51	const struct xfs_rmap_irec	*rec,
52	const struct xfs_failure_info	*notify)
53{
54	xfs_agblock_t			end_rec;
55	xfs_agblock_t			end_notify;
56	xfs_agblock_t			start_cross;
57	xfs_agblock_t			end_cross;
58
59	start_cross = max(rec->rm_startblock, notify->startblock);
60
61	end_rec = rec->rm_startblock + rec->rm_blockcount;
62	end_notify = notify->startblock + notify->blockcount;
63	end_cross = min(end_rec, end_notify);
64
65	return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
66}
67
68static int
69xfs_dax_failure_fn(
70	struct xfs_btree_cur		*cur,
71	const struct xfs_rmap_irec	*rec,
72	void				*data)
73{
74	struct xfs_mount		*mp = cur->bc_mp;
75	struct xfs_inode		*ip;
76	struct xfs_failure_info		*notify = data;
77	struct address_space		*mapping;
78	pgoff_t				pgoff;
79	unsigned long			pgcnt;
80	int				error = 0;
81
82	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
83	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
84		/* Continue the query because this isn't a failure. */
85		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
86			return 0;
87		notify->want_shutdown = true;
88		return 0;
89	}
90
91	/* Get files that incore, filter out others that are not in use. */
92	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
93			 0, &ip);
94	/* Continue the rmap query if the inode isn't incore */
95	if (error == -ENODATA)
96		return 0;
97	if (error) {
98		notify->want_shutdown = true;
99		return 0;
100	}
101
102	mapping = VFS_I(ip)->i_mapping;
103	pgoff = xfs_failure_pgoff(mp, rec, notify);
104	pgcnt = xfs_failure_pgcnt(mp, rec, notify);
105
106	/* Continue the rmap query if the inode isn't a dax file. */
107	if (dax_mapping(mapping))
108		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
109					  notify->mf_flags);
110
111	/* Invalidate the cache in dax pages. */
112	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
113		invalidate_inode_pages2_range(mapping, pgoff,
114					      pgoff + pgcnt - 1);
115
116	xfs_irele(ip);
117	return error;
118}
119
120static int
121xfs_dax_notify_failure_freeze(
122	struct xfs_mount	*mp)
123{
124	struct super_block	*sb = mp->m_super;
125	int			error;
126
127	error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
128	if (error)
129		xfs_emerg(mp, "already frozen by kernel, err=%d", error);
130
131	return error;
132}
133
134static void
135xfs_dax_notify_failure_thaw(
136	struct xfs_mount	*mp,
137	bool			kernel_frozen)
138{
139	struct super_block	*sb = mp->m_super;
140	int			error;
141
142	if (kernel_frozen) {
143		error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
144		if (error)
145			xfs_emerg(mp, "still frozen after notify failure, err=%d",
146				error);
147	}
148
149	/*
150	 * Also thaw userspace call anyway because the device is about to be
151	 * removed immediately.
152	 */
153	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
154}
155
156static int
157xfs_dax_notify_ddev_failure(
158	struct xfs_mount	*mp,
159	xfs_daddr_t		daddr,
160	xfs_daddr_t		bblen,
161	int			mf_flags)
162{
163	struct xfs_failure_info	notify = { .mf_flags = mf_flags };
164	struct xfs_trans	*tp = NULL;
165	struct xfs_btree_cur	*cur = NULL;
166	struct xfs_buf		*agf_bp = NULL;
167	int			error = 0;
168	bool			kernel_frozen = false;
169	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, daddr);
170	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, fsbno);
171	xfs_fsblock_t		end_fsbno = XFS_DADDR_TO_FSB(mp,
172							     daddr + bblen - 1);
173	xfs_agnumber_t		end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
174
175	if (mf_flags & MF_MEM_PRE_REMOVE) {
176		xfs_info(mp, "Device is about to be removed!");
177		/*
178		 * Freeze fs to prevent new mappings from being created.
179		 * - Keep going on if others already hold the kernel forzen.
180		 * - Keep going on if other errors too because this device is
181		 *   starting to fail.
182		 * - If kernel frozen state is hold successfully here, thaw it
183		 *   here as well at the end.
184		 */
185		kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
186	}
187
188	error = xfs_trans_alloc_empty(mp, &tp);
189	if (error)
190		goto out;
191
192	for (; agno <= end_agno; agno++) {
193		struct xfs_rmap_irec	ri_low = { };
194		struct xfs_rmap_irec	ri_high;
195		struct xfs_agf		*agf;
196		struct xfs_perag	*pag;
197		xfs_agblock_t		range_agend;
198
199		pag = xfs_perag_get(mp, agno);
200		error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
201		if (error) {
202			xfs_perag_put(pag);
203			break;
204		}
205
206		cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
207
208		/*
209		 * Set the rmap range from ri_low to ri_high, which represents
210		 * a [start, end] where we looking for the files or metadata.
211		 */
212		memset(&ri_high, 0xFF, sizeof(ri_high));
213		ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
214		if (agno == end_agno)
215			ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
216
217		agf = agf_bp->b_addr;
218		range_agend = min(be32_to_cpu(agf->agf_length) - 1,
219				ri_high.rm_startblock);
220		notify.startblock = ri_low.rm_startblock;
221		notify.blockcount = range_agend + 1 - ri_low.rm_startblock;
222
223		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
224				xfs_dax_failure_fn, &notify);
225		xfs_btree_del_cursor(cur, error);
226		xfs_trans_brelse(tp, agf_bp);
227		xfs_perag_put(pag);
228		if (error)
229			break;
230
231		fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
232	}
233
234	xfs_trans_cancel(tp);
235
236	/*
237	 * Shutdown fs from a force umount in pre-remove case which won't fail,
238	 * so errors can be ignored.  Otherwise, shutdown the filesystem with
239	 * CORRUPT flag if error occured or notify.want_shutdown was set during
240	 * RMAP querying.
241	 */
242	if (mf_flags & MF_MEM_PRE_REMOVE)
243		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
244	else if (error || notify.want_shutdown) {
245		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
246		if (!error)
247			error = -EFSCORRUPTED;
248	}
249
250out:
251	/* Thaw the fs if it has been frozen before. */
252	if (mf_flags & MF_MEM_PRE_REMOVE)
253		xfs_dax_notify_failure_thaw(mp, kernel_frozen);
254
255	return error;
256}
257
258static int
259xfs_dax_notify_failure(
260	struct dax_device	*dax_dev,
261	u64			offset,
262	u64			len,
263	int			mf_flags)
264{
265	struct xfs_mount	*mp = dax_holder(dax_dev);
266	u64			ddev_start;
267	u64			ddev_end;
268
269	if (!(mp->m_super->s_flags & SB_BORN)) {
270		xfs_warn(mp, "filesystem is not ready for notify_failure()!");
271		return -EIO;
272	}
273
274	if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
275		xfs_debug(mp,
276			 "notify_failure() not supported on realtime device!");
277		return -EOPNOTSUPP;
278	}
279
280	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
281	    mp->m_logdev_targp != mp->m_ddev_targp) {
282		/*
283		 * In the pre-remove case the failure notification is attempting
284		 * to trigger a force unmount.  The expectation is that the
285		 * device is still present, but its removal is in progress and
286		 * can not be cancelled, proceed with accessing the log device.
287		 */
288		if (mf_flags & MF_MEM_PRE_REMOVE)
289			return 0;
290		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
291		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
292		return -EFSCORRUPTED;
293	}
294
295	if (!xfs_has_rmapbt(mp)) {
296		xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
297		return -EOPNOTSUPP;
298	}
299
300	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
301	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
302
303	/* Notify failure on the whole device. */
304	if (offset == 0 && len == U64_MAX) {
305		offset = ddev_start;
306		len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
307	}
308
309	/* Ignore the range out of filesystem area */
310	if (offset + len - 1 < ddev_start)
311		return -ENXIO;
312	if (offset > ddev_end)
313		return -ENXIO;
314
315	/* Calculate the real range when it touches the boundary */
316	if (offset > ddev_start)
317		offset -= ddev_start;
318	else {
319		len -= ddev_start - offset;
320		offset = 0;
321	}
322	if (offset + len - 1 > ddev_end)
323		len = ddev_end - offset + 1;
324
325	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
326			mf_flags);
327}
328
329const struct dax_holder_operations xfs_dax_holder_operations = {
330	.notify_failure		= xfs_dax_notify_failure,
331};
332