1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
12#include "xfs_mount.h"
13#include "xfs_defer.h"
14#include "xfs_inode.h"
15#include "xfs_trans.h"
16#include "xfs_bmap.h"
17#include "xfs_icache.h"
18#include "xfs_quota.h"
19#include "xfs_exchmaps.h"
20#include "xfs_trace.h"
21#include "xfs_bmap_btree.h"
22#include "xfs_trans_space.h"
23#include "xfs_error.h"
24#include "xfs_errortag.h"
25#include "xfs_health.h"
26#include "xfs_exchmaps_item.h"
27#include "xfs_da_format.h"
28#include "xfs_da_btree.h"
29#include "xfs_attr_leaf.h"
30#include "xfs_attr.h"
31#include "xfs_dir2_priv.h"
32#include "xfs_dir2.h"
33#include "xfs_symlink_remote.h"
34
35struct kmem_cache	*xfs_exchmaps_intent_cache;
36
37/* bmbt mappings adjacent to a pair of records. */
38struct xfs_exchmaps_adjacent {
39	struct xfs_bmbt_irec		left1;
40	struct xfs_bmbt_irec		right1;
41	struct xfs_bmbt_irec		left2;
42	struct xfs_bmbt_irec		right2;
43};
44
45#define ADJACENT_INIT { \
46	.left1  = { .br_startblock = HOLESTARTBLOCK }, \
47	.right1 = { .br_startblock = HOLESTARTBLOCK }, \
48	.left2  = { .br_startblock = HOLESTARTBLOCK }, \
49	.right2 = { .br_startblock = HOLESTARTBLOCK }, \
50}
51
52/* Information to reset reflink flag / CoW fork state after an exchange. */
53
54/*
55 * If the reflink flag is set on either inode, make sure it has an incore CoW
56 * fork, since all reflink inodes must have them.  If there's a CoW fork and it
57 * has mappings in it, make sure the inodes are tagged appropriately so that
58 * speculative preallocations can be GC'd if we run low of space.
59 */
60static inline void
61xfs_exchmaps_ensure_cowfork(
62	struct xfs_inode	*ip)
63{
64	struct xfs_ifork	*cfork;
65
66	if (xfs_is_reflink_inode(ip))
67		xfs_ifork_init_cow(ip);
68
69	cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
70	if (!cfork)
71		return;
72	if (cfork->if_bytes > 0)
73		xfs_inode_set_cowblocks_tag(ip);
74	else
75		xfs_inode_clear_cowblocks_tag(ip);
76}
77
78/*
79 * Adjust the on-disk inode size upwards if needed so that we never add
80 * mappings into the file past EOF.  This is crucial so that log recovery won't
81 * get confused by the sudden appearance of post-eof mappings.
82 */
83STATIC void
84xfs_exchmaps_update_size(
85	struct xfs_trans	*tp,
86	struct xfs_inode	*ip,
87	struct xfs_bmbt_irec	*imap,
88	xfs_fsize_t		new_isize)
89{
90	struct xfs_mount	*mp = tp->t_mountp;
91	xfs_fsize_t		len;
92
93	if (new_isize < 0)
94		return;
95
96	len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount),
97		  new_isize);
98
99	if (len <= ip->i_disk_size)
100		return;
101
102	trace_xfs_exchmaps_update_inode_size(ip, len);
103
104	ip->i_disk_size = len;
105	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
106}
107
108/* Advance the incore state tracking after exchanging a mapping. */
109static inline void
110xmi_advance(
111	struct xfs_exchmaps_intent	*xmi,
112	const struct xfs_bmbt_irec	*irec)
113{
114	xmi->xmi_startoff1 += irec->br_blockcount;
115	xmi->xmi_startoff2 += irec->br_blockcount;
116	xmi->xmi_blockcount -= irec->br_blockcount;
117}
118
119/* Do we still have more mappings to exchange? */
120static inline bool
121xmi_has_more_exchange_work(const struct xfs_exchmaps_intent *xmi)
122{
123	return xmi->xmi_blockcount > 0;
124}
125
126/* Do we have post-operation cleanups to perform? */
127static inline bool
128xmi_has_postop_work(const struct xfs_exchmaps_intent *xmi)
129{
130	return xmi->xmi_flags & (XFS_EXCHMAPS_CLEAR_INO1_REFLINK |
131				 XFS_EXCHMAPS_CLEAR_INO2_REFLINK |
132				 __XFS_EXCHMAPS_INO2_SHORTFORM);
133}
134
135/* Check all mappings to make sure we can actually exchange them. */
136int
137xfs_exchmaps_check_forks(
138	struct xfs_mount		*mp,
139	const struct xfs_exchmaps_req	*req)
140{
141	struct xfs_ifork		*ifp1, *ifp2;
142	int				whichfork = xfs_exchmaps_reqfork(req);
143
144	/* No fork? */
145	ifp1 = xfs_ifork_ptr(req->ip1, whichfork);
146	ifp2 = xfs_ifork_ptr(req->ip2, whichfork);
147	if (!ifp1 || !ifp2)
148		return -EINVAL;
149
150	/* We don't know how to exchange local format forks. */
151	if (ifp1->if_format == XFS_DINODE_FMT_LOCAL ||
152	    ifp2->if_format == XFS_DINODE_FMT_LOCAL)
153		return -EINVAL;
154
155	return 0;
156}
157
158#ifdef CONFIG_XFS_QUOTA
159/* Log the actual updates to the quota accounting. */
160static inline void
161xfs_exchmaps_update_quota(
162	struct xfs_trans		*tp,
163	struct xfs_exchmaps_intent	*xmi,
164	struct xfs_bmbt_irec		*irec1,
165	struct xfs_bmbt_irec		*irec2)
166{
167	int64_t				ip1_delta = 0, ip2_delta = 0;
168	unsigned int			qflag;
169
170	qflag = XFS_IS_REALTIME_INODE(xmi->xmi_ip1) ? XFS_TRANS_DQ_RTBCOUNT :
171						      XFS_TRANS_DQ_BCOUNT;
172
173	if (xfs_bmap_is_real_extent(irec1)) {
174		ip1_delta -= irec1->br_blockcount;
175		ip2_delta += irec1->br_blockcount;
176	}
177
178	if (xfs_bmap_is_real_extent(irec2)) {
179		ip1_delta += irec2->br_blockcount;
180		ip2_delta -= irec2->br_blockcount;
181	}
182
183	xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip1, qflag, ip1_delta);
184	xfs_trans_mod_dquot_byino(tp, xmi->xmi_ip2, qflag, ip2_delta);
185}
186#else
187# define xfs_exchmaps_update_quota(tp, xmi, irec1, irec2)	((void)0)
188#endif
189
190/* Decide if we want to skip this mapping from file1. */
191static inline bool
192xfs_exchmaps_can_skip_mapping(
193	struct xfs_exchmaps_intent	*xmi,
194	struct xfs_bmbt_irec		*irec)
195{
196	struct xfs_mount		*mp = xmi->xmi_ip1->i_mount;
197
198	/* Do not skip this mapping if the caller did not tell us to. */
199	if (!(xmi->xmi_flags & XFS_EXCHMAPS_INO1_WRITTEN))
200		return false;
201
202	/* Do not skip mapped, written mappings. */
203	if (xfs_bmap_is_written_extent(irec))
204		return false;
205
206	/*
207	 * The mapping is unwritten or a hole.  It cannot be a delalloc
208	 * reservation because we already excluded those.  It cannot be an
209	 * unwritten extent with dirty page cache because we flushed the page
210	 * cache.  For files where the allocation unit is 1FSB (files on the
211	 * data dev, rt files if the extent size is 1FSB), we can safely
212	 * skip this mapping.
213	 */
214	if (!xfs_inode_has_bigrtalloc(xmi->xmi_ip1))
215		return true;
216
217	/*
218	 * For a realtime file with a multi-fsb allocation unit, the decision
219	 * is trickier because we can only swap full allocation units.
220	 * Unwritten mappings can appear in the middle of an rtx if the rtx is
221	 * partially written, but they can also appear for preallocations.
222	 *
223	 * If the mapping is a hole, skip it entirely.  Holes should align with
224	 * rtx boundaries.
225	 */
226	if (!xfs_bmap_is_real_extent(irec))
227		return true;
228
229	/*
230	 * All mappings below this point are unwritten.
231	 *
232	 * - If the beginning is not aligned to an rtx, trim the end of the
233	 *   mapping so that it does not cross an rtx boundary, and swap it.
234	 *
235	 * - If both ends are aligned to an rtx, skip the entire mapping.
236	 */
237	if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) {
238		xfs_fileoff_t	new_end;
239
240		new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize);
241		irec->br_blockcount = min(irec->br_blockcount,
242					  new_end - irec->br_startoff);
243		return false;
244	}
245	if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize))
246		return true;
247
248	/*
249	 * All mappings below this point are unwritten, start on an rtx
250	 * boundary, and do not end on an rtx boundary.
251	 *
252	 * - If the mapping is longer than one rtx, trim the end of the mapping
253	 *   down to an rtx boundary and skip it.
254	 *
255	 * - The mapping is shorter than one rtx.  Swap it.
256	 */
257	if (irec->br_blockcount > mp->m_sb.sb_rextsize) {
258		xfs_fileoff_t	new_end;
259
260		new_end = rounddown_64(irec->br_startoff + irec->br_blockcount,
261				mp->m_sb.sb_rextsize);
262		irec->br_blockcount = new_end - irec->br_startoff;
263		return true;
264	}
265
266	return false;
267}
268
269/*
270 * Walk forward through the file ranges in @xmi until we find two different
271 * mappings to exchange.  If there is work to do, return the mappings;
272 * otherwise we've reached the end of the range and xmi_blockcount will be
273 * zero.
274 *
275 * If the walk skips over a pair of mappings to the same storage, save them as
276 * the left records in @adj (if provided) so that the simulation phase can
277 * avoid an extra lookup.
278  */
279static int
280xfs_exchmaps_find_mappings(
281	struct xfs_exchmaps_intent	*xmi,
282	struct xfs_bmbt_irec		*irec1,
283	struct xfs_bmbt_irec		*irec2,
284	struct xfs_exchmaps_adjacent	*adj)
285{
286	int				nimaps;
287	int				bmap_flags;
288	int				error;
289
290	bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_whichfork(xmi));
291
292	for (; xmi_has_more_exchange_work(xmi); xmi_advance(xmi, irec1)) {
293		/* Read mapping from the first file */
294		nimaps = 1;
295		error = xfs_bmapi_read(xmi->xmi_ip1, xmi->xmi_startoff1,
296				xmi->xmi_blockcount, irec1, &nimaps,
297				bmap_flags);
298		if (error)
299			return error;
300		if (nimaps != 1 ||
301		    irec1->br_startblock == DELAYSTARTBLOCK ||
302		    irec1->br_startoff != xmi->xmi_startoff1) {
303			/*
304			 * We should never get no mapping or a delalloc mapping
305			 * or something that doesn't match what we asked for,
306			 * since the caller flushed both inodes and we hold the
307			 * ILOCKs for both inodes.
308			 */
309			ASSERT(0);
310			return -EINVAL;
311		}
312
313		if (xfs_exchmaps_can_skip_mapping(xmi, irec1)) {
314			trace_xfs_exchmaps_mapping1_skip(xmi->xmi_ip1, irec1);
315			continue;
316		}
317
318		/* Read mapping from the second file */
319		nimaps = 1;
320		error = xfs_bmapi_read(xmi->xmi_ip2, xmi->xmi_startoff2,
321				irec1->br_blockcount, irec2, &nimaps,
322				bmap_flags);
323		if (error)
324			return error;
325		if (nimaps != 1 ||
326		    irec2->br_startblock == DELAYSTARTBLOCK ||
327		    irec2->br_startoff != xmi->xmi_startoff2) {
328			/*
329			 * We should never get no mapping or a delalloc mapping
330			 * or something that doesn't match what we asked for,
331			 * since the caller flushed both inodes and we hold the
332			 * ILOCKs for both inodes.
333			 */
334			ASSERT(0);
335			return -EINVAL;
336		}
337
338		/*
339		 * We can only exchange as many blocks as the smaller of the
340		 * two mapping maps.
341		 */
342		irec1->br_blockcount = min(irec1->br_blockcount,
343					   irec2->br_blockcount);
344
345		trace_xfs_exchmaps_mapping1(xmi->xmi_ip1, irec1);
346		trace_xfs_exchmaps_mapping2(xmi->xmi_ip2, irec2);
347
348		/* We found something to exchange, so return it. */
349		if (irec1->br_startblock != irec2->br_startblock)
350			return 0;
351
352		/*
353		 * Two mappings pointing to the same physical block must not
354		 * have different states; that's filesystem corruption.  Move
355		 * on to the next mapping if they're both holes or both point
356		 * to the same physical space extent.
357		 */
358		if (irec1->br_state != irec2->br_state) {
359			xfs_bmap_mark_sick(xmi->xmi_ip1,
360					xfs_exchmaps_whichfork(xmi));
361			xfs_bmap_mark_sick(xmi->xmi_ip2,
362					xfs_exchmaps_whichfork(xmi));
363			return -EFSCORRUPTED;
364		}
365
366		/*
367		 * Save the mappings if we're estimating work and skipping
368		 * these identical mappings.
369		 */
370		if (adj) {
371			memcpy(&adj->left1, irec1, sizeof(*irec1));
372			memcpy(&adj->left2, irec2, sizeof(*irec2));
373		}
374	}
375
376	return 0;
377}
378
379/* Exchange these two mappings. */
380static void
381xfs_exchmaps_one_step(
382	struct xfs_trans		*tp,
383	struct xfs_exchmaps_intent	*xmi,
384	struct xfs_bmbt_irec		*irec1,
385	struct xfs_bmbt_irec		*irec2)
386{
387	int				whichfork = xfs_exchmaps_whichfork(xmi);
388
389	xfs_exchmaps_update_quota(tp, xmi, irec1, irec2);
390
391	/* Remove both mappings. */
392	xfs_bmap_unmap_extent(tp, xmi->xmi_ip1, whichfork, irec1);
393	xfs_bmap_unmap_extent(tp, xmi->xmi_ip2, whichfork, irec2);
394
395	/*
396	 * Re-add both mappings.  We exchange the file offsets between the two
397	 * maps and add the opposite map, which has the effect of filling the
398	 * logical offsets we just unmapped, but with with the physical mapping
399	 * information exchanged.
400	 */
401	swap(irec1->br_startoff, irec2->br_startoff);
402	xfs_bmap_map_extent(tp, xmi->xmi_ip1, whichfork, irec2);
403	xfs_bmap_map_extent(tp, xmi->xmi_ip2, whichfork, irec1);
404
405	/* Make sure we're not adding mappings past EOF. */
406	if (whichfork == XFS_DATA_FORK) {
407		xfs_exchmaps_update_size(tp, xmi->xmi_ip1, irec2,
408				xmi->xmi_isize1);
409		xfs_exchmaps_update_size(tp, xmi->xmi_ip2, irec1,
410				xmi->xmi_isize2);
411	}
412
413	/*
414	 * Advance our cursor and exit.   The caller (either defer ops or log
415	 * recovery) will log the XMD item, and if *blockcount is nonzero, it
416	 * will log a new XMI item for the remainder and call us back.
417	 */
418	xmi_advance(xmi, irec1);
419}
420
421/* Convert inode2's leaf attr fork back to shortform, if possible.. */
422STATIC int
423xfs_exchmaps_attr_to_sf(
424	struct xfs_trans		*tp,
425	struct xfs_exchmaps_intent	*xmi)
426{
427	struct xfs_da_args	args = {
428		.dp		= xmi->xmi_ip2,
429		.geo		= tp->t_mountp->m_attr_geo,
430		.whichfork	= XFS_ATTR_FORK,
431		.trans		= tp,
432		.owner		= xmi->xmi_ip2->i_ino,
433	};
434	struct xfs_buf		*bp;
435	int			forkoff;
436	int			error;
437
438	if (!xfs_attr_is_leaf(xmi->xmi_ip2))
439		return 0;
440
441	error = xfs_attr3_leaf_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, 0,
442			&bp);
443	if (error)
444		return error;
445
446	forkoff = xfs_attr_shortform_allfit(bp, xmi->xmi_ip2);
447	if (forkoff == 0)
448		return 0;
449
450	return xfs_attr3_leaf_to_shortform(bp, &args, forkoff);
451}
452
453/* Convert inode2's block dir fork back to shortform, if possible.. */
454STATIC int
455xfs_exchmaps_dir_to_sf(
456	struct xfs_trans		*tp,
457	struct xfs_exchmaps_intent	*xmi)
458{
459	struct xfs_da_args	args = {
460		.dp		= xmi->xmi_ip2,
461		.geo		= tp->t_mountp->m_dir_geo,
462		.whichfork	= XFS_DATA_FORK,
463		.trans		= tp,
464		.owner		= xmi->xmi_ip2->i_ino,
465	};
466	struct xfs_dir2_sf_hdr	sfh;
467	struct xfs_buf		*bp;
468	int			size;
469	int			error = 0;
470
471	if (xfs_dir2_format(&args, &error) != XFS_DIR2_FMT_BLOCK)
472		return error;
473
474	error = xfs_dir3_block_read(tp, xmi->xmi_ip2, xmi->xmi_ip2->i_ino, &bp);
475	if (error)
476		return error;
477
478	size = xfs_dir2_block_sfsize(xmi->xmi_ip2, bp->b_addr, &sfh);
479	if (size > xfs_inode_data_fork_size(xmi->xmi_ip2))
480		return 0;
481
482	return xfs_dir2_block_to_sf(&args, bp, size, &sfh);
483}
484
485/* Convert inode2's remote symlink target back to shortform, if possible. */
486STATIC int
487xfs_exchmaps_link_to_sf(
488	struct xfs_trans		*tp,
489	struct xfs_exchmaps_intent	*xmi)
490{
491	struct xfs_inode		*ip = xmi->xmi_ip2;
492	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
493	char				*buf;
494	int				error;
495
496	if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
497	    ip->i_disk_size > xfs_inode_data_fork_size(ip))
498		return 0;
499
500	/* Read the current symlink target into a buffer. */
501	buf = kmalloc(ip->i_disk_size + 1,
502			GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
503	if (!buf) {
504		ASSERT(0);
505		return -ENOMEM;
506	}
507
508	error = xfs_symlink_remote_read(ip, buf);
509	if (error)
510		goto free;
511
512	/* Remove the blocks. */
513	error = xfs_symlink_remote_truncate(tp, ip);
514	if (error)
515		goto free;
516
517	/* Convert fork to local format and log our changes. */
518	xfs_idestroy_fork(ifp);
519	ifp->if_bytes = 0;
520	ifp->if_format = XFS_DINODE_FMT_LOCAL;
521	xfs_init_local_fork(ip, XFS_DATA_FORK, buf, ip->i_disk_size);
522	xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
523free:
524	kfree(buf);
525	return error;
526}
527
528/* Clear the reflink flag after an exchange. */
529static inline void
530xfs_exchmaps_clear_reflink(
531	struct xfs_trans	*tp,
532	struct xfs_inode	*ip)
533{
534	trace_xfs_reflink_unset_inode_flag(ip);
535
536	ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
537	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
538}
539
540/* Finish whatever work might come after an exchange operation. */
541static int
542xfs_exchmaps_do_postop_work(
543	struct xfs_trans		*tp,
544	struct xfs_exchmaps_intent	*xmi)
545{
546	if (xmi->xmi_flags & __XFS_EXCHMAPS_INO2_SHORTFORM) {
547		int			error = 0;
548
549		if (xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)
550			error = xfs_exchmaps_attr_to_sf(tp, xmi);
551		else if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode))
552			error = xfs_exchmaps_dir_to_sf(tp, xmi);
553		else if (S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode))
554			error = xfs_exchmaps_link_to_sf(tp, xmi);
555		xmi->xmi_flags &= ~__XFS_EXCHMAPS_INO2_SHORTFORM;
556		if (error)
557			return error;
558	}
559
560	if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO1_REFLINK) {
561		xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip1);
562		xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO1_REFLINK;
563	}
564
565	if (xmi->xmi_flags & XFS_EXCHMAPS_CLEAR_INO2_REFLINK) {
566		xfs_exchmaps_clear_reflink(tp, xmi->xmi_ip2);
567		xmi->xmi_flags &= ~XFS_EXCHMAPS_CLEAR_INO2_REFLINK;
568	}
569
570	return 0;
571}
572
573/* Finish one step in a mapping exchange operation, possibly relogging. */
574int
575xfs_exchmaps_finish_one(
576	struct xfs_trans		*tp,
577	struct xfs_exchmaps_intent	*xmi)
578{
579	struct xfs_bmbt_irec		irec1, irec2;
580	int				error;
581
582	if (xmi_has_more_exchange_work(xmi)) {
583		/*
584		 * If the operation state says that some range of the files
585		 * have not yet been exchanged, look for mappings in that range
586		 * to exchange.  If we find some mappings, exchange them.
587		 */
588		error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, NULL);
589		if (error)
590			return error;
591
592		if (xmi_has_more_exchange_work(xmi))
593			xfs_exchmaps_one_step(tp, xmi, &irec1, &irec2);
594
595		/*
596		 * If the caller asked us to exchange the file sizes after the
597		 * exchange and either we just exchanged the last mappings in
598		 * the range or we didn't find anything to exchange, update the
599		 * ondisk file sizes.
600		 */
601		if ((xmi->xmi_flags & XFS_EXCHMAPS_SET_SIZES) &&
602		    !xmi_has_more_exchange_work(xmi)) {
603			xmi->xmi_ip1->i_disk_size = xmi->xmi_isize1;
604			xmi->xmi_ip2->i_disk_size = xmi->xmi_isize2;
605
606			xfs_trans_log_inode(tp, xmi->xmi_ip1, XFS_ILOG_CORE);
607			xfs_trans_log_inode(tp, xmi->xmi_ip2, XFS_ILOG_CORE);
608		}
609	} else if (xmi_has_postop_work(xmi)) {
610		/*
611		 * Now that we're finished with the exchange operation,
612		 * complete the post-op cleanup work.
613		 */
614		error = xfs_exchmaps_do_postop_work(tp, xmi);
615		if (error)
616			return error;
617	}
618
619	if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE))
620		return -EIO;
621
622	/* If we still have work to do, ask for a new transaction. */
623	if (xmi_has_more_exchange_work(xmi) || xmi_has_postop_work(xmi)) {
624		trace_xfs_exchmaps_defer(tp->t_mountp, xmi);
625		return -EAGAIN;
626	}
627
628	/*
629	 * If we reach here, we've finished all the exchange work and the post
630	 * operation work.  The last thing we need to do before returning to
631	 * the caller is to make sure that COW forks are set up correctly.
632	 */
633	if (!(xmi->xmi_flags & XFS_EXCHMAPS_ATTR_FORK)) {
634		xfs_exchmaps_ensure_cowfork(xmi->xmi_ip1);
635		xfs_exchmaps_ensure_cowfork(xmi->xmi_ip2);
636	}
637
638	return 0;
639}
640
641/*
642 * Compute the amount of bmbt blocks we should reserve for each file.  In the
643 * worst case, each exchange will fill a hole with a new mapping, which could
644 * result in a btree split every time we add a new leaf block.
645 */
646static inline uint64_t
647xfs_exchmaps_bmbt_blocks(
648	struct xfs_mount		*mp,
649	const struct xfs_exchmaps_req	*req)
650{
651	return howmany_64(req->nr_exchanges,
652					XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp)) *
653			XFS_EXTENTADD_SPACE_RES(mp, xfs_exchmaps_reqfork(req));
654}
655
656/* Compute the space we should reserve for the rmap btree expansions. */
657static inline uint64_t
658xfs_exchmaps_rmapbt_blocks(
659	struct xfs_mount		*mp,
660	const struct xfs_exchmaps_req	*req)
661{
662	if (!xfs_has_rmapbt(mp))
663		return 0;
664	if (XFS_IS_REALTIME_INODE(req->ip1))
665		return 0;
666
667	return howmany_64(req->nr_exchanges,
668					XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) *
669			XFS_RMAPADD_SPACE_RES(mp);
670}
671
672/* Estimate the bmbt and rmapbt overhead required to exchange mappings. */
673int
674xfs_exchmaps_estimate_overhead(
675	struct xfs_exchmaps_req		*req)
676{
677	struct xfs_mount		*mp = req->ip1->i_mount;
678	xfs_filblks_t			bmbt_blocks;
679	xfs_filblks_t			rmapbt_blocks;
680	xfs_filblks_t			resblks = req->resblks;
681
682	/*
683	 * Compute the number of bmbt and rmapbt blocks we might need to handle
684	 * the estimated number of exchanges.
685	 */
686	bmbt_blocks = xfs_exchmaps_bmbt_blocks(mp, req);
687	rmapbt_blocks = xfs_exchmaps_rmapbt_blocks(mp, req);
688
689	trace_xfs_exchmaps_overhead(mp, bmbt_blocks, rmapbt_blocks);
690
691	/* Make sure the change in file block count doesn't overflow. */
692	if (check_add_overflow(req->ip1_bcount, bmbt_blocks, &req->ip1_bcount))
693		return -EFBIG;
694	if (check_add_overflow(req->ip2_bcount, bmbt_blocks, &req->ip2_bcount))
695		return -EFBIG;
696
697	/*
698	 * Add together the number of blocks we need to handle btree growth,
699	 * then add it to the number of blocks we need to reserve to this
700	 * transaction.
701	 */
702	if (check_add_overflow(resblks, bmbt_blocks, &resblks))
703		return -ENOSPC;
704	if (check_add_overflow(resblks, bmbt_blocks, &resblks))
705		return -ENOSPC;
706	if (check_add_overflow(resblks, rmapbt_blocks, &resblks))
707		return -ENOSPC;
708	if (check_add_overflow(resblks, rmapbt_blocks, &resblks))
709		return -ENOSPC;
710
711	/* Can't actually reserve more than UINT_MAX blocks. */
712	if (req->resblks > UINT_MAX)
713		return -ENOSPC;
714
715	req->resblks = resblks;
716	trace_xfs_exchmaps_final_estimate(req);
717	return 0;
718}
719
720/* Decide if we can merge two real mappings. */
721static inline bool
722xmi_can_merge(
723	const struct xfs_bmbt_irec	*b1,
724	const struct xfs_bmbt_irec	*b2)
725{
726	/* Don't merge holes. */
727	if (b1->br_startblock == HOLESTARTBLOCK ||
728	    b2->br_startblock == HOLESTARTBLOCK)
729		return false;
730
731	/* We don't merge holes. */
732	if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2))
733		return false;
734
735	if (b1->br_startoff   + b1->br_blockcount == b2->br_startoff &&
736	    b1->br_startblock + b1->br_blockcount == b2->br_startblock &&
737	    b1->br_state			  == b2->br_state &&
738	    b1->br_blockcount + b2->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
739		return true;
740
741	return false;
742}
743
744/*
745 * Decide if we can merge three mappings.  Caller must ensure all three
746 * mappings must not be holes or delalloc reservations.
747 */
748static inline bool
749xmi_can_merge_all(
750	const struct xfs_bmbt_irec	*l,
751	const struct xfs_bmbt_irec	*m,
752	const struct xfs_bmbt_irec	*r)
753{
754	xfs_filblks_t			new_len;
755
756	new_len = l->br_blockcount + m->br_blockcount + r->br_blockcount;
757	return new_len <= XFS_MAX_BMBT_EXTLEN;
758}
759
760#define CLEFT_CONTIG	0x01
761#define CRIGHT_CONTIG	0x02
762#define CHOLE		0x04
763#define CBOTH_CONTIG	(CLEFT_CONTIG | CRIGHT_CONTIG)
764
765#define NLEFT_CONTIG	0x10
766#define NRIGHT_CONTIG	0x20
767#define NHOLE		0x40
768#define NBOTH_CONTIG	(NLEFT_CONTIG | NRIGHT_CONTIG)
769
770/* Estimate the effect of a single exchange on mapping count. */
771static inline int
772xmi_delta_nextents_step(
773	struct xfs_mount		*mp,
774	const struct xfs_bmbt_irec	*left,
775	const struct xfs_bmbt_irec	*curr,
776	const struct xfs_bmbt_irec	*new,
777	const struct xfs_bmbt_irec	*right)
778{
779	bool				lhole, rhole, chole, nhole;
780	unsigned int			state = 0;
781	int				ret = 0;
782
783	lhole = left->br_startblock == HOLESTARTBLOCK;
784	rhole = right->br_startblock == HOLESTARTBLOCK;
785	chole = curr->br_startblock == HOLESTARTBLOCK;
786	nhole = new->br_startblock == HOLESTARTBLOCK;
787
788	if (chole)
789		state |= CHOLE;
790	if (!lhole && !chole && xmi_can_merge(left, curr))
791		state |= CLEFT_CONTIG;
792	if (!rhole && !chole && xmi_can_merge(curr, right))
793		state |= CRIGHT_CONTIG;
794	if ((state & CBOTH_CONTIG) == CBOTH_CONTIG &&
795	    !xmi_can_merge_all(left, curr, right))
796		state &= ~CRIGHT_CONTIG;
797
798	if (nhole)
799		state |= NHOLE;
800	if (!lhole && !nhole && xmi_can_merge(left, new))
801		state |= NLEFT_CONTIG;
802	if (!rhole && !nhole && xmi_can_merge(new, right))
803		state |= NRIGHT_CONTIG;
804	if ((state & NBOTH_CONTIG) == NBOTH_CONTIG &&
805	    !xmi_can_merge_all(left, new, right))
806		state &= ~NRIGHT_CONTIG;
807
808	switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) {
809	case CLEFT_CONTIG | CRIGHT_CONTIG:
810		/*
811		 * left/curr/right are the same mapping, so deleting curr
812		 * causes 2 new mappings to be created.
813		 */
814		ret += 2;
815		break;
816	case 0:
817		/*
818		 * curr is not contiguous with any mapping, so we remove curr
819		 * completely
820		 */
821		ret--;
822		break;
823	case CHOLE:
824		/* hole, do nothing */
825		break;
826	case CLEFT_CONTIG:
827	case CRIGHT_CONTIG:
828		/* trim either left or right, no change */
829		break;
830	}
831
832	switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) {
833	case NLEFT_CONTIG | NRIGHT_CONTIG:
834		/*
835		 * left/curr/right will become the same mapping, so adding
836		 * curr causes the deletion of right.
837		 */
838		ret--;
839		break;
840	case 0:
841		/* new is not contiguous with any mapping */
842		ret++;
843		break;
844	case NHOLE:
845		/* hole, do nothing. */
846		break;
847	case NLEFT_CONTIG:
848	case NRIGHT_CONTIG:
849		/* new is absorbed into left or right, no change */
850		break;
851	}
852
853	trace_xfs_exchmaps_delta_nextents_step(mp, left, curr, new, right, ret,
854			state);
855	return ret;
856}
857
858/* Make sure we don't overflow the extent (mapping) counters. */
859static inline int
860xmi_ensure_delta_nextents(
861	struct xfs_exchmaps_req	*req,
862	struct xfs_inode	*ip,
863	int64_t			delta)
864{
865	struct xfs_mount	*mp = ip->i_mount;
866	int			whichfork = xfs_exchmaps_reqfork(req);
867	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
868	uint64_t		new_nextents;
869	xfs_extnum_t		max_nextents;
870
871	if (delta < 0)
872		return 0;
873
874	/*
875	 * It's always an error if the delta causes integer overflow.  delta
876	 * needs an explicit cast here to avoid warnings about implicit casts
877	 * coded into the overflow check.
878	 */
879	if (check_add_overflow(ifp->if_nextents, (uint64_t)delta,
880				&new_nextents))
881		return -EFBIG;
882
883	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
884	    new_nextents > 10)
885		return -EFBIG;
886
887	/*
888	 * We always promote both inodes to have large extent counts if the
889	 * superblock feature is enabled, so we only need to check against the
890	 * theoretical maximum.
891	 */
892	max_nextents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
893					     whichfork);
894	if (new_nextents > max_nextents)
895		return -EFBIG;
896
897	return 0;
898}
899
900/* Find the next mapping after irec. */
901static inline int
902xmi_next(
903	struct xfs_inode		*ip,
904	int				bmap_flags,
905	const struct xfs_bmbt_irec	*irec,
906	struct xfs_bmbt_irec		*nrec)
907{
908	xfs_fileoff_t			off;
909	xfs_filblks_t			blockcount;
910	int				nimaps = 1;
911	int				error;
912
913	off = irec->br_startoff + irec->br_blockcount;
914	blockcount = XFS_MAX_FILEOFF - off;
915	error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags);
916	if (error)
917		return error;
918	if (nrec->br_startblock == DELAYSTARTBLOCK ||
919	    nrec->br_startoff != off) {
920		/*
921		 * If we don't get the mapping we want, return a zero-length
922		 * mapping, which our estimator function will pretend is a hole.
923		 * We shouldn't get delalloc reservations.
924		 */
925		nrec->br_startblock = HOLESTARTBLOCK;
926	}
927
928	return 0;
929}
930
931int __init
932xfs_exchmaps_intent_init_cache(void)
933{
934	xfs_exchmaps_intent_cache = kmem_cache_create("xfs_exchmaps_intent",
935			sizeof(struct xfs_exchmaps_intent),
936			0, 0, NULL);
937
938	return xfs_exchmaps_intent_cache != NULL ? 0 : -ENOMEM;
939}
940
941void
942xfs_exchmaps_intent_destroy_cache(void)
943{
944	kmem_cache_destroy(xfs_exchmaps_intent_cache);
945	xfs_exchmaps_intent_cache = NULL;
946}
947
948/*
949 * Decide if we will exchange the reflink flags between the two files after the
950 * exchange.  The only time we want to do this is if we're exchanging all
951 * mappings under EOF and the inode reflink flags have different states.
952 */
953static inline bool
954xmi_can_exchange_reflink_flags(
955	const struct xfs_exchmaps_req	*req,
956	unsigned int			reflink_state)
957{
958	struct xfs_mount		*mp = req->ip1->i_mount;
959
960	if (hweight32(reflink_state) != 1)
961		return false;
962	if (req->startoff1 != 0 || req->startoff2 != 0)
963		return false;
964	if (req->blockcount != XFS_B_TO_FSB(mp, req->ip1->i_disk_size))
965		return false;
966	if (req->blockcount != XFS_B_TO_FSB(mp, req->ip2->i_disk_size))
967		return false;
968	return true;
969}
970
971
972/* Allocate and initialize a new incore intent item from a request. */
973struct xfs_exchmaps_intent *
974xfs_exchmaps_init_intent(
975	const struct xfs_exchmaps_req	*req)
976{
977	struct xfs_exchmaps_intent	*xmi;
978	unsigned int			rs = 0;
979
980	xmi = kmem_cache_zalloc(xfs_exchmaps_intent_cache,
981			GFP_NOFS | __GFP_NOFAIL);
982	INIT_LIST_HEAD(&xmi->xmi_list);
983	xmi->xmi_ip1 = req->ip1;
984	xmi->xmi_ip2 = req->ip2;
985	xmi->xmi_startoff1 = req->startoff1;
986	xmi->xmi_startoff2 = req->startoff2;
987	xmi->xmi_blockcount = req->blockcount;
988	xmi->xmi_isize1 = xmi->xmi_isize2 = -1;
989	xmi->xmi_flags = req->flags & XFS_EXCHMAPS_PARAMS;
990
991	if (xfs_exchmaps_whichfork(xmi) == XFS_ATTR_FORK) {
992		xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM;
993		return xmi;
994	}
995
996	if (req->flags & XFS_EXCHMAPS_SET_SIZES) {
997		xmi->xmi_flags |= XFS_EXCHMAPS_SET_SIZES;
998		xmi->xmi_isize1 = req->ip2->i_disk_size;
999		xmi->xmi_isize2 = req->ip1->i_disk_size;
1000	}
1001
1002	/* Record the state of each inode's reflink flag before the op. */
1003	if (xfs_is_reflink_inode(req->ip1))
1004		rs |= 1;
1005	if (xfs_is_reflink_inode(req->ip2))
1006		rs |= 2;
1007
1008	/*
1009	 * Figure out if we're clearing the reflink flags (which effectively
1010	 * exchanges them) after the operation.
1011	 */
1012	if (xmi_can_exchange_reflink_flags(req, rs)) {
1013		if (rs & 1)
1014			xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO1_REFLINK;
1015		if (rs & 2)
1016			xmi->xmi_flags |= XFS_EXCHMAPS_CLEAR_INO2_REFLINK;
1017	}
1018
1019	if (S_ISDIR(VFS_I(xmi->xmi_ip2)->i_mode) ||
1020	    S_ISLNK(VFS_I(xmi->xmi_ip2)->i_mode))
1021		xmi->xmi_flags |= __XFS_EXCHMAPS_INO2_SHORTFORM;
1022
1023	return xmi;
1024}
1025
1026/*
1027 * Estimate the number of exchange operations and the number of file blocks
1028 * in each file that will be affected by the exchange operation.
1029 */
1030int
1031xfs_exchmaps_estimate(
1032	struct xfs_exchmaps_req		*req)
1033{
1034	struct xfs_exchmaps_intent	*xmi;
1035	struct xfs_bmbt_irec		irec1, irec2;
1036	struct xfs_exchmaps_adjacent	adj = ADJACENT_INIT;
1037	xfs_filblks_t			ip1_blocks = 0, ip2_blocks = 0;
1038	int64_t				d_nexts1, d_nexts2;
1039	int				bmap_flags;
1040	int				error;
1041
1042	ASSERT(!(req->flags & ~XFS_EXCHMAPS_PARAMS));
1043
1044	bmap_flags = xfs_bmapi_aflag(xfs_exchmaps_reqfork(req));
1045	xmi = xfs_exchmaps_init_intent(req);
1046
1047	/*
1048	 * To guard against the possibility of overflowing the extent counters,
1049	 * we have to estimate an upper bound on the potential increase in that
1050	 * counter.  We can split the mapping at each end of the range, and for
1051	 * each step of the exchange we can split the mapping that we're
1052	 * working on if the mappings do not align.
1053	 */
1054	d_nexts1 = d_nexts2 = 3;
1055
1056	while (xmi_has_more_exchange_work(xmi)) {
1057		/*
1058		 * Walk through the file ranges until we find something to
1059		 * exchange.  Because we're simulating the exchange, pass in
1060		 * adj to capture skipped mappings for correct estimation of
1061		 * bmbt record merges.
1062		 */
1063		error = xfs_exchmaps_find_mappings(xmi, &irec1, &irec2, &adj);
1064		if (error)
1065			goto out_free;
1066		if (!xmi_has_more_exchange_work(xmi))
1067			break;
1068
1069		/* Update accounting. */
1070		if (xfs_bmap_is_real_extent(&irec1))
1071			ip1_blocks += irec1.br_blockcount;
1072		if (xfs_bmap_is_real_extent(&irec2))
1073			ip2_blocks += irec2.br_blockcount;
1074		req->nr_exchanges++;
1075
1076		/* Read the next mappings from both files. */
1077		error = xmi_next(req->ip1, bmap_flags, &irec1, &adj.right1);
1078		if (error)
1079			goto out_free;
1080
1081		error = xmi_next(req->ip2, bmap_flags, &irec2, &adj.right2);
1082		if (error)
1083			goto out_free;
1084
1085		/* Update extent count deltas. */
1086		d_nexts1 += xmi_delta_nextents_step(req->ip1->i_mount,
1087				&adj.left1, &irec1, &irec2, &adj.right1);
1088
1089		d_nexts2 += xmi_delta_nextents_step(req->ip1->i_mount,
1090				&adj.left2, &irec2, &irec1, &adj.right2);
1091
1092		/* Now pretend we exchanged the mappings. */
1093		if (xmi_can_merge(&adj.left2, &irec1))
1094			adj.left2.br_blockcount += irec1.br_blockcount;
1095		else
1096			memcpy(&adj.left2, &irec1, sizeof(irec1));
1097
1098		if (xmi_can_merge(&adj.left1, &irec2))
1099			adj.left1.br_blockcount += irec2.br_blockcount;
1100		else
1101			memcpy(&adj.left1, &irec2, sizeof(irec2));
1102
1103		xmi_advance(xmi, &irec1);
1104	}
1105
1106	/* Account for the blocks that are being exchanged. */
1107	if (XFS_IS_REALTIME_INODE(req->ip1) &&
1108	    xfs_exchmaps_reqfork(req) == XFS_DATA_FORK) {
1109		req->ip1_rtbcount = ip1_blocks;
1110		req->ip2_rtbcount = ip2_blocks;
1111	} else {
1112		req->ip1_bcount = ip1_blocks;
1113		req->ip2_bcount = ip2_blocks;
1114	}
1115
1116	/*
1117	 * Make sure that both forks have enough slack left in their extent
1118	 * counters that the exchange operation will not overflow.
1119	 */
1120	trace_xfs_exchmaps_delta_nextents(req, d_nexts1, d_nexts2);
1121	if (req->ip1 == req->ip2) {
1122		error = xmi_ensure_delta_nextents(req, req->ip1,
1123				d_nexts1 + d_nexts2);
1124	} else {
1125		error = xmi_ensure_delta_nextents(req, req->ip1, d_nexts1);
1126		if (error)
1127			goto out_free;
1128		error = xmi_ensure_delta_nextents(req, req->ip2, d_nexts2);
1129	}
1130	if (error)
1131		goto out_free;
1132
1133	trace_xfs_exchmaps_initial_estimate(req);
1134	error = xfs_exchmaps_estimate_overhead(req);
1135out_free:
1136	kmem_cache_free(xfs_exchmaps_intent_cache, xmi);
1137	return error;
1138}
1139
1140/* Set the reflink flag before an operation. */
1141static inline void
1142xfs_exchmaps_set_reflink(
1143	struct xfs_trans	*tp,
1144	struct xfs_inode	*ip)
1145{
1146	trace_xfs_reflink_set_inode_flag(ip);
1147
1148	ip->i_diflags2 |= XFS_DIFLAG2_REFLINK;
1149	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1150}
1151
1152/*
1153 * If either file has shared blocks and we're exchanging data forks, we must
1154 * flag the other file as having shared blocks so that we get the shared-block
1155 * rmap functions if we need to fix up the rmaps.
1156 */
1157void
1158xfs_exchmaps_ensure_reflink(
1159	struct xfs_trans			*tp,
1160	const struct xfs_exchmaps_intent	*xmi)
1161{
1162	unsigned int				rs = 0;
1163
1164	if (xfs_is_reflink_inode(xmi->xmi_ip1))
1165		rs |= 1;
1166	if (xfs_is_reflink_inode(xmi->xmi_ip2))
1167		rs |= 2;
1168
1169	if ((rs & 1) && !xfs_is_reflink_inode(xmi->xmi_ip2))
1170		xfs_exchmaps_set_reflink(tp, xmi->xmi_ip2);
1171
1172	if ((rs & 2) && !xfs_is_reflink_inode(xmi->xmi_ip1))
1173		xfs_exchmaps_set_reflink(tp, xmi->xmi_ip1);
1174}
1175
1176/* Set the large extent count flag before an operation if needed. */
1177static inline void
1178xfs_exchmaps_ensure_large_extent_counts(
1179	struct xfs_trans	*tp,
1180	struct xfs_inode	*ip)
1181{
1182	if (xfs_inode_has_large_extent_counts(ip))
1183		return;
1184
1185	ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
1186	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1187}
1188
1189/* Widen the extent counter fields of both inodes if necessary. */
1190void
1191xfs_exchmaps_upgrade_extent_counts(
1192	struct xfs_trans			*tp,
1193	const struct xfs_exchmaps_intent	*xmi)
1194{
1195	if (!xfs_has_large_extent_counts(tp->t_mountp))
1196		return;
1197
1198	xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip1);
1199	xfs_exchmaps_ensure_large_extent_counts(tp, xmi->xmi_ip2);
1200}
1201
1202/*
1203 * Schedule an exchange a range of mappings from one inode to another.
1204 *
1205 * The use of file mapping exchange log intent items ensures the operation can
1206 * be resumed even if the system goes down.  The caller must commit the
1207 * transaction to start the work.
1208 *
1209 * The caller must ensure the inodes must be joined to the transaction and
1210 * ILOCKd; they will still be joined to the transaction at exit.
1211 */
1212void
1213xfs_exchange_mappings(
1214	struct xfs_trans		*tp,
1215	const struct xfs_exchmaps_req	*req)
1216{
1217	struct xfs_exchmaps_intent	*xmi;
1218
1219	BUILD_BUG_ON(XFS_EXCHMAPS_INTERNAL_FLAGS & XFS_EXCHMAPS_LOGGED_FLAGS);
1220
1221	xfs_assert_ilocked(req->ip1, XFS_ILOCK_EXCL);
1222	xfs_assert_ilocked(req->ip2, XFS_ILOCK_EXCL);
1223	ASSERT(!(req->flags & ~XFS_EXCHMAPS_LOGGED_FLAGS));
1224	if (req->flags & XFS_EXCHMAPS_SET_SIZES)
1225		ASSERT(!(req->flags & XFS_EXCHMAPS_ATTR_FORK));
1226	ASSERT(xfs_has_exchange_range(tp->t_mountp));
1227
1228	if (req->blockcount == 0)
1229		return;
1230
1231	xmi = xfs_exchmaps_init_intent(req);
1232	xfs_exchmaps_defer_add(tp, xmi);
1233	xfs_exchmaps_ensure_reflink(tp, xmi);
1234	xfs_exchmaps_upgrade_extent_counts(tp, xmi);
1235}
1236