tree-log.c revision 6e8e777d
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2008 Oracle.  All rights reserved.
4 */
5
6#include <linux/sched.h>
7#include <linux/slab.h>
8#include <linux/blkdev.h>
9#include <linux/list_sort.h>
10#include <linux/iversion.h>
11#include "misc.h"
12#include "ctree.h"
13#include "tree-log.h"
14#include "disk-io.h"
15#include "locking.h"
16#include "print-tree.h"
17#include "backref.h"
18#include "compression.h"
19#include "qgroup.h"
20#include "block-group.h"
21#include "space-info.h"
22#include "zoned.h"
23
24/* magic values for the inode_only field in btrfs_log_inode:
25 *
26 * LOG_INODE_ALL means to log everything
27 * LOG_INODE_EXISTS means to log just enough to recreate the inode
28 * during log replay
29 */
30enum {
31	LOG_INODE_ALL,
32	LOG_INODE_EXISTS,
33	LOG_OTHER_INODE,
34	LOG_OTHER_INODE_ALL,
35};
36
37/*
38 * directory trouble cases
39 *
40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41 * log, we must force a full commit before doing an fsync of the directory
42 * where the unlink was done.
43 * ---> record transid of last unlink/rename per directory
44 *
45 * mkdir foo/some_dir
46 * normal commit
47 * rename foo/some_dir foo2/some_dir
48 * mkdir foo/some_dir
49 * fsync foo/some_dir/some_file
50 *
51 * The fsync above will unlink the original some_dir without recording
52 * it in its new location (foo2).  After a crash, some_dir will be gone
53 * unless the fsync of some_file forces a full commit
54 *
55 * 2) we must log any new names for any file or dir that is in the fsync
56 * log. ---> check inode while renaming/linking.
57 *
58 * 2a) we must log any new names for any file or dir during rename
59 * when the directory they are being removed from was logged.
60 * ---> check inode and old parent dir during rename
61 *
62 *  2a is actually the more important variant.  With the extra logging
63 *  a crash might unlink the old name without recreating the new one
64 *
65 * 3) after a crash, we must go through any directories with a link count
66 * of zero and redo the rm -rf
67 *
68 * mkdir f1/foo
69 * normal commit
70 * rm -rf f1/foo
71 * fsync(f1)
72 *
73 * The directory f1 was fully removed from the FS, but fsync was never
74 * called on f1, only its parent dir.  After a crash the rm -rf must
75 * be replayed.  This must be able to recurse down the entire
76 * directory tree.  The inode link count fixup code takes care of the
77 * ugly details.
78 */
79
80/*
81 * stages for the tree walking.  The first
82 * stage (0) is to only pin down the blocks we find
83 * the second stage (1) is to make sure that all the inodes
84 * we find in the log are created in the subvolume.
85 *
86 * The last stage is to deal with directories and links and extents
87 * and all the other fun semantics
88 */
89enum {
90	LOG_WALK_PIN_ONLY,
91	LOG_WALK_REPLAY_INODES,
92	LOG_WALK_REPLAY_DIR_INDEX,
93	LOG_WALK_REPLAY_ALL,
94};
95
96static int btrfs_log_inode(struct btrfs_trans_handle *trans,
97			   struct btrfs_root *root, struct btrfs_inode *inode,
98			   int inode_only,
99			   struct btrfs_log_ctx *ctx);
100static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
101			     struct btrfs_root *root,
102			     struct btrfs_path *path, u64 objectid);
103static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
104				       struct btrfs_root *root,
105				       struct btrfs_root *log,
106				       struct btrfs_path *path,
107				       u64 dirid, int del_all);
108static void wait_log_commit(struct btrfs_root *root, int transid);
109
110/*
111 * tree logging is a special write ahead log used to make sure that
112 * fsyncs and O_SYNCs can happen without doing full tree commits.
113 *
114 * Full tree commits are expensive because they require commonly
115 * modified blocks to be recowed, creating many dirty pages in the
116 * extent tree an 4x-6x higher write load than ext3.
117 *
118 * Instead of doing a tree commit on every fsync, we use the
119 * key ranges and transaction ids to find items for a given file or directory
120 * that have changed in this transaction.  Those items are copied into
121 * a special tree (one per subvolume root), that tree is written to disk
122 * and then the fsync is considered complete.
123 *
124 * After a crash, items are copied out of the log-tree back into the
125 * subvolume tree.  Any file data extents found are recorded in the extent
126 * allocation tree, and the log-tree freed.
127 *
128 * The log tree is read three times, once to pin down all the extents it is
129 * using in ram and once, once to create all the inodes logged in the tree
130 * and once to do all the other items.
131 */
132
133/*
134 * start a sub transaction and setup the log tree
135 * this increments the log tree writer count to make the people
136 * syncing the tree wait for us to finish
137 */
138static int start_log_trans(struct btrfs_trans_handle *trans,
139			   struct btrfs_root *root,
140			   struct btrfs_log_ctx *ctx)
141{
142	struct btrfs_fs_info *fs_info = root->fs_info;
143	struct btrfs_root *tree_root = fs_info->tree_root;
144	const bool zoned = btrfs_is_zoned(fs_info);
145	int ret = 0;
146	bool created = false;
147
148	/*
149	 * First check if the log root tree was already created. If not, create
150	 * it before locking the root's log_mutex, just to keep lockdep happy.
151	 */
152	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
153		mutex_lock(&tree_root->log_mutex);
154		if (!fs_info->log_root_tree) {
155			ret = btrfs_init_log_root_tree(trans, fs_info);
156			if (!ret) {
157				set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
158				created = true;
159			}
160		}
161		mutex_unlock(&tree_root->log_mutex);
162		if (ret)
163			return ret;
164	}
165
166	mutex_lock(&root->log_mutex);
167
168again:
169	if (root->log_root) {
170		int index = (root->log_transid + 1) % 2;
171
172		if (btrfs_need_log_full_commit(trans)) {
173			ret = -EAGAIN;
174			goto out;
175		}
176
177		if (zoned && atomic_read(&root->log_commit[index])) {
178			wait_log_commit(root, root->log_transid - 1);
179			goto again;
180		}
181
182		if (!root->log_start_pid) {
183			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
184			root->log_start_pid = current->pid;
185		} else if (root->log_start_pid != current->pid) {
186			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
187		}
188	} else {
189		/*
190		 * This means fs_info->log_root_tree was already created
191		 * for some other FS trees. Do the full commit not to mix
192		 * nodes from multiple log transactions to do sequential
193		 * writing.
194		 */
195		if (zoned && !created) {
196			ret = -EAGAIN;
197			goto out;
198		}
199
200		ret = btrfs_add_log_tree(trans, root);
201		if (ret)
202			goto out;
203
204		set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
205		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
206		root->log_start_pid = current->pid;
207	}
208
209	atomic_inc(&root->log_writers);
210	if (ctx && !ctx->logging_new_name) {
211		int index = root->log_transid % 2;
212		list_add_tail(&ctx->list, &root->log_ctxs[index]);
213		ctx->log_transid = root->log_transid;
214	}
215
216out:
217	mutex_unlock(&root->log_mutex);
218	return ret;
219}
220
221/*
222 * returns 0 if there was a log transaction running and we were able
223 * to join, or returns -ENOENT if there were not transactions
224 * in progress
225 */
226static int join_running_log_trans(struct btrfs_root *root)
227{
228	const bool zoned = btrfs_is_zoned(root->fs_info);
229	int ret = -ENOENT;
230
231	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
232		return ret;
233
234	mutex_lock(&root->log_mutex);
235again:
236	if (root->log_root) {
237		int index = (root->log_transid + 1) % 2;
238
239		ret = 0;
240		if (zoned && atomic_read(&root->log_commit[index])) {
241			wait_log_commit(root, root->log_transid - 1);
242			goto again;
243		}
244		atomic_inc(&root->log_writers);
245	}
246	mutex_unlock(&root->log_mutex);
247	return ret;
248}
249
250/*
251 * This either makes the current running log transaction wait
252 * until you call btrfs_end_log_trans() or it makes any future
253 * log transactions wait until you call btrfs_end_log_trans()
254 */
255void btrfs_pin_log_trans(struct btrfs_root *root)
256{
257	atomic_inc(&root->log_writers);
258}
259
260/*
261 * indicate we're done making changes to the log tree
262 * and wake up anyone waiting to do a sync
263 */
264void btrfs_end_log_trans(struct btrfs_root *root)
265{
266	if (atomic_dec_and_test(&root->log_writers)) {
267		/* atomic_dec_and_test implies a barrier */
268		cond_wake_up_nomb(&root->log_writer_wait);
269	}
270}
271
272static int btrfs_write_tree_block(struct extent_buffer *buf)
273{
274	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
275					buf->start + buf->len - 1);
276}
277
278static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
279{
280	filemap_fdatawait_range(buf->pages[0]->mapping,
281			        buf->start, buf->start + buf->len - 1);
282}
283
284/*
285 * the walk control struct is used to pass state down the chain when
286 * processing the log tree.  The stage field tells us which part
287 * of the log tree processing we are currently doing.  The others
288 * are state fields used for that specific part
289 */
290struct walk_control {
291	/* should we free the extent on disk when done?  This is used
292	 * at transaction commit time while freeing a log tree
293	 */
294	int free;
295
296	/* should we write out the extent buffer?  This is used
297	 * while flushing the log tree to disk during a sync
298	 */
299	int write;
300
301	/* should we wait for the extent buffer io to finish?  Also used
302	 * while flushing the log tree to disk for a sync
303	 */
304	int wait;
305
306	/* pin only walk, we record which extents on disk belong to the
307	 * log trees
308	 */
309	int pin;
310
311	/* what stage of the replay code we're currently in */
312	int stage;
313
314	/*
315	 * Ignore any items from the inode currently being processed. Needs
316	 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
317	 * the LOG_WALK_REPLAY_INODES stage.
318	 */
319	bool ignore_cur_inode;
320
321	/* the root we are currently replaying */
322	struct btrfs_root *replay_dest;
323
324	/* the trans handle for the current replay */
325	struct btrfs_trans_handle *trans;
326
327	/* the function that gets used to process blocks we find in the
328	 * tree.  Note the extent_buffer might not be up to date when it is
329	 * passed in, and it must be checked or read if you need the data
330	 * inside it
331	 */
332	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
333			    struct walk_control *wc, u64 gen, int level);
334};
335
336/*
337 * process_func used to pin down extents, write them or wait on them
338 */
339static int process_one_buffer(struct btrfs_root *log,
340			      struct extent_buffer *eb,
341			      struct walk_control *wc, u64 gen, int level)
342{
343	struct btrfs_fs_info *fs_info = log->fs_info;
344	int ret = 0;
345
346	/*
347	 * If this fs is mixed then we need to be able to process the leaves to
348	 * pin down any logged extents, so we have to read the block.
349	 */
350	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
351		ret = btrfs_read_buffer(eb, gen, level, NULL);
352		if (ret)
353			return ret;
354	}
355
356	if (wc->pin)
357		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
358						      eb->len);
359
360	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
361		if (wc->pin && btrfs_header_level(eb) == 0)
362			ret = btrfs_exclude_logged_extents(eb);
363		if (wc->write)
364			btrfs_write_tree_block(eb);
365		if (wc->wait)
366			btrfs_wait_tree_block_writeback(eb);
367	}
368	return ret;
369}
370
371/*
372 * Item overwrite used by replay and tree logging.  eb, slot and key all refer
373 * to the src data we are copying out.
374 *
375 * root is the tree we are copying into, and path is a scratch
376 * path for use in this function (it should be released on entry and
377 * will be released on exit).
378 *
379 * If the key is already in the destination tree the existing item is
380 * overwritten.  If the existing item isn't big enough, it is extended.
381 * If it is too large, it is truncated.
382 *
383 * If the key isn't in the destination yet, a new item is inserted.
384 */
385static noinline int overwrite_item(struct btrfs_trans_handle *trans,
386				   struct btrfs_root *root,
387				   struct btrfs_path *path,
388				   struct extent_buffer *eb, int slot,
389				   struct btrfs_key *key)
390{
391	int ret;
392	u32 item_size;
393	u64 saved_i_size = 0;
394	int save_old_i_size = 0;
395	unsigned long src_ptr;
396	unsigned long dst_ptr;
397	int overwrite_root = 0;
398	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
399
400	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
401		overwrite_root = 1;
402
403	item_size = btrfs_item_size_nr(eb, slot);
404	src_ptr = btrfs_item_ptr_offset(eb, slot);
405
406	/* look for the key in the destination tree */
407	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
408	if (ret < 0)
409		return ret;
410
411	if (ret == 0) {
412		char *src_copy;
413		char *dst_copy;
414		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
415						  path->slots[0]);
416		if (dst_size != item_size)
417			goto insert;
418
419		if (item_size == 0) {
420			btrfs_release_path(path);
421			return 0;
422		}
423		dst_copy = kmalloc(item_size, GFP_NOFS);
424		src_copy = kmalloc(item_size, GFP_NOFS);
425		if (!dst_copy || !src_copy) {
426			btrfs_release_path(path);
427			kfree(dst_copy);
428			kfree(src_copy);
429			return -ENOMEM;
430		}
431
432		read_extent_buffer(eb, src_copy, src_ptr, item_size);
433
434		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
435		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
436				   item_size);
437		ret = memcmp(dst_copy, src_copy, item_size);
438
439		kfree(dst_copy);
440		kfree(src_copy);
441		/*
442		 * they have the same contents, just return, this saves
443		 * us from cowing blocks in the destination tree and doing
444		 * extra writes that may not have been done by a previous
445		 * sync
446		 */
447		if (ret == 0) {
448			btrfs_release_path(path);
449			return 0;
450		}
451
452		/*
453		 * We need to load the old nbytes into the inode so when we
454		 * replay the extents we've logged we get the right nbytes.
455		 */
456		if (inode_item) {
457			struct btrfs_inode_item *item;
458			u64 nbytes;
459			u32 mode;
460
461			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
462					      struct btrfs_inode_item);
463			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
464			item = btrfs_item_ptr(eb, slot,
465					      struct btrfs_inode_item);
466			btrfs_set_inode_nbytes(eb, item, nbytes);
467
468			/*
469			 * If this is a directory we need to reset the i_size to
470			 * 0 so that we can set it up properly when replaying
471			 * the rest of the items in this log.
472			 */
473			mode = btrfs_inode_mode(eb, item);
474			if (S_ISDIR(mode))
475				btrfs_set_inode_size(eb, item, 0);
476		}
477	} else if (inode_item) {
478		struct btrfs_inode_item *item;
479		u32 mode;
480
481		/*
482		 * New inode, set nbytes to 0 so that the nbytes comes out
483		 * properly when we replay the extents.
484		 */
485		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
486		btrfs_set_inode_nbytes(eb, item, 0);
487
488		/*
489		 * If this is a directory we need to reset the i_size to 0 so
490		 * that we can set it up properly when replaying the rest of
491		 * the items in this log.
492		 */
493		mode = btrfs_inode_mode(eb, item);
494		if (S_ISDIR(mode))
495			btrfs_set_inode_size(eb, item, 0);
496	}
497insert:
498	btrfs_release_path(path);
499	/* try to insert the key into the destination tree */
500	path->skip_release_on_error = 1;
501	ret = btrfs_insert_empty_item(trans, root, path,
502				      key, item_size);
503	path->skip_release_on_error = 0;
504
505	/* make sure any existing item is the correct size */
506	if (ret == -EEXIST || ret == -EOVERFLOW) {
507		u32 found_size;
508		found_size = btrfs_item_size_nr(path->nodes[0],
509						path->slots[0]);
510		if (found_size > item_size)
511			btrfs_truncate_item(path, item_size, 1);
512		else if (found_size < item_size)
513			btrfs_extend_item(path, item_size - found_size);
514	} else if (ret) {
515		return ret;
516	}
517	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
518					path->slots[0]);
519
520	/* don't overwrite an existing inode if the generation number
521	 * was logged as zero.  This is done when the tree logging code
522	 * is just logging an inode to make sure it exists after recovery.
523	 *
524	 * Also, don't overwrite i_size on directories during replay.
525	 * log replay inserts and removes directory items based on the
526	 * state of the tree found in the subvolume, and i_size is modified
527	 * as it goes
528	 */
529	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
530		struct btrfs_inode_item *src_item;
531		struct btrfs_inode_item *dst_item;
532
533		src_item = (struct btrfs_inode_item *)src_ptr;
534		dst_item = (struct btrfs_inode_item *)dst_ptr;
535
536		if (btrfs_inode_generation(eb, src_item) == 0) {
537			struct extent_buffer *dst_eb = path->nodes[0];
538			const u64 ino_size = btrfs_inode_size(eb, src_item);
539
540			/*
541			 * For regular files an ino_size == 0 is used only when
542			 * logging that an inode exists, as part of a directory
543			 * fsync, and the inode wasn't fsynced before. In this
544			 * case don't set the size of the inode in the fs/subvol
545			 * tree, otherwise we would be throwing valid data away.
546			 */
547			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
548			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
549			    ino_size != 0)
550				btrfs_set_inode_size(dst_eb, dst_item, ino_size);
551			goto no_copy;
552		}
553
554		if (overwrite_root &&
555		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
556		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
557			save_old_i_size = 1;
558			saved_i_size = btrfs_inode_size(path->nodes[0],
559							dst_item);
560		}
561	}
562
563	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
564			   src_ptr, item_size);
565
566	if (save_old_i_size) {
567		struct btrfs_inode_item *dst_item;
568		dst_item = (struct btrfs_inode_item *)dst_ptr;
569		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
570	}
571
572	/* make sure the generation is filled in */
573	if (key->type == BTRFS_INODE_ITEM_KEY) {
574		struct btrfs_inode_item *dst_item;
575		dst_item = (struct btrfs_inode_item *)dst_ptr;
576		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
577			btrfs_set_inode_generation(path->nodes[0], dst_item,
578						   trans->transid);
579		}
580	}
581no_copy:
582	btrfs_mark_buffer_dirty(path->nodes[0]);
583	btrfs_release_path(path);
584	return 0;
585}
586
587/*
588 * simple helper to read an inode off the disk from a given root
589 * This can only be called for subvolume roots and not for the log
590 */
591static noinline struct inode *read_one_inode(struct btrfs_root *root,
592					     u64 objectid)
593{
594	struct inode *inode;
595
596	inode = btrfs_iget(root->fs_info->sb, objectid, root);
597	if (IS_ERR(inode))
598		inode = NULL;
599	return inode;
600}
601
602/* replays a single extent in 'eb' at 'slot' with 'key' into the
603 * subvolume 'root'.  path is released on entry and should be released
604 * on exit.
605 *
606 * extents in the log tree have not been allocated out of the extent
607 * tree yet.  So, this completes the allocation, taking a reference
608 * as required if the extent already exists or creating a new extent
609 * if it isn't in the extent allocation tree yet.
610 *
611 * The extent is inserted into the file, dropping any existing extents
612 * from the file that overlap the new one.
613 */
614static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
615				      struct btrfs_root *root,
616				      struct btrfs_path *path,
617				      struct extent_buffer *eb, int slot,
618				      struct btrfs_key *key)
619{
620	struct btrfs_drop_extents_args drop_args = { 0 };
621	struct btrfs_fs_info *fs_info = root->fs_info;
622	int found_type;
623	u64 extent_end;
624	u64 start = key->offset;
625	u64 nbytes = 0;
626	struct btrfs_file_extent_item *item;
627	struct inode *inode = NULL;
628	unsigned long size;
629	int ret = 0;
630
631	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
632	found_type = btrfs_file_extent_type(eb, item);
633
634	if (found_type == BTRFS_FILE_EXTENT_REG ||
635	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
636		nbytes = btrfs_file_extent_num_bytes(eb, item);
637		extent_end = start + nbytes;
638
639		/*
640		 * We don't add to the inodes nbytes if we are prealloc or a
641		 * hole.
642		 */
643		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
644			nbytes = 0;
645	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
646		size = btrfs_file_extent_ram_bytes(eb, item);
647		nbytes = btrfs_file_extent_ram_bytes(eb, item);
648		extent_end = ALIGN(start + size,
649				   fs_info->sectorsize);
650	} else {
651		ret = 0;
652		goto out;
653	}
654
655	inode = read_one_inode(root, key->objectid);
656	if (!inode) {
657		ret = -EIO;
658		goto out;
659	}
660
661	/*
662	 * first check to see if we already have this extent in the
663	 * file.  This must be done before the btrfs_drop_extents run
664	 * so we don't try to drop this extent.
665	 */
666	ret = btrfs_lookup_file_extent(trans, root, path,
667			btrfs_ino(BTRFS_I(inode)), start, 0);
668
669	if (ret == 0 &&
670	    (found_type == BTRFS_FILE_EXTENT_REG ||
671	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
672		struct btrfs_file_extent_item cmp1;
673		struct btrfs_file_extent_item cmp2;
674		struct btrfs_file_extent_item *existing;
675		struct extent_buffer *leaf;
676
677		leaf = path->nodes[0];
678		existing = btrfs_item_ptr(leaf, path->slots[0],
679					  struct btrfs_file_extent_item);
680
681		read_extent_buffer(eb, &cmp1, (unsigned long)item,
682				   sizeof(cmp1));
683		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
684				   sizeof(cmp2));
685
686		/*
687		 * we already have a pointer to this exact extent,
688		 * we don't have to do anything
689		 */
690		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
691			btrfs_release_path(path);
692			goto out;
693		}
694	}
695	btrfs_release_path(path);
696
697	/* drop any overlapping extents */
698	drop_args.start = start;
699	drop_args.end = extent_end;
700	drop_args.drop_cache = true;
701	ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
702	if (ret)
703		goto out;
704
705	if (found_type == BTRFS_FILE_EXTENT_REG ||
706	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
707		u64 offset;
708		unsigned long dest_offset;
709		struct btrfs_key ins;
710
711		if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
712		    btrfs_fs_incompat(fs_info, NO_HOLES))
713			goto update_inode;
714
715		ret = btrfs_insert_empty_item(trans, root, path, key,
716					      sizeof(*item));
717		if (ret)
718			goto out;
719		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
720						    path->slots[0]);
721		copy_extent_buffer(path->nodes[0], eb, dest_offset,
722				(unsigned long)item,  sizeof(*item));
723
724		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
725		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
726		ins.type = BTRFS_EXTENT_ITEM_KEY;
727		offset = key->offset - btrfs_file_extent_offset(eb, item);
728
729		/*
730		 * Manually record dirty extent, as here we did a shallow
731		 * file extent item copy and skip normal backref update,
732		 * but modifying extent tree all by ourselves.
733		 * So need to manually record dirty extent for qgroup,
734		 * as the owner of the file extent changed from log tree
735		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
736		 */
737		ret = btrfs_qgroup_trace_extent(trans,
738				btrfs_file_extent_disk_bytenr(eb, item),
739				btrfs_file_extent_disk_num_bytes(eb, item),
740				GFP_NOFS);
741		if (ret < 0)
742			goto out;
743
744		if (ins.objectid > 0) {
745			struct btrfs_ref ref = { 0 };
746			u64 csum_start;
747			u64 csum_end;
748			LIST_HEAD(ordered_sums);
749
750			/*
751			 * is this extent already allocated in the extent
752			 * allocation tree?  If so, just add a reference
753			 */
754			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
755						ins.offset);
756			if (ret == 0) {
757				btrfs_init_generic_ref(&ref,
758						BTRFS_ADD_DELAYED_REF,
759						ins.objectid, ins.offset, 0);
760				btrfs_init_data_ref(&ref,
761						root->root_key.objectid,
762						key->objectid, offset);
763				ret = btrfs_inc_extent_ref(trans, &ref);
764				if (ret)
765					goto out;
766			} else {
767				/*
768				 * insert the extent pointer in the extent
769				 * allocation tree
770				 */
771				ret = btrfs_alloc_logged_file_extent(trans,
772						root->root_key.objectid,
773						key->objectid, offset, &ins);
774				if (ret)
775					goto out;
776			}
777			btrfs_release_path(path);
778
779			if (btrfs_file_extent_compression(eb, item)) {
780				csum_start = ins.objectid;
781				csum_end = csum_start + ins.offset;
782			} else {
783				csum_start = ins.objectid +
784					btrfs_file_extent_offset(eb, item);
785				csum_end = csum_start +
786					btrfs_file_extent_num_bytes(eb, item);
787			}
788
789			ret = btrfs_lookup_csums_range(root->log_root,
790						csum_start, csum_end - 1,
791						&ordered_sums, 0);
792			if (ret)
793				goto out;
794			/*
795			 * Now delete all existing cums in the csum root that
796			 * cover our range. We do this because we can have an
797			 * extent that is completely referenced by one file
798			 * extent item and partially referenced by another
799			 * file extent item (like after using the clone or
800			 * extent_same ioctls). In this case if we end up doing
801			 * the replay of the one that partially references the
802			 * extent first, and we do not do the csum deletion
803			 * below, we can get 2 csum items in the csum tree that
804			 * overlap each other. For example, imagine our log has
805			 * the two following file extent items:
806			 *
807			 * key (257 EXTENT_DATA 409600)
808			 *     extent data disk byte 12845056 nr 102400
809			 *     extent data offset 20480 nr 20480 ram 102400
810			 *
811			 * key (257 EXTENT_DATA 819200)
812			 *     extent data disk byte 12845056 nr 102400
813			 *     extent data offset 0 nr 102400 ram 102400
814			 *
815			 * Where the second one fully references the 100K extent
816			 * that starts at disk byte 12845056, and the log tree
817			 * has a single csum item that covers the entire range
818			 * of the extent:
819			 *
820			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
821			 *
822			 * After the first file extent item is replayed, the
823			 * csum tree gets the following csum item:
824			 *
825			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
826			 *
827			 * Which covers the 20K sub-range starting at offset 20K
828			 * of our extent. Now when we replay the second file
829			 * extent item, if we do not delete existing csum items
830			 * that cover any of its blocks, we end up getting two
831			 * csum items in our csum tree that overlap each other:
832			 *
833			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
834			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
835			 *
836			 * Which is a problem, because after this anyone trying
837			 * to lookup up for the checksum of any block of our
838			 * extent starting at an offset of 40K or higher, will
839			 * end up looking at the second csum item only, which
840			 * does not contain the checksum for any block starting
841			 * at offset 40K or higher of our extent.
842			 */
843			while (!list_empty(&ordered_sums)) {
844				struct btrfs_ordered_sum *sums;
845				sums = list_entry(ordered_sums.next,
846						struct btrfs_ordered_sum,
847						list);
848				if (!ret)
849					ret = btrfs_del_csums(trans,
850							      fs_info->csum_root,
851							      sums->bytenr,
852							      sums->len);
853				if (!ret)
854					ret = btrfs_csum_file_blocks(trans,
855						fs_info->csum_root, sums);
856				list_del(&sums->list);
857				kfree(sums);
858			}
859			if (ret)
860				goto out;
861		} else {
862			btrfs_release_path(path);
863		}
864	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
865		/* inline extents are easy, we just overwrite them */
866		ret = overwrite_item(trans, root, path, eb, slot, key);
867		if (ret)
868			goto out;
869	}
870
871	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
872						extent_end - start);
873	if (ret)
874		goto out;
875
876update_inode:
877	btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
878	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
879out:
880	if (inode)
881		iput(inode);
882	return ret;
883}
884
885/*
886 * when cleaning up conflicts between the directory names in the
887 * subvolume, directory names in the log and directory names in the
888 * inode back references, we may have to unlink inodes from directories.
889 *
890 * This is a helper function to do the unlink of a specific directory
891 * item
892 */
893static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
894				      struct btrfs_root *root,
895				      struct btrfs_path *path,
896				      struct btrfs_inode *dir,
897				      struct btrfs_dir_item *di)
898{
899	struct inode *inode;
900	char *name;
901	int name_len;
902	struct extent_buffer *leaf;
903	struct btrfs_key location;
904	int ret;
905
906	leaf = path->nodes[0];
907
908	btrfs_dir_item_key_to_cpu(leaf, di, &location);
909	name_len = btrfs_dir_name_len(leaf, di);
910	name = kmalloc(name_len, GFP_NOFS);
911	if (!name)
912		return -ENOMEM;
913
914	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
915	btrfs_release_path(path);
916
917	inode = read_one_inode(root, location.objectid);
918	if (!inode) {
919		ret = -EIO;
920		goto out;
921	}
922
923	ret = link_to_fixup_dir(trans, root, path, location.objectid);
924	if (ret)
925		goto out;
926
927	ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
928			name_len);
929	if (ret)
930		goto out;
931	else
932		ret = btrfs_run_delayed_items(trans);
933out:
934	kfree(name);
935	iput(inode);
936	return ret;
937}
938
939/*
940 * helper function to see if a given name and sequence number found
941 * in an inode back reference are already in a directory and correctly
942 * point to this inode
943 */
944static noinline int inode_in_dir(struct btrfs_root *root,
945				 struct btrfs_path *path,
946				 u64 dirid, u64 objectid, u64 index,
947				 const char *name, int name_len)
948{
949	struct btrfs_dir_item *di;
950	struct btrfs_key location;
951	int match = 0;
952
953	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
954					 index, name, name_len, 0);
955	if (di && !IS_ERR(di)) {
956		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
957		if (location.objectid != objectid)
958			goto out;
959	} else
960		goto out;
961	btrfs_release_path(path);
962
963	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
964	if (di && !IS_ERR(di)) {
965		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
966		if (location.objectid != objectid)
967			goto out;
968	} else
969		goto out;
970	match = 1;
971out:
972	btrfs_release_path(path);
973	return match;
974}
975
976/*
977 * helper function to check a log tree for a named back reference in
978 * an inode.  This is used to decide if a back reference that is
979 * found in the subvolume conflicts with what we find in the log.
980 *
981 * inode backreferences may have multiple refs in a single item,
982 * during replay we process one reference at a time, and we don't
983 * want to delete valid links to a file from the subvolume if that
984 * link is also in the log.
985 */
986static noinline int backref_in_log(struct btrfs_root *log,
987				   struct btrfs_key *key,
988				   u64 ref_objectid,
989				   const char *name, int namelen)
990{
991	struct btrfs_path *path;
992	int ret;
993
994	path = btrfs_alloc_path();
995	if (!path)
996		return -ENOMEM;
997
998	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
999	if (ret < 0) {
1000		goto out;
1001	} else if (ret == 1) {
1002		ret = 0;
1003		goto out;
1004	}
1005
1006	if (key->type == BTRFS_INODE_EXTREF_KEY)
1007		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1008						       path->slots[0],
1009						       ref_objectid,
1010						       name, namelen);
1011	else
1012		ret = !!btrfs_find_name_in_backref(path->nodes[0],
1013						   path->slots[0],
1014						   name, namelen);
1015out:
1016	btrfs_free_path(path);
1017	return ret;
1018}
1019
1020static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
1021				  struct btrfs_root *root,
1022				  struct btrfs_path *path,
1023				  struct btrfs_root *log_root,
1024				  struct btrfs_inode *dir,
1025				  struct btrfs_inode *inode,
1026				  u64 inode_objectid, u64 parent_objectid,
1027				  u64 ref_index, char *name, int namelen,
1028				  int *search_done)
1029{
1030	int ret;
1031	char *victim_name;
1032	int victim_name_len;
1033	struct extent_buffer *leaf;
1034	struct btrfs_dir_item *di;
1035	struct btrfs_key search_key;
1036	struct btrfs_inode_extref *extref;
1037
1038again:
1039	/* Search old style refs */
1040	search_key.objectid = inode_objectid;
1041	search_key.type = BTRFS_INODE_REF_KEY;
1042	search_key.offset = parent_objectid;
1043	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1044	if (ret == 0) {
1045		struct btrfs_inode_ref *victim_ref;
1046		unsigned long ptr;
1047		unsigned long ptr_end;
1048
1049		leaf = path->nodes[0];
1050
1051		/* are we trying to overwrite a back ref for the root directory
1052		 * if so, just jump out, we're done
1053		 */
1054		if (search_key.objectid == search_key.offset)
1055			return 1;
1056
1057		/* check all the names in this back reference to see
1058		 * if they are in the log.  if so, we allow them to stay
1059		 * otherwise they must be unlinked as a conflict
1060		 */
1061		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1062		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1063		while (ptr < ptr_end) {
1064			victim_ref = (struct btrfs_inode_ref *)ptr;
1065			victim_name_len = btrfs_inode_ref_name_len(leaf,
1066								   victim_ref);
1067			victim_name = kmalloc(victim_name_len, GFP_NOFS);
1068			if (!victim_name)
1069				return -ENOMEM;
1070
1071			read_extent_buffer(leaf, victim_name,
1072					   (unsigned long)(victim_ref + 1),
1073					   victim_name_len);
1074
1075			ret = backref_in_log(log_root, &search_key,
1076					     parent_objectid, victim_name,
1077					     victim_name_len);
1078			if (ret < 0) {
1079				kfree(victim_name);
1080				return ret;
1081			} else if (!ret) {
1082				inc_nlink(&inode->vfs_inode);
1083				btrfs_release_path(path);
1084
1085				ret = btrfs_unlink_inode(trans, root, dir, inode,
1086						victim_name, victim_name_len);
1087				kfree(victim_name);
1088				if (ret)
1089					return ret;
1090				ret = btrfs_run_delayed_items(trans);
1091				if (ret)
1092					return ret;
1093				*search_done = 1;
1094				goto again;
1095			}
1096			kfree(victim_name);
1097
1098			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1099		}
1100
1101		/*
1102		 * NOTE: we have searched root tree and checked the
1103		 * corresponding ref, it does not need to check again.
1104		 */
1105		*search_done = 1;
1106	}
1107	btrfs_release_path(path);
1108
1109	/* Same search but for extended refs */
1110	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1111					   inode_objectid, parent_objectid, 0,
1112					   0);
1113	if (!IS_ERR_OR_NULL(extref)) {
1114		u32 item_size;
1115		u32 cur_offset = 0;
1116		unsigned long base;
1117		struct inode *victim_parent;
1118
1119		leaf = path->nodes[0];
1120
1121		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1122		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1123
1124		while (cur_offset < item_size) {
1125			extref = (struct btrfs_inode_extref *)(base + cur_offset);
1126
1127			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1128
1129			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1130				goto next;
1131
1132			victim_name = kmalloc(victim_name_len, GFP_NOFS);
1133			if (!victim_name)
1134				return -ENOMEM;
1135			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1136					   victim_name_len);
1137
1138			search_key.objectid = inode_objectid;
1139			search_key.type = BTRFS_INODE_EXTREF_KEY;
1140			search_key.offset = btrfs_extref_hash(parent_objectid,
1141							      victim_name,
1142							      victim_name_len);
1143			ret = backref_in_log(log_root, &search_key,
1144					     parent_objectid, victim_name,
1145					     victim_name_len);
1146			if (ret < 0) {
1147				return ret;
1148			} else if (!ret) {
1149				ret = -ENOENT;
1150				victim_parent = read_one_inode(root,
1151						parent_objectid);
1152				if (victim_parent) {
1153					inc_nlink(&inode->vfs_inode);
1154					btrfs_release_path(path);
1155
1156					ret = btrfs_unlink_inode(trans, root,
1157							BTRFS_I(victim_parent),
1158							inode,
1159							victim_name,
1160							victim_name_len);
1161					if (!ret)
1162						ret = btrfs_run_delayed_items(
1163								  trans);
1164				}
1165				iput(victim_parent);
1166				kfree(victim_name);
1167				if (ret)
1168					return ret;
1169				*search_done = 1;
1170				goto again;
1171			}
1172			kfree(victim_name);
1173next:
1174			cur_offset += victim_name_len + sizeof(*extref);
1175		}
1176		*search_done = 1;
1177	}
1178	btrfs_release_path(path);
1179
1180	/* look for a conflicting sequence number */
1181	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1182					 ref_index, name, namelen, 0);
1183	if (di && !IS_ERR(di)) {
1184		ret = drop_one_dir_item(trans, root, path, dir, di);
1185		if (ret)
1186			return ret;
1187	}
1188	btrfs_release_path(path);
1189
1190	/* look for a conflicting name */
1191	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1192				   name, namelen, 0);
1193	if (di && !IS_ERR(di)) {
1194		ret = drop_one_dir_item(trans, root, path, dir, di);
1195		if (ret)
1196			return ret;
1197	}
1198	btrfs_release_path(path);
1199
1200	return 0;
1201}
1202
1203static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1204			     u32 *namelen, char **name, u64 *index,
1205			     u64 *parent_objectid)
1206{
1207	struct btrfs_inode_extref *extref;
1208
1209	extref = (struct btrfs_inode_extref *)ref_ptr;
1210
1211	*namelen = btrfs_inode_extref_name_len(eb, extref);
1212	*name = kmalloc(*namelen, GFP_NOFS);
1213	if (*name == NULL)
1214		return -ENOMEM;
1215
1216	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1217			   *namelen);
1218
1219	if (index)
1220		*index = btrfs_inode_extref_index(eb, extref);
1221	if (parent_objectid)
1222		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
1223
1224	return 0;
1225}
1226
1227static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1228			  u32 *namelen, char **name, u64 *index)
1229{
1230	struct btrfs_inode_ref *ref;
1231
1232	ref = (struct btrfs_inode_ref *)ref_ptr;
1233
1234	*namelen = btrfs_inode_ref_name_len(eb, ref);
1235	*name = kmalloc(*namelen, GFP_NOFS);
1236	if (*name == NULL)
1237		return -ENOMEM;
1238
1239	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1240
1241	if (index)
1242		*index = btrfs_inode_ref_index(eb, ref);
1243
1244	return 0;
1245}
1246
1247/*
1248 * Take an inode reference item from the log tree and iterate all names from the
1249 * inode reference item in the subvolume tree with the same key (if it exists).
1250 * For any name that is not in the inode reference item from the log tree, do a
1251 * proper unlink of that name (that is, remove its entry from the inode
1252 * reference item and both dir index keys).
1253 */
1254static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1255				 struct btrfs_root *root,
1256				 struct btrfs_path *path,
1257				 struct btrfs_inode *inode,
1258				 struct extent_buffer *log_eb,
1259				 int log_slot,
1260				 struct btrfs_key *key)
1261{
1262	int ret;
1263	unsigned long ref_ptr;
1264	unsigned long ref_end;
1265	struct extent_buffer *eb;
1266
1267again:
1268	btrfs_release_path(path);
1269	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1270	if (ret > 0) {
1271		ret = 0;
1272		goto out;
1273	}
1274	if (ret < 0)
1275		goto out;
1276
1277	eb = path->nodes[0];
1278	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1279	ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
1280	while (ref_ptr < ref_end) {
1281		char *name = NULL;
1282		int namelen;
1283		u64 parent_id;
1284
1285		if (key->type == BTRFS_INODE_EXTREF_KEY) {
1286			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1287						NULL, &parent_id);
1288		} else {
1289			parent_id = key->offset;
1290			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1291					     NULL);
1292		}
1293		if (ret)
1294			goto out;
1295
1296		if (key->type == BTRFS_INODE_EXTREF_KEY)
1297			ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1298							       parent_id, name,
1299							       namelen);
1300		else
1301			ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
1302							   name, namelen);
1303
1304		if (!ret) {
1305			struct inode *dir;
1306
1307			btrfs_release_path(path);
1308			dir = read_one_inode(root, parent_id);
1309			if (!dir) {
1310				ret = -ENOENT;
1311				kfree(name);
1312				goto out;
1313			}
1314			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
1315						 inode, name, namelen);
1316			kfree(name);
1317			iput(dir);
1318			if (ret)
1319				goto out;
1320			goto again;
1321		}
1322
1323		kfree(name);
1324		ref_ptr += namelen;
1325		if (key->type == BTRFS_INODE_EXTREF_KEY)
1326			ref_ptr += sizeof(struct btrfs_inode_extref);
1327		else
1328			ref_ptr += sizeof(struct btrfs_inode_ref);
1329	}
1330	ret = 0;
1331 out:
1332	btrfs_release_path(path);
1333	return ret;
1334}
1335
1336static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
1337				  const u8 ref_type, const char *name,
1338				  const int namelen)
1339{
1340	struct btrfs_key key;
1341	struct btrfs_path *path;
1342	const u64 parent_id = btrfs_ino(BTRFS_I(dir));
1343	int ret;
1344
1345	path = btrfs_alloc_path();
1346	if (!path)
1347		return -ENOMEM;
1348
1349	key.objectid = btrfs_ino(BTRFS_I(inode));
1350	key.type = ref_type;
1351	if (key.type == BTRFS_INODE_REF_KEY)
1352		key.offset = parent_id;
1353	else
1354		key.offset = btrfs_extref_hash(parent_id, name, namelen);
1355
1356	ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
1357	if (ret < 0)
1358		goto out;
1359	if (ret > 0) {
1360		ret = 0;
1361		goto out;
1362	}
1363	if (key.type == BTRFS_INODE_EXTREF_KEY)
1364		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1365				path->slots[0], parent_id, name, namelen);
1366	else
1367		ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
1368						   name, namelen);
1369
1370out:
1371	btrfs_free_path(path);
1372	return ret;
1373}
1374
1375static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1376		    struct inode *dir, struct inode *inode, const char *name,
1377		    int namelen, u64 ref_index)
1378{
1379	struct btrfs_dir_item *dir_item;
1380	struct btrfs_key key;
1381	struct btrfs_path *path;
1382	struct inode *other_inode = NULL;
1383	int ret;
1384
1385	path = btrfs_alloc_path();
1386	if (!path)
1387		return -ENOMEM;
1388
1389	dir_item = btrfs_lookup_dir_item(NULL, root, path,
1390					 btrfs_ino(BTRFS_I(dir)),
1391					 name, namelen, 0);
1392	if (!dir_item) {
1393		btrfs_release_path(path);
1394		goto add_link;
1395	} else if (IS_ERR(dir_item)) {
1396		ret = PTR_ERR(dir_item);
1397		goto out;
1398	}
1399
1400	/*
1401	 * Our inode's dentry collides with the dentry of another inode which is
1402	 * in the log but not yet processed since it has a higher inode number.
1403	 * So delete that other dentry.
1404	 */
1405	btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1406	btrfs_release_path(path);
1407	other_inode = read_one_inode(root, key.objectid);
1408	if (!other_inode) {
1409		ret = -ENOENT;
1410		goto out;
1411	}
1412	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
1413				 name, namelen);
1414	if (ret)
1415		goto out;
1416	/*
1417	 * If we dropped the link count to 0, bump it so that later the iput()
1418	 * on the inode will not free it. We will fixup the link count later.
1419	 */
1420	if (other_inode->i_nlink == 0)
1421		inc_nlink(other_inode);
1422
1423	ret = btrfs_run_delayed_items(trans);
1424	if (ret)
1425		goto out;
1426add_link:
1427	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1428			     name, namelen, 0, ref_index);
1429out:
1430	iput(other_inode);
1431	btrfs_free_path(path);
1432
1433	return ret;
1434}
1435
1436/*
1437 * replay one inode back reference item found in the log tree.
1438 * eb, slot and key refer to the buffer and key found in the log tree.
1439 * root is the destination we are replaying into, and path is for temp
1440 * use by this function.  (it should be released on return).
1441 */
1442static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1443				  struct btrfs_root *root,
1444				  struct btrfs_root *log,
1445				  struct btrfs_path *path,
1446				  struct extent_buffer *eb, int slot,
1447				  struct btrfs_key *key)
1448{
1449	struct inode *dir = NULL;
1450	struct inode *inode = NULL;
1451	unsigned long ref_ptr;
1452	unsigned long ref_end;
1453	char *name = NULL;
1454	int namelen;
1455	int ret;
1456	int search_done = 0;
1457	int log_ref_ver = 0;
1458	u64 parent_objectid;
1459	u64 inode_objectid;
1460	u64 ref_index = 0;
1461	int ref_struct_size;
1462
1463	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1464	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1465
1466	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1467		struct btrfs_inode_extref *r;
1468
1469		ref_struct_size = sizeof(struct btrfs_inode_extref);
1470		log_ref_ver = 1;
1471		r = (struct btrfs_inode_extref *)ref_ptr;
1472		parent_objectid = btrfs_inode_extref_parent(eb, r);
1473	} else {
1474		ref_struct_size = sizeof(struct btrfs_inode_ref);
1475		parent_objectid = key->offset;
1476	}
1477	inode_objectid = key->objectid;
1478
1479	/*
1480	 * it is possible that we didn't log all the parent directories
1481	 * for a given inode.  If we don't find the dir, just don't
1482	 * copy the back ref in.  The link count fixup code will take
1483	 * care of the rest
1484	 */
1485	dir = read_one_inode(root, parent_objectid);
1486	if (!dir) {
1487		ret = -ENOENT;
1488		goto out;
1489	}
1490
1491	inode = read_one_inode(root, inode_objectid);
1492	if (!inode) {
1493		ret = -EIO;
1494		goto out;
1495	}
1496
1497	while (ref_ptr < ref_end) {
1498		if (log_ref_ver) {
1499			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1500						&ref_index, &parent_objectid);
1501			/*
1502			 * parent object can change from one array
1503			 * item to another.
1504			 */
1505			if (!dir)
1506				dir = read_one_inode(root, parent_objectid);
1507			if (!dir) {
1508				ret = -ENOENT;
1509				goto out;
1510			}
1511		} else {
1512			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1513					     &ref_index);
1514		}
1515		if (ret)
1516			goto out;
1517
1518		/* if we already have a perfect match, we're done */
1519		if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1520					btrfs_ino(BTRFS_I(inode)), ref_index,
1521					name, namelen)) {
1522			/*
1523			 * look for a conflicting back reference in the
1524			 * metadata. if we find one we have to unlink that name
1525			 * of the file before we add our new link.  Later on, we
1526			 * overwrite any existing back reference, and we don't
1527			 * want to create dangling pointers in the directory.
1528			 */
1529
1530			if (!search_done) {
1531				ret = __add_inode_ref(trans, root, path, log,
1532						      BTRFS_I(dir),
1533						      BTRFS_I(inode),
1534						      inode_objectid,
1535						      parent_objectid,
1536						      ref_index, name, namelen,
1537						      &search_done);
1538				if (ret) {
1539					if (ret == 1)
1540						ret = 0;
1541					goto out;
1542				}
1543			}
1544
1545			/*
1546			 * If a reference item already exists for this inode
1547			 * with the same parent and name, but different index,
1548			 * drop it and the corresponding directory index entries
1549			 * from the parent before adding the new reference item
1550			 * and dir index entries, otherwise we would fail with
1551			 * -EEXIST returned from btrfs_add_link() below.
1552			 */
1553			ret = btrfs_inode_ref_exists(inode, dir, key->type,
1554						     name, namelen);
1555			if (ret > 0) {
1556				ret = btrfs_unlink_inode(trans, root,
1557							 BTRFS_I(dir),
1558							 BTRFS_I(inode),
1559							 name, namelen);
1560				/*
1561				 * If we dropped the link count to 0, bump it so
1562				 * that later the iput() on the inode will not
1563				 * free it. We will fixup the link count later.
1564				 */
1565				if (!ret && inode->i_nlink == 0)
1566					inc_nlink(inode);
1567			}
1568			if (ret < 0)
1569				goto out;
1570
1571			/* insert our name */
1572			ret = add_link(trans, root, dir, inode, name, namelen,
1573				       ref_index);
1574			if (ret)
1575				goto out;
1576
1577			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1578			if (ret)
1579				goto out;
1580		}
1581
1582		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1583		kfree(name);
1584		name = NULL;
1585		if (log_ref_ver) {
1586			iput(dir);
1587			dir = NULL;
1588		}
1589	}
1590
1591	/*
1592	 * Before we overwrite the inode reference item in the subvolume tree
1593	 * with the item from the log tree, we must unlink all names from the
1594	 * parent directory that are in the subvolume's tree inode reference
1595	 * item, otherwise we end up with an inconsistent subvolume tree where
1596	 * dir index entries exist for a name but there is no inode reference
1597	 * item with the same name.
1598	 */
1599	ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1600				    key);
1601	if (ret)
1602		goto out;
1603
1604	/* finally write the back reference in the inode */
1605	ret = overwrite_item(trans, root, path, eb, slot, key);
1606out:
1607	btrfs_release_path(path);
1608	kfree(name);
1609	iput(dir);
1610	iput(inode);
1611	return ret;
1612}
1613
1614static int count_inode_extrefs(struct btrfs_root *root,
1615		struct btrfs_inode *inode, struct btrfs_path *path)
1616{
1617	int ret = 0;
1618	int name_len;
1619	unsigned int nlink = 0;
1620	u32 item_size;
1621	u32 cur_offset = 0;
1622	u64 inode_objectid = btrfs_ino(inode);
1623	u64 offset = 0;
1624	unsigned long ptr;
1625	struct btrfs_inode_extref *extref;
1626	struct extent_buffer *leaf;
1627
1628	while (1) {
1629		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1630					    &extref, &offset);
1631		if (ret)
1632			break;
1633
1634		leaf = path->nodes[0];
1635		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1636		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1637		cur_offset = 0;
1638
1639		while (cur_offset < item_size) {
1640			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1641			name_len = btrfs_inode_extref_name_len(leaf, extref);
1642
1643			nlink++;
1644
1645			cur_offset += name_len + sizeof(*extref);
1646		}
1647
1648		offset++;
1649		btrfs_release_path(path);
1650	}
1651	btrfs_release_path(path);
1652
1653	if (ret < 0 && ret != -ENOENT)
1654		return ret;
1655	return nlink;
1656}
1657
1658static int count_inode_refs(struct btrfs_root *root,
1659			struct btrfs_inode *inode, struct btrfs_path *path)
1660{
1661	int ret;
1662	struct btrfs_key key;
1663	unsigned int nlink = 0;
1664	unsigned long ptr;
1665	unsigned long ptr_end;
1666	int name_len;
1667	u64 ino = btrfs_ino(inode);
1668
1669	key.objectid = ino;
1670	key.type = BTRFS_INODE_REF_KEY;
1671	key.offset = (u64)-1;
1672
1673	while (1) {
1674		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1675		if (ret < 0)
1676			break;
1677		if (ret > 0) {
1678			if (path->slots[0] == 0)
1679				break;
1680			path->slots[0]--;
1681		}
1682process_slot:
1683		btrfs_item_key_to_cpu(path->nodes[0], &key,
1684				      path->slots[0]);
1685		if (key.objectid != ino ||
1686		    key.type != BTRFS_INODE_REF_KEY)
1687			break;
1688		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1689		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1690						   path->slots[0]);
1691		while (ptr < ptr_end) {
1692			struct btrfs_inode_ref *ref;
1693
1694			ref = (struct btrfs_inode_ref *)ptr;
1695			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1696							    ref);
1697			ptr = (unsigned long)(ref + 1) + name_len;
1698			nlink++;
1699		}
1700
1701		if (key.offset == 0)
1702			break;
1703		if (path->slots[0] > 0) {
1704			path->slots[0]--;
1705			goto process_slot;
1706		}
1707		key.offset--;
1708		btrfs_release_path(path);
1709	}
1710	btrfs_release_path(path);
1711
1712	return nlink;
1713}
1714
1715/*
1716 * There are a few corners where the link count of the file can't
1717 * be properly maintained during replay.  So, instead of adding
1718 * lots of complexity to the log code, we just scan the backrefs
1719 * for any file that has been through replay.
1720 *
1721 * The scan will update the link count on the inode to reflect the
1722 * number of back refs found.  If it goes down to zero, the iput
1723 * will free the inode.
1724 */
1725static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1726					   struct btrfs_root *root,
1727					   struct inode *inode)
1728{
1729	struct btrfs_path *path;
1730	int ret;
1731	u64 nlink = 0;
1732	u64 ino = btrfs_ino(BTRFS_I(inode));
1733
1734	path = btrfs_alloc_path();
1735	if (!path)
1736		return -ENOMEM;
1737
1738	ret = count_inode_refs(root, BTRFS_I(inode), path);
1739	if (ret < 0)
1740		goto out;
1741
1742	nlink = ret;
1743
1744	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1745	if (ret < 0)
1746		goto out;
1747
1748	nlink += ret;
1749
1750	ret = 0;
1751
1752	if (nlink != inode->i_nlink) {
1753		set_nlink(inode, nlink);
1754		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1755		if (ret)
1756			goto out;
1757	}
1758	BTRFS_I(inode)->index_cnt = (u64)-1;
1759
1760	if (inode->i_nlink == 0) {
1761		if (S_ISDIR(inode->i_mode)) {
1762			ret = replay_dir_deletes(trans, root, NULL, path,
1763						 ino, 1);
1764			if (ret)
1765				goto out;
1766		}
1767		ret = btrfs_insert_orphan_item(trans, root, ino);
1768		if (ret == -EEXIST)
1769			ret = 0;
1770	}
1771
1772out:
1773	btrfs_free_path(path);
1774	return ret;
1775}
1776
1777static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1778					    struct btrfs_root *root,
1779					    struct btrfs_path *path)
1780{
1781	int ret;
1782	struct btrfs_key key;
1783	struct inode *inode;
1784
1785	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1786	key.type = BTRFS_ORPHAN_ITEM_KEY;
1787	key.offset = (u64)-1;
1788	while (1) {
1789		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1790		if (ret < 0)
1791			break;
1792
1793		if (ret == 1) {
1794			ret = 0;
1795			if (path->slots[0] == 0)
1796				break;
1797			path->slots[0]--;
1798		}
1799
1800		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1801		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1802		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1803			break;
1804
1805		ret = btrfs_del_item(trans, root, path);
1806		if (ret)
1807			break;
1808
1809		btrfs_release_path(path);
1810		inode = read_one_inode(root, key.offset);
1811		if (!inode) {
1812			ret = -EIO;
1813			break;
1814		}
1815
1816		ret = fixup_inode_link_count(trans, root, inode);
1817		iput(inode);
1818		if (ret)
1819			break;
1820
1821		/*
1822		 * fixup on a directory may create new entries,
1823		 * make sure we always look for the highset possible
1824		 * offset
1825		 */
1826		key.offset = (u64)-1;
1827	}
1828	btrfs_release_path(path);
1829	return ret;
1830}
1831
1832
1833/*
1834 * record a given inode in the fixup dir so we can check its link
1835 * count when replay is done.  The link count is incremented here
1836 * so the inode won't go away until we check it
1837 */
1838static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1839				      struct btrfs_root *root,
1840				      struct btrfs_path *path,
1841				      u64 objectid)
1842{
1843	struct btrfs_key key;
1844	int ret = 0;
1845	struct inode *inode;
1846
1847	inode = read_one_inode(root, objectid);
1848	if (!inode)
1849		return -EIO;
1850
1851	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1852	key.type = BTRFS_ORPHAN_ITEM_KEY;
1853	key.offset = objectid;
1854
1855	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1856
1857	btrfs_release_path(path);
1858	if (ret == 0) {
1859		if (!inode->i_nlink)
1860			set_nlink(inode, 1);
1861		else
1862			inc_nlink(inode);
1863		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1864	} else if (ret == -EEXIST) {
1865		ret = 0;
1866	}
1867	iput(inode);
1868
1869	return ret;
1870}
1871
1872/*
1873 * when replaying the log for a directory, we only insert names
1874 * for inodes that actually exist.  This means an fsync on a directory
1875 * does not implicitly fsync all the new files in it
1876 */
1877static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1878				    struct btrfs_root *root,
1879				    u64 dirid, u64 index,
1880				    char *name, int name_len,
1881				    struct btrfs_key *location)
1882{
1883	struct inode *inode;
1884	struct inode *dir;
1885	int ret;
1886
1887	inode = read_one_inode(root, location->objectid);
1888	if (!inode)
1889		return -ENOENT;
1890
1891	dir = read_one_inode(root, dirid);
1892	if (!dir) {
1893		iput(inode);
1894		return -EIO;
1895	}
1896
1897	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1898			name_len, 1, index);
1899
1900	/* FIXME, put inode into FIXUP list */
1901
1902	iput(inode);
1903	iput(dir);
1904	return ret;
1905}
1906
1907/*
1908 * take a single entry in a log directory item and replay it into
1909 * the subvolume.
1910 *
1911 * if a conflicting item exists in the subdirectory already,
1912 * the inode it points to is unlinked and put into the link count
1913 * fix up tree.
1914 *
1915 * If a name from the log points to a file or directory that does
1916 * not exist in the FS, it is skipped.  fsyncs on directories
1917 * do not force down inodes inside that directory, just changes to the
1918 * names or unlinks in a directory.
1919 *
1920 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1921 * non-existing inode) and 1 if the name was replayed.
1922 */
1923static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1924				    struct btrfs_root *root,
1925				    struct btrfs_path *path,
1926				    struct extent_buffer *eb,
1927				    struct btrfs_dir_item *di,
1928				    struct btrfs_key *key)
1929{
1930	char *name;
1931	int name_len;
1932	struct btrfs_dir_item *dst_di;
1933	struct btrfs_key found_key;
1934	struct btrfs_key log_key;
1935	struct inode *dir;
1936	u8 log_type;
1937	int exists;
1938	int ret = 0;
1939	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1940	bool name_added = false;
1941
1942	dir = read_one_inode(root, key->objectid);
1943	if (!dir)
1944		return -EIO;
1945
1946	name_len = btrfs_dir_name_len(eb, di);
1947	name = kmalloc(name_len, GFP_NOFS);
1948	if (!name) {
1949		ret = -ENOMEM;
1950		goto out;
1951	}
1952
1953	log_type = btrfs_dir_type(eb, di);
1954	read_extent_buffer(eb, name, (unsigned long)(di + 1),
1955		   name_len);
1956
1957	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1958	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1959	if (exists == 0)
1960		exists = 1;
1961	else
1962		exists = 0;
1963	btrfs_release_path(path);
1964
1965	if (key->type == BTRFS_DIR_ITEM_KEY) {
1966		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1967				       name, name_len, 1);
1968	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
1969		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1970						     key->objectid,
1971						     key->offset, name,
1972						     name_len, 1);
1973	} else {
1974		/* Corruption */
1975		ret = -EINVAL;
1976		goto out;
1977	}
1978	if (IS_ERR_OR_NULL(dst_di)) {
1979		/* we need a sequence number to insert, so we only
1980		 * do inserts for the BTRFS_DIR_INDEX_KEY types
1981		 */
1982		if (key->type != BTRFS_DIR_INDEX_KEY)
1983			goto out;
1984		goto insert;
1985	}
1986
1987	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1988	/* the existing item matches the logged item */
1989	if (found_key.objectid == log_key.objectid &&
1990	    found_key.type == log_key.type &&
1991	    found_key.offset == log_key.offset &&
1992	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1993		update_size = false;
1994		goto out;
1995	}
1996
1997	/*
1998	 * don't drop the conflicting directory entry if the inode
1999	 * for the new entry doesn't exist
2000	 */
2001	if (!exists)
2002		goto out;
2003
2004	ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
2005	if (ret)
2006		goto out;
2007
2008	if (key->type == BTRFS_DIR_INDEX_KEY)
2009		goto insert;
2010out:
2011	btrfs_release_path(path);
2012	if (!ret && update_size) {
2013		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
2014		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
2015	}
2016	kfree(name);
2017	iput(dir);
2018	if (!ret && name_added)
2019		ret = 1;
2020	return ret;
2021
2022insert:
2023	/*
2024	 * Check if the inode reference exists in the log for the given name,
2025	 * inode and parent inode
2026	 */
2027	found_key.objectid = log_key.objectid;
2028	found_key.type = BTRFS_INODE_REF_KEY;
2029	found_key.offset = key->objectid;
2030	ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
2031	if (ret < 0) {
2032	        goto out;
2033	} else if (ret) {
2034	        /* The dentry will be added later. */
2035	        ret = 0;
2036	        update_size = false;
2037	        goto out;
2038	}
2039
2040	found_key.objectid = log_key.objectid;
2041	found_key.type = BTRFS_INODE_EXTREF_KEY;
2042	found_key.offset = key->objectid;
2043	ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
2044			     name_len);
2045	if (ret < 0) {
2046		goto out;
2047	} else if (ret) {
2048		/* The dentry will be added later. */
2049		ret = 0;
2050		update_size = false;
2051		goto out;
2052	}
2053	btrfs_release_path(path);
2054	ret = insert_one_name(trans, root, key->objectid, key->offset,
2055			      name, name_len, &log_key);
2056	if (ret && ret != -ENOENT && ret != -EEXIST)
2057		goto out;
2058	if (!ret)
2059		name_added = true;
2060	update_size = false;
2061	ret = 0;
2062	goto out;
2063}
2064
2065/*
2066 * find all the names in a directory item and reconcile them into
2067 * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
2068 * one name in a directory item, but the same code gets used for
2069 * both directory index types
2070 */
2071static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
2072					struct btrfs_root *root,
2073					struct btrfs_path *path,
2074					struct extent_buffer *eb, int slot,
2075					struct btrfs_key *key)
2076{
2077	int ret = 0;
2078	u32 item_size = btrfs_item_size_nr(eb, slot);
2079	struct btrfs_dir_item *di;
2080	int name_len;
2081	unsigned long ptr;
2082	unsigned long ptr_end;
2083	struct btrfs_path *fixup_path = NULL;
2084
2085	ptr = btrfs_item_ptr_offset(eb, slot);
2086	ptr_end = ptr + item_size;
2087	while (ptr < ptr_end) {
2088		di = (struct btrfs_dir_item *)ptr;
2089		name_len = btrfs_dir_name_len(eb, di);
2090		ret = replay_one_name(trans, root, path, eb, di, key);
2091		if (ret < 0)
2092			break;
2093		ptr = (unsigned long)(di + 1);
2094		ptr += name_len;
2095
2096		/*
2097		 * If this entry refers to a non-directory (directories can not
2098		 * have a link count > 1) and it was added in the transaction
2099		 * that was not committed, make sure we fixup the link count of
2100		 * the inode it the entry points to. Otherwise something like
2101		 * the following would result in a directory pointing to an
2102		 * inode with a wrong link that does not account for this dir
2103		 * entry:
2104		 *
2105		 * mkdir testdir
2106		 * touch testdir/foo
2107		 * touch testdir/bar
2108		 * sync
2109		 *
2110		 * ln testdir/bar testdir/bar_link
2111		 * ln testdir/foo testdir/foo_link
2112		 * xfs_io -c "fsync" testdir/bar
2113		 *
2114		 * <power failure>
2115		 *
2116		 * mount fs, log replay happens
2117		 *
2118		 * File foo would remain with a link count of 1 when it has two
2119		 * entries pointing to it in the directory testdir. This would
2120		 * make it impossible to ever delete the parent directory has
2121		 * it would result in stale dentries that can never be deleted.
2122		 */
2123		if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
2124			struct btrfs_key di_key;
2125
2126			if (!fixup_path) {
2127				fixup_path = btrfs_alloc_path();
2128				if (!fixup_path) {
2129					ret = -ENOMEM;
2130					break;
2131				}
2132			}
2133
2134			btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2135			ret = link_to_fixup_dir(trans, root, fixup_path,
2136						di_key.objectid);
2137			if (ret)
2138				break;
2139		}
2140		ret = 0;
2141	}
2142	btrfs_free_path(fixup_path);
2143	return ret;
2144}
2145
2146/*
2147 * directory replay has two parts.  There are the standard directory
2148 * items in the log copied from the subvolume, and range items
2149 * created in the log while the subvolume was logged.
2150 *
2151 * The range items tell us which parts of the key space the log
2152 * is authoritative for.  During replay, if a key in the subvolume
2153 * directory is in a logged range item, but not actually in the log
2154 * that means it was deleted from the directory before the fsync
2155 * and should be removed.
2156 */
2157static noinline int find_dir_range(struct btrfs_root *root,
2158				   struct btrfs_path *path,
2159				   u64 dirid, int key_type,
2160				   u64 *start_ret, u64 *end_ret)
2161{
2162	struct btrfs_key key;
2163	u64 found_end;
2164	struct btrfs_dir_log_item *item;
2165	int ret;
2166	int nritems;
2167
2168	if (*start_ret == (u64)-1)
2169		return 1;
2170
2171	key.objectid = dirid;
2172	key.type = key_type;
2173	key.offset = *start_ret;
2174
2175	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2176	if (ret < 0)
2177		goto out;
2178	if (ret > 0) {
2179		if (path->slots[0] == 0)
2180			goto out;
2181		path->slots[0]--;
2182	}
2183	if (ret != 0)
2184		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2185
2186	if (key.type != key_type || key.objectid != dirid) {
2187		ret = 1;
2188		goto next;
2189	}
2190	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2191			      struct btrfs_dir_log_item);
2192	found_end = btrfs_dir_log_end(path->nodes[0], item);
2193
2194	if (*start_ret >= key.offset && *start_ret <= found_end) {
2195		ret = 0;
2196		*start_ret = key.offset;
2197		*end_ret = found_end;
2198		goto out;
2199	}
2200	ret = 1;
2201next:
2202	/* check the next slot in the tree to see if it is a valid item */
2203	nritems = btrfs_header_nritems(path->nodes[0]);
2204	path->slots[0]++;
2205	if (path->slots[0] >= nritems) {
2206		ret = btrfs_next_leaf(root, path);
2207		if (ret)
2208			goto out;
2209	}
2210
2211	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2212
2213	if (key.type != key_type || key.objectid != dirid) {
2214		ret = 1;
2215		goto out;
2216	}
2217	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2218			      struct btrfs_dir_log_item);
2219	found_end = btrfs_dir_log_end(path->nodes[0], item);
2220	*start_ret = key.offset;
2221	*end_ret = found_end;
2222	ret = 0;
2223out:
2224	btrfs_release_path(path);
2225	return ret;
2226}
2227
2228/*
2229 * this looks for a given directory item in the log.  If the directory
2230 * item is not in the log, the item is removed and the inode it points
2231 * to is unlinked
2232 */
2233static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2234				      struct btrfs_root *root,
2235				      struct btrfs_root *log,
2236				      struct btrfs_path *path,
2237				      struct btrfs_path *log_path,
2238				      struct inode *dir,
2239				      struct btrfs_key *dir_key)
2240{
2241	int ret;
2242	struct extent_buffer *eb;
2243	int slot;
2244	u32 item_size;
2245	struct btrfs_dir_item *di;
2246	struct btrfs_dir_item *log_di;
2247	int name_len;
2248	unsigned long ptr;
2249	unsigned long ptr_end;
2250	char *name;
2251	struct inode *inode;
2252	struct btrfs_key location;
2253
2254again:
2255	eb = path->nodes[0];
2256	slot = path->slots[0];
2257	item_size = btrfs_item_size_nr(eb, slot);
2258	ptr = btrfs_item_ptr_offset(eb, slot);
2259	ptr_end = ptr + item_size;
2260	while (ptr < ptr_end) {
2261		di = (struct btrfs_dir_item *)ptr;
2262		name_len = btrfs_dir_name_len(eb, di);
2263		name = kmalloc(name_len, GFP_NOFS);
2264		if (!name) {
2265			ret = -ENOMEM;
2266			goto out;
2267		}
2268		read_extent_buffer(eb, name, (unsigned long)(di + 1),
2269				  name_len);
2270		log_di = NULL;
2271		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2272			log_di = btrfs_lookup_dir_item(trans, log, log_path,
2273						       dir_key->objectid,
2274						       name, name_len, 0);
2275		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2276			log_di = btrfs_lookup_dir_index_item(trans, log,
2277						     log_path,
2278						     dir_key->objectid,
2279						     dir_key->offset,
2280						     name, name_len, 0);
2281		}
2282		if (!log_di || log_di == ERR_PTR(-ENOENT)) {
2283			btrfs_dir_item_key_to_cpu(eb, di, &location);
2284			btrfs_release_path(path);
2285			btrfs_release_path(log_path);
2286			inode = read_one_inode(root, location.objectid);
2287			if (!inode) {
2288				kfree(name);
2289				return -EIO;
2290			}
2291
2292			ret = link_to_fixup_dir(trans, root,
2293						path, location.objectid);
2294			if (ret) {
2295				kfree(name);
2296				iput(inode);
2297				goto out;
2298			}
2299
2300			inc_nlink(inode);
2301			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2302					BTRFS_I(inode), name, name_len);
2303			if (!ret)
2304				ret = btrfs_run_delayed_items(trans);
2305			kfree(name);
2306			iput(inode);
2307			if (ret)
2308				goto out;
2309
2310			/* there might still be more names under this key
2311			 * check and repeat if required
2312			 */
2313			ret = btrfs_search_slot(NULL, root, dir_key, path,
2314						0, 0);
2315			if (ret == 0)
2316				goto again;
2317			ret = 0;
2318			goto out;
2319		} else if (IS_ERR(log_di)) {
2320			kfree(name);
2321			return PTR_ERR(log_di);
2322		}
2323		btrfs_release_path(log_path);
2324		kfree(name);
2325
2326		ptr = (unsigned long)(di + 1);
2327		ptr += name_len;
2328	}
2329	ret = 0;
2330out:
2331	btrfs_release_path(path);
2332	btrfs_release_path(log_path);
2333	return ret;
2334}
2335
2336static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2337			      struct btrfs_root *root,
2338			      struct btrfs_root *log,
2339			      struct btrfs_path *path,
2340			      const u64 ino)
2341{
2342	struct btrfs_key search_key;
2343	struct btrfs_path *log_path;
2344	int i;
2345	int nritems;
2346	int ret;
2347
2348	log_path = btrfs_alloc_path();
2349	if (!log_path)
2350		return -ENOMEM;
2351
2352	search_key.objectid = ino;
2353	search_key.type = BTRFS_XATTR_ITEM_KEY;
2354	search_key.offset = 0;
2355again:
2356	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2357	if (ret < 0)
2358		goto out;
2359process_leaf:
2360	nritems = btrfs_header_nritems(path->nodes[0]);
2361	for (i = path->slots[0]; i < nritems; i++) {
2362		struct btrfs_key key;
2363		struct btrfs_dir_item *di;
2364		struct btrfs_dir_item *log_di;
2365		u32 total_size;
2366		u32 cur;
2367
2368		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2369		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2370			ret = 0;
2371			goto out;
2372		}
2373
2374		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2375		total_size = btrfs_item_size_nr(path->nodes[0], i);
2376		cur = 0;
2377		while (cur < total_size) {
2378			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2379			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2380			u32 this_len = sizeof(*di) + name_len + data_len;
2381			char *name;
2382
2383			name = kmalloc(name_len, GFP_NOFS);
2384			if (!name) {
2385				ret = -ENOMEM;
2386				goto out;
2387			}
2388			read_extent_buffer(path->nodes[0], name,
2389					   (unsigned long)(di + 1), name_len);
2390
2391			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2392						    name, name_len, 0);
2393			btrfs_release_path(log_path);
2394			if (!log_di) {
2395				/* Doesn't exist in log tree, so delete it. */
2396				btrfs_release_path(path);
2397				di = btrfs_lookup_xattr(trans, root, path, ino,
2398							name, name_len, -1);
2399				kfree(name);
2400				if (IS_ERR(di)) {
2401					ret = PTR_ERR(di);
2402					goto out;
2403				}
2404				ASSERT(di);
2405				ret = btrfs_delete_one_dir_name(trans, root,
2406								path, di);
2407				if (ret)
2408					goto out;
2409				btrfs_release_path(path);
2410				search_key = key;
2411				goto again;
2412			}
2413			kfree(name);
2414			if (IS_ERR(log_di)) {
2415				ret = PTR_ERR(log_di);
2416				goto out;
2417			}
2418			cur += this_len;
2419			di = (struct btrfs_dir_item *)((char *)di + this_len);
2420		}
2421	}
2422	ret = btrfs_next_leaf(root, path);
2423	if (ret > 0)
2424		ret = 0;
2425	else if (ret == 0)
2426		goto process_leaf;
2427out:
2428	btrfs_free_path(log_path);
2429	btrfs_release_path(path);
2430	return ret;
2431}
2432
2433
2434/*
2435 * deletion replay happens before we copy any new directory items
2436 * out of the log or out of backreferences from inodes.  It
2437 * scans the log to find ranges of keys that log is authoritative for,
2438 * and then scans the directory to find items in those ranges that are
2439 * not present in the log.
2440 *
2441 * Anything we don't find in the log is unlinked and removed from the
2442 * directory.
2443 */
2444static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2445				       struct btrfs_root *root,
2446				       struct btrfs_root *log,
2447				       struct btrfs_path *path,
2448				       u64 dirid, int del_all)
2449{
2450	u64 range_start;
2451	u64 range_end;
2452	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2453	int ret = 0;
2454	struct btrfs_key dir_key;
2455	struct btrfs_key found_key;
2456	struct btrfs_path *log_path;
2457	struct inode *dir;
2458
2459	dir_key.objectid = dirid;
2460	dir_key.type = BTRFS_DIR_ITEM_KEY;
2461	log_path = btrfs_alloc_path();
2462	if (!log_path)
2463		return -ENOMEM;
2464
2465	dir = read_one_inode(root, dirid);
2466	/* it isn't an error if the inode isn't there, that can happen
2467	 * because we replay the deletes before we copy in the inode item
2468	 * from the log
2469	 */
2470	if (!dir) {
2471		btrfs_free_path(log_path);
2472		return 0;
2473	}
2474again:
2475	range_start = 0;
2476	range_end = 0;
2477	while (1) {
2478		if (del_all)
2479			range_end = (u64)-1;
2480		else {
2481			ret = find_dir_range(log, path, dirid, key_type,
2482					     &range_start, &range_end);
2483			if (ret != 0)
2484				break;
2485		}
2486
2487		dir_key.offset = range_start;
2488		while (1) {
2489			int nritems;
2490			ret = btrfs_search_slot(NULL, root, &dir_key, path,
2491						0, 0);
2492			if (ret < 0)
2493				goto out;
2494
2495			nritems = btrfs_header_nritems(path->nodes[0]);
2496			if (path->slots[0] >= nritems) {
2497				ret = btrfs_next_leaf(root, path);
2498				if (ret == 1)
2499					break;
2500				else if (ret < 0)
2501					goto out;
2502			}
2503			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2504					      path->slots[0]);
2505			if (found_key.objectid != dirid ||
2506			    found_key.type != dir_key.type)
2507				goto next_type;
2508
2509			if (found_key.offset > range_end)
2510				break;
2511
2512			ret = check_item_in_log(trans, root, log, path,
2513						log_path, dir,
2514						&found_key);
2515			if (ret)
2516				goto out;
2517			if (found_key.offset == (u64)-1)
2518				break;
2519			dir_key.offset = found_key.offset + 1;
2520		}
2521		btrfs_release_path(path);
2522		if (range_end == (u64)-1)
2523			break;
2524		range_start = range_end + 1;
2525	}
2526
2527next_type:
2528	ret = 0;
2529	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2530		key_type = BTRFS_DIR_LOG_INDEX_KEY;
2531		dir_key.type = BTRFS_DIR_INDEX_KEY;
2532		btrfs_release_path(path);
2533		goto again;
2534	}
2535out:
2536	btrfs_release_path(path);
2537	btrfs_free_path(log_path);
2538	iput(dir);
2539	return ret;
2540}
2541
2542/*
2543 * the process_func used to replay items from the log tree.  This
2544 * gets called in two different stages.  The first stage just looks
2545 * for inodes and makes sure they are all copied into the subvolume.
2546 *
2547 * The second stage copies all the other item types from the log into
2548 * the subvolume.  The two stage approach is slower, but gets rid of
2549 * lots of complexity around inodes referencing other inodes that exist
2550 * only in the log (references come from either directory items or inode
2551 * back refs).
2552 */
2553static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2554			     struct walk_control *wc, u64 gen, int level)
2555{
2556	int nritems;
2557	struct btrfs_path *path;
2558	struct btrfs_root *root = wc->replay_dest;
2559	struct btrfs_key key;
2560	int i;
2561	int ret;
2562
2563	ret = btrfs_read_buffer(eb, gen, level, NULL);
2564	if (ret)
2565		return ret;
2566
2567	level = btrfs_header_level(eb);
2568
2569	if (level != 0)
2570		return 0;
2571
2572	path = btrfs_alloc_path();
2573	if (!path)
2574		return -ENOMEM;
2575
2576	nritems = btrfs_header_nritems(eb);
2577	for (i = 0; i < nritems; i++) {
2578		btrfs_item_key_to_cpu(eb, &key, i);
2579
2580		/* inode keys are done during the first stage */
2581		if (key.type == BTRFS_INODE_ITEM_KEY &&
2582		    wc->stage == LOG_WALK_REPLAY_INODES) {
2583			struct btrfs_inode_item *inode_item;
2584			u32 mode;
2585
2586			inode_item = btrfs_item_ptr(eb, i,
2587					    struct btrfs_inode_item);
2588			/*
2589			 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2590			 * and never got linked before the fsync, skip it, as
2591			 * replaying it is pointless since it would be deleted
2592			 * later. We skip logging tmpfiles, but it's always
2593			 * possible we are replaying a log created with a kernel
2594			 * that used to log tmpfiles.
2595			 */
2596			if (btrfs_inode_nlink(eb, inode_item) == 0) {
2597				wc->ignore_cur_inode = true;
2598				continue;
2599			} else {
2600				wc->ignore_cur_inode = false;
2601			}
2602			ret = replay_xattr_deletes(wc->trans, root, log,
2603						   path, key.objectid);
2604			if (ret)
2605				break;
2606			mode = btrfs_inode_mode(eb, inode_item);
2607			if (S_ISDIR(mode)) {
2608				ret = replay_dir_deletes(wc->trans,
2609					 root, log, path, key.objectid, 0);
2610				if (ret)
2611					break;
2612			}
2613			ret = overwrite_item(wc->trans, root, path,
2614					     eb, i, &key);
2615			if (ret)
2616				break;
2617
2618			/*
2619			 * Before replaying extents, truncate the inode to its
2620			 * size. We need to do it now and not after log replay
2621			 * because before an fsync we can have prealloc extents
2622			 * added beyond the inode's i_size. If we did it after,
2623			 * through orphan cleanup for example, we would drop
2624			 * those prealloc extents just after replaying them.
2625			 */
2626			if (S_ISREG(mode)) {
2627				struct btrfs_drop_extents_args drop_args = { 0 };
2628				struct inode *inode;
2629				u64 from;
2630
2631				inode = read_one_inode(root, key.objectid);
2632				if (!inode) {
2633					ret = -EIO;
2634					break;
2635				}
2636				from = ALIGN(i_size_read(inode),
2637					     root->fs_info->sectorsize);
2638				drop_args.start = from;
2639				drop_args.end = (u64)-1;
2640				drop_args.drop_cache = true;
2641				ret = btrfs_drop_extents(wc->trans, root,
2642							 BTRFS_I(inode),
2643							 &drop_args);
2644				if (!ret) {
2645					inode_sub_bytes(inode,
2646							drop_args.bytes_found);
2647					/* Update the inode's nbytes. */
2648					ret = btrfs_update_inode(wc->trans,
2649							root, BTRFS_I(inode));
2650				}
2651				iput(inode);
2652				if (ret)
2653					break;
2654			}
2655
2656			ret = link_to_fixup_dir(wc->trans, root,
2657						path, key.objectid);
2658			if (ret)
2659				break;
2660		}
2661
2662		if (wc->ignore_cur_inode)
2663			continue;
2664
2665		if (key.type == BTRFS_DIR_INDEX_KEY &&
2666		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2667			ret = replay_one_dir_item(wc->trans, root, path,
2668						  eb, i, &key);
2669			if (ret)
2670				break;
2671		}
2672
2673		if (wc->stage < LOG_WALK_REPLAY_ALL)
2674			continue;
2675
2676		/* these keys are simply copied */
2677		if (key.type == BTRFS_XATTR_ITEM_KEY) {
2678			ret = overwrite_item(wc->trans, root, path,
2679					     eb, i, &key);
2680			if (ret)
2681				break;
2682		} else if (key.type == BTRFS_INODE_REF_KEY ||
2683			   key.type == BTRFS_INODE_EXTREF_KEY) {
2684			ret = add_inode_ref(wc->trans, root, log, path,
2685					    eb, i, &key);
2686			if (ret && ret != -ENOENT)
2687				break;
2688			ret = 0;
2689		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2690			ret = replay_one_extent(wc->trans, root, path,
2691						eb, i, &key);
2692			if (ret)
2693				break;
2694		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
2695			ret = replay_one_dir_item(wc->trans, root, path,
2696						  eb, i, &key);
2697			if (ret)
2698				break;
2699		}
2700	}
2701	btrfs_free_path(path);
2702	return ret;
2703}
2704
2705/*
2706 * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2707 */
2708static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2709{
2710	struct btrfs_block_group *cache;
2711
2712	cache = btrfs_lookup_block_group(fs_info, start);
2713	if (!cache) {
2714		btrfs_err(fs_info, "unable to find block group for %llu", start);
2715		return;
2716	}
2717
2718	spin_lock(&cache->space_info->lock);
2719	spin_lock(&cache->lock);
2720	cache->reserved -= fs_info->nodesize;
2721	cache->space_info->bytes_reserved -= fs_info->nodesize;
2722	spin_unlock(&cache->lock);
2723	spin_unlock(&cache->space_info->lock);
2724
2725	btrfs_put_block_group(cache);
2726}
2727
2728static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2729				   struct btrfs_root *root,
2730				   struct btrfs_path *path, int *level,
2731				   struct walk_control *wc)
2732{
2733	struct btrfs_fs_info *fs_info = root->fs_info;
2734	u64 bytenr;
2735	u64 ptr_gen;
2736	struct extent_buffer *next;
2737	struct extent_buffer *cur;
2738	u32 blocksize;
2739	int ret = 0;
2740
2741	while (*level > 0) {
2742		struct btrfs_key first_key;
2743
2744		cur = path->nodes[*level];
2745
2746		WARN_ON(btrfs_header_level(cur) != *level);
2747
2748		if (path->slots[*level] >=
2749		    btrfs_header_nritems(cur))
2750			break;
2751
2752		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2753		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2754		btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
2755		blocksize = fs_info->nodesize;
2756
2757		next = btrfs_find_create_tree_block(fs_info, bytenr,
2758						    btrfs_header_owner(cur),
2759						    *level - 1);
2760		if (IS_ERR(next))
2761			return PTR_ERR(next);
2762
2763		if (*level == 1) {
2764			ret = wc->process_func(root, next, wc, ptr_gen,
2765					       *level - 1);
2766			if (ret) {
2767				free_extent_buffer(next);
2768				return ret;
2769			}
2770
2771			path->slots[*level]++;
2772			if (wc->free) {
2773				ret = btrfs_read_buffer(next, ptr_gen,
2774							*level - 1, &first_key);
2775				if (ret) {
2776					free_extent_buffer(next);
2777					return ret;
2778				}
2779
2780				if (trans) {
2781					btrfs_tree_lock(next);
2782					btrfs_clean_tree_block(next);
2783					btrfs_wait_tree_block_writeback(next);
2784					btrfs_tree_unlock(next);
2785					ret = btrfs_pin_reserved_extent(trans,
2786							bytenr, blocksize);
2787					if (ret) {
2788						free_extent_buffer(next);
2789						return ret;
2790					}
2791					btrfs_redirty_list_add(
2792						trans->transaction, next);
2793				} else {
2794					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2795						clear_extent_buffer_dirty(next);
2796					unaccount_log_buffer(fs_info, bytenr);
2797				}
2798			}
2799			free_extent_buffer(next);
2800			continue;
2801		}
2802		ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
2803		if (ret) {
2804			free_extent_buffer(next);
2805			return ret;
2806		}
2807
2808		if (path->nodes[*level-1])
2809			free_extent_buffer(path->nodes[*level-1]);
2810		path->nodes[*level-1] = next;
2811		*level = btrfs_header_level(next);
2812		path->slots[*level] = 0;
2813		cond_resched();
2814	}
2815	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2816
2817	cond_resched();
2818	return 0;
2819}
2820
2821static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2822				 struct btrfs_root *root,
2823				 struct btrfs_path *path, int *level,
2824				 struct walk_control *wc)
2825{
2826	struct btrfs_fs_info *fs_info = root->fs_info;
2827	int i;
2828	int slot;
2829	int ret;
2830
2831	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2832		slot = path->slots[i];
2833		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2834			path->slots[i]++;
2835			*level = i;
2836			WARN_ON(*level == 0);
2837			return 0;
2838		} else {
2839			ret = wc->process_func(root, path->nodes[*level], wc,
2840				 btrfs_header_generation(path->nodes[*level]),
2841				 *level);
2842			if (ret)
2843				return ret;
2844
2845			if (wc->free) {
2846				struct extent_buffer *next;
2847
2848				next = path->nodes[*level];
2849
2850				if (trans) {
2851					btrfs_tree_lock(next);
2852					btrfs_clean_tree_block(next);
2853					btrfs_wait_tree_block_writeback(next);
2854					btrfs_tree_unlock(next);
2855					ret = btrfs_pin_reserved_extent(trans,
2856						     path->nodes[*level]->start,
2857						     path->nodes[*level]->len);
2858					if (ret)
2859						return ret;
2860				} else {
2861					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2862						clear_extent_buffer_dirty(next);
2863
2864					unaccount_log_buffer(fs_info,
2865						path->nodes[*level]->start);
2866				}
2867			}
2868			free_extent_buffer(path->nodes[*level]);
2869			path->nodes[*level] = NULL;
2870			*level = i + 1;
2871		}
2872	}
2873	return 1;
2874}
2875
2876/*
2877 * drop the reference count on the tree rooted at 'snap'.  This traverses
2878 * the tree freeing any blocks that have a ref count of zero after being
2879 * decremented.
2880 */
2881static int walk_log_tree(struct btrfs_trans_handle *trans,
2882			 struct btrfs_root *log, struct walk_control *wc)
2883{
2884	struct btrfs_fs_info *fs_info = log->fs_info;
2885	int ret = 0;
2886	int wret;
2887	int level;
2888	struct btrfs_path *path;
2889	int orig_level;
2890
2891	path = btrfs_alloc_path();
2892	if (!path)
2893		return -ENOMEM;
2894
2895	level = btrfs_header_level(log->node);
2896	orig_level = level;
2897	path->nodes[level] = log->node;
2898	atomic_inc(&log->node->refs);
2899	path->slots[level] = 0;
2900
2901	while (1) {
2902		wret = walk_down_log_tree(trans, log, path, &level, wc);
2903		if (wret > 0)
2904			break;
2905		if (wret < 0) {
2906			ret = wret;
2907			goto out;
2908		}
2909
2910		wret = walk_up_log_tree(trans, log, path, &level, wc);
2911		if (wret > 0)
2912			break;
2913		if (wret < 0) {
2914			ret = wret;
2915			goto out;
2916		}
2917	}
2918
2919	/* was the root node processed? if not, catch it here */
2920	if (path->nodes[orig_level]) {
2921		ret = wc->process_func(log, path->nodes[orig_level], wc,
2922			 btrfs_header_generation(path->nodes[orig_level]),
2923			 orig_level);
2924		if (ret)
2925			goto out;
2926		if (wc->free) {
2927			struct extent_buffer *next;
2928
2929			next = path->nodes[orig_level];
2930
2931			if (trans) {
2932				btrfs_tree_lock(next);
2933				btrfs_clean_tree_block(next);
2934				btrfs_wait_tree_block_writeback(next);
2935				btrfs_tree_unlock(next);
2936				ret = btrfs_pin_reserved_extent(trans,
2937						next->start, next->len);
2938				if (ret)
2939					goto out;
2940			} else {
2941				if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2942					clear_extent_buffer_dirty(next);
2943				unaccount_log_buffer(fs_info, next->start);
2944			}
2945		}
2946	}
2947
2948out:
2949	btrfs_free_path(path);
2950	return ret;
2951}
2952
2953/*
2954 * helper function to update the item for a given subvolumes log root
2955 * in the tree of log roots
2956 */
2957static int update_log_root(struct btrfs_trans_handle *trans,
2958			   struct btrfs_root *log,
2959			   struct btrfs_root_item *root_item)
2960{
2961	struct btrfs_fs_info *fs_info = log->fs_info;
2962	int ret;
2963
2964	if (log->log_transid == 1) {
2965		/* insert root item on the first sync */
2966		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2967				&log->root_key, root_item);
2968	} else {
2969		ret = btrfs_update_root(trans, fs_info->log_root_tree,
2970				&log->root_key, root_item);
2971	}
2972	return ret;
2973}
2974
2975static void wait_log_commit(struct btrfs_root *root, int transid)
2976{
2977	DEFINE_WAIT(wait);
2978	int index = transid % 2;
2979
2980	/*
2981	 * we only allow two pending log transactions at a time,
2982	 * so we know that if ours is more than 2 older than the
2983	 * current transaction, we're done
2984	 */
2985	for (;;) {
2986		prepare_to_wait(&root->log_commit_wait[index],
2987				&wait, TASK_UNINTERRUPTIBLE);
2988
2989		if (!(root->log_transid_committed < transid &&
2990		      atomic_read(&root->log_commit[index])))
2991			break;
2992
2993		mutex_unlock(&root->log_mutex);
2994		schedule();
2995		mutex_lock(&root->log_mutex);
2996	}
2997	finish_wait(&root->log_commit_wait[index], &wait);
2998}
2999
3000static void wait_for_writer(struct btrfs_root *root)
3001{
3002	DEFINE_WAIT(wait);
3003
3004	for (;;) {
3005		prepare_to_wait(&root->log_writer_wait, &wait,
3006				TASK_UNINTERRUPTIBLE);
3007		if (!atomic_read(&root->log_writers))
3008			break;
3009
3010		mutex_unlock(&root->log_mutex);
3011		schedule();
3012		mutex_lock(&root->log_mutex);
3013	}
3014	finish_wait(&root->log_writer_wait, &wait);
3015}
3016
3017static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
3018					struct btrfs_log_ctx *ctx)
3019{
3020	if (!ctx)
3021		return;
3022
3023	mutex_lock(&root->log_mutex);
3024	list_del_init(&ctx->list);
3025	mutex_unlock(&root->log_mutex);
3026}
3027
3028/*
3029 * Invoked in log mutex context, or be sure there is no other task which
3030 * can access the list.
3031 */
3032static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
3033					     int index, int error)
3034{
3035	struct btrfs_log_ctx *ctx;
3036	struct btrfs_log_ctx *safe;
3037
3038	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
3039		list_del_init(&ctx->list);
3040		ctx->log_ret = error;
3041	}
3042}
3043
3044/*
3045 * btrfs_sync_log does sends a given tree log down to the disk and
3046 * updates the super blocks to record it.  When this call is done,
3047 * you know that any inodes previously logged are safely on disk only
3048 * if it returns 0.
3049 *
3050 * Any other return value means you need to call btrfs_commit_transaction.
3051 * Some of the edge cases for fsyncing directories that have had unlinks
3052 * or renames done in the past mean that sometimes the only safe
3053 * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
3054 * that has happened.
3055 */
3056int btrfs_sync_log(struct btrfs_trans_handle *trans,
3057		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
3058{
3059	int index1;
3060	int index2;
3061	int mark;
3062	int ret;
3063	struct btrfs_fs_info *fs_info = root->fs_info;
3064	struct btrfs_root *log = root->log_root;
3065	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
3066	struct btrfs_root_item new_root_item;
3067	int log_transid = 0;
3068	struct btrfs_log_ctx root_log_ctx;
3069	struct blk_plug plug;
3070	u64 log_root_start;
3071	u64 log_root_level;
3072
3073	mutex_lock(&root->log_mutex);
3074	log_transid = ctx->log_transid;
3075	if (root->log_transid_committed >= log_transid) {
3076		mutex_unlock(&root->log_mutex);
3077		return ctx->log_ret;
3078	}
3079
3080	index1 = log_transid % 2;
3081	if (atomic_read(&root->log_commit[index1])) {
3082		wait_log_commit(root, log_transid);
3083		mutex_unlock(&root->log_mutex);
3084		return ctx->log_ret;
3085	}
3086	ASSERT(log_transid == root->log_transid);
3087	atomic_set(&root->log_commit[index1], 1);
3088
3089	/* wait for previous tree log sync to complete */
3090	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
3091		wait_log_commit(root, log_transid - 1);
3092
3093	while (1) {
3094		int batch = atomic_read(&root->log_batch);
3095		/* when we're on an ssd, just kick the log commit out */
3096		if (!btrfs_test_opt(fs_info, SSD) &&
3097		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
3098			mutex_unlock(&root->log_mutex);
3099			schedule_timeout_uninterruptible(1);
3100			mutex_lock(&root->log_mutex);
3101		}
3102		wait_for_writer(root);
3103		if (batch == atomic_read(&root->log_batch))
3104			break;
3105	}
3106
3107	/* bail out if we need to do a full commit */
3108	if (btrfs_need_log_full_commit(trans)) {
3109		ret = -EAGAIN;
3110		mutex_unlock(&root->log_mutex);
3111		goto out;
3112	}
3113
3114	if (log_transid % 2 == 0)
3115		mark = EXTENT_DIRTY;
3116	else
3117		mark = EXTENT_NEW;
3118
3119	/* we start IO on  all the marked extents here, but we don't actually
3120	 * wait for them until later.
3121	 */
3122	blk_start_plug(&plug);
3123	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
3124	/*
3125	 * -EAGAIN happens when someone, e.g., a concurrent transaction
3126	 *  commit, writes a dirty extent in this tree-log commit. This
3127	 *  concurrent write will create a hole writing out the extents,
3128	 *  and we cannot proceed on a zoned filesystem, requiring
3129	 *  sequential writing. While we can bail out to a full commit
3130	 *  here, but we can continue hoping the concurrent writing fills
3131	 *  the hole.
3132	 */
3133	if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
3134		ret = 0;
3135	if (ret) {
3136		blk_finish_plug(&plug);
3137		btrfs_abort_transaction(trans, ret);
3138		btrfs_set_log_full_commit(trans);
3139		mutex_unlock(&root->log_mutex);
3140		goto out;
3141	}
3142
3143	/*
3144	 * We _must_ update under the root->log_mutex in order to make sure we
3145	 * have a consistent view of the log root we are trying to commit at
3146	 * this moment.
3147	 *
3148	 * We _must_ copy this into a local copy, because we are not holding the
3149	 * log_root_tree->log_mutex yet.  This is important because when we
3150	 * commit the log_root_tree we must have a consistent view of the
3151	 * log_root_tree when we update the super block to point at the
3152	 * log_root_tree bytenr.  If we update the log_root_tree here we'll race
3153	 * with the commit and possibly point at the new block which we may not
3154	 * have written out.
3155	 */
3156	btrfs_set_root_node(&log->root_item, log->node);
3157	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3158
3159	root->log_transid++;
3160	log->log_transid = root->log_transid;
3161	root->log_start_pid = 0;
3162	/*
3163	 * IO has been started, blocks of the log tree have WRITTEN flag set
3164	 * in their headers. new modifications of the log will be written to
3165	 * new positions. so it's safe to allow log writers to go in.
3166	 */
3167	mutex_unlock(&root->log_mutex);
3168
3169	if (btrfs_is_zoned(fs_info)) {
3170		mutex_lock(&fs_info->tree_root->log_mutex);
3171		if (!log_root_tree->node) {
3172			ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
3173			if (ret) {
3174				mutex_unlock(&fs_info->tree_root->log_mutex);
3175				goto out;
3176			}
3177		}
3178		mutex_unlock(&fs_info->tree_root->log_mutex);
3179	}
3180
3181	btrfs_init_log_ctx(&root_log_ctx, NULL);
3182
3183	mutex_lock(&log_root_tree->log_mutex);
3184
3185	index2 = log_root_tree->log_transid % 2;
3186	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3187	root_log_ctx.log_transid = log_root_tree->log_transid;
3188
3189	/*
3190	 * Now we are safe to update the log_root_tree because we're under the
3191	 * log_mutex, and we're a current writer so we're holding the commit
3192	 * open until we drop the log_mutex.
3193	 */
3194	ret = update_log_root(trans, log, &new_root_item);
3195	if (ret) {
3196		if (!list_empty(&root_log_ctx.list))
3197			list_del_init(&root_log_ctx.list);
3198
3199		blk_finish_plug(&plug);
3200		btrfs_set_log_full_commit(trans);
3201
3202		if (ret != -ENOSPC) {
3203			btrfs_abort_transaction(trans, ret);
3204			mutex_unlock(&log_root_tree->log_mutex);
3205			goto out;
3206		}
3207		btrfs_wait_tree_log_extents(log, mark);
3208		mutex_unlock(&log_root_tree->log_mutex);
3209		ret = -EAGAIN;
3210		goto out;
3211	}
3212
3213	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3214		blk_finish_plug(&plug);
3215		list_del_init(&root_log_ctx.list);
3216		mutex_unlock(&log_root_tree->log_mutex);
3217		ret = root_log_ctx.log_ret;
3218		goto out;
3219	}
3220
3221	index2 = root_log_ctx.log_transid % 2;
3222	if (atomic_read(&log_root_tree->log_commit[index2])) {
3223		blk_finish_plug(&plug);
3224		ret = btrfs_wait_tree_log_extents(log, mark);
3225		wait_log_commit(log_root_tree,
3226				root_log_ctx.log_transid);
3227		mutex_unlock(&log_root_tree->log_mutex);
3228		if (!ret)
3229			ret = root_log_ctx.log_ret;
3230		goto out;
3231	}
3232	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3233	atomic_set(&log_root_tree->log_commit[index2], 1);
3234
3235	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3236		wait_log_commit(log_root_tree,
3237				root_log_ctx.log_transid - 1);
3238	}
3239
3240	/*
3241	 * now that we've moved on to the tree of log tree roots,
3242	 * check the full commit flag again
3243	 */
3244	if (btrfs_need_log_full_commit(trans)) {
3245		blk_finish_plug(&plug);
3246		btrfs_wait_tree_log_extents(log, mark);
3247		mutex_unlock(&log_root_tree->log_mutex);
3248		ret = -EAGAIN;
3249		goto out_wake_log_root;
3250	}
3251
3252	ret = btrfs_write_marked_extents(fs_info,
3253					 &log_root_tree->dirty_log_pages,
3254					 EXTENT_DIRTY | EXTENT_NEW);
3255	blk_finish_plug(&plug);
3256	/*
3257	 * As described above, -EAGAIN indicates a hole in the extents. We
3258	 * cannot wait for these write outs since the waiting cause a
3259	 * deadlock. Bail out to the full commit instead.
3260	 */
3261	if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3262		btrfs_set_log_full_commit(trans);
3263		btrfs_wait_tree_log_extents(log, mark);
3264		mutex_unlock(&log_root_tree->log_mutex);
3265		goto out_wake_log_root;
3266	} else if (ret) {
3267		btrfs_set_log_full_commit(trans);
3268		btrfs_abort_transaction(trans, ret);
3269		mutex_unlock(&log_root_tree->log_mutex);
3270		goto out_wake_log_root;
3271	}
3272	ret = btrfs_wait_tree_log_extents(log, mark);
3273	if (!ret)
3274		ret = btrfs_wait_tree_log_extents(log_root_tree,
3275						  EXTENT_NEW | EXTENT_DIRTY);
3276	if (ret) {
3277		btrfs_set_log_full_commit(trans);
3278		mutex_unlock(&log_root_tree->log_mutex);
3279		goto out_wake_log_root;
3280	}
3281
3282	log_root_start = log_root_tree->node->start;
3283	log_root_level = btrfs_header_level(log_root_tree->node);
3284	log_root_tree->log_transid++;
3285	mutex_unlock(&log_root_tree->log_mutex);
3286
3287	/*
3288	 * Here we are guaranteed that nobody is going to write the superblock
3289	 * for the current transaction before us and that neither we do write
3290	 * our superblock before the previous transaction finishes its commit
3291	 * and writes its superblock, because:
3292	 *
3293	 * 1) We are holding a handle on the current transaction, so no body
3294	 *    can commit it until we release the handle;
3295	 *
3296	 * 2) Before writing our superblock we acquire the tree_log_mutex, so
3297	 *    if the previous transaction is still committing, and hasn't yet
3298	 *    written its superblock, we wait for it to do it, because a
3299	 *    transaction commit acquires the tree_log_mutex when the commit
3300	 *    begins and releases it only after writing its superblock.
3301	 */
3302	mutex_lock(&fs_info->tree_log_mutex);
3303
3304	/*
3305	 * The previous transaction writeout phase could have failed, and thus
3306	 * marked the fs in an error state.  We must not commit here, as we
3307	 * could have updated our generation in the super_for_commit and
3308	 * writing the super here would result in transid mismatches.  If there
3309	 * is an error here just bail.
3310	 */
3311	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
3312		ret = -EIO;
3313		btrfs_set_log_full_commit(trans);
3314		btrfs_abort_transaction(trans, ret);
3315		mutex_unlock(&fs_info->tree_log_mutex);
3316		goto out_wake_log_root;
3317	}
3318
3319	btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3320	btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3321	ret = write_all_supers(fs_info, 1);
3322	mutex_unlock(&fs_info->tree_log_mutex);
3323	if (ret) {
3324		btrfs_set_log_full_commit(trans);
3325		btrfs_abort_transaction(trans, ret);
3326		goto out_wake_log_root;
3327	}
3328
3329	/*
3330	 * We know there can only be one task here, since we have not yet set
3331	 * root->log_commit[index1] to 0 and any task attempting to sync the
3332	 * log must wait for the previous log transaction to commit if it's
3333	 * still in progress or wait for the current log transaction commit if
3334	 * someone else already started it. We use <= and not < because the
3335	 * first log transaction has an ID of 0.
3336	 */
3337	ASSERT(root->last_log_commit <= log_transid);
3338	root->last_log_commit = log_transid;
3339
3340out_wake_log_root:
3341	mutex_lock(&log_root_tree->log_mutex);
3342	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3343
3344	log_root_tree->log_transid_committed++;
3345	atomic_set(&log_root_tree->log_commit[index2], 0);
3346	mutex_unlock(&log_root_tree->log_mutex);
3347
3348	/*
3349	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3350	 * all the updates above are seen by the woken threads. It might not be
3351	 * necessary, but proving that seems to be hard.
3352	 */
3353	cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3354out:
3355	mutex_lock(&root->log_mutex);
3356	btrfs_remove_all_log_ctxs(root, index1, ret);
3357	root->log_transid_committed++;
3358	atomic_set(&root->log_commit[index1], 0);
3359	mutex_unlock(&root->log_mutex);
3360
3361	/*
3362	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3363	 * all the updates above are seen by the woken threads. It might not be
3364	 * necessary, but proving that seems to be hard.
3365	 */
3366	cond_wake_up(&root->log_commit_wait[index1]);
3367	return ret;
3368}
3369
3370static void free_log_tree(struct btrfs_trans_handle *trans,
3371			  struct btrfs_root *log)
3372{
3373	int ret;
3374	struct walk_control wc = {
3375		.free = 1,
3376		.process_func = process_one_buffer
3377	};
3378
3379	if (log->node) {
3380		ret = walk_log_tree(trans, log, &wc);
3381		if (ret) {
3382			if (trans)
3383				btrfs_abort_transaction(trans, ret);
3384			else
3385				btrfs_handle_fs_error(log->fs_info, ret, NULL);
3386		}
3387	}
3388
3389	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3390			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3391	extent_io_tree_release(&log->log_csum_range);
3392
3393	if (trans && log->node)
3394		btrfs_redirty_list_add(trans->transaction, log->node);
3395	btrfs_put_root(log);
3396}
3397
3398/*
3399 * free all the extents used by the tree log.  This should be called
3400 * at commit time of the full transaction
3401 */
3402int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3403{
3404	if (root->log_root) {
3405		free_log_tree(trans, root->log_root);
3406		root->log_root = NULL;
3407		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3408	}
3409	return 0;
3410}
3411
3412int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3413			     struct btrfs_fs_info *fs_info)
3414{
3415	if (fs_info->log_root_tree) {
3416		free_log_tree(trans, fs_info->log_root_tree);
3417		fs_info->log_root_tree = NULL;
3418		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
3419	}
3420	return 0;
3421}
3422
3423/*
3424 * Check if an inode was logged in the current transaction. This may often
3425 * return some false positives, because logged_trans is an in memory only field,
3426 * not persisted anywhere. This is meant to be used in contexts where a false
3427 * positive has no functional consequences.
3428 */
3429static bool inode_logged(struct btrfs_trans_handle *trans,
3430			 struct btrfs_inode *inode)
3431{
3432	if (inode->logged_trans == trans->transid)
3433		return true;
3434
3435	/*
3436	 * The inode's logged_trans is always 0 when we load it (because it is
3437	 * not persisted in the inode item or elsewhere). So if it is 0, the
3438	 * inode was last modified in the current transaction and has the
3439	 * full_sync flag set, then the inode may have been logged before in
3440	 * the current transaction, then evicted and loaded again in the current
3441	 * transaction - or may have never been logged in the current transaction,
3442	 * but since we can not be sure, we have to assume it was, otherwise our
3443	 * callers can leave an inconsistent log.
3444	 */
3445	if (inode->logged_trans == 0 &&
3446	    inode->last_trans == trans->transid &&
3447	    test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
3448	    !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
3449		return true;
3450
3451	return false;
3452}
3453
3454/*
3455 * If both a file and directory are logged, and unlinks or renames are
3456 * mixed in, we have a few interesting corners:
3457 *
3458 * create file X in dir Y
3459 * link file X to X.link in dir Y
3460 * fsync file X
3461 * unlink file X but leave X.link
3462 * fsync dir Y
3463 *
3464 * After a crash we would expect only X.link to exist.  But file X
3465 * didn't get fsync'd again so the log has back refs for X and X.link.
3466 *
3467 * We solve this by removing directory entries and inode backrefs from the
3468 * log when a file that was logged in the current transaction is
3469 * unlinked.  Any later fsync will include the updated log entries, and
3470 * we'll be able to reconstruct the proper directory items from backrefs.
3471 *
3472 * This optimizations allows us to avoid relogging the entire inode
3473 * or the entire directory.
3474 */
3475int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3476				 struct btrfs_root *root,
3477				 const char *name, int name_len,
3478				 struct btrfs_inode *dir, u64 index)
3479{
3480	struct btrfs_root *log;
3481	struct btrfs_dir_item *di;
3482	struct btrfs_path *path;
3483	int ret;
3484	int err = 0;
3485	u64 dir_ino = btrfs_ino(dir);
3486
3487	if (!inode_logged(trans, dir))
3488		return 0;
3489
3490	ret = join_running_log_trans(root);
3491	if (ret)
3492		return 0;
3493
3494	mutex_lock(&dir->log_mutex);
3495
3496	log = root->log_root;
3497	path = btrfs_alloc_path();
3498	if (!path) {
3499		err = -ENOMEM;
3500		goto out_unlock;
3501	}
3502
3503	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3504				   name, name_len, -1);
3505	if (IS_ERR(di)) {
3506		err = PTR_ERR(di);
3507		goto fail;
3508	}
3509	if (di) {
3510		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3511		if (ret) {
3512			err = ret;
3513			goto fail;
3514		}
3515	}
3516	btrfs_release_path(path);
3517	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3518					 index, name, name_len, -1);
3519	if (IS_ERR(di)) {
3520		err = PTR_ERR(di);
3521		goto fail;
3522	}
3523	if (di) {
3524		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3525		if (ret) {
3526			err = ret;
3527			goto fail;
3528		}
3529	}
3530
3531	/*
3532	 * We do not need to update the size field of the directory's inode item
3533	 * because on log replay we update the field to reflect all existing
3534	 * entries in the directory (see overwrite_item()).
3535	 */
3536fail:
3537	btrfs_free_path(path);
3538out_unlock:
3539	mutex_unlock(&dir->log_mutex);
3540	if (err == -ENOSPC) {
3541		btrfs_set_log_full_commit(trans);
3542		err = 0;
3543	} else if (err < 0 && err != -ENOENT) {
3544		/* ENOENT can be returned if the entry hasn't been fsynced yet */
3545		btrfs_abort_transaction(trans, err);
3546	}
3547
3548	btrfs_end_log_trans(root);
3549
3550	return err;
3551}
3552
3553/* see comments for btrfs_del_dir_entries_in_log */
3554int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3555			       struct btrfs_root *root,
3556			       const char *name, int name_len,
3557			       struct btrfs_inode *inode, u64 dirid)
3558{
3559	struct btrfs_root *log;
3560	u64 index;
3561	int ret;
3562
3563	if (!inode_logged(trans, inode))
3564		return 0;
3565
3566	ret = join_running_log_trans(root);
3567	if (ret)
3568		return 0;
3569	log = root->log_root;
3570	mutex_lock(&inode->log_mutex);
3571
3572	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3573				  dirid, &index);
3574	mutex_unlock(&inode->log_mutex);
3575	if (ret == -ENOSPC) {
3576		btrfs_set_log_full_commit(trans);
3577		ret = 0;
3578	} else if (ret < 0 && ret != -ENOENT)
3579		btrfs_abort_transaction(trans, ret);
3580	btrfs_end_log_trans(root);
3581
3582	return ret;
3583}
3584
3585/*
3586 * creates a range item in the log for 'dirid'.  first_offset and
3587 * last_offset tell us which parts of the key space the log should
3588 * be considered authoritative for.
3589 */
3590static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3591				       struct btrfs_root *log,
3592				       struct btrfs_path *path,
3593				       int key_type, u64 dirid,
3594				       u64 first_offset, u64 last_offset)
3595{
3596	int ret;
3597	struct btrfs_key key;
3598	struct btrfs_dir_log_item *item;
3599
3600	key.objectid = dirid;
3601	key.offset = first_offset;
3602	if (key_type == BTRFS_DIR_ITEM_KEY)
3603		key.type = BTRFS_DIR_LOG_ITEM_KEY;
3604	else
3605		key.type = BTRFS_DIR_LOG_INDEX_KEY;
3606	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3607	if (ret)
3608		return ret;
3609
3610	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3611			      struct btrfs_dir_log_item);
3612	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3613	btrfs_mark_buffer_dirty(path->nodes[0]);
3614	btrfs_release_path(path);
3615	return 0;
3616}
3617
3618/*
3619 * log all the items included in the current transaction for a given
3620 * directory.  This also creates the range items in the log tree required
3621 * to replay anything deleted before the fsync
3622 */
3623static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3624			  struct btrfs_root *root, struct btrfs_inode *inode,
3625			  struct btrfs_path *path,
3626			  struct btrfs_path *dst_path, int key_type,
3627			  struct btrfs_log_ctx *ctx,
3628			  u64 min_offset, u64 *last_offset_ret)
3629{
3630	struct btrfs_key min_key;
3631	struct btrfs_root *log = root->log_root;
3632	struct extent_buffer *src;
3633	int err = 0;
3634	int ret;
3635	int i;
3636	int nritems;
3637	u64 first_offset = min_offset;
3638	u64 last_offset = (u64)-1;
3639	u64 ino = btrfs_ino(inode);
3640
3641	log = root->log_root;
3642
3643	min_key.objectid = ino;
3644	min_key.type = key_type;
3645	min_key.offset = min_offset;
3646
3647	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3648
3649	/*
3650	 * we didn't find anything from this transaction, see if there
3651	 * is anything at all
3652	 */
3653	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3654		min_key.objectid = ino;
3655		min_key.type = key_type;
3656		min_key.offset = (u64)-1;
3657		btrfs_release_path(path);
3658		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3659		if (ret < 0) {
3660			btrfs_release_path(path);
3661			return ret;
3662		}
3663		ret = btrfs_previous_item(root, path, ino, key_type);
3664
3665		/* if ret == 0 there are items for this type,
3666		 * create a range to tell us the last key of this type.
3667		 * otherwise, there are no items in this directory after
3668		 * *min_offset, and we create a range to indicate that.
3669		 */
3670		if (ret == 0) {
3671			struct btrfs_key tmp;
3672			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3673					      path->slots[0]);
3674			if (key_type == tmp.type)
3675				first_offset = max(min_offset, tmp.offset) + 1;
3676		}
3677		goto done;
3678	}
3679
3680	/* go backward to find any previous key */
3681	ret = btrfs_previous_item(root, path, ino, key_type);
3682	if (ret == 0) {
3683		struct btrfs_key tmp;
3684		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3685		if (key_type == tmp.type) {
3686			first_offset = tmp.offset;
3687			ret = overwrite_item(trans, log, dst_path,
3688					     path->nodes[0], path->slots[0],
3689					     &tmp);
3690			if (ret) {
3691				err = ret;
3692				goto done;
3693			}
3694		}
3695	}
3696	btrfs_release_path(path);
3697
3698	/*
3699	 * Find the first key from this transaction again.  See the note for
3700	 * log_new_dir_dentries, if we're logging a directory recursively we
3701	 * won't be holding its i_mutex, which means we can modify the directory
3702	 * while we're logging it.  If we remove an entry between our first
3703	 * search and this search we'll not find the key again and can just
3704	 * bail.
3705	 */
3706search:
3707	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3708	if (ret != 0)
3709		goto done;
3710
3711	/*
3712	 * we have a block from this transaction, log every item in it
3713	 * from our directory
3714	 */
3715	while (1) {
3716		struct btrfs_key tmp;
3717		src = path->nodes[0];
3718		nritems = btrfs_header_nritems(src);
3719		for (i = path->slots[0]; i < nritems; i++) {
3720			struct btrfs_dir_item *di;
3721
3722			btrfs_item_key_to_cpu(src, &min_key, i);
3723
3724			if (min_key.objectid != ino || min_key.type != key_type)
3725				goto done;
3726
3727			if (need_resched()) {
3728				btrfs_release_path(path);
3729				cond_resched();
3730				goto search;
3731			}
3732
3733			ret = overwrite_item(trans, log, dst_path, src, i,
3734					     &min_key);
3735			if (ret) {
3736				err = ret;
3737				goto done;
3738			}
3739
3740			/*
3741			 * We must make sure that when we log a directory entry,
3742			 * the corresponding inode, after log replay, has a
3743			 * matching link count. For example:
3744			 *
3745			 * touch foo
3746			 * mkdir mydir
3747			 * sync
3748			 * ln foo mydir/bar
3749			 * xfs_io -c "fsync" mydir
3750			 * <crash>
3751			 * <mount fs and log replay>
3752			 *
3753			 * Would result in a fsync log that when replayed, our
3754			 * file inode would have a link count of 1, but we get
3755			 * two directory entries pointing to the same inode.
3756			 * After removing one of the names, it would not be
3757			 * possible to remove the other name, which resulted
3758			 * always in stale file handle errors, and would not
3759			 * be possible to rmdir the parent directory, since
3760			 * its i_size could never decrement to the value
3761			 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3762			 */
3763			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3764			btrfs_dir_item_key_to_cpu(src, di, &tmp);
3765			if (ctx &&
3766			    (btrfs_dir_transid(src, di) == trans->transid ||
3767			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3768			    tmp.type != BTRFS_ROOT_ITEM_KEY)
3769				ctx->log_new_dentries = true;
3770		}
3771		path->slots[0] = nritems;
3772
3773		/*
3774		 * look ahead to the next item and see if it is also
3775		 * from this directory and from this transaction
3776		 */
3777		ret = btrfs_next_leaf(root, path);
3778		if (ret) {
3779			if (ret == 1)
3780				last_offset = (u64)-1;
3781			else
3782				err = ret;
3783			goto done;
3784		}
3785		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3786		if (tmp.objectid != ino || tmp.type != key_type) {
3787			last_offset = (u64)-1;
3788			goto done;
3789		}
3790		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3791			ret = overwrite_item(trans, log, dst_path,
3792					     path->nodes[0], path->slots[0],
3793					     &tmp);
3794			if (ret)
3795				err = ret;
3796			else
3797				last_offset = tmp.offset;
3798			goto done;
3799		}
3800	}
3801done:
3802	btrfs_release_path(path);
3803	btrfs_release_path(dst_path);
3804
3805	if (err == 0) {
3806		*last_offset_ret = last_offset;
3807		/*
3808		 * insert the log range keys to indicate where the log
3809		 * is valid
3810		 */
3811		ret = insert_dir_log_key(trans, log, path, key_type,
3812					 ino, first_offset, last_offset);
3813		if (ret)
3814			err = ret;
3815	}
3816	return err;
3817}
3818
3819/*
3820 * logging directories is very similar to logging inodes, We find all the items
3821 * from the current transaction and write them to the log.
3822 *
3823 * The recovery code scans the directory in the subvolume, and if it finds a
3824 * key in the range logged that is not present in the log tree, then it means
3825 * that dir entry was unlinked during the transaction.
3826 *
3827 * In order for that scan to work, we must include one key smaller than
3828 * the smallest logged by this transaction and one key larger than the largest
3829 * key logged by this transaction.
3830 */
3831static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3832			  struct btrfs_root *root, struct btrfs_inode *inode,
3833			  struct btrfs_path *path,
3834			  struct btrfs_path *dst_path,
3835			  struct btrfs_log_ctx *ctx)
3836{
3837	u64 min_key;
3838	u64 max_key;
3839	int ret;
3840	int key_type = BTRFS_DIR_ITEM_KEY;
3841
3842again:
3843	min_key = 0;
3844	max_key = 0;
3845	while (1) {
3846		ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3847				ctx, min_key, &max_key);
3848		if (ret)
3849			return ret;
3850		if (max_key == (u64)-1)
3851			break;
3852		min_key = max_key + 1;
3853	}
3854
3855	if (key_type == BTRFS_DIR_ITEM_KEY) {
3856		key_type = BTRFS_DIR_INDEX_KEY;
3857		goto again;
3858	}
3859	return 0;
3860}
3861
3862/*
3863 * a helper function to drop items from the log before we relog an
3864 * inode.  max_key_type indicates the highest item type to remove.
3865 * This cannot be run for file data extents because it does not
3866 * free the extents they point to.
3867 */
3868static int drop_objectid_items(struct btrfs_trans_handle *trans,
3869				  struct btrfs_root *log,
3870				  struct btrfs_path *path,
3871				  u64 objectid, int max_key_type)
3872{
3873	int ret;
3874	struct btrfs_key key;
3875	struct btrfs_key found_key;
3876	int start_slot;
3877
3878	key.objectid = objectid;
3879	key.type = max_key_type;
3880	key.offset = (u64)-1;
3881
3882	while (1) {
3883		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3884		BUG_ON(ret == 0); /* Logic error */
3885		if (ret < 0)
3886			break;
3887
3888		if (path->slots[0] == 0)
3889			break;
3890
3891		path->slots[0]--;
3892		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3893				      path->slots[0]);
3894
3895		if (found_key.objectid != objectid)
3896			break;
3897
3898		found_key.offset = 0;
3899		found_key.type = 0;
3900		ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
3901		if (ret < 0)
3902			break;
3903
3904		ret = btrfs_del_items(trans, log, path, start_slot,
3905				      path->slots[0] - start_slot + 1);
3906		/*
3907		 * If start slot isn't 0 then we don't need to re-search, we've
3908		 * found the last guy with the objectid in this tree.
3909		 */
3910		if (ret || start_slot != 0)
3911			break;
3912		btrfs_release_path(path);
3913	}
3914	btrfs_release_path(path);
3915	if (ret > 0)
3916		ret = 0;
3917	return ret;
3918}
3919
3920static void fill_inode_item(struct btrfs_trans_handle *trans,
3921			    struct extent_buffer *leaf,
3922			    struct btrfs_inode_item *item,
3923			    struct inode *inode, int log_inode_only,
3924			    u64 logged_isize)
3925{
3926	struct btrfs_map_token token;
3927
3928	btrfs_init_map_token(&token, leaf);
3929
3930	if (log_inode_only) {
3931		/* set the generation to zero so the recover code
3932		 * can tell the difference between an logging
3933		 * just to say 'this inode exists' and a logging
3934		 * to say 'update this inode with these values'
3935		 */
3936		btrfs_set_token_inode_generation(&token, item, 0);
3937		btrfs_set_token_inode_size(&token, item, logged_isize);
3938	} else {
3939		btrfs_set_token_inode_generation(&token, item,
3940						 BTRFS_I(inode)->generation);
3941		btrfs_set_token_inode_size(&token, item, inode->i_size);
3942	}
3943
3944	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3945	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3946	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3947	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3948
3949	btrfs_set_token_timespec_sec(&token, &item->atime,
3950				     inode->i_atime.tv_sec);
3951	btrfs_set_token_timespec_nsec(&token, &item->atime,
3952				      inode->i_atime.tv_nsec);
3953
3954	btrfs_set_token_timespec_sec(&token, &item->mtime,
3955				     inode->i_mtime.tv_sec);
3956	btrfs_set_token_timespec_nsec(&token, &item->mtime,
3957				      inode->i_mtime.tv_nsec);
3958
3959	btrfs_set_token_timespec_sec(&token, &item->ctime,
3960				     inode->i_ctime.tv_sec);
3961	btrfs_set_token_timespec_nsec(&token, &item->ctime,
3962				      inode->i_ctime.tv_nsec);
3963
3964	/*
3965	 * We do not need to set the nbytes field, in fact during a fast fsync
3966	 * its value may not even be correct, since a fast fsync does not wait
3967	 * for ordered extent completion, which is where we update nbytes, it
3968	 * only waits for writeback to complete. During log replay as we find
3969	 * file extent items and replay them, we adjust the nbytes field of the
3970	 * inode item in subvolume tree as needed (see overwrite_item()).
3971	 */
3972
3973	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3974	btrfs_set_token_inode_transid(&token, item, trans->transid);
3975	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3976	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
3977	btrfs_set_token_inode_block_group(&token, item, 0);
3978}
3979
3980static int log_inode_item(struct btrfs_trans_handle *trans,
3981			  struct btrfs_root *log, struct btrfs_path *path,
3982			  struct btrfs_inode *inode, bool inode_item_dropped)
3983{
3984	struct btrfs_inode_item *inode_item;
3985	int ret;
3986
3987	/*
3988	 * If we are doing a fast fsync and the inode was logged before in the
3989	 * current transaction, then we know the inode was previously logged and
3990	 * it exists in the log tree. For performance reasons, in this case use
3991	 * btrfs_search_slot() directly with ins_len set to 0 so that we never
3992	 * attempt a write lock on the leaf's parent, which adds unnecessary lock
3993	 * contention in case there are concurrent fsyncs for other inodes of the
3994	 * same subvolume. Using btrfs_insert_empty_item() when the inode item
3995	 * already exists can also result in unnecessarily splitting a leaf.
3996	 */
3997	if (!inode_item_dropped && inode->logged_trans == trans->transid) {
3998		ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
3999		ASSERT(ret <= 0);
4000		if (ret > 0)
4001			ret = -ENOENT;
4002	} else {
4003		/*
4004		 * This means it is the first fsync in the current transaction,
4005		 * so the inode item is not in the log and we need to insert it.
4006		 * We can never get -EEXIST because we are only called for a fast
4007		 * fsync and in case an inode eviction happens after the inode was
4008		 * logged before in the current transaction, when we load again
4009		 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4010		 * flags and set ->logged_trans to 0.
4011		 */
4012		ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
4013					      sizeof(*inode_item));
4014		ASSERT(ret != -EEXIST);
4015	}
4016	if (ret)
4017		return ret;
4018	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4019				    struct btrfs_inode_item);
4020	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4021			0, 0);
4022	btrfs_release_path(path);
4023	return 0;
4024}
4025
4026static int log_csums(struct btrfs_trans_handle *trans,
4027		     struct btrfs_inode *inode,
4028		     struct btrfs_root *log_root,
4029		     struct btrfs_ordered_sum *sums)
4030{
4031	const u64 lock_end = sums->bytenr + sums->len - 1;
4032	struct extent_state *cached_state = NULL;
4033	int ret;
4034
4035	/*
4036	 * If this inode was not used for reflink operations in the current
4037	 * transaction with new extents, then do the fast path, no need to
4038	 * worry about logging checksum items with overlapping ranges.
4039	 */
4040	if (inode->last_reflink_trans < trans->transid)
4041		return btrfs_csum_file_blocks(trans, log_root, sums);
4042
4043	/*
4044	 * Serialize logging for checksums. This is to avoid racing with the
4045	 * same checksum being logged by another task that is logging another
4046	 * file which happens to refer to the same extent as well. Such races
4047	 * can leave checksum items in the log with overlapping ranges.
4048	 */
4049	ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
4050			       lock_end, &cached_state);
4051	if (ret)
4052		return ret;
4053	/*
4054	 * Due to extent cloning, we might have logged a csum item that covers a
4055	 * subrange of a cloned extent, and later we can end up logging a csum
4056	 * item for a larger subrange of the same extent or the entire range.
4057	 * This would leave csum items in the log tree that cover the same range
4058	 * and break the searches for checksums in the log tree, resulting in
4059	 * some checksums missing in the fs/subvolume tree. So just delete (or
4060	 * trim and adjust) any existing csum items in the log for this range.
4061	 */
4062	ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
4063	if (!ret)
4064		ret = btrfs_csum_file_blocks(trans, log_root, sums);
4065
4066	unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
4067			     &cached_state);
4068
4069	return ret;
4070}
4071
4072static noinline int copy_items(struct btrfs_trans_handle *trans,
4073			       struct btrfs_inode *inode,
4074			       struct btrfs_path *dst_path,
4075			       struct btrfs_path *src_path,
4076			       int start_slot, int nr, int inode_only,
4077			       u64 logged_isize)
4078{
4079	struct btrfs_fs_info *fs_info = trans->fs_info;
4080	unsigned long src_offset;
4081	unsigned long dst_offset;
4082	struct btrfs_root *log = inode->root->log_root;
4083	struct btrfs_file_extent_item *extent;
4084	struct btrfs_inode_item *inode_item;
4085	struct extent_buffer *src = src_path->nodes[0];
4086	int ret;
4087	struct btrfs_key *ins_keys;
4088	u32 *ins_sizes;
4089	char *ins_data;
4090	int i;
4091	struct list_head ordered_sums;
4092	int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
4093
4094	INIT_LIST_HEAD(&ordered_sums);
4095
4096	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4097			   nr * sizeof(u32), GFP_NOFS);
4098	if (!ins_data)
4099		return -ENOMEM;
4100
4101	ins_sizes = (u32 *)ins_data;
4102	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4103
4104	for (i = 0; i < nr; i++) {
4105		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
4106		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
4107	}
4108	ret = btrfs_insert_empty_items(trans, log, dst_path,
4109				       ins_keys, ins_sizes, nr);
4110	if (ret) {
4111		kfree(ins_data);
4112		return ret;
4113	}
4114
4115	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
4116		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
4117						   dst_path->slots[0]);
4118
4119		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
4120
4121		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
4122			inode_item = btrfs_item_ptr(dst_path->nodes[0],
4123						    dst_path->slots[0],
4124						    struct btrfs_inode_item);
4125			fill_inode_item(trans, dst_path->nodes[0], inode_item,
4126					&inode->vfs_inode,
4127					inode_only == LOG_INODE_EXISTS,
4128					logged_isize);
4129		} else {
4130			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4131					   src_offset, ins_sizes[i]);
4132		}
4133
4134		/* take a reference on file data extents so that truncates
4135		 * or deletes of this inode don't have to relog the inode
4136		 * again
4137		 */
4138		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
4139		    !skip_csum) {
4140			int found_type;
4141			extent = btrfs_item_ptr(src, start_slot + i,
4142						struct btrfs_file_extent_item);
4143
4144			if (btrfs_file_extent_generation(src, extent) < trans->transid)
4145				continue;
4146
4147			found_type = btrfs_file_extent_type(src, extent);
4148			if (found_type == BTRFS_FILE_EXTENT_REG) {
4149				u64 ds, dl, cs, cl;
4150				ds = btrfs_file_extent_disk_bytenr(src,
4151								extent);
4152				/* ds == 0 is a hole */
4153				if (ds == 0)
4154					continue;
4155
4156				dl = btrfs_file_extent_disk_num_bytes(src,
4157								extent);
4158				cs = btrfs_file_extent_offset(src, extent);
4159				cl = btrfs_file_extent_num_bytes(src,
4160								extent);
4161				if (btrfs_file_extent_compression(src,
4162								  extent)) {
4163					cs = 0;
4164					cl = dl;
4165				}
4166
4167				ret = btrfs_lookup_csums_range(
4168						fs_info->csum_root,
4169						ds + cs, ds + cs + cl - 1,
4170						&ordered_sums, 0);
4171				if (ret)
4172					break;
4173			}
4174		}
4175	}
4176
4177	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
4178	btrfs_release_path(dst_path);
4179	kfree(ins_data);
4180
4181	/*
4182	 * we have to do this after the loop above to avoid changing the
4183	 * log tree while trying to change the log tree.
4184	 */
4185	while (!list_empty(&ordered_sums)) {
4186		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4187						   struct btrfs_ordered_sum,
4188						   list);
4189		if (!ret)
4190			ret = log_csums(trans, inode, log, sums);
4191		list_del(&sums->list);
4192		kfree(sums);
4193	}
4194
4195	return ret;
4196}
4197
4198static int extent_cmp(void *priv, const struct list_head *a,
4199		      const struct list_head *b)
4200{
4201	const struct extent_map *em1, *em2;
4202
4203	em1 = list_entry(a, struct extent_map, list);
4204	em2 = list_entry(b, struct extent_map, list);
4205
4206	if (em1->start < em2->start)
4207		return -1;
4208	else if (em1->start > em2->start)
4209		return 1;
4210	return 0;
4211}
4212
4213static int log_extent_csums(struct btrfs_trans_handle *trans,
4214			    struct btrfs_inode *inode,
4215			    struct btrfs_root *log_root,
4216			    const struct extent_map *em,
4217			    struct btrfs_log_ctx *ctx)
4218{
4219	struct btrfs_ordered_extent *ordered;
4220	u64 csum_offset;
4221	u64 csum_len;
4222	u64 mod_start = em->mod_start;
4223	u64 mod_len = em->mod_len;
4224	LIST_HEAD(ordered_sums);
4225	int ret = 0;
4226
4227	if (inode->flags & BTRFS_INODE_NODATASUM ||
4228	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
4229	    em->block_start == EXTENT_MAP_HOLE)
4230		return 0;
4231
4232	list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4233		const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4234		const u64 mod_end = mod_start + mod_len;
4235		struct btrfs_ordered_sum *sums;
4236
4237		if (mod_len == 0)
4238			break;
4239
4240		if (ordered_end <= mod_start)
4241			continue;
4242		if (mod_end <= ordered->file_offset)
4243			break;
4244
4245		/*
4246		 * We are going to copy all the csums on this ordered extent, so
4247		 * go ahead and adjust mod_start and mod_len in case this ordered
4248		 * extent has already been logged.
4249		 */
4250		if (ordered->file_offset > mod_start) {
4251			if (ordered_end >= mod_end)
4252				mod_len = ordered->file_offset - mod_start;
4253			/*
4254			 * If we have this case
4255			 *
4256			 * |--------- logged extent ---------|
4257			 *       |----- ordered extent ----|
4258			 *
4259			 * Just don't mess with mod_start and mod_len, we'll
4260			 * just end up logging more csums than we need and it
4261			 * will be ok.
4262			 */
4263		} else {
4264			if (ordered_end < mod_end) {
4265				mod_len = mod_end - ordered_end;
4266				mod_start = ordered_end;
4267			} else {
4268				mod_len = 0;
4269			}
4270		}
4271
4272		/*
4273		 * To keep us from looping for the above case of an ordered
4274		 * extent that falls inside of the logged extent.
4275		 */
4276		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4277			continue;
4278
4279		list_for_each_entry(sums, &ordered->list, list) {
4280			ret = log_csums(trans, inode, log_root, sums);
4281			if (ret)
4282				return ret;
4283		}
4284	}
4285
4286	/* We're done, found all csums in the ordered extents. */
4287	if (mod_len == 0)
4288		return 0;
4289
4290	/* If we're compressed we have to save the entire range of csums. */
4291	if (em->compress_type) {
4292		csum_offset = 0;
4293		csum_len = max(em->block_len, em->orig_block_len);
4294	} else {
4295		csum_offset = mod_start - em->start;
4296		csum_len = mod_len;
4297	}
4298
4299	/* block start is already adjusted for the file extent offset. */
4300	ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
4301				       em->block_start + csum_offset,
4302				       em->block_start + csum_offset +
4303				       csum_len - 1, &ordered_sums, 0);
4304	if (ret)
4305		return ret;
4306
4307	while (!list_empty(&ordered_sums)) {
4308		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4309						   struct btrfs_ordered_sum,
4310						   list);
4311		if (!ret)
4312			ret = log_csums(trans, inode, log_root, sums);
4313		list_del(&sums->list);
4314		kfree(sums);
4315	}
4316
4317	return ret;
4318}
4319
4320static int log_one_extent(struct btrfs_trans_handle *trans,
4321			  struct btrfs_inode *inode, struct btrfs_root *root,
4322			  const struct extent_map *em,
4323			  struct btrfs_path *path,
4324			  struct btrfs_log_ctx *ctx)
4325{
4326	struct btrfs_drop_extents_args drop_args = { 0 };
4327	struct btrfs_root *log = root->log_root;
4328	struct btrfs_file_extent_item *fi;
4329	struct extent_buffer *leaf;
4330	struct btrfs_map_token token;
4331	struct btrfs_key key;
4332	u64 extent_offset = em->start - em->orig_start;
4333	u64 block_len;
4334	int ret;
4335
4336	ret = log_extent_csums(trans, inode, log, em, ctx);
4337	if (ret)
4338		return ret;
4339
4340	drop_args.path = path;
4341	drop_args.start = em->start;
4342	drop_args.end = em->start + em->len;
4343	drop_args.replace_extent = true;
4344	drop_args.extent_item_size = sizeof(*fi);
4345	ret = btrfs_drop_extents(trans, log, inode, &drop_args);
4346	if (ret)
4347		return ret;
4348
4349	if (!drop_args.extent_inserted) {
4350		key.objectid = btrfs_ino(inode);
4351		key.type = BTRFS_EXTENT_DATA_KEY;
4352		key.offset = em->start;
4353
4354		ret = btrfs_insert_empty_item(trans, log, path, &key,
4355					      sizeof(*fi));
4356		if (ret)
4357			return ret;
4358	}
4359	leaf = path->nodes[0];
4360	btrfs_init_map_token(&token, leaf);
4361	fi = btrfs_item_ptr(leaf, path->slots[0],
4362			    struct btrfs_file_extent_item);
4363
4364	btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
4365	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4366		btrfs_set_token_file_extent_type(&token, fi,
4367						 BTRFS_FILE_EXTENT_PREALLOC);
4368	else
4369		btrfs_set_token_file_extent_type(&token, fi,
4370						 BTRFS_FILE_EXTENT_REG);
4371
4372	block_len = max(em->block_len, em->orig_block_len);
4373	if (em->compress_type != BTRFS_COMPRESS_NONE) {
4374		btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4375							em->block_start);
4376		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
4377	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4378		btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4379							em->block_start -
4380							extent_offset);
4381		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
4382	} else {
4383		btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
4384		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
4385	}
4386
4387	btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
4388	btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
4389	btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
4390	btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
4391	btrfs_set_token_file_extent_encryption(&token, fi, 0);
4392	btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
4393	btrfs_mark_buffer_dirty(leaf);
4394
4395	btrfs_release_path(path);
4396
4397	return ret;
4398}
4399
4400/*
4401 * Log all prealloc extents beyond the inode's i_size to make sure we do not
4402 * lose them after doing a fast fsync and replaying the log. We scan the
4403 * subvolume's root instead of iterating the inode's extent map tree because
4404 * otherwise we can log incorrect extent items based on extent map conversion.
4405 * That can happen due to the fact that extent maps are merged when they
4406 * are not in the extent map tree's list of modified extents.
4407 */
4408static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4409				      struct btrfs_inode *inode,
4410				      struct btrfs_path *path)
4411{
4412	struct btrfs_root *root = inode->root;
4413	struct btrfs_key key;
4414	const u64 i_size = i_size_read(&inode->vfs_inode);
4415	const u64 ino = btrfs_ino(inode);
4416	struct btrfs_path *dst_path = NULL;
4417	bool dropped_extents = false;
4418	u64 truncate_offset = i_size;
4419	struct extent_buffer *leaf;
4420	int slot;
4421	int ins_nr = 0;
4422	int start_slot;
4423	int ret;
4424
4425	if (!(inode->flags & BTRFS_INODE_PREALLOC))
4426		return 0;
4427
4428	key.objectid = ino;
4429	key.type = BTRFS_EXTENT_DATA_KEY;
4430	key.offset = i_size;
4431	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4432	if (ret < 0)
4433		goto out;
4434
4435	/*
4436	 * We must check if there is a prealloc extent that starts before the
4437	 * i_size and crosses the i_size boundary. This is to ensure later we
4438	 * truncate down to the end of that extent and not to the i_size, as
4439	 * otherwise we end up losing part of the prealloc extent after a log
4440	 * replay and with an implicit hole if there is another prealloc extent
4441	 * that starts at an offset beyond i_size.
4442	 */
4443	ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4444	if (ret < 0)
4445		goto out;
4446
4447	if (ret == 0) {
4448		struct btrfs_file_extent_item *ei;
4449
4450		leaf = path->nodes[0];
4451		slot = path->slots[0];
4452		ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4453
4454		if (btrfs_file_extent_type(leaf, ei) ==
4455		    BTRFS_FILE_EXTENT_PREALLOC) {
4456			u64 extent_end;
4457
4458			btrfs_item_key_to_cpu(leaf, &key, slot);
4459			extent_end = key.offset +
4460				btrfs_file_extent_num_bytes(leaf, ei);
4461
4462			if (extent_end > i_size)
4463				truncate_offset = extent_end;
4464		}
4465	} else {
4466		ret = 0;
4467	}
4468
4469	while (true) {
4470		leaf = path->nodes[0];
4471		slot = path->slots[0];
4472
4473		if (slot >= btrfs_header_nritems(leaf)) {
4474			if (ins_nr > 0) {
4475				ret = copy_items(trans, inode, dst_path, path,
4476						 start_slot, ins_nr, 1, 0);
4477				if (ret < 0)
4478					goto out;
4479				ins_nr = 0;
4480			}
4481			ret = btrfs_next_leaf(root, path);
4482			if (ret < 0)
4483				goto out;
4484			if (ret > 0) {
4485				ret = 0;
4486				break;
4487			}
4488			continue;
4489		}
4490
4491		btrfs_item_key_to_cpu(leaf, &key, slot);
4492		if (key.objectid > ino)
4493			break;
4494		if (WARN_ON_ONCE(key.objectid < ino) ||
4495		    key.type < BTRFS_EXTENT_DATA_KEY ||
4496		    key.offset < i_size) {
4497			path->slots[0]++;
4498			continue;
4499		}
4500		if (!dropped_extents) {
4501			/*
4502			 * Avoid logging extent items logged in past fsync calls
4503			 * and leading to duplicate keys in the log tree.
4504			 */
4505			do {
4506				ret = btrfs_truncate_inode_items(trans,
4507							 root->log_root,
4508							 inode, truncate_offset,
4509							 BTRFS_EXTENT_DATA_KEY,
4510							 NULL);
4511			} while (ret == -EAGAIN);
4512			if (ret)
4513				goto out;
4514			dropped_extents = true;
4515		}
4516		if (ins_nr == 0)
4517			start_slot = slot;
4518		ins_nr++;
4519		path->slots[0]++;
4520		if (!dst_path) {
4521			dst_path = btrfs_alloc_path();
4522			if (!dst_path) {
4523				ret = -ENOMEM;
4524				goto out;
4525			}
4526		}
4527	}
4528	if (ins_nr > 0)
4529		ret = copy_items(trans, inode, dst_path, path,
4530				 start_slot, ins_nr, 1, 0);
4531out:
4532	btrfs_release_path(path);
4533	btrfs_free_path(dst_path);
4534	return ret;
4535}
4536
4537static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4538				     struct btrfs_root *root,
4539				     struct btrfs_inode *inode,
4540				     struct btrfs_path *path,
4541				     struct btrfs_log_ctx *ctx)
4542{
4543	struct btrfs_ordered_extent *ordered;
4544	struct btrfs_ordered_extent *tmp;
4545	struct extent_map *em, *n;
4546	struct list_head extents;
4547	struct extent_map_tree *tree = &inode->extent_tree;
4548	int ret = 0;
4549	int num = 0;
4550
4551	INIT_LIST_HEAD(&extents);
4552
4553	write_lock(&tree->lock);
4554
4555	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4556		list_del_init(&em->list);
4557		/*
4558		 * Just an arbitrary number, this can be really CPU intensive
4559		 * once we start getting a lot of extents, and really once we
4560		 * have a bunch of extents we just want to commit since it will
4561		 * be faster.
4562		 */
4563		if (++num > 32768) {
4564			list_del_init(&tree->modified_extents);
4565			ret = -EFBIG;
4566			goto process;
4567		}
4568
4569		if (em->generation < trans->transid)
4570			continue;
4571
4572		/* We log prealloc extents beyond eof later. */
4573		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4574		    em->start >= i_size_read(&inode->vfs_inode))
4575			continue;
4576
4577		/* Need a ref to keep it from getting evicted from cache */
4578		refcount_inc(&em->refs);
4579		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4580		list_add_tail(&em->list, &extents);
4581		num++;
4582	}
4583
4584	list_sort(NULL, &extents, extent_cmp);
4585process:
4586	while (!list_empty(&extents)) {
4587		em = list_entry(extents.next, struct extent_map, list);
4588
4589		list_del_init(&em->list);
4590
4591		/*
4592		 * If we had an error we just need to delete everybody from our
4593		 * private list.
4594		 */
4595		if (ret) {
4596			clear_em_logging(tree, em);
4597			free_extent_map(em);
4598			continue;
4599		}
4600
4601		write_unlock(&tree->lock);
4602
4603		ret = log_one_extent(trans, inode, root, em, path, ctx);
4604		write_lock(&tree->lock);
4605		clear_em_logging(tree, em);
4606		free_extent_map(em);
4607	}
4608	WARN_ON(!list_empty(&extents));
4609	write_unlock(&tree->lock);
4610
4611	btrfs_release_path(path);
4612	if (!ret)
4613		ret = btrfs_log_prealloc_extents(trans, inode, path);
4614	if (ret)
4615		return ret;
4616
4617	/*
4618	 * We have logged all extents successfully, now make sure the commit of
4619	 * the current transaction waits for the ordered extents to complete
4620	 * before it commits and wipes out the log trees, otherwise we would
4621	 * lose data if an ordered extents completes after the transaction
4622	 * commits and a power failure happens after the transaction commit.
4623	 */
4624	list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4625		list_del_init(&ordered->log_list);
4626		set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4627
4628		if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4629			spin_lock_irq(&inode->ordered_tree.lock);
4630			if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4631				set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4632				atomic_inc(&trans->transaction->pending_ordered);
4633			}
4634			spin_unlock_irq(&inode->ordered_tree.lock);
4635		}
4636		btrfs_put_ordered_extent(ordered);
4637	}
4638
4639	return 0;
4640}
4641
4642static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4643			     struct btrfs_path *path, u64 *size_ret)
4644{
4645	struct btrfs_key key;
4646	int ret;
4647
4648	key.objectid = btrfs_ino(inode);
4649	key.type = BTRFS_INODE_ITEM_KEY;
4650	key.offset = 0;
4651
4652	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4653	if (ret < 0) {
4654		return ret;
4655	} else if (ret > 0) {
4656		*size_ret = 0;
4657	} else {
4658		struct btrfs_inode_item *item;
4659
4660		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4661				      struct btrfs_inode_item);
4662		*size_ret = btrfs_inode_size(path->nodes[0], item);
4663		/*
4664		 * If the in-memory inode's i_size is smaller then the inode
4665		 * size stored in the btree, return the inode's i_size, so
4666		 * that we get a correct inode size after replaying the log
4667		 * when before a power failure we had a shrinking truncate
4668		 * followed by addition of a new name (rename / new hard link).
4669		 * Otherwise return the inode size from the btree, to avoid
4670		 * data loss when replaying a log due to previously doing a
4671		 * write that expands the inode's size and logging a new name
4672		 * immediately after.
4673		 */
4674		if (*size_ret > inode->vfs_inode.i_size)
4675			*size_ret = inode->vfs_inode.i_size;
4676	}
4677
4678	btrfs_release_path(path);
4679	return 0;
4680}
4681
4682/*
4683 * At the moment we always log all xattrs. This is to figure out at log replay
4684 * time which xattrs must have their deletion replayed. If a xattr is missing
4685 * in the log tree and exists in the fs/subvol tree, we delete it. This is
4686 * because if a xattr is deleted, the inode is fsynced and a power failure
4687 * happens, causing the log to be replayed the next time the fs is mounted,
4688 * we want the xattr to not exist anymore (same behaviour as other filesystems
4689 * with a journal, ext3/4, xfs, f2fs, etc).
4690 */
4691static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4692				struct btrfs_root *root,
4693				struct btrfs_inode *inode,
4694				struct btrfs_path *path,
4695				struct btrfs_path *dst_path)
4696{
4697	int ret;
4698	struct btrfs_key key;
4699	const u64 ino = btrfs_ino(inode);
4700	int ins_nr = 0;
4701	int start_slot = 0;
4702	bool found_xattrs = false;
4703
4704	if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
4705		return 0;
4706
4707	key.objectid = ino;
4708	key.type = BTRFS_XATTR_ITEM_KEY;
4709	key.offset = 0;
4710
4711	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4712	if (ret < 0)
4713		return ret;
4714
4715	while (true) {
4716		int slot = path->slots[0];
4717		struct extent_buffer *leaf = path->nodes[0];
4718		int nritems = btrfs_header_nritems(leaf);
4719
4720		if (slot >= nritems) {
4721			if (ins_nr > 0) {
4722				ret = copy_items(trans, inode, dst_path, path,
4723						 start_slot, ins_nr, 1, 0);
4724				if (ret < 0)
4725					return ret;
4726				ins_nr = 0;
4727			}
4728			ret = btrfs_next_leaf(root, path);
4729			if (ret < 0)
4730				return ret;
4731			else if (ret > 0)
4732				break;
4733			continue;
4734		}
4735
4736		btrfs_item_key_to_cpu(leaf, &key, slot);
4737		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4738			break;
4739
4740		if (ins_nr == 0)
4741			start_slot = slot;
4742		ins_nr++;
4743		path->slots[0]++;
4744		found_xattrs = true;
4745		cond_resched();
4746	}
4747	if (ins_nr > 0) {
4748		ret = copy_items(trans, inode, dst_path, path,
4749				 start_slot, ins_nr, 1, 0);
4750		if (ret < 0)
4751			return ret;
4752	}
4753
4754	if (!found_xattrs)
4755		set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
4756
4757	return 0;
4758}
4759
4760/*
4761 * When using the NO_HOLES feature if we punched a hole that causes the
4762 * deletion of entire leafs or all the extent items of the first leaf (the one
4763 * that contains the inode item and references) we may end up not processing
4764 * any extents, because there are no leafs with a generation matching the
4765 * current transaction that have extent items for our inode. So we need to find
4766 * if any holes exist and then log them. We also need to log holes after any
4767 * truncate operation that changes the inode's size.
4768 */
4769static int btrfs_log_holes(struct btrfs_trans_handle *trans,
4770			   struct btrfs_root *root,
4771			   struct btrfs_inode *inode,
4772			   struct btrfs_path *path)
4773{
4774	struct btrfs_fs_info *fs_info = root->fs_info;
4775	struct btrfs_key key;
4776	const u64 ino = btrfs_ino(inode);
4777	const u64 i_size = i_size_read(&inode->vfs_inode);
4778	u64 prev_extent_end = 0;
4779	int ret;
4780
4781	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
4782		return 0;
4783
4784	key.objectid = ino;
4785	key.type = BTRFS_EXTENT_DATA_KEY;
4786	key.offset = 0;
4787
4788	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4789	if (ret < 0)
4790		return ret;
4791
4792	while (true) {
4793		struct extent_buffer *leaf = path->nodes[0];
4794
4795		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
4796			ret = btrfs_next_leaf(root, path);
4797			if (ret < 0)
4798				return ret;
4799			if (ret > 0) {
4800				ret = 0;
4801				break;
4802			}
4803			leaf = path->nodes[0];
4804		}
4805
4806		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4807		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
4808			break;
4809
4810		/* We have a hole, log it. */
4811		if (prev_extent_end < key.offset) {
4812			const u64 hole_len = key.offset - prev_extent_end;
4813
4814			/*
4815			 * Release the path to avoid deadlocks with other code
4816			 * paths that search the root while holding locks on
4817			 * leafs from the log root.
4818			 */
4819			btrfs_release_path(path);
4820			ret = btrfs_insert_file_extent(trans, root->log_root,
4821						       ino, prev_extent_end, 0,
4822						       0, hole_len, 0, hole_len,
4823						       0, 0, 0);
4824			if (ret < 0)
4825				return ret;
4826
4827			/*
4828			 * Search for the same key again in the root. Since it's
4829			 * an extent item and we are holding the inode lock, the
4830			 * key must still exist. If it doesn't just emit warning
4831			 * and return an error to fall back to a transaction
4832			 * commit.
4833			 */
4834			ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4835			if (ret < 0)
4836				return ret;
4837			if (WARN_ON(ret > 0))
4838				return -ENOENT;
4839			leaf = path->nodes[0];
4840		}
4841
4842		prev_extent_end = btrfs_file_extent_end(path);
4843		path->slots[0]++;
4844		cond_resched();
4845	}
4846
4847	if (prev_extent_end < i_size) {
4848		u64 hole_len;
4849
4850		btrfs_release_path(path);
4851		hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
4852		ret = btrfs_insert_file_extent(trans, root->log_root,
4853					       ino, prev_extent_end, 0, 0,
4854					       hole_len, 0, hole_len,
4855					       0, 0, 0);
4856		if (ret < 0)
4857			return ret;
4858	}
4859
4860	return 0;
4861}
4862
4863/*
4864 * When we are logging a new inode X, check if it doesn't have a reference that
4865 * matches the reference from some other inode Y created in a past transaction
4866 * and that was renamed in the current transaction. If we don't do this, then at
4867 * log replay time we can lose inode Y (and all its files if it's a directory):
4868 *
4869 * mkdir /mnt/x
4870 * echo "hello world" > /mnt/x/foobar
4871 * sync
4872 * mv /mnt/x /mnt/y
4873 * mkdir /mnt/x                 # or touch /mnt/x
4874 * xfs_io -c fsync /mnt/x
4875 * <power fail>
4876 * mount fs, trigger log replay
4877 *
4878 * After the log replay procedure, we would lose the first directory and all its
4879 * files (file foobar).
4880 * For the case where inode Y is not a directory we simply end up losing it:
4881 *
4882 * echo "123" > /mnt/foo
4883 * sync
4884 * mv /mnt/foo /mnt/bar
4885 * echo "abc" > /mnt/foo
4886 * xfs_io -c fsync /mnt/foo
4887 * <power fail>
4888 *
4889 * We also need this for cases where a snapshot entry is replaced by some other
4890 * entry (file or directory) otherwise we end up with an unreplayable log due to
4891 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
4892 * if it were a regular entry:
4893 *
4894 * mkdir /mnt/x
4895 * btrfs subvolume snapshot /mnt /mnt/x/snap
4896 * btrfs subvolume delete /mnt/x/snap
4897 * rmdir /mnt/x
4898 * mkdir /mnt/x
4899 * fsync /mnt/x or fsync some new file inside it
4900 * <power fail>
4901 *
4902 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
4903 * the same transaction.
4904 */
4905static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4906					 const int slot,
4907					 const struct btrfs_key *key,
4908					 struct btrfs_inode *inode,
4909					 u64 *other_ino, u64 *other_parent)
4910{
4911	int ret;
4912	struct btrfs_path *search_path;
4913	char *name = NULL;
4914	u32 name_len = 0;
4915	u32 item_size = btrfs_item_size_nr(eb, slot);
4916	u32 cur_offset = 0;
4917	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
4918
4919	search_path = btrfs_alloc_path();
4920	if (!search_path)
4921		return -ENOMEM;
4922	search_path->search_commit_root = 1;
4923	search_path->skip_locking = 1;
4924
4925	while (cur_offset < item_size) {
4926		u64 parent;
4927		u32 this_name_len;
4928		u32 this_len;
4929		unsigned long name_ptr;
4930		struct btrfs_dir_item *di;
4931
4932		if (key->type == BTRFS_INODE_REF_KEY) {
4933			struct btrfs_inode_ref *iref;
4934
4935			iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
4936			parent = key->offset;
4937			this_name_len = btrfs_inode_ref_name_len(eb, iref);
4938			name_ptr = (unsigned long)(iref + 1);
4939			this_len = sizeof(*iref) + this_name_len;
4940		} else {
4941			struct btrfs_inode_extref *extref;
4942
4943			extref = (struct btrfs_inode_extref *)(ptr +
4944							       cur_offset);
4945			parent = btrfs_inode_extref_parent(eb, extref);
4946			this_name_len = btrfs_inode_extref_name_len(eb, extref);
4947			name_ptr = (unsigned long)&extref->name;
4948			this_len = sizeof(*extref) + this_name_len;
4949		}
4950
4951		if (this_name_len > name_len) {
4952			char *new_name;
4953
4954			new_name = krealloc(name, this_name_len, GFP_NOFS);
4955			if (!new_name) {
4956				ret = -ENOMEM;
4957				goto out;
4958			}
4959			name_len = this_name_len;
4960			name = new_name;
4961		}
4962
4963		read_extent_buffer(eb, name, name_ptr, this_name_len);
4964		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
4965				parent, name, this_name_len, 0);
4966		if (di && !IS_ERR(di)) {
4967			struct btrfs_key di_key;
4968
4969			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4970						  di, &di_key);
4971			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4972				if (di_key.objectid != key->objectid) {
4973					ret = 1;
4974					*other_ino = di_key.objectid;
4975					*other_parent = parent;
4976				} else {
4977					ret = 0;
4978				}
4979			} else {
4980				ret = -EAGAIN;
4981			}
4982			goto out;
4983		} else if (IS_ERR(di)) {
4984			ret = PTR_ERR(di);
4985			goto out;
4986		}
4987		btrfs_release_path(search_path);
4988
4989		cur_offset += this_len;
4990	}
4991	ret = 0;
4992out:
4993	btrfs_free_path(search_path);
4994	kfree(name);
4995	return ret;
4996}
4997
4998struct btrfs_ino_list {
4999	u64 ino;
5000	u64 parent;
5001	struct list_head list;
5002};
5003
5004static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
5005				  struct btrfs_root *root,
5006				  struct btrfs_path *path,
5007				  struct btrfs_log_ctx *ctx,
5008				  u64 ino, u64 parent)
5009{
5010	struct btrfs_ino_list *ino_elem;
5011	LIST_HEAD(inode_list);
5012	int ret = 0;
5013
5014	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5015	if (!ino_elem)
5016		return -ENOMEM;
5017	ino_elem->ino = ino;
5018	ino_elem->parent = parent;
5019	list_add_tail(&ino_elem->list, &inode_list);
5020
5021	while (!list_empty(&inode_list)) {
5022		struct btrfs_fs_info *fs_info = root->fs_info;
5023		struct btrfs_key key;
5024		struct inode *inode;
5025
5026		ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
5027					    list);
5028		ino = ino_elem->ino;
5029		parent = ino_elem->parent;
5030		list_del(&ino_elem->list);
5031		kfree(ino_elem);
5032		if (ret)
5033			continue;
5034
5035		btrfs_release_path(path);
5036
5037		inode = btrfs_iget(fs_info->sb, ino, root);
5038		/*
5039		 * If the other inode that had a conflicting dir entry was
5040		 * deleted in the current transaction, we need to log its parent
5041		 * directory.
5042		 */
5043		if (IS_ERR(inode)) {
5044			ret = PTR_ERR(inode);
5045			if (ret == -ENOENT) {
5046				inode = btrfs_iget(fs_info->sb, parent, root);
5047				if (IS_ERR(inode)) {
5048					ret = PTR_ERR(inode);
5049				} else {
5050					ret = btrfs_log_inode(trans, root,
5051						      BTRFS_I(inode),
5052						      LOG_OTHER_INODE_ALL,
5053						      ctx);
5054					btrfs_add_delayed_iput(inode);
5055				}
5056			}
5057			continue;
5058		}
5059		/*
5060		 * If the inode was already logged skip it - otherwise we can
5061		 * hit an infinite loop. Example:
5062		 *
5063		 * From the commit root (previous transaction) we have the
5064		 * following inodes:
5065		 *
5066		 * inode 257 a directory
5067		 * inode 258 with references "zz" and "zz_link" on inode 257
5068		 * inode 259 with reference "a" on inode 257
5069		 *
5070		 * And in the current (uncommitted) transaction we have:
5071		 *
5072		 * inode 257 a directory, unchanged
5073		 * inode 258 with references "a" and "a2" on inode 257
5074		 * inode 259 with reference "zz_link" on inode 257
5075		 * inode 261 with reference "zz" on inode 257
5076		 *
5077		 * When logging inode 261 the following infinite loop could
5078		 * happen if we don't skip already logged inodes:
5079		 *
5080		 * - we detect inode 258 as a conflicting inode, with inode 261
5081		 *   on reference "zz", and log it;
5082		 *
5083		 * - we detect inode 259 as a conflicting inode, with inode 258
5084		 *   on reference "a", and log it;
5085		 *
5086		 * - we detect inode 258 as a conflicting inode, with inode 259
5087		 *   on reference "zz_link", and log it - again! After this we
5088		 *   repeat the above steps forever.
5089		 */
5090		spin_lock(&BTRFS_I(inode)->lock);
5091		/*
5092		 * Check the inode's logged_trans only instead of
5093		 * btrfs_inode_in_log(). This is because the last_log_commit of
5094		 * the inode is not updated when we only log that it exists and
5095		 * it has the full sync bit set (see btrfs_log_inode()).
5096		 */
5097		if (BTRFS_I(inode)->logged_trans == trans->transid) {
5098			spin_unlock(&BTRFS_I(inode)->lock);
5099			btrfs_add_delayed_iput(inode);
5100			continue;
5101		}
5102		spin_unlock(&BTRFS_I(inode)->lock);
5103		/*
5104		 * We are safe logging the other inode without acquiring its
5105		 * lock as long as we log with the LOG_INODE_EXISTS mode. We
5106		 * are safe against concurrent renames of the other inode as
5107		 * well because during a rename we pin the log and update the
5108		 * log with the new name before we unpin it.
5109		 */
5110		ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5111				      LOG_OTHER_INODE, ctx);
5112		if (ret) {
5113			btrfs_add_delayed_iput(inode);
5114			continue;
5115		}
5116
5117		key.objectid = ino;
5118		key.type = BTRFS_INODE_REF_KEY;
5119		key.offset = 0;
5120		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5121		if (ret < 0) {
5122			btrfs_add_delayed_iput(inode);
5123			continue;
5124		}
5125
5126		while (true) {
5127			struct extent_buffer *leaf = path->nodes[0];
5128			int slot = path->slots[0];
5129			u64 other_ino = 0;
5130			u64 other_parent = 0;
5131
5132			if (slot >= btrfs_header_nritems(leaf)) {
5133				ret = btrfs_next_leaf(root, path);
5134				if (ret < 0) {
5135					break;
5136				} else if (ret > 0) {
5137					ret = 0;
5138					break;
5139				}
5140				continue;
5141			}
5142
5143			btrfs_item_key_to_cpu(leaf, &key, slot);
5144			if (key.objectid != ino ||
5145			    (key.type != BTRFS_INODE_REF_KEY &&
5146			     key.type != BTRFS_INODE_EXTREF_KEY)) {
5147				ret = 0;
5148				break;
5149			}
5150
5151			ret = btrfs_check_ref_name_override(leaf, slot, &key,
5152					BTRFS_I(inode), &other_ino,
5153					&other_parent);
5154			if (ret < 0)
5155				break;
5156			if (ret > 0) {
5157				ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5158				if (!ino_elem) {
5159					ret = -ENOMEM;
5160					break;
5161				}
5162				ino_elem->ino = other_ino;
5163				ino_elem->parent = other_parent;
5164				list_add_tail(&ino_elem->list, &inode_list);
5165				ret = 0;
5166			}
5167			path->slots[0]++;
5168		}
5169		btrfs_add_delayed_iput(inode);
5170	}
5171
5172	return ret;
5173}
5174
5175static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5176				   struct btrfs_inode *inode,
5177				   struct btrfs_key *min_key,
5178				   const struct btrfs_key *max_key,
5179				   struct btrfs_path *path,
5180				   struct btrfs_path *dst_path,
5181				   const u64 logged_isize,
5182				   const bool recursive_logging,
5183				   const int inode_only,
5184				   struct btrfs_log_ctx *ctx,
5185				   bool *need_log_inode_item)
5186{
5187	struct btrfs_root *root = inode->root;
5188	int ins_start_slot = 0;
5189	int ins_nr = 0;
5190	int ret;
5191
5192	while (1) {
5193		ret = btrfs_search_forward(root, min_key, path, trans->transid);
5194		if (ret < 0)
5195			return ret;
5196		if (ret > 0) {
5197			ret = 0;
5198			break;
5199		}
5200again:
5201		/* Note, ins_nr might be > 0 here, cleanup outside the loop */
5202		if (min_key->objectid != max_key->objectid)
5203			break;
5204		if (min_key->type > max_key->type)
5205			break;
5206
5207		if (min_key->type == BTRFS_INODE_ITEM_KEY)
5208			*need_log_inode_item = false;
5209
5210		if ((min_key->type == BTRFS_INODE_REF_KEY ||
5211		     min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5212		    inode->generation == trans->transid &&
5213		    !recursive_logging) {
5214			u64 other_ino = 0;
5215			u64 other_parent = 0;
5216
5217			ret = btrfs_check_ref_name_override(path->nodes[0],
5218					path->slots[0], min_key, inode,
5219					&other_ino, &other_parent);
5220			if (ret < 0) {
5221				return ret;
5222			} else if (ret > 0 && ctx &&
5223				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5224				if (ins_nr > 0) {
5225					ins_nr++;
5226				} else {
5227					ins_nr = 1;
5228					ins_start_slot = path->slots[0];
5229				}
5230				ret = copy_items(trans, inode, dst_path, path,
5231						 ins_start_slot, ins_nr,
5232						 inode_only, logged_isize);
5233				if (ret < 0)
5234					return ret;
5235				ins_nr = 0;
5236
5237				ret = log_conflicting_inodes(trans, root, path,
5238						ctx, other_ino, other_parent);
5239				if (ret)
5240					return ret;
5241				btrfs_release_path(path);
5242				goto next_key;
5243			}
5244		}
5245
5246		/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
5247		if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5248			if (ins_nr == 0)
5249				goto next_slot;
5250			ret = copy_items(trans, inode, dst_path, path,
5251					 ins_start_slot,
5252					 ins_nr, inode_only, logged_isize);
5253			if (ret < 0)
5254				return ret;
5255			ins_nr = 0;
5256			goto next_slot;
5257		}
5258
5259		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5260			ins_nr++;
5261			goto next_slot;
5262		} else if (!ins_nr) {
5263			ins_start_slot = path->slots[0];
5264			ins_nr = 1;
5265			goto next_slot;
5266		}
5267
5268		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5269				 ins_nr, inode_only, logged_isize);
5270		if (ret < 0)
5271			return ret;
5272		ins_nr = 1;
5273		ins_start_slot = path->slots[0];
5274next_slot:
5275		path->slots[0]++;
5276		if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5277			btrfs_item_key_to_cpu(path->nodes[0], min_key,
5278					      path->slots[0]);
5279			goto again;
5280		}
5281		if (ins_nr) {
5282			ret = copy_items(trans, inode, dst_path, path,
5283					 ins_start_slot, ins_nr, inode_only,
5284					 logged_isize);
5285			if (ret < 0)
5286				return ret;
5287			ins_nr = 0;
5288		}
5289		btrfs_release_path(path);
5290next_key:
5291		if (min_key->offset < (u64)-1) {
5292			min_key->offset++;
5293		} else if (min_key->type < max_key->type) {
5294			min_key->type++;
5295			min_key->offset = 0;
5296		} else {
5297			break;
5298		}
5299	}
5300	if (ins_nr)
5301		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5302				 ins_nr, inode_only, logged_isize);
5303
5304	return ret;
5305}
5306
5307/* log a single inode in the tree log.
5308 * At least one parent directory for this inode must exist in the tree
5309 * or be logged already.
5310 *
5311 * Any items from this inode changed by the current transaction are copied
5312 * to the log tree.  An extra reference is taken on any extents in this
5313 * file, allowing us to avoid a whole pile of corner cases around logging
5314 * blocks that have been removed from the tree.
5315 *
5316 * See LOG_INODE_ALL and related defines for a description of what inode_only
5317 * does.
5318 *
5319 * This handles both files and directories.
5320 */
5321static int btrfs_log_inode(struct btrfs_trans_handle *trans,
5322			   struct btrfs_root *root, struct btrfs_inode *inode,
5323			   int inode_only,
5324			   struct btrfs_log_ctx *ctx)
5325{
5326	struct btrfs_path *path;
5327	struct btrfs_path *dst_path;
5328	struct btrfs_key min_key;
5329	struct btrfs_key max_key;
5330	struct btrfs_root *log = root->log_root;
5331	int err = 0;
5332	int ret = 0;
5333	bool fast_search = false;
5334	u64 ino = btrfs_ino(inode);
5335	struct extent_map_tree *em_tree = &inode->extent_tree;
5336	u64 logged_isize = 0;
5337	bool need_log_inode_item = true;
5338	bool xattrs_logged = false;
5339	bool recursive_logging = false;
5340	bool inode_item_dropped = true;
5341
5342	path = btrfs_alloc_path();
5343	if (!path)
5344		return -ENOMEM;
5345	dst_path = btrfs_alloc_path();
5346	if (!dst_path) {
5347		btrfs_free_path(path);
5348		return -ENOMEM;
5349	}
5350
5351	min_key.objectid = ino;
5352	min_key.type = BTRFS_INODE_ITEM_KEY;
5353	min_key.offset = 0;
5354
5355	max_key.objectid = ino;
5356
5357
5358	/* today the code can only do partial logging of directories */
5359	if (S_ISDIR(inode->vfs_inode.i_mode) ||
5360	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5361		       &inode->runtime_flags) &&
5362	     inode_only >= LOG_INODE_EXISTS))
5363		max_key.type = BTRFS_XATTR_ITEM_KEY;
5364	else
5365		max_key.type = (u8)-1;
5366	max_key.offset = (u64)-1;
5367
5368	/*
5369	 * Only run delayed items if we are a directory. We want to make sure
5370	 * all directory indexes hit the fs/subvolume tree so we can find them
5371	 * and figure out which index ranges have to be logged.
5372	 *
5373	 * Otherwise commit the delayed inode only if the full sync flag is set,
5374	 * as we want to make sure an up to date version is in the subvolume
5375	 * tree so copy_inode_items_to_log() / copy_items() can find it and copy
5376	 * it to the log tree. For a non full sync, we always log the inode item
5377	 * based on the in-memory struct btrfs_inode which is always up to date.
5378	 */
5379	if (S_ISDIR(inode->vfs_inode.i_mode))
5380		ret = btrfs_commit_inode_delayed_items(trans, inode);
5381	else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5382		ret = btrfs_commit_inode_delayed_inode(inode);
5383
5384	if (ret) {
5385		btrfs_free_path(path);
5386		btrfs_free_path(dst_path);
5387		return ret;
5388	}
5389
5390	if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
5391		recursive_logging = true;
5392		if (inode_only == LOG_OTHER_INODE)
5393			inode_only = LOG_INODE_EXISTS;
5394		else
5395			inode_only = LOG_INODE_ALL;
5396		mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
5397	} else {
5398		mutex_lock(&inode->log_mutex);
5399	}
5400
5401	/*
5402	 * This is for cases where logging a directory could result in losing a
5403	 * a file after replaying the log. For example, if we move a file from a
5404	 * directory A to a directory B, then fsync directory A, we have no way
5405	 * to known the file was moved from A to B, so logging just A would
5406	 * result in losing the file after a log replay.
5407	 */
5408	if (S_ISDIR(inode->vfs_inode.i_mode) &&
5409	    inode_only == LOG_INODE_ALL &&
5410	    inode->last_unlink_trans >= trans->transid) {
5411		btrfs_set_log_full_commit(trans);
5412		err = 1;
5413		goto out_unlock;
5414	}
5415
5416	/*
5417	 * a brute force approach to making sure we get the most uptodate
5418	 * copies of everything.
5419	 */
5420	if (S_ISDIR(inode->vfs_inode.i_mode)) {
5421		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
5422
5423		clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
5424		if (inode_only == LOG_INODE_EXISTS)
5425			max_key_type = BTRFS_XATTR_ITEM_KEY;
5426		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
5427	} else {
5428		if (inode_only == LOG_INODE_EXISTS) {
5429			/*
5430			 * Make sure the new inode item we write to the log has
5431			 * the same isize as the current one (if it exists).
5432			 * This is necessary to prevent data loss after log
5433			 * replay, and also to prevent doing a wrong expanding
5434			 * truncate - for e.g. create file, write 4K into offset
5435			 * 0, fsync, write 4K into offset 4096, add hard link,
5436			 * fsync some other file (to sync log), power fail - if
5437			 * we use the inode's current i_size, after log replay
5438			 * we get a 8Kb file, with the last 4Kb extent as a hole
5439			 * (zeroes), as if an expanding truncate happened,
5440			 * instead of getting a file of 4Kb only.
5441			 */
5442			err = logged_inode_size(log, inode, path, &logged_isize);
5443			if (err)
5444				goto out_unlock;
5445		}
5446		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5447			     &inode->runtime_flags)) {
5448			if (inode_only == LOG_INODE_EXISTS) {
5449				max_key.type = BTRFS_XATTR_ITEM_KEY;
5450				ret = drop_objectid_items(trans, log, path, ino,
5451							  max_key.type);
5452			} else {
5453				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5454					  &inode->runtime_flags);
5455				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5456					  &inode->runtime_flags);
5457				while(1) {
5458					ret = btrfs_truncate_inode_items(trans,
5459						log, inode, 0, 0, NULL);
5460					if (ret != -EAGAIN)
5461						break;
5462				}
5463			}
5464		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5465					      &inode->runtime_flags) ||
5466			   inode_only == LOG_INODE_EXISTS) {
5467			if (inode_only == LOG_INODE_ALL)
5468				fast_search = true;
5469			max_key.type = BTRFS_XATTR_ITEM_KEY;
5470			ret = drop_objectid_items(trans, log, path, ino,
5471						  max_key.type);
5472		} else {
5473			if (inode_only == LOG_INODE_ALL)
5474				fast_search = true;
5475			inode_item_dropped = false;
5476			goto log_extents;
5477		}
5478
5479	}
5480	if (ret) {
5481		err = ret;
5482		goto out_unlock;
5483	}
5484
5485	err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
5486				      path, dst_path, logged_isize,
5487				      recursive_logging, inode_only, ctx,
5488				      &need_log_inode_item);
5489	if (err)
5490		goto out_unlock;
5491
5492	btrfs_release_path(path);
5493	btrfs_release_path(dst_path);
5494	err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
5495	if (err)
5496		goto out_unlock;
5497	xattrs_logged = true;
5498	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5499		btrfs_release_path(path);
5500		btrfs_release_path(dst_path);
5501		err = btrfs_log_holes(trans, root, inode, path);
5502		if (err)
5503			goto out_unlock;
5504	}
5505log_extents:
5506	btrfs_release_path(path);
5507	btrfs_release_path(dst_path);
5508	if (need_log_inode_item) {
5509		err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
5510		if (err)
5511			goto out_unlock;
5512		/*
5513		 * If we are doing a fast fsync and the inode was logged before
5514		 * in this transaction, we don't need to log the xattrs because
5515		 * they were logged before. If xattrs were added, changed or
5516		 * deleted since the last time we logged the inode, then we have
5517		 * already logged them because the inode had the runtime flag
5518		 * BTRFS_INODE_COPY_EVERYTHING set.
5519		 */
5520		if (!xattrs_logged && inode->logged_trans < trans->transid) {
5521			err = btrfs_log_all_xattrs(trans, root, inode, path,
5522						   dst_path);
5523			if (err)
5524				goto out_unlock;
5525			btrfs_release_path(path);
5526		}
5527	}
5528	if (fast_search) {
5529		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
5530						ctx);
5531		if (ret) {
5532			err = ret;
5533			goto out_unlock;
5534		}
5535	} else if (inode_only == LOG_INODE_ALL) {
5536		struct extent_map *em, *n;
5537
5538		write_lock(&em_tree->lock);
5539		list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
5540			list_del_init(&em->list);
5541		write_unlock(&em_tree->lock);
5542	}
5543
5544	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5545		ret = log_directory_changes(trans, root, inode, path, dst_path,
5546					ctx);
5547		if (ret) {
5548			err = ret;
5549			goto out_unlock;
5550		}
5551	}
5552
5553	/*
5554	 * If we are logging that an ancestor inode exists as part of logging a
5555	 * new name from a link or rename operation, don't mark the inode as
5556	 * logged - otherwise if an explicit fsync is made against an ancestor,
5557	 * the fsync considers the inode in the log and doesn't sync the log,
5558	 * resulting in the ancestor missing after a power failure unless the
5559	 * log was synced as part of an fsync against any other unrelated inode.
5560	 * So keep it simple for this case and just don't flag the ancestors as
5561	 * logged.
5562	 */
5563	if (!ctx ||
5564	    !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
5565	      &inode->vfs_inode != ctx->inode)) {
5566		spin_lock(&inode->lock);
5567		inode->logged_trans = trans->transid;
5568		/*
5569		 * Don't update last_log_commit if we logged that an inode exists.
5570		 * We do this for two reasons:
5571		 *
5572		 * 1) We might have had buffered writes to this inode that were
5573		 *    flushed and had their ordered extents completed in this
5574		 *    transaction, but we did not previously log the inode with
5575		 *    LOG_INODE_ALL. Later the inode was evicted and after that
5576		 *    it was loaded again and this LOG_INODE_EXISTS log operation
5577		 *    happened. We must make sure that if an explicit fsync against
5578		 *    the inode is performed later, it logs the new extents, an
5579		 *    updated inode item, etc, and syncs the log. The same logic
5580		 *    applies to direct IO writes instead of buffered writes.
5581		 *
5582		 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
5583		 *    is logged with an i_size of 0 or whatever value was logged
5584		 *    before. If later the i_size of the inode is increased by a
5585		 *    truncate operation, the log is synced through an fsync of
5586		 *    some other inode and then finally an explicit fsync against
5587		 *    this inode is made, we must make sure this fsync logs the
5588		 *    inode with the new i_size, the hole between old i_size and
5589		 *    the new i_size, and syncs the log.
5590		 */
5591		if (inode_only != LOG_INODE_EXISTS)
5592			inode->last_log_commit = inode->last_sub_trans;
5593		spin_unlock(&inode->lock);
5594	}
5595out_unlock:
5596	mutex_unlock(&inode->log_mutex);
5597
5598	btrfs_free_path(path);
5599	btrfs_free_path(dst_path);
5600	return err;
5601}
5602
5603/*
5604 * Check if we need to log an inode. This is used in contexts where while
5605 * logging an inode we need to log another inode (either that it exists or in
5606 * full mode). This is used instead of btrfs_inode_in_log() because the later
5607 * requires the inode to be in the log and have the log transaction committed,
5608 * while here we do not care if the log transaction was already committed - our
5609 * caller will commit the log later - and we want to avoid logging an inode
5610 * multiple times when multiple tasks have joined the same log transaction.
5611 */
5612static bool need_log_inode(struct btrfs_trans_handle *trans,
5613			   struct btrfs_inode *inode)
5614{
5615	/*
5616	 * If this inode does not have new/updated/deleted xattrs since the last
5617	 * time it was logged and is flagged as logged in the current transaction,
5618	 * we can skip logging it. As for new/deleted names, those are updated in
5619	 * the log by link/unlink/rename operations.
5620	 * In case the inode was logged and then evicted and reloaded, its
5621	 * logged_trans will be 0, in which case we have to fully log it since
5622	 * logged_trans is a transient field, not persisted.
5623	 */
5624	if (inode->logged_trans == trans->transid &&
5625	    !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5626		return false;
5627
5628	return true;
5629}
5630
5631struct btrfs_dir_list {
5632	u64 ino;
5633	struct list_head list;
5634};
5635
5636/*
5637 * Log the inodes of the new dentries of a directory. See log_dir_items() for
5638 * details about the why it is needed.
5639 * This is a recursive operation - if an existing dentry corresponds to a
5640 * directory, that directory's new entries are logged too (same behaviour as
5641 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5642 * the dentries point to we do not lock their i_mutex, otherwise lockdep
5643 * complains about the following circular lock dependency / possible deadlock:
5644 *
5645 *        CPU0                                        CPU1
5646 *        ----                                        ----
5647 * lock(&type->i_mutex_dir_key#3/2);
5648 *                                            lock(sb_internal#2);
5649 *                                            lock(&type->i_mutex_dir_key#3/2);
5650 * lock(&sb->s_type->i_mutex_key#14);
5651 *
5652 * Where sb_internal is the lock (a counter that works as a lock) acquired by
5653 * sb_start_intwrite() in btrfs_start_transaction().
5654 * Not locking i_mutex of the inodes is still safe because:
5655 *
5656 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5657 *    that while logging the inode new references (names) are added or removed
5658 *    from the inode, leaving the logged inode item with a link count that does
5659 *    not match the number of logged inode reference items. This is fine because
5660 *    at log replay time we compute the real number of links and correct the
5661 *    link count in the inode item (see replay_one_buffer() and
5662 *    link_to_fixup_dir());
5663 *
5664 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5665 *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
5666 *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
5667 *    has a size that doesn't match the sum of the lengths of all the logged
5668 *    names. This does not result in a problem because if a dir_item key is
5669 *    logged but its matching dir_index key is not logged, at log replay time we
5670 *    don't use it to replay the respective name (see replay_one_name()). On the
5671 *    other hand if only the dir_index key ends up being logged, the respective
5672 *    name is added to the fs/subvol tree with both the dir_item and dir_index
5673 *    keys created (see replay_one_name()).
5674 *    The directory's inode item with a wrong i_size is not a problem as well,
5675 *    since we don't use it at log replay time to set the i_size in the inode
5676 *    item of the fs/subvol tree (see overwrite_item()).
5677 */
5678static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5679				struct btrfs_root *root,
5680				struct btrfs_inode *start_inode,
5681				struct btrfs_log_ctx *ctx)
5682{
5683	struct btrfs_fs_info *fs_info = root->fs_info;
5684	struct btrfs_root *log = root->log_root;
5685	struct btrfs_path *path;
5686	LIST_HEAD(dir_list);
5687	struct btrfs_dir_list *dir_elem;
5688	int ret = 0;
5689
5690	path = btrfs_alloc_path();
5691	if (!path)
5692		return -ENOMEM;
5693
5694	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5695	if (!dir_elem) {
5696		btrfs_free_path(path);
5697		return -ENOMEM;
5698	}
5699	dir_elem->ino = btrfs_ino(start_inode);
5700	list_add_tail(&dir_elem->list, &dir_list);
5701
5702	while (!list_empty(&dir_list)) {
5703		struct extent_buffer *leaf;
5704		struct btrfs_key min_key;
5705		int nritems;
5706		int i;
5707
5708		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5709					    list);
5710		if (ret)
5711			goto next_dir_inode;
5712
5713		min_key.objectid = dir_elem->ino;
5714		min_key.type = BTRFS_DIR_ITEM_KEY;
5715		min_key.offset = 0;
5716again:
5717		btrfs_release_path(path);
5718		ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5719		if (ret < 0) {
5720			goto next_dir_inode;
5721		} else if (ret > 0) {
5722			ret = 0;
5723			goto next_dir_inode;
5724		}
5725
5726process_leaf:
5727		leaf = path->nodes[0];
5728		nritems = btrfs_header_nritems(leaf);
5729		for (i = path->slots[0]; i < nritems; i++) {
5730			struct btrfs_dir_item *di;
5731			struct btrfs_key di_key;
5732			struct inode *di_inode;
5733			struct btrfs_dir_list *new_dir_elem;
5734			int log_mode = LOG_INODE_EXISTS;
5735			int type;
5736
5737			btrfs_item_key_to_cpu(leaf, &min_key, i);
5738			if (min_key.objectid != dir_elem->ino ||
5739			    min_key.type != BTRFS_DIR_ITEM_KEY)
5740				goto next_dir_inode;
5741
5742			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5743			type = btrfs_dir_type(leaf, di);
5744			if (btrfs_dir_transid(leaf, di) < trans->transid &&
5745			    type != BTRFS_FT_DIR)
5746				continue;
5747			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5748			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5749				continue;
5750
5751			btrfs_release_path(path);
5752			di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
5753			if (IS_ERR(di_inode)) {
5754				ret = PTR_ERR(di_inode);
5755				goto next_dir_inode;
5756			}
5757
5758			if (!need_log_inode(trans, BTRFS_I(di_inode))) {
5759				btrfs_add_delayed_iput(di_inode);
5760				break;
5761			}
5762
5763			ctx->log_new_dentries = false;
5764			if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
5765				log_mode = LOG_INODE_ALL;
5766			ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
5767					      log_mode, ctx);
5768			btrfs_add_delayed_iput(di_inode);
5769			if (ret)
5770				goto next_dir_inode;
5771			if (ctx->log_new_dentries) {
5772				new_dir_elem = kmalloc(sizeof(*new_dir_elem),
5773						       GFP_NOFS);
5774				if (!new_dir_elem) {
5775					ret = -ENOMEM;
5776					goto next_dir_inode;
5777				}
5778				new_dir_elem->ino = di_key.objectid;
5779				list_add_tail(&new_dir_elem->list, &dir_list);
5780			}
5781			break;
5782		}
5783		if (i == nritems) {
5784			ret = btrfs_next_leaf(log, path);
5785			if (ret < 0) {
5786				goto next_dir_inode;
5787			} else if (ret > 0) {
5788				ret = 0;
5789				goto next_dir_inode;
5790			}
5791			goto process_leaf;
5792		}
5793		if (min_key.offset < (u64)-1) {
5794			min_key.offset++;
5795			goto again;
5796		}
5797next_dir_inode:
5798		list_del(&dir_elem->list);
5799		kfree(dir_elem);
5800	}
5801
5802	btrfs_free_path(path);
5803	return ret;
5804}
5805
5806static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5807				 struct btrfs_inode *inode,
5808				 struct btrfs_log_ctx *ctx)
5809{
5810	struct btrfs_fs_info *fs_info = trans->fs_info;
5811	int ret;
5812	struct btrfs_path *path;
5813	struct btrfs_key key;
5814	struct btrfs_root *root = inode->root;
5815	const u64 ino = btrfs_ino(inode);
5816
5817	path = btrfs_alloc_path();
5818	if (!path)
5819		return -ENOMEM;
5820	path->skip_locking = 1;
5821	path->search_commit_root = 1;
5822
5823	key.objectid = ino;
5824	key.type = BTRFS_INODE_REF_KEY;
5825	key.offset = 0;
5826	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5827	if (ret < 0)
5828		goto out;
5829
5830	while (true) {
5831		struct extent_buffer *leaf = path->nodes[0];
5832		int slot = path->slots[0];
5833		u32 cur_offset = 0;
5834		u32 item_size;
5835		unsigned long ptr;
5836
5837		if (slot >= btrfs_header_nritems(leaf)) {
5838			ret = btrfs_next_leaf(root, path);
5839			if (ret < 0)
5840				goto out;
5841			else if (ret > 0)
5842				break;
5843			continue;
5844		}
5845
5846		btrfs_item_key_to_cpu(leaf, &key, slot);
5847		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5848		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5849			break;
5850
5851		item_size = btrfs_item_size_nr(leaf, slot);
5852		ptr = btrfs_item_ptr_offset(leaf, slot);
5853		while (cur_offset < item_size) {
5854			struct btrfs_key inode_key;
5855			struct inode *dir_inode;
5856
5857			inode_key.type = BTRFS_INODE_ITEM_KEY;
5858			inode_key.offset = 0;
5859
5860			if (key.type == BTRFS_INODE_EXTREF_KEY) {
5861				struct btrfs_inode_extref *extref;
5862
5863				extref = (struct btrfs_inode_extref *)
5864					(ptr + cur_offset);
5865				inode_key.objectid = btrfs_inode_extref_parent(
5866					leaf, extref);
5867				cur_offset += sizeof(*extref);
5868				cur_offset += btrfs_inode_extref_name_len(leaf,
5869					extref);
5870			} else {
5871				inode_key.objectid = key.offset;
5872				cur_offset = item_size;
5873			}
5874
5875			dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
5876					       root);
5877			/*
5878			 * If the parent inode was deleted, return an error to
5879			 * fallback to a transaction commit. This is to prevent
5880			 * getting an inode that was moved from one parent A to
5881			 * a parent B, got its former parent A deleted and then
5882			 * it got fsync'ed, from existing at both parents after
5883			 * a log replay (and the old parent still existing).
5884			 * Example:
5885			 *
5886			 * mkdir /mnt/A
5887			 * mkdir /mnt/B
5888			 * touch /mnt/B/bar
5889			 * sync
5890			 * mv /mnt/B/bar /mnt/A/bar
5891			 * mv -T /mnt/A /mnt/B
5892			 * fsync /mnt/B/bar
5893			 * <power fail>
5894			 *
5895			 * If we ignore the old parent B which got deleted,
5896			 * after a log replay we would have file bar linked
5897			 * at both parents and the old parent B would still
5898			 * exist.
5899			 */
5900			if (IS_ERR(dir_inode)) {
5901				ret = PTR_ERR(dir_inode);
5902				goto out;
5903			}
5904
5905			if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
5906				btrfs_add_delayed_iput(dir_inode);
5907				continue;
5908			}
5909
5910			if (ctx)
5911				ctx->log_new_dentries = false;
5912			ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
5913					      LOG_INODE_ALL, ctx);
5914			if (!ret && ctx && ctx->log_new_dentries)
5915				ret = log_new_dir_dentries(trans, root,
5916						   BTRFS_I(dir_inode), ctx);
5917			btrfs_add_delayed_iput(dir_inode);
5918			if (ret)
5919				goto out;
5920		}
5921		path->slots[0]++;
5922	}
5923	ret = 0;
5924out:
5925	btrfs_free_path(path);
5926	return ret;
5927}
5928
5929static int log_new_ancestors(struct btrfs_trans_handle *trans,
5930			     struct btrfs_root *root,
5931			     struct btrfs_path *path,
5932			     struct btrfs_log_ctx *ctx)
5933{
5934	struct btrfs_key found_key;
5935
5936	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5937
5938	while (true) {
5939		struct btrfs_fs_info *fs_info = root->fs_info;
5940		struct extent_buffer *leaf = path->nodes[0];
5941		int slot = path->slots[0];
5942		struct btrfs_key search_key;
5943		struct inode *inode;
5944		u64 ino;
5945		int ret = 0;
5946
5947		btrfs_release_path(path);
5948
5949		ino = found_key.offset;
5950
5951		search_key.objectid = found_key.offset;
5952		search_key.type = BTRFS_INODE_ITEM_KEY;
5953		search_key.offset = 0;
5954		inode = btrfs_iget(fs_info->sb, ino, root);
5955		if (IS_ERR(inode))
5956			return PTR_ERR(inode);
5957
5958		if (BTRFS_I(inode)->generation >= trans->transid &&
5959		    need_log_inode(trans, BTRFS_I(inode)))
5960			ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5961					      LOG_INODE_EXISTS, ctx);
5962		btrfs_add_delayed_iput(inode);
5963		if (ret)
5964			return ret;
5965
5966		if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
5967			break;
5968
5969		search_key.type = BTRFS_INODE_REF_KEY;
5970		ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5971		if (ret < 0)
5972			return ret;
5973
5974		leaf = path->nodes[0];
5975		slot = path->slots[0];
5976		if (slot >= btrfs_header_nritems(leaf)) {
5977			ret = btrfs_next_leaf(root, path);
5978			if (ret < 0)
5979				return ret;
5980			else if (ret > 0)
5981				return -ENOENT;
5982			leaf = path->nodes[0];
5983			slot = path->slots[0];
5984		}
5985
5986		btrfs_item_key_to_cpu(leaf, &found_key, slot);
5987		if (found_key.objectid != search_key.objectid ||
5988		    found_key.type != BTRFS_INODE_REF_KEY)
5989			return -ENOENT;
5990	}
5991	return 0;
5992}
5993
5994static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
5995				  struct btrfs_inode *inode,
5996				  struct dentry *parent,
5997				  struct btrfs_log_ctx *ctx)
5998{
5999	struct btrfs_root *root = inode->root;
6000	struct dentry *old_parent = NULL;
6001	struct super_block *sb = inode->vfs_inode.i_sb;
6002	int ret = 0;
6003
6004	while (true) {
6005		if (!parent || d_really_is_negative(parent) ||
6006		    sb != parent->d_sb)
6007			break;
6008
6009		inode = BTRFS_I(d_inode(parent));
6010		if (root != inode->root)
6011			break;
6012
6013		if (inode->generation >= trans->transid &&
6014		    need_log_inode(trans, inode)) {
6015			ret = btrfs_log_inode(trans, root, inode,
6016					      LOG_INODE_EXISTS, ctx);
6017			if (ret)
6018				break;
6019		}
6020		if (IS_ROOT(parent))
6021			break;
6022
6023		parent = dget_parent(parent);
6024		dput(old_parent);
6025		old_parent = parent;
6026	}
6027	dput(old_parent);
6028
6029	return ret;
6030}
6031
6032static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6033				 struct btrfs_inode *inode,
6034				 struct dentry *parent,
6035				 struct btrfs_log_ctx *ctx)
6036{
6037	struct btrfs_root *root = inode->root;
6038	const u64 ino = btrfs_ino(inode);
6039	struct btrfs_path *path;
6040	struct btrfs_key search_key;
6041	int ret;
6042
6043	/*
6044	 * For a single hard link case, go through a fast path that does not
6045	 * need to iterate the fs/subvolume tree.
6046	 */
6047	if (inode->vfs_inode.i_nlink < 2)
6048		return log_new_ancestors_fast(trans, inode, parent, ctx);
6049
6050	path = btrfs_alloc_path();
6051	if (!path)
6052		return -ENOMEM;
6053
6054	search_key.objectid = ino;
6055	search_key.type = BTRFS_INODE_REF_KEY;
6056	search_key.offset = 0;
6057again:
6058	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6059	if (ret < 0)
6060		goto out;
6061	if (ret == 0)
6062		path->slots[0]++;
6063
6064	while (true) {
6065		struct extent_buffer *leaf = path->nodes[0];
6066		int slot = path->slots[0];
6067		struct btrfs_key found_key;
6068
6069		if (slot >= btrfs_header_nritems(leaf)) {
6070			ret = btrfs_next_leaf(root, path);
6071			if (ret < 0)
6072				goto out;
6073			else if (ret > 0)
6074				break;
6075			continue;
6076		}
6077
6078		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6079		if (found_key.objectid != ino ||
6080		    found_key.type > BTRFS_INODE_EXTREF_KEY)
6081			break;
6082
6083		/*
6084		 * Don't deal with extended references because they are rare
6085		 * cases and too complex to deal with (we would need to keep
6086		 * track of which subitem we are processing for each item in
6087		 * this loop, etc). So just return some error to fallback to
6088		 * a transaction commit.
6089		 */
6090		if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6091			ret = -EMLINK;
6092			goto out;
6093		}
6094
6095		/*
6096		 * Logging ancestors needs to do more searches on the fs/subvol
6097		 * tree, so it releases the path as needed to avoid deadlocks.
6098		 * Keep track of the last inode ref key and resume from that key
6099		 * after logging all new ancestors for the current hard link.
6100		 */
6101		memcpy(&search_key, &found_key, sizeof(search_key));
6102
6103		ret = log_new_ancestors(trans, root, path, ctx);
6104		if (ret)
6105			goto out;
6106		btrfs_release_path(path);
6107		goto again;
6108	}
6109	ret = 0;
6110out:
6111	btrfs_free_path(path);
6112	return ret;
6113}
6114
6115/*
6116 * helper function around btrfs_log_inode to make sure newly created
6117 * parent directories also end up in the log.  A minimal inode and backref
6118 * only logging is done of any parent directories that are older than
6119 * the last committed transaction
6120 */
6121static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
6122				  struct btrfs_inode *inode,
6123				  struct dentry *parent,
6124				  int inode_only,
6125				  struct btrfs_log_ctx *ctx)
6126{
6127	struct btrfs_root *root = inode->root;
6128	struct btrfs_fs_info *fs_info = root->fs_info;
6129	int ret = 0;
6130	bool log_dentries = false;
6131
6132	if (btrfs_test_opt(fs_info, NOTREELOG)) {
6133		ret = 1;
6134		goto end_no_trans;
6135	}
6136
6137	if (btrfs_root_refs(&root->root_item) == 0) {
6138		ret = 1;
6139		goto end_no_trans;
6140	}
6141
6142	/*
6143	 * Skip already logged inodes or inodes corresponding to tmpfiles
6144	 * (since logging them is pointless, a link count of 0 means they
6145	 * will never be accessible).
6146	 */
6147	if ((btrfs_inode_in_log(inode, trans->transid) &&
6148	     list_empty(&ctx->ordered_extents)) ||
6149	    inode->vfs_inode.i_nlink == 0) {
6150		ret = BTRFS_NO_LOG_SYNC;
6151		goto end_no_trans;
6152	}
6153
6154	ret = start_log_trans(trans, root, ctx);
6155	if (ret)
6156		goto end_no_trans;
6157
6158	ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
6159	if (ret)
6160		goto end_trans;
6161
6162	/*
6163	 * for regular files, if its inode is already on disk, we don't
6164	 * have to worry about the parents at all.  This is because
6165	 * we can use the last_unlink_trans field to record renames
6166	 * and other fun in this file.
6167	 */
6168	if (S_ISREG(inode->vfs_inode.i_mode) &&
6169	    inode->generation < trans->transid &&
6170	    inode->last_unlink_trans < trans->transid) {
6171		ret = 0;
6172		goto end_trans;
6173	}
6174
6175	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
6176		log_dentries = true;
6177
6178	/*
6179	 * On unlink we must make sure all our current and old parent directory
6180	 * inodes are fully logged. This is to prevent lea