1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007 Oracle.  All rights reserved.
4 */
5
6#include <crypto/hash.h>
7#include <linux/kernel.h>
8#include <linux/bio.h>
9#include <linux/blk-cgroup.h>
10#include <linux/file.h>
11#include <linux/fs.h>
12#include <linux/pagemap.h>
13#include <linux/highmem.h>
14#include <linux/time.h>
15#include <linux/init.h>
16#include <linux/string.h>
17#include <linux/backing-dev.h>
18#include <linux/writeback.h>
19#include <linux/compat.h>
20#include <linux/xattr.h>
21#include <linux/posix_acl.h>
22#include <linux/falloc.h>
23#include <linux/slab.h>
24#include <linux/ratelimit.h>
25#include <linux/btrfs.h>
26#include <linux/blkdev.h>
27#include <linux/posix_acl_xattr.h>
28#include <linux/uio.h>
29#include <linux/magic.h>
30#include <linux/iversion.h>
31#include <linux/swap.h>
32#include <linux/migrate.h>
33#include <linux/sched/mm.h>
34#include <linux/iomap.h>
35#include <asm/unaligned.h>
36#include <linux/fsverity.h>
37#include "misc.h"
38#include "ctree.h"
39#include "disk-io.h"
40#include "transaction.h"
41#include "btrfs_inode.h"
42#include "print-tree.h"
43#include "ordered-data.h"
44#include "xattr.h"
45#include "tree-log.h"
46#include "volumes.h"
47#include "compression.h"
48#include "locking.h"
49#include "free-space-cache.h"
50#include "props.h"
51#include "qgroup.h"
52#include "delalloc-space.h"
53#include "block-group.h"
54#include "space-info.h"
55#include "zoned.h"
56#include "subpage.h"
57#include "inode-item.h"
58
59struct btrfs_iget_args {
60	u64 ino;
61	struct btrfs_root *root;
62};
63
64struct btrfs_dio_data {
65	ssize_t submitted;
66	struct extent_changeset *data_reserved;
67};
68
69static const struct inode_operations btrfs_dir_inode_operations;
70static const struct inode_operations btrfs_symlink_inode_operations;
71static const struct inode_operations btrfs_special_inode_operations;
72static const struct inode_operations btrfs_file_inode_operations;
73static const struct address_space_operations btrfs_aops;
74static const struct file_operations btrfs_dir_file_operations;
75
76static struct kmem_cache *btrfs_inode_cachep;
77struct kmem_cache *btrfs_trans_handle_cachep;
78struct kmem_cache *btrfs_path_cachep;
79struct kmem_cache *btrfs_free_space_cachep;
80struct kmem_cache *btrfs_free_space_bitmap_cachep;
81
82static int btrfs_setsize(struct inode *inode, struct iattr *attr);
83static int btrfs_truncate(struct inode *inode, bool skip_writeback);
84static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
85static noinline int cow_file_range(struct btrfs_inode *inode,
86				   struct page *locked_page,
87				   u64 start, u64 end, int *page_started,
88				   unsigned long *nr_written, int unlock);
89static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
90				       u64 len, u64 orig_start, u64 block_start,
91				       u64 block_len, u64 orig_block_len,
92				       u64 ram_bytes, int compress_type,
93				       int type);
94
95static void __endio_write_update_ordered(struct btrfs_inode *inode,
96					 const u64 offset, const u64 bytes,
97					 const bool uptodate);
98
99/*
100 * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
101 *
102 * ilock_flags can have the following bit set:
103 *
104 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
105 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
106 *		     return -EAGAIN
107 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
108 */
109int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
110{
111	if (ilock_flags & BTRFS_ILOCK_SHARED) {
112		if (ilock_flags & BTRFS_ILOCK_TRY) {
113			if (!inode_trylock_shared(inode))
114				return -EAGAIN;
115			else
116				return 0;
117		}
118		inode_lock_shared(inode);
119	} else {
120		if (ilock_flags & BTRFS_ILOCK_TRY) {
121			if (!inode_trylock(inode))
122				return -EAGAIN;
123			else
124				return 0;
125		}
126		inode_lock(inode);
127	}
128	if (ilock_flags & BTRFS_ILOCK_MMAP)
129		down_write(&BTRFS_I(inode)->i_mmap_lock);
130	return 0;
131}
132
133/*
134 * btrfs_inode_unlock - unock inode i_rwsem
135 *
136 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
137 * to decide whether the lock acquired is shared or exclusive.
138 */
139void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
140{
141	if (ilock_flags & BTRFS_ILOCK_MMAP)
142		up_write(&BTRFS_I(inode)->i_mmap_lock);
143	if (ilock_flags & BTRFS_ILOCK_SHARED)
144		inode_unlock_shared(inode);
145	else
146		inode_unlock(inode);
147}
148
149/*
150 * Cleanup all submitted ordered extents in specified range to handle errors
151 * from the btrfs_run_delalloc_range() callback.
152 *
153 * NOTE: caller must ensure that when an error happens, it can not call
154 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
155 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
156 * to be released, which we want to happen only when finishing the ordered
157 * extent (btrfs_finish_ordered_io()).
158 */
159static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
160						 struct page *locked_page,
161						 u64 offset, u64 bytes)
162{
163	unsigned long index = offset >> PAGE_SHIFT;
164	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
165	u64 page_start = page_offset(locked_page);
166	u64 page_end = page_start + PAGE_SIZE - 1;
167
168	struct page *page;
169
170	while (index <= end_index) {
171		/*
172		 * For locked page, we will call end_extent_writepage() on it
173		 * in run_delalloc_range() for the error handling.  That
174		 * end_extent_writepage() function will call
175		 * btrfs_mark_ordered_io_finished() to clear page Ordered and
176		 * run the ordered extent accounting.
177		 *
178		 * Here we can't just clear the Ordered bit, or
179		 * btrfs_mark_ordered_io_finished() would skip the accounting
180		 * for the page range, and the ordered extent will never finish.
181		 */
182		if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
183			index++;
184			continue;
185		}
186		page = find_get_page(inode->vfs_inode.i_mapping, index);
187		index++;
188		if (!page)
189			continue;
190
191		/*
192		 * Here we just clear all Ordered bits for every page in the
193		 * range, then __endio_write_update_ordered() will handle
194		 * the ordered extent accounting for the range.
195		 */
196		btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
197					       offset, bytes);
198		put_page(page);
199	}
200
201	/* The locked page covers the full range, nothing needs to be done */
202	if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
203		return;
204	/*
205	 * In case this page belongs to the delalloc range being instantiated
206	 * then skip it, since the first page of a range is going to be
207	 * properly cleaned up by the caller of run_delalloc_range
208	 */
209	if (page_start >= offset && page_end <= (offset + bytes - 1)) {
210		bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
211		offset = page_offset(locked_page) + PAGE_SIZE;
212	}
213
214	return __endio_write_update_ordered(inode, offset, bytes, false);
215}
216
217static int btrfs_dirty_inode(struct inode *inode);
218
219static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
220				     struct inode *inode,  struct inode *dir,
221				     const struct qstr *qstr)
222{
223	int err;
224
225	err = btrfs_init_acl(trans, inode, dir);
226	if (!err)
227		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
228	return err;
229}
230
231/*
232 * this does all the hard work for inserting an inline extent into
233 * the btree.  The caller should have done a btrfs_drop_extents so that
234 * no overlapping inline items exist in the btree
235 */
236static int insert_inline_extent(struct btrfs_trans_handle *trans,
237				struct btrfs_path *path, bool extent_inserted,
238				struct btrfs_root *root, struct inode *inode,
239				u64 start, size_t size, size_t compressed_size,
240				int compress_type,
241				struct page **compressed_pages)
242{
243	struct extent_buffer *leaf;
244	struct page *page = NULL;
245	char *kaddr;
246	unsigned long ptr;
247	struct btrfs_file_extent_item *ei;
248	int ret;
249	size_t cur_size = size;
250	unsigned long offset;
251
252	ASSERT((compressed_size > 0 && compressed_pages) ||
253	       (compressed_size == 0 && !compressed_pages));
254
255	if (compressed_size && compressed_pages)
256		cur_size = compressed_size;
257
258	if (!extent_inserted) {
259		struct btrfs_key key;
260		size_t datasize;
261
262		key.objectid = btrfs_ino(BTRFS_I(inode));
263		key.offset = start;
264		key.type = BTRFS_EXTENT_DATA_KEY;
265
266		datasize = btrfs_file_extent_calc_inline_size(cur_size);
267		ret = btrfs_insert_empty_item(trans, root, path, &key,
268					      datasize);
269		if (ret)
270			goto fail;
271	}
272	leaf = path->nodes[0];
273	ei = btrfs_item_ptr(leaf, path->slots[0],
274			    struct btrfs_file_extent_item);
275	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
276	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
277	btrfs_set_file_extent_encryption(leaf, ei, 0);
278	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
279	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
280	ptr = btrfs_file_extent_inline_start(ei);
281
282	if (compress_type != BTRFS_COMPRESS_NONE) {
283		struct page *cpage;
284		int i = 0;
285		while (compressed_size > 0) {
286			cpage = compressed_pages[i];
287			cur_size = min_t(unsigned long, compressed_size,
288				       PAGE_SIZE);
289
290			kaddr = kmap_atomic(cpage);
291			write_extent_buffer(leaf, kaddr, ptr, cur_size);
292			kunmap_atomic(kaddr);
293
294			i++;
295			ptr += cur_size;
296			compressed_size -= cur_size;
297		}
298		btrfs_set_file_extent_compression(leaf, ei,
299						  compress_type);
300	} else {
301		page = find_get_page(inode->i_mapping,
302				     start >> PAGE_SHIFT);
303		btrfs_set_file_extent_compression(leaf, ei, 0);
304		kaddr = kmap_atomic(page);
305		offset = offset_in_page(start);
306		write_extent_buffer(leaf, kaddr + offset, ptr, size);
307		kunmap_atomic(kaddr);
308		put_page(page);
309	}
310	btrfs_mark_buffer_dirty(leaf);
311	btrfs_release_path(path);
312
313	/*
314	 * We align size to sectorsize for inline extents just for simplicity
315	 * sake.
316	 */
317	size = ALIGN(size, root->fs_info->sectorsize);
318	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
319	if (ret)
320		goto fail;
321
322	/*
323	 * we're an inline extent, so nobody can
324	 * extend the file past i_size without locking
325	 * a page we already have locked.
326	 *
327	 * We must do any isize and inode updates
328	 * before we unlock the pages.  Otherwise we
329	 * could end up racing with unlink.
330	 */
331	BTRFS_I(inode)->disk_i_size = inode->i_size;
332fail:
333	return ret;
334}
335
336
337/*
338 * conditionally insert an inline extent into the file.  This
339 * does the checks required to make sure the data is small enough
340 * to fit as an inline extent.
341 */
342static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
343					  u64 end, size_t compressed_size,
344					  int compress_type,
345					  struct page **compressed_pages)
346{
347	struct btrfs_drop_extents_args drop_args = { 0 };
348	struct btrfs_root *root = inode->root;
349	struct btrfs_fs_info *fs_info = root->fs_info;
350	struct btrfs_trans_handle *trans;
351	u64 isize = i_size_read(&inode->vfs_inode);
352	u64 actual_end = min(end + 1, isize);
353	u64 inline_len = actual_end - start;
354	u64 aligned_end = ALIGN(end, fs_info->sectorsize);
355	u64 data_len = inline_len;
356	int ret;
357	struct btrfs_path *path;
358
359	if (compressed_size)
360		data_len = compressed_size;
361
362	if (start > 0 ||
363	    actual_end > fs_info->sectorsize ||
364	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
365	    (!compressed_size &&
366	    (actual_end & (fs_info->sectorsize - 1)) == 0) ||
367	    end + 1 < isize ||
368	    data_len > fs_info->max_inline) {
369		return 1;
370	}
371
372	path = btrfs_alloc_path();
373	if (!path)
374		return -ENOMEM;
375
376	trans = btrfs_join_transaction(root);
377	if (IS_ERR(trans)) {
378		btrfs_free_path(path);
379		return PTR_ERR(trans);
380	}
381	trans->block_rsv = &inode->block_rsv;
382
383	drop_args.path = path;
384	drop_args.start = start;
385	drop_args.end = aligned_end;
386	drop_args.drop_cache = true;
387	drop_args.replace_extent = true;
388
389	if (compressed_size && compressed_pages)
390		drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
391		   compressed_size);
392	else
393		drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
394		    inline_len);
395
396	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
397	if (ret) {
398		btrfs_abort_transaction(trans, ret);
399		goto out;
400	}
401
402	if (isize > actual_end)
403		inline_len = min_t(u64, isize, actual_end);
404	ret = insert_inline_extent(trans, path, drop_args.extent_inserted,
405				   root, &inode->vfs_inode, start,
406				   inline_len, compressed_size,
407				   compress_type, compressed_pages);
408	if (ret && ret != -ENOSPC) {
409		btrfs_abort_transaction(trans, ret);
410		goto out;
411	} else if (ret == -ENOSPC) {
412		ret = 1;
413		goto out;
414	}
415
416	btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found);
417	ret = btrfs_update_inode(trans, root, inode);
418	if (ret && ret != -ENOSPC) {
419		btrfs_abort_transaction(trans, ret);
420		goto out;
421	} else if (ret == -ENOSPC) {
422		ret = 1;
423		goto out;
424	}
425
426	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
427out:
428	/*
429	 * Don't forget to free the reserved space, as for inlined extent
430	 * it won't count as data extent, free them directly here.
431	 * And at reserve time, it's always aligned to page size, so
432	 * just free one page here.
433	 */
434	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
435	btrfs_free_path(path);
436	btrfs_end_transaction(trans);
437	return ret;
438}
439
440struct async_extent {
441	u64 start;
442	u64 ram_size;
443	u64 compressed_size;
444	struct page **pages;
445	unsigned long nr_pages;
446	int compress_type;
447	struct list_head list;
448};
449
450struct async_chunk {
451	struct inode *inode;
452	struct page *locked_page;
453	u64 start;
454	u64 end;
455	unsigned int write_flags;
456	struct list_head extents;
457	struct cgroup_subsys_state *blkcg_css;
458	struct btrfs_work work;
459	struct async_cow *async_cow;
460};
461
462struct async_cow {
463	atomic_t num_chunks;
464	struct async_chunk chunks[];
465};
466
467static noinline int add_async_extent(struct async_chunk *cow,
468				     u64 start, u64 ram_size,
469				     u64 compressed_size,
470				     struct page **pages,
471				     unsigned long nr_pages,
472				     int compress_type)
473{
474	struct async_extent *async_extent;
475
476	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
477	BUG_ON(!async_extent); /* -ENOMEM */
478	async_extent->start = start;
479	async_extent->ram_size = ram_size;
480	async_extent->compressed_size = compressed_size;
481	async_extent->pages = pages;
482	async_extent->nr_pages = nr_pages;
483	async_extent->compress_type = compress_type;
484	list_add_tail(&async_extent->list, &cow->extents);
485	return 0;
486}
487
488/*
489 * Check if the inode has flags compatible with compression
490 */
491static inline bool inode_can_compress(struct btrfs_inode *inode)
492{
493	if (inode->flags & BTRFS_INODE_NODATACOW ||
494	    inode->flags & BTRFS_INODE_NODATASUM)
495		return false;
496	return true;
497}
498
499/*
500 * Check if the inode needs to be submitted to compression, based on mount
501 * options, defragmentation, properties or heuristics.
502 */
503static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
504				      u64 end)
505{
506	struct btrfs_fs_info *fs_info = inode->root->fs_info;
507
508	if (!inode_can_compress(inode)) {
509		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
510			KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
511			btrfs_ino(inode));
512		return 0;
513	}
514	/*
515	 * Special check for subpage.
516	 *
517	 * We lock the full page then run each delalloc range in the page, thus
518	 * for the following case, we will hit some subpage specific corner case:
519	 *
520	 * 0		32K		64K
521	 * |	|///////|	|///////|
522	 *		\- A		\- B
523	 *
524	 * In above case, both range A and range B will try to unlock the full
525	 * page [0, 64K), causing the one finished later will have page
526	 * unlocked already, triggering various page lock requirement BUG_ON()s.
527	 *
528	 * So here we add an artificial limit that subpage compression can only
529	 * if the range is fully page aligned.
530	 *
531	 * In theory we only need to ensure the first page is fully covered, but
532	 * the tailing partial page will be locked until the full compression
533	 * finishes, delaying the write of other range.
534	 *
535	 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
536	 * first to prevent any submitted async extent to unlock the full page.
537	 * By this, we can ensure for subpage case that only the last async_cow
538	 * will unlock the full page.
539	 */
540	if (fs_info->sectorsize < PAGE_SIZE) {
541		if (!IS_ALIGNED(start, PAGE_SIZE) ||
542		    !IS_ALIGNED(end + 1, PAGE_SIZE))
543			return 0;
544	}
545
546	/* force compress */
547	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
548		return 1;
549	/* defrag ioctl */
550	if (inode->defrag_compress)
551		return 1;
552	/* bad compression ratios */
553	if (inode->flags & BTRFS_INODE_NOCOMPRESS)
554		return 0;
555	if (btrfs_test_opt(fs_info, COMPRESS) ||
556	    inode->flags & BTRFS_INODE_COMPRESS ||
557	    inode->prop_compress)
558		return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
559	return 0;
560}
561
562static inline void inode_should_defrag(struct btrfs_inode *inode,
563		u64 start, u64 end, u64 num_bytes, u64 small_write)
564{
565	/* If this is a small write inside eof, kick off a defrag */
566	if (num_bytes < small_write &&
567	    (start > 0 || end + 1 < inode->disk_i_size))
568		btrfs_add_inode_defrag(NULL, inode);
569}
570
571/*
572 * we create compressed extents in two phases.  The first
573 * phase compresses a range of pages that have already been
574 * locked (both pages and state bits are locked).
575 *
576 * This is done inside an ordered work queue, and the compression
577 * is spread across many cpus.  The actual IO submission is step
578 * two, and the ordered work queue takes care of making sure that
579 * happens in the same order things were put onto the queue by
580 * writepages and friends.
581 *
582 * If this code finds it can't get good compression, it puts an
583 * entry onto the work queue to write the uncompressed bytes.  This
584 * makes sure that both compressed inodes and uncompressed inodes
585 * are written in the same order that the flusher thread sent them
586 * down.
587 */
588static noinline int compress_file_range(struct async_chunk *async_chunk)
589{
590	struct inode *inode = async_chunk->inode;
591	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
592	u64 blocksize = fs_info->sectorsize;
593	u64 start = async_chunk->start;
594	u64 end = async_chunk->end;
595	u64 actual_end;
596	u64 i_size;
597	int ret = 0;
598	struct page **pages = NULL;
599	unsigned long nr_pages;
600	unsigned long total_compressed = 0;
601	unsigned long total_in = 0;
602	int i;
603	int will_compress;
604	int compress_type = fs_info->compress_type;
605	int compressed_extents = 0;
606	int redirty = 0;
607
608	inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
609			SZ_16K);
610
611	/*
612	 * We need to save i_size before now because it could change in between
613	 * us evaluating the size and assigning it.  This is because we lock and
614	 * unlock the page in truncate and fallocate, and then modify the i_size
615	 * later on.
616	 *
617	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
618	 * does that for us.
619	 */
620	barrier();
621	i_size = i_size_read(inode);
622	barrier();
623	actual_end = min_t(u64, i_size, end + 1);
624again:
625	will_compress = 0;
626	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
627	BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
628	nr_pages = min_t(unsigned long, nr_pages,
629			BTRFS_MAX_COMPRESSED / PAGE_SIZE);
630
631	/*
632	 * we don't want to send crud past the end of i_size through
633	 * compression, that's just a waste of CPU time.  So, if the
634	 * end of the file is before the start of our current
635	 * requested range of bytes, we bail out to the uncompressed
636	 * cleanup code that can deal with all of this.
637	 *
638	 * It isn't really the fastest way to fix things, but this is a
639	 * very uncommon corner.
640	 */
641	if (actual_end <= start)
642		goto cleanup_and_bail_uncompressed;
643
644	total_compressed = actual_end - start;
645
646	/*
647	 * Skip compression for a small file range(<=blocksize) that
648	 * isn't an inline extent, since it doesn't save disk space at all.
649	 */
650	if (total_compressed <= blocksize &&
651	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
652		goto cleanup_and_bail_uncompressed;
653
654	/*
655	 * For subpage case, we require full page alignment for the sector
656	 * aligned range.
657	 * Thus we must also check against @actual_end, not just @end.
658	 */
659	if (blocksize < PAGE_SIZE) {
660		if (!IS_ALIGNED(start, PAGE_SIZE) ||
661		    !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
662			goto cleanup_and_bail_uncompressed;
663	}
664
665	total_compressed = min_t(unsigned long, total_compressed,
666			BTRFS_MAX_UNCOMPRESSED);
667	total_in = 0;
668	ret = 0;
669
670	/*
671	 * we do compression for mount -o compress and when the
672	 * inode has not been flagged as nocompress.  This flag can
673	 * change at any time if we discover bad compression ratios.
674	 */
675	if (inode_need_compress(BTRFS_I(inode), start, end)) {
676		WARN_ON(pages);
677		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
678		if (!pages) {
679			/* just bail out to the uncompressed code */
680			nr_pages = 0;
681			goto cont;
682		}
683
684		if (BTRFS_I(inode)->defrag_compress)
685			compress_type = BTRFS_I(inode)->defrag_compress;
686		else if (BTRFS_I(inode)->prop_compress)
687			compress_type = BTRFS_I(inode)->prop_compress;
688
689		/*
690		 * we need to call clear_page_dirty_for_io on each
691		 * page in the range.  Otherwise applications with the file
692		 * mmap'd can wander in and change the page contents while
693		 * we are compressing them.
694		 *
695		 * If the compression fails for any reason, we set the pages
696		 * dirty again later on.
697		 *
698		 * Note that the remaining part is redirtied, the start pointer
699		 * has moved, the end is the original one.
700		 */
701		if (!redirty) {
702			extent_range_clear_dirty_for_io(inode, start, end);
703			redirty = 1;
704		}
705
706		/* Compression level is applied here and only here */
707		ret = btrfs_compress_pages(
708			compress_type | (fs_info->compress_level << 4),
709					   inode->i_mapping, start,
710					   pages,
711					   &nr_pages,
712					   &total_in,
713					   &total_compressed);
714
715		if (!ret) {
716			unsigned long offset = offset_in_page(total_compressed);
717			struct page *page = pages[nr_pages - 1];
718
719			/* zero the tail end of the last page, we might be
720			 * sending it down to disk
721			 */
722			if (offset)
723				memzero_page(page, offset, PAGE_SIZE - offset);
724			will_compress = 1;
725		}
726	}
727cont:
728	/*
729	 * Check cow_file_range() for why we don't even try to create inline
730	 * extent for subpage case.
731	 */
732	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
733		/* lets try to make an inline extent */
734		if (ret || total_in < actual_end) {
735			/* we didn't compress the entire range, try
736			 * to make an uncompressed inline extent.
737			 */
738			ret = cow_file_range_inline(BTRFS_I(inode), start, end,
739						    0, BTRFS_COMPRESS_NONE,
740						    NULL);
741		} else {
742			/* try making a compressed inline extent */
743			ret = cow_file_range_inline(BTRFS_I(inode), start, end,
744						    total_compressed,
745						    compress_type, pages);
746		}
747		if (ret <= 0) {
748			unsigned long clear_flags = EXTENT_DELALLOC |
749				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
750				EXTENT_DO_ACCOUNTING;
751			unsigned long page_error_op;
752
753			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
754
755			/*
756			 * inline extent creation worked or returned error,
757			 * we don't need to create any more async work items.
758			 * Unlock and free up our temp pages.
759			 *
760			 * We use DO_ACCOUNTING here because we need the
761			 * delalloc_release_metadata to be done _after_ we drop
762			 * our outstanding extent for clearing delalloc for this
763			 * range.
764			 */
765			extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
766						     NULL,
767						     clear_flags,
768						     PAGE_UNLOCK |
769						     PAGE_START_WRITEBACK |
770						     page_error_op |
771						     PAGE_END_WRITEBACK);
772
773			/*
774			 * Ensure we only free the compressed pages if we have
775			 * them allocated, as we can still reach here with
776			 * inode_need_compress() == false.
777			 */
778			if (pages) {
779				for (i = 0; i < nr_pages; i++) {
780					WARN_ON(pages[i]->mapping);
781					put_page(pages[i]);
782				}
783				kfree(pages);
784			}
785			return 0;
786		}
787	}
788
789	if (will_compress) {
790		/*
791		 * we aren't doing an inline extent round the compressed size
792		 * up to a block size boundary so the allocator does sane
793		 * things
794		 */
795		total_compressed = ALIGN(total_compressed, blocksize);
796
797		/*
798		 * one last check to make sure the compression is really a
799		 * win, compare the page count read with the blocks on disk,
800		 * compression must free at least one sector size
801		 */
802		total_in = round_up(total_in, fs_info->sectorsize);
803		if (total_compressed + blocksize <= total_in) {
804			compressed_extents++;
805
806			/*
807			 * The async work queues will take care of doing actual
808			 * allocation on disk for these compressed pages, and
809			 * will submit them to the elevator.
810			 */
811			add_async_extent(async_chunk, start, total_in,
812					total_compressed, pages, nr_pages,
813					compress_type);
814
815			if (start + total_in < end) {
816				start += total_in;
817				pages = NULL;
818				cond_resched();
819				goto again;
820			}
821			return compressed_extents;
822		}
823	}
824	if (pages) {
825		/*
826		 * the compression code ran but failed to make things smaller,
827		 * free any pages it allocated and our page pointer array
828		 */
829		for (i = 0; i < nr_pages; i++) {
830			WARN_ON(pages[i]->mapping);
831			put_page(pages[i]);
832		}
833		kfree(pages);
834		pages = NULL;
835		total_compressed = 0;
836		nr_pages = 0;
837
838		/* flag the file so we don't compress in the future */
839		if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
840		    !(BTRFS_I(inode)->prop_compress)) {
841			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
842		}
843	}
844cleanup_and_bail_uncompressed:
845	/*
846	 * No compression, but we still need to write the pages in the file
847	 * we've been given so far.  redirty the locked page if it corresponds
848	 * to our extent and set things up for the async work queue to run
849	 * cow_file_range to do the normal delalloc dance.
850	 */
851	if (async_chunk->locked_page &&
852	    (page_offset(async_chunk->locked_page) >= start &&
853	     page_offset(async_chunk->locked_page)) <= end) {
854		__set_page_dirty_nobuffers(async_chunk->locked_page);
855		/* unlocked later on in the async handlers */
856	}
857
858	if (redirty)
859		extent_range_redirty_for_io(inode, start, end);
860	add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
861			 BTRFS_COMPRESS_NONE);
862	compressed_extents++;
863
864	return compressed_extents;
865}
866
867static void free_async_extent_pages(struct async_extent *async_extent)
868{
869	int i;
870
871	if (!async_extent->pages)
872		return;
873
874	for (i = 0; i < async_extent->nr_pages; i++) {
875		WARN_ON(async_extent->pages[i]->mapping);
876		put_page(async_extent->pages[i]);
877	}
878	kfree(async_extent->pages);
879	async_extent->nr_pages = 0;
880	async_extent->pages = NULL;
881}
882
883static int submit_uncompressed_range(struct btrfs_inode *inode,
884				     struct async_extent *async_extent,
885				     struct page *locked_page)
886{
887	u64 start = async_extent->start;
888	u64 end = async_extent->start + async_extent->ram_size - 1;
889	unsigned long nr_written = 0;
890	int page_started = 0;
891	int ret;
892
893	/*
894	 * Call cow_file_range() to run the delalloc range directly, since we
895	 * won't go to NOCOW or async path again.
896	 *
897	 * Also we call cow_file_range() with @unlock_page == 0, so that we
898	 * can directly submit them without interruption.
899	 */
900	ret = cow_file_range(inode, locked_page, start, end, &page_started,
901			     &nr_written, 0);
902	/* Inline extent inserted, page gets unlocked and everything is done */
903	if (page_started) {
904		ret = 0;
905		goto out;
906	}
907	if (ret < 0) {
908		if (locked_page)
909			unlock_page(locked_page);
910		goto out;
911	}
912
913	ret = extent_write_locked_range(&inode->vfs_inode, start, end);
914	/* All pages will be unlocked, including @locked_page */
915out:
916	kfree(async_extent);
917	return ret;
918}
919
920static int submit_one_async_extent(struct btrfs_inode *inode,
921				   struct async_chunk *async_chunk,
922				   struct async_extent *async_extent,
923				   u64 *alloc_hint)
924{
925	struct extent_io_tree *io_tree = &inode->io_tree;
926	struct btrfs_root *root = inode->root;
927	struct btrfs_fs_info *fs_info = root->fs_info;
928	struct btrfs_key ins;
929	struct page *locked_page = NULL;
930	struct extent_map *em;
931	int ret = 0;
932	u64 start = async_extent->start;
933	u64 end = async_extent->start + async_extent->ram_size - 1;
934
935	/*
936	 * If async_chunk->locked_page is in the async_extent range, we need to
937	 * handle it.
938	 */
939	if (async_chunk->locked_page) {
940		u64 locked_page_start = page_offset(async_chunk->locked_page);
941		u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
942
943		if (!(start >= locked_page_end || end <= locked_page_start))
944			locked_page = async_chunk->locked_page;
945	}
946	lock_extent(io_tree, start, end);
947
948	/* We have fall back to uncompressed write */
949	if (!async_extent->pages)
950		return submit_uncompressed_range(inode, async_extent, locked_page);
951
952	ret = btrfs_reserve_extent(root, async_extent->ram_size,
953				   async_extent->compressed_size,
954				   async_extent->compressed_size,
955				   0, *alloc_hint, &ins, 1, 1);
956	if (ret) {
957		free_async_extent_pages(async_extent);
958		/*
959		 * Here we used to try again by going back to non-compressed
960		 * path for ENOSPC.  But we can't reserve space even for
961		 * compressed size, how could it work for uncompressed size
962		 * which requires larger size?  So here we directly go error
963		 * path.
964		 */
965		goto out_free;
966	}
967
968	/* Here we're doing allocation and writeback of the compressed pages */
969	em = create_io_em(inode, start,
970			  async_extent->ram_size,	/* len */
971			  start,			/* orig_start */
972			  ins.objectid,			/* block_start */
973			  ins.offset,			/* block_len */
974			  ins.offset,			/* orig_block_len */
975			  async_extent->ram_size,	/* ram_bytes */
976			  async_extent->compress_type,
977			  BTRFS_ORDERED_COMPRESSED);
978	if (IS_ERR(em)) {
979		ret = PTR_ERR(em);
980		goto out_free_reserve;
981	}
982	free_extent_map(em);
983
984	ret = btrfs_add_ordered_extent_compress(inode, start,	/* file_offset */
985					ins.objectid,		/* disk_bytenr */
986					async_extent->ram_size, /* num_bytes */
987					ins.offset,		/* disk_num_bytes */
988					async_extent->compress_type);
989	if (ret) {
990		btrfs_drop_extent_cache(inode, start, end, 0);
991		goto out_free_reserve;
992	}
993	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
994
995	/* Clear dirty, set writeback and unlock the pages. */
996	extent_clear_unlock_delalloc(inode, start, end,
997			NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
998			PAGE_UNLOCK | PAGE_START_WRITEBACK);
999	if (btrfs_submit_compressed_write(inode, start,	/* file_offset */
1000			    async_extent->ram_size,	/* num_bytes */
1001			    ins.objectid,		/* disk_bytenr */
1002			    ins.offset,			/* compressed_len */
1003			    async_extent->pages,	/* compressed_pages */
1004			    async_extent->nr_pages,
1005			    async_chunk->write_flags,
1006			    async_chunk->blkcg_css)) {
1007		const u64 start = async_extent->start;
1008		const u64 end = start + async_extent->ram_size - 1;
1009
1010		btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
1011
1012		extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
1013					     PAGE_END_WRITEBACK | PAGE_SET_ERROR);
1014		free_async_extent_pages(async_extent);
1015	}
1016	*alloc_hint = ins.objectid + ins.offset;
1017	kfree(async_extent);
1018	return ret;
1019
1020out_free_reserve:
1021	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1022	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1023out_free:
1024	extent_clear_unlock_delalloc(inode, start, end,
1025				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
1026				     EXTENT_DELALLOC_NEW |
1027				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1028				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
1029				     PAGE_END_WRITEBACK | PAGE_SET_ERROR);
1030	free_async_extent_pages(async_extent);
1031	kfree(async_extent);
1032	return ret;
1033}
1034
1035/*
1036 * Phase two of compressed writeback.  This is the ordered portion of the code,
1037 * which only gets called in the order the work was queued.  We walk all the
1038 * async extents created by compress_file_range and send them down to the disk.
1039 */
1040static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
1041{
1042	struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
1043	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1044	struct async_extent *async_extent;
1045	u64 alloc_hint = 0;
1046	int ret = 0;
1047
1048	while (!list_empty(&async_chunk->extents)) {
1049		u64 extent_start;
1050		u64 ram_size;
1051
1052		async_extent = list_entry(async_chunk->extents.next,
1053					  struct async_extent, list);
1054		list_del(&async_extent->list);
1055		extent_start = async_extent->start;
1056		ram_size = async_extent->ram_size;
1057
1058		ret = submit_one_async_extent(inode, async_chunk, async_extent,
1059					      &alloc_hint);
1060		btrfs_debug(fs_info,
1061"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
1062			    inode->root->root_key.objectid,
1063			    btrfs_ino(inode), extent_start, ram_size, ret);
1064	}
1065}
1066
1067static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1068				      u64 num_bytes)
1069{
1070	struct extent_map_tree *em_tree = &inode->extent_tree;
1071	struct extent_map *em;
1072	u64 alloc_hint = 0;
1073
1074	read_lock(&em_tree->lock);
1075	em = search_extent_mapping(em_tree, start, num_bytes);
1076	if (em) {
1077		/*
1078		 * if block start isn't an actual block number then find the
1079		 * first block in this inode and use that as a hint.  If that
1080		 * block is also bogus then just don't worry about it.
1081		 */
1082		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1083			free_extent_map(em);
1084			em = search_extent_mapping(em_tree, 0, 0);
1085			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
1086				alloc_hint = em->block_start;
1087			if (em)
1088				free_extent_map(em);
1089		} else {
1090			alloc_hint = em->block_start;
1091			free_extent_map(em);
1092		}
1093	}
1094	read_unlock(&em_tree->lock);
1095
1096	return alloc_hint;
1097}
1098
1099/*
1100 * when extent_io.c finds a delayed allocation range in the file,
1101 * the call backs end up in this code.  The basic idea is to
1102 * allocate extents on disk for the range, and create ordered data structs
1103 * in ram to track those extents.
1104 *
1105 * locked_page is the page that writepage had locked already.  We use
1106 * it to make sure we don't do extra locks or unlocks.
1107 *
1108 * *page_started is set to one if we unlock locked_page and do everything
1109 * required to start IO on it.  It may be clean and already done with
1110 * IO when we return.
1111 */
1112static noinline int cow_file_range(struct btrfs_inode *inode,
1113				   struct page *locked_page,
1114				   u64 start, u64 end, int *page_started,
1115				   unsigned long *nr_written, int unlock)
1116{
1117	struct btrfs_root *root = inode->root;
1118	struct btrfs_fs_info *fs_info = root->fs_info;
1119	u64 alloc_hint = 0;
1120	u64 num_bytes;
1121	unsigned long ram_size;
1122	u64 cur_alloc_size = 0;
1123	u64 min_alloc_size;
1124	u64 blocksize = fs_info->sectorsize;
1125	struct btrfs_key ins;
1126	struct extent_map *em;
1127	unsigned clear_bits;
1128	unsigned long page_ops;
1129	bool extent_reserved = false;
1130	int ret = 0;
1131
1132	if (btrfs_is_free_space_inode(inode)) {
1133		WARN_ON_ONCE(1);
1134		ret = -EINVAL;
1135		goto out_unlock;
1136	}
1137
1138	num_bytes = ALIGN(end - start + 1, blocksize);
1139	num_bytes = max(blocksize,  num_bytes);
1140	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1141
1142	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1143
1144	/*
1145	 * Due to the page size limit, for subpage we can only trigger the
1146	 * writeback for the dirty sectors of page, that means data writeback
1147	 * is doing more writeback than what we want.
1148	 *
1149	 * This is especially unexpected for some call sites like fallocate,
1150	 * where we only increase i_size after everything is done.
1151	 * This means we can trigger inline extent even if we didn't want to.
1152	 * So here we skip inline extent creation completely.
1153	 */
1154	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
1155		/* lets try to make an inline extent */
1156		ret = cow_file_range_inline(inode, start, end, 0,
1157					    BTRFS_COMPRESS_NONE, NULL);
1158		if (ret == 0) {
1159			/*
1160			 * We use DO_ACCOUNTING here because we need the
1161			 * delalloc_release_metadata to be run _after_ we drop
1162			 * our outstanding extent for clearing delalloc for this
1163			 * range.
1164			 */
1165			extent_clear_unlock_delalloc(inode, start, end,
1166				     locked_page,
1167				     EXTENT_LOCKED | EXTENT_DELALLOC |
1168				     EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1169				     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1170				     PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
1171			*nr_written = *nr_written +
1172			     (end - start + PAGE_SIZE) / PAGE_SIZE;
1173			*page_started = 1;
1174			/*
1175			 * locked_page is locked by the caller of
1176			 * writepage_delalloc(), not locked by
1177			 * __process_pages_contig().
1178			 *
1179			 * We can't let __process_pages_contig() to unlock it,
1180			 * as it doesn't have any subpage::writers recorded.
1181			 *
1182			 * Here we manually unlock the page, since the caller
1183			 * can't use page_started to determine if it's an
1184			 * inline extent or a compressed extent.
1185			 */
1186			unlock_page(locked_page);
1187			goto out;
1188		} else if (ret < 0) {
1189			goto out_unlock;
1190		}
1191	}
1192
1193	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1194	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
1195
1196	/*
1197	 * Relocation relies on the relocated extents to have exactly the same
1198	 * size as the original extents. Normally writeback for relocation data
1199	 * extents follows a NOCOW path because relocation preallocates the
1200	 * extents. However, due to an operation such as scrub turning a block
1201	 * group to RO mode, it may fallback to COW mode, so we must make sure
1202	 * an extent allocated during COW has exactly the requested size and can
1203	 * not be split into smaller extents, otherwise relocation breaks and
1204	 * fails during the stage where it updates the bytenr of file extent
1205	 * items.
1206	 */
1207	if (btrfs_is_data_reloc_root(root))
1208		min_alloc_size = num_bytes;
1209	else
1210		min_alloc_size = fs_info->sectorsize;
1211
1212	while (num_bytes > 0) {
1213		cur_alloc_size = num_bytes;
1214		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1215					   min_alloc_size, 0, alloc_hint,
1216					   &ins, 1, 1);
1217		if (ret < 0)
1218			goto out_unlock;
1219		cur_alloc_size = ins.offset;
1220		extent_reserved = true;
1221
1222		ram_size = ins.offset;
1223		em = create_io_em(inode, start, ins.offset, /* len */
1224				  start, /* orig_start */
1225				  ins.objectid, /* block_start */
1226				  ins.offset, /* block_len */
1227				  ins.offset, /* orig_block_len */
1228				  ram_size, /* ram_bytes */
1229				  BTRFS_COMPRESS_NONE, /* compress_type */
1230				  BTRFS_ORDERED_REGULAR /* type */);
1231		if (IS_ERR(em)) {
1232			ret = PTR_ERR(em);
1233			goto out_reserve;
1234		}
1235		free_extent_map(em);
1236
1237		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1238					       ram_size, cur_alloc_size,
1239					       BTRFS_ORDERED_REGULAR);
1240		if (ret)
1241			goto out_drop_extent_cache;
1242
1243		if (btrfs_is_data_reloc_root(root)) {
1244			ret = btrfs_reloc_clone_csums(inode, start,
1245						      cur_alloc_size);
1246			/*
1247			 * Only drop cache here, and process as normal.
1248			 *
1249			 * We must not allow extent_clear_unlock_delalloc()
1250			 * at out_unlock label to free meta of this ordered
1251			 * extent, as its meta should be freed by
1252			 * btrfs_finish_ordered_io().
1253			 *
1254			 * So we must continue until @start is increased to
1255			 * skip current ordered extent.
1256			 */
1257			if (ret)
1258				btrfs_drop_extent_cache(inode, start,
1259						start + ram_size - 1, 0);
1260		}
1261
1262		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1263
1264		/*
1265		 * We're not doing compressed IO, don't unlock the first page
1266		 * (which the caller expects to stay locked), don't clear any
1267		 * dirty bits and don't set any writeback bits
1268		 *
1269		 * Do set the Ordered (Private2) bit so we know this page was
1270		 * properly setup for writepage.
1271		 */
1272		page_ops = unlock ? PAGE_UNLOCK : 0;
1273		page_ops |= PAGE_SET_ORDERED;
1274
1275		extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
1276					     locked_page,
1277					     EXTENT_LOCKED | EXTENT_DELALLOC,
1278					     page_ops);
1279		if (num_bytes < cur_alloc_size)
1280			num_bytes = 0;
1281		else
1282			num_bytes -= cur_alloc_size;
1283		alloc_hint = ins.objectid + ins.offset;
1284		start += cur_alloc_size;
1285		extent_reserved = false;
1286
1287		/*
1288		 * btrfs_reloc_clone_csums() error, since start is increased
1289		 * extent_clear_unlock_delalloc() at out_unlock label won't
1290		 * free metadata of current ordered extent, we're OK to exit.
1291		 */
1292		if (ret)
1293			goto out_unlock;
1294	}
1295out:
1296	return ret;
1297
1298out_drop_extent_cache:
1299	btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1300out_reserve:
1301	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1302	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1303out_unlock:
1304	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1305		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1306	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1307	/*
1308	 * If we reserved an extent for our delalloc range (or a subrange) and
1309	 * failed to create the respective ordered extent, then it means that
1310	 * when we reserved the extent we decremented the extent's size from
1311	 * the data space_info's bytes_may_use counter and incremented the
1312	 * space_info's bytes_reserved counter by the same amount. We must make
1313	 * sure extent_clear_unlock_delalloc() does not try to decrement again
1314	 * the data space_info's bytes_may_use counter, therefore we do not pass
1315	 * it the flag EXTENT_CLEAR_DATA_RESV.
1316	 */
1317	if (extent_reserved) {
1318		extent_clear_unlock_delalloc(inode, start,
1319					     start + cur_alloc_size - 1,
1320					     locked_page,
1321					     clear_bits,
1322					     page_ops);
1323		start += cur_alloc_size;
1324		if (start >= end)
1325			goto out;
1326	}
1327	extent_clear_unlock_delalloc(inode, start, end, locked_page,
1328				     clear_bits | EXTENT_CLEAR_DATA_RESV,
1329				     page_ops);
1330	goto out;
1331}
1332
1333/*
1334 * work queue call back to started compression on a file and pages
1335 */
1336static noinline void async_cow_start(struct btrfs_work *work)
1337{
1338	struct async_chunk *async_chunk;
1339	int compressed_extents;
1340
1341	async_chunk = container_of(work, struct async_chunk, work);
1342
1343	compressed_extents = compress_file_range(async_chunk);
1344	if (compressed_extents == 0) {
1345		btrfs_add_delayed_iput(async_chunk->inode);
1346		async_chunk->inode = NULL;
1347	}
1348}
1349
1350/*
1351 * work queue call back to submit previously compressed pages
1352 */
1353static noinline void async_cow_submit(struct btrfs_work *work)
1354{
1355	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1356						     work);
1357	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1358	unsigned long nr_pages;
1359
1360	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1361		PAGE_SHIFT;
1362
1363	/*
1364	 * ->inode could be NULL if async_chunk_start has failed to compress,
1365	 * in which case we don't have anything to submit, yet we need to
1366	 * always adjust ->async_delalloc_pages as its paired with the init
1367	 * happening in cow_file_range_async
1368	 */
1369	if (async_chunk->inode)
1370		submit_compressed_extents(async_chunk);
1371
1372	/* atomic_sub_return implies a barrier */
1373	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1374	    5 * SZ_1M)
1375		cond_wake_up_nomb(&fs_info->async_submit_wait);
1376}
1377
1378static noinline void async_cow_free(struct btrfs_work *work)
1379{
1380	struct async_chunk *async_chunk;
1381	struct async_cow *async_cow;
1382
1383	async_chunk = container_of(work, struct async_chunk, work);
1384	if (async_chunk->inode)
1385		btrfs_add_delayed_iput(async_chunk->inode);
1386	if (async_chunk->blkcg_css)
1387		css_put(async_chunk->blkcg_css);
1388
1389	async_cow = async_chunk->async_cow;
1390	if (atomic_dec_and_test(&async_cow->num_chunks))
1391		kvfree(async_cow);
1392}
1393
1394static int cow_file_range_async(struct btrfs_inode *inode,
1395				struct writeback_control *wbc,
1396				struct page *locked_page,
1397				u64 start, u64 end, int *page_started,
1398				unsigned long *nr_written)
1399{
1400	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1401	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1402	struct async_cow *ctx;
1403	struct async_chunk *async_chunk;
1404	unsigned long nr_pages;
1405	u64 cur_end;
1406	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1407	int i;
1408	bool should_compress;
1409	unsigned nofs_flag;
1410	const unsigned int write_flags = wbc_to_write_flags(wbc);
1411
1412	unlock_extent(&inode->io_tree, start, end);
1413
1414	if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
1415	    !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
1416		num_chunks = 1;
1417		should_compress = false;
1418	} else {
1419		should_compress = true;
1420	}
1421
1422	nofs_flag = memalloc_nofs_save();
1423	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1424	memalloc_nofs_restore(nofs_flag);
1425
1426	if (!ctx) {
1427		unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
1428			EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1429			EXTENT_DO_ACCOUNTING;
1430		unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
1431					 PAGE_END_WRITEBACK | PAGE_SET_ERROR;
1432
1433		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1434					     clear_bits, page_ops);
1435		return -ENOMEM;
1436	}
1437
1438	async_chunk = ctx->chunks;
1439	atomic_set(&ctx->num_chunks, num_chunks);
1440
1441	for (i = 0; i < num_chunks; i++) {
1442		if (should_compress)
1443			cur_end = min(end, start + SZ_512K - 1);
1444		else
1445			cur_end = end;
1446
1447		/*
1448		 * igrab is called higher up in the call chain, take only the
1449		 * lightweight reference for the callback lifetime
1450		 */
1451		ihold(&inode->vfs_inode);
1452		async_chunk[i].async_cow = ctx;
1453		async_chunk[i].inode = &inode->vfs_inode;
1454		async_chunk[i].start = start;
1455		async_chunk[i].end = cur_end;
1456		async_chunk[i].write_flags = write_flags;
1457		INIT_LIST_HEAD(&async_chunk[i].extents);
1458
1459		/*
1460		 * The locked_page comes all the way from writepage and its
1461		 * the original page we were actually given.  As we spread
1462		 * this large delalloc region across multiple async_chunk
1463		 * structs, only the first struct needs a pointer to locked_page
1464		 *
1465		 * This way we don't need racey decisions about who is supposed
1466		 * to unlock it.
1467		 */
1468		if (locked_page) {
1469			/*
1470			 * Depending on the compressibility, the pages might or
1471			 * might not go through async.  We want all of them to
1472			 * be accounted against wbc once.  Let's do it here
1473			 * before the paths diverge.  wbc accounting is used
1474			 * only for foreign writeback detection and doesn't
1475			 * need full accuracy.  Just account the whole thing
1476			 * against the first page.
1477			 */
1478			wbc_account_cgroup_owner(wbc, locked_page,
1479						 cur_end - start);
1480			async_chunk[i].locked_page = locked_page;
1481			locked_page = NULL;
1482		} else {
1483			async_chunk[i].locked_page = NULL;
1484		}
1485
1486		if (blkcg_css != blkcg_root_css) {
1487			css_get(blkcg_css);
1488			async_chunk[i].blkcg_css = blkcg_css;
1489		} else {
1490			async_chunk[i].blkcg_css = NULL;
1491		}
1492
1493		btrfs_init_work(&async_chunk[i].work, async_cow_start,
1494				async_cow_submit, async_cow_free);
1495
1496		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1497		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1498
1499		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1500
1501		*nr_written += nr_pages;
1502		start = cur_end + 1;
1503	}
1504	*page_started = 1;
1505	return 0;
1506}
1507
1508static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
1509				       struct page *locked_page, u64 start,
1510				       u64 end, int *page_started,
1511				       unsigned long *nr_written)
1512{
1513	int ret;
1514
1515	ret = cow_file_range(inode, locked_page, start, end, page_started,
1516			     nr_written, 0);
1517	if (ret)
1518		return ret;
1519
1520	if (*page_started)
1521		return 0;
1522
1523	__set_page_dirty_nobuffers(locked_page);
1524	account_page_redirty(locked_page);
1525	extent_write_locked_range(&inode->vfs_inode, start, end);
1526	*page_started = 1;
1527
1528	return 0;
1529}
1530
1531static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1532					u64 bytenr, u64 num_bytes)
1533{
1534	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
1535	struct btrfs_ordered_sum *sums;
1536	int ret;
1537	LIST_HEAD(list);
1538
1539	ret = btrfs_lookup_csums_range(csum_root, bytenr,
1540				       bytenr + num_bytes - 1, &list, 0);
1541	if (ret == 0 && list_empty(&list))
1542		return 0;
1543
1544	while (!list_empty(&list)) {
1545		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1546		list_del(&sums->list);
1547		kfree(sums);
1548	}
1549	if (ret < 0)
1550		return ret;
1551	return 1;
1552}
1553
1554static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
1555			   const u64 start, const u64 end,
1556			   int *page_started, unsigned long *nr_written)
1557{
1558	const bool is_space_ino = btrfs_is_free_space_inode(inode);
1559	const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
1560	const u64 range_bytes = end + 1 - start;
1561	struct extent_io_tree *io_tree = &inode->io_tree;
1562	u64 range_start = start;
1563	u64 count;
1564
1565	/*
1566	 * If EXTENT_NORESERVE is set it means that when the buffered write was
1567	 * made we had not enough available data space and therefore we did not
1568	 * reserve data space for it, since we though we could do NOCOW for the
1569	 * respective file range (either there is prealloc extent or the inode
1570	 * has the NOCOW bit set).
1571	 *
1572	 * However when we need to fallback to COW mode (because for example the
1573	 * block group for the corresponding extent was turned to RO mode by a
1574	 * scrub or relocation) we need to do the following:
1575	 *
1576	 * 1) We increment the bytes_may_use counter of the data space info.
1577	 *    If COW succeeds, it allocates a new data extent and after doing
1578	 *    that it decrements the space info's bytes_may_use counter and
1579	 *    increments its bytes_reserved counter by the same amount (we do
1580	 *    this at btrfs_add_reserved_bytes()). So we need to increment the
1581	 *    bytes_may_use counter to compensate (when space is reserved at
1582	 *    buffered write time, the bytes_may_use counter is incremented);
1583	 *
1584	 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1585	 *    that if the COW path fails for any reason, it decrements (through
1586	 *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1587	 *    data space info, which we incremented in the step above.
1588	 *
1589	 * If we need to fallback to cow and the inode corresponds to a free
1590	 * space cache inode or an inode of the data relocation tree, we must
1591	 * also increment bytes_may_use of the data space_info for the same
1592	 * reason. Space caches and relocated data extents always get a prealloc
1593	 * extent for them, however scrub or balance may have set the block
1594	 * group that contains that extent to RO mode and therefore force COW
1595	 * when starting writeback.
1596	 */
1597	count = count_range_bits(io_tree, &range_start, end, range_bytes,
1598				 EXTENT_NORESERVE, 0);
1599	if (count > 0 || is_space_ino || is_reloc_ino) {
1600		u64 bytes = count;
1601		struct btrfs_fs_info *fs_info = inode->root->fs_info;
1602		struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1603
1604		if (is_space_ino || is_reloc_ino)
1605			bytes = range_bytes;
1606
1607		spin_lock(&sinfo->lock);
1608		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
1609		spin_unlock(&sinfo->lock);
1610
1611		if (count > 0)
1612			clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1613					 0, 0, NULL);
1614	}
1615
1616	return cow_file_range(inode, locked_page, start, end, page_started,
1617			      nr_written, 1);
1618}
1619
1620/*
1621 * when nowcow writeback call back.  This checks for snapshots or COW copies
1622 * of the extents that exist in the file, and COWs the file as required.
1623 *
1624 * If no cow copies or snapshots exist, we write directly to the existing
1625 * blocks on disk
1626 */
1627static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
1628				       struct page *locked_page,
1629				       const u64 start, const u64 end,
1630				       int *page_started,
1631				       unsigned long *nr_written)
1632{
1633	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1634	struct btrfs_root *root = inode->root;
1635	struct btrfs_path *path;
1636	u64 cow_start = (u64)-1;
1637	u64 cur_offset = start;
1638	int ret;
1639	bool check_prev = true;
1640	const bool freespace_inode = btrfs_is_free_space_inode(inode);
1641	u64 ino = btrfs_ino(inode);
1642	bool nocow = false;
1643	u64 disk_bytenr = 0;
1644	const bool force = inode->flags & BTRFS_INODE_NODATACOW;
1645
1646	path = btrfs_alloc_path();
1647	if (!path) {
1648		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1649					     EXTENT_LOCKED | EXTENT_DELALLOC |
1650					     EXTENT_DO_ACCOUNTING |
1651					     EXTENT_DEFRAG, PAGE_UNLOCK |
1652					     PAGE_START_WRITEBACK |
1653					     PAGE_END_WRITEBACK);
1654		return -ENOMEM;
1655	}
1656
1657	while (1) {
1658		struct btrfs_key found_key;
1659		struct btrfs_file_extent_item *fi;
1660		struct extent_buffer *leaf;
1661		u64 extent_end;
1662		u64 extent_offset;
1663		u64 num_bytes = 0;
1664		u64 disk_num_bytes;
1665		u64 ram_bytes;
1666		int extent_type;
1667
1668		nocow = false;
1669
1670		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1671					       cur_offset, 0);
1672		if (ret < 0)
1673			goto error;
1674
1675		/*
1676		 * If there is no extent for our range when doing the initial
1677		 * search, then go back to the previous slot as it will be the
1678		 * one containing the search offset
1679		 */
1680		if (ret > 0 && path->slots[0] > 0 && check_prev) {
1681			leaf = path->nodes[0];
1682			btrfs_item_key_to_cpu(leaf, &found_key,
1683					      path->slots[0] - 1);
1684			if (found_key.objectid == ino &&
1685			    found_key.type == BTRFS_EXTENT_DATA_KEY)
1686				path->slots[0]--;
1687		}
1688		check_prev = false;
1689next_slot:
1690		/* Go to next leaf if we have exhausted the current one */
1691		leaf = path->nodes[0];
1692		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1693			ret = btrfs_next_leaf(root, path);
1694			if (ret < 0) {
1695				if (cow_start != (u64)-1)
1696					cur_offset = cow_start;
1697				goto error;
1698			}
1699			if (ret > 0)
1700				break;
1701			leaf = path->nodes[0];
1702		}
1703
1704		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1705
1706		/* Didn't find anything for our INO */
1707		if (found_key.objectid > ino)
1708			break;
1709		/*
1710		 * Keep searching until we find an EXTENT_ITEM or there are no
1711		 * more extents for this inode
1712		 */
1713		if (WARN_ON_ONCE(found_key.objectid < ino) ||
1714		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
1715			path->slots[0]++;
1716			goto next_slot;
1717		}
1718
1719		/* Found key is not EXTENT_DATA_KEY or starts after req range */
1720		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1721		    found_key.offset > end)
1722			break;
1723
1724		/*
1725		 * If the found extent starts after requested offset, then
1726		 * adjust extent_end to be right before this extent begins
1727		 */
1728		if (found_key.offset > cur_offset) {
1729			extent_end = found_key.offset;
1730			extent_type = 0;
1731			goto out_check;
1732		}
1733
1734		/*
1735		 * Found extent which begins before our range and potentially
1736		 * intersect it
1737		 */
1738		fi = btrfs_item_ptr(leaf, path->slots[0],
1739				    struct btrfs_file_extent_item);
1740		extent_type = btrfs_file_extent_type(leaf, fi);
1741
1742		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1743		if (extent_type == BTRFS_FILE_EXTENT_REG ||
1744		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1745			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1746			extent_offset = btrfs_file_extent_offset(leaf, fi);
1747			extent_end = found_key.offset +
1748				btrfs_file_extent_num_bytes(leaf, fi);
1749			disk_num_bytes =
1750				btrfs_file_extent_disk_num_bytes(leaf, fi);
1751			/*
1752			 * If the extent we got ends before our current offset,
1753			 * skip to the next extent.
1754			 */
1755			if (extent_end <= cur_offset) {
1756				path->slots[0]++;
1757				goto next_slot;
1758			}
1759			/* Skip holes */
1760			if (disk_bytenr == 0)
1761				goto out_check;
1762			/* Skip compressed/encrypted/encoded extents */
1763			if (btrfs_file_extent_compression(leaf, fi) ||
1764			    btrfs_file_extent_encryption(leaf, fi) ||
1765			    btrfs_file_extent_other_encoding(leaf, fi))
1766				goto out_check;
1767			/*
1768			 * If extent is created before the last volume's snapshot
1769			 * this implies the extent is shared, hence we can't do
1770			 * nocow. This is the same check as in
1771			 * btrfs_cross_ref_exist but without calling
1772			 * btrfs_search_slot.
1773			 */
1774			if (!freespace_inode &&
1775			    btrfs_file_extent_generation(leaf, fi) <=
1776			    btrfs_root_last_snapshot(&root->root_item))
1777				goto out_check;
1778			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1779				goto out_check;
1780
1781			/*
1782			 * The following checks can be expensive, as they need to
1783			 * take other locks and do btree or rbtree searches, so
1784			 * release the path to avoid blocking other tasks for too
1785			 * long.
1786			 */
1787			btrfs_release_path(path);
1788
1789			ret = btrfs_cross_ref_exist(root, ino,
1790						    found_key.offset -
1791						    extent_offset, disk_bytenr, false);
1792			if (ret) {
1793				/*
1794				 * ret could be -EIO if the above fails to read
1795				 * metadata.
1796				 */
1797				if (ret < 0) {
1798					if (cow_start != (u64)-1)
1799						cur_offset = cow_start;
1800					goto error;
1801				}
1802
1803				WARN_ON_ONCE(freespace_inode);
1804				goto out_check;
1805			}
1806			disk_bytenr += extent_offset;
1807			disk_bytenr += cur_offset - found_key.offset;
1808			num_bytes = min(end + 1, extent_end) - cur_offset;
1809			/*
1810			 * If there are pending snapshots for this root, we
1811			 * fall into common COW way
1812			 */
1813			if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
1814				goto out_check;
1815			/*
1816			 * force cow if csum exists in the range.
1817			 * this ensure that csum for a given extent are
1818			 * either valid or do not exist.
1819			 */
1820			ret = csum_exist_in_range(fs_info, disk_bytenr,
1821						  num_bytes);
1822			if (ret) {
1823				/*
1824				 * ret could be -EIO if the above fails to read
1825				 * metadata.
1826				 */
1827				if (ret < 0) {
1828					if (cow_start != (u64)-1)
1829						cur_offset = cow_start;
1830					goto error;
1831				}
1832				WARN_ON_ONCE(freespace_inode);
1833				goto out_check;
1834			}
1835			/* If the extent's block group is RO, we must COW */
1836			if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
1837				goto out_check;
1838			nocow = true;
1839		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1840			extent_end = found_key.offset + ram_bytes;
1841			extent_end = ALIGN(extent_end, fs_info->sectorsize);
1842			/* Skip extents outside of our requested range */
1843			if (extent_end <= start) {
1844				path->slots[0]++;
1845				goto next_slot;
1846			}
1847		} else {
1848			/* If this triggers then we have a memory corruption */
1849			BUG();
1850		}
1851out_check:
1852		/*
1853		 * If nocow is false then record the beginning of the range
1854		 * that needs to be COWed
1855		 */
1856		if (!nocow) {
1857			if (cow_start == (u64)-1)
1858				cow_start = cur_offset;
1859			cur_offset = extent_end;
1860			if (cur_offset > end)
1861				break;
1862			if (!path->nodes[0])
1863				continue;
1864			path->slots[0]++;
1865			goto next_slot;
1866		}
1867
1868		/*
1869		 * COW range from cow_start to found_key.offset - 1. As the key
1870		 * will contain the beginning of the first extent that can be
1871		 * NOCOW, following one which needs to be COW'ed
1872		 */
1873		if (cow_start != (u64)-1) {
1874			ret = fallback_to_cow(inode, locked_page,
1875					      cow_start, found_key.offset - 1,
1876					      page_started, nr_written);
1877			if (ret)
1878				goto error;
1879			cow_start = (u64)-1;
1880		}
1881
1882		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1883			u64 orig_start = found_key.offset - extent_offset;
1884			struct extent_map *em;
1885
1886			em = create_io_em(inode, cur_offset, num_bytes,
1887					  orig_start,
1888					  disk_bytenr, /* block_start */
1889					  num_bytes, /* block_len */
1890					  disk_num_bytes, /* orig_block_len */
1891					  ram_bytes, BTRFS_COMPRESS_NONE,
1892					  BTRFS_ORDERED_PREALLOC);
1893			if (IS_ERR(em)) {
1894				ret = PTR_ERR(em);
1895				goto error;
1896			}
1897			free_extent_map(em);
1898			ret = btrfs_add_ordered_extent(inode, cur_offset,
1899						       disk_bytenr, num_bytes,
1900						       num_bytes,
1901						       BTRFS_ORDERED_PREALLOC);
1902			if (ret) {
1903				btrfs_drop_extent_cache(inode, cur_offset,
1904							cur_offset + num_bytes - 1,
1905							0);
1906				goto error;
1907			}
1908		} else {
1909			ret = btrfs_add_ordered_extent(inode, cur_offset,
1910						       disk_bytenr, num_bytes,
1911						       num_bytes,
1912						       BTRFS_ORDERED_NOCOW);
1913			if (ret)
1914				goto error;
1915		}
1916
1917		if (nocow)
1918			btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1919		nocow = false;
1920
1921		if (btrfs_is_data_reloc_root(root))
1922			/*
1923			 * Error handled later, as we must prevent
1924			 * extent_clear_unlock_delalloc() in error handler
1925			 * from freeing metadata of created ordered extent.
1926			 */
1927			ret = btrfs_reloc_clone_csums(inode, cur_offset,
1928						      num_bytes);
1929
1930		extent_clear_unlock_delalloc(inode, cur_offset,
1931					     cur_offset + num_bytes - 1,
1932					     locked_page, EXTENT_LOCKED |
1933					     EXTENT_DELALLOC |
1934					     EXTENT_CLEAR_DATA_RESV,
1935					     PAGE_UNLOCK | PAGE_SET_ORDERED);
1936
1937		cur_offset = extent_end;
1938
1939		/*
1940		 * btrfs_reloc_clone_csums() error, now we're OK to call error
1941		 * handler, as metadata for created ordered extent will only
1942		 * be freed by btrfs_finish_ordered_io().
1943		 */
1944		if (ret)
1945			goto error;
1946		if (cur_offset > end)
1947			break;
1948	}
1949	btrfs_release_path(path);
1950
1951	if (cur_offset <= end && cow_start == (u64)-1)
1952		cow_start = cur_offset;
1953
1954	if (cow_start != (u64)-1) {
1955		cur_offset = end;
1956		ret = fallback_to_cow(inode, locked_page, cow_start, end,
1957				      page_started, nr_written);
1958		if (ret)
1959			goto error;
1960	}
1961
1962error:
1963	if (nocow)
1964		btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1965
1966	if (ret && cur_offset < end)
1967		extent_clear_unlock_delalloc(inode, cur_offset, end,
1968					     locked_page, EXTENT_LOCKED |
1969					     EXTENT_DELALLOC | EXTENT_DEFRAG |
1970					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1971					     PAGE_START_WRITEBACK |
1972					     PAGE_END_WRITEBACK);
1973	btrfs_free_path(path);
1974	return ret;
1975}
1976
1977static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
1978{
1979	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
1980		if (inode->defrag_bytes &&
1981		    test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
1982				   0, NULL))
1983			return false;
1984		return true;
1985	}
1986	return false;
1987}
1988
1989/*
1990 * Function to process delayed allocation (create CoW) for ranges which are
1991 * being touched for the first time.
1992 */
1993int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
1994		u64 start, u64 end, int *page_started, unsigned long *nr_written,
1995		struct writeback_control *wbc)
1996{
1997	int ret;
1998	const bool zoned = btrfs_is_zoned(inode->root->fs_info);
1999
2000	/*
2001	 * The range must cover part of the @locked_page, or the returned
2002	 * @page_started can confuse the caller.
2003	 */
2004	ASSERT(!(end <= page_offset(locked_page) ||
2005		 start >= page_offset(locked_page) + PAGE_SIZE));
2006
2007	if (should_nocow(inode, start, end)) {
2008		/*
2009		 * Normally on a zoned device we're only doing COW writes, but
2010		 * in case of relocation on a zoned filesystem we have taken
2011		 * precaution, that we're only writing sequentially. It's safe
2012		 * to use run_delalloc_nocow() here, like for  regular
2013		 * preallocated inodes.
2014		 */
2015		ASSERT(!zoned ||
2016		       (zoned && btrfs_is_data_reloc_root(inode->root)));
2017		ret = run_delalloc_nocow(inode, locked_page, start, end,
2018					 page_started, nr_written);
2019	} else if (!inode_can_compress(inode) ||
2020		   !inode_need_compress(inode, start, end)) {
2021		if (zoned)
2022			ret = run_delalloc_zoned(inode, locked_page, start, end,
2023						 page_started, nr_written);
2024		else
2025			ret = cow_file_range(inode, locked_page, start, end,
2026					     page_started, nr_written, 1);
2027	} else {
2028		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
2029		ret = cow_file_range_async(inode, wbc, locked_page, start, end,
2030					   page_started, nr_written);
2031	}
2032	ASSERT(ret <= 0);
2033	if (ret)
2034		btrfs_cleanup_ordered_extents(inode, locked_page, start,
2035					      end - start + 1);
2036	return ret;
2037}
2038
2039void btrfs_split_delalloc_extent(struct inode *inode,
2040				 struct extent_state *orig, u64 split)
2041{
2042	u64 size;
2043
2044	/* not delalloc, ignore it */
2045	if (!(orig->state & EXTENT_DELALLOC))
2046		return;
2047
2048	size = orig->end - orig->start + 1;
2049	if (size > BTRFS_MAX_EXTENT_SIZE) {
2050		u32 num_extents;
2051		u64 new_size;
2052
2053		/*
2054		 * See the explanation in btrfs_merge_delalloc_extent, the same
2055		 * applies here, just in reverse.
2056		 */
2057		new_size = orig->end - split + 1;
2058		num_extents = count_max_extents(new_size);
2059		new_size = split - orig->start;
2060		num_extents += count_max_extents(new_size);
2061		if (count_max_extents(size) >= num_extents)
2062			return;
2063	}
2064
2065	spin_lock(&BTRFS_I(inode)->lock);
2066	btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
2067	spin_unlock(&BTRFS_I(inode)->lock);
2068}
2069
2070/*
2071 * Handle merged delayed allocation extents so we can keep track of new extents
2072 * that are just merged onto old extents, such as when we are doing sequential
2073 * writes, so we can properly account for the metadata space we'll need.
2074 */
2075void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
2076				 struct extent_state *other)
2077{
2078	u64 new_size, old_size;
2079	u32 num_extents;
2080
2081	/* not delalloc, ignore it */
2082	if (!(other->state & EXTENT_DELALLOC))
2083		return;
2084
2085	if (new->start > other->start)
2086		new_size = new->end - other->start + 1;
2087	else
2088		new_size = other->end - new->start + 1;
2089
2090	/* we're not bigger than the max, unreserve the space and go */
2091	if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
2092		spin_lock(&BTRFS_I(inode)->lock);
2093		btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
2094		spin_unlock(&BTRFS_I(inode)->lock);
2095		return;
2096	}
2097
2098	/*
2099	 * We have to add up either side to figure out how many extents were
2100	 * accounted for before we merged into one big extent.  If the number of
2101	 * extents we accounted for is <= the amount we need for the new range
2102	 * then we can return, otherwise drop.  Think of it like this
2103	 *
2104	 * [ 4k][MAX_SIZE]
2105	 *
2106	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
2107	 * need 2 outstanding extents, on one side we have 1 and the other side
2108	 * we have 1 so they are == and we can return.  But in this case
2109	 *
2110	 * [MAX_SIZE+4k][MAX_SIZE+4k]
2111	 *
2112	 * Each range on their own accounts for 2 extents, but merged together
2113	 * they are only 3 extents worth of accounting, so we need to drop in
2114	 * this case.
2115	 */
2116	old_size = other->end - other->start + 1;
2117	num_extents = count_max_extents(old_size);
2118	old_size = new->end - new->start + 1;
2119	num_extents += count_max_extents(old_size);
2120	if (count_max_extents(new_size) >= num_extents)
2121		return;
2122
2123	spin_lock(&BTRFS_I(inode)->lock);
2124	btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
2125	spin_unlock(&BTRFS_I(inode)->lock);
2126}
2127
2128static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
2129				      struct inode *inode)
2130{
2131	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2132
2133	spin_lock(&root->delalloc_lock);
2134	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
2135		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
2136			      &root->delalloc_inodes);
2137		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2138			&BTRFS_I(inode)->runtime_flags);
2139		root->nr_delalloc_inodes++;
2140		if (root->nr_delalloc_inodes == 1) {
2141			spin_lock(&fs_info->delalloc_root_lock);
2142			BUG_ON(!list_empty(&root->delalloc_root));
2143			list_add_tail(&root->delalloc_root,
2144				      &fs_info->delalloc_roots);
2145			spin_unlock(&fs_info->delalloc_root_lock);
2146		}
2147	}
2148	spin_unlock(&root->delalloc_lock);
2149}
2150
2151
2152void __btrfs_del_delalloc_inode(struct btrfs_root *root,
2153				struct btrfs_inode *inode)
2154{
2155	struct btrfs_fs_info *fs_info = root->fs_info;
2156
2157	if (!list_empty(&inode->delalloc_inodes)) {
2158		list_del_init(&inode->delalloc_inodes);
2159		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2160			  &inode->runtime_flags);
2161		root->nr_delalloc_inodes--;
2162		if (!root->nr_delalloc_inodes) {
2163			ASSERT(list_empty(&root->delalloc_inodes));
2164			spin_lock(&fs_info->delalloc_root_lock);
2165			BUG_ON(list_empty(&root->delalloc_root));
2166			list_del_init(&root->delalloc_root);
2167			spin_unlock(&fs_info->delalloc_root_lock);
2168		}
2169	}
2170}
2171
2172static void btrfs_del_delalloc_inode(struct btrfs_root *root,
2173				     struct btrfs_inode *inode)
2174{
2175	spin_lock(&root->delalloc_lock);
2176	__btrfs_del_delalloc_inode(root, inode);
2177	spin_unlock(&root->delalloc_lock);
2178}
2179
2180/*
2181 * Properly track delayed allocation bytes in the inode and to maintain the
2182 * list of inodes that have pending delalloc work to be done.
2183 */
2184void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
2185			       unsigned *bits)
2186{
2187	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2188
2189	if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
2190		WARN_ON(1);
2191	/*
2192	 * set_bit and clear bit hooks normally require _irqsave/restore
2193	 * but in this case, we are only testing for the DELALLOC
2194	 * bit, which is only set or cleared with irqs on
2195	 */
2196	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
2197		struct btrfs_root *root = BTRFS_I(inode)->root;
2198		u64 len = state->end + 1 - state->start;
2199		u32 num_extents = count_max_extents(len);
2200		bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
2201
2202		spin_lock(&BTRFS_I(inode)->lock);
2203		btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
2204		spin_unlock(&BTRFS_I(inode)->lock);
2205
2206		/* For sanity tests */
2207		if (btrfs_is_testing(fs_info))
2208			return;
2209
2210		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2211					 fs_info->delalloc_batch);
2212		spin_lock(&BTRFS_I(inode)->lock);
2213		BTRFS_I(inode)->delalloc_bytes += len;
2214		if (*bits & EXTENT_DEFRAG)
2215			BTRFS_I(inode)->defrag_bytes += len;
2216		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2217					 &BTRFS_I(inode)->runtime_flags))
2218			btrfs_add_delalloc_inodes(root, inode);
2219		spin_unlock(&BTRFS_I(inode)->lock);
2220	}
2221
2222	if (!(state->state & EXTENT_DELALLOC_NEW) &&
2223	    (*bits & EXTENT_DELALLOC_NEW)) {
2224		spin_lock(&BTRFS_I(inode)->lock);
2225		BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
2226			state->start;
2227		spin_unlock(&BTRFS_I(inode)->lock);
2228	}
2229}
2230
2231/*
2232 * Once a range is no longer delalloc this function ensures that proper
2233 * accounting happens.
2234 */
2235void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
2236				 struct extent_state *state, unsigned *bits)
2237{
2238	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
2239	struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
2240	u64 len = state->end + 1 - state->start;
2241	u32 num_extents = count_max_extents(len);
2242
2243	if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
2244		spin_lock(&inode->lock);
2245		inode->defrag_bytes -= len;
2246		spin_unlock(&inode->lock);
2247	}
2248
2249	/*
2250	 * set_bit and clear bit hooks normally require _irqsave/restore
2251	 * but in this case, we are only testing for the DELALLOC
2252	 * bit, which is only set or cleared with irqs on
2253	 */
2254	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
2255		struct btrfs_root *root = inode->root;
2256		bool do_list = !btrfs_is_free_space_inode(inode);
2257
2258		spin_lock(&inode->lock);
2259		btrfs_mod_outstanding_extents(inode, -num_extents);
2260		spin_unlock(&inode->lock);
2261
2262		/*
2263		 * We don't reserve metadata space for space cache inodes so we
2264		 * don't need to call delalloc_release_metadata if there is an
2265		 * error.
2266		 */
2267		if (*bits & EXTENT_CLEAR_META_RESV &&
2268		    root != fs_info->tree_root)
2269			btrfs_delalloc_release_metadata(inode, len, false);
2270
2271		/* For sanity tests. */
2272		if (btrfs_is_testing(fs_info))
2273			return;
2274
2275		if (!btrfs_is_data_reloc_root(root) &&
2276		    do_list && !(state->state & EXTENT_NORESERVE) &&
2277		    (*bits & EXTENT_CLEAR_DATA_RESV))
2278			btrfs_free_reserved_data_space_noquota(fs_info, len);
2279
2280		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2281					 fs_info->delalloc_batch);
2282		spin_lock(&inode->lock);
2283		inode->delalloc_bytes -= len;
2284		if (do_list && inode->delalloc_bytes == 0 &&
2285		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2286					&inode->runtime_flags))
2287			btrfs_del_delalloc_inode(root, inode);
2288		spin_unlock(&inode->lock);
2289	}
2290
2291	if ((state->state & EXTENT_DELALLOC_NEW) &&
2292	    (*bits & EXTENT_DELALLOC_NEW)) {
2293		spin_lock(&inode->lock);
2294		ASSERT(inode->new_delalloc_bytes >= len);
2295		inode->new_delalloc_bytes -= len;
2296		if (*bits & EXTENT_ADD_INODE_BYTES)
2297			inode_add_bytes(&inode->vfs_inode, len);
2298		spin_unlock(&inode->lock);
2299	}
2300}
2301
2302/*
2303 * in order to insert checksums into the metadata in large chunks,
2304 * we wait until bio submission time.   All the pages in the bio are
2305 * checksummed and sums are attached onto the ordered extent record.
2306 *
2307 * At IO completion time the cums attached on the ordered extent record
2308 * are inserted into the btree
2309 */
2310static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
2311					   u64 dio_file_offset)
2312{
2313	return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
2314}
2315
2316/*
2317 * Split an extent_map at [start, start + len]
2318 *
2319 * This function is intended to be used only for extract_ordered_extent().
2320 */
2321static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
2322			  u64 pre, u64 post)
2323{
2324	struct extent_map_tree *em_tree = &inode->extent_tree;
2325	struct extent_map *em;
2326	struct extent_map *split_pre = NULL;
2327	struct extent_map *split_mid = NULL;
2328	struct extent_map *split_post = NULL;
2329	int ret = 0;
2330	unsigned long flags;
2331
2332	/* Sanity check */
2333	if (pre == 0 && post == 0)
2334		return 0;
2335
2336	split_pre = alloc_extent_map();
2337	if (pre)
2338		split_mid = alloc_extent_map();
2339	if (post)
2340		split_post = alloc_extent_map();
2341	if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
2342		ret = -ENOMEM;
2343		goto out;
2344	}
2345
2346	ASSERT(pre + post < len);
2347
2348	lock_extent(&inode->io_tree, start, start + len - 1);
2349	write_lock(&em_tree->lock);
2350	em = lookup_extent_mapping(em_tree, start, len);
2351	if (!em) {
2352		ret = -EIO;
2353		goto out_unlock;
2354	}
2355
2356	ASSERT(em->len == len);
2357	ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
2358	ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
2359	ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
2360	ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
2361	ASSERT(!list_empty(&em->list));
2362
2363	flags = em->flags;
2364	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
2365
2366	/* First, replace the em with a new extent_map starting from * em->start */
2367	split_pre->start = em->start;
2368	split_pre->len = (pre ? pre : em->len - post);
2369	split_pre->orig_start = split_pre->start;
2370	split_pre->block_start = em->block_start;
2371	split_pre->block_len = split_pre->len;
2372	split_pre->orig_block_len = split_pre->block_len;
2373	split_pre->ram_bytes = split_pre->len;
2374	split_pre->flags = flags;
2375	split_pre->compress_type = em->compress_type;
2376	split_pre->generation = em->generation;
2377
2378	replace_extent_mapping(em_tree, em, split_pre, 1);
2379
2380	/*
2381	 * Now we only have an extent_map at:
2382	 *     [em->start, em->start + pre] if pre != 0
2383	 *     [em->start, em->start + em->len - post] if pre == 0
2384	 */
2385
2386	if (pre) {
2387		/* Insert the middle extent_map */
2388		split_mid->start = em->start + pre;
2389		split_mid->len = em->len - pre - post;
2390		split_mid->orig_start = split_mid->start;
2391		split_mid->block_start = em->block_start + pre;
2392		split_mid->block_len = split_mid->len;
2393		split_mid->orig_block_len = split_mid->block_len;
2394		split_mid->ram_bytes = split_mid->len;
2395		split_mid->flags = flags;
2396		split_mid->compress_type = em->compress_type;
2397		split_mid->generation = em->generation;
2398		add_extent_mapping(em_tree, split_mid, 1);
2399	}
2400
2401	if (post) {
2402		split_post->start = em->start + em->len - post;
2403		split_post->len = post;
2404		split_post->orig_start = split_post->start;
2405		split_post->block_start = em->block_start + em->len - post;
2406		split_post->block_len = split_post->len;
2407		split_post->orig_block_len = split_post->block_len;
2408		split_post->ram_bytes = split_post->len;
2409		split_post->flags = flags;
2410		split_post->compress_type = em->compress_type;
2411		split_post->generation = em->generation;
2412		add_extent_mapping(em_tree, split_post, 1);
2413	}
2414
2415	/* Once for us */
2416	free_extent_map(em);
2417	/* Once for the tree */
2418	free_extent_map(em);
2419
2420out_unlock:
2421	write_unlock(&em_tree->lock);
2422	unlock_extent(&inode->io_tree, start, start + len - 1);
2423out:
2424	free_extent_map(split_pre);
2425	free_extent_map(split_mid);
2426	free_extent_map(split_post);
2427
2428	return ret;
2429}
2430
2431static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
2432					   struct bio *bio, loff_t file_offset)
2433{
2434	struct btrfs_ordered_extent *ordered;
2435	u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
2436	u64 file_len;
2437	u64 len = bio->bi_iter.bi_size;
2438	u64 end = start + len;
2439	u64 ordered_end;
2440	u64 pre, post;
2441	int ret = 0;
2442
2443	ordered = btrfs_lookup_ordered_extent(inode, file_offset);
2444	if (WARN_ON_ONCE(!ordered))
2445		return BLK_STS_IOERR;
2446
2447	/* No need to split */
2448	if (ordered->disk_num_bytes == len)
2449		goto out;
2450
2451	/* We cannot split once end_bio'd ordered extent */
2452	if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
2453		ret = -EINVAL;
2454		goto out;
2455	}
2456
2457	/* We cannot split a compressed ordered extent */
2458	if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
2459		ret = -EINVAL;
2460		goto out;
2461	}
2462
2463	ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
2464	/* bio must be in one ordered extent */
2465	if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
2466		ret = -EINVAL;
2467		goto out;
2468	}
2469
2470	/* Checksum list should be empty */
2471	if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
2472		ret = -EINVAL;
2473		goto out;
2474	}
2475
2476	file_len = ordered->num_bytes;
2477	pre = start - ordered->disk_bytenr;
2478	post = ordered_end - end;
2479
2480	ret = btrfs_split_ordered_extent(ordered, pre, post);
2481	if (ret)
2482		goto out;
2483	ret = split_zoned_em(inode, file_offset, file_len, pre, post);
2484
2485out:
2486	btrfs_put_ordered_extent(ordered);
2487
2488	return errno_to_blk_status(ret);
2489}
2490
2491/*
2492 * extent_io.c submission hook. This does the right thing for csum calculation
2493 * on write, or reading the csums from the tree before a read.
2494 *
2495 * Rules about async/sync submit,
2496 * a) read:				sync submit
2497 *
2498 * b) write without checksum:		sync submit
2499 *
2500 * c) write with checksum:
2501 *    c-1) if bio is issued by fsync:	sync submit
2502 *         (sync_writers != 0)
2503 *
2504 *    c-2) if root is reloc root:	sync submit
2505 *         (only in case of buffered IO)
2506 *
2507 *    c-3) otherwise:			async submit
2508 */
2509blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
2510				   int mirror_num, unsigned long bio_flags)
2511
2512{
2513	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2514	struct btrfs_root *root = BTRFS_I(inode)->root;
2515	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
2516	blk_status_t ret = 0;
2517	int skip_sum;
2518	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
2519
2520	skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
2521		test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
2522
2523	if (btrfs_is_free_space_inode(BTRFS_I(inode)))
2524		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
2525
2526	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
2527		struct page *page = bio_first_bvec_all(bio)->bv_page;
2528		loff_t file_offset = page_offset(page);
2529
2530		ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
2531		if (ret)
2532			goto out;
2533	}
2534
2535	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
2536		ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
2537		if (ret)
2538			goto out;
2539
2540		if (bio_flags & EXTENT_BIO_COMPRESSED) {
2541			ret = btrfs_submit_compressed_read(inode, bio,
2542							   mirror_num,
2543							   bio_flags);
2544			goto out;
2545		} else {
2546			/*
2547			 * Lookup bio sums does extra checks around whether we
2548			 * need to csum or not, which is why we ignore skip_sum
2549			 * here.
2550			 */
2551			ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2552			if (ret)
2553				goto out;
2554		}
2555		goto mapit;
2556	} else if (async && !skip_sum) {
2557		/* csum items have already been cloned */
2558		if (btrfs_is_data_reloc_root(root))
2559			goto mapit;
2560		/* we're doing a write, do the async checksumming */
2561		ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
2562					  0, btrfs_submit_bio_start);
2563		goto out;
2564	} else if (!skip_sum) {
2565		ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
2566		if (ret)
2567			goto out;
2568	}
2569
2570mapit:
2571	ret = btrfs_map_bio(fs_info, bio, mirror_num);
2572
2573out:
2574	if (ret) {
2575		bio->bi_status = ret;
2576		bio_endio(bio);
2577	}
2578	return ret;
2579}
2580
2581/*
2582 * given a list of ordered sums record them in the inode.  This happens
2583 * at IO completion time based on sums calculated at bio submission time.
2584 */
2585static int add_pending_csums(struct btrfs_trans_handle *trans,
2586			     struct list_head *list)
2587{
2588	struct btrfs_ordered_sum *sum;
2589	struct btrfs_root *csum_root = NULL;
2590	int ret;
2591
2592	list_for_each_entry(sum, list, list) {
2593		trans->adding_csums = true;
2594		if (!csum_root)
2595			csum_root = btrfs_csum_root(trans->fs_info,
2596						    sum->bytenr);
2597		ret = btrfs_csum_file_blocks(trans, csum_root, sum);
2598		trans->adding_csums = false;
2599		if (ret)
2600			return ret;
2601	}
2602	return 0;
2603}
2604
2605static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2606					 const u64 start,
2607					 const u64 len,
2608					 struct extent_state **cached_state)
2609{
2610	u64 search_start = start;
2611	const u64 end = start + len - 1;
2612
2613	while (search_start < end) {
2614		const u64 search_len = end - search_start + 1;
2615		struct extent_map *em;
2616		u64 em_len;
2617		int ret = 0;
2618
2619		em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
2620		if (IS_ERR(em))
2621			return PTR_ERR(em);
2622
2623		if (em->block_start != EXTENT_MAP_HOLE)
2624			goto next;
2625
2626		em_len = em->len;
2627		if (em->start < search_start)
2628			em_len -= search_start - em->start;
2629		if (em_len > search_len)
2630			em_len = search_len;
2631
2632		ret = set_extent_bit(&inode->io_tree, search_start,
2633				     search_start + em_len - 1,
2634				     EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
2635				     GFP_NOFS, NULL);
2636next:
2637		search_start = extent_map_end(em);
2638		free_extent_map(em);
2639		if (ret)
2640			return ret;
2641	}
2642	return 0;
2643}
2644
2645int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2646			      unsigned int extra_bits,
2647			      struct extent_state **cached_state)
2648{
2649	WARN_ON(PAGE_ALIGNED(end));
2650
2651	if (start >= i_size_read(&inode->vfs_inode) &&
2652	    !(inode->flags & BTRFS_INODE_PREALLOC)) {
2653		/*
2654		 * There can't be any extents following eof in this case so just
2655		 * set the delalloc new bit for the range directly.
2656		 */
2657		extra_bits |= EXTENT_DELALLOC_NEW;
2658	} else {
2659		int ret;
2660
2661		ret = btrfs_find_new_delalloc_bytes(inode, start,
2662						    end + 1 - start,
2663						    cached_state);
2664		if (ret)
2665			return ret;
2666	}
2667
2668	return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
2669				   cached_state);
2670}
2671
2672/* see btrfs_writepage_start_hook for details on why this is required */
2673struct btrfs_writepage_fixup {
2674	struct page *page;
2675	struct inode *inode;
2676	struct btrfs_work work;
2677};
2678
2679static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2680{
2681	struct btrfs_writepage_fixup *fixup;
2682	struct btrfs_ordered_extent *ordered;
2683	struct extent_state *cached_state = NULL;
2684	struct extent_changeset *data_reserved = NULL;
2685	struct page *page;
2686	struct btrfs_inode *inode;
2687	u64 page_start;
2688	u64 page_end;
2689	int ret = 0;
2690	bool free_delalloc_space = true;
2691
2692	fixup = container_of(work, struct btrfs_writepage_fixup, work);
2693	page = fixup->page;
2694	inode = BTRFS_I(fixup->inode);
2695	page_start = page_offset(page);
2696	page_end = page_offset(page) + PAGE_SIZE - 1;
2697
2698	/*
2699	 * This is similar to page_mkwrite, we need to reserve the space before
2700	 * we take the page lock.
2701	 */
2702	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2703					   PAGE_SIZE);
2704again:
2705	lock_page(page);
2706
2707	/*
2708	 * Before we queued this fixup, we took a reference on the page.
2709	 * page->mapping may go NULL, but it shouldn't be moved to a different
2710	 * address space.
2711	 */
2712	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2713		/*
2714		 * Unfortunately this is a little tricky, either
2715		 *
2716		 * 1) We got here and our page had already been dealt with and
2717		 *    we reserved our space, thus ret == 0, so we need to just
2718		 *    drop our space reservation and bail.  This can happen the
2719		 *    first time we come into the fixup worker, or could happen
2720		 *    while waiting for the ordered extent.
2721		 * 2) Our page was already dealt with, but we happened to get an
2722		 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
2723		 *    this case we obviously don't have anything to release, but
2724		 *    because the page was already dealt with we don't want to
2725		 *    mark the page with an error, so make sure we're resetting
2726		 *    ret to 0.  This is why we have this check _before_ the ret
2727		 *    check, because we do not want to have a surprise ENOSPC
2728		 *    when the page was already properly dealt with.
2729		 */
2730		if (!ret) {
2731			btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2732			btrfs_delalloc_release_space(inode, data_reserved,
2733						     page_start, PAGE_SIZE,
2734						     true);
2735		}
2736		ret = 0;
2737		goto out_page;
2738	}
2739
2740	/*
2741	 * We can't mess with the page state unless it is locked, so now that
2742	 * it is locked bail if we failed to make our space reservation.
2743	 */
2744	if (ret)
2745		goto out_page;
2746
2747	lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
2748
2749	/* already ordered? We're done */
2750	if (PageOrdered(page))
2751		goto out_reserved;
2752
2753	ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2754	if (ordered) {
2755		unlock_extent_cached(&inode->io_tree, page_start, page_end,
2756				     &cached_state);
2757		unlock_page(page);
2758		btrfs_start_ordered_extent(ordered, 1);
2759		btrfs_put_ordered_extent(ordered);
2760		goto again;
2761	}
2762
2763	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2764					&cached_state);
2765	if (ret)
2766		goto out_reserved;
2767
2768	/*
2769	 * Everything went as planned, we're now the owner of a dirty page with
2770	 * delayed allocation bits set and space reserved for our COW
2771	 * destination.
2772	 *
2773	 * The page was dirty when we started, nothing should have cleaned it.
2774	 */
2775	BUG_ON(!PageDirty(page));
2776	free_delalloc_space = false;
2777out_reserved:
2778	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2779	if (free_delalloc_space)
2780		btrfs_delalloc_release_space(inode, data_reserved, page_start,
2781					     PAGE_SIZE, true);
2782	unlock_extent_cached(&inode->io_tree, page_start, page_end,
2783			     &cached_state);
2784out_page:
2785	if (ret) {
2786		/*
2787		 * We hit ENOSPC or other errors.  Update the mapping and page
2788		 * to reflect the errors and clean the page.
2789		 */
2790		mapping_set_error(page->mapping, ret);
2791		end_extent_writepage(page, ret, page_start, page_end);
2792		clear_page_dirty_for_io(page);
2793		SetPageError(page);
2794	}
2795	btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
2796	unlock_page(page);
2797	put_page(page);
2798	kfree(fixup);
2799	extent_changeset_free(data_reserved);
2800	/*
2801	 * As a precaution, do a delayed iput in case it would be the last iput
2802	 * that could need flushing space. Recursing back to fixup worker would
2803	 * deadlock.
2804	 */
2805	btrfs_add_delayed_iput(&inode->vfs_inode);
2806}
2807
2808/*
2809 * There are a few paths in the higher layers of the kernel that directly
2810 * set the page dirty bit without asking the filesystem if it is a
2811 * good idea.  This causes problems because we want to make sure COW
2812 * properly happens and the data=ordered rules are followed.
2813 *
2814 * In our case any range that doesn't have the ORDERED bit set
2815 * hasn't been properly setup for IO.  We kick off an async process
2816 * to fix it up.  The async helper will wait for ordered extents, set
2817 * the delalloc bit and make it safe to write the page.
2818 */
2819int btrfs_writepage_cow_fixup(struct page *page)
2820{
2821	struct inode *inode = page->mapping->host;
2822	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2823	struct btrfs_writepage_fixup *fixup;
2824
2825	/* This page has ordered extent covering it already */
2826	if (PageOrdered(page))
2827		return 0;
2828
2829	/*
2830	 * PageChecked is set below when we create a fixup worker for this page,
2831	 * don't try to create another one if we're already PageChecked()
2832	 *
2833	 * The extent_io writepage code will redirty the page if we send back
2834	 * EAGAIN.
2835	 */
2836	if (PageChecked(page))
2837		return -EAGAIN;
2838
2839	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2840	if (!fixup)
2841		return -EAGAIN;
2842
2843	/*
2844	 * We are already holding a reference to this inode from
2845	 * write_cache_pages.  We need to hold it because the space reservation
2846	 * takes place outside of the page lock, and we can't trust
2847	 * page->mapping outside of the page lock.
2848	 */
2849	ihold(inode);
2850	btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
2851	get_page(page);
2852	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
2853	fixup->page = page;
2854	fixup->inode = inode;
2855	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2856
2857	return -EAGAIN;
2858}
2859
2860static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2861				       struct btrfs_inode *inode, u64 file_pos,
2862				       struct btrfs_file_extent_item *stack_fi,
2863				       const bool update_inode_bytes,
2864				       u64 qgroup_reserved)
2865{
2866	struct btrfs_root *root = inode->root;
2867	const u64 sectorsize = root->fs_info->sectorsize;
2868	struct btrfs_path *path;
2869	struct extent_buffer *leaf;
2870	struct btrfs_key ins;
2871	u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2872	u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2873	u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2874	u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2875	struct btrfs_drop_extents_args drop_args = { 0 };
2876	int ret;
2877
2878	path = btrfs_alloc_path();
2879	if (!path)
2880		return -ENOMEM;
2881
2882	/*
2883	 * we may be replacing one extent in the tree with another.
2884	 * The new extent is pinned in the extent map, and we don't want
2885	 * to drop it from the cache until it is completely in the btree.
2886	 *
2887	 * So, tell btrfs_drop_extents to leave this extent in the cache.
2888	 * the caller is expected to unpin it and allow it to be merged
2889	 * with the others.
2890	 */
2891	drop_args.path = path;
2892	drop_args.start = file_pos;
2893	drop_args.end = file_pos + num_bytes;
2894	drop_args.replace_extent = true;
2895	drop_args.extent_item_size = sizeof(*stack_fi);
2896	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2897	if (ret)
2898		goto out;
2899
2900	if (!drop_args.extent_inserted) {
2901		ins.objectid = btrfs_ino(inode);
2902		ins.offset = file_pos;
2903		ins.type = BTRFS_EXTENT_DATA_KEY;
2904
2905		ret = btrfs_insert_empty_item(trans, root, path, &ins,
2906					      sizeof(*stack_fi));
2907		if (ret)
2908			goto out;
2909	}
2910	leaf = path->nodes[0];
2911	btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2912	write_extent_buffer(leaf, stack_fi,
2913			btrfs_item_ptr_offset(leaf, path->slots[0]),
2914			sizeof(struct btrfs_file_extent_item));
2915
2916	btrfs_mark_buffer_dirty(leaf);
2917	btrfs_release_path(path);
2918
2919	/*
2920	 * If we dropped an inline extent here, we know the range where it is
2921	 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
2922	 * number of bytes only for that range containing the inline extent.
2923	 * The remaining of the range will be processed when clearning the
2924	 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
2925	 */
2926	if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
2927		u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
2928
2929		inline_size = drop_args.bytes_found - inline_size;
2930		btrfs_update_inode_bytes(inode, sectorsize, inline_size);
2931		drop_args.bytes_found -= inline_size;
2932		num_bytes -= sectorsize;
2933	}
2934
2935	if (update_inode_bytes)
2936		btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
2937
2938	ins.objectid = disk_bytenr;
2939	ins.offset = disk_num_bytes;
2940	ins.type = BTRFS_EXTENT_ITEM_KEY;
2941
2942	ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
2943	if (ret)
2944		goto out;
2945
2946	ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
2947					       file_pos, qgroup_reserved, &ins);
2948out:
2949	btrfs_free_path(path);
2950
2951	return ret;
2952}
2953
2954static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2955					 u64 start, u64 len)
2956{
2957	struct btrfs_block_group *cache;
2958
2959	cache = btrfs_lookup_block_group(fs_info, start);
2960	ASSERT(cache);
2961
2962	spin_lock(&cache->lock);
2963	cache->delalloc_bytes -= len;
2964	spin_unlock(&cache->lock);
2965
2966	btrfs_put_block_group(cache);
2967}
2968
2969static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
2970					     struct btrfs_ordered_extent *oe)
2971{
2972	struct btrfs_file_extent_item stack_fi;
2973	u64 logical_len;
2974	bool update_inode_bytes;
2975
2976	memset(&stack_fi, 0, sizeof(stack_fi));
2977	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
2978	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
2979	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
2980						   oe->disk_num_bytes);
2981	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
2982		logical_len = oe->truncated_len;
2983	else
2984		logical_len = oe->num_bytes;
2985	btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len);
2986	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len);
2987	btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
2988	/* Encryption and other encoding is reserved and all 0 */
2989
2990	/*
2991	 * For delalloc, when completing an ordered extent we update the inode's
2992	 * bytes when clearing the range in the inode's io tree, so pass false
2993	 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
2994	 * except if the ordered extent was truncated.
2995	 */
2996	update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
2997			     test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
2998
2999	return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
3000					   oe->file_offset, &stack_fi,
3001					   update_inode_bytes, oe->qgroup_rsv);
3002}
3003
3004/*
3005 * As ordered data IO finishes, this gets called so we can finish
3006 * an ordered extent if the range of bytes in the file it covers are
3007 * fully written.
3008 */
3009static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
3010{
3011	struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
3012	struct btrfs_root *root = inode->root;
3013	struct btrfs_fs_info *fs_info = root->fs_info;
3014	struct btrfs_trans_handle *trans = NULL;
3015	struct extent_io_tree *io_tree = &inode->io_tree;
3016	struct extent_state *cached_state = NULL;
3017	u64 start, end;
3018	int compress_type = 0;
3019	int ret = 0;
3020	u64 logical_len = ordered_extent->num_bytes;
3021	bool freespace_inode;
3022	bool truncated = false;
3023	bool clear_reserved_extent = true;
3024	unsigned int clear_bits = EXTENT_DEFRAG;
3025
3026	start = ordered_extent->file_offset;
3027	end = start + ordered_extent->num_bytes - 1;
3028
3029	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3030	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3031	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
3032		clear_bits |= EXTENT_DELALLOC_NEW;
3033
3034	freespace_inode = btrfs_is_free_space_inode(inode);
3035
3036	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3037		ret = -EIO;
3038		goto out;
3039	}
3040
3041	/* A valid bdev implies a write on a sequential zone */
3042	if (ordered_extent->bdev) {
3043		btrfs_rewrite_logical_zoned(ordered_extent);
3044		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3045					ordered_extent->disk_num_bytes);
3046	}
3047
3048	btrfs_free_io_failure_record(inode, start, end);
3049
3050	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3051		truncated = true;
3052		logical_len = ordered_extent->truncated_len;
3053		/* Truncated the entire extent, don't bother adding */
3054		if (!logical_len)
3055			goto out;
3056	}
3057
3058	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3059		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3060
3061		btrfs_inode_safe_disk_i_size_write(inode, 0);
3062		if (freespace_inode)
3063			trans = btrfs_join_transaction_spacecache(root);
3064		else
3065			trans = btrfs_join_transaction(root);
3066		if (IS_ERR(trans)) {
3067			ret = PTR_ERR(trans);
3068			trans = NULL;
3069			goto out;
3070		}
3071		trans->block_rsv = &inode->block_rsv;
3072		ret = btrfs_update_inode_fallback(trans, root, inode);
3073		if (ret) /* -ENOMEM or corruption */
3074			btrfs_abort_transaction(trans, ret);
3075		goto out;
3076	}
3077
3078	clear_bits |= EXTENT_LOCKED;
3079	lock_extent_bits(io_tree, start, end, &cached_state);
3080
3081	if (freespace_inode)
3082		trans = btrfs_join_transaction_spacecache(root);
3083	else
3084		trans = btrfs_join_transaction(root);
3085	if (IS_ERR(trans)) {
3086		ret = PTR_ERR(trans);
3087		trans = NULL;
3088		goto out;
3089	}
3090
3091	trans->block_rsv = &inode->block_rsv;
3092
3093	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3094		compress_type = ordered_extent->compress_type;
3095	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3096		BUG_ON(compress_type);
3097		ret = btrfs_mark_extent_written(trans, inode,
3098						ordered_extent->file_offset,
3099						ordered_extent->file_offset +
3100						logical_len);
3101	} else {
3102		BUG_ON(root == fs_info->tree_root);
3103		ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3104		if (!ret) {
3105			clear_reserved_extent = false;
3106			btrfs_release_delalloc_bytes(fs_info,
3107						ordered_extent->disk_bytenr,
3108						ordered_extent->disk_num_bytes);
3109		}
3110	}
3111	unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
3112			   ordered_extent->num_bytes, trans->transid);
3113	if (ret < 0) {
3114		btrfs_abort_transaction(trans, ret);
3115		goto out;
3116	}
3117
3118	ret = add_pending_csums(trans, &ordered_extent->list);
3119	if (ret) {
3120		btrfs_abort_transaction(trans, ret);
3121		goto out;
3122	}
3123
3124	/*
3125	 * If this is a new delalloc range, clear its new delalloc flag to
3126	 * update the inode's number of bytes. This needs to be done first
3127	 * before updating the inode item.
3128	 */
3129	if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3130	    !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3131		clear_extent_bit(&inode->io_tree, start, end,
3132				 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
3133				 0, 0, &cached_state);
3134
3135	btrfs_inode_safe_disk_i_size_write(inode, 0);
3136	ret = btrfs_update_inode_fallback(trans, root, inode);
3137	if (ret) { /* -ENOMEM or corruption */
3138		btrfs_abort_transaction(trans, ret);
3139		goto out;
3140	}
3141	ret = 0;
3142out:
3143	clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3144			 (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
3145			 &cached_state);
3146
3147	if (trans)
3148		btrfs_end_transaction(trans);
3149
3150	if (ret || truncated) {
3151		u64 unwritten_start = start;
3152
3153		/*
3154		 * If we failed to finish this ordered extent for any reason we
3155		 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3156		 * extent, and mark the inode with the error if it wasn't
3157		 * already set.  Any error during writeback would have already
3158		 * set the mapping error, so we need to set it if we're the ones
3159		 * marking this ordered extent as failed.
3160		 */
3161		if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
3162					     &ordered_extent->flags))
3163			mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
3164
3165		if (truncated)
3166			unwritten_start += logical_len;
3167		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
3168
3169		/* Drop the cache for the part of the extent we didn't write. */
3170		btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
3171
3172		/*
3173		 * If the ordered extent had an IOERR or something else went
3174		 * wrong we need to return the space for this ordered extent
3175		 * back to the allocator.  We only free the extent in the
3176		 * truncated case if we didn't write out the extent at all.
3177		 *
3178		 * If we made it past insert_reserved_file_extent before we
3179		 * errored out then we don't need to do this as the accounting
3180		 * has already been done.
3181		 */
3182		if ((ret || !logical_len) &&
3183		    clear_reserved_extent &&
3184		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3185		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3186			/*
3187			 * Discard the range before returning it back to the
3188			 * free space pool
3189			 */
3190			if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3191				btrfs_discard_extent(fs_info,
3192						ordered_extent->disk_bytenr,
3193						ordered_extent->disk_num_bytes,
3194						NULL);
3195			btrfs_free_reserved_extent(fs_info,
3196					ordered_extent->disk_bytenr,
3197					ordered_extent->disk_num_bytes, 1);
3198		}
3199	}
3200
3201	/*
3202	 * This needs to be done to make sure anybody waiting knows we are done
3203	 * updating everything for this ordered extent.
3204	 */
3205	btrfs_remove_ordered_extent(inode, ordered_extent);
3206
3207	/* once for us */
3208	btrfs_put_ordered_extent(ordered_extent);
3209	/* once for the tree */
3210	btrfs_put_ordered_extent(ordered_extent);
3211
3212	return ret;
3213}
3214
3215static void finish_ordered_fn(struct btrfs_work *work)
3216{
3217	struct btrfs_ordered_extent *ordered_extent;
3218	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3219	btrfs_finish_ordered_io(ordered_extent);
3220}
3221
3222void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
3223					  struct page *page, u64 start,
3224					  u64 end, bool uptodate)
3225{
3226	trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
3227
3228	btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
3229				       finish_ordered_fn, uptodate);
3230}
3231
3232/*
3233 * check_data_csum - verify checksum of one sector of uncompressed data
3234 * @inode:	inode
3235 * @io_bio:	btrfs_io_bio which contains the csum
3236 * @bio_offset:	offset to the beginning of the bio (in bytes)
3237 * @page:	page where is the data to be verified
3238 * @pgoff:	offset inside the page
3239 * @start:	logical offset in the file
3240 *
3241 * The length of such check is always one sector size.
3242 */
3243static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
3244			   u32 bio_offset, struct page *page, u32 pgoff,
3245			   u64 start)
3246{
3247	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3248	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3249	char *kaddr;
3250	u32 len = fs_info->sectorsize;
3251	const u32 csum_size = fs_info->csum_size;
3252	unsigned int offset_sectors;
3253	u8 *csum_expected;
3254	u8 csum[BTRFS_CSUM_SIZE];
3255
3256	ASSERT(pgoff + len <= PAGE_SIZE);
3257
3258	offset_sectors = bio_offset >> fs_info->sectorsize_bits;
3259	csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
3260
3261	kaddr = kmap_atomic(page);
3262	shash->tfm = fs_info->csum_shash;
3263
3264	crypto_shash_digest(shash, kaddr + pgoff, len, csum);
3265
3266	if (memcmp(csum, csum_expected, csum_size))
3267		goto zeroit;
3268
3269	kunmap_atomic(kaddr);
3270	return 0;
3271zeroit:
3272	btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3273				    bbio->mirror_num);
3274	if (bbio->device)
3275		btrfs_dev_stat_inc_and_print(bbio->device,
3276					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
3277	memset(kaddr + pgoff, 1, len);
3278	flush_dcache_page(page);
3279	kunmap_atomic(kaddr);
3280	return -EIO;
3281}
3282
3283/*
3284 * When reads are done, we need to check csums to verify the data is correct.
3285 * if there's a match, we allow the bio to finish.  If not, the code in
3286 * extent_io.c will try to find good copies for us.
3287 *
3288 * @bio_offset:	offset to the beginning of the bio (in bytes)
3289 * @start:	file offset of the range start
3290 * @end:	file offset of the range end (inclusive)
3291 *
3292 * Return a bitmap where bit set means a csum mismatch, and bit not set means
3293 * csum match.
3294 */
3295unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
3296				    u32 bio_offset, struct page *page,
3297				    u64 start, u64 end)
3298{
3299	struct inode *inode = page->mapping->host;
3300	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3301	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3302	struct btrfs_root *root = BTRFS_I(inode)->root;
3303	const u32 sectorsize = root->fs_info->sectorsize;
3304	u32 pg_off;
3305	unsigned int result = 0;
3306
3307	if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
3308		btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
3309		return 0;
3310	}
3311
3312	/*
3313	 * This only happens for NODATASUM or compressed read.
3314	 * Normally this should be covered by above check for compressed read
3315	 * or the next check for NODATASUM.  Just do a quicker exit here.
3316	 */
3317	if (bbio->csum == NULL)
3318		return 0;
3319
3320	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3321		return 0;
3322
3323	if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
3324		return 0;
3325
3326	ASSERT(page_offset(page) <= start &&
3327	       end <= page_offset(page) + PAGE_SIZE - 1);
3328	for (pg_off = offset_in_page(start);
3329	     pg_off < offset_in_page(end);
3330	     pg_off += sectorsize, bio_offset += sectorsize) {
3331		u64 file_offset = pg_off + page_offset(page);
3332		int ret;
3333
3334		if (btrfs_is_data_reloc_root(root) &&
3335		    test_range_bit(io_tree, file_offset,
3336				   file_offset + sectorsize - 1,
3337				   EXTENT_NODATASUM, 1, NULL)) {
3338			/* Skip the range without csum for data reloc inode */
3339			clear_extent_bits(io_tree, file_offset,
3340					  file_offset + sectorsize - 1,
3341					  EXTENT_NODATASUM);
3342			continue;
3343		}
3344		ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
3345				      page_offset(page) + pg_off);
3346		if (ret < 0) {
3347			const int nr_bit = (pg_off - offset_in_page(start)) >>
3348				     root->fs_info->sectorsize_bits;
3349
3350			result |= (1U << nr_bit);
3351		}
3352	}
3353	return result;
3354}
3355
3356/*
3357 * btrfs_add_delayed_iput - perform a delayed iput on @inode
3358 *
3359 * @inode: The inode we want to perform iput on
3360 *
3361 * This function uses the generic vfs_inode::i_count to track whether we should
3362 * just decrement it (in case it's > 1) or if this is the last iput then link
3363 * the inode to the delayed iput machinery. Delayed iputs are processed at
3364 * transaction commit time/superblock commit/cleaner kthread.
3365 */
3366void btrfs_add_delayed_iput(struct inode *inode)
3367{
3368	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3369	struct btrfs_inode *binode = BTRFS_I(inode);
3370
3371	if (atomic_add_unless(&inode->i_count, -1, 1))
3372		return;
3373
3374	atomic_inc(&fs_info->nr_delayed_iputs);
3375	spin_lock(&fs_info->delayed_iput_lock);
3376	ASSERT(list_empty(&binode->delayed_iput));
3377	list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3378	spin_unlock(&fs_info->delayed_iput_lock);
3379	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3380		wake_up_process(fs_info->cleaner_kthread);
3381}
3382
3383static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3384				    struct btrfs_inode *inode)
3385{
3386	list_del_init(&inode->delayed_iput);
3387	spin_unlock(&fs_info->delayed_iput_lock);
3388	iput(&inode->vfs_inode);
3389	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3390		wake_up(&fs_info->delayed_iputs_wait);
3391	spin_lock(&fs_info->delayed_iput_lock);
3392}
3393
3394static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3395				   struct btrfs_inode *inode)
3396{
3397	if (!list_empty(&inode->delayed_iput)) {
3398		spin_lock(&fs_info->delayed_iput_lock);
3399		if (!list_empty(&inode->delayed_iput))
3400			run_delayed_iput_locked(fs_info, inode);
3401		spin_unlock(&fs_info->delayed_iput_lock);
3402	}
3403}
3404
3405void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3406{
3407
3408	spin_lock(&fs_info->delayed_iput_lock);
3409	while (!list_empty(&fs_info->delayed_iputs)) {
3410		struct btrfs_inode *inode;
3411
3412		inode = list_first_entry(&fs_info->delayed_iputs,
3413				struct btrfs_inode, delayed_iput);
3414		run_delayed_iput_locked(fs_info, inode);
3415		cond_resched_lock(&fs_info->delayed_iput_lock);
3416	}
3417	spin_unlock(&fs_info->delayed_iput_lock);
3418}
3419
3420/**
3421 * Wait for flushing all delayed iputs
3422 *
3423 * @fs_info:  the filesystem
3424 *
3425 * This will wait on any delayed iputs that are currently running with KILLABLE
3426 * set.  Once they are all done running we will return, unless we are killed in
3427 * which case we return EINTR. This helps in user operations like fallocate etc
3428 * that might get blocked on the iputs.
3429 *
3430 * Return EINTR if we were killed, 0 if nothing's pending
3431 */
3432int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3433{
3434	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3435			atomic_read(&fs_info->nr_delayed_iputs) == 0);
3436	if (ret)
3437		return -EINTR;
3438	return 0;
3439}
3440
3441/*
3442 * This creates an orphan entry for the given inode in case something goes wrong
3443 * in the middle of an unlink.
3444 */
3445int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3446		     struct btrfs_inode *inode)
3447{
3448	int ret;
3449
3450	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3451	if (ret && ret != -EEXIST) {
3452		btrfs_abort_transaction(trans, ret);
3453		return ret;
3454	}
3455
3456	return 0;
3457}
3458
3459/*
3460 * We have done the delete so we can go ahead and remove the orphan item for
3461 * this particular inode.
3462 */
3463static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3464			    struct btrfs_inode *inode)
3465{
3466	return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3467}
3468
3469/*
3470 * this cleans up any orphans that may be left on the list from the last use
3471 * of this root.
3472 */
3473int btrfs_orphan_cleanup(struct btrfs_root *root)
3474{
3475	struct btrfs_fs_info *fs_info = root->fs_info;
3476	struct btrfs_path *path;
3477	struct extent_buffer *leaf;
3478	struct btrfs_key key, found_key;
3479	struct btrfs_trans_handle *trans;
3480	struct inode *inode;
3481	u64 last_objectid = 0;
3482	int ret = 0, nr_unlink = 0;
3483
3484	if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
3485		return 0;
3486
3487	path = btrfs_alloc_path();
3488	if (!path) {
3489		ret = -ENOMEM;
3490		goto out;
3491	}
3492	path->reada = READA_BACK;
3493
3494	key.objectid = BTRFS_ORPHAN_OBJECTID;
3495	key.type = BTRFS_ORPHAN_ITEM_KEY;
3496	key.offset = (u64)-1;
3497
3498	while (1) {
3499		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3500		if (ret < 0)
3501			goto out;
3502
3503		/*
3504		 * if ret == 0 means we found what we were searching for, which
3505		 * is weird, but possible, so only screw with path if we didn't
3506		 * find the key and see if we have stuff that matches
3507		 */
3508		if (ret > 0) {
3509			ret = 0;
3510			if (path->slots[0] == 0)
3511				break;
3512			path->slots[0]--;
3513		}
3514
3515		/* pull out the item */
3516		leaf = path->nodes[0];
3517		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3518
3519		/* make sure the item matches what we want */
3520		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3521			break;
3522		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3523			break;
3524
3525		/* release the path since we're done with it */
3526		btrfs_release_path(path);
3527
3528		/*
3529		 * this is where we are basically btrfs_lookup, without the
3530		 * crossing root thing.  we store the inode number in the
3531		 * offset of the orphan item.
3532		 */
3533
3534		if (found_key.offset == last_objectid) {
3535			btrfs_err(fs_info,
3536				  "Error removing orphan entry, stopping orphan cleanup");
3537			ret = -EINVAL;
3538			goto out;
3539		}
3540
3541		last_objectid = found_key.offset;
3542
3543		found_key.objectid = found_key.offset;
3544		found_key.type = BTRFS_INODE_ITEM_KEY;
3545		found_key.offset = 0;
3546		inode = btrfs_iget(fs_info->sb, last_objectid, root);
3547		ret = PTR_ERR_OR_ZERO(inode);
3548		if (ret && ret != -ENOENT)
3549			goto out;
3550
3551		if (ret == -ENOENT && root == fs_info->tree_root) {
3552			struct btrfs_root *dead_root;
3553			int is_dead_root = 0;
3554
3555			/*
3556			 * This is an orphan in the tree root. Currently these
3557			 * could come from 2 sources:
3558			 *  a) a root (snapshot/subvolume) deletion in progress
3559			 *  b) a free space cache inode
3560			 * We need to distinguish those two, as the orphan item
3561			 * for a root must not get deleted before the deletion
3562			 * of the snapshot/subvolume's tree completes.
3563			 *
3564			 * btrfs_find_orphan_roots() ran before us, which has
3565			 * found all deleted roots and loaded them into
3566			 * fs_info->fs_roots_radix. So here we can find if an
3567			 * orphan item corresponds to a deleted root by looking
3568			 * up the root from that radix tree.
3569			 */
3570
3571			spin_lock(&fs_info->fs_roots_radix_lock);
3572			dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3573							 (unsigned long)found_key.objectid);
3574			if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3575				is_dead_root = 1;
3576			spin_unlock(&fs_info->fs_roots_radix_lock);
3577
3578			if (is_dead_root) {
3579				/* prevent this orphan from being found again */
3580				key.offset = found_key.objectid - 1;
3581				continue;
3582			}
3583
3584		}
3585
3586		/*
3587		 * If we have an inode with links, there are a couple of
3588		 * possibilities:
3589		 *
3590		 * 1. We were halfway through creating fsverity metadata for the
3591		 * file. In that case, the orphan item represents incomplete
3592		 * fsverity metadata which must be cleaned up with
3593		 * btrfs_drop_verity_items and deleting the orphan item.
3594
3595		 * 2. Old kernels (before v3.12) used to create an
3596		 * orphan item for truncate indicating that there were possibly
3597		 * extent items past i_size that needed to be deleted. In v3.12,
3598		 * truncate was changed to update i_size in sync with the extent
3599		 * items, but the (useless) orphan item was still created. Since
3600		 * v4.18, we don't create the orphan item for truncate at all.
3601		 *
3602		 * So, this item could mean that we need to do a truncate, but
3603		 * only if this filesystem was last used on a pre-v3.12 kernel
3604		 * and was not cleanly unmounted. The odds of that are quite
3605		 * slim, and it's a pain to do the truncate now, so just delete
3606		 * the orphan item.
3607		 *
3608		 * It's also possible that this orphan item was supposed to be
3609		 * deleted but wasn't. The inode number may have been reused,
3610		 * but either way, we can delete the orphan item.
3611		 */
3612		if (ret == -ENOENT || inode->i_nlink) {
3613			if (!ret) {
3614				ret = btrfs_drop_verity_items(BTRFS_I(inode));
3615				iput(inode);
3616				if (ret)
3617					goto out;
3618			}
3619			trans = btrfs_start_transaction(root, 1);
3620			if (IS_ERR(trans)) {
3621				ret = PTR_ERR(trans);
3622				goto out;
3623			}
3624			btrfs_debug(fs_info, "auto deleting %Lu",
3625				    found_key.objectid);
3626			ret = btrfs_del_orphan_item(trans, root,
3627						    found_key.objectid);
3628			btrfs_end_transaction(trans);
3629			if (ret)
3630				goto out;
3631			continue;
3632		}
3633
3634		nr_unlink++;
3635
3636		/* this will do delete_inode and everything for us */
3637		iput(inode);
3638	}
3639	/* release the path since we're done with it */
3640	btrfs_release_path(path);
3641
3642	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3643		trans = btrfs_join_transaction(root);
3644		if (!IS_ERR(trans))
3645			btrfs_end_transaction(trans);
3646	}
3647
3648	if (nr_unlink)
3649		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3650
3651out:
3652	if (ret)
3653		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3654	btrfs_free_path(path);
3655	return ret;
3656}
3657
3658/*
3659 * very simple check to peek ahead in the leaf looking for xattrs.  If we
3660 * don't find any xattrs, we know there can't be any acls.
3661 *
3662 * slot is the slot the inode is in, objectid is the objectid of the inode
3663 */
3664static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3665					  int slot, u64 objectid,
3666					  int *first_xattr_slot)
3667{
3668	u32 nritems = btrfs_header_nritems(leaf);
3669	struct btrfs_key found_key;
3670	static u64 xattr_access = 0;
3671	static u64 xattr_default = 0;
3672	int scanned = 0;
3673
3674	if (!xattr_access) {
3675		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3676					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3677		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3678					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3679	}
3680
3681	slot++;
3682	*first_xattr_slot = -1;
3683	while (slot < nritems) {
3684		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3685
3686		/* we found a different objectid, there must not be acls */
3687		if (found_key.objectid != objectid)
3688			return 0;
3689
3690		/* we found an xattr, assume we've got an acl */
3691		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3692			if (*first_xattr_slot == -1)
3693				*first_xattr_slot = slot;
3694			if (found_key.offset == xattr_access ||
3695			    found_key.offset == xattr_default)
3696				return 1;
3697		}
3698
3699		/*
3700		 * we found a key greater than an xattr key, there can't
3701		 * be any acls later on
3702		 */
3703		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3704			return 0;
3705
3706		slot++;
3707		scanned++;
3708
3709		/*
3710		 * it goes inode, inode backrefs, xattrs, extents,
3711		 * so if there are a ton of hard links to an inode there can
3712		 * be a lot of backrefs.  Don't waste time searching too hard,
3713		 * this is just an optimization
3714		 */
3715		if (scanned >= 8)
3716			break;
3717	}
3718	/* we hit the end of the leaf before we found an xattr or
3719	 * something larger than an xattr.  We have to assume the inode
3720	 * has acls
3721	 */
3722	if (*first_xattr_slot == -1)
3723		*first_xattr_slot = slot;
3724	return 1;
3725}
3726
3727/*
3728 * read an inode from the btree into the in-memory inode
3729 */
3730static int btrfs_read_locked_inode(struct inode *inode,
3731				   struct btrfs_path *in_path)
3732{
3733	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3734	struct btrfs_path *path = in_path;
3735	struct extent_buffer *leaf;
3736	struct btrfs_inode_item *inode_item;
3737	struct btrfs_root *root = BTRFS_I(inode)->root;
3738	struct btrfs_key location;
3739	unsigned long ptr;
3740	int maybe_acls;
3741	u32 rdev;
3742	int ret;
3743	bool filled = false;
3744	int first_xattr_slot;
3745
3746	ret = btrfs_fill_inode(inode, &rdev);
3747	if (!ret)
3748		filled = true;
3749
3750	if (!path) {
3751		path = btrfs_alloc_path();
3752		if (!path)
3753			return -ENOMEM;
3754	}
3755
3756	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3757
3758	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3759	if (ret) {
3760		if (path != in_path)
3761			btrfs_free_path(path);
3762		return ret;
3763	}
3764
3765	leaf = path->nodes[0];
3766
3767	if (filled)
3768		goto cache_index;
3769
3770	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3771				    struct btrfs_inode_item);
3772	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3773	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3774	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3775	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3776	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3777	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
3778			round_up(i_size_read(inode), fs_info->sectorsize));
3779
3780	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3781	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3782
3783	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3784	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3785
3786	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3787	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3788
3789	BTRFS_I(inode)->i_otime.tv_sec =
3790		btrfs_timespec_sec(leaf, &inode_item->otime);
3791	BTRFS_I(inode)->i_otime.tv_nsec =
3792		btrfs_timespec_nsec(leaf, &inode_item->otime);
3793
3794	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3795	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3796	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3797
3798	inode_set_iversion_queried(inode,
3799				   btrfs_inode_sequence(leaf, inode_item));
3800	inode->i_generation = BTRFS_I(inode)->generation;
3801	inode->i_rdev = 0;
3802	rdev = btrfs_inode_rdev(leaf, inode_item);
3803
3804	BTRFS_I(inode)->index_cnt = (u64)-1;
3805	btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
3806				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
3807
3808cache_index:
3809	/*
3810	 * If we were modified in the current generation and evicted from memory
3811	 * and then re-read we need to do a full sync since we don't have any
3812	 * idea about which extents were modified before we were evicted from
3813	 * cache.
3814	 *
3815	 * This is required for both inode re-read from disk and delayed inode
3816	 * in delayed_nodes_tree.
3817	 */
3818	if (BTRFS_I(inode)->last_trans == fs_info->generation)
3819		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3820			&BTRFS_I(inode)->runtime_flags);
3821
3822	/*
3823	 * We don't persist the id of the transaction where an unlink operation
3824	 * against the inode was last made. So here we assume the inode might
3825	 * have been evicted, and therefore the exact value of last_unlink_trans
3826	 * lost, and set it to last_trans to avoid metadata inconsistencies
3827	 * between the inode and its parent if the inode is fsync'ed and the log
3828	 * replayed. For example, in the scenario:
3829	 *
3830	 * touch mydir/foo
3831	 * ln mydir/foo mydir/bar
3832	 * sync
3833	 * unlink mydir/bar
3834	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3835	 * xfs_io -c fsync mydir/foo
3836	 * <power failure>
3837	 * mount fs, triggers fsync log replay
3838	 *
3839	 * We must make sure that when we fsync our inode foo we also log its
3840	 * parent inode, otherwise after log replay the parent still has the
3841	 * dentry with the "bar" name but our inode foo has a link count of 1
3842	 * and doesn't have an inode ref with the name "bar" anymore.
3843	 *
3844	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
3845	 * but it guarantees correctness at the expense of occasional full
3846	 * transaction commits on fsync if our inode is a directory, or if our
3847	 * inode is not a directory, logging its parent unnecessarily.
3848	 */
3849	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3850
3851	/*
3852	 * Same logic as for last_unlink_trans. We don't persist the generation
3853	 * of the last transaction where this inode was used for a reflink
3854	 * operation, so after eviction and reloading the inode we must be
3855	 * pessimistic and assume the last transaction that modified the inode.
3856	 */
3857	BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
3858
3859	path->slots[0]++;
3860	if (inode->i_nlink != 1 ||
3861	    path->slots[0] >= btrfs_header_nritems(leaf))
3862		goto cache_acl;
3863
3864	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3865	if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3866		goto cache_acl;
3867
3868	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3869	if (location.type == BTRFS_INODE_REF_KEY) {
3870		struct btrfs_inode_ref *ref;
3871
3872		ref = (struct btrfs_inode_ref *)ptr;
3873		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3874	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3875		struct btrfs_inode_extref *extref;
3876
3877		extref = (struct btrfs_inode_extref *)ptr;
3878		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3879								     extref);
3880	}
3881cache_acl:
3882	/*
3883	 * try to precache a NULL acl entry for files that don't have
3884	 * any xattrs or acls
3885	 */
3886	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3887			btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3888	if (first_xattr_slot != -1) {
3889		path->slots[0] = first_xattr_slot;
3890		ret = btrfs_load_inode_props(inode, path);
3891		if (ret)
3892			btrfs_err(fs_info,
3893				  "error loading props for ino %llu (root %llu): %d",
3894				  btrfs_ino(BTRFS_I(inode)),
3895				  root->root_key.objectid, ret);
3896	}
3897	if (path != in_path)
3898		btrfs_free_path(path);
3899
3900	if (!maybe_acls)
3901		cache_no_acl(inode);
3902
3903	switch (inode->i_mode & S_IFMT) {
3904	case S_IFREG:
3905		inode->i_mapping->a_ops = &btrfs_aops;
3906		inode->i_fop = &btrfs_file_operations;
3907		inode->i_op = &btrfs_file_inode_operations;
3908		break;
3909	case S_IFDIR:
3910		inode->i_fop = &btrfs_dir_file_operations;
3911		inode->i_op = &btrfs_dir_inode_operations;
3912		break;
3913	case S_IFLNK:
3914		inode->i_op = &btrfs_symlink_inode_operations;
3915		inode_nohighmem(inode);
3916		inode->i_mapping->a_ops = &btrfs_aops;
3917		break;
3918	default:
3919		inode->i_op = &btrfs_special_inode_operations;
3920		init_special_inode(inode, inode->i_mode, rdev);
3921		break;
3922	}
3923
3924	btrfs_sync_inode_flags_to_i_flags(inode);
3925	return 0;
3926}
3927
3928/*
3929 * given a leaf and an inode, copy the inode fields into the leaf
3930 */
3931static void fill_inode_item(struct btrfs_trans_handle *trans,
3932			    struct extent_buffer *leaf,
3933			    struct btrfs_inode_item *item,
3934			    struct inode *inode)
3935{
3936	struct btrfs_map_token token;
3937	u64 flags;
3938
3939	btrfs_init_map_token(&token, leaf);
3940
3941	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3942	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3943	btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
3944	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3945	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3946
3947	btrfs_set_token_timespec_sec(&token, &item->atime,
3948				     inode->i_atime.tv_sec);
3949	btrfs_set_token_timespec_nsec(&token, &item->atime,
3950				      inode->i_atime.tv_nsec);
3951
3952	btrfs_set_token_timespec_sec(&token, &item->mtime,
3953				     inode->i_mtime.tv_sec);
3954	btrfs_set_token_timespec_nsec(&token, &item->mtime,
3955				      inode->i_mtime.tv_nsec);
3956
3957	btrfs_set_token_timespec_sec(&token, &item->ctime,
3958				     inode->i_ctime.tv_sec);
3959	btrfs_set_token_timespec_nsec(&token, &item->ctime,
3960				      inode->i_ctime.tv_nsec);
3961
3962	btrfs_set_token_timespec_sec(&token, &item->otime,
3963				     BTRFS_I(inode)->i_otime.tv_sec);
3964	btrfs_set_token_timespec_nsec(&token, &item->otime,
3965				      BTRFS_I(inode)->i_otime.tv_nsec);
3966
3967	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
3968	btrfs_set_token_inode_generation(&token, item,
3969					 BTRFS_I(inode)->generation);
3970	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3971	btrfs_set_token_inode_transid(&token, item, trans->transid);
3972	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3973	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
3974					  BTRFS_I(inode)->ro_flags);
3975	btrfs_set_token_inode_flags(&token, item, flags);
3976	btrfs_set_token_inode_block_group(&token, item, 0);
3977}
3978
3979/*
3980 * copy everything in the in-memory inode into the btree.
3981 */
3982static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3983				struct btrfs_root *root,
3984				struct btrfs_inode *inode)
3985{
3986	struct btrfs_inode_item *inode_item;
3987	struct btrfs_path *path;
3988	struct extent_buffer *leaf;
3989	int ret;
3990
3991	path = btrfs_alloc_path();
3992	if (!path)
3993		return -ENOMEM;
3994
3995	ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
3996	if (ret) {
3997		if (ret > 0)
3998			ret = -ENOENT;
3999		goto failed;
4000	}
4001
4002	leaf = path->nodes[0];
4003	inode_item = btrfs_item_ptr(leaf, path->slots[0],
4004				    struct btrfs_inode_item);
4005
4006	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
4007	btrfs_mark_buffer_dirty(leaf);
4008	btrfs_set_inode_last_trans(trans, inode);
4009	ret = 0;
4010failed:
4011	btrfs_free_path(path);
4012	return ret;
4013}
4014
4015/*
4016 * copy everything in the in-memory inode into the btree.
4017 */
4018noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
4019				struct btrfs_root *root,
4020				struct btrfs_inode *inode)
4021{
4022	struct btrfs_fs_info *fs_info = root->fs_info;
4023	int ret;
4024
4025	/*
4026	 * If the inode is a free space inode, we can deadlock during commit
4027	 * if we put it into the delayed code.
4028	 *
4029	 * The data relocation inode should also be directly updated
4030	 * without delay
4031	 */
4032	if (!btrfs_is_free_space_inode(inode)
4033	    && !btrfs_is_data_reloc_root(root)
4034	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4035		btrfs_update_root_times(trans, root);
4036
4037		ret = btrfs_delayed_update_inode(trans, root, inode);
4038		if (!ret)
4039			btrfs_set_inode_last_trans(trans, inode);
4040		return ret;
4041	}
4042
4043	return btrfs_update_inode_item(trans, root, inode);
4044}
4045
4046int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4047				struct btrfs_root *root, struct btrfs_inode *inode)
4048{
4049	int ret;
4050
4051	ret = btrfs_update_inode(trans, root, inode);
4052	if (ret == -ENOSPC)
4053		return btrfs_update_inode_item(trans, root, inode);
4054	return ret;
4055}
4056
4057/*
4058 * unlink helper that gets used here in inode.c and in the tree logging
4059 * recovery code.  It remove a link in a directory with a given name, and
4060 * also drops the back refs in the inode to the directory
4061 */
4062static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4063				struct btrfs_inode *dir,
4064				struct btrfs_inode *inode,
4065				const char *name, int name_len)
4066{
4067	struct btrfs_root *root = dir->root;
4068	struct btrfs_fs_info *fs_info = root->fs_info;
4069	struct btrfs_path *path;
4070	int ret = 0;
4071	struct btrfs_dir_item *di;
4072	u64 index;
4073	u64 ino = btrfs_ino(inode);
4074	u64 dir_ino = btrfs_ino(dir);
4075
4076	path = btrfs_alloc_path();
4077	if (!path) {
4078		ret = -ENOMEM;
4079		goto out;
4080	}
4081
4082	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4083				    name, name_len, -1);
4084	if (IS_ERR_OR_NULL(di)) {
4085		ret = di ? PTR_ERR(di) : -ENOENT;
4086		goto err;
4087	}
4088	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4089	if (ret)
4090		goto err;
4091	btrfs_release_path(path);
4092
4093	/*
4094	 * If we don't have dir index, we have to get it by looking up
4095	 * the inode ref, since we get the inode ref, remove it directly,
4096	 * it is unnecessary to do delayed deletion.
4097	 *
4098	 * But if we have dir index, needn't search inode ref to get it.
4099	 * Since the inode ref is close to the inode item, it is better
4100	 * that we delay to delete it, and just do this deletion when
4101	 * we update the inode item.
4102	 */
4103	if (inode->dir_index) {
4104		ret = btrfs_delayed_delete_inode_ref(inode);
4105		if (!ret) {
4106			index = inode->dir_index;
4107			goto skip_backref;
4108		}
4109	}
4110
4111	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
4112				  dir_ino, &index);
4113	if (ret) {
4114		btrfs_info(fs_info,
4115			"failed to delete reference to %.*s, inode %llu parent %llu",
4116			name_len, name, ino, dir_ino);
4117		btrfs_abort_transaction(trans, ret);
4118		goto err;
4119	}
4120skip_backref:
4121	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4122	if (ret) {
4123		btrfs_abort_transaction(trans, ret);
4124		goto err;
4125	}
4126
4127	btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
4128				   dir_ino);
4129	btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index);
4130
4131	/*
4132	 * If we have a pending delayed iput we could end up with the final iput
4133	 * being run in btrfs-cleaner context.  If we have enough of these built
4134	 * up we can end up burning a lot of time in btrfs-cleaner without any
4135	 * way to throttle the unlinks.  Since we're currently holding a ref on
4136	 * the inode we can run the delayed iput here without any issues as the
4137	 * final iput won't be done until after we drop the ref we're currently
4138	 * holding.
4139	 */
4140	btrfs_run_delayed_iput(fs_info, inode);
4141err:
4142	btrfs_free_path(path);
4143	if (ret)
4144		goto out;
4145
4146	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
4147	inode_inc_iversion(&inode->vfs_inode);
4148	inode_inc_iversion(&dir->vfs_inode);
4149	inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
4150		dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
4151	ret = btrfs_update_inode(trans, root, dir);
4152out:
4153	return ret;
4154}
4155
4156int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4157		       struct btrfs_inode *dir, struct btrfs_inode *inode,
4158		       const char *name, int name_len)
4159{
4160	int ret;
4161	ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len);
4162	if (!ret) {
4163		drop_nlink(&inode->vfs_inode);
4164		ret = btrfs_update_inode(trans, inode->root, inode);
4165	}
4166	return ret;
4167}
4168
4169/*
4170 * helper to start transaction for unlink and rmdir.
4171 *
4172 * unlink and rmdir are special in btrfs, they do not always free space, so
4173 * if we cannot make our reservations the normal way try and see if there is
4174 * plenty of slack room in the global reserve to migrate, otherwise we cannot
4175 * allow the unlink to occur.
4176 */
4177static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
4178{
4179	struct btrfs_root *root = BTRFS_I(dir)->root;
4180
4181	/*
4182	 * 1 for the possible orphan item
4183	 * 1 for the dir item
4184	 * 1 for the dir index
4185	 * 1 for the inode ref
4186	 * 1 for the inode
4187	 */
4188	return btrfs_start_transaction_fallback_global_rsv(root, 5);
4189}
4190
4191static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4192{
4193	struct btrfs_trans_handle *trans;
4194	struct inode *inode = d_inode(dentry);
4195	int ret;
4196
4197	trans = __unlink_start_trans(dir);
4198	if (IS_ERR(trans))
4199		return PTR_ERR(trans);
4200
4201	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4202			0);
4203
4204	ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
4205			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4206			dentry->d_name.len);
4207	if (ret)
4208		goto out;
4209
4210	if (inode->i_nlink == 0) {
4211		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4212		if (ret)
4213			goto out;
4214	}
4215
4216out:
4217	btrfs_end_transaction(trans);
4218	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4219	return ret;
4220}
4221
4222static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4223			       struct inode *dir, struct dentry *dentry)
4224{
4225	struct btrfs_root *root = BTRFS_I(dir)->root;
4226	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4227	struct btrfs_path *path;
4228	struct extent_buffer *leaf;
4229	struct btrfs_dir_item *di;
4230	struct btrfs_key key;
4231	const char *name = dentry->d_name.name;
4232	int name_len = dentry->d_name.len;
4233	u64 index;
4234	int ret;
4235	u64 objectid;
4236	u64 dir_ino = btrfs_ino(BTRFS_I(dir));
4237
4238	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4239		objectid = inode->root->root_key.objectid;
4240	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4241		objectid = inode->location.objectid;
4242	} else {
4243		WARN_ON(1);
4244		return -EINVAL;
4245	}
4246
4247	path = btrfs_alloc_path();
4248	if (!path)
4249		return -ENOMEM;
4250
4251	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4252				   name, name_len, -1);
4253	if (IS_ERR_OR_NULL(di)) {
4254		ret = di ? PTR_ERR(di) : -ENOENT;
4255		goto out;
4256	}
4257
4258	leaf = path->nodes[0];
4259	btrfs_dir_item_key_to_cpu(leaf, di, &key);
4260	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4261	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4262	if (ret) {
4263		btrfs_abort_transaction(trans, ret);
4264		goto out;
4265	}
4266	btrfs_release_path(path);
4267
4268	/*
4269	 * This is a placeholder inode for a subvolume we didn't have a
4270	 * reference to at the time of the snapshot creation.  In the meantime
4271	 * we could have renamed the real subvol link into our snapshot, so
4272	 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4273	 * Instead simply lookup the dir_index_item for this entry so we can
4274	 * remove it.  Otherwise we know we have a ref to the root and we can
4275	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
4276	 */
4277	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4278		di = btrfs_search_dir_index_item(root, path, dir_ino,
4279						 name, name_len);
4280		if (IS_ERR_OR_NULL(di)) {
4281			if (!di)
4282				ret = -ENOENT;
4283			else
4284				ret = PTR_ERR(di);
4285			btrfs_abort_transaction(trans, ret);
4286			goto out;
4287		}
4288
4289		leaf = path->nodes[0];
4290		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4291		index = key.offset;
4292		btrfs_release_path(path);
4293	} else {
4294		ret = btrfs_del_root_ref(trans, objectid,
4295					 root->root_key.objectid, dir_ino,
4296					 &index, name, name_len);
4297		if (ret) {
4298			btrfs_abort_transaction(trans, ret);
4299			goto out;
4300		}
4301	}
4302
4303	ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
4304	if (ret) {
4305		btrfs_abort_transaction(trans, ret);
4306		goto out;
4307	}
4308
4309	btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
4310	inode_inc_iversion(dir);
4311	dir->i_mtime = dir->i_ctime = current_time(dir);
4312	ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
4313	if (ret)
4314		btrfs_abort_transaction(trans, ret);
4315out:
4316	btrfs_free_path(path);
4317	return ret;
4318}
4319
4320/*
4321 * Helper to check if the subvolume references other subvolumes or if it's
4322 * default.
4323 */
4324static noinline int may_destroy_subvol(struct btrfs_root *root)
4325{
4326	struct btrfs_fs_info *fs_info = root->fs_info;
4327	struct btrfs_path *path;
4328	struct btrfs_dir_item *di;
4329	struct btrfs_key key;
4330	u64 dir_id;
4331	int ret;
4332
4333	path = btrfs_alloc_path();
4334	if (!path)
4335		return -ENOMEM;
4336
4337	/* Make sure this root isn't set as the default subvol */
4338	dir_id = btrfs_super_root_dir(fs_info->super_copy);
4339	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4340				   dir_id, "default", 7, 0);
4341	if (di && !IS_ERR(di)) {
4342		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4343		if (key.objectid == root->root_key.objectid) {
4344			ret = -EPERM;
4345			btrfs_err(fs_info,
4346				  "deleting default subvolume %llu is not allowed",
4347				  key.objectid);
4348			goto out;
4349		}
4350		btrfs_release_path(path);
4351	}
4352
4353	key.objectid = root->root_key.objectid;
4354	key.type = BTRFS_ROOT_REF_KEY;
4355	key.offset = (u64)-1;
4356
4357	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4358	if (ret < 0)
4359		goto out;
4360	BUG_ON(ret == 0);
4361
4362	ret = 0;
4363	if (path->slots[0] > 0) {
4364		path->slots[0]--;
4365		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4366		if (key.objectid == root->root_key.objectid &&
4367		    key.type == BTRFS_ROOT_REF_KEY)
4368			ret = -ENOTEMPTY;
4369	}
4370out:
4371	btrfs_free_path(path);
4372	return ret;
4373}
4374
4375/* Delete all dentries for inodes belonging to the root */
4376static void btrfs_prune_dentries(struct btrfs_root *root)
4377{
4378	struct btrfs_fs_info *fs_info = root->fs_info;
4379	struct rb_node *node;
4380	struct rb_node *prev;
4381	struct btrfs_inode *entry;
4382	struct inode *inode;
4383	u64 objectid = 0;
4384
4385	if (!BTRFS_FS_ERROR(fs_info))
4386		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4387
4388	spin_lock(&root->inode_lock);
4389again:
4390	node = root->inode_tree.rb_node;
4391	prev = NULL;
4392	while (node) {
4393		prev = node;
4394		entry = rb_entry(node, struct btrfs_inode, rb_node);
4395
4396		if (objectid < btrfs_ino(entry))
4397			node = node->rb_left;
4398		else if (objectid > btrfs_ino(entry))
4399			node = node->rb_right;
4400		else
4401			break;
4402	}
4403	if (!node) {
4404		while (prev) {
4405			entry = rb_entry(prev, struct btrfs_inode, rb_node);
4406			if (objectid <= btrfs_ino(entry)) {
4407				node = prev;
4408				break;
4409			}
4410			prev = rb_next(prev);
4411		}
4412	}
4413	while (node) {
4414		entry = rb_entry(node, struct btrfs_inode, rb_node);
4415		objectid = btrfs_ino(entry) + 1;
4416		inode = igrab(&entry->vfs_inode);
4417		if (inode) {
4418			spin_unlock(&root->inode_lock);
4419			if (atomic_read(&inode->i_count) > 1)
4420				d_prune_aliases(inode);
4421			/*
4422			 * btrfs_drop_inode will have it removed from the inode
4423			 * cache when its usage count hits zero.
4424			 */
4425			iput(inode);
4426			cond_resched();
4427			spin_lock(&root->inode_lock);
4428			goto again;
4429		}
4430
4431		if (cond_resched_lock(&root->inode_lock))
4432			goto again;
4433
4434		node = rb_next(node);
4435	}
4436	spin_unlock(&root->inode_lock);
4437}
4438
4439int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
4440{
4441	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
4442	struct btrfs_root *root = BTRFS_I(dir)->root;
4443	struct inode *inode = d_inode(dentry);
4444	struct btrfs_root *dest = BTRFS_I(inode)->root;
4445	struct btrfs_trans_handle *trans;
4446	struct btrfs_block_rsv block_rsv;
4447	u64 root_flags;
4448	int ret;
4449
4450	/*
4451	 * Don't allow to delete a subvolume with send in progress. This is
4452	 * inside the inode lock so the error handling that has to drop the bit
4453	 * again is not run concurrently.
4454	 */
4455	spin_lock(&dest->root_item_lock);
4456	if (dest->send_in_progress) {
4457		spin_unlock(&dest->root_item_lock);
4458		btrfs_warn(fs_info,
4459			   "attempt to delete subvolume %llu during send",
4460			   dest->root_key.objectid);
4461		return -EPERM;
4462	}
4463	root_flags = btrfs_root_flags(&dest->root_item);
4464	btrfs_set_root_flags(&dest->root_item,
4465			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4466	spin_unlock(&dest->root_item_lock);
4467
4468	down_write(&fs_info->subvol_sem);
4469
4470	ret = may_destroy_subvol(dest);
4471	if (ret)
4472		goto out_up_write;
4473
4474	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4475	/*
4476	 * One for dir inode,
4477	 * two for dir entries,
4478	 * two for root ref/backref.
4479	 */
4480	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4481	if (ret)
4482		goto out_up_write;
4483
4484	trans = btrfs_start_transaction(root, 0);
4485	if (IS_ERR(trans)) {
4486		ret = PTR_ERR(trans);
4487		goto out_release;
4488	}
4489	trans->block_rsv = &block_rsv;
4490	trans->bytes_reserved = block_rsv.size;
4491
4492	btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
4493
4494	ret = btrfs_unlink_subvol(trans, dir, dentry);
4495	if (ret) {
4496		btrfs_abort_transaction(trans, ret);
4497		goto out_end_trans;
4498	}
4499
4500	ret = btrfs_record_root_in_trans(trans, dest);
4501	if (ret) {
4502		btrfs_abort_transaction(trans, ret);
4503		goto out_end_trans;
4504	}
4505
4506	memset(&dest->root_item.drop_progress, 0,
4507		sizeof(dest->root_item.drop_progress));
4508	btrfs_set_root_drop_level(&dest->root_item, 0);
4509	btrfs_set_root_refs(&dest->root_item, 0);
4510
4511	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4512		ret = btrfs_insert_orphan_item(trans,
4513					fs_info->tree_root,
4514					dest->root_key.objectid);
4515		if (ret) {
4516			btrfs_abort_transaction(trans, ret);
4517			goto out_end_trans;
4518		}
4519	}
4520
4521	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4522				  BTRFS_UUID_KEY_SUBVOL,
4523				  dest->root_key.objectid);
4524	if (ret && ret != -ENOENT) {
4525		btrfs_abort_transaction(trans, ret);
4526		goto out_end_trans;
4527	}
4528	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4529		ret = btrfs_uuid_tree_remove(trans,
4530					  dest->root_item.received_uuid,
4531					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4532					  dest->root_key.objectid);
4533		if (ret && ret != -ENOENT) {
4534			btrfs_abort_transaction(trans, ret);
4535			goto out_end_trans;
4536		}
4537	}
4538
4539	free_anon_bdev(dest->anon_dev);
4540	dest->anon_dev = 0;
4541out_end_trans:
4542	trans->block_rsv = NULL;
4543	trans->bytes_reserved = 0;
4544	ret = btrfs_end_transaction(trans);
4545	inode->i_flags |= S_DEAD;
4546out_release:
4547	btrfs_subvolume_release_metadata(root, &block_rsv);
4548out_up_write:
4549	up_write(&fs_info->subvol_sem);
4550	if (ret) {
4551		spin_lock(&dest->root_item_lock);
4552		root_flags = btrfs_root_flags(&dest->root_item);
4553		btrfs_set_root_flags(&dest->root_item,
4554				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4555		spin_unlock(&dest->root_item_lock);
4556	} else {
4557		d_invalidate(dentry);
4558		btrfs_prune_dentries(dest);
4559		ASSERT(dest->send_in_progress == 0);
4560	}
4561
4562	return ret;
4563}
4564
4565static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4566{
4567	struct inode *inode = d_inode(dentry);
4568	int err = 0;
4569	struct btrfs_trans_handle *trans;
4570	u64 last_unlink_trans;
4571
4572	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4573		return -ENOTEMPTY;
4574	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
4575		return btrfs_delete_subvolume(dir, dentry);
4576
4577	trans = __unlink_start_trans(dir);
4578	if (IS_ERR(trans))
4579		return PTR_ERR(trans);
4580
4581	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4582		err = btrfs_unlink_subvol(trans, dir, dentry);
4583		goto out;
4584	}
4585
4586	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4587	if (err)
4588		goto out;
4589
4590	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4591
4592	/* now the directory is empty */
4593	err = btrfs_unlink_inode(trans, BTRFS_I(dir),
4594			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4595			dentry->d_name.len);
4596	if (!err) {
4597		btrfs_i_size_write(BTRFS_I(inode), 0);
4598		/*
4599		 * Propagate the last_unlink_trans value of the deleted dir to
4600		 * its parent directory. This is to prevent an unrecoverable
4601		 * log tree in the case we do something like this:
4602		 * 1) create dir foo
4603		 * 2) create snapshot under dir foo
4604		 * 3) delete the snapshot
4605		 * 4) rmdir foo
4606		 * 5) mkdir foo
4607		 * 6) fsync foo or some file inside foo
4608		 */
4609		if (last_unlink_trans >= trans->transid)
4610			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4611	}
4612out:
4613	btrfs_end_transaction(trans);
4614	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4615
4616	return err;
4617}
4618
4619/*
4620 * btrfs_truncate_block - read, zero a chunk and write a block
4621 * @inode - inode that we're zeroing
4622 * @from - the offset to start zeroing
4623 * @len - the length to zero, 0 to zero the entire range respective to the
4624 *	offset
4625 * @front - zero up to the offset instead of from the offset on
4626 *
4627 * This will find the block for the "from" offset and cow the block and zero the
4628 * part we want to zero.  This is used with truncate and hole punching.
4629 */
4630int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
4631			 int front)
4632{
4633	struct btrfs_fs_info *fs_info = inode->root->fs_info;
4634	struct address_space *mapping = inode->vfs_inode.i_mapping;
4635	struct extent_io_tree *io_tree = &inode->io_tree;
4636	struct btrfs_ordered_extent *ordered;
4637	struct extent_state *cached_state = NULL;
4638	struct extent_changeset *data_reserved = NULL;
4639	bool only_release_metadata = false;
4640	u32 blocksize = fs_info->sectorsize;
4641	pgoff_t index = from >> PAGE_SHIFT;
4642	unsigned offset = from & (blocksize - 1);
4643	struct page *page;
4644	gfp_t mask = btrfs_alloc_write_mask(mapping);
4645	size_t write_bytes = blocksize;
4646	int ret = 0;
4647	u64 block_start;
4648	u64 block_end;
4649
4650	if (IS_ALIGNED(offset, blocksize) &&
4651	    (!len || IS_ALIGNED(len, blocksize)))
4652		goto out;
4653
4654	block_start = round_down(from, blocksize);
4655	block_end = block_start + blocksize - 1;
4656
4657	ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
4658					  blocksize);
4659	if (ret < 0) {
4660		if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
4661			/* For nocow case, no need to reserve data space */
4662			only_release_metadata = true;
4663		} else {
4664			goto out;
4665		}
4666	}
4667	ret = btrfs_delalloc_reserve_metadata(inode, blocksize);
4668	if (ret < 0) {
4669		if (!only_release_metadata)
4670			btrfs_free_reserved_data_space(inode, data_reserved,
4671						       block_start, blocksize);
4672		goto out;
4673	}
4674again:
4675	page = find_or_create_page(mapping, index, mask);
4676	if (!page) {
4677		btrfs_delalloc_release_space(inode, data_reserved, block_start,
4678					     blocksize, true);
4679		btrfs_delalloc_release_extents(inode, blocksize);
4680		ret = -ENOMEM;
4681		goto out;
4682	}
4683	ret = set_page_extent_mapped(page);
4684	if (ret < 0)
4685		goto out_unlock;
4686
4687	if (!PageUptodate(page)) {
4688		ret = btrfs_readpage(NULL, page);
4689		lock_page(page);
4690		if (page->mapping != mapping) {
4691			unlock_page(page);
4692			put_page(page);
4693			goto again;
4694		}
4695		if (!PageUptodate(page)) {
4696			ret = -EIO;
4697			goto out_unlock;
4698		}
4699	}
4700	wait_on_page_writeback(page);
4701
4702	lock_extent_bits(io_tree, block_start, block_end, &cached_state);
4703
4704	ordered = btrfs_lookup_ordered_extent(inode, block_start);
4705	if (ordered) {
4706		unlock_extent_cached(io_tree, block_start, block_end,
4707				     &cached_state);
4708		unlock_page(page);
4709		put_page(page);
4710		btrfs_start_ordered_extent(ordered, 1);
4711		btrfs_put_ordered_extent(ordered);
4712		goto again;
4713	}
4714
4715	clear_extent_bit(&inode->io_tree, block_start, block_end,
4716			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4717			 0, 0, &cached_state);
4718
4719	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4720					&cached_state);
4721	if (ret) {
4722		unlock_extent_cached(io_tree, block_start, block_end,
4723				     &cached_state);
4724		goto out_unlock;
4725	}
4726
4727	if (offset != blocksize) {
4728		if (!len)
4729			len = blocksize - offset;
4730		if (front)
4731			memzero_page(page, (block_start - page_offset(page)),
4732				     offset);
4733		else
4734			memzero_page(page, (block_start - page_offset(page)) + offset,
4735				     len);
4736		flush_dcache_page(page);
4737	}
4738	btrfs_page_clear_checked(fs_info, page, block_start,
4739				 block_end + 1 - block_start);
4740	btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
4741	unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
4742
4743	if (only_release_metadata)
4744		set_extent_bit(&inode->io_tree, block_start, block_end,
4745			       EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
4746
4747out_unlock:
4748	if (ret) {
4749		if (only_release_metadata)
4750			btrfs_delalloc_release_metadata(inode, blocksize, true);
4751		else
4752			btrfs_delalloc_release_space(inode, data_reserved,
4753					block_start, blocksize, true);
4754	}
4755	btrfs_delalloc_release_extents(inode, blocksize);
4756	unlock_page(page);
4757	put_page(page);
4758out:
4759	if (only_release_metadata)
4760		btrfs_check_nocow_unlock(inode);
4761	extent_changeset_free(data_reserved);
4762	return ret;
4763}
4764
4765static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
4766			     u64 offset, u64 len)
4767{
4768	struct btrfs_fs_info *fs_info = root->fs_info;
4769	struct btrfs_trans_handle *trans;
4770	struct btrfs_drop_extents_args drop_args = { 0 };
4771	int ret;
4772
4773	/*
4774	 * If NO_HOLES is enabled, we don't need to do anything.
4775	 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
4776	 * or btrfs_update_inode() will be called, which guarantee that the next
4777	 * fsync will know this inode was changed and needs to be logged.
4778	 */
4779	if (btrfs_fs_incompat(fs_info, NO_HOLES))
4780		return 0;
4781
4782	/*
4783	 * 1 - for the one we're dropping
4784	 * 1 - for the one we're adding
4785	 * 1 - for updating the inode.
4786	 */
4787	trans = btrfs_start_transaction(root, 3);
4788	if (IS_ERR(trans))
4789		return PTR_ERR(trans);
4790
4791	drop_args.start = offset;
4792	drop_args.end = offset + len;
4793	drop_args.drop_cache = true;
4794
4795	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
4796	if (ret) {
4797		btrfs_abort_transaction(trans, ret);
4798		btrfs_end_transaction(trans);
4799		return ret;
4800	}
4801
4802	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
4803			offset, 0, 0, len, 0, len, 0, 0, 0);
4804	if (ret) {
4805		btrfs_abort_transaction(trans, ret);
4806	} else {
4807		btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
4808		btrfs_update_inode(trans, root, inode);
4809	}
4810	btrfs_end_transaction(trans);
4811	return ret;
4812}
4813
4814/*
4815 * This function puts in dummy file extents for the area we're creating a hole
4816 * for.  So if we are truncating this file to a larger size we need to insert
4817 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4818 * the range between oldsize and size
4819 */
4820int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
4821{
4822	struct btrfs_root *root = inode->root;
4823	struct btrfs_fs_info *fs_info = root->fs_info;
4824	struct extent_io_tree *io_tree = &inode->io_tree;
4825	struct extent_map *em = NULL;
4826	struct extent_state *cached_state = NULL;
4827	struct extent_map_tree *em_tree = &inode->extent_tree;
4828	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4829	u64 block_end = ALIGN(size, fs_info->sectorsize);
4830	u64 last_byte;
4831	u64 cur_offset;
4832	u64 hole_size;
4833	int err = 0;
4834
4835	/*
4836	 * If our size started in the middle of a block we need to zero out the
4837	 * rest of the block before we expand the i_size, otherwise we could
4838	 * expose stale data.
4839	 */
4840	err = btrfs_truncate_block(inode, oldsize, 0, 0);
4841	if (err)
4842		return err;
4843
4844	if (size <= hole_start)
4845		return 0;
4846
4847	btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
4848					   &cached_state);
4849	cur_offset = hole_start;
4850	while (1) {
4851		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4852				      block_end - cur_offset);
4853		if (IS_ERR(em)) {
4854			err = PTR_ERR(em);
4855			em = NULL;
4856			break;
4857		}
4858		last_byte = min(extent_map_end(em), block_end);
4859		last_byte = ALIGN(last_byte, fs_info->sectorsize);
4860		hole_size = last_byte - cur_offset;
4861
4862		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4863			struct extent_map *hole_em;
4864
4865			err = maybe_insert_hole(root, inode, cur_offset,
4866						hole_size);
4867			if (err)
4868				break;
4869
4870			err = btrfs_inode_set_file_extent_range(inode,
4871							cur_offset, hole_size);
4872			if (err)
4873				break;
4874
4875			btrfs_drop_extent_cache(inode, cur_offset,
4876						cur_offset + hole_size - 1, 0);
4877			hole_em = alloc_extent_map();
4878			if (!hole_em) {
4879				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4880					&inode->runtime_flags);
4881				goto next;
4882			}
4883			hole_em->start = cur_offset;
4884			hole_em->len = hole_size;
4885			hole_em->orig_start = cur_offset;
4886
4887			hole_em->block_start = EXTENT_MAP_HOLE;
4888			hole_em->block_len = 0;
4889			hole_em->orig_block_len = 0;
4890			hole_em->ram_bytes = hole_size;
4891			hole_em->compress_type = BTRFS_COMPRESS_NONE;
4892			hole_em->generation = fs_info->generation;
4893
4894			while (1) {
4895				write_lock(&em_tree->lock);
4896				err = add_extent_mapping(em_tree, hole_em, 1);
4897				write_unlock(&em_tree->lock);
4898				if (err != -EEXIST)
4899					break;
4900				btrfs_drop_extent_cache(inode, cur_offset,
4901							cur_offset +
4902							hole_size - 1, 0);
4903			}
4904			free_extent_map(hole_em);
4905		} else {
4906			err = btrfs_inode_set_file_extent_range(inode,
4907							cur_offset, hole_size);
4908			if (err)
4909				break;
4910		}
4911next:
4912		free_extent_map(em);
4913		em = NULL;
4914		cur_offset = last_byte;
4915		if (cur_offset >= block_end)
4916			break;
4917	}
4918	free_extent_map(em);
4919	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
4920	return err;
4921}
4922
4923static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4924{
4925	struct btrfs_root *root = BTRFS_I(inode)->root;
4926	struct btrfs_trans_handle *trans;
4927	loff_t oldsize = i_size_read(inode);
4928	loff_t newsize = attr->ia_size;
4929	int mask = attr->ia_valid;
4930	int ret;
4931
4932	/*
4933	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4934	 * special case where we need to update the times despite not having
4935	 * these flags set.  For all other operations the VFS set these flags
4936	 * explicitly if it wants a timestamp update.
4937	 */
4938	if (newsize != oldsize) {
4939		inode_inc_iversion(inode);
4940		if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
4941			inode->i_ctime = inode->i_mtime =
4942				current_time(inode);
4943	}
4944
4945	if (newsize > oldsize) {
4946		/*
4947		 * Don't do an expanding truncate while snapshotting is ongoing.
4948		 * This is to ensure the snapshot captures a fully consistent
4949		 * state of this file - if the snapshot captures this expanding
4950		 * truncation, it must capture all writes that happened before
4951		 * this truncation.
4952		 */
4953		btrfs_drew_write_lock(&root->snapshot_lock);
4954		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
4955		if (ret) {
4956			btrfs_drew_write_unlock(&root->snapshot_lock);
4957			return ret;
4958		}
4959
4960		trans = btrfs_start_transaction(root, 1);
4961		if (IS_ERR(trans)) {
4962			btrfs_drew_write_unlock(&root->snapshot_lock);
4963			return PTR_ERR(trans);
4964		}
4965
4966		i_size_write(inode, newsize);
4967		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
4968		pagecache_isize_extended(inode, oldsize, newsize);
4969		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
4970		btrfs_drew_write_unlock(&root->snapshot_lock);
4971		btrfs_end_transaction(trans);
4972	} else {
4973		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4974
4975		if (btrfs_is_zoned(fs_info)) {
4976			ret = btrfs_wait_ordered_range(inode,
4977					ALIGN(newsize, fs_info->sectorsize),
4978					(u64)-1);
4979			if (ret)
4980				return ret;
4981		}
4982
4983		/*
4984		 * We're truncating a file that used to have good data down to
4985		 * zero. Make sure any new writes to the file get on disk
4986		 * on close.
4987		 */
4988		if (newsize == 0)
4989			set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
4990				&BTRFS_I(inode)->runtime_flags);
4991
4992		truncate_setsize(inode, newsize);
4993
4994		inode_dio_wait(inode);
4995
4996		ret = btrfs_truncate(inode, newsize == oldsize);
4997		if (ret && inode->i_nlink) {
4998			int err;
4999
5000			/*
5001			 * Truncate failed, so fix up the in-memory size. We
5002			 * adjusted disk_i_size down as we removed extents, so
5003			 * wait for disk_i_size to be stable and then update the
5004			 * in-memory size to match.
5005			 */
5006			err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5007			if (err)
5008				return err;
5009			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5010		}
5011	}
5012
5013	return ret;
5014}
5015
5016static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
5017			 struct iattr *attr)
5018{
5019	struct inode *inode = d_inode(dentry);
5020	struct btrfs_root *root = BTRFS_I(inode)->root;
5021	int err;
5022
5023	if (btrfs_root_readonly(root))
5024		return -EROFS;
5025
5026	err = setattr_prepare(mnt_userns, dentry, attr);
5027	if (err)
5028		return err;
5029
5030	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5031		err = btrfs_setsize(inode, attr);
5032		if (err)
5033			return err;
5034	}
5035
5036	if (attr->ia_valid) {
5037		setattr_copy(mnt_userns, inode, attr);
5038		inode_inc_iversion(inode);
5039		err = btrfs_dirty_inode(inode);
5040
5041		if (!err && attr->ia_valid & ATTR_MODE)
5042			err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
5043	}
5044
5045	return err;
5046}
5047
5048/*
5049 * While truncating the inode pages during eviction, we get the VFS calling
5050 * btrfs_invalidatepage() against each page of the inode. This is slow because
5051 * the calls to btrfs_invalidatepage() result in a huge amount of calls to
5052 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
5053 * extent_state structures over and over, wasting lots of time.
5054 *
5055 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
5056 * those expensive operations on a per page basis and do only the ordered io
5057 * finishing, while we release here the extent_map and extent_state structures,
5058 * without the excessive merging and splitting.
5059 */
5060static void evict_inode_truncate_pages(struct inode *inode)
5061{
5062	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5063	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
5064	struct rb_node *node;
5065
5066	ASSERT(inode->i_state & I_FREEING);
5067	truncate_inode_pages_final(&inode->i_data);
5068
5069	write_lock(&map_tree->lock);
5070	while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
5071		struct extent_map *em;
5072
5073		node = rb_first_cached(&map_tree->map);
5074		em = rb_entry(node, struct extent_map, rb_node);
5075		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
5076		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
5077		remove_extent_mapping(map_tree, em);
5078		free_extent_map(em);
5079		if (need_resched()) {
5080			write_unlock(&map_tree->lock);
5081			cond_resched();
5082			write_lock(&map_tree->lock);
5083		}
5084	}
5085	write_unlock(&map_tree->lock);
5086
5087	/*
5088	 * Keep looping until we have no more ranges in the io tree.
5089	 * We can have ongoing bios started by readahead that have
5090	 * their endio callback (extent_io.c:end_bio_extent_readpage)
5091	 * still in progress (unlocked the pages in the bio but did not yet
5092	 * unlocked the ranges in the io tree). Therefore this means some
5093	 * ranges can still be locked and eviction started because before
5094	 * submitting those bios, which are executed by a separate task (work
5095	 * queue kthread), inode references (inode->i_count) were not taken
5096	 * (which would be dropped in the end io callback of each bio).
5097	 * Therefore here we effectively end up waiting for those bios and
5098	 * anyone else holding locked ranges without having bumped the inode's
5099	 * reference count - if we don't do it, when they access the inode's
5100	 * io_tree to unlock a range it may be too late, leading to an
5101	 * use-after-free issue.
5102	 */
5103	spin_lock(&io_tree->lock);
5104	while (!RB_EMPTY_ROOT(&io_tree->state)) {
5105		struct extent_state *state;
5106		struct extent_state *cached_state = NULL;
5107		u64 start;
5108		u64 end;
5109		unsigned state_flags;
5110
5111		node = rb_first(&io_tree->state);
5112		state = rb_entry(node, struct extent_state, rb_node);
5113		start = state->start;
5114		end = state->end;
5115		state_flags = state->state;
5116		spin_unlock(&io_tree->lock);
5117
5118		lock_extent_bits(io_tree, start, end, &cached_state);
5119
5120		/*
5121		 * If still has DELALLOC flag, the extent didn't reach disk,
5122		 * and its reserved space won't be freed by delayed_ref.
5123		 * So we need to free its reserved space here.
5124		 * (Refer to comment in btrfs_invalidatepage, case 2)
5125		 *
5126		 * Note, end is the bytenr of last byte, so we need + 1 here.
5127		 */
5128		if (state_flags & EXTENT_DELALLOC)
5129			btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5130					       end - start + 1);
5131
5132		clear_extent_bit(io_tree, start, end,
5133				 EXTENT_LOCKED | EXTENT_DELALLOC |
5134				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
5135				 &cached_state);
5136
5137		cond_resched();
5138		spin_lock(&io_tree->lock);
5139	}
5140	spin_unlock(&io_tree->lock);
5141}
5142
5143static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5144							struct btrfs_block_rsv *rsv)
5145{
5146	struct btrfs_fs_info *fs_info = root->fs_info;
5147	struct btrfs_trans_handle *trans;
5148	u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
5149	int ret;
5150
5151	/*
5152	 * Eviction should be taking place at some place safe because of our
5153	 * delayed iputs.  However the normal flushing code will run delayed
5154	 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5155	 *
5156	 * We reserve the delayed_refs_extra here again because we can't use
5157	 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5158	 * above.  We reserve our extra bit here because we generate a ton of
5159	 * delayed refs activity by truncating.
5160	 *
5161	 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
5162	 * if we fail to make this reservation we can re-try without the
5163	 * delayed_refs_extra so we can make some forward progress.
5164	 */
5165	ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
5166				     BTRFS_RESERVE_FLUSH_EVICT);
5167	if (ret) {
5168		ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
5169					     BTRFS_RESERVE_FLUSH_EVICT);
5170		if (ret) {
5171			btrfs_warn(fs_info,
5172				   "could not allocate space for delete; will truncate on mount");
5173			return ERR_PTR(-ENOSPC);
5174		}
5175		delayed_refs_extra = 0;
5176	}
5177
5178	trans = btrfs_join_transaction(root);
5179	if (IS_ERR(trans))
5180		return trans;
5181
5182	if (delayed_refs_extra) {
5183		trans->block_rsv = &fs_info->trans_block_rsv;
5184		trans->bytes_reserved = delayed_refs_extra;
5185		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5186					delayed_refs_extra, 1);
5187	}
5188	return trans;
5189}
5190
5191void btrfs_evict_inode(struct inode *inode)
5192{
5193	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5194	struct btrfs_trans_handle *trans;
5195	struct btrfs_root *root = BTRFS_I(inode)->root;
5196	struct btrfs_block_rsv *rsv;
5197	int ret;
5198
5199	trace_btrfs_inode_evict(inode);
5200
5201	if (!root) {
5202		fsverity_cleanup_inode(inode);
5203		clear_inode(inode);
5204		return;
5205	}
5206
5207	evict_inode_truncate_pages(inode);
5208
5209	if (inode->i_nlink &&
5210	    ((btrfs_root_refs(&root->root_item) != 0 &&
5211	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5212	     btrfs_is_free_space_inode(BTRFS_I(inode))))
5213		goto no_delete;
5214
5215	if (is_bad_inode(inode))
5216		goto no_delete;
5217
5218	btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
5219
5220	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5221		goto no_delete;
5222
5223	if (inode->i_nlink > 0) {
5224		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5225		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5226		goto no_delete;
5227	}
5228
5229	/*
5230	 * This makes sure the inode item in tree is uptodate and the space for
5231	 * the inode update is released.
5232	 */
5233	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5234	if (ret)
5235		goto no_delete;
5236
5237	/*
5238	 * This drops any pending insert or delete operations we have for this
5239	 * inode.  We could have a delayed dir index deletion queued up, but
5240	 * we're removing the inode completely so that'll be taken care of in
5241	 * the truncate.
5242	 */
5243	btrfs_kill_delayed_inode_items(BTRFS_I(inode));
5244
5245	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5246	if (!rsv)
5247		goto no_delete;
5248	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5249	rsv->failfast = 1;
5250
5251	btrfs_i_size_write(BTRFS_I(inode), 0);
5252
5253	while (1) {
5254		struct btrfs_truncate_control control = {
5255			.inode = BTRFS_I(inode),
5256			.ino = btrfs_ino(BTRFS_I(inode)),
5257			.new_size = 0,
5258			.min_type = 0,
5259		};
5260
5261		trans = evict_refill_and_join(root, rsv);
5262		if (IS_ERR(trans))
5263			goto free_rsv;
5264
5265		trans->block_rsv = rsv;
5266
5267		ret = btrfs_truncate_inode_items(trans, root, &control);
5268		trans->block_rsv = &fs_info->trans_block_rsv;
5269		btrfs_end_transaction(trans);
5270		btrfs_btree_balance_dirty(fs_info);
5271		if (ret && ret != -ENOSPC && ret != -EAGAIN)
5272			goto free_rsv;
5273		else if (!ret)
5274			break;
5275	}
5276
5277	/*
5278	 * Errors here aren't a big deal, it just means we leave orphan items in
5279	 * the tree. They will be cleaned up on the next mount. If the inode
5280	 * number gets reused, cleanup deletes the orphan item without doing
5281	 * anything, and unlink reuses the existing orphan item.
5282	 *
5283	 * If it turns out that we are dropping too many of these, we might want
5284	 * to add a mechanism for retrying these after a commit.
5285	 */
5286	trans = evict_refill_and_join(root, rsv);
5287	if (!IS_ERR(trans)) {
5288		trans->block_rsv = rsv;
5289		btrfs_orphan_del(trans, BTRFS_I(inode));
5290		trans->block_rsv = &fs_info->trans_block_rsv;
5291		btrfs_end_transaction(trans);
5292	}
5293
5294free_rsv:
5295	btrfs_free_block_rsv(fs_info, rsv);
5296no_delete:
5297	/*
5298	 * If we didn't successfully delete, the orphan item will still be in
5299	 * the tree and we'll retry on the next mount. Again, we might also want
5300	 * to retry these periodically in the future.
5301	 */
5302	btrfs_remove_delayed_node(BTRFS_I(inode));
5303	fsverity_cleanup_inode(inode);
5304	clear_inode(inode);
5305}
5306
5307/*
5308 * Return the key found in the dir entry in the location pointer, fill @type
5309 * with BTRFS_FT_*, and return 0.
5310 *
5311 * If no dir entries were found, returns -ENOENT.
5312 * If found a corrupted location in dir entry, returns -EUCLEAN.
5313 */
5314static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
5315			       struct btrfs_key *location, u8 *type)
5316{
5317	const char *name = dentry->d_name.name;
5318	int namelen = dentry->d_name.len;
5319	struct btrfs_dir_item *di;
5320	struct btrfs_path *path;
5321	struct btrfs_root *root = BTRFS_I(dir)->root;
5322	int ret = 0;
5323
5324	path = btrfs_alloc_path();
5325	if (!path)
5326		return -ENOMEM;
5327
5328	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
5329			name, namelen, 0);
5330	if (IS_ERR_OR_NULL(di)) {
5331		ret = di ? PTR_ERR(di) : -ENOENT;
5332		goto out;
5333	}
5334
5335	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5336	if (location->type != BTRFS_INODE_ITEM_KEY &&
5337	    location->type != BTRFS_ROOT_ITEM_KEY) {
5338		ret = -EUCLEAN;
5339		btrfs_warn(root->fs_info,
5340"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5341			   __func__, name, btrfs_ino(BTRFS_I(dir)),
5342			   location->objectid, location->type, location->offset);
5343	}
5344	if (!ret)
5345		*type = btrfs_dir_type(path->nodes[0], di);
5346out:
5347	btrfs_free_path(path);
5348	return ret;
5349}
5350
5351/*
5352 * when we hit a tree root in a directory, the btrfs part of the inode
5353 * needs to be changed to reflect the root directory of the tree root.  This
5354 * is kind of like crossing a mount point.
5355 */
5356static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5357				    struct inode *dir,
5358				    struct dentry *dentry,
5359				    struct btrfs_key *location,
5360				    struct btrfs_root **sub_root)
5361{
5362	struct btrfs_path *path;
5363	struct btrfs_root *new_root;
5364	struct btrfs_root_ref *ref;
5365	struct extent_buffer *leaf;
5366	struct btrfs_key key;
5367	int ret;
5368	int err = 0;
5369
5370	path = btrfs_alloc_path();
5371	if (!path) {
5372		err = -ENOMEM;
5373		goto out;
5374	}
5375
5376	err = -ENOENT;
5377	key.objectid = BTRFS_I(dir)->root->root_key.objectid;
5378	key.type = BTRFS_ROOT_REF_KEY;
5379	key.offset = location->objectid;
5380
5381	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5382	if (ret) {
5383		if (ret < 0)
5384			err = ret;
5385		goto out;
5386	}
5387
5388	leaf = path->nodes[0];
5389	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5390	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
5391	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
5392		goto out;
5393
5394	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
5395				   (unsigned long)(ref + 1),
5396				   dentry->d_name.len);
5397	if (ret)
5398		goto out;
5399
5400	btrfs_release_path(path);
5401
5402	new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5403	if (IS_ERR(new_root)) {
5404		err = PTR_ERR(new_root);
5405		goto out;
5406	}
5407
5408	*sub_root = new_root;
5409	location->objectid = btrfs_root_dirid(&new_root->root_item);
5410	location->type = BTRFS_INODE_ITEM_KEY;
5411	location->offset = 0;
5412	err = 0;
5413out:
5414	btrfs_free_path(path);
5415	return err;
5416}
5417
5418static void inode_tree_add(struct inode *inode)
5419{
5420	struct btrfs_root *root = BTRFS_I(inode)->root;
5421	struct btrfs_inode *entry;
5422	struct rb_node **p;
5423	struct rb_node *parent;
5424	struct rb_node *new = &BTRFS_I(inode)->rb_node;
5425	u64 ino = btrfs_ino(BTRFS_I(inode));
5426
5427	if (inode_unhashed(inode))
5428		return;
5429	parent = NULL;
5430	spin_lock(&root->inode_lock);
5431	p = &root->inode_tree.rb_node;
5432	while (*p) {
5433		parent = *p;
5434		entry = rb_entry(parent, struct btrfs_inode, rb_node);
5435
5436		if (ino < btrfs_ino(entry))
5437			p = &parent->rb_left;
5438		else if (ino > btrfs_ino(entry))
5439			p = &parent->rb_right;
5440		else {
5441			WARN_ON(!(entry->vfs_inode.i_state &
5442				  (I_WILL_FREE | I_FREEING)));
5443			rb_replace_node(parent, new, &root->inode_tree);
5444			RB_CLEAR_NODE(parent);
5445			spin_unlock(&root->inode_lock);
5446			return;
5447		}
5448	}
5449	rb_link_node(new, parent, p);
5450	rb_insert_color(new, &root->inode_tree);
5451	spin_unlock(&root->inode_lock);
5452}
5453
5454static void inode_tree_del(struct btrfs_inode *inode)
5455{
5456	struct btrfs_root *root = inode->root;
5457	int empty = 0;
5458
5459	spin_lock(&root->inode_lock);
5460	if (!RB_EMPTY_NODE(&inode->rb_node)) {
5461		rb_erase(&inode->rb_node, &root->inode_tree);
5462		RB_CLEAR_NODE(&inode->rb_node);
5463		empty = RB_EMPTY_ROOT(&root->inode_tree);
5464	}
5465	spin_unlock(&root->inode_lock);
5466
5467	if (empty && btrfs_root_refs(&root->root_item) == 0) {
5468		spin_lock(&root->inode_lock);
5469		empty = RB_EMPTY_ROOT(&root->inode_tree);
5470		spin_unlock(&root->inode_lock);
5471		if (empty)
5472			btrfs_add_dead_root(root);
5473	}
5474}
5475
5476
5477static int btrfs_init_locked_inode(struct inode *inode, void *p)
5478{
5479	struct btrfs_iget_args *args = p;
5480
5481	inode->i_ino = args->ino;
5482	BTRFS_I(inode)->location.objectid = args->ino;
5483	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5484	BTRFS_I(inode)->location.offset = 0;
5485	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5486	BUG_ON(args->root && !BTRFS_I(inode)->root);
5487	return 0;
5488}
5489
5490static int btrfs_find_actor(struct inode *inode, void *opaque)
5491{
5492	struct btrfs_iget_args *args = opaque;
5493
5494	return args->ino == BTRFS_I(inode)->location.objectid &&
5495		args->root == BTRFS_I(inode)->root;
5496}
5497
5498static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
5499				       struct btrfs_root *root)
5500{
5501	struct inode *inode;
5502	struct btrfs_iget_args args;
5503	unsigned long hashval = btrfs_inode_hash(ino, root);
5504
5505	args.ino = ino;
5506	args.root = root;
5507
5508	inode = iget5_locked(s, hashval, btrfs_find_actor,
5509			     btrfs_init_locked_inode,
5510			     (void *)&args);
5511	return inode;
5512}
5513
5514/*
5515 * Get an inode object given its inode number and corresponding root.
5516 * Path can be preallocated to prevent recursing back to iget through
5517 * allocator. NULL is also valid but may require an additional allocation
5518 * later.
5519 */
5520struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
5521			      struct btrfs_root *root, struct btrfs_path *path)
5522{
5523	struct inode *inode;
5524
5525	inode = btrfs_iget_locked(s, ino, root);
5526	if (!inode)
5527		return ERR_PTR(-ENOMEM);
5528
5529	if (inode->i_state & I_NEW) {
5530		int ret;
5531
5532		ret = btrfs_read_locked_inode(inode, path);
5533		if (!ret) {
5534			inode_tree_add(inode);
5535			unlock_new_inode(inode);
5536		} else {
5537			iget_failed(inode);
5538			/*
5539			 * ret > 0 can come from btrfs_search_slot called by
5540			 * btrfs_read_locked_inode, this means the inode item
5541			 * was not found.
5542			 */
5543			if (ret > 0)
5544				ret = -ENOENT;
5545			inode = ERR_PTR(ret);
5546		}
5547	}
5548
5549	return inode;
5550}
5551
5552struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
5553{
5554	return btrfs_iget_path(s, ino, root, NULL);
5555}
5556
5557static struct inode *new_simple_dir(struct super_block *s,
5558				    struct btrfs_key *key,
5559				    struct btrfs_root *root)
5560{
5561	struct inode *inode = new_inode(s);
5562
5563	if (!inode)
5564		return ERR_PTR(-ENOMEM);
5565
5566	BTRFS_I(inode)->root = btrfs_grab_root(root);
5567	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5568	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5569
5570	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5571	/*
5572	 * We only need lookup, the rest is read-only and there's no inode
5573	 * associated with the dentry
5574	 */
5575	inode->i_op = &simple_dir_inode_operations;
5576	inode->i_opflags &= ~IOP_XATTR;
5577	inode->i_fop = &simple_dir_operations;
5578	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5579	inode->i_mtime = current_time(inode);
5580	inode->i_atime = inode->i_mtime;
5581	inode->i_ctime = inode->i_mtime;
5582	BTRFS_I(inode)->i_otime = inode->i_mtime;
5583
5584	return inode;
5585}
5586
5587static inline u8 btrfs_inode_type(struct inode *inode)
5588{
5589	/*
5590	 * Compile-time asserts that generic FT_* types still match
5591	 * BTRFS_FT_* types
5592	 */
5593	BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
5594	BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
5595	BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
5596	BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
5597	BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
5598	BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
5599	BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
5600	BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
5601
5602	return fs_umode_to_ftype(inode->i_mode);
5603}
5604
5605struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5606{
5607	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
5608	struct inode *inode;
5609	struct btrfs_root *root = BTRFS_I(dir)->root;
5610	struct btrfs_root *sub_root = root;
5611	struct btrfs_key location;
5612	u8 di_type = 0;
5613	int ret = 0;
5614
5615	if (dentry->d_name.len > BTRFS_NAME_LEN)
5616		return ERR_PTR(-ENAMETOOLONG);
5617
5618	ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
5619	if (ret < 0)
5620		return ERR_PTR(ret);
5621
5622	if (location.type == BTRFS_INODE_ITEM_KEY) {
5623		inode = btrfs_iget(dir->i_sb, location.objectid, root);
5624		if (IS_ERR(inode))
5625			return inode;
5626
5627		/* Do extra check against inode mode with di_type */
5628		if (btrfs_inode_type(inode) != di_type) {
5629			btrfs_crit(fs_info,
5630"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5631				  inode->i_mode, btrfs_inode_type(inode),
5632				  di_type);
5633			iput(inode);
5634			return ERR_PTR(-EUCLEAN);
5635		}
5636		return inode;
5637	}
5638
5639	ret = fixup_tree_root_location(fs_info, dir, dentry,
5640				       &location, &sub_root);
5641	if (ret < 0) {
5642		if (ret != -ENOENT)
5643			inode = ERR_PTR(ret);
5644		else
5645			inode = new_simple_dir(dir->i_sb, &location, sub_root);
5646	} else {
5647		inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
5648	}
5649	if (root != sub_root)
5650		btrfs_put_root(sub_root);
5651
5652	if (!IS_ERR(inode) && root != sub_root) {
5653		down_read(&fs_info->cleanup_work_sem);
5654		if (!sb_rdonly(inode->i_sb))
5655			ret = btrfs_orphan_cleanup(sub_root);
5656		up_read(&fs_info->cleanup_work_sem);
5657		if (ret) {
5658			iput(inode);
5659			inode = ERR_PTR(ret);
5660		}
5661	}
5662
5663	return inode;
5664}
5665
5666static int btrfs_dentry_delete(const struct dentry *dentry)
5667{
5668	struct btrfs_root *root;
5669	struct inode *inode = d_inode(dentry);
5670
5671	if (!inode && !IS_ROOT(dentry))
5672		inode = d_inode(dentry->d_parent);
5673
5674	if (inode) {
5675		root = BTRFS_I(inode)->root;
5676		if (btrfs_root_refs(&root->root_item) == 0)
5677			return 1;
5678
5679		if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5680			return 1;
5681	}
5682	return 0;
5683}
5684
5685static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5686				   unsigned int flags)
5687{
5688	struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5689
5690	if (inode == ERR_PTR(-ENOENT))
5691		inode = NULL;
5692	return d_splice_alias(inode, dentry);
5693}
5694
5695/*
5696 * All this infrastructure exists because dir_emit can fault, and we are holding
5697 * the tree lock when doing readdir.  For now just allocate a buffer and copy
5698 * our information into that, and then dir_emit from the buffer.  This is
5699 * similar to what NFS does, only we don't keep the buffer around in pagecache
5700 * because I'm afraid I'll mess that up.  Long term we need to make filldir do
5701 * copy_to_user_inatomic so we don't have to worry about page faulting under the
5702 * tree lock.
5703 */
5704static int btrfs_opendir(struct inode *inode, struct file *file)
5705{
5706	struct btrfs_file_private *private;
5707
5708	private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);