inode.c revision e1646070
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007 Oracle.  All rights reserved.
4 */
5
6#include <crypto/hash.h>
7#include <linux/kernel.h>
8#include <linux/bio.h>
9#include <linux/file.h>
10#include <linux/fs.h>
11#include <linux/pagemap.h>
12#include <linux/highmem.h>
13#include <linux/time.h>
14#include <linux/init.h>
15#include <linux/string.h>
16#include <linux/backing-dev.h>
17#include <linux/writeback.h>
18#include <linux/compat.h>
19#include <linux/xattr.h>
20#include <linux/posix_acl.h>
21#include <linux/falloc.h>
22#include <linux/slab.h>
23#include <linux/ratelimit.h>
24#include <linux/btrfs.h>
25#include <linux/blkdev.h>
26#include <linux/posix_acl_xattr.h>
27#include <linux/uio.h>
28#include <linux/magic.h>
29#include <linux/iversion.h>
30#include <linux/swap.h>
31#include <linux/migrate.h>
32#include <linux/sched/mm.h>
33#include <linux/iomap.h>
34#include <asm/unaligned.h>
35#include "misc.h"
36#include "ctree.h"
37#include "disk-io.h"
38#include "transaction.h"
39#include "btrfs_inode.h"
40#include "print-tree.h"
41#include "ordered-data.h"
42#include "xattr.h"
43#include "tree-log.h"
44#include "volumes.h"
45#include "compression.h"
46#include "locking.h"
47#include "free-space-cache.h"
48#include "props.h"
49#include "qgroup.h"
50#include "delalloc-space.h"
51#include "block-group.h"
52#include "space-info.h"
53#include "zoned.h"
54#include "subpage.h"
55
56struct btrfs_iget_args {
57	u64 ino;
58	struct btrfs_root *root;
59};
60
61struct btrfs_dio_data {
62	u64 reserve;
63	loff_t length;
64	ssize_t submitted;
65	struct extent_changeset *data_reserved;
66};
67
68static const struct inode_operations btrfs_dir_inode_operations;
69static const struct inode_operations btrfs_symlink_inode_operations;
70static const struct inode_operations btrfs_special_inode_operations;
71static const struct inode_operations btrfs_file_inode_operations;
72static const struct address_space_operations btrfs_aops;
73static const struct file_operations btrfs_dir_file_operations;
74
75static struct kmem_cache *btrfs_inode_cachep;
76struct kmem_cache *btrfs_trans_handle_cachep;
77struct kmem_cache *btrfs_path_cachep;
78struct kmem_cache *btrfs_free_space_cachep;
79struct kmem_cache *btrfs_free_space_bitmap_cachep;
80
81static int btrfs_setsize(struct inode *inode, struct iattr *attr);
82static int btrfs_truncate(struct inode *inode, bool skip_writeback);
83static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
84static noinline int cow_file_range(struct btrfs_inode *inode,
85				   struct page *locked_page,
86				   u64 start, u64 end, int *page_started,
87				   unsigned long *nr_written, int unlock);
88static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
89				       u64 len, u64 orig_start, u64 block_start,
90				       u64 block_len, u64 orig_block_len,
91				       u64 ram_bytes, int compress_type,
92				       int type);
93
94static void __endio_write_update_ordered(struct btrfs_inode *inode,
95					 const u64 offset, const u64 bytes,
96					 const bool uptodate);
97
98/*
99 * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
100 *
101 * ilock_flags can have the following bit set:
102 *
103 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
104 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
105 *		     return -EAGAIN
106 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
107 */
108int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
109{
110	if (ilock_flags & BTRFS_ILOCK_SHARED) {
111		if (ilock_flags & BTRFS_ILOCK_TRY) {
112			if (!inode_trylock_shared(inode))
113				return -EAGAIN;
114			else
115				return 0;
116		}
117		inode_lock_shared(inode);
118	} else {
119		if (ilock_flags & BTRFS_ILOCK_TRY) {
120			if (!inode_trylock(inode))
121				return -EAGAIN;
122			else
123				return 0;
124		}
125		inode_lock(inode);
126	}
127	if (ilock_flags & BTRFS_ILOCK_MMAP)
128		down_write(&BTRFS_I(inode)->i_mmap_lock);
129	return 0;
130}
131
132/*
133 * btrfs_inode_unlock - unock inode i_rwsem
134 *
135 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
136 * to decide whether the lock acquired is shared or exclusive.
137 */
138void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
139{
140	if (ilock_flags & BTRFS_ILOCK_MMAP)
141		up_write(&BTRFS_I(inode)->i_mmap_lock);
142	if (ilock_flags & BTRFS_ILOCK_SHARED)
143		inode_unlock_shared(inode);
144	else
145		inode_unlock(inode);
146}
147
148/*
149 * Cleanup all submitted ordered extents in specified range to handle errors
150 * from the btrfs_run_delalloc_range() callback.
151 *
152 * NOTE: caller must ensure that when an error happens, it can not call
153 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
154 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
155 * to be released, which we want to happen only when finishing the ordered
156 * extent (btrfs_finish_ordered_io()).
157 */
158static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
159						 struct page *locked_page,
160						 u64 offset, u64 bytes)
161{
162	unsigned long index = offset >> PAGE_SHIFT;
163	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
164	u64 page_start = page_offset(locked_page);
165	u64 page_end = page_start + PAGE_SIZE - 1;
166
167	struct page *page;
168
169	while (index <= end_index) {
170		/*
171		 * For locked page, we will call end_extent_writepage() on it
172		 * in run_delalloc_range() for the error handling.  That
173		 * end_extent_writepage() function will call
174		 * btrfs_mark_ordered_io_finished() to clear page Ordered and
175		 * run the ordered extent accounting.
176		 *
177		 * Here we can't just clear the Ordered bit, or
178		 * btrfs_mark_ordered_io_finished() would skip the accounting
179		 * for the page range, and the ordered extent will never finish.
180		 */
181		if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
182			index++;
183			continue;
184		}
185		page = find_get_page(inode->vfs_inode.i_mapping, index);
186		index++;
187		if (!page)
188			continue;
189
190		/*
191		 * Here we just clear all Ordered bits for every page in the
192		 * range, then __endio_write_update_ordered() will handle
193		 * the ordered extent accounting for the range.
194		 */
195		btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
196					       offset, bytes);
197		put_page(page);
198	}
199
200	/* The locked page covers the full range, nothing needs to be done */
201	if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
202		return;
203	/*
204	 * In case this page belongs to the delalloc range being instantiated
205	 * then skip it, since the first page of a range is going to be
206	 * properly cleaned up by the caller of run_delalloc_range
207	 */
208	if (page_start >= offset && page_end <= (offset + bytes - 1)) {
209		bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
210		offset = page_offset(locked_page) + PAGE_SIZE;
211	}
212
213	return __endio_write_update_ordered(inode, offset, bytes, false);
214}
215
216static int btrfs_dirty_inode(struct inode *inode);
217
218static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
219				     struct inode *inode,  struct inode *dir,
220				     const struct qstr *qstr)
221{
222	int err;
223
224	err = btrfs_init_acl(trans, inode, dir);
225	if (!err)
226		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
227	return err;
228}
229
230/*
231 * this does all the hard work for inserting an inline extent into
232 * the btree.  The caller should have done a btrfs_drop_extents so that
233 * no overlapping inline items exist in the btree
234 */
235static int insert_inline_extent(struct btrfs_trans_handle *trans,
236				struct btrfs_path *path, bool extent_inserted,
237				struct btrfs_root *root, struct inode *inode,
238				u64 start, size_t size, size_t compressed_size,
239				int compress_type,
240				struct page **compressed_pages)
241{
242	struct extent_buffer *leaf;
243	struct page *page = NULL;
244	char *kaddr;
245	unsigned long ptr;
246	struct btrfs_file_extent_item *ei;
247	int ret;
248	size_t cur_size = size;
249	unsigned long offset;
250
251	ASSERT((compressed_size > 0 && compressed_pages) ||
252	       (compressed_size == 0 && !compressed_pages));
253
254	if (compressed_size && compressed_pages)
255		cur_size = compressed_size;
256
257	if (!extent_inserted) {
258		struct btrfs_key key;
259		size_t datasize;
260
261		key.objectid = btrfs_ino(BTRFS_I(inode));
262		key.offset = start;
263		key.type = BTRFS_EXTENT_DATA_KEY;
264
265		datasize = btrfs_file_extent_calc_inline_size(cur_size);
266		ret = btrfs_insert_empty_item(trans, root, path, &key,
267					      datasize);
268		if (ret)
269			goto fail;
270	}
271	leaf = path->nodes[0];
272	ei = btrfs_item_ptr(leaf, path->slots[0],
273			    struct btrfs_file_extent_item);
274	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
275	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
276	btrfs_set_file_extent_encryption(leaf, ei, 0);
277	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
278	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
279	ptr = btrfs_file_extent_inline_start(ei);
280
281	if (compress_type != BTRFS_COMPRESS_NONE) {
282		struct page *cpage;
283		int i = 0;
284		while (compressed_size > 0) {
285			cpage = compressed_pages[i];
286			cur_size = min_t(unsigned long, compressed_size,
287				       PAGE_SIZE);
288
289			kaddr = page_address(cpage);
290			write_extent_buffer(leaf, kaddr, ptr, cur_size);
291
292			i++;
293			ptr += cur_size;
294			compressed_size -= cur_size;
295		}
296		btrfs_set_file_extent_compression(leaf, ei,
297						  compress_type);
298	} else {
299		page = find_get_page(inode->i_mapping,
300				     start >> PAGE_SHIFT);
301		btrfs_set_file_extent_compression(leaf, ei, 0);
302		kaddr = kmap_atomic(page);
303		offset = offset_in_page(start);
304		write_extent_buffer(leaf, kaddr + offset, ptr, size);
305		kunmap_atomic(kaddr);
306		put_page(page);
307	}
308	btrfs_mark_buffer_dirty(leaf);
309	btrfs_release_path(path);
310
311	/*
312	 * We align size to sectorsize for inline extents just for simplicity
313	 * sake.
314	 */
315	size = ALIGN(size, root->fs_info->sectorsize);
316	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
317	if (ret)
318		goto fail;
319
320	/*
321	 * we're an inline extent, so nobody can
322	 * extend the file past i_size without locking
323	 * a page we already have locked.
324	 *
325	 * We must do any isize and inode updates
326	 * before we unlock the pages.  Otherwise we
327	 * could end up racing with unlink.
328	 */
329	BTRFS_I(inode)->disk_i_size = inode->i_size;
330fail:
331	return ret;
332}
333
334
335/*
336 * conditionally insert an inline extent into the file.  This
337 * does the checks required to make sure the data is small enough
338 * to fit as an inline extent.
339 */
340static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
341					  u64 end, size_t compressed_size,
342					  int compress_type,
343					  struct page **compressed_pages)
344{
345	struct btrfs_drop_extents_args drop_args = { 0 };
346	struct btrfs_root *root = inode->root;
347	struct btrfs_fs_info *fs_info = root->fs_info;
348	struct btrfs_trans_handle *trans;
349	u64 isize = i_size_read(&inode->vfs_inode);
350	u64 actual_end = min(end + 1, isize);
351	u64 inline_len = actual_end - start;
352	u64 aligned_end = ALIGN(end, fs_info->sectorsize);
353	u64 data_len = inline_len;
354	int ret;
355	struct btrfs_path *path;
356
357	if (compressed_size)
358		data_len = compressed_size;
359
360	if (start > 0 ||
361	    actual_end > fs_info->sectorsize ||
362	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
363	    (!compressed_size &&
364	    (actual_end & (fs_info->sectorsize - 1)) == 0) ||
365	    end + 1 < isize ||
366	    data_len > fs_info->max_inline) {
367		return 1;
368	}
369
370	path = btrfs_alloc_path();
371	if (!path)
372		return -ENOMEM;
373
374	trans = btrfs_join_transaction(root);
375	if (IS_ERR(trans)) {
376		btrfs_free_path(path);
377		return PTR_ERR(trans);
378	}
379	trans->block_rsv = &inode->block_rsv;
380
381	drop_args.path = path;
382	drop_args.start = start;
383	drop_args.end = aligned_end;
384	drop_args.drop_cache = true;
385	drop_args.replace_extent = true;
386
387	if (compressed_size && compressed_pages)
388		drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
389		   compressed_size);
390	else
391		drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
392		    inline_len);
393
394	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
395	if (ret) {
396		btrfs_abort_transaction(trans, ret);
397		goto out;
398	}
399
400	if (isize > actual_end)
401		inline_len = min_t(u64, isize, actual_end);
402	ret = insert_inline_extent(trans, path, drop_args.extent_inserted,
403				   root, &inode->vfs_inode, start,
404				   inline_len, compressed_size,
405				   compress_type, compressed_pages);
406	if (ret && ret != -ENOSPC) {
407		btrfs_abort_transaction(trans, ret);
408		goto out;
409	} else if (ret == -ENOSPC) {
410		ret = 1;
411		goto out;
412	}
413
414	btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found);
415	ret = btrfs_update_inode(trans, root, inode);
416	if (ret && ret != -ENOSPC) {
417		btrfs_abort_transaction(trans, ret);
418		goto out;
419	} else if (ret == -ENOSPC) {
420		ret = 1;
421		goto out;
422	}
423
424	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
425out:
426	/*
427	 * Don't forget to free the reserved space, as for inlined extent
428	 * it won't count as data extent, free them directly here.
429	 * And at reserve time, it's always aligned to page size, so
430	 * just free one page here.
431	 */
432	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
433	btrfs_free_path(path);
434	btrfs_end_transaction(trans);
435	return ret;
436}
437
438struct async_extent {
439	u64 start;
440	u64 ram_size;
441	u64 compressed_size;
442	struct page **pages;
443	unsigned long nr_pages;
444	int compress_type;
445	struct list_head list;
446};
447
448struct async_chunk {
449	struct inode *inode;
450	struct page *locked_page;
451	u64 start;
452	u64 end;
453	unsigned int write_flags;
454	struct list_head extents;
455	struct cgroup_subsys_state *blkcg_css;
456	struct btrfs_work work;
457	atomic_t *pending;
458};
459
460struct async_cow {
461	/* Number of chunks in flight; must be first in the structure */
462	atomic_t num_chunks;
463	struct async_chunk chunks[];
464};
465
466static noinline int add_async_extent(struct async_chunk *cow,
467				     u64 start, u64 ram_size,
468				     u64 compressed_size,
469				     struct page **pages,
470				     unsigned long nr_pages,
471				     int compress_type)
472{
473	struct async_extent *async_extent;
474
475	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
476	BUG_ON(!async_extent); /* -ENOMEM */
477	async_extent->start = start;
478	async_extent->ram_size = ram_size;
479	async_extent->compressed_size = compressed_size;
480	async_extent->pages = pages;
481	async_extent->nr_pages = nr_pages;
482	async_extent->compress_type = compress_type;
483	list_add_tail(&async_extent->list, &cow->extents);
484	return 0;
485}
486
487/*
488 * Check if the inode has flags compatible with compression
489 */
490static inline bool inode_can_compress(struct btrfs_inode *inode)
491{
492	/* Subpage doesn't support compression yet */
493	if (inode->root->fs_info->sectorsize < PAGE_SIZE)
494		return false;
495	if (inode->flags & BTRFS_INODE_NODATACOW ||
496	    inode->flags & BTRFS_INODE_NODATASUM)
497		return false;
498	return true;
499}
500
501/*
502 * Check if the inode needs to be submitted to compression, based on mount
503 * options, defragmentation, properties or heuristics.
504 */
505static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
506				      u64 end)
507{
508	struct btrfs_fs_info *fs_info = inode->root->fs_info;
509
510	if (!inode_can_compress(inode)) {
511		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
512			KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
513			btrfs_ino(inode));
514		return 0;
515	}
516	/* force compress */
517	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
518		return 1;
519	/* defrag ioctl */
520	if (inode->defrag_compress)
521		return 1;
522	/* bad compression ratios */
523	if (inode->flags & BTRFS_INODE_NOCOMPRESS)
524		return 0;
525	if (btrfs_test_opt(fs_info, COMPRESS) ||
526	    inode->flags & BTRFS_INODE_COMPRESS ||
527	    inode->prop_compress)
528		return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
529	return 0;
530}
531
532static inline void inode_should_defrag(struct btrfs_inode *inode,
533		u64 start, u64 end, u64 num_bytes, u64 small_write)
534{
535	/* If this is a small write inside eof, kick off a defrag */
536	if (num_bytes < small_write &&
537	    (start > 0 || end + 1 < inode->disk_i_size))
538		btrfs_add_inode_defrag(NULL, inode);
539}
540
541/*
542 * we create compressed extents in two phases.  The first
543 * phase compresses a range of pages that have already been
544 * locked (both pages and state bits are locked).
545 *
546 * This is done inside an ordered work queue, and the compression
547 * is spread across many cpus.  The actual IO submission is step
548 * two, and the ordered work queue takes care of making sure that
549 * happens in the same order things were put onto the queue by
550 * writepages and friends.
551 *
552 * If this code finds it can't get good compression, it puts an
553 * entry onto the work queue to write the uncompressed bytes.  This
554 * makes sure that both compressed inodes and uncompressed inodes
555 * are written in the same order that the flusher thread sent them
556 * down.
557 */
558static noinline int compress_file_range(struct async_chunk *async_chunk)
559{
560	struct inode *inode = async_chunk->inode;
561	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
562	u64 blocksize = fs_info->sectorsize;
563	u64 start = async_chunk->start;
564	u64 end = async_chunk->end;
565	u64 actual_end;
566	u64 i_size;
567	int ret = 0;
568	struct page **pages = NULL;
569	unsigned long nr_pages;
570	unsigned long total_compressed = 0;
571	unsigned long total_in = 0;
572	int i;
573	int will_compress;
574	int compress_type = fs_info->compress_type;
575	int compressed_extents = 0;
576	int redirty = 0;
577
578	inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
579			SZ_16K);
580
581	/*
582	 * We need to save i_size before now because it could change in between
583	 * us evaluating the size and assigning it.  This is because we lock and
584	 * unlock the page in truncate and fallocate, and then modify the i_size
585	 * later on.
586	 *
587	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
588	 * does that for us.
589	 */
590	barrier();
591	i_size = i_size_read(inode);
592	barrier();
593	actual_end = min_t(u64, i_size, end + 1);
594again:
595	will_compress = 0;
596	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
597	BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
598	nr_pages = min_t(unsigned long, nr_pages,
599			BTRFS_MAX_COMPRESSED / PAGE_SIZE);
600
601	/*
602	 * we don't want to send crud past the end of i_size through
603	 * compression, that's just a waste of CPU time.  So, if the
604	 * end of the file is before the start of our current
605	 * requested range of bytes, we bail out to the uncompressed
606	 * cleanup code that can deal with all of this.
607	 *
608	 * It isn't really the fastest way to fix things, but this is a
609	 * very uncommon corner.
610	 */
611	if (actual_end <= start)
612		goto cleanup_and_bail_uncompressed;
613
614	total_compressed = actual_end - start;
615
616	/*
617	 * skip compression for a small file range(<=blocksize) that
618	 * isn't an inline extent, since it doesn't save disk space at all.
619	 */
620	if (total_compressed <= blocksize &&
621	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
622		goto cleanup_and_bail_uncompressed;
623
624	total_compressed = min_t(unsigned long, total_compressed,
625			BTRFS_MAX_UNCOMPRESSED);
626	total_in = 0;
627	ret = 0;
628
629	/*
630	 * we do compression for mount -o compress and when the
631	 * inode has not been flagged as nocompress.  This flag can
632	 * change at any time if we discover bad compression ratios.
633	 */
634	if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) {
635		WARN_ON(pages);
636		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
637		if (!pages) {
638			/* just bail out to the uncompressed code */
639			nr_pages = 0;
640			goto cont;
641		}
642
643		if (BTRFS_I(inode)->defrag_compress)
644			compress_type = BTRFS_I(inode)->defrag_compress;
645		else if (BTRFS_I(inode)->prop_compress)
646			compress_type = BTRFS_I(inode)->prop_compress;
647
648		/*
649		 * we need to call clear_page_dirty_for_io on each
650		 * page in the range.  Otherwise applications with the file
651		 * mmap'd can wander in and change the page contents while
652		 * we are compressing them.
653		 *
654		 * If the compression fails for any reason, we set the pages
655		 * dirty again later on.
656		 *
657		 * Note that the remaining part is redirtied, the start pointer
658		 * has moved, the end is the original one.
659		 */
660		if (!redirty) {
661			extent_range_clear_dirty_for_io(inode, start, end);
662			redirty = 1;
663		}
664
665		/* Compression level is applied here and only here */
666		ret = btrfs_compress_pages(
667			compress_type | (fs_info->compress_level << 4),
668					   inode->i_mapping, start,
669					   pages,
670					   &nr_pages,
671					   &total_in,
672					   &total_compressed);
673
674		if (!ret) {
675			unsigned long offset = offset_in_page(total_compressed);
676			struct page *page = pages[nr_pages - 1];
677
678			/* zero the tail end of the last page, we might be
679			 * sending it down to disk
680			 */
681			if (offset)
682				memzero_page(page, offset, PAGE_SIZE - offset);
683			will_compress = 1;
684		}
685	}
686cont:
687	/*
688	 * Check cow_file_range() for why we don't even try to create inline
689	 * extent for subpage case.
690	 */
691	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
692		/* lets try to make an inline extent */
693		if (ret || total_in < actual_end) {
694			/* we didn't compress the entire range, try
695			 * to make an uncompressed inline extent.
696			 */
697			ret = cow_file_range_inline(BTRFS_I(inode), start, end,
698						    0, BTRFS_COMPRESS_NONE,
699						    NULL);
700		} else {
701			/* try making a compressed inline extent */
702			ret = cow_file_range_inline(BTRFS_I(inode), start, end,
703						    total_compressed,
704						    compress_type, pages);
705		}
706		if (ret <= 0) {
707			unsigned long clear_flags = EXTENT_DELALLOC |
708				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
709				EXTENT_DO_ACCOUNTING;
710			unsigned long page_error_op;
711
712			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
713
714			/*
715			 * inline extent creation worked or returned error,
716			 * we don't need to create any more async work items.
717			 * Unlock and free up our temp pages.
718			 *
719			 * We use DO_ACCOUNTING here because we need the
720			 * delalloc_release_metadata to be done _after_ we drop
721			 * our outstanding extent for clearing delalloc for this
722			 * range.
723			 */
724			extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
725						     NULL,
726						     clear_flags,
727						     PAGE_UNLOCK |
728						     PAGE_START_WRITEBACK |
729						     page_error_op |
730						     PAGE_END_WRITEBACK);
731
732			/*
733			 * Ensure we only free the compressed pages if we have
734			 * them allocated, as we can still reach here with
735			 * inode_need_compress() == false.
736			 */
737			if (pages) {
738				for (i = 0; i < nr_pages; i++) {
739					WARN_ON(pages[i]->mapping);
740					put_page(pages[i]);
741				}
742				kfree(pages);
743			}
744			return 0;
745		}
746	}
747
748	if (will_compress) {
749		/*
750		 * we aren't doing an inline extent round the compressed size
751		 * up to a block size boundary so the allocator does sane
752		 * things
753		 */
754		total_compressed = ALIGN(total_compressed, blocksize);
755
756		/*
757		 * one last check to make sure the compression is really a
758		 * win, compare the page count read with the blocks on disk,
759		 * compression must free at least one sector size
760		 */
761		total_in = ALIGN(total_in, PAGE_SIZE);
762		if (total_compressed + blocksize <= total_in) {
763			compressed_extents++;
764
765			/*
766			 * The async work queues will take care of doing actual
767			 * allocation on disk for these compressed pages, and
768			 * will submit them to the elevator.
769			 */
770			add_async_extent(async_chunk, start, total_in,
771					total_compressed, pages, nr_pages,
772					compress_type);
773
774			if (start + total_in < end) {
775				start += total_in;
776				pages = NULL;
777				cond_resched();
778				goto again;
779			}
780			return compressed_extents;
781		}
782	}
783	if (pages) {
784		/*
785		 * the compression code ran but failed to make things smaller,
786		 * free any pages it allocated and our page pointer array
787		 */
788		for (i = 0; i < nr_pages; i++) {
789			WARN_ON(pages[i]->mapping);
790			put_page(pages[i]);
791		}
792		kfree(pages);
793		pages = NULL;
794		total_compressed = 0;
795		nr_pages = 0;
796
797		/* flag the file so we don't compress in the future */
798		if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
799		    !(BTRFS_I(inode)->prop_compress)) {
800			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
801		}
802	}
803cleanup_and_bail_uncompressed:
804	/*
805	 * No compression, but we still need to write the pages in the file
806	 * we've been given so far.  redirty the locked page if it corresponds
807	 * to our extent and set things up for the async work queue to run
808	 * cow_file_range to do the normal delalloc dance.
809	 */
810	if (async_chunk->locked_page &&
811	    (page_offset(async_chunk->locked_page) >= start &&
812	     page_offset(async_chunk->locked_page)) <= end) {
813		__set_page_dirty_nobuffers(async_chunk->locked_page);
814		/* unlocked later on in the async handlers */
815	}
816
817	if (redirty)
818		extent_range_redirty_for_io(inode, start, end);
819	add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
820			 BTRFS_COMPRESS_NONE);
821	compressed_extents++;
822
823	return compressed_extents;
824}
825
826static void free_async_extent_pages(struct async_extent *async_extent)
827{
828	int i;
829
830	if (!async_extent->pages)
831		return;
832
833	for (i = 0; i < async_extent->nr_pages; i++) {
834		WARN_ON(async_extent->pages[i]->mapping);
835		put_page(async_extent->pages[i]);
836	}
837	kfree(async_extent->pages);
838	async_extent->nr_pages = 0;
839	async_extent->pages = NULL;
840}
841
842/*
843 * phase two of compressed writeback.  This is the ordered portion
844 * of the code, which only gets called in the order the work was
845 * queued.  We walk all the async extents created by compress_file_range
846 * and send them down to the disk.
847 */
848static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
849{
850	struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
851	struct btrfs_fs_info *fs_info = inode->root->fs_info;
852	struct async_extent *async_extent;
853	u64 alloc_hint = 0;
854	struct btrfs_key ins;
855	struct extent_map *em;
856	struct btrfs_root *root = inode->root;
857	struct extent_io_tree *io_tree = &inode->io_tree;
858	int ret = 0;
859
860again:
861	while (!list_empty(&async_chunk->extents)) {
862		async_extent = list_entry(async_chunk->extents.next,
863					  struct async_extent, list);
864		list_del(&async_extent->list);
865
866retry:
867		lock_extent(io_tree, async_extent->start,
868			    async_extent->start + async_extent->ram_size - 1);
869		/* did the compression code fall back to uncompressed IO? */
870		if (!async_extent->pages) {
871			int page_started = 0;
872			unsigned long nr_written = 0;
873
874			/* allocate blocks */
875			ret = cow_file_range(inode, async_chunk->locked_page,
876					     async_extent->start,
877					     async_extent->start +
878					     async_extent->ram_size - 1,
879					     &page_started, &nr_written, 0);
880
881			/* JDM XXX */
882
883			/*
884			 * if page_started, cow_file_range inserted an
885			 * inline extent and took care of all the unlocking
886			 * and IO for us.  Otherwise, we need to submit
887			 * all those pages down to the drive.
888			 */
889			if (!page_started && !ret)
890				extent_write_locked_range(&inode->vfs_inode,
891						  async_extent->start,
892						  async_extent->start +
893						  async_extent->ram_size - 1,
894						  WB_SYNC_ALL);
895			else if (ret && async_chunk->locked_page)
896				unlock_page(async_chunk->locked_page);
897			kfree(async_extent);
898			cond_resched();
899			continue;
900		}
901
902		ret = btrfs_reserve_extent(root, async_extent->ram_size,
903					   async_extent->compressed_size,
904					   async_extent->compressed_size,
905					   0, alloc_hint, &ins, 1, 1);
906		if (ret) {
907			free_async_extent_pages(async_extent);
908
909			if (ret == -ENOSPC) {
910				unlock_extent(io_tree, async_extent->start,
911					      async_extent->start +
912					      async_extent->ram_size - 1);
913
914				/*
915				 * we need to redirty the pages if we decide to
916				 * fallback to uncompressed IO, otherwise we
917				 * will not submit these pages down to lower
918				 * layers.
919				 */
920				extent_range_redirty_for_io(&inode->vfs_inode,
921						async_extent->start,
922						async_extent->start +
923						async_extent->ram_size - 1);
924
925				goto retry;
926			}
927			goto out_free;
928		}
929		/*
930		 * here we're doing allocation and writeback of the
931		 * compressed pages
932		 */
933		em = create_io_em(inode, async_extent->start,
934				  async_extent->ram_size, /* len */
935				  async_extent->start, /* orig_start */
936				  ins.objectid, /* block_start */
937				  ins.offset, /* block_len */
938				  ins.offset, /* orig_block_len */
939				  async_extent->ram_size, /* ram_bytes */
940				  async_extent->compress_type,
941				  BTRFS_ORDERED_COMPRESSED);
942		if (IS_ERR(em))
943			/* ret value is not necessary due to void function */
944			goto out_free_reserve;
945		free_extent_map(em);
946
947		ret = btrfs_add_ordered_extent_compress(inode,
948						async_extent->start,
949						ins.objectid,
950						async_extent->ram_size,
951						ins.offset,
952						async_extent->compress_type);
953		if (ret) {
954			btrfs_drop_extent_cache(inode, async_extent->start,
955						async_extent->start +
956						async_extent->ram_size - 1, 0);
957			goto out_free_reserve;
958		}
959		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
960
961		/*
962		 * clear dirty, set writeback and unlock the pages.
963		 */
964		extent_clear_unlock_delalloc(inode, async_extent->start,
965				async_extent->start +
966				async_extent->ram_size - 1,
967				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
968				PAGE_UNLOCK | PAGE_START_WRITEBACK);
969		if (btrfs_submit_compressed_write(inode, async_extent->start,
970				    async_extent->ram_size,
971				    ins.objectid,
972				    ins.offset, async_extent->pages,
973				    async_extent->nr_pages,
974				    async_chunk->write_flags,
975				    async_chunk->blkcg_css)) {
976			struct page *p = async_extent->pages[0];
977			const u64 start = async_extent->start;
978			const u64 end = start + async_extent->ram_size - 1;
979
980			p->mapping = inode->vfs_inode.i_mapping;
981			btrfs_writepage_endio_finish_ordered(inode, p, start,
982							     end, false);
983
984			p->mapping = NULL;
985			extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
986						     PAGE_END_WRITEBACK |
987						     PAGE_SET_ERROR);
988			free_async_extent_pages(async_extent);
989		}
990		alloc_hint = ins.objectid + ins.offset;
991		kfree(async_extent);
992		cond_resched();
993	}
994	return;
995out_free_reserve:
996	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
997	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
998out_free:
999	extent_clear_unlock_delalloc(inode, async_extent->start,
1000				     async_extent->start +
1001				     async_extent->ram_size - 1,
1002				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
1003				     EXTENT_DELALLOC_NEW |
1004				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1005				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
1006				     PAGE_END_WRITEBACK | PAGE_SET_ERROR);
1007	free_async_extent_pages(async_extent);
1008	kfree(async_extent);
1009	goto again;
1010}
1011
1012static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1013				      u64 num_bytes)
1014{
1015	struct extent_map_tree *em_tree = &inode->extent_tree;
1016	struct extent_map *em;
1017	u64 alloc_hint = 0;
1018
1019	read_lock(&em_tree->lock);
1020	em = search_extent_mapping(em_tree, start, num_bytes);
1021	if (em) {
1022		/*
1023		 * if block start isn't an actual block number then find the
1024		 * first block in this inode and use that as a hint.  If that
1025		 * block is also bogus then just don't worry about it.
1026		 */
1027		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1028			free_extent_map(em);
1029			em = search_extent_mapping(em_tree, 0, 0);
1030			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
1031				alloc_hint = em->block_start;
1032			if (em)
1033				free_extent_map(em);
1034		} else {
1035			alloc_hint = em->block_start;
1036			free_extent_map(em);
1037		}
1038	}
1039	read_unlock(&em_tree->lock);
1040
1041	return alloc_hint;
1042}
1043
1044/*
1045 * when extent_io.c finds a delayed allocation range in the file,
1046 * the call backs end up in this code.  The basic idea is to
1047 * allocate extents on disk for the range, and create ordered data structs
1048 * in ram to track those extents.
1049 *
1050 * locked_page is the page that writepage had locked already.  We use
1051 * it to make sure we don't do extra locks or unlocks.
1052 *
1053 * *page_started is set to one if we unlock locked_page and do everything
1054 * required to start IO on it.  It may be clean and already done with
1055 * IO when we return.
1056 */
1057static noinline int cow_file_range(struct btrfs_inode *inode,
1058				   struct page *locked_page,
1059				   u64 start, u64 end, int *page_started,
1060				   unsigned long *nr_written, int unlock)
1061{
1062	struct btrfs_root *root = inode->root;
1063	struct btrfs_fs_info *fs_info = root->fs_info;
1064	u64 alloc_hint = 0;
1065	u64 num_bytes;
1066	unsigned long ram_size;
1067	u64 cur_alloc_size = 0;
1068	u64 min_alloc_size;
1069	u64 blocksize = fs_info->sectorsize;
1070	struct btrfs_key ins;
1071	struct extent_map *em;
1072	unsigned clear_bits;
1073	unsigned long page_ops;
1074	bool extent_reserved = false;
1075	int ret = 0;
1076
1077	if (btrfs_is_free_space_inode(inode)) {
1078		WARN_ON_ONCE(1);
1079		ret = -EINVAL;
1080		goto out_unlock;
1081	}
1082
1083	num_bytes = ALIGN(end - start + 1, blocksize);
1084	num_bytes = max(blocksize,  num_bytes);
1085	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1086
1087	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1088
1089	/*
1090	 * Due to the page size limit, for subpage we can only trigger the
1091	 * writeback for the dirty sectors of page, that means data writeback
1092	 * is doing more writeback than what we want.
1093	 *
1094	 * This is especially unexpected for some call sites like fallocate,
1095	 * where we only increase i_size after everything is done.
1096	 * This means we can trigger inline extent even if we didn't want to.
1097	 * So here we skip inline extent creation completely.
1098	 */
1099	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
1100		/* lets try to make an inline extent */
1101		ret = cow_file_range_inline(inode, start, end, 0,
1102					    BTRFS_COMPRESS_NONE, NULL);
1103		if (ret == 0) {
1104			/*
1105			 * We use DO_ACCOUNTING here because we need the
1106			 * delalloc_release_metadata to be run _after_ we drop
1107			 * our outstanding extent for clearing delalloc for this
1108			 * range.
1109			 */
1110			extent_clear_unlock_delalloc(inode, start, end,
1111				     locked_page,
1112				     EXTENT_LOCKED | EXTENT_DELALLOC |
1113				     EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1114				     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1115				     PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
1116			*nr_written = *nr_written +
1117			     (end - start + PAGE_SIZE) / PAGE_SIZE;
1118			*page_started = 1;
1119			/*
1120			 * locked_page is locked by the caller of
1121			 * writepage_delalloc(), not locked by
1122			 * __process_pages_contig().
1123			 *
1124			 * We can't let __process_pages_contig() to unlock it,
1125			 * as it doesn't have any subpage::writers recorded.
1126			 *
1127			 * Here we manually unlock the page, since the caller
1128			 * can't use page_started to determine if it's an
1129			 * inline extent or a compressed extent.
1130			 */
1131			unlock_page(locked_page);
1132			goto out;
1133		} else if (ret < 0) {
1134			goto out_unlock;
1135		}
1136	}
1137
1138	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1139	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
1140
1141	/*
1142	 * Relocation relies on the relocated extents to have exactly the same
1143	 * size as the original extents. Normally writeback for relocation data
1144	 * extents follows a NOCOW path because relocation preallocates the
1145	 * extents. However, due to an operation such as scrub turning a block
1146	 * group to RO mode, it may fallback to COW mode, so we must make sure
1147	 * an extent allocated during COW has exactly the requested size and can
1148	 * not be split into smaller extents, otherwise relocation breaks and
1149	 * fails during the stage where it updates the bytenr of file extent
1150	 * items.
1151	 */
1152	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1153		min_alloc_size = num_bytes;
1154	else
1155		min_alloc_size = fs_info->sectorsize;
1156
1157	while (num_bytes > 0) {
1158		cur_alloc_size = num_bytes;
1159		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1160					   min_alloc_size, 0, alloc_hint,
1161					   &ins, 1, 1);
1162		if (ret < 0)
1163			goto out_unlock;
1164		cur_alloc_size = ins.offset;
1165		extent_reserved = true;
1166
1167		ram_size = ins.offset;
1168		em = create_io_em(inode, start, ins.offset, /* len */
1169				  start, /* orig_start */
1170				  ins.objectid, /* block_start */
1171				  ins.offset, /* block_len */
1172				  ins.offset, /* orig_block_len */
1173				  ram_size, /* ram_bytes */
1174				  BTRFS_COMPRESS_NONE, /* compress_type */
1175				  BTRFS_ORDERED_REGULAR /* type */);
1176		if (IS_ERR(em)) {
1177			ret = PTR_ERR(em);
1178			goto out_reserve;
1179		}
1180		free_extent_map(em);
1181
1182		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1183					       ram_size, cur_alloc_size,
1184					       BTRFS_ORDERED_REGULAR);
1185		if (ret)
1186			goto out_drop_extent_cache;
1187
1188		if (root->root_key.objectid ==
1189		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1190			ret = btrfs_reloc_clone_csums(inode, start,
1191						      cur_alloc_size);
1192			/*
1193			 * Only drop cache here, and process as normal.
1194			 *
1195			 * We must not allow extent_clear_unlock_delalloc()
1196			 * at out_unlock label to free meta of this ordered
1197			 * extent, as its meta should be freed by
1198			 * btrfs_finish_ordered_io().
1199			 *
1200			 * So we must continue until @start is increased to
1201			 * skip current ordered extent.
1202			 */
1203			if (ret)
1204				btrfs_drop_extent_cache(inode, start,
1205						start + ram_size - 1, 0);
1206		}
1207
1208		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1209
1210		/*
1211		 * We're not doing compressed IO, don't unlock the first page
1212		 * (which the caller expects to stay locked), don't clear any
1213		 * dirty bits and don't set any writeback bits
1214		 *
1215		 * Do set the Ordered (Private2) bit so we know this page was
1216		 * properly setup for writepage.
1217		 */
1218		page_ops = unlock ? PAGE_UNLOCK : 0;
1219		page_ops |= PAGE_SET_ORDERED;
1220
1221		extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
1222					     locked_page,
1223					     EXTENT_LOCKED | EXTENT_DELALLOC,
1224					     page_ops);
1225		if (num_bytes < cur_alloc_size)
1226			num_bytes = 0;
1227		else
1228			num_bytes -= cur_alloc_size;
1229		alloc_hint = ins.objectid + ins.offset;
1230		start += cur_alloc_size;
1231		extent_reserved = false;
1232
1233		/*
1234		 * btrfs_reloc_clone_csums() error, since start is increased
1235		 * extent_clear_unlock_delalloc() at out_unlock label won't
1236		 * free metadata of current ordered extent, we're OK to exit.
1237		 */
1238		if (ret)
1239			goto out_unlock;
1240	}
1241out:
1242	return ret;
1243
1244out_drop_extent_cache:
1245	btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1246out_reserve:
1247	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1248	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1249out_unlock:
1250	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1251		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1252	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1253	/*
1254	 * If we reserved an extent for our delalloc range (or a subrange) and
1255	 * failed to create the respective ordered extent, then it means that
1256	 * when we reserved the extent we decremented the extent's size from
1257	 * the data space_info's bytes_may_use counter and incremented the
1258	 * space_info's bytes_reserved counter by the same amount. We must make
1259	 * sure extent_clear_unlock_delalloc() does not try to decrement again
1260	 * the data space_info's bytes_may_use counter, therefore we do not pass
1261	 * it the flag EXTENT_CLEAR_DATA_RESV.
1262	 */
1263	if (extent_reserved) {
1264		extent_clear_unlock_delalloc(inode, start,
1265					     start + cur_alloc_size - 1,
1266					     locked_page,
1267					     clear_bits,
1268					     page_ops);
1269		start += cur_alloc_size;
1270		if (start >= end)
1271			goto out;
1272	}
1273	extent_clear_unlock_delalloc(inode, start, end, locked_page,
1274				     clear_bits | EXTENT_CLEAR_DATA_RESV,
1275				     page_ops);
1276	goto out;
1277}
1278
1279/*
1280 * work queue call back to started compression on a file and pages
1281 */
1282static noinline void async_cow_start(struct btrfs_work *work)
1283{
1284	struct async_chunk *async_chunk;
1285	int compressed_extents;
1286
1287	async_chunk = container_of(work, struct async_chunk, work);
1288
1289	compressed_extents = compress_file_range(async_chunk);
1290	if (compressed_extents == 0) {
1291		btrfs_add_delayed_iput(async_chunk->inode);
1292		async_chunk->inode = NULL;
1293	}
1294}
1295
1296/*
1297 * work queue call back to submit previously compressed pages
1298 */
1299static noinline void async_cow_submit(struct btrfs_work *work)
1300{
1301	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1302						     work);
1303	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1304	unsigned long nr_pages;
1305
1306	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1307		PAGE_SHIFT;
1308
1309	/*
1310	 * ->inode could be NULL if async_chunk_start has failed to compress,
1311	 * in which case we don't have anything to submit, yet we need to
1312	 * always adjust ->async_delalloc_pages as its paired with the init
1313	 * happening in cow_file_range_async
1314	 */
1315	if (async_chunk->inode)
1316		submit_compressed_extents(async_chunk);
1317
1318	/* atomic_sub_return implies a barrier */
1319	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1320	    5 * SZ_1M)
1321		cond_wake_up_nomb(&fs_info->async_submit_wait);
1322}
1323
1324static noinline void async_cow_free(struct btrfs_work *work)
1325{
1326	struct async_chunk *async_chunk;
1327
1328	async_chunk = container_of(work, struct async_chunk, work);
1329	if (async_chunk->inode)
1330		btrfs_add_delayed_iput(async_chunk->inode);
1331	if (async_chunk->blkcg_css)
1332		css_put(async_chunk->blkcg_css);
1333	/*
1334	 * Since the pointer to 'pending' is at the beginning of the array of
1335	 * async_chunk's, freeing it ensures the whole array has been freed.
1336	 */
1337	if (atomic_dec_and_test(async_chunk->pending))
1338		kvfree(async_chunk->pending);
1339}
1340
1341static int cow_file_range_async(struct btrfs_inode *inode,
1342				struct writeback_control *wbc,
1343				struct page *locked_page,
1344				u64 start, u64 end, int *page_started,
1345				unsigned long *nr_written)
1346{
1347	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1348	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1349	struct async_cow *ctx;
1350	struct async_chunk *async_chunk;
1351	unsigned long nr_pages;
1352	u64 cur_end;
1353	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1354	int i;
1355	bool should_compress;
1356	unsigned nofs_flag;
1357	const unsigned int write_flags = wbc_to_write_flags(wbc);
1358
1359	unlock_extent(&inode->io_tree, start, end);
1360
1361	if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
1362	    !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
1363		num_chunks = 1;
1364		should_compress = false;
1365	} else {
1366		should_compress = true;
1367	}
1368
1369	nofs_flag = memalloc_nofs_save();
1370	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1371	memalloc_nofs_restore(nofs_flag);
1372
1373	if (!ctx) {
1374		unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
1375			EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1376			EXTENT_DO_ACCOUNTING;
1377		unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
1378					 PAGE_END_WRITEBACK | PAGE_SET_ERROR;
1379
1380		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1381					     clear_bits, page_ops);
1382		return -ENOMEM;
1383	}
1384
1385	async_chunk = ctx->chunks;
1386	atomic_set(&ctx->num_chunks, num_chunks);
1387
1388	for (i = 0; i < num_chunks; i++) {
1389		if (should_compress)
1390			cur_end = min(end, start + SZ_512K - 1);
1391		else
1392			cur_end = end;
1393
1394		/*
1395		 * igrab is called higher up in the call chain, take only the
1396		 * lightweight reference for the callback lifetime
1397		 */
1398		ihold(&inode->vfs_inode);
1399		async_chunk[i].pending = &ctx->num_chunks;
1400		async_chunk[i].inode = &inode->vfs_inode;
1401		async_chunk[i].start = start;
1402		async_chunk[i].end = cur_end;
1403		async_chunk[i].write_flags = write_flags;
1404		INIT_LIST_HEAD(&async_chunk[i].extents);
1405
1406		/*
1407		 * The locked_page comes all the way from writepage and its
1408		 * the original page we were actually given.  As we spread
1409		 * this large delalloc region across multiple async_chunk
1410		 * structs, only the first struct needs a pointer to locked_page
1411		 *
1412		 * This way we don't need racey decisions about who is supposed
1413		 * to unlock it.
1414		 */
1415		if (locked_page) {
1416			/*
1417			 * Depending on the compressibility, the pages might or
1418			 * might not go through async.  We want all of them to
1419			 * be accounted against wbc once.  Let's do it here
1420			 * before the paths diverge.  wbc accounting is used
1421			 * only for foreign writeback detection and doesn't
1422			 * need full accuracy.  Just account the whole thing
1423			 * against the first page.
1424			 */
1425			wbc_account_cgroup_owner(wbc, locked_page,
1426						 cur_end - start);
1427			async_chunk[i].locked_page = locked_page;
1428			locked_page = NULL;
1429		} else {
1430			async_chunk[i].locked_page = NULL;
1431		}
1432
1433		if (blkcg_css != blkcg_root_css) {
1434			css_get(blkcg_css);
1435			async_chunk[i].blkcg_css = blkcg_css;
1436		} else {
1437			async_chunk[i].blkcg_css = NULL;
1438		}
1439
1440		btrfs_init_work(&async_chunk[i].work, async_cow_start,
1441				async_cow_submit, async_cow_free);
1442
1443		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1444		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1445
1446		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1447
1448		*nr_written += nr_pages;
1449		start = cur_end + 1;
1450	}
1451	*page_started = 1;
1452	return 0;
1453}
1454
1455static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
1456				       struct page *locked_page, u64 start,
1457				       u64 end, int *page_started,
1458				       unsigned long *nr_written)
1459{
1460	int ret;
1461
1462	ret = cow_file_range(inode, locked_page, start, end, page_started,
1463			     nr_written, 0);
1464	if (ret)
1465		return ret;
1466
1467	if (*page_started)
1468		return 0;
1469
1470	__set_page_dirty_nobuffers(locked_page);
1471	account_page_redirty(locked_page);
1472	extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL);
1473	*page_started = 1;
1474
1475	return 0;
1476}
1477
1478static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1479					u64 bytenr, u64 num_bytes)
1480{
1481	int ret;
1482	struct btrfs_ordered_sum *sums;
1483	LIST_HEAD(list);
1484
1485	ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1486				       bytenr + num_bytes - 1, &list, 0);
1487	if (ret == 0 && list_empty(&list))
1488		return 0;
1489
1490	while (!list_empty(&list)) {
1491		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1492		list_del(&sums->list);
1493		kfree(sums);
1494	}
1495	if (ret < 0)
1496		return ret;
1497	return 1;
1498}
1499
1500static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
1501			   const u64 start, const u64 end,
1502			   int *page_started, unsigned long *nr_written)
1503{
1504	const bool is_space_ino = btrfs_is_free_space_inode(inode);
1505	const bool is_reloc_ino = (inode->root->root_key.objectid ==
1506				   BTRFS_DATA_RELOC_TREE_OBJECTID);
1507	const u64 range_bytes = end + 1 - start;
1508	struct extent_io_tree *io_tree = &inode->io_tree;
1509	u64 range_start = start;
1510	u64 count;
1511
1512	/*
1513	 * If EXTENT_NORESERVE is set it means that when the buffered write was
1514	 * made we had not enough available data space and therefore we did not
1515	 * reserve data space for it, since we though we could do NOCOW for the
1516	 * respective file range (either there is prealloc extent or the inode
1517	 * has the NOCOW bit set).
1518	 *
1519	 * However when we need to fallback to COW mode (because for example the
1520	 * block group for the corresponding extent was turned to RO mode by a
1521	 * scrub or relocation) we need to do the following:
1522	 *
1523	 * 1) We increment the bytes_may_use counter of the data space info.
1524	 *    If COW succeeds, it allocates a new data extent and after doing
1525	 *    that it decrements the space info's bytes_may_use counter and
1526	 *    increments its bytes_reserved counter by the same amount (we do
1527	 *    this at btrfs_add_reserved_bytes()). So we need to increment the
1528	 *    bytes_may_use counter to compensate (when space is reserved at
1529	 *    buffered write time, the bytes_may_use counter is incremented);
1530	 *
1531	 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1532	 *    that if the COW path fails for any reason, it decrements (through
1533	 *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1534	 *    data space info, which we incremented in the step above.
1535	 *
1536	 * If we need to fallback to cow and the inode corresponds to a free
1537	 * space cache inode or an inode of the data relocation tree, we must
1538	 * also increment bytes_may_use of the data space_info for the same
1539	 * reason. Space caches and relocated data extents always get a prealloc
1540	 * extent for them, however scrub or balance may have set the block
1541	 * group that contains that extent to RO mode and therefore force COW
1542	 * when starting writeback.
1543	 */
1544	count = count_range_bits(io_tree, &range_start, end, range_bytes,
1545				 EXTENT_NORESERVE, 0);
1546	if (count > 0 || is_space_ino || is_reloc_ino) {
1547		u64 bytes = count;
1548		struct btrfs_fs_info *fs_info = inode->root->fs_info;
1549		struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1550
1551		if (is_space_ino || is_reloc_ino)
1552			bytes = range_bytes;
1553
1554		spin_lock(&sinfo->lock);
1555		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
1556		spin_unlock(&sinfo->lock);
1557
1558		if (count > 0)
1559			clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1560					 0, 0, NULL);
1561	}
1562
1563	return cow_file_range(inode, locked_page, start, end, page_started,
1564			      nr_written, 1);
1565}
1566
1567/*
1568 * when nowcow writeback call back.  This checks for snapshots or COW copies
1569 * of the extents that exist in the file, and COWs the file as required.
1570 *
1571 * If no cow copies or snapshots exist, we write directly to the existing
1572 * blocks on disk
1573 */
1574static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
1575				       struct page *locked_page,
1576				       const u64 start, const u64 end,
1577				       int *page_started,
1578				       unsigned long *nr_written)
1579{
1580	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1581	struct btrfs_root *root = inode->root;
1582	struct btrfs_path *path;
1583	u64 cow_start = (u64)-1;
1584	u64 cur_offset = start;
1585	int ret;
1586	bool check_prev = true;
1587	const bool freespace_inode = btrfs_is_free_space_inode(inode);
1588	u64 ino = btrfs_ino(inode);
1589	bool nocow = false;
1590	u64 disk_bytenr = 0;
1591	const bool force = inode->flags & BTRFS_INODE_NODATACOW;
1592
1593	path = btrfs_alloc_path();
1594	if (!path) {
1595		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1596					     EXTENT_LOCKED | EXTENT_DELALLOC |
1597					     EXTENT_DO_ACCOUNTING |
1598					     EXTENT_DEFRAG, PAGE_UNLOCK |
1599					     PAGE_START_WRITEBACK |
1600					     PAGE_END_WRITEBACK);
1601		return -ENOMEM;
1602	}
1603
1604	while (1) {
1605		struct btrfs_key found_key;
1606		struct btrfs_file_extent_item *fi;
1607		struct extent_buffer *leaf;
1608		u64 extent_end;
1609		u64 extent_offset;
1610		u64 num_bytes = 0;
1611		u64 disk_num_bytes;
1612		u64 ram_bytes;
1613		int extent_type;
1614
1615		nocow = false;
1616
1617		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1618					       cur_offset, 0);
1619		if (ret < 0)
1620			goto error;
1621
1622		/*
1623		 * If there is no extent for our range when doing the initial
1624		 * search, then go back to the previous slot as it will be the
1625		 * one containing the search offset
1626		 */
1627		if (ret > 0 && path->slots[0] > 0 && check_prev) {
1628			leaf = path->nodes[0];
1629			btrfs_item_key_to_cpu(leaf, &found_key,
1630					      path->slots[0] - 1);
1631			if (found_key.objectid == ino &&
1632			    found_key.type == BTRFS_EXTENT_DATA_KEY)
1633				path->slots[0]--;
1634		}
1635		check_prev = false;
1636next_slot:
1637		/* Go to next leaf if we have exhausted the current one */
1638		leaf = path->nodes[0];
1639		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1640			ret = btrfs_next_leaf(root, path);
1641			if (ret < 0) {
1642				if (cow_start != (u64)-1)
1643					cur_offset = cow_start;
1644				goto error;
1645			}
1646			if (ret > 0)
1647				break;
1648			leaf = path->nodes[0];
1649		}
1650
1651		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1652
1653		/* Didn't find anything for our INO */
1654		if (found_key.objectid > ino)
1655			break;
1656		/*
1657		 * Keep searching until we find an EXTENT_ITEM or there are no
1658		 * more extents for this inode
1659		 */
1660		if (WARN_ON_ONCE(found_key.objectid < ino) ||
1661		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
1662			path->slots[0]++;
1663			goto next_slot;
1664		}
1665
1666		/* Found key is not EXTENT_DATA_KEY or starts after req range */
1667		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1668		    found_key.offset > end)
1669			break;
1670
1671		/*
1672		 * If the found extent starts after requested offset, then
1673		 * adjust extent_end to be right before this extent begins
1674		 */
1675		if (found_key.offset > cur_offset) {
1676			extent_end = found_key.offset;
1677			extent_type = 0;
1678			goto out_check;
1679		}
1680
1681		/*
1682		 * Found extent which begins before our range and potentially
1683		 * intersect it
1684		 */
1685		fi = btrfs_item_ptr(leaf, path->slots[0],
1686				    struct btrfs_file_extent_item);
1687		extent_type = btrfs_file_extent_type(leaf, fi);
1688
1689		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1690		if (extent_type == BTRFS_FILE_EXTENT_REG ||
1691		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1692			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1693			extent_offset = btrfs_file_extent_offset(leaf, fi);
1694			extent_end = found_key.offset +
1695				btrfs_file_extent_num_bytes(leaf, fi);
1696			disk_num_bytes =
1697				btrfs_file_extent_disk_num_bytes(leaf, fi);
1698			/*
1699			 * If the extent we got ends before our current offset,
1700			 * skip to the next extent.
1701			 */
1702			if (extent_end <= cur_offset) {
1703				path->slots[0]++;
1704				goto next_slot;
1705			}
1706			/* Skip holes */
1707			if (disk_bytenr == 0)
1708				goto out_check;
1709			/* Skip compressed/encrypted/encoded extents */
1710			if (btrfs_file_extent_compression(leaf, fi) ||
1711			    btrfs_file_extent_encryption(leaf, fi) ||
1712			    btrfs_file_extent_other_encoding(leaf, fi))
1713				goto out_check;
1714			/*
1715			 * If extent is created before the last volume's snapshot
1716			 * this implies the extent is shared, hence we can't do
1717			 * nocow. This is the same check as in
1718			 * btrfs_cross_ref_exist but without calling
1719			 * btrfs_search_slot.
1720			 */
1721			if (!freespace_inode &&
1722			    btrfs_file_extent_generation(leaf, fi) <=
1723			    btrfs_root_last_snapshot(&root->root_item))
1724				goto out_check;
1725			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1726				goto out_check;
1727
1728			/*
1729			 * The following checks can be expensive, as they need to
1730			 * take other locks and do btree or rbtree searches, so
1731			 * release the path to avoid blocking other tasks for too
1732			 * long.
1733			 */
1734			btrfs_release_path(path);
1735
1736			ret = btrfs_cross_ref_exist(root, ino,
1737						    found_key.offset -
1738						    extent_offset, disk_bytenr, false);
1739			if (ret) {
1740				/*
1741				 * ret could be -EIO if the above fails to read
1742				 * metadata.
1743				 */
1744				if (ret < 0) {
1745					if (cow_start != (u64)-1)
1746						cur_offset = cow_start;
1747					goto error;
1748				}
1749
1750				WARN_ON_ONCE(freespace_inode);
1751				goto out_check;
1752			}
1753			disk_bytenr += extent_offset;
1754			disk_bytenr += cur_offset - found_key.offset;
1755			num_bytes = min(end + 1, extent_end) - cur_offset;
1756			/*
1757			 * If there are pending snapshots for this root, we
1758			 * fall into common COW way
1759			 */
1760			if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
1761				goto out_check;
1762			/*
1763			 * force cow if csum exists in the range.
1764			 * this ensure that csum for a given extent are
1765			 * either valid or do not exist.
1766			 */
1767			ret = csum_exist_in_range(fs_info, disk_bytenr,
1768						  num_bytes);
1769			if (ret) {
1770				/*
1771				 * ret could be -EIO if the above fails to read
1772				 * metadata.
1773				 */
1774				if (ret < 0) {
1775					if (cow_start != (u64)-1)
1776						cur_offset = cow_start;
1777					goto error;
1778				}
1779				WARN_ON_ONCE(freespace_inode);
1780				goto out_check;
1781			}
1782			/* If the extent's block group is RO, we must COW */
1783			if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
1784				goto out_check;
1785			nocow = true;
1786		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1787			extent_end = found_key.offset + ram_bytes;
1788			extent_end = ALIGN(extent_end, fs_info->sectorsize);
1789			/* Skip extents outside of our requested range */
1790			if (extent_end <= start) {
1791				path->slots[0]++;
1792				goto next_slot;
1793			}
1794		} else {
1795			/* If this triggers then we have a memory corruption */
1796			BUG();
1797		}
1798out_check:
1799		/*
1800		 * If nocow is false then record the beginning of the range
1801		 * that needs to be COWed
1802		 */
1803		if (!nocow) {
1804			if (cow_start == (u64)-1)
1805				cow_start = cur_offset;
1806			cur_offset = extent_end;
1807			if (cur_offset > end)
1808				break;
1809			if (!path->nodes[0])
1810				continue;
1811			path->slots[0]++;
1812			goto next_slot;
1813		}
1814
1815		/*
1816		 * COW range from cow_start to found_key.offset - 1. As the key
1817		 * will contain the beginning of the first extent that can be
1818		 * NOCOW, following one which needs to be COW'ed
1819		 */
1820		if (cow_start != (u64)-1) {
1821			ret = fallback_to_cow(inode, locked_page,
1822					      cow_start, found_key.offset - 1,
1823					      page_started, nr_written);
1824			if (ret)
1825				goto error;
1826			cow_start = (u64)-1;
1827		}
1828
1829		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1830			u64 orig_start = found_key.offset - extent_offset;
1831			struct extent_map *em;
1832
1833			em = create_io_em(inode, cur_offset, num_bytes,
1834					  orig_start,
1835					  disk_bytenr, /* block_start */
1836					  num_bytes, /* block_len */
1837					  disk_num_bytes, /* orig_block_len */
1838					  ram_bytes, BTRFS_COMPRESS_NONE,
1839					  BTRFS_ORDERED_PREALLOC);
1840			if (IS_ERR(em)) {
1841				ret = PTR_ERR(em);
1842				goto error;
1843			}
1844			free_extent_map(em);
1845			ret = btrfs_add_ordered_extent(inode, cur_offset,
1846						       disk_bytenr, num_bytes,
1847						       num_bytes,
1848						       BTRFS_ORDERED_PREALLOC);
1849			if (ret) {
1850				btrfs_drop_extent_cache(inode, cur_offset,
1851							cur_offset + num_bytes - 1,
1852							0);
1853				goto error;
1854			}
1855		} else {
1856			ret = btrfs_add_ordered_extent(inode, cur_offset,
1857						       disk_bytenr, num_bytes,
1858						       num_bytes,
1859						       BTRFS_ORDERED_NOCOW);
1860			if (ret)
1861				goto error;
1862		}
1863
1864		if (nocow)
1865			btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1866		nocow = false;
1867
1868		if (root->root_key.objectid ==
1869		    BTRFS_DATA_RELOC_TREE_OBJECTID)
1870			/*
1871			 * Error handled later, as we must prevent
1872			 * extent_clear_unlock_delalloc() in error handler
1873			 * from freeing metadata of created ordered extent.
1874			 */
1875			ret = btrfs_reloc_clone_csums(inode, cur_offset,
1876						      num_bytes);
1877
1878		extent_clear_unlock_delalloc(inode, cur_offset,
1879					     cur_offset + num_bytes - 1,
1880					     locked_page, EXTENT_LOCKED |
1881					     EXTENT_DELALLOC |
1882					     EXTENT_CLEAR_DATA_RESV,
1883					     PAGE_UNLOCK | PAGE_SET_ORDERED);
1884
1885		cur_offset = extent_end;
1886
1887		/*
1888		 * btrfs_reloc_clone_csums() error, now we're OK to call error
1889		 * handler, as metadata for created ordered extent will only
1890		 * be freed by btrfs_finish_ordered_io().
1891		 */
1892		if (ret)
1893			goto error;
1894		if (cur_offset > end)
1895			break;
1896	}
1897	btrfs_release_path(path);
1898
1899	if (cur_offset <= end && cow_start == (u64)-1)
1900		cow_start = cur_offset;
1901
1902	if (cow_start != (u64)-1) {
1903		cur_offset = end;
1904		ret = fallback_to_cow(inode, locked_page, cow_start, end,
1905				      page_started, nr_written);
1906		if (ret)
1907			goto error;
1908	}
1909
1910error:
1911	if (nocow)
1912		btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1913
1914	if (ret && cur_offset < end)
1915		extent_clear_unlock_delalloc(inode, cur_offset, end,
1916					     locked_page, EXTENT_LOCKED |
1917					     EXTENT_DELALLOC | EXTENT_DEFRAG |
1918					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1919					     PAGE_START_WRITEBACK |
1920					     PAGE_END_WRITEBACK);
1921	btrfs_free_path(path);
1922	return ret;
1923}
1924
1925static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
1926{
1927	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
1928		if (inode->defrag_bytes &&
1929		    test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
1930				   0, NULL))
1931			return false;
1932		return true;
1933	}
1934	return false;
1935}
1936
1937/*
1938 * Function to process delayed allocation (create CoW) for ranges which are
1939 * being touched for the first time.
1940 */
1941int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
1942		u64 start, u64 end, int *page_started, unsigned long *nr_written,
1943		struct writeback_control *wbc)
1944{
1945	int ret;
1946	const bool zoned = btrfs_is_zoned(inode->root->fs_info);
1947
1948	if (should_nocow(inode, start, end)) {
1949		ASSERT(!zoned);
1950		ret = run_delalloc_nocow(inode, locked_page, start, end,
1951					 page_started, nr_written);
1952	} else if (!inode_can_compress(inode) ||
1953		   !inode_need_compress(inode, start, end)) {
1954		if (zoned)
1955			ret = run_delalloc_zoned(inode, locked_page, start, end,
1956						 page_started, nr_written);
1957		else
1958			ret = cow_file_range(inode, locked_page, start, end,
1959					     page_started, nr_written, 1);
1960	} else {
1961		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
1962		ret = cow_file_range_async(inode, wbc, locked_page, start, end,
1963					   page_started, nr_written);
1964	}
1965	if (ret)
1966		btrfs_cleanup_ordered_extents(inode, locked_page, start,
1967					      end - start + 1);
1968	return ret;
1969}
1970
1971void btrfs_split_delalloc_extent(struct inode *inode,
1972				 struct extent_state *orig, u64 split)
1973{
1974	u64 size;
1975
1976	/* not delalloc, ignore it */
1977	if (!(orig->state & EXTENT_DELALLOC))
1978		return;
1979
1980	size = orig->end - orig->start + 1;
1981	if (size > BTRFS_MAX_EXTENT_SIZE) {
1982		u32 num_extents;
1983		u64 new_size;
1984
1985		/*
1986		 * See the explanation in btrfs_merge_delalloc_extent, the same
1987		 * applies here, just in reverse.
1988		 */
1989		new_size = orig->end - split + 1;
1990		num_extents = count_max_extents(new_size);
1991		new_size = split - orig->start;
1992		num_extents += count_max_extents(new_size);
1993		if (count_max_extents(size) >= num_extents)
1994			return;
1995	}
1996
1997	spin_lock(&BTRFS_I(inode)->lock);
1998	btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1999	spin_unlock(&BTRFS_I(inode)->lock);
2000}
2001
2002/*
2003 * Handle merged delayed allocation extents so we can keep track of new extents
2004 * that are just merged onto old extents, such as when we are doing sequential
2005 * writes, so we can properly account for the metadata space we'll need.
2006 */
2007void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
2008				 struct extent_state *other)
2009{
2010	u64 new_size, old_size;
2011	u32 num_extents;
2012
2013	/* not delalloc, ignore it */
2014	if (!(other->state & EXTENT_DELALLOC))
2015		return;
2016
2017	if (new->start > other->start)
2018		new_size = new->end - other->start + 1;
2019	else
2020		new_size = other->end - new->start + 1;
2021
2022	/* we're not bigger than the max, unreserve the space and go */
2023	if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
2024		spin_lock(&BTRFS_I(inode)->lock);
2025		btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
2026		spin_unlock(&BTRFS_I(inode)->lock);
2027		return;
2028	}
2029
2030	/*
2031	 * We have to add up either side to figure out how many extents were
2032	 * accounted for before we merged into one big extent.  If the number of
2033	 * extents we accounted for is <= the amount we need for the new range
2034	 * then we can return, otherwise drop.  Think of it like this
2035	 *
2036	 * [ 4k][MAX_SIZE]
2037	 *
2038	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
2039	 * need 2 outstanding extents, on one side we have 1 and the other side
2040	 * we have 1 so they are == and we can return.  But in this case
2041	 *
2042	 * [MAX_SIZE+4k][MAX_SIZE+4k]
2043	 *
2044	 * Each range on their own accounts for 2 extents, but merged together
2045	 * they are only 3 extents worth of accounting, so we need to drop in
2046	 * this case.
2047	 */
2048	old_size = other->end - other->start + 1;
2049	num_extents = count_max_extents(old_size);
2050	old_size = new->end - new->start + 1;
2051	num_extents += count_max_extents(old_size);
2052	if (count_max_extents(new_size) >= num_extents)
2053		return;
2054
2055	spin_lock(&BTRFS_I(inode)->lock);
2056	btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
2057	spin_unlock(&BTRFS_I(inode)->lock);
2058}
2059
2060static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
2061				      struct inode *inode)
2062{
2063	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2064
2065	spin_lock(&root->delalloc_lock);
2066	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
2067		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
2068			      &root->delalloc_inodes);
2069		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2070			&BTRFS_I(inode)->runtime_flags);
2071		root->nr_delalloc_inodes++;
2072		if (root->nr_delalloc_inodes == 1) {
2073			spin_lock(&fs_info->delalloc_root_lock);
2074			BUG_ON(!list_empty(&root->delalloc_root));
2075			list_add_tail(&root->delalloc_root,
2076				      &fs_info->delalloc_roots);
2077			spin_unlock(&fs_info->delalloc_root_lock);
2078		}
2079	}
2080	spin_unlock(&root->delalloc_lock);
2081}
2082
2083
2084void __btrfs_del_delalloc_inode(struct btrfs_root *root,
2085				struct btrfs_inode *inode)
2086{
2087	struct btrfs_fs_info *fs_info = root->fs_info;
2088
2089	if (!list_empty(&inode->delalloc_inodes)) {
2090		list_del_init(&inode->delalloc_inodes);
2091		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2092			  &inode->runtime_flags);
2093		root->nr_delalloc_inodes--;
2094		if (!root->nr_delalloc_inodes) {
2095			ASSERT(list_empty(&root->delalloc_inodes));
2096			spin_lock(&fs_info->delalloc_root_lock);
2097			BUG_ON(list_empty(&root->delalloc_root));
2098			list_del_init(&root->delalloc_root);
2099			spin_unlock(&fs_info->delalloc_root_lock);
2100		}
2101	}
2102}
2103
2104static void btrfs_del_delalloc_inode(struct btrfs_root *root,
2105				     struct btrfs_inode *inode)
2106{
2107	spin_lock(&root->delalloc_lock);
2108	__btrfs_del_delalloc_inode(root, inode);
2109	spin_unlock(&root->delalloc_lock);
2110}
2111
2112/*
2113 * Properly track delayed allocation bytes in the inode and to maintain the
2114 * list of inodes that have pending delalloc work to be done.
2115 */
2116void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
2117			       unsigned *bits)
2118{
2119	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2120
2121	if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
2122		WARN_ON(1);
2123	/*
2124	 * set_bit and clear bit hooks normally require _irqsave/restore
2125	 * but in this case, we are only testing for the DELALLOC
2126	 * bit, which is only set or cleared with irqs on
2127	 */
2128	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
2129		struct btrfs_root *root = BTRFS_I(inode)->root;
2130		u64 len = state->end + 1 - state->start;
2131		u32 num_extents = count_max_extents(len);
2132		bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
2133
2134		spin_lock(&BTRFS_I(inode)->lock);
2135		btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
2136		spin_unlock(&BTRFS_I(inode)->lock);
2137
2138		/* For sanity tests */
2139		if (btrfs_is_testing(fs_info))
2140			return;
2141
2142		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2143					 fs_info->delalloc_batch);
2144		spin_lock(&BTRFS_I(inode)->lock);
2145		BTRFS_I(inode)->delalloc_bytes += len;
2146		if (*bits & EXTENT_DEFRAG)
2147			BTRFS_I(inode)->defrag_bytes += len;
2148		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2149					 &BTRFS_I(inode)->runtime_flags))
2150			btrfs_add_delalloc_inodes(root, inode);
2151		spin_unlock(&BTRFS_I(inode)->lock);
2152	}
2153
2154	if (!(state->state & EXTENT_DELALLOC_NEW) &&
2155	    (*bits & EXTENT_DELALLOC_NEW)) {
2156		spin_lock(&BTRFS_I(inode)->lock);
2157		BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
2158			state->start;
2159		spin_unlock(&BTRFS_I(inode)->lock);
2160	}
2161}
2162
2163/*
2164 * Once a range is no longer delalloc this function ensures that proper
2165 * accounting happens.
2166 */
2167void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
2168				 struct extent_state *state, unsigned *bits)
2169{
2170	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
2171	struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
2172	u64 len = state->end + 1 - state->start;
2173	u32 num_extents = count_max_extents(len);
2174
2175	if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
2176		spin_lock(&inode->lock);
2177		inode->defrag_bytes -= len;
2178		spin_unlock(&inode->lock);
2179	}
2180
2181	/*
2182	 * set_bit and clear bit hooks normally require _irqsave/restore
2183	 * but in this case, we are only testing for the DELALLOC
2184	 * bit, which is only set or cleared with irqs on
2185	 */
2186	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
2187		struct btrfs_root *root = inode->root;
2188		bool do_list = !btrfs_is_free_space_inode(inode);
2189
2190		spin_lock(&inode->lock);
2191		btrfs_mod_outstanding_extents(inode, -num_extents);
2192		spin_unlock(&inode->lock);
2193
2194		/*
2195		 * We don't reserve metadata space for space cache inodes so we
2196		 * don't need to call delalloc_release_metadata if there is an
2197		 * error.
2198		 */
2199		if (*bits & EXTENT_CLEAR_META_RESV &&
2200		    root != fs_info->tree_root)
2201			btrfs_delalloc_release_metadata(inode, len, false);
2202
2203		/* For sanity tests. */
2204		if (btrfs_is_testing(fs_info))
2205			return;
2206
2207		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
2208		    do_list && !(state->state & EXTENT_NORESERVE) &&
2209		    (*bits & EXTENT_CLEAR_DATA_RESV))
2210			btrfs_free_reserved_data_space_noquota(fs_info, len);
2211
2212		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2213					 fs_info->delalloc_batch);
2214		spin_lock(&inode->lock);
2215		inode->delalloc_bytes -= len;
2216		if (do_list && inode->delalloc_bytes == 0 &&
2217		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2218					&inode->runtime_flags))
2219			btrfs_del_delalloc_inode(root, inode);
2220		spin_unlock(&inode->lock);
2221	}
2222
2223	if ((state->state & EXTENT_DELALLOC_NEW) &&
2224	    (*bits & EXTENT_DELALLOC_NEW)) {
2225		spin_lock(&inode->lock);
2226		ASSERT(inode->new_delalloc_bytes >= len);
2227		inode->new_delalloc_bytes -= len;
2228		if (*bits & EXTENT_ADD_INODE_BYTES)
2229			inode_add_bytes(&inode->vfs_inode, len);
2230		spin_unlock(&inode->lock);
2231	}
2232}
2233
2234/*
2235 * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
2236 * in a chunk's stripe. This function ensures that bios do not span a
2237 * stripe/chunk
2238 *
2239 * @page - The page we are about to add to the bio
2240 * @size - size we want to add to the bio
2241 * @bio - bio we want to ensure is smaller than a stripe
2242 * @bio_flags - flags of the bio
2243 *
2244 * return 1 if page cannot be added to the bio
2245 * return 0 if page can be added to the bio
2246 * return error otherwise
2247 */
2248int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
2249			     unsigned long bio_flags)
2250{
2251	struct inode *inode = page->mapping->host;
2252	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2253	u64 logical = bio->bi_iter.bi_sector << 9;
2254	u32 bio_len = bio->bi_iter.bi_size;
2255	struct extent_map *em;
2256	int ret = 0;
2257	struct btrfs_io_geometry geom;
2258
2259	if (bio_flags & EXTENT_BIO_COMPRESSED)
2260		return 0;
2261
2262	em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
2263	if (IS_ERR(em))
2264		return PTR_ERR(em);
2265	ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom);
2266	if (ret < 0)
2267		goto out;
2268
2269	if (geom.len < bio_len + size)
2270		ret = 1;
2271out:
2272	free_extent_map(em);
2273	return ret;
2274}
2275
2276/*
2277 * in order to insert checksums into the metadata in large chunks,
2278 * we wait until bio submission time.   All the pages in the bio are
2279 * checksummed and sums are attached onto the ordered extent record.
2280 *
2281 * At IO completion time the cums attached on the ordered extent record
2282 * are inserted into the btree
2283 */
2284static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
2285					   u64 dio_file_offset)
2286{
2287	return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
2288}
2289
2290/*
2291 * Split an extent_map at [start, start + len]
2292 *
2293 * This function is intended to be used only for extract_ordered_extent().
2294 */
2295static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
2296			  u64 pre, u64 post)
2297{
2298	struct extent_map_tree *em_tree = &inode->extent_tree;
2299	struct extent_map *em;
2300	struct extent_map *split_pre = NULL;
2301	struct extent_map *split_mid = NULL;
2302	struct extent_map *split_post = NULL;
2303	int ret = 0;
2304	int modified;
2305	unsigned long flags;
2306
2307	/* Sanity check */
2308	if (pre == 0 && post == 0)
2309		return 0;
2310
2311	split_pre = alloc_extent_map();
2312	if (pre)
2313		split_mid = alloc_extent_map();
2314	if (post)
2315		split_post = alloc_extent_map();
2316	if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
2317		ret = -ENOMEM;
2318		goto out;
2319	}
2320
2321	ASSERT(pre + post < len);
2322
2323	lock_extent(&inode->io_tree, start, start + len - 1);
2324	write_lock(&em_tree->lock);
2325	em = lookup_extent_mapping(em_tree, start, len);
2326	if (!em) {
2327		ret = -EIO;
2328		goto out_unlock;
2329	}
2330
2331	ASSERT(em->len == len);
2332	ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
2333	ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
2334
2335	flags = em->flags;
2336	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
2337	clear_bit(EXTENT_FLAG_LOGGING, &flags);
2338	modified = !list_empty(&em->list);
2339
2340	/* First, replace the em with a new extent_map starting from * em->start */
2341	split_pre->start = em->start;
2342	split_pre->len = (pre ? pre : em->len - post);
2343	split_pre->orig_start = split_pre->start;
2344	split_pre->block_start = em->block_start;
2345	split_pre->block_len = split_pre->len;
2346	split_pre->orig_block_len = split_pre->block_len;
2347	split_pre->ram_bytes = split_pre->len;
2348	split_pre->flags = flags;
2349	split_pre->compress_type = em->compress_type;
2350	split_pre->generation = em->generation;
2351
2352	replace_extent_mapping(em_tree, em, split_pre, modified);
2353
2354	/*
2355	 * Now we only have an extent_map at:
2356	 *     [em->start, em->start + pre] if pre != 0
2357	 *     [em->start, em->start + em->len - post] if pre == 0
2358	 */
2359
2360	if (pre) {
2361		/* Insert the middle extent_map */
2362		split_mid->start = em->start + pre;
2363		split_mid->len = em->len - pre - post;
2364		split_mid->orig_start = split_mid->start;
2365		split_mid->block_start = em->block_start + pre;
2366		split_mid->block_len = split_mid->len;
2367		split_mid->orig_block_len = split_mid->block_len;
2368		split_mid->ram_bytes = split_mid->len;
2369		split_mid->flags = flags;
2370		split_mid->compress_type = em->compress_type;
2371		split_mid->generation = em->generation;
2372		add_extent_mapping(em_tree, split_mid, modified);
2373	}
2374
2375	if (post) {
2376		split_post->start = em->start + em->len - post;
2377		split_post->len = post;
2378		split_post->orig_start = split_post->start;
2379		split_post->block_start = em->block_start + em->len - post;
2380		split_post->block_len = split_post->len;
2381		split_post->orig_block_len = split_post->block_len;
2382		split_post->ram_bytes = split_post->len;
2383		split_post->flags = flags;
2384		split_post->compress_type = em->compress_type;
2385		split_post->generation = em->generation;
2386		add_extent_mapping(em_tree, split_post, modified);
2387	}
2388
2389	/* Once for us */
2390	free_extent_map(em);
2391	/* Once for the tree */
2392	free_extent_map(em);
2393
2394out_unlock:
2395	write_unlock(&em_tree->lock);
2396	unlock_extent(&inode->io_tree, start, start + len - 1);
2397out:
2398	free_extent_map(split_pre);
2399	free_extent_map(split_mid);
2400	free_extent_map(split_post);
2401
2402	return ret;
2403}
2404
2405static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
2406					   struct bio *bio, loff_t file_offset)
2407{
2408	struct btrfs_ordered_extent *ordered;
2409	u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
2410	u64 file_len;
2411	u64 len = bio->bi_iter.bi_size;
2412	u64 end = start + len;
2413	u64 ordered_end;
2414	u64 pre, post;
2415	int ret = 0;
2416
2417	ordered = btrfs_lookup_ordered_extent(inode, file_offset);
2418	if (WARN_ON_ONCE(!ordered))
2419		return BLK_STS_IOERR;
2420
2421	/* No need to split */
2422	if (ordered->disk_num_bytes == len)
2423		goto out;
2424
2425	/* We cannot split once end_bio'd ordered extent */
2426	if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
2427		ret = -EINVAL;
2428		goto out;
2429	}
2430
2431	/* We cannot split a compressed ordered extent */
2432	if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
2433		ret = -EINVAL;
2434		goto out;
2435	}
2436
2437	ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
2438	/* bio must be in one ordered extent */
2439	if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
2440		ret = -EINVAL;
2441		goto out;
2442	}
2443
2444	/* Checksum list should be empty */
2445	if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
2446		ret = -EINVAL;
2447		goto out;
2448	}
2449
2450	file_len = ordered->num_bytes;
2451	pre = start - ordered->disk_bytenr;
2452	post = ordered_end - end;
2453
2454	ret = btrfs_split_ordered_extent(ordered, pre, post);
2455	if (ret)
2456		goto out;
2457	ret = split_zoned_em(inode, file_offset, file_len, pre, post);
2458
2459out:
2460	btrfs_put_ordered_extent(ordered);
2461
2462	return errno_to_blk_status(ret);
2463}
2464
2465/*
2466 * extent_io.c submission hook. This does the right thing for csum calculation
2467 * on write, or reading the csums from the tree before a read.
2468 *
2469 * Rules about async/sync submit,
2470 * a) read:				sync submit
2471 *
2472 * b) write without checksum:		sync submit
2473 *
2474 * c) write with checksum:
2475 *    c-1) if bio is issued by fsync:	sync submit
2476 *         (sync_writers != 0)
2477 *
2478 *    c-2) if root is reloc root:	sync submit
2479 *         (only in case of buffered IO)
2480 *
2481 *    c-3) otherwise:			async submit
2482 */
2483blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
2484				   int mirror_num, unsigned long bio_flags)
2485
2486{
2487	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2488	struct btrfs_root *root = BTRFS_I(inode)->root;
2489	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
2490	blk_status_t ret = 0;
2491	int skip_sum;
2492	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
2493
2494	skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
2495		   !fs_info->csum_root;
2496
2497	if (btrfs_is_free_space_inode(BTRFS_I(inode)))
2498		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
2499
2500	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
2501		struct page *page = bio_first_bvec_all(bio)->bv_page;
2502		loff_t file_offset = page_offset(page);
2503
2504		ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
2505		if (ret)
2506			goto out;
2507	}
2508
2509	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
2510		ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
2511		if (ret)
2512			goto out;
2513
2514		if (bio_flags & EXTENT_BIO_COMPRESSED) {
2515			ret = btrfs_submit_compressed_read(inode, bio,
2516							   mirror_num,
2517							   bio_flags);
2518			goto out;
2519		} else {
2520			/*
2521			 * Lookup bio sums does extra checks around whether we
2522			 * need to csum or not, which is why we ignore skip_sum
2523			 * here.
2524			 */
2525			ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2526			if (ret)
2527				goto out;
2528		}
2529		goto mapit;
2530	} else if (async && !skip_sum) {
2531		/* csum items have already been cloned */
2532		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2533			goto mapit;
2534		/* we're doing a write, do the async checksumming */
2535		ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
2536					  0, btrfs_submit_bio_start);
2537		goto out;
2538	} else if (!skip_sum) {
2539		ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
2540		if (ret)
2541			goto out;
2542	}
2543
2544mapit:
2545	ret = btrfs_map_bio(fs_info, bio, mirror_num);
2546
2547out:
2548	if (ret) {
2549		bio->bi_status = ret;
2550		bio_endio(bio);
2551	}
2552	return ret;
2553}
2554
2555/*
2556 * given a list of ordered sums record them in the inode.  This happens
2557 * at IO completion time based on sums calculated at bio submission time.
2558 */
2559static int add_pending_csums(struct btrfs_trans_handle *trans,
2560			     struct list_head *list)
2561{
2562	struct btrfs_ordered_sum *sum;
2563	int ret;
2564
2565	list_for_each_entry(sum, list, list) {
2566		trans->adding_csums = true;
2567		ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
2568		trans->adding_csums = false;
2569		if (ret)
2570			return ret;
2571	}
2572	return 0;
2573}
2574
2575static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2576					 const u64 start,
2577					 const u64 len,
2578					 struct extent_state **cached_state)
2579{
2580	u64 search_start = start;
2581	const u64 end = start + len - 1;
2582
2583	while (search_start < end) {
2584		const u64 search_len = end - search_start + 1;
2585		struct extent_map *em;
2586		u64 em_len;
2587		int ret = 0;
2588
2589		em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
2590		if (IS_ERR(em))
2591			return PTR_ERR(em);
2592
2593		if (em->block_start != EXTENT_MAP_HOLE)
2594			goto next;
2595
2596		em_len = em->len;
2597		if (em->start < search_start)
2598			em_len -= search_start - em->start;
2599		if (em_len > search_len)
2600			em_len = search_len;
2601
2602		ret = set_extent_bit(&inode->io_tree, search_start,
2603				     search_start + em_len - 1,
2604				     EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
2605				     GFP_NOFS, NULL);
2606next:
2607		search_start = extent_map_end(em);
2608		free_extent_map(em);
2609		if (ret)
2610			return ret;
2611	}
2612	return 0;
2613}
2614
2615int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2616			      unsigned int extra_bits,
2617			      struct extent_state **cached_state)
2618{
2619	WARN_ON(PAGE_ALIGNED(end));
2620
2621	if (start >= i_size_read(&inode->vfs_inode) &&
2622	    !(inode->flags & BTRFS_INODE_PREALLOC)) {
2623		/*
2624		 * There can't be any extents following eof in this case so just
2625		 * set the delalloc new bit for the range directly.
2626		 */
2627		extra_bits |= EXTENT_DELALLOC_NEW;
2628	} else {
2629		int ret;
2630
2631		ret = btrfs_find_new_delalloc_bytes(inode, start,
2632						    end + 1 - start,
2633						    cached_state);
2634		if (ret)
2635			return ret;
2636	}
2637
2638	return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
2639				   cached_state);
2640}
2641
2642/* see btrfs_writepage_start_hook for details on why this is required */
2643struct btrfs_writepage_fixup {
2644	struct page *page;
2645	struct inode *inode;
2646	struct btrfs_work work;
2647};
2648
2649static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2650{
2651	struct btrfs_writepage_fixup *fixup;
2652	struct btrfs_ordered_extent *ordered;
2653	struct extent_state *cached_state = NULL;
2654	struct extent_changeset *data_reserved = NULL;
2655	struct page *page;
2656	struct btrfs_inode *inode;
2657	u64 page_start;
2658	u64 page_end;
2659	int ret = 0;
2660	bool free_delalloc_space = true;
2661
2662	fixup = container_of(work, struct btrfs_writepage_fixup, work);
2663	page = fixup->page;
2664	inode = BTRFS_I(fixup->inode);
2665	page_start = page_offset(page);
2666	page_end = page_offset(page) + PAGE_SIZE - 1;
2667
2668	/*
2669	 * This is similar to page_mkwrite, we need to reserve the space before
2670	 * we take the page lock.
2671	 */
2672	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2673					   PAGE_SIZE);
2674again:
2675	lock_page(page);
2676
2677	/*
2678	 * Before we queued this fixup, we took a reference on the page.
2679	 * page->mapping may go NULL, but it shouldn't be moved to a different
2680	 * address space.
2681	 */
2682	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2683		/*
2684		 * Unfortunately this is a little tricky, either
2685		 *
2686		 * 1) We got here and our page had already been dealt with and
2687		 *    we reserved our space, thus ret == 0, so we need to just
2688		 *    drop our space reservation and bail.  This can happen the
2689		 *    first time we come into the fixup worker, or could happen
2690		 *    while waiting for the ordered extent.
2691		 * 2) Our page was already dealt with, but we happened to get an
2692		 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
2693		 *    this case we obviously don't have anything to release, but
2694		 *    because the page was already dealt with we don't want to
2695		 *    mark the page with an error, so make sure we're resetting
2696		 *    ret to 0.  This is why we have this check _before_ the ret
2697		 *    check, because we do not want to have a surprise ENOSPC
2698		 *    when the page was already properly dealt with.
2699		 */
2700		if (!ret) {
2701			btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2702			btrfs_delalloc_release_space(inode, data_reserved,
2703						     page_start, PAGE_SIZE,
2704						     true);
2705		}
2706		ret = 0;
2707		goto out_page;
2708	}
2709
2710	/*
2711	 * We can't mess with the page state unless it is locked, so now that
2712	 * it is locked bail if we failed to make our space reservation.
2713	 */
2714	if (ret)
2715		goto out_page;
2716
2717	lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
2718
2719	/* already ordered? We're done */
2720	if (PageOrdered(page))
2721		goto out_reserved;
2722
2723	ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2724	if (ordered) {
2725		unlock_extent_cached(&inode->io_tree, page_start, page_end,
2726				     &cached_state);
2727		unlock_page(page);
2728		btrfs_start_ordered_extent(ordered, 1);
2729		btrfs_put_ordered_extent(ordered);
2730		goto again;
2731	}
2732
2733	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2734					&cached_state);
2735	if (ret)
2736		goto out_reserved;
2737
2738	/*
2739	 * Everything went as planned, we're now the owner of a dirty page with
2740	 * delayed allocation bits set and space reserved for our COW
2741	 * destination.
2742	 *
2743	 * The page was dirty when we started, nothing should have cleaned it.
2744	 */
2745	BUG_ON(!PageDirty(page));
2746	free_delalloc_space = false;
2747out_reserved:
2748	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2749	if (free_delalloc_space)
2750		btrfs_delalloc_release_space(inode, data_reserved, page_start,
2751					     PAGE_SIZE, true);
2752	unlock_extent_cached(&inode->io_tree, page_start, page_end,
2753			     &cached_state);
2754out_page:
2755	if (ret) {
2756		/*
2757		 * We hit ENOSPC or other errors.  Update the mapping and page
2758		 * to reflect the errors and clean the page.
2759		 */
2760		mapping_set_error(page->mapping, ret);
2761		end_extent_writepage(page, ret, page_start, page_end);
2762		clear_page_dirty_for_io(page);
2763		SetPageError(page);
2764	}
2765	ClearPageChecked(page);
2766	unlock_page(page);
2767	put_page(page);
2768	kfree(fixup);
2769	extent_changeset_free(data_reserved);
2770	/*
2771	 * As a precaution, do a delayed iput in case it would be the last iput
2772	 * that could need flushing space. Recursing back to fixup worker would
2773	 * deadlock.
2774	 */
2775	btrfs_add_delayed_iput(&inode->vfs_inode);
2776}
2777
2778/*
2779 * There are a few paths in the higher layers of the kernel that directly
2780 * set the page dirty bit without asking the filesystem if it is a
2781 * good idea.  This causes problems because we want to make sure COW
2782 * properly happens and the data=ordered rules are followed.
2783 *
2784 * In our case any range that doesn't have the ORDERED bit set
2785 * hasn't been properly setup for IO.  We kick off an async process
2786 * to fix it up.  The async helper will wait for ordered extents, set
2787 * the delalloc bit and make it safe to write the page.
2788 */
2789int btrfs_writepage_cow_fixup(struct page *page)
2790{
2791	struct inode *inode = page->mapping->host;
2792	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2793	struct btrfs_writepage_fixup *fixup;
2794
2795	/* This page has ordered extent covering it already */
2796	if (PageOrdered(page))
2797		return 0;
2798
2799	/*
2800	 * PageChecked is set below when we create a fixup worker for this page,
2801	 * don't try to create another one if we're already PageChecked()
2802	 *
2803	 * The extent_io writepage code will redirty the page if we send back
2804	 * EAGAIN.
2805	 */
2806	if (PageChecked(page))
2807		return -EAGAIN;
2808
2809	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2810	if (!fixup)
2811		return -EAGAIN;
2812
2813	/*
2814	 * We are already holding a reference to this inode from
2815	 * write_cache_pages.  We need to hold it because the space reservation
2816	 * takes place outside of the page lock, and we can't trust
2817	 * page->mapping outside of the page lock.
2818	 */
2819	ihold(inode);
2820	SetPageChecked(page);
2821	get_page(page);
2822	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
2823	fixup->page = page;
2824	fixup->inode = inode;
2825	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2826
2827	return -EAGAIN;
2828}
2829
2830static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2831				       struct btrfs_inode *inode, u64 file_pos,
2832				       struct btrfs_file_extent_item *stack_fi,
2833				       const bool update_inode_bytes,
2834				       u64 qgroup_reserved)
2835{
2836	struct btrfs_root *root = inode->root;
2837	const u64 sectorsize = root->fs_info->sectorsize;
2838	struct btrfs_path *path;
2839	struct extent_buffer *leaf;
2840	struct btrfs_key ins;
2841	u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2842	u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2843	u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2844	u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2845	struct btrfs_drop_extents_args drop_args = { 0 };
2846	int ret;
2847
2848	path = btrfs_alloc_path();
2849	if (!path)
2850		return -ENOMEM;
2851
2852	/*
2853	 * we may be replacing one extent in the tree with another.
2854	 * The new extent is pinned in the extent map, and we don't want
2855	 * to drop it from the cache until it is completely in the btree.
2856	 *
2857	 * So, tell btrfs_drop_extents to leave this extent in the cache.
2858	 * the caller is expected to unpin it and allow it to be merged
2859	 * with the others.
2860	 */
2861	drop_args.path = path;
2862	drop_args.start = file_pos;
2863	drop_args.end = file_pos + num_bytes;
2864	drop_args.replace_extent = true;
2865	drop_args.extent_item_size = sizeof(*stack_fi);
2866	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2867	if (ret)
2868		goto out;
2869
2870	if (!drop_args.extent_inserted) {
2871		ins.objectid = btrfs_ino(inode);
2872		ins.offset = file_pos;
2873		ins.type = BTRFS_EXTENT_DATA_KEY;
2874
2875		ret = btrfs_insert_empty_item(trans, root, path, &ins,
2876					      sizeof(*stack_fi));
2877		if (ret)
2878			goto out;
2879	}
2880	leaf = path->nodes[0];
2881	btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2882	write_extent_buffer(leaf, stack_fi,
2883			btrfs_item_ptr_offset(leaf, path->slots[0]),
2884			sizeof(struct btrfs_file_extent_item));
2885
2886	btrfs_mark_buffer_dirty(leaf);
2887	btrfs_release_path(path);
2888
2889	/*
2890	 * If we dropped an inline extent here, we know the range where it is
2891	 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
2892	 * number of bytes only for that range containing the inline extent.
2893	 * The remaining of the range will be processed when clearning the
2894	 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
2895	 */
2896	if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
2897		u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
2898
2899		inline_size = drop_args.bytes_found - inline_size;
2900		btrfs_update_inode_bytes(inode, sectorsize, inline_size);
2901		drop_args.bytes_found -= inline_size;
2902		num_bytes -= sectorsize;
2903	}
2904
2905	if (update_inode_bytes)
2906		btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
2907
2908	ins.objectid = disk_bytenr;
2909	ins.offset = disk_num_bytes;
2910	ins.type = BTRFS_EXTENT_ITEM_KEY;
2911
2912	ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
2913	if (ret)
2914		goto out;
2915
2916	ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
2917					       file_pos, qgroup_reserved, &ins);
2918out:
2919	btrfs_free_path(path);
2920
2921	return ret;
2922}
2923
2924static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2925					 u64 start, u64 len)
2926{
2927	struct btrfs_block_group *cache;
2928
2929	cache = btrfs_lookup_block_group(fs_info, start);
2930	ASSERT(cache);
2931
2932	spin_lock(&cache->lock);
2933	cache->delalloc_bytes -= len;
2934	spin_unlock(&cache->lock);
2935
2936	btrfs_put_block_group(cache);
2937}
2938
2939static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
2940					     struct btrfs_ordered_extent *oe)
2941{
2942	struct btrfs_file_extent_item stack_fi;
2943	u64 logical_len;
2944	bool update_inode_bytes;
2945
2946	memset(&stack_fi, 0, sizeof(stack_fi));
2947	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
2948	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
2949	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
2950						   oe->disk_num_bytes);
2951	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
2952		logical_len = oe->truncated_len;
2953	else
2954		logical_len = oe->num_bytes;
2955	btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len);
2956	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len);
2957	btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
2958	/* Encryption and other encoding is reserved and all 0 */
2959
2960	/*
2961	 * For delalloc, when completing an ordered extent we update the inode's
2962	 * bytes when clearing the range in the inode's io tree, so pass false
2963	 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
2964	 * except if the ordered extent was truncated.
2965	 */
2966	update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
2967			     test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
2968
2969	return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
2970					   oe->file_offset, &stack_fi,
2971					   update_inode_bytes, oe->qgroup_rsv);
2972}
2973
2974/*
2975 * As ordered data IO finishes, this gets called so we can finish
2976 * an ordered extent if the range of bytes in the file it covers are
2977 * fully written.
2978 */
2979static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2980{
2981	struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
2982	struct btrfs_root *root = inode->root;
2983	struct btrfs_fs_info *fs_info = root->fs_info;
2984	struct btrfs_trans_handle *trans = NULL;
2985	struct extent_io_tree *io_tree = &inode->io_tree;
2986	struct extent_state *cached_state = NULL;
2987	u64 start, end;
2988	int compress_type = 0;
2989	int ret = 0;
2990	u64 logical_len = ordered_extent->num_bytes;
2991	bool freespace_inode;
2992	bool truncated = false;
2993	bool clear_reserved_extent = true;
2994	unsigned int clear_bits = EXTENT_DEFRAG;
2995
2996	start = ordered_extent->file_offset;
2997	end = start + ordered_extent->num_bytes - 1;
2998
2999	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3000	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3001	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
3002		clear_bits |= EXTENT_DELALLOC_NEW;
3003
3004	freespace_inode = btrfs_is_free_space_inode(inode);
3005
3006	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3007		ret = -EIO;
3008		goto out;
3009	}
3010
3011	if (ordered_extent->bdev)
3012		btrfs_rewrite_logical_zoned(ordered_extent);
3013
3014	btrfs_free_io_failure_record(inode, start, end);
3015
3016	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3017		truncated = true;
3018		logical_len = ordered_extent->truncated_len;
3019		/* Truncated the entire extent, don't bother adding */
3020		if (!logical_len)
3021			goto out;
3022	}
3023
3024	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3025		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3026
3027		btrfs_inode_safe_disk_i_size_write(inode, 0);
3028		if (freespace_inode)
3029			trans = btrfs_join_transaction_spacecache(root);
3030		else
3031			trans = btrfs_join_transaction(root);
3032		if (IS_ERR(trans)) {
3033			ret = PTR_ERR(trans);
3034			trans = NULL;
3035			goto out;
3036		}
3037		trans->block_rsv = &inode->block_rsv;
3038		ret = btrfs_update_inode_fallback(trans, root, inode);
3039		if (ret) /* -ENOMEM or corruption */
3040			btrfs_abort_transaction(trans, ret);
3041		goto out;
3042	}
3043
3044	clear_bits |= EXTENT_LOCKED;
3045	lock_extent_bits(io_tree, start, end, &cached_state);
3046
3047	if (freespace_inode)
3048		trans = btrfs_join_transaction_spacecache(root);
3049	else
3050		trans = btrfs_join_transaction(root);
3051	if (IS_ERR(trans)) {
3052		ret = PTR_ERR(trans);
3053		trans = NULL;
3054		goto out;
3055	}
3056
3057	trans->block_rsv = &inode->block_rsv;
3058
3059	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3060		compress_type = ordered_extent->compress_type;
3061	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3062		BUG_ON(compress_type);
3063		ret = btrfs_mark_extent_written(trans, inode,
3064						ordered_extent->file_offset,
3065						ordered_extent->file_offset +
3066						logical_len);
3067	} else {
3068		BUG_ON(root == fs_info->tree_root);
3069		ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3070		if (!ret) {
3071			clear_reserved_extent = false;
3072			btrfs_release_delalloc_bytes(fs_info,
3073						ordered_extent->disk_bytenr,
3074						ordered_extent->disk_num_bytes);
3075		}
3076	}
3077	unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
3078			   ordered_extent->num_bytes, trans->transid);
3079	if (ret < 0) {
3080		btrfs_abort_transaction(trans, ret);
3081		goto out;
3082	}
3083
3084	ret = add_pending_csums(trans, &ordered_extent->list);
3085	if (ret) {
3086		btrfs_abort_transaction(trans, ret);
3087		goto out;
3088	}
3089
3090	/*
3091	 * If this is a new delalloc range, clear its new delalloc flag to
3092	 * update the inode's number of bytes. This needs to be done first
3093	 * before updating the inode item.
3094	 */
3095	if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3096	    !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3097		clear_extent_bit(&inode->io_tree, start, end,
3098				 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
3099				 0, 0, &cached_state);
3100
3101	btrfs_inode_safe_disk_i_size_write(inode, 0);
3102	ret = btrfs_update_inode_fallback(trans, root, inode);
3103	if (ret) { /* -ENOMEM or corruption */
3104		btrfs_abort_transaction(trans, ret);
3105		goto out;
3106	}
3107	ret = 0;
3108out:
3109	clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3110			 (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
3111			 &cached_state);
3112
3113	if (trans)
3114		btrfs_end_transaction(trans);
3115
3116	if (ret || truncated) {
3117		u64 unwritten_start = start;
3118
3119		/*
3120		 * If we failed to finish this ordered extent for any reason we
3121		 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3122		 * extent, and mark the inode with the error if it wasn't
3123		 * already set.  Any error during writeback would have already
3124		 * set the mapping error, so we need to set it if we're the ones
3125		 * marking this ordered extent as failed.
3126		 */
3127		if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
3128					     &ordered_extent->flags))
3129			mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
3130
3131		if (truncated)
3132			unwritten_start += logical_len;
3133		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
3134
3135		/* Drop the cache for the part of the extent we didn't write. */
3136		btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
3137
3138		/*
3139		 * If the ordered extent had an IOERR or something else went
3140		 * wrong we need to return the space for this ordered extent
3141		 * back to the allocator.  We only free the extent in the
3142		 * truncated case if we didn't write out the extent at all.
3143		 *
3144		 * If we made it past insert_reserved_file_extent before we
3145		 * errored out then we don't need to do this as the accounting
3146		 * has already been done.
3147		 */
3148		if ((ret || !logical_len) &&
3149		    clear_reserved_extent &&
3150		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3151		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3152			/*
3153			 * Discard the range before returning it back to the
3154			 * free space pool
3155			 */
3156			if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3157				btrfs_discard_extent(fs_info,
3158						ordered_extent->disk_bytenr,
3159						ordered_extent->disk_num_bytes,
3160						NULL);
3161			btrfs_free_reserved_extent(fs_info,
3162					ordered_extent->disk_bytenr,
3163					ordered_extent->disk_num_bytes, 1);
3164		}
3165	}
3166
3167	/*
3168	 * This needs to be done to make sure anybody waiting knows we are done
3169	 * updating everything for this ordered extent.
3170	 */
3171	btrfs_remove_ordered_extent(inode, ordered_extent);
3172
3173	/* once for us */
3174	btrfs_put_ordered_extent(ordered_extent);
3175	/* once for the tree */
3176	btrfs_put_ordered_extent(ordered_extent);
3177
3178	return ret;
3179}
3180
3181static void finish_ordered_fn(struct btrfs_work *work)
3182{
3183	struct btrfs_ordered_extent *ordered_extent;
3184	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3185	btrfs_finish_ordered_io(ordered_extent);
3186}
3187
3188void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
3189					  struct page *page, u64 start,
3190					  u64 end, bool uptodate)
3191{
3192	trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
3193
3194	btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
3195				       finish_ordered_fn, uptodate);
3196}
3197
3198/*
3199 * check_data_csum - verify checksum of one sector of uncompressed data
3200 * @inode:	inode
3201 * @io_bio:	btrfs_io_bio which contains the csum
3202 * @bio_offset:	offset to the beginning of the bio (in bytes)
3203 * @page:	page where is the data to be verified
3204 * @pgoff:	offset inside the page
3205 * @start:	logical offset in the file
3206 *
3207 * The length of such check is always one sector size.
3208 */
3209static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
3210			   u32 bio_offset, struct page *page, u32 pgoff,
3211			   u64 start)
3212{
3213	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3214	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3215	char *kaddr;
3216	u32 len = fs_info->sectorsize;
3217	const u32 csum_size = fs_info->csum_size;
3218	unsigned int offset_sectors;
3219	u8 *csum_expected;
3220	u8 csum[BTRFS_CSUM_SIZE];
3221
3222	ASSERT(pgoff + len <= PAGE_SIZE);
3223
3224	offset_sectors = bio_offset >> fs_info->sectorsize_bits;
3225	csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size;
3226
3227	kaddr = kmap_atomic(page);
3228	shash->tfm = fs_info->csum_shash;
3229
3230	crypto_shash_digest(shash, kaddr + pgoff, len, csum);
3231
3232	if (memcmp(csum, csum_expected, csum_size))
3233		goto zeroit;
3234
3235	kunmap_atomic(kaddr);
3236	return 0;
3237zeroit:
3238	btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3239				    io_bio->mirror_num);
3240	if (io_bio->device)
3241		btrfs_dev_stat_inc_and_print(io_bio->device,
3242					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
3243	memset(kaddr + pgoff, 1, len);
3244	flush_dcache_page(page);
3245	kunmap_atomic(kaddr);
3246	return -EIO;
3247}
3248
3249/*
3250 * When reads are done, we need to check csums to verify the data is correct.
3251 * if there's a match, we allow the bio to finish.  If not, the code in
3252 * extent_io.c will try to find good copies for us.
3253 *
3254 * @bio_offset:	offset to the beginning of the bio (in bytes)
3255 * @start:	file offset of the range start
3256 * @end:	file offset of the range end (inclusive)
3257 *
3258 * Return a bitmap where bit set means a csum mismatch, and bit not set means
3259 * csum match.
3260 */
3261unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
3262				    struct page *page, u64 start, u64 end)
3263{
3264	struct inode *inode = page->mapping->host;
3265	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3266	struct btrfs_root *root = BTRFS_I(inode)->root;
3267	const u32 sectorsize = root->fs_info->sectorsize;
3268	u32 pg_off;
3269	unsigned int result = 0;
3270
3271	if (PageChecked(page)) {
3272		ClearPageChecked(page);
3273		return 0;
3274	}
3275
3276	/*
3277	 * For subpage case, above PageChecked is not safe as it's not subpage
3278	 * compatible.
3279	 * But for now only cow fixup and compressed read utilize PageChecked
3280	 * flag, while in this context we can easily use io_bio->csum to
3281	 * determine if we really need to do csum verification.
3282	 *
3283	 * So for now, just exit if io_bio->csum is NULL, as it means it's
3284	 * compressed read, and its compressed data csum has already been
3285	 * verified.
3286	 */
3287	if (io_bio->csum == NULL)
3288		return 0;
3289
3290	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3291		return 0;
3292
3293	if (!root->fs_info->csum_root)
3294		return 0;
3295
3296	ASSERT(page_offset(page) <= start &&
3297	       end <= page_offset(page) + PAGE_SIZE - 1);
3298	for (pg_off = offset_in_page(start);
3299	     pg_off < offset_in_page(end);
3300	     pg_off += sectorsize, bio_offset += sectorsize) {
3301		u64 file_offset = pg_off + page_offset(page);
3302		int ret;
3303
3304		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
3305		    test_range_bit(io_tree, file_offset,
3306				   file_offset + sectorsize - 1,
3307				   EXTENT_NODATASUM, 1, NULL)) {
3308			/* Skip the range without csum for data reloc inode */
3309			clear_extent_bits(io_tree, file_offset,
3310					  file_offset + sectorsize - 1,
3311					  EXTENT_NODATASUM);
3312			continue;
3313		}
3314		ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
3315				      page_offset(page) + pg_off);
3316		if (ret < 0) {
3317			const int nr_bit = (pg_off - offset_in_page(start)) >>
3318				     root->fs_info->sectorsize_bits;
3319
3320			result |= (1U << nr_bit);
3321		}
3322	}
3323	return result;
3324}
3325
3326/*
3327 * btrfs_add_delayed_iput - perform a delayed iput on @inode
3328 *
3329 * @inode: The inode we want to perform iput on
3330 *
3331 * This function uses the generic vfs_inode::i_count to track whether we should
3332 * just decrement it (in case it's > 1) or if this is the last iput then link
3333 * the inode to the delayed iput machinery. Delayed iputs are processed at
3334 * transaction commit time/superblock commit/cleaner kthread.
3335 */
3336void btrfs_add_delayed_iput(struct inode *inode)
3337{
3338	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3339	struct btrfs_inode *binode = BTRFS_I(inode);
3340
3341	if (atomic_add_unless(&inode->i_count, -1, 1))
3342		return;
3343
3344	atomic_inc(&fs_info->nr_delayed_iputs);
3345	spin_lock(&fs_info->delayed_iput_lock);
3346	ASSERT(list_empty(&binode->delayed_iput));
3347	list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3348	spin_unlock(&fs_info->delayed_iput_lock);
3349	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3350		wake_up_process(fs_info->cleaner_kthread);
3351}
3352
3353static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3354				    struct btrfs_inode *inode)
3355{
3356	list_del_init(&inode->delayed_iput);
3357	spin_unlock(&fs_info->delayed_iput_lock);
3358	iput(&inode->vfs_inode);
3359	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3360		wake_up(&fs_info->delayed_iputs_wait);
3361	spin_lock(&fs_info->delayed_iput_lock);
3362}
3363
3364static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3365				   struct btrfs_inode *inode)
3366{
3367	if (!list_empty(&inode->delayed_iput)) {
3368		spin_lock(&fs_info->delayed_iput_lock);
3369		if (!list_empty(&inode->delayed_iput))
3370			run_delayed_iput_locked(fs_info, inode);
3371		spin_unlock(&fs_info->delayed_iput_lock);
3372	}
3373}
3374
3375void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3376{
3377
3378	spin_lock(&fs_info->delayed_iput_lock);
3379	while (!list_empty(&fs_info->delayed_iputs)) {
3380		struct btrfs_inode *inode;
3381
3382		inode = list_first_entry(&fs_info->delayed_iputs,
3383				struct btrfs_inode, delayed_iput);
3384		run_delayed_iput_locked(fs_info, inode);
3385		cond_resched_lock(&fs_info->delayed_iput_lock);
3386	}
3387	spin_unlock(&fs_info->delayed_iput_lock);
3388}
3389
3390/**
3391 * Wait for flushing all delayed iputs
3392 *
3393 * @fs_info:  the filesystem
3394 *
3395 * This will wait on any delayed iputs that are currently running with KILLABLE
3396 * set.  Once they are all done running we will return, unless we are killed in
3397 * which case we return EINTR. This helps in user operations like fallocate etc
3398 * that might get blocked on the iputs.
3399 *
3400 * Return EINTR if we were killed, 0 if nothing's pending
3401 */
3402int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3403{
3404	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3405			atomic_read(&fs_info->nr_delayed_iputs) == 0);
3406	if (ret)
3407		return -EINTR;
3408	return 0;
3409}
3410
3411/*
3412 * This creates an orphan entry for the given inode in case something goes wrong
3413 * in the middle of an unlink.
3414 */
3415int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3416		     struct btrfs_inode *inode)
3417{
3418	int ret;
3419
3420	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3421	if (ret && ret != -EEXIST) {
3422		btrfs_abort_transaction(trans, ret);
3423		return ret;
3424	}
3425
3426	return 0;
3427}
3428
3429/*
3430 * We have done the delete so we can go ahead and remove the orphan item for
3431 * this particular inode.
3432 */
3433static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3434			    struct btrfs_inode *inode)
3435{
3436	return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3437}
3438
3439/*
3440 * this cleans up any orphans that may be left on the list from the last use
3441 * of this root.
3442 */
3443int btrfs_orphan_cleanup(struct btrfs_root *root)
3444{
3445	struct btrfs_fs_info *fs_info = root->fs_info;
3446	struct btrfs_path *path;
3447	struct extent_buffer *leaf;
3448	struct btrfs_key key, found_key;
3449	struct btrfs_trans_handle *trans;
3450	struct inode *inode;
3451	u64 last_objectid = 0;
3452	int ret = 0, nr_unlink = 0;
3453
3454	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3455		return 0;
3456
3457	path = btrfs_alloc_path();
3458	if (!path) {
3459		ret = -ENOMEM;
3460		goto out;
3461	}
3462	path->reada = READA_BACK;
3463
3464	key.objectid = BTRFS_ORPHAN_OBJECTID;
3465	key.type = BTRFS_ORPHAN_ITEM_KEY;
3466	key.offset = (u64)-1;
3467
3468	while (1) {
3469		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3470		if (ret < 0)
3471			goto out;
3472
3473		/*
3474		 * if ret == 0 means we found what we were searching for, which
3475		 * is weird, but possible, so only screw with path if we didn't
3476		 * find the key and see if we have stuff that matches
3477		 */
3478		if (ret > 0) {
3479			ret = 0;
3480			if (path->slots[0] == 0)
3481				break;
3482			path->slots[0]--;
3483		}
3484
3485		/* pull out the item */
3486		leaf = path->nodes[0];
3487		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3488
3489		/* make sure the item matches what we want */
3490		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3491			break;
3492		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3493			break;
3494
3495		/* release the path since we're done with it */
3496		btrfs_release_path(path);
3497
3498		/*
3499		 * this is where we are basically btrfs_lookup, without the
3500		 * crossing root thing.  we store the inode number in the
3501		 * offset of the orphan item.
3502		 */
3503
3504		if (found_key.offset == last_objectid) {
3505			btrfs_err(fs_info,
3506				  "Error removing orphan entry, stopping orphan cleanup");
3507			ret = -EINVAL;
3508			goto out;
3509		}
3510
3511		last_objectid = found_key.offset;
3512
3513		found_key.objectid = found_key.offset;
3514		found_key.type = BTRFS_INODE_ITEM_KEY;
3515		found_key.offset = 0;
3516		inode = btrfs_iget(fs_info->sb, last_objectid, root);
3517		ret = PTR_ERR_OR_ZERO(inode);
3518		if (ret && ret != -ENOENT)
3519			goto out;
3520
3521		if (ret == -ENOENT && root == fs_info->tree_root) {
3522			struct btrfs_root *dead_root;
3523			int is_dead_root = 0;
3524
3525			/*
3526			 * This is an orphan in the tree root. Currently these
3527			 * could come from 2 sources:
3528			 *  a) a root (snapshot/subvolume) deletion in progress
3529			 *  b) a free space cache inode
3530			 * We need to distinguish those two, as the orphan item
3531			 * for a root must not get deleted before the deletion
3532			 * of the snapshot/subvolume's tree completes.
3533			 *
3534			 * btrfs_find_orphan_roots() ran before us, which has
3535			 * found all deleted roots and loaded them into
3536			 * fs_info->fs_roots_radix. So here we can find if an
3537			 * orphan item corresponds to a deleted root by looking
3538			 * up the root from that radix tree.
3539			 */
3540
3541			spin_lock(&fs_info->fs_roots_radix_lock);
3542			dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3543							 (unsigned long)found_key.objectid);
3544			if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3545				is_dead_root = 1;
3546			spin_unlock(&fs_info->fs_roots_radix_lock);
3547
3548			if (is_dead_root) {
3549				/* prevent this orphan from being found again */
3550				key.offset = found_key.objectid - 1;
3551				continue;
3552			}
3553
3554		}
3555
3556		/*
3557		 * If we have an inode with links, there are a couple of
3558		 * possibilities. Old kernels (before v3.12) used to create an
3559		 * orphan item for truncate indicating that there were possibly
3560		 * extent items past i_size that needed to be deleted. In v3.12,
3561		 * truncate was changed to update i_size in sync with the extent
3562		 * items, but the (useless) orphan item was still created. Since
3563		 * v4.18, we don't create the orphan item for truncate at all.
3564		 *
3565		 * So, this item could mean that we need to do a truncate, but
3566		 * only if this filesystem was last used on a pre-v3.12 kernel
3567		 * and was not cleanly unmounted. The odds of that are quite
3568		 * slim, and it's a pain to do the truncate now, so just delete
3569		 * the orphan item.
3570		 *
3571		 * It's also possible that this orphan item was supposed to be
3572		 * deleted but wasn't. The inode number may have been reused,
3573		 * but either way, we can delete the orphan item.
3574		 */
3575		if (ret == -ENOENT || inode->i_nlink) {
3576			if (!ret)
3577				iput(inode);
3578			trans = btrfs_start_transaction(root, 1);
3579			if (IS_ERR(trans)) {
3580				ret = PTR_ERR(trans);
3581				goto out;
3582			}
3583			btrfs_debug(fs_info, "auto deleting %Lu",
3584				    found_key.objectid);
3585			ret = btrfs_del_orphan_item(trans, root,
3586						    found_key.objectid);
3587			btrfs_end_transaction(trans);
3588			if (ret)
3589				goto out;
3590			continue;
3591		}
3592
3593		nr_unlink++;
3594
3595		/* this will do delete_inode and everything for us */
3596		iput(inode);
3597	}
3598	/* release the path since we're done with it */
3599	btrfs_release_path(path);
3600
3601	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3602
3603	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3604		trans = btrfs_join_transaction(root);
3605		if (!IS_ERR(trans))
3606			btrfs_end_transaction(trans);
3607	}
3608
3609	if (nr_unlink)
3610		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3611
3612out:
3613	if (ret)
3614		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3615	btrfs_free_path(path);
3616	return ret;
3617}
3618
3619/*
3620 * very simple check to peek ahead in the leaf looking for xattrs.  If we
3621 * don't find any xattrs, we know there can't be any acls.
3622 *
3623 * slot is the slot the inode is in, objectid is the objectid of the inode
3624 */
3625static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3626					  int slot, u64 objectid,
3627					  int *first_xattr_slot)
3628{
3629	u32 nritems = btrfs_header_nritems(leaf);
3630	struct btrfs_key found_key;
3631	static u64 xattr_access = 0;
3632	static u64 xattr_default = 0;
3633	int scanned = 0;
3634
3635	if (!xattr_access) {
3636		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3637					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3638		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3639					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3640	}
3641
3642	slot++;
3643	*first_xattr_slot = -1;
3644	while (slot < nritems) {
3645		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3646
3647		/* we found a different objectid, there must not be acls */
3648		if (found_key.objectid != objectid)
3649			return 0;
3650
3651		/* we found an xattr, assume we've got an acl */
3652		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3653			if (*first_xattr_slot == -1)
3654				*first_xattr_slot = slot;
3655			if (found_key.offset == xattr_access ||
3656			    found_key.offset == xattr_default)
3657				return 1;
3658		}
3659
3660		/*
3661		 * we found a key greater than an xattr key, there can't
3662		 * be any acls later on
3663		 */
3664		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3665			return 0;
3666
3667		slot++;
3668		scanned++;
3669
3670		/*
3671		 * it goes inode, inode backrefs, xattrs, extents,
3672		 * so if there are a ton of hard links to an inode there can
3673		 * be a lot of backrefs.  Don't waste time searching too hard,
3674		 * this is just an optimization
3675		 */
3676		if (scanned >= 8)
3677			break;
3678	}
3679	/* we hit the end of the leaf before we found an xattr or
3680	 * something larger than an xattr.  We have to assume the inode
3681	 * has acls
3682	 */
3683	if (*first_xattr_slot == -1)
3684		*first_xattr_slot = slot;
3685	return 1;
3686}
3687
3688/*
3689 * read an inode from the btree into the in-memory inode
3690 */
3691static int btrfs_read_locked_inode(struct inode *inode,
3692				   struct btrfs_path *in_path)
3693{
3694	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3695	struct btrfs_path *path = in_path;
3696	struct extent_buffer *leaf;
3697	struct btrfs_inode_item *inode_item;
3698	struct btrfs_root *root = BTRFS_I(inode)->root;
3699	struct btrfs_key location;
3700	unsigned long ptr;
3701	int maybe_acls;
3702	u32 rdev;
3703	int ret;
3704	bool filled = false;
3705	int first_xattr_slot;
3706
3707	ret = btrfs_fill_inode(inode, &rdev);
3708	if (!ret)
3709		filled = true;
3710
3711	if (!path) {
3712		path = btrfs_alloc_path();
3713		if (!path)
3714			return -ENOMEM;
3715	}
3716
3717	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3718
3719	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3720	if (ret) {
3721		if (path != in_path)
3722			btrfs_free_path(path);
3723		return ret;
3724	}
3725
3726	leaf = path->nodes[0];
3727
3728	if (filled)
3729		goto cache_index;
3730
3731	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3732				    struct btrfs_inode_item);
3733	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3734	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3735	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3736	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3737	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3738	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
3739			round_up(i_size_read(inode), fs_info->sectorsize));
3740
3741	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3742	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3743
3744	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3745	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3746
3747	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3748	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3749
3750	BTRFS_I(inode)->i_otime.tv_sec =
3751		btrfs_timespec_sec(leaf, &inode_item->otime);
3752	BTRFS_I(inode)->i_otime.tv_nsec =
3753		btrfs_timespec_nsec(leaf, &inode_item->otime);
3754
3755	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3756	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3757	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3758
3759	inode_set_iversion_queried(inode,
3760				   btrfs_inode_sequence(leaf, inode_item));
3761	inode->i_generation = BTRFS_I(inode)->generation;
3762	inode->i_rdev = 0;
3763	rdev = btrfs_inode_rdev(leaf, inode_item);
3764
3765	BTRFS_I(inode)->index_cnt = (u64)-1;
3766	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3767
3768cache_index:
3769	/*
3770	 * If we were modified in the current generation and evicted from memory
3771	 * and then re-read we need to do a full sync since we don't have any
3772	 * idea about which extents were modified before we were evicted from
3773	 * cache.
3774	 *
3775	 * This is required for both inode re-read from disk and delayed inode
3776	 * in delayed_nodes_tree.
3777	 */
3778	if (BTRFS_I(inode)->last_trans == fs_info->generation)
3779		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3780			&BTRFS_I(inode)->runtime_flags);
3781
3782	/*
3783	 * We don't persist the id of the transaction where an unlink operation
3784	 * against the inode was last made. So here we assume the inode might
3785	 * have been evicted, and therefore the exact value of last_unlink_trans
3786	 * lost, and set it to last_trans to avoid metadata inconsistencies
3787	 * between the inode and its parent if the inode is fsync'ed and the log
3788	 * replayed. For example, in the scenario:
3789	 *
3790	 * touch mydir/foo
3791	 * ln mydir/foo mydir/bar
3792	 * sync
3793	 * unlink mydir/bar
3794	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3795	 * xfs_io -c fsync mydir/foo
3796	 * <power failure>
3797	 * mount fs, triggers fsync log replay
3798	 *
3799	 * We must make sure that when we fsync our inode foo we also log its
3800	 * parent inode, otherwise after log replay the parent still has the
3801	 * dentry with the "bar" name but our inode foo has a link count of 1
3802	 * and doesn't have an inode ref with the name "bar" anymore.
3803	 *
3804	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
3805	 * but it guarantees correctness at the expense of occasional full
3806	 * transaction commits on fsync if our inode is a directory, or if our
3807	 * inode is not a directory, logging its parent unnecessarily.
3808	 */
3809	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3810
3811	/*
3812	 * Same logic as for last_unlink_trans. We don't persist the generation
3813	 * of the last transaction where this inode was used for a reflink
3814	 * operation, so after eviction and reloading the inode we must be
3815	 * pessimistic and assume the last transaction that modified the inode.
3816	 */
3817	BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
3818
3819	path->slots[0]++;
3820	if (inode->i_nlink != 1 ||
3821	    path->slots[0] >= btrfs_header_nritems(leaf))
3822		goto cache_acl;
3823
3824	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3825	if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3826		goto cache_acl;
3827
3828	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3829	if (location.type == BTRFS_INODE_REF_KEY) {
3830		struct btrfs_inode_ref *ref;
3831
3832		ref = (struct btrfs_inode_ref *)ptr;
3833		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3834	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3835		struct btrfs_inode_extref *extref;
3836
3837		extref = (struct btrfs_inode_extref *)ptr;
3838		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3839								     extref);
3840	}
3841cache_acl:
3842	/*
3843	 * try to precache a NULL acl entry for files that don't have
3844	 * any xattrs or acls
3845	 */
3846	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3847			btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3848	if (first_xattr_slot != -1) {
3849		path->slots[0] = first_xattr_slot;
3850		ret = btrfs_load_inode_props(inode, path);
3851		if (ret)
3852			btrfs_err(fs_info,
3853				  "error loading props for ino %llu (root %llu): %d",
3854				  btrfs_ino(BTRFS_I(inode)),
3855				  root->root_key.objectid, ret);
3856	}
3857	if (path != in_path)
3858		btrfs_free_path(path);
3859
3860	if (!maybe_acls)
3861		cache_no_acl(inode);
3862
3863	switch (inode->i_mode & S_IFMT) {
3864	case S_IFREG:
3865		inode->i_mapping->a_ops = &btrfs_aops;
3866		inode->i_fop = &btrfs_file_operations;
3867		inode->i_op = &btrfs_file_inode_operations;
3868		break;
3869	case S_IFDIR:
3870		inode->i_fop = &btrfs_dir_file_operations;
3871		inode->i_op = &btrfs_dir_inode_operations;
3872		break;
3873	case S_IFLNK:
3874		inode->i_op = &btrfs_symlink_inode_operations;
3875		inode_nohighmem(inode);
3876		inode->i_mapping->a_ops = &btrfs_aops;
3877		break;
3878	default:
3879		inode->i_op = &btrfs_special_inode_operations;
3880		init_special_inode(inode, inode->i_mode, rdev);
3881		break;
3882	}
3883
3884	btrfs_sync_inode_flags_to_i_flags(inode);
3885	return 0;
3886}
3887
3888/*
3889 * given a leaf and an inode, copy the inode fields into the leaf
3890 */
3891static void fill_inode_item(struct btrfs_trans_handle *trans,
3892			    struct extent_buffer *leaf,
3893			    struct btrfs_inode_item *item,
3894			    struct inode *inode)
3895{
3896	struct btrfs_map_token token;
3897
3898	btrfs_init_map_token(&token, leaf);
3899
3900	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3901	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3902	btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
3903	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3904	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3905
3906	btrfs_set_token_timespec_sec(&token, &item->atime,
3907				     inode->i_atime.tv_sec);
3908	btrfs_set_token_timespec_nsec(&token, &item->atime,
3909				      inode->i_atime.tv_nsec);
3910
3911	btrfs_set_token_timespec_sec(&token, &item->mtime,
3912				     inode->i_mtime.tv_sec);
3913	btrfs_set_token_timespec_nsec(&token, &item->mtime,
3914				      inode->i_mtime.tv_nsec);
3915
3916	btrfs_set_token_timespec_sec(&token, &item->ctime,
3917				     inode->i_ctime.tv_sec);
3918	btrfs_set_token_timespec_nsec(&token, &item->ctime,
3919				      inode->i_ctime.tv_nsec);
3920
3921	btrfs_set_token_timespec_sec(&token, &item->otime,
3922				     BTRFS_I(inode)->i_otime.tv_sec);
3923	btrfs_set_token_timespec_nsec(&token, &item->otime,
3924				      BTRFS_I(inode)->i_otime.tv_nsec);
3925
3926	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
3927	btrfs_set_token_inode_generation(&token, item,
3928					 BTRFS_I(inode)->generation);
3929	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3930	btrfs_set_token_inode_transid(&token, item, trans->transid);
3931	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3932	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
3933	btrfs_set_token_inode_block_group(&token, item, 0);
3934}
3935
3936/*
3937 * copy everything in the in-memory inode into the btree.
3938 */
3939static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3940				struct btrfs_root *root,
3941				struct btrfs_inode *inode)
3942{
3943	struct btrfs_inode_item *inode_item;
3944	struct btrfs_path *path;
3945	struct extent_buffer *leaf;
3946	int ret;
3947
3948	path = btrfs_alloc_path();
3949	if (!path)
3950		return -ENOMEM;
3951
3952	ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
3953	if (ret) {
3954		if (ret > 0)
3955			ret = -ENOENT;
3956		goto failed;
3957	}
3958
3959	leaf = path->nodes[0];
3960	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3961				    struct btrfs_inode_item);
3962
3963	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
3964	btrfs_mark_buffer_dirty(leaf);
3965	btrfs_set_inode_last_trans(trans, inode);
3966	ret = 0;
3967failed:
3968	btrfs_free_path(path);
3969	return ret;
3970}
3971
3972/*
3973 * copy everything in the in-memory inode into the btree.
3974 */
3975noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3976				struct btrfs_root *root,
3977				struct btrfs_inode *inode)
3978{
3979	struct btrfs_fs_info *fs_info = root->fs_info;
3980	int ret;
3981
3982	/*
3983	 * If the inode is a free space inode, we can deadlock during commit
3984	 * if we put it into the delayed code.
3985	 *
3986	 * The data relocation inode should also be directly updated
3987	 * without delay
3988	 */
3989	if (!btrfs_is_free_space_inode(inode)
3990	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
3991	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
3992		btrfs_update_root_times(trans, root);
3993
3994		ret = btrfs_delayed_update_inode(trans, root, inode);
3995		if (!ret)
3996			btrfs_set_inode_last_trans(trans, inode);
3997		return ret;
3998	}
3999
4000	return btrfs_update_inode_item(trans, root, inode);
4001}
4002
4003int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4004				struct btrfs_root *root, struct btrfs_inode *inode)
4005{
4006	int ret;
4007
4008	ret = btrfs_update_inode(trans, root, inode);
4009	if (ret == -ENOSPC)
4010		return btrfs_update_inode_item(trans, root, inode);
4011	return ret;
4012}
4013
4014/*
4015 * unlink helper that gets used here in inode.c and in the tree logging
4016 * recovery code.  It remove a link in a directory with a given name, and
4017 * also drops the back refs in the inode to the directory
4018 */
4019static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4020				struct btrfs_root *root,
4021				struct btrfs_inode *dir,
4022				struct btrfs_inode *inode,
4023				const char *name, int name_len)
4024{
4025	struct btrfs_fs_info *fs_info = root->fs_info;
4026	struct btrfs_path *path;
4027	int ret = 0;
4028	struct btrfs_dir_item *di;
4029	u64 index;
4030	u64 ino = btrfs_ino(inode);
4031	u64 dir_ino = btrfs_ino(dir);
4032
4033	path = btrfs_alloc_path();
4034	if (!path) {
4035		ret = -ENOMEM;
4036		goto out;
4037	}
4038
4039	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4040				    name, name_len, -1);
4041	if (IS_ERR_OR_NULL(di)) {
4042		ret = di ? PTR_ERR(di) : -ENOENT;
4043		goto err;
4044	}
4045	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4046	if (ret)
4047		goto err;
4048	btrfs_release_path(path);
4049
4050	/*
4051	 * If we don't have dir index, we have to get it by looking up
4052	 * the inode ref, since we get the inode ref, remove it directly,
4053	 * it is unnecessary to do delayed deletion.
4054	 *
4055	 * But if we have dir index, needn't search inode ref to get it.
4056	 * Since the inode ref is close to the inode item, it is better
4057	 * that we delay to delete it, and just do this deletion when
4058	 * we update the inode item.
4059	 */
4060	if (inode->dir_index) {
4061		ret = btrfs_delayed_delete_inode_ref(inode);
4062		if (!ret) {
4063			index = inode->dir_index;
4064			goto skip_backref;
4065		}
4066	}
4067
4068	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
4069				  dir_ino, &index);
4070	if (ret) {
4071		btrfs_info(fs_info,
4072			"failed to delete reference to %.*s, inode %llu parent %llu",
4073			name_len, name, ino, dir_ino);
4074		btrfs_abort_transaction(trans, ret);
4075		goto err;
4076	}
4077skip_backref:
4078	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4079	if (ret) {
4080		btrfs_abort_transaction(trans, ret);
4081		goto err;
4082	}
4083
4084	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
4085			dir_ino);
4086	if (ret != 0 && ret != -ENOENT) {
4087		btrfs_abort_transaction(trans, ret);
4088		goto err;
4089	}
4090
4091	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
4092			index);
4093	if (ret == -ENOENT)
4094		ret = 0;
4095	else if (ret)
4096		btrfs_abort_transaction(trans, ret);
4097
4098	/*
4099	 * If we have a pending delayed iput we could end up with the final iput
4100	 * being run in btrfs-cleaner context.  If we have enough of these built
4101	 * up we can end up burning a lot of time in btrfs-cleaner without any
4102	 * way to throttle the unlinks.  Since we're currently holding a ref on
4103	 * the inode we can run the delayed iput here without any issues as the
4104	 * final iput won't be done until after we drop the ref we're currently
4105	 * holding.
4106	 */
4107	btrfs_run_delayed_iput(fs_info, inode);
4108err:
4109	btrfs_free_path(path);
4110	if (ret)
4111		goto out;
4112
4113	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
4114	inode_inc_iversion(&inode->vfs_inode);
4115	inode_inc_iversion(&dir->vfs_inode);
4116	inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
4117		dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
4118	ret = btrfs_update_inode(trans, root, dir);
4119out:
4120	return ret;
4121}
4122
4123int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4124		       struct btrfs_root *root,
4125		       struct btrfs_inode *dir, struct btrfs_inode *inode,
4126		       const char *name, int name_len)
4127{
4128	int ret;
4129	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
4130	if (!ret) {
4131		drop_nlink(&inode->vfs_inode);
4132		ret = btrfs_update_inode(trans, root, inode);
4133	}
4134	return ret;
4135}
4136
4137/*
4138 * helper to start transaction for unlink and rmdir.
4139 *
4140 * unlink and rmdir are special in btrfs, they do not always free space, so
4141 * if we cannot make our reservations the normal way try and see if there is
4142 * plenty of slack room in the global reserve to migrate, otherwise we cannot
4143 * allow the unlink to occur.
4144 */
4145static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
4146{
4147	struct btrfs_root *root = BTRFS_I(dir)->root;
4148
4149	/*
4150	 * 1 for the possible orphan item
4151	 * 1 for the dir item
4152	 * 1 for the dir index
4153	 * 1 for the inode ref
4154	 * 1 for the inode
4155	 */
4156	return btrfs_start_transaction_fallback_global_rsv(root, 5);
4157}
4158
4159static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4160{
4161	struct btrfs_root *root = BTRFS_I(dir)->root;
4162	struct btrfs_trans_handle *trans;
4163	struct inode *inode = d_inode(dentry);
4164	int ret;
4165
4166	trans = __unlink_start_trans(dir);
4167	if (IS_ERR(trans))
4168		return PTR_ERR(trans);
4169
4170	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4171			0);
4172
4173	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
4174			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4175			dentry->d_name.len);
4176	if (ret)
4177		goto out;
4178
4179	if (inode->i_nlink == 0) {
4180		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4181		if (ret)
4182			goto out;
4183	}
4184
4185out:
4186	btrfs_end_transaction(trans);
4187	btrfs_btree_balance_dirty(root->fs_info);
4188	return ret;
4189}
4190
4191static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4192			       struct inode *dir, struct dentry *dentry)
4193{
4194	struct btrfs_root *root = BTRFS_I(dir)->root;
4195	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4196	struct btrfs_path *path;
4197	struct extent_buffer *leaf;
4198	struct btrfs_dir_item *di;
4199	struct btrfs_key key;
4200	const char *name = dentry->d_name.name;
4201	int name_len = dentry->d_name.len;
4202	u64 index;
4203	int ret;
4204	u64 objectid;
4205	u64 dir_ino = btrfs_ino(BTRFS_I(dir));
4206
4207	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4208		objectid = inode->root->root_key.objectid;
4209	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4210		objectid = inode->location.objectid;
4211	} else {
4212		WARN_ON(1);
4213		return -EINVAL;
4214	}
4215
4216	path = btrfs_alloc_path();
4217	if (!path)
4218		return -ENOMEM;
4219
4220	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4221				   name, name_len, -1);
4222	if (IS_ERR_OR_NULL(di)) {
4223		ret = di ? PTR_ERR(di) : -ENOENT;
4224		goto out;
4225	}
4226
4227	leaf = path->nodes[0];
4228	btrfs_dir_item_key_to_cpu(leaf, di, &key);
4229	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4230	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4231	if (ret) {
4232		btrfs_abort_transaction(trans, ret);
4233		goto out;
4234	}
4235	btrfs_release_path(path);
4236
4237	/*
4238	 * This is a placeholder inode for a subvolume we didn't have a
4239	 * reference to at the time of the snapshot creation.  In the meantime
4240	 * we could have renamed the real subvol link into our snapshot, so
4241	 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4242	 * Instead simply lookup the dir_index_item for this entry so we can
4243	 * remove it.  Otherwise we know we have a ref to the root and we can
4244	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
4245	 */
4246	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4247		di = btrfs_search_dir_index_item(root, path, dir_ino,
4248						 name, name_len);
4249		if (IS_ERR_OR_NULL(di)) {
4250			if (!di)
4251				ret = -ENOENT;
4252			else
4253				ret = PTR_ERR(di);
4254			btrfs_abort_transaction(trans, ret);
4255			goto out;
4256		}
4257
4258		leaf = path->nodes[0];
4259		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4260		index = key.offset;
4261		btrfs_release_path(path);
4262	} else {
4263		ret = btrfs_del_root_ref(trans, objectid,
4264					 root->root_key.objectid, dir_ino,
4265					 &index, name, name_len);
4266		if (ret) {
4267			btrfs_abort_transaction(trans, ret);
4268			goto out;
4269		}
4270	}
4271
4272	ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
4273	if (ret) {
4274		btrfs_abort_transaction(trans, ret);
4275		goto out;
4276	}
4277
4278	btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
4279	inode_inc_iversion(dir);
4280	dir->i_mtime = dir->i_ctime = current_time(dir);
4281	ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
4282	if (ret)
4283		btrfs_abort_transaction(trans, ret);
4284out:
4285	btrfs_free_path(path);
4286	return ret;
4287}
4288
4289/*
4290 * Helper to check if the subvolume references other subvolumes or if it's
4291 * default.
4292 */
4293static noinline int may_destroy_subvol(struct btrfs_root *root)
4294{
4295	struct btrfs_fs_info *fs_info = root->fs_info;
4296	struct btrfs_path *path;
4297	struct btrfs_dir_item *di;
4298	struct btrfs_key key;
4299	u64 dir_id;
4300	int ret;
4301
4302	path = btrfs_alloc_path();
4303	if (!path)
4304		return -ENOMEM;
4305
4306	/* Make sure this root isn't set as the default subvol */
4307	dir_id = btrfs_super_root_dir(fs_info->super_copy);
4308	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4309				   dir_id, "default", 7, 0);
4310	if (di && !IS_ERR(di)) {
4311		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4312		if (key.objectid == root->root_key.objectid) {
4313			ret = -EPERM;
4314			btrfs_err(fs_info,
4315				  "deleting default subvolume %llu is not allowed",
4316				  key.objectid);
4317			goto out;
4318		}
4319		btrfs_release_path(path);
4320	}
4321
4322	key.objectid = root->root_key.objectid;
4323	key.type = BTRFS_ROOT_REF_KEY;
4324	key.offset = (u64)-1;
4325
4326	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4327	if (ret < 0)
4328		goto out;
4329	BUG_ON(ret == 0);
4330
4331	ret = 0;
4332	if (path->slots[0] > 0) {
4333		path->slots[0]--;
4334		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4335		if (key.objectid == root->root_key.objectid &&
4336		    key.type == BTRFS_ROOT_REF_KEY)
4337			ret = -ENOTEMPTY;
4338	}
4339out:
4340	btrfs_free_path(path);
4341	return ret;
4342}
4343
4344/* Delete all dentries for inodes belonging to the root */
4345static void btrfs_prune_dentries(struct btrfs_root *root)
4346{
4347	struct btrfs_fs_info *fs_info = root->fs_info;
4348	struct rb_node *node;
4349	struct rb_node *prev;
4350	struct btrfs_inode *entry;
4351	struct inode *inode;
4352	u64 objectid = 0;
4353
4354	if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
4355		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4356
4357	spin_lock(&root->inode_lock);
4358again:
4359	node = root->inode_tree.rb_node;
4360	prev = NULL;
4361	while (node) {
4362		prev = node;
4363		entry = rb_entry(node, struct btrfs_inode, rb_node);
4364
4365		if (objectid < btrfs_ino(entry))
4366			node = node->rb_left;
4367		else if (objectid > btrfs_ino(entry))
4368			node = node->rb_right;
4369		else
4370			break;
4371	}
4372	if (!node) {
4373		while (prev) {
4374			entry = rb_entry(prev, struct btrfs_inode, rb_node);
4375			if (objectid <= btrfs_ino(entry)) {
4376				node = prev;
4377				break;
4378			}
4379			prev = rb_next(prev);
4380		}
4381	}
4382	while (node) {
4383		entry = rb_entry(node, struct btrfs_inode, rb_node);
4384		objectid = btrfs_ino(entry) + 1;
4385		inode = igrab(&entry->vfs_inode);
4386		if (inode) {
4387			spin_unlock(&root->inode_lock);
4388			if (atomic_read(&inode->i_count) > 1)
4389				d_prune_aliases(inode);
4390			/*
4391			 * btrfs_drop_inode will have it removed from the inode
4392			 * cache when its usage count hits zero.
4393			 */
4394			iput(inode);
4395			cond_resched();
4396			spin_lock(&root->inode_lock);
4397			goto again;
4398		}
4399
4400		if (cond_resched_lock(&root->inode_lock))
4401			goto again;
4402
4403		node = rb_next(node);
4404	}
4405	spin_unlock(&root->inode_lock);
4406}
4407
4408int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
4409{
4410	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
4411	struct btrfs_root *root = BTRFS_I(dir)->root;
4412	struct inode *inode = d_inode(dentry);
4413	struct btrfs_root *dest = BTRFS_I(inode)->root;
4414	struct btrfs_trans_handle *trans;
4415	struct btrfs_block_rsv block_rsv;
4416	u64 root_flags;
4417	int ret;
4418
4419	/*
4420	 * Don't allow to delete a subvolume with send in progress. This is
4421	 * inside the inode lock so the error handling that has to drop the bit
4422	 * again is not run concurrently.
4423	 */
4424	spin_lock(&dest->root_item_lock);
4425	if (dest->send_in_progress) {
4426		spin_unlock(&dest->root_item_lock);
4427		btrfs_warn(fs_info,
4428			   "attempt to delete subvolume %llu during send",
4429			   dest->root_key.objectid);
4430		return -EPERM;
4431	}
4432	root_flags = btrfs_root_flags(&dest->root_item);
4433	btrfs_set_root_flags(&dest->root_item,
4434			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4435	spin_unlock(&dest->root_item_lock);
4436
4437	down_write(&fs_info->subvol_sem);
4438
4439	ret = may_destroy_subvol(dest);
4440	if (ret)
4441		goto out_up_write;
4442
4443	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4444	/*
4445	 * One for dir inode,
4446	 * two for dir entries,
4447	 * two for root ref/backref.
4448	 */
4449	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4450	if (ret)
4451		goto out_up_write;
4452
4453	trans = btrfs_start_transaction(root, 0);
4454	if (IS_ERR(trans)) {
4455		ret = PTR_ERR(trans);
4456		goto out_release;
4457	}
4458	trans->block_rsv = &block_rsv;
4459	trans->bytes_reserved = block_rsv.size;
4460
4461	btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
4462
4463	ret = btrfs_unlink_subvol(trans, dir, dentry);
4464	if (ret) {
4465		btrfs_abort_transaction(trans, ret);
4466		goto out_end_trans;
4467	}
4468
4469	ret = btrfs_record_root_in_trans(trans, dest);
4470	if (ret) {
4471		btrfs_abort_transaction(trans, ret);
4472		goto out_end_trans;
4473	}
4474
4475	memset(&dest->root_item.drop_progress, 0,
4476		sizeof(dest->root_item.drop_progress));
4477	btrfs_set_root_drop_level(&dest->root_item, 0);
4478	btrfs_set_root_refs(&dest->root_item, 0);
4479
4480	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4481		ret = btrfs_insert_orphan_item(trans,
4482					fs_info->tree_root,
4483					dest->root_key.objectid);
4484		if (ret) {
4485			btrfs_abort_transaction(trans, ret);
4486			goto out_end_trans;
4487		}
4488	}
4489
4490	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4491				  BTRFS_UUID_KEY_SUBVOL,
4492				  dest->root_key.objectid);
4493	if (ret && ret != -ENOENT) {
4494		btrfs_abort_transaction(trans, ret);
4495		goto out_end_trans;
4496	}
4497	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4498		ret = btrfs_uuid_tree_remove(trans,
4499					  dest->root_item.received_uuid,
4500					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4501					  dest->root_key.objectid);
4502		if (ret && ret != -ENOENT) {
4503			btrfs_abort_transaction(trans, ret);
4504			goto out_end_trans;
4505		}
4506	}
4507
4508	free_anon_bdev(dest->anon_dev);
4509	dest->anon_dev = 0;
4510out_end_trans:
4511	trans->block_rsv = NULL;
4512	trans->bytes_reserved = 0;
4513	ret = btrfs_end_transaction(trans);
4514	inode->i_flags |= S_DEAD;
4515out_release:
4516	btrfs_subvolume_release_metadata(root, &block_rsv);
4517out_up_write:
4518	up_write(&fs_info->subvol_sem);
4519	if (ret) {
4520		spin_lock(&dest->root_item_lock);
4521		root_flags = btrfs_root_flags(&dest->root_item);
4522		btrfs_set_root_flags(&dest->root_item,
4523				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4524		spin_unlock(&dest->root_item_lock);
4525	} else {
4526		d_invalidate(dentry);
4527		btrfs_prune_dentries(dest);
4528		ASSERT(dest->send_in_progress == 0);
4529	}
4530
4531	return ret;
4532}
4533
4534static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4535{
4536	struct inode *inode = d_inode(dentry);
4537	int err = 0;
4538	struct btrfs_root *root = BTRFS_I(dir)->root;
4539	struct btrfs_trans_handle *trans;
4540	u64 last_unlink_trans;
4541
4542	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4543		return -ENOTEMPTY;
4544	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
4545		return btrfs_delete_subvolume(dir, dentry);
4546
4547	trans = __unlink_start_trans(dir);
4548	if (IS_ERR(trans))
4549		return PTR_ERR(trans);
4550
4551	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4552		err = btrfs_unlink_subvol(trans, dir, dentry);
4553		goto out;
4554	}
4555
4556	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4557	if (err)
4558		goto out;
4559
4560	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4561
4562	/* now the directory is empty */
4563	err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
4564			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4565			dentry->d_name.len);
4566	if (!err) {
4567		btrfs_i_size_write(BTRFS_I(inode), 0);
4568		/*
4569		 * Propagate the last_unlink_trans value of the deleted dir to
4570		 * its parent directory. This is to prevent an unrecoverable
4571		 * log tree in the case we do something like this:
4572		 * 1) create dir foo
4573		 * 2) create snapshot under dir foo
4574		 * 3) delete the snapshot
4575		 * 4) rmdir foo
4576		 * 5) mkdir foo
4577		 * 6) fsync foo or some file inside foo
4578		 */
4579		if (last_unlink_trans >= trans->transid)
4580			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4581	}
4582out:
4583	btrfs_end_transaction(trans);
4584	btrfs_btree_balance_dirty(root->fs_info);
4585
4586	return err;
4587}
4588
4589/*
4590 * Return this if we need to call truncate_block for the last bit of the
4591 * truncate.
4592 */
4593#define NEED_TRUNCATE_BLOCK 1
4594
4595/*
4596 * Remove inode items from a given root.
4597 *
4598 * @trans:		A transaction handle.
4599 * @root:		The root from which to remove items.
4600 * @inode:		The inode whose items we want to remove.
4601 * @new_size:		The new i_size for the inode. This is only applicable when
4602 *			@min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise.
4603 * @min_type:		The minimum key type to remove. All keys with a type
4604 *			greater than this value are removed and all keys with
4605 *			this type are removed only if their offset is >= @new_size.
4606 * @extents_found:	Output parameter that will contain the number of file
4607 *			extent items that were removed or adjusted to the new
4608 *			inode i_size. The caller is responsible for initializing
4609 *			the counter. Also, it can be NULL if the caller does not
4610 *			need this counter.
4611 *
4612 * Remove all keys associated with the inode from the given root that have a key
4613 * with a type greater than or equals to @min_type. When @min_type has a value of
4614 * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
4615 * greater than or equals to @new_size. If a file extent item that starts before
4616 * @new_size and ends after it is found, its length is adjusted.
4617 *
4618 * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
4619 * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
4620 */
4621int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4622			       struct btrfs_root *root,
4623			       struct btrfs_inode *inode,
4624			       u64 new_size, u32 min_type,
4625			       u64 *extents_found)
4626{
4627	struct btrfs_fs_info *fs_info = root->fs_info;
4628	struct btrfs_path *path;
4629	struct extent_buffer *leaf;
4630	struct btrfs_file_extent_item *fi;
4631	struct btrfs_key key;
4632	struct btrfs_key found_key;
4633	u64 extent_start = 0;
4634	u64 extent_num_bytes = 0;
4635	u64 extent_offset = 0;
4636	u64 item_end = 0;
4637	u64 last_size = new_size;
4638	u32 found_type = (u8)-1;
4639	int found_extent;
4640	int del_item;
4641	int pending_del_nr = 0;
4642	int pending_del_slot = 0;
4643	int extent_type = -1;
4644	int ret;
4645	u64 ino = btrfs_ino(inode);
4646	u64 bytes_deleted = 0;
4647	bool be_nice = false;
4648	bool should_throttle = false;
4649	const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
4650	struct extent_state *cached_state = NULL;
4651
4652	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4653
4654	/*
4655	 * For non-free space inodes and non-shareable roots, we want to back
4656	 * off from time to time.  This means all inodes in subvolume roots,
4657	 * reloc roots, and data reloc roots.
4658	 */
4659	if (!btrfs_is_free_space_inode(inode) &&
4660	    test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
4661		be_nice = true;
4662
4663	path = btrfs_alloc_path();
4664	if (!path)
4665		return -ENOMEM;
4666	path->reada = READA_BACK;
4667
4668	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4669		lock_extent_bits(&inode->io_tree, lock_start, (u64)-1,
4670				 &cached_state);
4671
4672		/*
4673		 * We want to drop from the next block forward in case this
4674		 * new size is not block aligned since we will be keeping the
4675		 * last block of the extent just the way it is.
4676		 */
4677		btrfs_drop_extent_cache(inode, ALIGN(new_size,
4678					fs_info->sectorsize),
4679					(u64)-1, 0);
4680	}
4681
4682	/*
4683	 * This function is also used to drop the items in the log tree before
4684	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4685	 * it is used to drop the logged items. So we shouldn't kill the delayed
4686	 * items.
4687	 */
4688	if (min_type == 0 && root == inode->root)
4689		btrfs_kill_delayed_inode_items(inode);
4690
4691	key.objectid = ino;
4692	key.offset = (u64)-1;
4693	key.type = (u8)-1;
4694
4695search_again:
4696	/*
4697	 * with a 16K leaf size and 128MB extents, you can actually queue
4698	 * up a huge file in a single leaf.  Most of the time that
4699	 * bytes_deleted is > 0, it will be huge by the time we get here
4700	 */
4701	if (be_nice && bytes_deleted > SZ_32M &&
4702	    btrfs_should_end_transaction(trans)) {
4703		ret = -EAGAIN;
4704		goto out;
4705	}
4706
4707	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4708	if (ret < 0)
4709		goto out;
4710
4711	if (ret > 0) {
4712		ret = 0;
4713		/* there are no items in the tree for us to truncate, we're
4714		 * done
4715		 */
4716		if (path->slots[0] == 0)
4717			goto out;
4718		path->slots[0]--;
4719	}
4720
4721	while (1) {
4722		u64 clear_start = 0, clear_len = 0;
4723
4724		fi = NULL;
4725		leaf = path->nodes[0];
4726		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4727		found_type = found_key.type;
4728
4729		if (found_key.objectid != ino)
4730			break;
4731
4732		if (found_type < min_type)
4733			break;
4734
4735		item_end = found_key.offset;
4736		if (found_type == BTRFS_EXTENT_DATA_KEY) {
4737			fi = btrfs_item_ptr(leaf, path->slots[0],
4738					    struct btrfs_file_extent_item);
4739			extent_type = btrfs_file_extent_type(leaf, fi);
4740			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4741				item_end +=
4742				    btrfs_file_extent_num_bytes(leaf, fi);
4743
4744				trace_btrfs_truncate_show_fi_regular(
4745					inode, leaf, fi, found_key.offset);
4746			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4747				item_end += btrfs_file_extent_ram_bytes(leaf,
4748									fi);
4749
4750				trace_btrfs_truncate_show_fi_inline(
4751					inode, leaf, fi, path->slots[0],
4752					found_key.offset);
4753			}
4754			item_end--;
4755		}
4756		if (found_type > min_type) {
4757			del_item = 1;
4758		} else {
4759			if (item_end < new_size)
4760				break;
4761			if (found_key.offset >= new_size)
4762				del_item = 1;
4763			else
4764				del_item = 0;
4765		}
4766		found_extent = 0;
4767		/* FIXME, shrink the extent if the ref count is only 1 */
4768		if (found_type != BTRFS_EXTENT_DATA_KEY)
4769			goto delete;
4770
4771		if (extents_found != NULL)
4772			(*extents_found)++;
4773
4774		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4775			u64 num_dec;
4776
4777			clear_start = found_key.offset;
4778			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4779			if (!del_item) {
4780				u64 orig_num_bytes =
4781					btrfs_file_extent_num_bytes(leaf, fi);
4782				extent_num_bytes = ALIGN(new_size -
4783						found_key.offset,
4784						fs_info->sectorsize);
4785				clear_start = ALIGN(new_size, fs_info->sectorsize);
4786				btrfs_set_file_extent_num_bytes(leaf, fi,
4787							 extent_num_bytes);
4788				num_dec = (orig_num_bytes -
4789					   extent_num_bytes);
4790				if (test_bit(BTRFS_ROOT_SHAREABLE,
4791					     &root->state) &&
4792				    extent_start != 0)
4793					inode_sub_bytes(&inode->vfs_inode,
4794							num_dec);
4795				btrfs_mark_buffer_dirty(leaf);
4796			} else {
4797				extent_num_bytes =
4798					btrfs_file_extent_disk_num_bytes(leaf,
4799									 fi);
4800				extent_offset = found_key.offset -
4801					btrfs_file_extent_offset(leaf, fi);
4802
4803				/* FIXME blocksize != 4096 */
4804				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4805				if (extent_start != 0) {
4806					found_extent = 1;
4807					if (test_bit(BTRFS_ROOT_SHAREABLE,
4808						     &root->state))
4809						inode_sub_bytes(&inode->vfs_inode,
4810								num_dec);
4811				}
4812			}
4813			clear_len = num_dec;
4814		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4815			/*
4816			 * we can't truncate inline items that have had
4817			 * special encodings
4818			 */
4819			if (!del_item &&
4820			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
4821			    btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
4822			    btrfs_file_extent_compression(leaf, fi) == 0) {
4823				u32 size = (u32)(new_size - found_key.offset);
4824
4825				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4826				size = btrfs_file_extent_calc_inline_size(size);
4827				btrfs_truncate_item(path, size, 1);
4828			} else if (!del_item) {
4829				/*
4830				 * We have to bail so the last_size is set to
4831				 * just before this extent.
4832				 */
4833				ret = NEED_TRUNCATE_BLOCK;
4834				break;
4835			} else {
4836				/*
4837				 * Inline extents are special, we just treat
4838				 * them as a full sector worth in the file
4839				 * extent tree just for simplicity sake.
4840				 */
4841				clear_len = fs_info->sectorsize;
4842			}
4843
4844			if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
4845				inode_sub_bytes(&inode->vfs_inode,
4846						item_end + 1 - new_size);
4847		}
4848delete:
4849		/*
4850		 * We use btrfs_truncate_inode_items() to clean up log trees for
4851		 * multiple fsyncs, and in this case we don't want to clear the
4852		 * file extent range because it's just the log.
4853		 */
4854		if (root == inode->root) {
4855			ret = btrfs_inode_clear_file_extent_range(inode,
4856						  clear_start, clear_len);
4857			if (ret) {
4858				btrfs_abort_transaction(trans, ret);
4859				break;
4860			}
4861		}
4862
4863		if (del_item)
4864			last_size = found_key.offset;
4865		else
4866			last_size = new_size;
4867		if (del_item) {
4868			if (!pending_del_nr) {
4869				/* no pending yet, add ourselves */
4870				pending_del_slot = path->slots[0];
4871				pending_del_nr = 1;
4872			} else if (pending_del_nr &&
4873				   path->slots[0] + 1 == pending_del_slot) {
4874				/* hop on the pending chunk */
4875				pending_del_nr++;
4876				pending_del_slot = path->slots[0];
4877			} else {
4878				BUG();
4879			}
4880		} else {
4881			break;
4882		}
4883		should_throttle = false;
4884
4885		if (found_extent &&
4886		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4887			struct btrfs_ref ref = { 0 };
4888
4889			bytes_deleted += extent_num_bytes;
4890
4891			btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
4892					extent_start, extent_num_bytes, 0);
4893			ref.real_root = root->root_key.objectid;
4894			btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
4895					ino, extent_offset);
4896			ret = btrfs_free_extent(trans, &ref);
4897			if (ret) {
4898				btrfs_abort_transaction(trans, ret);
4899				break;
4900			}
4901			if (be_nice) {
4902				if (btrfs_should_throttle_delayed_refs(trans))
4903					should_throttle = true;
4904			}
4905		}
4906
4907		if (found_type == BTRFS_INODE_ITEM_KEY)
4908			break;
4909
4910		if (path->slots[0] == 0 ||
4911		    path->slots[0] != pending_del_slot ||
4912		    should_throttle) {
4913			if (pending_del_nr) {
4914				ret = btrfs_del_items(trans, root, path,
4915						pending_del_slot,
4916						pending_del_nr);
4917				if (ret) {
4918					btrfs_abort_transaction(trans, ret);
4919					break;
4920				}
4921				pending_del_nr = 0;
4922			}
4923			btrfs_release_path(path);
4924
4925			/*
4926			 * We can generate a lot of delayed refs, so we need to
4927			 * throttle every once and a while and make sure we're
4928			 * adding enough space to keep up with the work we are
4929			 * generating.  Since we hold a transaction here we
4930			 * can't flush, and we don't want to FLUSH_LIMIT because
4931			 * we could have generated too many delayed refs to
4932			 * actually allocate, so just bail if we're short and
4933			 * let the normal reservation dance happen higher up.
4934			 */
4935			if (should_throttle) {
4936				ret = btrfs_delayed_refs_rsv_refill(fs_info,
4937							BTRFS_RESERVE_NO_FLUSH);
4938				if (ret) {
4939					ret = -EAGAIN;
4940					break;
4941				}
4942			}
4943			goto search_again;
4944		} else {
4945			path->slots[0]--;
4946		}
4947	}
4948out:
4949	if (ret >= 0 && pending_del_nr) {
4950		int err;
4951
4952		err = btrfs_del_items(trans, root, path, pending_del_slot,
4953				      pending_del_nr);
4954		if (err) {
4955			btrfs_abort_transaction(trans, err);
4956			ret = err;
4957		}
4958	}
4959	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4960		ASSERT(last_size >= new_size);
4961		if (!ret && last_size > new_size)
4962			last_size = new_size;
4963		btrfs_inode_safe_disk_i_size_write(inode, last_size);
4964		unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1,
4965				     &cached_state);
4966	}
4967
4968	btrfs_free_path(path);
4969	return ret;
4970}
4971
4972/*
4973 * btrfs_truncate_block - read, zero a chunk and write a block
4974 * @inode - inode that we're zeroing
4975 * @from - the offset to start zeroing
4976 * @len - the length to zero, 0 to zero the entire range respective to the
4977 *	offset
4978 * @front - zero up to the offset instead of from the offset on
4979 *
4980 * This will find the block for the "from" offset and cow the block and zero the
4981 * part we want to zero.  This is used with truncate and hole punching.
4982 */
4983int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
4984			 int front)
4985{
4986	struct btrfs_fs_info *fs_info = inode->root->fs_info;
4987	struct address_space *mapping = inode->vfs_inode.i_mapping;
4988	struct extent_io_tree *io_tree = &inode->io_tree;
4989	struct btrfs_ordered_extent *ordered;
4990	struct extent_state *cached_state = NULL;
4991	struct extent_changeset *data_reserved = NULL;
4992	bool only_release_metadata = false;
4993	u32 blocksize = fs_info->sectorsize;
4994	pgoff_t index = from >> PAGE_SHIFT;
4995	unsigned offset = from & (blocksize - 1);
4996	struct page *page;
4997	gfp_t mask = btrfs_alloc_write_mask(mapping);
4998	size_t write_bytes = blocksize;
4999	int ret = 0;
5000	u64 block_start;
5001	u64 block_end;
5002
5003	if (IS_ALIGNED(offset, blocksize) &&
5004	    (!len || IS_ALIGNED(len, blocksize)))
5005		goto out;
5006
5007	block_start = round_down(from, blocksize);
5008	block_end = block_start + blocksize - 1;
5009
5010	ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
5011					  blocksize);
5012	if (ret < 0) {
5013		if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
5014			/* For nocow case, no need to reserve data space */
5015			only_release_metadata = true;
5016		} else {
5017			goto out;
5018		}
5019	}
5020	ret = btrfs_delalloc_reserve_metadata(inode, blocksize);
5021	if (ret < 0) {
5022		if (!only_release_metadata)
5023			btrfs_free_reserved_data_space(inode, data_reserved,
5024						       block_start, blocksize);
5025		goto out;
5026	}
5027again:
5028	page = find_or_create_page(mapping, index, mask);
5029	if (!page) {
5030		btrfs_delalloc_release_space(inode, data_reserved, block_start,
5031					     blocksize, true);
5032		btrfs_delalloc_release_extents(inode, blocksize);
5033		ret = -ENOMEM;
5034		goto out;
5035	}
5036	ret = set_page_extent_mapped(page);
5037	if (ret < 0)
5038		goto out_unlock;
5039
5040	if (!PageUptodate(page)) {
5041		ret = btrfs_readpage(NULL, page);
5042		lock_page(page);
5043		if (page->mapping != mapping) {
5044			unlock_page(page);
5045			put_page(page);
5046			goto again;
5047		}
5048		if (!PageUptodate(page)) {
5049			ret = -EIO;
5050			goto out_unlock;
5051		}
5052	}
5053	wait_on_page_writeback(page);
5054
5055	lock_extent_bits(io_tree, block_start, block_end, &cached_state);
5056
5057	ordered = btrfs_lookup_ordered_extent(inode, block_start);
5058	if (ordered) {
5059		unlock_extent_cached(io_tree, block_start, block_end,
5060				     &cached_state);
5061		unlock_page(page);
5062		put_page(page);
5063		btrfs_start_ordered_extent(ordered, 1);
5064		btrfs_put_ordered_extent(ordered);
5065		goto again;
5066	}
5067
5068	clear_extent_bit(&inode->io_tree, block_start, block_end,
5069			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
5070			 0, 0, &cached_state);
5071
5072	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
5073					&cached_state);
5074	if (ret) {
5075		unlock_extent_cached(io_tree, block_start, block_end,
5076				     &cached_state);
5077		goto out_unlock;
5078	}
5079
5080	if (offset != blocksize) {
5081		if (!len)
5082			len = blocksize - offset;
5083		if (front)
5084			memzero_page(page, (block_start - page_offset(page)),
5085				     offset);
5086		else
5087			memzero_page(page, (block_start - page_offset(page)) + offset,
5088				     len);
5089		flush_dcache_page(page);
5090	}
5091	ClearPageChecked(page);
5092	btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
5093	unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
5094
5095	if (only_release_metadata)
5096		set_extent_bit(&inode->io_tree, block_start, block_end,
5097			       EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
5098
5099out_unlock:
5100	if (ret) {
5101		if (only_release_metadata)
5102			btrfs_delalloc_release_metadata(inode, blocksize, true);
5103		else
5104			btrfs_delalloc_release_space(inode, data_reserved,
5105					block_start, blocksize, true);
5106	}
5107	btrfs_delalloc_release_extents(inode, blocksize);
5108	unlock_page(page);
5109	put_page(page);
5110out:
5111	if (only_release_metadata)
5112		btrfs_check_nocow_unlock(inode);
5113	extent_changeset_free(data_reserved);
5114	return ret;
5115}
5116
5117static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
5118			     u64 offset, u64 len)
5119{
5120	struct btrfs_fs_info *fs_info = root->fs_info;
5121	struct btrfs_trans_handle *trans;
5122	struct btrfs_drop_extents_args drop_args = { 0 };
5123	int ret;
5124
5125	/*
5126	 * If NO_HOLES is enabled, we don't need to do anything.
5127	 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
5128	 * or btrfs_update_inode() will be called, which guarantee that the next
5129	 * fsync will know this inode was changed and needs to be logged.
5130	 */
5131	if (btrfs_fs_incompat(fs_info, NO_HOLES))
5132		return 0;
5133
5134	/*
5135	 * 1 - for the one we're dropping
5136	 * 1 - for the one we're adding
5137	 * 1 - for updating the inode.
5138	 */
5139	trans = btrfs_start_transaction(root, 3);
5140	if (IS_ERR(trans))
5141		return PTR_ERR(trans);
5142
5143	drop_args.start = offset;
5144	drop_args.end = offset + len;
5145	drop_args.drop_cache = true;
5146
5147	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
5148	if (ret) {
5149		btrfs_abort_transaction(trans, ret);
5150		btrfs_end_transaction(trans);
5151		return ret;
5152	}
5153
5154	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
5155			offset, 0, 0, len, 0, len, 0, 0, 0);
5156	if (ret) {
5157		btrfs_abort_transaction(trans, ret);
5158	} else {
5159		btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
5160		btrfs_update_inode(trans, root, inode);
5161	}
5162	btrfs_end_transaction(trans);
5163	return ret;
5164}
5165
5166/*
5167 * This function puts in dummy file extents for the area we're creating a hole
5168 * for.  So if we are truncating this file to a larger size we need to insert
5169 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
5170 * the range between oldsize and size
5171 */
5172int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
5173{
5174	struct btrfs_root *root = inode->root;
5175	struct btrfs_fs_info *fs_info = root->fs_info;
5176	struct extent_io_tree *io_tree = &inode->io_tree;
5177	struct extent_map *em = NULL;
5178	struct extent_state *cached_state = NULL;
5179	struct extent_map_tree *em_tree = &inode->extent_tree;
5180	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
5181	u64 block_end = ALIGN(size, fs_info->sectorsize);
5182	u64 last_byte;
5183	u64 cur_offset;
5184	u64 hole_size;
5185	int err = 0;
5186
5187	/*
5188	 * If our size started in the middle of a block we need to zero out the
5189	 * rest of the block before we expand the i_size, otherwise we could
5190	 * expose stale data.
5191	 */
5192	err = btrfs_truncate_block(inode, oldsize, 0, 0);
5193	if (err)
5194		return err;
5195
5196	if (size <= hole_start)
5197		return 0;
5198
5199	btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
5200					   &cached_state);
5201	cur_offset = hole_start;
5202	while (1) {
5203		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
5204				      block_end - cur_offset);
5205		if (IS_ERR(em)) {
5206			err = PTR_ERR(em);
5207			em = NULL;
5208			break;
5209		}
5210		last_byte = min(extent_map_end(em), block_end);
5211		last_byte = ALIGN(last_byte, fs_info->sectorsize);
5212		hole_size = last_byte - cur_offset;
5213
5214		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
5215			struct extent_map *hole_em;
5216
5217			err = maybe_insert_hole(root, inode, cur_offset,
5218						hole_size);
5219			if (err)
5220				break;
5221
5222			err = btrfs_inode_set_file_extent_range(inode,
5223							cur_offset, hole_size);
5224			if (err)
5225				break;
5226
5227			btrfs_drop_extent_cache(inode, cur_offset,
5228						cur_offset + hole_size - 1, 0);
5229			hole_em = alloc_extent_map();
5230			if (!hole_em) {
5231				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5232					&inode->runtime_flags);
5233				goto next;
5234			}
5235			hole_em->start = cur_offset;
5236			hole_em->len = hole_size;
5237			hole_em->orig_start = cur_offset;
5238
5239			hole_em->block_start = EXTENT_MAP_HOLE;
5240			hole_em->block_len = 0;
5241			hole_em->orig_block_len = 0;
5242			hole_em->ram_bytes = hole_size;
5243			hole_em->compress_type = BTRFS_COMPRESS_NONE;
5244			hole_em->generation = fs_info->generation;
5245
5246			while (1) {
5247				write_lock(&em_tree->lock);
5248				err = add_extent_mapping(em_tree, hole_em, 1);
5249				write_unlock(&em_tree->lock);
5250				if (err != -EEXIST)
5251					break;
5252				btrfs_drop_extent_cache(inode, cur_offset,
5253							cur_offset +
5254							hole_size - 1, 0);
5255			}
5256			free_extent_map(hole_em);
5257		} else {
5258			err = btrfs_inode_set_file_extent_range(inode,
5259							cur_offset, hole_size);
5260			if (err)
5261				break;
5262		}
5263next:
5264		free_extent_map(em);
5265		em = NULL;
5266		cur_offset = last_byte;
5267		if (cur_offset >= block_end)
5268			break;
5269	}
5270	free_extent_map(em);
5271	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
5272	return err;
5273}
5274
5275static int btrfs_setsize(struct inode *inode, struct iattr *attr)
5276{
5277	struct btrfs_root *root = BTRFS_I(inode)->root;
5278	struct btrfs_trans_handle *trans;
5279	loff_t oldsize = i_size_read(inode);
5280	loff_t newsize = attr->ia_size;
5281	int mask = attr->ia_valid;
5282	int ret;
5283
5284	/*
5285	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
5286	 * special case where we need to update the times despite not having
5287	 * these flags set.  For all other operations the VFS set these flags
5288	 * explicitly if it wants a timestamp update.
5289	 */
5290	if (newsize != oldsize) {
5291		inode_inc_iversion(inode);
5292		if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
5293			inode->i_ctime = inode->i_mtime =
5294				current_time(inode);
5295	}
5296
5297	if (newsize > oldsize) {
5298		/*
5299		 * Don't do an expanding truncate while snapshotting is ongoing.
5300		 * This is to ensure the snapshot captures a fully consistent
5301		 * state of this file - if the snapshot captures this expanding
5302		 * truncation, it must capture all writes that happened before
5303		 * this truncation.
5304		 */
5305		btrfs_drew_write_lock(&root->snapshot_lock);
5306		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
5307		if (ret) {
5308			btrfs_drew_write_unlock(&root->snapshot_lock);
5309			return ret;
5310		}
5311
5312		trans = btrfs_start_transaction(root, 1);
5313		if (IS_ERR(trans)) {
5314			btrfs_drew_write_unlock(&root->snapshot_lock);
5315			return PTR_ERR(trans);
5316		}
5317
5318		i_size_write(inode, newsize);
5319		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
5320		pagecache_isize_extended(inode, oldsize, newsize);
5321		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
5322		btrfs_drew_write_unlock(&root->snapshot_lock);
5323		btrfs_end_transaction(trans);
5324	} else {
5325		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5326
5327		if (btrfs_is_zoned(fs_info)) {
5328			ret = btrfs_wait_ordered_range(inode,
5329					ALIGN(newsize, fs_info->sectorsize),
5330					(u64)-1);
5331			if (ret)
5332				return ret;
5333		}
5334
5335		/*
5336		 * We're truncating a file that used to have good data down to
5337		 * zero. Make sure any new writes to the file get on disk
5338		 * on close.
5339		 */
5340		if (newsize == 0)
5341			set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5342				&BTRFS_I(inode)->runtime_flags);
5343
5344		truncate_setsize(inode, newsize);
5345
5346		inode_dio_wait(inode);
5347
5348		ret = btrfs_truncate(inode, newsize == oldsize);
5349		if (ret && inode->i_nlink) {
5350			int err;
5351
5352			/*
5353			 * Truncate failed, so fix up the in-memory size. We
5354			 * adjusted disk_i_size down as we removed extents, so
5355			 * wait for disk_i_size to be stable and then update the
5356			 * in-memory size to match.
5357			 */
5358			err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5359			if (err)
5360				return err;
5361			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5362		}
5363	}
5364
5365	return ret;
5366}
5367
5368static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
5369			 struct iattr *attr)
5370{
5371	struct inode *inode = d_inode(dentry);
5372	struct btrfs_root *root = BTRFS_I(inode)->root;
5373	int err;
5374
5375	if (btrfs_root_readonly(root))
5376		return -EROFS;
5377
5378	err = setattr_prepare(&init_user_ns, dentry, attr);
5379	if (err)
5380		return err;
5381
5382	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5383		err = btrfs_setsize(inode, attr);
5384		if (err)
5385			return err;
5386	}
5387
5388	if (attr->ia_valid) {
5389		setattr_copy(&init_user_ns, inode, attr);
5390		inode_inc_iversion(inode);
5391		err = btrfs_dirty_inode(inode);
5392
5393		if (!err && attr->ia_valid & ATTR_MODE)
5394			err = posix_acl_chmod(&init_user_ns, inode,
5395					      inode->i_mode);
5396	}
5397
5398	return err;
5399}
5400
5401/*
5402 * While truncating the inode pages during eviction, we get the VFS calling
5403 * btrfs_invalidatepage() against each page of the inode. This is slow because
5404 * the calls to btrfs_invalidatepage() result in a huge amount of calls to
5405 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
5406 * extent_state structures over and over, wasting lots of time.
5407 *
5408 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
5409 * those expensive operations on a per page basis and do only the ordered io
5410 * finishing, while we release here the extent_map and extent_state structures,
5411 * without the excessive merging and splitting.
5412 */
5413static void evict_inode_truncate_pages(struct inode *inode)
5414{
5415	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5416	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
5417	struct rb_node *node;
5418
5419	ASSERT(inode->i_state & I_FREEING);
5420	truncate_inode_pages_final(&inode->i_data);
5421
5422	write_lock(&map_tree->lock);
5423	while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
5424		struct extent_map *em;
5425
5426		node = rb_first_cached(&map_tree->map);
5427		em = rb_entry(node, struct extent_map, rb_node);
5428		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
5429		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
5430		remove_extent_mapping(map_tree, em);
5431		free_extent_map(em);
5432		if (need_resched()) {
5433			write_unlock(&map_tree->lock);
5434			cond_resched();
5435			write_lock(&map_tree->lock);
5436		}
5437	}
5438	write_unlock(&map_tree->lock);
5439
5440	/*
5441	 * Keep looping until we have no more ranges in the io tree.
5442	 * We can have ongoing bios started by readahead that have
5443	 * their endio callback (extent_io.c:end_bio_extent_readpage)
5444	 * still in progress (unlocked the pages in the bio but did not yet
5445	 * unlocked the ranges in the io tree). Therefore this means some
5446	 * ranges can still be locked and eviction started because before
5447	 * submitting those bios, which are executed by a separate task (work
5448	 * queue kthread), inode references (inode->i_count) were not taken
5449	 * (which would be dropped in the end io callback of each bio).
5450	 * Therefore here we effectively end up waiting for those bios and
5451	 * anyone else holding locked ranges without having bumped the inode's
5452	 * reference count - if we don't do it, when they access the inode's
5453	 * io_tree to unlock a range it may be too late, leading to an
5454	 * use-after-free issue.
5455	 */
5456	spin_lock(&io_tree->lock);
5457	while (!RB_EMPTY_ROOT(&io_tree->state)) {
5458		struct extent_state *state;
5459		struct extent_state *cached_state = NULL;
5460		u64 start;
5461		u64 end;
5462		unsigned state_flags;
5463
5464		node = rb_first(&io_tree->state);
5465		state = rb_entry(node, struct extent_state, rb_node);
5466		start = state->start;
5467		end = state->end;
5468		state_flags = state->state;
5469		spin_unlock(&io_tree->lock);
5470
5471		lock_extent_bits(io_tree, start, end, &cached_state);
5472
5473		/*
5474		 * If still has DELALLOC flag, the extent didn't reach disk,
5475		 * and its reserved space won't be freed by delayed_ref.
5476		 * So we need to free its reserved space here.
5477		 * (Refer to comment in btrfs_invalidatepage, case 2)
5478		 *
5479		 * Note, end is the bytenr of last byte, so we need + 1 here.
5480		 */
5481		if (state_flags & EXTENT_DELALLOC)
5482			btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5483					       end - start + 1);
5484
5485		clear_extent_bit(io_tree, start, end,
5486				 EXTENT_LOCKED | EXTENT_DELALLOC |
5487				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
5488				 &cached_state);
5489
5490		cond_resched();
5491		spin_lock(&io_tree->lock);
5492	}
5493	spin_unlock(&io_tree->lock);
5494}
5495
5496static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5497							struct btrfs_block_rsv *rsv)
5498{
5499	struct btrfs_fs_info *fs_info = root->fs_info;
5500	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5501	struct btrfs_trans_handle *trans;
5502	u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
5503	int ret;
5504
5505	/*
5506	 * Eviction should be taking place at some place safe because of our
5507	 * delayed iputs.  However the normal flushing code will run delayed
5508	 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5509	 *
5510	 * We reserve the delayed_refs_extra here again because we can't use
5511	 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5512	 * above.  We reserve our extra bit here because we generate a ton of
5513	 * delayed refs activity by truncating.
5514	 *
5515	 * If we cannot make our reservation we'll attempt to steal from the
5516	 * global reserve, because we really want to be able to free up space.
5517	 */
5518	ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra,
5519				     BTRFS_RESERVE_FLUSH_EVICT);
5520	if (ret) {
5521		/*
5522		 * Try to steal from the global reserve if there is space for
5523		 * it.
5524		 */
5525		if (btrfs_check_space_for_delayed_refs(fs_info) ||
5526		    btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) {
5527			btrfs_warn(fs_info,
5528				   "could not allocate space for delete; will truncate on mount");
5529			return ERR_PTR(-ENOSPC);
5530		}
5531		delayed_refs_extra = 0;
5532	}
5533
5534	trans = btrfs_join_transaction(root);
5535	if (IS_ERR(trans))
5536		return trans;
5537
5538	if (delayed_refs_extra) {
5539		trans->block_rsv = &fs_info->trans_block_rsv;
5540		trans->bytes_reserved = delayed_refs_extra;
5541		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5542					delayed_refs_extra, 1);
5543	}
5544	return trans;
5545}
5546
5547void btrfs_evict_inode(struct inode *inode)
5548{
5549	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5550	struct btrfs_trans_handle *trans;
5551	struct btrfs_root *root = BTRFS_I(inode)->root;
5552	struct btrfs_block_rsv *rsv;
5553	int ret;
5554
5555	trace_btrfs_inode_evict(inode);
5556
5557	if (!root) {
5558		clear_inode(inode);
5559		return;
5560	}
5561
5562	evict_inode_truncate_pages(inode);
5563
5564	if (inode->i_nlink &&
5565	    ((btrfs_root_refs(&root->root_item) != 0 &&
5566	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5567	     btrfs_is_free_space_inode(BTRFS_I(inode))))
5568		goto no_delete;
5569
5570	if (is_bad_inode(inode))
5571		goto no_delete;
5572
5573	btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
5574
5575	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5576		goto no_delete;
5577
5578	if (inode->i_nlink > 0) {
5579		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5580		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5581		goto no_delete;
5582	}
5583
5584	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5585	if (ret)
5586		goto no_delete;
5587
5588	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5589	if (!rsv)
5590		goto no_delete;
5591	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5592	rsv->failfast = 1;
5593
5594	btrfs_i_size_write(BTRFS_I(inode), 0);
5595
5596	while (1) {
5597		trans = evict_refill_and_join(root, rsv);
5598		if (IS_ERR(trans))
5599			goto free_rsv;
5600
5601		trans->block_rsv = rsv;
5602
5603		ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
5604						 0, 0, NULL);
5605		trans->block_rsv = &fs_info->trans_block_rsv;
5606		btrfs_end_transaction(trans);
5607		btrfs_btree_balance_dirty(fs_info);
5608		if (ret && ret != -ENOSPC && ret != -EAGAIN)
5609			goto free_rsv;
5610		else if (!ret)
5611			break;
5612	}
5613
5614	/*
5615	 * Errors here aren't a big deal, it just means we leave orphan items in
5616	 * the tree. They will be cleaned up on the next mount. If the inode
5617	 * number gets reused, cleanup deletes the orphan item without doing
5618	 * anything, and unlink reuses the existing orphan item.
5619	 *
5620	 * If it turns out that we are dropping too many of these, we might want
5621	 * to add a mechanism for retrying these after a commit.
5622	 */
5623	trans = evict_refill_and_join(root, rsv);
5624	if (!IS_ERR(trans)) {
5625		trans->block_rsv = rsv;
5626		btrfs_orphan_del(trans, BTRFS_I(inode));
5627		trans->block_rsv = &fs_info->trans_block_rsv;
5628		btrfs_end_transaction(trans);
5629	}
5630
5631free_rsv:
5632	btrfs_free_block_rsv(fs_info, rsv);
5633no_delete:
5634	/*
5635	 * If we didn't successfully delete, the orphan item will still be in
5636	 * the tree and we'll retry on the next mount. Again, we might also want
5637	 * to retry these periodically in the future.
5638	 */
5639	btrfs_remove_delayed_node(BTRFS_I(inode));
5640	clear_inode(inode);
5641}
5642
5643/*
5644 * Return the key found in the dir entry in the location pointer, fill @type
5645 * with BTRFS_FT_*, and return 0.
5646 *
5647 * If no dir entries were found, returns -ENOENT.
5648 * If found a corrupted location in dir entry, returns -EUCLEAN.
5649 */
5650static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
5651			       struct btrfs_key *location, u8 *type)
5652{
5653	const char *name = dentry->d_name.name;
5654	int namelen = dentry->d_name.len;
5655	struct btrfs_dir_item *di;
5656	struct btrfs_path *path;
5657	struct btrfs_root *root = BTRFS_I(dir)->root;
5658	int ret = 0;
5659
5660	path = btrfs_alloc_path();
5661	if (!path)
5662		return -ENOMEM;
5663
5664	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
5665			name, namelen, 0);
5666	if (IS_ERR_OR_NULL(di)) {
5667		ret = di ? PTR_ERR(di) : -ENOENT;
5668		goto out;
5669	}
5670
5671	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5672	if (location->type != BTRFS_INODE_ITEM_KEY &&
5673	    location->type != BTRFS_ROOT_ITEM_KEY) {
5674		ret = -EUCLEAN;
5675		btrfs_warn(root->fs_info,
5676"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5677			   __func__, name, btrfs_ino(BTRFS_I(dir)),
5678			   location->objectid, location->type, location->offset);
5679	}
5680	if (!ret)
5681		*type = btrfs_dir_type(path->nodes[0], di);
5682out:
5683	btrfs_free_path(path);
5684	return ret;
5685}
5686
5687/*
5688 * when we hit a tree root in a directory, the btrfs part of the inode
5689 * needs to be changed to reflect the root directory of the tree root.  This
5690 * is kind of like crossing a mount point.
5691 */
5692static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5693				    struct inode *dir,
5694				    struct dentry *dentry,
5695				    struct btrfs_key *location,
5696				    struct btrfs_root **sub_root)
5697{
5698	struct btrfs_path *path;
5699	struct btrfs_root *new_root;
5700	struct btrfs_root_ref *ref;
5701	struct extent_buffer *leaf;
5702	struct btrfs_key key;
5703	int ret;
5704	int err = 0;
5705
5706	path = btrfs_alloc_path();
5707	if (!path) {
5708		err = -ENOMEM;
5709		goto out;
5710	}
5711
5712	err = -ENOENT;
5713	key.objectid = BTRFS_I(dir)->root->root_key.objectid;
5714	key.type = BTRFS_ROOT_REF_KEY;
5715	key.offset = location->objectid;
5716
5717	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5718	if (ret) {
5719		if (ret < 0)
5720			err = ret;
5721		goto out;
5722	}
5723
5724	leaf = path->nodes[0];
5725	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5726	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
5727	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
5728		goto out;
5729
5730	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
5731				   (unsigned long)(ref + 1),
5732				   dentry->d_name.len);
5733	if (ret)
5734		goto out;
5735
5736	btrfs_release_path(path);
5737
5738	new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5739	if (IS_ERR(new_root)) {
5740		err = PTR_ERR(new_root);
5741		goto out;
5742	}
5743
5744	*sub_root = new_root;
5745	location->objectid = btrfs_root_dirid(&new_root->root_item);
5746	location->type = BTRFS_INODE_ITEM_KEY;
5747	location->offset = 0;
5748	err = 0;
5749out:
5750	btrfs_free_path(path);
5751	return err;
5752}
5753
5754static void inode_tree_add(struct inode *inode)
5755{
5756	struct btrfs_root *root = BTRFS_I(inode)->root;
5757	struct btrfs_inode *entry;
5758	struct rb_node **p;
5759	struct rb_node *parent;
5760	struct rb_node *new = &BTRFS_I(inode)->rb_node;
5761	u64 ino = btrfs_ino(BTRFS_I(inode));
5762
5763	if (inode_unhashed(inode))
5764		return;
5765	parent = NULL;
5766	spin_lock(&root->inode_lock);
5767	p = &root->inode_tree.rb_node;
5768	while (*p) {
5769		parent = *p;
5770		entry = rb_entry(parent, struct btrfs_inode, rb_node);
5771
5772		if (ino < btrfs_ino(entry))
5773			p = &parent->rb_left;
5774		else if (ino > btrfs_ino(entry))
5775			p = &parent->rb_right;
5776		else {
5777			WARN_ON(!(entry->vfs_inode.i_state &
5778				  (I_WILL_FREE | I_FREEING)));
5779			rb_replace_node(parent, new, &root->inode_tree);
5780			RB_CLEAR_NODE(parent);
5781			spin_unlock(&root->inode_lock);
5782			return;
5783		}
5784	}
5785	rb_link_node(new, parent, p);
5786	rb_insert_color(new, &root->inode_tree);
5787	spin_unlock(&root->inode_lock);
5788}
5789
5790static void inode_tree_del(struct btrfs_inode *inode)
5791{
5792	struct btrfs_root *root = inode->root;
5793	int empty = 0;
5794
5795	spin_lock(&root->inode_lock);
5796	if (!RB_EMPTY_NODE(&inode->rb_node)) {
5797		rb_erase(&inode->rb_node, &root->inode_tree);
5798		RB_CLEAR_NODE(&inode->rb_node);
5799		empty = RB_EMPTY_ROOT(&root->inode_tree);
5800	}
5801	spin_unlock(&root->inode_lock);
5802
5803	if (empty && btrfs_root_refs(&root->root_item) == 0) {
5804		spin_lock(&root->inode_lock);
5805		empty = RB_EMPTY_ROOT(&root->inode_tree);
5806		spin_unlock(&root->inode_lock);
5807		if (empty)
5808			btrfs_add_dead_root(root);
5809	}
5810}
5811
5812
5813static int btrfs_init_locked_inode(struct inode *inode, void *p)
5814{
5815	struct btrfs_iget_args *args = p;
5816
5817	inode->i_ino = args->ino;
5818	BTRFS_I(inode)->location.objectid = args->ino;
5819	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5820	BTRFS_I(inode)->location.offset = 0;
5821	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5822	BUG_ON(args->root && !BTRFS_I(inode)->root);
5823	return 0;
5824}
5825
5826static int btrfs_find_actor(struct inode *inode, void *opaque)
5827{
5828	struct btrfs_iget_args *args = opaque;
5829
5830	return args->ino == BTRFS_I(inode)->location.objectid &&
5831		args->root == BTRFS_I(inode)->root;
5832}
5833
5834static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
5835				       struct btrfs_root *root)
5836{
5837	struct inode *inode;
5838	struct btrfs_iget_args args;
5839	unsigned long hashval = btrfs_inode_hash(ino, root);
5840
5841	args.ino = ino;
5842	args.root = root;
5843
5844	inode = iget5_locked(s, hashval, btrfs_find_actor,
5845			     btrfs_init_locked_inode,
5846			     (void *)&args);
5847	return inode;
5848}
5849
5850/*
5851 * Get an inode object given its inode number and corresponding root.
5852 * Path can be preallocated to prevent recursing back to iget through
5853 * allocator. NULL is also valid but may require an additional allocation
5854 * later.
5855 */
5856struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
5857			      struct btrfs_root *root, struct btrfs_path *path)
5858{
5859	struct inode *inode;
5860
5861	inode = btrfs_iget_locked(s, ino, root);
5862	if (!inode)
5863		return ERR_PTR(-ENOMEM);
5864
5865	if (inode->i_state & I_NEW) {
5866		int ret;
5867
5868		ret = btrfs_read_locked_inode(inode, path);
5869		if (!ret) {
5870			inode_tree_add(inode);
5871			unlock_new_inode(inode);
5872		} else {
5873			iget_failed(inode);
5874			/*
5875			 * ret > 0 can come from btrfs_search_slot called by
5876			 * btrfs_read_locked_inode, this means the inode item
5877			 * was not found.
5878			 */
5879			if (ret > 0)
5880				ret = -ENOENT;
5881			inode = ERR_PTR(ret);
5882		}
5883	}
5884
5885	return inode;
5886}
5887
5888struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
5889{
5890	return btrfs_iget_path(s, ino, root, NULL);
5891}
5892
5893static struct inode *new_simple_dir(struct super_block *s,
5894				    struct btrfs_key *key,
5895				    struct btrfs_root *root)
5896{
5897	struct inode *inode = new_inode(s);
5898
5899	if (!inode)
5900		return ERR_PTR(-ENOMEM);
5901
5902	BTRFS_I(inode)->root = btrfs_grab_root(root);
5903