file.c revision e0467866
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007 Oracle.  All rights reserved.
4 */
5
6#include <linux/fs.h>
7#include <linux/pagemap.h>
8#include <linux/time.h>
9#include <linux/init.h>
10#include <linux/string.h>
11#include <linux/backing-dev.h>
12#include <linux/falloc.h>
13#include <linux/writeback.h>
14#include <linux/compat.h>
15#include <linux/slab.h>
16#include <linux/btrfs.h>
17#include <linux/uio.h>
18#include <linux/iversion.h>
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22#include "btrfs_inode.h"
23#include "print-tree.h"
24#include "tree-log.h"
25#include "locking.h"
26#include "volumes.h"
27#include "qgroup.h"
28#include "compression.h"
29#include "delalloc-space.h"
30#include "reflink.h"
31#include "subpage.h"
32
33static struct kmem_cache *btrfs_inode_defrag_cachep;
34/*
35 * when auto defrag is enabled we
36 * queue up these defrag structs to remember which
37 * inodes need defragging passes
38 */
39struct inode_defrag {
40	struct rb_node rb_node;
41	/* objectid */
42	u64 ino;
43	/*
44	 * transid where the defrag was added, we search for
45	 * extents newer than this
46	 */
47	u64 transid;
48
49	/* root objectid */
50	u64 root;
51
52	/* last offset we were able to defrag */
53	u64 last_offset;
54
55	/* if we've wrapped around back to zero once already */
56	int cycled;
57};
58
59static int __compare_inode_defrag(struct inode_defrag *defrag1,
60				  struct inode_defrag *defrag2)
61{
62	if (defrag1->root > defrag2->root)
63		return 1;
64	else if (defrag1->root < defrag2->root)
65		return -1;
66	else if (defrag1->ino > defrag2->ino)
67		return 1;
68	else if (defrag1->ino < defrag2->ino)
69		return -1;
70	else
71		return 0;
72}
73
74/* pop a record for an inode into the defrag tree.  The lock
75 * must be held already
76 *
77 * If you're inserting a record for an older transid than an
78 * existing record, the transid already in the tree is lowered
79 *
80 * If an existing record is found the defrag item you
81 * pass in is freed
82 */
83static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
84				    struct inode_defrag *defrag)
85{
86	struct btrfs_fs_info *fs_info = inode->root->fs_info;
87	struct inode_defrag *entry;
88	struct rb_node **p;
89	struct rb_node *parent = NULL;
90	int ret;
91
92	p = &fs_info->defrag_inodes.rb_node;
93	while (*p) {
94		parent = *p;
95		entry = rb_entry(parent, struct inode_defrag, rb_node);
96
97		ret = __compare_inode_defrag(defrag, entry);
98		if (ret < 0)
99			p = &parent->rb_left;
100		else if (ret > 0)
101			p = &parent->rb_right;
102		else {
103			/* if we're reinserting an entry for
104			 * an old defrag run, make sure to
105			 * lower the transid of our existing record
106			 */
107			if (defrag->transid < entry->transid)
108				entry->transid = defrag->transid;
109			if (defrag->last_offset > entry->last_offset)
110				entry->last_offset = defrag->last_offset;
111			return -EEXIST;
112		}
113	}
114	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
115	rb_link_node(&defrag->rb_node, parent, p);
116	rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
117	return 0;
118}
119
120static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
121{
122	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
123		return 0;
124
125	if (btrfs_fs_closing(fs_info))
126		return 0;
127
128	return 1;
129}
130
131/*
132 * insert a defrag record for this inode if auto defrag is
133 * enabled
134 */
135int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
136			   struct btrfs_inode *inode)
137{
138	struct btrfs_root *root = inode->root;
139	struct btrfs_fs_info *fs_info = root->fs_info;
140	struct inode_defrag *defrag;
141	u64 transid;
142	int ret;
143
144	if (!__need_auto_defrag(fs_info))
145		return 0;
146
147	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
148		return 0;
149
150	if (trans)
151		transid = trans->transid;
152	else
153		transid = inode->root->last_trans;
154
155	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
156	if (!defrag)
157		return -ENOMEM;
158
159	defrag->ino = btrfs_ino(inode);
160	defrag->transid = transid;
161	defrag->root = root->root_key.objectid;
162
163	spin_lock(&fs_info->defrag_inodes_lock);
164	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
165		/*
166		 * If we set IN_DEFRAG flag and evict the inode from memory,
167		 * and then re-read this inode, this new inode doesn't have
168		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
169		 */
170		ret = __btrfs_add_inode_defrag(inode, defrag);
171		if (ret)
172			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
173	} else {
174		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
175	}
176	spin_unlock(&fs_info->defrag_inodes_lock);
177	return 0;
178}
179
180/*
181 * Requeue the defrag object. If there is a defrag object that points to
182 * the same inode in the tree, we will merge them together (by
183 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
184 */
185static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
186				       struct inode_defrag *defrag)
187{
188	struct btrfs_fs_info *fs_info = inode->root->fs_info;
189	int ret;
190
191	if (!__need_auto_defrag(fs_info))
192		goto out;
193
194	/*
195	 * Here we don't check the IN_DEFRAG flag, because we need merge
196	 * them together.
197	 */
198	spin_lock(&fs_info->defrag_inodes_lock);
199	ret = __btrfs_add_inode_defrag(inode, defrag);
200	spin_unlock(&fs_info->defrag_inodes_lock);
201	if (ret)
202		goto out;
203	return;
204out:
205	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
206}
207
208/*
209 * pick the defragable inode that we want, if it doesn't exist, we will get
210 * the next one.
211 */
212static struct inode_defrag *
213btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
214{
215	struct inode_defrag *entry = NULL;
216	struct inode_defrag tmp;
217	struct rb_node *p;
218	struct rb_node *parent = NULL;
219	int ret;
220
221	tmp.ino = ino;
222	tmp.root = root;
223
224	spin_lock(&fs_info->defrag_inodes_lock);
225	p = fs_info->defrag_inodes.rb_node;
226	while (p) {
227		parent = p;
228		entry = rb_entry(parent, struct inode_defrag, rb_node);
229
230		ret = __compare_inode_defrag(&tmp, entry);
231		if (ret < 0)
232			p = parent->rb_left;
233		else if (ret > 0)
234			p = parent->rb_right;
235		else
236			goto out;
237	}
238
239	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
240		parent = rb_next(parent);
241		if (parent)
242			entry = rb_entry(parent, struct inode_defrag, rb_node);
243		else
244			entry = NULL;
245	}
246out:
247	if (entry)
248		rb_erase(parent, &fs_info->defrag_inodes);
249	spin_unlock(&fs_info->defrag_inodes_lock);
250	return entry;
251}
252
253void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
254{
255	struct inode_defrag *defrag;
256	struct rb_node *node;
257
258	spin_lock(&fs_info->defrag_inodes_lock);
259	node = rb_first(&fs_info->defrag_inodes);
260	while (node) {
261		rb_erase(node, &fs_info->defrag_inodes);
262		defrag = rb_entry(node, struct inode_defrag, rb_node);
263		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
264
265		cond_resched_lock(&fs_info->defrag_inodes_lock);
266
267		node = rb_first(&fs_info->defrag_inodes);
268	}
269	spin_unlock(&fs_info->defrag_inodes_lock);
270}
271
272#define BTRFS_DEFRAG_BATCH	1024
273
274static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
275				    struct inode_defrag *defrag)
276{
277	struct btrfs_root *inode_root;
278	struct inode *inode;
279	struct btrfs_ioctl_defrag_range_args range;
280	int num_defrag;
281	int ret;
282
283	/* get the inode */
284	inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
285	if (IS_ERR(inode_root)) {
286		ret = PTR_ERR(inode_root);
287		goto cleanup;
288	}
289
290	inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
291	btrfs_put_root(inode_root);
292	if (IS_ERR(inode)) {
293		ret = PTR_ERR(inode);
294		goto cleanup;
295	}
296
297	/* do a chunk of defrag */
298	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
299	memset(&range, 0, sizeof(range));
300	range.len = (u64)-1;
301	range.start = defrag->last_offset;
302
303	sb_start_write(fs_info->sb);
304	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
305				       BTRFS_DEFRAG_BATCH);
306	sb_end_write(fs_info->sb);
307	/*
308	 * if we filled the whole defrag batch, there
309	 * must be more work to do.  Queue this defrag
310	 * again
311	 */
312	if (num_defrag == BTRFS_DEFRAG_BATCH) {
313		defrag->last_offset = range.start;
314		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
315	} else if (defrag->last_offset && !defrag->cycled) {
316		/*
317		 * we didn't fill our defrag batch, but
318		 * we didn't start at zero.  Make sure we loop
319		 * around to the start of the file.
320		 */
321		defrag->last_offset = 0;
322		defrag->cycled = 1;
323		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
324	} else {
325		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
326	}
327
328	iput(inode);
329	return 0;
330cleanup:
331	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
332	return ret;
333}
334
335/*
336 * run through the list of inodes in the FS that need
337 * defragging
338 */
339int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
340{
341	struct inode_defrag *defrag;
342	u64 first_ino = 0;
343	u64 root_objectid = 0;
344
345	atomic_inc(&fs_info->defrag_running);
346	while (1) {
347		/* Pause the auto defragger. */
348		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
349			     &fs_info->fs_state))
350			break;
351
352		if (!__need_auto_defrag(fs_info))
353			break;
354
355		/* find an inode to defrag */
356		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
357						 first_ino);
358		if (!defrag) {
359			if (root_objectid || first_ino) {
360				root_objectid = 0;
361				first_ino = 0;
362				continue;
363			} else {
364				break;
365			}
366		}
367
368		first_ino = defrag->ino + 1;
369		root_objectid = defrag->root;
370
371		__btrfs_run_defrag_inode(fs_info, defrag);
372	}
373	atomic_dec(&fs_info->defrag_running);
374
375	/*
376	 * during unmount, we use the transaction_wait queue to
377	 * wait for the defragger to stop
378	 */
379	wake_up(&fs_info->transaction_wait);
380	return 0;
381}
382
383/* simple helper to fault in pages and copy.  This should go away
384 * and be replaced with calls into generic code.
385 */
386static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
387					 struct page **prepared_pages,
388					 struct iov_iter *i)
389{
390	size_t copied = 0;
391	size_t total_copied = 0;
392	int pg = 0;
393	int offset = offset_in_page(pos);
394
395	while (write_bytes > 0) {
396		size_t count = min_t(size_t,
397				     PAGE_SIZE - offset, write_bytes);
398		struct page *page = prepared_pages[pg];
399		/*
400		 * Copy data from userspace to the current page
401		 */
402		copied = copy_page_from_iter_atomic(page, offset, count, i);
403
404		/* Flush processor's dcache for this page */
405		flush_dcache_page(page);
406
407		/*
408		 * if we get a partial write, we can end up with
409		 * partially up to date pages.  These add
410		 * a lot of complexity, so make sure they don't
411		 * happen by forcing this copy to be retried.
412		 *
413		 * The rest of the btrfs_file_write code will fall
414		 * back to page at a time copies after we return 0.
415		 */
416		if (unlikely(copied < count)) {
417			if (!PageUptodate(page)) {
418				iov_iter_revert(i, copied);
419				copied = 0;
420			}
421			if (!copied)
422				break;
423		}
424
425		write_bytes -= copied;
426		total_copied += copied;
427		offset += copied;
428		if (offset == PAGE_SIZE) {
429			pg++;
430			offset = 0;
431		}
432	}
433	return total_copied;
434}
435
436/*
437 * unlocks pages after btrfs_file_write is done with them
438 */
439static void btrfs_drop_pages(struct page **pages, size_t num_pages)
440{
441	size_t i;
442	for (i = 0; i < num_pages; i++) {
443		/* page checked is some magic around finding pages that
444		 * have been modified without going through btrfs_set_page_dirty
445		 * clear it here. There should be no need to mark the pages
446		 * accessed as prepare_pages should have marked them accessed
447		 * in prepare_pages via find_or_create_page()
448		 */
449		ClearPageChecked(pages[i]);
450		unlock_page(pages[i]);
451		put_page(pages[i]);
452	}
453}
454
455/*
456 * After btrfs_copy_from_user(), update the following things for delalloc:
457 * - Mark newly dirtied pages as DELALLOC in the io tree.
458 *   Used to advise which range is to be written back.
459 * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
460 * - Update inode size for past EOF write
461 */
462int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
463		      size_t num_pages, loff_t pos, size_t write_bytes,
464		      struct extent_state **cached, bool noreserve)
465{
466	struct btrfs_fs_info *fs_info = inode->root->fs_info;
467	int err = 0;
468	int i;
469	u64 num_bytes;
470	u64 start_pos;
471	u64 end_of_last_block;
472	u64 end_pos = pos + write_bytes;
473	loff_t isize = i_size_read(&inode->vfs_inode);
474	unsigned int extra_bits = 0;
475
476	if (write_bytes == 0)
477		return 0;
478
479	if (noreserve)
480		extra_bits |= EXTENT_NORESERVE;
481
482	start_pos = round_down(pos, fs_info->sectorsize);
483	num_bytes = round_up(write_bytes + pos - start_pos,
484			     fs_info->sectorsize);
485	ASSERT(num_bytes <= U32_MAX);
486
487	end_of_last_block = start_pos + num_bytes - 1;
488
489	/*
490	 * The pages may have already been dirty, clear out old accounting so
491	 * we can set things up properly
492	 */
493	clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
494			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
495			 0, 0, cached);
496
497	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
498					extra_bits, cached);
499	if (err)
500		return err;
501
502	for (i = 0; i < num_pages; i++) {
503		struct page *p = pages[i];
504
505		btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
506		ClearPageChecked(p);
507		btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
508	}
509
510	/*
511	 * we've only changed i_size in ram, and we haven't updated
512	 * the disk i_size.  There is no need to log the inode
513	 * at this time.
514	 */
515	if (end_pos > isize)
516		i_size_write(&inode->vfs_inode, end_pos);
517	return 0;
518}
519
520/*
521 * this drops all the extents in the cache that intersect the range
522 * [start, end].  Existing extents are split as required.
523 */
524void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
525			     int skip_pinned)
526{
527	struct extent_map *em;
528	struct extent_map *split = NULL;
529	struct extent_map *split2 = NULL;
530	struct extent_map_tree *em_tree = &inode->extent_tree;
531	u64 len = end - start + 1;
532	u64 gen;
533	int ret;
534	int testend = 1;
535	unsigned long flags;
536	int compressed = 0;
537	bool modified;
538
539	WARN_ON(end < start);
540	if (end == (u64)-1) {
541		len = (u64)-1;
542		testend = 0;
543	}
544	while (1) {
545		int no_splits = 0;
546
547		modified = false;
548		if (!split)
549			split = alloc_extent_map();
550		if (!split2)
551			split2 = alloc_extent_map();
552		if (!split || !split2)
553			no_splits = 1;
554
555		write_lock(&em_tree->lock);
556		em = lookup_extent_mapping(em_tree, start, len);
557		if (!em) {
558			write_unlock(&em_tree->lock);
559			break;
560		}
561		flags = em->flags;
562		gen = em->generation;
563		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
564			if (testend && em->start + em->len >= start + len) {
565				free_extent_map(em);
566				write_unlock(&em_tree->lock);
567				break;
568			}
569			start = em->start + em->len;
570			if (testend)
571				len = start + len - (em->start + em->len);
572			free_extent_map(em);
573			write_unlock(&em_tree->lock);
574			continue;
575		}
576		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
577		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
578		clear_bit(EXTENT_FLAG_LOGGING, &flags);
579		modified = !list_empty(&em->list);
580		if (no_splits)
581			goto next;
582
583		if (em->start < start) {
584			split->start = em->start;
585			split->len = start - em->start;
586
587			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
588				split->orig_start = em->orig_start;
589				split->block_start = em->block_start;
590
591				if (compressed)
592					split->block_len = em->block_len;
593				else
594					split->block_len = split->len;
595				split->orig_block_len = max(split->block_len,
596						em->orig_block_len);
597				split->ram_bytes = em->ram_bytes;
598			} else {
599				split->orig_start = split->start;
600				split->block_len = 0;
601				split->block_start = em->block_start;
602				split->orig_block_len = 0;
603				split->ram_bytes = split->len;
604			}
605
606			split->generation = gen;
607			split->flags = flags;
608			split->compress_type = em->compress_type;
609			replace_extent_mapping(em_tree, em, split, modified);
610			free_extent_map(split);
611			split = split2;
612			split2 = NULL;
613		}
614		if (testend && em->start + em->len > start + len) {
615			u64 diff = start + len - em->start;
616
617			split->start = start + len;
618			split->len = em->start + em->len - (start + len);
619			split->flags = flags;
620			split->compress_type = em->compress_type;
621			split->generation = gen;
622
623			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
624				split->orig_block_len = max(em->block_len,
625						    em->orig_block_len);
626
627				split->ram_bytes = em->ram_bytes;
628				if (compressed) {
629					split->block_len = em->block_len;
630					split->block_start = em->block_start;
631					split->orig_start = em->orig_start;
632				} else {
633					split->block_len = split->len;
634					split->block_start = em->block_start
635						+ diff;
636					split->orig_start = em->orig_start;
637				}
638			} else {
639				split->ram_bytes = split->len;
640				split->orig_start = split->start;
641				split->block_len = 0;
642				split->block_start = em->block_start;
643				split->orig_block_len = 0;
644			}
645
646			if (extent_map_in_tree(em)) {
647				replace_extent_mapping(em_tree, em, split,
648						       modified);
649			} else {
650				ret = add_extent_mapping(em_tree, split,
651							 modified);
652				ASSERT(ret == 0); /* Logic error */
653			}
654			free_extent_map(split);
655			split = NULL;
656		}
657next:
658		if (extent_map_in_tree(em))
659			remove_extent_mapping(em_tree, em);
660		write_unlock(&em_tree->lock);
661
662		/* once for us */
663		free_extent_map(em);
664		/* once for the tree*/
665		free_extent_map(em);
666	}
667	if (split)
668		free_extent_map(split);
669	if (split2)
670		free_extent_map(split2);
671}
672
673/*
674 * this is very complex, but the basic idea is to drop all extents
675 * in the range start - end.  hint_block is filled in with a block number
676 * that would be a good hint to the block allocator for this file.
677 *
678 * If an extent intersects the range but is not entirely inside the range
679 * it is either truncated or split.  Anything entirely inside the range
680 * is deleted from the tree.
681 *
682 * Note: the VFS' inode number of bytes is not updated, it's up to the caller
683 * to deal with that. We set the field 'bytes_found' of the arguments structure
684 * with the number of allocated bytes found in the target range, so that the
685 * caller can update the inode's number of bytes in an atomic way when
686 * replacing extents in a range to avoid races with stat(2).
687 */
688int btrfs_drop_extents(struct btrfs_trans_handle *trans,
689		       struct btrfs_root *root, struct btrfs_inode *inode,
690		       struct btrfs_drop_extents_args *args)
691{
692	struct btrfs_fs_info *fs_info = root->fs_info;
693	struct extent_buffer *leaf;
694	struct btrfs_file_extent_item *fi;
695	struct btrfs_ref ref = { 0 };
696	struct btrfs_key key;
697	struct btrfs_key new_key;
698	u64 ino = btrfs_ino(inode);
699	u64 search_start = args->start;
700	u64 disk_bytenr = 0;
701	u64 num_bytes = 0;
702	u64 extent_offset = 0;
703	u64 extent_end = 0;
704	u64 last_end = args->start;
705	int del_nr = 0;
706	int del_slot = 0;
707	int extent_type;
708	int recow;
709	int ret;
710	int modify_tree = -1;
711	int update_refs;
712	int found = 0;
713	int leafs_visited = 0;
714	struct btrfs_path *path = args->path;
715
716	args->bytes_found = 0;
717	args->extent_inserted = false;
718
719	/* Must always have a path if ->replace_extent is true */
720	ASSERT(!(args->replace_extent && !args->path));
721
722	if (!path) {
723		path = btrfs_alloc_path();
724		if (!path) {
725			ret = -ENOMEM;
726			goto out;
727		}
728	}
729
730	if (args->drop_cache)
731		btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0);
732
733	if (args->start >= inode->disk_i_size && !args->replace_extent)
734		modify_tree = 0;
735
736	update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
737		       root == fs_info->tree_root);
738	while (1) {
739		recow = 0;
740		ret = btrfs_lookup_file_extent(trans, root, path, ino,
741					       search_start, modify_tree);
742		if (ret < 0)
743			break;
744		if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
745			leaf = path->nodes[0];
746			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
747			if (key.objectid == ino &&
748			    key.type == BTRFS_EXTENT_DATA_KEY)
749				path->slots[0]--;
750		}
751		ret = 0;
752		leafs_visited++;
753next_slot:
754		leaf = path->nodes[0];
755		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
756			BUG_ON(del_nr > 0);
757			ret = btrfs_next_leaf(root, path);
758			if (ret < 0)
759				break;
760			if (ret > 0) {
761				ret = 0;
762				break;
763			}
764			leafs_visited++;
765			leaf = path->nodes[0];
766			recow = 1;
767		}
768
769		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
770
771		if (key.objectid > ino)
772			break;
773		if (WARN_ON_ONCE(key.objectid < ino) ||
774		    key.type < BTRFS_EXTENT_DATA_KEY) {
775			ASSERT(del_nr == 0);
776			path->slots[0]++;
777			goto next_slot;
778		}
779		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
780			break;
781
782		fi = btrfs_item_ptr(leaf, path->slots[0],
783				    struct btrfs_file_extent_item);
784		extent_type = btrfs_file_extent_type(leaf, fi);
785
786		if (extent_type == BTRFS_FILE_EXTENT_REG ||
787		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
788			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
789			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
790			extent_offset = btrfs_file_extent_offset(leaf, fi);
791			extent_end = key.offset +
792				btrfs_file_extent_num_bytes(leaf, fi);
793		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
794			extent_end = key.offset +
795				btrfs_file_extent_ram_bytes(leaf, fi);
796		} else {
797			/* can't happen */
798			BUG();
799		}
800
801		/*
802		 * Don't skip extent items representing 0 byte lengths. They
803		 * used to be created (bug) if while punching holes we hit
804		 * -ENOSPC condition. So if we find one here, just ensure we
805		 * delete it, otherwise we would insert a new file extent item
806		 * with the same key (offset) as that 0 bytes length file
807		 * extent item in the call to setup_items_for_insert() later
808		 * in this function.
809		 */
810		if (extent_end == key.offset && extent_end >= search_start) {
811			last_end = extent_end;
812			goto delete_extent_item;
813		}
814
815		if (extent_end <= search_start) {
816			path->slots[0]++;
817			goto next_slot;
818		}
819
820		found = 1;
821		search_start = max(key.offset, args->start);
822		if (recow || !modify_tree) {
823			modify_tree = -1;
824			btrfs_release_path(path);
825			continue;
826		}
827
828		/*
829		 *     | - range to drop - |
830		 *  | -------- extent -------- |
831		 */
832		if (args->start > key.offset && args->end < extent_end) {
833			BUG_ON(del_nr > 0);
834			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
835				ret = -EOPNOTSUPP;
836				break;
837			}
838
839			memcpy(&new_key, &key, sizeof(new_key));
840			new_key.offset = args->start;
841			ret = btrfs_duplicate_item(trans, root, path,
842						   &new_key);
843			if (ret == -EAGAIN) {
844				btrfs_release_path(path);
845				continue;
846			}
847			if (ret < 0)
848				break;
849
850			leaf = path->nodes[0];
851			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
852					    struct btrfs_file_extent_item);
853			btrfs_set_file_extent_num_bytes(leaf, fi,
854							args->start - key.offset);
855
856			fi = btrfs_item_ptr(leaf, path->slots[0],
857					    struct btrfs_file_extent_item);
858
859			extent_offset += args->start - key.offset;
860			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
861			btrfs_set_file_extent_num_bytes(leaf, fi,
862							extent_end - args->start);
863			btrfs_mark_buffer_dirty(leaf);
864
865			if (update_refs && disk_bytenr > 0) {
866				btrfs_init_generic_ref(&ref,
867						BTRFS_ADD_DELAYED_REF,
868						disk_bytenr, num_bytes, 0);
869				btrfs_init_data_ref(&ref,
870						root->root_key.objectid,
871						new_key.objectid,
872						args->start - extent_offset);
873				ret = btrfs_inc_extent_ref(trans, &ref);
874				BUG_ON(ret); /* -ENOMEM */
875			}
876			key.offset = args->start;
877		}
878		/*
879		 * From here on out we will have actually dropped something, so
880		 * last_end can be updated.
881		 */
882		last_end = extent_end;
883
884		/*
885		 *  | ---- range to drop ----- |
886		 *      | -------- extent -------- |
887		 */
888		if (args->start <= key.offset && args->end < extent_end) {
889			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
890				ret = -EOPNOTSUPP;
891				break;
892			}
893
894			memcpy(&new_key, &key, sizeof(new_key));
895			new_key.offset = args->end;
896			btrfs_set_item_key_safe(fs_info, path, &new_key);
897
898			extent_offset += args->end - key.offset;
899			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
900			btrfs_set_file_extent_num_bytes(leaf, fi,
901							extent_end - args->end);
902			btrfs_mark_buffer_dirty(leaf);
903			if (update_refs && disk_bytenr > 0)
904				args->bytes_found += args->end - key.offset;
905			break;
906		}
907
908		search_start = extent_end;
909		/*
910		 *       | ---- range to drop ----- |
911		 *  | -------- extent -------- |
912		 */
913		if (args->start > key.offset && args->end >= extent_end) {
914			BUG_ON(del_nr > 0);
915			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
916				ret = -EOPNOTSUPP;
917				break;
918			}
919
920			btrfs_set_file_extent_num_bytes(leaf, fi,
921							args->start - key.offset);
922			btrfs_mark_buffer_dirty(leaf);
923			if (update_refs && disk_bytenr > 0)
924				args->bytes_found += extent_end - args->start;
925			if (args->end == extent_end)
926				break;
927
928			path->slots[0]++;
929			goto next_slot;
930		}
931
932		/*
933		 *  | ---- range to drop ----- |
934		 *    | ------ extent ------ |
935		 */
936		if (args->start <= key.offset && args->end >= extent_end) {
937delete_extent_item:
938			if (del_nr == 0) {
939				del_slot = path->slots[0];
940				del_nr = 1;
941			} else {
942				BUG_ON(del_slot + del_nr != path->slots[0]);
943				del_nr++;
944			}
945
946			if (update_refs &&
947			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
948				args->bytes_found += extent_end - key.offset;
949				extent_end = ALIGN(extent_end,
950						   fs_info->sectorsize);
951			} else if (update_refs && disk_bytenr > 0) {
952				btrfs_init_generic_ref(&ref,
953						BTRFS_DROP_DELAYED_REF,
954						disk_bytenr, num_bytes, 0);
955				btrfs_init_data_ref(&ref,
956						root->root_key.objectid,
957						key.objectid,
958						key.offset - extent_offset);
959				ret = btrfs_free_extent(trans, &ref);
960				BUG_ON(ret); /* -ENOMEM */
961				args->bytes_found += extent_end - key.offset;
962			}
963
964			if (args->end == extent_end)
965				break;
966
967			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
968				path->slots[0]++;
969				goto next_slot;
970			}
971
972			ret = btrfs_del_items(trans, root, path, del_slot,
973					      del_nr);
974			if (ret) {
975				btrfs_abort_transaction(trans, ret);
976				break;
977			}
978
979			del_nr = 0;
980			del_slot = 0;
981
982			btrfs_release_path(path);
983			continue;
984		}
985
986		BUG();
987	}
988
989	if (!ret && del_nr > 0) {
990		/*
991		 * Set path->slots[0] to first slot, so that after the delete
992		 * if items are move off from our leaf to its immediate left or
993		 * right neighbor leafs, we end up with a correct and adjusted
994		 * path->slots[0] for our insertion (if args->replace_extent).
995		 */
996		path->slots[0] = del_slot;
997		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
998		if (ret)
999			btrfs_abort_transaction(trans, ret);
1000	}
1001
1002	leaf = path->nodes[0];
1003	/*
1004	 * If btrfs_del_items() was called, it might have deleted a leaf, in
1005	 * which case it unlocked our path, so check path->locks[0] matches a
1006	 * write lock.
1007	 */
1008	if (!ret && args->replace_extent && leafs_visited == 1 &&
1009	    path->locks[0] == BTRFS_WRITE_LOCK &&
1010	    btrfs_leaf_free_space(leaf) >=
1011	    sizeof(struct btrfs_item) + args->extent_item_size) {
1012
1013		key.objectid = ino;
1014		key.type = BTRFS_EXTENT_DATA_KEY;
1015		key.offset = args->start;
1016		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
1017			struct btrfs_key slot_key;
1018
1019			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
1020			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
1021				path->slots[0]++;
1022		}
1023		setup_items_for_insert(root, path, &key,
1024				       &args->extent_item_size, 1);
1025		args->extent_inserted = true;
1026	}
1027
1028	if (!args->path)
1029		btrfs_free_path(path);
1030	else if (!args->extent_inserted)
1031		btrfs_release_path(path);
1032out:
1033	args->drop_end = found ? min(args->end, last_end) : args->end;
1034
1035	return ret;
1036}
1037
1038static int extent_mergeable(struct extent_buffer *leaf, int slot,
1039			    u64 objectid, u64 bytenr, u64 orig_offset,
1040			    u64 *start, u64 *end)
1041{
1042	struct btrfs_file_extent_item *fi;
1043	struct btrfs_key key;
1044	u64 extent_end;
1045
1046	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1047		return 0;
1048
1049	btrfs_item_key_to_cpu(leaf, &key, slot);
1050	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
1051		return 0;
1052
1053	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1054	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
1055	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1056	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
1057	    btrfs_file_extent_compression(leaf, fi) ||
1058	    btrfs_file_extent_encryption(leaf, fi) ||
1059	    btrfs_file_extent_other_encoding(leaf, fi))
1060		return 0;
1061
1062	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1063	if ((*start && *start != key.offset) || (*end && *end != extent_end))
1064		return 0;
1065
1066	*start = key.offset;
1067	*end = extent_end;
1068	return 1;
1069}
1070
1071/*
1072 * Mark extent in the range start - end as written.
1073 *
1074 * This changes extent type from 'pre-allocated' to 'regular'. If only
1075 * part of extent is marked as written, the extent will be split into
1076 * two or three.
1077 */
1078int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1079			      struct btrfs_inode *inode, u64 start, u64 end)
1080{
1081	struct btrfs_fs_info *fs_info = trans->fs_info;
1082	struct btrfs_root *root = inode->root;
1083	struct extent_buffer *leaf;
1084	struct btrfs_path *path;
1085	struct btrfs_file_extent_item *fi;
1086	struct btrfs_ref ref = { 0 };
1087	struct btrfs_key key;
1088	struct btrfs_key new_key;
1089	u64 bytenr;
1090	u64 num_bytes;
1091	u64 extent_end;
1092	u64 orig_offset;
1093	u64 other_start;
1094	u64 other_end;
1095	u64 split;
1096	int del_nr = 0;
1097	int del_slot = 0;
1098	int recow;
1099	int ret = 0;
1100	u64 ino = btrfs_ino(inode);
1101
1102	path = btrfs_alloc_path();
1103	if (!path)
1104		return -ENOMEM;
1105again:
1106	recow = 0;
1107	split = start;
1108	key.objectid = ino;
1109	key.type = BTRFS_EXTENT_DATA_KEY;
1110	key.offset = split;
1111
1112	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1113	if (ret < 0)
1114		goto out;
1115	if (ret > 0 && path->slots[0] > 0)
1116		path->slots[0]--;
1117
1118	leaf = path->nodes[0];
1119	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1120	if (key.objectid != ino ||
1121	    key.type != BTRFS_EXTENT_DATA_KEY) {
1122		ret = -EINVAL;
1123		btrfs_abort_transaction(trans, ret);
1124		goto out;
1125	}
1126	fi = btrfs_item_ptr(leaf, path->slots[0],
1127			    struct btrfs_file_extent_item);
1128	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
1129		ret = -EINVAL;
1130		btrfs_abort_transaction(trans, ret);
1131		goto out;
1132	}
1133	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1134	if (key.offset > start || extent_end < end) {
1135		ret = -EINVAL;
1136		btrfs_abort_transaction(trans, ret);
1137		goto out;
1138	}
1139
1140	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1141	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1142	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1143	memcpy(&new_key, &key, sizeof(new_key));
1144
1145	if (start == key.offset && end < extent_end) {
1146		other_start = 0;
1147		other_end = start;
1148		if (extent_mergeable(leaf, path->slots[0] - 1,
1149				     ino, bytenr, orig_offset,
1150				     &other_start, &other_end)) {
1151			new_key.offset = end;
1152			btrfs_set_item_key_safe(fs_info, path, &new_key);
1153			fi = btrfs_item_ptr(leaf, path->slots[0],
1154					    struct btrfs_file_extent_item);
1155			btrfs_set_file_extent_generation(leaf, fi,
1156							 trans->transid);
1157			btrfs_set_file_extent_num_bytes(leaf, fi,
1158							extent_end - end);
1159			btrfs_set_file_extent_offset(leaf, fi,
1160						     end - orig_offset);
1161			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1162					    struct btrfs_file_extent_item);
1163			btrfs_set_file_extent_generation(leaf, fi,
1164							 trans->transid);
1165			btrfs_set_file_extent_num_bytes(leaf, fi,
1166							end - other_start);
1167			btrfs_mark_buffer_dirty(leaf);
1168			goto out;
1169		}
1170	}
1171
1172	if (start > key.offset && end == extent_end) {
1173		other_start = end;
1174		other_end = 0;
1175		if (extent_mergeable(leaf, path->slots[0] + 1,
1176				     ino, bytenr, orig_offset,
1177				     &other_start, &other_end)) {
1178			fi = btrfs_item_ptr(leaf, path->slots[0],
1179					    struct btrfs_file_extent_item);
1180			btrfs_set_file_extent_num_bytes(leaf, fi,
1181							start - key.offset);
1182			btrfs_set_file_extent_generation(leaf, fi,
1183							 trans->transid);
1184			path->slots[0]++;
1185			new_key.offset = start;
1186			btrfs_set_item_key_safe(fs_info, path, &new_key);
1187
1188			fi = btrfs_item_ptr(leaf, path->slots[0],
1189					    struct btrfs_file_extent_item);
1190			btrfs_set_file_extent_generation(leaf, fi,
1191							 trans->transid);
1192			btrfs_set_file_extent_num_bytes(leaf, fi,
1193							other_end - start);
1194			btrfs_set_file_extent_offset(leaf, fi,
1195						     start - orig_offset);
1196			btrfs_mark_buffer_dirty(leaf);
1197			goto out;
1198		}
1199	}
1200
1201	while (start > key.offset || end < extent_end) {
1202		if (key.offset == start)
1203			split = end;
1204
1205		new_key.offset = split;
1206		ret = btrfs_duplicate_item(trans, root, path, &new_key);
1207		if (ret == -EAGAIN) {
1208			btrfs_release_path(path);
1209			goto again;
1210		}
1211		if (ret < 0) {
1212			btrfs_abort_transaction(trans, ret);
1213			goto out;
1214		}
1215
1216		leaf = path->nodes[0];
1217		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1218				    struct btrfs_file_extent_item);
1219		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1220		btrfs_set_file_extent_num_bytes(leaf, fi,
1221						split - key.offset);
1222
1223		fi = btrfs_item_ptr(leaf, path->slots[0],
1224				    struct btrfs_file_extent_item);
1225
1226		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1227		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1228		btrfs_set_file_extent_num_bytes(leaf, fi,
1229						extent_end - split);
1230		btrfs_mark_buffer_dirty(leaf);
1231
1232		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
1233				       num_bytes, 0);
1234		btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
1235				    orig_offset);
1236		ret = btrfs_inc_extent_ref(trans, &ref);
1237		if (ret) {
1238			btrfs_abort_transaction(trans, ret);
1239			goto out;
1240		}
1241
1242		if (split == start) {
1243			key.offset = start;
1244		} else {
1245			if (start != key.offset) {
1246				ret = -EINVAL;
1247				btrfs_abort_transaction(trans, ret);
1248				goto out;
1249			}
1250			path->slots[0]--;
1251			extent_end = end;
1252		}
1253		recow = 1;
1254	}
1255
1256	other_start = end;
1257	other_end = 0;
1258	btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1259			       num_bytes, 0);
1260	btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
1261	if (extent_mergeable(leaf, path->slots[0] + 1,
1262			     ino, bytenr, orig_offset,
1263			     &other_start, &other_end)) {
1264		if (recow) {
1265			btrfs_release_path(path);
1266			goto again;
1267		}
1268		extent_end = other_end;
1269		del_slot = path->slots[0] + 1;
1270		del_nr++;
1271		ret = btrfs_free_extent(trans, &ref);
1272		if (ret) {
1273			btrfs_abort_transaction(trans, ret);
1274			goto out;
1275		}
1276	}
1277	other_start = 0;
1278	other_end = start;
1279	if (extent_mergeable(leaf, path->slots[0] - 1,
1280			     ino, bytenr, orig_offset,
1281			     &other_start, &other_end)) {
1282		if (recow) {
1283			btrfs_release_path(path);
1284			goto again;
1285		}
1286		key.offset = other_start;
1287		del_slot = path->slots[0];
1288		del_nr++;
1289		ret = btrfs_free_extent(trans, &ref);
1290		if (ret) {
1291			btrfs_abort_transaction(trans, ret);
1292			goto out;
1293		}
1294	}
1295	if (del_nr == 0) {
1296		fi = btrfs_item_ptr(leaf, path->slots[0],
1297			   struct btrfs_file_extent_item);
1298		btrfs_set_file_extent_type(leaf, fi,
1299					   BTRFS_FILE_EXTENT_REG);
1300		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1301		btrfs_mark_buffer_dirty(leaf);
1302	} else {
1303		fi = btrfs_item_ptr(leaf, del_slot - 1,
1304			   struct btrfs_file_extent_item);
1305		btrfs_set_file_extent_type(leaf, fi,
1306					   BTRFS_FILE_EXTENT_REG);
1307		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1308		btrfs_set_file_extent_num_bytes(leaf, fi,
1309						extent_end - key.offset);
1310		btrfs_mark_buffer_dirty(leaf);
1311
1312		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1313		if (ret < 0) {
1314			btrfs_abort_transaction(trans, ret);
1315			goto out;
1316		}
1317	}
1318out:
1319	btrfs_free_path(path);
1320	return ret;
1321}
1322
1323/*
1324 * on error we return an unlocked page and the error value
1325 * on success we return a locked page and 0
1326 */
1327static int prepare_uptodate_page(struct inode *inode,
1328				 struct page *page, u64 pos,
1329				 bool force_uptodate)
1330{
1331	int ret = 0;
1332
1333	if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
1334	    !PageUptodate(page)) {
1335		ret = btrfs_readpage(NULL, page);
1336		if (ret)
1337			return ret;
1338		lock_page(page);
1339		if (!PageUptodate(page)) {
1340			unlock_page(page);
1341			return -EIO;
1342		}
1343
1344		/*
1345		 * Since btrfs_readpage() will unlock the page before it
1346		 * returns, there is a window where btrfs_releasepage() can
1347		 * be called to release the page.
1348		 * Here we check both inode mapping and PagePrivate() to
1349		 * make sure the page was not released.
1350		 *
1351		 * The private flag check is essential for subpage as we need
1352		 * to store extra bitmap using page->private.
1353		 */
1354		if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
1355			unlock_page(page);
1356			return -EAGAIN;
1357		}
1358	}
1359	return 0;
1360}
1361
1362/*
1363 * this just gets pages into the page cache and locks them down.
1364 */
1365static noinline int prepare_pages(struct inode *inode, struct page **pages,
1366				  size_t num_pages, loff_t pos,
1367				  size_t write_bytes, bool force_uptodate)
1368{
1369	int i;
1370	unsigned long index = pos >> PAGE_SHIFT;
1371	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1372	int err = 0;
1373	int faili;
1374
1375	for (i = 0; i < num_pages; i++) {
1376again:
1377		pages[i] = find_or_create_page(inode->i_mapping, index + i,
1378					       mask | __GFP_WRITE);
1379		if (!pages[i]) {
1380			faili = i - 1;
1381			err = -ENOMEM;
1382			goto fail;
1383		}
1384
1385		err = set_page_extent_mapped(pages[i]);
1386		if (err < 0) {
1387			faili = i;
1388			goto fail;
1389		}
1390
1391		if (i == 0)
1392			err = prepare_uptodate_page(inode, pages[i], pos,
1393						    force_uptodate);
1394		if (!err && i == num_pages - 1)
1395			err = prepare_uptodate_page(inode, pages[i],
1396						    pos + write_bytes, false);
1397		if (err) {
1398			put_page(pages[i]);
1399			if (err == -EAGAIN) {
1400				err = 0;
1401				goto again;
1402			}
1403			faili = i - 1;
1404			goto fail;
1405		}
1406		wait_on_page_writeback(pages[i]);
1407	}
1408
1409	return 0;
1410fail:
1411	while (faili >= 0) {
1412		unlock_page(pages[faili]);
1413		put_page(pages[faili]);
1414		faili--;
1415	}
1416	return err;
1417
1418}
1419
1420/*
1421 * This function locks the extent and properly waits for data=ordered extents
1422 * to finish before allowing the pages to be modified if need.
1423 *
1424 * The return value:
1425 * 1 - the extent is locked
1426 * 0 - the extent is not locked, and everything is OK
1427 * -EAGAIN - need re-prepare the pages
1428 * the other < 0 number - Something wrong happens
1429 */
1430static noinline int
1431lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
1432				size_t num_pages, loff_t pos,
1433				size_t write_bytes,
1434				u64 *lockstart, u64 *lockend,
1435				struct extent_state **cached_state)
1436{
1437	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1438	u64 start_pos;
1439	u64 last_pos;
1440	int i;
1441	int ret = 0;
1442
1443	start_pos = round_down(pos, fs_info->sectorsize);
1444	last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
1445
1446	if (start_pos < inode->vfs_inode.i_size) {
1447		struct btrfs_ordered_extent *ordered;
1448
1449		lock_extent_bits(&inode->io_tree, start_pos, last_pos,
1450				cached_state);
1451		ordered = btrfs_lookup_ordered_range(inode, start_pos,
1452						     last_pos - start_pos + 1);
1453		if (ordered &&
1454		    ordered->file_offset + ordered->num_bytes > start_pos &&
1455		    ordered->file_offset <= last_pos) {
1456			unlock_extent_cached(&inode->io_tree, start_pos,
1457					last_pos, cached_state);
1458			for (i = 0; i < num_pages; i++) {
1459				unlock_page(pages[i]);
1460				put_page(pages[i]);
1461			}
1462			btrfs_start_ordered_extent(ordered, 1);
1463			btrfs_put_ordered_extent(ordered);
1464			return -EAGAIN;
1465		}
1466		if (ordered)
1467			btrfs_put_ordered_extent(ordered);
1468
1469		*lockstart = start_pos;
1470		*lockend = last_pos;
1471		ret = 1;
1472	}
1473
1474	/*
1475	 * We should be called after prepare_pages() which should have locked
1476	 * all pages in the range.
1477	 */
1478	for (i = 0; i < num_pages; i++)
1479		WARN_ON(!PageLocked(pages[i]));
1480
1481	return ret;
1482}
1483
1484static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
1485			   size_t *write_bytes, bool nowait)
1486{
1487	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1488	struct btrfs_root *root = inode->root;
1489	u64 lockstart, lockend;
1490	u64 num_bytes;
1491	int ret;
1492
1493	if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1494		return 0;
1495
1496	if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
1497		return -EAGAIN;
1498
1499	lockstart = round_down(pos, fs_info->sectorsize);
1500	lockend = round_up(pos + *write_bytes,
1501			   fs_info->sectorsize) - 1;
1502	num_bytes = lockend - lockstart + 1;
1503
1504	if (nowait) {
1505		struct btrfs_ordered_extent *ordered;
1506
1507		if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
1508			return -EAGAIN;
1509
1510		ordered = btrfs_lookup_ordered_range(inode, lockstart,
1511						     num_bytes);
1512		if (ordered) {
1513			btrfs_put_ordered_extent(ordered);
1514			ret = -EAGAIN;
1515			goto out_unlock;
1516		}
1517	} else {
1518		btrfs_lock_and_flush_ordered_range(inode, lockstart,
1519						   lockend, NULL);
1520	}
1521
1522	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
1523			NULL, NULL, NULL, false);
1524	if (ret <= 0) {
1525		ret = 0;
1526		if (!nowait)
1527			btrfs_drew_write_unlock(&root->snapshot_lock);
1528	} else {
1529		*write_bytes = min_t(size_t, *write_bytes ,
1530				     num_bytes - pos + lockstart);
1531	}
1532out_unlock:
1533	unlock_extent(&inode->io_tree, lockstart, lockend);
1534
1535	return ret;
1536}
1537
1538static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
1539			      size_t *write_bytes)
1540{
1541	return check_can_nocow(inode, pos, write_bytes, true);
1542}
1543
1544/*
1545 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1546 *
1547 * @pos:	 File offset
1548 * @write_bytes: The length to write, will be updated to the nocow writeable
1549 *		 range
1550 *
1551 * This function will flush ordered extents in the range to ensure proper
1552 * nocow checks.
1553 *
1554 * Return:
1555 * >0		and update @write_bytes if we can do nocow write
1556 *  0		if we can't do nocow write
1557 * -EAGAIN	if we can't get the needed lock or there are ordered extents
1558 * 		for * (nowait == true) case
1559 * <0		if other error happened
1560 *
1561 * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
1562 */
1563int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
1564			   size_t *write_bytes)
1565{
1566	return check_can_nocow(inode, pos, write_bytes, false);
1567}
1568
1569void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1570{
1571	btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1572}
1573
1574static void update_time_for_write(struct inode *inode)
1575{
1576	struct timespec64 now;
1577
1578	if (IS_NOCMTIME(inode))
1579		return;
1580
1581	now = current_time(inode);
1582	if (!timespec64_equal(&inode->i_mtime, &now))
1583		inode->i_mtime = now;
1584
1585	if (!timespec64_equal(&inode->i_ctime, &now))
1586		inode->i_ctime = now;
1587
1588	if (IS_I_VERSION(inode))
1589		inode_inc_iversion(inode);
1590}
1591
1592static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
1593			     size_t count)
1594{
1595	struct file *file = iocb->ki_filp;
1596	struct inode *inode = file_inode(file);
1597	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1598	loff_t pos = iocb->ki_pos;
1599	int ret;
1600	loff_t oldsize;
1601	loff_t start_pos;
1602
1603	if (iocb->ki_flags & IOCB_NOWAIT) {
1604		size_t nocow_bytes = count;
1605
1606		/* We will allocate space in case nodatacow is not set, so bail */
1607		if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0)
1608			return -EAGAIN;
1609		/*
1610		 * There are holes in the range or parts of the range that must
1611		 * be COWed (shared extents, RO block groups, etc), so just bail
1612		 * out.
1613		 */
1614		if (nocow_bytes < count)
1615			return -EAGAIN;
1616	}
1617
1618	current->backing_dev_info = inode_to_bdi(inode);
1619	ret = file_remove_privs(file);
1620	if (ret)
1621		return ret;
1622
1623	/*
1624	 * We reserve space for updating the inode when we reserve space for the
1625	 * extent we are going to write, so we will enospc out there.  We don't
1626	 * need to start yet another transaction to update the inode as we will
1627	 * update the inode when we finish writing whatever data we write.
1628	 */
1629	update_time_for_write(inode);
1630
1631	start_pos = round_down(pos, fs_info->sectorsize);
1632	oldsize = i_size_read(inode);
1633	if (start_pos > oldsize) {
1634		/* Expand hole size to cover write data, preventing empty gap */
1635		loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1636
1637		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1638		if (ret) {
1639			current->backing_dev_info = NULL;
1640			return ret;
1641		}
1642	}
1643
1644	return 0;
1645}
1646
1647static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
1648					       struct iov_iter *i)
1649{
1650	struct file *file = iocb->ki_filp;
1651	loff_t pos;
1652	struct inode *inode = file_inode(file);
1653	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1654	struct page **pages = NULL;
1655	struct extent_changeset *data_reserved = NULL;
1656	u64 release_bytes = 0;
1657	u64 lockstart;
1658	u64 lockend;
1659	size_t num_written = 0;
1660	int nrptrs;
1661	ssize_t ret;
1662	bool only_release_metadata = false;
1663	bool force_page_uptodate = false;
1664	loff_t old_isize = i_size_read(inode);
1665	unsigned int ilock_flags = 0;
1666
1667	if (iocb->ki_flags & IOCB_NOWAIT)
1668		ilock_flags |= BTRFS_ILOCK_TRY;
1669
1670	ret = btrfs_inode_lock(inode, ilock_flags);
1671	if (ret < 0)
1672		return ret;
1673
1674	ret = generic_write_checks(iocb, i);
1675	if (ret <= 0)
1676		goto out;
1677
1678	ret = btrfs_write_check(iocb, i, ret);
1679	if (ret < 0)
1680		goto out;
1681
1682	pos = iocb->ki_pos;
1683	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1684			PAGE_SIZE / (sizeof(struct page *)));
1685	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1686	nrptrs = max(nrptrs, 8);
1687	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1688	if (!pages) {
1689		ret = -ENOMEM;
1690		goto out;
1691	}
1692
1693	while (iov_iter_count(i) > 0) {
1694		struct extent_state *cached_state = NULL;
1695		size_t offset = offset_in_page(pos);
1696		size_t sector_offset;
1697		size_t write_bytes = min(iov_iter_count(i),
1698					 nrptrs * (size_t)PAGE_SIZE -
1699					 offset);
1700		size_t num_pages;
1701		size_t reserve_bytes;
1702		size_t dirty_pages;
1703		size_t copied;
1704		size_t dirty_sectors;
1705		size_t num_sectors;
1706		int extents_locked;
1707
1708		/*
1709		 * Fault pages before locking them in prepare_pages
1710		 * to avoid recursive lock
1711		 */
1712		if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1713			ret = -EFAULT;
1714			break;
1715		}
1716
1717		only_release_metadata = false;
1718		sector_offset = pos & (fs_info->sectorsize - 1);
1719
1720		extent_changeset_release(data_reserved);
1721		ret = btrfs_check_data_free_space(BTRFS_I(inode),
1722						  &data_reserved, pos,
1723						  write_bytes);
1724		if (ret < 0) {
1725			/*
1726			 * If we don't have to COW at the offset, reserve
1727			 * metadata only. write_bytes may get smaller than
1728			 * requested here.
1729			 */
1730			if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1731						   &write_bytes) > 0)
1732				only_release_metadata = true;
1733			else
1734				break;
1735		}
1736
1737		num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
1738		WARN_ON(num_pages > nrptrs);
1739		reserve_bytes = round_up(write_bytes + sector_offset,
1740					 fs_info->sectorsize);
1741		WARN_ON(reserve_bytes == 0);
1742		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1743				reserve_bytes);
1744		if (ret) {
1745			if (!only_release_metadata)
1746				btrfs_free_reserved_data_space(BTRFS_I(inode),
1747						data_reserved, pos,
1748						write_bytes);
1749			else
1750				btrfs_check_nocow_unlock(BTRFS_I(inode));
1751			break;
1752		}
1753
1754		release_bytes = reserve_bytes;
1755again:
1756		/*
1757		 * This is going to setup the pages array with the number of
1758		 * pages we want, so we don't really need to worry about the
1759		 * contents of pages from loop to loop
1760		 */
1761		ret = prepare_pages(inode, pages, num_pages,
1762				    pos, write_bytes,
1763				    force_page_uptodate);
1764		if (ret) {
1765			btrfs_delalloc_release_extents(BTRFS_I(inode),
1766						       reserve_bytes);
1767			break;
1768		}
1769
1770		extents_locked = lock_and_cleanup_extent_if_need(
1771				BTRFS_I(inode), pages,
1772				num_pages, pos, write_bytes, &lockstart,
1773				&lockend, &cached_state);
1774		if (extents_locked < 0) {
1775			if (extents_locked == -EAGAIN)
1776				goto again;
1777			btrfs_delalloc_release_extents(BTRFS_I(inode),
1778						       reserve_bytes);
1779			ret = extents_locked;
1780			break;
1781		}
1782
1783		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
1784
1785		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1786		dirty_sectors = round_up(copied + sector_offset,
1787					fs_info->sectorsize);
1788		dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1789
1790		/*
1791		 * if we have trouble faulting in the pages, fall
1792		 * back to one page at a time
1793		 */
1794		if (copied < write_bytes)
1795			nrptrs = 1;
1796
1797		if (copied == 0) {
1798			force_page_uptodate = true;
1799			dirty_sectors = 0;
1800			dirty_pages = 0;
1801		} else {
1802			force_page_uptodate = false;
1803			dirty_pages = DIV_ROUND_UP(copied + offset,
1804						   PAGE_SIZE);
1805		}
1806
1807		if (num_sectors > dirty_sectors) {
1808			/* release everything except the sectors we dirtied */
1809			release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
1810			if (only_release_metadata) {
1811				btrfs_delalloc_release_metadata(BTRFS_I(inode),
1812							release_bytes, true);
1813			} else {
1814				u64 __pos;
1815
1816				__pos = round_down(pos,
1817						   fs_info->sectorsize) +
1818					(dirty_pages << PAGE_SHIFT);
1819				btrfs_delalloc_release_space(BTRFS_I(inode),
1820						data_reserved, __pos,
1821						release_bytes, true);
1822			}
1823		}
1824
1825		release_bytes = round_up(copied + sector_offset,
1826					fs_info->sectorsize);
1827
1828		ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
1829					dirty_pages, pos, copied,
1830					&cached_state, only_release_metadata);
1831
1832		/*
1833		 * If we have not locked the extent range, because the range's
1834		 * start offset is >= i_size, we might still have a non-NULL
1835		 * cached extent state, acquired while marking the extent range
1836		 * as delalloc through btrfs_dirty_pages(). Therefore free any
1837		 * possible cached extent state to avoid a memory leak.
1838		 */
1839		if (extents_locked)
1840			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1841					     lockstart, lockend, &cached_state);
1842		else
1843			free_extent_state(cached_state);
1844
1845		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1846		if (ret) {
1847			btrfs_drop_pages(pages, num_pages);
1848			break;
1849		}
1850
1851		release_bytes = 0;
1852		if (only_release_metadata)
1853			btrfs_check_nocow_unlock(BTRFS_I(inode));
1854
1855		btrfs_drop_pages(pages, num_pages);
1856
1857		cond_resched();
1858
1859		balance_dirty_pages_ratelimited(inode->i_mapping);
1860
1861		pos += copied;
1862		num_written += copied;
1863	}
1864
1865	kfree(pages);
1866
1867	if (release_bytes) {
1868		if (only_release_metadata) {
1869			btrfs_check_nocow_unlock(BTRFS_I(inode));
1870			btrfs_delalloc_release_metadata(BTRFS_I(inode),
1871					release_bytes, true);
1872		} else {
1873			btrfs_delalloc_release_space(BTRFS_I(inode),
1874					data_reserved,
1875					round_down(pos, fs_info->sectorsize),
1876					release_bytes, true);
1877		}
1878	}
1879
1880	extent_changeset_free(data_reserved);
1881	if (num_written > 0) {
1882		pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1883		iocb->ki_pos += num_written;
1884	}
1885out:
1886	btrfs_inode_unlock(inode, ilock_flags);
1887	return num_written ? num_written : ret;
1888}
1889
1890static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
1891			       const struct iov_iter *iter, loff_t offset)
1892{
1893	const u32 blocksize_mask = fs_info->sectorsize - 1;
1894
1895	if (offset & blocksize_mask)
1896		return -EINVAL;
1897
1898	if (iov_iter_alignment(iter) & blocksize_mask)
1899		return -EINVAL;
1900
1901	return 0;
1902}
1903
1904static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
1905{
1906	struct file *file = iocb->ki_filp;
1907	struct inode *inode = file_inode(file);
1908	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1909	loff_t pos;
1910	ssize_t written = 0;
1911	ssize_t written_buffered;
1912	loff_t endbyte;
1913	ssize_t err;
1914	unsigned int ilock_flags = 0;
1915	struct iomap_dio *dio = NULL;
1916
1917	if (iocb->ki_flags & IOCB_NOWAIT)
1918		ilock_flags |= BTRFS_ILOCK_TRY;
1919
1920	/* If the write DIO is within EOF, use a shared lock */
1921	if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode))
1922		ilock_flags |= BTRFS_ILOCK_SHARED;
1923
1924relock:
1925	err = btrfs_inode_lock(inode, ilock_flags);
1926	if (err < 0)
1927		return err;
1928
1929	err = generic_write_checks(iocb, from);
1930	if (err <= 0) {
1931		btrfs_inode_unlock(inode, ilock_flags);
1932		return err;
1933	}
1934
1935	err = btrfs_write_check(iocb, from, err);
1936	if (err < 0) {
1937		btrfs_inode_unlock(inode, ilock_flags);
1938		goto out;
1939	}
1940
1941	pos = iocb->ki_pos;
1942	/*
1943	 * Re-check since file size may have changed just before taking the
1944	 * lock or pos may have changed because of O_APPEND in generic_write_check()
1945	 */
1946	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
1947	    pos + iov_iter_count(from) > i_size_read(inode)) {
1948		btrfs_inode_unlock(inode, ilock_flags);
1949		ilock_flags &= ~BTRFS_ILOCK_SHARED;
1950		goto relock;
1951	}
1952
1953	if (check_direct_IO(fs_info, from, pos)) {
1954		btrfs_inode_unlock(inode, ilock_flags);
1955		goto buffered;
1956	}
1957
1958	dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
1959			     0);
1960
1961	btrfs_inode_unlock(inode, ilock_flags);
1962
1963	if (IS_ERR_OR_NULL(dio)) {
1964		err = PTR_ERR_OR_ZERO(dio);
1965		if (err < 0 && err != -ENOTBLK)
1966			goto out;
1967	} else {
1968		written = iomap_dio_complete(dio);
1969	}
1970
1971	if (written < 0 || !iov_iter_count(from)) {
1972		err = written;
1973		goto out;
1974	}
1975
1976buffered:
1977	pos = iocb->ki_pos;
1978	written_buffered = btrfs_buffered_write(iocb, from);
1979	if (written_buffered < 0) {
1980		err = written_buffered;
1981		goto out;
1982	}
1983	/*
1984	 * Ensure all data is persisted. We want the next direct IO read to be
1985	 * able to read what was just written.
1986	 */
1987	endbyte = pos + written_buffered - 1;
1988	err = btrfs_fdatawrite_range(inode, pos, endbyte);
1989	if (err)
1990		goto out;
1991	err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1992	if (err)
1993		goto out;
1994	written += written_buffered;
1995	iocb->ki_pos = pos + written_buffered;
1996	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
1997				 endbyte >> PAGE_SHIFT);
1998out:
1999	return written ? written : err;
2000}
2001
2002static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
2003				    struct iov_iter *from)
2004{
2005	struct file *file = iocb->ki_filp;
2006	struct btrfs_inode *inode = BTRFS_I(file_inode(file));
2007	ssize_t num_written = 0;
2008	const bool sync = iocb->ki_flags & IOCB_DSYNC;
2009
2010	/*
2011	 * If the fs flips readonly due to some impossible error, although we
2012	 * have opened a file as writable, we have to stop this write operation
2013	 * to ensure consistency.
2014	 */
2015	if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
2016		return -EROFS;
2017
2018	if (!(iocb->ki_flags & IOCB_DIRECT) &&
2019	    (iocb->ki_flags & IOCB_NOWAIT))
2020		return -EOPNOTSUPP;
2021
2022	if (sync)
2023		atomic_inc(&inode->sync_writers);
2024
2025	if (iocb->ki_flags & IOCB_DIRECT)
2026		num_written = btrfs_direct_write(iocb, from);
2027	else
2028		num_written = btrfs_buffered_write(iocb, from);
2029
2030	btrfs_set_inode_last_sub_trans(inode);
2031
2032	if (num_written > 0)
2033		num_written = generic_write_sync(iocb, num_written);
2034
2035	if (sync)
2036		atomic_dec(&inode->sync_writers);
2037
2038	current->backing_dev_info = NULL;
2039	return num_written;
2040}
2041
2042int btrfs_release_file(struct inode *inode, struct file *filp)
2043{
2044	struct btrfs_file_private *private = filp->private_data;
2045
2046	if (private && private->filldir_buf)
2047		kfree(private->filldir_buf);
2048	kfree(private);
2049	filp->private_data = NULL;
2050
2051	/*
2052	 * Set by setattr when we are about to truncate a file from a non-zero
2053	 * size to a zero size.  This tries to flush down new bytes that may
2054	 * have been written if the application were using truncate to replace
2055	 * a file in place.
2056	 */
2057	if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
2058			       &BTRFS_I(inode)->runtime_flags))
2059			filemap_flush(inode->i_mapping);
2060	return 0;
2061}
2062
2063static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
2064{
2065	int ret;
2066	struct blk_plug plug;
2067
2068	/*
2069	 * This is only called in fsync, which would do synchronous writes, so
2070	 * a plug can merge adjacent IOs as much as possible.  Esp. in case of
2071	 * multiple disks using raid profile, a large IO can be split to
2072	 * several segments of stripe length (currently 64K).
2073	 */
2074	blk_start_plug(&plug);
2075	atomic_inc(&BTRFS_I(inode)->sync_writers);
2076	ret = btrfs_fdatawrite_range(inode, start, end);
2077	atomic_dec(&BTRFS_I(inode)->sync_writers);
2078	blk_finish_plug(&plug);
2079
2080	return ret;
2081}
2082
2083static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
2084{
2085	struct btrfs_inode *inode = BTRFS_I(ctx->inode);
2086	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2087
2088	if (btrfs_inode_in_log(inode, fs_info->generation) &&
2089	    list_empty(&ctx->ordered_extents))
2090		return true;
2091
2092	/*
2093	 * If we are doing a fast fsync we can not bail out if the inode's
2094	 * last_trans is <= then the last committed transaction, because we only
2095	 * update the last_trans of the inode during ordered extent completion,
2096	 * and for a fast fsync we don't wait for that, we only wait for the
2097	 * writeback to complete.
2098	 */
2099	if (inode->last_trans <= fs_info->last_trans_committed &&
2100	    (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
2101	     list_empty(&ctx->ordered_extents)))
2102		return true;
2103
2104	return false;
2105}
2106
2107/*
2108 * fsync call for both files and directories.  This logs the inode into
2109 * the tree log instead of forcing full commits whenever possible.
2110 *
2111 * It needs to call filemap_fdatawait so that all ordered extent updates are
2112 * in the metadata btree are up to date for copying to the log.
2113 *
2114 * It drops the inode mutex before doing the tree log commit.  This is an
2115 * important optimization for directories because holding the mutex prevents
2116 * new operations on the dir while we write to disk.
2117 */
2118int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
2119{
2120	struct dentry *dentry = file_dentry(file);
2121	struct inode *inode = d_inode(dentry);
2122	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2123	struct btrfs_root *root = BTRFS_I(inode)->root;
2124	struct btrfs_trans_handle *trans;
2125	struct btrfs_log_ctx ctx;
2126	int ret = 0, err;
2127	u64 len;
2128	bool full_sync;
2129
2130	trace_btrfs_sync_file(file, datasync);
2131
2132	btrfs_init_log_ctx(&ctx, inode);
2133
2134	/*
2135	 * Always set the range to a full range, otherwise we can get into
2136	 * several problems, from missing file extent items to represent holes
2137	 * when not using the NO_HOLES feature, to log tree corruption due to
2138	 * races between hole detection during logging and completion of ordered
2139	 * extents outside the range, to missing checksums due to ordered extents
2140	 * for which we flushed only a subset of their pages.
2141	 */
2142	start = 0;
2143	end = LLONG_MAX;
2144	len = (u64)LLONG_MAX + 1;
2145
2146	/*
2147	 * We write the dirty pages in the range and wait until they complete
2148	 * out of the ->i_mutex. If so, we can flush the dirty pages by
2149	 * multi-task, and make the performance up.  See
2150	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
2151	 */
2152	ret = start_ordered_ops(inode, start, end);
2153	if (ret)
2154		goto out;
2155
2156	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
2157
2158	atomic_inc(&root->log_batch);
2159
2160	/*
2161	 * Always check for the full sync flag while holding the inode's lock,
2162	 * to avoid races with other tasks. The flag must be either set all the
2163	 * time during logging or always off all the time while logging.
2164	 */
2165	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2166			     &BTRFS_I(inode)->runtime_flags);
2167
2168	/*
2169	 * Before we acquired the inode's lock and the mmap lock, someone may
2170	 * have dirtied more pages in the target range. We need to make sure
2171	 * that writeback for any such pages does not start while we are logging
2172	 * the inode, because if it does, any of the following might happen when
2173	 * we are not doing a full inode sync:
2174	 *
2175	 * 1) We log an extent after its writeback finishes but before its
2176	 *    checksums are added to the csum tree, leading to -EIO errors
2177	 *    when attempting to read the extent after a log replay.
2178	 *
2179	 * 2) We can end up logging an extent before its writeback finishes.
2180	 *    Therefore after the log replay we will have a file extent item
2181	 *    pointing to an unwritten extent (and no data checksums as well).
2182	 *
2183	 * So trigger writeback for any eventual new dirty pages and then we
2184	 * wait for all ordered extents to complete below.
2185	 */
2186	ret = start_ordered_ops(inode, start, end);
2187	if (ret) {
2188		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2189		goto out;
2190	}
2191
2192	/*
2193	 * We have to do this here to avoid the priority inversion of waiting on
2194	 * IO of a lower priority task while holding a transaction open.
2195	 *
2196	 * For a full fsync we wait for the ordered extents to complete while
2197	 * for a fast fsync we wait just for writeback to complete, and then
2198	 * attach the ordered extents to the transaction so that a transaction
2199	 * commit waits for their completion, to avoid data loss if we fsync,
2200	 * the current transaction commits before the ordered extents complete
2201	 * and a power failure happens right after that.
2202	 *
2203	 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
2204	 * logical address recorded in the ordered extent may change. We need
2205	 * to wait for the IO to stabilize the logical address.
2206	 */
2207	if (full_sync || btrfs_is_zoned(fs_info)) {
2208		ret = btrfs_wait_ordered_range(inode, start, len);
2209	} else {
2210		/*
2211		 * Get our ordered extents as soon as possible to avoid doing
2212		 * checksum lookups in the csum tree, and use instead the
2213		 * checksums attached to the ordered extents.
2214		 */
2215		btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
2216						      &ctx.ordered_extents);
2217		ret = filemap_fdatawait_range(inode->i_mapping, start, end);
2218	}
2219
2220	if (ret)
2221		goto out_release_extents;
2222
2223	atomic_inc(&root->log_batch);
2224
2225	smp_mb();
2226	if (skip_inode_logging(&ctx)) {
2227		/*
2228		 * We've had everything committed since the last time we were
2229		 * modified so clear this flag in case it was set for whatever
2230		 * reason, it's no longer relevant.
2231		 */
2232		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2233			  &BTRFS_I(inode)->runtime_flags);
2234		/*
2235		 * An ordered extent might have started before and completed
2236		 * already with io errors, in which case the inode was not
2237		 * updated and we end up here. So check the inode's mapping
2238		 * for any errors that might have happened since we last
2239		 * checked called fsync.
2240		 */
2241		ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
2242		goto out_release_extents;
2243	}
2244
2245	/*
2246	 * We use start here because we will need to wait on the IO to complete
2247	 * in btrfs_sync_log, which could require joining a transaction (for
2248	 * example checking cross references in the nocow path).  If we use join
2249	 * here we could get into a situation where we're waiting on IO to
2250	 * happen that is blocked on a transaction trying to commit.  With start
2251	 * we inc the extwriter counter, so we wait for all extwriters to exit
2252	 * before we start blocking joiners.  This comment is to keep somebody
2253	 * from thinking they are super smart and changing this to
2254	 * btrfs_join_transaction *cough*Josef*cough*.
2255	 */
2256	trans = btrfs_start_transaction(root, 0);
2257	if (IS_ERR(trans)) {
2258		ret = PTR_ERR(trans);
2259		goto out_release_extents;
2260	}
2261	trans->in_fsync = true;
2262
2263	ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
2264	btrfs_release_log_ctx_extents(&ctx);
2265	if (ret < 0) {
2266		/* Fallthrough and commit/free transaction. */
2267		ret = 1;
2268	}
2269
2270	/* we've logged all the items and now have a consistent
2271	 * version of the file in the log.  It is possible that
2272	 * someone will come in and modify the file, but that's
2273	 * fine because the log is consistent on disk, and we
2274	 * have references to all of the file's extents
2275	 *
2276	 * It is possible that someone will come in and log the
2277	 * file again, but that will end up using the synchronization
2278	 * inside btrfs_sync_log to keep things safe.
2279	 */
2280	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2281
2282	if (ret != BTRFS_NO_LOG_SYNC) {
2283		if (!ret) {
2284			ret = btrfs_sync_log(trans, root, &ctx);
2285			if (!ret) {
2286				ret = btrfs_end_transaction(trans);
2287				goto out;
2288			}
2289		}
2290		if (!full_sync) {
2291			ret = btrfs_wait_ordered_range(inode, start, len);
2292			if (ret) {
2293				btrfs_end_transaction(trans);
2294				goto out;
2295			}
2296		}
2297		ret = btrfs_commit_transaction(trans);
2298	} else {
2299		ret = btrfs_end_transaction(trans);
2300	}
2301out:
2302	ASSERT(list_empty(&ctx.list));
2303	err = file_check_and_advance_wb_err(file);
2304	if (!ret)
2305		ret = err;
2306	return ret > 0 ? -EIO : ret;
2307
2308out_release_extents:
2309	btrfs_release_log_ctx_extents(&ctx);
2310	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2311	goto out;
2312}
2313
2314static const struct vm_operations_struct btrfs_file_vm_ops = {
2315	.fault		= filemap_fault,
2316	.map_pages	= filemap_map_pages,
2317	.page_mkwrite	= btrfs_page_mkwrite,
2318};
2319
2320static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
2321{
2322	struct address_space *mapping = filp->f_mapping;
2323
2324	if (!mapping->a_ops->readpage)
2325		return -ENOEXEC;
2326
2327	file_accessed(filp);
2328	vma->vm_ops = &btrfs_file_vm_ops;
2329
2330	return 0;
2331}
2332
2333static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2334			  int slot, u64 start, u64 end)
2335{
2336	struct btrfs_file_extent_item *fi;
2337	struct btrfs_key key;
2338
2339	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2340		return 0;
2341
2342	btrfs_item_key_to_cpu(leaf, &key, slot);
2343	if (key.objectid != btrfs_ino(inode) ||
2344	    key.type != BTRFS_EXTENT_DATA_KEY)
2345		return 0;
2346
2347	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2348
2349	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2350		return 0;
2351
2352	if (btrfs_file_extent_disk_bytenr(leaf, fi))
2353		return 0;
2354
2355	if (key.offset == end)
2356		return 1;
2357	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2358		return 1;
2359	return 0;
2360}
2361
2362static int fill_holes(struct btrfs_trans_handle *trans,
2363		struct btrfs_inode *inode,
2364		struct btrfs_path *path, u64 offset, u64 end)
2365{
2366	struct btrfs_fs_info *fs_info = trans->fs_info;
2367	struct btrfs_root *root = inode->root;
2368	struct extent_buffer *leaf;
2369	struct btrfs_file_extent_item *fi;
2370	struct extent_map *hole_em;
2371	struct extent_map_tree *em_tree = &inode->extent_tree;
2372	struct btrfs_key key;
2373	int ret;
2374
2375	if (btrfs_fs_incompat(fs_info, NO_HOLES))
2376		goto out;
2377
2378	key.objectid = btrfs_ino(inode);
2379	key.type = BTRFS_EXTENT_DATA_KEY;
2380	key.offset = offset;
2381
2382	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2383	if (ret <= 0) {
2384		/*
2385		 * We should have dropped this offset, so if we find it then
2386		 * something has gone horribly wrong.
2387		 */
2388		if (ret == 0)
2389			ret = -EINVAL;
2390		return ret;
2391	}
2392
2393	leaf = path->nodes[0];
2394	if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2395		u64 num_bytes;
2396
2397		path->slots[0]--;
2398		fi = btrfs_item_ptr(leaf, path->slots[0],
2399				    struct btrfs_file_extent_item);
2400		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2401			end - offset;
2402		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2403		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2404		btrfs_set_file_extent_offset(leaf, fi, 0);
2405		btrfs_mark_buffer_dirty(leaf);
2406		goto out;
2407	}
2408
2409	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2410		u64 num_bytes;
2411
2412		key.offset = offset;
2413		btrfs_set_item_key_safe(fs_info, path, &key);
2414		fi = btrfs_item_ptr(leaf, path->slots[0],
2415				    struct btrfs_file_extent_item);
2416		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2417			offset;
2418		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2419		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2420		btrfs_set_file_extent_offset(leaf, fi, 0);
2421		btrfs_mark_buffer_dirty(leaf);
2422		goto out;
2423	}
2424	btrfs_release_path(path);
2425
2426	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
2427			offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
2428	if (ret)
2429		return ret;
2430
2431out:
2432	btrfs_release_path(path);
2433
2434	hole_em = alloc_extent_map();
2435	if (!hole_em) {
2436		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2437		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
2438	} else {
2439		hole_em->start = offset;
2440		hole_em->len = end - offset;
2441		hole_em->ram_bytes = hole_em->len;
2442		hole_em->orig_start = offset;
2443
2444		hole_em->block_start = EXTENT_MAP_HOLE;
2445		hole_em->block_len = 0;
2446		hole_em->orig_block_len = 0;
2447		hole_em->compress_type = BTRFS_COMPRESS_NONE;
2448		hole_em->generation = trans->transid;
2449
2450		do {
2451			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2452			write_lock(&em_tree->lock);
2453			ret = add_extent_mapping(em_tree, hole_em, 1);
2454			write_unlock(&em_tree->lock);
2455		} while (ret == -EEXIST);
2456		free_extent_map(hole_em);
2457		if (ret)
2458			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2459					&inode->runtime_flags);
2460	}
2461
2462	return 0;
2463}
2464
2465/*
2466 * Find a hole extent on given inode and change start/len to the end of hole
2467 * extent.(hole/vacuum extent whose em->start <= start &&
2468 *	   em->start + em->len > start)
2469 * When a hole extent is found, return 1 and modify start/len.
2470 */
2471static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2472{
2473	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2474	struct extent_map *em;
2475	int ret = 0;
2476
2477	em = btrfs_get_extent(inode, NULL, 0,
2478			      round_down(*start, fs_info->sectorsize),
2479			      round_up(*len, fs_info->sectorsize));
2480	if (IS_ERR(em))
2481		return PTR_ERR(em);
2482
2483	/* Hole or vacuum extent(only exists in no-hole mode) */
2484	if (em->block_start == EXTENT_MAP_HOLE) {
2485		ret = 1;
2486		*len = em->start + em->len > *start + *len ?
2487		       0 : *start + *len - em->start - em->len;
2488		*start = em->start + em->len;
2489	}
2490	free_extent_map(em);
2491	return ret;
2492}
2493
2494static int btrfs_punch_hole_lock_range(struct inode *inode,
2495				       const u64 lockstart,
2496				       const u64 lockend,
2497				       struct extent_state **cached_state)
2498{
2499	/*
2500	 * For subpage case, if the range is not at page boundary, we could
2501	 * have pages at the leading/tailing part of the range.
2502	 * This could lead to dead loop since filemap_range_has_page()
2503	 * will always return true.
2504	 * So here we need to do extra page alignment for
2505	 * filemap_range_has_page().
2506	 */
2507	const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
2508	const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
2509
2510	while (1) {
2511		struct btrfs_ordered_extent *ordered;
2512		int ret;
2513
2514		truncate_pagecache_range(inode, lockstart, lockend);
2515
2516		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2517				 cached_state);
2518		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
2519							    lockend);
2520
2521		/*
2522		 * We need to make sure we have no ordered extents in this range
2523		 * and nobody raced in and read a page in this range, if we did
2524		 * we need to try again.
2525		 */
2526		if ((!ordered ||
2527		    (ordered->file_offset + ordered->num_bytes <= lockstart ||
2528		     ordered->file_offset > lockend)) &&
2529		     !filemap_range_has_page(inode->i_mapping,
2530					     page_lockstart, page_lockend)) {
2531			if (ordered)
2532				btrfs_put_ordered_extent(ordered);
2533			break;
2534		}
2535		if (ordered)
2536			btrfs_put_ordered_extent(ordered);
2537		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
2538				     lockend, cached_state);
2539		ret = btrfs_wait_ordered_range(inode, lockstart,
2540					       lockend - lockstart + 1);
2541		if (ret)
2542			return ret;
2543	}
2544	return 0;
2545}
2546
2547static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2548				     struct btrfs_inode *inode,
2549				     struct btrfs_path *path,
2550				     struct btrfs_replace_extent_info *extent_info,
2551				     const u64 replace_len,
2552				     const u64 bytes_to_drop)
2553{
2554	struct btrfs_fs_info *fs_info = trans->fs_info;
2555	struct btrfs_root *root = inode->root;
2556	struct btrfs_file_extent_item *extent;
2557	struct extent_buffer *leaf;
2558	struct btrfs_key key;
2559	int slot;
2560	struct btrfs_ref ref = { 0 };
2561	int ret;
2562
2563	if (replace_len == 0)
2564		return 0;
2565
2566	if (extent_info->disk_offset == 0 &&
2567	    btrfs_fs_incompat(fs_info, NO_HOLES)) {
2568		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2569		return 0;
2570	}
2571
2572	key.objectid = btrfs_ino(inode);
2573	key.type = BTRFS_EXTENT_DATA_KEY;
2574	key.offset = extent_info->file_offset;
2575	ret = btrfs_insert_empty_item(trans, root, path, &key,
2576				      sizeof(struct btrfs_file_extent_item));
2577	if (ret)
2578		return ret;
2579	leaf = path->nodes[0];
2580	slot = path->slots[0];
2581	write_extent_buffer(leaf, extent_info->extent_buf,
2582			    btrfs_item_ptr_offset(leaf, slot),
2583			    sizeof(struct btrfs_file_extent_item));
2584	extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2585	ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2586	btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2587	btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2588	if (extent_info->is_new_extent)
2589		btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2590	btrfs_mark_buffer_dirty(leaf);
2591	btrfs_release_path(path);
2592
2593	ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2594						replace_len);
2595	if (ret)
2596		return ret;
2597
2598	/* If it's a hole, nothing more needs to be done. */
2599	if (extent_info->disk_offset == 0) {
2600		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2601		return 0;
2602	}
2603
2604	btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2605
2606	if (extent_info->is_new_extent && extent_info->insertions == 0) {
2607		key.objectid = extent_info->disk_offset;
2608		key.type = BTRFS_EXTENT_ITEM_KEY;
2609		key.offset = extent_info->disk_len;
2610		ret = btrfs_alloc_reserved_file_extent(trans, root,
2611						       btrfs_ino(inode),
2612						       extent_info->file_offset,
2613						       extent_info->qgroup_reserved,
2614						       &key);
2615	} else {
2616		u64 ref_offset;
2617
2618		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2619				       extent_info->disk_offset,
2620				       extent_info->disk_len, 0);
2621		ref_offset = extent_info->file_offset - extent_info->data_offset;
2622		btrfs_init_data_ref(&ref, root->root_key.objectid,
2623				    btrfs_ino(inode), ref_offset);
2624		ret = btrfs_inc_extent_ref(trans, &ref);
2625	}
2626
2627	extent_info->insertions++;
2628
2629	return ret;
2630}
2631
2632/*
2633 * The respective range must have been previously locked, as well as the inode.
2634 * The end offset is inclusive (last byte of the range).
2635 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2636 * the file range with an extent.
2637 * When not punching a hole, we don't want to end up in a state where we dropped
2638 * extents without inserting a new one, so we must abort the transaction to avoid
2639 * a corruption.
2640 */
2641int btrfs_replace_file_extents(struct btrfs_inode *inode,
2642			       struct btrfs_path *path, const u64 start,
2643			       const u64 end,
2644			       struct btrfs_replace_extent_info *extent_info,
2645			       struct btrfs_trans_handle **trans_out)
2646{
2647	struct btrfs_drop_extents_args drop_args = { 0 };
2648	struct btrfs_root *root = inode->root;
2649	struct btrfs_fs_info *fs_info = root->fs_info;
2650	u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2651	u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2652	struct btrfs_trans_handle *trans = NULL;
2653	struct btrfs_block_rsv *rsv;
2654	unsigned int rsv_count;
2655	u64 cur_offset;
2656	u64 len = end - start;
2657	int ret = 0;
2658
2659	if (end <= start)
2660		return -EINVAL;
2661
2662	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2663	if (!rsv) {
2664		ret = -ENOMEM;
2665		goto out;
2666	}
2667	rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2668	rsv->failfast = 1;
2669
2670	/*
2671	 * 1 - update the inode
2672	 * 1 - removing the extents in the range
2673	 * 1 - adding the hole extent if no_holes isn't set or if we are
2674	 *     replacing the range with a new extent
2675	 */
2676	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2677		rsv_count = 3;
2678	else
2679		rsv_count = 2;
2680
2681	trans = btrfs_start_transaction(root, rsv_count);
2682	if (IS_ERR(trans)) {
2683		ret = PTR_ERR(trans);
2684		trans = NULL;
2685		goto out_free;
2686	}
2687
2688	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2689				      min_size, false);
2690	BUG_ON(ret);
2691	trans->block_rsv = rsv;
2692
2693	cur_offset = start;
2694	drop_args.path = path;
2695	drop_args.end = end + 1;
2696	drop_args.drop_cache = true;
2697	while (cur_offset < end) {
2698		drop_args.start = cur_offset;
2699		ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2700		/* If we are punching a hole decrement the inode's byte count */
2701		if (!extent_info)
2702			btrfs_update_inode_bytes(inode, 0,
2703						 drop_args.bytes_found);
2704		if (ret != -ENOSPC) {
2705			/*
2706			 * When cloning we want to avoid transaction aborts when
2707			 * nothing was done and we are attempting to clone parts
2708			 * of inline extents, in such cases -EOPNOTSUPP is
2709			 * returned by __btrfs_drop_extents() without having
2710			 * changed anything in the file.
2711			 */
2712			if (extent_info && !extent_info->is_new_extent &&
2713			    ret && ret != -EOPNOTSUPP)
2714				btrfs_abort_transaction(trans, ret);
2715			break;
2716		}
2717
2718		trans->block_rsv = &fs_info->trans_block_rsv;
2719
2720		if (!extent_info && cur_offset < drop_args.drop_end &&
2721		    cur_offset < ino_size) {
2722			ret = fill_holes(trans, inode, path, cur_offset,
2723					 drop_args.drop_end);
2724			if (ret) {
2725				/*
2726				 * If we failed then we didn't insert our hole
2727				 * entries for the area we dropped, so now the
2728				 * fs is corrupted, so we must abort the
2729				 * transaction.
2730				 */
2731				btrfs_abort_transaction(trans, ret);
2732				break;
2733			}
2734		} else if (!extent_info && cur_offset < drop_args.drop_end) {
2735			/*
2736			 * We are past the i_size here, but since we didn't
2737			 * insert holes we need to clear the mapped area so we
2738			 * know to not set disk_i_size in this area until a new
2739			 * file extent is inserted here.
2740			 */
2741			ret = btrfs_inode_clear_file_extent_range(inode,
2742					cur_offset,
2743					drop_args.drop_end - cur_offset);
2744			if (ret) {
2745				/*
2746				 * We couldn't clear our area, so we could
2747				 * presumably adjust up and corrupt the fs, so
2748				 * we need to abort.
2749				 */
2750				btrfs_abort_transaction(trans, ret);
2751				break;
2752			}
2753		}
2754
2755		if (extent_info &&
2756		    drop_args.drop_end > extent_info->file_offset) {
2757			u64 replace_len = drop_args.drop_end -
2758					  extent_info->file_offset;
2759
2760			ret = btrfs_insert_replace_extent(trans, inode,	path,
2761					extent_info, replace_len,
2762					drop_args.bytes_found);
2763			if (ret) {
2764				btrfs_abort_transaction(trans, ret);
2765				break;
2766			}
2767			extent_info->data_len -= replace_len;
2768			extent_info->data_offset += replace_len;
2769			extent_info->file_offset += replace_len;
2770		}
2771
2772		ret = btrfs_update_inode(trans, root, inode);
2773		if (ret)
2774			break;
2775
2776		btrfs_end_transaction(trans);
2777		btrfs_btree_balance_dirty(fs_info);
2778
2779		trans = btrfs_start_transaction(root, rsv_count);
2780		if (IS_ERR(trans)) {
2781			ret = PTR_ERR(trans);
2782			trans = NULL;
2783			break;
2784		}
2785
2786		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2787					      rsv, min_size, false);
2788		BUG_ON(ret);	/* shouldn't happen */
2789		trans->block_rsv = rsv;
2790
2791		cur_offset = drop_args.drop_end;
2792		len = end - cur_offset;
2793		if (!extent_info && len) {
2794			ret = find_first_non_hole(inode, &cur_offset, &len);
2795			if (unlikely(ret < 0))
2796				break;
2797			if (ret && !len) {
2798				ret = 0;
2799				break;
2800			}
2801		}
2802	}
2803
2804	/*
2805	 * If we were cloning, force the next fsync to be a full one since we
2806	 * we replaced (or just dropped in the case of cloning holes when
2807	 * NO_HOLES is enabled) file extent items and did not setup new extent
2808	 * maps for the replacement extents (or holes).
2809	 */
2810	if (extent_info && !extent_info->is_new_extent)
2811		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
2812
2813	if (ret)
2814		goto out_trans;
2815
2816	trans->block_rsv = &fs_info->trans_block_rsv;
2817	/*
2818	 * If we are using the NO_HOLES feature we might have had already an
2819	 * hole that overlaps a part of the region [lockstart, lockend] and
2820	 * ends at (or beyond) lockend. Since we have no file extent items to
2821	 * represent holes, drop_end can be less than lockend and so we must
2822	 * make sure we have an extent map representing the existing hole (the
2823	 * call to __btrfs_drop_extents() might have dropped the existing extent
2824	 * map representing the existing hole), otherwise the fast fsync path
2825	 * will not record the existence of the hole region
2826	 * [existing_hole_start, lockend].
2827	 */
2828	if (drop_args.drop_end <= end)
2829		drop_args.drop_end = end + 1;
2830	/*
2831	 * Don't insert file hole extent item if it's for a range beyond eof
2832	 * (because it's useless) or if it represents a 0 bytes range (when
2833	 * cur_offset == drop_end).
2834	 */
2835	if (!extent_info && cur_offset < ino_size &&
2836	    cur_offset < drop_args.drop_end) {
2837		ret = fill_holes(trans, inode, path, cur_offset,
2838				 drop_args.drop_end);
2839		if (ret) {
2840			/* Same comment as above. */
2841			btrfs_abort_transaction(trans, ret);
2842			goto out_trans;
2843		}
2844	} else if (!extent_info && cur_offset < drop_args.drop_end) {
2845		/* See the comment in the loop above for the reasoning here. */
2846		ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2847					drop_args.drop_end - cur_offset);
2848		if (ret) {
2849			btrfs_abort_transaction(trans, ret);
2850			goto out_trans;
2851		}
2852
2853	}
2854	if (extent_info) {
2855		ret = btrfs_insert_replace_extent(trans, inode, path,
2856				extent_info, extent_info->data_len,
2857				drop_args.bytes_found);
2858		if (ret) {
2859			btrfs_abort_transaction(trans, ret);
2860			goto out_trans;
2861		}
2862	}
2863
2864out_trans:
2865	if (!trans)
2866		goto out_free;
2867
2868	trans->block_rsv = &fs_info->trans_block_rsv;
2869	if (ret)
2870		btrfs_end_transaction(trans);
2871	else
2872		*trans_out = trans;
2873out_free:
2874	btrfs_free_block_rsv(fs_info, rsv);
2875out:
2876	return ret;
2877}
2878
2879static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2880{
2881	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2882	struct btrfs_root *root = BTRFS_I(inode)->root;
2883	struct extent_state *cached_state = NULL;
2884	struct btrfs_path *path;
2885	struct btrfs_trans_handle *trans = NULL;
2886	u64 lockstart;
2887	u64 lockend;
2888	u64 tail_start;
2889	u64 tail_len;
2890	u64 orig_start = offset;
2891	int ret = 0;
2892	bool same_block;
2893	u64 ino_size;
2894	bool truncated_block = false;
2895	bool updated_inode = false;
2896
2897	ret = btrfs_wait_ordered_range(inode, offset, len);
2898	if (ret)
2899		return ret;
2900
2901	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
2902	ino_size = round_up(inode->i_size, fs_info->sectorsize);
2903	ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2904	if (ret < 0)
2905		goto out_only_mutex;
2906	if (ret && !len) {
2907		/* Already in a large hole */
2908		ret = 0;
2909		goto out_only_mutex;
2910	}
2911
2912	lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
2913	lockend = round_down(offset + len,
2914			     btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
2915	same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2916		== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2917	/*
2918	 * We needn't truncate any block which is beyond the end of the file
2919	 * because we are sure there is no data there.
2920	 */
2921	/*
2922	 * Only do this if we are in the same block and we aren't doing the
2923	 * entire block.
2924	 */
2925	if (same_block && len < fs_info->sectorsize) {
2926		if (offset < ino_size) {
2927			truncated_block = true;
2928			ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2929						   0);
2930		} else {
2931			ret = 0;
2932		}
2933		goto out_only_mutex;
2934	}
2935
2936	/* zero back part of the first block */
2937	if (offset < ino_size) {
2938		truncated_block = true;
2939		ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2940		if (ret) {
2941			btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2942			return ret;
2943		}
2944	}
2945
2946	/* Check the aligned pages after the first unaligned page,
2947	 * if offset != orig_start, which means the first unaligned page
2948	 * including several following pages are already in holes,
2949	 * the extra check can be skipped */
2950	if (offset == orig_start) {
2951		/* after truncate page, check hole again */
2952		len = offset + len - lockstart;
2953		offset = lockstart;
2954		ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2955		if (ret < 0)
2956			goto out_only_mutex;
2957		if (ret && !len) {
2958			ret = 0;
2959			goto out_only_mutex;
2960		}
2961		lockstart = offset;
2962	}
2963
2964	/* Check the tail unaligned part is in a hole */
2965	tail_start = lockend + 1;
2966	tail_len = offset + len - tail_start;
2967	if (tail_len) {
2968		ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2969		if (unlikely(ret < 0))
2970			goto out_only_mutex;
2971		if (!ret) {
2972			/* zero the front end of the last page */
2973			if (tail_start + tail_len < ino_size) {
2974				truncated_block = true;
2975				ret = btrfs_truncate_block(BTRFS_I(inode),
2976							tail_start + tail_len,
2977							0, 1);
2978				if (ret)
2979					goto out_only_mutex;
2980			}
2981		}
2982	}
2983
2984	if (lockend < lockstart) {
2985		ret = 0;
2986		goto out_only_mutex;
2987	}
2988
2989	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
2990					  &cached_state);
2991	if (ret)
2992		goto out_only_mutex;
2993
2994	path = btrfs_alloc_path();
2995	if (!path) {
2996		ret = -ENOMEM;
2997		goto out;
2998	}
2999
3000	ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
3001					 lockend, NULL, &trans);
3002	btrfs_free_path(path);
3003	if (ret)
3004		goto out;
3005
3006	ASSERT(trans != NULL);
3007	inode_inc_iversion(inode);
3008	inode->i_mtime = inode->i_ctime = current_time(inode);
3009	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3010	updated_inode = true;
3011	btrfs_end_transaction(trans);
3012	btrfs_btree_balance_dirty(fs_info);
3013out:
3014	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3015			     &cached_state);
3016out_only_mutex:
3017	if (!updated_inode && truncated_block && !ret) {
3018		/*
3019		 * If we only end up zeroing part of a page, we still need to
3020		 * update the inode item, so that all the time fields are
3021		 * updated as well as the necessary btrfs inode in memory fields
3022		 * for detecting, at fsync time, if the inode isn't yet in the
3023		 * log tree or it's there but not up to date.
3024		 */
3025		struct timespec64 now = current_time(inode);
3026
3027		inode_inc_iversion(inode);
3028		inode->i_mtime = now;
3029		inode->i_ctime = now;
3030		trans = btrfs_start_transaction(root, 1);
3031		if (IS_ERR(trans)) {
3032			ret = PTR_ERR(trans);
3033		} else {
3034			int ret2;
3035
3036			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3037			ret2 = btrfs_end_transaction(trans);
3038			if (!ret)
3039				ret = ret2;
3040		}
3041	}
3042	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3043	return ret;
3044}
3045
3046/* Helper structure to record which range is already reserved */
3047struct falloc_range {
3048	struct list_head list;
3049	u64 start;
3050	u64 len;
3051};
3052
3053/*
3054 * Helper function to add falloc range
3055 *
3056 * Caller should have locked the larger range of extent containing
3057 * [start, len)
3058 */
3059static int add_falloc_range(struct list_head *head, u64 start, u64 len)
3060{
3061	struct falloc_range *range = NULL;
3062
3063	if (!list_empty(head)) {
3064		/*
3065		 * As fallocate iterates by bytenr order, we only need to check
3066		 * the last range.
3067		 */
3068		range = list_last_entry(head, struct falloc_range, list);
3069		if (range->start + range->len == start) {
3070			range->len += len;
3071			return 0;
3072		}
3073	}
3074
3075	range = kmalloc(sizeof(*range), GFP_KERNEL);
3076	if (!range)
3077		return -ENOMEM;
3078	range->start = start;
3079	range->len = len;
3080	list_add_tail(&range->list, head);
3081	return 0;
3082}
3083
3084static int btrfs_fallocate_update_isize(struct inode *inode,
3085					const u64 end,
3086					const int mode)
3087{
3088	struct btrfs_trans_handle *trans;
3089	struct btrfs_root *root = BTRFS_I(inode)->root;
3090	int ret;
3091	int ret2;
3092
3093	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
3094		return 0;
3095
3096	trans = btrfs_start_transaction(root, 1);
3097	if (IS_ERR(trans))
3098		return PTR_ERR(trans);
3099
3100	inode->i_ctime = current_time(inode);
3101	i_size_write(inode, end);
3102	btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
3103	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3104	ret2 = btrfs_end_transaction(trans);
3105
3106	return ret ? ret : ret2;
3107}
3108
3109enum {
3110	RANGE_BOUNDARY_WRITTEN_EXTENT,
3111	RANGE_BOUNDARY_PREALLOC_EXTENT,
3112	RANGE_BOUNDARY_HOLE,
3113};
3114
3115static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
3116						 u64 offset)
3117{
3118	const u64 sectorsize = btrfs_inode_sectorsize(inode);
3119	struct extent_map *em;
3120	int ret;
3121
3122	offset = round_down(offset, sectorsize);
3123	em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
3124	if (IS_ERR(em))
3125		return PTR_ERR(em);
3126
3127	if (em->block_start == EXTENT_MAP_HOLE)
3128		ret = RANGE_BOUNDARY_HOLE;
3129	else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3130		ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
3131	else
3132		ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
3133
3134	free_extent_map(em);
3135	return ret;
3136}
3137
3138static int btrfs_zero_range(struct inode *inode,
3139			    loff_t offset,
3140			    loff_t len,
3141			    const int mode)
3142{
3143	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
3144	struct extent_map *em;
3145	struct extent_changeset *data_reserved = NULL;
3146	int ret;
3147	u64 alloc_hint = 0;
3148	const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
3149	u64 alloc_start = round_down(offset, sectorsize);
3150	u64 alloc_end = round_up(offset + len, sectorsize);
3151	u64 bytes_to_reserve = 0;
3152	bool space_reserved = false;
3153
3154	inode_dio_wait(inode);
3155
3156	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3157			      alloc_end - alloc_start);
3158	if (IS_ERR(em)) {
3159		ret = PTR_ERR(em);
3160		goto out;
3161	}
3162
3163	/*
3164	 * Avoid hole punching and extent allocation for some cases. More cases
3165	 * could be considered, but these are unlikely common and we keep things
3166	 * as simple as possible for now. Also, intentionally, if the target
3167	 * range contains one or more prealloc extents together with regular
3168	 * extents and holes, we drop all the existing extents and allocate a
3169	 * new prealloc extent, so that we get a larger contiguous disk extent.
3170	 */
3171	if (em->start <= alloc_start &&
3172	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3173		const u64 em_end = em->start + em->len;
3174
3175		if (em_end >= offset + len) {
3176			/*
3177			 * The whole range is already a prealloc extent,
3178			 * do nothing except updating the inode's i_size if
3179			 * needed.
3180			 */
3181			free_extent_map(em);
3182			ret = btrfs_fallocate_update_isize(inode, offset + len,
3183							   mode);
3184			goto out;
3185		}
3186		/*
3187		 * Part of the range is already a prealloc extent, so operate
3188		 * only on the remaining part of the range.
3189		 */
3190		alloc_start = em_end;
3191		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
3192		len = offset + len - alloc_start;
3193		offset = alloc_start;
3194		alloc_hint = em->block_start + em->len;
3195	}
3196	free_extent_map(em);
3197
3198	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
3199	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
3200		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3201				      sectorsize);
3202		if (IS_ERR(em)) {
3203			ret = PTR_ERR(em);
3204			goto out;
3205		}
3206
3207		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3208			free_extent_map(em);
3209			ret = btrfs_fallocate_update_isize(inode, offset + len,
3210							   mode);
3211			goto out;
3212		}
3213		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
3214			free_extent_map(em);
3215			ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
3216						   0);
3217			if (!ret)
3218				ret = btrfs_fallocate_update_isize(inode,
3219								   offset + len,
3220								   mode);
3221			return ret;
3222		}
3223		free_extent_map(em);
3224		alloc_start = round_down(offset, sectorsize);
3225		alloc_end = alloc_start + sectorsize;
3226		goto reserve_space;
3227	}
3228
3229	alloc_start = round_up(offset, sectorsize);
3230	alloc_end = round_down(offset + len, sectorsize);
3231
3232	/*
3233	 * For unaligned ranges, check the pages at the boundaries, they might
3234	 * map to an extent, in which case we need to partially zero them, or
3235	 * they might map to a hole, in which case we need our allocation range
3236	 * to cover them.
3237	 */
3238	if (!IS_ALIGNED(offset, sectorsize)) {
3239		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3240							    offset);
3241		if (ret < 0)
3242			goto out;
3243		if (ret == RANGE_BOUNDARY_HOLE) {
3244			alloc_start = round_down(offset, sectorsize);
3245			ret = 0;
3246		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3247			ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
3248			if (ret)
3249				goto out;
3250		} else {
3251			ret = 0;
3252		}
3253	}
3254
3255	if (!IS_ALIGNED(offset + len, sectorsize)) {
3256		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3257							    offset + len);
3258		if (ret < 0)
3259			goto out;
3260		if (ret == RANGE_BOUNDARY_HOLE) {
3261			alloc_end = round_up(offset + len, sectorsize);
3262			ret = 0;
3263		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3264			ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
3265						   0, 1);
3266			if (ret)
3267				goto out;
3268		} else {
3269			ret = 0;
3270		}
3271	}
3272
3273reserve_space:
3274	if (alloc_start < alloc_end) {
3275		struct extent_state *cached_state = NULL;
3276		const u64 lockstart = alloc_start;
3277		const u64 lockend = alloc_end - 1;
3278
3279		bytes_to_reserve = alloc_end - alloc_start;
3280		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3281						      bytes_to_reserve);
3282		if (ret < 0)
3283			goto out;
3284		space_reserved = true;
3285		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
3286						  &cached_state);
3287		if (ret)
3288			goto out;
3289		ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
3290						alloc_start, bytes_to_reserve);
3291		if (ret) {
3292			unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
3293					     lockend, &cached_state);
3294			goto out;
3295		}
3296		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3297						alloc_end - alloc_start,
3298						i_blocksize(inode),
3299						offset + len, &alloc_hint);
3300		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
3301				     lockend, &cached_state);
3302		/* btrfs_prealloc_file_range releases reserved space on error */
3303		if (ret) {
3304			space_reserved = false;
3305			goto out;
3306		}
3307	}
3308	ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3309 out:
3310	if (ret && space_reserved)
3311		btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3312					       alloc_start, bytes_to_reserve);
3313	extent_changeset_free(data_reserved);
3314
3315	return ret;
3316}
3317
3318static long btrfs_fallocate(struct file *file, int mode,
3319			    loff_t offset, loff_t len)
3320{
3321	struct inode *inode = file_inode(file);
3322	struct extent_state *cached_state = NULL;
3323	struct extent_changeset *data_reserved = NULL;
3324	struct falloc_range *range;
3325	struct falloc_range *tmp;
3326	struct list_head reserve_list;
3327	u64 cur_offset;
3328	u64 last_byte;
3329	u64 alloc_start;
3330	u64 alloc_end;
3331	u64 alloc_hint = 0;
3332	u64 locked_end;
3333	u64 actual_end = 0;
3334	struct extent_map *em;
3335	int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
3336	int ret;
3337
3338	/* Do not allow fallocate in ZONED mode */
3339	if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
3340		return -EOPNOTSUPP;
3341
3342	alloc_start = round_down(offset, blocksize);
3343	alloc_end = round_up(offset + len, blocksize);
3344	cur_offset = alloc_start;
3345
3346	/* Make sure we aren't being give some crap mode */
3347	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3348		     FALLOC_FL_ZERO_RANGE))
3349		return -EOPNOTSUPP;
3350
3351	if (mode & FALLOC_FL_PUNCH_HOLE)
3352		return btrfs_punch_hole(inode, offset, len);
3353
3354	/*
3355	 * Only trigger disk allocation, don't trigger qgroup reserve
3356	 *
3357	 * For qgroup space, it will be checked later.
3358	 */
3359	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
3360		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3361						      alloc_end - alloc_start);
3362		if (ret < 0)
3363			return ret;
3364	}
3365
3366	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
3367
3368	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3369		ret = inode_newsize_ok(inode, offset + len);
3370		if (ret)
3371			goto out;
3372	}
3373
3374	/*
3375	 * TODO: Move these two operations after we have checked
3376	 * accurate reserved space, or fallocate can still fail but
3377	 * with page truncated or size expanded.
3378	 *
3379	 * But that's a minor problem and won't do much harm BTW.
3380	 */
3381	if (alloc_start > inode->i_size) {
3382		ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
3383					alloc_start);
3384		if (ret)
3385			goto out;
3386	} else if (offset + len > inode->i_size) {
3387		/*
3388		 * If we are fallocating from the end of the file onward we
3389		 * need to zero out the end of the block if i_size lands in the
3390		 * middle of a block.
3391		 */
3392		ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
3393		if (ret)
3394			goto out;
3395	}
3396
3397	/*
3398	 * wait for ordered IO before we have any locks.  We'll loop again
3399	 * below with the locks held.
3400	 */
3401	ret = btrfs_wait_ordered_range(inode, alloc_start,
3402				       alloc_end - alloc_start);
3403	if (ret)
3404		goto out;
3405
3406	if (mode & FALLOC_FL_ZERO_RANGE) {
3407		ret = btrfs_zero_range(inode, offset, len, mode);
3408		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3409		return ret;
3410	}
3411
3412	locked_end = alloc_end - 1;
3413	while (1) {
3414		struct btrfs_ordered_extent *ordered;
3415
3416		/* the extent lock is ordered inside the running
3417		 * transaction
3418		 */
3419		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
3420				 locked_end, &cached_state);
3421		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
3422							    locked_end);
3423
3424		if (ordered &&
3425		    ordered->file_offset + ordered->num_bytes > alloc_start &&
3426		    ordered->file_offset < alloc_end) {
3427			btrfs_put_ordered_extent(ordered);
3428			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
3429					     alloc_start, locked_end,
3430					     &cached_state);
3431			/*
3432			 * we can't wait on the range with the transaction
3433			 * running or with the extent lock held
3434			 */
3435			ret = btrfs_wait_ordered_range(inode, alloc_start,
3436						       alloc_end - alloc_start);
3437			if (ret)
3438				goto out;
3439		} else {
3440			if (ordered)
3441				btrfs_put_ordered_extent(ordered);
3442			break;
3443		}
3444	}
3445
3446	/* First, check if we exceed the qgroup limit */
3447	INIT_LIST_HEAD(&reserve_list);
3448	while (cur_offset < alloc_end) {
3449		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
3450				      alloc_end - cur_offset);
3451		if (IS_ERR(em)) {
3452			ret = PTR_ERR(em);
3453			break;
3454		}
3455		last_byte = min(extent_map_end(em), alloc_end);
3456		actual_end = min_t(u64, extent_map_end(em), offset + len);
3457		last_byte = ALIGN(last_byte, blocksize);
3458		if (em->block_start == EXTENT_MAP_HOLE ||
3459		    (cur_offset >= inode->i_size &&
3460		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
3461			ret = add_falloc_range(&reserve_list, cur_offset,
3462					       last_byte - cur_offset);
3463			if (ret < 0) {
3464				free_extent_map(em);
3465				break;
3466			}
3467			ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3468					&data_reserved, cur_offset,
3469					last_byte - cur_offset);
3470			if (ret < 0) {
3471				cur_offset = last_byte;
3472				free_extent_map(em);
3473				break;
3474			}
3475		} else {
3476			/*
3477			 * Do not need to reserve unwritten extent for this
3478			 * range, free reserved data space first, otherwise
3479			 * it'll result in false ENOSPC error.
3480			 */
3481			btrfs_free_reserved_data_space(BTRFS_I(inode),
3482				data_reserved, cur_offset,
3483				last_byte - cur_offset);
3484		}
3485		free_extent_map(em);
3486		cur_offset = last_byte;
3487	}
3488
3489	/*
3490	 * If ret is still 0, means we're OK to fallocate.
3491	 * Or just cleanup the list and exit.
3492	 */
3493	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3494		if (!ret)
3495			ret = btrfs_prealloc_file_range(inode, mode,
3496					range->start,
3497					range->len, i_blocksize(inode),
3498					offset + len, &alloc_hint);
3499		else
3500			btrfs_free_reserved_data_space(BTRFS_I(inode),
3501					data_reserved, range->start,
3502					range->len);
3503		list_del(&range->list);
3504		kfree(range);
3505	}
3506	if (ret < 0)
3507		goto out_unlock;
3508
3509	/*
3510	 * We didn't need to allocate any more space, but we still extended the
3511	 * size of the file so we need to update i_size and the inode item.
3512	 */
3513	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3514out_unlock:
3515	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3516			     &cached_state);
3517out:
3518	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3519	/* Let go of our reservation. */
3520	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
3521		btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3522				cur_offset, alloc_end - cur_offset);
3523	extent_changeset_free(data_reserved);
3524	return ret;
3525}
3526
3527static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
3528				  int whence)
3529{
3530	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3531	struct extent_map *em = NULL;
3532	struct extent_state *cached_state = NULL;
3533	loff_t i_size = inode->vfs_inode.i_size;
3534	u64 lockstart;
3535	u64 lockend;
3536	u64 start;
3537	u64 len;
3538	int ret = 0;
3539
3540	if (i_size == 0 || offset >= i_size)
3541		return -ENXIO;
3542
3543	/*
3544	 * offset can be negative, in this case we start finding DATA/HOLE from
3545	 * the very start of the file.
3546	 */
3547	start = max_t(loff_t, 0, offset);
3548
3549	lockstart = round_down(start, fs_info->sectorsize);
3550	lockend = round_up(i_size, fs_info->sectorsize);
3551	if (lockend <= lockstart)
3552		lockend = lockstart + fs_info->sectorsize;
3553	lockend--;
3554	len = lockend - lockstart + 1;
3555
3556	lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
3557
3558	while (start < i_size) {
3559		em = btrfs_get_extent_fiemap(inode, start, len);
3560		if (IS_ERR(em)) {
3561			ret = PTR_ERR(em);
3562			em = NULL;
3563			break;
3564		}
3565
3566		if (whence == SEEK_HOLE &&
3567		    (em->block_start == EXTENT_MAP_HOLE ||
3568		     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3569			break;
3570		else if (whence == SEEK_DATA &&
3571			   (em->block_start != EXTENT_MAP_HOLE &&
3572			    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3573			break;
3574
3575		start = em->start + em->len;
3576		free_extent_map(em);
3577		em = NULL;
3578		cond_resched();
3579	}
3580	free_extent_map(em);
3581	unlock_extent_cached(&inode->io_tree, lockstart, lockend,
3582			     &cached_state);
3583	if (ret) {
3584		offset = ret;
3585	} else {
3586		if (whence == SEEK_DATA && start >= i_size)
3587			offset = -ENXIO;
3588		else
3589			offset = min_t(loff_t, start, i_size);
3590	}
3591
3592	return offset;
3593}
3594
3595static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3596{
3597	struct inode *inode = file->f_mapping->host;
3598
3599	switch (whence) {
3600	default:
3601		return generic_file_llseek(file, offset, whence);
3602	case SEEK_DATA:
3603	case SEEK_HOLE:
3604		btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
3605		offset = find_desired_extent(BTRFS_I(inode), offset, whence);
3606		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
3607		break;
3608	}
3609
3610	if (offset < 0)
3611		return offset;
3612
3613	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3614}
3615
3616static int btrfs_file_open(struct inode *inode, struct file *filp)
3617{
3618	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
3619	return generic_file_open(inode, filp);
3620}
3621
3622static int check_direct_read(struct btrfs_fs_info *fs_info,
3623			     const struct iov_iter *iter, loff_t offset)
3624{
3625	int ret;
3626	int i, seg;
3627
3628	ret = check_direct_IO(fs_info, iter, offset);
3629	if (ret < 0)
3630		return ret;
3631
3632	if (!iter_is_iovec(iter))
3633		return 0;
3634
3635	for (seg = 0; seg < iter->nr_segs; seg++)
3636		for (i = seg + 1; i < iter->nr_segs; i++)
3637			if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
3638				return -EINVAL;
3639	return 0;
3640}
3641
3642static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
3643{
3644	struct inode *inode = file_inode(iocb->ki_filp);
3645	ssize_t ret;
3646
3647	if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
3648		return 0;
3649
3650	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
3651	ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
3652	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
3653	return ret;
3654}
3655
3656static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3657{
3658	ssize_t ret = 0;
3659
3660	if (iocb->ki_flags & IOCB_DIRECT) {
3661		ret = btrfs_direct_read(iocb, to);
3662		if (ret < 0 || !iov_iter_count(to) ||
3663		    iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3664			return ret;
3665	}
3666
3667	return filemap_read(iocb, to, ret);
3668}
3669
3670const struct file_operations btrfs_file_operations = {
3671	.llseek		= btrfs_file_llseek,
3672	.read_iter      = btrfs_file_read_iter,
3673	.splice_read	= generic_file_splice_read,
3674	.write_iter	= btrfs_file_write_iter,
3675	.splice_write	= iter_file_splice_write,
3676	.mmap		= btrfs_file_mmap,
3677	.open		= btrfs_file_open,
3678	.release	= btrfs_release_file,
3679	.fsync		= btrfs_sync_file,
3680	.fallocate	= btrfs_fallocate,
3681	.unlocked_ioctl	= btrfs_ioctl,
3682#ifdef CONFIG_COMPAT
3683	.compat_ioctl	= btrfs_compat_ioctl,
3684#endif
3685	.remap_file_range = btrfs_remap_file_range,
3686};
3687
3688void __cold btrfs_auto_defrag_exit(void)
3689{
3690	kmem_cache_destroy(btrfs_inode_defrag_cachep);
3691}
3692
3693int __init btrfs_auto_defrag_init(void)
3694{
3695	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
3696					sizeof(struct inode_defrag), 0,
3697					SLAB_MEM_SPREAD,
3698					NULL);
3699	if (!btrfs_inode_defrag_cachep)
3700		return -ENOMEM;
3701
3702	return 0;
3703}
3704
3705int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
3706{
3707	int ret;
3708
3709	/*
3710	 * So with compression we will find and lock a dirty page and clear the
3711	 * first one as dirty, setup an async extent, and immediately return
3712	 * with the entire range locked but with nobody actually marked with
3713	 * writeback.  So we can't just filemap_write_and_wait_range() and
3714	 * expect it to work since it will just kick off a thread to do the
3715	 * actual work.  So we need to call filemap_fdatawrite_range _again_
3716	 * since it will wait on the page lock, which won't be unlocked until
3717	 * after the pages have been marked as writeback and so we're good to go
3718	 * from there.  We have to do this otherwise we'll miss the ordered
3719	 * extents and that results in badness.  Please Josef, do not think you
3720	 * know better and pull this out at some point in the future, it is
3721	 * right and you are wrong.
3722	 */
3723	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3724	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
3725			     &BTRFS_I(inode)->runtime_flags))
3726		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3727
3728	return ret;
3729}
3730