1#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
5#include <linux/pagemap.h>
6#include <linux/page-flags.h>
7#include <linux/module.h>
8#include <linux/spinlock.h>
9#include <linux/blkdev.h>
10#include <linux/swap.h>
11#include <linux/writeback.h>
12#include <linux/pagevec.h>
13#include "extent_io.h"
14#include "extent_map.h"
15#include "compat.h"
16#include "ctree.h"
17#include "btrfs_inode.h"
18
19static struct kmem_cache *extent_state_cache;
20static struct kmem_cache *extent_buffer_cache;
21
22static LIST_HEAD(buffers);
23static LIST_HEAD(states);
24
25#define LEAK_DEBUG 0
26#if LEAK_DEBUG
27static DEFINE_SPINLOCK(leak_lock);
28#endif
29
30#define BUFFER_LRU_MAX 64
31
32struct tree_entry {
33	u64 start;
34	u64 end;
35	struct rb_node rb_node;
36};
37
38struct extent_page_data {
39	struct bio *bio;
40	struct extent_io_tree *tree;
41	get_extent_t *get_extent;
42
43	/* tells writepage not to lock the state bits for this range
44	 * it still does the unlocking
45	 */
46	unsigned int extent_locked:1;
47
48	/* tells the submit_bio code to use a WRITE_SYNC */
49	unsigned int sync_io:1;
50};
51
52int __init extent_io_init(void)
53{
54	extent_state_cache = kmem_cache_create("extent_state",
55			sizeof(struct extent_state), 0,
56			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
57	if (!extent_state_cache)
58		return -ENOMEM;
59
60	extent_buffer_cache = kmem_cache_create("extent_buffers",
61			sizeof(struct extent_buffer), 0,
62			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
63	if (!extent_buffer_cache)
64		goto free_state_cache;
65	return 0;
66
67free_state_cache:
68	kmem_cache_destroy(extent_state_cache);
69	return -ENOMEM;
70}
71
72void extent_io_exit(void)
73{
74	struct extent_state *state;
75	struct extent_buffer *eb;
76
77	while (!list_empty(&states)) {
78		state = list_entry(states.next, struct extent_state, leak_list);
79		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
80		       "state %lu in tree %p refs %d\n",
81		       (unsigned long long)state->start,
82		       (unsigned long long)state->end,
83		       state->state, state->tree, atomic_read(&state->refs));
84		list_del(&state->leak_list);
85		kmem_cache_free(extent_state_cache, state);
86
87	}
88
89	while (!list_empty(&buffers)) {
90		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
91		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
92		       "refs %d\n", (unsigned long long)eb->start,
93		       eb->len, atomic_read(&eb->refs));
94		list_del(&eb->leak_list);
95		kmem_cache_free(extent_buffer_cache, eb);
96	}
97	if (extent_state_cache)
98		kmem_cache_destroy(extent_state_cache);
99	if (extent_buffer_cache)
100		kmem_cache_destroy(extent_buffer_cache);
101}
102
103void extent_io_tree_init(struct extent_io_tree *tree,
104			  struct address_space *mapping, gfp_t mask)
105{
106	tree->state = RB_ROOT;
107	tree->buffer = RB_ROOT;
108	tree->ops = NULL;
109	tree->dirty_bytes = 0;
110	spin_lock_init(&tree->lock);
111	spin_lock_init(&tree->buffer_lock);
112	tree->mapping = mapping;
113}
114
115static struct extent_state *alloc_extent_state(gfp_t mask)
116{
117	struct extent_state *state;
118#if LEAK_DEBUG
119	unsigned long flags;
120#endif
121
122	state = kmem_cache_alloc(extent_state_cache, mask);
123	if (!state)
124		return state;
125	state->state = 0;
126	state->private = 0;
127	state->tree = NULL;
128#if LEAK_DEBUG
129	spin_lock_irqsave(&leak_lock, flags);
130	list_add(&state->leak_list, &states);
131	spin_unlock_irqrestore(&leak_lock, flags);
132#endif
133	atomic_set(&state->refs, 1);
134	init_waitqueue_head(&state->wq);
135	return state;
136}
137
138void free_extent_state(struct extent_state *state)
139{
140	if (!state)
141		return;
142	if (atomic_dec_and_test(&state->refs)) {
143#if LEAK_DEBUG
144		unsigned long flags;
145#endif
146		WARN_ON(state->tree);
147#if LEAK_DEBUG
148		spin_lock_irqsave(&leak_lock, flags);
149		list_del(&state->leak_list);
150		spin_unlock_irqrestore(&leak_lock, flags);
151#endif
152		kmem_cache_free(extent_state_cache, state);
153	}
154}
155
156static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
157				   struct rb_node *node)
158{
159	struct rb_node **p = &root->rb_node;
160	struct rb_node *parent = NULL;
161	struct tree_entry *entry;
162
163	while (*p) {
164		parent = *p;
165		entry = rb_entry(parent, struct tree_entry, rb_node);
166
167		if (offset < entry->start)
168			p = &(*p)->rb_left;
169		else if (offset > entry->end)
170			p = &(*p)->rb_right;
171		else
172			return parent;
173	}
174
175	entry = rb_entry(node, struct tree_entry, rb_node);
176	rb_link_node(node, parent, p);
177	rb_insert_color(node, root);
178	return NULL;
179}
180
181static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
182				     struct rb_node **prev_ret,
183				     struct rb_node **next_ret)
184{
185	struct rb_root *root = &tree->state;
186	struct rb_node *n = root->rb_node;
187	struct rb_node *prev = NULL;
188	struct rb_node *orig_prev = NULL;
189	struct tree_entry *entry;
190	struct tree_entry *prev_entry = NULL;
191
192	while (n) {
193		entry = rb_entry(n, struct tree_entry, rb_node);
194		prev = n;
195		prev_entry = entry;
196
197		if (offset < entry->start)
198			n = n->rb_left;
199		else if (offset > entry->end)
200			n = n->rb_right;
201		else
202			return n;
203	}
204
205	if (prev_ret) {
206		orig_prev = prev;
207		while (prev && offset > prev_entry->end) {
208			prev = rb_next(prev);
209			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
210		}
211		*prev_ret = prev;
212		prev = orig_prev;
213	}
214
215	if (next_ret) {
216		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
217		while (prev && offset < prev_entry->start) {
218			prev = rb_prev(prev);
219			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
220		}
221		*next_ret = prev;
222	}
223	return NULL;
224}
225
226static inline struct rb_node *tree_search(struct extent_io_tree *tree,
227					  u64 offset)
228{
229	struct rb_node *prev = NULL;
230	struct rb_node *ret;
231
232	ret = __etree_search(tree, offset, &prev, NULL);
233	if (!ret)
234		return prev;
235	return ret;
236}
237
238static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
239					  u64 offset, struct rb_node *node)
240{
241	struct rb_root *root = &tree->buffer;
242	struct rb_node **p = &root->rb_node;
243	struct rb_node *parent = NULL;
244	struct extent_buffer *eb;
245
246	while (*p) {
247		parent = *p;
248		eb = rb_entry(parent, struct extent_buffer, rb_node);
249
250		if (offset < eb->start)
251			p = &(*p)->rb_left;
252		else if (offset > eb->start)
253			p = &(*p)->rb_right;
254		else
255			return eb;
256	}
257
258	rb_link_node(node, parent, p);
259	rb_insert_color(node, root);
260	return NULL;
261}
262
263static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
264					   u64 offset)
265{
266	struct rb_root *root = &tree->buffer;
267	struct rb_node *n = root->rb_node;
268	struct extent_buffer *eb;
269
270	while (n) {
271		eb = rb_entry(n, struct extent_buffer, rb_node);
272		if (offset < eb->start)
273			n = n->rb_left;
274		else if (offset > eb->start)
275			n = n->rb_right;
276		else
277			return eb;
278	}
279	return NULL;
280}
281
282static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
283		     struct extent_state *other)
284{
285	if (tree->ops && tree->ops->merge_extent_hook)
286		tree->ops->merge_extent_hook(tree->mapping->host, new,
287					     other);
288}
289
290/*
291 * utility function to look for merge candidates inside a given range.
292 * Any extents with matching state are merged together into a single
293 * extent in the tree.  Extents with EXTENT_IO in their state field
294 * are not merged because the end_io handlers need to be able to do
295 * operations on them without sleeping (or doing allocations/splits).
296 *
297 * This should be called with the tree lock held.
298 */
299static int merge_state(struct extent_io_tree *tree,
300		       struct extent_state *state)
301{
302	struct extent_state *other;
303	struct rb_node *other_node;
304
305	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
306		return 0;
307
308	other_node = rb_prev(&state->rb_node);
309	if (other_node) {
310		other = rb_entry(other_node, struct extent_state, rb_node);
311		if (other->end == state->start - 1 &&
312		    other->state == state->state) {
313			merge_cb(tree, state, other);
314			state->start = other->start;
315			other->tree = NULL;
316			rb_erase(&other->rb_node, &tree->state);
317			free_extent_state(other);
318		}
319	}
320	other_node = rb_next(&state->rb_node);
321	if (other_node) {
322		other = rb_entry(other_node, struct extent_state, rb_node);
323		if (other->start == state->end + 1 &&
324		    other->state == state->state) {
325			merge_cb(tree, state, other);
326			other->start = state->start;
327			state->tree = NULL;
328			rb_erase(&state->rb_node, &tree->state);
329			free_extent_state(state);
330			state = NULL;
331		}
332	}
333
334	return 0;
335}
336
337static int set_state_cb(struct extent_io_tree *tree,
338			 struct extent_state *state, int *bits)
339{
340	if (tree->ops && tree->ops->set_bit_hook) {
341		return tree->ops->set_bit_hook(tree->mapping->host,
342					       state, bits);
343	}
344
345	return 0;
346}
347
348static void clear_state_cb(struct extent_io_tree *tree,
349			   struct extent_state *state, int *bits)
350{
351	if (tree->ops && tree->ops->clear_bit_hook)
352		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
353}
354
355/*
356 * insert an extent_state struct into the tree.  'bits' are set on the
357 * struct before it is inserted.
358 *
359 * This may return -EEXIST if the extent is already there, in which case the
360 * state struct is freed.
361 *
362 * The tree lock is not taken internally.  This is a utility function and
363 * probably isn't what you want to call (see set/clear_extent_bit).
364 */
365static int insert_state(struct extent_io_tree *tree,
366			struct extent_state *state, u64 start, u64 end,
367			int *bits)
368{
369	struct rb_node *node;
370	int bits_to_set = *bits & ~EXTENT_CTLBITS;
371	int ret;
372
373	if (end < start) {
374		printk(KERN_ERR "btrfs end < start %llu %llu\n",
375		       (unsigned long long)end,
376		       (unsigned long long)start);
377		WARN_ON(1);
378	}
379	state->start = start;
380	state->end = end;
381	ret = set_state_cb(tree, state, bits);
382	if (ret)
383		return ret;
384
385	if (bits_to_set & EXTENT_DIRTY)
386		tree->dirty_bytes += end - start + 1;
387	state->state |= bits_to_set;
388	node = tree_insert(&tree->state, end, &state->rb_node);
389	if (node) {
390		struct extent_state *found;
391		found = rb_entry(node, struct extent_state, rb_node);
392		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
393		       "%llu %llu\n", (unsigned long long)found->start,
394		       (unsigned long long)found->end,
395		       (unsigned long long)start, (unsigned long long)end);
396		free_extent_state(state);
397		return -EEXIST;
398	}
399	state->tree = tree;
400	merge_state(tree, state);
401	return 0;
402}
403
404static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
405		     u64 split)
406{
407	if (tree->ops && tree->ops->split_extent_hook)
408		return tree->ops->split_extent_hook(tree->mapping->host,
409						    orig, split);
410	return 0;
411}
412
413/*
414 * split a given extent state struct in two, inserting the preallocated
415 * struct 'prealloc' as the newly created second half.  'split' indicates an
416 * offset inside 'orig' where it should be split.
417 *
418 * Before calling,
419 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
420 * are two extent state structs in the tree:
421 * prealloc: [orig->start, split - 1]
422 * orig: [ split, orig->end ]
423 *
424 * The tree locks are not taken by this function. They need to be held
425 * by the caller.
426 */
427static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
428		       struct extent_state *prealloc, u64 split)
429{
430	struct rb_node *node;
431
432	split_cb(tree, orig, split);
433
434	prealloc->start = orig->start;
435	prealloc->end = split - 1;
436	prealloc->state = orig->state;
437	orig->start = split;
438
439	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
440	if (node) {
441		free_extent_state(prealloc);
442		return -EEXIST;
443	}
444	prealloc->tree = tree;
445	return 0;
446}
447
448/*
449 * utility function to clear some bits in an extent state struct.
450 * it will optionally wake up any one waiting on this state (wake == 1), or
451 * forcibly remove the state from the tree (delete == 1).
452 *
453 * If no bits are set on the state struct after clearing things, the
454 * struct is freed and removed from the tree
455 */
456static int clear_state_bit(struct extent_io_tree *tree,
457			    struct extent_state *state,
458			    int *bits, int wake)
459{
460	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
461	int ret = state->state & bits_to_clear;
462
463	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
464		u64 range = state->end - state->start + 1;
465		WARN_ON(range > tree->dirty_bytes);
466		tree->dirty_bytes -= range;
467	}
468	clear_state_cb(tree, state, bits);
469	state->state &= ~bits_to_clear;
470	if (wake)
471		wake_up(&state->wq);
472	if (state->state == 0) {
473		if (state->tree) {
474			rb_erase(&state->rb_node, &tree->state);
475			state->tree = NULL;
476			free_extent_state(state);
477		} else {
478			WARN_ON(1);
479		}
480	} else {
481		merge_state(tree, state);
482	}
483	return ret;
484}
485
486/*
487 * clear some bits on a range in the tree.  This may require splitting
488 * or inserting elements in the tree, so the gfp mask is used to
489 * indicate which allocations or sleeping are allowed.
490 *
491 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
492 * the given range from the tree regardless of state (ie for truncate).
493 *
494 * the range [start, end] is inclusive.
495 *
496 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
497 * bits were already set, or zero if none of the bits were already set.
498 */
499int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
500		     int bits, int wake, int delete,
501		     struct extent_state **cached_state,
502		     gfp_t mask)
503{
504	struct extent_state *state;
505	struct extent_state *cached;
506	struct extent_state *prealloc = NULL;
507	struct rb_node *next_node;
508	struct rb_node *node;
509	u64 last_end;
510	int err;
511	int set = 0;
512	int clear = 0;
513
514	if (delete)
515		bits |= ~EXTENT_CTLBITS;
516	bits |= EXTENT_FIRST_DELALLOC;
517
518	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
519		clear = 1;
520again:
521	if (!prealloc && (mask & __GFP_WAIT)) {
522		prealloc = alloc_extent_state(mask);
523		if (!prealloc)
524			return -ENOMEM;
525	}
526
527	spin_lock(&tree->lock);
528	if (cached_state) {
529		cached = *cached_state;
530
531		if (clear) {
532			*cached_state = NULL;
533			cached_state = NULL;
534		}
535
536		if (cached && cached->tree && cached->start == start) {
537			if (clear)
538				atomic_dec(&cached->refs);
539			state = cached;
540			goto hit_next;
541		}
542		if (clear)
543			free_extent_state(cached);
544	}
545	/*
546	 * this search will find the extents that end after
547	 * our range starts
548	 */
549	node = tree_search(tree, start);
550	if (!node)
551		goto out;
552	state = rb_entry(node, struct extent_state, rb_node);
553hit_next:
554	if (state->start > end)
555		goto out;
556	WARN_ON(state->end < start);
557	last_end = state->end;
558
559	/*
560	 *     | ---- desired range ---- |
561	 *  | state | or
562	 *  | ------------- state -------------- |
563	 *
564	 * We need to split the extent we found, and may flip
565	 * bits on second half.
566	 *
567	 * If the extent we found extends past our range, we
568	 * just split and search again.  It'll get split again
569	 * the next time though.
570	 *
571	 * If the extent we found is inside our range, we clear
572	 * the desired bit on it.
573	 */
574
575	if (state->start < start) {
576		if (!prealloc)
577			prealloc = alloc_extent_state(GFP_ATOMIC);
578		err = split_state(tree, state, prealloc, start);
579		BUG_ON(err == -EEXIST);
580		prealloc = NULL;
581		if (err)
582			goto out;
583		if (state->end <= end) {
584			set |= clear_state_bit(tree, state, &bits, wake);
585			if (last_end == (u64)-1)
586				goto out;
587			start = last_end + 1;
588		}
589		goto search_again;
590	}
591	/*
592	 * | ---- desired range ---- |
593	 *                        | state |
594	 * We need to split the extent, and clear the bit
595	 * on the first half
596	 */
597	if (state->start <= end && state->end > end) {
598		if (!prealloc)
599			prealloc = alloc_extent_state(GFP_ATOMIC);
600		err = split_state(tree, state, prealloc, end + 1);
601		BUG_ON(err == -EEXIST);
602		if (wake)
603			wake_up(&state->wq);
604
605		set |= clear_state_bit(tree, prealloc, &bits, wake);
606
607		prealloc = NULL;
608		goto out;
609	}
610
611	if (state->end < end && prealloc && !need_resched())
612		next_node = rb_next(&state->rb_node);
613	else
614		next_node = NULL;
615
616	set |= clear_state_bit(tree, state, &bits, wake);
617	if (last_end == (u64)-1)
618		goto out;
619	start = last_end + 1;
620	if (start <= end && next_node) {
621		state = rb_entry(next_node, struct extent_state,
622				 rb_node);
623		if (state->start == start)
624			goto hit_next;
625	}
626	goto search_again;
627
628out:
629	spin_unlock(&tree->lock);
630	if (prealloc)
631		free_extent_state(prealloc);
632
633	return set;
634
635search_again:
636	if (start > end)
637		goto out;
638	spin_unlock(&tree->lock);
639	if (mask & __GFP_WAIT)
640		cond_resched();
641	goto again;
642}
643
644static int wait_on_state(struct extent_io_tree *tree,
645			 struct extent_state *state)
646		__releases(tree->lock)
647		__acquires(tree->lock)
648{
649	DEFINE_WAIT(wait);
650	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
651	spin_unlock(&tree->lock);
652	schedule();
653	spin_lock(&tree->lock);
654	finish_wait(&state->wq, &wait);
655	return 0;
656}
657
658/*
659 * waits for one or more bits to clear on a range in the state tree.
660 * The range [start, end] is inclusive.
661 * The tree lock is taken by this function
662 */
663int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
664{
665	struct extent_state *state;
666	struct rb_node *node;
667
668	spin_lock(&tree->lock);
669again:
670	while (1) {
671		/*
672		 * this search will find all the extents that end after
673		 * our range starts
674		 */
675		node = tree_search(tree, start);
676		if (!node)
677			break;
678
679		state = rb_entry(node, struct extent_state, rb_node);
680
681		if (state->start > end)
682			goto out;
683
684		if (state->state & bits) {
685			start = state->start;
686			atomic_inc(&state->refs);
687			wait_on_state(tree, state);
688			free_extent_state(state);
689			goto again;
690		}
691		start = state->end + 1;
692
693		if (start > end)
694			break;
695
696		if (need_resched()) {
697			spin_unlock(&tree->lock);
698			cond_resched();
699			spin_lock(&tree->lock);
700		}
701	}
702out:
703	spin_unlock(&tree->lock);
704	return 0;
705}
706
707static int set_state_bits(struct extent_io_tree *tree,
708			   struct extent_state *state,
709			   int *bits)
710{
711	int ret;
712	int bits_to_set = *bits & ~EXTENT_CTLBITS;
713
714	ret = set_state_cb(tree, state, bits);
715	if (ret)
716		return ret;
717	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
718		u64 range = state->end - state->start + 1;
719		tree->dirty_bytes += range;
720	}
721	state->state |= bits_to_set;
722
723	return 0;
724}
725
726static void cache_state(struct extent_state *state,
727			struct extent_state **cached_ptr)
728{
729	if (cached_ptr && !(*cached_ptr)) {
730		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
731			*cached_ptr = state;
732			atomic_inc(&state->refs);
733		}
734	}
735}
736
737/*
738 * set some bits on a range in the tree.  This may require allocations or
739 * sleeping, so the gfp mask is used to indicate what is allowed.
740 *
741 * If any of the exclusive bits are set, this will fail with -EEXIST if some
742 * part of the range already has the desired bits set.  The start of the
743 * existing range is returned in failed_start in this case.
744 *
745 * [start, end] is inclusive This takes the tree lock.
746 */
747
748int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
749		   int bits, int exclusive_bits, u64 *failed_start,
750		   struct extent_state **cached_state, gfp_t mask)
751{
752	struct extent_state *state;
753	struct extent_state *prealloc = NULL;
754	struct rb_node *node;
755	int err = 0;
756	u64 last_start;
757	u64 last_end;
758
759	bits |= EXTENT_FIRST_DELALLOC;
760again:
761	if (!prealloc && (mask & __GFP_WAIT)) {
762		prealloc = alloc_extent_state(mask);
763		if (!prealloc)
764			return -ENOMEM;
765	}
766
767	spin_lock(&tree->lock);
768	if (cached_state && *cached_state) {
769		state = *cached_state;
770		if (state->start == start && state->tree) {
771			node = &state->rb_node;
772			goto hit_next;
773		}
774	}
775	/*
776	 * this search will find all the extents that end after
777	 * our range starts.
778	 */
779	node = tree_search(tree, start);
780	if (!node) {
781		err = insert_state(tree, prealloc, start, end, &bits);
782		prealloc = NULL;
783		BUG_ON(err == -EEXIST);
784		goto out;
785	}
786	state = rb_entry(node, struct extent_state, rb_node);
787hit_next:
788	last_start = state->start;
789	last_end = state->end;
790
791	/*
792	 * | ---- desired range ---- |
793	 * | state |
794	 *
795	 * Just lock what we found and keep going
796	 */
797	if (state->start == start && state->end <= end) {
798		struct rb_node *next_node;
799		if (state->state & exclusive_bits) {
800			*failed_start = state->start;
801			err = -EEXIST;
802			goto out;
803		}
804
805		err = set_state_bits(tree, state, &bits);
806		if (err)
807			goto out;
808
809		cache_state(state, cached_state);
810		merge_state(tree, state);
811		if (last_end == (u64)-1)
812			goto out;
813
814		start = last_end + 1;
815		if (start < end && prealloc && !need_resched()) {
816			next_node = rb_next(node);
817			if (next_node) {
818				state = rb_entry(next_node, struct extent_state,
819						 rb_node);
820				if (state->start == start)
821					goto hit_next;
822			}
823		}
824		goto search_again;
825	}
826
827	/*
828	 *     | ---- desired range ---- |
829	 * | state |
830	 *   or
831	 * | ------------- state -------------- |
832	 *
833	 * We need to split the extent we found, and may flip bits on
834	 * second half.
835	 *
836	 * If the extent we found extends past our
837	 * range, we just split and search again.  It'll get split
838	 * again the next time though.
839	 *
840	 * If the extent we found is inside our range, we set the
841	 * desired bit on it.
842	 */
843	if (state->start < start) {
844		if (state->state & exclusive_bits) {
845			*failed_start = start;
846			err = -EEXIST;
847			goto out;
848		}
849		err = split_state(tree, state, prealloc, start);
850		BUG_ON(err == -EEXIST);
851		prealloc = NULL;
852		if (err)
853			goto out;
854		if (state->end <= end) {
855			err = set_state_bits(tree, state, &bits);
856			if (err)
857				goto out;
858			cache_state(state, cached_state);
859			merge_state(tree, state);
860			if (last_end == (u64)-1)
861				goto out;
862			start = last_end + 1;
863		}
864		goto search_again;
865	}
866	/*
867	 * | ---- desired range ---- |
868	 *     | state | or               | state |
869	 *
870	 * There's a hole, we need to insert something in it and
871	 * ignore the extent we found.
872	 */
873	if (state->start > start) {
874		u64 this_end;
875		if (end < last_start)
876			this_end = end;
877		else
878			this_end = last_start - 1;
879		err = insert_state(tree, prealloc, start, this_end,
880				   &bits);
881		BUG_ON(err == -EEXIST);
882		if (err) {
883			prealloc = NULL;
884			goto out;
885		}
886		cache_state(prealloc, cached_state);
887		prealloc = NULL;
888		start = this_end + 1;
889		goto search_again;
890	}
891	/*
892	 * | ---- desired range ---- |
893	 *                        | state |
894	 * We need to split the extent, and set the bit
895	 * on the first half
896	 */
897	if (state->start <= end && state->end > end) {
898		if (state->state & exclusive_bits) {
899			*failed_start = start;
900			err = -EEXIST;
901			goto out;
902		}
903		err = split_state(tree, state, prealloc, end + 1);
904		BUG_ON(err == -EEXIST);
905
906		err = set_state_bits(tree, prealloc, &bits);
907		if (err) {
908			prealloc = NULL;
909			goto out;
910		}
911		cache_state(prealloc, cached_state);
912		merge_state(tree, prealloc);
913		prealloc = NULL;
914		goto out;
915	}
916
917	goto search_again;
918
919out:
920	spin_unlock(&tree->lock);
921	if (prealloc)
922		free_extent_state(prealloc);
923
924	return err;
925
926search_again:
927	if (start > end)
928		goto out;
929	spin_unlock(&tree->lock);
930	if (mask & __GFP_WAIT)
931		cond_resched();
932	goto again;
933}
934
935/* wrappers around set/clear extent bit */
936int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
937		     gfp_t mask)
938{
939	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
940			      NULL, mask);
941}
942
943int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
944		    int bits, gfp_t mask)
945{
946	return set_extent_bit(tree, start, end, bits, 0, NULL,
947			      NULL, mask);
948}
949
950int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
951		      int bits, gfp_t mask)
952{
953	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
954}
955
956int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
957			struct extent_state **cached_state, gfp_t mask)
958{
959	return set_extent_bit(tree, start, end,
960			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
961			      0, NULL, cached_state, mask);
962}
963
964int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
965		       gfp_t mask)
966{
967	return clear_extent_bit(tree, start, end,
968				EXTENT_DIRTY | EXTENT_DELALLOC |
969				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
970}
971
972int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
973		     gfp_t mask)
974{
975	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
976			      NULL, mask);
977}
978
979static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
980		       gfp_t mask)
981{
982	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
983				NULL, mask);
984}
985
986int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
987			gfp_t mask)
988{
989	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
990			      NULL, mask);
991}
992
993static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
994				 u64 end, struct extent_state **cached_state,
995				 gfp_t mask)
996{
997	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
998				cached_state, mask);
999}
1000
1001int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1002{
1003	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
1004}
1005
1006/*
1007 * either insert or lock state struct between start and end use mask to tell
1008 * us if waiting is desired.
1009 */
1010int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1011		     int bits, struct extent_state **cached_state, gfp_t mask)
1012{
1013	int err;
1014	u64 failed_start;
1015	while (1) {
1016		err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1017				     EXTENT_LOCKED, &failed_start,
1018				     cached_state, mask);
1019		if (err == -EEXIST && (mask & __GFP_WAIT)) {
1020			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1021			start = failed_start;
1022		} else {
1023			break;
1024		}
1025		WARN_ON(start > end);
1026	}
1027	return err;
1028}
1029
1030int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1031{
1032	return lock_extent_bits(tree, start, end, 0, NULL, mask);
1033}
1034
1035int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1036		    gfp_t mask)
1037{
1038	int err;
1039	u64 failed_start;
1040
1041	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1042			     &failed_start, NULL, mask);
1043	if (err == -EEXIST) {
1044		if (failed_start > start)
1045			clear_extent_bit(tree, start, failed_start - 1,
1046					 EXTENT_LOCKED, 1, 0, NULL, mask);
1047		return 0;
1048	}
1049	return 1;
1050}
1051
1052int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1053			 struct extent_state **cached, gfp_t mask)
1054{
1055	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1056				mask);
1057}
1058
1059int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1060		  gfp_t mask)
1061{
1062	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1063				mask);
1064}
1065
1066/*
1067 * helper function to set pages and extents in the tree dirty
1068 */
1069int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
1070{
1071	unsigned long index = start >> PAGE_CACHE_SHIFT;
1072	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1073	struct page *page;
1074
1075	while (index <= end_index) {
1076		page = find_get_page(tree->mapping, index);
1077		BUG_ON(!page);
1078		__set_page_dirty_nobuffers(page);
1079		page_cache_release(page);
1080		index++;
1081	}
1082	return 0;
1083}
1084
1085/*
1086 * helper function to set both pages and extents in the tree writeback
1087 */
1088static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1089{
1090	unsigned long index = start >> PAGE_CACHE_SHIFT;
1091	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1092	struct page *page;
1093
1094	while (index <= end_index) {
1095		page = find_get_page(tree->mapping, index);
1096		BUG_ON(!page);
1097		set_page_writeback(page);
1098		page_cache_release(page);
1099		index++;
1100	}
1101	return 0;
1102}
1103
1104/*
1105 * find the first offset in the io tree with 'bits' set. zero is
1106 * returned if we find something, and *start_ret and *end_ret are
1107 * set to reflect the state struct that was found.
1108 *
1109 * If nothing was found, 1 is returned, < 0 on error
1110 */
1111int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1112			  u64 *start_ret, u64 *end_ret, int bits)
1113{
1114	struct rb_node *node;
1115	struct extent_state *state;
1116	int ret = 1;
1117
1118	spin_lock(&tree->lock);
1119	/*
1120	 * this search will find all the extents that end after
1121	 * our range starts.
1122	 */
1123	node = tree_search(tree, start);
1124	if (!node)
1125		goto out;
1126
1127	while (1) {
1128		state = rb_entry(node, struct extent_state, rb_node);
1129		if (state->end >= start && (state->state & bits)) {
1130			*start_ret = state->start;
1131			*end_ret = state->end;
1132			ret = 0;
1133			break;
1134		}
1135		node = rb_next(node);
1136		if (!node)
1137			break;
1138	}
1139out:
1140	spin_unlock(&tree->lock);
1141	return ret;
1142}
1143
1144/* find the first state struct with 'bits' set after 'start', and
1145 * return it.  tree->lock must be held.  NULL will returned if
1146 * nothing was found after 'start'
1147 */
1148struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1149						 u64 start, int bits)
1150{
1151	struct rb_node *node;
1152	struct extent_state *state;
1153
1154	/*
1155	 * this search will find all the extents that end after
1156	 * our range starts.
1157	 */
1158	node = tree_search(tree, start);
1159	if (!node)
1160		goto out;
1161
1162	while (1) {
1163		state = rb_entry(node, struct extent_state, rb_node);
1164		if (state->end >= start && (state->state & bits))
1165			return state;
1166
1167		node = rb_next(node);
1168		if (!node)
1169			break;
1170	}
1171out:
1172	return NULL;
1173}
1174
1175/*
1176 * find a contiguous range of bytes in the file marked as delalloc, not
1177 * more than 'max_bytes'.  start and end are used to return the range,
1178 *
1179 * 1 is returned if we find something, 0 if nothing was in the tree
1180 */
1181static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1182					u64 *start, u64 *end, u64 max_bytes,
1183					struct extent_state **cached_state)
1184{
1185	struct rb_node *node;
1186	struct extent_state *state;
1187	u64 cur_start = *start;
1188	u64 found = 0;
1189	u64 total_bytes = 0;
1190
1191	spin_lock(&tree->lock);
1192
1193	/*
1194	 * this search will find all the extents that end after
1195	 * our range starts.
1196	 */
1197	node = tree_search(tree, cur_start);
1198	if (!node) {
1199		if (!found)
1200			*end = (u64)-1;
1201		goto out;
1202	}
1203
1204	while (1) {
1205		state = rb_entry(node, struct extent_state, rb_node);
1206		if (found && (state->start != cur_start ||
1207			      (state->state & EXTENT_BOUNDARY))) {
1208			goto out;
1209		}
1210		if (!(state->state & EXTENT_DELALLOC)) {
1211			if (!found)
1212				*end = state->end;
1213			goto out;
1214		}
1215		if (!found) {
1216			*start = state->start;
1217			*cached_state = state;
1218			atomic_inc(&state->refs);
1219		}
1220		found++;
1221		*end = state->end;
1222		cur_start = state->end + 1;
1223		node = rb_next(node);
1224		if (!node)
1225			break;
1226		total_bytes += state->end - state->start + 1;
1227		if (total_bytes >= max_bytes)
1228			break;
1229	}
1230out:
1231	spin_unlock(&tree->lock);
1232	return found;
1233}
1234
1235static noinline int __unlock_for_delalloc(struct inode *inode,
1236					  struct page *locked_page,
1237					  u64 start, u64 end)
1238{
1239	int ret;
1240	struct page *pages[16];
1241	unsigned long index = start >> PAGE_CACHE_SHIFT;
1242	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1243	unsigned long nr_pages = end_index - index + 1;
1244	int i;
1245
1246	if (index == locked_page->index && end_index == index)
1247		return 0;
1248
1249	while (nr_pages > 0) {
1250		ret = find_get_pages_contig(inode->i_mapping, index,
1251				     min_t(unsigned long, nr_pages,
1252				     ARRAY_SIZE(pages)), pages);
1253		for (i = 0; i < ret; i++) {
1254			if (pages[i] != locked_page)
1255				unlock_page(pages[i]);
1256			page_cache_release(pages[i]);
1257		}
1258		nr_pages -= ret;
1259		index += ret;
1260		cond_resched();
1261	}
1262	return 0;
1263}
1264
1265static noinline int lock_delalloc_pages(struct inode *inode,
1266					struct page *locked_page,
1267					u64 delalloc_start,
1268					u64 delalloc_end)
1269{
1270	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1271	unsigned long start_index = index;
1272	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1273	unsigned long pages_locked = 0;
1274	struct page *pages[16];
1275	unsigned long nrpages;
1276	int ret;
1277	int i;
1278
1279	/* the caller is responsible for locking the start index */
1280	if (index == locked_page->index && index == end_index)
1281		return 0;
1282
1283	/* skip the page at the start index */
1284	nrpages = end_index - index + 1;
1285	while (nrpages > 0) {
1286		ret = find_get_pages_contig(inode->i_mapping, index,
1287				     min_t(unsigned long,
1288				     nrpages, ARRAY_SIZE(pages)), pages);
1289		if (ret == 0) {
1290			ret = -EAGAIN;
1291			goto done;
1292		}
1293		/* now we have an array of pages, lock them all */
1294		for (i = 0; i < ret; i++) {
1295			/*
1296			 * the caller is taking responsibility for
1297			 * locked_page
1298			 */
1299			if (pages[i] != locked_page) {
1300				lock_page(pages[i]);
1301				if (!PageDirty(pages[i]) ||
1302				    pages[i]->mapping != inode->i_mapping) {
1303					ret = -EAGAIN;
1304					unlock_page(pages[i]);
1305					page_cache_release(pages[i]);
1306					goto done;
1307				}
1308			}
1309			page_cache_release(pages[i]);
1310			pages_locked++;
1311		}
1312		nrpages -= ret;
1313		index += ret;
1314		cond_resched();
1315	}
1316	ret = 0;
1317done:
1318	if (ret && pages_locked) {
1319		__unlock_for_delalloc(inode, locked_page,
1320			      delalloc_start,
1321			      ((u64)(start_index + pages_locked - 1)) <<
1322			      PAGE_CACHE_SHIFT);
1323	}
1324	return ret;
1325}
1326
1327/*
1328 * find a contiguous range of bytes in the file marked as delalloc, not
1329 * more than 'max_bytes'.  start and end are used to return the range,
1330 *
1331 * 1 is returned if we find something, 0 if nothing was in the tree
1332 */
1333static noinline u64 find_lock_delalloc_range(struct inode *inode,
1334					     struct extent_io_tree *tree,
1335					     struct page *locked_page,
1336					     u64 *start, u64 *end,
1337					     u64 max_bytes)
1338{
1339	u64 delalloc_start;
1340	u64 delalloc_end;
1341	u64 found;
1342	struct extent_state *cached_state = NULL;
1343	int ret;
1344	int loops = 0;
1345
1346again:
1347	/* step one, find a bunch of delalloc bytes starting at start */
1348	delalloc_start = *start;
1349	delalloc_end = 0;
1350	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1351				    max_bytes, &cached_state);
1352	if (!found || delalloc_end <= *start) {
1353		*start = delalloc_start;
1354		*end = delalloc_end;
1355		free_extent_state(cached_state);
1356		return found;
1357	}
1358
1359	/*
1360	 * start comes from the offset of locked_page.  We have to lock
1361	 * pages in order, so we can't process delalloc bytes before
1362	 * locked_page
1363	 */
1364	if (delalloc_start < *start)
1365		delalloc_start = *start;
1366
1367	/*
1368	 * make sure to limit the number of pages we try to lock down
1369	 * if we're looping.
1370	 */
1371	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1372		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1373
1374	/* step two, lock all the pages after the page that has start */
1375	ret = lock_delalloc_pages(inode, locked_page,
1376				  delalloc_start, delalloc_end);
1377	if (ret == -EAGAIN) {
1378		/* some of the pages are gone, lets avoid looping by
1379		 * shortening the size of the delalloc range we're searching
1380		 */
1381		free_extent_state(cached_state);
1382		if (!loops) {
1383			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1384			max_bytes = PAGE_CACHE_SIZE - offset;
1385			loops = 1;
1386			goto again;
1387		} else {
1388			found = 0;
1389			goto out_failed;
1390		}
1391	}
1392	BUG_ON(ret);
1393
1394	/* step three, lock the state bits for the whole range */
1395	lock_extent_bits(tree, delalloc_start, delalloc_end,
1396			 0, &cached_state, GFP_NOFS);
1397
1398	/* then test to make sure it is all still delalloc */
1399	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1400			     EXTENT_DELALLOC, 1, cached_state);
1401	if (!ret) {
1402		unlock_extent_cached(tree, delalloc_start, delalloc_end,
1403				     &cached_state, GFP_NOFS);
1404		__unlock_for_delalloc(inode, locked_page,
1405			      delalloc_start, delalloc_end);
1406		cond_resched();
1407		goto again;
1408	}
1409	free_extent_state(cached_state);
1410	*start = delalloc_start;
1411	*end = delalloc_end;
1412out_failed:
1413	return found;
1414}
1415
1416int extent_clear_unlock_delalloc(struct inode *inode,
1417				struct extent_io_tree *tree,
1418				u64 start, u64 end, struct page *locked_page,
1419				unsigned long op)
1420{
1421	int ret;
1422	struct page *pages[16];
1423	unsigned long index = start >> PAGE_CACHE_SHIFT;
1424	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1425	unsigned long nr_pages = end_index - index + 1;
1426	int i;
1427	int clear_bits = 0;
1428
1429	if (op & EXTENT_CLEAR_UNLOCK)
1430		clear_bits |= EXTENT_LOCKED;
1431	if (op & EXTENT_CLEAR_DIRTY)
1432		clear_bits |= EXTENT_DIRTY;
1433
1434	if (op & EXTENT_CLEAR_DELALLOC)
1435		clear_bits |= EXTENT_DELALLOC;
1436
1437	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1438	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1439		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1440		    EXTENT_SET_PRIVATE2)))
1441		return 0;
1442
1443	while (nr_pages > 0) {
1444		ret = find_get_pages_contig(inode->i_mapping, index,
1445				     min_t(unsigned long,
1446				     nr_pages, ARRAY_SIZE(pages)), pages);
1447		for (i = 0; i < ret; i++) {
1448
1449			if (op & EXTENT_SET_PRIVATE2)
1450				SetPagePrivate2(pages[i]);
1451
1452			if (pages[i] == locked_page) {
1453				page_cache_release(pages[i]);
1454				continue;
1455			}
1456			if (op & EXTENT_CLEAR_DIRTY)
1457				clear_page_dirty_for_io(pages[i]);
1458			if (op & EXTENT_SET_WRITEBACK)
1459				set_page_writeback(pages[i]);
1460			if (op & EXTENT_END_WRITEBACK)
1461				end_page_writeback(pages[i]);
1462			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1463				unlock_page(pages[i]);
1464			page_cache_release(pages[i]);
1465		}
1466		nr_pages -= ret;
1467		index += ret;
1468		cond_resched();
1469	}
1470	return 0;
1471}
1472
1473/*
1474 * count the number of bytes in the tree that have a given bit(s)
1475 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1476 * cached.  The total number found is returned.
1477 */
1478u64 count_range_bits(struct extent_io_tree *tree,
1479		     u64 *start, u64 search_end, u64 max_bytes,
1480		     unsigned long bits)
1481{
1482	struct rb_node *node;
1483	struct extent_state *state;
1484	u64 cur_start = *start;
1485	u64 total_bytes = 0;
1486	int found = 0;
1487
1488	if (search_end <= cur_start) {
1489		WARN_ON(1);
1490		return 0;
1491	}
1492
1493	spin_lock(&tree->lock);
1494	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1495		total_bytes = tree->dirty_bytes;
1496		goto out;
1497	}
1498	/*
1499	 * this search will find all the extents that end after
1500	 * our range starts.
1501	 */
1502	node = tree_search(tree, cur_start);
1503	if (!node)
1504		goto out;
1505
1506	while (1) {
1507		state = rb_entry(node, struct extent_state, rb_node);
1508		if (state->start > search_end)
1509			break;
1510		if (state->end >= cur_start && (state->state & bits)) {
1511			total_bytes += min(search_end, state->end) + 1 -
1512				       max(cur_start, state->start);
1513			if (total_bytes >= max_bytes)
1514				break;
1515			if (!found) {
1516				*start = state->start;
1517				found = 1;
1518			}
1519		}
1520		node = rb_next(node);
1521		if (!node)
1522			break;
1523	}
1524out:
1525	spin_unlock(&tree->lock);
1526	return total_bytes;
1527}
1528
1529/*
1530 * set the private field for a given byte offset in the tree.  If there isn't
1531 * an extent_state there already, this does nothing.
1532 */
1533int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1534{
1535	struct rb_node *node;
1536	struct extent_state *state;
1537	int ret = 0;
1538
1539	spin_lock(&tree->lock);
1540	/*
1541	 * this search will find all the extents that end after
1542	 * our range starts.
1543	 */
1544	node = tree_search(tree, start);
1545	if (!node) {
1546		ret = -ENOENT;
1547		goto out;
1548	}
1549	state = rb_entry(node, struct extent_state, rb_node);
1550	if (state->start != start) {
1551		ret = -ENOENT;
1552		goto out;
1553	}
1554	state->private = private;
1555out:
1556	spin_unlock(&tree->lock);
1557	return ret;
1558}
1559
1560int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1561{
1562	struct rb_node *node;
1563	struct extent_state *state;
1564	int ret = 0;
1565
1566	spin_lock(&tree->lock);
1567	/*
1568	 * this search will find all the extents that end after
1569	 * our range starts.
1570	 */
1571	node = tree_search(tree, start);
1572	if (!node) {
1573		ret = -ENOENT;
1574		goto out;
1575	}
1576	state = rb_entry(node, struct extent_state, rb_node);
1577	if (state->start != start) {
1578		ret = -ENOENT;
1579		goto out;
1580	}
1581	*private = state->private;
1582out:
1583	spin_unlock(&tree->lock);
1584	return ret;
1585}
1586
1587/*
1588 * searches a range in the state tree for a given mask.
1589 * If 'filled' == 1, this returns 1 only if every extent in the tree
1590 * has the bits set.  Otherwise, 1 is returned if any bit in the
1591 * range is found set.
1592 */
1593int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1594		   int bits, int filled, struct extent_state *cached)
1595{
1596	struct extent_state *state = NULL;
1597	struct rb_node *node;
1598	int bitset = 0;
1599
1600	spin_lock(&tree->lock);
1601	if (cached && cached->tree && cached->start == start)
1602		node = &cached->rb_node;
1603	else
1604		node = tree_search(tree, start);
1605	while (node && start <= end) {
1606		state = rb_entry(node, struct extent_state, rb_node);
1607
1608		if (filled && state->start > start) {
1609			bitset = 0;
1610			break;
1611		}
1612
1613		if (state->start > end)
1614			break;
1615
1616		if (state->state & bits) {
1617			bitset = 1;
1618			if (!filled)
1619				break;
1620		} else if (filled) {
1621			bitset = 0;
1622			break;
1623		}
1624
1625		if (state->end == (u64)-1)
1626			break;
1627
1628		start = state->end + 1;
1629		if (start > end)
1630			break;
1631		node = rb_next(node);
1632		if (!node) {
1633			if (filled)
1634				bitset = 0;
1635			break;
1636		}
1637	}
1638	spin_unlock(&tree->lock);
1639	return bitset;
1640}
1641
1642/*
1643 * helper function to set a given page up to date if all the
1644 * extents in the tree for that page are up to date
1645 */
1646static int check_page_uptodate(struct extent_io_tree *tree,
1647			       struct page *page)
1648{
1649	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1650	u64 end = start + PAGE_CACHE_SIZE - 1;
1651	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1652		SetPageUptodate(page);
1653	return 0;
1654}
1655
1656/*
1657 * helper function to unlock a page if all the extents in the tree
1658 * for that page are unlocked
1659 */
1660static int check_page_locked(struct extent_io_tree *tree,
1661			     struct page *page)
1662{
1663	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1664	u64 end = start + PAGE_CACHE_SIZE - 1;
1665	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1666		unlock_page(page);
1667	return 0;
1668}
1669
1670/*
1671 * helper function to end page writeback if all the extents
1672 * in the tree for that page are done with writeback
1673 */
1674static int check_page_writeback(struct extent_io_tree *tree,
1675			     struct page *page)
1676{
1677	end_page_writeback(page);
1678	return 0;
1679}
1680
1681/* lots and lots of room for performance fixes in the end_bio funcs */
1682
1683/*
1684 * after a writepage IO is done, we need to:
1685 * clear the uptodate bits on error
1686 * clear the writeback bits in the extent tree for this IO
1687 * end_page_writeback if the page has no more pending IO
1688 *
1689 * Scheduling is not allowed, so the extent state tree is expected
1690 * to have one and only one object corresponding to this IO.
1691 */
1692static void end_bio_extent_writepage(struct bio *bio, int err)
1693{
1694	int uptodate = err == 0;
1695	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1696	struct extent_io_tree *tree;
1697	u64 start;
1698	u64 end;
1699	int whole_page;
1700	int ret;
1701
1702	do {
1703		struct page *page = bvec->bv_page;
1704		tree = &BTRFS_I(page->mapping->host)->io_tree;
1705
1706		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1707			 bvec->bv_offset;
1708		end = start + bvec->bv_len - 1;
1709
1710		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1711			whole_page = 1;
1712		else
1713			whole_page = 0;
1714
1715		if (--bvec >= bio->bi_io_vec)
1716			prefetchw(&bvec->bv_page->flags);
1717		if (tree->ops && tree->ops->writepage_end_io_hook) {
1718			ret = tree->ops->writepage_end_io_hook(page, start,
1719						       end, NULL, uptodate);
1720			if (ret)
1721				uptodate = 0;
1722		}
1723
1724		if (!uptodate && tree->ops &&
1725		    tree->ops->writepage_io_failed_hook) {
1726			ret = tree->ops->writepage_io_failed_hook(bio, page,
1727							 start, end, NULL);
1728			if (ret == 0) {
1729				uptodate = (err == 0);
1730				continue;
1731			}
1732		}
1733
1734		if (!uptodate) {
1735			clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
1736			ClearPageUptodate(page);
1737			SetPageError(page);
1738		}
1739
1740		if (whole_page)
1741			end_page_writeback(page);
1742		else
1743			check_page_writeback(tree, page);
1744	} while (bvec >= bio->bi_io_vec);
1745
1746	bio_put(bio);
1747}
1748
1749/*
1750 * after a readpage IO is done, we need to:
1751 * clear the uptodate bits on error
1752 * set the uptodate bits if things worked
1753 * set the page up to date if all extents in the tree are uptodate
1754 * clear the lock bit in the extent tree
1755 * unlock the page if there are no other extents locked for it
1756 *
1757 * Scheduling is not allowed, so the extent state tree is expected
1758 * to have one and only one object corresponding to this IO.
1759 */
1760static void end_bio_extent_readpage(struct bio *bio, int err)
1761{
1762	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1763	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1764	struct bio_vec *bvec = bio->bi_io_vec;
1765	struct extent_io_tree *tree;
1766	u64 start;
1767	u64 end;
1768	int whole_page;
1769	int ret;
1770
1771	if (err)
1772		uptodate = 0;
1773
1774	do {
1775		struct page *page = bvec->bv_page;
1776		tree = &BTRFS_I(page->mapping->host)->io_tree;
1777
1778		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1779			bvec->bv_offset;
1780		end = start + bvec->bv_len - 1;
1781
1782		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1783			whole_page = 1;
1784		else
1785			whole_page = 0;
1786
1787		if (++bvec <= bvec_end)
1788			prefetchw(&bvec->bv_page->flags);
1789
1790		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1791			ret = tree->ops->readpage_end_io_hook(page, start, end,
1792							      NULL);
1793			if (ret)
1794				uptodate = 0;
1795		}
1796		if (!uptodate && tree->ops &&
1797		    tree->ops->readpage_io_failed_hook) {
1798			ret = tree->ops->readpage_io_failed_hook(bio, page,
1799							 start, end, NULL);
1800			if (ret == 0) {
1801				uptodate =
1802					test_bit(BIO_UPTODATE, &bio->bi_flags);
1803				if (err)
1804					uptodate = 0;
1805				continue;
1806			}
1807		}
1808
1809		if (uptodate) {
1810			set_extent_uptodate(tree, start, end,
1811					    GFP_ATOMIC);
1812		}
1813		unlock_extent(tree, start, end, GFP_ATOMIC);
1814
1815		if (whole_page) {
1816			if (uptodate) {
1817				SetPageUptodate(page);
1818			} else {
1819				ClearPageUptodate(page);
1820				SetPageError(page);
1821			}
1822			unlock_page(page);
1823		} else {
1824			if (uptodate) {
1825				check_page_uptodate(tree, page);
1826			} else {
1827				ClearPageUptodate(page);
1828				SetPageError(page);
1829			}
1830			check_page_locked(tree, page);
1831		}
1832	} while (bvec <= bvec_end);
1833
1834	bio_put(bio);
1835}
1836
1837/*
1838 * IO done from prepare_write is pretty simple, we just unlock
1839 * the structs in the extent tree when done, and set the uptodate bits
1840 * as appropriate.
1841 */
1842static void end_bio_extent_preparewrite(struct bio *bio, int err)
1843{
1844	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1845	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1846	struct extent_io_tree *tree;
1847	u64 start;
1848	u64 end;
1849
1850	do {
1851		struct page *page = bvec->bv_page;
1852		tree = &BTRFS_I(page->mapping->host)->io_tree;
1853
1854		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1855			bvec->bv_offset;
1856		end = start + bvec->bv_len - 1;
1857
1858		if (--bvec >= bio->bi_io_vec)
1859			prefetchw(&bvec->bv_page->flags);
1860
1861		if (uptodate) {
1862			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1863		} else {
1864			ClearPageUptodate(page);
1865			SetPageError(page);
1866		}
1867
1868		unlock_extent(tree, start, end, GFP_ATOMIC);
1869
1870	} while (bvec >= bio->bi_io_vec);
1871
1872	bio_put(bio);
1873}
1874
1875static struct bio *
1876extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1877		 gfp_t gfp_flags)
1878{
1879	struct bio *bio;
1880
1881	bio = bio_alloc(gfp_flags, nr_vecs);
1882
1883	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1884		while (!bio && (nr_vecs /= 2))
1885			bio = bio_alloc(gfp_flags, nr_vecs);
1886	}
1887
1888	if (bio) {
1889		bio->bi_size = 0;
1890		bio->bi_bdev = bdev;
1891		bio->bi_sector = first_sector;
1892	}
1893	return bio;
1894}
1895
1896static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1897			  unsigned long bio_flags)
1898{
1899	int ret = 0;
1900	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1901	struct page *page = bvec->bv_page;
1902	struct extent_io_tree *tree = bio->bi_private;
1903	u64 start;
1904	u64 end;
1905
1906	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1907	end = start + bvec->bv_len - 1;
1908
1909	bio->bi_private = NULL;
1910
1911	bio_get(bio);
1912
1913	if (tree->ops && tree->ops->submit_bio_hook)
1914		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1915					   mirror_num, bio_flags, start);
1916	else
1917		submit_bio(rw, bio);
1918	if (bio_flagged(bio, BIO_EOPNOTSUPP))
1919		ret = -EOPNOTSUPP;
1920	bio_put(bio);
1921	return ret;
1922}
1923
1924static int submit_extent_page(int rw, struct extent_io_tree *tree,
1925			      struct page *page, sector_t sector,
1926			      size_t size, unsigned long offset,
1927			      struct block_device *bdev,
1928			      struct bio **bio_ret,
1929			      unsigned long max_pages,
1930			      bio_end_io_t end_io_func,
1931			      int mirror_num,
1932			      unsigned long prev_bio_flags,
1933			      unsigned long bio_flags)
1934{
1935	int ret = 0;
1936	struct bio *bio;
1937	int nr;
1938	int contig = 0;
1939	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1940	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1941	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1942
1943	if (bio_ret && *bio_ret) {
1944		bio = *bio_ret;
1945		if (old_compressed)
1946			contig = bio->bi_sector == sector;
1947		else
1948			contig = bio->bi_sector + (bio->bi_size >> 9) ==
1949				sector;
1950
1951		if (prev_bio_flags != bio_flags || !contig ||
1952		    (tree->ops && tree->ops->merge_bio_hook &&
1953		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
1954					       bio_flags)) ||
1955		    bio_add_page(bio, page, page_size, offset) < page_size) {
1956			ret = submit_one_bio(rw, bio, mirror_num,
1957					     prev_bio_flags);
1958			bio = NULL;
1959		} else {
1960			return 0;
1961		}
1962	}
1963	if (this_compressed)
1964		nr = BIO_MAX_PAGES;
1965	else
1966		nr = bio_get_nr_vecs(bdev);
1967
1968	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1969
1970	bio_add_page(bio, page, page_size, offset);
1971	bio->bi_end_io = end_io_func;
1972	bio->bi_private = tree;
1973
1974	if (bio_ret)
1975		*bio_ret = bio;
1976	else
1977		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1978
1979	return ret;
1980}
1981
1982void set_page_extent_mapped(struct page *page)
1983{
1984	if (!PagePrivate(page)) {
1985		SetPagePrivate(page);
1986		page_cache_get(page);
1987		set_page_private(page, EXTENT_PAGE_PRIVATE);
1988	}
1989}
1990
1991static void set_page_extent_head(struct page *page, unsigned long len)
1992{
1993	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1994}
1995
1996/*
1997 * basic readpage implementation.  Locked extent state structs are inserted
1998 * into the tree that are removed when the IO is done (by the end_io
1999 * handlers)
2000 */
2001static int __extent_read_full_page(struct extent_io_tree *tree,
2002				   struct page *page,
2003				   get_extent_t *get_extent,
2004				   struct bio **bio, int mirror_num,
2005				   unsigned long *bio_flags)
2006{
2007	struct inode *inode = page->mapping->host;
2008	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2009	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2010	u64 end;
2011	u64 cur = start;
2012	u64 extent_offset;
2013	u64 last_byte = i_size_read(inode);
2014	u64 block_start;
2015	u64 cur_end;
2016	sector_t sector;
2017	struct extent_map *em;
2018	struct block_device *bdev;
2019	struct btrfs_ordered_extent *ordered;
2020	int ret;
2021	int nr = 0;
2022	size_t page_offset = 0;
2023	size_t iosize;
2024	size_t disk_io_size;
2025	size_t blocksize = inode->i_sb->s_blocksize;
2026	unsigned long this_bio_flag = 0;
2027
2028	set_page_extent_mapped(page);
2029
2030	end = page_end;
2031	while (1) {
2032		lock_extent(tree, start, end, GFP_NOFS);
2033		ordered = btrfs_lookup_ordered_extent(inode, start);
2034		if (!ordered)
2035			break;
2036		unlock_extent(tree, start, end, GFP_NOFS);
2037		btrfs_start_ordered_extent(inode, ordered, 1);
2038		btrfs_put_ordered_extent(ordered);
2039	}
2040
2041	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2042		char *userpage;
2043		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2044
2045		if (zero_offset) {
2046			iosize = PAGE_CACHE_SIZE - zero_offset;
2047			userpage = kmap_atomic(page, KM_USER0);
2048			memset(userpage + zero_offset, 0, iosize);
2049			flush_dcache_page(page);
2050			kunmap_atomic(userpage, KM_USER0);
2051		}
2052	}
2053	while (cur <= end) {
2054		if (cur >= last_byte) {
2055			char *userpage;
2056			iosize = PAGE_CACHE_SIZE - page_offset;
2057			userpage = kmap_atomic(page, KM_USER0);
2058			memset(userpage + page_offset, 0, iosize);
2059			flush_dcache_page(page);
2060			kunmap_atomic(userpage, KM_USER0);
2061			set_extent_uptodate(tree, cur, cur + iosize - 1,
2062					    GFP_NOFS);
2063			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2064			break;
2065		}
2066		em = get_extent(inode, page, page_offset, cur,
2067				end - cur + 1, 0);
2068		if (IS_ERR(em) || !em) {
2069			SetPageError(page);
2070			unlock_extent(tree, cur, end, GFP_NOFS);
2071			break;
2072		}
2073		extent_offset = cur - em->start;
2074		BUG_ON(extent_map_end(em) <= cur);
2075		BUG_ON(end < cur);
2076
2077		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2078			this_bio_flag = EXTENT_BIO_COMPRESSED;
2079
2080		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2081		cur_end = min(extent_map_end(em) - 1, end);
2082		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2083		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2084			disk_io_size = em->block_len;
2085			sector = em->block_start >> 9;
2086		} else {
2087			sector = (em->block_start + extent_offset) >> 9;
2088			disk_io_size = iosize;
2089		}
2090		bdev = em->bdev;
2091		block_start = em->block_start;
2092		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2093			block_start = EXTENT_MAP_HOLE;
2094		free_extent_map(em);
2095		em = NULL;
2096
2097		/* we've found a hole, just zero and go on */
2098		if (block_start == EXTENT_MAP_HOLE) {
2099			char *userpage;
2100			userpage = kmap_atomic(page, KM_USER0);
2101			memset(userpage + page_offset, 0, iosize);
2102			flush_dcache_page(page);
2103			kunmap_atomic(userpage, KM_USER0);
2104
2105			set_extent_uptodate(tree, cur, cur + iosize - 1,
2106					    GFP_NOFS);
2107			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2108			cur = cur + iosize;
2109			page_offset += iosize;
2110			continue;
2111		}
2112		/* the get_extent function already copied into the page */
2113		if (test_range_bit(tree, cur, cur_end,
2114				   EXTENT_UPTODATE, 1, NULL)) {
2115			check_page_uptodate(tree, page);
2116			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2117			cur = cur + iosize;
2118			page_offset += iosize;
2119			continue;
2120		}
2121		/* we have an inline extent but it didn't get marked up
2122		 * to date.  Error out
2123		 */
2124		if (block_start == EXTENT_MAP_INLINE) {
2125			SetPageError(page);
2126			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2127			cur = cur + iosize;
2128			page_offset += iosize;
2129			continue;
2130		}
2131
2132		ret = 0;
2133		if (tree->ops && tree->ops->readpage_io_hook) {
2134			ret = tree->ops->readpage_io_hook(page, cur,
2135							  cur + iosize - 1);
2136		}
2137		if (!ret) {
2138			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2139			pnr -= page->index;
2140			ret = submit_extent_page(READ, tree, page,
2141					 sector, disk_io_size, page_offset,
2142					 bdev, bio, pnr,
2143					 end_bio_extent_readpage, mirror_num,
2144					 *bio_flags,
2145					 this_bio_flag);
2146			nr++;
2147			*bio_flags = this_bio_flag;
2148		}
2149		if (ret)
2150			SetPageError(page);
2151		cur = cur + iosize;
2152		page_offset += iosize;
2153	}
2154	if (!nr) {
2155		if (!PageError(page))
2156			SetPageUptodate(page);
2157		unlock_page(page);
2158	}
2159	return 0;
2160}
2161
2162int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2163			    get_extent_t *get_extent)
2164{
2165	struct bio *bio = NULL;
2166	unsigned long bio_flags = 0;
2167	int ret;
2168
2169	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2170				      &bio_flags);
2171	if (bio)
2172		submit_one_bio(READ, bio, 0, bio_flags);
2173	return ret;
2174}
2175
2176static noinline void update_nr_written(struct page *page,
2177				      struct writeback_control *wbc,
2178				      unsigned long nr_written)
2179{
2180	wbc->nr_to_write -= nr_written;
2181	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2182	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2183		page->mapping->writeback_index = page->index + nr_written;
2184}
2185
2186/*
2187 * the writepage semantics are similar to regular writepage.  extent
2188 * records are inserted to lock ranges in the tree, and as dirty areas
2189 * are found, they are marked writeback.  Then the lock bits are removed
2190 * and the end_io handler clears the writeback ranges
2191 */
2192static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2193			      void *data)
2194{
2195	struct inode *inode = page->mapping->host;
2196	struct extent_page_data *epd = data;
2197	struct extent_io_tree *tree = epd->tree;
2198	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2199	u64 delalloc_start;
2200	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2201	u64 end;
2202	u64 cur = start;
2203	u64 extent_offset;
2204	u64 last_byte = i_size_read(inode);
2205	u64 block_start;
2206	u64 iosize;
2207	u64 unlock_start;
2208	sector_t sector;
2209	struct extent_state *cached_state = NULL;
2210	struct extent_map *em;
2211	struct block_device *bdev;
2212	int ret;
2213	int nr = 0;
2214	size_t pg_offset = 0;
2215	size_t blocksize;
2216	loff_t i_size = i_size_read(inode);
2217	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2218	u64 nr_delalloc;
2219	u64 delalloc_end;
2220	int page_started;
2221	int compressed;
2222	int write_flags;
2223	unsigned long nr_written = 0;
2224
2225	if (wbc->sync_mode == WB_SYNC_ALL)
2226		write_flags = WRITE_SYNC_PLUG;
2227	else
2228		write_flags = WRITE;
2229
2230	WARN_ON(!PageLocked(page));
2231	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2232	if (page->index > end_index ||
2233	   (page->index == end_index && !pg_offset)) {
2234		page->mapping->a_ops->invalidatepage(page, 0);
2235		unlock_page(page);
2236		return 0;
2237	}
2238
2239	if (page->index == end_index) {
2240		char *userpage;
2241
2242		userpage = kmap_atomic(page, KM_USER0);
2243		memset(userpage + pg_offset, 0,
2244		       PAGE_CACHE_SIZE - pg_offset);
2245		kunmap_atomic(userpage, KM_USER0);
2246		flush_dcache_page(page);
2247	}
2248	pg_offset = 0;
2249
2250	set_page_extent_mapped(page);
2251
2252	delalloc_start = start;
2253	delalloc_end = 0;
2254	page_started = 0;
2255	if (!epd->extent_locked) {
2256		u64 delalloc_to_write = 0;
2257		/*
2258		 * make sure the wbc mapping index is at least updated
2259		 * to this page.
2260		 */
2261		update_nr_written(page, wbc, 0);
2262
2263		while (delalloc_end < page_end) {
2264			nr_delalloc = find_lock_delalloc_range(inode, tree,
2265						       page,
2266						       &delalloc_start,
2267						       &delalloc_end,
2268						       128 * 1024 * 1024);
2269			if (nr_delalloc == 0) {
2270				delalloc_start = delalloc_end + 1;
2271				continue;
2272			}
2273			tree->ops->fill_delalloc(inode, page, delalloc_start,
2274						 delalloc_end, &page_started,
2275						 &nr_written);
2276			/*
2277			 * delalloc_end is already one less than the total
2278			 * length, so we don't subtract one from
2279			 * PAGE_CACHE_SIZE
2280			 */
2281			delalloc_to_write += (delalloc_end - delalloc_start +
2282					      PAGE_CACHE_SIZE) >>
2283					      PAGE_CACHE_SHIFT;
2284			delalloc_start = delalloc_end + 1;
2285		}
2286		if (wbc->nr_to_write < delalloc_to_write) {
2287			int thresh = 8192;
2288
2289			if (delalloc_to_write < thresh * 2)
2290				thresh = delalloc_to_write;
2291			wbc->nr_to_write = min_t(u64, delalloc_to_write,
2292						 thresh);
2293		}
2294
2295		/* did the fill delalloc function already unlock and start
2296		 * the IO?
2297		 */
2298		if (page_started) {
2299			ret = 0;
2300			/*
2301			 * we've unlocked the page, so we can't update
2302			 * the mapping's writeback index, just update
2303			 * nr_to_write.
2304			 */
2305			wbc->nr_to_write -= nr_written;
2306			goto done_unlocked;
2307		}
2308	}
2309	if (tree->ops && tree->ops->writepage_start_hook) {
2310		ret = tree->ops->writepage_start_hook(page, start,
2311						      page_end);
2312		if (ret == -EAGAIN) {
2313			redirty_page_for_writepage(wbc, page);
2314			update_nr_written(page, wbc, nr_written);
2315			unlock_page(page);
2316			ret = 0;
2317			goto done_unlocked;
2318		}
2319	}
2320
2321	/*
2322	 * we don't want to touch the inode after unlocking the page,
2323	 * so we update the mapping writeback index now
2324	 */
2325	update_nr_written(page, wbc, nr_written + 1);
2326
2327	end = page_end;
2328	if (last_byte <= start) {
2329		if (tree->ops && tree->ops->writepage_end_io_hook)
2330			tree->ops->writepage_end_io_hook(page, start,
2331							 page_end, NULL, 1);
2332		unlock_start = page_end + 1;
2333		goto done;
2334	}
2335
2336	blocksize = inode->i_sb->s_blocksize;
2337
2338	while (cur <= end) {
2339		if (cur >= last_byte) {
2340			if (tree->ops && tree->ops->writepage_end_io_hook)
2341				tree->ops->writepage_end_io_hook(page, cur,
2342							 page_end, NULL, 1);
2343			unlock_start = page_end + 1;
2344			break;
2345		}
2346		em = epd->get_extent(inode, page, pg_offset, cur,
2347				     end - cur + 1, 1);
2348		if (IS_ERR(em) || !em) {
2349			SetPageError(page);
2350			break;
2351		}
2352
2353		extent_offset = cur - em->start;
2354		BUG_ON(extent_map_end(em) <= cur);
2355		BUG_ON(end < cur);
2356		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2357		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2358		sector = (em->block_start + extent_offset) >> 9;
2359		bdev = em->bdev;
2360		block_start = em->block_start;
2361		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2362		free_extent_map(em);
2363		em = NULL;
2364
2365		/*
2366		 * compressed and inline extents are written through other
2367		 * paths in the FS
2368		 */
2369		if (compressed || block_start == EXTENT_MAP_HOLE ||
2370		    block_start == EXTENT_MAP_INLINE) {
2371			/*
2372			 * end_io notification does not happen here for
2373			 * compressed extents
2374			 */
2375			if (!compressed && tree->ops &&
2376			    tree->ops->writepage_end_io_hook)
2377				tree->ops->writepage_end_io_hook(page, cur,
2378							 cur + iosize - 1,
2379							 NULL, 1);
2380			else if (compressed) {
2381				/* we don't want to end_page_writeback on
2382				 * a compressed extent.  this happens
2383				 * elsewhere
2384				 */
2385				nr++;
2386			}
2387
2388			cur += iosize;
2389			pg_offset += iosize;
2390			unlock_start = cur;
2391			continue;
2392		}
2393		/* leave this out until we have a page_mkwrite call */
2394		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2395				   EXTENT_DIRTY, 0, NULL)) {
2396			cur = cur + iosize;
2397			pg_offset += iosize;
2398			continue;
2399		}
2400
2401		if (tree->ops && tree->ops->writepage_io_hook) {
2402			ret = tree->ops->writepage_io_hook(page, cur,
2403						cur + iosize - 1);
2404		} else {
2405			ret = 0;
2406		}
2407		if (ret) {
2408			SetPageError(page);
2409		} else {
2410			unsigned long max_nr = end_index + 1;
2411
2412			set_range_writeback(tree, cur, cur + iosize - 1);
2413			if (!PageWriteback(page)) {
2414				printk(KERN_ERR "btrfs warning page %lu not "
2415				       "writeback, cur %llu end %llu\n",
2416				       page->index, (unsigned long long)cur,
2417				       (unsigned long long)end);
2418			}
2419
2420			ret = submit_extent_page(write_flags, tree, page,
2421						 sector, iosize, pg_offset,
2422						 bdev, &epd->bio, max_nr,
2423						 end_bio_extent_writepage,
2424						 0, 0, 0);
2425			if (ret)
2426				SetPageError(page);
2427		}
2428		cur = cur + iosize;
2429		pg_offset += iosize;
2430		nr++;
2431	}
2432done:
2433	if (nr == 0) {
2434		/* make sure the mapping tag for page dirty gets cleared */
2435		set_page_writeback(page);
2436		end_page_writeback(page);
2437	}
2438	unlock_page(page);
2439
2440done_unlocked:
2441
2442	/* drop our reference on any cached states */
2443	free_extent_state(cached_state);
2444	return 0;
2445}
2446
2447/**
2448 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2449 * @mapping: address space structure to write
2450 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2451 * @writepage: function called for each page
2452 * @data: data passed to writepage function
2453 *
2454 * If a page is already under I/O, write_cache_pages() skips it, even
2455 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2456 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2457 * and msync() need to guarantee that all the data which was dirty at the time
2458 * the call was made get new I/O started against them.  If wbc->sync_mode is
2459 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2460 * existing IO to complete.
2461 */
2462static int extent_write_cache_pages(struct extent_io_tree *tree,
2463			     struct address_space *mapping,
2464			     struct writeback_control *wbc,
2465			     writepage_t writepage, void *data,
2466			     void (*flush_fn)(void *))
2467{
2468	int ret = 0;
2469	int done = 0;
2470	int nr_to_write_done = 0;
2471	struct pagevec pvec;
2472	int nr_pages;
2473	pgoff_t index;
2474	pgoff_t end;		/* Inclusive */
2475	int scanned = 0;
2476	int range_whole = 0;
2477
2478	pagevec_init(&pvec, 0);
2479	if (wbc->range_cyclic) {
2480		index = mapping->writeback_index; /* Start from prev offset */
2481		end = -1;
2482	} else {
2483		index = wbc->range_start >> PAGE_CACHE_SHIFT;
2484		end = wbc->range_end >> PAGE_CACHE_SHIFT;
2485		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2486			range_whole = 1;
2487		scanned = 1;
2488	}
2489retry:
2490	while (!done && !nr_to_write_done && (index <= end) &&
2491	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2492			      PAGECACHE_TAG_DIRTY, min(end - index,
2493				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2494		unsigned i;
2495
2496		scanned = 1;
2497		for (i = 0; i < nr_pages; i++) {
2498			struct page *page = pvec.pages[i];
2499
2500			/*
2501			 * At this point we hold neither mapping->tree_lock nor
2502			 * lock on the page itself: the page may be truncated or
2503			 * invalidated (changing page->mapping to NULL), or even
2504			 * swizzled back from swapper_space to tmpfs file
2505			 * mapping
2506			 */
2507			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2508				tree->ops->write_cache_pages_lock_hook(page);
2509			else
2510				lock_page(page);
2511
2512			if (unlikely(page->mapping != mapping)) {
2513				unlock_page(page);
2514				continue;
2515			}
2516
2517			if (!wbc->range_cyclic && page->index > end) {
2518				done = 1;
2519				unlock_page(page);
2520				continue;
2521			}
2522
2523			if (wbc->sync_mode != WB_SYNC_NONE) {
2524				if (PageWriteback(page))
2525					flush_fn(data);
2526				wait_on_page_writeback(page);
2527			}
2528
2529			if (PageWriteback(page) ||
2530			    !clear_page_dirty_for_io(page)) {
2531				unlock_page(page);
2532				continue;
2533			}
2534
2535			ret = (*writepage)(page, wbc, data);
2536
2537			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2538				unlock_page(page);
2539				ret = 0;
2540			}
2541			if (ret)
2542				done = 1;
2543
2544			/*
2545			 * the filesystem may choose to bump up nr_to_write.
2546			 * We have to make sure to honor the new nr_to_write
2547			 * at any time
2548			 */
2549			nr_to_write_done = wbc->nr_to_write <= 0;
2550		}
2551		pagevec_release(&pvec);
2552		cond_resched();
2553	}
2554	if (!scanned && !done) {
2555		/*
2556		 * We hit the last page and there is more work to be done: wrap
2557		 * back to the start of the file
2558		 */
2559		scanned = 1;
2560		index = 0;
2561		goto retry;
2562	}
2563	return ret;
2564}
2565
2566static void flush_epd_write_bio(struct extent_page_data *epd)
2567{
2568	if (epd->bio) {
2569		if (epd->sync_io)
2570			submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2571		else
2572			submit_one_bio(WRITE, epd->bio, 0, 0);
2573		epd->bio = NULL;
2574	}
2575}
2576
2577static noinline void flush_write_bio(void *data)
2578{
2579	struct extent_page_data *epd = data;
2580	flush_epd_write_bio(epd);
2581}
2582
2583int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2584			  get_extent_t *get_extent,
2585			  struct writeback_control *wbc)
2586{
2587	int ret;
2588	struct address_space *mapping = page->mapping;
2589	struct extent_page_data epd = {
2590		.bio = NULL,
2591		.tree = tree,
2592		.get_extent = get_extent,
2593		.extent_locked = 0,
2594		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
2595	};
2596	struct writeback_control wbc_writepages = {
2597		.sync_mode	= wbc->sync_mode,
2598		.older_than_this = NULL,
2599		.nr_to_write	= 64,
2600		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
2601		.range_end	= (loff_t)-1,
2602	};
2603
2604	ret = __extent_writepage(page, wbc, &epd);
2605
2606	extent_write_cache_pages(tree, mapping, &wbc_writepages,
2607				 __extent_writepage, &epd, flush_write_bio);
2608	flush_epd_write_bio(&epd);
2609	return ret;
2610}
2611
2612int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2613			      u64 start, u64 end, get_extent_t *get_extent,
2614			      int mode)
2615{
2616	int ret = 0;
2617	struct address_space *mapping = inode->i_mapping;
2618	struct page *page;
2619	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2620		PAGE_CACHE_SHIFT;
2621
2622	struct extent_page_data epd = {
2623		.bio = NULL,
2624		.tree = tree,
2625		.get_extent = get_extent,
2626		.extent_locked = 1,
2627		.sync_io = mode == WB_SYNC_ALL,
2628	};
2629	struct writeback_control wbc_writepages = {
2630		.sync_mode	= mode,
2631		.older_than_this = NULL,
2632		.nr_to_write	= nr_pages * 2,
2633		.range_start	= start,
2634		.range_end	= end + 1,
2635	};
2636
2637	while (start <= end) {
2638		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2639		if (clear_page_dirty_for_io(page))
2640			ret = __extent_writepage(page, &wbc_writepages, &epd);
2641		else {
2642			if (tree->ops && tree->ops->writepage_end_io_hook)
2643				tree->ops->writepage_end_io_hook(page, start,
2644						 start + PAGE_CACHE_SIZE - 1,
2645						 NULL, 1);
2646			unlock_page(page);
2647		}
2648		page_cache_release(page);
2649		start += PAGE_CACHE_SIZE;
2650	}
2651
2652	flush_epd_write_bio(&epd);
2653	return ret;
2654}
2655
2656int extent_writepages(struct extent_io_tree *tree,
2657		      struct address_space *mapping,
2658		      get_extent_t *get_extent,
2659		      struct writeback_control *wbc)
2660{
2661	int ret = 0;
2662	struct extent_page_data epd = {
2663		.bio = NULL,
2664		.tree = tree,
2665		.get_extent = get_extent,
2666		.extent_locked = 0,
2667		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
2668	};
2669
2670	ret = extent_write_cache_pages(tree, mapping, wbc,
2671				       __extent_writepage, &epd,
2672				       flush_write_bio);
2673	flush_epd_write_bio(&epd);
2674	return ret;
2675}
2676
2677int extent_readpages(struct extent_io_tree *tree,
2678		     struct address_space *mapping,
2679		     struct list_head *pages, unsigned nr_pages,
2680		     get_extent_t get_extent)
2681{
2682	struct bio *bio = NULL;
2683	unsigned page_idx;
2684	unsigned long bio_flags = 0;
2685
2686	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2687		struct page *page = list_entry(pages->prev, struct page, lru);
2688
2689		prefetchw(&page->flags);
2690		list_del(&page->lru);
2691		if (!add_to_page_cache_lru(page, mapping,
2692					page->index, GFP_KERNEL)) {
2693			__extent_read_full_page(tree, page, get_extent,
2694						&bio, 0, &bio_flags);
2695		}
2696		page_cache_release(page);
2697	}
2698	BUG_ON(!list_empty(pages));
2699	if (bio)
2700		submit_one_bio(READ, bio, 0, bio_flags);
2701	return 0;
2702}
2703
2704/*
2705 * basic invalidatepage code, this waits on any locked or writeback
2706 * ranges corresponding to the page, and then deletes any extent state
2707 * records from the tree
2708 */
2709int extent_invalidatepage(struct extent_io_tree *tree,
2710			  struct page *page, unsigned long offset)
2711{
2712	struct extent_state *cached_state = NULL;
2713	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2714	u64 end = start + PAGE_CACHE_SIZE - 1;
2715	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2716
2717	start += (offset + blocksize - 1) & ~(blocksize - 1);
2718	if (start > end)
2719		return 0;
2720
2721	lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
2722	wait_on_page_writeback(page);
2723	clear_extent_bit(tree, start, end,
2724			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2725			 EXTENT_DO_ACCOUNTING,
2726			 1, 1, &cached_state, GFP_NOFS);
2727	return 0;
2728}
2729
2730/*
2731 * simple commit_write call, set_range_dirty is used to mark both
2732 * the pages and the extent records as dirty
2733 */
2734int extent_commit_write(struct extent_io_tree *tree,
2735			struct inode *inode, struct page *page,
2736			unsigned from, unsigned to)
2737{
2738	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2739
2740	set_page_extent_mapped(page);
2741	set_page_dirty(page);
2742
2743	if (pos > inode->i_size) {
2744		i_size_write(inode, pos);
2745		mark_inode_dirty(inode);
2746	}
2747	return 0;
2748}
2749
2750int extent_prepare_write(struct extent_io_tree *tree,
2751			 struct inode *inode, struct page *page,
2752			 unsigned from, unsigned to, get_extent_t *get_extent)
2753{
2754	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2755	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2756	u64 block_start;
2757	u64 orig_block_start;
2758	u64 block_end;
2759	u64 cur_end;
2760	struct extent_map *em;
2761	unsigned blocksize = 1 << inode->i_blkbits;
2762	size_t page_offset = 0;
2763	size_t block_off_start;
2764	size_t block_off_end;
2765	int err = 0;
2766	int iocount = 0;
2767	int ret = 0;
2768	int isnew;
2769
2770	set_page_extent_mapped(page);
2771
2772	block_start = (page_start + from) & ~((u64)blocksize - 1);
2773	block_end = (page_start + to - 1) | (blocksize - 1);
2774	orig_block_start = block_start;
2775
2776	lock_extent(tree, page_start, page_end, GFP_NOFS);
2777	while (block_start <= block_end) {
2778		em = get_extent(inode, page, page_offset, block_start,
2779				block_end - block_start + 1, 1);
2780		if (IS_ERR(em) || !em)
2781			goto err;
2782
2783		cur_end = min(block_end, extent_map_end(em) - 1);
2784		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2785		block_off_end = block_off_start + blocksize;
2786		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2787
2788		if (!PageUptodate(page) && isnew &&
2789		    (block_off_end > to || block_off_start < from)) {
2790			void *kaddr;
2791
2792			kaddr = kmap_atomic(page, KM_USER0);
2793			if (block_off_end > to)
2794				memset(kaddr + to, 0, block_off_end - to);
2795			if (block_off_start < from)
2796				memset(kaddr + block_off_start, 0,
2797				       from - block_off_start);
2798			flush_dcache_page(page);
2799			kunmap_atomic(kaddr, KM_USER0);
2800		}
2801		if ((em->block_start != EXTENT_MAP_HOLE &&
2802		     em->block_start != EXTENT_MAP_INLINE) &&
2803		    !isnew && !PageUptodate(page) &&
2804		    (block_off_end > to || block_off_start < from) &&
2805		    !test_range_bit(tree, block_start, cur_end,
2806				    EXTENT_UPTODATE, 1, NULL)) {
2807			u64 sector;
2808			u64 extent_offset = block_start - em->start;
2809			size_t iosize;
2810			sector = (em->block_start + extent_offset) >> 9;
2811			iosize = (cur_end - block_start + blocksize) &
2812				~((u64)blocksize - 1);
2813			/*
2814			 * we've already got the extent locked, but we
2815			 * need to split the state such that our end_bio
2816			 * handler can clear the lock.
2817			 */
2818			set_extent_bit(tree, block_start,
2819				       block_start + iosize - 1,
2820				       EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
2821			ret = submit_extent_page(READ, tree, page,
2822					 sector, iosize, page_offset, em->bdev,
2823					 NULL, 1,
2824					 end_bio_extent_preparewrite, 0,
2825					 0, 0);
2826			iocount++;
2827			block_start = block_start + iosize;
2828		} else {
2829			set_extent_uptodate(tree, block_start, cur_end,
2830					    GFP_NOFS);
2831			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2832			block_start = cur_end + 1;
2833		}
2834		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2835		free_extent_map(em);
2836	}
2837	if (iocount) {
2838		wait_extent_bit(tree, orig_block_start,
2839				block_end, EXTENT_LOCKED);
2840	}
2841	check_page_uptodate(tree, page);
2842err:
2843	return err;
2844}
2845
2846/*
2847 * a helper for releasepage, this tests for areas of the page that
2848 * are locked or under IO and drops the related state bits if it is safe
2849 * to drop the page.
2850 */
2851int try_release_extent_state(struct extent_map_tree *map,
2852			     struct extent_io_tree *tree, struct page *page,
2853			     gfp_t mask)
2854{
2855	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2856	u64 end = start + PAGE_CACHE_SIZE - 1;
2857	int ret = 1;
2858
2859	if (test_range_bit(tree, start, end,
2860			   EXTENT_IOBITS, 0, NULL))
2861		ret = 0;
2862	else {
2863		if ((mask & GFP_NOFS) == GFP_NOFS)
2864			mask = GFP_NOFS;
2865		/*
2866		 * at this point we can safely clear everything except the
2867		 * locked bit and the nodatasum bit
2868		 */
2869		clear_extent_bit(tree, start, end,
2870				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2871				 0, 0, NULL, mask);
2872	}
2873	return ret;
2874}
2875
2876/*
2877 * a helper for releasepage.  As long as there are no locked extents
2878 * in the range corresponding to the page, both state records and extent
2879 * map records are removed
2880 */
2881int try_release_extent_mapping(struct extent_map_tree *map,
2882			       struct extent_io_tree *tree, struct page *page,
2883			       gfp_t mask)
2884{
2885	struct extent_map *em;
2886	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2887	u64 end = start + PAGE_CACHE_SIZE - 1;
2888
2889	if ((mask & __GFP_WAIT) &&
2890	    page->mapping->host->i_size > 16 * 1024 * 1024) {
2891		u64 len;
2892		while (start <= end) {
2893			len = end - start + 1;
2894			write_lock(&map->lock);
2895			em = lookup_extent_mapping(map, start, len);
2896			if (!em || IS_ERR(em)) {
2897				write_unlock(&map->lock);
2898				break;
2899			}
2900			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2901			    em->start != start) {
2902				write_unlock(&map->lock);
2903				free_extent_map(em);
2904				break;
2905			}
2906			if (!test_range_bit(tree, em->start,
2907					    extent_map_end(em) - 1,
2908					    EXTENT_LOCKED | EXTENT_WRITEBACK,
2909					    0, NULL)) {
2910				remove_extent_mapping(map, em);
2911				/* once for the rb tree */
2912				free_extent_map(em);
2913			}
2914			start = extent_map_end(em);
2915			write_unlock(&map->lock);
2916
2917			/* once for us */
2918			free_extent_map(em);
2919		}
2920	}
2921	return try_release_extent_state(map, tree, page, mask);
2922}
2923
2924sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2925		get_extent_t *get_extent)
2926{
2927	struct inode *inode = mapping->host;
2928	struct extent_state *cached_state = NULL;
2929	u64 start = iblock << inode->i_blkbits;
2930	sector_t sector = 0;
2931	size_t blksize = (1 << inode->i_blkbits);
2932	struct extent_map *em;
2933
2934	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2935			 0, &cached_state, GFP_NOFS);
2936	em = get_extent(inode, NULL, 0, start, blksize, 0);
2937	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
2938			     start + blksize - 1, &cached_state, GFP_NOFS);
2939	if (!em || IS_ERR(em))
2940		return 0;
2941
2942	if (em->block_start > EXTENT_MAP_LAST_BYTE)
2943		goto out;
2944
2945	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2946out:
2947	free_extent_map(em);
2948	return sector;
2949}
2950
2951int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2952		__u64 start, __u64 len, get_extent_t *get_extent)
2953{
2954	int ret;
2955	u64 off = start;
2956	u64 max = start + len;
2957	u32 flags = 0;
2958	u64 disko = 0;
2959	struct extent_map *em = NULL;
2960	struct extent_state *cached_state = NULL;
2961	int end = 0;
2962	u64 em_start = 0, em_len = 0;
2963	unsigned long emflags;
2964	ret = 0;
2965
2966	if (len == 0)
2967		return -EINVAL;
2968
2969	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2970			 &cached_state, GFP_NOFS);
2971	em = get_extent(inode, NULL, 0, off, max - off, 0);
2972	if (!em)
2973		goto out;
2974	if (IS_ERR(em)) {
2975		ret = PTR_ERR(em);
2976		goto out;
2977	}
2978	while (!end) {
2979		off = em->start + em->len;
2980		if (off >= max)
2981			end = 1;
2982
2983		em_start = em->start;
2984		em_len = em->len;
2985
2986		disko = 0;
2987		flags = 0;
2988
2989		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2990			end = 1;
2991			flags |= FIEMAP_EXTENT_LAST;
2992		} else if (em->block_start == EXTENT_MAP_HOLE) {
2993			flags |= FIEMAP_EXTENT_UNWRITTEN;
2994		} else if (em->block_start == EXTENT_MAP_INLINE) {
2995			flags |= (FIEMAP_EXTENT_DATA_INLINE |
2996				  FIEMAP_EXTENT_NOT_ALIGNED);
2997		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
2998			flags |= (FIEMAP_EXTENT_DELALLOC |
2999				  FIEMAP_EXTENT_UNKNOWN);
3000		} else {
3001			disko = em->block_start;
3002		}
3003		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3004			flags |= FIEMAP_EXTENT_ENCODED;
3005
3006		emflags = em->flags;
3007		free_extent_map(em);
3008		em = NULL;
3009
3010		if (!end) {
3011			em = get_extent(inode, NULL, 0, off, max - off, 0);
3012			if (!em)
3013				goto out;
3014			if (IS_ERR(em)) {
3015				ret = PTR_ERR(em);
3016				goto out;
3017			}
3018			emflags = em->flags;
3019		}
3020		if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
3021			flags |= FIEMAP_EXTENT_LAST;
3022			end = 1;
3023		}
3024
3025		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3026					em_len, flags);
3027		if (ret)
3028			goto out_free;
3029	}
3030out_free:
3031	free_extent_map(em);
3032out:
3033	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3034			     &cached_state, GFP_NOFS);
3035	return ret;
3036}
3037
3038static inline struct page *extent_buffer_page(struct extent_buffer *eb,
3039					      unsigned long i)
3040{
3041	struct page *p;
3042	struct address_space *mapping;
3043
3044	if (i == 0)
3045		return eb->first_page;
3046	i += eb->start >> PAGE_CACHE_SHIFT;
3047	mapping = eb->first_page->mapping;
3048	if (!mapping)
3049		return NULL;
3050
3051	/*
3052	 * extent_buffer_page is only called after pinning the page
3053	 * by increasing the reference count.  So we know the page must
3054	 * be in the radix tree.
3055	 */
3056	rcu_read_lock();
3057	p = radix_tree_lookup(&mapping->page_tree, i);
3058	rcu_read_unlock();
3059
3060	return p;
3061}
3062
3063static inline unsigned long num_extent_pages(u64 start, u64 len)
3064{
3065	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3066		(start >> PAGE_CACHE_SHIFT);
3067}
3068
3069static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3070						   u64 start,
3071						   unsigned long len,
3072						   gfp_t mask)
3073{
3074	struct extent_buffer *eb = NULL;
3075#if LEAK_DEBUG
3076	unsigned long flags;
3077#endif
3078
3079	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3080	eb->start = start;
3081	eb->len = len;
3082	spin_lock_init(&eb->lock);
3083	init_waitqueue_head(&eb->lock_wq);
3084
3085#if LEAK_DEBUG
3086	spin_lock_irqsave(&leak_lock, flags);
3087	list_add(&eb->leak_list, &buffers);
3088	spin_unlock_irqrestore(&leak_lock, flags);
3089#endif
3090	atomic_set(&eb->refs, 1);
3091
3092	return eb;
3093}
3094
3095static void __free_extent_buffer(struct extent_buffer *eb)
3096{
3097#if LEAK_DEBUG
3098	unsigned long flags;
3099	spin_lock_irqsave(&leak_lock, flags);
3100	list_del(&eb->leak_list);
3101	spin_unlock_irqrestore(&leak_lock, flags);
3102#endif
3103	kmem_cache_free(extent_buffer_cache, eb);
3104}
3105
3106struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3107					  u64 start, unsigned long len,
3108					  struct page *page0,
3109					  gfp_t mask)
3110{
3111	unsigned long num_pages = num_extent_pages(start, len);
3112	unsigned long i;
3113	unsigned long index = start >> PAGE_CACHE_SHIFT;
3114	struct extent_buffer *eb;
3115	struct extent_buffer *exists = NULL;
3116	struct page *p;
3117	struct address_space *mapping = tree->mapping;
3118	int uptodate = 1;
3119
3120	spin_lock(&tree->buffer_lock);
3121	eb = buffer_search(tree, start);
3122	if (eb) {
3123		atomic_inc(&eb->refs);
3124		spin_unlock(&tree->buffer_lock);
3125		mark_page_accessed(eb->first_page);
3126		return eb;
3127	}
3128	spin_unlock(&tree->buffer_lock);
3129
3130	eb = __alloc_extent_buffer(tree, start, len, mask);
3131	if (!eb)
3132		return NULL;
3133
3134	if (page0) {
3135		eb->first_page = page0;
3136		i = 1;
3137		index++;
3138		page_cache_get(page0);
3139		mark_page_accessed(page0);
3140		set_page_extent_mapped(page0);
3141		set_page_extent_head(page0, len);
3142		uptodate = PageUptodate(page0);
3143	} else {
3144		i = 0;
3145	}
3146	for (; i < num_pages; i++, index++) {
3147		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
3148		if (!p) {
3149			WARN_ON(1);
3150			goto free_eb;
3151		}
3152		set_page_extent_mapped(p);
3153		mark_page_accessed(p);
3154		if (i == 0) {
3155			eb->first_page = p;
3156			set_page_extent_head(p, len);
3157		} else {
3158			set_page_private(p, EXTENT_PAGE_PRIVATE);
3159		}
3160		if (!PageUptodate(p))
3161			uptodate = 0;
3162		unlock_page(p);
3163	}
3164	if (uptodate)
3165		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3166
3167	spin_lock(&tree->buffer_lock);
3168	exists = buffer_tree_insert(tree, start, &eb->rb_node);
3169	if (exists) {
3170		/* add one reference for the caller */
3171		atomic_inc(&exists->refs);
3172		spin_unlock(&tree->buffer_lock);
3173		goto free_eb;
3174	}
3175	/* add one reference for the tree */
3176	atomic_inc(&eb->refs);
3177	spin_unlock(&tree->buffer_lock);
3178	return eb;
3179
3180free_eb:
3181	if (!atomic_dec_and_test(&eb->refs))
3182		return exists;
3183	for (index = 1; index < i; index++)
3184		page_cache_release(extent_buffer_page(eb, index));
3185	page_cache_release(extent_buffer_page(eb, 0));
3186	__free_extent_buffer(eb);
3187	return exists;
3188}
3189
3190struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3191					 u64 start, unsigned long len,
3192					  gfp_t mask)
3193{
3194	struct extent_buffer *eb;
3195
3196	spin_lock(&tree->buffer_lock);
3197	eb = buffer_search(tree, start);
3198	if (eb)
3199		atomic_inc(&eb->refs);
3200	spin_unlock(&tree->buffer_lock);
3201
3202	if (eb)
3203		mark_page_accessed(eb->first_page);
3204
3205	return eb;
3206}
3207
3208void free_extent_buffer(struct extent_buffer *eb)
3209{
3210	if (!eb)
3211		return;
3212
3213	if (!atomic_dec_and_test(&eb->refs))
3214		return;
3215
3216	WARN_ON(1);
3217}
3218
3219int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3220			      struct extent_buffer *eb)
3221{
3222	unsigned long i;
3223	unsigned long num_pages;
3224	struct page *page;
3225
3226	num_pages = num_extent_pages(eb->start, eb->len);
3227
3228	for (i = 0; i < num_pages; i++) {
3229		page = extent_buffer_page(eb, i);
3230		if (!PageDirty(page))
3231			continue;
3232
3233		lock_page(page);
3234		if (i == 0)
3235			set_page_extent_head(page, eb->len);
3236		else
3237			set_page_private(page, EXTENT_PAGE_PRIVATE);
3238
3239		clear_page_dirty_for_io(page);
3240		spin_lock_irq(&page->mapping->tree_lock);
3241		if (!PageDirty(page)) {
3242			radix_tree_tag_clear(&page->mapping->page_tree,
3243						page_index(page),
3244						PAGECACHE_TAG_DIRTY);
3245		}
3246		spin_unlock_irq(&page->mapping->tree_lock);
3247		unlock_page(page);
3248	}
3249	return 0;
3250}
3251
3252int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3253				    struct extent_buffer *eb)
3254{
3255	return wait_on_extent_writeback(tree, eb->start,
3256					eb->start + eb->len - 1);
3257}
3258
3259int set_extent_buffer_dirty(struct extent_io_tree *tree,
3260			     struct extent_buffer *eb)
3261{
3262	unsigned long i;
3263	unsigned long num_pages;
3264	int was_dirty = 0;
3265
3266	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3267	num_pages = num_extent_pages(eb->start, eb->len);
3268	for (i = 0; i < num_pages; i++)
3269		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3270	return was_dirty;
3271}
3272
3273int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3274				struct extent_buffer *eb,
3275				struct extent_state **cached_state)
3276{
3277	unsigned long i;
3278	struct page *page;
3279	unsigned long num_pages;
3280
3281	num_pages = num_extent_pages(eb->start, eb->len);
3282	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3283
3284	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3285			      cached_state, GFP_NOFS);
3286	for (i = 0; i < num_pages; i++) {
3287		page = extent_buffer_page(eb, i);
3288		if (page)
3289			ClearPageUptodate(page);
3290	}
3291	return 0;
3292}
3293
3294int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3295				struct extent_buffer *eb)
3296{
3297	unsigned long i;
3298	struct page *page;
3299	unsigned long num_pages;
3300
3301	num_pages = num_extent_pages(eb->start, eb->len);
3302
3303	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3304			    GFP_NOFS);
3305	for (i = 0; i < num_pages; i++) {
3306		page = extent_buffer_page(eb, i);
3307		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3308		    ((i == num_pages - 1) &&
3309		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3310			check_page_uptodate(tree, page);
3311			continue;
3312		}
3313		SetPageUptodate(page);
3314	}
3315	return 0;
3316}
3317
3318int extent_range_uptodate(struct extent_io_tree *tree,
3319			  u64 start, u64 end)
3320{
3321	struct page *page;
3322	int ret;
3323	int pg_uptodate = 1;
3324	int uptodate;
3325	unsigned long index;
3326
3327	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
3328	if (ret)
3329		return 1;
3330	while (start <= end) {
3331		index = start >> PAGE_CACHE_SHIFT;
3332		page = find_get_page(tree->mapping, index);
3333		uptodate = PageUptodate(page);
3334		page_cache_release(page);
3335		if (!uptodate) {
3336			pg_uptodate = 0;
3337			break;
3338		}
3339		start += PAGE_CACHE_SIZE;
3340	}
3341	return pg_uptodate;
3342}
3343
3344int extent_buffer_uptodate(struct extent_io_tree *tree,
3345			   struct extent_buffer *eb,
3346			   struct extent_state *cached_state)
3347{
3348	int ret = 0;
3349	unsigned long num_pages;
3350	unsigned long i;
3351	struct page *page;
3352	int pg_uptodate = 1;
3353
3354	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3355		return 1;
3356
3357	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3358			   EXTENT_UPTODATE, 1, cached_state);
3359	if (ret)
3360		return ret;
3361
3362	num_pages = num_extent_pages(eb->start, eb->len);
3363	for (i = 0; i < num_pages; i++) {
3364		page = extent_buffer_page(eb, i);
3365		if (!PageUptodate(page)) {
3366			pg_uptodate = 0;
3367			break;
3368		}
3369	}
3370	return pg_uptodate;
3371}
3372
3373int read_extent_buffer_pages(struct extent_io_tree *tree,
3374			     struct extent_buffer *eb,
3375			     u64 start, int wait,
3376			     get_extent_t *get_extent, int mirror_num)
3377{
3378	unsigned long i;
3379	unsigned long start_i;
3380	struct page *page;
3381	int err;
3382	int ret = 0;
3383	int locked_pages = 0;
3384	int all_uptodate = 1;
3385	int inc_all_pages = 0;
3386	unsigned long num_pages;
3387	struct bio *bio = NULL;
3388	unsigned long bio_flags = 0;
3389
3390	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3391		return 0;
3392
3393	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3394			   EXTENT_UPTODATE, 1, NULL)) {
3395		return 0;
3396	}
3397
3398	if (start) {
3399		WARN_ON(start < eb->start);
3400		start_i = (start >> PAGE_CACHE_SHIFT) -
3401			(eb->start >> PAGE_CACHE_SHIFT);
3402	} else {
3403		start_i = 0;
3404	}
3405
3406	num_pages = num_extent_pages(eb->start, eb->len);
3407	for (i = start_i; i < num_pages; i++) {
3408		page = extent_buffer_page(eb, i);
3409		if (!wait) {
3410			if (!trylock_page(page))
3411				goto unlock_exit;
3412		} else {
3413			lock_page(page);
3414		}
3415		locked_pages++;
3416		if (!PageUptodate(page))
3417			all_uptodate = 0;
3418	}
3419	if (all_uptodate) {
3420		if (start_i == 0)
3421			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3422		goto unlock_exit;
3423	}
3424
3425	for (i = start_i; i < num_pages; i++) {
3426		page = extent_buffer_page(eb, i);
3427		if (inc_all_pages)
3428			page_cache_get(page);
3429		if (!PageUptodate(page)) {
3430			if (start_i == 0)
3431				inc_all_pages = 1;
3432			ClearPageError(page);
3433			err = __extent_read_full_page(tree, page,
3434						      get_extent, &bio,
3435						      mirror_num, &bio_flags);
3436			if (err)
3437				ret = err;
3438		} else {
3439			unlock_page(page);
3440		}
3441	}
3442
3443	if (bio)
3444		submit_one_bio(READ, bio, mirror_num, bio_flags);
3445
3446	if (ret || !wait)
3447		return ret;
3448
3449	for (i = start_i; i < num_pages; i++) {
3450		page = extent_buffer_page(eb, i);
3451		wait_on_page_locked(page);
3452		if (!PageUptodate(page))
3453			ret = -EIO;
3454	}
3455
3456	if (!ret)
3457		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3458	return ret;
3459
3460unlock_exit:
3461	i = start_i;
3462	while (locked_pages > 0) {
3463		page = extent_buffer_page(eb, i);
3464		i++;
3465		unlock_page(page);
3466		locked_pages--;
3467	}
3468	return ret;
3469}
3470
3471void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3472			unsigned long start,
3473			unsigned long len)
3474{
3475	size_t cur;
3476	size_t offset;
3477	struct page *page;
3478	char *kaddr;
3479	char *dst = (char *)dstv;
3480	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3481	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3482
3483	WARN_ON(start > eb->len);
3484	WARN_ON(start + len > eb->start + eb->len);
3485
3486	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3487
3488	while (len > 0) {
3489		page = extent_buffer_page(eb, i);
3490
3491		cur = min(len, (PAGE_CACHE_SIZE - offset));
3492		kaddr = kmap_atomic(page, KM_USER1);
3493		memcpy(dst, kaddr + offset, cur);
3494		kunmap_atomic(kaddr, KM_USER1);
3495
3496		dst += cur;
3497		len -= cur;
3498		offset = 0;
3499		i++;
3500	}
3501}
3502
3503int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3504			       unsigned long min_len, char **token, char **map,
3505			       unsigned long *map_start,
3506			       unsigned long *map_len, int km)
3507{
3508	size_t offset = start & (PAGE_CACHE_SIZE - 1);
3509	char *kaddr;
3510	struct page *p;
3511	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3512	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3513	unsigned long end_i = (start_offset + start + min_len - 1) >>
3514		PAGE_CACHE_SHIFT;
3515
3516	if (i != end_i)
3517		return -EINVAL;
3518
3519	if (i == 0) {
3520		offset = start_offset;
3521		*map_start = 0;
3522	} else {
3523		offset = 0;
3524		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3525	}
3526
3527	if (start + min_len > eb->len) {
3528		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3529		       "wanted %lu %lu\n", (unsigned long long)eb->start,
3530		       eb->len, start, min_len);
3531		WARN_ON(1);
3532	}
3533
3534	p = extent_buffer_page(eb, i);
3535	kaddr = kmap_atomic(p, km);
3536	*token = kaddr;
3537	*map = kaddr + offset;
3538	*map_len = PAGE_CACHE_SIZE - offset;
3539	return 0;
3540}
3541
3542int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3543		      unsigned long min_len,
3544		      char **token, char **map,
3545		      unsigned long *map_start,
3546		      unsigned long *map_len, int km)
3547{
3548	int err;
3549	int save = 0;
3550	if (eb->map_token) {
3551		unmap_extent_buffer(eb, eb->map_token, km);
3552		eb->map_token = NULL;
3553		save = 1;
3554	}
3555	err = map_private_extent_buffer(eb, start, min_len, token, map,
3556				       map_start, map_len, km);
3557	if (!err && save) {
3558		eb->map_token = *token;
3559		eb->kaddr = *map;
3560		eb->map_start = *map_start;
3561		eb->map_len = *map_len;
3562	}
3563	return err;
3564}
3565
3566void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3567{
3568	kunmap_atomic(token, km);
3569}
3570
3571int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3572			  unsigned long start,
3573			  unsigned long len)
3574{
3575	size_t cur;
3576	size_t offset;
3577	struct page *page;
3578	char *kaddr;
3579	char *ptr = (char *)ptrv;
3580	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3581	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3582	int ret = 0;
3583
3584	WARN_ON(start > eb->len);
3585	WARN_ON(start + len > eb->start + eb->len);
3586
3587	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3588
3589	while (len > 0) {
3590		page = extent_buffer_page(eb, i);
3591
3592		cur = min(len, (PAGE_CACHE_SIZE - offset));
3593
3594		kaddr = kmap_atomic(page, KM_USER0);
3595		ret = memcmp(ptr, kaddr + offset, cur);
3596		kunmap_atomic(kaddr, KM_USER0);
3597		if (ret)
3598			break;
3599
3600		ptr += cur;
3601		len -= cur;
3602		offset = 0;
3603		i++;
3604	}
3605	return ret;
3606}
3607
3608void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3609			 unsigned long start, unsigned long len)
3610{
3611	size_t cur;
3612	size_t offset;
3613	struct page *page;
3614	char *kaddr;
3615	char *src = (char *)srcv;
3616	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3617	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3618
3619	WARN_ON(start > eb->len);
3620	WARN_ON(start + len > eb->start + eb->len);
3621
3622	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3623
3624	while (len > 0) {
3625		page = extent_buffer_page(eb, i);
3626		WARN_ON(!PageUptodate(page));
3627
3628		cur = min(len, PAGE_CACHE_SIZE - offset);
3629		kaddr = kmap_atomic(page, KM_USER1);
3630		memcpy(kaddr + offset, src, cur);
3631		kunmap_atomic(kaddr, KM_USER1);
3632
3633		src += cur;
3634		len -= cur;
3635		offset = 0;
3636		i++;
3637	}
3638}
3639
3640void memset_extent_buffer(struct extent_buffer *eb, char c,
3641			  unsigned long start, unsigned long len)
3642{
3643	size_t cur;
3644	size_t offset;
3645	struct page *page;
3646	char *kaddr;
3647	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3648	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3649
3650	WARN_ON(start > eb->len);
3651	WARN_ON(start + len > eb->start + eb->len);
3652
3653	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3654
3655	while (len > 0) {
3656		page = extent_buffer_page(eb, i);
3657		WARN_ON(!PageUptodate(page));
3658
3659		cur = min(len, PAGE_CACHE_SIZE - offset);
3660		kaddr = kmap_atomic(page, KM_USER0);
3661		memset(kaddr + offset, c, cur);
3662		kunmap_atomic(kaddr, KM_USER0);
3663
3664		len -= cur;
3665		offset = 0;
3666		i++;
3667	}
3668}
3669
3670void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3671			unsigned long dst_offset, unsigned long src_offset,
3672			unsigned long len)
3673{
3674	u64 dst_len = dst->len;
3675	size_t cur;
3676	size_t offset;
3677	struct page *page;
3678	char *kaddr;
3679	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3680	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3681
3682	WARN_ON(src->len != dst_len);
3683
3684	offset = (start_offset + dst_offset) &
3685		((unsigned long)PAGE_CACHE_SIZE - 1);
3686
3687	while (len > 0) {
3688		page = extent_buffer_page(dst, i);
3689		WARN_ON(!PageUptodate(page));
3690
3691		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3692
3693		kaddr = kmap_atomic(page, KM_USER0);
3694		read_extent_buffer(src, kaddr + offset, src_offset, cur);
3695		kunmap_atomic(kaddr, KM_USER0);
3696
3697		src_offset += cur;
3698		len -= cur;
3699		offset = 0;
3700		i++;
3701	}
3702}
3703
3704static void move_pages(struct page *dst_page, struct page *src_page,
3705		       unsigned long dst_off, unsigned long src_off,
3706		       unsigned long len)
3707{
3708	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3709	if (dst_page == src_page) {
3710		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3711	} else {
3712		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3713		char *p = dst_kaddr + dst_off + len;
3714		char *s = src_kaddr + src_off + len;
3715
3716		while (len--)
3717			*--p = *--s;
3718
3719		kunmap_atomic(src_kaddr, KM_USER1);
3720	}
3721	kunmap_atomic(dst_kaddr, KM_USER0);
3722}
3723
3724static void copy_pages(struct page *dst_page, struct page *src_page,
3725		       unsigned long dst_off, unsigned long src_off,
3726		       unsigned long len)
3727{
3728	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3729	char *src_kaddr;
3730
3731	if (dst_page != src_page)
3732		src_kaddr = kmap_atomic(src_page, KM_USER1);
3733	else
3734		src_kaddr = dst_kaddr;
3735
3736	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3737	kunmap_atomic(dst_kaddr, KM_USER0);
3738	if (dst_page != src_page)
3739		kunmap_atomic(src_kaddr, KM_USER1);
3740}
3741
3742void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3743			   unsigned long src_offset, unsigned long len)
3744{
3745	size_t cur;
3746	size_t dst_off_in_page;
3747	size_t src_off_in_page;
3748	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3749	unsigned long dst_i;
3750	unsigned long src_i;
3751
3752	if (src_offset + len > dst->len) {
3753		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3754		       "len %lu dst len %lu\n", src_offset, len, dst->len);
3755		BUG_ON(1);
3756	}
3757	if (dst_offset + len > dst->len) {
3758		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3759		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
3760		BUG_ON(1);
3761	}
3762
3763	while (len > 0) {
3764		dst_off_in_page = (start_offset + dst_offset) &
3765			((unsigned long)PAGE_CACHE_SIZE - 1);
3766		src_off_in_page = (start_offset + src_offset) &
3767			((unsigned long)PAGE_CACHE_SIZE - 1);
3768
3769		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3770		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3771
3772		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3773					       src_off_in_page));
3774		cur = min_t(unsigned long, cur,
3775			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3776
3777		copy_pages(extent_buffer_page(dst, dst_i),
3778			   extent_buffer_page(dst, src_i),
3779			   dst_off_in_page, src_off_in_page, cur);
3780
3781		src_offset += cur;
3782		dst_offset += cur;
3783		len -= cur;
3784	}
3785}
3786
3787void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3788			   unsigned long src_offset, unsigned long len)
3789{
3790	size_t cur;
3791	size_t dst_off_in_page;
3792	size_t src_off_in_page;
3793	unsigned long dst_end = dst_offset + len - 1;
3794	unsigned long src_end = src_offset + len - 1;
3795	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3796	unsigned long dst_i;
3797	unsigned long src_i;
3798
3799	if (src_offset + len > dst->len) {
3800		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3801		       "len %lu len %lu\n", src_offset, len, dst->len);
3802		BUG_ON(1);
3803	}
3804	if (dst_offset + len > dst->len) {
3805		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3806		       "len %lu len %lu\n", dst_offset, len, dst->len);
3807		BUG_ON(1);
3808	}
3809	if (dst_offset < src_offset) {
3810		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3811		return;
3812	}
3813	while (len > 0) {
3814		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3815		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3816
3817		dst_off_in_page = (start_offset + dst_end) &
3818			((unsigned long)PAGE_CACHE_SIZE - 1);
3819		src_off_in_page = (start_offset + src_end) &
3820			((unsigned long)PAGE_CACHE_SIZE - 1);
3821
3822		cur = min_t(unsigned long, len, src_off_in_page + 1);
3823		cur = min(cur, dst_off_in_page + 1);
3824		move_pages(extent_buffer_page(dst, dst_i),
3825			   extent_buffer_page(dst, src_i),
3826			   dst_off_in_page - cur + 1,
3827			   src_off_in_page - cur + 1, cur);
3828
3829		dst_end -= cur;
3830		src_end -= cur;
3831		len -= cur;
3832	}
3833}
3834
3835int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3836{
3837	u64 start = page_offset(page);
3838	struct extent_buffer *eb;
3839	int ret = 1;
3840	unsigned long i;
3841	unsigned long num_pages;
3842
3843	spin_lock(&tree->buffer_lock);
3844	eb = buffer_search(tree, start);
3845	if (!eb)
3846		goto out;
3847
3848	if (atomic_read(&eb->refs) > 1) {
3849		ret = 0;
3850		goto out;
3851	}
3852	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3853		ret = 0;
3854		goto out;
3855	}
3856	/* at this point we can safely release the extent buffer */
3857	num_pages = num_extent_pages(eb->start, eb->len);
3858	for (i = 0; i < num_pages; i++)
3859		page_cache_release(extent_buffer_page(eb, i));
3860	rb_erase(&eb->rb_node, &tree->buffer);
3861	__free_extent_buffer(eb);
3862out:
3863	spin_unlock(&tree->buffer_lock);
3864	return ret;
3865}
3866