1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 *   Copyright (c) 2005, Bull S.A.
7 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 *   - ext4*_error() should be used in some situations
28 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 *   - smart tree reduction
30 */
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/time.h>
35#include <linux/jbd2.h>
36#include <linux/highuid.h>
37#include <linux/pagemap.h>
38#include <linux/quotaops.h>
39#include <linux/string.h>
40#include <linux/slab.h>
41#include <linux/falloc.h>
42#include <asm/uaccess.h>
43#include <linux/fiemap.h>
44#include "ext4_jbd2.h"
45#include "ext4_extents.h"
46
47
48/*
49 * ext_pblock:
50 * combine low and high parts of physical block number into ext4_fsblk_t
51 */
52ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
53{
54	ext4_fsblk_t block;
55
56	block = le32_to_cpu(ex->ee_start_lo);
57	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
58	return block;
59}
60
61/*
62 * idx_pblock:
63 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
64 */
65ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
66{
67	ext4_fsblk_t block;
68
69	block = le32_to_cpu(ix->ei_leaf_lo);
70	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
71	return block;
72}
73
74/*
75 * ext4_ext_store_pblock:
76 * stores a large physical block number into an extent struct,
77 * breaking it into parts
78 */
79void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
80{
81	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
82	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
83}
84
85/*
86 * ext4_idx_store_pblock:
87 * stores a large physical block number into an index struct,
88 * breaking it into parts
89 */
90static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
91{
92	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
93	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94}
95
96static int ext4_ext_truncate_extend_restart(handle_t *handle,
97					    struct inode *inode,
98					    int needed)
99{
100	int err;
101
102	if (!ext4_handle_valid(handle))
103		return 0;
104	if (handle->h_buffer_credits > needed)
105		return 0;
106	err = ext4_journal_extend(handle, needed);
107	if (err <= 0)
108		return err;
109	err = ext4_truncate_restart_trans(handle, inode, needed);
110	if (err == 0)
111		err = -EAGAIN;
112
113	return err;
114}
115
116/*
117 * could return:
118 *  - EROFS
119 *  - ENOMEM
120 */
121static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
122				struct ext4_ext_path *path)
123{
124	if (path->p_bh) {
125		/* path points to block */
126		return ext4_journal_get_write_access(handle, path->p_bh);
127	}
128	/* path points to leaf/index in inode body */
129	/* we use in-core data, no need to protect them */
130	return 0;
131}
132
133/*
134 * could return:
135 *  - EROFS
136 *  - ENOMEM
137 *  - EIO
138 */
139static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
140				struct ext4_ext_path *path)
141{
142	int err;
143	if (path->p_bh) {
144		/* path points to block */
145		err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
146	} else {
147		/* path points to leaf/index in inode body */
148		err = ext4_mark_inode_dirty(handle, inode);
149	}
150	return err;
151}
152
153static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
154			      struct ext4_ext_path *path,
155			      ext4_lblk_t block)
156{
157	struct ext4_inode_info *ei = EXT4_I(inode);
158	ext4_fsblk_t bg_start;
159	ext4_fsblk_t last_block;
160	ext4_grpblk_t colour;
161	ext4_group_t block_group;
162	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
163	int depth;
164
165	if (path) {
166		struct ext4_extent *ex;
167		depth = path->p_depth;
168
169		/* try to predict block placement */
170		ex = path[depth].p_ext;
171		if (ex)
172			return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
173
174		/* it looks like index is empty;
175		 * try to find starting block from index itself */
176		if (path[depth].p_bh)
177			return path[depth].p_bh->b_blocknr;
178	}
179
180	/* OK. use inode's group */
181	block_group = ei->i_block_group;
182	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
183		/*
184		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
185		 * block groups per flexgroup, reserve the first block
186		 * group for directories and special files.  Regular
187		 * files will start at the second block group.  This
188		 * tends to speed up directory access and improves
189		 * fsck times.
190		 */
191		block_group &= ~(flex_size-1);
192		if (S_ISREG(inode->i_mode))
193			block_group++;
194	}
195	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
196	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
197
198	/*
199	 * If we are doing delayed allocation, we don't need take
200	 * colour into account.
201	 */
202	if (test_opt(inode->i_sb, DELALLOC))
203		return bg_start;
204
205	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
206		colour = (current->pid % 16) *
207			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
208	else
209		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
210	return bg_start + colour + block;
211}
212
213/*
214 * Allocation for a meta data block
215 */
216static ext4_fsblk_t
217ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
218			struct ext4_ext_path *path,
219			struct ext4_extent *ex, int *err)
220{
221	ext4_fsblk_t goal, newblock;
222
223	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
224	newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
225	return newblock;
226}
227
228static inline int ext4_ext_space_block(struct inode *inode, int check)
229{
230	int size;
231
232	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
233			/ sizeof(struct ext4_extent);
234	if (!check) {
235#ifdef AGGRESSIVE_TEST
236		if (size > 6)
237			size = 6;
238#endif
239	}
240	return size;
241}
242
243static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
244{
245	int size;
246
247	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
248			/ sizeof(struct ext4_extent_idx);
249	if (!check) {
250#ifdef AGGRESSIVE_TEST
251		if (size > 5)
252			size = 5;
253#endif
254	}
255	return size;
256}
257
258static inline int ext4_ext_space_root(struct inode *inode, int check)
259{
260	int size;
261
262	size = sizeof(EXT4_I(inode)->i_data);
263	size -= sizeof(struct ext4_extent_header);
264	size /= sizeof(struct ext4_extent);
265	if (!check) {
266#ifdef AGGRESSIVE_TEST
267		if (size > 3)
268			size = 3;
269#endif
270	}
271	return size;
272}
273
274static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
275{
276	int size;
277
278	size = sizeof(EXT4_I(inode)->i_data);
279	size -= sizeof(struct ext4_extent_header);
280	size /= sizeof(struct ext4_extent_idx);
281	if (!check) {
282#ifdef AGGRESSIVE_TEST
283		if (size > 4)
284			size = 4;
285#endif
286	}
287	return size;
288}
289
290/*
291 * Calculate the number of metadata blocks needed
292 * to allocate @blocks
293 * Worse case is one block per extent
294 */
295int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
296{
297	struct ext4_inode_info *ei = EXT4_I(inode);
298	int idxs, num = 0;
299
300	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
301		/ sizeof(struct ext4_extent_idx));
302
303	/*
304	 * If the new delayed allocation block is contiguous with the
305	 * previous da block, it can share index blocks with the
306	 * previous block, so we only need to allocate a new index
307	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
308	 * an additional index block, and at ldxs**3 blocks, yet
309	 * another index blocks.
310	 */
311	if (ei->i_da_metadata_calc_len &&
312	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
313		if ((ei->i_da_metadata_calc_len % idxs) == 0)
314			num++;
315		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
316			num++;
317		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
318			num++;
319			ei->i_da_metadata_calc_len = 0;
320		} else
321			ei->i_da_metadata_calc_len++;
322		ei->i_da_metadata_calc_last_lblock++;
323		return num;
324	}
325
326	/*
327	 * In the worst case we need a new set of index blocks at
328	 * every level of the inode's extent tree.
329	 */
330	ei->i_da_metadata_calc_len = 1;
331	ei->i_da_metadata_calc_last_lblock = lblock;
332	return ext_depth(inode) + 1;
333}
334
335static int
336ext4_ext_max_entries(struct inode *inode, int depth)
337{
338	int max;
339
340	if (depth == ext_depth(inode)) {
341		if (depth == 0)
342			max = ext4_ext_space_root(inode, 1);
343		else
344			max = ext4_ext_space_root_idx(inode, 1);
345	} else {
346		if (depth == 0)
347			max = ext4_ext_space_block(inode, 1);
348		else
349			max = ext4_ext_space_block_idx(inode, 1);
350	}
351
352	return max;
353}
354
355static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
356{
357	ext4_fsblk_t block = ext_pblock(ext);
358	int len = ext4_ext_get_actual_len(ext);
359
360	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
361}
362
363static int ext4_valid_extent_idx(struct inode *inode,
364				struct ext4_extent_idx *ext_idx)
365{
366	ext4_fsblk_t block = idx_pblock(ext_idx);
367
368	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
369}
370
371static int ext4_valid_extent_entries(struct inode *inode,
372				struct ext4_extent_header *eh,
373				int depth)
374{
375	struct ext4_extent *ext;
376	struct ext4_extent_idx *ext_idx;
377	unsigned short entries;
378	if (eh->eh_entries == 0)
379		return 1;
380
381	entries = le16_to_cpu(eh->eh_entries);
382
383	if (depth == 0) {
384		/* leaf entries */
385		ext = EXT_FIRST_EXTENT(eh);
386		while (entries) {
387			if (!ext4_valid_extent(inode, ext))
388				return 0;
389			ext++;
390			entries--;
391		}
392	} else {
393		ext_idx = EXT_FIRST_INDEX(eh);
394		while (entries) {
395			if (!ext4_valid_extent_idx(inode, ext_idx))
396				return 0;
397			ext_idx++;
398			entries--;
399		}
400	}
401	return 1;
402}
403
404static int __ext4_ext_check(const char *function, unsigned int line,
405			    struct inode *inode, struct ext4_extent_header *eh,
406			    int depth)
407{
408	const char *error_msg;
409	int max = 0;
410
411	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
412		error_msg = "invalid magic";
413		goto corrupted;
414	}
415	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
416		error_msg = "unexpected eh_depth";
417		goto corrupted;
418	}
419	if (unlikely(eh->eh_max == 0)) {
420		error_msg = "invalid eh_max";
421		goto corrupted;
422	}
423	max = ext4_ext_max_entries(inode, depth);
424	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
425		error_msg = "too large eh_max";
426		goto corrupted;
427	}
428	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
429		error_msg = "invalid eh_entries";
430		goto corrupted;
431	}
432	if (!ext4_valid_extent_entries(inode, eh, depth)) {
433		error_msg = "invalid extent entries";
434		goto corrupted;
435	}
436	return 0;
437
438corrupted:
439	ext4_error_inode(inode, function, line, 0,
440			"bad header/extent: %s - magic %x, "
441			"entries %u, max %u(%u), depth %u(%u)",
442			error_msg, le16_to_cpu(eh->eh_magic),
443			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
444			max, le16_to_cpu(eh->eh_depth), depth);
445
446	return -EIO;
447}
448
449#define ext4_ext_check(inode, eh, depth)	\
450	__ext4_ext_check(__func__, __LINE__, inode, eh, depth)
451
452int ext4_ext_check_inode(struct inode *inode)
453{
454	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
455}
456
457#ifdef EXT_DEBUG
458static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
459{
460	int k, l = path->p_depth;
461
462	ext_debug("path:");
463	for (k = 0; k <= l; k++, path++) {
464		if (path->p_idx) {
465		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
466			    idx_pblock(path->p_idx));
467		} else if (path->p_ext) {
468			ext_debug("  %d:[%d]%d:%llu ",
469				  le32_to_cpu(path->p_ext->ee_block),
470				  ext4_ext_is_uninitialized(path->p_ext),
471				  ext4_ext_get_actual_len(path->p_ext),
472				  ext_pblock(path->p_ext));
473		} else
474			ext_debug("  []");
475	}
476	ext_debug("\n");
477}
478
479static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
480{
481	int depth = ext_depth(inode);
482	struct ext4_extent_header *eh;
483	struct ext4_extent *ex;
484	int i;
485
486	if (!path)
487		return;
488
489	eh = path[depth].p_hdr;
490	ex = EXT_FIRST_EXTENT(eh);
491
492	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
493
494	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
495		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
496			  ext4_ext_is_uninitialized(ex),
497			  ext4_ext_get_actual_len(ex), ext_pblock(ex));
498	}
499	ext_debug("\n");
500}
501#else
502#define ext4_ext_show_path(inode, path)
503#define ext4_ext_show_leaf(inode, path)
504#endif
505
506void ext4_ext_drop_refs(struct ext4_ext_path *path)
507{
508	int depth = path->p_depth;
509	int i;
510
511	for (i = 0; i <= depth; i++, path++)
512		if (path->p_bh) {
513			brelse(path->p_bh);
514			path->p_bh = NULL;
515		}
516}
517
518/*
519 * ext4_ext_binsearch_idx:
520 * binary search for the closest index of the given block
521 * the header must be checked before calling this
522 */
523static void
524ext4_ext_binsearch_idx(struct inode *inode,
525			struct ext4_ext_path *path, ext4_lblk_t block)
526{
527	struct ext4_extent_header *eh = path->p_hdr;
528	struct ext4_extent_idx *r, *l, *m;
529
530
531	ext_debug("binsearch for %u(idx):  ", block);
532
533	l = EXT_FIRST_INDEX(eh) + 1;
534	r = EXT_LAST_INDEX(eh);
535	while (l <= r) {
536		m = l + (r - l) / 2;
537		if (block < le32_to_cpu(m->ei_block))
538			r = m - 1;
539		else
540			l = m + 1;
541		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
542				m, le32_to_cpu(m->ei_block),
543				r, le32_to_cpu(r->ei_block));
544	}
545
546	path->p_idx = l - 1;
547	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
548		  idx_pblock(path->p_idx));
549
550#ifdef CHECK_BINSEARCH
551	{
552		struct ext4_extent_idx *chix, *ix;
553		int k;
554
555		chix = ix = EXT_FIRST_INDEX(eh);
556		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
557		  if (k != 0 &&
558		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
559				printk(KERN_DEBUG "k=%d, ix=0x%p, "
560				       "first=0x%p\n", k,
561				       ix, EXT_FIRST_INDEX(eh));
562				printk(KERN_DEBUG "%u <= %u\n",
563				       le32_to_cpu(ix->ei_block),
564				       le32_to_cpu(ix[-1].ei_block));
565			}
566			BUG_ON(k && le32_to_cpu(ix->ei_block)
567					   <= le32_to_cpu(ix[-1].ei_block));
568			if (block < le32_to_cpu(ix->ei_block))
569				break;
570			chix = ix;
571		}
572		BUG_ON(chix != path->p_idx);
573	}
574#endif
575
576}
577
578/*
579 * ext4_ext_binsearch:
580 * binary search for closest extent of the given block
581 * the header must be checked before calling this
582 */
583static void
584ext4_ext_binsearch(struct inode *inode,
585		struct ext4_ext_path *path, ext4_lblk_t block)
586{
587	struct ext4_extent_header *eh = path->p_hdr;
588	struct ext4_extent *r, *l, *m;
589
590	if (eh->eh_entries == 0) {
591		/*
592		 * this leaf is empty:
593		 * we get such a leaf in split/add case
594		 */
595		return;
596	}
597
598	ext_debug("binsearch for %u:  ", block);
599
600	l = EXT_FIRST_EXTENT(eh) + 1;
601	r = EXT_LAST_EXTENT(eh);
602
603	while (l <= r) {
604		m = l + (r - l) / 2;
605		if (block < le32_to_cpu(m->ee_block))
606			r = m - 1;
607		else
608			l = m + 1;
609		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
610				m, le32_to_cpu(m->ee_block),
611				r, le32_to_cpu(r->ee_block));
612	}
613
614	path->p_ext = l - 1;
615	ext_debug("  -> %d:%llu:[%d]%d ",
616			le32_to_cpu(path->p_ext->ee_block),
617			ext_pblock(path->p_ext),
618			ext4_ext_is_uninitialized(path->p_ext),
619			ext4_ext_get_actual_len(path->p_ext));
620
621#ifdef CHECK_BINSEARCH
622	{
623		struct ext4_extent *chex, *ex;
624		int k;
625
626		chex = ex = EXT_FIRST_EXTENT(eh);
627		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
628			BUG_ON(k && le32_to_cpu(ex->ee_block)
629					  <= le32_to_cpu(ex[-1].ee_block));
630			if (block < le32_to_cpu(ex->ee_block))
631				break;
632			chex = ex;
633		}
634		BUG_ON(chex != path->p_ext);
635	}
636#endif
637
638}
639
640int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
641{
642	struct ext4_extent_header *eh;
643
644	eh = ext_inode_hdr(inode);
645	eh->eh_depth = 0;
646	eh->eh_entries = 0;
647	eh->eh_magic = EXT4_EXT_MAGIC;
648	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
649	ext4_mark_inode_dirty(handle, inode);
650	ext4_ext_invalidate_cache(inode);
651	return 0;
652}
653
654struct ext4_ext_path *
655ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
656					struct ext4_ext_path *path)
657{
658	struct ext4_extent_header *eh;
659	struct buffer_head *bh;
660	short int depth, i, ppos = 0, alloc = 0;
661
662	eh = ext_inode_hdr(inode);
663	depth = ext_depth(inode);
664
665	/* account possible depth increase */
666	if (!path) {
667		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
668				GFP_NOFS);
669		if (!path)
670			return ERR_PTR(-ENOMEM);
671		alloc = 1;
672	}
673	path[0].p_hdr = eh;
674	path[0].p_bh = NULL;
675
676	i = depth;
677	/* walk through the tree */
678	while (i) {
679		int need_to_validate = 0;
680
681		ext_debug("depth %d: num %d, max %d\n",
682			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
683
684		ext4_ext_binsearch_idx(inode, path + ppos, block);
685		path[ppos].p_block = idx_pblock(path[ppos].p_idx);
686		path[ppos].p_depth = i;
687		path[ppos].p_ext = NULL;
688
689		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
690		if (unlikely(!bh))
691			goto err;
692		if (!bh_uptodate_or_lock(bh)) {
693			if (bh_submit_read(bh) < 0) {
694				put_bh(bh);
695				goto err;
696			}
697			/* validate the extent entries */
698			need_to_validate = 1;
699		}
700		eh = ext_block_hdr(bh);
701		ppos++;
702		if (unlikely(ppos > depth)) {
703			put_bh(bh);
704			EXT4_ERROR_INODE(inode,
705					 "ppos %d > depth %d", ppos, depth);
706			goto err;
707		}
708		path[ppos].p_bh = bh;
709		path[ppos].p_hdr = eh;
710		i--;
711
712		if (need_to_validate && ext4_ext_check(inode, eh, i))
713			goto err;
714	}
715
716	path[ppos].p_depth = i;
717	path[ppos].p_ext = NULL;
718	path[ppos].p_idx = NULL;
719
720	/* find extent */
721	ext4_ext_binsearch(inode, path + ppos, block);
722	/* if not an empty leaf */
723	if (path[ppos].p_ext)
724		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
725
726	ext4_ext_show_path(inode, path);
727
728	return path;
729
730err:
731	ext4_ext_drop_refs(path);
732	if (alloc)
733		kfree(path);
734	return ERR_PTR(-EIO);
735}
736
737/*
738 * ext4_ext_insert_index:
739 * insert new index [@logical;@ptr] into the block at @curp;
740 * check where to insert: before @curp or after @curp
741 */
742int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
743				struct ext4_ext_path *curp,
744				int logical, ext4_fsblk_t ptr)
745{
746	struct ext4_extent_idx *ix;
747	int len, err;
748
749	err = ext4_ext_get_access(handle, inode, curp);
750	if (err)
751		return err;
752
753	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
754		EXT4_ERROR_INODE(inode,
755				 "logical %d == ei_block %d!",
756				 logical, le32_to_cpu(curp->p_idx->ei_block));
757		return -EIO;
758	}
759	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
760	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
761		/* insert after */
762		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
763			len = (len - 1) * sizeof(struct ext4_extent_idx);
764			len = len < 0 ? 0 : len;
765			ext_debug("insert new index %d after: %llu. "
766					"move %d from 0x%p to 0x%p\n",
767					logical, ptr, len,
768					(curp->p_idx + 1), (curp->p_idx + 2));
769			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
770		}
771		ix = curp->p_idx + 1;
772	} else {
773		/* insert before */
774		len = len * sizeof(struct ext4_extent_idx);
775		len = len < 0 ? 0 : len;
776		ext_debug("insert new index %d before: %llu. "
777				"move %d from 0x%p to 0x%p\n",
778				logical, ptr, len,
779				curp->p_idx, (curp->p_idx + 1));
780		memmove(curp->p_idx + 1, curp->p_idx, len);
781		ix = curp->p_idx;
782	}
783
784	ix->ei_block = cpu_to_le32(logical);
785	ext4_idx_store_pblock(ix, ptr);
786	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
787
788	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
789			     > le16_to_cpu(curp->p_hdr->eh_max))) {
790		EXT4_ERROR_INODE(inode,
791				 "logical %d == ei_block %d!",
792				 logical, le32_to_cpu(curp->p_idx->ei_block));
793		return -EIO;
794	}
795	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
796		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
797		return -EIO;
798	}
799
800	err = ext4_ext_dirty(handle, inode, curp);
801	ext4_std_error(inode->i_sb, err);
802
803	return err;
804}
805
806/*
807 * ext4_ext_split:
808 * inserts new subtree into the path, using free index entry
809 * at depth @at:
810 * - allocates all needed blocks (new leaf and all intermediate index blocks)
811 * - makes decision where to split
812 * - moves remaining extents and index entries (right to the split point)
813 *   into the newly allocated blocks
814 * - initializes subtree
815 */
816static int ext4_ext_split(handle_t *handle, struct inode *inode,
817				struct ext4_ext_path *path,
818				struct ext4_extent *newext, int at)
819{
820	struct buffer_head *bh = NULL;
821	int depth = ext_depth(inode);
822	struct ext4_extent_header *neh;
823	struct ext4_extent_idx *fidx;
824	struct ext4_extent *ex;
825	int i = at, k, m, a;
826	ext4_fsblk_t newblock, oldblock;
827	__le32 border;
828	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
829	int err = 0;
830
831	/* make decision: where to split? */
832
833	/* if current leaf will be split, then we should use
834	 * border from split point */
835	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
836		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
837		return -EIO;
838	}
839	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
840		border = path[depth].p_ext[1].ee_block;
841		ext_debug("leaf will be split."
842				" next leaf starts at %d\n",
843				  le32_to_cpu(border));
844	} else {
845		border = newext->ee_block;
846		ext_debug("leaf will be added."
847				" next leaf starts at %d\n",
848				le32_to_cpu(border));
849	}
850
851	/*
852	 * If error occurs, then we break processing
853	 * and mark filesystem read-only. index won't
854	 * be inserted and tree will be in consistent
855	 * state. Next mount will repair buffers too.
856	 */
857
858	/*
859	 * Get array to track all allocated blocks.
860	 * We need this to handle errors and free blocks
861	 * upon them.
862	 */
863	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
864	if (!ablocks)
865		return -ENOMEM;
866
867	/* allocate all needed blocks */
868	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
869	for (a = 0; a < depth - at; a++) {
870		newblock = ext4_ext_new_meta_block(handle, inode, path,
871						   newext, &err);
872		if (newblock == 0)
873			goto cleanup;
874		ablocks[a] = newblock;
875	}
876
877	/* initialize new leaf */
878	newblock = ablocks[--a];
879	if (unlikely(newblock == 0)) {
880		EXT4_ERROR_INODE(inode, "newblock == 0!");
881		err = -EIO;
882		goto cleanup;
883	}
884	bh = sb_getblk(inode->i_sb, newblock);
885	if (!bh) {
886		err = -EIO;
887		goto cleanup;
888	}
889	lock_buffer(bh);
890
891	err = ext4_journal_get_create_access(handle, bh);
892	if (err)
893		goto cleanup;
894
895	neh = ext_block_hdr(bh);
896	neh->eh_entries = 0;
897	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
898	neh->eh_magic = EXT4_EXT_MAGIC;
899	neh->eh_depth = 0;
900	ex = EXT_FIRST_EXTENT(neh);
901
902	/* move remainder of path[depth] to the new leaf */
903	if (unlikely(path[depth].p_hdr->eh_entries !=
904		     path[depth].p_hdr->eh_max)) {
905		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
906				 path[depth].p_hdr->eh_entries,
907				 path[depth].p_hdr->eh_max);
908		err = -EIO;
909		goto cleanup;
910	}
911	/* start copy from next extent */
912	/* TODO: we could do it by single memmove */
913	m = 0;
914	path[depth].p_ext++;
915	while (path[depth].p_ext <=
916			EXT_MAX_EXTENT(path[depth].p_hdr)) {
917		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
918				le32_to_cpu(path[depth].p_ext->ee_block),
919				ext_pblock(path[depth].p_ext),
920				ext4_ext_is_uninitialized(path[depth].p_ext),
921				ext4_ext_get_actual_len(path[depth].p_ext),
922				newblock);
923		/*memmove(ex++, path[depth].p_ext++,
924				sizeof(struct ext4_extent));
925		neh->eh_entries++;*/
926		path[depth].p_ext++;
927		m++;
928	}
929	if (m) {
930		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
931		le16_add_cpu(&neh->eh_entries, m);
932	}
933
934	set_buffer_uptodate(bh);
935	unlock_buffer(bh);
936
937	err = ext4_handle_dirty_metadata(handle, inode, bh);
938	if (err)
939		goto cleanup;
940	brelse(bh);
941	bh = NULL;
942
943	/* correct old leaf */
944	if (m) {
945		err = ext4_ext_get_access(handle, inode, path + depth);
946		if (err)
947			goto cleanup;
948		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
949		err = ext4_ext_dirty(handle, inode, path + depth);
950		if (err)
951			goto cleanup;
952
953	}
954
955	/* create intermediate indexes */
956	k = depth - at - 1;
957	if (unlikely(k < 0)) {
958		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
959		err = -EIO;
960		goto cleanup;
961	}
962	if (k)
963		ext_debug("create %d intermediate indices\n", k);
964	/* insert new index into current index block */
965	/* current depth stored in i var */
966	i = depth - 1;
967	while (k--) {
968		oldblock = newblock;
969		newblock = ablocks[--a];
970		bh = sb_getblk(inode->i_sb, newblock);
971		if (!bh) {
972			err = -EIO;
973			goto cleanup;
974		}
975		lock_buffer(bh);
976
977		err = ext4_journal_get_create_access(handle, bh);
978		if (err)
979			goto cleanup;
980
981		neh = ext_block_hdr(bh);
982		neh->eh_entries = cpu_to_le16(1);
983		neh->eh_magic = EXT4_EXT_MAGIC;
984		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
985		neh->eh_depth = cpu_to_le16(depth - i);
986		fidx = EXT_FIRST_INDEX(neh);
987		fidx->ei_block = border;
988		ext4_idx_store_pblock(fidx, oldblock);
989
990		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
991				i, newblock, le32_to_cpu(border), oldblock);
992		/* copy indexes */
993		m = 0;
994		path[i].p_idx++;
995
996		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
997				EXT_MAX_INDEX(path[i].p_hdr));
998		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
999					EXT_LAST_INDEX(path[i].p_hdr))) {
1000			EXT4_ERROR_INODE(inode,
1001					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1002					 le32_to_cpu(path[i].p_ext->ee_block));
1003			err = -EIO;
1004			goto cleanup;
1005		}
1006		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
1007			ext_debug("%d: move %d:%llu in new index %llu\n", i,
1008					le32_to_cpu(path[i].p_idx->ei_block),
1009					idx_pblock(path[i].p_idx),
1010					newblock);
1011			/*memmove(++fidx, path[i].p_idx++,
1012					sizeof(struct ext4_extent_idx));
1013			neh->eh_entries++;
1014			BUG_ON(neh->eh_entries > neh->eh_max);*/
1015			path[i].p_idx++;
1016			m++;
1017		}
1018		if (m) {
1019			memmove(++fidx, path[i].p_idx - m,
1020				sizeof(struct ext4_extent_idx) * m);
1021			le16_add_cpu(&neh->eh_entries, m);
1022		}
1023		set_buffer_uptodate(bh);
1024		unlock_buffer(bh);
1025
1026		err = ext4_handle_dirty_metadata(handle, inode, bh);
1027		if (err)
1028			goto cleanup;
1029		brelse(bh);
1030		bh = NULL;
1031
1032		/* correct old index */
1033		if (m) {
1034			err = ext4_ext_get_access(handle, inode, path + i);
1035			if (err)
1036				goto cleanup;
1037			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1038			err = ext4_ext_dirty(handle, inode, path + i);
1039			if (err)
1040				goto cleanup;
1041		}
1042
1043		i--;
1044	}
1045
1046	/* insert new index */
1047	err = ext4_ext_insert_index(handle, inode, path + at,
1048				    le32_to_cpu(border), newblock);
1049
1050cleanup:
1051	if (bh) {
1052		if (buffer_locked(bh))
1053			unlock_buffer(bh);
1054		brelse(bh);
1055	}
1056
1057	if (err) {
1058		/* free all allocated blocks in error case */
1059		for (i = 0; i < depth; i++) {
1060			if (!ablocks[i])
1061				continue;
1062			ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
1063					 EXT4_FREE_BLOCKS_METADATA);
1064		}
1065	}
1066	kfree(ablocks);
1067
1068	return err;
1069}
1070
1071/*
1072 * ext4_ext_grow_indepth:
1073 * implements tree growing procedure:
1074 * - allocates new block
1075 * - moves top-level data (index block or leaf) into the new block
1076 * - initializes new top-level, creating index that points to the
1077 *   just created block
1078 */
1079static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1080					struct ext4_ext_path *path,
1081					struct ext4_extent *newext)
1082{
1083	struct ext4_ext_path *curp = path;
1084	struct ext4_extent_header *neh;
1085	struct buffer_head *bh;
1086	ext4_fsblk_t newblock;
1087	int err = 0;
1088
1089	newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
1090	if (newblock == 0)
1091		return err;
1092
1093	bh = sb_getblk(inode->i_sb, newblock);
1094	if (!bh) {
1095		err = -EIO;
1096		ext4_std_error(inode->i_sb, err);
1097		return err;
1098	}
1099	lock_buffer(bh);
1100
1101	err = ext4_journal_get_create_access(handle, bh);
1102	if (err) {
1103		unlock_buffer(bh);
1104		goto out;
1105	}
1106
1107	/* move top-level index/leaf into new block */
1108	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
1109
1110	/* set size of new block */
1111	neh = ext_block_hdr(bh);
1112	/* old root could have indexes or leaves
1113	 * so calculate e_max right way */
1114	if (ext_depth(inode))
1115		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1116	else
1117		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1118	neh->eh_magic = EXT4_EXT_MAGIC;
1119	set_buffer_uptodate(bh);
1120	unlock_buffer(bh);
1121
1122	err = ext4_handle_dirty_metadata(handle, inode, bh);
1123	if (err)
1124		goto out;
1125
1126	/* create index in new top-level index: num,max,pointer */
1127	err = ext4_ext_get_access(handle, inode, curp);
1128	if (err)
1129		goto out;
1130
1131	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1132	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1133	curp->p_hdr->eh_entries = cpu_to_le16(1);
1134	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1135
1136	if (path[0].p_hdr->eh_depth)
1137		curp->p_idx->ei_block =
1138			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
1139	else
1140		curp->p_idx->ei_block =
1141			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
1142	ext4_idx_store_pblock(curp->p_idx, newblock);
1143
1144	neh = ext_inode_hdr(inode);
1145	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1146		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1147		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1148		  idx_pblock(EXT_FIRST_INDEX(neh)));
1149
1150	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1151	err = ext4_ext_dirty(handle, inode, curp);
1152out:
1153	brelse(bh);
1154
1155	return err;
1156}
1157
1158/*
1159 * ext4_ext_create_new_leaf:
1160 * finds empty index and adds new leaf.
1161 * if no free index is found, then it requests in-depth growing.
1162 */
1163static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1164					struct ext4_ext_path *path,
1165					struct ext4_extent *newext)
1166{
1167	struct ext4_ext_path *curp;
1168	int depth, i, err = 0;
1169
1170repeat:
1171	i = depth = ext_depth(inode);
1172
1173	/* walk up to the tree and look for free index entry */
1174	curp = path + depth;
1175	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1176		i--;
1177		curp--;
1178	}
1179
1180	/* we use already allocated block for index block,
1181	 * so subsequent data blocks should be contiguous */
1182	if (EXT_HAS_FREE_INDEX(curp)) {
1183		/* if we found index with free entry, then use that
1184		 * entry: create all needed subtree and add new leaf */
1185		err = ext4_ext_split(handle, inode, path, newext, i);
1186		if (err)
1187			goto out;
1188
1189		/* refill path */
1190		ext4_ext_drop_refs(path);
1191		path = ext4_ext_find_extent(inode,
1192				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1193				    path);
1194		if (IS_ERR(path))
1195			err = PTR_ERR(path);
1196	} else {
1197		/* tree is full, time to grow in depth */
1198		err = ext4_ext_grow_indepth(handle, inode, path, newext);
1199		if (err)
1200			goto out;
1201
1202		/* refill path */
1203		ext4_ext_drop_refs(path);
1204		path = ext4_ext_find_extent(inode,
1205				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1206				    path);
1207		if (IS_ERR(path)) {
1208			err = PTR_ERR(path);
1209			goto out;
1210		}
1211
1212		/*
1213		 * only first (depth 0 -> 1) produces free space;
1214		 * in all other cases we have to split the grown tree
1215		 */
1216		depth = ext_depth(inode);
1217		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1218			/* now we need to split */
1219			goto repeat;
1220		}
1221	}
1222
1223out:
1224	return err;
1225}
1226
1227/*
1228 * search the closest allocated block to the left for *logical
1229 * and returns it at @logical + it's physical address at @phys
1230 * if *logical is the smallest allocated block, the function
1231 * returns 0 at @phys
1232 * return value contains 0 (success) or error code
1233 */
1234int
1235ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1236			ext4_lblk_t *logical, ext4_fsblk_t *phys)
1237{
1238	struct ext4_extent_idx *ix;
1239	struct ext4_extent *ex;
1240	int depth, ee_len;
1241
1242	if (unlikely(path == NULL)) {
1243		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1244		return -EIO;
1245	}
1246	depth = path->p_depth;
1247	*phys = 0;
1248
1249	if (depth == 0 && path->p_ext == NULL)
1250		return 0;
1251
1252	/* usually extent in the path covers blocks smaller
1253	 * then *logical, but it can be that extent is the
1254	 * first one in the file */
1255
1256	ex = path[depth].p_ext;
1257	ee_len = ext4_ext_get_actual_len(ex);
1258	if (*logical < le32_to_cpu(ex->ee_block)) {
1259		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1260			EXT4_ERROR_INODE(inode,
1261					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1262					 *logical, le32_to_cpu(ex->ee_block));
1263			return -EIO;
1264		}
1265		while (--depth >= 0) {
1266			ix = path[depth].p_idx;
1267			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1268				EXT4_ERROR_INODE(inode,
1269				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1270				  ix != NULL ? ix->ei_block : 0,
1271				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1272				    EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
1273				  depth);
1274				return -EIO;
1275			}
1276		}
1277		return 0;
1278	}
1279
1280	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1281		EXT4_ERROR_INODE(inode,
1282				 "logical %d < ee_block %d + ee_len %d!",
1283				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1284		return -EIO;
1285	}
1286
1287	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1288	*phys = ext_pblock(ex) + ee_len - 1;
1289	return 0;
1290}
1291
1292/*
1293 * search the closest allocated block to the right for *logical
1294 * and returns it at @logical + it's physical address at @phys
1295 * if *logical is the smallest allocated block, the function
1296 * returns 0 at @phys
1297 * return value contains 0 (success) or error code
1298 */
1299int
1300ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1301			ext4_lblk_t *logical, ext4_fsblk_t *phys)
1302{
1303	struct buffer_head *bh = NULL;
1304	struct ext4_extent_header *eh;
1305	struct ext4_extent_idx *ix;
1306	struct ext4_extent *ex;
1307	ext4_fsblk_t block;
1308	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1309	int ee_len;
1310
1311	if (unlikely(path == NULL)) {
1312		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1313		return -EIO;
1314	}
1315	depth = path->p_depth;
1316	*phys = 0;
1317
1318	if (depth == 0 && path->p_ext == NULL)
1319		return 0;
1320
1321	/* usually extent in the path covers blocks smaller
1322	 * then *logical, but it can be that extent is the
1323	 * first one in the file */
1324
1325	ex = path[depth].p_ext;
1326	ee_len = ext4_ext_get_actual_len(ex);
1327	if (*logical < le32_to_cpu(ex->ee_block)) {
1328		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1329			EXT4_ERROR_INODE(inode,
1330					 "first_extent(path[%d].p_hdr) != ex",
1331					 depth);
1332			return -EIO;
1333		}
1334		while (--depth >= 0) {
1335			ix = path[depth].p_idx;
1336			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1337				EXT4_ERROR_INODE(inode,
1338						 "ix != EXT_FIRST_INDEX *logical %d!",
1339						 *logical);
1340				return -EIO;
1341			}
1342		}
1343		*logical = le32_to_cpu(ex->ee_block);
1344		*phys = ext_pblock(ex);
1345		return 0;
1346	}
1347
1348	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1349		EXT4_ERROR_INODE(inode,
1350				 "logical %d < ee_block %d + ee_len %d!",
1351				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1352		return -EIO;
1353	}
1354
1355	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1356		/* next allocated block in this leaf */
1357		ex++;
1358		*logical = le32_to_cpu(ex->ee_block);
1359		*phys = ext_pblock(ex);
1360		return 0;
1361	}
1362
1363	/* go up and search for index to the right */
1364	while (--depth >= 0) {
1365		ix = path[depth].p_idx;
1366		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1367			goto got_index;
1368	}
1369
1370	/* we've gone up to the root and found no index to the right */
1371	return 0;
1372
1373got_index:
1374	/* we've found index to the right, let's
1375	 * follow it and find the closest allocated
1376	 * block to the right */
1377	ix++;
1378	block = idx_pblock(ix);
1379	while (++depth < path->p_depth) {
1380		bh = sb_bread(inode->i_sb, block);
1381		if (bh == NULL)
1382			return -EIO;
1383		eh = ext_block_hdr(bh);
1384		/* subtract from p_depth to get proper eh_depth */
1385		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1386			put_bh(bh);
1387			return -EIO;
1388		}
1389		ix = EXT_FIRST_INDEX(eh);
1390		block = idx_pblock(ix);
1391		put_bh(bh);
1392	}
1393
1394	bh = sb_bread(inode->i_sb, block);
1395	if (bh == NULL)
1396		return -EIO;
1397	eh = ext_block_hdr(bh);
1398	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1399		put_bh(bh);
1400		return -EIO;
1401	}
1402	ex = EXT_FIRST_EXTENT(eh);
1403	*logical = le32_to_cpu(ex->ee_block);
1404	*phys = ext_pblock(ex);
1405	put_bh(bh);
1406	return 0;
1407}
1408
1409/*
1410 * ext4_ext_next_allocated_block:
1411 * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
1412 * NOTE: it considers block number from index entry as
1413 * allocated block. Thus, index entries have to be consistent
1414 * with leaves.
1415 */
1416static ext4_lblk_t
1417ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1418{
1419	int depth;
1420
1421	BUG_ON(path == NULL);
1422	depth = path->p_depth;
1423
1424	if (depth == 0 && path->p_ext == NULL)
1425		return EXT_MAX_BLOCK;
1426
1427	while (depth >= 0) {
1428		if (depth == path->p_depth) {
1429			/* leaf */
1430			if (path[depth].p_ext !=
1431					EXT_LAST_EXTENT(path[depth].p_hdr))
1432			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1433		} else {
1434			/* index */
1435			if (path[depth].p_idx !=
1436					EXT_LAST_INDEX(path[depth].p_hdr))
1437			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1438		}
1439		depth--;
1440	}
1441
1442	return EXT_MAX_BLOCK;
1443}
1444
1445/*
1446 * ext4_ext_next_leaf_block:
1447 * returns first allocated block from next leaf or EXT_MAX_BLOCK
1448 */
1449static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1450					struct ext4_ext_path *path)
1451{
1452	int depth;
1453
1454	BUG_ON(path == NULL);
1455	depth = path->p_depth;
1456
1457	/* zero-tree has no leaf blocks at all */
1458	if (depth == 0)
1459		return EXT_MAX_BLOCK;
1460
1461	/* go to index block */
1462	depth--;
1463
1464	while (depth >= 0) {
1465		if (path[depth].p_idx !=
1466				EXT_LAST_INDEX(path[depth].p_hdr))
1467			return (ext4_lblk_t)
1468				le32_to_cpu(path[depth].p_idx[1].ei_block);
1469		depth--;
1470	}
1471
1472	return EXT_MAX_BLOCK;
1473}
1474
1475/*
1476 * ext4_ext_correct_indexes:
1477 * if leaf gets modified and modified extent is first in the leaf,
1478 * then we have to correct all indexes above.
1479 * TODO: do we need to correct tree in all cases?
1480 */
1481static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1482				struct ext4_ext_path *path)
1483{
1484	struct ext4_extent_header *eh;
1485	int depth = ext_depth(inode);
1486	struct ext4_extent *ex;
1487	__le32 border;
1488	int k, err = 0;
1489
1490	eh = path[depth].p_hdr;
1491	ex = path[depth].p_ext;
1492
1493	if (unlikely(ex == NULL || eh == NULL)) {
1494		EXT4_ERROR_INODE(inode,
1495				 "ex %p == NULL or eh %p == NULL", ex, eh);
1496		return -EIO;
1497	}
1498
1499	if (depth == 0) {
1500		/* there is no tree at all */
1501		return 0;
1502	}
1503
1504	if (ex != EXT_FIRST_EXTENT(eh)) {
1505		/* we correct tree if first leaf got modified only */
1506		return 0;
1507	}
1508
1509	/*
1510	 * TODO: we need correction if border is smaller than current one
1511	 */
1512	k = depth - 1;
1513	border = path[depth].p_ext->ee_block;
1514	err = ext4_ext_get_access(handle, inode, path + k);
1515	if (err)
1516		return err;
1517	path[k].p_idx->ei_block = border;
1518	err = ext4_ext_dirty(handle, inode, path + k);
1519	if (err)
1520		return err;
1521
1522	while (k--) {
1523		/* change all left-side indexes */
1524		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1525			break;
1526		err = ext4_ext_get_access(handle, inode, path + k);
1527		if (err)
1528			break;
1529		path[k].p_idx->ei_block = border;
1530		err = ext4_ext_dirty(handle, inode, path + k);
1531		if (err)
1532			break;
1533	}
1534
1535	return err;
1536}
1537
1538int
1539ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1540				struct ext4_extent *ex2)
1541{
1542	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1543
1544	/*
1545	 * Make sure that either both extents are uninitialized, or
1546	 * both are _not_.
1547	 */
1548	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1549		return 0;
1550
1551	if (ext4_ext_is_uninitialized(ex1))
1552		max_len = EXT_UNINIT_MAX_LEN;
1553	else
1554		max_len = EXT_INIT_MAX_LEN;
1555
1556	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1557	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1558
1559	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1560			le32_to_cpu(ex2->ee_block))
1561		return 0;
1562
1563	/*
1564	 * To allow future support for preallocated extents to be added
1565	 * as an RO_COMPAT feature, refuse to merge to extents if
1566	 * this can result in the top bit of ee_len being set.
1567	 */
1568	if (ext1_ee_len + ext2_ee_len > max_len)
1569		return 0;
1570#ifdef AGGRESSIVE_TEST
1571	if (ext1_ee_len >= 4)
1572		return 0;
1573#endif
1574
1575	if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
1576		return 1;
1577	return 0;
1578}
1579
1580/*
1581 * This function tries to merge the "ex" extent to the next extent in the tree.
1582 * It always tries to merge towards right. If you want to merge towards
1583 * left, pass "ex - 1" as argument instead of "ex".
1584 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1585 * 1 if they got merged.
1586 */
1587int ext4_ext_try_to_merge(struct inode *inode,
1588			  struct ext4_ext_path *path,
1589			  struct ext4_extent *ex)
1590{
1591	struct ext4_extent_header *eh;
1592	unsigned int depth, len;
1593	int merge_done = 0;
1594	int uninitialized = 0;
1595
1596	depth = ext_depth(inode);
1597	BUG_ON(path[depth].p_hdr == NULL);
1598	eh = path[depth].p_hdr;
1599
1600	while (ex < EXT_LAST_EXTENT(eh)) {
1601		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1602			break;
1603		/* merge with next extent! */
1604		if (ext4_ext_is_uninitialized(ex))
1605			uninitialized = 1;
1606		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1607				+ ext4_ext_get_actual_len(ex + 1));
1608		if (uninitialized)
1609			ext4_ext_mark_uninitialized(ex);
1610
1611		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1612			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1613				* sizeof(struct ext4_extent);
1614			memmove(ex + 1, ex + 2, len);
1615		}
1616		le16_add_cpu(&eh->eh_entries, -1);
1617		merge_done = 1;
1618		WARN_ON(eh->eh_entries == 0);
1619		if (!eh->eh_entries)
1620			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1621	}
1622
1623	return merge_done;
1624}
1625
1626/*
1627 * check if a portion of the "newext" extent overlaps with an
1628 * existing extent.
1629 *
1630 * If there is an overlap discovered, it updates the length of the newext
1631 * such that there will be no overlap, and then returns 1.
1632 * If there is no overlap found, it returns 0.
1633 */
1634unsigned int ext4_ext_check_overlap(struct inode *inode,
1635				    struct ext4_extent *newext,
1636				    struct ext4_ext_path *path)
1637{
1638	ext4_lblk_t b1, b2;
1639	unsigned int depth, len1;
1640	unsigned int ret = 0;
1641
1642	b1 = le32_to_cpu(newext->ee_block);
1643	len1 = ext4_ext_get_actual_len(newext);
1644	depth = ext_depth(inode);
1645	if (!path[depth].p_ext)
1646		goto out;
1647	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1648
1649	/*
1650	 * get the next allocated block if the extent in the path
1651	 * is before the requested block(s)
1652	 */
1653	if (b2 < b1) {
1654		b2 = ext4_ext_next_allocated_block(path);
1655		if (b2 == EXT_MAX_BLOCK)
1656			goto out;
1657	}
1658
1659	/* check for wrap through zero on extent logical start block*/
1660	if (b1 + len1 < b1) {
1661		len1 = EXT_MAX_BLOCK - b1;
1662		newext->ee_len = cpu_to_le16(len1);
1663		ret = 1;
1664	}
1665
1666	/* check for overlap */
1667	if (b1 + len1 > b2) {
1668		newext->ee_len = cpu_to_le16(b2 - b1);
1669		ret = 1;
1670	}
1671out:
1672	return ret;
1673}
1674
1675/*
1676 * ext4_ext_insert_extent:
1677 * tries to merge requsted extent into the existing extent or
1678 * inserts requested extent as new one into the tree,
1679 * creating new leaf in the no-space case.
1680 */
1681int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1682				struct ext4_ext_path *path,
1683				struct ext4_extent *newext, int flag)
1684{
1685	struct ext4_extent_header *eh;
1686	struct ext4_extent *ex, *fex;
1687	struct ext4_extent *nearex; /* nearest extent */
1688	struct ext4_ext_path *npath = NULL;
1689	int depth, len, err;
1690	ext4_lblk_t next;
1691	unsigned uninitialized = 0;
1692
1693	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1694		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1695		return -EIO;
1696	}
1697	depth = ext_depth(inode);
1698	ex = path[depth].p_ext;
1699	if (unlikely(path[depth].p_hdr == NULL)) {
1700		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1701		return -EIO;
1702	}
1703
1704	/* try to insert block into found extent and return */
1705	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1706		&& ext4_can_extents_be_merged(inode, ex, newext)) {
1707		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1708				ext4_ext_is_uninitialized(newext),
1709				ext4_ext_get_actual_len(newext),
1710				le32_to_cpu(ex->ee_block),
1711				ext4_ext_is_uninitialized(ex),
1712				ext4_ext_get_actual_len(ex), ext_pblock(ex));
1713		err = ext4_ext_get_access(handle, inode, path + depth);
1714		if (err)
1715			return err;
1716
1717		/*
1718		 * ext4_can_extents_be_merged should have checked that either
1719		 * both extents are uninitialized, or both aren't. Thus we
1720		 * need to check only one of them here.
1721		 */
1722		if (ext4_ext_is_uninitialized(ex))
1723			uninitialized = 1;
1724		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1725					+ ext4_ext_get_actual_len(newext));
1726		if (uninitialized)
1727			ext4_ext_mark_uninitialized(ex);
1728		eh = path[depth].p_hdr;
1729		nearex = ex;
1730		goto merge;
1731	}
1732
1733repeat:
1734	depth = ext_depth(inode);
1735	eh = path[depth].p_hdr;
1736	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1737		goto has_space;
1738
1739	/* probably next leaf has space for us? */
1740	fex = EXT_LAST_EXTENT(eh);
1741	next = ext4_ext_next_leaf_block(inode, path);
1742	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1743	    && next != EXT_MAX_BLOCK) {
1744		ext_debug("next leaf block - %d\n", next);
1745		BUG_ON(npath != NULL);
1746		npath = ext4_ext_find_extent(inode, next, NULL);
1747		if (IS_ERR(npath))
1748			return PTR_ERR(npath);
1749		BUG_ON(npath->p_depth != path->p_depth);
1750		eh = npath[depth].p_hdr;
1751		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1752			ext_debug("next leaf isnt full(%d)\n",
1753				  le16_to_cpu(eh->eh_entries));
1754			path = npath;
1755			goto repeat;
1756		}
1757		ext_debug("next leaf has no free space(%d,%d)\n",
1758			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1759	}
1760
1761	/*
1762	 * There is no free space in the found leaf.
1763	 * We're gonna add a new leaf in the tree.
1764	 */
1765	err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1766	if (err)
1767		goto cleanup;
1768	depth = ext_depth(inode);
1769	eh = path[depth].p_hdr;
1770
1771has_space:
1772	nearex = path[depth].p_ext;
1773
1774	err = ext4_ext_get_access(handle, inode, path + depth);
1775	if (err)
1776		goto cleanup;
1777
1778	if (!nearex) {
1779		/* there is no extent in this leaf, create first one */
1780		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1781				le32_to_cpu(newext->ee_block),
1782				ext_pblock(newext),
1783				ext4_ext_is_uninitialized(newext),
1784				ext4_ext_get_actual_len(newext));
1785		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1786	} else if (le32_to_cpu(newext->ee_block)
1787			   > le32_to_cpu(nearex->ee_block)) {
1788/*		BUG_ON(newext->ee_block == nearex->ee_block); */
1789		if (nearex != EXT_LAST_EXTENT(eh)) {
1790			len = EXT_MAX_EXTENT(eh) - nearex;
1791			len = (len - 1) * sizeof(struct ext4_extent);
1792			len = len < 0 ? 0 : len;
1793			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1794					"move %d from 0x%p to 0x%p\n",
1795					le32_to_cpu(newext->ee_block),
1796					ext_pblock(newext),
1797					ext4_ext_is_uninitialized(newext),
1798					ext4_ext_get_actual_len(newext),
1799					nearex, len, nearex + 1, nearex + 2);
1800			memmove(nearex + 2, nearex + 1, len);
1801		}
1802		path[depth].p_ext = nearex + 1;
1803	} else {
1804		BUG_ON(newext->ee_block == nearex->ee_block);
1805		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1806		len = len < 0 ? 0 : len;
1807		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1808				"move %d from 0x%p to 0x%p\n",
1809				le32_to_cpu(newext->ee_block),
1810				ext_pblock(newext),
1811				ext4_ext_is_uninitialized(newext),
1812				ext4_ext_get_actual_len(newext),
1813				nearex, len, nearex + 1, nearex + 2);
1814		memmove(nearex + 1, nearex, len);
1815		path[depth].p_ext = nearex;
1816	}
1817
1818	le16_add_cpu(&eh->eh_entries, 1);
1819	nearex = path[depth].p_ext;
1820	nearex->ee_block = newext->ee_block;
1821	ext4_ext_store_pblock(nearex, ext_pblock(newext));
1822	nearex->ee_len = newext->ee_len;
1823
1824merge:
1825	/* try to merge extents to the right */
1826	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1827		ext4_ext_try_to_merge(inode, path, nearex);
1828
1829	/* try to merge extents to the left */
1830
1831	/* time to correct all indexes above */
1832	err = ext4_ext_correct_indexes(handle, inode, path);
1833	if (err)
1834		goto cleanup;
1835
1836	err = ext4_ext_dirty(handle, inode, path + depth);
1837
1838cleanup:
1839	if (npath) {
1840		ext4_ext_drop_refs(npath);
1841		kfree(npath);
1842	}
1843	ext4_ext_invalidate_cache(inode);
1844	return err;
1845}
1846
1847int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1848			ext4_lblk_t num, ext_prepare_callback func,
1849			void *cbdata)
1850{
1851	struct ext4_ext_path *path = NULL;
1852	struct ext4_ext_cache cbex;
1853	struct ext4_extent *ex;
1854	ext4_lblk_t next, start = 0, end = 0;
1855	ext4_lblk_t last = block + num;
1856	int depth, exists, err = 0;
1857
1858	BUG_ON(func == NULL);
1859	BUG_ON(inode == NULL);
1860
1861	while (block < last && block != EXT_MAX_BLOCK) {
1862		num = last - block;
1863		/* find extent for this block */
1864		down_read(&EXT4_I(inode)->i_data_sem);
1865		path = ext4_ext_find_extent(inode, block, path);
1866		up_read(&EXT4_I(inode)->i_data_sem);
1867		if (IS_ERR(path)) {
1868			err = PTR_ERR(path);
1869			path = NULL;
1870			break;
1871		}
1872
1873		depth = ext_depth(inode);
1874		if (unlikely(path[depth].p_hdr == NULL)) {
1875			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1876			err = -EIO;
1877			break;
1878		}
1879		ex = path[depth].p_ext;
1880		next = ext4_ext_next_allocated_block(path);
1881
1882		exists = 0;
1883		if (!ex) {
1884			/* there is no extent yet, so try to allocate
1885			 * all requested space */
1886			start = block;
1887			end = block + num;
1888		} else if (le32_to_cpu(ex->ee_block) > block) {
1889			/* need to allocate space before found extent */
1890			start = block;
1891			end = le32_to_cpu(ex->ee_block);
1892			if (block + num < end)
1893				end = block + num;
1894		} else if (block >= le32_to_cpu(ex->ee_block)
1895					+ ext4_ext_get_actual_len(ex)) {
1896			/* need to allocate space after found extent */
1897			start = block;
1898			end = block + num;
1899			if (end >= next)
1900				end = next;
1901		} else if (block >= le32_to_cpu(ex->ee_block)) {
1902			/*
1903			 * some part of requested space is covered
1904			 * by found extent
1905			 */
1906			start = block;
1907			end = le32_to_cpu(ex->ee_block)
1908				+ ext4_ext_get_actual_len(ex);
1909			if (block + num < end)
1910				end = block + num;
1911			exists = 1;
1912		} else {
1913			BUG();
1914		}
1915		BUG_ON(end <= start);
1916
1917		if (!exists) {
1918			cbex.ec_block = start;
1919			cbex.ec_len = end - start;
1920			cbex.ec_start = 0;
1921			cbex.ec_type = EXT4_EXT_CACHE_GAP;
1922		} else {
1923			cbex.ec_block = le32_to_cpu(ex->ee_block);
1924			cbex.ec_len = ext4_ext_get_actual_len(ex);
1925			cbex.ec_start = ext_pblock(ex);
1926			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1927		}
1928
1929		if (unlikely(cbex.ec_len == 0)) {
1930			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
1931			err = -EIO;
1932			break;
1933		}
1934		err = func(inode, path, &cbex, ex, cbdata);
1935		ext4_ext_drop_refs(path);
1936
1937		if (err < 0)
1938			break;
1939
1940		if (err == EXT_REPEAT)
1941			continue;
1942		else if (err == EXT_BREAK) {
1943			err = 0;
1944			break;
1945		}
1946
1947		if (ext_depth(inode) != depth) {
1948			/* depth was changed. we have to realloc path */
1949			kfree(path);
1950			path = NULL;
1951		}
1952
1953		block = cbex.ec_block + cbex.ec_len;
1954	}
1955
1956	if (path) {
1957		ext4_ext_drop_refs(path);
1958		kfree(path);
1959	}
1960
1961	return err;
1962}
1963
1964static void
1965ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1966			__u32 len, ext4_fsblk_t start, int type)
1967{
1968	struct ext4_ext_cache *cex;
1969	BUG_ON(len == 0);
1970	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1971	cex = &EXT4_I(inode)->i_cached_extent;
1972	cex->ec_type = type;
1973	cex->ec_block = block;
1974	cex->ec_len = len;
1975	cex->ec_start = start;
1976	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1977}
1978
1979/*
1980 * ext4_ext_put_gap_in_cache:
1981 * calculate boundaries of the gap that the requested block fits into
1982 * and cache this gap
1983 */
1984static void
1985ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1986				ext4_lblk_t block)
1987{
1988	int depth = ext_depth(inode);
1989	unsigned long len;
1990	ext4_lblk_t lblock;
1991	struct ext4_extent *ex;
1992
1993	ex = path[depth].p_ext;
1994	if (ex == NULL) {
1995		/* there is no extent yet, so gap is [0;-] */
1996		lblock = 0;
1997		len = EXT_MAX_BLOCK;
1998		ext_debug("cache gap(whole file):");
1999	} else if (block < le32_to_cpu(ex->ee_block)) {
2000		lblock = block;
2001		len = le32_to_cpu(ex->ee_block) - block;
2002		ext_debug("cache gap(before): %u [%u:%u]",
2003				block,
2004				le32_to_cpu(ex->ee_block),
2005				 ext4_ext_get_actual_len(ex));
2006	} else if (block >= le32_to_cpu(ex->ee_block)
2007			+ ext4_ext_get_actual_len(ex)) {
2008		ext4_lblk_t next;
2009		lblock = le32_to_cpu(ex->ee_block)
2010			+ ext4_ext_get_actual_len(ex);
2011
2012		next = ext4_ext_next_allocated_block(path);
2013		ext_debug("cache gap(after): [%u:%u] %u",
2014				le32_to_cpu(ex->ee_block),
2015				ext4_ext_get_actual_len(ex),
2016				block);
2017		BUG_ON(next == lblock);
2018		len = next - lblock;
2019	} else {
2020		lblock = len = 0;
2021		BUG();
2022	}
2023
2024	ext_debug(" -> %u:%lu\n", lblock, len);
2025	ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
2026}
2027
2028static int
2029ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2030			struct ext4_extent *ex)
2031{
2032	struct ext4_ext_cache *cex;
2033	int ret = EXT4_EXT_CACHE_NO;
2034
2035	/*
2036	 * We borrow i_block_reservation_lock to protect i_cached_extent
2037	 */
2038	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2039	cex = &EXT4_I(inode)->i_cached_extent;
2040
2041	/* has cache valid data? */
2042	if (cex->ec_type == EXT4_EXT_CACHE_NO)
2043		goto errout;
2044
2045	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
2046			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
2047	if (in_range(block, cex->ec_block, cex->ec_len)) {
2048		ex->ee_block = cpu_to_le32(cex->ec_block);
2049		ext4_ext_store_pblock(ex, cex->ec_start);
2050		ex->ee_len = cpu_to_le16(cex->ec_len);
2051		ext_debug("%u cached by %u:%u:%llu\n",
2052				block,
2053				cex->ec_block, cex->ec_len, cex->ec_start);
2054		ret = cex->ec_type;
2055	}
2056errout:
2057	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2058	return ret;
2059}
2060
2061/*
2062 * ext4_ext_rm_idx:
2063 * removes index from the index block.
2064 * It's used in truncate case only, thus all requests are for
2065 * last index in the block only.
2066 */
2067static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2068			struct ext4_ext_path *path)
2069{
2070	int err;
2071	ext4_fsblk_t leaf;
2072
2073	/* free index block */
2074	path--;
2075	leaf = idx_pblock(path->p_idx);
2076	if (unlikely(path->p_hdr->eh_entries == 0)) {
2077		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2078		return -EIO;
2079	}
2080	err = ext4_ext_get_access(handle, inode, path);
2081	if (err)
2082		return err;
2083	le16_add_cpu(&path->p_hdr->eh_entries, -1);
2084	err = ext4_ext_dirty(handle, inode, path);
2085	if (err)
2086		return err;
2087	ext_debug("index is empty, remove it, free block %llu\n", leaf);
2088	ext4_free_blocks(handle, inode, 0, leaf, 1,
2089			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2090	return err;
2091}
2092
2093/*
2094 * ext4_ext_calc_credits_for_single_extent:
2095 * This routine returns max. credits that needed to insert an extent
2096 * to the extent tree.
2097 * When pass the actual path, the caller should calculate credits
2098 * under i_data_sem.
2099 */
2100int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2101						struct ext4_ext_path *path)
2102{
2103	if (path) {
2104		int depth = ext_depth(inode);
2105		int ret = 0;
2106
2107		/* probably there is space in leaf? */
2108		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2109				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2110
2111			/*
2112			 *  There are some space in the leaf tree, no
2113			 *  need to account for leaf block credit
2114			 *
2115			 *  bitmaps and block group descriptor blocks
2116			 *  and other metadat blocks still need to be
2117			 *  accounted.
2118			 */
2119			/* 1 bitmap, 1 block group descriptor */
2120			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2121			return ret;
2122		}
2123	}
2124
2125	return ext4_chunk_trans_blocks(inode, nrblocks);
2126}
2127
2128/*
2129 * How many index/leaf blocks need to change/allocate to modify nrblocks?
2130 *
2131 * if nrblocks are fit in a single extent (chunk flag is 1), then
2132 * in the worse case, each tree level index/leaf need to be changed
2133 * if the tree split due to insert a new extent, then the old tree
2134 * index/leaf need to be updated too
2135 *
2136 * If the nrblocks are discontiguous, they could cause
2137 * the whole tree split more than once, but this is really rare.
2138 */
2139int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2140{
2141	int index;
2142	int depth = ext_depth(inode);
2143
2144	if (chunk)
2145		index = depth * 2;
2146	else
2147		index = depth * 3;
2148
2149	return index;
2150}
2151
2152static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2153				struct ext4_extent *ex,
2154				ext4_lblk_t from, ext4_lblk_t to)
2155{
2156	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2157	int flags = EXT4_FREE_BLOCKS_FORGET;
2158
2159	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2160		flags |= EXT4_FREE_BLOCKS_METADATA;
2161#ifdef EXTENTS_STATS
2162	{
2163		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2164		spin_lock(&sbi->s_ext_stats_lock);
2165		sbi->s_ext_blocks += ee_len;
2166		sbi->s_ext_extents++;
2167		if (ee_len < sbi->s_ext_min)
2168			sbi->s_ext_min = ee_len;
2169		if (ee_len > sbi->s_ext_max)
2170			sbi->s_ext_max = ee_len;
2171		if (ext_depth(inode) > sbi->s_depth_max)
2172			sbi->s_depth_max = ext_depth(inode);
2173		spin_unlock(&sbi->s_ext_stats_lock);
2174	}
2175#endif
2176	if (from >= le32_to_cpu(ex->ee_block)
2177	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2178		/* tail removal */
2179		ext4_lblk_t num;
2180		ext4_fsblk_t start;
2181
2182		num = le32_to_cpu(ex->ee_block) + ee_len - from;
2183		start = ext_pblock(ex) + ee_len - num;
2184		ext_debug("free last %u blocks starting %llu\n", num, start);
2185		ext4_free_blocks(handle, inode, 0, start, num, flags);
2186	} else if (from == le32_to_cpu(ex->ee_block)
2187		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2188		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
2189			from, to, le32_to_cpu(ex->ee_block), ee_len);
2190	} else {
2191		printk(KERN_INFO "strange request: removal(2) "
2192				"%u-%u from %u:%u\n",
2193				from, to, le32_to_cpu(ex->ee_block), ee_len);
2194	}
2195	return 0;
2196}
2197
2198static int
2199ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2200		struct ext4_ext_path *path, ext4_lblk_t start)
2201{
2202	int err = 0, correct_index = 0;
2203	int depth = ext_depth(inode), credits;
2204	struct ext4_extent_header *eh;
2205	ext4_lblk_t a, b, block;
2206	unsigned num;
2207	ext4_lblk_t ex_ee_block;
2208	unsigned short ex_ee_len;
2209	unsigned uninitialized = 0;
2210	struct ext4_extent *ex;
2211
2212	/* the header must be checked already in ext4_ext_remove_space() */
2213	ext_debug("truncate since %u in leaf\n", start);
2214	if (!path[depth].p_hdr)
2215		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2216	eh = path[depth].p_hdr;
2217	if (unlikely(path[depth].p_hdr == NULL)) {
2218		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2219		return -EIO;
2220	}
2221	/* find where to start removing */
2222	ex = EXT_LAST_EXTENT(eh);
2223
2224	ex_ee_block = le32_to_cpu(ex->ee_block);
2225	ex_ee_len = ext4_ext_get_actual_len(ex);
2226
2227	while (ex >= EXT_FIRST_EXTENT(eh) &&
2228			ex_ee_block + ex_ee_len > start) {
2229
2230		if (ext4_ext_is_uninitialized(ex))
2231			uninitialized = 1;
2232		else
2233			uninitialized = 0;
2234
2235		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2236			 uninitialized, ex_ee_len);
2237		path[depth].p_ext = ex;
2238
2239		a = ex_ee_block > start ? ex_ee_block : start;
2240		b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
2241			ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
2242
2243		ext_debug("  border %u:%u\n", a, b);
2244
2245		if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
2246			block = 0;
2247			num = 0;
2248			BUG();
2249		} else if (a != ex_ee_block) {
2250			/* remove tail of the extent */
2251			block = ex_ee_block;
2252			num = a - block;
2253		} else if (b != ex_ee_block + ex_ee_len - 1) {
2254			/* remove head of the extent */
2255			block = a;
2256			num = b - a;
2257			/* there is no "make a hole" API yet */
2258			BUG();
2259		} else {
2260			/* remove whole extent: excellent! */
2261			block = ex_ee_block;
2262			num = 0;
2263			BUG_ON(a != ex_ee_block);
2264			BUG_ON(b != ex_ee_block + ex_ee_len - 1);
2265		}
2266
2267		/*
2268		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2269		 * descriptor) for each block group; assume two block
2270		 * groups plus ex_ee_len/blocks_per_block_group for
2271		 * the worst case
2272		 */
2273		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2274		if (ex == EXT_FIRST_EXTENT(eh)) {
2275			correct_index = 1;
2276			credits += (ext_depth(inode)) + 1;
2277		}
2278		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2279
2280		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2281		if (err)
2282			goto out;
2283
2284		err = ext4_ext_get_access(handle, inode, path + depth);
2285		if (err)
2286			goto out;
2287
2288		err = ext4_remove_blocks(handle, inode, ex, a, b);
2289		if (err)
2290			goto out;
2291
2292		if (num == 0) {
2293			/* this extent is removed; mark slot entirely unused */
2294			ext4_ext_store_pblock(ex, 0);
2295			le16_add_cpu(&eh->eh_entries, -1);
2296		}
2297
2298		ex->ee_block = cpu_to_le32(block);
2299		ex->ee_len = cpu_to_le16(num);
2300		/*
2301		 * Do not mark uninitialized if all the blocks in the
2302		 * extent have been removed.
2303		 */
2304		if (uninitialized && num)
2305			ext4_ext_mark_uninitialized(ex);
2306
2307		err = ext4_ext_dirty(handle, inode, path + depth);
2308		if (err)
2309			goto out;
2310
2311		ext_debug("new extent: %u:%u:%llu\n", block, num,
2312				ext_pblock(ex));
2313		ex--;
2314		ex_ee_block = le32_to_cpu(ex->ee_block);
2315		ex_ee_len = ext4_ext_get_actual_len(ex);
2316	}
2317
2318	if (correct_index && eh->eh_entries)
2319		err = ext4_ext_correct_indexes(handle, inode, path);
2320
2321	/* if this leaf is free, then we should
2322	 * remove it from index block above */
2323	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2324		err = ext4_ext_rm_idx(handle, inode, path + depth);
2325
2326out:
2327	return err;
2328}
2329
2330/*
2331 * ext4_ext_more_to_rm:
2332 * returns 1 if current index has to be freed (even partial)
2333 */
2334static int
2335ext4_ext_more_to_rm(struct ext4_ext_path *path)
2336{
2337	BUG_ON(path->p_idx == NULL);
2338
2339	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2340		return 0;
2341
2342	/*
2343	 * if truncate on deeper level happened, it wasn't partial,
2344	 * so we have to consider current index for truncation
2345	 */
2346	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2347		return 0;
2348	return 1;
2349}
2350
2351static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2352{
2353	struct super_block *sb = inode->i_sb;
2354	int depth = ext_depth(inode);
2355	struct ext4_ext_path *path;
2356	handle_t *handle;
2357	int i, err;
2358
2359	ext_debug("truncate since %u\n", start);
2360
2361	/* probably first extent we're gonna free will be last in block */
2362	handle = ext4_journal_start(inode, depth + 1);
2363	if (IS_ERR(handle))
2364		return PTR_ERR(handle);
2365
2366again:
2367	ext4_ext_invalidate_cache(inode);
2368
2369	/*
2370	 * We start scanning from right side, freeing all the blocks
2371	 * after i_size and walking into the tree depth-wise.
2372	 */
2373	depth = ext_depth(inode);
2374	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2375	if (path == NULL) {
2376		ext4_journal_stop(handle);
2377		return -ENOMEM;
2378	}
2379	path[0].p_depth = depth;
2380	path[0].p_hdr = ext_inode_hdr(inode);
2381	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2382		err = -EIO;
2383		goto out;
2384	}
2385	i = err = 0;
2386
2387	while (i >= 0 && err == 0) {
2388		if (i == depth) {
2389			/* this is leaf block */
2390			err = ext4_ext_rm_leaf(handle, inode, path, start);
2391			/* root level has p_bh == NULL, brelse() eats this */
2392			brelse(path[i].p_bh);
2393			path[i].p_bh = NULL;
2394			i--;
2395			continue;
2396		}
2397
2398		/* this is index block */
2399		if (!path[i].p_hdr) {
2400			ext_debug("initialize header\n");
2401			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2402		}
2403
2404		if (!path[i].p_idx) {
2405			/* this level hasn't been touched yet */
2406			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2407			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2408			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2409				  path[i].p_hdr,
2410				  le16_to_cpu(path[i].p_hdr->eh_entries));
2411		} else {
2412			/* we were already here, see at next index */
2413			path[i].p_idx--;
2414		}
2415
2416		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2417				i, EXT_FIRST_INDEX(path[i].p_hdr),
2418				path[i].p_idx);
2419		if (ext4_ext_more_to_rm(path + i)) {
2420			struct buffer_head *bh;
2421			/* go to the next level */
2422			ext_debug("move to level %d (block %llu)\n",
2423				  i + 1, idx_pblock(path[i].p_idx));
2424			memset(path + i + 1, 0, sizeof(*path));
2425			bh = sb_bread(sb, idx_pblock(path[i].p_idx));
2426			if (!bh) {
2427				/* should we reset i_size? */
2428				err = -EIO;
2429				break;
2430			}
2431			if (WARN_ON(i + 1 > depth)) {
2432				err = -EIO;
2433				break;
2434			}
2435			if (ext4_ext_check(inode, ext_block_hdr(bh),
2436							depth - i - 1)) {
2437				err = -EIO;
2438				break;
2439			}
2440			path[i + 1].p_bh = bh;
2441
2442			/* save actual number of indexes since this
2443			 * number is changed at the next iteration */
2444			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2445			i++;
2446		} else {
2447			/* we finished processing this index, go up */
2448			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2449				/* index is empty, remove it;
2450				 * handle must be already prepared by the
2451				 * truncatei_leaf() */
2452				err = ext4_ext_rm_idx(handle, inode, path + i);
2453			}
2454			/* root level has p_bh == NULL, brelse() eats this */
2455			brelse(path[i].p_bh);
2456			path[i].p_bh = NULL;
2457			i--;
2458			ext_debug("return to level %d\n", i);
2459		}
2460	}
2461
2462	/* TODO: flexible tree reduction should be here */
2463	if (path->p_hdr->eh_entries == 0) {
2464		/*
2465		 * truncate to zero freed all the tree,
2466		 * so we need to correct eh_depth
2467		 */
2468		err = ext4_ext_get_access(handle, inode, path);
2469		if (err == 0) {
2470			ext_inode_hdr(inode)->eh_depth = 0;
2471			ext_inode_hdr(inode)->eh_max =
2472				cpu_to_le16(ext4_ext_space_root(inode, 0));
2473			err = ext4_ext_dirty(handle, inode, path);
2474		}
2475	}
2476out:
2477	ext4_ext_drop_refs(path);
2478	kfree(path);
2479	if (err == -EAGAIN)
2480		goto again;
2481	ext4_journal_stop(handle);
2482
2483	return err;
2484}
2485
2486/*
2487 * called at mount time
2488 */
2489void ext4_ext_init(struct super_block *sb)
2490{
2491	/*
2492	 * possible initialization would be here
2493	 */
2494
2495	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2496#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2497		printk(KERN_INFO "EXT4-fs: file extents enabled");
2498#ifdef AGGRESSIVE_TEST
2499		printk(", aggressive tests");
2500#endif
2501#ifdef CHECK_BINSEARCH
2502		printk(", check binsearch");
2503#endif
2504#ifdef EXTENTS_STATS
2505		printk(", stats");
2506#endif
2507		printk("\n");
2508#endif
2509#ifdef EXTENTS_STATS
2510		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2511		EXT4_SB(sb)->s_ext_min = 1 << 30;
2512		EXT4_SB(sb)->s_ext_max = 0;
2513#endif
2514	}
2515}
2516
2517/*
2518 * called at umount time
2519 */
2520void ext4_ext_release(struct super_block *sb)
2521{
2522	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2523		return;
2524
2525#ifdef EXTENTS_STATS
2526	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2527		struct ext4_sb_info *sbi = EXT4_SB(sb);
2528		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2529			sbi->s_ext_blocks, sbi->s_ext_extents,
2530			sbi->s_ext_blocks / sbi->s_ext_extents);
2531		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2532			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2533	}
2534#endif
2535}
2536
2537static void bi_complete(struct bio *bio, int error)
2538{
2539	complete((struct completion *)bio->bi_private);
2540}
2541
2542static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2543{
2544	int ret;
2545	struct bio *bio;
2546	int blkbits, blocksize;
2547	sector_t ee_pblock;
2548	struct completion event;
2549	unsigned int ee_len, len, done, offset;
2550
2551
2552	blkbits   = inode->i_blkbits;
2553	blocksize = inode->i_sb->s_blocksize;
2554	ee_len    = ext4_ext_get_actual_len(ex);
2555	ee_pblock = ext_pblock(ex);
2556
2557	/* convert ee_pblock to 512 byte sectors */
2558	ee_pblock = ee_pblock << (blkbits - 9);
2559
2560	while (ee_len > 0) {
2561
2562		if (ee_len > BIO_MAX_PAGES)
2563			len = BIO_MAX_PAGES;
2564		else
2565			len = ee_len;
2566
2567		bio = bio_alloc(GFP_NOIO, len);
2568		if (!bio)
2569			return -ENOMEM;
2570
2571		bio->bi_sector = ee_pblock;
2572		bio->bi_bdev   = inode->i_sb->s_bdev;
2573
2574		done = 0;
2575		offset = 0;
2576		while (done < len) {
2577			ret = bio_add_page(bio, ZERO_PAGE(0),
2578							blocksize, offset);
2579			if (ret != blocksize) {
2580				/*
2581				 * We can't add any more pages because of
2582				 * hardware limitations.  Start a new bio.
2583				 */
2584				break;
2585			}
2586			done++;
2587			offset += blocksize;
2588			if (offset >= PAGE_CACHE_SIZE)
2589				offset = 0;
2590		}
2591
2592		init_completion(&event);
2593		bio->bi_private = &event;
2594		bio->bi_end_io = bi_complete;
2595		submit_bio(WRITE, bio);
2596		wait_for_completion(&event);
2597
2598		if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2599			bio_put(bio);
2600			return -EIO;
2601		}
2602		bio_put(bio);
2603		ee_len    -= done;
2604		ee_pblock += done  << (blkbits - 9);
2605	}
2606	return 0;
2607}
2608
2609#define EXT4_EXT_ZERO_LEN 7
2610/*
2611 * This function is called by ext4_ext_map_blocks() if someone tries to write
2612 * to an uninitialized extent. It may result in splitting the uninitialized
2613 * extent into multiple extents (upto three - one initialized and two
2614 * uninitialized).
2615 * There are three possibilities:
2616 *   a> There is no split required: Entire extent should be initialized
2617 *   b> Splits in two extents: Write is happening at either end of the extent
2618 *   c> Splits in three extents: Somone is writing in middle of the extent
2619 */
2620static int ext4_ext_convert_to_initialized(handle_t *handle,
2621					   struct inode *inode,
2622					   struct ext4_map_blocks *map,
2623					   struct ext4_ext_path *path)
2624{
2625	struct ext4_extent *ex, newex, orig_ex;
2626	struct ext4_extent *ex1 = NULL;
2627	struct ext4_extent *ex2 = NULL;
2628	struct ext4_extent *ex3 = NULL;
2629	struct ext4_extent_header *eh;
2630	ext4_lblk_t ee_block, eof_block;
2631	unsigned int allocated, ee_len, depth;
2632	ext4_fsblk_t newblock;
2633	int err = 0;
2634	int ret = 0;
2635	int may_zeroout;
2636
2637	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2638		"block %llu, max_blocks %u\n", inode->i_ino,
2639		(unsigned long long)map->m_lblk, map->m_len);
2640
2641	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2642		inode->i_sb->s_blocksize_bits;
2643	if (eof_block < map->m_lblk + map->m_len)
2644		eof_block = map->m_lblk + map->m_len;
2645
2646	depth = ext_depth(inode);
2647	eh = path[depth].p_hdr;
2648	ex = path[depth].p_ext;
2649	ee_block = le32_to_cpu(ex->ee_block);
2650	ee_len = ext4_ext_get_actual_len(ex);
2651	allocated = ee_len - (map->m_lblk - ee_block);
2652	newblock = map->m_lblk - ee_block + ext_pblock(ex);
2653
2654	ex2 = ex;
2655	orig_ex.ee_block = ex->ee_block;
2656	orig_ex.ee_len   = cpu_to_le16(ee_len);
2657	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2658
2659	/*
2660	 * It is safe to convert extent to initialized via explicit
2661	 * zeroout only if extent is fully insde i_size or new_size.
2662	 */
2663	may_zeroout = ee_block + ee_len <= eof_block;
2664
2665	err = ext4_ext_get_access(handle, inode, path + depth);
2666	if (err)
2667		goto out;
2668	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2669	if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
2670		err =  ext4_ext_zeroout(inode, &orig_ex);
2671		if (err)
2672			goto fix_extent_len;
2673		/* update the extent length and mark as initialized */
2674		ex->ee_block = orig_ex.ee_block;
2675		ex->ee_len   = orig_ex.ee_len;
2676		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2677		ext4_ext_dirty(handle, inode, path + depth);
2678		/* zeroed the full extent */
2679		return allocated;
2680	}
2681
2682	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2683	if (map->m_lblk > ee_block) {
2684		ex1 = ex;
2685		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2686		ext4_ext_mark_uninitialized(ex1);
2687		ex2 = &newex;
2688	}
2689	/*
2690	 * for sanity, update the length of the ex2 extent before
2691	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2692	 * overlap of blocks.
2693	 */
2694	if (!ex1 && allocated > map->m_len)
2695		ex2->ee_len = cpu_to_le16(map->m_len);
2696	/* ex3: to ee_block + ee_len : uninitialised */
2697	if (allocated > map->m_len) {
2698		unsigned int newdepth;
2699		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2700		if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2701			/*
2702			 * map->m_lblk == ee_block is handled by the zerouout
2703			 * at the beginning.
2704			 * Mark first half uninitialized.
2705			 * Mark second half initialized and zero out the
2706			 * initialized extent
2707			 */
2708			ex->ee_block = orig_ex.ee_block;
2709			ex->ee_len   = cpu_to_le16(ee_len - allocated);
2710			ext4_ext_mark_uninitialized(ex);
2711			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2712			ext4_ext_dirty(handle, inode, path + depth);
2713
2714			ex3 = &newex;
2715			ex3->ee_block = cpu_to_le32(map->m_lblk);
2716			ext4_ext_store_pblock(ex3, newblock);
2717			ex3->ee_len = cpu_to_le16(allocated);
2718			err = ext4_ext_insert_extent(handle, inode, path,
2719							ex3, 0);
2720			if (err == -ENOSPC) {
2721				err =  ext4_ext_zeroout(inode, &orig_ex);
2722				if (err)
2723					goto fix_extent_len;
2724				ex->ee_block = orig_ex.ee_block;
2725				ex->ee_len   = orig_ex.ee_len;
2726				ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2727				ext4_ext_dirty(handle, inode, path + depth);
2728				/* blocks available from map->m_lblk */
2729				return allocated;
2730
2731			} else if (err)
2732				goto fix_extent_len;
2733
2734			/*
2735			 * We need to zero out the second half because
2736			 * an fallocate request can update file size and
2737			 * converting the second half to initialized extent
2738			 * implies that we can leak some junk data to user
2739			 * space.
2740			 */
2741			err =  ext4_ext_zeroout(inode, ex3);
2742			if (err) {
2743				/*
2744				 * We should actually mark the
2745				 * second half as uninit and return error
2746				 * Insert would have changed the extent
2747				 */
2748				depth = ext_depth(inode);
2749				ext4_ext_drop_refs(path);
2750				path = ext4_ext_find_extent(inode, map->m_lblk,
2751							    path);
2752				if (IS_ERR(path)) {
2753					err = PTR_ERR(path);
2754					return err;
2755				}
2756				/* get the second half extent details */
2757				ex = path[depth].p_ext;
2758				err = ext4_ext_get_access(handle, inode,
2759								path + depth);
2760				if (err)
2761					return err;
2762				ext4_ext_mark_uninitialized(ex);
2763				ext4_ext_dirty(handle, inode, path + depth);
2764				return err;
2765			}
2766
2767			/* zeroed the second half */
2768			return allocated;
2769		}
2770		ex3 = &newex;
2771		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2772		ext4_ext_store_pblock(ex3, newblock + map->m_len);
2773		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2774		ext4_ext_mark_uninitialized(ex3);
2775		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2776		if (err == -ENOSPC && may_zeroout) {
2777			err =  ext4_ext_zeroout(inode, &orig_ex);
2778			if (err)
2779				goto fix_extent_len;
2780			/* update the extent length and mark as initialized */
2781			ex->ee_block = orig_ex.ee_block;
2782			ex->ee_len   = orig_ex.ee_len;
2783			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2784			ext4_ext_dirty(handle, inode, path + depth);
2785			/* zeroed the full extent */
2786			/* blocks available from map->m_lblk */
2787			return allocated;
2788
2789		} else if (err)
2790			goto fix_extent_len;
2791		/*
2792		 * The depth, and hence eh & ex might change
2793		 * as part of the insert above.
2794		 */
2795		newdepth = ext_depth(inode);
2796		/*
2797		 * update the extent length after successful insert of the
2798		 * split extent
2799		 */
2800		ee_len -= ext4_ext_get_actual_len(ex3);
2801		orig_ex.ee_len = cpu_to_le16(ee_len);
2802		may_zeroout = ee_block + ee_len <= eof_block;
2803
2804		depth = newdepth;
2805		ext4_ext_drop_refs(path);
2806		path = ext4_ext_find_extent(inode, map->m_lblk, path);
2807		if (IS_ERR(path)) {
2808			err = PTR_ERR(path);
2809			goto out;
2810		}
2811		eh = path[depth].p_hdr;
2812		ex = path[depth].p_ext;
2813		if (ex2 != &newex)
2814			ex2 = ex;
2815
2816		err = ext4_ext_get_access(handle, inode, path + depth);
2817		if (err)
2818			goto out;
2819
2820		allocated = map->m_len;
2821
2822		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2823		 * to insert a extent in the middle zerout directly
2824		 * otherwise give the extent a chance to merge to left
2825		 */
2826		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2827			map->m_lblk != ee_block && may_zeroout) {
2828			err =  ext4_ext_zeroout(inode, &orig_ex);
2829			if (err)
2830				goto fix_extent_len;
2831			/* update the extent length and mark as initialized */
2832			ex->ee_block = orig_ex.ee_block;
2833			ex->ee_len   = orig_ex.ee_len;
2834			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2835			ext4_ext_dirty(handle, inode, path + depth);
2836			/* zero out the first half */
2837			/* blocks available from map->m_lblk */
2838			return allocated;
2839		}
2840	}
2841	/*
2842	 * If there was a change of depth as part of the
2843	 * insertion of ex3 above, we need to update the length
2844	 * of the ex1 extent again here
2845	 */
2846	if (ex1 && ex1 != ex) {
2847		ex1 = ex;
2848		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2849		ext4_ext_mark_uninitialized(ex1);
2850		ex2 = &newex;
2851	}
2852	/* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2853	ex2->ee_block = cpu_to_le32(map->m_lblk);
2854	ext4_ext_store_pblock(ex2, newblock);
2855	ex2->ee_len = cpu_to_le16(allocated);
2856	if (ex2 != ex)
2857		goto insert;
2858	/*
2859	 * New (initialized) extent starts from the first block
2860	 * in the current extent. i.e., ex2 == ex
2861	 * We have to see if it can be merged with the extent
2862	 * on the left.
2863	 */
2864	if (ex2 > EXT_FIRST_EXTENT(eh)) {
2865		/*
2866		 * To merge left, pass "ex2 - 1" to try_to_merge(),
2867		 * since it merges towards right _only_.
2868		 */
2869		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2870		if (ret) {
2871			err = ext4_ext_correct_indexes(handle, inode, path);
2872			if (err)
2873				goto out;
2874			depth = ext_depth(inode);
2875			ex2--;
2876		}
2877	}
2878	/*
2879	 * Try to Merge towards right. This might be required
2880	 * only when the whole extent is being written to.
2881	 * i.e. ex2 == ex and ex3 == NULL.
2882	 */
2883	if (!ex3) {
2884		ret = ext4_ext_try_to_merge(inode, path, ex2);
2885		if (ret) {
2886			err = ext4_ext_correct_indexes(handle, inode, path);
2887			if (err)
2888				goto out;
2889		}
2890	}
2891	/* Mark modified extent as dirty */
2892	err = ext4_ext_dirty(handle, inode, path + depth);
2893	goto out;
2894insert:
2895	err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2896	if (err == -ENOSPC && may_zeroout) {
2897		err =  ext4_ext_zeroout(inode, &orig_ex);
2898		if (err)
2899			goto fix_extent_len;
2900		/* update the extent length and mark as initialized */
2901		ex->ee_block = orig_ex.ee_block;
2902		ex->ee_len   = orig_ex.ee_len;
2903		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2904		ext4_ext_dirty(handle, inode, path + depth);
2905		/* zero out the first half */
2906		return allocated;
2907	} else if (err)
2908		goto fix_extent_len;
2909out:
2910	ext4_ext_show_leaf(inode, path);
2911	return err ? err : allocated;
2912
2913fix_extent_len:
2914	ex->ee_block = orig_ex.ee_block;
2915	ex->ee_len   = orig_ex.ee_len;
2916	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2917	ext4_ext_mark_uninitialized(ex);
2918	ext4_ext_dirty(handle, inode, path + depth);
2919	return err;
2920}
2921
2922/*
2923 * This function is called by ext4_ext_map_blocks() from
2924 * ext4_get_blocks_dio_write() when DIO to write
2925 * to an uninitialized extent.
2926 *
2927 * Writing to an uninitized extent may result in splitting the uninitialized
2928 * extent into multiple /intialized unintialized extents (up to three)
2929 * There are three possibilities:
2930 *   a> There is no split required: Entire extent should be uninitialized
2931 *   b> Splits in two extents: Write is happening at either end of the extent
2932 *   c> Splits in three extents: Somone is writing in middle of the extent
2933 *
2934 * One of more index blocks maybe needed if the extent tree grow after
2935 * the unintialized extent split. To prevent ENOSPC occur at the IO
2936 * complete, we need to split the uninitialized extent before DIO submit
2937 * the IO. The uninitialized extent called at this time will be split
2938 * into three uninitialized extent(at most). After IO complete, the part
2939 * being filled will be convert to initialized by the end_io callback function
2940 * via ext4_convert_unwritten_extents().
2941 *
2942 * Returns the size of uninitialized extent to be written on success.
2943 */
2944static int ext4_split_unwritten_extents(handle_t *handle,
2945					struct inode *inode,
2946					struct ext4_map_blocks *map,
2947					struct ext4_ext_path *path,
2948					int flags)
2949{
2950	struct ext4_extent *ex, newex, orig_ex;
2951	struct ext4_extent *ex1 = NULL;
2952	struct ext4_extent *ex2 = NULL;
2953	struct ext4_extent *ex3 = NULL;
2954	ext4_lblk_t ee_block, eof_block;
2955	unsigned int allocated, ee_len, depth;
2956	ext4_fsblk_t newblock;
2957	int err = 0;
2958	int may_zeroout;
2959
2960	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
2961		"block %llu, max_blocks %u\n", inode->i_ino,
2962		(unsigned long long)map->m_lblk, map->m_len);
2963
2964	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2965		inode->i_sb->s_blocksize_bits;
2966	if (eof_block < map->m_lblk + map->m_len)
2967		eof_block = map->m_lblk + map->m_len;
2968
2969	depth = ext_depth(inode);
2970	ex = path[depth].p_ext;
2971	ee_block = le32_to_cpu(ex->ee_block);
2972	ee_len = ext4_ext_get_actual_len(ex);
2973	allocated = ee_len - (map->m_lblk - ee_block);
2974	newblock = map->m_lblk - ee_block + ext_pblock(ex);
2975
2976	ex2 = ex;
2977	orig_ex.ee_block = ex->ee_block;
2978	orig_ex.ee_len   = cpu_to_le16(ee_len);
2979	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2980
2981	/*
2982	 * It is safe to convert extent to initialized via explicit
2983	 * zeroout only if extent is fully insde i_size or new_size.
2984	 */
2985	may_zeroout = ee_block + ee_len <= eof_block;
2986
2987	/*
2988 	 * If the uninitialized extent begins at the same logical
2989 	 * block where the write begins, and the write completely
2990 	 * covers the extent, then we don't need to split it.
2991 	 */
2992	if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
2993		return allocated;
2994
2995	err = ext4_ext_get_access(handle, inode, path + depth);
2996	if (err)
2997		goto out;
2998	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2999	if (map->m_lblk > ee_block) {
3000		ex1 = ex;
3001		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
3002		ext4_ext_mark_uninitialized(ex1);
3003		ex2 = &newex;
3004	}
3005	/*
3006	 * for sanity, update the length of the ex2 extent before
3007	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
3008	 * overlap of blocks.
3009	 */
3010	if (!ex1 && allocated > map->m_len)
3011		ex2->ee_len = cpu_to_le16(map->m_len);
3012	/* ex3: to ee_block + ee_len : uninitialised */
3013	if (allocated > map->m_len) {
3014		unsigned int newdepth;
3015		ex3 = &newex;
3016		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
3017		ext4_ext_store_pblock(ex3, newblock + map->m_len);
3018		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
3019		ext4_ext_mark_uninitialized(ex3);
3020		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
3021		if (err == -ENOSPC && may_zeroout) {
3022			err =  ext4_ext_zeroout(inode, &orig_ex);
3023			if (err)
3024				goto fix_extent_len;
3025			/* update the extent length and mark as initialized */
3026			ex->ee_block = orig_ex.ee_block;
3027			ex->ee_len   = orig_ex.ee_len;
3028			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3029			ext4_ext_dirty(handle, inode, path + depth);
3030			/* zeroed the full extent */
3031			/* blocks available from map->m_lblk */
3032			return allocated;
3033
3034		} else if (err)
3035			goto fix_extent_len;
3036		/*
3037		 * The depth, and hence eh & ex might change
3038		 * as part of the insert above.
3039		 */
3040		newdepth = ext_depth(inode);
3041		/*
3042		 * update the extent length after successful insert of the
3043		 * split extent
3044		 */
3045		ee_len -= ext4_ext_get_actual_len(ex3);
3046		orig_ex.ee_len = cpu_to_le16(ee_len);
3047		may_zeroout = ee_block + ee_len <= eof_block;
3048
3049		depth = newdepth;
3050		ext4_ext_drop_refs(path);
3051		path = ext4_ext_find_extent(inode, map->m_lblk, path);
3052		if (IS_ERR(path)) {
3053			err = PTR_ERR(path);
3054			goto out;
3055		}
3056		ex = path[depth].p_ext;
3057		if (ex2 != &newex)
3058			ex2 = ex;
3059
3060		err = ext4_ext_get_access(handle, inode, path + depth);
3061		if (err)
3062			goto out;
3063
3064		allocated = map->m_len;
3065	}
3066	/*
3067	 * If there was a change of depth as part of the
3068	 * insertion of ex3 above, we need to update the length
3069	 * of the ex1 extent again here
3070	 */
3071	if (ex1 && ex1 != ex) {
3072		ex1 = ex;
3073		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
3074		ext4_ext_mark_uninitialized(ex1);
3075		ex2 = &newex;
3076	}
3077	/*
3078	 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
3079	 * using direct I/O, uninitialised still.
3080	 */
3081	ex2->ee_block = cpu_to_le32(map->m_lblk);
3082	ext4_ext_store_pblock(ex2, newblock);
3083	ex2->ee_len = cpu_to_le16(allocated);
3084	ext4_ext_mark_uninitialized(ex2);
3085	if (ex2 != ex)
3086		goto insert;
3087	/* Mark modified extent as dirty */
3088	err = ext4_ext_dirty(handle, inode, path + depth);
3089	ext_debug("out here\n");
3090	goto out;
3091insert:
3092	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3093	if (err == -ENOSPC && may_zeroout) {
3094		err =  ext4_ext_zeroout(inode, &orig_ex);
3095		if (err)
3096			goto fix_extent_len;
3097		/* update the extent length and mark as initialized */
3098		ex->ee_block = orig_ex.ee_block;
3099		ex->ee_len   = orig_ex.ee_len;
3100		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3101		ext4_ext_dirty(handle, inode, path + depth);
3102		/* zero out the first half */
3103		return allocated;
3104	} else if (err)
3105		goto fix_extent_len;
3106out:
3107	ext4_ext_show_leaf(inode, path);
3108	return err ? err : allocated;
3109
3110fix_extent_len:
3111	ex->ee_block = orig_ex.ee_block;
3112	ex->ee_len   = orig_ex.ee_len;
3113	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3114	ext4_ext_mark_uninitialized(ex);
3115	ext4_ext_dirty(handle, inode, path + depth);
3116	return err;
3117}
3118static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3119					      struct inode *inode,
3120					      struct ext4_ext_path *path)
3121{
3122	struct ext4_extent *ex;
3123	struct ext4_extent_header *eh;
3124	int depth;
3125	int err = 0;
3126	int ret = 0;
3127
3128	depth = ext_depth(inode);
3129	eh = path[depth].p_hdr;
3130	ex = path[depth].p_ext;
3131
3132	err = ext4_ext_get_access(handle, inode, path + depth);
3133	if (err)
3134		goto out;
3135	/* first mark the extent as initialized */
3136	ext4_ext_mark_initialized(ex);
3137
3138	/*
3139	 * We have to see if it can be merged with the extent
3140	 * on the left.
3141	 */
3142	if (ex > EXT_FIRST_EXTENT(eh)) {
3143		/*
3144		 * To merge left, pass "ex - 1" to try_to_merge(),
3145		 * since it merges towards right _only_.
3146		 */
3147		ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3148		if (ret) {
3149			err = ext4_ext_correct_indexes(handle, inode, path);
3150			if (err)
3151				goto out;
3152			depth = ext_depth(inode);
3153			ex--;
3154		}
3155	}
3156	/*
3157	 * Try to Merge towards right.
3158	 */
3159	ret = ext4_ext_try_to_merge(inode, path, ex);
3160	if (ret) {
3161		err = ext4_ext_correct_indexes(handle, inode, path);
3162		if (err)
3163			goto out;
3164		depth = ext_depth(inode);
3165	}
3166	/* Mark modified extent as dirty */
3167	err = ext4_ext_dirty(handle, inode, path + depth);
3168out:
3169	ext4_ext_show_leaf(inode, path);
3170	return err;
3171}
3172
3173static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3174			sector_t block, int count)
3175{
3176	int i;
3177	for (i = 0; i < count; i++)
3178                unmap_underlying_metadata(bdev, block + i);
3179}
3180
3181static int
3182ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3183			struct ext4_map_blocks *map,
3184			struct ext4_ext_path *path, int flags,
3185			unsigned int allocated, ext4_fsblk_t newblock)
3186{
3187	int ret = 0;
3188	int err = 0;
3189	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3190
3191	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3192		  "block %llu, max_blocks %u, flags %d, allocated %u",
3193		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3194		  flags, allocated);
3195	ext4_ext_show_leaf(inode, path);
3196
3197	/* get_block() before submit the IO, split the extent */
3198	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3199		ret = ext4_split_unwritten_extents(handle, inode, map,
3200						   path, flags);
3201		/*
3202		 * Flag the inode(non aio case) or end_io struct (aio case)
3203		 * that this IO needs to convertion to written when IO is
3204		 * completed
3205		 */
3206		if (io)
3207			io->flag = EXT4_IO_UNWRITTEN;
3208		else
3209			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3210		if (ext4_should_dioread_nolock(inode))
3211			map->m_flags |= EXT4_MAP_UNINIT;
3212		goto out;
3213	}
3214	/* IO end_io complete, convert the filled extent to written */
3215	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3216		ret = ext4_convert_unwritten_extents_endio(handle, inode,
3217							path);
3218		if (ret >= 0)
3219			ext4_update_inode_fsync_trans(handle, inode, 1);
3220		goto out2;
3221	}
3222	/* buffered IO case */
3223	/*
3224	 * repeat fallocate creation request
3225	 * we already have an unwritten extent
3226	 */
3227	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3228		goto map_out;
3229
3230	/* buffered READ or buffered write_begin() lookup */
3231	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3232		/*
3233		 * We have blocks reserved already.  We
3234		 * return allocated blocks so that delalloc
3235		 * won't do block reservation for us.  But
3236		 * the buffer head will be unmapped so that
3237		 * a read from the block returns 0s.
3238		 */
3239		map->m_flags |= EXT4_MAP_UNWRITTEN;
3240		goto out1;
3241	}
3242
3243	/* buffered write, writepage time, convert*/
3244	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3245	if (ret >= 0)
3246		ext4_update_inode_fsync_trans(handle, inode, 1);
3247out:
3248	if (ret <= 0) {
3249		err = ret;
3250		goto out2;
3251	} else
3252		allocated = ret;
3253	map->m_flags |= EXT4_MAP_NEW;
3254	/*
3255	 * if we allocated more blocks than requested
3256	 * we need to make sure we unmap the extra block
3257	 * allocated. The actual needed block will get
3258	 * unmapped later when we find the buffer_head marked
3259	 * new.
3260	 */
3261	if (allocated > map->m_len) {
3262		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3263					newblock + map->m_len,
3264					allocated - map->m_len);
3265		allocated = map->m_len;
3266	}
3267
3268	/*
3269	 * If we have done fallocate with the offset that is already
3270	 * delayed allocated, we would have block reservation
3271	 * and quota reservation done in the delayed write path.
3272	 * But fallocate would have already updated quota and block
3273	 * count for this offset. So cancel these reservation
3274	 */
3275	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3276		ext4_da_update_reserve_space(inode, allocated, 0);
3277
3278map_out:
3279	map->m_flags |= EXT4_MAP_MAPPED;
3280out1:
3281	if (allocated > map->m_len)
3282		allocated = map->m_len;
3283	ext4_ext_show_leaf(inode, path);
3284	map->m_pblk = newblock;
3285	map->m_len = allocated;
3286out2:
3287	if (path) {
3288		ext4_ext_drop_refs(path);
3289		kfree(path);
3290	}
3291	return err ? err : allocated;
3292}
3293/*
3294 * Block allocation/map/preallocation routine for extents based files
3295 *
3296 *
3297 * Need to be called with
3298 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
3299 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
3300 *
3301 * return > 0, number of of blocks already mapped/allocated
3302 *          if create == 0 and these are pre-allocated blocks
3303 *          	buffer head is unmapped
3304 *          otherwise blocks are mapped
3305 *
3306 * return = 0, if plain look up failed (blocks have not been allocated)
3307 *          buffer head is unmapped
3308 *
3309 * return < 0, error case.
3310 */
3311int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3312			struct ext4_map_blocks *map, int flags)
3313{
3314	struct ext4_ext_path *path = NULL;
3315	struct ext4_extent_header *eh;
3316	struct ext4_extent newex, *ex, *last_ex;
3317	ext4_fsblk_t newblock;
3318	int i, err = 0, depth, ret, cache_type;
3319	unsigned int allocated = 0;
3320	struct ext4_allocation_request ar;
3321	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3322
3323	ext_debug("blocks %u/%u requested for inode %lu\n",
3324		  map->m_lblk, map->m_len, inode->i_ino);
3325
3326	/* check in cache */
3327	cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
3328	if (cache_type) {
3329		if (cache_type == EXT4_EXT_CACHE_GAP) {
3330			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3331				/*
3332				 * block isn't allocated yet and
3333				 * user doesn't want to allocate it
3334				 */
3335				goto out2;
3336			}
3337			/* we should allocate requested block */
3338		} else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
3339			/* block is already allocated */
3340			newblock = map->m_lblk
3341				   - le32_to_cpu(newex.ee_block)
3342				   + ext_pblock(&newex);
3343			/* number of remaining blocks in the extent */
3344			allocated = ext4_ext_get_actual_len(&newex) -
3345				(map->m_lblk - le32_to_cpu(newex.ee_block));
3346			goto out;
3347		} else {
3348			BUG();
3349		}
3350	}
3351
3352	/* find extent for this block */
3353	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3354	if (IS_ERR(path)) {
3355		err = PTR_ERR(path);
3356		path = NULL;
3357		goto out2;
3358	}
3359
3360	depth = ext_depth(inode);
3361
3362	/*
3363	 * consistent leaf must not be empty;
3364	 * this situation is possible, though, _during_ tree modification;
3365	 * this is why assert can't be put in ext4_ext_find_extent()
3366	 */
3367	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3368		EXT4_ERROR_INODE(inode, "bad extent address "
3369				 "lblock: %lu, depth: %d pblock %lld",
3370				 (unsigned long) map->m_lblk, depth,
3371				 path[depth].p_block);
3372		err = -EIO;
3373		goto out2;
3374	}
3375	eh = path[depth].p_hdr;
3376
3377	ex = path[depth].p_ext;
3378	if (ex) {
3379		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3380		ext4_fsblk_t ee_start = ext_pblock(ex);
3381		unsigned short ee_len;
3382
3383		/*
3384		 * Uninitialized extents are treated as holes, except that
3385		 * we split out initialized portions during a write.
3386		 */
3387		ee_len = ext4_ext_get_actual_len(ex);
3388		/* if found extent covers block, simply return it */
3389		if (in_range(map->m_lblk, ee_block, ee_len)) {
3390			newblock = map->m_lblk - ee_block + ee_start;
3391			/* number of remaining blocks in the extent */
3392			allocated = ee_len - (map->m_lblk - ee_block);
3393			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3394				  ee_block, ee_len, newblock);
3395
3396			/* Do not put uninitialized extent in the cache */
3397			if (!ext4_ext_is_uninitialized(ex)) {
3398				ext4_ext_put_in_cache(inode, ee_block,
3399							ee_len, ee_start,
3400							EXT4_EXT_CACHE_EXTENT);
3401				goto out;
3402			}
3403			ret = ext4_ext_handle_uninitialized_extents(handle,
3404					inode, map, path, flags, allocated,
3405					newblock);
3406			return ret;
3407		}
3408	}
3409
3410	/*
3411	 * requested block isn't allocated yet;
3412	 * we couldn't try to create block if create flag is zero
3413	 */
3414	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3415		/*
3416		 * put just found gap into cache to speed up
3417		 * subsequent requests
3418		 */
3419		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3420		goto out2;
3421	}
3422	/*
3423	 * Okay, we need to do block allocation.
3424	 */
3425
3426	/* find neighbour allocated blocks */
3427	ar.lleft = map->m_lblk;
3428	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3429	if (err)
3430		goto out2;
3431	ar.lright = map->m_lblk;
3432	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
3433	if (err)
3434		goto out2;
3435
3436	/*
3437	 * See if request is beyond maximum number of blocks we can have in
3438	 * a single extent. For an initialized extent this limit is
3439	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3440	 * EXT_UNINIT_MAX_LEN.
3441	 */
3442	if (map->m_len > EXT_INIT_MAX_LEN &&
3443	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3444		map->m_len = EXT_INIT_MAX_LEN;
3445	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3446		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3447		map->m_len = EXT_UNINIT_MAX_LEN;
3448
3449	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3450	newex.ee_block = cpu_to_le32(map->m_lblk);
3451	newex.ee_len = cpu_to_le16(map->m_len);
3452	err = ext4_ext_check_overlap(inode, &newex, path);
3453	if (err)
3454		allocated = ext4_ext_get_actual_len(&newex);
3455	else
3456		allocated = map->m_len;
3457
3458	/* allocate new block */
3459	ar.inode = inode;
3460	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3461	ar.logical = map->m_lblk;
3462	ar.len = allocated;
3463	if (S_ISREG(inode->i_mode))
3464		ar.flags = EXT4_MB_HINT_DATA;
3465	else
3466		/* disable in-core preallocation for non-regular files */
3467		ar.flags = 0;
3468	newblock = ext4_mb_new_blocks(handle, &ar, &err);
3469	if (!newblock)
3470		goto out2;
3471	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
3472		  ar.goal, newblock, allocated);
3473
3474	/* try to insert new extent into found leaf and return */
3475	ext4_ext_store_pblock(&newex, newblock);
3476	newex.ee_len = cpu_to_le16(ar.len);
3477	/* Mark uninitialized */
3478	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3479		ext4_ext_mark_uninitialized(&newex);
3480		/*
3481		 * io_end structure was created for every IO write to an
3482		 * uninitialized extent. To avoid unecessary conversion,
3483		 * here we flag the IO that really needs the conversion.
3484		 * For non asycn direct IO case, flag the inode state
3485		 * that we need to perform convertion when IO is done.
3486		 */
3487		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3488			if (io)
3489				io->flag = EXT4_IO_UNWRITTEN;
3490			else
3491				ext4_set_inode_state(inode,
3492						     EXT4_STATE_DIO_UNWRITTEN);
3493		}
3494		if (ext4_should_dioread_nolock(inode))
3495			map->m_flags |= EXT4_MAP_UNINIT;
3496	}
3497
3498	if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
3499		if (unlikely(!eh->eh_entries)) {
3500			EXT4_ERROR_INODE(inode,
3501					 "eh->eh_entries == 0 and "
3502					 "EOFBLOCKS_FL set");
3503			err = -EIO;
3504			goto out2;
3505		}
3506		last_ex = EXT_LAST_EXTENT(eh);
3507		/*
3508		 * If the current leaf block was reached by looking at
3509		 * the last index block all the way down the tree, and
3510		 * we are extending the inode beyond the last extent
3511		 * in the current leaf block, then clear the
3512		 * EOFBLOCKS_FL flag.
3513		 */
3514		for (i = depth-1; i >= 0; i--) {
3515			if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3516				break;
3517		}
3518		if ((i < 0) &&
3519		    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
3520		     ext4_ext_get_actual_len(last_ex)))
3521			ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3522	}
3523	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3524	if (err) {
3525		/* free data blocks we just allocated */
3526		/* not a good idea to call discard here directly,
3527		 * but otherwise we'd need to call it every free() */
3528		ext4_discard_preallocations(inode);
3529		ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
3530				 ext4_ext_get_actual_len(&newex), 0);
3531		goto out2;
3532	}
3533
3534	/* previous routine could use block we allocated */
3535	newblock = ext_pblock(&newex);
3536	allocated = ext4_ext_get_actual_len(&newex);
3537	if (allocated > map->m_len)
3538		allocated = map->m_len;
3539	map->m_flags |= EXT4_MAP_NEW;
3540
3541	/*
3542	 * Update reserved blocks/metadata blocks after successful
3543	 * block allocation which had been deferred till now.
3544	 */
3545	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3546		ext4_da_update_reserve_space(inode, allocated, 1);
3547
3548	/*
3549	 * Cache the extent and update transaction to commit on fdatasync only
3550	 * when it is _not_ an uninitialized extent.
3551	 */
3552	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3553		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
3554						EXT4_EXT_CACHE_EXTENT);
3555		ext4_update_inode_fsync_trans(handle, inode, 1);
3556	} else
3557		ext4_update_inode_fsync_trans(handle, inode, 0);
3558out:
3559	if (allocated > map->m_len)
3560		allocated = map->m_len;
3561	ext4_ext_show_leaf(inode, path);
3562	map->m_flags |= EXT4_MAP_MAPPED;
3563	map->m_pblk = newblock;
3564	map->m_len = allocated;
3565out2:
3566	if (path) {
3567		ext4_ext_drop_refs(path);
3568		kfree(path);
3569	}
3570	return err ? err : allocated;
3571}
3572
3573void ext4_ext_truncate(struct inode *inode)
3574{
3575	struct address_space *mapping = inode->i_mapping;
3576	struct super_block *sb = inode->i_sb;
3577	ext4_lblk_t last_block;
3578	handle_t *handle;
3579	int err = 0;
3580
3581	/*
3582	 * probably first extent we're gonna free will be last in block
3583	 */
3584	err = ext4_writepage_trans_blocks(inode);
3585	handle = ext4_journal_start(inode, err);
3586	if (IS_ERR(handle))
3587		return;
3588
3589	if (inode->i_size & (sb->s_blocksize - 1))
3590		ext4_block_truncate_page(handle, mapping, inode->i_size);
3591
3592	if (ext4_orphan_add(handle, inode))
3593		goto out_stop;
3594
3595	down_write(&EXT4_I(inode)->i_data_sem);
3596	ext4_ext_invalidate_cache(inode);
3597
3598	ext4_discard_preallocations(inode);
3599
3600	/*
3601	 * TODO: optimization is possible here.
3602	 * Probably we need not scan at all,
3603	 * because page truncation is enough.
3604	 */
3605
3606	/* we have to know where to truncate from in crash case */
3607	EXT4_I(inode)->i_disksize = inode->i_size;
3608	ext4_mark_inode_dirty(handle, inode);
3609
3610	last_block = (inode->i_size + sb->s_blocksize - 1)
3611			>> EXT4_BLOCK_SIZE_BITS(sb);
3612	err = ext4_ext_remove_space(inode, last_block);
3613
3614	/* In a multi-transaction truncate, we only make the final
3615	 * transaction synchronous.
3616	 */
3617	if (IS_SYNC(inode))
3618		ext4_handle_sync(handle);
3619
3620out_stop:
3621	up_write(&EXT4_I(inode)->i_data_sem);
3622	/*
3623	 * If this was a simple ftruncate() and the file will remain alive,
3624	 * then we need to clear up the orphan record which we created above.
3625	 * However, if this was a real unlink then we were called by
3626	 * ext4_delete_inode(), and we allow that function to clean up the
3627	 * orphan info for us.
3628	 */
3629	if (inode->i_nlink)
3630		ext4_orphan_del(handle, inode);
3631
3632	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3633	ext4_mark_inode_dirty(handle, inode);
3634	ext4_journal_stop(handle);
3635}
3636
3637static void ext4_falloc_update_inode(struct inode *inode,
3638				int mode, loff_t new_size, int update_ctime)
3639{
3640	struct timespec now;
3641
3642	if (update_ctime) {
3643		now = current_fs_time(inode->i_sb);
3644		if (!timespec_equal(&inode->i_ctime, &now))
3645			inode->i_ctime = now;
3646	}
3647	/*
3648	 * Update only when preallocation was requested beyond
3649	 * the file size.
3650	 */
3651	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3652		if (new_size > i_size_read(inode))
3653			i_size_write(inode, new_size);
3654		if (new_size > EXT4_I(inode)->i_disksize)
3655			ext4_update_i_disksize(inode, new_size);
3656	} else {
3657		/*
3658		 * Mark that we allocate beyond EOF so the subsequent truncate
3659		 * can proceed even if the new size is the same as i_size.
3660		 */
3661		if (new_size > i_size_read(inode))
3662			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3663	}
3664
3665}
3666
3667/*
3668 * preallocate space for a file. This implements ext4's fallocate inode
3669 * operation, which gets called from sys_fallocate system call.
3670 * For block-mapped files, posix_fallocate should fall back to the method
3671 * of writing zeroes to the required new blocks (the same behavior which is
3672 * expected for file systems which do not support fallocate() system call).
3673 */
3674long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3675{
3676	handle_t *handle;
3677	loff_t new_size;
3678	unsigned int max_blocks;
3679	int ret = 0;
3680	int ret2 = 0;
3681	int retries = 0;
3682	struct ext4_map_blocks map;
3683	unsigned int credits, blkbits = inode->i_blkbits;
3684
3685	/*
3686	 * currently supporting (pre)allocate mode for extent-based
3687	 * files _only_
3688	 */
3689	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3690		return -EOPNOTSUPP;
3691
3692	/* preallocation to directories is currently not supported */
3693	if (S_ISDIR(inode->i_mode))
3694		return -ENODEV;
3695
3696	map.m_lblk = offset >> blkbits;
3697	/*
3698	 * We can't just convert len to max_blocks because
3699	 * If blocksize = 4096 offset = 3072 and len = 2048
3700	 */
3701	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3702		- map.m_lblk;
3703	/*
3704	 * credits to insert 1 extent into extent tree
3705	 */
3706	credits = ext4_chunk_trans_blocks(inode, max_blocks);
3707	mutex_lock(&inode->i_mutex);
3708	ret = inode_newsize_ok(inode, (len + offset));
3709	if (ret) {
3710		mutex_unlock(&inode->i_mutex);
3711		return ret;
3712	}
3713retry:
3714	while (ret >= 0 && ret < max_blocks) {
3715		map.m_lblk = map.m_lblk + ret;
3716		map.m_len = max_blocks = max_blocks - ret;
3717		handle = ext4_journal_start(inode, credits);
3718		if (IS_ERR(handle)) {
3719			ret = PTR_ERR(handle);
3720			break;
3721		}
3722		ret = ext4_map_blocks(handle, inode, &map,
3723				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3724		if (ret <= 0) {
3725#ifdef EXT4FS_DEBUG
3726			WARN_ON(ret <= 0);
3727			printk(KERN_ERR "%s: ext4_ext_map_blocks "
3728				    "returned error inode#%lu, block=%u, "
3729				    "max_blocks=%u", __func__,
3730				    inode->i_ino, block, max_blocks);
3731#endif
3732			ext4_mark_inode_dirty(handle, inode);
3733			ret2 = ext4_journal_stop(handle);
3734			break;
3735		}
3736		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3737						blkbits) >> blkbits))
3738			new_size = offset + len;
3739		else
3740			new_size = (map.m_lblk + ret) << blkbits;
3741
3742		ext4_falloc_update_inode(inode, mode, new_size,
3743					 (map.m_flags & EXT4_MAP_NEW));
3744		ext4_mark_inode_dirty(handle, inode);
3745		ret2 = ext4_journal_stop(handle);
3746		if (ret2)
3747			break;
3748	}
3749	if (ret == -ENOSPC &&
3750			ext4_should_retry_alloc(inode->i_sb, &retries)) {
3751		ret = 0;
3752		goto retry;
3753	}
3754	mutex_unlock(&inode->i_mutex);
3755	return ret > 0 ? ret2 : ret;
3756}
3757
3758/*
3759 * This function convert a range of blocks to written extents
3760 * The caller of this function will pass the start offset and the size.
3761 * all unwritten extents within this range will be converted to
3762 * written extents.
3763 *
3764 * This function is called from the direct IO end io call back
3765 * function, to convert the fallocated extents after IO is completed.
3766 * Returns 0 on success.
3767 */
3768int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3769				    ssize_t len)
3770{
3771	handle_t *handle;
3772	unsigned int max_blocks;
3773	int ret = 0;
3774	int ret2 = 0;
3775	struct ext4_map_blocks map;
3776	unsigned int credits, blkbits = inode->i_blkbits;
3777
3778	map.m_lblk = offset >> blkbits;
3779	/*
3780	 * We can't just convert len to max_blocks because
3781	 * If blocksize = 4096 offset = 3072 and len = 2048
3782	 */
3783	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
3784		      map.m_lblk);
3785	/*
3786	 * credits to insert 1 extent into extent tree
3787	 */
3788	credits = ext4_chunk_trans_blocks(inode, max_blocks);
3789	while (ret >= 0 && ret < max_blocks) {
3790		map.m_lblk += ret;
3791		map.m_len = (max_blocks -= ret);
3792		handle = ext4_journal_start(inode, credits);
3793		if (IS_ERR(handle)) {
3794			ret = PTR_ERR(handle);
3795			break;
3796		}
3797		ret = ext4_map_blocks(handle, inode, &map,
3798				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3799		if (ret <= 0) {
3800			WARN_ON(ret <= 0);
3801			printk(KERN_ERR "%s: ext4_ext_map_blocks "
3802				    "returned error inode#%lu, block=%u, "
3803				    "max_blocks=%u", __func__,
3804				    inode->i_ino, map.m_lblk, map.m_len);
3805		}
3806		ext4_mark_inode_dirty(handle, inode);
3807		ret2 = ext4_journal_stop(handle);
3808		if (ret <= 0 || ret2 )
3809			break;
3810	}
3811	return ret > 0 ? ret2 : ret;
3812}
3813/*
3814 * Callback function called for each extent to gather FIEMAP information.
3815 */
3816static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3817		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
3818		       void *data)
3819{
3820	struct fiemap_extent_info *fieinfo = data;
3821	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
3822	__u64	logical;
3823	__u64	physical;
3824	__u64	length;
3825	__u32	flags = 0;
3826	int	error;
3827
3828	logical =  (__u64)newex->ec_block << blksize_bits;
3829
3830	if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
3831		pgoff_t offset;
3832		struct page *page;
3833		struct buffer_head *bh = NULL;
3834
3835		offset = logical >> PAGE_SHIFT;
3836		page = find_get_page(inode->i_mapping, offset);
3837		if (!page || !page_has_buffers(page))
3838			return EXT_CONTINUE;
3839
3840		bh = page_buffers(page);
3841
3842		if (!bh)
3843			return EXT_CONTINUE;
3844
3845		if (buffer_delay(bh)) {
3846			flags |= FIEMAP_EXTENT_DELALLOC;
3847			page_cache_release(page);
3848		} else {
3849			page_cache_release(page);
3850			return EXT_CONTINUE;
3851		}
3852	}
3853
3854	physical = (__u64)newex->ec_start << blksize_bits;
3855	length =   (__u64)newex->ec_len << blksize_bits;
3856
3857	if (ex && ext4_ext_is_uninitialized(ex))
3858		flags |= FIEMAP_EXTENT_UNWRITTEN;
3859
3860	if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
3861	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
3862		loff_t size = i_size_read(inode);
3863		loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
3864
3865		flags |= FIEMAP_EXTENT_LAST;
3866		if ((flags & FIEMAP_EXTENT_DELALLOC) &&
3867		    logical+length > size)
3868			length = (size - logical + bs - 1) & ~(bs-1);
3869	}
3870
3871	error = fiemap_fill_next_extent(fieinfo, logical, physical,
3872					length, flags);
3873	if (error < 0)
3874		return error;
3875	if (error == 1)
3876		return EXT_BREAK;
3877
3878	return EXT_CONTINUE;
3879}
3880
3881/* fiemap flags we can handle specified here */
3882#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3883
3884static int ext4_xattr_fiemap(struct inode *inode,
3885				struct fiemap_extent_info *fieinfo)
3886{
3887	__u64 physical = 0;
3888	__u64 length;
3889	__u32 flags = FIEMAP_EXTENT_LAST;
3890	int blockbits = inode->i_sb->s_blocksize_bits;
3891	int error = 0;
3892
3893	/* in-inode? */
3894	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
3895		struct ext4_iloc iloc;
3896		int offset;	/* offset of xattr in inode */
3897
3898		error = ext4_get_inode_loc(inode, &iloc);
3899		if (error)
3900			return error;
3901		physical = iloc.bh->b_blocknr << blockbits;
3902		offset = EXT4_GOOD_OLD_INODE_SIZE +
3903				EXT4_I(inode)->i_extra_isize;
3904		physical += offset;
3905		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
3906		flags |= FIEMAP_EXTENT_DATA_INLINE;
3907		brelse(iloc.bh);
3908	} else { /* external block */
3909		physical = EXT4_I(inode)->i_file_acl << blockbits;
3910		length = inode->i_sb->s_blocksize;
3911	}
3912
3913	if (physical)
3914		error = fiemap_fill_next_extent(fieinfo, 0, physical,
3915						length, flags);
3916	return (error < 0 ? error : 0);
3917}
3918
3919int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3920		__u64 start, __u64 len)
3921{
3922	ext4_lblk_t start_blk;
3923	int error = 0;
3924
3925	/* fallback to generic here if not in extents fmt */
3926	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3927		return generic_block_fiemap(inode, fieinfo, start, len,
3928			ext4_get_block);
3929
3930	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
3931		return -EBADR;
3932
3933	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3934		error = ext4_xattr_fiemap(inode, fieinfo);
3935	} else {
3936		ext4_lblk_t len_blks;
3937		__u64 last_blk;
3938
3939		start_blk = start >> inode->i_sb->s_blocksize_bits;
3940		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
3941		if (last_blk >= EXT_MAX_BLOCK)
3942			last_blk = EXT_MAX_BLOCK-1;
3943		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
3944
3945		/*
3946		 * Walk the extent tree gathering extent information.
3947		 * ext4_ext_fiemap_cb will push extents back to user.
3948		 */
3949		error = ext4_ext_walk_space(inode, start_blk, len_blks,
3950					  ext4_ext_fiemap_cb, fieinfo);
3951	}
3952
3953	return error;
3954}
3955