1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2010 Zheng Liu <lz@freebsd.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/types.h>
32#include <sys/kernel.h>
33#include <sys/malloc.h>
34#include <sys/vnode.h>
35#include <sys/bio.h>
36#include <sys/buf.h>
37#include <sys/endian.h>
38#include <sys/conf.h>
39#include <sys/sdt.h>
40#include <sys/stat.h>
41
42#include <fs/ext2fs/ext2_mount.h>
43#include <fs/ext2fs/fs.h>
44#include <fs/ext2fs/inode.h>
45#include <fs/ext2fs/ext2fs.h>
46#include <fs/ext2fs/ext2_extents.h>
47#include <fs/ext2fs/ext2_extern.h>
48
49SDT_PROVIDER_DECLARE(ext2fs);
50/*
51 * ext2fs trace probe:
52 * arg0: verbosity. Higher numbers give more verbose messages
53 * arg1: Textual message
54 */
55SDT_PROBE_DEFINE2(ext2fs, , trace, extents, "int", "char*");
56
57static MALLOC_DEFINE(M_EXT2EXTENTS, "ext2_extents", "EXT2 extents");
58
59#ifdef EXT2FS_PRINT_EXTENTS
60static const bool print_extents_walk = true;
61
62static int ext4_ext_check_header(struct inode *, struct ext4_extent_header *,
63    int);
64static int ext4_ext_walk_header(struct inode *, struct ext4_extent_header *,
65    int);
66static inline e4fs_daddr_t ext4_ext_index_pblock(struct ext4_extent_index *);
67static inline e4fs_daddr_t ext4_ext_extent_pblock(struct ext4_extent *);
68
69static int
70ext4_ext_blk_check(struct inode *ip, e4fs_daddr_t blk)
71{
72	struct m_ext2fs *fs;
73
74	fs = ip->i_e2fs;
75
76	if (blk < fs->e2fs->e2fs_first_dblock || blk >= fs->e2fs_bcount)
77		return (EIO);
78
79	return (0);
80}
81
82static int
83ext4_ext_walk_index(struct inode *ip, struct ext4_extent_index *ex, int depth,
84    bool do_walk)
85{
86	struct m_ext2fs *fs;
87	struct buf *bp;
88	e4fs_daddr_t blk;
89	int error;
90
91	fs = ip->i_e2fs;
92
93	if (print_extents_walk)
94		printf("    index %p => (blk %u pblk %ju)\n", ex,
95		    le32toh(ex->ei_blk),
96		    (uint64_t)le16toh(ex->ei_leaf_hi) << 32 |
97		    le32toh(ex->ei_leaf_lo));
98
99	if(!do_walk)
100		return (0);
101
102	blk = ext4_ext_index_pblock(ex);
103	error = ext4_ext_blk_check(ip, blk);
104	if (error)
105		return (error);
106
107	if ((error = bread(ip->i_devvp,
108	    fsbtodb(fs, blk), (int)fs->e2fs_bsize, NOCRED, &bp)) != 0) {
109		brelse(bp);
110		return (error);
111	}
112
113	error = ext4_ext_walk_header(ip,
114	    (struct ext4_extent_header *)bp->b_data, depth);
115
116	brelse(bp);
117
118	return (error);
119}
120
121static int
122ext4_ext_walk_extent(struct inode *ip, struct ext4_extent *ep)
123{
124	e4fs_daddr_t blk;
125	int error;
126
127	blk = ext4_ext_extent_pblock(ep);
128	error = ext4_ext_blk_check(ip, blk);
129	if (error)
130		return (error);
131
132	if (print_extents_walk)
133		printf("    ext %p => (blk %u len %u start %ju)\n",
134		    ep, le32toh(ep->e_blk), le16toh(ep->e_len),
135		    (uint64_t)blk);
136
137	return (0);
138}
139
140static int
141ext4_ext_walk_header(struct inode *ip, struct ext4_extent_header *eh, int depth)
142{
143	int i, error = 0;
144
145	error = ext4_ext_check_header(ip, eh, depth);
146	if (error)
147		return (error);
148
149	if (print_extents_walk)
150		printf("header %p => (entries %d max %d depth %d gen %d)\n",
151		    eh, le16toh(eh->eh_ecount),
152		    le16toh(eh->eh_max), le16toh(eh->eh_depth),
153		    le32toh(eh->eh_gen));
154
155	for (i = 0; i < le16toh(eh->eh_ecount) && error == 0; i++)
156		if (eh->eh_depth != 0)
157			error = ext4_ext_walk_index(ip,
158			    (struct ext4_extent_index *)(eh + 1 + i), depth - 1,
159			    true);
160		else
161			error = ext4_ext_walk_extent(ip,
162			    (struct ext4_extent *)(eh + 1 + i));
163
164	return (error);
165}
166
167int
168ext4_ext_walk(struct inode *ip)
169{
170	struct ext4_extent_header *ehp;
171
172	ehp = (struct ext4_extent_header *)ip->i_db;
173
174	if (print_extents_walk)
175		printf("Extent status:ip=%ju\n", ip->i_number);
176
177	if (!(ip->i_flag & IN_E4EXTENTS))
178		return (0);
179
180	return (ext4_ext_walk_header(ip, ehp, 0));
181}
182
183static int
184ext4_ext_print_path(struct inode *ip, struct ext4_extent_path *path)
185{
186	int k, depth, error = 0;
187
188	depth = path->ep_depth;
189
190	if (print_extents_walk)
191		printf("ip=%ju, Path:\n", ip->i_number);
192
193	for (k = 0; k <= depth && error == 0; k++, path++) {
194		if (path->ep_index) {
195			error = ext4_ext_walk_index(ip, path->ep_index,
196			    depth - 1, false);
197		} else if (path->ep_ext) {
198			error = ext4_ext_walk_extent(ip, path->ep_ext);
199		}
200	}
201
202	return (error);
203}
204#endif
205
206static inline struct ext4_extent_header *
207ext4_ext_inode_header(struct inode *ip)
208{
209
210	return ((struct ext4_extent_header *)ip->i_db);
211}
212
213static inline struct ext4_extent_header *
214ext4_ext_block_header(char *bdata)
215{
216
217	return ((struct ext4_extent_header *)bdata);
218}
219
220static inline unsigned short
221ext4_ext_inode_depth(struct inode *ip)
222{
223	struct ext4_extent_header *ehp;
224
225	ehp = (struct ext4_extent_header *)ip->i_data;
226	return (le16toh(ehp->eh_depth));
227}
228
229static inline e4fs_daddr_t
230ext4_ext_index_pblock(struct ext4_extent_index *index)
231{
232	e4fs_daddr_t blk;
233
234	blk = le32toh(index->ei_leaf_lo);
235	blk |= (e4fs_daddr_t)le16toh(index->ei_leaf_hi) << 32;
236
237	return (blk);
238}
239
240static inline void
241ext4_index_store_pblock(struct ext4_extent_index *index, e4fs_daddr_t pb)
242{
243
244	index->ei_leaf_lo = htole32(pb & 0xffffffff);
245	index->ei_leaf_hi = htole16((pb >> 32) & 0xffff);
246}
247
248static inline e4fs_daddr_t
249ext4_ext_extent_pblock(struct ext4_extent *extent)
250{
251	e4fs_daddr_t blk;
252
253	blk = le32toh(extent->e_start_lo);
254	blk |= (e4fs_daddr_t)le16toh(extent->e_start_hi) << 32;
255
256	return (blk);
257}
258
259static inline void
260ext4_ext_store_pblock(struct ext4_extent *ex, e4fs_daddr_t pb)
261{
262
263	ex->e_start_lo = htole32(pb & 0xffffffff);
264	ex->e_start_hi = htole16((pb >> 32) & 0xffff);
265}
266
267int
268ext4_ext_in_cache(struct inode *ip, daddr_t lbn, struct ext4_extent *ep)
269{
270	struct ext4_extent_cache *ecp;
271	int ret = EXT4_EXT_CACHE_NO;
272
273	ecp = &ip->i_ext_cache;
274	if (ecp->ec_type == EXT4_EXT_CACHE_NO)
275		return (ret);
276
277	if (lbn >= ecp->ec_blk && lbn < ecp->ec_blk + ecp->ec_len) {
278		ep->e_blk = htole32(ecp->ec_blk);
279		ep->e_start_lo = htole32(ecp->ec_start & 0xffffffff);
280		ep->e_start_hi = htole16(ecp->ec_start >> 32 & 0xffff);
281		ep->e_len = htole16(ecp->ec_len);
282		ret = ecp->ec_type;
283	}
284	return (ret);
285}
286
287static inline int
288ext4_ext_space_root(struct inode *ip)
289{
290	int size;
291
292	size = sizeof(ip->i_data);
293	size -= sizeof(struct ext4_extent_header);
294	size /= sizeof(struct ext4_extent);
295
296	return (size);
297}
298
299static inline int
300ext4_ext_space_block(struct inode *ip)
301{
302	struct m_ext2fs *fs;
303	int size;
304
305	fs = ip->i_e2fs;
306
307	size = (fs->e2fs_bsize - sizeof(struct ext4_extent_header)) /
308	    sizeof(struct ext4_extent);
309
310	return (size);
311}
312
313static inline int
314ext4_ext_space_root_idx(struct inode *ip)
315{
316	int size;
317
318	size = sizeof(ip->i_data);
319	size -= sizeof(struct ext4_extent_header);
320	size /= sizeof(struct ext4_extent_index);
321
322	return (size);
323}
324
325static inline int
326ext4_ext_space_block_idx(struct inode *ip)
327{
328	struct m_ext2fs *fs;
329	int size;
330
331	fs = ip->i_e2fs;
332
333	size = (fs->e2fs_bsize - sizeof(struct ext4_extent_header)) /
334	    sizeof(struct ext4_extent_index);
335
336	return (size);
337}
338
339static int
340ext4_ext_max_entries(struct inode *ip, int depth)
341{
342
343	if (depth == ext4_ext_inode_depth(ip)) {
344		if (depth == 0)
345			return (ext4_ext_space_root(ip));
346		else
347			return (ext4_ext_space_root_idx(ip));
348	} else {
349		if (depth == 0)
350			return (ext4_ext_space_block(ip));
351		else
352			return (ext4_ext_space_block_idx(ip));
353	}
354}
355
356static inline uint16_t
357ext4_ext_get_actual_len(struct ext4_extent *ext)
358{
359
360	return (le16toh(ext->e_len) <= EXT_INIT_MAX_LEN ?
361	    le16toh(ext->e_len) : (le16toh(ext->e_len) - EXT_INIT_MAX_LEN));
362}
363
364
365static int
366ext4_inode_block_validate(struct inode *ip, e4fs_daddr_t start_blk,
367    unsigned int count)
368{
369	struct m_ext2fs *fs;
370
371	fs = ip->i_e2fs;
372
373	if ((start_blk <= le32toh(fs->e2fs->e2fs_first_dblock)) ||
374	    (start_blk + count < start_blk) ||
375	    (start_blk + count > fs->e2fs_bcount))
376		return (EIO);
377
378	return (0);
379}
380
381static int
382ext4_validate_extent(struct inode *ip, struct ext4_extent *ext)
383{
384	e4fs_daddr_t blk = ext4_ext_extent_pblock(ext);
385	uint32_t lblk = le32toh(ext->e_blk);
386	int len = ext4_ext_get_actual_len(ext);
387
388	if (lblk + len <= lblk)
389		return (EIO);
390
391	return (ext4_inode_block_validate(ip, blk, len));
392}
393
394static int
395ext4_validate_extent_idx(struct inode *ip, struct ext4_extent_index *ext_idx)
396{
397	e4fs_daddr_t blk = ext4_ext_index_pblock(ext_idx);
398
399	return (ext4_inode_block_validate(ip, blk, 1));
400}
401
402static int
403ext4_validate_extent_entries(struct inode *ip, struct ext4_extent_header *eh,
404    int depth)
405{
406	unsigned int count;
407
408	count = le16toh(eh->eh_ecount);
409	if (count == 0)
410		return (0);
411
412	if (depth == 0) {
413		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
414		uint32_t lblk = 0;
415		uint32_t prev = 0;
416		int len = 0;
417		while (count) {
418			/* leaf entries */
419			if (ext4_validate_extent(ip, ext))
420				return (EIO);
421
422			/* Check for overlapping extents */
423			lblk = le32toh(ext->e_blk);
424			len = ext4_ext_get_actual_len(ext);
425			if ((lblk <= prev) && prev)
426				return (EIO);
427
428			ext++;
429			count--;
430			prev = lblk + len - 1;
431		}
432	} else {
433		struct ext4_extent_index *ext_idx = EXT_FIRST_INDEX(eh);
434		while (count) {
435			if (ext4_validate_extent_idx(ip, ext_idx))
436				return (EIO);
437
438			ext_idx++;
439			count--;
440		}
441	}
442
443	return (0);
444}
445
446static int
447ext4_ext_check_header(struct inode *ip, struct ext4_extent_header *eh,
448    int depth)
449{
450#ifdef KDTRACE_HOOKS
451	char *error_msg;
452#else
453	char *error_msg __unused;
454#endif
455
456	if (le16toh(eh->eh_magic) != EXT4_EXT_MAGIC) {
457		error_msg = "header: invalid magic";
458		goto corrupted;
459	}
460	if (le16toh(eh->eh_depth) != depth ||
461	    le16toh(eh->eh_depth) > EXT4_EXT_DEPTH_MAX)
462	{
463		error_msg = "header: invalid eh_depth";
464		goto corrupted;
465	}
466	if (eh->eh_max == 0) {
467		error_msg = "header: invalid eh_max";
468		goto corrupted;
469	}
470	if (le16toh(eh->eh_max) > ext4_ext_max_entries(ip, depth)) {
471		error_msg = "header: too large eh_max";
472		goto corrupted;
473	}
474	if (le16toh(eh->eh_ecount) > le16toh(eh->eh_max)) {
475		error_msg = "header: invalid eh_entries";
476		goto corrupted;
477	}
478	if (le16toh(eh->eh_depth) > EXT4_EXT_DEPTH_MAX) {
479		error_msg = "header: invalid eh_depth";
480		goto corrupted;
481	}
482	if (ext4_validate_extent_entries(ip, eh, depth)) {
483		error_msg = "header: invalid extent entries";
484		goto corrupted;
485	}
486
487	return (0);
488
489corrupted:
490	SDT_PROBE2(ext2fs, , trace, extents, 1, error_msg);
491	return (EIO);
492}
493
494static void
495ext4_ext_binsearch_index(struct ext4_extent_path *path, int blk)
496{
497	struct ext4_extent_header *eh;
498	struct ext4_extent_index *r, *l, *m;
499
500	eh = path->ep_header;
501
502	KASSERT(le16toh(eh->eh_ecount) <= le16toh(eh->eh_max) &&
503	    le16toh(eh->eh_ecount) > 0,
504	    ("ext4_ext_binsearch_index: bad args"));
505
506	l = EXT_FIRST_INDEX(eh) + 1;
507	r = EXT_FIRST_INDEX(eh) + le16toh(eh->eh_ecount) - 1;
508	while (l <= r) {
509		m = l + (r - l) / 2;
510		if (blk < le32toh(m->ei_blk))
511			r = m - 1;
512		else
513			l = m + 1;
514	}
515
516	path->ep_index = l - 1;
517}
518
519static void
520ext4_ext_binsearch_ext(struct ext4_extent_path *path, int blk)
521{
522	struct ext4_extent_header *eh;
523	struct ext4_extent *r, *l, *m;
524
525	eh = path->ep_header;
526
527	KASSERT(le16toh(eh->eh_ecount) <= le16toh(eh->eh_max),
528	    ("ext4_ext_binsearch_ext: bad args"));
529
530	if (eh->eh_ecount == 0)
531		return;
532
533	l = EXT_FIRST_EXTENT(eh) + 1;
534	r = EXT_FIRST_EXTENT(eh) + le16toh(eh->eh_ecount) - 1;
535
536	while (l <= r) {
537		m = l + (r - l) / 2;
538		if (blk < le32toh(m->e_blk))
539			r = m - 1;
540		else
541			l = m + 1;
542	}
543
544	path->ep_ext = l - 1;
545}
546
547static int
548ext4_ext_fill_path_bdata(struct ext4_extent_path *path,
549    struct buf *bp, uint64_t blk)
550{
551
552	KASSERT(path->ep_data == NULL,
553	    ("ext4_ext_fill_path_bdata: bad ep_data"));
554
555	path->ep_data = malloc(bp->b_bufsize, M_EXT2EXTENTS, M_WAITOK);
556	memcpy(path->ep_data, bp->b_data, bp->b_bufsize);
557	path->ep_blk = blk;
558
559	return (0);
560}
561
562static void
563ext4_ext_fill_path_buf(struct ext4_extent_path *path, struct buf *bp)
564{
565
566	KASSERT(path->ep_data != NULL,
567	    ("ext4_ext_fill_path_buf: bad ep_data"));
568
569	memcpy(bp->b_data, path->ep_data, bp->b_bufsize);
570}
571
572static void
573ext4_ext_drop_refs(struct ext4_extent_path *path)
574{
575	int depth, i;
576
577	if (!path)
578		return;
579
580	depth = path->ep_depth;
581	for (i = 0; i <= depth; i++, path++)
582		if (path->ep_data) {
583			free(path->ep_data, M_EXT2EXTENTS);
584			path->ep_data = NULL;
585		}
586}
587
588void
589ext4_ext_path_free(struct ext4_extent_path *path)
590{
591
592	if (!path)
593		return;
594
595	ext4_ext_drop_refs(path);
596	free(path, M_EXT2EXTENTS);
597}
598
599int
600ext4_ext_find_extent(struct inode *ip, daddr_t block,
601    struct ext4_extent_path **ppath)
602{
603	struct ext4_extent_header *eh;
604	struct ext4_extent_path *path;
605	struct buf *bp;
606	uint64_t blk;
607	int error, depth, i, ppos, alloc;
608
609	eh = ext4_ext_inode_header(ip);
610	depth = ext4_ext_inode_depth(ip);
611	ppos = 0;
612	alloc = 0;
613
614	error = ext4_ext_check_header(ip, eh, depth);
615	if (error)
616		return (error);
617
618	if (ppath == NULL)
619		return (EINVAL);
620
621	path = *ppath;
622	if (path == NULL) {
623		path = malloc(EXT4_EXT_DEPTH_MAX *
624		    sizeof(struct ext4_extent_path),
625		    M_EXT2EXTENTS, M_WAITOK | M_ZERO);
626		*ppath = path;
627		alloc = 1;
628	}
629
630	path[0].ep_header = eh;
631	path[0].ep_data = NULL;
632
633	/* Walk through the tree. */
634	i = depth;
635	while (i) {
636		ext4_ext_binsearch_index(&path[ppos], block);
637		blk = ext4_ext_index_pblock(path[ppos].ep_index);
638		path[ppos].ep_depth = i;
639		path[ppos].ep_ext = NULL;
640
641		error = bread(ip->i_devvp, fsbtodb(ip->i_e2fs, blk),
642		    ip->i_e2fs->e2fs_bsize, NOCRED, &bp);
643		if (error) {
644			goto error;
645		}
646
647		ppos++;
648		if (ppos > depth) {
649			SDT_PROBE2(ext2fs, , trace, extents, 1,
650			    "ppos > depth => extent corrupted");
651			error = EIO;
652			brelse(bp);
653			goto error;
654		}
655
656		ext4_ext_fill_path_bdata(&path[ppos], bp, blk);
657		bqrelse(bp);
658
659		eh = ext4_ext_block_header(path[ppos].ep_data);
660		if (ext4_ext_check_header(ip, eh, i - 1) ||
661		    ext2_extent_blk_csum_verify(ip, path[ppos].ep_data)) {
662			error = EIO;
663			goto error;
664		}
665
666		path[ppos].ep_header = eh;
667
668		i--;
669	}
670
671	error = ext4_ext_check_header(ip, eh, 0);
672	if (error)
673		goto error;
674
675	/* Find extent. */
676	path[ppos].ep_depth = i;
677	path[ppos].ep_header = eh;
678	path[ppos].ep_ext = NULL;
679	path[ppos].ep_index = NULL;
680	ext4_ext_binsearch_ext(&path[ppos], block);
681	return (0);
682
683error:
684	ext4_ext_drop_refs(path);
685	if (alloc)
686		free(path, M_EXT2EXTENTS);
687
688	*ppath = NULL;
689
690	return (error);
691}
692
693static inline int
694ext4_ext_space_block_index(struct inode *ip)
695{
696	struct m_ext2fs *fs;
697	int size;
698
699	fs = ip->i_e2fs;
700
701	size = (fs->e2fs_bsize - sizeof(struct ext4_extent_header)) /
702	    sizeof(struct ext4_extent_index);
703
704	return (size);
705}
706
707void
708ext4_ext_tree_init(struct inode *ip)
709{
710	struct ext4_extent_header *ehp;
711
712	ip->i_flag |= IN_E4EXTENTS;
713
714	memset(ip->i_data, 0, EXT2_NDADDR + EXT2_NIADDR);
715	ehp = (struct ext4_extent_header *)ip->i_data;
716	ehp->eh_magic = htole16(EXT4_EXT_MAGIC);
717	ehp->eh_max = htole16(ext4_ext_space_root(ip));
718	ip->i_ext_cache.ec_type = EXT4_EXT_CACHE_NO;
719	ip->i_flag |= IN_CHANGE | IN_UPDATE;
720	ext2_update(ip->i_vnode, 1);
721}
722
723static inline void
724ext4_ext_put_in_cache(struct inode *ip, uint32_t blk,
725			uint32_t len, uint32_t start, int type)
726{
727
728	KASSERT(len != 0, ("ext4_ext_put_in_cache: bad input"));
729
730	ip->i_ext_cache.ec_type = type;
731	ip->i_ext_cache.ec_blk = blk;
732	ip->i_ext_cache.ec_len = len;
733	ip->i_ext_cache.ec_start = start;
734}
735
736static e4fs_daddr_t
737ext4_ext_blkpref(struct inode *ip, struct ext4_extent_path *path,
738    e4fs_daddr_t block)
739{
740	struct m_ext2fs *fs;
741	struct ext4_extent *ex;
742	e4fs_daddr_t bg_start;
743	int depth;
744
745	fs = ip->i_e2fs;
746
747	if (path) {
748		depth = path->ep_depth;
749		ex = path[depth].ep_ext;
750		if (ex) {
751			e4fs_daddr_t pblk = ext4_ext_extent_pblock(ex);
752			e2fs_daddr_t blk = le32toh(ex->e_blk);
753
754			if (block > blk)
755				return (pblk + (block - blk));
756			else
757				return (pblk - (blk - block));
758		}
759
760		/* Try to get block from index itself. */
761		if (path[depth].ep_data)
762			return (path[depth].ep_blk);
763	}
764
765	/* Use inode's group. */
766	bg_start = (ip->i_block_group * EXT2_BLOCKS_PER_GROUP(ip->i_e2fs)) +
767	    le32toh(fs->e2fs->e2fs_first_dblock);
768
769	return (bg_start + block);
770}
771
772static int inline
773ext4_can_extents_be_merged(struct ext4_extent *ex1,
774    struct ext4_extent *ex2)
775{
776
777	if (le32toh(ex1->e_blk) + le16toh(ex1->e_len) != le32toh(ex2->e_blk))
778		return (0);
779
780	if (le16toh(ex1->e_len) + le16toh(ex2->e_len) > EXT4_MAX_LEN)
781		return (0);
782
783	if (ext4_ext_extent_pblock(ex1) + le16toh(ex1->e_len) ==
784	    ext4_ext_extent_pblock(ex2))
785		return (1);
786
787	return (0);
788}
789
790static unsigned
791ext4_ext_next_leaf_block(struct inode *ip, struct ext4_extent_path *path)
792{
793	int depth = path->ep_depth;
794
795	/* Empty tree */
796	if (depth == 0)
797		return (EXT4_MAX_BLOCKS);
798
799	/* Go to indexes. */
800	depth--;
801
802	while (depth >= 0) {
803		if (path[depth].ep_index !=
804		    EXT_LAST_INDEX(path[depth].ep_header))
805			return (le32toh(path[depth].ep_index[1].ei_blk));
806
807		depth--;
808	}
809
810	return (EXT4_MAX_BLOCKS);
811}
812
813static int
814ext4_ext_dirty(struct inode *ip, struct ext4_extent_path *path)
815{
816	struct m_ext2fs *fs;
817	struct buf *bp;
818	uint64_t blk;
819	int error;
820
821	fs = ip->i_e2fs;
822
823	if (!path)
824		return (EINVAL);
825
826	if (path->ep_data) {
827		blk = path->ep_blk;
828		bp = getblk(ip->i_devvp, fsbtodb(fs, blk),
829		    fs->e2fs_bsize, 0, 0, 0);
830		if (!bp)
831			return (EIO);
832		ext4_ext_fill_path_buf(path, bp);
833		ext2_extent_blk_csum_set(ip, bp->b_data);
834		error = bwrite(bp);
835	} else {
836		ip->i_flag |= IN_CHANGE | IN_UPDATE;
837		error = ext2_update(ip->i_vnode, 1);
838	}
839
840	return (error);
841}
842
843static int
844ext4_ext_insert_index(struct inode *ip, struct ext4_extent_path *path,
845    uint32_t lblk, e4fs_daddr_t blk)
846{
847	struct ext4_extent_index *idx;
848	int len;
849
850	if (lblk == le32toh(path->ep_index->ei_blk)) {
851		SDT_PROBE2(ext2fs, , trace, extents, 1,
852		    "lblk == index blk => extent corrupted");
853		return (EIO);
854	}
855
856	if (le16toh(path->ep_header->eh_ecount) >=
857	    le16toh(path->ep_header->eh_max)) {
858		SDT_PROBE2(ext2fs, , trace, extents, 1,
859		    "ecout > maxcount => extent corrupted");
860		return (EIO);
861	}
862
863	if (lblk > le32toh(path->ep_index->ei_blk)) {
864		/* Insert after. */
865		idx = path->ep_index + 1;
866	} else {
867		/* Insert before. */
868		idx = path->ep_index;
869	}
870
871	len = EXT_LAST_INDEX(path->ep_header) - idx + 1;
872	if (len > 0)
873		memmove(idx + 1, idx, len * sizeof(struct ext4_extent_index));
874
875	if (idx > EXT_MAX_INDEX(path->ep_header)) {
876		SDT_PROBE2(ext2fs, , trace, extents, 1,
877		    "index is out of range => extent corrupted");
878		return (EIO);
879	}
880
881	idx->ei_blk = htole32(lblk);
882	ext4_index_store_pblock(idx, blk);
883	path->ep_header->eh_ecount =
884	    htole16(le16toh(path->ep_header->eh_ecount) + 1);
885
886	return (ext4_ext_dirty(ip, path));
887}
888
889static e4fs_daddr_t
890ext4_ext_alloc_meta(struct inode *ip)
891{
892	e4fs_daddr_t blk = ext2_alloc_meta(ip);
893	if (blk) {
894		ip->i_blocks += btodb(ip->i_e2fs->e2fs_bsize);
895		ip->i_flag |= IN_CHANGE | IN_UPDATE;
896		ext2_update(ip->i_vnode, 1);
897	}
898
899	return (blk);
900}
901
902static void
903ext4_ext_blkfree(struct inode *ip, uint64_t blk, int count, int flags)
904{
905	struct m_ext2fs *fs;
906	int i, blocksreleased;
907
908	fs = ip->i_e2fs;
909	blocksreleased = count;
910
911	for(i = 0; i < count; i++)
912		ext2_blkfree(ip, blk + i, fs->e2fs_bsize);
913
914	if (ip->i_blocks >= blocksreleased)
915		ip->i_blocks -= (btodb(fs->e2fs_bsize)*blocksreleased);
916	else
917		ip->i_blocks = 0;
918
919	ip->i_flag |= IN_CHANGE | IN_UPDATE;
920	ext2_update(ip->i_vnode, 1);
921}
922
923static int
924ext4_ext_split(struct inode *ip, struct ext4_extent_path *path,
925    struct ext4_extent *newext, int at)
926{
927	struct m_ext2fs *fs;
928	struct  buf *bp;
929	int depth = ext4_ext_inode_depth(ip);
930	struct ext4_extent_header *neh;
931	struct ext4_extent_index *fidx;
932	struct ext4_extent *ex;
933	int i = at, k, m, a;
934	e4fs_daddr_t newblk, oldblk;
935	uint32_t border;
936	e4fs_daddr_t *ablks = NULL;
937	int error = 0;
938
939	fs = ip->i_e2fs;
940	bp = NULL;
941
942	/*
943	 * We will split at current extent for now.
944	 */
945	if (path[depth].ep_ext > EXT_MAX_EXTENT(path[depth].ep_header)) {
946		SDT_PROBE2(ext2fs, , trace, extents, 1,
947		    "extent is out of range => extent corrupted");
948		return (EIO);
949	}
950
951	if (path[depth].ep_ext != EXT_MAX_EXTENT(path[depth].ep_header))
952		border = le32toh(path[depth].ep_ext[1].e_blk);
953	else
954		border = le32toh(newext->e_blk);
955
956	/* Allocate new blocks. */
957	ablks = malloc(sizeof(e4fs_daddr_t) * depth,
958	    M_EXT2EXTENTS, M_WAITOK | M_ZERO);
959	for (a = 0; a < depth - at; a++) {
960		newblk = ext4_ext_alloc_meta(ip);
961		if (newblk == 0)
962			goto cleanup;
963		ablks[a] = newblk;
964	}
965
966	newblk = ablks[--a];
967	bp = getblk(ip->i_devvp, fsbtodb(fs, newblk), fs->e2fs_bsize, 0, 0, 0);
968	if (!bp) {
969		error = EIO;
970		goto cleanup;
971	}
972
973	neh = ext4_ext_block_header(bp->b_data);
974	neh->eh_ecount = 0;
975	neh->eh_max = le16toh(ext4_ext_space_block(ip));
976	neh->eh_magic = le16toh(EXT4_EXT_MAGIC);
977	neh->eh_depth = 0;
978	ex = EXT_FIRST_EXTENT(neh);
979
980	if (le16toh(path[depth].ep_header->eh_ecount) !=
981	    le16toh(path[depth].ep_header->eh_max)) {
982		SDT_PROBE2(ext2fs, , trace, extents, 1,
983		    "extents count out of range => extent corrupted");
984		error = EIO;
985		goto cleanup;
986	}
987
988	/* Start copy from next extent. */
989	m = 0;
990	path[depth].ep_ext++;
991	while (path[depth].ep_ext <= EXT_MAX_EXTENT(path[depth].ep_header)) {
992		path[depth].ep_ext++;
993		m++;
994	}
995	if (m) {
996		memmove(ex, path[depth].ep_ext - m,
997		    sizeof(struct ext4_extent) * m);
998		neh->eh_ecount = htole16(le16toh(neh->eh_ecount) + m);
999	}
1000
1001	ext2_extent_blk_csum_set(ip, bp->b_data);
1002	bwrite(bp);
1003	bp = NULL;
1004
1005	/* Fix old leaf. */
1006	if (m) {
1007		path[depth].ep_header->eh_ecount =
1008		    htole16(le16toh(path[depth].ep_header->eh_ecount) - m);
1009		ext4_ext_dirty(ip, path + depth);
1010	}
1011
1012	/* Create intermediate indexes. */
1013	k = depth - at - 1;
1014	KASSERT(k >= 0, ("ext4_ext_split: negative k"));
1015
1016	/* Insert new index into current index block. */
1017	i = depth - 1;
1018	while (k--) {
1019		oldblk = newblk;
1020		newblk = ablks[--a];
1021		error = bread(ip->i_devvp, fsbtodb(fs, newblk),
1022		    (int)fs->e2fs_bsize, NOCRED, &bp);
1023		if (error) {
1024			goto cleanup;
1025		}
1026
1027		neh = (struct ext4_extent_header *)bp->b_data;
1028		neh->eh_ecount = htole16(1);
1029		neh->eh_magic = htole16(EXT4_EXT_MAGIC);
1030		neh->eh_max = htole16(ext4_ext_space_block_index(ip));
1031		neh->eh_depth = htole16(depth - i);
1032		fidx = EXT_FIRST_INDEX(neh);
1033		fidx->ei_blk = htole32(border);
1034		ext4_index_store_pblock(fidx, oldblk);
1035
1036		m = 0;
1037		path[i].ep_index++;
1038		while (path[i].ep_index <= EXT_MAX_INDEX(path[i].ep_header)) {
1039			path[i].ep_index++;
1040			m++;
1041		}
1042		if (m) {
1043			memmove(++fidx, path[i].ep_index - m,
1044			    sizeof(struct ext4_extent_index) * m);
1045			neh->eh_ecount = htole16(le16toh(neh->eh_ecount) + m);
1046		}
1047
1048		ext2_extent_blk_csum_set(ip, bp->b_data);
1049		bwrite(bp);
1050		bp = NULL;
1051
1052		/* Fix old index. */
1053		if (m) {
1054			path[i].ep_header->eh_ecount =
1055			    htole16(le16toh(path[i].ep_header->eh_ecount) - m);
1056			ext4_ext_dirty(ip, path + i);
1057		}
1058
1059		i--;
1060	}
1061
1062	error = ext4_ext_insert_index(ip, path + at, border, newblk);
1063
1064cleanup:
1065	if (bp)
1066		brelse(bp);
1067
1068	if (error) {
1069		for (i = 0; i < depth; i++) {
1070			if (!ablks[i])
1071				continue;
1072			ext4_ext_blkfree(ip, ablks[i], 1, 0);
1073		}
1074	}
1075
1076	free(ablks, M_EXT2EXTENTS);
1077
1078	return (error);
1079}
1080
1081static int
1082ext4_ext_grow_indepth(struct inode *ip, struct ext4_extent_path *path,
1083    struct ext4_extent *newext)
1084{
1085	struct m_ext2fs *fs;
1086	struct ext4_extent_path *curpath;
1087	struct ext4_extent_header *neh;
1088	struct buf *bp;
1089	e4fs_daddr_t newblk;
1090	int error = 0;
1091
1092	fs = ip->i_e2fs;
1093	curpath = path;
1094
1095	newblk = ext4_ext_alloc_meta(ip);
1096	if (newblk == 0)
1097		return (error);
1098
1099	bp = getblk(ip->i_devvp, fsbtodb(fs, newblk), fs->e2fs_bsize, 0, 0, 0);
1100	if (!bp) {
1101		ext4_ext_blkfree(ip, newblk, 1, 0);
1102		return (EIO);
1103	}
1104
1105	/* Move top-level index/leaf into new block. */
1106	memmove(bp->b_data, curpath->ep_header, sizeof(ip->i_data));
1107
1108	/* Set size of new block */
1109	neh = ext4_ext_block_header(bp->b_data);
1110	neh->eh_magic = htole16(EXT4_EXT_MAGIC);
1111
1112	if (ext4_ext_inode_depth(ip))
1113		neh->eh_max = htole16(ext4_ext_space_block_index(ip));
1114	else
1115		neh->eh_max = htole16(ext4_ext_space_block(ip));
1116
1117	ext2_extent_blk_csum_set(ip, bp->b_data);
1118	error = bwrite(bp);
1119	if (error) {
1120		ext4_ext_blkfree(ip, newblk, 1, 0);
1121		goto out;
1122	}
1123
1124	bp = NULL;
1125
1126	curpath->ep_header->eh_magic = htole16(EXT4_EXT_MAGIC);
1127	curpath->ep_header->eh_max = htole16(ext4_ext_space_root(ip));
1128	curpath->ep_header->eh_ecount = htole16(1);
1129	curpath->ep_index = EXT_FIRST_INDEX(curpath->ep_header);
1130	curpath->ep_index->ei_blk = EXT_FIRST_EXTENT(path[0].ep_header)->e_blk;
1131	ext4_index_store_pblock(curpath->ep_index, newblk);
1132
1133	neh = ext4_ext_inode_header(ip);
1134	neh->eh_depth = htole16(path->ep_depth + 1);
1135	ext4_ext_dirty(ip, curpath);
1136out:
1137	brelse(bp);
1138
1139	return (error);
1140}
1141
1142static int
1143ext4_ext_create_new_leaf(struct inode *ip, struct ext4_extent_path *path,
1144    struct ext4_extent *newext)
1145{
1146	struct ext4_extent_path *curpath;
1147	int depth, i, error;
1148
1149repeat:
1150	i = depth = ext4_ext_inode_depth(ip);
1151
1152	/* Look for free index entry int the tree */
1153	curpath = path + depth;
1154	while (i > 0 && !EXT_HAS_FREE_INDEX(curpath)) {
1155		i--;
1156		curpath--;
1157	}
1158
1159	/*
1160	 * We use already allocated block for index block,
1161	 * so subsequent data blocks should be contiguous.
1162	 */
1163	if (EXT_HAS_FREE_INDEX(curpath)) {
1164		error = ext4_ext_split(ip, path, newext, i);
1165		if (error)
1166			goto out;
1167
1168		/* Refill path. */
1169		ext4_ext_drop_refs(path);
1170		error = ext4_ext_find_extent(ip, le32toh(newext->e_blk), &path);
1171		if (error)
1172			goto out;
1173	} else {
1174		/* Tree is full, do grow in depth. */
1175		error = ext4_ext_grow_indepth(ip, path, newext);
1176		if (error)
1177			goto out;
1178
1179		/* Refill path. */
1180		ext4_ext_drop_refs(path);
1181		error = ext4_ext_find_extent(ip, le32toh(newext->e_blk), &path);
1182		if (error)
1183			goto out;
1184
1185		/* Check and split tree if required. */
1186		depth = ext4_ext_inode_depth(ip);
1187		if (le16toh(path[depth].ep_header->eh_ecount) ==
1188		    le16toh(path[depth].ep_header->eh_max))
1189			goto repeat;
1190	}
1191
1192out:
1193	return (error);
1194}
1195
1196static int
1197ext4_ext_correct_indexes(struct inode *ip, struct ext4_extent_path *path)
1198{
1199	struct ext4_extent_header *eh;
1200	struct ext4_extent *ex;
1201	int32_t border;
1202	int depth, k;
1203
1204	depth = ext4_ext_inode_depth(ip);
1205	eh = path[depth].ep_header;
1206	ex = path[depth].ep_ext;
1207
1208	if (ex == NULL || eh == NULL)
1209		return (EIO);
1210
1211	if (!depth)
1212		return (0);
1213
1214	/* We will correct tree if first leaf got modified only. */
1215	if (ex != EXT_FIRST_EXTENT(eh))
1216		return (0);
1217
1218	k = depth - 1;
1219	border = le32toh(path[depth].ep_ext->e_blk);
1220	path[k].ep_index->ei_blk = htole32(border);
1221	ext4_ext_dirty(ip, path + k);
1222	while (k--) {
1223		/* Change all left-side indexes. */
1224		if (path[k+1].ep_index != EXT_FIRST_INDEX(path[k+1].ep_header))
1225			break;
1226
1227		path[k].ep_index->ei_blk = htole32(border);
1228		ext4_ext_dirty(ip, path + k);
1229	}
1230
1231	return (0);
1232}
1233
1234static int
1235ext4_ext_insert_extent(struct inode *ip, struct ext4_extent_path *path,
1236    struct ext4_extent *newext)
1237{
1238	struct ext4_extent_header * eh;
1239	struct ext4_extent *ex, *nex, *nearex;
1240	struct ext4_extent_path *npath;
1241	int depth, len, error, next;
1242
1243	depth = ext4_ext_inode_depth(ip);
1244	ex = path[depth].ep_ext;
1245	npath = NULL;
1246
1247	if (htole16(newext->e_len) == 0 || path[depth].ep_header == NULL)
1248		return (EINVAL);
1249
1250	/* Insert block into found extent. */
1251	if (ex && ext4_can_extents_be_merged(ex, newext)) {
1252		ex->e_len = htole16(le16toh(ex->e_len) + le16toh(newext->e_len));
1253		eh = path[depth].ep_header;
1254		nearex = ex;
1255		goto merge;
1256	}
1257
1258repeat:
1259	depth = ext4_ext_inode_depth(ip);
1260	eh = path[depth].ep_header;
1261	if (le16toh(eh->eh_ecount) < le16toh(eh->eh_max))
1262		goto has_space;
1263
1264	/* Try next leaf */
1265	nex = EXT_LAST_EXTENT(eh);
1266	next = ext4_ext_next_leaf_block(ip, path);
1267	if (le32toh(newext->e_blk) > le32toh(nex->e_blk) && next !=
1268	    EXT4_MAX_BLOCKS) {
1269		KASSERT(npath == NULL,
1270		    ("ext4_ext_insert_extent: bad path"));
1271
1272		error = ext4_ext_find_extent(ip, next, &npath);
1273		if (error)
1274			goto cleanup;
1275
1276		if (npath->ep_depth != path->ep_depth) {
1277			error = EIO;
1278			goto cleanup;
1279		}
1280
1281		eh = npath[depth].ep_header;
1282		if (le16toh(eh->eh_ecount) < le16toh(eh->eh_max)) {
1283			path = npath;
1284			goto repeat;
1285		}
1286	}
1287
1288	/*
1289	 * There is no free space in the found leaf,
1290	 * try to add a new leaf to the tree.
1291	 */
1292	error = ext4_ext_create_new_leaf(ip, path, newext);
1293	if (error)
1294		goto cleanup;
1295
1296	depth = ext4_ext_inode_depth(ip);
1297	eh = path[depth].ep_header;
1298
1299has_space:
1300	nearex = path[depth].ep_ext;
1301	if (!nearex) {
1302		/* Create new extent in the leaf. */
1303		path[depth].ep_ext = EXT_FIRST_EXTENT(eh);
1304	} else if (le32toh(newext->e_blk) > le32toh(nearex->e_blk)) {
1305		if (nearex != EXT_LAST_EXTENT(eh)) {
1306			len = EXT_MAX_EXTENT(eh) - nearex;
1307			len = (len - 1) * sizeof(struct ext4_extent);
1308			len = len < 0 ? 0 : len;
1309			memmove(nearex + 2, nearex + 1, len);
1310		}
1311		path[depth].ep_ext = nearex + 1;
1312	} else {
1313		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1314		len = len < 0 ? 0 : len;
1315		memmove(nearex + 1, nearex, len);
1316		path[depth].ep_ext = nearex;
1317	}
1318
1319	eh->eh_ecount = htole16(le16toh(eh->eh_ecount) + 1);
1320	nearex = path[depth].ep_ext;
1321	nearex->e_blk = newext->e_blk;
1322	nearex->e_start_lo = newext->e_start_lo;
1323	nearex->e_start_hi = newext->e_start_hi;
1324	nearex->e_len = newext->e_len;
1325
1326merge:
1327	/* Try to merge extents to the right. */
1328	while (nearex < EXT_LAST_EXTENT(eh)) {
1329		if (!ext4_can_extents_be_merged(nearex, nearex + 1))
1330			break;
1331
1332		/* Merge with next extent. */
1333		nearex->e_len = htole16(le16toh(nearex->e_len) +
1334		    le16toh(nearex[1].e_len));
1335		if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
1336			len = (EXT_LAST_EXTENT(eh) - nearex - 1) *
1337			    sizeof(struct ext4_extent);
1338			memmove(nearex + 1, nearex + 2, len);
1339		}
1340
1341		eh->eh_ecount = htole16(le16toh(eh->eh_ecount) - 1);
1342		KASSERT(le16toh(eh->eh_ecount) != 0,
1343		    ("ext4_ext_insert_extent: bad ecount"));
1344	}
1345
1346	/*
1347	 * Try to merge extents to the left,
1348	 * start from inexes correction.
1349	 */
1350	error = ext4_ext_correct_indexes(ip, path);
1351	if (error)
1352		goto cleanup;
1353
1354	ext4_ext_dirty(ip, path + depth);
1355
1356cleanup:
1357	if (npath) {
1358		ext4_ext_drop_refs(npath);
1359		free(npath, M_EXT2EXTENTS);
1360	}
1361
1362	ip->i_ext_cache.ec_type = EXT4_EXT_CACHE_NO;
1363	return (error);
1364}
1365
1366static e4fs_daddr_t
1367ext4_new_blocks(struct inode *ip, daddr_t lbn, e4fs_daddr_t pref,
1368    struct ucred *cred, unsigned long *count, int *perror)
1369{
1370	struct m_ext2fs *fs;
1371	e4fs_daddr_t newblk;
1372
1373	/*
1374	 * We will allocate only single block for now.
1375	 */
1376	if (*count > 1)
1377		return (0);
1378
1379	fs = ip->i_e2fs;
1380	EXT2_LOCK(ip->i_ump);
1381	*perror = ext2_alloc(ip, lbn, pref, (int)fs->e2fs_bsize, cred, &newblk);
1382	if (*perror)
1383		return (0);
1384
1385	if (newblk) {
1386		ip->i_flag |= IN_CHANGE | IN_UPDATE;
1387		ext2_update(ip->i_vnode, 1);
1388	}
1389
1390	return (newblk);
1391}
1392
1393int
1394ext4_ext_get_blocks(struct inode *ip, e4fs_daddr_t iblk,
1395    unsigned long max_blocks, struct ucred *cred, struct buf **bpp,
1396    int *pallocated, daddr_t *nb)
1397{
1398	struct m_ext2fs *fs;
1399	struct buf *bp = NULL;
1400	struct ext4_extent_path *path;
1401	struct ext4_extent newex, *ex;
1402	e4fs_daddr_t bpref, newblk = 0;
1403	unsigned long allocated = 0;
1404	int error = 0, depth;
1405
1406	if(bpp)
1407		*bpp = NULL;
1408	*pallocated = 0;
1409
1410	/* Check cache. */
1411	path = NULL;
1412	if ((bpref = ext4_ext_in_cache(ip, iblk, &newex))) {
1413		if (bpref == EXT4_EXT_CACHE_IN) {
1414			/* Block is already allocated. */
1415			newblk = iblk - le32toh(newex.e_blk) +
1416			    ext4_ext_extent_pblock(&newex);
1417			allocated = le16toh(newex.e_len) - (iblk - le32toh(newex.e_blk));
1418			goto out;
1419		} else {
1420			error = EIO;
1421			goto out2;
1422		}
1423	}
1424
1425	error = ext4_ext_find_extent(ip, iblk, &path);
1426	if (error) {
1427		goto out2;
1428	}
1429
1430	depth = ext4_ext_inode_depth(ip);
1431	if (path[depth].ep_ext == NULL && depth != 0) {
1432		error = EIO;
1433		goto out2;
1434	}
1435
1436	if ((ex = path[depth].ep_ext)) {
1437		uint64_t lblk = le32toh(ex->e_blk);
1438		uint16_t e_len  = le16toh(ex->e_len);
1439		e4fs_daddr_t e_start = ext4_ext_extent_pblock(ex);
1440
1441		if (e_len > EXT4_MAX_LEN)
1442			goto out2;
1443
1444		/* If we found extent covers block, simply return it. */
1445		if (iblk >= lblk && iblk < lblk + e_len) {
1446			newblk = iblk - lblk + e_start;
1447			allocated = e_len - (iblk - lblk);
1448			ext4_ext_put_in_cache(ip, lblk, e_len,
1449			    e_start, EXT4_EXT_CACHE_IN);
1450			goto out;
1451		}
1452	}
1453
1454	/* Allocate the new block. */
1455	if (S_ISREG(ip->i_mode) && (!ip->i_next_alloc_block)) {
1456		ip->i_next_alloc_goal = 0;
1457	}
1458
1459	bpref = ext4_ext_blkpref(ip, path, iblk);
1460	allocated = max_blocks;
1461	newblk = ext4_new_blocks(ip, iblk, bpref, cred, &allocated, &error);
1462	if (!newblk)
1463		goto out2;
1464
1465	/* Try to insert new extent into found leaf and return. */
1466	newex.e_blk = htole32(iblk);
1467	ext4_ext_store_pblock(&newex, newblk);
1468	newex.e_len = htole16(allocated);
1469	error = ext4_ext_insert_extent(ip, path, &newex);
1470	if (error)
1471		goto out2;
1472
1473	newblk = ext4_ext_extent_pblock(&newex);
1474	ext4_ext_put_in_cache(ip, iblk, allocated, newblk, EXT4_EXT_CACHE_IN);
1475	*pallocated = 1;
1476
1477out:
1478	if (allocated > max_blocks)
1479		allocated = max_blocks;
1480
1481	if (bpp)
1482	{
1483		fs = ip->i_e2fs;
1484		error = bread(ip->i_devvp, fsbtodb(fs, newblk),
1485		    fs->e2fs_bsize, cred, &bp);
1486		if (error) {
1487			brelse(bp);
1488		} else {
1489			*bpp = bp;
1490		}
1491	}
1492
1493out2:
1494	if (path) {
1495		ext4_ext_drop_refs(path);
1496		free(path, M_EXT2EXTENTS);
1497	}
1498
1499	if (nb)
1500		*nb = newblk;
1501
1502	return (error);
1503}
1504
1505static inline struct ext4_extent_header *
1506ext4_ext_header(struct inode *ip)
1507{
1508
1509	return ((struct ext4_extent_header *)ip->i_db);
1510}
1511
1512static int
1513ext4_remove_blocks(struct inode *ip, struct ext4_extent *ex,
1514    unsigned long from, unsigned long to)
1515{
1516	unsigned long num, start;
1517
1518	if (from >= le32toh(ex->e_blk) &&
1519	    to == le32toh(ex->e_blk) + ext4_ext_get_actual_len(ex) - 1) {
1520		/* Tail cleanup. */
1521		num = le32toh(ex->e_blk) + ext4_ext_get_actual_len(ex) - from;
1522		start = ext4_ext_extent_pblock(ex) +
1523		    ext4_ext_get_actual_len(ex) - num;
1524		ext4_ext_blkfree(ip, start, num, 0);
1525	}
1526
1527	return (0);
1528}
1529
1530static int
1531ext4_ext_rm_index(struct inode *ip, struct ext4_extent_path *path)
1532{
1533	e4fs_daddr_t leaf;
1534
1535	/* Free index block. */
1536	path--;
1537	leaf = ext4_ext_index_pblock(path->ep_index);
1538	KASSERT(path->ep_header->eh_ecount != 0,
1539	    ("ext4_ext_rm_index: bad ecount"));
1540	path->ep_header->eh_ecount =
1541	    htole16(le16toh(path->ep_header->eh_ecount) - 1);
1542	ext4_ext_dirty(ip, path);
1543	ext4_ext_blkfree(ip, leaf, 1, 0);
1544	return (0);
1545}
1546
1547static int
1548ext4_ext_rm_leaf(struct inode *ip, struct ext4_extent_path *path,
1549    uint64_t start)
1550{
1551	struct ext4_extent_header *eh;
1552	struct ext4_extent *ex;
1553	unsigned int a, b, block, num;
1554	unsigned long ex_blk;
1555	unsigned short ex_len;
1556	int depth;
1557	int error, correct_index;
1558
1559	depth = ext4_ext_inode_depth(ip);
1560	if (!path[depth].ep_header) {
1561		if (path[depth].ep_data == NULL)
1562			return (EINVAL);
1563		path[depth].ep_header =
1564		    (struct ext4_extent_header* )path[depth].ep_data;
1565	}
1566
1567	eh = path[depth].ep_header;
1568	if (!eh) {
1569		SDT_PROBE2(ext2fs, , trace, extents, 1,
1570		    "bad header => extent corrupted");
1571		return (EIO);
1572	}
1573
1574	ex = EXT_LAST_EXTENT(eh);
1575	ex_blk = le32toh(ex->e_blk);
1576	ex_len = ext4_ext_get_actual_len(ex);
1577
1578	error = 0;
1579	correct_index = 0;
1580	while (ex >= EXT_FIRST_EXTENT(eh) && ex_blk + ex_len > start) {
1581		path[depth].ep_ext = ex;
1582		a = ex_blk > start ? ex_blk : start;
1583		b = (uint64_t)ex_blk + ex_len - 1 <
1584		    EXT4_MAX_BLOCKS ? ex_blk + ex_len - 1 : EXT4_MAX_BLOCKS;
1585
1586		if (a != ex_blk && b != ex_blk + ex_len - 1)
1587			return (EINVAL);
1588		else if (a != ex_blk) {
1589			/* Remove tail of the extent. */
1590			block = ex_blk;
1591			num = a - block;
1592		} else if (b != ex_blk + ex_len - 1) {
1593			/* Remove head of the extent, not implemented. */
1594			return (EINVAL);
1595		} else {
1596			/* Remove whole extent. */
1597			block = ex_blk;
1598			num = 0;
1599		}
1600
1601		if (ex == EXT_FIRST_EXTENT(eh))
1602			correct_index = 1;
1603
1604		error = ext4_remove_blocks(ip, ex, a, b);
1605		if (error)
1606			goto out;
1607
1608		if (num == 0) {
1609			ext4_ext_store_pblock(ex, 0);
1610			eh->eh_ecount = htole16(le16toh(eh->eh_ecount) - 1);
1611		}
1612
1613		ex->e_blk = htole32(block);
1614		ex->e_len = htole16(num);
1615
1616		ext4_ext_dirty(ip, path + depth);
1617
1618		ex--;
1619		ex_blk = htole32(ex->e_blk);
1620		ex_len = ext4_ext_get_actual_len(ex);
1621	};
1622
1623	if (correct_index && le16toh(eh->eh_ecount))
1624		error = ext4_ext_correct_indexes(ip, path);
1625
1626	/*
1627	 * If this leaf is free, we should
1628	 * remove it from index block above.
1629	 */
1630	if (error == 0 && eh->eh_ecount == 0 &&
1631	    path[depth].ep_data != NULL)
1632		error = ext4_ext_rm_index(ip, path + depth);
1633
1634out:
1635	return (error);
1636}
1637
1638static struct buf *
1639ext4_read_extent_tree_block(struct inode *ip, e4fs_daddr_t pblk,
1640    int depth, int flags)
1641{
1642	struct m_ext2fs *fs;
1643	struct ext4_extent_header *eh;
1644	struct buf *bp;
1645	int error;
1646
1647	fs = ip->i_e2fs;
1648	error = bread(ip->i_devvp, fsbtodb(fs, pblk),
1649	    fs->e2fs_bsize, NOCRED, &bp);
1650	if (error) {
1651		return (NULL);
1652	}
1653
1654	eh = ext4_ext_block_header(bp->b_data);
1655	if (le16toh(eh->eh_depth) != depth) {
1656		SDT_PROBE2(ext2fs, , trace, extents, 1,
1657		    "unexpected eh_depth");
1658		goto err;
1659	}
1660
1661	error = ext4_ext_check_header(ip, eh, depth);
1662	if (error)
1663		goto err;
1664
1665	return (bp);
1666
1667err:
1668	brelse(bp);
1669	return (NULL);
1670
1671}
1672
1673static int inline
1674ext4_ext_more_to_rm(struct ext4_extent_path *path)
1675{
1676
1677	KASSERT(path->ep_index != NULL,
1678	    ("ext4_ext_more_to_rm: bad index from path"));
1679
1680	if (path->ep_index < EXT_FIRST_INDEX(path->ep_header))
1681		return (0);
1682
1683	if (le16toh(path->ep_header->eh_ecount) == path->index_count)
1684		return (0);
1685
1686	return (1);
1687}
1688
1689int
1690ext4_ext_remove_space(struct inode *ip, off_t length, int flags,
1691    struct ucred *cred, struct thread *td)
1692{
1693	struct buf *bp;
1694	struct ext4_extent_header *ehp;
1695	struct ext4_extent_path *path;
1696	int depth;
1697	int i, error;
1698
1699	ehp = (struct ext4_extent_header *)ip->i_db;
1700	depth = ext4_ext_inode_depth(ip);
1701
1702	error = ext4_ext_check_header(ip, ehp, depth);
1703	if(error)
1704		return (error);
1705
1706	path = malloc(sizeof(struct ext4_extent_path) * (depth + 1),
1707	    M_EXT2EXTENTS, M_WAITOK | M_ZERO);
1708	path[0].ep_header = ehp;
1709	path[0].ep_depth = depth;
1710	i = 0;
1711	while (error == 0 && i >= 0) {
1712		if (i == depth) {
1713			/* This is leaf. */
1714			error = ext4_ext_rm_leaf(ip, path, length);
1715			if (error)
1716				break;
1717			free(path[i].ep_data, M_EXT2EXTENTS);
1718			path[i].ep_data = NULL;
1719			i--;
1720			continue;
1721		}
1722
1723		/* This is index. */
1724		if (!path[i].ep_header)
1725			path[i].ep_header =
1726			    (struct ext4_extent_header *)path[i].ep_data;
1727
1728		if (!path[i].ep_index) {
1729			/* This level hasn't touched yet. */
1730			path[i].ep_index = EXT_LAST_INDEX(path[i].ep_header);
1731			path[i].index_count =
1732			    le16toh(path[i].ep_header->eh_ecount) + 1;
1733		} else {
1734			/* We've already was here, see at next index. */
1735			path[i].ep_index--;
1736		}
1737
1738		if (ext4_ext_more_to_rm(path + i)) {
1739			memset(path + i + 1, 0, sizeof(*path));
1740			bp = ext4_read_extent_tree_block(ip,
1741			    ext4_ext_index_pblock(path[i].ep_index),
1742			    path[0].ep_depth - (i + 1), 0);
1743			if (!bp) {
1744				error = EIO;
1745				break;
1746			}
1747
1748			ext4_ext_fill_path_bdata(&path[i+1], bp,
1749			    ext4_ext_index_pblock(path[i].ep_index));
1750			brelse(bp);
1751			path[i].index_count =
1752			    le16toh(path[i].ep_header->eh_ecount);
1753			i++;
1754		} else {
1755			if (path[i].ep_header->eh_ecount == 0 && i > 0) {
1756				/* Index is empty, remove it. */
1757				error = ext4_ext_rm_index(ip, path + i);
1758			}
1759			free(path[i].ep_data, M_EXT2EXTENTS);
1760			path[i].ep_data = NULL;
1761			i--;
1762		}
1763	}
1764
1765	if (path->ep_header->eh_ecount == 0) {
1766		/*
1767		 * Truncate the tree to zero.
1768		 */
1769		 ext4_ext_header(ip)->eh_depth = 0;
1770		 ext4_ext_header(ip)->eh_max = htole16(ext4_ext_space_root(ip));
1771		 ext4_ext_dirty(ip, path);
1772	}
1773
1774	ext4_ext_drop_refs(path);
1775	free(path, M_EXT2EXTENTS);
1776
1777	ip->i_ext_cache.ec_type = EXT4_EXT_CACHE_NO;
1778	return (error);
1779}
1780