1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * alloc.c
5 *
6 * Extent allocs and frees
7 *
8 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/swap.h>
31
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "aops.h"
39#include "dlmglue.h"
40#include "extent_map.h"
41#include "inode.h"
42#include "journal.h"
43#include "localalloc.h"
44#include "suballoc.h"
45#include "sysfile.h"
46#include "file.h"
47#include "super.h"
48#include "uptodate.h"
49
50#include "buffer_head_io.h"
51
52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
53
54/*
55 * Structures which describe a path through a btree, and functions to
56 * manipulate them.
57 *
58 * The idea here is to be as generic as possible with the tree
59 * manipulation code.
60 */
61struct ocfs2_path_item {
62	struct buffer_head		*bh;
63	struct ocfs2_extent_list	*el;
64};
65
66#define OCFS2_MAX_PATH_DEPTH	5
67
68struct ocfs2_path {
69	int			p_tree_depth;
70	struct ocfs2_path_item	p_node[OCFS2_MAX_PATH_DEPTH];
71};
72
73#define path_root_bh(_path) ((_path)->p_node[0].bh)
74#define path_root_el(_path) ((_path)->p_node[0].el)
75#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
76#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
77#define path_num_items(_path) ((_path)->p_tree_depth + 1)
78
79/*
80 * Reset the actual path elements so that we can re-use the structure
81 * to build another path. Generally, this involves freeing the buffer
82 * heads.
83 */
84static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
85{
86	int i, start = 0, depth = 0;
87	struct ocfs2_path_item *node;
88
89	if (keep_root)
90		start = 1;
91
92	for(i = start; i < path_num_items(path); i++) {
93		node = &path->p_node[i];
94
95		brelse(node->bh);
96		node->bh = NULL;
97		node->el = NULL;
98	}
99
100	/*
101	 * Tree depth may change during truncate, or insert. If we're
102	 * keeping the root extent list, then make sure that our path
103	 * structure reflects the proper depth.
104	 */
105	if (keep_root)
106		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
107
108	path->p_tree_depth = depth;
109}
110
111static void ocfs2_free_path(struct ocfs2_path *path)
112{
113	if (path) {
114		ocfs2_reinit_path(path, 0);
115		kfree(path);
116	}
117}
118
119/*
120 * Make the *dest path the same as src and re-initialize src path to
121 * have a root only.
122 */
123static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
124{
125	int i;
126
127	BUG_ON(path_root_bh(dest) != path_root_bh(src));
128
129	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
130		brelse(dest->p_node[i].bh);
131
132		dest->p_node[i].bh = src->p_node[i].bh;
133		dest->p_node[i].el = src->p_node[i].el;
134
135		src->p_node[i].bh = NULL;
136		src->p_node[i].el = NULL;
137	}
138}
139
140/*
141 * Insert an extent block at given index.
142 *
143 * This will not take an additional reference on eb_bh.
144 */
145static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
146					struct buffer_head *eb_bh)
147{
148	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
149
150	/*
151	 * Right now, no root bh is an extent block, so this helps
152	 * catch code errors with dinode trees. The assertion can be
153	 * safely removed if we ever need to insert extent block
154	 * structures at the root.
155	 */
156	BUG_ON(index == 0);
157
158	path->p_node[index].bh = eb_bh;
159	path->p_node[index].el = &eb->h_list;
160}
161
162static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
163					 struct ocfs2_extent_list *root_el)
164{
165	struct ocfs2_path *path;
166
167	BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
168
169	path = kzalloc(sizeof(*path), GFP_NOFS);
170	if (path) {
171		path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
172		get_bh(root_bh);
173		path_root_bh(path) = root_bh;
174		path_root_el(path) = root_el;
175	}
176
177	return path;
178}
179
180/*
181 * Allocate and initialize a new path based on a disk inode tree.
182 */
183static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
184{
185	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
186	struct ocfs2_extent_list *el = &di->id2.i_list;
187
188	return ocfs2_new_path(di_bh, el);
189}
190
191/*
192 * Convenience function to journal all components in a path.
193 */
194static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
195				     struct ocfs2_path *path)
196{
197	int i, ret = 0;
198
199	if (!path)
200		goto out;
201
202	for(i = 0; i < path_num_items(path); i++) {
203		ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
204					   OCFS2_JOURNAL_ACCESS_WRITE);
205		if (ret < 0) {
206			mlog_errno(ret);
207			goto out;
208		}
209	}
210
211out:
212	return ret;
213}
214
215enum ocfs2_contig_type {
216	CONTIG_NONE = 0,
217	CONTIG_LEFT,
218	CONTIG_RIGHT
219};
220
221
222/*
223 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
224 * ocfs2_extent_contig only work properly against leaf nodes!
225 */
226static int ocfs2_block_extent_contig(struct super_block *sb,
227				     struct ocfs2_extent_rec *ext,
228				     u64 blkno)
229{
230	u64 blk_end = le64_to_cpu(ext->e_blkno);
231
232	blk_end += ocfs2_clusters_to_blocks(sb,
233				    le16_to_cpu(ext->e_leaf_clusters));
234
235	return blkno == blk_end;
236}
237
238static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
239				  struct ocfs2_extent_rec *right)
240{
241	u32 left_range;
242
243	left_range = le32_to_cpu(left->e_cpos) +
244		le16_to_cpu(left->e_leaf_clusters);
245
246	return (left_range == le32_to_cpu(right->e_cpos));
247}
248
249static enum ocfs2_contig_type
250	ocfs2_extent_contig(struct inode *inode,
251			    struct ocfs2_extent_rec *ext,
252			    struct ocfs2_extent_rec *insert_rec)
253{
254	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
255
256	if (ocfs2_extents_adjacent(ext, insert_rec) &&
257	    ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
258			return CONTIG_RIGHT;
259
260	blkno = le64_to_cpu(ext->e_blkno);
261	if (ocfs2_extents_adjacent(insert_rec, ext) &&
262	    ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
263		return CONTIG_LEFT;
264
265	return CONTIG_NONE;
266}
267
268/*
269 * NOTE: We can have pretty much any combination of contiguousness and
270 * appending.
271 *
272 * The usefulness of APPEND_TAIL is more in that it lets us know that
273 * we'll have to update the path to that leaf.
274 */
275enum ocfs2_append_type {
276	APPEND_NONE = 0,
277	APPEND_TAIL,
278};
279
280struct ocfs2_insert_type {
281	enum ocfs2_append_type	ins_appending;
282	enum ocfs2_contig_type	ins_contig;
283	int			ins_contig_index;
284	int			ins_free_records;
285	int			ins_tree_depth;
286};
287
288/*
289 * How many free extents have we got before we need more meta data?
290 */
291int ocfs2_num_free_extents(struct ocfs2_super *osb,
292			   struct inode *inode,
293			   struct ocfs2_dinode *fe)
294{
295	int retval;
296	struct ocfs2_extent_list *el;
297	struct ocfs2_extent_block *eb;
298	struct buffer_head *eb_bh = NULL;
299
300	mlog_entry_void();
301
302	if (!OCFS2_IS_VALID_DINODE(fe)) {
303		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
304		retval = -EIO;
305		goto bail;
306	}
307
308	if (fe->i_last_eb_blk) {
309		retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
310					  &eb_bh, OCFS2_BH_CACHED, inode);
311		if (retval < 0) {
312			mlog_errno(retval);
313			goto bail;
314		}
315		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
316		el = &eb->h_list;
317	} else
318		el = &fe->id2.i_list;
319
320	BUG_ON(el->l_tree_depth != 0);
321
322	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
323bail:
324	if (eb_bh)
325		brelse(eb_bh);
326
327	mlog_exit(retval);
328	return retval;
329}
330
331/* expects array to already be allocated
332 *
333 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
334 * l_count for you
335 */
336static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
337				     handle_t *handle,
338				     struct inode *inode,
339				     int wanted,
340				     struct ocfs2_alloc_context *meta_ac,
341				     struct buffer_head *bhs[])
342{
343	int count, status, i;
344	u16 suballoc_bit_start;
345	u32 num_got;
346	u64 first_blkno;
347	struct ocfs2_extent_block *eb;
348
349	mlog_entry_void();
350
351	count = 0;
352	while (count < wanted) {
353		status = ocfs2_claim_metadata(osb,
354					      handle,
355					      meta_ac,
356					      wanted - count,
357					      &suballoc_bit_start,
358					      &num_got,
359					      &first_blkno);
360		if (status < 0) {
361			mlog_errno(status);
362			goto bail;
363		}
364
365		for(i = count;  i < (num_got + count); i++) {
366			bhs[i] = sb_getblk(osb->sb, first_blkno);
367			if (bhs[i] == NULL) {
368				status = -EIO;
369				mlog_errno(status);
370				goto bail;
371			}
372			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
373
374			status = ocfs2_journal_access(handle, inode, bhs[i],
375						      OCFS2_JOURNAL_ACCESS_CREATE);
376			if (status < 0) {
377				mlog_errno(status);
378				goto bail;
379			}
380
381			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
382			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
383			/* Ok, setup the minimal stuff here. */
384			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
385			eb->h_blkno = cpu_to_le64(first_blkno);
386			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
387
388#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
389			/* we always use slot zero's suballocator */
390			eb->h_suballoc_slot = 0;
391#else
392			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
393#endif
394			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
395			eb->h_list.l_count =
396				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
397
398			suballoc_bit_start++;
399			first_blkno++;
400
401			/* We'll also be dirtied by the caller, so
402			 * this isn't absolutely necessary. */
403			status = ocfs2_journal_dirty(handle, bhs[i]);
404			if (status < 0) {
405				mlog_errno(status);
406				goto bail;
407			}
408		}
409
410		count += num_got;
411	}
412
413	status = 0;
414bail:
415	if (status < 0) {
416		for(i = 0; i < wanted; i++) {
417			if (bhs[i])
418				brelse(bhs[i]);
419			bhs[i] = NULL;
420		}
421	}
422	mlog_exit(status);
423	return status;
424}
425
426/*
427 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
428 *
429 * Returns the sum of the rightmost extent rec logical offset and
430 * cluster count.
431 *
432 * ocfs2_add_branch() uses this to determine what logical cluster
433 * value should be populated into the leftmost new branch records.
434 *
435 * ocfs2_shift_tree_depth() uses this to determine the # clusters
436 * value for the new topmost tree record.
437 */
438static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
439{
440	int i;
441
442	i = le16_to_cpu(el->l_next_free_rec) - 1;
443
444	return le32_to_cpu(el->l_recs[i].e_cpos) +
445		ocfs2_rec_clusters(el, &el->l_recs[i]);
446}
447
448/*
449 * Add an entire tree branch to our inode. eb_bh is the extent block
450 * to start at, if we don't want to start the branch at the dinode
451 * structure.
452 *
453 * last_eb_bh is required as we have to update it's next_leaf pointer
454 * for the new last extent block.
455 *
456 * the new branch will be 'empty' in the sense that every block will
457 * contain a single record with cluster count == 0.
458 */
459static int ocfs2_add_branch(struct ocfs2_super *osb,
460			    handle_t *handle,
461			    struct inode *inode,
462			    struct buffer_head *fe_bh,
463			    struct buffer_head *eb_bh,
464			    struct buffer_head *last_eb_bh,
465			    struct ocfs2_alloc_context *meta_ac)
466{
467	int status, new_blocks, i;
468	u64 next_blkno, new_last_eb_blk;
469	struct buffer_head *bh;
470	struct buffer_head **new_eb_bhs = NULL;
471	struct ocfs2_dinode *fe;
472	struct ocfs2_extent_block *eb;
473	struct ocfs2_extent_list  *eb_el;
474	struct ocfs2_extent_list  *el;
475	u32 new_cpos;
476
477	mlog_entry_void();
478
479	BUG_ON(!last_eb_bh);
480
481	fe = (struct ocfs2_dinode *) fe_bh->b_data;
482
483	if (eb_bh) {
484		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
485		el = &eb->h_list;
486	} else
487		el = &fe->id2.i_list;
488
489	/* we never add a branch to a leaf. */
490	BUG_ON(!el->l_tree_depth);
491
492	new_blocks = le16_to_cpu(el->l_tree_depth);
493
494	/* allocate the number of new eb blocks we need */
495	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
496			     GFP_KERNEL);
497	if (!new_eb_bhs) {
498		status = -ENOMEM;
499		mlog_errno(status);
500		goto bail;
501	}
502
503	status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
504					   meta_ac, new_eb_bhs);
505	if (status < 0) {
506		mlog_errno(status);
507		goto bail;
508	}
509
510	eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
511	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
512
513	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
514	 * linked with the rest of the tree.
515	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
516	 *
517	 * when we leave the loop, new_last_eb_blk will point to the
518	 * newest leaf, and next_blkno will point to the topmost extent
519	 * block. */
520	next_blkno = new_last_eb_blk = 0;
521	for(i = 0; i < new_blocks; i++) {
522		bh = new_eb_bhs[i];
523		eb = (struct ocfs2_extent_block *) bh->b_data;
524		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
525			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
526			status = -EIO;
527			goto bail;
528		}
529		eb_el = &eb->h_list;
530
531		status = ocfs2_journal_access(handle, inode, bh,
532					      OCFS2_JOURNAL_ACCESS_CREATE);
533		if (status < 0) {
534			mlog_errno(status);
535			goto bail;
536		}
537
538		eb->h_next_leaf_blk = 0;
539		eb_el->l_tree_depth = cpu_to_le16(i);
540		eb_el->l_next_free_rec = cpu_to_le16(1);
541		/*
542		 * This actually counts as an empty extent as
543		 * c_clusters == 0
544		 */
545		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
546		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
547		/*
548		 * eb_el isn't always an interior node, but even leaf
549		 * nodes want a zero'd flags and reserved field so
550		 * this gets the whole 32 bits regardless of use.
551		 */
552		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
553		if (!eb_el->l_tree_depth)
554			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
555
556		status = ocfs2_journal_dirty(handle, bh);
557		if (status < 0) {
558			mlog_errno(status);
559			goto bail;
560		}
561
562		next_blkno = le64_to_cpu(eb->h_blkno);
563	}
564
565	/* This is a bit hairy. We want to update up to three blocks
566	 * here without leaving any of them in an inconsistent state
567	 * in case of error. We don't have to worry about
568	 * journal_dirty erroring as it won't unless we've aborted the
569	 * handle (in which case we would never be here) so reserving
570	 * the write with journal_access is all we need to do. */
571	status = ocfs2_journal_access(handle, inode, last_eb_bh,
572				      OCFS2_JOURNAL_ACCESS_WRITE);
573	if (status < 0) {
574		mlog_errno(status);
575		goto bail;
576	}
577	status = ocfs2_journal_access(handle, inode, fe_bh,
578				      OCFS2_JOURNAL_ACCESS_WRITE);
579	if (status < 0) {
580		mlog_errno(status);
581		goto bail;
582	}
583	if (eb_bh) {
584		status = ocfs2_journal_access(handle, inode, eb_bh,
585					      OCFS2_JOURNAL_ACCESS_WRITE);
586		if (status < 0) {
587			mlog_errno(status);
588			goto bail;
589		}
590	}
591
592	/* Link the new branch into the rest of the tree (el will
593	 * either be on the fe, or the extent block passed in. */
594	i = le16_to_cpu(el->l_next_free_rec);
595	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
596	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
597	el->l_recs[i].e_int_clusters = 0;
598	le16_add_cpu(&el->l_next_free_rec, 1);
599
600	/* fe needs a new last extent block pointer, as does the
601	 * next_leaf on the previously last-extent-block. */
602	fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
603
604	eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
605	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
606
607	status = ocfs2_journal_dirty(handle, last_eb_bh);
608	if (status < 0)
609		mlog_errno(status);
610	status = ocfs2_journal_dirty(handle, fe_bh);
611	if (status < 0)
612		mlog_errno(status);
613	if (eb_bh) {
614		status = ocfs2_journal_dirty(handle, eb_bh);
615		if (status < 0)
616			mlog_errno(status);
617	}
618
619	status = 0;
620bail:
621	if (new_eb_bhs) {
622		for (i = 0; i < new_blocks; i++)
623			if (new_eb_bhs[i])
624				brelse(new_eb_bhs[i]);
625		kfree(new_eb_bhs);
626	}
627
628	mlog_exit(status);
629	return status;
630}
631
632/*
633 * adds another level to the allocation tree.
634 * returns back the new extent block so you can add a branch to it
635 * after this call.
636 */
637static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
638				  handle_t *handle,
639				  struct inode *inode,
640				  struct buffer_head *fe_bh,
641				  struct ocfs2_alloc_context *meta_ac,
642				  struct buffer_head **ret_new_eb_bh)
643{
644	int status, i;
645	u32 new_clusters;
646	struct buffer_head *new_eb_bh = NULL;
647	struct ocfs2_dinode *fe;
648	struct ocfs2_extent_block *eb;
649	struct ocfs2_extent_list  *fe_el;
650	struct ocfs2_extent_list  *eb_el;
651
652	mlog_entry_void();
653
654	status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
655					   &new_eb_bh);
656	if (status < 0) {
657		mlog_errno(status);
658		goto bail;
659	}
660
661	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
662	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
663		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
664		status = -EIO;
665		goto bail;
666	}
667
668	eb_el = &eb->h_list;
669	fe = (struct ocfs2_dinode *) fe_bh->b_data;
670	fe_el = &fe->id2.i_list;
671
672	status = ocfs2_journal_access(handle, inode, new_eb_bh,
673				      OCFS2_JOURNAL_ACCESS_CREATE);
674	if (status < 0) {
675		mlog_errno(status);
676		goto bail;
677	}
678
679	/* copy the fe data into the new extent block */
680	eb_el->l_tree_depth = fe_el->l_tree_depth;
681	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
682	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
683		eb_el->l_recs[i] = fe_el->l_recs[i];
684
685	status = ocfs2_journal_dirty(handle, new_eb_bh);
686	if (status < 0) {
687		mlog_errno(status);
688		goto bail;
689	}
690
691	status = ocfs2_journal_access(handle, inode, fe_bh,
692				      OCFS2_JOURNAL_ACCESS_WRITE);
693	if (status < 0) {
694		mlog_errno(status);
695		goto bail;
696	}
697
698	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
699
700	/* update fe now */
701	le16_add_cpu(&fe_el->l_tree_depth, 1);
702	fe_el->l_recs[0].e_cpos = 0;
703	fe_el->l_recs[0].e_blkno = eb->h_blkno;
704	fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
705	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
706		memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
707	fe_el->l_next_free_rec = cpu_to_le16(1);
708
709	/* If this is our 1st tree depth shift, then last_eb_blk
710	 * becomes the allocated extent block */
711	if (fe_el->l_tree_depth == cpu_to_le16(1))
712		fe->i_last_eb_blk = eb->h_blkno;
713
714	status = ocfs2_journal_dirty(handle, fe_bh);
715	if (status < 0) {
716		mlog_errno(status);
717		goto bail;
718	}
719
720	*ret_new_eb_bh = new_eb_bh;
721	new_eb_bh = NULL;
722	status = 0;
723bail:
724	if (new_eb_bh)
725		brelse(new_eb_bh);
726
727	mlog_exit(status);
728	return status;
729}
730
731/*
732 * Should only be called when there is no space left in any of the
733 * leaf nodes. What we want to do is find the lowest tree depth
734 * non-leaf extent block with room for new records. There are three
735 * valid results of this search:
736 *
737 * 1) a lowest extent block is found, then we pass it back in
738 *    *lowest_eb_bh and return '0'
739 *
740 * 2) the search fails to find anything, but the dinode has room. We
741 *    pass NULL back in *lowest_eb_bh, but still return '0'
742 *
743 * 3) the search fails to find anything AND the dinode is full, in
744 *    which case we return > 0
745 *
746 * return status < 0 indicates an error.
747 */
748static int ocfs2_find_branch_target(struct ocfs2_super *osb,
749				    struct inode *inode,
750				    struct buffer_head *fe_bh,
751				    struct buffer_head **target_bh)
752{
753	int status = 0, i;
754	u64 blkno;
755	struct ocfs2_dinode *fe;
756	struct ocfs2_extent_block *eb;
757	struct ocfs2_extent_list  *el;
758	struct buffer_head *bh = NULL;
759	struct buffer_head *lowest_bh = NULL;
760
761	mlog_entry_void();
762
763	*target_bh = NULL;
764
765	fe = (struct ocfs2_dinode *) fe_bh->b_data;
766	el = &fe->id2.i_list;
767
768	while(le16_to_cpu(el->l_tree_depth) > 1) {
769		if (le16_to_cpu(el->l_next_free_rec) == 0) {
770			ocfs2_error(inode->i_sb, "Dinode %llu has empty "
771				    "extent list (next_free_rec == 0)",
772				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
773			status = -EIO;
774			goto bail;
775		}
776		i = le16_to_cpu(el->l_next_free_rec) - 1;
777		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
778		if (!blkno) {
779			ocfs2_error(inode->i_sb, "Dinode %llu has extent "
780				    "list where extent # %d has no physical "
781				    "block start",
782				    (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
783			status = -EIO;
784			goto bail;
785		}
786
787		if (bh) {
788			brelse(bh);
789			bh = NULL;
790		}
791
792		status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
793					  inode);
794		if (status < 0) {
795			mlog_errno(status);
796			goto bail;
797		}
798
799		eb = (struct ocfs2_extent_block *) bh->b_data;
800		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
801			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
802			status = -EIO;
803			goto bail;
804		}
805		el = &eb->h_list;
806
807		if (le16_to_cpu(el->l_next_free_rec) <
808		    le16_to_cpu(el->l_count)) {
809			if (lowest_bh)
810				brelse(lowest_bh);
811			lowest_bh = bh;
812			get_bh(lowest_bh);
813		}
814	}
815
816	/* If we didn't find one and the fe doesn't have any room,
817	 * then return '1' */
818	if (!lowest_bh
819	    && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
820		status = 1;
821
822	*target_bh = lowest_bh;
823bail:
824	if (bh)
825		brelse(bh);
826
827	mlog_exit(status);
828	return status;
829}
830
831/*
832 * This is only valid for leaf nodes, which are the only ones that can
833 * have empty extents anyway.
834 */
835static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
836{
837	return !rec->e_leaf_clusters;
838}
839
840/*
841 * This function will discard the rightmost extent record.
842 */
843static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
844{
845	int next_free = le16_to_cpu(el->l_next_free_rec);
846	int count = le16_to_cpu(el->l_count);
847	unsigned int num_bytes;
848
849	BUG_ON(!next_free);
850	/* This will cause us to go off the end of our extent list. */
851	BUG_ON(next_free >= count);
852
853	num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
854
855	memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
856}
857
858static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
859			      struct ocfs2_extent_rec *insert_rec)
860{
861	int i, insert_index, next_free, has_empty, num_bytes;
862	u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
863	struct ocfs2_extent_rec *rec;
864
865	next_free = le16_to_cpu(el->l_next_free_rec);
866	has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
867
868	BUG_ON(!next_free);
869
870	/* The tree code before us didn't allow enough room in the leaf. */
871	if (el->l_next_free_rec == el->l_count && !has_empty)
872		BUG();
873
874	/*
875	 * The easiest way to approach this is to just remove the
876	 * empty extent and temporarily decrement next_free.
877	 */
878	if (has_empty) {
879		/*
880		 * If next_free was 1 (only an empty extent), this
881		 * loop won't execute, which is fine. We still want
882		 * the decrement above to happen.
883		 */
884		for(i = 0; i < (next_free - 1); i++)
885			el->l_recs[i] = el->l_recs[i+1];
886
887		next_free--;
888	}
889
890	/*
891	 * Figure out what the new record index should be.
892	 */
893	for(i = 0; i < next_free; i++) {
894		rec = &el->l_recs[i];
895
896		if (insert_cpos < le32_to_cpu(rec->e_cpos))
897			break;
898	}
899	insert_index = i;
900
901	mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
902	     insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
903
904	BUG_ON(insert_index < 0);
905	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
906	BUG_ON(insert_index > next_free);
907
908	/*
909	 * No need to memmove if we're just adding to the tail.
910	 */
911	if (insert_index != next_free) {
912		BUG_ON(next_free >= le16_to_cpu(el->l_count));
913
914		num_bytes = next_free - insert_index;
915		num_bytes *= sizeof(struct ocfs2_extent_rec);
916		memmove(&el->l_recs[insert_index + 1],
917			&el->l_recs[insert_index],
918			num_bytes);
919	}
920
921	/*
922	 * Either we had an empty extent, and need to re-increment or
923	 * there was no empty extent on a non full rightmost leaf node,
924	 * in which case we still need to increment.
925	 */
926	next_free++;
927	el->l_next_free_rec = cpu_to_le16(next_free);
928	/*
929	 * Make sure none of the math above just messed up our tree.
930	 */
931	BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
932
933	el->l_recs[insert_index] = *insert_rec;
934
935}
936
937/*
938 * Create an empty extent record .
939 *
940 * l_next_free_rec may be updated.
941 *
942 * If an empty extent already exists do nothing.
943 */
944static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
945{
946	int next_free = le16_to_cpu(el->l_next_free_rec);
947
948	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
949
950	if (next_free == 0)
951		goto set_and_inc;
952
953	if (ocfs2_is_empty_extent(&el->l_recs[0]))
954		return;
955
956	mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
957			"Asked to create an empty extent in a full list:\n"
958			"count = %u, tree depth = %u",
959			le16_to_cpu(el->l_count),
960			le16_to_cpu(el->l_tree_depth));
961
962	ocfs2_shift_records_right(el);
963
964set_and_inc:
965	le16_add_cpu(&el->l_next_free_rec, 1);
966	memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
967}
968
969/*
970 * For a rotation which involves two leaf nodes, the "root node" is
971 * the lowest level tree node which contains a path to both leafs. This
972 * resulting set of information can be used to form a complete "subtree"
973 *
974 * This function is passed two full paths from the dinode down to a
975 * pair of adjacent leaves. It's task is to figure out which path
976 * index contains the subtree root - this can be the root index itself
977 * in a worst-case rotation.
978 *
979 * The array index of the subtree root is passed back.
980 */
981static int ocfs2_find_subtree_root(struct inode *inode,
982				   struct ocfs2_path *left,
983				   struct ocfs2_path *right)
984{
985	int i = 0;
986
987	/*
988	 * Check that the caller passed in two paths from the same tree.
989	 */
990	BUG_ON(path_root_bh(left) != path_root_bh(right));
991
992	do {
993		i++;
994
995		/*
996		 * The caller didn't pass two adjacent paths.
997		 */
998		mlog_bug_on_msg(i > left->p_tree_depth,
999				"Inode %lu, left depth %u, right depth %u\n"
1000				"left leaf blk %llu, right leaf blk %llu\n",
1001				inode->i_ino, left->p_tree_depth,
1002				right->p_tree_depth,
1003				(unsigned long long)path_leaf_bh(left)->b_blocknr,
1004				(unsigned long long)path_leaf_bh(right)->b_blocknr);
1005	} while (left->p_node[i].bh->b_blocknr ==
1006		 right->p_node[i].bh->b_blocknr);
1007
1008	return i - 1;
1009}
1010
1011typedef void (path_insert_t)(void *, struct buffer_head *);
1012
1013/*
1014 * Traverse a btree path in search of cpos, starting at root_el.
1015 *
1016 * This code can be called with a cpos larger than the tree, in which
1017 * case it will return the rightmost path.
1018 */
1019static int __ocfs2_find_path(struct inode *inode,
1020			     struct ocfs2_extent_list *root_el, u32 cpos,
1021			     path_insert_t *func, void *data)
1022{
1023	int i, ret = 0;
1024	u32 range;
1025	u64 blkno;
1026	struct buffer_head *bh = NULL;
1027	struct ocfs2_extent_block *eb;
1028	struct ocfs2_extent_list *el;
1029	struct ocfs2_extent_rec *rec;
1030	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1031
1032	el = root_el;
1033	while (el->l_tree_depth) {
1034		if (le16_to_cpu(el->l_next_free_rec) == 0) {
1035			ocfs2_error(inode->i_sb,
1036				    "Inode %llu has empty extent list at "
1037				    "depth %u\n",
1038				    (unsigned long long)oi->ip_blkno,
1039				    le16_to_cpu(el->l_tree_depth));
1040			ret = -EROFS;
1041			goto out;
1042
1043		}
1044
1045		for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1046			rec = &el->l_recs[i];
1047
1048			/*
1049			 * In the case that cpos is off the allocation
1050			 * tree, this should just wind up returning the
1051			 * rightmost record.
1052			 */
1053			range = le32_to_cpu(rec->e_cpos) +
1054				ocfs2_rec_clusters(el, rec);
1055			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1056			    break;
1057		}
1058
1059		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1060		if (blkno == 0) {
1061			ocfs2_error(inode->i_sb,
1062				    "Inode %llu has bad blkno in extent list "
1063				    "at depth %u (index %d)\n",
1064				    (unsigned long long)oi->ip_blkno,
1065				    le16_to_cpu(el->l_tree_depth), i);
1066			ret = -EROFS;
1067			goto out;
1068		}
1069
1070		brelse(bh);
1071		bh = NULL;
1072		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1073				       &bh, OCFS2_BH_CACHED, inode);
1074		if (ret) {
1075			mlog_errno(ret);
1076			goto out;
1077		}
1078
1079		eb = (struct ocfs2_extent_block *) bh->b_data;
1080		el = &eb->h_list;
1081		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1082			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1083			ret = -EIO;
1084			goto out;
1085		}
1086
1087		if (le16_to_cpu(el->l_next_free_rec) >
1088		    le16_to_cpu(el->l_count)) {
1089			ocfs2_error(inode->i_sb,
1090				    "Inode %llu has bad count in extent list "
1091				    "at block %llu (next free=%u, count=%u)\n",
1092				    (unsigned long long)oi->ip_blkno,
1093				    (unsigned long long)bh->b_blocknr,
1094				    le16_to_cpu(el->l_next_free_rec),
1095				    le16_to_cpu(el->l_count));
1096			ret = -EROFS;
1097			goto out;
1098		}
1099
1100		if (func)
1101			func(data, bh);
1102	}
1103
1104out:
1105	/*
1106	 * Catch any trailing bh that the loop didn't handle.
1107	 */
1108	brelse(bh);
1109
1110	return ret;
1111}
1112
1113/*
1114 * Given an initialized path (that is, it has a valid root extent
1115 * list), this function will traverse the btree in search of the path
1116 * which would contain cpos.
1117 *
1118 * The path traveled is recorded in the path structure.
1119 *
1120 * Note that this will not do any comparisons on leaf node extent
1121 * records, so it will work fine in the case that we just added a tree
1122 * branch.
1123 */
1124struct find_path_data {
1125	int index;
1126	struct ocfs2_path *path;
1127};
1128static void find_path_ins(void *data, struct buffer_head *bh)
1129{
1130	struct find_path_data *fp = data;
1131
1132	get_bh(bh);
1133	ocfs2_path_insert_eb(fp->path, fp->index, bh);
1134	fp->index++;
1135}
1136static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1137			   u32 cpos)
1138{
1139	struct find_path_data data;
1140
1141	data.index = 1;
1142	data.path = path;
1143	return __ocfs2_find_path(inode, path_root_el(path), cpos,
1144				 find_path_ins, &data);
1145}
1146
1147static void find_leaf_ins(void *data, struct buffer_head *bh)
1148{
1149	struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1150	struct ocfs2_extent_list *el = &eb->h_list;
1151	struct buffer_head **ret = data;
1152
1153	/* We want to retain only the leaf block. */
1154	if (le16_to_cpu(el->l_tree_depth) == 0) {
1155		get_bh(bh);
1156		*ret = bh;
1157	}
1158}
1159/*
1160 * Find the leaf block in the tree which would contain cpos. No
1161 * checking of the actual leaf is done.
1162 *
1163 * Some paths want to call this instead of allocating a path structure
1164 * and calling ocfs2_find_path().
1165 *
1166 * This function doesn't handle non btree extent lists.
1167 */
1168int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1169		    u32 cpos, struct buffer_head **leaf_bh)
1170{
1171	int ret;
1172	struct buffer_head *bh = NULL;
1173
1174	ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1175	if (ret) {
1176		mlog_errno(ret);
1177		goto out;
1178	}
1179
1180	*leaf_bh = bh;
1181out:
1182	return ret;
1183}
1184
1185/*
1186 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1187 *
1188 * Basically, we've moved stuff around at the bottom of the tree and
1189 * we need to fix up the extent records above the changes to reflect
1190 * the new changes.
1191 *
1192 * left_rec: the record on the left.
1193 * left_child_el: is the child list pointed to by left_rec
1194 * right_rec: the record to the right of left_rec
1195 * right_child_el: is the child list pointed to by right_rec
1196 *
1197 * By definition, this only works on interior nodes.
1198 */
1199static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1200				  struct ocfs2_extent_list *left_child_el,
1201				  struct ocfs2_extent_rec *right_rec,
1202				  struct ocfs2_extent_list *right_child_el)
1203{
1204	u32 left_clusters, right_end;
1205
1206	/*
1207	 * Interior nodes never have holes. Their cpos is the cpos of
1208	 * the leftmost record in their child list. Their cluster
1209	 * count covers the full theoretical range of their child list
1210	 * - the range between their cpos and the cpos of the record
1211	 * immediately to their right.
1212	 */
1213	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1214	left_clusters -= le32_to_cpu(left_rec->e_cpos);
1215	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1216
1217	/*
1218	 * Calculate the rightmost cluster count boundary before
1219	 * moving cpos - we will need to adjust clusters after
1220	 * updating e_cpos to keep the same highest cluster count.
1221	 */
1222	right_end = le32_to_cpu(right_rec->e_cpos);
1223	right_end += le32_to_cpu(right_rec->e_int_clusters);
1224
1225	right_rec->e_cpos = left_rec->e_cpos;
1226	le32_add_cpu(&right_rec->e_cpos, left_clusters);
1227
1228	right_end -= le32_to_cpu(right_rec->e_cpos);
1229	right_rec->e_int_clusters = cpu_to_le32(right_end);
1230}
1231
1232/*
1233 * Adjust the adjacent root node records involved in a
1234 * rotation. left_el_blkno is passed in as a key so that we can easily
1235 * find it's index in the root list.
1236 */
1237static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1238				      struct ocfs2_extent_list *left_el,
1239				      struct ocfs2_extent_list *right_el,
1240				      u64 left_el_blkno)
1241{
1242	int i;
1243
1244	BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1245	       le16_to_cpu(left_el->l_tree_depth));
1246
1247	for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1248		if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1249			break;
1250	}
1251
1252	/*
1253	 * The path walking code should have never returned a root and
1254	 * two paths which are not adjacent.
1255	 */
1256	BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1257
1258	ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1259				      &root_el->l_recs[i + 1], right_el);
1260}
1261
1262/*
1263 * We've changed a leaf block (in right_path) and need to reflect that
1264 * change back up the subtree.
1265 *
1266 * This happens in multiple places:
1267 *   - When we've moved an extent record from the left path leaf to the right
1268 *     path leaf to make room for an empty extent in the left path leaf.
1269 *   - When our insert into the right path leaf is at the leftmost edge
1270 *     and requires an update of the path immediately to it's left. This
1271 *     can occur at the end of some types of rotation and appending inserts.
1272 */
1273static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1274				       struct ocfs2_path *left_path,
1275				       struct ocfs2_path *right_path,
1276				       int subtree_index)
1277{
1278	int ret, i, idx;
1279	struct ocfs2_extent_list *el, *left_el, *right_el;
1280	struct ocfs2_extent_rec *left_rec, *right_rec;
1281	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1282
1283	/*
1284	 * Update the counts and position values within all the
1285	 * interior nodes to reflect the leaf rotation we just did.
1286	 *
1287	 * The root node is handled below the loop.
1288	 *
1289	 * We begin the loop with right_el and left_el pointing to the
1290	 * leaf lists and work our way up.
1291	 *
1292	 * NOTE: within this loop, left_el and right_el always refer
1293	 * to the *child* lists.
1294	 */
1295	left_el = path_leaf_el(left_path);
1296	right_el = path_leaf_el(right_path);
1297	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1298		mlog(0, "Adjust records at index %u\n", i);
1299
1300		/*
1301		 * One nice property of knowing that all of these
1302		 * nodes are below the root is that we only deal with
1303		 * the leftmost right node record and the rightmost
1304		 * left node record.
1305		 */
1306		el = left_path->p_node[i].el;
1307		idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1308		left_rec = &el->l_recs[idx];
1309
1310		el = right_path->p_node[i].el;
1311		right_rec = &el->l_recs[0];
1312
1313		ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1314					      right_el);
1315
1316		ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1317		if (ret)
1318			mlog_errno(ret);
1319
1320		ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1321		if (ret)
1322			mlog_errno(ret);
1323
1324		/*
1325		 * Setup our list pointers now so that the current
1326		 * parents become children in the next iteration.
1327		 */
1328		left_el = left_path->p_node[i].el;
1329		right_el = right_path->p_node[i].el;
1330	}
1331
1332	/*
1333	 * At the root node, adjust the two adjacent records which
1334	 * begin our path to the leaves.
1335	 */
1336
1337	el = left_path->p_node[subtree_index].el;
1338	left_el = left_path->p_node[subtree_index + 1].el;
1339	right_el = right_path->p_node[subtree_index + 1].el;
1340
1341	ocfs2_adjust_root_records(el, left_el, right_el,
1342				  left_path->p_node[subtree_index + 1].bh->b_blocknr);
1343
1344	root_bh = left_path->p_node[subtree_index].bh;
1345
1346	ret = ocfs2_journal_dirty(handle, root_bh);
1347	if (ret)
1348		mlog_errno(ret);
1349}
1350
1351static int ocfs2_rotate_subtree_right(struct inode *inode,
1352				      handle_t *handle,
1353				      struct ocfs2_path *left_path,
1354				      struct ocfs2_path *right_path,
1355				      int subtree_index)
1356{
1357	int ret, i;
1358	struct buffer_head *right_leaf_bh;
1359	struct buffer_head *left_leaf_bh = NULL;
1360	struct buffer_head *root_bh;
1361	struct ocfs2_extent_list *right_el, *left_el;
1362	struct ocfs2_extent_rec move_rec;
1363
1364	left_leaf_bh = path_leaf_bh(left_path);
1365	left_el = path_leaf_el(left_path);
1366
1367	if (left_el->l_next_free_rec != left_el->l_count) {
1368		ocfs2_error(inode->i_sb,
1369			    "Inode %llu has non-full interior leaf node %llu"
1370			    "(next free = %u)",
1371			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
1372			    (unsigned long long)left_leaf_bh->b_blocknr,
1373			    le16_to_cpu(left_el->l_next_free_rec));
1374		return -EROFS;
1375	}
1376
1377	/*
1378	 * This extent block may already have an empty record, so we
1379	 * return early if so.
1380	 */
1381	if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1382		return 0;
1383
1384	root_bh = left_path->p_node[subtree_index].bh;
1385	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1386
1387	ret = ocfs2_journal_access(handle, inode, root_bh,
1388				   OCFS2_JOURNAL_ACCESS_WRITE);
1389	if (ret) {
1390		mlog_errno(ret);
1391		goto out;
1392	}
1393
1394	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1395		ret = ocfs2_journal_access(handle, inode,
1396					   right_path->p_node[i].bh,
1397					   OCFS2_JOURNAL_ACCESS_WRITE);
1398		if (ret) {
1399			mlog_errno(ret);
1400			goto out;
1401		}
1402
1403		ret = ocfs2_journal_access(handle, inode,
1404					   left_path->p_node[i].bh,
1405					   OCFS2_JOURNAL_ACCESS_WRITE);
1406		if (ret) {
1407			mlog_errno(ret);
1408			goto out;
1409		}
1410	}
1411
1412	right_leaf_bh = path_leaf_bh(right_path);
1413	right_el = path_leaf_el(right_path);
1414
1415	/* This is a code error, not a disk corruption. */
1416	mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1417			"because rightmost leaf block %llu is empty\n",
1418			(unsigned long long)OCFS2_I(inode)->ip_blkno,
1419			(unsigned long long)right_leaf_bh->b_blocknr);
1420
1421	ocfs2_create_empty_extent(right_el);
1422
1423	ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1424	if (ret) {
1425		mlog_errno(ret);
1426		goto out;
1427	}
1428
1429	/* Do the copy now. */
1430	i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1431	move_rec = left_el->l_recs[i];
1432	right_el->l_recs[0] = move_rec;
1433
1434	/*
1435	 * Clear out the record we just copied and shift everything
1436	 * over, leaving an empty extent in the left leaf.
1437	 *
1438	 * We temporarily subtract from next_free_rec so that the
1439	 * shift will lose the tail record (which is now defunct).
1440	 */
1441	le16_add_cpu(&left_el->l_next_free_rec, -1);
1442	ocfs2_shift_records_right(left_el);
1443	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1444	le16_add_cpu(&left_el->l_next_free_rec, 1);
1445
1446	ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1447	if (ret) {
1448		mlog_errno(ret);
1449		goto out;
1450	}
1451
1452	ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1453				subtree_index);
1454
1455out:
1456	return ret;
1457}
1458
1459/*
1460 * Given a full path, determine what cpos value would return us a path
1461 * containing the leaf immediately to the left of the current one.
1462 *
1463 * Will return zero if the path passed in is already the leftmost path.
1464 */
1465static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1466					 struct ocfs2_path *path, u32 *cpos)
1467{
1468	int i, j, ret = 0;
1469	u64 blkno;
1470	struct ocfs2_extent_list *el;
1471
1472	BUG_ON(path->p_tree_depth == 0);
1473
1474	*cpos = 0;
1475
1476	blkno = path_leaf_bh(path)->b_blocknr;
1477
1478	/* Start at the tree node just above the leaf and work our way up. */
1479	i = path->p_tree_depth - 1;
1480	while (i >= 0) {
1481		el = path->p_node[i].el;
1482
1483		/*
1484		 * Find the extent record just before the one in our
1485		 * path.
1486		 */
1487		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1488			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1489				if (j == 0) {
1490					if (i == 0) {
1491						/*
1492						 * We've determined that the
1493						 * path specified is already
1494						 * the leftmost one - return a
1495						 * cpos of zero.
1496						 */
1497						goto out;
1498					}
1499					/*
1500					 * The leftmost record points to our
1501					 * leaf - we need to travel up the
1502					 * tree one level.
1503					 */
1504					goto next_node;
1505				}
1506
1507				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1508				*cpos = *cpos + ocfs2_rec_clusters(el,
1509							   &el->l_recs[j - 1]);
1510				*cpos = *cpos - 1;
1511				goto out;
1512			}
1513		}
1514
1515		/*
1516		 * If we got here, we never found a valid node where
1517		 * the tree indicated one should be.
1518		 */
1519		ocfs2_error(sb,
1520			    "Invalid extent tree at extent block %llu\n",
1521			    (unsigned long long)blkno);
1522		ret = -EROFS;
1523		goto out;
1524
1525next_node:
1526		blkno = path->p_node[i].bh->b_blocknr;
1527		i--;
1528	}
1529
1530out:
1531	return ret;
1532}
1533
1534static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1535					   struct ocfs2_path *path)
1536{
1537	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
1538
1539	if (handle->h_buffer_credits < credits)
1540		return ocfs2_extend_trans(handle, credits);
1541
1542	return 0;
1543}
1544
1545/*
1546 * Trap the case where we're inserting into the theoretical range past
1547 * the _actual_ left leaf range. Otherwise, we'll rotate a record
1548 * whose cpos is less than ours into the right leaf.
1549 *
1550 * It's only necessary to look at the rightmost record of the left
1551 * leaf because the logic that calls us should ensure that the
1552 * theoretical ranges in the path components above the leaves are
1553 * correct.
1554 */
1555static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1556						 u32 insert_cpos)
1557{
1558	struct ocfs2_extent_list *left_el;
1559	struct ocfs2_extent_rec *rec;
1560	int next_free;
1561
1562	left_el = path_leaf_el(left_path);
1563	next_free = le16_to_cpu(left_el->l_next_free_rec);
1564	rec = &left_el->l_recs[next_free - 1];
1565
1566	if (insert_cpos > le32_to_cpu(rec->e_cpos))
1567		return 1;
1568	return 0;
1569}
1570
1571/*
1572 * Rotate all the records in a btree right one record, starting at insert_cpos.
1573 *
1574 * The path to the rightmost leaf should be passed in.
1575 *
1576 * The array is assumed to be large enough to hold an entire path (tree depth).
1577 *
1578 * Upon succesful return from this function:
1579 *
1580 * - The 'right_path' array will contain a path to the leaf block
1581 *   whose range contains e_cpos.
1582 * - That leaf block will have a single empty extent in list index 0.
1583 * - In the case that the rotation requires a post-insert update,
1584 *   *ret_left_path will contain a valid path which can be passed to
1585 *   ocfs2_insert_path().
1586 */
1587static int ocfs2_rotate_tree_right(struct inode *inode,
1588				   handle_t *handle,
1589				   u32 insert_cpos,
1590				   struct ocfs2_path *right_path,
1591				   struct ocfs2_path **ret_left_path)
1592{
1593	int ret, start;
1594	u32 cpos;
1595	struct ocfs2_path *left_path = NULL;
1596
1597	*ret_left_path = NULL;
1598
1599	left_path = ocfs2_new_path(path_root_bh(right_path),
1600				   path_root_el(right_path));
1601	if (!left_path) {
1602		ret = -ENOMEM;
1603		mlog_errno(ret);
1604		goto out;
1605	}
1606
1607	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1608	if (ret) {
1609		mlog_errno(ret);
1610		goto out;
1611	}
1612
1613	mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1614
1615	/*
1616	 * What we want to do here is:
1617	 *
1618	 * 1) Start with the rightmost path.
1619	 *
1620	 * 2) Determine a path to the leaf block directly to the left
1621	 *    of that leaf.
1622	 *
1623	 * 3) Determine the 'subtree root' - the lowest level tree node
1624	 *    which contains a path to both leaves.
1625	 *
1626	 * 4) Rotate the subtree.
1627	 *
1628	 * 5) Find the next subtree by considering the left path to be
1629	 *    the new right path.
1630	 *
1631	 * The check at the top of this while loop also accepts
1632	 * insert_cpos == cpos because cpos is only a _theoretical_
1633	 * value to get us the left path - insert_cpos might very well
1634	 * be filling that hole.
1635	 *
1636	 * Stop at a cpos of '0' because we either started at the
1637	 * leftmost branch (i.e., a tree with one branch and a
1638	 * rotation inside of it), or we've gone as far as we can in
1639	 * rotating subtrees.
1640	 */
1641	while (cpos && insert_cpos <= cpos) {
1642		mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
1643		     insert_cpos, cpos);
1644
1645		ret = ocfs2_find_path(inode, left_path, cpos);
1646		if (ret) {
1647			mlog_errno(ret);
1648			goto out;
1649		}
1650
1651		mlog_bug_on_msg(path_leaf_bh(left_path) ==
1652				path_leaf_bh(right_path),
1653				"Inode %lu: error during insert of %u "
1654				"(left path cpos %u) results in two identical "
1655				"paths ending at %llu\n",
1656				inode->i_ino, insert_cpos, cpos,
1657				(unsigned long long)
1658				path_leaf_bh(left_path)->b_blocknr);
1659
1660		if (ocfs2_rotate_requires_path_adjustment(left_path,
1661							  insert_cpos)) {
1662			mlog(0, "Path adjustment required\n");
1663
1664			/*
1665			 * We've rotated the tree as much as we
1666			 * should. The rest is up to
1667			 * ocfs2_insert_path() to complete, after the
1668			 * record insertion. We indicate this
1669			 * situation by returning the left path.
1670			 *
1671			 * The reason we don't adjust the records here
1672			 * before the record insert is that an error
1673			 * later might break the rule where a parent
1674			 * record e_cpos will reflect the actual
1675			 * e_cpos of the 1st nonempty record of the
1676			 * child list.
1677			 */
1678			*ret_left_path = left_path;
1679			goto out_ret_path;
1680		}
1681
1682		start = ocfs2_find_subtree_root(inode, left_path, right_path);
1683
1684		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
1685		     start,
1686		     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
1687		     right_path->p_tree_depth);
1688
1689		ret = ocfs2_extend_rotate_transaction(handle, start,
1690						      right_path);
1691		if (ret) {
1692			mlog_errno(ret);
1693			goto out;
1694		}
1695
1696		ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
1697						 right_path, start);
1698		if (ret) {
1699			mlog_errno(ret);
1700			goto out;
1701		}
1702
1703		/*
1704		 * There is no need to re-read the next right path
1705		 * as we know that it'll be our current left
1706		 * path. Optimize by copying values instead.
1707		 */
1708		ocfs2_mv_path(right_path, left_path);
1709
1710		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1711						    &cpos);
1712		if (ret) {
1713			mlog_errno(ret);
1714			goto out;
1715		}
1716	}
1717
1718out:
1719	ocfs2_free_path(left_path);
1720
1721out_ret_path:
1722	return ret;
1723}
1724
1725/*
1726 * Do the final bits of extent record insertion at the target leaf
1727 * list. If this leaf is part of an allocation tree, it is assumed
1728 * that the tree above has been prepared.
1729 */
1730static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1731				 struct ocfs2_extent_list *el,
1732				 struct ocfs2_insert_type *insert,
1733				 struct inode *inode)
1734{
1735	int i = insert->ins_contig_index;
1736	unsigned int range;
1737	struct ocfs2_extent_rec *rec;
1738
1739	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1740
1741	/*
1742	 * Contiguous insert - either left or right.
1743	 */
1744	if (insert->ins_contig != CONTIG_NONE) {
1745		rec = &el->l_recs[i];
1746		if (insert->ins_contig == CONTIG_LEFT) {
1747			rec->e_blkno = insert_rec->e_blkno;
1748			rec->e_cpos = insert_rec->e_cpos;
1749		}
1750		le16_add_cpu(&rec->e_leaf_clusters,
1751			     le16_to_cpu(insert_rec->e_leaf_clusters));
1752		return;
1753	}
1754
1755	/*
1756	 * Handle insert into an empty leaf.
1757	 */
1758	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
1759	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
1760	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
1761		el->l_recs[0] = *insert_rec;
1762		el->l_next_free_rec = cpu_to_le16(1);
1763		return;
1764	}
1765
1766	/*
1767	 * Appending insert.
1768	 */
1769	if (insert->ins_appending == APPEND_TAIL) {
1770		i = le16_to_cpu(el->l_next_free_rec) - 1;
1771		rec = &el->l_recs[i];
1772		range = le32_to_cpu(rec->e_cpos)
1773			+ le16_to_cpu(rec->e_leaf_clusters);
1774		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
1775
1776		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
1777				le16_to_cpu(el->l_count),
1778				"inode %lu, depth %u, count %u, next free %u, "
1779				"rec.cpos %u, rec.clusters %u, "
1780				"insert.cpos %u, insert.clusters %u\n",
1781				inode->i_ino,
1782				le16_to_cpu(el->l_tree_depth),
1783				le16_to_cpu(el->l_count),
1784				le16_to_cpu(el->l_next_free_rec),
1785				le32_to_cpu(el->l_recs[i].e_cpos),
1786				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
1787				le32_to_cpu(insert_rec->e_cpos),
1788				le16_to_cpu(insert_rec->e_leaf_clusters));
1789		i++;
1790		el->l_recs[i] = *insert_rec;
1791		le16_add_cpu(&el->l_next_free_rec, 1);
1792		return;
1793	}
1794
1795	/*
1796	 * Ok, we have to rotate.
1797	 *
1798	 * At this point, it is safe to assume that inserting into an
1799	 * empty leaf and appending to a leaf have both been handled
1800	 * above.
1801	 *
1802	 * This leaf needs to have space, either by the empty 1st
1803	 * extent record, or by virtue of an l_next_rec < l_count.
1804	 */
1805	ocfs2_rotate_leaf(el, insert_rec);
1806}
1807
1808static inline void ocfs2_update_dinode_clusters(struct inode *inode,
1809						struct ocfs2_dinode *di,
1810						u32 clusters)
1811{
1812	le32_add_cpu(&di->i_clusters, clusters);
1813	spin_lock(&OCFS2_I(inode)->ip_lock);
1814	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
1815	spin_unlock(&OCFS2_I(inode)->ip_lock);
1816}
1817
1818static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1819				    struct ocfs2_extent_rec *insert_rec,
1820				    struct ocfs2_path *right_path,
1821				    struct ocfs2_path **ret_left_path)
1822{
1823	int ret, i, next_free;
1824	struct buffer_head *bh;
1825	struct ocfs2_extent_list *el;
1826	struct ocfs2_path *left_path = NULL;
1827
1828	*ret_left_path = NULL;
1829
1830	/*
1831	 * This shouldn't happen for non-trees. The extent rec cluster
1832	 * count manipulation below only works for interior nodes.
1833	 */
1834	BUG_ON(right_path->p_tree_depth == 0);
1835
1836	/*
1837	 * If our appending insert is at the leftmost edge of a leaf,
1838	 * then we might need to update the rightmost records of the
1839	 * neighboring path.
1840	 */
1841	el = path_leaf_el(right_path);
1842	next_free = le16_to_cpu(el->l_next_free_rec);
1843	if (next_free == 0 ||
1844	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
1845		u32 left_cpos;
1846
1847		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1848						    &left_cpos);
1849		if (ret) {
1850			mlog_errno(ret);
1851			goto out;
1852		}
1853
1854		mlog(0, "Append may need a left path update. cpos: %u, "
1855		     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
1856		     left_cpos);
1857
1858		/*
1859		 * No need to worry if the append is already in the
1860		 * leftmost leaf.
1861		 */
1862		if (left_cpos) {
1863			left_path = ocfs2_new_path(path_root_bh(right_path),
1864						   path_root_el(right_path));
1865			if (!left_path) {
1866				ret = -ENOMEM;
1867				mlog_errno(ret);
1868				goto out;
1869			}
1870
1871			ret = ocfs2_find_path(inode, left_path, left_cpos);
1872			if (ret) {
1873				mlog_errno(ret);
1874				goto out;
1875			}
1876
1877			/*
1878			 * ocfs2_insert_path() will pass the left_path to the
1879			 * journal for us.
1880			 */
1881		}
1882	}
1883
1884	ret = ocfs2_journal_access_path(inode, handle, right_path);
1885	if (ret) {
1886		mlog_errno(ret);
1887		goto out;
1888	}
1889
1890	el = path_root_el(right_path);
1891	bh = path_root_bh(right_path);
1892	i = 0;
1893	while (1) {
1894		struct ocfs2_extent_rec *rec;
1895
1896		next_free = le16_to_cpu(el->l_next_free_rec);
1897		if (next_free == 0) {
1898			ocfs2_error(inode->i_sb,
1899				    "Dinode %llu has a bad extent list",
1900				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
1901			ret = -EIO;
1902			goto out;
1903		}
1904
1905		rec = &el->l_recs[next_free - 1];
1906
1907		rec->e_int_clusters = insert_rec->e_cpos;
1908		le32_add_cpu(&rec->e_int_clusters,
1909			     le16_to_cpu(insert_rec->e_leaf_clusters));
1910		le32_add_cpu(&rec->e_int_clusters,
1911			     -le32_to_cpu(rec->e_cpos));
1912
1913		ret = ocfs2_journal_dirty(handle, bh);
1914		if (ret)
1915			mlog_errno(ret);
1916
1917		/* Don't touch the leaf node */
1918		if (++i >= right_path->p_tree_depth)
1919			break;
1920
1921		bh = right_path->p_node[i].bh;
1922		el = right_path->p_node[i].el;
1923	}
1924
1925	*ret_left_path = left_path;
1926	ret = 0;
1927out:
1928	if (ret != 0)
1929		ocfs2_free_path(left_path);
1930
1931	return ret;
1932}
1933
1934/*
1935 * This function only does inserts on an allocation b-tree. For dinode
1936 * lists, ocfs2_insert_at_leaf() is called directly.
1937 *
1938 * right_path is the path we want to do the actual insert
1939 * in. left_path should only be passed in if we need to update that
1940 * portion of the tree after an edge insert.
1941 */
1942static int ocfs2_insert_path(struct inode *inode,
1943			     handle_t *handle,
1944			     struct ocfs2_path *left_path,
1945			     struct ocfs2_path *right_path,
1946			     struct ocfs2_extent_rec *insert_rec,
1947			     struct ocfs2_insert_type *insert)
1948{
1949	int ret, subtree_index;
1950	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
1951	struct ocfs2_extent_list *el;
1952
1953	/*
1954	 * Pass both paths to the journal. The majority of inserts
1955	 * will be touching all components anyway.
1956	 */
1957	ret = ocfs2_journal_access_path(inode, handle, right_path);
1958	if (ret < 0) {
1959		mlog_errno(ret);
1960		goto out;
1961	}
1962
1963	if (left_path) {
1964		int credits = handle->h_buffer_credits;
1965
1966		/*
1967		 * There's a chance that left_path got passed back to
1968		 * us without being accounted for in the
1969		 * journal. Extend our transaction here to be sure we
1970		 * can change those blocks.
1971		 */
1972		credits += left_path->p_tree_depth;
1973
1974		ret = ocfs2_extend_trans(handle, credits);
1975		if (ret < 0) {
1976			mlog_errno(ret);
1977			goto out;
1978		}
1979
1980		ret = ocfs2_journal_access_path(inode, handle, left_path);
1981		if (ret < 0) {
1982			mlog_errno(ret);
1983			goto out;
1984		}
1985	}
1986
1987	el = path_leaf_el(right_path);
1988
1989	ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
1990	ret = ocfs2_journal_dirty(handle, leaf_bh);
1991	if (ret)
1992		mlog_errno(ret);
1993
1994	if (left_path) {
1995		subtree_index = ocfs2_find_subtree_root(inode, left_path,
1996							right_path);
1997		ocfs2_complete_edge_insert(inode, handle, left_path,
1998					   right_path, subtree_index);
1999	}
2000
2001	ret = 0;
2002out:
2003	return ret;
2004}
2005
2006static int ocfs2_do_insert_extent(struct inode *inode,
2007				  handle_t *handle,
2008				  struct buffer_head *di_bh,
2009				  struct ocfs2_extent_rec *insert_rec,
2010				  struct ocfs2_insert_type *type)
2011{
2012	int ret, rotate = 0;
2013	u32 cpos;
2014	struct ocfs2_path *right_path = NULL;
2015	struct ocfs2_path *left_path = NULL;
2016	struct ocfs2_dinode *di;
2017	struct ocfs2_extent_list *el;
2018
2019	di = (struct ocfs2_dinode *) di_bh->b_data;
2020	el = &di->id2.i_list;
2021
2022	ret = ocfs2_journal_access(handle, inode, di_bh,
2023				   OCFS2_JOURNAL_ACCESS_WRITE);
2024	if (ret) {
2025		mlog_errno(ret);
2026		goto out;
2027	}
2028
2029	if (le16_to_cpu(el->l_tree_depth) == 0) {
2030		ocfs2_insert_at_leaf(insert_rec, el, type, inode);
2031		goto out_update_clusters;
2032	}
2033
2034	right_path = ocfs2_new_inode_path(di_bh);
2035	if (!right_path) {
2036		ret = -ENOMEM;
2037		mlog_errno(ret);
2038		goto out;
2039	}
2040
2041	/*
2042	 * Determine the path to start with. Rotations need the
2043	 * rightmost path, everything else can go directly to the
2044	 * target leaf.
2045	 */
2046	cpos = le32_to_cpu(insert_rec->e_cpos);
2047	if (type->ins_appending == APPEND_NONE &&
2048	    type->ins_contig == CONTIG_NONE) {
2049		rotate = 1;
2050		cpos = UINT_MAX;
2051	}
2052
2053	ret = ocfs2_find_path(inode, right_path, cpos);
2054	if (ret) {
2055		mlog_errno(ret);
2056		goto out;
2057	}
2058
2059	if (rotate) {
2060		ret = ocfs2_rotate_tree_right(inode, handle,
2061					      le32_to_cpu(insert_rec->e_cpos),
2062					      right_path, &left_path);
2063		if (ret) {
2064			mlog_errno(ret);
2065			goto out;
2066		}
2067	} else if (type->ins_appending == APPEND_TAIL
2068		   && type->ins_contig != CONTIG_LEFT) {
2069		ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
2070					       right_path, &left_path);
2071		if (ret) {
2072			mlog_errno(ret);
2073			goto out;
2074		}
2075	}
2076
2077	ret = ocfs2_insert_path(inode, handle, left_path, right_path,
2078				insert_rec, type);
2079	if (ret) {
2080		mlog_errno(ret);
2081		goto out;
2082	}
2083
2084out_update_clusters:
2085	ocfs2_update_dinode_clusters(inode, di,
2086				     le16_to_cpu(insert_rec->e_leaf_clusters));
2087
2088	ret = ocfs2_journal_dirty(handle, di_bh);
2089	if (ret)
2090		mlog_errno(ret);
2091
2092out:
2093	ocfs2_free_path(left_path);
2094	ocfs2_free_path(right_path);
2095
2096	return ret;
2097}
2098
2099static void ocfs2_figure_contig_type(struct inode *inode,
2100				     struct ocfs2_insert_type *insert,
2101				     struct ocfs2_extent_list *el,
2102				     struct ocfs2_extent_rec *insert_rec)
2103{
2104	int i;
2105	enum ocfs2_contig_type contig_type = CONTIG_NONE;
2106
2107	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
2108
2109	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2110		contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
2111						  insert_rec);
2112		if (contig_type != CONTIG_NONE) {
2113			insert->ins_contig_index = i;
2114			break;
2115		}
2116	}
2117	insert->ins_contig = contig_type;
2118}
2119
2120/*
2121 * This should only be called against the righmost leaf extent list.
2122 *
2123 * ocfs2_figure_appending_type() will figure out whether we'll have to
2124 * insert at the tail of the rightmost leaf.
2125 *
2126 * This should also work against the dinode list for tree's with 0
2127 * depth. If we consider the dinode list to be the rightmost leaf node
2128 * then the logic here makes sense.
2129 */
2130static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
2131					struct ocfs2_extent_list *el,
2132					struct ocfs2_extent_rec *insert_rec)
2133{
2134	int i;
2135	u32 cpos = le32_to_cpu(insert_rec->e_cpos);
2136	struct ocfs2_extent_rec *rec;
2137
2138	insert->ins_appending = APPEND_NONE;
2139
2140	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
2141
2142	if (!el->l_next_free_rec)
2143		goto set_tail_append;
2144
2145	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
2146		/* Were all records empty? */
2147		if (le16_to_cpu(el->l_next_free_rec) == 1)
2148			goto set_tail_append;
2149	}
2150
2151	i = le16_to_cpu(el->l_next_free_rec) - 1;
2152	rec = &el->l_recs[i];
2153
2154	if (cpos >=
2155	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
2156		goto set_tail_append;
2157
2158	return;
2159
2160set_tail_append:
2161	insert->ins_appending = APPEND_TAIL;
2162}
2163
2164/*
2165 * Helper function called at the begining of an insert.
2166 *
2167 * This computes a few things that are commonly used in the process of
2168 * inserting into the btree:
2169 *   - Whether the new extent is contiguous with an existing one.
2170 *   - The current tree depth.
2171 *   - Whether the insert is an appending one.
2172 *   - The total # of free records in the tree.
2173 *
2174 * All of the information is stored on the ocfs2_insert_type
2175 * structure.
2176 */
2177static int ocfs2_figure_insert_type(struct inode *inode,
2178				    struct buffer_head *di_bh,
2179				    struct buffer_head **last_eb_bh,
2180				    struct ocfs2_extent_rec *insert_rec,
2181				    struct ocfs2_insert_type *insert)
2182{
2183	int ret;
2184	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2185	struct ocfs2_extent_block *eb;
2186	struct ocfs2_extent_list *el;
2187	struct ocfs2_path *path = NULL;
2188	struct buffer_head *bh = NULL;
2189
2190	el = &di->id2.i_list;
2191	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
2192
2193	if (el->l_tree_depth) {
2194		/*
2195		 * If we have tree depth, we read in the
2196		 * rightmost extent block ahead of time as
2197		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
2198		 * may want it later.
2199		 */
2200		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
2201				       le64_to_cpu(di->i_last_eb_blk), &bh,
2202				       OCFS2_BH_CACHED, inode);
2203		if (ret) {
2204			mlog_exit(ret);
2205			goto out;
2206		}
2207		eb = (struct ocfs2_extent_block *) bh->b_data;
2208		el = &eb->h_list;
2209	}
2210
2211	insert->ins_free_records = le16_to_cpu(el->l_count) -
2212		le16_to_cpu(el->l_next_free_rec);
2213
2214	if (!insert->ins_tree_depth) {
2215		ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2216		ocfs2_figure_appending_type(insert, el, insert_rec);
2217		return 0;
2218	}
2219
2220	path = ocfs2_new_inode_path(di_bh);
2221	if (!path) {
2222		ret = -ENOMEM;
2223		mlog_errno(ret);
2224		goto out;
2225	}
2226
2227	/*
2228	 * In the case that we're inserting past what the tree
2229	 * currently accounts for, ocfs2_find_path() will return for
2230	 * us the rightmost tree path. This is accounted for below in
2231	 * the appending code.
2232	 */
2233	ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
2234	if (ret) {
2235		mlog_errno(ret);
2236		goto out;
2237	}
2238
2239	el = path_leaf_el(path);
2240
2241	/*
2242	 * Now that we have the path, there's two things we want to determine:
2243	 * 1) Contiguousness (also set contig_index if this is so)
2244	 *
2245	 * 2) Are we doing an append? We can trivially break this up
2246         *     into two types of appends: simple record append, or a
2247         *     rotate inside the tail leaf.
2248	 */
2249	ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2250
2251	/*
2252	 * The insert code isn't quite ready to deal with all cases of
2253	 * left contiguousness. Specifically, if it's an insert into
2254	 * the 1st record in a leaf, it will require the adjustment of
2255	 * cluster count on the last record of the path directly to it's
2256	 * left. For now, just catch that case and fool the layers
2257	 * above us. This works just fine for tree_depth == 0, which
2258	 * is why we allow that above.
2259	 */
2260	if (insert->ins_contig == CONTIG_LEFT &&
2261	    insert->ins_contig_index == 0)
2262		insert->ins_contig = CONTIG_NONE;
2263
2264	/*
2265	 * Ok, so we can simply compare against last_eb to figure out
2266	 * whether the path doesn't exist. This will only happen in
2267	 * the case that we're doing a tail append, so maybe we can
2268	 * take advantage of that information somehow.
2269	 */
2270	if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
2271		/*
2272		 * Ok, ocfs2_find_path() returned us the rightmost
2273		 * tree path. This might be an appending insert. There are
2274		 * two cases:
2275		 *    1) We're doing a true append at the tail:
2276		 *	-This might even be off the end of the leaf
2277		 *    2) We're "appending" by rotating in the tail
2278		 */
2279		ocfs2_figure_appending_type(insert, el, insert_rec);
2280	}
2281
2282out:
2283	ocfs2_free_path(path);
2284
2285	if (ret == 0)
2286		*last_eb_bh = bh;
2287	else
2288		brelse(bh);
2289	return ret;
2290}
2291
2292/*
2293 * Insert an extent into an inode btree.
2294 *
2295 * The caller needs to update fe->i_clusters
2296 */
2297int ocfs2_insert_extent(struct ocfs2_super *osb,
2298			handle_t *handle,
2299			struct inode *inode,
2300			struct buffer_head *fe_bh,
2301			u32 cpos,
2302			u64 start_blk,
2303			u32 new_clusters,
2304			struct ocfs2_alloc_context *meta_ac)
2305{
2306	int status, shift;
2307	struct buffer_head *last_eb_bh = NULL;
2308	struct buffer_head *bh = NULL;
2309	struct ocfs2_insert_type insert = {0, };
2310	struct ocfs2_extent_rec rec;
2311
2312	mlog(0, "add %u clusters at position %u to inode %llu\n",
2313	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
2314
2315	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
2316			(OCFS2_I(inode)->ip_clusters != cpos),
2317			"Device %s, asking for sparse allocation: inode %llu, "
2318			"cpos %u, clusters %u\n",
2319			osb->dev_str,
2320			(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
2321			OCFS2_I(inode)->ip_clusters);
2322
2323	memset(&rec, 0, sizeof(rec));
2324	rec.e_cpos = cpu_to_le32(cpos);
2325	rec.e_blkno = cpu_to_le64(start_blk);
2326	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
2327
2328	status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
2329					  &insert);
2330	if (status < 0) {
2331		mlog_errno(status);
2332		goto bail;
2333	}
2334
2335	mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
2336	     "Insert.contig_index: %d, Insert.free_records: %d, "
2337	     "Insert.tree_depth: %d\n",
2338	     insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
2339	     insert.ins_free_records, insert.ins_tree_depth);
2340
2341	/*
2342	 * Avoid growing the tree unless we're out of records and the
2343	 * insert type requres one.
2344	 */
2345	if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
2346		goto out_add;
2347
2348	shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
2349	if (shift < 0) {
2350		status = shift;
2351		mlog_errno(status);
2352		goto bail;
2353	}
2354
2355	/* We traveled all the way to the bottom of the allocation tree
2356	 * and didn't find room for any more extents - we need to add
2357	 * another tree level */
2358	if (shift) {
2359		BUG_ON(bh);
2360		mlog(0, "need to shift tree depth "
2361		     "(current = %d)\n", insert.ins_tree_depth);
2362
2363		/* ocfs2_shift_tree_depth will return us a buffer with
2364		 * the new extent block (so we can pass that to
2365		 * ocfs2_add_branch). */
2366		status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
2367						meta_ac, &bh);
2368		if (status < 0) {
2369			mlog_errno(status);
2370			goto bail;
2371		}
2372		insert.ins_tree_depth++;
2373		/* Special case: we have room now if we shifted from
2374		 * tree_depth 0 */
2375		if (insert.ins_tree_depth == 1)
2376			goto out_add;
2377	}
2378
2379	/* call ocfs2_add_branch to add the final part of the tree with
2380	 * the new data. */
2381	mlog(0, "add branch. bh = %p\n", bh);
2382	status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
2383				  meta_ac);
2384	if (status < 0) {
2385		mlog_errno(status);
2386		goto bail;
2387	}
2388
2389out_add:
2390	/* Finally, we can add clusters. This might rotate the tree for us. */
2391	status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
2392	if (status < 0)
2393		mlog_errno(status);
2394	else
2395		ocfs2_extent_map_insert_rec(inode, &rec);
2396
2397bail:
2398	if (bh)
2399		brelse(bh);
2400
2401	if (last_eb_bh)
2402		brelse(last_eb_bh);
2403
2404	mlog_exit(status);
2405	return status;
2406}
2407
2408static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
2409{
2410	struct buffer_head *tl_bh = osb->osb_tl_bh;
2411	struct ocfs2_dinode *di;
2412	struct ocfs2_truncate_log *tl;
2413
2414	di = (struct ocfs2_dinode *) tl_bh->b_data;
2415	tl = &di->id2.i_dealloc;
2416
2417	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
2418			"slot %d, invalid truncate log parameters: used = "
2419			"%u, count = %u\n", osb->slot_num,
2420			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
2421	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
2422}
2423
2424static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
2425					   unsigned int new_start)
2426{
2427	unsigned int tail_index;
2428	unsigned int current_tail;
2429
2430	/* No records, nothing to coalesce */
2431	if (!le16_to_cpu(tl->tl_used))
2432		return 0;
2433
2434	tail_index = le16_to_cpu(tl->tl_used) - 1;
2435	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
2436	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
2437
2438	return current_tail == new_start;
2439}
2440
2441static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
2442				     handle_t *handle,
2443				     u64 start_blk,
2444				     unsigned int num_clusters)
2445{
2446	int status, index;
2447	unsigned int start_cluster, tl_count;
2448	struct inode *tl_inode = osb->osb_tl_inode;
2449	struct buffer_head *tl_bh = osb->osb_tl_bh;
2450	struct ocfs2_dinode *di;
2451	struct ocfs2_truncate_log *tl;
2452
2453	mlog_entry("start_blk = %llu, num_clusters = %u\n",
2454		   (unsigned long long)start_blk, num_clusters);
2455
2456	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
2457
2458	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
2459
2460	di = (struct ocfs2_dinode *) tl_bh->b_data;
2461	tl = &di->id2.i_dealloc;
2462	if (!OCFS2_IS_VALID_DINODE(di)) {
2463		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
2464		status = -EIO;
2465		goto bail;
2466	}
2467
2468	tl_count = le16_to_cpu(tl->tl_count);
2469	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
2470			tl_count == 0,
2471			"Truncate record count on #%llu invalid "
2472			"wanted %u, actual %u\n",
2473			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
2474			ocfs2_truncate_recs_per_inode(osb->sb),
2475			le16_to_cpu(tl->tl_count));
2476
2477	/* Caller should have known to flush before calling us. */
2478	index = le16_to_cpu(tl->tl_used);
2479	if (index >= tl_count) {
2480		status = -ENOSPC;
2481		mlog_errno(status);
2482		goto bail;
2483	}
2484
2485	status = ocfs2_journal_access(handle, tl_inode, tl_bh,
2486				      OCFS2_JOURNAL_ACCESS_WRITE);
2487	if (status < 0) {
2488		mlog_errno(status);
2489		goto bail;
2490	}
2491
2492	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
2493	     "%llu (index = %d)\n", num_clusters, start_cluster,
2494	     (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
2495
2496	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
2497		/*
2498		 * Move index back to the record we are coalescing with.
2499		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
2500		 */
2501		index--;
2502
2503		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
2504		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
2505		     index, le32_to_cpu(tl->tl_recs[index].t_start),
2506		     num_clusters);
2507	} else {
2508		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
2509		tl->tl_used = cpu_to_le16(index + 1);
2510	}
2511	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
2512
2513	status = ocfs2_journal_dirty(handle, tl_bh);
2514	if (status < 0) {
2515		mlog_errno(status);
2516		goto bail;
2517	}
2518
2519bail:
2520	mlog_exit(status);
2521	return status;
2522}
2523
2524static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
2525					 handle_t *handle,
2526					 struct inode *data_alloc_inode,
2527					 struct buffer_head *data_alloc_bh)
2528{
2529	int status = 0;
2530	int i;
2531	unsigned int num_clusters;
2532	u64 start_blk;
2533	struct ocfs2_truncate_rec rec;
2534	struct ocfs2_dinode *di;
2535	struct ocfs2_truncate_log *tl;
2536	struct inode *tl_inode = osb->osb_tl_inode;
2537	struct buffer_head *tl_bh = osb->osb_tl_bh;
2538
2539	mlog_entry_void();
2540
2541	di = (struct ocfs2_dinode *) tl_bh->b_data;
2542	tl = &di->id2.i_dealloc;
2543	i = le16_to_cpu(tl->tl_used) - 1;
2544	while (i >= 0) {
2545		/* Caller has given us at least enough credits to
2546		 * update the truncate log dinode */
2547		status = ocfs2_journal_access(handle, tl_inode, tl_bh,
2548					      OCFS2_JOURNAL_ACCESS_WRITE);
2549		if (status < 0) {
2550			mlog_errno(status);
2551			goto bail;
2552		}
2553
2554		tl->tl_used = cpu_to_le16(i);
2555
2556		status = ocfs2_journal_dirty(handle, tl_bh);
2557		if (status < 0) {
2558			mlog_errno(status);
2559			goto bail;
2560		}
2561
2562		/* TODO: Perhaps we can calculate the bulk of the
2563		 * credits up front rather than extending like
2564		 * this. */
2565		status = ocfs2_extend_trans(handle,
2566					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
2567		if (status < 0) {
2568			mlog_errno(status);
2569			goto bail;
2570		}
2571
2572		rec = tl->tl_recs[i];
2573		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
2574						    le32_to_cpu(rec.t_start));
2575		num_clusters = le32_to_cpu(rec.t_clusters);
2576
2577		/* if start_blk is not set, we ignore the record as
2578		 * invalid. */
2579		if (start_blk) {
2580			mlog(0, "free record %d, start = %u, clusters = %u\n",
2581			     i, le32_to_cpu(rec.t_start), num_clusters);
2582
2583			status = ocfs2_free_clusters(handle, data_alloc_inode,
2584						     data_alloc_bh, start_blk,
2585						     num_clusters);
2586			if (status < 0) {
2587				mlog_errno(status);
2588				goto bail;
2589			}
2590		}
2591		i--;
2592	}
2593
2594bail:
2595	mlog_exit(status);
2596	return status;
2597}
2598
2599/* Expects you to already be holding tl_inode->i_mutex */
2600static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2601{
2602	int status;
2603	unsigned int num_to_flush;
2604	handle_t *handle;
2605	struct inode *tl_inode = osb->osb_tl_inode;
2606	struct inode *data_alloc_inode = NULL;
2607	struct buffer_head *tl_bh = osb->osb_tl_bh;
2608	struct buffer_head *data_alloc_bh = NULL;
2609	struct ocfs2_dinode *di;
2610	struct ocfs2_truncate_log *tl;
2611
2612	mlog_entry_void();
2613
2614	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
2615
2616	di = (struct ocfs2_dinode *) tl_bh->b_data;
2617	tl = &di->id2.i_dealloc;
2618	if (!OCFS2_IS_VALID_DINODE(di)) {
2619		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
2620		status = -EIO;
2621		goto out;
2622	}
2623
2624	num_to_flush = le16_to_cpu(tl->tl_used);
2625	mlog(0, "Flush %u records from truncate log #%llu\n",
2626	     num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
2627	if (!num_to_flush) {
2628		status = 0;
2629		goto out;
2630	}
2631
2632	data_alloc_inode = ocfs2_get_system_file_inode(osb,
2633						       GLOBAL_BITMAP_SYSTEM_INODE,
2634						       OCFS2_INVALID_SLOT);
2635	if (!data_alloc_inode) {
2636		status = -EINVAL;
2637		mlog(ML_ERROR, "Could not get bitmap inode!\n");
2638		goto out;
2639	}
2640
2641	mutex_lock(&data_alloc_inode->i_mutex);
2642
2643	status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
2644	if (status < 0) {
2645		mlog_errno(status);
2646		goto out_mutex;
2647	}
2648
2649	handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
2650	if (IS_ERR(handle)) {
2651		status = PTR_ERR(handle);
2652		mlog_errno(status);
2653		goto out_unlock;
2654	}
2655
2656	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
2657					       data_alloc_bh);
2658	if (status < 0)
2659		mlog_errno(status);
2660
2661	ocfs2_commit_trans(osb, handle);
2662
2663out_unlock:
2664	brelse(data_alloc_bh);
2665	ocfs2_meta_unlock(data_alloc_inode, 1);
2666
2667out_mutex:
2668	mutex_unlock(&data_alloc_inode->i_mutex);
2669	iput(data_alloc_inode);
2670
2671out:
2672	mlog_exit(status);
2673	return status;
2674}
2675
2676int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2677{
2678	int status;
2679	struct inode *tl_inode = osb->osb_tl_inode;
2680
2681	mutex_lock(&tl_inode->i_mutex);
2682	status = __ocfs2_flush_truncate_log(osb);
2683	mutex_unlock(&tl_inode->i_mutex);
2684
2685	return status;
2686}
2687
2688static void ocfs2_truncate_log_worker(struct work_struct *work)
2689{
2690	int status;
2691	struct ocfs2_super *osb =
2692		container_of(work, struct ocfs2_super,
2693			     osb_truncate_log_wq.work);
2694
2695	mlog_entry_void();
2696
2697	status = ocfs2_flush_truncate_log(osb);
2698	if (status < 0)
2699		mlog_errno(status);
2700
2701	mlog_exit(status);
2702}
2703
2704#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
2705void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
2706				       int cancel)
2707{
2708	if (osb->osb_tl_inode) {
2709		/* We want to push off log flushes while truncates are
2710		 * still running. */
2711		if (cancel)
2712			cancel_delayed_work(&osb->osb_truncate_log_wq);
2713
2714		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
2715				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
2716	}
2717}
2718
2719static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
2720				       int slot_num,
2721				       struct inode **tl_inode,
2722				       struct buffer_head **tl_bh)
2723{
2724	int status;
2725	struct inode *inode = NULL;
2726	struct buffer_head *bh = NULL;
2727
2728	inode = ocfs2_get_system_file_inode(osb,
2729					   TRUNCATE_LOG_SYSTEM_INODE,
2730					   slot_num);
2731	if (!inode) {
2732		status = -EINVAL;
2733		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
2734		goto bail;
2735	}
2736
2737	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
2738				  OCFS2_BH_CACHED, inode);
2739	if (status < 0) {
2740		iput(inode);
2741		mlog_errno(status);
2742		goto bail;
2743	}
2744
2745	*tl_inode = inode;
2746	*tl_bh    = bh;
2747bail:
2748	mlog_exit(status);
2749	return status;
2750}
2751
2752/* called during the 1st stage of node recovery. we stamp a clean
2753 * truncate log and pass back a copy for processing later. if the
2754 * truncate log does not require processing, a *tl_copy is set to
2755 * NULL. */
2756int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
2757				      int slot_num,
2758				      struct ocfs2_dinode **tl_copy)
2759{
2760	int status;
2761	struct inode *tl_inode = NULL;
2762	struct buffer_head *tl_bh = NULL;
2763	struct ocfs2_dinode *di;
2764	struct ocfs2_truncate_log *tl;
2765
2766	*tl_copy = NULL;
2767
2768	mlog(0, "recover truncate log from slot %d\n", slot_num);
2769
2770	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
2771	if (status < 0) {
2772		mlog_errno(status);
2773		goto bail;
2774	}
2775
2776	di = (struct ocfs2_dinode *) tl_bh->b_data;
2777	tl = &di->id2.i_dealloc;
2778	if (!OCFS2_IS_VALID_DINODE(di)) {
2779		OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
2780		status = -EIO;
2781		goto bail;
2782	}
2783
2784	if (le16_to_cpu(tl->tl_used)) {
2785		mlog(0, "We'll have %u logs to recover\n",
2786		     le16_to_cpu(tl->tl_used));
2787
2788		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
2789		if (!(*tl_copy)) {
2790			status = -ENOMEM;
2791			mlog_errno(status);
2792			goto bail;
2793		}
2794
2795		/* Assuming the write-out below goes well, this copy
2796		 * will be passed back to recovery for processing. */
2797		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
2798
2799		/* All we need to do to clear the truncate log is set
2800		 * tl_used. */
2801		tl->tl_used = 0;
2802
2803		status = ocfs2_write_block(osb, tl_bh, tl_inode);
2804		if (status < 0) {
2805			mlog_errno(status);
2806			goto bail;
2807		}
2808	}
2809
2810bail:
2811	if (tl_inode)
2812		iput(tl_inode);
2813	if (tl_bh)
2814		brelse(tl_bh);
2815
2816	if (status < 0 && (*tl_copy)) {
2817		kfree(*tl_copy);
2818		*tl_copy = NULL;
2819	}
2820
2821	mlog_exit(status);
2822	return status;
2823}
2824
2825int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
2826					 struct ocfs2_dinode *tl_copy)
2827{
2828	int status = 0;
2829	int i;
2830	unsigned int clusters, num_recs, start_cluster;
2831	u64 start_blk;
2832	handle_t *handle;
2833	struct inode *tl_inode = osb->osb_tl_inode;
2834	struct ocfs2_truncate_log *tl;
2835
2836	mlog_entry_void();
2837
2838	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
2839		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
2840		return -EINVAL;
2841	}
2842
2843	tl = &tl_copy->id2.i_dealloc;
2844	num_recs = le16_to_cpu(tl->tl_used);
2845	mlog(0, "cleanup %u records from %llu\n", num_recs,
2846	     (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
2847
2848	mutex_lock(&tl_inode->i_mutex);
2849	for(i = 0; i < num_recs; i++) {
2850		if (ocfs2_truncate_log_needs_flush(osb)) {
2851			status = __ocfs2_flush_truncate_log(osb);
2852			if (status < 0) {
2853				mlog_errno(status);
2854				goto bail_up;
2855			}
2856		}
2857
2858		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
2859		if (IS_ERR(handle)) {
2860			status = PTR_ERR(handle);
2861			mlog_errno(status);
2862			goto bail_up;
2863		}
2864
2865		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
2866		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
2867		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
2868
2869		status = ocfs2_truncate_log_append(osb, handle,
2870						   start_blk, clusters);
2871		ocfs2_commit_trans(osb, handle);
2872		if (status < 0) {
2873			mlog_errno(status);
2874			goto bail_up;
2875		}
2876	}
2877
2878bail_up:
2879	mutex_unlock(&tl_inode->i_mutex);
2880
2881	mlog_exit(status);
2882	return status;
2883}
2884
2885void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
2886{
2887	int status;
2888	struct inode *tl_inode = osb->osb_tl_inode;
2889
2890	mlog_entry_void();
2891
2892	if (tl_inode) {
2893		cancel_delayed_work(&osb->osb_truncate_log_wq);
2894		flush_workqueue(ocfs2_wq);
2895
2896		status = ocfs2_flush_truncate_log(osb);
2897		if (status < 0)
2898			mlog_errno(status);
2899
2900		brelse(osb->osb_tl_bh);
2901		iput(osb->osb_tl_inode);
2902	}
2903
2904	mlog_exit_void();
2905}
2906
2907int ocfs2_truncate_log_init(struct ocfs2_super *osb)
2908{
2909	int status;
2910	struct inode *tl_inode = NULL;
2911	struct buffer_head *tl_bh = NULL;
2912
2913	mlog_entry_void();
2914
2915	status = ocfs2_get_truncate_log_info(osb,
2916					     osb->slot_num,
2917					     &tl_inode,
2918					     &tl_bh);
2919	if (status < 0)
2920		mlog_errno(status);
2921
2922	/* ocfs2_truncate_log_shutdown keys on the existence of
2923	 * osb->osb_tl_inode so we don't set any of the osb variables
2924	 * until we're sure all is well. */
2925	INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
2926			  ocfs2_truncate_log_worker);
2927	osb->osb_tl_bh    = tl_bh;
2928	osb->osb_tl_inode = tl_inode;
2929
2930	mlog_exit(status);
2931	return status;
2932}
2933
2934/* This function will figure out whether the currently last extent
2935 * block will be deleted, and if it will, what the new last extent
2936 * block will be so we can update his h_next_leaf_blk field, as well
2937 * as the dinodes i_last_eb_blk */
2938static int ocfs2_find_new_last_ext_blk(struct inode *inode,
2939				       unsigned int clusters_to_del,
2940				       struct ocfs2_path *path,
2941				       struct buffer_head **new_last_eb)
2942{
2943	int next_free, ret = 0;
2944	u32 cpos;
2945	struct ocfs2_extent_rec *rec;
2946	struct ocfs2_extent_block *eb;
2947	struct ocfs2_extent_list *el;
2948	struct buffer_head *bh = NULL;
2949
2950	*new_last_eb = NULL;
2951
2952	/* we have no tree, so of course, no last_eb. */
2953	if (!path->p_tree_depth)
2954		goto out;
2955
2956	/* trunc to zero special case - this makes tree_depth = 0
2957	 * regardless of what it is.  */
2958	if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
2959		goto out;
2960
2961	el = path_leaf_el(path);
2962	BUG_ON(!el->l_next_free_rec);
2963
2964	/*
2965	 * Make sure that this extent list will actually be empty
2966	 * after we clear away the data. We can shortcut out if
2967	 * there's more than one non-empty extent in the
2968	 * list. Otherwise, a check of the remaining extent is
2969	 * necessary.
2970	 */
2971	next_free = le16_to_cpu(el->l_next_free_rec);
2972	rec = NULL;
2973	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
2974		if (next_free > 2)
2975			goto out;
2976
2977		/* We may have a valid extent in index 1, check it. */
2978		if (next_free == 2)
2979			rec = &el->l_recs[1];
2980
2981		/*
2982		 * Fall through - no more nonempty extents, so we want
2983		 * to delete this leaf.
2984		 */
2985	} else {
2986		if (next_free > 1)
2987			goto out;
2988
2989		rec = &el->l_recs[0];
2990	}
2991
2992	if (rec) {
2993		/*
2994		 * Check it we'll only be trimming off the end of this
2995		 * cluster.
2996		 */
2997		if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
2998			goto out;
2999	}
3000
3001	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
3002	if (ret) {
3003		mlog_errno(ret);
3004		goto out;
3005	}
3006
3007	ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
3008	if (ret) {
3009		mlog_errno(ret);
3010		goto out;
3011	}
3012
3013	eb = (struct ocfs2_extent_block *) bh->b_data;
3014	el = &eb->h_list;
3015	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
3016		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
3017		ret = -EROFS;
3018		goto out;
3019	}
3020
3021	*new_last_eb = bh;
3022	get_bh(*new_last_eb);
3023	mlog(0, "returning block %llu, (cpos: %u)\n",
3024	     (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
3025out:
3026	brelse(bh);
3027
3028	return ret;
3029}
3030
3031/*
3032 * Trim some clusters off the rightmost edge of a tree. Only called
3033 * during truncate.
3034 *
3035 * The caller needs to:
3036 *   - start journaling of each path component.
3037 *   - compute and fully set up any new last ext block
3038 */
3039static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
3040			   handle_t *handle, struct ocfs2_truncate_context *tc,
3041			   u32 clusters_to_del, u64 *delete_start)
3042{
3043	int ret, i, index = path->p_tree_depth;
3044	u32 new_edge = 0;
3045	u64 deleted_eb = 0;
3046	struct buffer_head *bh;
3047	struct ocfs2_extent_list *el;
3048	struct ocfs2_extent_rec *rec;
3049
3050	*delete_start = 0;
3051
3052	while (index >= 0) {
3053		bh = path->p_node[index].bh;
3054		el = path->p_node[index].el;
3055
3056		mlog(0, "traveling tree (index = %d, block = %llu)\n",
3057		     index,  (unsigned long long)bh->b_blocknr);
3058
3059		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
3060
3061		if (index !=
3062		    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
3063			ocfs2_error(inode->i_sb,
3064				    "Inode %lu has invalid ext. block %llu",
3065				    inode->i_ino,
3066				    (unsigned long long)bh->b_blocknr);
3067			ret = -EROFS;
3068			goto out;
3069		}
3070
3071find_tail_record:
3072		i = le16_to_cpu(el->l_next_free_rec) - 1;
3073		rec = &el->l_recs[i];
3074
3075		mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
3076		     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
3077		     ocfs2_rec_clusters(el, rec),
3078		     (unsigned long long)le64_to_cpu(rec->e_blkno),
3079		     le16_to_cpu(el->l_next_free_rec));
3080
3081		BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
3082
3083		if (le16_to_cpu(el->l_tree_depth) == 0) {
3084			/*
3085			 * If the leaf block contains a single empty
3086			 * extent and no records, we can just remove
3087			 * the block.
3088			 */
3089			if (i == 0 && ocfs2_is_empty_extent(rec)) {
3090				memset(rec, 0,
3091				       sizeof(struct ocfs2_extent_rec));
3092				el->l_next_free_rec = cpu_to_le16(0);
3093
3094				goto delete;
3095			}
3096
3097			/*
3098			 * Remove any empty extents by shifting things
3099			 * left. That should make life much easier on
3100			 * the code below. This condition is rare
3101			 * enough that we shouldn't see a performance
3102			 * hit.
3103			 */
3104			if (ocfs2_is_empty_extent(&el->l_recs[0])) {
3105				le16_add_cpu(&el->l_next_free_rec, -1);
3106
3107				for(i = 0;
3108				    i < le16_to_cpu(el->l_next_free_rec); i++)
3109					el->l_recs[i] = el->l_recs[i + 1];
3110
3111				memset(&el->l_recs[i], 0,
3112				       sizeof(struct ocfs2_extent_rec));
3113
3114				/*
3115				 * We've modified our extent list. The
3116				 * simplest way to handle this change
3117				 * is to being the search from the
3118				 * start again.
3119				 */
3120				goto find_tail_record;
3121			}
3122
3123			le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
3124
3125			/*
3126			 * We'll use "new_edge" on our way back up the
3127			 * tree to know what our rightmost cpos is.
3128			 */
3129			new_edge = le16_to_cpu(rec->e_leaf_clusters);
3130			new_edge += le32_to_cpu(rec->e_cpos);
3131
3132			/*
3133			 * The caller will use this to delete data blocks.
3134			 */
3135			*delete_start = le64_to_cpu(rec->e_blkno)
3136				+ ocfs2_clusters_to_blocks(inode->i_sb,
3137					le16_to_cpu(rec->e_leaf_clusters));
3138
3139			/*
3140			 * If it's now empty, remove this record.
3141			 */
3142			if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
3143				memset(rec, 0,
3144				       sizeof(struct ocfs2_extent_rec));
3145				le16_add_cpu(&el->l_next_free_rec, -1);
3146			}
3147		} else {
3148			if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
3149				memset(rec, 0,
3150				       sizeof(struct ocfs2_extent_rec));
3151				le16_add_cpu(&el->l_next_free_rec, -1);
3152
3153				goto delete;
3154			}
3155
3156			/* Can this actually happen? */
3157			if (le16_to_cpu(el->l_next_free_rec) == 0)
3158				goto delete;
3159
3160			/*
3161			 * We never actually deleted any clusters
3162			 * because our leaf was empty. There's no
3163			 * reason to adjust the rightmost edge then.
3164			 */
3165			if (new_edge == 0)
3166				goto delete;
3167
3168			rec->e_int_clusters = cpu_to_le32(new_edge);
3169			le32_add_cpu(&rec->e_int_clusters,
3170				     -le32_to_cpu(rec->e_cpos));
3171
3172			 /*
3173			  * A deleted child record should have been
3174			  * caught above.
3175			  */
3176			 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
3177		}
3178
3179delete:
3180		ret = ocfs2_journal_dirty(handle, bh);
3181		if (ret) {
3182			mlog_errno(ret);
3183			goto out;
3184		}
3185
3186		mlog(0, "extent list container %llu, after: record %d: "
3187		     "(%u, %u, %llu), next = %u.\n",
3188		     (unsigned long long)bh->b_blocknr, i,
3189		     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
3190		     (unsigned long long)le64_to_cpu(rec->e_blkno),
3191		     le16_to_cpu(el->l_next_free_rec));
3192
3193		/*
3194		 * We must be careful to only attempt delete of an
3195		 * extent block (and not the root inode block).
3196		 */
3197		if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
3198			struct ocfs2_extent_block *eb =
3199				(struct ocfs2_extent_block *)bh->b_data;
3200
3201			/*
3202			 * Save this for use when processing the
3203			 * parent block.
3204			 */
3205			deleted_eb = le64_to_cpu(eb->h_blkno);
3206
3207			mlog(0, "deleting this extent block.\n");
3208
3209			ocfs2_remove_from_cache(inode, bh);
3210
3211			BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
3212			BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
3213			BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
3214
3215			if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
3216				/*
3217				 * This code only understands how to
3218				 * lock the suballocator in slot 0,
3219				 * which is fine because allocation is
3220				 * only ever done out of that
3221				 * suballocator too. A future version
3222				 * might change that however, so avoid
3223				 * a free if we don't know how to
3224				 * handle it. This way an fs incompat
3225				 * bit will not be necessary.
3226				 */
3227				ret = ocfs2_free_extent_block(handle,
3228							      tc->tc_ext_alloc_inode,
3229							      tc->tc_ext_alloc_bh,
3230							      eb);
3231
3232				/* An error here is not fatal. */
3233				if (ret < 0)
3234					mlog_errno(ret);
3235			}
3236		} else {
3237			deleted_eb = 0;
3238		}
3239
3240		index--;
3241	}
3242
3243	ret = 0;
3244out:
3245	return ret;
3246}
3247
3248static int ocfs2_do_truncate(struct ocfs2_super *osb,
3249			     unsigned int clusters_to_del,
3250			     struct inode *inode,
3251			     struct buffer_head *fe_bh,
3252			     handle_t *handle,
3253			     struct ocfs2_truncate_context *tc,
3254			     struct ocfs2_path *path)
3255{
3256	int status;
3257	struct ocfs2_dinode *fe;
3258	struct ocfs2_extent_block *last_eb = NULL;
3259	struct ocfs2_extent_list *el;
3260	struct buffer_head *last_eb_bh = NULL;
3261	u64 delete_blk = 0;
3262
3263	fe = (struct ocfs2_dinode *) fe_bh->b_data;
3264
3265	status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
3266					     path, &last_eb_bh);
3267	if (status < 0) {
3268		mlog_errno(status);
3269		goto bail;
3270	}
3271
3272	/*
3273	 * Each component will be touched, so we might as well journal
3274	 * here to avoid having to handle errors later.
3275	 */
3276	status = ocfs2_journal_access_path(inode, handle, path);
3277	if (status < 0) {
3278		mlog_errno(status);
3279		goto bail;
3280	}
3281
3282	if (last_eb_bh) {
3283		status = ocfs2_journal_access(handle, inode, last_eb_bh,
3284					      OCFS2_JOURNAL_ACCESS_WRITE);
3285		if (status < 0) {
3286			mlog_errno(status);
3287			goto bail;
3288		}
3289
3290		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
3291	}
3292
3293	el = &(fe->id2.i_list);
3294
3295	/*
3296	 * Lower levels depend on this never happening, but it's best
3297	 * to check it up here before changing the tree.
3298	 */
3299	if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
3300		ocfs2_error(inode->i_sb,
3301			    "Inode %lu has an empty extent record, depth %u\n",
3302			    inode->i_ino, le16_to_cpu(el->l_tree_depth));
3303		status = -EROFS;
3304		goto bail;
3305	}
3306
3307	spin_lock(&OCFS2_I(inode)->ip_lock);
3308	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
3309				      clusters_to_del;
3310	spin_unlock(&OCFS2_I(inode)->ip_lock);
3311	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
3312
3313	status = ocfs2_trim_tree(inode, path, handle, tc,
3314				 clusters_to_del, &delete_blk);
3315	if (status) {
3316		mlog_errno(status);
3317		goto bail;
3318	}
3319
3320	if (le32_to_cpu(fe->i_clusters) == 0) {
3321		/* trunc to zero is a special case. */
3322		el->l_tree_depth = 0;
3323		fe->i_last_eb_blk = 0;
3324	} else if (last_eb)
3325		fe->i_last_eb_blk = last_eb->h_blkno;
3326
3327	status = ocfs2_journal_dirty(handle, fe_bh);
3328	if (status < 0) {
3329		mlog_errno(status);
3330		goto bail;
3331	}
3332
3333	if (last_eb) {
3334		/* If there will be a new last extent block, then by
3335		 * definition, there cannot be any leaves to the right of
3336		 * him. */
3337		last_eb->h_next_leaf_blk = 0;
3338		status = ocfs2_journal_dirty(handle, last_eb_bh);
3339		if (status < 0) {
3340			mlog_errno(status);
3341			goto bail;
3342		}
3343	}
3344
3345	if (delete_blk) {
3346		status = ocfs2_truncate_log_append(osb, handle, delete_blk,
3347						   clusters_to_del);
3348		if (status < 0) {
3349			mlog_errno(status);
3350			goto bail;
3351		}
3352	}
3353	status = 0;
3354bail:
3355
3356	mlog_exit(status);
3357	return status;
3358}
3359
3360static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
3361{
3362	set_buffer_uptodate(bh);
3363	mark_buffer_dirty(bh);
3364	return 0;
3365}
3366
3367static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
3368{
3369	set_buffer_uptodate(bh);
3370	mark_buffer_dirty(bh);
3371	return ocfs2_journal_dirty_data(handle, bh);
3372}
3373
3374static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3375				     struct page **pages, int numpages,
3376				     u64 phys, handle_t *handle)
3377{
3378	int i, ret, partial = 0;
3379	void *kaddr;
3380	struct page *page;
3381	unsigned int from, to = PAGE_CACHE_SIZE;
3382	struct super_block *sb = inode->i_sb;
3383
3384	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
3385
3386	if (numpages == 0)
3387		goto out;
3388
3389	from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
3390	if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
3391		/*
3392		 * Since 'from' has been capped to a value below page
3393		 * size, this calculation won't be able to overflow
3394		 * 'to'
3395		 */
3396		to = ocfs2_align_bytes_to_clusters(sb, from);
3397
3398		/*
3399		 * The truncate tail in this case should never contain
3400		 * more than one page at maximum. The loop below also
3401		 * assumes this.
3402		 */
3403		BUG_ON(numpages != 1);
3404	}
3405
3406	for(i = 0; i < numpages; i++) {
3407		page = pages[i];
3408
3409		BUG_ON(from > PAGE_CACHE_SIZE);
3410		BUG_ON(to > PAGE_CACHE_SIZE);
3411
3412		ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
3413		if (ret)
3414			mlog_errno(ret);
3415
3416		kaddr = kmap_atomic(page, KM_USER0);
3417		memset(kaddr + from, 0, to - from);
3418		kunmap_atomic(kaddr, KM_USER0);
3419
3420		/*
3421		 * Need to set the buffers we zero'd into uptodate
3422		 * here if they aren't - ocfs2_map_page_blocks()
3423		 * might've skipped some
3424		 */
3425		if (ocfs2_should_order_data(inode)) {
3426			ret = walk_page_buffers(handle,
3427						page_buffers(page),
3428						from, to, &partial,
3429						ocfs2_ordered_zero_func);
3430			if (ret < 0)
3431				mlog_errno(ret);
3432		} else {
3433			ret = walk_page_buffers(handle, page_buffers(page),
3434						from, to, &partial,
3435						ocfs2_writeback_zero_func);
3436			if (ret < 0)
3437				mlog_errno(ret);
3438		}
3439
3440		if (!partial)
3441			SetPageUptodate(page);
3442
3443		flush_dcache_page(page);
3444
3445		/*
3446		 * Every page after the 1st one should be completely zero'd.
3447		 */
3448		from = 0;
3449	}
3450out:
3451	if (pages) {
3452		for (i = 0; i < numpages; i++) {
3453			page = pages[i];
3454			unlock_page(page);
3455			mark_page_accessed(page);
3456			page_cache_release(page);
3457		}
3458	}
3459}
3460
3461static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
3462				int *num, u64 *phys)
3463{
3464	int i, numpages = 0, ret = 0;
3465	unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
3466	unsigned int ext_flags;
3467	struct super_block *sb = inode->i_sb;
3468	struct address_space *mapping = inode->i_mapping;
3469	unsigned long index;
3470	u64 next_cluster_bytes;
3471
3472	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
3473
3474	/* Cluster boundary, so we don't need to grab any pages. */
3475	if ((isize & (csize - 1)) == 0)
3476		goto out;
3477
3478	ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
3479					  phys, NULL, &ext_flags);
3480	if (ret) {
3481		mlog_errno(ret);
3482		goto out;
3483	}
3484
3485	/* Tail is a hole. */
3486	if (*phys == 0)
3487		goto out;
3488
3489	/* Tail is marked as unwritten, we can count on write to zero
3490	 * in that case. */
3491	if (ext_flags & OCFS2_EXT_UNWRITTEN)
3492		goto out;
3493
3494	next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
3495	index = isize >> PAGE_CACHE_SHIFT;
3496	do {
3497		pages[numpages] = grab_cache_page(mapping, index);
3498		if (!pages[numpages]) {
3499			ret = -ENOMEM;
3500			mlog_errno(ret);
3501			goto out;
3502		}
3503
3504		numpages++;
3505		index++;
3506	} while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
3507
3508out:
3509	if (ret != 0) {
3510		if (pages) {
3511			for (i = 0; i < numpages; i++) {
3512				if (pages[i]) {
3513					unlock_page(pages[i]);
3514					page_cache_release(pages[i]);
3515				}
3516			}
3517		}
3518		numpages = 0;
3519	}
3520
3521	*num = numpages;
3522
3523	return ret;
3524}
3525
3526/*
3527 * Zero the area past i_size but still within an allocated
3528 * cluster. This avoids exposing nonzero data on subsequent file
3529 * extends.
3530 *
3531 * We need to call this before i_size is updated on the inode because
3532 * otherwise block_write_full_page() will skip writeout of pages past
3533 * i_size. The new_i_size parameter is passed for this reason.
3534 */
3535int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3536				 u64 new_i_size)
3537{
3538	int ret, numpages;
3539	loff_t endbyte;
3540	struct page **pages = NULL;
3541	u64 phys;
3542
3543	/*
3544	 * File systems which don't support sparse files zero on every
3545	 * extend.
3546	 */
3547	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
3548		return 0;
3549
3550	pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
3551			sizeof(struct page *), GFP_NOFS);
3552	if (pages == NULL) {
3553		ret = -ENOMEM;
3554		mlog_errno(ret);
3555		goto out;
3556	}
3557
3558	ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
3559	if (ret) {
3560		mlog_errno(ret);
3561		goto out;
3562	}
3563
3564	if (numpages == 0)
3565		goto out;
3566
3567	ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
3568				 handle);
3569
3570	/*
3571	 * Initiate writeout of the pages we zero'd here. We don't
3572	 * wait on them - the truncate_inode_pages() call later will
3573	 * do that for us.
3574	 */
3575	endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
3576	ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
3577				    endbyte - 1, SYNC_FILE_RANGE_WRITE);
3578	if (ret)
3579		mlog_errno(ret);
3580
3581out:
3582	if (pages)
3583		kfree(pages);
3584
3585	return ret;
3586}
3587
3588/*
3589 * It is expected, that by the time you call this function,
3590 * inode->i_size and fe->i_size have been adjusted.
3591 *
3592 * WARNING: This will kfree the truncate context
3593 */
3594int ocfs2_commit_truncate(struct ocfs2_super *osb,
3595			  struct inode *inode,
3596			  struct buffer_head *fe_bh,
3597			  struct ocfs2_truncate_context *tc)
3598{
3599	int status, i, credits, tl_sem = 0;
3600	u32 clusters_to_del, new_highest_cpos, range;
3601	struct ocfs2_extent_list *el;
3602	handle_t *handle = NULL;
3603	struct inode *tl_inode = osb->osb_tl_inode;
3604	struct ocfs2_path *path = NULL;
3605
3606	mlog_entry_void();
3607
3608	down_write(&OCFS2_I(inode)->ip_alloc_sem);
3609
3610	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
3611						     i_size_read(inode));
3612
3613	path = ocfs2_new_inode_path(fe_bh);
3614	if (!path) {
3615		status = -ENOMEM;
3616		mlog_errno(status);
3617		goto bail;
3618	}
3619
3620	ocfs2_extent_map_trunc(inode, new_highest_cpos);
3621
3622start:
3623	/*
3624	 * Check that we still have allocation to delete.
3625	 */
3626	if (OCFS2_I(inode)->ip_clusters == 0) {
3627		status = 0;
3628		goto bail;
3629	}
3630
3631	/*
3632	 * Truncate always works against the rightmost tree branch.
3633	 */
3634	status = ocfs2_find_path(inode, path, UINT_MAX);
3635	if (status) {
3636		mlog_errno(status);
3637		goto bail;
3638	}
3639
3640	mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
3641	     OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
3642
3643	/*
3644	 * By now, el will point to the extent list on the bottom most
3645	 * portion of this tree. Only the tail record is considered in
3646	 * each pass.
3647	 *
3648	 * We handle the following cases, in order:
3649	 * - empty extent: delete the remaining branch
3650	 * - remove the entire record
3651	 * - remove a partial record
3652	 * - no record needs to be removed (truncate has completed)
3653	 */
3654	el = path_leaf_el(path);
3655	if (le16_to_cpu(el->l_next_free_rec) == 0) {
3656		ocfs2_error(inode->i_sb,
3657			    "Inode %llu has empty extent block at %llu\n",
3658			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
3659			    (unsigned long long)path_leaf_bh(path)->b_blocknr);
3660		status = -EROFS;
3661		goto bail;
3662	}
3663
3664	i = le16_to_cpu(el->l_next_free_rec) - 1;
3665	range = le32_to_cpu(el->l_recs[i].e_cpos) +
3666		ocfs2_rec_clusters(el, &el->l_recs[i]);
3667	if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
3668		clusters_to_del = 0;
3669	} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
3670		clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
3671	} else if (range > new_highest_cpos) {
3672		clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
3673				   le32_to_cpu(el->l_recs[i].e_cpos)) -
3674				  new_highest_cpos;
3675	} else {
3676		status = 0;
3677		goto bail;
3678	}
3679
3680	mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
3681	     clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
3682
3683	BUG_ON(clusters_to_del == 0);
3684
3685	mutex_lock(&tl_inode->i_mutex);
3686	tl_sem = 1;
3687	/* ocfs2_truncate_log_needs_flush guarantees us at least one
3688	 * record is free for use. If there isn't any, we flush to get
3689	 * an empty truncate log.  */
3690	if (ocfs2_truncate_log_needs_flush(osb)) {
3691		status = __ocfs2_flush_truncate_log(osb);
3692		if (status < 0) {
3693			mlog_errno(status);
3694			goto bail;
3695		}
3696	}
3697
3698	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
3699						(struct ocfs2_dinode *)fe_bh->b_data,
3700						el);
3701	handle = ocfs2_start_trans(osb, credits);
3702	if (IS_ERR(handle)) {
3703		status = PTR_ERR(handle);
3704		handle = NULL;
3705		mlog_errno(status);
3706		goto bail;
3707	}
3708
3709	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
3710				   tc, path);
3711	if (status < 0) {
3712		mlog_errno(status);
3713		goto bail;
3714	}
3715
3716	mutex_unlock(&tl_inode->i_mutex);
3717	tl_sem = 0;
3718
3719	ocfs2_commit_trans(osb, handle);
3720	handle = NULL;
3721
3722	ocfs2_reinit_path(path, 1);
3723
3724	/*
3725	 * The check above will catch the case where we've truncated
3726	 * away all allocation.
3727	 */
3728	goto start;
3729
3730bail:
3731	up_write(&OCFS2_I(inode)->ip_alloc_sem);
3732
3733	ocfs2_schedule_truncate_log_flush(osb, 1);
3734
3735	if (tl_sem)
3736		mutex_unlock(&tl_inode->i_mutex);
3737
3738	if (handle)
3739		ocfs2_commit_trans(osb, handle);
3740
3741	ocfs2_free_path(path);
3742
3743	/* This will drop the ext_alloc cluster lock for us */
3744	ocfs2_free_truncate_context(tc);
3745
3746	mlog_exit(status);
3747	return status;
3748}
3749
3750/*
3751 * Expects the inode to already be locked. This will figure out which
3752 * inodes need to be locked and will put them on the returned truncate
3753 * context.
3754 */
3755int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3756			   struct inode *inode,
3757			   struct buffer_head *fe_bh,
3758			   struct ocfs2_truncate_context **tc)
3759{
3760	int status, metadata_delete, i;
3761	unsigned int new_i_clusters;
3762	struct ocfs2_dinode *fe;
3763	struct ocfs2_extent_block *eb;
3764	struct ocfs2_extent_list *el;
3765	struct buffer_head *last_eb_bh = NULL;
3766	struct inode *ext_alloc_inode = NULL;
3767	struct buffer_head *ext_alloc_bh = NULL;
3768
3769	mlog_entry_void();
3770
3771	*tc = NULL;
3772
3773	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
3774						  i_size_read(inode));
3775	fe = (struct ocfs2_dinode *) fe_bh->b_data;
3776
3777	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
3778	     "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
3779	     (unsigned long long)le64_to_cpu(fe->i_size));
3780
3781	*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
3782	if (!(*tc)) {
3783		status = -ENOMEM;
3784		mlog_errno(status);
3785		goto bail;
3786	}
3787
3788	metadata_delete = 0;
3789	if (fe->id2.i_list.l_tree_depth) {
3790		/* If we have a tree, then the truncate may result in
3791		 * metadata deletes. Figure this out from the
3792		 * rightmost leaf block.*/
3793		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
3794					  &last_eb_bh, OCFS2_BH_CACHED, inode);
3795		if (status < 0) {
3796			mlog_errno(status);
3797			goto bail;
3798		}
3799		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
3800		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
3801			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
3802
3803			brelse(last_eb_bh);
3804			status = -EIO;
3805			goto bail;
3806		}
3807		el = &(eb->h_list);
3808
3809		i = 0;
3810		if (ocfs2_is_empty_extent(&el->l_recs[0]))
3811			i = 1;
3812		if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
3813			metadata_delete = 1;
3814	}
3815
3816	(*tc)->tc_last_eb_bh = last_eb_bh;
3817
3818	if (metadata_delete) {
3819		mlog(0, "Will have to delete metadata for this trunc. "
3820		     "locking allocator.\n");
3821		ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
3822		if (!ext_alloc_inode) {
3823			status = -ENOMEM;
3824			mlog_errno(status);
3825			goto bail;
3826		}
3827
3828		mutex_lock(&ext_alloc_inode->i_mutex);
3829		(*tc)->tc_ext_alloc_inode = ext_alloc_inode;
3830
3831		status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
3832		if (status < 0) {
3833			mlog_errno(status);
3834			goto bail;
3835		}
3836		(*tc)->tc_ext_alloc_bh = ext_alloc_bh;
3837		(*tc)->tc_ext_alloc_locked = 1;
3838	}
3839
3840	status = 0;
3841bail:
3842	if (status < 0) {
3843		if (*tc)
3844			ocfs2_free_truncate_context(*tc);
3845		*tc = NULL;
3846	}
3847	mlog_exit_void();
3848	return status;
3849}
3850
3851static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
3852{
3853	if (tc->tc_ext_alloc_inode) {
3854		if (tc->tc_ext_alloc_locked)
3855			ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
3856
3857		mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
3858		iput(tc->tc_ext_alloc_inode);
3859	}
3860
3861	if (tc->tc_ext_alloc_bh)
3862		brelse(tc->tc_ext_alloc_bh);
3863
3864	if (tc->tc_last_eb_bh)
3865		brelse(tc->tc_last_eb_bh);
3866
3867	kfree(tc);
3868}
3869