1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h"
30#include "xfs_da_btree.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h"
38#include "xfs_inode_item.h"
39#include "xfs_alloc.h"
40#include "xfs_btree.h"
41#include "xfs_bmap.h"
42#include "xfs_attr.h"
43#include "xfs_attr_leaf.h"
44#include "xfs_dir2_data.h"
45#include "xfs_dir2_leaf.h"
46#include "xfs_dir2_block.h"
47#include "xfs_dir2_node.h"
48#include "xfs_error.h"
49
50/*
51 * xfs_da_btree.c
52 *
53 * Routines to implement directories as Btrees of hashed names.
54 */
55
56/*========================================================================
57 * Function prototypes for the kernel.
58 *========================================================================*/
59
60/*
61 * Routines used for growing the Btree.
62 */
63STATIC int xfs_da_root_split(xfs_da_state_t *state,
64					    xfs_da_state_blk_t *existing_root,
65					    xfs_da_state_blk_t *new_child);
66STATIC int xfs_da_node_split(xfs_da_state_t *state,
67					    xfs_da_state_blk_t *existing_blk,
68					    xfs_da_state_blk_t *split_blk,
69					    xfs_da_state_blk_t *blk_to_add,
70					    int treelevel,
71					    int *result);
72STATIC void xfs_da_node_rebalance(xfs_da_state_t *state,
73					 xfs_da_state_blk_t *node_blk_1,
74					 xfs_da_state_blk_t *node_blk_2);
75STATIC void xfs_da_node_add(xfs_da_state_t *state,
76				   xfs_da_state_blk_t *old_node_blk,
77				   xfs_da_state_blk_t *new_node_blk);
78
79/*
80 * Routines used for shrinking the Btree.
81 */
82STATIC int xfs_da_root_join(xfs_da_state_t *state,
83					   xfs_da_state_blk_t *root_blk);
84STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval);
85STATIC void xfs_da_node_remove(xfs_da_state_t *state,
86					      xfs_da_state_blk_t *drop_blk);
87STATIC void xfs_da_node_unbalance(xfs_da_state_t *state,
88					 xfs_da_state_blk_t *src_node_blk,
89					 xfs_da_state_blk_t *dst_node_blk);
90
91/*
92 * Utility routines.
93 */
94STATIC uint	xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count);
95STATIC int	xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp);
96STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra);
97STATIC int	xfs_da_blk_unlink(xfs_da_state_t *state,
98				  xfs_da_state_blk_t *drop_blk,
99				  xfs_da_state_blk_t *save_blk);
100STATIC void	xfs_da_state_kill_altpath(xfs_da_state_t *state);
101
102/*========================================================================
103 * Routines used for growing the Btree.
104 *========================================================================*/
105
106/*
107 * Create the initial contents of an intermediate node.
108 */
109int
110xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
111				 xfs_dabuf_t **bpp, int whichfork)
112{
113	xfs_da_intnode_t *node;
114	xfs_dabuf_t *bp;
115	int error;
116	xfs_trans_t *tp;
117
118	tp = args->trans;
119	error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
120	if (error)
121		return(error);
122	ASSERT(bp != NULL);
123	node = bp->data;
124	node->hdr.info.forw = 0;
125	node->hdr.info.back = 0;
126	node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC);
127	node->hdr.info.pad = 0;
128	node->hdr.count = 0;
129	node->hdr.level = cpu_to_be16(level);
130
131	xfs_da_log_buf(tp, bp,
132		XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
133
134	*bpp = bp;
135	return(0);
136}
137
138/*
139 * Split a leaf node, rebalance, then possibly split
140 * intermediate nodes, rebalance, etc.
141 */
142int							/* error */
143xfs_da_split(xfs_da_state_t *state)
144{
145	xfs_da_state_blk_t *oldblk, *newblk, *addblk;
146	xfs_da_intnode_t *node;
147	xfs_dabuf_t *bp;
148	int max, action, error, i;
149
150	/*
151	 * Walk back up the tree splitting/inserting/adjusting as necessary.
152	 * If we need to insert and there isn't room, split the node, then
153	 * decide which fragment to insert the new block from below into.
154	 * Note that we may split the root this way, but we need more fixup.
155	 */
156	max = state->path.active - 1;
157	ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
158	ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
159	       state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
160
161	addblk = &state->path.blk[max];		/* initial dummy value */
162	for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
163		oldblk = &state->path.blk[i];
164		newblk = &state->altpath.blk[i];
165
166		/*
167		 * If a leaf node then
168		 *     Allocate a new leaf node, then rebalance across them.
169		 * else if an intermediate node then
170		 *     We split on the last layer, must we split the node?
171		 */
172		switch (oldblk->magic) {
173		case XFS_ATTR_LEAF_MAGIC:
174			error = xfs_attr_leaf_split(state, oldblk, newblk);
175			if ((error != 0) && (error != ENOSPC)) {
176				return(error);	/* GROT: attr is inconsistent */
177			}
178			if (!error) {
179				addblk = newblk;
180				break;
181			}
182			/*
183			 * Entry wouldn't fit, split the leaf again.
184			 */
185			state->extravalid = 1;
186			if (state->inleaf) {
187				state->extraafter = 0;	/* before newblk */
188				error = xfs_attr_leaf_split(state, oldblk,
189							    &state->extrablk);
190			} else {
191				state->extraafter = 1;	/* after newblk */
192				error = xfs_attr_leaf_split(state, newblk,
193							    &state->extrablk);
194			}
195			if (error)
196				return(error);	/* GROT: attr inconsistent */
197			addblk = newblk;
198			break;
199		case XFS_DIR2_LEAFN_MAGIC:
200			error = xfs_dir2_leafn_split(state, oldblk, newblk);
201			if (error)
202				return error;
203			addblk = newblk;
204			break;
205		case XFS_DA_NODE_MAGIC:
206			error = xfs_da_node_split(state, oldblk, newblk, addblk,
207							 max - i, &action);
208			xfs_da_buf_done(addblk->bp);
209			addblk->bp = NULL;
210			if (error)
211				return(error);	/* GROT: dir is inconsistent */
212			/*
213			 * Record the newly split block for the next time thru?
214			 */
215			if (action)
216				addblk = newblk;
217			else
218				addblk = NULL;
219			break;
220		}
221
222		/*
223		 * Update the btree to show the new hashval for this child.
224		 */
225		xfs_da_fixhashpath(state, &state->path);
226		/*
227		 * If we won't need this block again, it's getting dropped
228		 * from the active path by the loop control, so we need
229		 * to mark it done now.
230		 */
231		if (i > 0 || !addblk)
232			xfs_da_buf_done(oldblk->bp);
233	}
234	if (!addblk)
235		return(0);
236
237	/*
238	 * Split the root node.
239	 */
240	ASSERT(state->path.active == 0);
241	oldblk = &state->path.blk[0];
242	error = xfs_da_root_split(state, oldblk, addblk);
243	if (error) {
244		xfs_da_buf_done(oldblk->bp);
245		xfs_da_buf_done(addblk->bp);
246		addblk->bp = NULL;
247		return(error);	/* GROT: dir is inconsistent */
248	}
249
250	/*
251	 * Update pointers to the node which used to be block 0 and
252	 * just got bumped because of the addition of a new root node.
253	 * There might be three blocks involved if a double split occurred,
254	 * and the original block 0 could be at any position in the list.
255	 */
256
257	node = oldblk->bp->data;
258	if (node->hdr.info.forw) {
259		if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
260			bp = addblk->bp;
261		} else {
262			ASSERT(state->extravalid);
263			bp = state->extrablk.bp;
264		}
265		node = bp->data;
266		node->hdr.info.back = cpu_to_be32(oldblk->blkno);
267		xfs_da_log_buf(state->args->trans, bp,
268		    XFS_DA_LOGRANGE(node, &node->hdr.info,
269		    sizeof(node->hdr.info)));
270	}
271	node = oldblk->bp->data;
272	if (node->hdr.info.back) {
273		if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
274			bp = addblk->bp;
275		} else {
276			ASSERT(state->extravalid);
277			bp = state->extrablk.bp;
278		}
279		node = bp->data;
280		node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
281		xfs_da_log_buf(state->args->trans, bp,
282		    XFS_DA_LOGRANGE(node, &node->hdr.info,
283		    sizeof(node->hdr.info)));
284	}
285	xfs_da_buf_done(oldblk->bp);
286	xfs_da_buf_done(addblk->bp);
287	addblk->bp = NULL;
288	return(0);
289}
290
291/*
292 * Split the root.  We have to create a new root and point to the two
293 * parts (the split old root) that we just created.  Copy block zero to
294 * the EOF, extending the inode in process.
295 */
296STATIC int						/* error */
297xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
298				 xfs_da_state_blk_t *blk2)
299{
300	xfs_da_intnode_t *node, *oldroot;
301	xfs_da_args_t *args;
302	xfs_dablk_t blkno;
303	xfs_dabuf_t *bp;
304	int error, size;
305	xfs_inode_t *dp;
306	xfs_trans_t *tp;
307	xfs_mount_t *mp;
308	xfs_dir2_leaf_t *leaf;
309
310	/*
311	 * Copy the existing (incorrect) block from the root node position
312	 * to a free space somewhere.
313	 */
314	args = state->args;
315	ASSERT(args != NULL);
316	error = xfs_da_grow_inode(args, &blkno);
317	if (error)
318		return(error);
319	dp = args->dp;
320	tp = args->trans;
321	mp = state->mp;
322	error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
323	if (error)
324		return(error);
325	ASSERT(bp != NULL);
326	node = bp->data;
327	oldroot = blk1->bp->data;
328	if (be16_to_cpu(oldroot->hdr.info.magic) == XFS_DA_NODE_MAGIC) {
329		size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] -
330			     (char *)oldroot);
331	} else {
332		ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
333		leaf = (xfs_dir2_leaf_t *)oldroot;
334		size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] -
335			     (char *)leaf);
336	}
337	memcpy(node, oldroot, size);
338	xfs_da_log_buf(tp, bp, 0, size - 1);
339	xfs_da_buf_done(blk1->bp);
340	blk1->bp = bp;
341	blk1->blkno = blkno;
342
343	/*
344	 * Set up the new root node.
345	 */
346	error = xfs_da_node_create(args,
347		(args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0,
348		be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork);
349	if (error)
350		return(error);
351	node = bp->data;
352	node->btree[0].hashval = cpu_to_be32(blk1->hashval);
353	node->btree[0].before = cpu_to_be32(blk1->blkno);
354	node->btree[1].hashval = cpu_to_be32(blk2->hashval);
355	node->btree[1].before = cpu_to_be32(blk2->blkno);
356	node->hdr.count = cpu_to_be16(2);
357
358#ifdef DEBUG
359	if (be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC) {
360		ASSERT(blk1->blkno >= mp->m_dirleafblk &&
361		       blk1->blkno < mp->m_dirfreeblk);
362		ASSERT(blk2->blkno >= mp->m_dirleafblk &&
363		       blk2->blkno < mp->m_dirfreeblk);
364	}
365#endif
366
367	/* Header is already logged by xfs_da_node_create */
368	xfs_da_log_buf(tp, bp,
369		XFS_DA_LOGRANGE(node, node->btree,
370			sizeof(xfs_da_node_entry_t) * 2));
371	xfs_da_buf_done(bp);
372
373	return(0);
374}
375
376/*
377 * Split the node, rebalance, then add the new entry.
378 */
379STATIC int						/* error */
380xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
381				 xfs_da_state_blk_t *newblk,
382				 xfs_da_state_blk_t *addblk,
383				 int treelevel, int *result)
384{
385	xfs_da_intnode_t *node;
386	xfs_dablk_t blkno;
387	int newcount, error;
388	int useextra;
389
390	node = oldblk->bp->data;
391	ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
392
393	/*
394	 * With V2 dirs the extra block is data or freespace.
395	 */
396	useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
397	newcount = 1 + useextra;
398	/*
399	 * Do we have to split the node?
400	 */
401	if ((be16_to_cpu(node->hdr.count) + newcount) > state->node_ents) {
402		/*
403		 * Allocate a new node, add to the doubly linked chain of
404		 * nodes, then move some of our excess entries into it.
405		 */
406		error = xfs_da_grow_inode(state->args, &blkno);
407		if (error)
408			return(error);	/* GROT: dir is inconsistent */
409
410		error = xfs_da_node_create(state->args, blkno, treelevel,
411					   &newblk->bp, state->args->whichfork);
412		if (error)
413			return(error);	/* GROT: dir is inconsistent */
414		newblk->blkno = blkno;
415		newblk->magic = XFS_DA_NODE_MAGIC;
416		xfs_da_node_rebalance(state, oldblk, newblk);
417		error = xfs_da_blk_link(state, oldblk, newblk);
418		if (error)
419			return(error);
420		*result = 1;
421	} else {
422		*result = 0;
423	}
424
425	/*
426	 * Insert the new entry(s) into the correct block
427	 * (updating last hashval in the process).
428	 *
429	 * xfs_da_node_add() inserts BEFORE the given index,
430	 * and as a result of using node_lookup_int() we always
431	 * point to a valid entry (not after one), but a split
432	 * operation always results in a new block whose hashvals
433	 * FOLLOW the current block.
434	 *
435	 * If we had double-split op below us, then add the extra block too.
436	 */
437	node = oldblk->bp->data;
438	if (oldblk->index <= be16_to_cpu(node->hdr.count)) {
439		oldblk->index++;
440		xfs_da_node_add(state, oldblk, addblk);
441		if (useextra) {
442			if (state->extraafter)
443				oldblk->index++;
444			xfs_da_node_add(state, oldblk, &state->extrablk);
445			state->extravalid = 0;
446		}
447	} else {
448		newblk->index++;
449		xfs_da_node_add(state, newblk, addblk);
450		if (useextra) {
451			if (state->extraafter)
452				newblk->index++;
453			xfs_da_node_add(state, newblk, &state->extrablk);
454			state->extravalid = 0;
455		}
456	}
457
458	return(0);
459}
460
461/*
462 * Balance the btree elements between two intermediate nodes,
463 * usually one full and one empty.
464 *
465 * NOTE: if blk2 is empty, then it will get the upper half of blk1.
466 */
467STATIC void
468xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
469				     xfs_da_state_blk_t *blk2)
470{
471	xfs_da_intnode_t *node1, *node2, *tmpnode;
472	xfs_da_node_entry_t *btree_s, *btree_d;
473	int count, tmp;
474	xfs_trans_t *tp;
475
476	node1 = blk1->bp->data;
477	node2 = blk2->bp->data;
478	/*
479	 * Figure out how many entries need to move, and in which direction.
480	 * Swap the nodes around if that makes it simpler.
481	 */
482	if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
483	    ((be32_to_cpu(node2->btree[0].hashval) < be32_to_cpu(node1->btree[0].hashval)) ||
484	     (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) <
485	      be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) {
486		tmpnode = node1;
487		node1 = node2;
488		node2 = tmpnode;
489	}
490	ASSERT(be16_to_cpu(node1->hdr.info.magic) == XFS_DA_NODE_MAGIC);
491	ASSERT(be16_to_cpu(node2->hdr.info.magic) == XFS_DA_NODE_MAGIC);
492	count = (be16_to_cpu(node1->hdr.count) - be16_to_cpu(node2->hdr.count)) / 2;
493	if (count == 0)
494		return;
495	tp = state->args->trans;
496	/*
497	 * Two cases: high-to-low and low-to-high.
498	 */
499	if (count > 0) {
500		/*
501		 * Move elements in node2 up to make a hole.
502		 */
503		if ((tmp = be16_to_cpu(node2->hdr.count)) > 0) {
504			tmp *= (uint)sizeof(xfs_da_node_entry_t);
505			btree_s = &node2->btree[0];
506			btree_d = &node2->btree[count];
507			memmove(btree_d, btree_s, tmp);
508		}
509
510		/*
511		 * Move the req'd B-tree elements from high in node1 to
512		 * low in node2.
513		 */
514		be16_add(&node2->hdr.count, count);
515		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
516		btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count];
517		btree_d = &node2->btree[0];
518		memcpy(btree_d, btree_s, tmp);
519		be16_add(&node1->hdr.count, -count);
520	} else {
521		/*
522		 * Move the req'd B-tree elements from low in node2 to
523		 * high in node1.
524		 */
525		count = -count;
526		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
527		btree_s = &node2->btree[0];
528		btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)];
529		memcpy(btree_d, btree_s, tmp);
530		be16_add(&node1->hdr.count, count);
531		xfs_da_log_buf(tp, blk1->bp,
532			XFS_DA_LOGRANGE(node1, btree_d, tmp));
533
534		/*
535		 * Move elements in node2 down to fill the hole.
536		 */
537		tmp  = be16_to_cpu(node2->hdr.count) - count;
538		tmp *= (uint)sizeof(xfs_da_node_entry_t);
539		btree_s = &node2->btree[count];
540		btree_d = &node2->btree[0];
541		memmove(btree_d, btree_s, tmp);
542		be16_add(&node2->hdr.count, -count);
543	}
544
545	/*
546	 * Log header of node 1 and all current bits of node 2.
547	 */
548	xfs_da_log_buf(tp, blk1->bp,
549		XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr)));
550	xfs_da_log_buf(tp, blk2->bp,
551		XFS_DA_LOGRANGE(node2, &node2->hdr,
552			sizeof(node2->hdr) +
553			sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count)));
554
555	/*
556	 * Record the last hashval from each block for upward propagation.
557	 * (note: don't use the swapped node pointers)
558	 */
559	node1 = blk1->bp->data;
560	node2 = blk2->bp->data;
561	blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval);
562	blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval);
563
564	/*
565	 * Adjust the expected index for insertion.
566	 */
567	if (blk1->index >= be16_to_cpu(node1->hdr.count)) {
568		blk2->index = blk1->index - be16_to_cpu(node1->hdr.count);
569		blk1->index = be16_to_cpu(node1->hdr.count) + 1;	/* make it invalid */
570	}
571}
572
573/*
574 * Add a new entry to an intermediate node.
575 */
576STATIC void
577xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
578			       xfs_da_state_blk_t *newblk)
579{
580	xfs_da_intnode_t *node;
581	xfs_da_node_entry_t *btree;
582	int tmp;
583	xfs_mount_t *mp;
584
585	node = oldblk->bp->data;
586	mp = state->mp;
587	ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
588	ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
589	ASSERT(newblk->blkno != 0);
590	if (state->args->whichfork == XFS_DATA_FORK)
591		ASSERT(newblk->blkno >= mp->m_dirleafblk &&
592		       newblk->blkno < mp->m_dirfreeblk);
593
594	/*
595	 * We may need to make some room before we insert the new node.
596	 */
597	tmp = 0;
598	btree = &node->btree[ oldblk->index ];
599	if (oldblk->index < be16_to_cpu(node->hdr.count)) {
600		tmp = (be16_to_cpu(node->hdr.count) - oldblk->index) * (uint)sizeof(*btree);
601		memmove(btree + 1, btree, tmp);
602	}
603	btree->hashval = cpu_to_be32(newblk->hashval);
604	btree->before = cpu_to_be32(newblk->blkno);
605	xfs_da_log_buf(state->args->trans, oldblk->bp,
606		XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree)));
607	be16_add(&node->hdr.count, 1);
608	xfs_da_log_buf(state->args->trans, oldblk->bp,
609		XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
610
611	/*
612	 * Copy the last hash value from the oldblk to propagate upwards.
613	 */
614	oldblk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1 ].hashval);
615}
616
617/*========================================================================
618 * Routines used for shrinking the Btree.
619 *========================================================================*/
620
621/*
622 * Deallocate an empty leaf node, remove it from its parent,
623 * possibly deallocating that block, etc...
624 */
625int
626xfs_da_join(xfs_da_state_t *state)
627{
628	xfs_da_state_blk_t *drop_blk, *save_blk;
629	int action, error;
630
631	action = 0;
632	drop_blk = &state->path.blk[ state->path.active-1 ];
633	save_blk = &state->altpath.blk[ state->path.active-1 ];
634	ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
635	ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
636	       drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
637
638	/*
639	 * Walk back up the tree joining/deallocating as necessary.
640	 * When we stop dropping blocks, break out.
641	 */
642	for (  ; state->path.active >= 2; drop_blk--, save_blk--,
643		 state->path.active--) {
644		/*
645		 * See if we can combine the block with a neighbor.
646		 *   (action == 0) => no options, just leave
647		 *   (action == 1) => coalesce, then unlink
648		 *   (action == 2) => block empty, unlink it
649		 */
650		switch (drop_blk->magic) {
651		case XFS_ATTR_LEAF_MAGIC:
652			error = xfs_attr_leaf_toosmall(state, &action);
653			if (error)
654				return(error);
655			if (action == 0)
656				return(0);
657			xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
658			break;
659		case XFS_DIR2_LEAFN_MAGIC:
660			error = xfs_dir2_leafn_toosmall(state, &action);
661			if (error)
662				return error;
663			if (action == 0)
664				return 0;
665			xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
666			break;
667		case XFS_DA_NODE_MAGIC:
668			/*
669			 * Remove the offending node, fixup hashvals,
670			 * check for a toosmall neighbor.
671			 */
672			xfs_da_node_remove(state, drop_blk);
673			xfs_da_fixhashpath(state, &state->path);
674			error = xfs_da_node_toosmall(state, &action);
675			if (error)
676				return(error);
677			if (action == 0)
678				return 0;
679			xfs_da_node_unbalance(state, drop_blk, save_blk);
680			break;
681		}
682		xfs_da_fixhashpath(state, &state->altpath);
683		error = xfs_da_blk_unlink(state, drop_blk, save_blk);
684		xfs_da_state_kill_altpath(state);
685		if (error)
686			return(error);
687		error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
688							 drop_blk->bp);
689		drop_blk->bp = NULL;
690		if (error)
691			return(error);
692	}
693	/*
694	 * We joined all the way to the top.  If it turns out that
695	 * we only have one entry in the root, make the child block
696	 * the new root.
697	 */
698	xfs_da_node_remove(state, drop_blk);
699	xfs_da_fixhashpath(state, &state->path);
700	error = xfs_da_root_join(state, &state->path.blk[0]);
701	return(error);
702}
703
704/*
705 * We have only one entry in the root.  Copy the only remaining child of
706 * the old root to block 0 as the new root node.
707 */
708STATIC int
709xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
710{
711	xfs_da_intnode_t *oldroot;
712	/* REFERENCED */
713	xfs_da_blkinfo_t *blkinfo;
714	xfs_da_args_t *args;
715	xfs_dablk_t child;
716	xfs_dabuf_t *bp;
717	int error;
718
719	args = state->args;
720	ASSERT(args != NULL);
721	ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
722	oldroot = root_blk->bp->data;
723	ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DA_NODE_MAGIC);
724	ASSERT(!oldroot->hdr.info.forw);
725	ASSERT(!oldroot->hdr.info.back);
726
727	/*
728	 * If the root has more than one child, then don't do anything.
729	 */
730	if (be16_to_cpu(oldroot->hdr.count) > 1)
731		return(0);
732
733	/*
734	 * Read in the (only) child block, then copy those bytes into
735	 * the root block's buffer and free the original child block.
736	 */
737	child = be32_to_cpu(oldroot->btree[0].before);
738	ASSERT(child != 0);
739	error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
740					     args->whichfork);
741	if (error)
742		return(error);
743	ASSERT(bp != NULL);
744	blkinfo = bp->data;
745	if (be16_to_cpu(oldroot->hdr.level) == 1) {
746		ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DIR2_LEAFN_MAGIC ||
747		       be16_to_cpu(blkinfo->magic) == XFS_ATTR_LEAF_MAGIC);
748	} else {
749		ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DA_NODE_MAGIC);
750	}
751	ASSERT(!blkinfo->forw);
752	ASSERT(!blkinfo->back);
753	memcpy(root_blk->bp->data, bp->data, state->blocksize);
754	xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
755	error = xfs_da_shrink_inode(args, child, bp);
756	return(error);
757}
758
759/*
760 * Check a node block and its neighbors to see if the block should be
761 * collapsed into one or the other neighbor.  Always keep the block
762 * with the smaller block number.
763 * If the current block is over 50% full, don't try to join it, return 0.
764 * If the block is empty, fill in the state structure and return 2.
765 * If it can be collapsed, fill in the state structure and return 1.
766 * If nothing can be done, return 0.
767 */
768STATIC int
769xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
770{
771	xfs_da_intnode_t *node;
772	xfs_da_state_blk_t *blk;
773	xfs_da_blkinfo_t *info;
774	int count, forward, error, retval, i;
775	xfs_dablk_t blkno;
776	xfs_dabuf_t *bp;
777
778	/*
779	 * Check for the degenerate case of the block being over 50% full.
780	 * If so, it's not worth even looking to see if we might be able
781	 * to coalesce with a sibling.
782	 */
783	blk = &state->path.blk[ state->path.active-1 ];
784	info = blk->bp->data;
785	ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC);
786	node = (xfs_da_intnode_t *)info;
787	count = be16_to_cpu(node->hdr.count);
788	if (count > (state->node_ents >> 1)) {
789		*action = 0;	/* blk over 50%, don't try to join */
790		return(0);	/* blk over 50%, don't try to join */
791	}
792
793	/*
794	 * Check for the degenerate case of the block being empty.
795	 * If the block is empty, we'll simply delete it, no need to
796	 * coalesce it with a sibling block.  We choose (arbitrarily)
797	 * to merge with the forward block unless it is NULL.
798	 */
799	if (count == 0) {
800		/*
801		 * Make altpath point to the block we want to keep and
802		 * path point to the block we want to drop (this one).
803		 */
804		forward = (info->forw != 0);
805		memcpy(&state->altpath, &state->path, sizeof(state->path));
806		error = xfs_da_path_shift(state, &state->altpath, forward,
807						 0, &retval);
808		if (error)
809			return(error);
810		if (retval) {
811			*action = 0;
812		} else {
813			*action = 2;
814		}
815		return(0);
816	}
817
818	/*
819	 * Examine each sibling block to see if we can coalesce with
820	 * at least 25% free space to spare.  We need to figure out
821	 * whether to merge with the forward or the backward block.
822	 * We prefer coalescing with the lower numbered sibling so as
823	 * to shrink a directory over time.
824	 */
825	/* start with smaller blk num */
826	forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));
827	for (i = 0; i < 2; forward = !forward, i++) {
828		if (forward)
829			blkno = be32_to_cpu(info->forw);
830		else
831			blkno = be32_to_cpu(info->back);
832		if (blkno == 0)
833			continue;
834		error = xfs_da_read_buf(state->args->trans, state->args->dp,
835					blkno, -1, &bp, state->args->whichfork);
836		if (error)
837			return(error);
838		ASSERT(bp != NULL);
839
840		node = (xfs_da_intnode_t *)info;
841		count  = state->node_ents;
842		count -= state->node_ents >> 2;
843		count -= be16_to_cpu(node->hdr.count);
844		node = bp->data;
845		ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
846		count -= be16_to_cpu(node->hdr.count);
847		xfs_da_brelse(state->args->trans, bp);
848		if (count >= 0)
849			break;	/* fits with at least 25% to spare */
850	}
851	if (i >= 2) {
852		*action = 0;
853		return(0);
854	}
855
856	/*
857	 * Make altpath point to the block we want to keep (the lower
858	 * numbered block) and path point to the block we want to drop.
859	 */
860	memcpy(&state->altpath, &state->path, sizeof(state->path));
861	if (blkno < blk->blkno) {
862		error = xfs_da_path_shift(state, &state->altpath, forward,
863						 0, &retval);
864		if (error) {
865			return(error);
866		}
867		if (retval) {
868			*action = 0;
869			return(0);
870		}
871	} else {
872		error = xfs_da_path_shift(state, &state->path, forward,
873						 0, &retval);
874		if (error) {
875			return(error);
876		}
877		if (retval) {
878			*action = 0;
879			return(0);
880		}
881	}
882	*action = 1;
883	return(0);
884}
885
886/*
887 * Walk back up the tree adjusting hash values as necessary,
888 * when we stop making changes, return.
889 */
890void
891xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
892{
893	xfs_da_state_blk_t *blk;
894	xfs_da_intnode_t *node;
895	xfs_da_node_entry_t *btree;
896	xfs_dahash_t lasthash=0;
897	int level, count;
898
899	level = path->active-1;
900	blk = &path->blk[ level ];
901	switch (blk->magic) {
902	case XFS_ATTR_LEAF_MAGIC:
903		lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
904		if (count == 0)
905			return;
906		break;
907	case XFS_DIR2_LEAFN_MAGIC:
908		lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
909		if (count == 0)
910			return;
911		break;
912	case XFS_DA_NODE_MAGIC:
913		lasthash = xfs_da_node_lasthash(blk->bp, &count);
914		if (count == 0)
915			return;
916		break;
917	}
918	for (blk--, level--; level >= 0; blk--, level--) {
919		node = blk->bp->data;
920		ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
921		btree = &node->btree[ blk->index ];
922		if (be32_to_cpu(btree->hashval) == lasthash)
923			break;
924		blk->hashval = lasthash;
925		btree->hashval = cpu_to_be32(lasthash);
926		xfs_da_log_buf(state->args->trans, blk->bp,
927				  XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
928
929		lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
930	}
931}
932
933/*
934 * Remove an entry from an intermediate node.
935 */
936STATIC void
937xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
938{
939	xfs_da_intnode_t *node;
940	xfs_da_node_entry_t *btree;
941	int tmp;
942
943	node = drop_blk->bp->data;
944	ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count));
945	ASSERT(drop_blk->index >= 0);
946
947	/*
948	 * Copy over the offending entry, or just zero it out.
949	 */
950	btree = &node->btree[drop_blk->index];
951	if (drop_blk->index < (be16_to_cpu(node->hdr.count)-1)) {
952		tmp  = be16_to_cpu(node->hdr.count) - drop_blk->index - 1;
953		tmp *= (uint)sizeof(xfs_da_node_entry_t);
954		memmove(btree, btree + 1, tmp);
955		xfs_da_log_buf(state->args->trans, drop_blk->bp,
956		    XFS_DA_LOGRANGE(node, btree, tmp));
957		btree = &node->btree[be16_to_cpu(node->hdr.count)-1];
958	}
959	memset((char *)btree, 0, sizeof(xfs_da_node_entry_t));
960	xfs_da_log_buf(state->args->trans, drop_blk->bp,
961	    XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
962	be16_add(&node->hdr.count, -1);
963	xfs_da_log_buf(state->args->trans, drop_blk->bp,
964	    XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
965
966	/*
967	 * Copy the last hash value from the block to propagate upwards.
968	 */
969	btree--;
970	drop_blk->hashval = be32_to_cpu(btree->hashval);
971}
972
973/*
974 * Unbalance the btree elements between two intermediate nodes,
975 * move all Btree elements from one node into another.
976 */
977STATIC void
978xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
979				     xfs_da_state_blk_t *save_blk)
980{
981	xfs_da_intnode_t *drop_node, *save_node;
982	xfs_da_node_entry_t *btree;
983	int tmp;
984	xfs_trans_t *tp;
985
986	drop_node = drop_blk->bp->data;
987	save_node = save_blk->bp->data;
988	ASSERT(be16_to_cpu(drop_node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
989	ASSERT(be16_to_cpu(save_node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
990	tp = state->args->trans;
991
992	/*
993	 * If the dying block has lower hashvals, then move all the
994	 * elements in the remaining block up to make a hole.
995	 */
996	if ((be32_to_cpu(drop_node->btree[0].hashval) < be32_to_cpu(save_node->btree[ 0 ].hashval)) ||
997	    (be32_to_cpu(drop_node->btree[be16_to_cpu(drop_node->hdr.count)-1].hashval) <
998	     be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval)))
999	{
1000		btree = &save_node->btree[be16_to_cpu(drop_node->hdr.count)];
1001		tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
1002		memmove(btree, &save_node->btree[0], tmp);
1003		btree = &save_node->btree[0];
1004		xfs_da_log_buf(tp, save_blk->bp,
1005			XFS_DA_LOGRANGE(save_node, btree,
1006				(be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) *
1007				sizeof(xfs_da_node_entry_t)));
1008	} else {
1009		btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)];
1010		xfs_da_log_buf(tp, save_blk->bp,
1011			XFS_DA_LOGRANGE(save_node, btree,
1012				be16_to_cpu(drop_node->hdr.count) *
1013				sizeof(xfs_da_node_entry_t)));
1014	}
1015
1016	/*
1017	 * Move all the B-tree elements from drop_blk to save_blk.
1018	 */
1019	tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
1020	memcpy(btree, &drop_node->btree[0], tmp);
1021	be16_add(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count));
1022
1023	xfs_da_log_buf(tp, save_blk->bp,
1024		XFS_DA_LOGRANGE(save_node, &save_node->hdr,
1025			sizeof(save_node->hdr)));
1026
1027	/*
1028	 * Save the last hashval in the remaining block for upward propagation.
1029	 */
1030	save_blk->hashval = be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval);
1031}
1032
1033/*========================================================================
1034 * Routines used for finding things in the Btree.
1035 *========================================================================*/
1036
1037/*
1038 * Walk down the Btree looking for a particular filename, filling
1039 * in the state structure as we go.
1040 *
1041 * We will set the state structure to point to each of the elements
1042 * in each of the nodes where either the hashval is or should be.
1043 *
1044 * We support duplicate hashval's so for each entry in the current
1045 * node that could contain the desired hashval, descend.  This is a
1046 * pruned depth-first tree search.
1047 */
1048int							/* error */
1049xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
1050{
1051	xfs_da_state_blk_t *blk;
1052	xfs_da_blkinfo_t *curr;
1053	xfs_da_intnode_t *node;
1054	xfs_da_node_entry_t *btree;
1055	xfs_dablk_t blkno;
1056	int probe, span, max, error, retval;
1057	xfs_dahash_t hashval, btreehashval;
1058	xfs_da_args_t *args;
1059
1060	args = state->args;
1061
1062	/*
1063	 * Descend thru the B-tree searching each level for the right
1064	 * node to use, until the right hashval is found.
1065	 */
1066	blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0;
1067	for (blk = &state->path.blk[0], state->path.active = 1;
1068			 state->path.active <= XFS_DA_NODE_MAXDEPTH;
1069			 blk++, state->path.active++) {
1070		/*
1071		 * Read the next node down in the tree.
1072		 */
1073		blk->blkno = blkno;
1074		error = xfs_da_read_buf(args->trans, args->dp, blkno,
1075					-1, &blk->bp, args->whichfork);
1076		if (error) {
1077			blk->blkno = 0;
1078			state->path.active--;
1079			return(error);
1080		}
1081		curr = blk->bp->data;
1082		blk->magic = be16_to_cpu(curr->magic);
1083		ASSERT(blk->magic == XFS_DA_NODE_MAGIC ||
1084		       blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1085		       blk->magic == XFS_ATTR_LEAF_MAGIC);
1086
1087		/*
1088		 * Search an intermediate node for a match.
1089		 */
1090		if (blk->magic == XFS_DA_NODE_MAGIC) {
1091			node = blk->bp->data;
1092			max = be16_to_cpu(node->hdr.count);
1093			blk->hashval = be32_to_cpu(node->btree[max-1].hashval);
1094
1095			/*
1096			 * Binary search.  (note: small blocks will skip loop)
1097			 */
1098			probe = span = max / 2;
1099			hashval = args->hashval;
1100			for (btree = &node->btree[probe]; span > 4;
1101				   btree = &node->btree[probe]) {
1102				span /= 2;
1103				btreehashval = be32_to_cpu(btree->hashval);
1104				if (btreehashval < hashval)
1105					probe += span;
1106				else if (btreehashval > hashval)
1107					probe -= span;
1108				else
1109					break;
1110			}
1111			ASSERT((probe >= 0) && (probe < max));
1112			ASSERT((span <= 4) || (be32_to_cpu(btree->hashval) == hashval));
1113
1114			/*
1115			 * Since we may have duplicate hashval's, find the first
1116			 * matching hashval in the node.
1117			 */
1118			while ((probe > 0) && (be32_to_cpu(btree->hashval) >= hashval)) {
1119				btree--;
1120				probe--;
1121			}
1122			while ((probe < max) && (be32_to_cpu(btree->hashval) < hashval)) {
1123				btree++;
1124				probe++;
1125			}
1126
1127			/*
1128			 * Pick the right block to descend on.
1129			 */
1130			if (probe == max) {
1131				blk->index = max-1;
1132				blkno = be32_to_cpu(node->btree[max-1].before);
1133			} else {
1134				blk->index = probe;
1135				blkno = be32_to_cpu(btree->before);
1136			}
1137		} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1138			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
1139			break;
1140		} else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
1141			blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
1142			break;
1143		}
1144	}
1145
1146	/*
1147	 * A leaf block that ends in the hashval that we are interested in
1148	 * (final hashval == search hashval) means that the next block may
1149	 * contain more entries with the same hashval, shift upward to the
1150	 * next leaf and keep searching.
1151	 */
1152	for (;;) {
1153		if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
1154			retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
1155							&blk->index, state);
1156		} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1157			retval = xfs_attr_leaf_lookup_int(blk->bp, args);
1158			blk->index = args->index;
1159			args->blkno = blk->blkno;
1160		} else {
1161			ASSERT(0);
1162			return XFS_ERROR(EFSCORRUPTED);
1163		}
1164		if (((retval == ENOENT) || (retval == ENOATTR)) &&
1165		    (blk->hashval == args->hashval)) {
1166			error = xfs_da_path_shift(state, &state->path, 1, 1,
1167							 &retval);
1168			if (error)
1169				return(error);
1170			if (retval == 0) {
1171				continue;
1172			} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1173				/* path_shift() gives ENOENT */
1174				retval = XFS_ERROR(ENOATTR);
1175			}
1176		}
1177		break;
1178	}
1179	*result = retval;
1180	return(0);
1181}
1182
1183/*========================================================================
1184 * Utility routines.
1185 *========================================================================*/
1186
1187/*
1188 * Link a new block into a doubly linked list of blocks (of whatever type).
1189 */
1190int							/* error */
1191xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
1192			       xfs_da_state_blk_t *new_blk)
1193{
1194	xfs_da_blkinfo_t *old_info, *new_info, *tmp_info;
1195	xfs_da_args_t *args;
1196	int before=0, error;
1197	xfs_dabuf_t *bp;
1198
1199	/*
1200	 * Set up environment.
1201	 */
1202	args = state->args;
1203	ASSERT(args != NULL);
1204	old_info = old_blk->bp->data;
1205	new_info = new_blk->bp->data;
1206	ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
1207	       old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1208	       old_blk->magic == XFS_ATTR_LEAF_MAGIC);
1209	ASSERT(old_blk->magic == be16_to_cpu(old_info->magic));
1210	ASSERT(new_blk->magic == be16_to_cpu(new_info->magic));
1211	ASSERT(old_blk->magic == new_blk->magic);
1212
1213	switch (old_blk->magic) {
1214	case XFS_ATTR_LEAF_MAGIC:
1215		before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
1216		break;
1217	case XFS_DIR2_LEAFN_MAGIC:
1218		before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
1219		break;
1220	case XFS_DA_NODE_MAGIC:
1221		before = xfs_da_node_order(old_blk->bp, new_blk->bp);
1222		break;
1223	}
1224
1225	/*
1226	 * Link blocks in appropriate order.
1227	 */
1228	if (before) {
1229		/*
1230		 * Link new block in before existing block.
1231		 */
1232		new_info->forw = cpu_to_be32(old_blk->blkno);
1233		new_info->back = old_info->back;
1234		if (old_info->back) {
1235			error = xfs_da_read_buf(args->trans, args->dp,
1236						be32_to_cpu(old_info->back),
1237						-1, &bp, args->whichfork);
1238			if (error)
1239				return(error);
1240			ASSERT(bp != NULL);
1241			tmp_info = bp->data;
1242			ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic));
1243			ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
1244			tmp_info->forw = cpu_to_be32(new_blk->blkno);
1245			xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
1246			xfs_da_buf_done(bp);
1247		}
1248		old_info->back = cpu_to_be32(new_blk->blkno);
1249	} else {
1250		/*
1251		 * Link new block in after existing block.
1252		 */
1253		new_info->forw = old_info->forw;
1254		new_info->back = cpu_to_be32(old_blk->blkno);
1255		if (old_info->forw) {
1256			error = xfs_da_read_buf(args->trans, args->dp,
1257						be32_to_cpu(old_info->forw),
1258						-1, &bp, args->whichfork);
1259			if (error)
1260				return(error);
1261			ASSERT(bp != NULL);
1262			tmp_info = bp->data;
1263			ASSERT(tmp_info->magic == old_info->magic);
1264			ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
1265			tmp_info->back = cpu_to_be32(new_blk->blkno);
1266			xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
1267			xfs_da_buf_done(bp);
1268		}
1269		old_info->forw = cpu_to_be32(new_blk->blkno);
1270	}
1271
1272	xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
1273	xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
1274	return(0);
1275}
1276
1277/*
1278 * Compare two intermediate nodes for "order".
1279 */
1280STATIC int
1281xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp)
1282{
1283	xfs_da_intnode_t *node1, *node2;
1284
1285	node1 = node1_bp->data;
1286	node2 = node2_bp->data;
1287	ASSERT((be16_to_cpu(node1->hdr.info.magic) == XFS_DA_NODE_MAGIC) &&
1288	       (be16_to_cpu(node2->hdr.info.magic) == XFS_DA_NODE_MAGIC));
1289	if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
1290	    ((be32_to_cpu(node2->btree[0].hashval) <
1291	      be32_to_cpu(node1->btree[0].hashval)) ||
1292	     (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) <
1293	      be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) {
1294		return(1);
1295	}
1296	return(0);
1297}
1298
1299/*
1300 * Pick up the last hashvalue from an intermediate node.
1301 */
1302STATIC uint
1303xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count)
1304{
1305	xfs_da_intnode_t *node;
1306
1307	node = bp->data;
1308	ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
1309	if (count)
1310		*count = be16_to_cpu(node->hdr.count);
1311	if (!node->hdr.count)
1312		return(0);
1313	return be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
1314}
1315
1316/*
1317 * Unlink a block from a doubly linked list of blocks.
1318 */
1319STATIC int						/* error */
1320xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1321				 xfs_da_state_blk_t *save_blk)
1322{
1323	xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info;
1324	xfs_da_args_t *args;
1325	xfs_dabuf_t *bp;
1326	int error;
1327
1328	/*
1329	 * Set up environment.
1330	 */
1331	args = state->args;
1332	ASSERT(args != NULL);
1333	save_info = save_blk->bp->data;
1334	drop_info = drop_blk->bp->data;
1335	ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
1336	       save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1337	       save_blk->magic == XFS_ATTR_LEAF_MAGIC);
1338	ASSERT(save_blk->magic == be16_to_cpu(save_info->magic));
1339	ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic));
1340	ASSERT(save_blk->magic == drop_blk->magic);
1341	ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
1342	       (be32_to_cpu(save_info->back) == drop_blk->blkno));
1343	ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
1344	       (be32_to_cpu(drop_info->back) == save_blk->blkno));
1345
1346	/*
1347	 * Unlink the leaf block from the doubly linked chain of leaves.
1348	 */
1349	if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
1350		save_info->back = drop_info->back;
1351		if (drop_info->back) {
1352			error = xfs_da_read_buf(args->trans, args->dp,
1353						be32_to_cpu(drop_info->back),
1354						-1, &bp, args->whichfork);
1355			if (error)
1356				return(error);
1357			ASSERT(bp != NULL);
1358			tmp_info = bp->data;
1359			ASSERT(tmp_info->magic == save_info->magic);
1360			ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
1361			tmp_info->forw = cpu_to_be32(save_blk->blkno);
1362			xfs_da_log_buf(args->trans, bp, 0,
1363						    sizeof(*tmp_info) - 1);
1364			xfs_da_buf_done(bp);
1365		}
1366	} else {
1367		save_info->forw = drop_info->forw;
1368		if (drop_info->forw) {
1369			error = xfs_da_read_buf(args->trans, args->dp,
1370						be32_to_cpu(drop_info->forw),
1371						-1, &bp, args->whichfork);
1372			if (error)
1373				return(error);
1374			ASSERT(bp != NULL);
1375			tmp_info = bp->data;
1376			ASSERT(tmp_info->magic == save_info->magic);
1377			ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
1378			tmp_info->back = cpu_to_be32(save_blk->blkno);
1379			xfs_da_log_buf(args->trans, bp, 0,
1380						    sizeof(*tmp_info) - 1);
1381			xfs_da_buf_done(bp);
1382		}
1383	}
1384
1385	xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
1386	return(0);
1387}
1388
1389/*
1390 * Move a path "forward" or "!forward" one block at the current level.
1391 *
1392 * This routine will adjust a "path" to point to the next block
1393 * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
1394 * Btree, including updating pointers to the intermediate nodes between
1395 * the new bottom and the root.
1396 */
1397int							/* error */
1398xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
1399				 int forward, int release, int *result)
1400{
1401	xfs_da_state_blk_t *blk;
1402	xfs_da_blkinfo_t *info;
1403	xfs_da_intnode_t *node;
1404	xfs_da_args_t *args;
1405	xfs_dablk_t blkno=0;
1406	int level, error;
1407
1408	/*
1409	 * Roll up the Btree looking for the first block where our
1410	 * current index is not at the edge of the block.  Note that
1411	 * we skip the bottom layer because we want the sibling block.
1412	 */
1413	args = state->args;
1414	ASSERT(args != NULL);
1415	ASSERT(path != NULL);
1416	ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1417	level = (path->active-1) - 1;	/* skip bottom layer in path */
1418	for (blk = &path->blk[level]; level >= 0; blk--, level--) {
1419		ASSERT(blk->bp != NULL);
1420		node = blk->bp->data;
1421		ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
1422		if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) {
1423			blk->index++;
1424			blkno = be32_to_cpu(node->btree[blk->index].before);
1425			break;
1426		} else if (!forward && (blk->index > 0)) {
1427			blk->index--;
1428			blkno = be32_to_cpu(node->btree[blk->index].before);
1429			break;
1430		}
1431	}
1432	if (level < 0) {
1433		*result = XFS_ERROR(ENOENT);	/* we're out of our tree */
1434		ASSERT(args->oknoent);
1435		return(0);
1436	}
1437
1438	/*
1439	 * Roll down the edge of the subtree until we reach the
1440	 * same depth we were at originally.
1441	 */
1442	for (blk++, level++; level < path->active; blk++, level++) {
1443		/*
1444		 * Release the old block.
1445		 * (if it's dirty, trans won't actually let go)
1446		 */
1447		if (release)
1448			xfs_da_brelse(args->trans, blk->bp);
1449
1450		/*
1451		 * Read the next child block.
1452		 */
1453		blk->blkno = blkno;
1454		error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
1455						     &blk->bp, args->whichfork);
1456		if (error)
1457			return(error);
1458		ASSERT(blk->bp != NULL);
1459		info = blk->bp->data;
1460		ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC ||
1461		       be16_to_cpu(info->magic) == XFS_DIR2_LEAFN_MAGIC ||
1462		       be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC);
1463		blk->magic = be16_to_cpu(info->magic);
1464		if (blk->magic == XFS_DA_NODE_MAGIC) {
1465			node = (xfs_da_intnode_t *)info;
1466			blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
1467			if (forward)
1468				blk->index = 0;
1469			else
1470				blk->index = be16_to_cpu(node->hdr.count)-1;
1471			blkno = be32_to_cpu(node->btree[blk->index].before);
1472		} else {
1473			ASSERT(level == path->active-1);
1474			blk->index = 0;
1475			switch(blk->magic) {
1476			case XFS_ATTR_LEAF_MAGIC:
1477				blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
1478								      NULL);
1479				break;
1480			case XFS_DIR2_LEAFN_MAGIC:
1481				blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
1482								       NULL);
1483				break;
1484			default:
1485				ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
1486				       blk->magic == XFS_DIR2_LEAFN_MAGIC);
1487				break;
1488			}
1489		}
1490	}
1491	*result = 0;
1492	return(0);
1493}
1494
1495
1496/*========================================================================
1497 * Utility routines.
1498 *========================================================================*/
1499
1500/*
1501 * Implement a simple hash on a character string.
1502 * Rotate the hash value by 7 bits, then XOR each character in.
1503 * This is implemented with some source-level loop unrolling.
1504 */
1505xfs_dahash_t
1506xfs_da_hashname(const uchar_t *name, int namelen)
1507{
1508	xfs_dahash_t hash;
1509
1510	/*
1511	 * Do four characters at a time as long as we can.
1512	 */
1513	for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
1514		hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
1515		       (name[3] << 0) ^ rol32(hash, 7 * 4);
1516
1517	/*
1518	 * Now do the rest of the characters.
1519	 */
1520	switch (namelen) {
1521	case 3:
1522		return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
1523		       rol32(hash, 7 * 3);
1524	case 2:
1525		return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
1526	case 1:
1527		return (name[0] << 0) ^ rol32(hash, 7 * 1);
1528	default: /* case 0: */
1529		return hash;
1530	}
1531}
1532
1533/*
1534 * Add a block to the btree ahead of the file.
1535 * Return the new block number to the caller.
1536 */
1537int
1538xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
1539{
1540	xfs_fileoff_t bno, b;
1541	xfs_bmbt_irec_t map;
1542	xfs_bmbt_irec_t	*mapp;
1543	xfs_inode_t *dp;
1544	int nmap, error, w, count, c, got, i, mapi;
1545	xfs_trans_t *tp;
1546	xfs_mount_t *mp;
1547
1548	dp = args->dp;
1549	mp = dp->i_mount;
1550	w = args->whichfork;
1551	tp = args->trans;
1552	/*
1553	 * For new directories adjust the file offset and block count.
1554	 */
1555	if (w == XFS_DATA_FORK) {
1556		bno = mp->m_dirleafblk;
1557		count = mp->m_dirblkfsbs;
1558	} else {
1559		bno = 0;
1560		count = 1;
1561	}
1562	/*
1563	 * Find a spot in the file space to put the new block.
1564	 */
1565	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w)))
1566		return error;
1567	if (w == XFS_DATA_FORK)
1568		ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk);
1569	/*
1570	 * Try mapping it in one filesystem block.
1571	 */
1572	nmap = 1;
1573	ASSERT(args->firstblock != NULL);
1574	if ((error = xfs_bmapi(tp, dp, bno, count,
1575			XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
1576			XFS_BMAPI_CONTIG,
1577			args->firstblock, args->total, &map, &nmap,
1578			args->flist, NULL))) {
1579		return error;
1580	}
1581	ASSERT(nmap <= 1);
1582	if (nmap == 1) {
1583		mapp = &map;
1584		mapi = 1;
1585	}
1586	/*
1587	 * If we didn't get it and the block might work if fragmented,
1588	 * try without the CONTIG flag.  Loop until we get it all.
1589	 */
1590	else if (nmap == 0 && count > 1) {
1591		mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
1592		for (b = bno, mapi = 0; b < bno + count; ) {
1593			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
1594			c = (int)(bno + count - b);
1595			if ((error = xfs_bmapi(tp, dp, b, c,
1596					XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|
1597					XFS_BMAPI_METADATA,
1598					args->firstblock, args->total,
1599					&mapp[mapi], &nmap, args->flist,
1600					NULL))) {
1601				kmem_free(mapp, sizeof(*mapp) * count);
1602				return error;
1603			}
1604			if (nmap < 1)
1605				break;
1606			mapi += nmap;
1607			b = mapp[mapi - 1].br_startoff +
1608			    mapp[mapi - 1].br_blockcount;
1609		}
1610	} else {
1611		mapi = 0;
1612		mapp = NULL;
1613	}
1614	/*
1615	 * Count the blocks we got, make sure it matches the total.
1616	 */
1617	for (i = 0, got = 0; i < mapi; i++)
1618		got += mapp[i].br_blockcount;
1619	if (got != count || mapp[0].br_startoff != bno ||
1620	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
1621	    bno + count) {
1622		if (mapp != &map)
1623			kmem_free(mapp, sizeof(*mapp) * count);
1624		return XFS_ERROR(ENOSPC);
1625	}
1626	if (mapp != &map)
1627		kmem_free(mapp, sizeof(*mapp) * count);
1628	*new_blkno = (xfs_dablk_t)bno;
1629	return 0;
1630}
1631
1632/*
1633 * Ick.  We need to always be able to remove a btree block, even
1634 * if there's no space reservation because the filesystem is full.
1635 * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
1636 * It swaps the target block with the last block in the file.  The
1637 * last block in the file can always be removed since it can't cause
1638 * a bmap btree split to do that.
1639 */
1640STATIC int
1641xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
1642		      xfs_dabuf_t **dead_bufp)
1643{
1644	xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno;
1645	xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf;
1646	xfs_fileoff_t lastoff;
1647	xfs_inode_t *ip;
1648	xfs_trans_t *tp;
1649	xfs_mount_t *mp;
1650	int error, w, entno, level, dead_level;
1651	xfs_da_blkinfo_t *dead_info, *sib_info;
1652	xfs_da_intnode_t *par_node, *dead_node;
1653	xfs_dir2_leaf_t *dead_leaf2;
1654	xfs_dahash_t dead_hash;
1655
1656	dead_buf = *dead_bufp;
1657	dead_blkno = *dead_blknop;
1658	tp = args->trans;
1659	ip = args->dp;
1660	w = args->whichfork;
1661	ASSERT(w == XFS_DATA_FORK);
1662	mp = ip->i_mount;
1663	lastoff = mp->m_dirfreeblk;
1664	error = xfs_bmap_last_before(tp, ip, &lastoff, w);
1665	if (error)
1666		return error;
1667	if (unlikely(lastoff == 0)) {
1668		XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
1669				 mp);
1670		return XFS_ERROR(EFSCORRUPTED);
1671	}
1672	/*
1673	 * Read the last block in the btree space.
1674	 */
1675	last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
1676	if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
1677		return error;
1678	/*
1679	 * Copy the last block into the dead buffer and log it.
1680	 */
1681	memcpy(dead_buf->data, last_buf->data, mp->m_dirblksize);
1682	xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
1683	dead_info = dead_buf->data;
1684	/*
1685	 * Get values from the moved block.
1686	 */
1687	if (be16_to_cpu(dead_info->magic) == XFS_DIR2_LEAFN_MAGIC) {
1688		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
1689		dead_level = 0;
1690		dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval);
1691	} else {
1692		ASSERT(be16_to_cpu(dead_info->magic) == XFS_DA_NODE_MAGIC);
1693		dead_node = (xfs_da_intnode_t *)dead_info;
1694		dead_level = be16_to_cpu(dead_node->hdr.level);
1695		dead_hash = be32_to_cpu(dead_node->btree[be16_to_cpu(dead_node->hdr.count) - 1].hashval);
1696	}
1697	sib_buf = par_buf = NULL;
1698	/*
1699	 * If the moved block has a left sibling, fix up the pointers.
1700	 */
1701	if ((sib_blkno = be32_to_cpu(dead_info->back))) {
1702		if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
1703			goto done;
1704		sib_info = sib_buf->data;
1705		if (unlikely(
1706		    be32_to_cpu(sib_info->forw) != last_blkno ||
1707		    sib_info->magic != dead_info->magic)) {
1708			XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
1709					 XFS_ERRLEVEL_LOW, mp);
1710			error = XFS_ERROR(EFSCORRUPTED);
1711			goto done;
1712		}
1713		sib_info->forw = cpu_to_be32(dead_blkno);
1714		xfs_da_log_buf(tp, sib_buf,
1715			XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
1716					sizeof(sib_info->forw)));
1717		xfs_da_buf_done(sib_buf);
1718		sib_buf = NULL;
1719	}
1720	/*
1721	 * If the moved block has a right sibling, fix up the pointers.
1722	 */
1723	if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
1724		if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
1725			goto done;
1726		sib_info = sib_buf->data;
1727		if (unlikely(
1728		       be32_to_cpu(sib_info->back) != last_blkno ||
1729		       sib_info->magic != dead_info->magic)) {
1730			XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
1731					 XFS_ERRLEVEL_LOW, mp);
1732			error = XFS_ERROR(EFSCORRUPTED);
1733			goto done;
1734		}
1735		sib_info->back = cpu_to_be32(dead_blkno);
1736		xfs_da_log_buf(tp, sib_buf,
1737			XFS_DA_LOGRANGE(sib_info, &sib_info->back,
1738					sizeof(sib_info->back)));
1739		xfs_da_buf_done(sib_buf);
1740		sib_buf = NULL;
1741	}
1742	par_blkno = mp->m_dirleafblk;
1743	level = -1;
1744	/*
1745	 * Walk down the tree looking for the parent of the moved block.
1746	 */
1747	for (;;) {
1748		if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
1749			goto done;
1750		par_node = par_buf->data;
1751		if (unlikely(
1752		    be16_to_cpu(par_node->hdr.info.magic) != XFS_DA_NODE_MAGIC ||
1753		    (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) {
1754			XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
1755					 XFS_ERRLEVEL_LOW, mp);
1756			error = XFS_ERROR(EFSCORRUPTED);
1757			goto done;
1758		}
1759		level = be16_to_cpu(par_node->hdr.level);
1760		for (entno = 0;
1761		     entno < be16_to_cpu(par_node->hdr.count) &&
1762		     be32_to_cpu(par_node->btree[entno].hashval) < dead_hash;
1763		     entno++)
1764			continue;
1765		if (unlikely(entno == be16_to_cpu(par_node->hdr.count))) {
1766			XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
1767					 XFS_ERRLEVEL_LOW, mp);
1768			error = XFS_ERROR(EFSCORRUPTED);
1769			goto done;
1770		}
1771		par_blkno = be32_to_cpu(par_node->btree[entno].before);
1772		if (level == dead_level + 1)
1773			break;
1774		xfs_da_brelse(tp, par_buf);
1775		par_buf = NULL;
1776	}
1777	/*
1778	 * We're in the right parent block.
1779	 * Look for the right entry.
1780	 */
1781	for (;;) {
1782		for (;
1783		     entno < be16_to_cpu(par_node->hdr.count) &&
1784		     be32_to_cpu(par_node->btree[entno].before) != last_blkno;
1785		     entno++)
1786			continue;
1787		if (entno < be16_to_cpu(par_node->hdr.count))
1788			break;
1789		par_blkno = be32_to_cpu(par_node->hdr.info.forw);
1790		xfs_da_brelse(tp, par_buf);
1791		par_buf = NULL;
1792		if (unlikely(par_blkno == 0)) {
1793			XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
1794					 XFS_ERRLEVEL_LOW, mp);
1795			error = XFS_ERROR(EFSCORRUPTED);
1796			goto done;
1797		}
1798		if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
1799			goto done;
1800		par_node = par_buf->data;
1801		if (unlikely(
1802		    be16_to_cpu(par_node->hdr.level) != level ||
1803		    be16_to_cpu(par_node->hdr.info.magic) != XFS_DA_NODE_MAGIC)) {
1804			XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
1805					 XFS_ERRLEVEL_LOW, mp);
1806			error = XFS_ERROR(EFSCORRUPTED);
1807			goto done;
1808		}
1809		entno = 0;
1810	}
1811	/*
1812	 * Update the parent entry pointing to the moved block.
1813	 */
1814	par_node->btree[entno].before = cpu_to_be32(dead_blkno);
1815	xfs_da_log_buf(tp, par_buf,
1816		XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before,
1817				sizeof(par_node->btree[entno].before)));
1818	xfs_da_buf_done(par_buf);
1819	xfs_da_buf_done(dead_buf);
1820	*dead_blknop = last_blkno;
1821	*dead_bufp = last_buf;
1822	return 0;
1823done:
1824	if (par_buf)
1825		xfs_da_brelse(tp, par_buf);
1826	if (sib_buf)
1827		xfs_da_brelse(tp, sib_buf);
1828	xfs_da_brelse(tp, last_buf);
1829	return error;
1830}
1831
1832/*
1833 * Remove a btree block from a directory or attribute.
1834 */
1835int
1836xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
1837		    xfs_dabuf_t *dead_buf)
1838{
1839	xfs_inode_t *dp;
1840	int done, error, w, count;
1841	xfs_trans_t *tp;
1842	xfs_mount_t *mp;
1843
1844	dp = args->dp;
1845	w = args->whichfork;
1846	tp = args->trans;
1847	mp = dp->i_mount;
1848	if (w == XFS_DATA_FORK)
1849		count = mp->m_dirblkfsbs;
1850	else
1851		count = 1;
1852	for (;;) {
1853		/*
1854		 * Remove extents.  If we get ENOSPC for a dir we have to move
1855		 * the last block to the place we want to kill.
1856		 */
1857		if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
1858				XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA,
1859				0, args->firstblock, args->flist, NULL,
1860				&done)) == ENOSPC) {
1861			if (w != XFS_DATA_FORK)
1862				break;
1863			if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
1864					&dead_buf)))
1865				break;
1866		} else {
1867			break;
1868		}
1869	}
1870	xfs_da_binval(tp, dead_buf);
1871	return error;
1872}
1873
1874/*
1875 * See if the mapping(s) for this btree block are valid, i.e.
1876 * don't contain holes, are logically contiguous, and cover the whole range.
1877 */
1878STATIC int
1879xfs_da_map_covers_blocks(
1880	int		nmap,
1881	xfs_bmbt_irec_t	*mapp,
1882	xfs_dablk_t	bno,
1883	int		count)
1884{
1885	int		i;
1886	xfs_fileoff_t	off;
1887
1888	for (i = 0, off = bno; i < nmap; i++) {
1889		if (mapp[i].br_startblock == HOLESTARTBLOCK ||
1890		    mapp[i].br_startblock == DELAYSTARTBLOCK) {
1891			return 0;
1892		}
1893		if (off != mapp[i].br_startoff) {
1894			return 0;
1895		}
1896		off += mapp[i].br_blockcount;
1897	}
1898	return off == bno + count;
1899}
1900
1901/*
1902 * Make a dabuf.
1903 * Used for get_buf, read_buf, read_bufr, and reada_buf.
1904 */
1905STATIC int
1906xfs_da_do_buf(
1907	xfs_trans_t	*trans,
1908	xfs_inode_t	*dp,
1909	xfs_dablk_t	bno,
1910	xfs_daddr_t	*mappedbnop,
1911	xfs_dabuf_t	**bpp,
1912	int		whichfork,
1913	int		caller,
1914	inst_t		*ra)
1915{
1916	xfs_buf_t	*bp = NULL;
1917	xfs_buf_t	**bplist;
1918	int		error=0;
1919	int		i;
1920	xfs_bmbt_irec_t	map;
1921	xfs_bmbt_irec_t	*mapp;
1922	xfs_daddr_t	mappedbno;
1923	xfs_mount_t	*mp;
1924	int		nbplist=0;
1925	int		nfsb;
1926	int		nmap;
1927	xfs_dabuf_t	*rbp;
1928
1929	mp = dp->i_mount;
1930	nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1;
1931	mappedbno = *mappedbnop;
1932	/*
1933	 * Caller doesn't have a mapping.  -2 means don't complain
1934	 * if we land in a hole.
1935	 */
1936	if (mappedbno == -1 || mappedbno == -2) {
1937		/*
1938		 * Optimize the one-block case.
1939		 */
1940		if (nfsb == 1) {
1941			xfs_fsblock_t	fsb;
1942
1943			if ((error =
1944			    xfs_bmapi_single(trans, dp, whichfork, &fsb,
1945				    (xfs_fileoff_t)bno))) {
1946				return error;
1947			}
1948			mapp = &map;
1949			if (fsb == NULLFSBLOCK) {
1950				nmap = 0;
1951			} else {
1952				map.br_startblock = fsb;
1953				map.br_startoff = (xfs_fileoff_t)bno;
1954				map.br_blockcount = 1;
1955				nmap = 1;
1956			}
1957		} else {
1958			mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
1959			nmap = nfsb;
1960			if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
1961					nfsb,
1962					XFS_BMAPI_METADATA |
1963						XFS_BMAPI_AFLAG(whichfork),
1964					NULL, 0, mapp, &nmap, NULL, NULL)))
1965				goto exit0;
1966		}
1967	} else {
1968		map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
1969		map.br_startoff = (xfs_fileoff_t)bno;
1970		map.br_blockcount = nfsb;
1971		mapp = &map;
1972		nmap = 1;
1973	}
1974	if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) {
1975		error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
1976		if (unlikely(error == EFSCORRUPTED)) {
1977			if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
1978				int	i;
1979				cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n",
1980					(long long)bno);
1981				cmn_err(CE_ALERT, "dir: inode %lld\n",
1982					(long long)dp->i_ino);
1983				for (i = 0; i < nmap; i++) {
1984					cmn_err(CE_ALERT,
1985						"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n",
1986						i,
1987						(long long)mapp[i].br_startoff,
1988						(long long)mapp[i].br_startblock,
1989						(long long)mapp[i].br_blockcount,
1990						mapp[i].br_state);
1991				}
1992			}
1993			XFS_ERROR_REPORT("xfs_da_do_buf(1)",
1994					 XFS_ERRLEVEL_LOW, mp);
1995		}
1996		goto exit0;
1997	}
1998	if (caller != 3 && nmap > 1) {
1999		bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP);
2000		nbplist = 0;
2001	} else
2002		bplist = NULL;
2003	/*
2004	 * Turn the mapping(s) into buffer(s).
2005	 */
2006	for (i = 0; i < nmap; i++) {
2007		int	nmapped;
2008
2009		mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock);
2010		if (i == 0)
2011			*mappedbnop = mappedbno;
2012		nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount);
2013		switch (caller) {
2014		case 0:
2015			bp = xfs_trans_get_buf(trans, mp->m_ddev_targp,
2016				mappedbno, nmapped, 0);
2017			error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO);
2018			break;
2019		case 1:
2020		case 2:
2021			bp = NULL;
2022			error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp,
2023				mappedbno, nmapped, 0, &bp);
2024			break;
2025		case 3:
2026			xfs_baread(mp->m_ddev_targp, mappedbno, nmapped);
2027			error = 0;
2028			bp = NULL;
2029			break;
2030		}
2031		if (error) {
2032			if (bp)
2033				xfs_trans_brelse(trans, bp);
2034			goto exit1;
2035		}
2036		if (!bp)
2037			continue;
2038		if (caller == 1) {
2039			if (whichfork == XFS_ATTR_FORK) {
2040				XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE,
2041						XFS_ATTR_BTREE_REF);
2042			} else {
2043				XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE,
2044						XFS_DIR_BTREE_REF);
2045			}
2046		}
2047		if (bplist) {
2048			bplist[nbplist++] = bp;
2049		}
2050	}
2051	/*
2052	 * Build a dabuf structure.
2053	 */
2054	if (bplist) {
2055		rbp = xfs_da_buf_make(nbplist, bplist, ra);
2056	} else if (bp)
2057		rbp = xfs_da_buf_make(1, &bp, ra);
2058	else
2059		rbp = NULL;
2060	/*
2061	 * For read_buf, check the magic number.
2062	 */
2063	if (caller == 1) {
2064		xfs_dir2_data_t		*data;
2065		xfs_dir2_free_t		*free;
2066		xfs_da_blkinfo_t	*info;
2067		uint			magic, magic1;
2068
2069		info = rbp->data;
2070		data = rbp->data;
2071		free = rbp->data;
2072		magic = be16_to_cpu(info->magic);
2073		magic1 = be32_to_cpu(data->hdr.magic);
2074		if (unlikely(
2075		    XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
2076				   (magic != XFS_ATTR_LEAF_MAGIC) &&
2077				   (magic != XFS_DIR2_LEAF1_MAGIC) &&
2078				   (magic != XFS_DIR2_LEAFN_MAGIC) &&
2079				   (magic1 != XFS_DIR2_BLOCK_MAGIC) &&
2080				   (magic1 != XFS_DIR2_DATA_MAGIC) &&
2081				   (be32_to_cpu(free->hdr.magic) != XFS_DIR2_FREE_MAGIC),
2082				mp, XFS_ERRTAG_DA_READ_BUF,
2083				XFS_RANDOM_DA_READ_BUF))) {
2084			xfs_buftrace("DA READ ERROR", rbp->bps[0]);
2085			XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
2086					     XFS_ERRLEVEL_LOW, mp, info);
2087			error = XFS_ERROR(EFSCORRUPTED);
2088			xfs_da_brelse(trans, rbp);
2089			nbplist = 0;
2090			goto exit1;
2091		}
2092	}
2093	if (bplist) {
2094		kmem_free(bplist, sizeof(*bplist) * nmap);
2095	}
2096	if (mapp != &map) {
2097		kmem_free(mapp, sizeof(*mapp) * nfsb);
2098	}
2099	if (bpp)
2100		*bpp = rbp;
2101	return 0;
2102exit1:
2103	if (bplist) {
2104		for (i = 0; i < nbplist; i++)
2105			xfs_trans_brelse(trans, bplist[i]);
2106		kmem_free(bplist, sizeof(*bplist) * nmap);
2107	}
2108exit0:
2109	if (mapp != &map)
2110		kmem_free(mapp, sizeof(*mapp) * nfsb);
2111	if (bpp)
2112		*bpp = NULL;
2113	return error;
2114}
2115
2116/*
2117 * Get a buffer for the dir/attr block.
2118 */
2119int
2120xfs_da_get_buf(
2121	xfs_trans_t	*trans,
2122	xfs_inode_t	*dp,
2123	xfs_dablk_t	bno,
2124	xfs_daddr_t		mappedbno,
2125	xfs_dabuf_t	**bpp,
2126	int		whichfork)
2127{
2128	return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0,
2129						 (inst_t *)__return_address);
2130}
2131
2132/*
2133 * Get a buffer for the dir/attr block, fill in the contents.
2134 */
2135int
2136xfs_da_read_buf(
2137	xfs_trans_t	*trans,
2138	xfs_inode_t	*dp,
2139	xfs_dablk_t	bno,
2140	xfs_daddr_t		mappedbno,
2141	xfs_dabuf_t	**bpp,
2142	int		whichfork)
2143{
2144	return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1,
2145		(inst_t *)__return_address);
2146}
2147
2148/*
2149 * Readahead the dir/attr block.
2150 */
2151xfs_daddr_t
2152xfs_da_reada_buf(
2153	xfs_trans_t	*trans,
2154	xfs_inode_t	*dp,
2155	xfs_dablk_t	bno,
2156	int		whichfork)
2157{
2158	xfs_daddr_t		rval;
2159
2160	rval = -1;
2161	if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3,
2162			(inst_t *)__return_address))
2163		return -1;
2164	else
2165		return rval;
2166}
2167
2168kmem_zone_t *xfs_da_state_zone;	/* anchor for state struct zone */
2169kmem_zone_t *xfs_dabuf_zone;		/* dabuf zone */
2170
2171/*
2172 * Allocate a dir-state structure.
2173 * We don't put them on the stack since they're large.
2174 */
2175xfs_da_state_t *
2176xfs_da_state_alloc(void)
2177{
2178	return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP);
2179}
2180
2181/*
2182 * Kill the altpath contents of a da-state structure.
2183 */
2184STATIC void
2185xfs_da_state_kill_altpath(xfs_da_state_t *state)
2186{
2187	int	i;
2188
2189	for (i = 0; i < state->altpath.active; i++) {
2190		if (state->altpath.blk[i].bp) {
2191			if (state->altpath.blk[i].bp != state->path.blk[i].bp)
2192				xfs_da_buf_done(state->altpath.blk[i].bp);
2193			state->altpath.blk[i].bp = NULL;
2194		}
2195	}
2196	state->altpath.active = 0;
2197}
2198
2199/*
2200 * Free a da-state structure.
2201 */
2202void
2203xfs_da_state_free(xfs_da_state_t *state)
2204{
2205	int	i;
2206
2207	xfs_da_state_kill_altpath(state);
2208	for (i = 0; i < state->path.active; i++) {
2209		if (state->path.blk[i].bp)
2210			xfs_da_buf_done(state->path.blk[i].bp);
2211	}
2212	if (state->extravalid && state->extrablk.bp)
2213		xfs_da_buf_done(state->extrablk.bp);
2214#ifdef DEBUG
2215	memset((char *)state, 0, sizeof(*state));
2216#endif /* DEBUG */
2217	kmem_zone_free(xfs_da_state_zone, state);
2218}
2219
2220#ifdef XFS_DABUF_DEBUG
2221xfs_dabuf_t	*xfs_dabuf_global_list;
2222lock_t		xfs_dabuf_global_lock;
2223#endif
2224
2225/*
2226 * Create a dabuf.
2227 */
2228/* ARGSUSED */
2229STATIC xfs_dabuf_t *
2230xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
2231{
2232	xfs_buf_t	*bp;
2233	xfs_dabuf_t	*dabuf;
2234	int		i;
2235	int		off;
2236
2237	if (nbuf == 1)
2238		dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP);
2239	else
2240		dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP);
2241	dabuf->dirty = 0;
2242#ifdef XFS_DABUF_DEBUG
2243	dabuf->ra = ra;
2244	dabuf->target = XFS_BUF_TARGET(bps[0]);
2245	dabuf->blkno = XFS_BUF_ADDR(bps[0]);
2246#endif
2247	if (nbuf == 1) {
2248		dabuf->nbuf = 1;
2249		bp = bps[0];
2250		dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
2251		dabuf->data = XFS_BUF_PTR(bp);
2252		dabuf->bps[0] = bp;
2253	} else {
2254		dabuf->nbuf = nbuf;
2255		for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) {
2256			dabuf->bps[i] = bp = bps[i];
2257			dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp));
2258		}
2259		dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
2260		for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) {
2261			bp = bps[i];
2262			memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp),
2263				XFS_BUF_COUNT(bp));
2264		}
2265	}
2266#ifdef XFS_DABUF_DEBUG
2267	{
2268		SPLDECL(s);
2269		xfs_dabuf_t	*p;
2270
2271		s = mutex_spinlock(&xfs_dabuf_global_lock);
2272		for (p = xfs_dabuf_global_list; p; p = p->next) {
2273			ASSERT(p->blkno != dabuf->blkno ||
2274			       p->target != dabuf->target);
2275		}
2276		dabuf->prev = NULL;
2277		if (xfs_dabuf_global_list)
2278			xfs_dabuf_global_list->prev = dabuf;
2279		dabuf->next = xfs_dabuf_global_list;
2280		xfs_dabuf_global_list = dabuf;
2281		mutex_spinunlock(&xfs_dabuf_global_lock, s);
2282	}
2283#endif
2284	return dabuf;
2285}
2286
2287/*
2288 * Un-dirty a dabuf.
2289 */
2290STATIC void
2291xfs_da_buf_clean(xfs_dabuf_t *dabuf)
2292{
2293	xfs_buf_t	*bp;
2294	int		i;
2295	int		off;
2296
2297	if (dabuf->dirty) {
2298		ASSERT(dabuf->nbuf > 1);
2299		dabuf->dirty = 0;
2300		for (i = off = 0; i < dabuf->nbuf;
2301				i++, off += XFS_BUF_COUNT(bp)) {
2302			bp = dabuf->bps[i];
2303			memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off,
2304				XFS_BUF_COUNT(bp));
2305		}
2306	}
2307}
2308
2309/*
2310 * Release a dabuf.
2311 */
2312void
2313xfs_da_buf_done(xfs_dabuf_t *dabuf)
2314{
2315	ASSERT(dabuf);
2316	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
2317	if (dabuf->dirty)
2318		xfs_da_buf_clean(dabuf);
2319	if (dabuf->nbuf > 1)
2320		kmem_free(dabuf->data, BBTOB(dabuf->bbcount));
2321#ifdef XFS_DABUF_DEBUG
2322	{
2323		SPLDECL(s);
2324
2325		s = mutex_spinlock(&xfs_dabuf_global_lock);
2326		if (dabuf->prev)
2327			dabuf->prev->next = dabuf->next;
2328		else
2329			xfs_dabuf_global_list = dabuf->next;
2330		if (dabuf->next)
2331			dabuf->next->prev = dabuf->prev;
2332		mutex_spinunlock(&xfs_dabuf_global_lock, s);
2333	}
2334	memset(dabuf, 0, XFS_DA_BUF_SIZE(dabuf->nbuf));
2335#endif
2336	if (dabuf->nbuf == 1)
2337		kmem_zone_free(xfs_dabuf_zone, dabuf);
2338	else
2339		kmem_free(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf));
2340}
2341
2342/*
2343 * Log transaction from a dabuf.
2344 */
2345void
2346xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
2347{
2348	xfs_buf_t	*bp;
2349	uint		f;
2350	int		i;
2351	uint		l;
2352	int		off;
2353
2354	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
2355	if (dabuf->nbuf == 1) {
2356		ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0]));
2357		xfs_trans_log_buf(tp, dabuf->bps[0], first, last);
2358		return;
2359	}
2360	dabuf->dirty = 1;
2361	ASSERT(first <= last);
2362	for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) {
2363		bp = dabuf->bps[i];
2364		f = off;
2365		l = f + XFS_BUF_COUNT(bp) - 1;
2366		if (f < first)
2367			f = first;
2368		if (l > last)
2369			l = last;
2370		if (f <= l)
2371			xfs_trans_log_buf(tp, bp, f - off, l - off);
2372		/*
2373		 * B_DONE is set by xfs_trans_log buf.
2374		 * If we don't set it on a new buffer (get not read)
2375		 * then if we don't put anything in the buffer it won't
2376		 * be set, and at commit it it released into the cache,
2377		 * and then a read will fail.
2378		 */
2379		else if (!(XFS_BUF_ISDONE(bp)))
2380		  XFS_BUF_DONE(bp);
2381	}
2382	ASSERT(last < off);
2383}
2384
2385/*
2386 * Release dabuf from a transaction.
2387 * Have to free up the dabuf before the buffers are released,
2388 * since the synchronization on the dabuf is really the lock on the buffer.
2389 */
2390void
2391xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
2392{
2393	xfs_buf_t	*bp;
2394	xfs_buf_t	**bplist;
2395	int		i;
2396	int		nbuf;
2397
2398	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
2399	if ((nbuf = dabuf->nbuf) == 1) {
2400		bplist = &bp;
2401		bp = dabuf->bps[0];
2402	} else {
2403		bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
2404		memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist));
2405	}
2406	xfs_da_buf_done(dabuf);
2407	for (i = 0; i < nbuf; i++)
2408		xfs_trans_brelse(tp, bplist[i]);
2409	if (bplist != &bp)
2410		kmem_free(bplist, nbuf * sizeof(*bplist));
2411}
2412
2413/*
2414 * Invalidate dabuf from a transaction.
2415 */
2416void
2417xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
2418{
2419	xfs_buf_t	*bp;
2420	xfs_buf_t	**bplist;
2421	int		i;
2422	int		nbuf;
2423
2424	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
2425	if ((nbuf = dabuf->nbuf) == 1) {
2426		bplist = &bp;
2427		bp = dabuf->bps[0];
2428	} else {
2429		bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
2430		memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist));
2431	}
2432	xfs_da_buf_done(dabuf);
2433	for (i = 0; i < nbuf; i++)
2434		xfs_trans_binval(tp, bplist[i]);
2435	if (bplist != &bp)
2436		kmem_free(bplist, nbuf * sizeof(*bplist));
2437}
2438
2439/*
2440 * Get the first daddr from a dabuf.
2441 */
2442xfs_daddr_t
2443xfs_da_blkno(xfs_dabuf_t *dabuf)
2444{
2445	ASSERT(dabuf->nbuf);
2446	ASSERT(dabuf->data);
2447	return XFS_BUF_ADDR(dabuf->bps[0]);
2448}
2449