1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5/**
6 ** old_item_num
7 ** old_entry_num
8 ** set_entry_sizes
9 ** create_virtual_node
10 ** check_left
11 ** check_right
12 ** directory_part_size
13 ** get_num_ver
14 ** set_parameters
15 ** is_leaf_removable
16 ** are_leaves_removable
17 ** get_empty_nodes
18 ** get_lfree
19 ** get_rfree
20 ** is_left_neighbor_in_cache
21 ** decrement_key
22 ** get_far_parent
23 ** get_parents
24 ** can_node_be_removed
25 ** ip_check_balance
26 ** dc_check_balance_internal
27 ** dc_check_balance_leaf
28 ** dc_check_balance
29 ** check_balance
30 ** get_direct_parent
31 ** get_neighbors
32 ** fix_nodes
33 **
34 **
35 **/
36
37#include <linux/time.h>
38#include <linux/string.h>
39#include <linux/reiserfs_fs.h>
40#include <linux/buffer_head.h>
41
42/* To make any changes in the tree we find a node, that contains item
43   to be changed/deleted or position in the node we insert a new item
44   to. We call this node S. To do balancing we need to decide what we
45   will shift to left/right neighbor, or to a new node, where new item
46   will be etc. To make this analysis simpler we build virtual
47   node. Virtual node is an array of items, that will replace items of
48   node S. (For instance if we are going to delete an item, virtual
49   node does not contain it). Virtual node keeps information about
50   item sizes and types, mergeability of first and last items, sizes
51   of all entries in directory item. We use this array of items when
52   calculating what we can shift to neighbors and how many nodes we
53   have to have if we do not any shiftings, if we shift to left/right
54   neighbor or to both. */
55
56/* taking item number in virtual node, returns number of item, that it has in source buffer */
57static inline int old_item_num(int new_num, int affected_item_num, int mode)
58{
59	if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
60		return new_num;
61
62	if (mode == M_INSERT) {
63
64		RFALSE(new_num == 0,
65		       "vs-8005: for INSERT mode and item number of inserted item");
66
67		return new_num - 1;
68	}
69
70	RFALSE(mode != M_DELETE,
71	       "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
72	       mode);
73	/* delete mode */
74	return new_num + 1;
75}
76
77static void create_virtual_node(struct tree_balance *tb, int h)
78{
79	struct item_head *ih;
80	struct virtual_node *vn = tb->tb_vn;
81	int new_num;
82	struct buffer_head *Sh;	/* this comes from tb->S[h] */
83
84	Sh = PATH_H_PBUFFER(tb->tb_path, h);
85
86	/* size of changed node */
87	vn->vn_size =
88	    MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
89
90	/* for internal nodes array if virtual items is not created */
91	if (h) {
92		vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
93		return;
94	}
95
96	/* number of items in virtual node  */
97	vn->vn_nr_item =
98	    B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
99	    ((vn->vn_mode == M_DELETE) ? 1 : 0);
100
101	/* first virtual item */
102	vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
103	memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
104	vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
105
106	/* first item in the node */
107	ih = B_N_PITEM_HEAD(Sh, 0);
108
109	/* define the mergeability for 0-th item (if it is not being deleted) */
110	if (op_is_left_mergeable(&(ih->ih_key), Sh->b_size)
111	    && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
112		vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
113
114	/* go through all items those remain in the virtual node (except for the new (inserted) one) */
115	for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
116		int j;
117		struct virtual_item *vi = vn->vn_vi + new_num;
118		int is_affected =
119		    ((new_num != vn->vn_affected_item_num) ? 0 : 1);
120
121		if (is_affected && vn->vn_mode == M_INSERT)
122			continue;
123
124		/* get item number in source node */
125		j = old_item_num(new_num, vn->vn_affected_item_num,
126				 vn->vn_mode);
127
128		vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
129		vi->vi_ih = ih + j;
130		vi->vi_item = B_I_PITEM(Sh, ih + j);
131		vi->vi_uarea = vn->vn_free_ptr;
132
133		// consume too much memory
134		vn->vn_free_ptr +=
135		    op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
136		if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
137			reiserfs_panic(tb->tb_sb,
138				       "vs-8030: create_virtual_node: "
139				       "virtual node space consumed");
140
141		if (!is_affected)
142			/* this is not being changed */
143			continue;
144
145		if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
146			vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
147			vi->vi_new_data = vn->vn_data;	// pointer to data which is going to be pasted
148		}
149	}
150
151	/* virtual inserted item is not defined yet */
152	if (vn->vn_mode == M_INSERT) {
153		struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
154
155		RFALSE(vn->vn_ins_ih == 0,
156		       "vs-8040: item header of inserted item is not specified");
157		vi->vi_item_len = tb->insert_size[0];
158		vi->vi_ih = vn->vn_ins_ih;
159		vi->vi_item = vn->vn_data;
160		vi->vi_uarea = vn->vn_free_ptr;
161
162		op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
163			     tb->insert_size[0]);
164	}
165
166	/* set right merge flag we take right delimiting key and check whether it is a mergeable item */
167	if (tb->CFR[0]) {
168		struct reiserfs_key *key;
169
170		key = B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]);
171		if (op_is_left_mergeable(key, Sh->b_size)
172		    && (vn->vn_mode != M_DELETE
173			|| vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
174			vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
175			    VI_TYPE_RIGHT_MERGEABLE;
176
177#ifdef CONFIG_REISERFS_CHECK
178		if (op_is_left_mergeable(key, Sh->b_size) &&
179		    !(vn->vn_mode != M_DELETE
180		      || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
181			/* we delete last item and it could be merged with right neighbor's first item */
182			if (!
183			    (B_NR_ITEMS(Sh) == 1
184			     && is_direntry_le_ih(B_N_PITEM_HEAD(Sh, 0))
185			     && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) {
186				/* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
187				print_block(Sh, 0, -1, -1);
188				reiserfs_panic(tb->tb_sb,
189					       "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c",
190					       key, vn->vn_affected_item_num,
191					       vn->vn_mode, M_DELETE);
192			}
193		}
194#endif
195
196	}
197}
198
199/* using virtual node check, how many items can be shifted to left
200   neighbor */
201static void check_left(struct tree_balance *tb, int h, int cur_free)
202{
203	int i;
204	struct virtual_node *vn = tb->tb_vn;
205	struct virtual_item *vi;
206	int d_size, ih_size;
207
208	RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
209
210	/* internal level */
211	if (h > 0) {
212		tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
213		return;
214	}
215
216	/* leaf level */
217
218	if (!cur_free || !vn->vn_nr_item) {
219		/* no free space or nothing to move */
220		tb->lnum[h] = 0;
221		tb->lbytes = -1;
222		return;
223	}
224
225	RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
226	       "vs-8055: parent does not exist or invalid");
227
228	vi = vn->vn_vi;
229	if ((unsigned int)cur_free >=
230	    (vn->vn_size -
231	     ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
232		/* all contents of S[0] fits into L[0] */
233
234		RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
235		       "vs-8055: invalid mode or balance condition failed");
236
237		tb->lnum[0] = vn->vn_nr_item;
238		tb->lbytes = -1;
239		return;
240	}
241
242	d_size = 0, ih_size = IH_SIZE;
243
244	/* first item may be merge with last item in left neighbor */
245	if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
246		d_size = -((int)IH_SIZE), ih_size = 0;
247
248	tb->lnum[0] = 0;
249	for (i = 0; i < vn->vn_nr_item;
250	     i++, ih_size = IH_SIZE, d_size = 0, vi++) {
251		d_size += vi->vi_item_len;
252		if (cur_free >= d_size) {
253			/* the item can be shifted entirely */
254			cur_free -= d_size;
255			tb->lnum[0]++;
256			continue;
257		}
258
259		/* the item cannot be shifted entirely, try to split it */
260		/* check whether L[0] can hold ih and at least one byte of the item body */
261		if (cur_free <= ih_size) {
262			/* cannot shift even a part of the current item */
263			tb->lbytes = -1;
264			return;
265		}
266		cur_free -= ih_size;
267
268		tb->lbytes = op_check_left(vi, cur_free, 0, 0);
269		if (tb->lbytes != -1)
270			/* count partially shifted item */
271			tb->lnum[0]++;
272
273		break;
274	}
275
276	return;
277}
278
279/* using virtual node check, how many items can be shifted to right
280   neighbor */
281static void check_right(struct tree_balance *tb, int h, int cur_free)
282{
283	int i;
284	struct virtual_node *vn = tb->tb_vn;
285	struct virtual_item *vi;
286	int d_size, ih_size;
287
288	RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
289
290	/* internal level */
291	if (h > 0) {
292		tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
293		return;
294	}
295
296	/* leaf level */
297
298	if (!cur_free || !vn->vn_nr_item) {
299		/* no free space  */
300		tb->rnum[h] = 0;
301		tb->rbytes = -1;
302		return;
303	}
304
305	RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
306	       "vs-8075: parent does not exist or invalid");
307
308	vi = vn->vn_vi + vn->vn_nr_item - 1;
309	if ((unsigned int)cur_free >=
310	    (vn->vn_size -
311	     ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
312		/* all contents of S[0] fits into R[0] */
313
314		RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
315		       "vs-8080: invalid mode or balance condition failed");
316
317		tb->rnum[h] = vn->vn_nr_item;
318		tb->rbytes = -1;
319		return;
320	}
321
322	d_size = 0, ih_size = IH_SIZE;
323
324	/* last item may be merge with first item in right neighbor */
325	if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
326		d_size = -(int)IH_SIZE, ih_size = 0;
327
328	tb->rnum[0] = 0;
329	for (i = vn->vn_nr_item - 1; i >= 0;
330	     i--, d_size = 0, ih_size = IH_SIZE, vi--) {
331		d_size += vi->vi_item_len;
332		if (cur_free >= d_size) {
333			/* the item can be shifted entirely */
334			cur_free -= d_size;
335			tb->rnum[0]++;
336			continue;
337		}
338
339		/* check whether R[0] can hold ih and at least one byte of the item body */
340		if (cur_free <= ih_size) {	/* cannot shift even a part of the current item */
341			tb->rbytes = -1;
342			return;
343		}
344
345		/* R[0] can hold the header of the item and at least one byte of its body */
346		cur_free -= ih_size;	/* cur_free is still > 0 */
347
348		tb->rbytes = op_check_right(vi, cur_free);
349		if (tb->rbytes != -1)
350			/* count partially shifted item */
351			tb->rnum[0]++;
352
353		break;
354	}
355
356	return;
357}
358
359/*
360 * from - number of items, which are shifted to left neighbor entirely
361 * to - number of item, which are shifted to right neighbor entirely
362 * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
363 * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
364static int get_num_ver(int mode, struct tree_balance *tb, int h,
365		       int from, int from_bytes,
366		       int to, int to_bytes, short *snum012, int flow)
367{
368	int i;
369	int cur_free;
370	//    int bytes;
371	int units;
372	struct virtual_node *vn = tb->tb_vn;
373	//    struct virtual_item * vi;
374
375	int total_node_size, max_node_size, current_item_size;
376	int needed_nodes;
377	int start_item,		/* position of item we start filling node from */
378	 end_item,		/* position of item we finish filling node by */
379	 start_bytes,		/* number of first bytes (entries for directory) of start_item-th item
380				   we do not include into node that is being filled */
381	 end_bytes;		/* number of last bytes (entries for directory) of end_item-th item
382				   we do node include into node that is being filled */
383	int split_item_positions[2];	/* these are positions in virtual item of
384					   items, that are split between S[0] and
385					   S1new and S1new and S2new */
386
387	split_item_positions[0] = -1;
388	split_item_positions[1] = -1;
389
390	/* We only create additional nodes if we are in insert or paste mode
391	   or we are in replace mode at the internal level. If h is 0 and
392	   the mode is M_REPLACE then in fix_nodes we change the mode to
393	   paste or insert before we get here in the code.  */
394	RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
395	       "vs-8100: insert_size < 0 in overflow");
396
397	max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
398
399	/* snum012 [0-2] - number of items, that lay
400	   to S[0], first new node and second new node */
401	snum012[3] = -1;	/* s1bytes */
402	snum012[4] = -1;	/* s2bytes */
403
404	/* internal level */
405	if (h > 0) {
406		i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
407		if (i == max_node_size)
408			return 1;
409		return (i / max_node_size + 1);
410	}
411
412	/* leaf level */
413	needed_nodes = 1;
414	total_node_size = 0;
415	cur_free = max_node_size;
416
417	// start from 'from'-th item
418	start_item = from;
419	// skip its first 'start_bytes' units
420	start_bytes = ((from_bytes != -1) ? from_bytes : 0);
421
422	// last included item is the 'end_item'-th one
423	end_item = vn->vn_nr_item - to - 1;
424	// do not count last 'end_bytes' units of 'end_item'-th item
425	end_bytes = (to_bytes != -1) ? to_bytes : 0;
426
427	/* go through all item beginning from the start_item-th item and ending by
428	   the end_item-th item. Do not count first 'start_bytes' units of
429	   'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
430
431	for (i = start_item; i <= end_item; i++) {
432		struct virtual_item *vi = vn->vn_vi + i;
433		int skip_from_end = ((i == end_item) ? end_bytes : 0);
434
435		RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
436
437		/* get size of current item */
438		current_item_size = vi->vi_item_len;
439
440		/* do not take in calculation head part (from_bytes) of from-th item */
441		current_item_size -=
442		    op_part_size(vi, 0 /*from start */ , start_bytes);
443
444		/* do not take in calculation tail part of last item */
445		current_item_size -=
446		    op_part_size(vi, 1 /*from end */ , skip_from_end);
447
448		/* if item fits into current node entierly */
449		if (total_node_size + current_item_size <= max_node_size) {
450			snum012[needed_nodes - 1]++;
451			total_node_size += current_item_size;
452			start_bytes = 0;
453			continue;
454		}
455
456		if (current_item_size > max_node_size) {
457			/* virtual item length is longer, than max size of item in
458			   a node. It is impossible for direct item */
459			RFALSE(is_direct_le_ih(vi->vi_ih),
460			       "vs-8110: "
461			       "direct item length is %d. It can not be longer than %d",
462			       current_item_size, max_node_size);
463			/* we will try to split it */
464			flow = 1;
465		}
466
467		if (!flow) {
468			/* as we do not split items, take new node and continue */
469			needed_nodes++;
470			i--;
471			total_node_size = 0;
472			continue;
473		}
474		// calculate number of item units which fit into node being
475		// filled
476		{
477			int free_space;
478
479			free_space = max_node_size - total_node_size - IH_SIZE;
480			units =
481			    op_check_left(vi, free_space, start_bytes,
482					  skip_from_end);
483			if (units == -1) {
484				/* nothing fits into current node, take new node and continue */
485				needed_nodes++, i--, total_node_size = 0;
486				continue;
487			}
488		}
489
490		/* something fits into the current node */
491		//if (snum012[3] != -1 || needed_nodes != 1)
492		//  reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
493		//snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
494		start_bytes += units;
495		snum012[needed_nodes - 1 + 3] = units;
496
497		if (needed_nodes > 2)
498			reiserfs_warning(tb->tb_sb, "vs-8111: get_num_ver: "
499					 "split_item_position is out of boundary");
500		snum012[needed_nodes - 1]++;
501		split_item_positions[needed_nodes - 1] = i;
502		needed_nodes++;
503		/* continue from the same item with start_bytes != -1 */
504		start_item = i;
505		i--;
506		total_node_size = 0;
507	}
508
509	// sum012[4] (if it is not -1) contains number of units of which
510	// are to be in S1new, snum012[3] - to be in S0. They are supposed
511	// to be S1bytes and S2bytes correspondingly, so recalculate
512	if (snum012[4] > 0) {
513		int split_item_num;
514		int bytes_to_r, bytes_to_l;
515		int bytes_to_S1new;
516
517		split_item_num = split_item_positions[1];
518		bytes_to_l =
519		    ((from == split_item_num
520		      && from_bytes != -1) ? from_bytes : 0);
521		bytes_to_r =
522		    ((end_item == split_item_num
523		      && end_bytes != -1) ? end_bytes : 0);
524		bytes_to_S1new =
525		    ((split_item_positions[0] ==
526		      split_item_positions[1]) ? snum012[3] : 0);
527
528		// s2bytes
529		snum012[4] =
530		    op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
531		    bytes_to_r - bytes_to_l - bytes_to_S1new;
532
533		if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
534		    vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
535			reiserfs_warning(tb->tb_sb, "vs-8115: get_num_ver: not "
536					 "directory or indirect item");
537	}
538
539	/* now we know S2bytes, calculate S1bytes */
540	if (snum012[3] > 0) {
541		int split_item_num;
542		int bytes_to_r, bytes_to_l;
543		int bytes_to_S2new;
544
545		split_item_num = split_item_positions[0];
546		bytes_to_l =
547		    ((from == split_item_num
548		      && from_bytes != -1) ? from_bytes : 0);
549		bytes_to_r =
550		    ((end_item == split_item_num
551		      && end_bytes != -1) ? end_bytes : 0);
552		bytes_to_S2new =
553		    ((split_item_positions[0] == split_item_positions[1]
554		      && snum012[4] != -1) ? snum012[4] : 0);
555
556		// s1bytes
557		snum012[3] =
558		    op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
559		    bytes_to_r - bytes_to_l - bytes_to_S2new;
560	}
561
562	return needed_nodes;
563}
564
565#ifdef CONFIG_REISERFS_CHECK
566extern struct tree_balance *cur_tb;
567#endif
568
569/* Set parameters for balancing.
570 * Performs write of results of analysis of balancing into structure tb,
571 * where it will later be used by the functions that actually do the balancing.
572 * Parameters:
573 *	tb	tree_balance structure;
574 *	h	current level of the node;
575 *	lnum	number of items from S[h] that must be shifted to L[h];
576 *	rnum	number of items from S[h] that must be shifted to R[h];
577 *	blk_num	number of blocks that S[h] will be splitted into;
578 *	s012	number of items that fall into splitted nodes.
579 *	lbytes	number of bytes which flow to the left neighbor from the item that is not
580 *		not shifted entirely
581 *	rbytes	number of bytes which flow to the right neighbor from the item that is not
582 *		not shifted entirely
583 *	s1bytes	number of bytes which flow to the first  new node when S[0] splits (this number is contained in s012 array)
584 */
585
586static void set_parameters(struct tree_balance *tb, int h, int lnum,
587			   int rnum, int blk_num, short *s012, int lb, int rb)
588{
589
590	tb->lnum[h] = lnum;
591	tb->rnum[h] = rnum;
592	tb->blknum[h] = blk_num;
593
594	if (h == 0) {		/* only for leaf level */
595		if (s012 != NULL) {
596			tb->s0num = *s012++,
597			    tb->s1num = *s012++, tb->s2num = *s012++;
598			tb->s1bytes = *s012++;
599			tb->s2bytes = *s012;
600		}
601		tb->lbytes = lb;
602		tb->rbytes = rb;
603	}
604	PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
605	PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
606
607	PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
608	PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
609}
610
611/* check, does node disappear if we shift tb->lnum[0] items to left
612   neighbor and tb->rnum[0] to the right one. */
613static int is_leaf_removable(struct tree_balance *tb)
614{
615	struct virtual_node *vn = tb->tb_vn;
616	int to_left, to_right;
617	int size;
618	int remain_items;
619
620	/* number of items, that will be shifted to left (right) neighbor
621	   entirely */
622	to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
623	to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
624	remain_items = vn->vn_nr_item;
625
626	/* how many items remain in S[0] after shiftings to neighbors */
627	remain_items -= (to_left + to_right);
628
629	if (remain_items < 1) {
630		/* all content of node can be shifted to neighbors */
631		set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
632			       NULL, -1, -1);
633		return 1;
634	}
635
636	if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
637		/* S[0] is not removable */
638		return 0;
639
640	/* check, whether we can divide 1 remaining item between neighbors */
641
642	/* get size of remaining item (in item units) */
643	size = op_unit_num(&(vn->vn_vi[to_left]));
644
645	if (tb->lbytes + tb->rbytes >= size) {
646		set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
647			       tb->lbytes, -1);
648		return 1;
649	}
650
651	return 0;
652}
653
654/* check whether L, S, R can be joined in one node */
655static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
656{
657	struct virtual_node *vn = tb->tb_vn;
658	int ih_size;
659	struct buffer_head *S0;
660
661	S0 = PATH_H_PBUFFER(tb->tb_path, 0);
662
663	ih_size = 0;
664	if (vn->vn_nr_item) {
665		if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
666			ih_size += IH_SIZE;
667
668		if (vn->vn_vi[vn->vn_nr_item - 1].
669		    vi_type & VI_TYPE_RIGHT_MERGEABLE)
670			ih_size += IH_SIZE;
671	} else {
672		/* there was only one item and it will be deleted */
673		struct item_head *ih;
674
675		RFALSE(B_NR_ITEMS(S0) != 1,
676		       "vs-8125: item number must be 1: it is %d",
677		       B_NR_ITEMS(S0));
678
679		ih = B_N_PITEM_HEAD(S0, 0);
680		if (tb->CFR[0]
681		    && !comp_short_le_keys(&(ih->ih_key),
682					   B_N_PDELIM_KEY(tb->CFR[0],
683							  tb->rkey[0])))
684			if (is_direntry_le_ih(ih)) {
685				/* Directory must be in correct state here: that is
686				   somewhere at the left side should exist first directory
687				   item. But the item being deleted can not be that first
688				   one because its right neighbor is item of the same
689				   directory. (But first item always gets deleted in last
690				   turn). So, neighbors of deleted item can be merged, so
691				   we can save ih_size */
692				ih_size = IH_SIZE;
693
694				/* we might check that left neighbor exists and is of the
695				   same directory */
696				RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
697				       "vs-8130: first directory item can not be removed until directory is not empty");
698			}
699
700	}
701
702	if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
703		set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
704		PROC_INFO_INC(tb->tb_sb, leaves_removable);
705		return 1;
706	}
707	return 0;
708
709}
710
711/* when we do not split item, lnum and rnum are numbers of entire items */
712#define SET_PAR_SHIFT_LEFT \
713if (h)\
714{\
715   int to_l;\
716   \
717   to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
718	      (MAX_NR_KEY(Sh) + 1 - lpar);\
719	      \
720	      set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
721}\
722else \
723{\
724   if (lset==LEFT_SHIFT_FLOW)\
725     set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
726		     tb->lbytes, -1);\
727   else\
728     set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
729		     -1, -1);\
730}
731
732#define SET_PAR_SHIFT_RIGHT \
733if (h)\
734{\
735   int to_r;\
736   \
737   to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
738   \
739   set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
740}\
741else \
742{\
743   if (rset==RIGHT_SHIFT_FLOW)\
744     set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
745		  -1, tb->rbytes);\
746   else\
747     set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
748		  -1, -1);\
749}
750
751static void free_buffers_in_tb(struct tree_balance *p_s_tb)
752{
753	int n_counter;
754
755	decrement_counters_in_path(p_s_tb->tb_path);
756
757	for (n_counter = 0; n_counter < MAX_HEIGHT; n_counter++) {
758		decrement_bcount(p_s_tb->L[n_counter]);
759		p_s_tb->L[n_counter] = NULL;
760		decrement_bcount(p_s_tb->R[n_counter]);
761		p_s_tb->R[n_counter] = NULL;
762		decrement_bcount(p_s_tb->FL[n_counter]);
763		p_s_tb->FL[n_counter] = NULL;
764		decrement_bcount(p_s_tb->FR[n_counter]);
765		p_s_tb->FR[n_counter] = NULL;
766		decrement_bcount(p_s_tb->CFL[n_counter]);
767		p_s_tb->CFL[n_counter] = NULL;
768		decrement_bcount(p_s_tb->CFR[n_counter]);
769		p_s_tb->CFR[n_counter] = NULL;
770	}
771}
772
773/* Get new buffers for storing new nodes that are created while balancing.
774 * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
775 *	        CARRY_ON - schedule didn't occur while the function worked;
776 *	        NO_DISK_SPACE - no disk space.
777 */
778/* The function is NOT SCHEDULE-SAFE! */
779static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
780{
781	struct buffer_head *p_s_new_bh,
782	    *p_s_Sh = PATH_H_PBUFFER(p_s_tb->tb_path, n_h);
783	b_blocknr_t *p_n_blocknr, a_n_blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
784	int n_counter, n_number_of_freeblk, n_amount_needed,	/* number of needed empty blocks */
785	 n_retval = CARRY_ON;
786	struct super_block *p_s_sb = p_s_tb->tb_sb;
787
788	/* number_of_freeblk is the number of empty blocks which have been
789	   acquired for use by the balancing algorithm minus the number of
790	   empty blocks used in the previous levels of the analysis,
791	   number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
792	   after empty blocks are acquired, and the balancing analysis is
793	   then restarted, amount_needed is the number needed by this level
794	   (n_h) of the balancing analysis.
795
796	   Note that for systems with many processes writing, it would be
797	   more layout optimal to calculate the total number needed by all
798	   levels and then to run reiserfs_new_blocks to get all of them at once.  */
799
800	/* Initiate number_of_freeblk to the amount acquired prior to the restart of
801	   the analysis or 0 if not restarted, then subtract the amount needed
802	   by all of the levels of the tree below n_h. */
803	/* blknum includes S[n_h], so we subtract 1 in this calculation */
804	for (n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum;
805	     n_counter < n_h; n_counter++)
806		n_number_of_freeblk -=
807		    (p_s_tb->blknum[n_counter]) ? (p_s_tb->blknum[n_counter] -
808						   1) : 0;
809
810	/* Allocate missing empty blocks. */
811	/* if p_s_Sh == 0  then we are getting a new root */
812	n_amount_needed = (p_s_Sh) ? (p_s_tb->blknum[n_h] - 1) : 1;
813	/*  Amount_needed = the amount that we need more than the amount that we have. */
814	if (n_amount_needed > n_number_of_freeblk)
815		n_amount_needed -= n_number_of_freeblk;
816	else			/* If we have enough already then there is nothing to do. */
817		return CARRY_ON;
818
819	/* No need to check quota - is not allocated for blocks used for formatted nodes */
820	if (reiserfs_new_form_blocknrs(p_s_tb, a_n_blocknrs,
821				       n_amount_needed) == NO_DISK_SPACE)
822		return NO_DISK_SPACE;
823
824	/* for each blocknumber we just got, get a buffer and stick it on FEB */
825	for (p_n_blocknr = a_n_blocknrs, n_counter = 0;
826	     n_counter < n_amount_needed; p_n_blocknr++, n_counter++) {
827
828		RFALSE(!*p_n_blocknr,
829		       "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
830
831		p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr);
832		RFALSE(buffer_dirty(p_s_new_bh) ||
833		       buffer_journaled(p_s_new_bh) ||
834		       buffer_journal_dirty(p_s_new_bh),
835		       "PAP-8140: journlaled or dirty buffer %b for the new block",
836		       p_s_new_bh);
837
838		/* Put empty buffers into the array. */
839		RFALSE(p_s_tb->FEB[p_s_tb->cur_blknum],
840		       "PAP-8141: busy slot for new buffer");
841
842		set_buffer_journal_new(p_s_new_bh);
843		p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh;
844	}
845
846	if (n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB(p_s_tb))
847		n_retval = REPEAT_SEARCH;
848
849	return n_retval;
850}
851
852/* Get free space of the left neighbor, which is stored in the parent
853 * node of the left neighbor.  */
854static int get_lfree(struct tree_balance *tb, int h)
855{
856	struct buffer_head *l, *f;
857	int order;
858
859	if ((f = PATH_H_PPARENT(tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0)
860		return 0;
861
862	if (f == l)
863		order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
864	else {
865		order = B_NR_ITEMS(l);
866		f = l;
867	}
868
869	return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
870}
871
872/* Get free space of the right neighbor,
873 * which is stored in the parent node of the right neighbor.
874 */
875static int get_rfree(struct tree_balance *tb, int h)
876{
877	struct buffer_head *r, *f;
878	int order;
879
880	if ((f = PATH_H_PPARENT(tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0)
881		return 0;
882
883	if (f == r)
884		order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
885	else {
886		order = 0;
887		f = r;
888	}
889
890	return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
891
892}
893
894/* Check whether left neighbor is in memory. */
895static int is_left_neighbor_in_cache(struct tree_balance *p_s_tb, int n_h)
896{
897	struct buffer_head *p_s_father, *left;
898	struct super_block *p_s_sb = p_s_tb->tb_sb;
899	b_blocknr_t n_left_neighbor_blocknr;
900	int n_left_neighbor_position;
901
902	if (!p_s_tb->FL[n_h])	/* Father of the left neighbor does not exist. */
903		return 0;
904
905	/* Calculate father of the node to be balanced. */
906	p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1);
907
908	RFALSE(!p_s_father ||
909	       !B_IS_IN_TREE(p_s_father) ||
910	       !B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
911	       !buffer_uptodate(p_s_father) ||
912	       !buffer_uptodate(p_s_tb->FL[n_h]),
913	       "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
914	       p_s_father, p_s_tb->FL[n_h]);
915
916	/* Get position of the pointer to the left neighbor into the left father. */
917	n_left_neighbor_position = (p_s_father == p_s_tb->FL[n_h]) ?
918	    p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->FL[n_h]);
919	/* Get left neighbor block number. */
920	n_left_neighbor_blocknr =
921	    B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
922	/* Look for the left neighbor in the cache. */
923	if ((left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr))) {
924
925		RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
926		       "vs-8170: left neighbor (%b %z) is not in the tree",
927		       left, left);
928		put_bh(left);
929		return 1;
930	}
931
932	return 0;
933}
934
935#define LEFT_PARENTS  'l'
936#define RIGHT_PARENTS 'r'
937
938static void decrement_key(struct cpu_key *p_s_key)
939{
940	// call item specific function for this key
941	item_ops[cpu_key_k_type(p_s_key)]->decrement_key(p_s_key);
942}
943
944/* Calculate far left/right parent of the left/right neighbor of the current node, that
945 * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
946 * Calculate left/right common parent of the current node and L[h]/R[h].
947 * Calculate left/right delimiting key position.
948 * Returns:	PATH_INCORRECT   - path in the tree is not correct;
949 		SCHEDULE_OCCURRED - schedule occurred while the function worked;
950 *	        CARRY_ON         - schedule didn't occur while the function worked;
951 */
952static int get_far_parent(struct tree_balance *p_s_tb,
953			  int n_h,
954			  struct buffer_head **pp_s_father,
955			  struct buffer_head **pp_s_com_father, char c_lr_par)
956{
957	struct buffer_head *p_s_parent;
958	INITIALIZE_PATH(s_path_to_neighbor_father);
959	struct treepath *p_s_path = p_s_tb->tb_path;
960	struct cpu_key s_lr_father_key;
961	int n_counter,
962	    n_position = INT_MAX,
963	    n_first_last_position = 0,
964	    n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h);
965
966	/* Starting from F[n_h] go upwards in the tree, and look for the common
967	   ancestor of F[n_h], and its neighbor l/r, that should be obtained. */
968
969	n_counter = n_path_offset;
970
971	RFALSE(n_counter < FIRST_PATH_ELEMENT_OFFSET,
972	       "PAP-8180: invalid path length");
973
974	for (; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter--) {
975		/* Check whether parent of the current buffer in the path is really parent in the tree. */
976		if (!B_IS_IN_TREE
977		    (p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)))
978			return REPEAT_SEARCH;
979		/* Check whether position in the parent is correct. */
980		if ((n_position =
981		     PATH_OFFSET_POSITION(p_s_path,
982					  n_counter - 1)) >
983		    B_NR_ITEMS(p_s_parent))
984			return REPEAT_SEARCH;
985		/* Check whether parent at the path really points to the child. */
986		if (B_N_CHILD_NUM(p_s_parent, n_position) !=
987		    PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr)
988			return REPEAT_SEARCH;
989		/* Return delimiting key if position in the parent is not equal to first/last one. */
990		if (c_lr_par == RIGHT_PARENTS)
991			n_first_last_position = B_NR_ITEMS(p_s_parent);
992		if (n_position != n_first_last_position) {
993			*pp_s_com_father = p_s_parent;
994			get_bh(*pp_s_com_father);
995			/*(*pp_s_com_father = p_s_parent)->b_count++; */
996			break;
997		}
998	}
999
1000	/* if we are in the root of the tree, then there is no common father */
1001	if (n_counter == FIRST_PATH_ELEMENT_OFFSET) {
1002		/* Check whether first buffer in the path is the root of the tree. */
1003		if (PATH_OFFSET_PBUFFER
1004		    (p_s_tb->tb_path,
1005		     FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1006		    SB_ROOT_BLOCK(p_s_tb->tb_sb)) {
1007			*pp_s_father = *pp_s_com_father = NULL;
1008			return CARRY_ON;
1009		}
1010		return REPEAT_SEARCH;
1011	}
1012
1013	RFALSE(B_LEVEL(*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL,
1014	       "PAP-8185: (%b %z) level too small",
1015	       *pp_s_com_father, *pp_s_com_father);
1016
1017	/* Check whether the common parent is locked. */
1018
1019	if (buffer_locked(*pp_s_com_father)) {
1020		__wait_on_buffer(*pp_s_com_father);
1021		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
1022			decrement_bcount(*pp_s_com_father);
1023			return REPEAT_SEARCH;
1024		}
1025	}
1026
1027	/* So, we got common parent of the current node and its left/right neighbor.
1028	   Now we are geting the parent of the left/right neighbor. */
1029
1030	/* Form key to get parent of the left/right neighbor. */
1031	le_key2cpu_key(&s_lr_father_key,
1032		       B_N_PDELIM_KEY(*pp_s_com_father,
1033				      (c_lr_par ==
1034				       LEFT_PARENTS) ? (p_s_tb->lkey[n_h - 1] =
1035							n_position -
1036							1) : (p_s_tb->rkey[n_h -
1037									   1] =
1038							      n_position)));
1039
1040	if (c_lr_par == LEFT_PARENTS)
1041		decrement_key(&s_lr_father_key);
1042
1043	if (search_by_key
1044	    (p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
1045	     n_h + 1) == IO_ERROR)
1046		// path is released
1047		return IO_ERROR;
1048
1049	if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
1050		decrement_counters_in_path(&s_path_to_neighbor_father);
1051		decrement_bcount(*pp_s_com_father);
1052		return REPEAT_SEARCH;
1053	}
1054
1055	*pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1056
1057	RFALSE(B_LEVEL(*pp_s_father) != n_h + 1,
1058	       "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father);
1059	RFALSE(s_path_to_neighbor_father.path_length <
1060	       FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
1061
1062	s_path_to_neighbor_father.path_length--;
1063	decrement_counters_in_path(&s_path_to_neighbor_father);
1064	return CARRY_ON;
1065}
1066
1067/* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of
1068 * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset],
1069 * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset].
1070 * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset].
1071 * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
1072 *	        CARRY_ON - schedule didn't occur while the function worked;
1073 */
1074static int get_parents(struct tree_balance *p_s_tb, int n_h)
1075{
1076	struct treepath *p_s_path = p_s_tb->tb_path;
1077	int n_position,
1078	    n_ret_value,
1079	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1080	struct buffer_head *p_s_curf, *p_s_curcf;
1081
1082	/* Current node is the root of the tree or will be root of the tree */
1083	if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1084		/* The root can not have parents.
1085		   Release nodes which previously were obtained as parents of the current node neighbors. */
1086		decrement_bcount(p_s_tb->FL[n_h]);
1087		decrement_bcount(p_s_tb->CFL[n_h]);
1088		decrement_bcount(p_s_tb->FR[n_h]);
1089		decrement_bcount(p_s_tb->CFR[n_h]);
1090		p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] =
1091		    p_s_tb->CFR[n_h] = NULL;
1092		return CARRY_ON;
1093	}
1094
1095	/* Get parent FL[n_path_offset] of L[n_path_offset]. */
1096	if ((n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1))) {
1097		/* Current node is not the first child of its parent. */
1098		/*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */
1099		p_s_curf = p_s_curcf =
1100		    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1101		get_bh(p_s_curf);
1102		get_bh(p_s_curf);
1103		p_s_tb->lkey[n_h] = n_position - 1;
1104	} else {
1105		/* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
1106		   Calculate current common parent of L[n_path_offset] and the current node. Note that
1107		   CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
1108		   Calculate lkey[n_path_offset]. */
1109		if ((n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,
1110						  &p_s_curcf,
1111						  LEFT_PARENTS)) != CARRY_ON)
1112			return n_ret_value;
1113	}
1114
1115	decrement_bcount(p_s_tb->FL[n_h]);
1116	p_s_tb->FL[n_h] = p_s_curf;	/* New initialization of FL[n_h]. */
1117	decrement_bcount(p_s_tb->CFL[n_h]);
1118	p_s_tb->CFL[n_h] = p_s_curcf;	/* New initialization of CFL[n_h]. */
1119
1120	RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
1121	       (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
1122	       "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf);
1123
1124/* Get parent FR[n_h] of R[n_h]. */
1125
1126/* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */
1127	if (n_position == B_NR_ITEMS(PATH_H_PBUFFER(p_s_path, n_h + 1))) {
1128/* Calculate current parent of R[n_h], which is the right neighbor of F[n_h].
1129   Calculate current common parent of R[n_h] and current node. Note that CFR[n_h]
1130   not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */
1131		if ((n_ret_value =
1132		     get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf,
1133				    RIGHT_PARENTS)) != CARRY_ON)
1134			return n_ret_value;
1135	} else {
1136/* Current node is not the last child of its parent F[n_h]. */
1137		/*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */
1138		p_s_curf = p_s_curcf =
1139		    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1140		get_bh(p_s_curf);
1141		get_bh(p_s_curf);
1142		p_s_tb->rkey[n_h] = n_position;
1143	}
1144
1145	decrement_bcount(p_s_tb->FR[n_h]);
1146	p_s_tb->FR[n_h] = p_s_curf;	/* New initialization of FR[n_path_offset]. */
1147
1148	decrement_bcount(p_s_tb->CFR[n_h]);
1149	p_s_tb->CFR[n_h] = p_s_curcf;	/* New initialization of CFR[n_path_offset]. */
1150
1151	RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
1152	       (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
1153	       "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf);
1154
1155	return CARRY_ON;
1156}
1157
1158/* it is possible to remove node as result of shiftings to
1159   neighbors even when we insert or paste item. */
1160static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
1161				      struct tree_balance *tb, int h)
1162{
1163	struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
1164	int levbytes = tb->insert_size[h];
1165	struct item_head *ih;
1166	struct reiserfs_key *r_key = NULL;
1167
1168	ih = B_N_PITEM_HEAD(Sh, 0);
1169	if (tb->CFR[h])
1170		r_key = B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]);
1171
1172	if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
1173	    /* shifting may merge items which might save space */
1174	    -
1175	    ((!h
1176	      && op_is_left_mergeable(&(ih->ih_key), Sh->b_size)) ? IH_SIZE : 0)
1177	    -
1178	    ((!h && r_key
1179	      && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
1180	    + ((h) ? KEY_SIZE : 0)) {
1181		/* node can not be removed */
1182		if (sfree >= levbytes) {	/* new item fits into node S[h] without any shifting */
1183			if (!h)
1184				tb->s0num =
1185				    B_NR_ITEMS(Sh) +
1186				    ((mode == M_INSERT) ? 1 : 0);
1187			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1188			return NO_BALANCING_NEEDED;
1189		}
1190	}
1191	PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
1192	return !NO_BALANCING_NEEDED;
1193}
1194
1195/* Check whether current node S[h] is balanced when increasing its size by
1196 * Inserting or Pasting.
1197 * Calculate parameters for balancing for current level h.
1198 * Parameters:
1199 *	tb	tree_balance structure;
1200 *	h	current level of the node;
1201 *	inum	item number in S[h];
1202 *	mode	i - insert, p - paste;
1203 * Returns:	1 - schedule occurred;
1204 *	        0 - balancing for higher levels needed;
1205 *	       -1 - no balancing for higher levels needed;
1206 *	       -2 - no disk space.
1207 */
1208/* ip means Inserting or Pasting */
1209static int ip_check_balance(struct tree_balance *tb, int h)
1210{
1211	struct virtual_node *vn = tb->tb_vn;
1212	int levbytes,		/* Number of bytes that must be inserted into (value
1213				   is negative if bytes are deleted) buffer which
1214				   contains node being balanced.  The mnemonic is
1215				   that the attempted change in node space used level
1216				   is levbytes bytes. */
1217	 n_ret_value;
1218
1219	int lfree, sfree, rfree /* free space in L, S and R */ ;
1220
1221	/* nver is short for number of vertixes, and lnver is the number if
1222	   we shift to the left, rnver is the number if we shift to the
1223	   right, and lrnver is the number if we shift in both directions.
1224	   The goal is to minimize first the number of vertixes, and second,
1225	   the number of vertixes whose contents are changed by shifting,
1226	   and third the number of uncached vertixes whose contents are
1227	   changed by shifting and must be read from disk.  */
1228	int nver, lnver, rnver, lrnver;
1229
1230	/* used at leaf level only, S0 = S[0] is the node being balanced,
1231	   sInum [ I = 0,1,2 ] is the number of items that will
1232	   remain in node SI after balancing.  S1 and S2 are new
1233	   nodes that might be created. */
1234
1235	/* we perform 8 calls to get_num_ver().  For each call we calculate five parameters.
1236	   where 4th parameter is s1bytes and 5th - s2bytes
1237	 */
1238	short snum012[40] = { 0, };	/* s0num, s1num, s2num for 8 cases
1239					   0,1 - do not shift and do not shift but bottle
1240					   2 - shift only whole item to left
1241					   3 - shift to left and bottle as much as possible
1242					   4,5 - shift to right (whole items and as much as possible
1243					   6,7 - shift to both directions (whole items and as much as possible)
1244					 */
1245
1246	/* Sh is the node whose balance is currently being checked */
1247	struct buffer_head *Sh;
1248
1249	Sh = PATH_H_PBUFFER(tb->tb_path, h);
1250	levbytes = tb->insert_size[h];
1251
1252	/* Calculate balance parameters for creating new root. */
1253	if (!Sh) {
1254		if (!h)
1255			reiserfs_panic(tb->tb_sb,
1256				       "vs-8210: ip_check_balance: S[0] can not be 0");
1257		switch (n_ret_value = get_empty_nodes(tb, h)) {
1258		case CARRY_ON:
1259			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1260			return NO_BALANCING_NEEDED;	/* no balancing for higher levels needed */
1261
1262		case NO_DISK_SPACE:
1263		case REPEAT_SEARCH:
1264			return n_ret_value;
1265		default:
1266			reiserfs_panic(tb->tb_sb,
1267				       "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes");
1268		}
1269	}
1270
1271	if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)	/* get parents of S[h] neighbors. */
1272		return n_ret_value;
1273
1274	sfree = B_FREE_SPACE(Sh);
1275
1276	/* get free space of neighbors */
1277	rfree = get_rfree(tb, h);
1278	lfree = get_lfree(tb, h);
1279
1280	if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
1281	    NO_BALANCING_NEEDED)
1282		/* and new item fits into node S[h] without any shifting */
1283		return NO_BALANCING_NEEDED;
1284
1285	create_virtual_node(tb, h);
1286
1287	/*
1288	   determine maximal number of items we can shift to the left neighbor (in tb structure)
1289	   and the maximal number of bytes that can flow to the left neighbor
1290	   from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
1291	 */
1292	check_left(tb, h, lfree);
1293
1294	/*
1295	   determine maximal number of items we can shift to the right neighbor (in tb structure)
1296	   and the maximal number of bytes that can flow to the right neighbor
1297	   from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
1298	 */
1299	check_right(tb, h, rfree);
1300
1301	/* all contents of internal node S[h] can be moved into its
1302	   neighbors, S[h] will be removed after balancing */
1303	if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
1304		int to_r;
1305
1306		/* Since we are working on internal nodes, and our internal
1307		   nodes have fixed size entries, then we can balance by the
1308		   number of items rather than the space they consume.  In this
1309		   routine we set the left node equal to the right node,
1310		   allowing a difference of less than or equal to 1 child
1311		   pointer. */
1312		to_r =
1313		    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1314		     vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1315						tb->rnum[h]);
1316		set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1317			       -1, -1);
1318		return CARRY_ON;
1319	}
1320
1321	/* this checks balance condition, that any two neighboring nodes can not fit in one node */
1322	RFALSE(h &&
1323	       (tb->lnum[h] >= vn->vn_nr_item + 1 ||
1324		tb->rnum[h] >= vn->vn_nr_item + 1),
1325	       "vs-8220: tree is not balanced on internal level");
1326	RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
1327		      (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
1328	       "vs-8225: tree is not balanced on leaf level");
1329
1330	/* all contents of S[0] can be moved into its neighbors
1331	   S[0] will be removed after balancing. */
1332	if (!h && is_leaf_removable(tb))
1333		return CARRY_ON;
1334
1335	/* why do we perform this check here rather than earlier??
1336	   Answer: we can win 1 node in some cases above. Moreover we
1337	   checked it above, when we checked, that S[0] is not removable
1338	   in principle */
1339	if (sfree >= levbytes) {	/* new item fits into node S[h] without any shifting */
1340		if (!h)
1341			tb->s0num = vn->vn_nr_item;
1342		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1343		return NO_BALANCING_NEEDED;
1344	}
1345
1346	{
1347		int lpar, rpar, nset, lset, rset, lrset;
1348		/*
1349		 * regular overflowing of the node
1350		 */
1351
1352		/* get_num_ver works in 2 modes (FLOW & NO_FLOW)
1353		   lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
1354		   nset, lset, rset, lrset - shows, whether flowing items give better packing
1355		 */
1356#define FLOW 1
1357#define NO_FLOW 0		/* do not any splitting */
1358
1359		/* we choose one the following */
1360#define NOTHING_SHIFT_NO_FLOW	0
1361#define NOTHING_SHIFT_FLOW	5
1362#define LEFT_SHIFT_NO_FLOW	10
1363#define LEFT_SHIFT_FLOW		15
1364#define RIGHT_SHIFT_NO_FLOW	20
1365#define RIGHT_SHIFT_FLOW	25
1366#define LR_SHIFT_NO_FLOW	30
1367#define LR_SHIFT_FLOW		35
1368
1369		lpar = tb->lnum[h];
1370		rpar = tb->rnum[h];
1371
1372		/* calculate number of blocks S[h] must be split into when
1373		   nothing is shifted to the neighbors,
1374		   as well as number of items in each part of the split node (s012 numbers),
1375		   and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
1376		nset = NOTHING_SHIFT_NO_FLOW;
1377		nver = get_num_ver(vn->vn_mode, tb, h,
1378				   0, -1, h ? vn->vn_nr_item : 0, -1,
1379				   snum012, NO_FLOW);
1380
1381		if (!h) {
1382			int nver1;
1383
1384			/* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
1385			nver1 = get_num_ver(vn->vn_mode, tb, h,
1386					    0, -1, 0, -1,
1387					    snum012 + NOTHING_SHIFT_FLOW, FLOW);
1388			if (nver > nver1)
1389				nset = NOTHING_SHIFT_FLOW, nver = nver1;
1390		}
1391
1392		/* calculate number of blocks S[h] must be split into when
1393		   l_shift_num first items and l_shift_bytes of the right most
1394		   liquid item to be shifted are shifted to the left neighbor,
1395		   as well as number of items in each part of the splitted node (s012 numbers),
1396		   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1397		 */
1398		lset = LEFT_SHIFT_NO_FLOW;
1399		lnver = get_num_ver(vn->vn_mode, tb, h,
1400				    lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1401				    -1, h ? vn->vn_nr_item : 0, -1,
1402				    snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
1403		if (!h) {
1404			int lnver1;
1405
1406			lnver1 = get_num_ver(vn->vn_mode, tb, h,
1407					     lpar -
1408					     ((tb->lbytes != -1) ? 1 : 0),
1409					     tb->lbytes, 0, -1,
1410					     snum012 + LEFT_SHIFT_FLOW, FLOW);
1411			if (lnver > lnver1)
1412				lset = LEFT_SHIFT_FLOW, lnver = lnver1;
1413		}
1414
1415		/* calculate number of blocks S[h] must be split into when
1416		   r_shift_num first items and r_shift_bytes of the left most
1417		   liquid item to be shifted are shifted to the right neighbor,
1418		   as well as number of items in each part of the splitted node (s012 numbers),
1419		   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1420		 */
1421		rset = RIGHT_SHIFT_NO_FLOW;
1422		rnver = get_num_ver(vn->vn_mode, tb, h,
1423				    0, -1,
1424				    h ? (vn->vn_nr_item - rpar) : (rpar -
1425								   ((tb->
1426								     rbytes !=
1427								     -1) ? 1 :
1428								    0)), -1,
1429				    snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
1430		if (!h) {
1431			int rnver1;
1432
1433			rnver1 = get_num_ver(vn->vn_mode, tb, h,
1434					     0, -1,
1435					     (rpar -
1436					      ((tb->rbytes != -1) ? 1 : 0)),
1437					     tb->rbytes,
1438					     snum012 + RIGHT_SHIFT_FLOW, FLOW);
1439
1440			if (rnver > rnver1)
1441				rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
1442		}
1443
1444		/* calculate number of blocks S[h] must be split into when
1445		   items are shifted in both directions,
1446		   as well as number of items in each part of the splitted node (s012 numbers),
1447		   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1448		 */
1449		lrset = LR_SHIFT_NO_FLOW;
1450		lrnver = get_num_ver(vn->vn_mode, tb, h,
1451				     lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1452				     -1,
1453				     h ? (vn->vn_nr_item - rpar) : (rpar -
1454								    ((tb->
1455								      rbytes !=
1456								      -1) ? 1 :
1457								     0)), -1,
1458				     snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
1459		if (!h) {
1460			int lrnver1;
1461
1462			lrnver1 = get_num_ver(vn->vn_mode, tb, h,
1463					      lpar -
1464					      ((tb->lbytes != -1) ? 1 : 0),
1465					      tb->lbytes,
1466					      (rpar -
1467					       ((tb->rbytes != -1) ? 1 : 0)),
1468					      tb->rbytes,
1469					      snum012 + LR_SHIFT_FLOW, FLOW);
1470			if (lrnver > lrnver1)
1471				lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
1472		}
1473
1474		/* Our general shifting strategy is:
1475		   1) to minimized number of new nodes;
1476		   2) to minimized number of neighbors involved in shifting;
1477		   3) to minimized number of disk reads; */
1478
1479		/* we can win TWO or ONE nodes by shifting in both directions */
1480		if (lrnver < lnver && lrnver < rnver) {
1481			RFALSE(h &&
1482			       (tb->lnum[h] != 1 ||
1483				tb->rnum[h] != 1 ||
1484				lrnver != 1 || rnver != 2 || lnver != 2
1485				|| h != 1), "vs-8230: bad h");
1486			if (lrset == LR_SHIFT_FLOW)
1487				set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
1488					       lrnver, snum012 + lrset,
1489					       tb->lbytes, tb->rbytes);
1490			else
1491				set_parameters(tb, h,
1492					       tb->lnum[h] -
1493					       ((tb->lbytes == -1) ? 0 : 1),
1494					       tb->rnum[h] -
1495					       ((tb->rbytes == -1) ? 0 : 1),
1496					       lrnver, snum012 + lrset, -1, -1);
1497
1498			return CARRY_ON;
1499		}
1500
1501		/* if shifting doesn't lead to better packing then don't shift */
1502		if (nver == lrnver) {
1503			set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
1504				       -1);
1505			return CARRY_ON;
1506		}
1507
1508		/* now we know that for better packing shifting in only one
1509		   direction either to the left or to the right is required */
1510
1511		/*  if shifting to the left is better than shifting to the right */
1512		if (lnver < rnver) {
1513			SET_PAR_SHIFT_LEFT;
1514			return CARRY_ON;
1515		}
1516
1517		/* if shifting to the right is better than shifting to the left */
1518		if (lnver > rnver) {
1519			SET_PAR_SHIFT_RIGHT;
1520			return CARRY_ON;
1521		}
1522
1523		/* now shifting in either direction gives the same number
1524		   of nodes and we can make use of the cached neighbors */
1525		if (is_left_neighbor_in_cache(tb, h)) {
1526			SET_PAR_SHIFT_LEFT;
1527			return CARRY_ON;
1528		}
1529
1530		/* shift to the right independently on whether the right neighbor in cache or not */
1531		SET_PAR_SHIFT_RIGHT;
1532		return CARRY_ON;
1533	}
1534}
1535
1536/* Check whether current node S[h] is balanced when Decreasing its size by
1537 * Deleting or Cutting for INTERNAL node of S+tree.
1538 * Calculate parameters for balancing for current level h.
1539 * Parameters:
1540 *	tb	tree_balance structure;
1541 *	h	current level of the node;
1542 *	inum	item number in S[h];
1543 *	mode	i - insert, p - paste;
1544 * Returns:	1 - schedule occurred;
1545 *	        0 - balancing for higher levels needed;
1546 *	       -1 - no balancing for higher levels needed;
1547 *	       -2 - no disk space.
1548 *
1549 * Note: Items of internal nodes have fixed size, so the balance condition for
1550 * the internal part of S+tree is as for the B-trees.
1551 */
1552static int dc_check_balance_internal(struct tree_balance *tb, int h)
1553{
1554	struct virtual_node *vn = tb->tb_vn;
1555
1556	/* Sh is the node whose balance is currently being checked,
1557	   and Fh is its father.  */
1558	struct buffer_head *Sh, *Fh;
1559	int maxsize, n_ret_value;
1560	int lfree, rfree /* free space in L and R */ ;
1561
1562	Sh = PATH_H_PBUFFER(tb->tb_path, h);
1563	Fh = PATH_H_PPARENT(tb->tb_path, h);
1564
1565	maxsize = MAX_CHILD_SIZE(Sh);
1566
1567/*   using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
1568/*   new_nr_item = number of items node would have if operation is */
1569/* 	performed without balancing (new_nr_item); */
1570	create_virtual_node(tb, h);
1571
1572	if (!Fh) {		/* S[h] is the root. */
1573		if (vn->vn_nr_item > 0) {
1574			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1575			return NO_BALANCING_NEEDED;	/* no balancing for higher levels needed */
1576		}
1577		/* new_nr_item == 0.
1578		 * Current root will be deleted resulting in
1579		 * decrementing the tree height. */
1580		set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
1581		return CARRY_ON;
1582	}
1583
1584	if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)
1585		return n_ret_value;
1586
1587	/* get free space of neighbors */
1588	rfree = get_rfree(tb, h);
1589	lfree = get_lfree(tb, h);
1590
1591	/* determine maximal number of items we can fit into neighbors */
1592	check_left(tb, h, lfree);
1593	check_right(tb, h, rfree);
1594
1595	if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) {	/* Balance condition for the internal node is valid.
1596						 * In this case we balance only if it leads to better packing. */
1597		if (vn->vn_nr_item == MIN_NR_KEY(Sh)) {	/* Here we join S[h] with one of its neighbors,
1598							 * which is impossible with greater values of new_nr_item. */
1599			if (tb->lnum[h] >= vn->vn_nr_item + 1) {
1600				/* All contents of S[h] can be moved to L[h]. */
1601				int n;
1602				int order_L;
1603
1604				order_L =
1605				    ((n =
1606				      PATH_H_B_ITEM_ORDER(tb->tb_path,
1607							  h)) ==
1608				     0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1609				n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
1610				    (DC_SIZE + KEY_SIZE);
1611				set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
1612					       -1);
1613				return CARRY_ON;
1614			}
1615
1616			if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1617				/* All contents of S[h] can be moved to R[h]. */
1618				int n;
1619				int order_R;
1620
1621				order_R =
1622				    ((n =
1623				      PATH_H_B_ITEM_ORDER(tb->tb_path,
1624							  h)) ==
1625				     B_NR_ITEMS(Fh)) ? 0 : n + 1;
1626				n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
1627				    (DC_SIZE + KEY_SIZE);
1628				set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
1629					       -1);
1630				return CARRY_ON;
1631			}
1632		}
1633
1634		if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1635			/* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1636			int to_r;
1637
1638			to_r =
1639			    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
1640			     tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
1641			    (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1642			set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
1643				       0, NULL, -1, -1);
1644			return CARRY_ON;
1645		}
1646
1647		/* Balancing does not lead to better packing. */
1648		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1649		return NO_BALANCING_NEEDED;
1650	}
1651
1652	/* Current node contain insufficient number of items. Balancing is required. */
1653	/* Check whether we can merge S[h] with left neighbor. */
1654	if (tb->lnum[h] >= vn->vn_nr_item + 1)
1655		if (is_left_neighbor_in_cache(tb, h)
1656		    || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
1657			int n;
1658			int order_L;
1659
1660			order_L =
1661			    ((n =
1662			      PATH_H_B_ITEM_ORDER(tb->tb_path,
1663						  h)) ==
1664			     0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1665			n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
1666								      KEY_SIZE);
1667			set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
1668			return CARRY_ON;
1669		}
1670
1671	/* Check whether we can merge S[h] with right neighbor. */
1672	if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1673		int n;
1674		int order_R;
1675
1676		order_R =
1677		    ((n =
1678		      PATH_H_B_ITEM_ORDER(tb->tb_path,
1679					  h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
1680		n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
1681							      KEY_SIZE);
1682		set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
1683		return CARRY_ON;
1684	}
1685
1686	/* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1687	if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1688		int to_r;
1689
1690		to_r =
1691		    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1692		     vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1693						tb->rnum[h]);
1694		set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1695			       -1, -1);
1696		return CARRY_ON;
1697	}
1698
1699	/* For internal nodes try to borrow item from a neighbor */
1700	RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
1701
1702	/* Borrow one or two items from caching neighbor */
1703	if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
1704		int from_l;
1705
1706		from_l =
1707		    (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
1708		     1) / 2 - (vn->vn_nr_item + 1);
1709		set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
1710		return CARRY_ON;
1711	}
1712
1713	set_parameters(tb, h, 0,
1714		       -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
1715			  1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
1716	return CARRY_ON;
1717}
1718
1719/* Check whether current node S[h] is balanced when Decreasing its size by
1720 * Deleting or Truncating for LEAF node of S+tree.
1721 * Calculate parameters for balancing for current level h.
1722 * Parameters:
1723 *	tb	tree_balance structure;
1724 *	h	current level of the node;
1725 *	inum	item number in S[h];
1726 *	mode	i - insert, p - paste;
1727 * Returns:	1 - schedule occurred;
1728 *	        0 - balancing for higher levels needed;
1729 *	       -1 - no balancing for higher levels needed;
1730 *	       -2 - no disk space.
1731 */
1732static int dc_check_balance_leaf(struct tree_balance *tb, int h)
1733{
1734	struct virtual_node *vn = tb->tb_vn;
1735
1736	/* Number of bytes that must be deleted from
1737	   (value is negative if bytes are deleted) buffer which
1738	   contains node being balanced.  The mnemonic is that the
1739	   attempted change in node space used level is levbytes bytes. */
1740	int levbytes;
1741	/* the maximal item size */
1742	int maxsize, n_ret_value;
1743	/* S0 is the node whose balance is currently being checked,
1744	   and F0 is its father.  */
1745	struct buffer_head *S0, *F0;
1746	int lfree, rfree /* free space in L and R */ ;
1747
1748	S0 = PATH_H_PBUFFER(tb->tb_path, 0);
1749	F0 = PATH_H_PPARENT(tb->tb_path, 0);
1750
1751	levbytes = tb->insert_size[h];
1752
1753	maxsize = MAX_CHILD_SIZE(S0);	/* maximal possible size of an item */
1754
1755	if (!F0) {		/* S[0] is the root now. */
1756
1757		RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
1758		       "vs-8240: attempt to create empty buffer tree");
1759
1760		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1761		return NO_BALANCING_NEEDED;
1762	}
1763
1764	if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)
1765		return n_ret_value;
1766
1767	/* get free space of neighbors */
1768	rfree = get_rfree(tb, h);
1769	lfree = get_lfree(tb, h);
1770
1771	create_virtual_node(tb, h);
1772
1773	/* if 3 leaves can be merge to one, set parameters and return */
1774	if (are_leaves_removable(tb, lfree, rfree))
1775		return CARRY_ON;
1776
1777	/* determine maximal number of items we can shift to the left/right  neighbor
1778	   and the maximal number of bytes that can flow to the left/right neighbor
1779	   from the left/right most liquid item that cannot be shifted from S[0] entirely
1780	 */
1781	check_left(tb, h, lfree);
1782	check_right(tb, h, rfree);
1783
1784	/* check whether we can merge S with left neighbor. */
1785	if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
1786		if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||	/* S can not be merged with R */
1787		    !tb->FR[h]) {
1788
1789			RFALSE(!tb->FL[h],
1790			       "vs-8245: dc_check_balance_leaf: FL[h] must exist");
1791
1792			/* set parameter to merge S[0] with its left neighbor */
1793			set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
1794			return CARRY_ON;
1795		}
1796
1797	/* check whether we can merge S[0] with right neighbor. */
1798	if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
1799		set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
1800		return CARRY_ON;
1801	}
1802
1803	/* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
1804	if (is_leaf_removable(tb))
1805		return CARRY_ON;
1806
1807	/* Balancing is not required. */
1808	tb->s0num = vn->vn_nr_item;
1809	set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1810	return NO_BALANCING_NEEDED;
1811}
1812
1813/* Check whether current node S[h] is balanced when Decreasing its size by
1814 * Deleting or Cutting.
1815 * Calculate parameters for balancing for current level h.
1816 * Parameters:
1817 *	tb	tree_balance structure;
1818 *	h	current level of the node;
1819 *	inum	item number in S[h];
1820 *	mode	d - delete, c - cut.
1821 * Returns:	1 - schedule occurred;
1822 *	        0 - balancing for higher levels needed;
1823 *	       -1 - no balancing for higher levels needed;
1824 *	       -2 - no disk space.
1825 */
1826static int dc_check_balance(struct tree_balance *tb, int h)
1827{
1828	RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
1829	       "vs-8250: S is not initialized");
1830
1831	if (h)
1832		return dc_check_balance_internal(tb, h);
1833	else
1834		return dc_check_balance_leaf(tb, h);
1835}
1836
1837/* Check whether current node S[h] is balanced.
1838 * Calculate parameters for balancing for current level h.
1839 * Parameters:
1840 *
1841 *	tb	tree_balance structure:
1842 *
1843 *              tb is a large structure that must be read about in the header file
1844 *              at the same time as this procedure if the reader is to successfully
1845 *              understand this procedure
1846 *
1847 *	h	current level of the node;
1848 *	inum	item number in S[h];
1849 *	mode	i - insert, p - paste, d - delete, c - cut.
1850 * Returns:	1 - schedule occurred;
1851 *	        0 - balancing for higher levels needed;
1852 *	       -1 - no balancing for higher levels needed;
1853 *	       -2 - no disk space.
1854 */
1855static int check_balance(int mode,
1856			 struct tree_balance *tb,
1857			 int h,
1858			 int inum,
1859			 int pos_in_item,
1860			 struct item_head *ins_ih, const void *data)
1861{
1862	struct virtual_node *vn;
1863
1864	vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
1865	vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
1866	vn->vn_mode = mode;
1867	vn->vn_affected_item_num = inum;
1868	vn->vn_pos_in_item = pos_in_item;
1869	vn->vn_ins_ih = ins_ih;
1870	vn->vn_data = data;
1871
1872	RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
1873	       "vs-8255: ins_ih can not be 0 in insert mode");
1874
1875	if (tb->insert_size[h] > 0)
1876		/* Calculate balance parameters when size of node is increasing. */
1877		return ip_check_balance(tb, h);
1878
1879	/* Calculate balance parameters when  size of node is decreasing. */
1880	return dc_check_balance(tb, h);
1881}
1882
1883/* Check whether parent at the path is the really parent of the current node.*/
1884static int get_direct_parent(struct tree_balance *p_s_tb, int n_h)
1885{
1886	struct buffer_head *p_s_bh;
1887	struct treepath *p_s_path = p_s_tb->tb_path;
1888	int n_position,
1889	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1890
1891	/* We are in the root or in the new root. */
1892	if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1893
1894		RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
1895		       "PAP-8260: invalid offset in the path");
1896
1897		if (PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->
1898		    b_blocknr == SB_ROOT_BLOCK(p_s_tb->tb_sb)) {
1899			/* Root is not changed. */
1900			PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
1901			PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
1902			return CARRY_ON;
1903		}
1904		return REPEAT_SEARCH;	/* Root is changed and we must recalculate the path. */
1905	}
1906
1907	if (!B_IS_IN_TREE
1908	    (p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)))
1909		return REPEAT_SEARCH;	/* Parent in the path is not in the tree. */
1910
1911	if ((n_position =
1912	     PATH_OFFSET_POSITION(p_s_path,
1913				  n_path_offset - 1)) > B_NR_ITEMS(p_s_bh))
1914		return REPEAT_SEARCH;
1915
1916	if (B_N_CHILD_NUM(p_s_bh, n_position) !=
1917	    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr)
1918		/* Parent in the path is not parent of the current node in the tree. */
1919		return REPEAT_SEARCH;
1920
1921	if (buffer_locked(p_s_bh)) {
1922		__wait_on_buffer(p_s_bh);
1923		if (FILESYSTEM_CHANGED_TB(p_s_tb))
1924			return REPEAT_SEARCH;
1925	}
1926
1927	return CARRY_ON;	/* Parent in the path is unlocked and really parent of the current node.  */
1928}
1929
1930/* Using lnum[n_h] and rnum[n_h] we should determine what neighbors
1931 * of S[n_h] we
1932 * need in order to balance S[n_h], and get them if necessary.
1933 * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
1934 *	        CARRY_ON - schedule didn't occur while the function worked;
1935 */
1936static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
1937{
1938	int n_child_position,
1939	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
1940	unsigned long n_son_number;
1941	struct super_block *p_s_sb = p_s_tb->tb_sb;
1942	struct buffer_head *p_s_bh;
1943
1944	PROC_INFO_INC(p_s_sb, get_neighbors[n_h]);
1945
1946	if (p_s_tb->lnum[n_h]) {
1947		/* We need left neighbor to balance S[n_h]. */
1948		PROC_INFO_INC(p_s_sb, need_l_neighbor[n_h]);
1949		p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1950
1951		RFALSE(p_s_bh == p_s_tb->FL[n_h] &&
1952		       !PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
1953		       "PAP-8270: invalid position in the parent");
1954
1955		n_child_position =
1956		    (p_s_bh ==
1957		     p_s_tb->FL[n_h]) ? p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->
1958								       FL[n_h]);
1959		n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
1960		p_s_bh = sb_bread(p_s_sb, n_son_number);
1961		if (!p_s_bh)
1962			return IO_ERROR;
1963		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
1964			decrement_bcount(p_s_bh);
1965			PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]);
1966			return REPEAT_SEARCH;
1967		}
1968
1969		RFALSE(!B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
1970		       n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
1971		       B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
1972		       p_s_bh->b_blocknr, "PAP-8275: invalid parent");
1973		RFALSE(!B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child");
1974		RFALSE(!n_h &&
1975		       B_FREE_SPACE(p_s_bh) !=
1976		       MAX_CHILD_SIZE(p_s_bh) -
1977		       dc_size(B_N_CHILD(p_s_tb->FL[0], n_child_position)),
1978		       "PAP-8290: invalid child size of left neighbor");
1979
1980		decrement_bcount(p_s_tb->L[n_h]);
1981		p_s_tb->L[n_h] = p_s_bh;
1982	}
1983
1984	if (p_s_tb->rnum[n_h]) {	/* We need right neighbor to balance S[n_path_offset]. */
1985		PROC_INFO_INC(p_s_sb, need_r_neighbor[n_h]);
1986		p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1987
1988		RFALSE(p_s_bh == p_s_tb->FR[n_h] &&
1989		       PATH_OFFSET_POSITION(p_s_tb->tb_path,
1990					    n_path_offset) >=
1991		       B_NR_ITEMS(p_s_bh),
1992		       "PAP-8295: invalid position in the parent");
1993
1994		n_child_position =
1995		    (p_s_bh == p_s_tb->FR[n_h]) ? p_s_tb->rkey[n_h] + 1 : 0;
1996		n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
1997		p_s_bh = sb_bread(p_s_sb, n_son_number);
1998		if (!p_s_bh)
1999			return IO_ERROR;
2000		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2001			decrement_bcount(p_s_bh);
2002			PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]);
2003			return REPEAT_SEARCH;
2004		}
2005		decrement_bcount(p_s_tb->R[n_h]);
2006		p_s_tb->R[n_h] = p_s_bh;
2007
2008		RFALSE(!n_h
2009		       && B_FREE_SPACE(p_s_bh) !=
2010		       MAX_CHILD_SIZE(p_s_bh) -
2011		       dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)),
2012		       "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
2013		       B_FREE_SPACE(p_s_bh), MAX_CHILD_SIZE(p_s_bh),
2014		       dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)));
2015
2016	}
2017	return CARRY_ON;
2018}
2019
2020static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
2021{
2022	int max_num_of_items;
2023	int max_num_of_entries;
2024	unsigned long blocksize = sb->s_blocksize;
2025
2026#define MIN_NAME_LEN 1
2027
2028	max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
2029	max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
2030	    (DEH_SIZE + MIN_NAME_LEN);
2031
2032	return sizeof(struct virtual_node) +
2033	    max(max_num_of_items * sizeof(struct virtual_item),
2034		sizeof(struct virtual_item) + sizeof(struct direntry_uarea) +
2035		(max_num_of_entries - 1) * sizeof(__u16));
2036}
2037
2038/* maybe we should fail balancing we are going to perform when kmalloc
2039   fails several times. But now it will loop until kmalloc gets
2040   required memory */
2041static int get_mem_for_virtual_node(struct tree_balance *tb)
2042{
2043	int check_fs = 0;
2044	int size;
2045	char *buf;
2046
2047	size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
2048
2049	if (size > tb->vn_buf_size) {
2050		/* we have to allocate more memory for virtual node */
2051		if (tb->vn_buf) {
2052			/* free memory allocated before */
2053			kfree(tb->vn_buf);
2054			/* this is not needed if kfree is atomic */
2055			check_fs = 1;
2056		}
2057
2058		/* virtual node requires now more memory */
2059		tb->vn_buf_size = size;
2060
2061		/* get memory for virtual item */
2062		buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
2063		if (!buf) {
2064			/* getting memory with GFP_KERNEL priority may involve
2065			   balancing now (due to indirect_to_direct conversion on
2066			   dcache shrinking). So, release path and collected
2067			   resources here */
2068			free_buffers_in_tb(tb);
2069			buf = kmalloc(size, GFP_NOFS);
2070			if (!buf) {
2071				tb->vn_buf_size = 0;
2072			}
2073			tb->vn_buf = buf;
2074			schedule();
2075			return REPEAT_SEARCH;
2076		}
2077
2078		tb->vn_buf = buf;
2079	}
2080
2081	if (check_fs && FILESYSTEM_CHANGED_TB(tb))
2082		return REPEAT_SEARCH;
2083
2084	return CARRY_ON;
2085}
2086
2087#ifdef CONFIG_REISERFS_CHECK
2088static void tb_buffer_sanity_check(struct super_block *p_s_sb,
2089				   struct buffer_head *p_s_bh,
2090				   const char *descr, int level)
2091{
2092	if (p_s_bh) {
2093		if (atomic_read(&(p_s_bh->b_count)) <= 0) {
2094
2095			reiserfs_panic(p_s_sb,
2096				       "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n",
2097				       descr, level, p_s_bh);
2098		}
2099
2100		if (!buffer_uptodate(p_s_bh)) {
2101			reiserfs_panic(p_s_sb,
2102				       "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n",
2103				       descr, level, p_s_bh);
2104		}
2105
2106		if (!B_IS_IN_TREE(p_s_bh)) {
2107			reiserfs_panic(p_s_sb,
2108				       "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n",
2109				       descr, level, p_s_bh);
2110		}
2111
2112		if (p_s_bh->b_bdev != p_s_sb->s_bdev) {
2113			reiserfs_panic(p_s_sb,
2114				       "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n",
2115				       descr, level, p_s_bh);
2116		}
2117
2118		if (p_s_bh->b_size != p_s_sb->s_blocksize) {
2119			reiserfs_panic(p_s_sb,
2120				       "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n",
2121				       descr, level, p_s_bh);
2122		}
2123
2124		if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
2125			reiserfs_panic(p_s_sb,
2126				       "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n",
2127				       descr, level, p_s_bh);
2128		}
2129	}
2130}
2131#else
2132static void tb_buffer_sanity_check(struct super_block *p_s_sb,
2133				   struct buffer_head *p_s_bh,
2134				   const char *descr, int level)
2135{;
2136}
2137#endif
2138
2139static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
2140{
2141	return reiserfs_prepare_for_journal(s, bh, 0);
2142}
2143
2144static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
2145{
2146	struct buffer_head *locked;
2147#ifdef CONFIG_REISERFS_CHECK
2148	int repeat_counter = 0;
2149#endif
2150	int i;
2151
2152	do {
2153
2154		locked = NULL;
2155
2156		for (i = p_s_tb->tb_path->path_length;
2157		     !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
2158			if (PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
2159				/* if I understand correctly, we can only be sure the last buffer
2160				 ** in the path is in the tree --clm
2161				 */
2162#ifdef CONFIG_REISERFS_CHECK
2163				if (PATH_PLAST_BUFFER(p_s_tb->tb_path) ==
2164				    PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
2165					tb_buffer_sanity_check(p_s_tb->tb_sb,
2166							       PATH_OFFSET_PBUFFER
2167							       (p_s_tb->tb_path,
2168								i), "S",
2169							       p_s_tb->tb_path->
2170							       path_length - i);
2171				}
2172#endif
2173				if (!clear_all_dirty_bits(p_s_tb->tb_sb,
2174							  PATH_OFFSET_PBUFFER
2175							  (p_s_tb->tb_path,
2176							   i))) {
2177					locked =
2178					    PATH_OFFSET_PBUFFER(p_s_tb->tb_path,
2179								i);
2180				}
2181			}
2182		}
2183
2184		for (i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i];
2185		     i++) {
2186
2187			if (p_s_tb->lnum[i]) {
2188
2189				if (p_s_tb->L[i]) {
2190					tb_buffer_sanity_check(p_s_tb->tb_sb,
2191							       p_s_tb->L[i],
2192							       "L", i);
2193					if (!clear_all_dirty_bits
2194					    (p_s_tb->tb_sb, p_s_tb->L[i]))
2195						locked = p_s_tb->L[i];
2196				}
2197
2198				if (!locked && p_s_tb->FL[i]) {
2199					tb_buffer_sanity_check(p_s_tb->tb_sb,
2200							       p_s_tb->FL[i],
2201							       "FL", i);
2202					if (!clear_all_dirty_bits
2203					    (p_s_tb->tb_sb, p_s_tb->FL[i]))
2204						locked = p_s_tb->FL[i];
2205				}
2206
2207				if (!locked && p_s_tb->CFL[i]) {
2208					tb_buffer_sanity_check(p_s_tb->tb_sb,
2209							       p_s_tb->CFL[i],
2210							       "CFL", i);
2211					if (!clear_all_dirty_bits
2212					    (p_s_tb->tb_sb, p_s_tb->CFL[i]))
2213						locked = p_s_tb->CFL[i];
2214				}
2215
2216			}
2217
2218			if (!locked && (p_s_tb->rnum[i])) {
2219
2220				if (p_s_tb->R[i]) {
2221					tb_buffer_sanity_check(p_s_tb->tb_sb,
2222							       p_s_tb->R[i],
2223							       "R", i);
2224					if (!clear_all_dirty_bits
2225					    (p_s_tb->tb_sb, p_s_tb->R[i]))
2226						locked = p_s_tb->R[i];
2227				}
2228
2229				if (!locked && p_s_tb->FR[i]) {
2230					tb_buffer_sanity_check(p_s_tb->tb_sb,
2231							       p_s_tb->FR[i],
2232							       "FR", i);
2233					if (!clear_all_dirty_bits
2234					    (p_s_tb->tb_sb, p_s_tb->FR[i]))
2235						locked = p_s_tb->FR[i];
2236				}
2237
2238				if (!locked && p_s_tb->CFR[i]) {
2239					tb_buffer_sanity_check(p_s_tb->tb_sb,
2240							       p_s_tb->CFR[i],
2241							       "CFR", i);
2242					if (!clear_all_dirty_bits
2243					    (p_s_tb->tb_sb, p_s_tb->CFR[i]))
2244						locked = p_s_tb->CFR[i];
2245				}
2246			}
2247		}
2248		/* as far as I can tell, this is not required.  The FEB list seems
2249		 ** to be full of newly allocated nodes, which will never be locked,
2250		 ** dirty, or anything else.
2251		 ** To be safe, I'm putting in the checks and waits in.  For the moment,
2252		 ** they are needed to keep the code in journal.c from complaining
2253		 ** about the buffer.  That code is inside CONFIG_REISERFS_CHECK as well.
2254		 ** --clm
2255		 */
2256		for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
2257			if (p_s_tb->FEB[i]) {
2258				if (!clear_all_dirty_bits
2259				    (p_s_tb->tb_sb, p_s_tb->FEB[i]))
2260					locked = p_s_tb->FEB[i];
2261			}
2262		}
2263
2264		if (locked) {
2265#ifdef CONFIG_REISERFS_CHECK
2266			repeat_counter++;
2267			if ((repeat_counter % 10000) == 0) {
2268				reiserfs_warning(p_s_tb->tb_sb,
2269						 "wait_tb_buffers_until_released(): too many "
2270						 "iterations waiting for buffer to unlock "
2271						 "(%b)", locked);
2272
2273				/* Don't loop forever.  Try to recover from possible error. */
2274
2275				return (FILESYSTEM_CHANGED_TB(p_s_tb)) ?
2276				    REPEAT_SEARCH : CARRY_ON;
2277			}
2278#endif
2279			__wait_on_buffer(locked);
2280			if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2281				return REPEAT_SEARCH;
2282			}
2283		}
2284
2285	} while (locked);
2286
2287	return CARRY_ON;
2288}
2289
2290/* Prepare for balancing, that is
2291 *	get all necessary parents, and neighbors;
2292 *	analyze what and where should be moved;
2293 *	get sufficient number of new nodes;
2294 * Balancing will start only after all resources will be collected at a time.
2295 *
2296 * When ported to SMP kernels, only at the last moment after all needed nodes
2297 * are collected in cache, will the resources be locked using the usual
2298 * textbook ordered lock acquisition algorithms.  Note that ensuring that
2299 * this code neither write locks what it does not need to write lock nor locks out of order
2300 * will be a pain in the butt that could have been avoided.  Grumble grumble. -Hans
2301 *
2302 * fix is meant in the sense of render unchanging
2303 *
2304 * Latency might be improved by first gathering a list of what buffers are needed
2305 * and then getting as many of them in parallel as possible? -Hans
2306 *
2307 * Parameters:
2308 *	op_mode	i - insert, d - delete, c - cut (truncate), p - paste (append)
2309 *	tb	tree_balance structure;
2310 *	inum	item number in S[h];
2311 *      pos_in_item - comment this if you can
2312 *      ins_ih & ins_sd are used when inserting
2313 * Returns:	1 - schedule occurred while the function worked;
2314 *	        0 - schedule didn't occur while the function worked;
2315 *             -1 - if no_disk_space
2316 */
2317
2318int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_ins_ih,	// item head of item being inserted
2319	      const void *data	// inserted item or data to be pasted
2320    )
2321{
2322	int n_ret_value, n_h, n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path);
2323	int n_pos_in_item;
2324
2325	/* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
2326	 ** during wait_tb_buffers_run
2327	 */
2328	int wait_tb_buffers_run = 0;
2329	struct buffer_head *p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
2330
2331	++REISERFS_SB(p_s_tb->tb_sb)->s_fix_nodes;
2332
2333	n_pos_in_item = p_s_tb->tb_path->pos_in_item;
2334
2335	p_s_tb->fs_gen = get_generation(p_s_tb->tb_sb);
2336
2337	/* we prepare and log the super here so it will already be in the
2338	 ** transaction when do_balance needs to change it.
2339	 ** This way do_balance won't have to schedule when trying to prepare
2340	 ** the super for logging
2341	 */
2342	reiserfs_prepare_for_journal(p_s_tb->tb_sb,
2343				     SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1);
2344	journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb,
2345			   SB_BUFFER_WITH_SB(p_s_tb->tb_sb));
2346	if (FILESYSTEM_CHANGED_TB(p_s_tb))
2347		return REPEAT_SEARCH;
2348
2349	/* if it possible in indirect_to_direct conversion */
2350	if (buffer_locked(p_s_tbS0)) {
2351		__wait_on_buffer(p_s_tbS0);
2352		if (FILESYSTEM_CHANGED_TB(p_s_tb))
2353			return REPEAT_SEARCH;
2354	}
2355#ifdef CONFIG_REISERFS_CHECK
2356	if (cur_tb) {
2357		print_cur_tb("fix_nodes");
2358		reiserfs_panic(p_s_tb->tb_sb,
2359			       "PAP-8305: fix_nodes:  there is pending do_balance");
2360	}
2361
2362	if (!buffer_uptodate(p_s_tbS0) || !B_IS_IN_TREE(p_s_tbS0)) {
2363		reiserfs_panic(p_s_tb->tb_sb,
2364			       "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate "
2365			       "at the beginning of fix_nodes or not in tree (mode %c)",
2366			       p_s_tbS0, p_s_tbS0, n_op_mode);
2367	}
2368
2369	/* Check parameters. */
2370	switch (n_op_mode) {
2371	case M_INSERT:
2372		if (n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0))
2373			reiserfs_panic(p_s_tb->tb_sb,
2374				       "PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert",
2375				       n_item_num, B_NR_ITEMS(p_s_tbS0));
2376		break;
2377	case M_PASTE:
2378	case M_DELETE:
2379	case M_CUT:
2380		if (n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0)) {
2381			print_block(p_s_tbS0, 0, -1, -1);
2382			reiserfs_panic(p_s_tb->tb_sb,
2383				       "PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n",
2384				       n_item_num, n_op_mode,
2385				       p_s_tb->insert_size[0]);
2386		}
2387		break;
2388	default:
2389		reiserfs_panic(p_s_tb->tb_sb,
2390			       "PAP-8340: fix_nodes: Incorrect mode of operation");
2391	}
2392#endif
2393
2394	if (get_mem_for_virtual_node(p_s_tb) == REPEAT_SEARCH)
2395		return REPEAT_SEARCH;
2396
2397	/* Starting from the leaf level; for all levels n_h of the tree. */
2398	for (n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++) {
2399		if ((n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON) {
2400			goto repeat;
2401		}
2402
2403		if ((n_ret_value =
2404		     check_balance(n_op_mode, p_s_tb, n_h, n_item_num,
2405				   n_pos_in_item, p_s_ins_ih,
2406				   data)) != CARRY_ON) {
2407			if (n_ret_value == NO_BALANCING_NEEDED) {
2408				/* No balancing for higher levels needed. */
2409				if ((n_ret_value =
2410				     get_neighbors(p_s_tb, n_h)) != CARRY_ON) {
2411					goto repeat;
2412				}
2413				if (n_h != MAX_HEIGHT - 1)
2414					p_s_tb->insert_size[n_h + 1] = 0;
2415				/* ok, analysis and resource gathering are complete */
2416				break;
2417			}
2418			goto repeat;
2419		}
2420
2421		if ((n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON) {
2422			goto repeat;
2423		}
2424
2425		if ((n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON) {
2426			goto repeat;	/* No disk space, or schedule occurred and
2427					   analysis may be invalid and needs to be redone. */
2428		}
2429
2430		if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h)) {
2431			/* We have a positive insert size but no nodes exist on this
2432			   level, this means that we are creating a new root. */
2433
2434			RFALSE(p_s_tb->blknum[n_h] != 1,
2435			       "PAP-8350: creating new empty root");
2436
2437			if (n_h < MAX_HEIGHT - 1)
2438				p_s_tb->insert_size[n_h + 1] = 0;
2439		} else if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1)) {
2440			if (p_s_tb->blknum[n_h] > 1) {
2441				/* The tree needs to be grown, so this node S[n_h]
2442				   which is the root node is split into two nodes,
2443				   and a new node (S[n_h+1]) will be created to
2444				   become the root node.  */
2445
2446				RFALSE(n_h == MAX_HEIGHT - 1,
2447				       "PAP-8355: attempt to create too high of a tree");
2448
2449				p_s_tb->insert_size[n_h + 1] =
2450				    (DC_SIZE +
2451				     KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) +
2452				    DC_SIZE;
2453			} else if (n_h < MAX_HEIGHT - 1)
2454				p_s_tb->insert_size[n_h + 1] = 0;
2455		} else
2456			p_s_tb->insert_size[n_h + 1] =
2457			    (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
2458	}
2459
2460	if ((n_ret_value = wait_tb_buffers_until_unlocked(p_s_tb)) == CARRY_ON) {
2461		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2462			wait_tb_buffers_run = 1;
2463			n_ret_value = REPEAT_SEARCH;
2464			goto repeat;
2465		} else {
2466			return CARRY_ON;
2467		}
2468	} else {
2469		wait_tb_buffers_run = 1;
2470		goto repeat;
2471	}
2472
2473      repeat:
2474	// fix_nodes was unable to perform its calculation due to
2475	// filesystem got changed under us, lack of free disk space or i/o
2476	// failure. If the first is the case - the search will be
2477	// repeated. For now - free all resources acquired so far except
2478	// for the new allocated nodes
2479	{
2480		int i;
2481
2482		/* Release path buffers. */
2483		if (wait_tb_buffers_run) {
2484			pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path);
2485		} else {
2486			pathrelse(p_s_tb->tb_path);
2487		}
2488		/* brelse all resources collected for balancing */
2489		for (i = 0; i < MAX_HEIGHT; i++) {
2490			if (wait_tb_buffers_run) {
2491				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2492								 p_s_tb->L[i]);
2493				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2494								 p_s_tb->R[i]);
2495				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2496								 p_s_tb->FL[i]);
2497				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2498								 p_s_tb->FR[i]);
2499				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2500								 p_s_tb->
2501								 CFL[i]);
2502				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2503								 p_s_tb->
2504								 CFR[i]);
2505			}
2506
2507			brelse(p_s_tb->L[i]);
2508			p_s_tb->L[i] = NULL;
2509			brelse(p_s_tb->R[i]);
2510			p_s_tb->R[i] = NULL;
2511			brelse(p_s_tb->FL[i]);
2512			p_s_tb->FL[i] = NULL;
2513			brelse(p_s_tb->FR[i]);
2514			p_s_tb->FR[i] = NULL;
2515			brelse(p_s_tb->CFL[i]);
2516			p_s_tb->CFL[i] = NULL;
2517			brelse(p_s_tb->CFR[i]);
2518			p_s_tb->CFR[i] = NULL;
2519		}
2520
2521		if (wait_tb_buffers_run) {
2522			for (i = 0; i < MAX_FEB_SIZE; i++) {
2523				if (p_s_tb->FEB[i]) {
2524					reiserfs_restore_prepared_buffer
2525					    (p_s_tb->tb_sb, p_s_tb->FEB[i]);
2526				}
2527			}
2528		}
2529		return n_ret_value;
2530	}
2531
2532}
2533
2534/* Anatoly will probably forgive me renaming p_s_tb to tb. I just
2535   wanted to make lines shorter */
2536void unfix_nodes(struct tree_balance *tb)
2537{
2538	int i;
2539
2540	/* Release path buffers. */
2541	pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2542
2543	/* brelse all resources collected for balancing */
2544	for (i = 0; i < MAX_HEIGHT; i++) {
2545		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
2546		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
2547		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
2548		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
2549		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
2550		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
2551
2552		brelse(tb->L[i]);
2553		brelse(tb->R[i]);
2554		brelse(tb->FL[i]);
2555		brelse(tb->FR[i]);
2556		brelse(tb->CFL[i]);
2557		brelse(tb->CFR[i]);
2558	}
2559
2560	/* deal with list of allocated (used and unused) nodes */
2561	for (i = 0; i < MAX_FEB_SIZE; i++) {
2562		if (tb->FEB[i]) {
2563			b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
2564			/* de-allocated block which was not used by balancing and
2565			   bforget about buffer for it */
2566			brelse(tb->FEB[i]);
2567			reiserfs_free_block(tb->transaction_handle, NULL,
2568					    blocknr, 0);
2569		}
2570		if (tb->used[i]) {
2571			/* release used as new nodes including a new root */
2572			brelse(tb->used[i]);
2573		}
2574	}
2575
2576	kfree(tb->vn_buf);
2577
2578}
2579