1/*
2 *   Copyright (c) International Business Machines Corp., 2000-2002
3 *
4 *   This program is free software;  you can redistribute it and/or modify
5 *   it under the terms of the GNU General Public License as published by
6 *   the Free Software Foundation; either version 2 of the License, or
7 *   (at your option) any later version.
8 *
9 *   This program is distributed in the hope that it will be useful,
10 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
12 *   the GNU General Public License for more details.
13 *
14 *   You should have received a copy of the GNU General Public License
15 *   along with this program;  if not, write to the Free Software
16 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18/*
19 *      jfs_xtree.c: extent allocation descriptor B+-tree manager
20 */
21
22#include <linux/fs.h>
23#include "jfs_incore.h"
24#include "jfs_filsys.h"
25#include "jfs_metapage.h"
26#include "jfs_dmap.h"
27#include "jfs_dinode.h"
28#include "jfs_superblock.h"
29#include "jfs_debug.h"
30
31/*
32 * xtree local flag
33 */
34#define XT_INSERT       0x00000001
35
36/*
37 *       xtree key/entry comparison: extent offset
38 *
39 * return:
40 *      -1: k < start of extent
41 *       0: start_of_extent <= k <= end_of_extent
42 *       1: k > end_of_extent
43 */
44#define XT_CMP(CMP, K, X, OFFSET64)\
45{\
46        OFFSET64 = offsetXAD(X);\
47        (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\
48              ((K) < OFFSET64) ? -1 : 0;\
49}
50
51/* write a xad entry */
52#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\
53{\
54        (XAD)->flag = (FLAG);\
55        XADoffset((XAD), (OFF));\
56        XADlength((XAD), (LEN));\
57        XADaddress((XAD), (ADDR));\
58}
59
60#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot)
61
62/* get page buffer for specified block address */
63#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
64{\
65        BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\
66        if (!(RC))\
67        {\
68                if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\
69                    (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\
70                    (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\
71                {\
72                        jERROR(1,("XT_GETPAGE: xtree page corrupt\n"));\
73			BT_PUTPAGE(MP);\
74			updateSuper((IP)->i_sb, FM_DIRTY);\
75			MP = NULL;\
76                        RC = EIO;\
77                }\
78        }\
79}
80
81/* for consistency */
82#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
83
84#define XT_GETSEARCH(IP, LEAF, BN, MP,  P, INDEX) \
85	BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot)
86/* xtree entry parameter descriptor */
87struct xtsplit {
88	struct metapage *mp;
89	s16 index;
90	u8 flag;
91	s64 off;
92	s64 addr;
93	int len;
94	struct pxdlist *pxdlist;
95};
96
97
98/*
99 *      statistics
100 */
101#ifdef CONFIG_JFS_STATISTICS
102static struct {
103	uint search;
104	uint fastSearch;
105	uint split;
106} xtStat;
107#endif
108
109
110/*
111 * forward references
112 */
113static int xtSearch(struct inode *ip,
114		    s64 xoff, int *cmpp, struct btstack * btstack, int flag);
115
116static int xtSplitUp(tid_t tid,
117		     struct inode *ip,
118		     struct xtsplit * split, struct btstack * btstack);
119
120static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split,
121		       struct metapage ** rmpp, s64 * rbnp);
122
123static int xtSplitRoot(tid_t tid, struct inode *ip,
124		       struct xtsplit * split, struct metapage ** rmpp);
125
126#ifdef _STILL_TO_PORT
127static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
128		      xtpage_t * fp, struct btstack * btstack);
129
130static int xtSearchNode(struct inode *ip,
131			xad_t * xad,
132			int *cmpp, struct btstack * btstack, int flag);
133
134static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp);
135#endif				/*  _STILL_TO_PORT */
136
137/* External references */
138
139/*
140 *      debug control
141 */
142/*      #define _JFS_DEBUG_XTREE        1 */
143
144
145/*
146 *      xtLookup()
147 *
148 * function: map a single page into a physical extent;
149 */
150int xtLookup(struct inode *ip, s64 lstart,
151	     s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check)
152{
153	int rc = 0;
154	struct btstack btstack;
155	int cmp;
156	s64 bn;
157	struct metapage *mp;
158	xtpage_t *p;
159	int index;
160	xad_t *xad;
161	s64 size, xoff, xend;
162	int xlen;
163	s64 xaddr;
164
165	*plen = 0;
166
167	if (!no_check) {
168		/* is lookup offset beyond eof ? */
169		size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
170		    JFS_SBI(ip->i_sb)->l2bsize;
171		if (lstart >= size) {
172			jERROR(1,
173			       ("xtLookup: lstart (0x%lx) >= size (0x%lx)\n",
174				(ulong) lstart, (ulong) size));
175			return 0;
176		}
177	}
178
179	/*
180	 * search for the xad entry covering the logical extent
181	 */
182//search:
183	if ((rc = xtSearch(ip, lstart, &cmp, &btstack, 0))) {
184		jERROR(1, ("xtLookup: xtSearch returned %d\n", rc));
185		return rc;
186	}
187
188	/*
189	 *      compute the physical extent covering logical extent
190	 *
191	 * N.B. search may have failed (e.g., hole in sparse file),
192	 * and returned the index of the next entry.
193	 */
194	/* retrieve search result */
195	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
196
197	/* is xad found covering start of logical extent ?
198	 * lstart is a page start address,
199	 * i.e., lstart cannot start in a hole;
200	 */
201	if (cmp) {
202		jFYI(1, ("xtLookup: cmp = %d\n", cmp));
203		goto out;
204	}
205
206	/*
207	 * lxd covered by xad
208	 */
209	xad = &p->xad[index];
210	xoff = offsetXAD(xad);
211	xlen = lengthXAD(xad);
212	xend = xoff + xlen;
213	xaddr = addressXAD(xad);
214
215	jEVENT(0,
216	       ("index = %d, xoff = 0x%lx, xlen = 0x%x, xaddr = 0x%lx\n",
217		index, (ulong) xoff, xlen, (ulong) xaddr));
218
219	/* initialize new pxd */
220	*pflag = xad->flag;
221	*paddr = xaddr + (lstart - xoff);
222	/* a page must be fully covered by an xad */
223	*plen = min(xend - lstart, llen);
224
225      out:
226	XT_PUTPAGE(mp);
227
228	return rc;
229}
230
231
232/*
233 *      xtLookupList()
234 *
235 * function: map a single logical extent into a list of physical extent;
236 *
237 * parameter:
238 *      struct inode    *ip,
239 *      struct lxdlist  *lxdlist,       lxd list (in)
240 *      struct xadlist  *xadlist,       xad list (in/out)
241 *      int		flag)
242 *
243 * coverage of lxd by xad under assumption of
244 * . lxd's are ordered and disjoint.
245 * . xad's are ordered and disjoint.
246 *
247 * return:
248 *      0:      success
249 *
250 * note: a page being written (even a single byte) is backed fully,
251 *      except the last page which is only backed with blocks
252 *      required to cover the last byte;
253 *      the extent backing a page is fully contained within an xad;
254 */
255int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
256		 struct xadlist * xadlist, int flag)
257{
258	int rc = 0;
259	struct btstack btstack;
260	int cmp;
261	s64 bn;
262	struct metapage *mp;
263	xtpage_t *p;
264	int index;
265	lxd_t *lxd;
266	xad_t *xad, *pxd;
267	s64 size, lstart, lend, xstart, xend, pstart;
268	s64 llen, xlen, plen;
269	s64 xaddr, paddr;
270	int nlxd, npxd, maxnpxd;
271
272	npxd = xadlist->nxad = 0;
273	maxnpxd = xadlist->maxnxad;
274	pxd = xadlist->xad;
275
276	nlxd = lxdlist->nlxd;
277	lxd = lxdlist->lxd;
278
279	lstart = offsetLXD(lxd);
280	llen = lengthLXD(lxd);
281	lend = lstart + llen;
282
283	size = (ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
284	    JFS_SBI(ip->i_sb)->l2bsize;
285
286	/*
287	 * search for the xad entry covering the logical extent
288	 */
289      search:
290	if (lstart >= size)
291		return 0;
292
293	if ((rc = xtSearch(ip, lstart, &cmp, &btstack, 0)))
294		return rc;
295
296	/*
297	 *      compute the physical extent covering logical extent
298	 *
299	 * N.B. search may have failed (e.g., hole in sparse file),
300	 * and returned the index of the next entry.
301	 */
302//map:
303	/* retrieve search result */
304	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
305
306	/* is xad on the next sibling page ? */
307	if (index == le16_to_cpu(p->header.nextindex)) {
308		if (p->header.flag & BT_ROOT)
309			goto mapend;
310
311		if ((bn = le64_to_cpu(p->header.next)) == 0)
312			goto mapend;
313
314		XT_PUTPAGE(mp);
315
316		/* get next sibling page */
317		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
318		if (rc)
319			return rc;
320
321		index = XTENTRYSTART;
322	}
323
324	xad = &p->xad[index];
325
326	/*
327	 * is lxd covered by xad ?
328	 */
329      compare:
330	xstart = offsetXAD(xad);
331	xlen = lengthXAD(xad);
332	xend = xstart + xlen;
333	xaddr = addressXAD(xad);
334
335      compare1:
336	if (xstart < lstart)
337		goto compare2;
338
339	/* (lstart <= xstart) */
340
341	/* lxd is NOT covered by xad */
342	if (lend <= xstart) {
343		/*
344		 * get next lxd
345		 */
346		if (--nlxd == 0)
347			goto mapend;
348		lxd++;
349
350		lstart = offsetLXD(lxd);
351		llen = lengthLXD(lxd);
352		lend = lstart + llen;
353		if (lstart >= size)
354			goto mapend;
355
356		/* compare with the current xad  */
357		goto compare1;
358	}
359	/* lxd is covered by xad */
360	else {			/* (xstart < lend) */
361
362		/* initialize new pxd */
363		pstart = xstart;
364		plen = min(lend - xstart, xlen);
365		paddr = xaddr;
366
367		goto cover;
368	}
369
370	/* (xstart < lstart) */
371      compare2:
372	/* lxd is covered by xad */
373	if (lstart < xend) {
374		/* initialize new pxd */
375		pstart = lstart;
376		plen = min(xend - lstart, llen);
377		paddr = xaddr + (lstart - xstart);
378
379		goto cover;
380	}
381	/* lxd is NOT covered by xad */
382	else {			/* (xend <= lstart) */
383
384		/*
385		 * get next xad
386		 *
387		 * linear search next xad covering lxd on
388		 * the current xad page, and then tree search
389		 */
390		if (index == le16_to_cpu(p->header.nextindex) - 1) {
391			if (p->header.flag & BT_ROOT)
392				goto mapend;
393
394			XT_PUTPAGE(mp);
395			goto search;
396		} else {
397			index++;
398			xad++;
399
400			/* compare with new xad */
401			goto compare;
402		}
403	}
404
405	/*
406	 * lxd is covered by xad and a new pxd has been initialized
407	 * (lstart <= xstart < lend) or (xstart < lstart < xend)
408	 */
409      cover:
410	/* finalize pxd corresponding to current xad */
411	XT_PUTENTRY(pxd, xad->flag, pstart, plen, paddr);
412
413	if (++npxd >= maxnpxd)
414		goto mapend;
415	pxd++;
416
417	/*
418	 * lxd is fully covered by xad
419	 */
420	if (lend <= xend) {
421		/*
422		 * get next lxd
423		 */
424		if (--nlxd == 0)
425			goto mapend;
426		lxd++;
427
428		lstart = offsetLXD(lxd);
429		llen = lengthLXD(lxd);
430		lend = lstart + llen;
431		if (lstart >= size)
432			goto mapend;
433
434		/*
435		 * test for old xad covering new lxd
436		 * (old xstart < new lstart)
437		 */
438		goto compare2;
439	}
440	/*
441	 * lxd is partially covered by xad
442	 */
443	else {			/* (xend < lend)  */
444
445		/*
446		 * get next xad
447		 *
448		 * linear search next xad covering lxd on
449		 * the current xad page, and then next xad page search
450		 */
451		if (index == le16_to_cpu(p->header.nextindex) - 1) {
452			if (p->header.flag & BT_ROOT)
453				goto mapend;
454
455			if ((bn = le64_to_cpu(p->header.next)) == 0)
456				goto mapend;
457
458			XT_PUTPAGE(mp);
459
460			/* get next sibling page */
461			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
462			if (rc)
463				return rc;
464
465			index = XTENTRYSTART;
466			xad = &p->xad[index];
467		} else {
468			index++;
469			xad++;
470		}
471
472		/*
473		 * test for new xad covering old lxd
474		 * (old lstart < new xstart)
475		 */
476		goto compare;
477	}
478
479      mapend:
480	xadlist->nxad = npxd;
481
482//out:
483	XT_PUTPAGE(mp);
484
485	return rc;
486}
487
488
489/*
490 *      xtSearch()
491 *
492 * function:    search for the xad entry covering specified offset.
493 *
494 * parameters:
495 *      ip      - file object;
496 *      xoff    - extent offset;
497 *      cmpp    - comparison result:
498 *      btstack - traverse stack;
499 *      flag    - search process flag (XT_INSERT);
500 *
501 * returns:
502 *      btstack contains (bn, index) of search path traversed to the entry.
503 *      *cmpp is set to result of comparison with the entry returned.
504 *      the page containing the entry is pinned at exit.
505 */
506static int xtSearch(struct inode *ip, s64 xoff,	/* offset of extent */
507		    int *cmpp, struct btstack * btstack, int flag)
508{
509	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
510	int rc = 0;
511	int cmp = 1;		/* init for empty page */
512	s64 bn;			/* block number */
513	struct metapage *mp;	/* page buffer */
514	xtpage_t *p;		/* page */
515	xad_t *xad;
516	int base, index, lim, btindex;
517	struct btframe *btsp;
518	int nsplit = 0;		/* number of pages to split */
519	s64 t64;
520
521	INCREMENT(xtStat.search);
522
523	BT_CLR(btstack);
524
525	btstack->nsplit = 0;
526
527	/*
528	 *      search down tree from root:
529	 *
530	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
531	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
532	 *
533	 * if entry with search key K is not found
534	 * internal page search find the entry with largest key Ki
535	 * less than K which point to the child page to search;
536	 * leaf page search find the entry with smallest key Kj
537	 * greater than K so that the returned index is the position of
538	 * the entry to be shifted right for insertion of new entry.
539	 * for empty tree, search key is greater than any key of the tree.
540	 *
541	 * by convention, root bn = 0.
542	 */
543	for (bn = 0;;) {
544		/* get/pin the page to search */
545		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
546		if (rc)
547			return rc;
548
549		/* try sequential access heuristics with the previous
550		 * access entry in target leaf page:
551		 * once search narrowed down into the target leaf,
552		 * key must either match an entry in the leaf or
553		 * key entry does not exist in the tree;
554		 */
555//fastSearch:
556		if ((jfs_ip->btorder & BT_SEQUENTIAL) &&
557		    (p->header.flag & BT_LEAF) &&
558		    (index = jfs_ip->btindex) <
559		    le16_to_cpu(p->header.nextindex)) {
560			xad = &p->xad[index];
561			t64 = offsetXAD(xad);
562			if (xoff < t64 + lengthXAD(xad)) {
563				if (xoff >= t64) {
564					*cmpp = 0;
565					goto out;
566				}
567
568				/* stop sequential access heuristics */
569				goto binarySearch;
570			} else {	/* (t64 + lengthXAD(xad)) <= xoff */
571
572				/* try next sequential entry */
573				index++;
574				if (index <
575				    le16_to_cpu(p->header.nextindex)) {
576					xad++;
577					t64 = offsetXAD(xad);
578					if (xoff < t64 + lengthXAD(xad)) {
579						if (xoff >= t64) {
580							*cmpp = 0;
581							goto out;
582						}
583
584						/* miss: key falls between
585						 * previous and this entry
586						 */
587						*cmpp = 1;
588						goto out;
589					}
590
591					/* (xoff >= t64 + lengthXAD(xad));
592					 * matching entry may be further out:
593					 * stop heuristic search
594					 */
595					/* stop sequential access heuristics */
596					goto binarySearch;
597				}
598
599				/* (index == p->header.nextindex);
600				 * miss: key entry does not exist in
601				 * the target leaf/tree
602				 */
603				*cmpp = 1;
604				goto out;
605			}
606
607			/*
608			 * if hit, return index of the entry found, and
609			 * if miss, where new entry with search key is
610			 * to be inserted;
611			 */
612		      out:
613			/* compute number of pages to split */
614			if (flag & XT_INSERT) {
615				if (p->header.nextindex ==	/* little-endian */
616				    p->header.maxentry)
617					nsplit++;
618				else
619					nsplit = 0;
620				btstack->nsplit = nsplit;
621			}
622
623			/* save search result */
624			btsp = btstack->top;
625			btsp->bn = bn;
626			btsp->index = index;
627			btsp->mp = mp;
628
629			/* update sequential access heuristics */
630			jfs_ip->btindex = index;
631
632			INCREMENT(xtStat.fastSearch);
633			return 0;
634		}
635
636		/* well, ... full search now */
637	      binarySearch:
638		lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
639
640		/*
641		 * binary search with search key K on the current page
642		 */
643		for (base = XTENTRYSTART; lim; lim >>= 1) {
644			index = base + (lim >> 1);
645
646			XT_CMP(cmp, xoff, &p->xad[index], t64);
647			if (cmp == 0) {
648				/*
649				 *      search hit
650				 */
651				/* search hit - leaf page:
652				 * return the entry found
653				 */
654				if (p->header.flag & BT_LEAF) {
655					*cmpp = cmp;
656
657					/* compute number of pages to split */
658					if (flag & XT_INSERT) {
659						if (p->header.nextindex ==
660						    p->header.maxentry)
661							nsplit++;
662						else
663							nsplit = 0;
664						btstack->nsplit = nsplit;
665					}
666
667					/* save search result */
668					btsp = btstack->top;
669					btsp->bn = bn;
670					btsp->index = index;
671					btsp->mp = mp;
672
673					/* init sequential access heuristics */
674					btindex = jfs_ip->btindex;
675					if (index == btindex ||
676					    index == btindex + 1)
677						jfs_ip->btorder = BT_SEQUENTIAL;
678					else
679						jfs_ip->btorder = BT_RANDOM;
680					jfs_ip->btindex = index;
681
682					return 0;
683				}
684
685				/* search hit - internal page:
686				 * descend/search its child page
687				 */
688				goto next;
689			}
690
691			if (cmp > 0) {
692				base = index + 1;
693				--lim;
694			}
695		}
696
697		/*
698		 *      search miss
699		 *
700		 * base is the smallest index with key (Kj) greater than
701		 * search key (K) and may be zero or maxentry index.
702		 */
703		/*
704		 * search miss - leaf page:
705		 *
706		 * return location of entry (base) where new entry with
707		 * search key K is to be inserted.
708		 */
709		if (p->header.flag & BT_LEAF) {
710			*cmpp = cmp;
711
712			/* compute number of pages to split */
713			if (flag & XT_INSERT) {
714				if (p->header.nextindex ==
715				    p->header.maxentry)
716					nsplit++;
717				else
718					nsplit = 0;
719				btstack->nsplit = nsplit;
720			}
721
722			/* save search result */
723			btsp = btstack->top;
724			btsp->bn = bn;
725			btsp->index = base;
726			btsp->mp = mp;
727
728			/* init sequential access heuristics */
729			btindex = jfs_ip->btindex;
730			if (base == btindex || base == btindex + 1)
731				jfs_ip->btorder = BT_SEQUENTIAL;
732			else
733				jfs_ip->btorder = BT_RANDOM;
734			jfs_ip->btindex = base;
735
736			return 0;
737		}
738
739		/*
740		 * search miss - non-leaf page:
741		 *
742		 * if base is non-zero, decrement base by one to get the parent
743		 * entry of the child page to search.
744		 */
745		index = base ? base - 1 : base;
746
747		/*
748		 * go down to child page
749		 */
750	      next:
751		/* update number of pages to split */
752		if (p->header.nextindex == p->header.maxentry)
753			nsplit++;
754		else
755			nsplit = 0;
756
757		/* push (bn, index) of the parent page/entry */
758		BT_PUSH(btstack, bn, index);
759
760		/* get the child page block number */
761		bn = addressXAD(&p->xad[index]);
762
763		/* unpin the parent page */
764		XT_PUTPAGE(mp);
765	}
766}
767
768/*
769 *      xtInsert()
770 *
771 * function:
772 *
773 * parameter:
774 *      tid     - transaction id;
775 *      ip      - file object;
776 *      xflag   - extent flag (XAD_NOTRECORDED):
777 *      xoff    - extent offset;
778 *      xlen    - extent length;
779 *      xaddrp  - extent address pointer (in/out):
780 *              if (*xaddrp)
781 *                      caller allocated data extent at *xaddrp;
782 *              else
783 *                      allocate data extent and return its xaddr;
784 *      flag    -
785 *
786 * return:
787 */
788int xtInsert(tid_t tid,		/* transaction id */
789	     struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp,
790	     int flag)
791{
792	int rc = 0;
793	s64 xaddr, hint;
794	struct metapage *mp;	/* meta-page buffer */
795	xtpage_t *p;		/* base B+-tree index page */
796	s64 bn;
797	int index, nextindex;
798	struct btstack btstack;	/* traverse stack */
799	struct xtsplit split;	/* split information */
800	xad_t *xad;
801	int cmp;
802	struct tlock *tlck;
803	struct xtlock *xtlck;
804
805	jFYI(1,
806	     ("xtInsert: nxoff:0x%lx nxlen:0x%x\n", (ulong) xoff, xlen));
807
808	/*
809	 *      search for the entry location at which to insert:
810	 *
811	 * xtFastSearch() and xtSearch() both returns (leaf page
812	 * pinned, index at which to insert).
813	 * n.b. xtSearch() may return index of maxentry of
814	 * the full page.
815	 */
816	if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT)))
817		return rc;
818
819	/* retrieve search result */
820	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
821
822	/* This test must follow XT_GETSEARCH since mp must be valid if
823	 * we branch to out: */
824	if (cmp == 0) {
825		rc = EEXIST;
826		goto out;
827	}
828
829	/*
830	 * allocate data extent requested
831	 *
832	 * allocation hint: last xad
833	 */
834	if ((xaddr = *xaddrp) == 0) {
835		if (index > XTENTRYSTART) {
836			xad = &p->xad[index - 1];
837			hint = addressXAD(xad) + lengthXAD(xad) - 1;
838		} else
839			hint = 0;
840		if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr)))
841			goto out;
842	}
843
844	/*
845	 *      insert entry for new extent
846	 */
847	xflag |= XAD_NEW;
848
849	/*
850	 *      if the leaf page is full, split the page and
851	 *      propagate up the router entry for the new page from split
852	 *
853	 * The xtSplitUp() will insert the entry and unpin the leaf page.
854	 */
855	nextindex = le16_to_cpu(p->header.nextindex);
856	if (nextindex == le16_to_cpu(p->header.maxentry)) {
857		split.mp = mp;
858		split.index = index;
859		split.flag = xflag;
860		split.off = xoff;
861		split.len = xlen;
862		split.addr = xaddr;
863		split.pxdlist = NULL;
864		if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
865			/* undo data extent allocation */
866			if (*xaddrp == 0)
867				dbFree(ip, xaddr, (s64) xlen);
868			return rc;
869		}
870
871		*xaddrp = xaddr;
872		return 0;
873	}
874
875	/*
876	 *      insert the new entry into the leaf page
877	 */
878	/*
879	 * acquire a transaction lock on the leaf page;
880	 *
881	 * action: xad insertion/extension;
882	 */
883	BT_MARK_DIRTY(mp, ip);
884
885	/* if insert into middle, shift right remaining entries. */
886	if (index < nextindex)
887		memmove(&p->xad[index + 1], &p->xad[index],
888			(nextindex - index) * sizeof(xad_t));
889
890	/* insert the new entry: mark the entry NEW */
891	xad = &p->xad[index];
892	XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
893
894	/* advance next available entry index */
895	p->header.nextindex =
896	    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
897
898	/* Don't log it if there are no links to the file */
899	if (!test_cflag(COMMIT_Nolink, ip)) {
900		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
901		xtlck = (struct xtlock *) & tlck->lock;
902		xtlck->lwm.offset =
903		    (xtlck->lwm.offset) ? min(index,
904					      (int)xtlck->lwm.offset) : index;
905		xtlck->lwm.length =
906		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
907	}
908
909	*xaddrp = xaddr;
910
911      out:
912	/* unpin the leaf page */
913	XT_PUTPAGE(mp);
914
915	return rc;
916}
917
918
919/*
920 *      xtSplitUp()
921 *
922 * function:
923 *      split full pages as propagating insertion up the tree
924 *
925 * parameter:
926 *      tid     - transaction id;
927 *      ip      - file object;
928 *      split   - entry parameter descriptor;
929 *      btstack - traverse stack from xtSearch()
930 *
931 * return:
932 */
933static int
934xtSplitUp(tid_t tid,
935	  struct inode *ip, struct xtsplit * split, struct btstack * btstack)
936{
937	int rc = 0;
938	struct metapage *smp;
939	xtpage_t *sp;		/* split page */
940	struct metapage *rmp;
941	s64 rbn;		/* new right page block number */
942	struct metapage *rcmp;
943	xtpage_t *rcp;		/* right child page */
944	s64 rcbn;		/* right child page block number */
945	int skip;		/* index of entry of insertion */
946	int nextindex;		/* next available entry index of p */
947	struct btframe *parent;	/* parent page entry on traverse stack */
948	xad_t *xad;
949	s64 xaddr;
950	int xlen;
951	int nsplit;		/* number of pages split */
952	struct pxdlist pxdlist;
953	pxd_t *pxd;
954	struct tlock *tlck;
955	struct xtlock *xtlck;
956
957	smp = split->mp;
958	sp = XT_PAGE(ip, smp);
959
960	/* is inode xtree root extension/inline EA area free ? */
961	if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) &&
962	    (sp->header.maxentry < cpu_to_le16(XTROOTMAXSLOT)) &&
963	    (JFS_IP(ip)->mode2 & INLINEEA)) {
964		sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT);
965		JFS_IP(ip)->mode2 &= ~INLINEEA;
966
967		BT_MARK_DIRTY(smp, ip);
968		/*
969		 * acquire a transaction lock on the leaf page;
970		 *
971		 * action: xad insertion/extension;
972		 */
973
974		/* if insert into middle, shift right remaining entries. */
975		skip = split->index;
976		nextindex = le16_to_cpu(sp->header.nextindex);
977		if (skip < nextindex)
978			memmove(&sp->xad[skip + 1], &sp->xad[skip],
979				(nextindex - skip) * sizeof(xad_t));
980
981		/* insert the new entry: mark the entry NEW */
982		xad = &sp->xad[skip];
983		XT_PUTENTRY(xad, split->flag, split->off, split->len,
984			    split->addr);
985
986		/* advance next available entry index */
987		sp->header.nextindex =
988		    cpu_to_le16(le16_to_cpu(sp->header.nextindex) + 1);
989
990		/* Don't log it if there are no links to the file */
991		if (!test_cflag(COMMIT_Nolink, ip)) {
992			tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
993			xtlck = (struct xtlock *) & tlck->lock;
994			xtlck->lwm.offset = (xtlck->lwm.offset) ?
995			    min(skip, (int)xtlck->lwm.offset) : skip;
996			xtlck->lwm.length =
997			    le16_to_cpu(sp->header.nextindex) -
998			    xtlck->lwm.offset;
999		}
1000
1001		return 0;
1002	}
1003
1004	/*
1005	 * allocate new index blocks to cover index page split(s)
1006	 *
1007	 * allocation hint: ?
1008	 */
1009	if (split->pxdlist == NULL) {
1010		nsplit = btstack->nsplit;
1011		split->pxdlist = &pxdlist;
1012		pxdlist.maxnpxd = pxdlist.npxd = 0;
1013		pxd = &pxdlist.pxd[0];
1014		xlen = JFS_SBI(ip->i_sb)->nbperpage;
1015		for (; nsplit > 0; nsplit--, pxd++) {
1016			if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr))
1017			    == 0) {
1018				PXDaddress(pxd, xaddr);
1019				PXDlength(pxd, xlen);
1020
1021				pxdlist.maxnpxd++;
1022
1023				continue;
1024			}
1025
1026			/* undo allocation */
1027
1028			XT_PUTPAGE(smp);
1029			return rc;
1030		}
1031	}
1032
1033	/*
1034	 * Split leaf page <sp> into <sp> and a new right page <rp>.
1035	 *
1036	 * The split routines insert the new entry into the leaf page,
1037	 * and acquire txLock as appropriate.
1038	 * return <rp> pinned and its block number <rpbn>.
1039	 */
1040	rc = (sp->header.flag & BT_ROOT) ?
1041	    xtSplitRoot(tid, ip, split, &rmp) :
1042	    xtSplitPage(tid, ip, split, &rmp, &rbn);
1043	if (rc)
1044		return EIO;
1045
1046	XT_PUTPAGE(smp);
1047
1048	/*
1049	 * propagate up the router entry for the leaf page just split
1050	 *
1051	 * insert a router entry for the new page into the parent page,
1052	 * propagate the insert/split up the tree by walking back the stack
1053	 * of (bn of parent page, index of child page entry in parent page)
1054	 * that were traversed during the search for the page that split.
1055	 *
1056	 * the propagation of insert/split up the tree stops if the root
1057	 * splits or the page inserted into doesn't have to split to hold
1058	 * the new entry.
1059	 *
1060	 * the parent entry for the split page remains the same, and
1061	 * a new entry is inserted at its right with the first key and
1062	 * block number of the new right page.
1063	 *
1064	 * There are a maximum of 3 pages pinned at any time:
1065	 * right child, left parent and right parent (when the parent splits)
1066	 * to keep the child page pinned while working on the parent.
1067	 * make sure that all pins are released at exit.
1068	 */
1069	while ((parent = BT_POP(btstack)) != NULL) {
1070		/* parent page specified by stack frame <parent> */
1071
1072		/* keep current child pages <rcp> pinned */
1073		rcmp = rmp;
1074		rcbn = rbn;
1075		rcp = XT_PAGE(ip, rcmp);
1076
1077		/*
1078		 * insert router entry in parent for new right child page <rp>
1079		 */
1080		/* get/pin the parent page <sp> */
1081		XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
1082		if (rc)
1083			goto errout2;
1084
1085		/*
1086		 * The new key entry goes ONE AFTER the index of parent entry,
1087		 * because the split was to the right.
1088		 */
1089		skip = parent->index + 1;
1090
1091		/*
1092		 * split or shift right remaining entries of the parent page
1093		 */
1094		nextindex = le16_to_cpu(sp->header.nextindex);
1095		/*
1096		 * parent page is full - split the parent page
1097		 */
1098		if (nextindex == le16_to_cpu(sp->header.maxentry)) {
1099			/* init for parent page split */
1100			split->mp = smp;
1101			split->index = skip;	/* index at insert */
1102			split->flag = XAD_NEW;
1103			split->off = offsetXAD(&rcp->xad[XTENTRYSTART]);
1104			split->len = JFS_SBI(ip->i_sb)->nbperpage;
1105			split->addr = rcbn;
1106
1107			/* unpin previous right child page */
1108			XT_PUTPAGE(rcmp);
1109
1110			/* The split routines insert the new entry,
1111			 * and acquire txLock as appropriate.
1112			 * return <rp> pinned and its block number <rpbn>.
1113			 */
1114			rc = (sp->header.flag & BT_ROOT) ?
1115			    xtSplitRoot(tid, ip, split, &rmp) :
1116			    xtSplitPage(tid, ip, split, &rmp, &rbn);
1117			if (rc)
1118				goto errout1;
1119
1120			XT_PUTPAGE(smp);
1121			/* keep new child page <rp> pinned */
1122		}
1123		/*
1124		 * parent page is not full - insert in parent page
1125		 */
1126		else {
1127			/*
1128			 * insert router entry in parent for the right child
1129			 * page from the first entry of the right child page:
1130			 */
1131			/*
1132			 * acquire a transaction lock on the parent page;
1133			 *
1134			 * action: router xad insertion;
1135			 */
1136			BT_MARK_DIRTY(smp, ip);
1137
1138			/*
1139			 * if insert into middle, shift right remaining entries
1140			 */
1141			if (skip < nextindex)
1142				memmove(&sp->xad[skip + 1], &sp->xad[skip],
1143					(nextindex -
1144					 skip) << L2XTSLOTSIZE);
1145
1146			/* insert the router entry */
1147			xad = &sp->xad[skip];
1148			XT_PUTENTRY(xad, XAD_NEW,
1149				    offsetXAD(&rcp->xad[XTENTRYSTART]),
1150				    JFS_SBI(ip->i_sb)->nbperpage, rcbn);
1151
1152			/* advance next available entry index. */
1153			sp->header.nextindex =
1154			    cpu_to_le16(le16_to_cpu(sp->header.nextindex) +
1155					1);
1156
1157			/* Don't log it if there are no links to the file */
1158			if (!test_cflag(COMMIT_Nolink, ip)) {
1159				tlck = txLock(tid, ip, smp,
1160					      tlckXTREE | tlckGROW);
1161				xtlck = (struct xtlock *) & tlck->lock;
1162				xtlck->lwm.offset = (xtlck->lwm.offset) ?
1163				    min(skip, (int)xtlck->lwm.offset) : skip;
1164				xtlck->lwm.length =
1165				    le16_to_cpu(sp->header.nextindex) -
1166				    xtlck->lwm.offset;
1167			}
1168
1169			/* unpin parent page */
1170			XT_PUTPAGE(smp);
1171
1172			/* exit propagate up */
1173			break;
1174		}
1175	}
1176
1177	/* unpin current right page */
1178	XT_PUTPAGE(rmp);
1179
1180	return 0;
1181
1182	/*
1183	 * If something fails in the above loop we were already walking back
1184	 * up the tree and the tree is now inconsistent.
1185	 * release all pages we're holding.
1186	 */
1187      errout1:
1188	XT_PUTPAGE(smp);
1189
1190      errout2:
1191	XT_PUTPAGE(rcmp);
1192
1193	return rc;
1194}
1195
1196
1197/*
1198 *      xtSplitPage()
1199 *
1200 * function:
1201 *      split a full non-root page into
1202 *      original/split/left page and new right page
1203 *      i.e., the original/split page remains as left page.
1204 *
1205 * parameter:
1206 *      int		tid,
1207 *      struct inode    *ip,
1208 *      struct xtsplit  *split,
1209 *      struct metapage	**rmpp,
1210 *      u64		*rbnp,
1211 *
1212 * return:
1213 *      Pointer to page in which to insert or NULL on error.
1214 */
1215static int
1216xtSplitPage(tid_t tid, struct inode *ip,
1217	    struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp)
1218{
1219	int rc = 0;
1220	struct metapage *smp;
1221	xtpage_t *sp;
1222	struct metapage *rmp;
1223	xtpage_t *rp;		/* new right page allocated */
1224	s64 rbn;		/* new right page block number */
1225	struct metapage *mp;
1226	xtpage_t *p;
1227	s64 nextbn;
1228	int skip, maxentry, middle, righthalf, n;
1229	xad_t *xad;
1230	struct pxdlist *pxdlist;
1231	pxd_t *pxd;
1232	struct tlock *tlck;
1233	struct xtlock *sxtlck = 0, *rxtlck = 0;
1234
1235	smp = split->mp;
1236	sp = XT_PAGE(ip, smp);
1237
1238	INCREMENT(xtStat.split);
1239
1240	/*
1241	 * allocate the new right page for the split
1242	 */
1243	pxdlist = split->pxdlist;
1244	pxd = &pxdlist->pxd[pxdlist->npxd];
1245	pxdlist->npxd++;
1246	rbn = addressPXD(pxd);
1247	rmp = get_metapage(ip, rbn, PSIZE, 1);
1248	if (rmp == NULL)
1249		return EIO;
1250
1251	jEVENT(0,
1252	       ("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p\n", ip, smp, rmp));
1253
1254	BT_MARK_DIRTY(rmp, ip);
1255	/*
1256	 * action: new page;
1257	 */
1258
1259	rp = (xtpage_t *) rmp->data;
1260	rp->header.self = *pxd;
1261	rp->header.flag = sp->header.flag & BT_TYPE;
1262	rp->header.maxentry = sp->header.maxentry;	/* little-endian */
1263	rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
1264
1265	BT_MARK_DIRTY(smp, ip);
1266	/* Don't log it if there are no links to the file */
1267	if (!test_cflag(COMMIT_Nolink, ip)) {
1268		/*
1269		 * acquire a transaction lock on the new right page;
1270		 */
1271		tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
1272		rxtlck = (struct xtlock *) & tlck->lock;
1273		rxtlck->lwm.offset = XTENTRYSTART;
1274		/*
1275		 * acquire a transaction lock on the split page
1276		 */
1277		tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
1278		sxtlck = (struct xtlock *) & tlck->lock;
1279	}
1280
1281	/*
1282	 * initialize/update sibling pointers of <sp> and <rp>
1283	 */
1284	nextbn = le64_to_cpu(sp->header.next);
1285	rp->header.next = cpu_to_le64(nextbn);
1286	rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
1287	sp->header.next = cpu_to_le64(rbn);
1288
1289	skip = split->index;
1290
1291	/*
1292	 *      sequential append at tail (after last entry of last page)
1293	 *
1294	 * if splitting the last page on a level because of appending
1295	 * a entry to it (skip is maxentry), it's likely that the access is
1296	 * sequential. adding an empty page on the side of the level is less
1297	 * work and can push the fill factor much higher than normal.
1298	 * if we're wrong it's no big deal -  we will do the split the right
1299	 * way next time.
1300	 * (it may look like it's equally easy to do a similar hack for
1301	 * reverse sorted data, that is, split the tree left, but it's not.
1302	 * Be my guest.)
1303	 */
1304	if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) {
1305		/*
1306		 * acquire a transaction lock on the new/right page;
1307		 *
1308		 * action: xad insertion;
1309		 */
1310		/* insert entry at the first entry of the new right page */
1311		xad = &rp->xad[XTENTRYSTART];
1312		XT_PUTENTRY(xad, split->flag, split->off, split->len,
1313			    split->addr);
1314
1315		rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
1316
1317		if (!test_cflag(COMMIT_Nolink, ip)) {
1318			/* rxtlck->lwm.offset = XTENTRYSTART; */
1319			rxtlck->lwm.length = 1;
1320		}
1321
1322		*rmpp = rmp;
1323		*rbnp = rbn;
1324
1325		ip->i_blocks += LBLK2PBLK(ip->i_sb, lengthPXD(pxd));
1326
1327		jEVENT(0, ("xtSplitPage: sp:0x%p rp:0x%p\n", sp, rp));
1328		return 0;
1329	}
1330
1331	/*
1332	 *      non-sequential insert (at possibly middle page)
1333	 */
1334
1335	/*
1336	 * update previous pointer of old next/right page of <sp>
1337	 */
1338	if (nextbn != 0) {
1339		XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
1340		if (rc) {
1341			XT_PUTPAGE(rmp);
1342			return rc;
1343		}
1344
1345		BT_MARK_DIRTY(mp, ip);
1346		/*
1347		 * acquire a transaction lock on the next page;
1348		 *
1349		 * action:sibling pointer update;
1350		 */
1351		if (!test_cflag(COMMIT_Nolink, ip))
1352			tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
1353
1354		p->header.prev = cpu_to_le64(rbn);
1355
1356		/* sibling page may have been updated previously, or
1357		 * it may be updated later;
1358		 */
1359
1360		XT_PUTPAGE(mp);
1361	}
1362
1363	/*
1364	 * split the data between the split and new/right pages
1365	 */
1366	maxentry = le16_to_cpu(sp->header.maxentry);
1367	middle = maxentry >> 1;
1368	righthalf = maxentry - middle;
1369
1370	/*
1371	 * skip index in old split/left page - insert into left page:
1372	 */
1373	if (skip <= middle) {
1374		/* move right half of split page to the new right page */
1375		memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
1376			righthalf << L2XTSLOTSIZE);
1377
1378		/* shift right tail of left half to make room for new entry */
1379		if (skip < middle)
1380			memmove(&sp->xad[skip + 1], &sp->xad[skip],
1381				(middle - skip) << L2XTSLOTSIZE);
1382
1383		/* insert new entry */
1384		xad = &sp->xad[skip];
1385		XT_PUTENTRY(xad, split->flag, split->off, split->len,
1386			    split->addr);
1387
1388		/* update page header */
1389		sp->header.nextindex = cpu_to_le16(middle + 1);
1390		if (!test_cflag(COMMIT_Nolink, ip)) {
1391			sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
1392			    min(skip, (int)sxtlck->lwm.offset) : skip;
1393		}
1394
1395		rp->header.nextindex =
1396		    cpu_to_le16(XTENTRYSTART + righthalf);
1397	}
1398	/*
1399	 * skip index in new right page - insert into right page:
1400	 */
1401	else {
1402		/* move left head of right half to right page */
1403		n = skip - middle;
1404		memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
1405			n << L2XTSLOTSIZE);
1406
1407		/* insert new entry */
1408		n += XTENTRYSTART;
1409		xad = &rp->xad[n];
1410		XT_PUTENTRY(xad, split->flag, split->off, split->len,
1411			    split->addr);
1412
1413		/* move right tail of right half to right page */
1414		if (skip < maxentry)
1415			memmove(&rp->xad[n + 1], &sp->xad[skip],
1416				(maxentry - skip) << L2XTSLOTSIZE);
1417
1418		/* update page header */
1419		sp->header.nextindex = cpu_to_le16(middle);
1420		if (!test_cflag(COMMIT_Nolink, ip)) {
1421			sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
1422			    min(middle, (int)sxtlck->lwm.offset) : middle;
1423		}
1424
1425		rp->header.nextindex = cpu_to_le16(XTENTRYSTART +
1426						   righthalf + 1);
1427	}
1428
1429	if (!test_cflag(COMMIT_Nolink, ip)) {
1430		sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) -
1431		    sxtlck->lwm.offset;
1432
1433		/* rxtlck->lwm.offset = XTENTRYSTART; */
1434		rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
1435		    XTENTRYSTART;
1436	}
1437
1438	*rmpp = rmp;
1439	*rbnp = rbn;
1440
1441	ip->i_blocks += LBLK2PBLK(ip->i_sb, lengthPXD(pxd));
1442
1443	jEVENT(0, ("xtSplitPage: sp:0x%p rp:0x%p\n", sp, rp));
1444	return rc;
1445}
1446
1447
1448/*
1449 *      xtSplitRoot()
1450 *
1451 * function:
1452 *      split the full root page into
1453 *      original/root/split page and new right page
1454 *      i.e., root remains fixed in tree anchor (inode) and
1455 *      the root is copied to a single new right child page
1456 *      since root page << non-root page, and
1457 *      the split root page contains a single entry for the
1458 *      new right child page.
1459 *
1460 * parameter:
1461 *      int		tid,
1462 *      struct inode    *ip,
1463 *      struct xtsplit  *split,
1464 *      struct metapage	**rmpp)
1465 *
1466 * return:
1467 *      Pointer to page in which to insert or NULL on error.
1468 */
1469static int
1470xtSplitRoot(tid_t tid,
1471	    struct inode *ip, struct xtsplit * split, struct metapage ** rmpp)
1472{
1473	xtpage_t *sp;
1474	struct metapage *rmp;
1475	xtpage_t *rp;
1476	s64 rbn;
1477	int skip, nextindex;
1478	xad_t *xad;
1479	pxd_t *pxd;
1480	struct pxdlist *pxdlist;
1481	struct tlock *tlck;
1482	struct xtlock *xtlck;
1483
1484	sp = &JFS_IP(ip)->i_xtroot;
1485
1486	INCREMENT(xtStat.split);
1487
1488	/*
1489	 *      allocate a single (right) child page
1490	 */
1491	pxdlist = split->pxdlist;
1492	pxd = &pxdlist->pxd[pxdlist->npxd];
1493	pxdlist->npxd++;
1494	rbn = addressPXD(pxd);
1495	rmp = get_metapage(ip, rbn, PSIZE, 1);
1496	if (rmp == NULL)
1497		return EIO;
1498
1499	jEVENT(0, ("xtSplitRoot: ip:0x%p rmp:0x%p\n", ip, rmp));
1500
1501	/*
1502	 * acquire a transaction lock on the new right page;
1503	 *
1504	 * action: new page;
1505	 */
1506	BT_MARK_DIRTY(rmp, ip);
1507
1508	rp = (xtpage_t *) rmp->data;
1509	rp->header.flag =
1510	    (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
1511	rp->header.self = *pxd;
1512	rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
1513	rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE);
1514
1515	/* initialize sibling pointers */
1516	rp->header.next = 0;
1517	rp->header.prev = 0;
1518
1519	/*
1520	 * copy the in-line root page into new right page extent
1521	 */
1522	nextindex = le16_to_cpu(sp->header.maxentry);
1523	memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART],
1524		(nextindex - XTENTRYSTART) << L2XTSLOTSIZE);
1525
1526	/*
1527	 * insert the new entry into the new right/child page
1528	 * (skip index in the new right page will not change)
1529	 */
1530	skip = split->index;
1531	/* if insert into middle, shift right remaining entries */
1532	if (skip != nextindex)
1533		memmove(&rp->xad[skip + 1], &rp->xad[skip],
1534			(nextindex - skip) * sizeof(xad_t));
1535
1536	xad = &rp->xad[skip];
1537	XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr);
1538
1539	/* update page header */
1540	rp->header.nextindex = cpu_to_le16(nextindex + 1);
1541
1542	if (!test_cflag(COMMIT_Nolink, ip)) {
1543		tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
1544		xtlck = (struct xtlock *) & tlck->lock;
1545		xtlck->lwm.offset = XTENTRYSTART;
1546		xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
1547		    XTENTRYSTART;
1548	}
1549
1550	/*
1551	 *      reset the root
1552	 *
1553	 * init root with the single entry for the new right page
1554	 * set the 1st entry offset to 0, which force the left-most key
1555	 * at any level of the tree to be less than any search key.
1556	 */
1557	/*
1558	 * acquire a transaction lock on the root page (in-memory inode);
1559	 *
1560	 * action: root split;
1561	 */
1562	BT_MARK_DIRTY(split->mp, ip);
1563
1564	xad = &sp->xad[XTENTRYSTART];
1565	XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn);
1566
1567	/* update page header of root */
1568	sp->header.flag &= ~BT_LEAF;
1569	sp->header.flag |= BT_INTERNAL;
1570
1571	sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
1572
1573	if (!test_cflag(COMMIT_Nolink, ip)) {
1574		tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW);
1575		xtlck = (struct xtlock *) & tlck->lock;
1576		xtlck->lwm.offset = XTENTRYSTART;
1577		xtlck->lwm.length = 1;
1578	}
1579
1580	*rmpp = rmp;
1581
1582	ip->i_blocks += LBLK2PBLK(ip->i_sb, lengthPXD(pxd));
1583
1584	jEVENT(0, ("xtSplitRoot: sp:0x%p rp:0x%p\n", sp, rp));
1585	return 0;
1586}
1587
1588
1589/*
1590 *      xtExtend()
1591 *
1592 * function: extend in-place;
1593 *
1594 * note: existing extent may or may not have been committed.
1595 * caller is responsible for pager buffer cache update, and
1596 * working block allocation map update;
1597 * update pmap: alloc whole extended extent;
1598 */
1599int xtExtend(tid_t tid,		/* transaction id */
1600	     struct inode *ip, s64 xoff,	/* delta extent offset */
1601	     s32 xlen,		/* delta extent length */
1602	     int flag)
1603{
1604	int rc = 0;
1605	int cmp;
1606	struct metapage *mp;	/* meta-page buffer */
1607	xtpage_t *p;		/* base B+-tree index page */
1608	s64 bn;
1609	int index, nextindex, len;
1610	struct btstack btstack;	/* traverse stack */
1611	struct xtsplit split;	/* split information */
1612	xad_t *xad;
1613	s64 xaddr;
1614	struct tlock *tlck;
1615	struct xtlock *xtlck = 0;
1616	int rootsplit = 0;
1617
1618	jFYI(1,
1619	     ("xtExtend: nxoff:0x%lx nxlen:0x%x\n", (ulong) xoff, xlen));
1620
1621	/* there must exist extent to be extended */
1622	if ((rc = xtSearch(ip, xoff - 1, &cmp, &btstack, XT_INSERT)))
1623		return rc;
1624	assert(cmp == 0);
1625
1626	/* retrieve search result */
1627	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
1628
1629	/* extension must be contiguous */
1630	xad = &p->xad[index];
1631	jFYI(0, ("xtExtend: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
1632		 (ulong) offsetXAD(xad), lengthXAD(xad),
1633		 (ulong) addressXAD(xad)));
1634	assert((offsetXAD(xad) + lengthXAD(xad)) == xoff);
1635
1636	/*
1637	 * acquire a transaction lock on the leaf page;
1638	 *
1639	 * action: xad insertion/extension;
1640	 */
1641	BT_MARK_DIRTY(mp, ip);
1642	if (!test_cflag(COMMIT_Nolink, ip)) {
1643		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
1644		xtlck = (struct xtlock *) & tlck->lock;
1645	}
1646
1647	/* extend will overflow extent ? */
1648	xlen = lengthXAD(xad) + xlen;
1649	if ((len = xlen - MAXXLEN) <= 0)
1650		goto extendOld;
1651
1652	/*
1653	 *      extent overflow: insert entry for new extent
1654	 */
1655//insertNew:
1656	xoff = offsetXAD(xad) + MAXXLEN;
1657	xaddr = addressXAD(xad) + MAXXLEN;
1658	nextindex = le16_to_cpu(p->header.nextindex);
1659
1660	/*
1661	 *      if the leaf page is full, insert the new entry and
1662	 *      propagate up the router entry for the new page from split
1663	 *
1664	 * The xtSplitUp() will insert the entry and unpin the leaf page.
1665	 */
1666	if (nextindex == le16_to_cpu(p->header.maxentry)) {
1667		rootsplit = p->header.flag & BT_ROOT;
1668
1669		/* xtSpliUp() unpins leaf pages */
1670		split.mp = mp;
1671		split.index = index + 1;
1672		split.flag = XAD_NEW;
1673		split.off = xoff;	/* split offset */
1674		split.len = len;
1675		split.addr = xaddr;
1676		split.pxdlist = NULL;
1677		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
1678			return rc;
1679
1680		/*
1681		 * if leaf root has been split, original root has been
1682		 * copied to new child page, i.e., original entry now
1683		 * resides on the new child page;
1684		 */
1685		if (rootsplit) {
1686			if (p->header.nextindex ==
1687			    cpu_to_le16(XTENTRYSTART + 1)) {
1688				xad = &p->xad[XTENTRYSTART];
1689				bn = addressXAD(xad);
1690
1691				/* get new child page */
1692				XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
1693
1694				BT_MARK_DIRTY(mp, ip);
1695				if (!test_cflag(COMMIT_Nolink, ip)) {
1696					tlck = txLock(tid, ip, mp,
1697						      tlckXTREE |
1698						      tlckGROW);
1699					xtlck = (struct xtlock *) & tlck->lock;
1700				}
1701			}
1702		} else
1703			/* get back old page */
1704			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
1705	}
1706	/*
1707	 *      insert the new entry into the leaf page
1708	 */
1709	else {
1710		/* insert the new entry: mark the entry NEW */
1711		xad = &p->xad[index + 1];
1712		XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr);
1713
1714		/* advance next available entry index */
1715		p->header.nextindex =
1716		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
1717	}
1718
1719	/* get back old entry */
1720	xad = &p->xad[index];
1721	xlen = MAXXLEN;
1722
1723	/*
1724	 * extend old extent
1725	 */
1726      extendOld:
1727	XADlength(xad, xlen);
1728	if (!(xad->flag & XAD_NEW))
1729		xad->flag |= XAD_EXTENDED;
1730
1731	if (!test_cflag(COMMIT_Nolink, ip)) {
1732		xtlck->lwm.offset =
1733		    (xtlck->lwm.offset) ? min(index,
1734					      (int)xtlck->lwm.offset) : index;
1735		xtlck->lwm.length =
1736		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
1737	}
1738
1739	/* unpin the leaf page */
1740	XT_PUTPAGE(mp);
1741
1742	return rc;
1743}
1744
1745
1746/*
1747 *      xtTailgate()
1748 *
1749 * function: split existing 'tail' extent
1750 *      (split offset >= start offset of tail extent), and
1751 *      relocate and extend the split tail half;
1752 *
1753 * note: existing extent may or may not have been committed.
1754 * caller is responsible for pager buffer cache update, and
1755 * working block allocation map update;
1756 * update pmap: free old split tail extent, alloc new extent;
1757 */
1758int xtTailgate(tid_t tid,		/* transaction id */
1759	       struct inode *ip, s64 xoff,	/* split/new extent offset */
1760	       s32 xlen,	/* new extent length */
1761	       s64 xaddr,	/* new extent address */
1762	       int flag)
1763{
1764	int rc = 0;
1765	int cmp;
1766	struct metapage *mp;	/* meta-page buffer */
1767	xtpage_t *p;		/* base B+-tree index page */
1768	s64 bn;
1769	int index, nextindex, llen, rlen;
1770	struct btstack btstack;	/* traverse stack */
1771	struct xtsplit split;	/* split information */
1772	xad_t *xad;
1773	struct tlock *tlck;
1774	struct xtlock *xtlck = 0;
1775	struct tlock *mtlck;
1776	struct maplock *pxdlock;
1777	int rootsplit = 0;
1778
1779/*
1780printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
1781        (ulong)xoff, xlen, (ulong)xaddr);
1782*/
1783
1784	/* there must exist extent to be tailgated */
1785	if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT)))
1786		return rc;
1787	assert(cmp == 0);
1788
1789	/* retrieve search result */
1790	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
1791
1792	/* entry found must be last entry */
1793	nextindex = le16_to_cpu(p->header.nextindex);
1794	assert(index == nextindex - 1);
1795
1796	BT_MARK_DIRTY(mp, ip);
1797	/*
1798	 * acquire tlock of the leaf page containing original entry
1799	 */
1800	if (!test_cflag(COMMIT_Nolink, ip)) {
1801		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
1802		xtlck = (struct xtlock *) & tlck->lock;
1803	}
1804
1805	/* completely replace extent ? */
1806	xad = &p->xad[index];
1807/*
1808printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
1809        (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
1810*/
1811	if ((llen = xoff - offsetXAD(xad)) == 0)
1812		goto updateOld;
1813
1814	/*
1815	 *      partially replace extent: insert entry for new extent
1816	 */
1817//insertNew:
1818	/*
1819	 *      if the leaf page is full, insert the new entry and
1820	 *      propagate up the router entry for the new page from split
1821	 *
1822	 * The xtSplitUp() will insert the entry and unpin the leaf page.
1823	 */
1824	if (nextindex == le16_to_cpu(p->header.maxentry)) {
1825		rootsplit = p->header.flag & BT_ROOT;
1826
1827		/* xtSpliUp() unpins leaf pages */
1828		split.mp = mp;
1829		split.index = index + 1;
1830		split.flag = XAD_NEW;
1831		split.off = xoff;	/* split offset */
1832		split.len = xlen;
1833		split.addr = xaddr;
1834		split.pxdlist = NULL;
1835		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
1836			return rc;
1837
1838		/*
1839		 * if leaf root has been split, original root has been
1840		 * copied to new child page, i.e., original entry now
1841		 * resides on the new child page;
1842		 */
1843		if (rootsplit) {
1844			if (p->header.nextindex ==
1845			    cpu_to_le16(XTENTRYSTART + 1)) {
1846				xad = &p->xad[XTENTRYSTART];
1847				bn = addressXAD(xad);
1848
1849				/* get new child page */
1850				XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
1851
1852				BT_MARK_DIRTY(mp, ip);
1853				if (!test_cflag(COMMIT_Nolink, ip)) {
1854					tlck = txLock(tid, ip, mp,
1855						      tlckXTREE |
1856						      tlckGROW);
1857					xtlck = (struct xtlock *) & tlck->lock;
1858				}
1859			}
1860		} else
1861			/* get back old page */
1862			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
1863	}
1864	/*
1865	 *      insert the new entry into the leaf page
1866	 */
1867	else {
1868		/* insert the new entry: mark the entry NEW */
1869		xad = &p->xad[index + 1];
1870		XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
1871
1872		/* advance next available entry index */
1873		p->header.nextindex =
1874		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
1875	}
1876
1877	/* get back old XAD */
1878	xad = &p->xad[index];
1879
1880	/*
1881	 * truncate/relocate old extent at split offset
1882	 */
1883      updateOld:
1884	/* update dmap for old/committed/truncated extent */
1885	rlen = lengthXAD(xad) - llen;
1886	if (!(xad->flag & XAD_NEW)) {
1887		/* free from PWMAP at commit */
1888		if (!test_cflag(COMMIT_Nolink, ip)) {
1889			mtlck = txMaplock(tid, ip, tlckMAP);
1890			pxdlock = (struct maplock *) & mtlck->lock;
1891			pxdlock->flag = mlckFREEPXD;
1892			PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen);
1893			PXDlength(&pxdlock->pxd, rlen);
1894			pxdlock->index = 1;
1895		}
1896		jEVENT(0,
1897		       ("xtTailgate: free extent xaddr:0x%lx xlen:0x%x\n",
1898			(ulong) addressPXD(&pxdlock->pxd),
1899			lengthPXD(&pxdlock->pxd)));
1900	} else
1901		/* free from WMAP */
1902		dbFree(ip, addressXAD(xad) + llen, (s64) rlen);
1903
1904	if (llen)
1905		/* truncate */
1906		XADlength(xad, llen);
1907	else
1908		/* replace */
1909		XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
1910
1911	if (!test_cflag(COMMIT_Nolink, ip)) {
1912		xtlck->lwm.offset = (xtlck->lwm.offset) ?
1913		    min(index, (int)xtlck->lwm.offset) : index;
1914		xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
1915		    xtlck->lwm.offset;
1916	}
1917
1918	/* unpin the leaf page */
1919	XT_PUTPAGE(mp);
1920
1921	return rc;
1922}
1923
1924
1925/*
1926 *      xtUpdate()
1927 *
1928 * function: update XAD;
1929 *
1930 *      update extent for allocated_but_not_recorded or
1931 *      compressed extent;
1932 *
1933 * parameter:
1934 *      nxad    - new XAD;
1935 *                logical extent of the specified XAD must be completely
1936 *                contained by an existing XAD;
1937 */
1938int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
1939{				/* new XAD */
1940	int rc = 0;
1941	int cmp;
1942	struct metapage *mp;	/* meta-page buffer */
1943	xtpage_t *p;		/* base B+-tree index page */
1944	s64 bn;
1945	int index0, index, newindex, nextindex;
1946	struct btstack btstack;	/* traverse stack */
1947	struct xtsplit split;	/* split information */
1948	xad_t *xad, *lxad, *rxad;
1949	int xflag;
1950	s64 nxoff, xoff;
1951	int nxlen, xlen, lxlen, rxlen;
1952	s64 nxaddr, xaddr;
1953	struct tlock *tlck;
1954	struct xtlock *xtlck = 0;
1955	int rootsplit = 0, newpage = 0;
1956
1957	/* there must exist extent to be tailgated */
1958	nxoff = offsetXAD(nxad);
1959	nxlen = lengthXAD(nxad);
1960	nxaddr = addressXAD(nxad);
1961/*
1962printf("xtUpdate: nxflag:0x%x nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
1963        nxad->flag, (ulong)nxoff, nxlen, (ulong)nxaddr);
1964*/
1965	if ((rc = xtSearch(ip, nxoff, &cmp, &btstack, XT_INSERT)))
1966		return rc;
1967	assert(cmp == 0);
1968
1969	/* retrieve search result */
1970	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
1971
1972	BT_MARK_DIRTY(mp, ip);
1973	/*
1974	 * acquire tlock of the leaf page containing original entry
1975	 */
1976	if (!test_cflag(COMMIT_Nolink, ip)) {
1977		tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
1978		xtlck = (struct xtlock *) & tlck->lock;
1979	}
1980
1981	xad = &p->xad[index0];
1982	xflag = xad->flag;
1983	xoff = offsetXAD(xad);
1984	xlen = lengthXAD(xad);
1985	xaddr = addressXAD(xad);
1986/*
1987printf("xtUpdate: xflag:0x%x xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
1988        xflag, (ulong)xoff, xlen, (ulong)xaddr);
1989*/
1990
1991	/* nXAD must be completely contained within XAD */
1992	assert(xoff <= nxoff);
1993	assert(nxoff + nxlen <= xoff + xlen);
1994
1995	index = index0;
1996	newindex = index + 1;
1997	nextindex = le16_to_cpu(p->header.nextindex);
1998
1999#ifdef  _JFS_WIP_NOCOALESCE
2000	if (xoff < nxoff)
2001		goto updateRight;
2002
2003	/*
2004	 * replace XAD with nXAD
2005	 */
2006      replace:			/* (nxoff == xoff) */
2007	if (nxlen == xlen) {
2008		/* replace XAD with nXAD:recorded */
2009		*xad = *nxad;
2010		xad->flag = xflag & ~XAD_NOTRECORDED;
2011
2012		goto out;
2013	} else			/* (nxlen < xlen) */
2014		goto updateLeft;
2015#endif				/* _JFS_WIP_NOCOALESCE */
2016
2017/* #ifdef _JFS_WIP_COALESCE */
2018	if (xoff < nxoff)
2019		goto coalesceRight;
2020
2021	/*
2022	 * coalesce with left XAD
2023	 */
2024//coalesceLeft: /* (xoff == nxoff) */
2025	/* is XAD first entry of page ? */
2026	if (index == XTENTRYSTART)
2027		goto replace;
2028
2029	/* is nXAD logically and physically contiguous with lXAD ? */
2030	lxad = &p->xad[index - 1];
2031	lxlen = lengthXAD(lxad);
2032	if (!(lxad->flag & XAD_NOTRECORDED) &&
2033	    (nxoff == offsetXAD(lxad) + lxlen) &&
2034	    (nxaddr == addressXAD(lxad) + lxlen) &&
2035	    (lxlen + nxlen < MAXXLEN)) {
2036		/* extend right lXAD */
2037		index0 = index - 1;
2038		XADlength(lxad, lxlen + nxlen);
2039
2040		/* If we just merged two extents together, need to make sure the
2041		 * right extent gets logged.  If the left one is marked XAD_NEW,
2042		 * then we know it will be logged.  Otherwise, mark as
2043		 * XAD_EXTENDED
2044		 */
2045		if (!(lxad->flag & XAD_NEW))
2046			lxad->flag |= XAD_EXTENDED;
2047
2048		if (xlen > nxlen) {
2049			/* truncate XAD */
2050			XADoffset(xad, xoff + nxlen);
2051			XADlength(xad, xlen - nxlen);
2052			XADaddress(xad, xaddr + nxlen);
2053			goto out;
2054		} else {	/* (xlen == nxlen) */
2055
2056			/* remove XAD */
2057			if (index < nextindex - 1)
2058				memmove(&p->xad[index], &p->xad[index + 1],
2059					(nextindex - index -
2060					 1) << L2XTSLOTSIZE);
2061
2062			p->header.nextindex =
2063			    cpu_to_le16(le16_to_cpu(p->header.nextindex) -
2064					1);
2065
2066			index = index0;
2067			newindex = index + 1;
2068			nextindex = le16_to_cpu(p->header.nextindex);
2069			xoff = nxoff = offsetXAD(lxad);
2070			xlen = nxlen = lxlen + nxlen;
2071			xaddr = nxaddr = addressXAD(lxad);
2072			goto coalesceRight;
2073		}
2074	}
2075
2076	/*
2077	 * replace XAD with nXAD
2078	 */
2079      replace:			/* (nxoff == xoff) */
2080	if (nxlen == xlen) {
2081		/* replace XAD with nXAD:recorded */
2082		*xad = *nxad;
2083		xad->flag = xflag & ~XAD_NOTRECORDED;
2084
2085		goto coalesceRight;
2086	} else			/* (nxlen < xlen) */
2087		goto updateLeft;
2088
2089	/*
2090	 * coalesce with right XAD
2091	 */
2092      coalesceRight:		/* (xoff <= nxoff) */
2093	/* is XAD last entry of page ? */
2094	if (newindex == nextindex) {
2095		if (xoff == nxoff)
2096			goto out;
2097		goto updateRight;
2098	}
2099
2100	/* is nXAD logically and physically contiguous with rXAD ? */
2101	rxad = &p->xad[index + 1];
2102	rxlen = lengthXAD(rxad);
2103	if (!(rxad->flag & XAD_NOTRECORDED) &&
2104	    (nxoff + nxlen == offsetXAD(rxad)) &&
2105	    (nxaddr + nxlen == addressXAD(rxad)) &&
2106	    (rxlen + nxlen < MAXXLEN)) {
2107		/* extend left rXAD */
2108		XADoffset(rxad, nxoff);
2109		XADlength(rxad, rxlen + nxlen);
2110		XADaddress(rxad, nxaddr);
2111
2112		/* If we just merged two extents together, need to make sure
2113		 * the left extent gets logged.  If the right one is marked
2114		 * XAD_NEW, then we know it will be logged.  Otherwise, mark as
2115		 * XAD_EXTENDED
2116		 */
2117		if (!(rxad->flag & XAD_NEW))
2118			rxad->flag |= XAD_EXTENDED;
2119
2120		if (xlen > nxlen)
2121			/* truncate XAD */
2122			XADlength(xad, xlen - nxlen);
2123		else {		/* (xlen == nxlen) */
2124
2125			/* remove XAD */
2126			memmove(&p->xad[index], &p->xad[index + 1],
2127				(nextindex - index - 1) << L2XTSLOTSIZE);
2128
2129			p->header.nextindex =
2130			    cpu_to_le16(le16_to_cpu(p->header.nextindex) -
2131					1);
2132		}
2133
2134		goto out;
2135	} else if (xoff == nxoff)
2136		goto out;
2137
2138	assert(xoff < nxoff);
2139/* #endif _JFS_WIP_COALESCE */
2140
2141	/*
2142	 * split XAD into (lXAD, nXAD):
2143	 *
2144	 *          |---nXAD--->
2145	 * --|----------XAD----------|--
2146	 *   |-lXAD-|
2147	 */
2148      updateRight:		/* (xoff < nxoff) */
2149	/* truncate old XAD as lXAD:not_recorded */
2150	xad = &p->xad[index];
2151	XADlength(xad, nxoff - xoff);
2152
2153	/* insert nXAD:recorded */
2154	if (nextindex == le16_to_cpu(p->header.maxentry)) {
2155/*
2156printf("xtUpdate.updateRight.split p:0x%p\n", p);
2157*/
2158		rootsplit = p->header.flag & BT_ROOT;
2159
2160		/* xtSpliUp() unpins leaf pages */
2161		split.mp = mp;
2162		split.index = newindex;
2163		split.flag = xflag & ~XAD_NOTRECORDED;
2164		split.off = nxoff;
2165		split.len = nxlen;
2166		split.addr = nxaddr;
2167		split.pxdlist = NULL;
2168		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
2169			return rc;
2170
2171		/*
2172		 * if leaf root has been split, original root has been
2173		 * copied to new child page, i.e., original entry now
2174		 * resides on the new child page;
2175		 */
2176		if (rootsplit) {
2177			if (p->header.nextindex ==
2178			    cpu_to_le16(XTENTRYSTART + 1)) {
2179				xad = &p->xad[XTENTRYSTART];
2180				bn = addressXAD(xad);
2181
2182				/* get new child page */
2183				XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2184
2185				BT_MARK_DIRTY(mp, ip);
2186				if (!test_cflag(COMMIT_Nolink, ip)) {
2187					tlck = txLock(tid, ip, mp,
2188						      tlckXTREE |
2189						      tlckGROW);
2190					xtlck = (struct xtlock *) & tlck->lock;
2191				}
2192			}
2193		} else {
2194			/* get back old page */
2195			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2196
2197			/* is nXAD on new page ? */
2198			if (newindex >
2199			    (le16_to_cpu(p->header.maxentry) >> 1)) {
2200				newindex =
2201				    newindex -
2202				    le16_to_cpu(p->header.nextindex) +
2203				    XTENTRYSTART;
2204				newpage = 1;
2205			}
2206		}
2207	} else {
2208		/* if insert into middle, shift right remaining entries */
2209		if (newindex < nextindex)
2210			memmove(&p->xad[newindex + 1], &p->xad[newindex],
2211				(nextindex - newindex) << L2XTSLOTSIZE);
2212
2213		/* insert the entry */
2214		xad = &p->xad[newindex];
2215		*xad = *nxad;
2216		xad->flag = xflag & ~XAD_NOTRECORDED;
2217
2218		/* advance next available entry index. */
2219		p->header.nextindex =
2220		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
2221	}
2222
2223	/*
2224	 * does nXAD force 3-way split ?
2225	 *
2226	 *          |---nXAD--->|
2227	 * --|----------XAD-------------|--
2228	 *   |-lXAD-|           |-rXAD -|
2229	 */
2230	if (nxoff + nxlen == xoff + xlen)
2231		goto out;
2232
2233	/* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */
2234	if (newpage) {
2235		/* close out old page */
2236		if (!test_cflag(COMMIT_Nolink, ip)) {
2237			xtlck->lwm.offset = (xtlck->lwm.offset) ?
2238			    min(index0, (int)xtlck->lwm.offset) : index0;
2239			xtlck->lwm.length =
2240			    le16_to_cpu(p->header.nextindex) -
2241			    xtlck->lwm.offset;
2242		}
2243
2244		bn = le64_to_cpu(p->header.next);
2245		XT_PUTPAGE(mp);
2246
2247		/* get new right page */
2248		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2249
2250		BT_MARK_DIRTY(mp, ip);
2251		if (!test_cflag(COMMIT_Nolink, ip)) {
2252			tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
2253			xtlck = (struct xtlock *) & tlck->lock;
2254		}
2255
2256		index0 = index = newindex;
2257	} else
2258		index++;
2259
2260	newindex = index + 1;
2261	nextindex = le16_to_cpu(p->header.nextindex);
2262	xlen = xlen - (nxoff - xoff);
2263	xoff = nxoff;
2264	xaddr = nxaddr;
2265
2266	/* recompute split pages */
2267	if (nextindex == le16_to_cpu(p->header.maxentry)) {
2268/*
2269printf("xtUpdate: updateRight+Left recompute split pages: p:0x%p\n", p);
2270*/
2271		XT_PUTPAGE(mp);
2272
2273		if ((rc = xtSearch(ip, nxoff, &cmp, &btstack, XT_INSERT)))
2274			return rc;
2275		assert(cmp == 0);
2276
2277		/* retrieve search result */
2278		XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
2279		assert(index0 == index);
2280	}
2281
2282	/*
2283	 * split XAD into (nXAD, rXAD)
2284	 *
2285	 *          ---nXAD---|
2286	 * --|----------XAD----------|--
2287	 *                    |-rXAD-|
2288	 */
2289      updateLeft:		/* (nxoff == xoff) && (nxlen < xlen) */
2290	/* update old XAD with nXAD:recorded */
2291	xad = &p->xad[index];
2292	*xad = *nxad;
2293	xad->flag = xflag & ~XAD_NOTRECORDED;
2294
2295	/* insert rXAD:not_recorded */
2296	xoff = xoff + nxlen;
2297	xlen = xlen - nxlen;
2298	xaddr = xaddr + nxlen;
2299	if (nextindex == le16_to_cpu(p->header.maxentry)) {
2300		rootsplit = p->header.flag & BT_ROOT;
2301
2302/*
2303printf("xtUpdate.updateLeft.split p:0x%p\n", p);
2304*/
2305		/* xtSpliUp() unpins leaf pages */
2306		split.mp = mp;
2307		split.index = newindex;
2308		split.flag = xflag;
2309		split.off = xoff;
2310		split.len = xlen;
2311		split.addr = xaddr;
2312		split.pxdlist = NULL;
2313		if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
2314			return rc;
2315
2316		/*
2317		 * if leaf root has been split, original root has been
2318		 * copied to new child page, i.e., original entry now
2319		 * resides on the new child page;
2320		 */
2321		if (rootsplit) {
2322			if (p->header.nextindex ==
2323			    cpu_to_le16(XTENTRYSTART + 1)) {
2324				xad = &p->xad[XTENTRYSTART];
2325				bn = addressXAD(xad);
2326
2327				/* get new child page */
2328				XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2329
2330				BT_MARK_DIRTY(mp, ip);
2331				if (!test_cflag(COMMIT_Nolink, ip)) {
2332					tlck = txLock(tid, ip, mp,
2333						      tlckXTREE |
2334						      tlckGROW);
2335					xtlck = (struct xtlock *) & tlck->lock;
2336				}
2337			}
2338		} else
2339			/* get back old page */
2340			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2341	} else {
2342		/* if insert into middle, shift right remaining entries */
2343		if (newindex < nextindex)
2344			memmove(&p->xad[newindex + 1], &p->xad[newindex],
2345				(nextindex - newindex) << L2XTSLOTSIZE);
2346
2347		/* insert the entry */
2348		xad = &p->xad[newindex];
2349		XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
2350
2351		/* advance next available entry index. */
2352		p->header.nextindex =
2353		    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
2354	}
2355
2356      out:
2357	if (!test_cflag(COMMIT_Nolink, ip)) {
2358		xtlck->lwm.offset = (xtlck->lwm.offset) ?
2359		    min(index0, (int)xtlck->lwm.offset) : index0;
2360		xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
2361		    xtlck->lwm.offset;
2362	}
2363
2364	/* unpin the leaf page */
2365	XT_PUTPAGE(mp);
2366
2367	return rc;
2368}
2369
2370
2371/*
2372 *      xtAppend()
2373 *
2374 * function: grow in append mode from contiguous region specified ;
2375 *
2376 * parameter:
2377 *      tid             - transaction id;
2378 *      ip              - file object;
2379 *      xflag           - extent flag:
2380 *      xoff            - extent offset;
2381 *      maxblocks       - max extent length;
2382 *      xlen            - extent length (in/out);
2383 *      xaddrp          - extent address pointer (in/out):
2384 *      flag            -
2385 *
2386 * return:
2387 */
2388int xtAppend(tid_t tid,		/* transaction id */
2389	     struct inode *ip, int xflag, s64 xoff, s32 maxblocks,
2390	     s32 * xlenp,	/* (in/out) */
2391	     s64 * xaddrp,	/* (in/out) */
2392	     int flag)
2393{
2394	int rc = 0;
2395	struct metapage *mp;	/* meta-page buffer */
2396	xtpage_t *p;		/* base B+-tree index page */
2397	s64 bn, xaddr;
2398	int index, nextindex;
2399	struct btstack btstack;	/* traverse stack */
2400	struct xtsplit split;	/* split information */
2401	xad_t *xad;
2402	int cmp;
2403	struct tlock *tlck;
2404	struct xtlock *xtlck;
2405	int nsplit, nblocks, xlen;
2406	struct pxdlist pxdlist;
2407	pxd_t *pxd;
2408
2409	xaddr = *xaddrp;
2410	xlen = *xlenp;
2411	jEVENT(0,
2412	       ("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx\n",
2413		(ulong) xoff, maxblocks, xlen, (ulong) xaddr));
2414
2415	/*
2416	 *      search for the entry location at which to insert:
2417	 *
2418	 * xtFastSearch() and xtSearch() both returns (leaf page
2419	 * pinned, index at which to insert).
2420	 * n.b. xtSearch() may return index of maxentry of
2421	 * the full page.
2422	 */
2423	if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT)))
2424		return rc;
2425
2426	/* retrieve search result */
2427	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2428
2429	if (cmp == 0) {
2430		rc = EEXIST;
2431		goto out;
2432	}
2433//insert:
2434	/*
2435	 *      insert entry for new extent
2436	 */
2437	xflag |= XAD_NEW;
2438
2439	/*
2440	 *      if the leaf page is full, split the page and
2441	 *      propagate up the router entry for the new page from split
2442	 *
2443	 * The xtSplitUp() will insert the entry and unpin the leaf page.
2444	 */
2445	nextindex = le16_to_cpu(p->header.nextindex);
2446	if (nextindex < le16_to_cpu(p->header.maxentry))
2447		goto insertLeaf;
2448
2449	/*
2450	 * allocate new index blocks to cover index page split(s)
2451	 */
2452	nsplit = btstack.nsplit;
2453	split.pxdlist = &pxdlist;
2454	pxdlist.maxnpxd = pxdlist.npxd = 0;
2455	pxd = &pxdlist.pxd[0];
2456	nblocks = JFS_SBI(ip->i_sb)->nbperpage;
2457	for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) {
2458		if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) {
2459			PXDaddress(pxd, xaddr);
2460			PXDlength(pxd, nblocks);
2461
2462			pxdlist.maxnpxd++;
2463
2464			continue;
2465		}
2466
2467		/* undo allocation */
2468
2469		goto out;
2470	}
2471
2472	xlen = min(xlen, maxblocks);
2473
2474	/*
2475	 * allocate data extent requested
2476	 */
2477	if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
2478		goto out;
2479
2480	split.mp = mp;
2481	split.index = index;
2482	split.flag = xflag;
2483	split.off = xoff;
2484	split.len = xlen;
2485	split.addr = xaddr;
2486	if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
2487		/* undo data extent allocation */
2488		dbFree(ip, *xaddrp, (s64) * xlenp);
2489
2490		return rc;
2491	}
2492
2493	*xaddrp = xaddr;
2494	*xlenp = xlen;
2495	return 0;
2496
2497	/*
2498	 *      insert the new entry into the leaf page
2499	 */
2500      insertLeaf:
2501	/*
2502	 * allocate data extent requested
2503	 */
2504	if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
2505		goto out;
2506
2507	BT_MARK_DIRTY(mp, ip);
2508	/*
2509	 * acquire a transaction lock on the leaf page;
2510	 *
2511	 * action: xad insertion/extension;
2512	 */
2513	tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
2514	xtlck = (struct xtlock *) & tlck->lock;
2515
2516	/* insert the new entry: mark the entry NEW */
2517	xad = &p->xad[index];
2518	XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
2519
2520	/* advance next available entry index */
2521	p->header.nextindex =
2522	    cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
2523
2524	xtlck->lwm.offset =
2525	    (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index;
2526	xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
2527	    xtlck->lwm.offset;
2528
2529	*xaddrp = xaddr;
2530	*xlenp = xlen;
2531
2532      out:
2533	/* unpin the leaf page */
2534	XT_PUTPAGE(mp);
2535
2536	return rc;
2537}
2538#ifdef _STILL_TO_PORT
2539
2540/* - TBD for defragmentaion/reorganization -
2541 *
2542 *      xtDelete()
2543 *
2544 * function:
2545 *      delete the entry with the specified key.
2546 *
2547 *      N.B.: whole extent of the entry is assumed to be deleted.
2548 *
2549 * parameter:
2550 *
2551 * return:
2552 *       ENOENT: if the entry is not found.
2553 *
2554 * exception:
2555 */
2556int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
2557{
2558	int rc = 0;
2559	struct btstack btstack;
2560	int cmp;
2561	s64 bn;
2562	struct metapage *mp;
2563	xtpage_t *p;
2564	int index, nextindex;
2565	struct tlock *tlck;
2566	struct xtlock *xtlck;
2567
2568	/*
2569	 * find the matching entry; xtSearch() pins the page
2570	 */
2571	if ((rc = xtSearch(ip, xoff, &cmp, &btstack, 0)))
2572		return rc;
2573
2574	XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2575	if (cmp) {
2576		/* unpin the leaf page */
2577		XT_PUTPAGE(mp);
2578		return ENOENT;
2579	}
2580
2581	/*
2582	 * delete the entry from the leaf page
2583	 */
2584	nextindex = le16_to_cpu(p->header.nextindex);
2585	p->header.nextindex =
2586	    cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1);
2587
2588	/*
2589	 * if the leaf page bocome empty, free the page
2590	 */
2591	if (p->header.nextindex == cpu_to_le16(XTENTRYSTART))
2592		return (xtDeleteUp(tid, ip, mp, p, &btstack));
2593
2594	BT_MARK_DIRTY(mp, ip);
2595	/*
2596	 * acquire a transaction lock on the leaf page;
2597	 *
2598	 * action:xad deletion;
2599	 */
2600	tlck = txLock(tid, ip, mp, tlckXTREE);
2601	xtlck = (struct xtlock *) & tlck->lock;
2602	xtlck->lwm.offset =
2603	    (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index;
2604
2605	/* if delete from middle, shift left/compact the remaining entries */
2606	if (index < nextindex - 1)
2607		memmove(&p->xad[index], &p->xad[index + 1],
2608			(nextindex - index - 1) * sizeof(xad_t));
2609
2610	XT_PUTPAGE(mp);
2611
2612	return 0;
2613}
2614
2615
2616/* - TBD for defragmentaion/reorganization -
2617 *
2618 *      xtDeleteUp()
2619 *
2620 * function:
2621 *      free empty pages as propagating deletion up the tree
2622 *
2623 * parameter:
2624 *
2625 * return:
2626 */
2627static int
2628xtDeleteUp(tid_t tid, struct inode *ip,
2629	   struct metapage * fmp, xtpage_t * fp, struct btstack * btstack)
2630{
2631	int rc = 0;
2632	struct metapage *mp;
2633	xtpage_t *p;
2634	int index, nextindex;
2635	s64 xaddr;
2636	int xlen;
2637	struct btframe *parent;
2638	struct tlock *tlck;
2639	struct xtlock *xtlck;
2640
2641	/*
2642	 * keep root leaf page which has become empty
2643	 */
2644	if (fp->header.flag & BT_ROOT) {
2645		/* keep the root page */
2646		fp->header.flag &= ~BT_INTERNAL;
2647		fp->header.flag |= BT_LEAF;
2648		fp->header.nextindex = cpu_to_le16(XTENTRYSTART);
2649
2650		/* XT_PUTPAGE(fmp); */
2651
2652		return 0;
2653	}
2654
2655	/*
2656	 * free non-root leaf page
2657	 */
2658	if ((rc = xtRelink(tid, ip, fp)))
2659		return rc;
2660
2661	xaddr = addressPXD(&fp->header.self);
2662	xlen = lengthPXD(&fp->header.self);
2663	/* free the page extent */
2664	dbFree(ip, xaddr, (s64) xlen);
2665
2666	/* free the buffer page */
2667	discard_metapage(fmp);
2668
2669	/*
2670	 * propagate page deletion up the index tree
2671	 *
2672	 * If the delete from the parent page makes it empty,
2673	 * continue all the way up the tree.
2674	 * stop if the root page is reached (which is never deleted) or
2675	 * if the entry deletion does not empty the page.
2676	 */
2677	while ((parent = BT_POP(btstack)) != NULL) {
2678		/* get/pin the parent page <sp> */
2679		XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
2680		if (rc)
2681			return rc;
2682
2683		index = parent->index;
2684
2685		/* delete the entry for the freed child page from parent.
2686		 */
2687		nextindex = le16_to_cpu(p->header.nextindex);
2688
2689		/*
2690		 * the parent has the single entry being deleted:
2691		 * free the parent page which has become empty.
2692		 */
2693		if (nextindex == 1) {
2694			if (p->header.flag & BT_ROOT) {
2695				/* keep the root page */
2696				p->header.flag &= ~BT_INTERNAL;
2697				p->header.flag |= BT_LEAF;
2698				p->header.nextindex =
2699				    cpu_to_le16(XTENTRYSTART);
2700
2701				/* XT_PUTPAGE(fmp); */
2702
2703				break;
2704			} else {
2705				/* free the parent page */
2706				if ((rc = xtRelink(tid, ip, p)))
2707					return rc;
2708
2709				xaddr = addressPXD(&p->header.self);
2710				/* free the page extent */
2711				dbFree(ip, xaddr,
2712				       (s64) JFS_SBI(ip->i_sb)->nbperpage);
2713
2714				/* unpin/free the buffer page */
2715				discard_metapage(fmp);
2716
2717				/* propagate up */
2718				continue;
2719			}
2720		}
2721		/*
2722		 * the parent has other entries remaining:
2723		 * delete the router entry from the parent page.
2724		 */
2725		else {
2726			BT_MARK_DIRTY(mp, ip);
2727			/*
2728			 * acquire a transaction lock on the leaf page;
2729			 *
2730			 * action:xad deletion;
2731			 */
2732			tlck = txLock(tid, ip, mp, tlckXTREE);
2733			xtlck = (struct xtlock *) & tlck->lock;
2734			xtlck->lwm.offset =
2735			    (xtlck->lwm.offset) ? min(index,
2736						      xtlck->lwm.
2737						      offset) : index;
2738
2739			/* if delete from middle,
2740			 * shift left/compact the remaining entries in the page
2741			 */
2742			if (index < nextindex - 1)
2743				memmove(&p->xad[index], &p->xad[index + 1],
2744					(nextindex - index -
2745					 1) << L2XTSLOTSIZE);
2746
2747			p->header.nextindex =
2748			    cpu_to_le16(le16_to_cpu(p->header.nextindex) -
2749					1);
2750			jEVENT(0,
2751			       ("xtDeleteUp(entry): 0x%lx[%d]\n",
2752				(ulong) parent->bn, index));
2753		}
2754
2755		/* unpin the parent page */
2756		XT_PUTPAGE(mp);
2757
2758		/* exit propagation up */
2759		break;
2760	}
2761
2762	return 0;
2763}
2764
2765
2766/*
2767 * NAME:        xtRelocate()
2768 *
2769 * FUNCTION:    relocate xtpage or data extent of regular file;
2770 *              This function is mainly used by defragfs utility.
2771 *
2772 * NOTE:        This routine does not have the logic to handle
2773 *              uncommitted allocated extent. The caller should call
2774 *              txCommit() to commit all the allocation before call
2775 *              this routine.
2776 */
2777xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
2778	   s64 nxaddr,		/* new xaddr */
2779	   int xtype)
2780{				/* extent type: XTPAGE or DATAEXT */
2781	int rc = 0;
2782	struct tblock *tblk;
2783	struct tlock *tlck;
2784	struct xtlock *xtlck;
2785	struct metapage *mp, *pmp, *lmp, *rmp;	/* meta-page buffer */
2786	xtpage_t *p, *pp, *rp, *lp;	/* base B+-tree index page */
2787	xad_t *xad;
2788	pxd_t *pxd;
2789	s64 xoff, xsize;
2790	int xlen;
2791	s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn;
2792	cbuf_t *cp;
2793	s64 offset, nbytes, nbrd, pno;
2794	int nb, npages, nblks;
2795	s64 bn;
2796	int cmp;
2797	int index;
2798	struct pxd_lock *pxdlock;
2799	struct btstack btstack;	/* traverse stack */
2800
2801	xtype = xtype & EXTENT_TYPE;
2802
2803	xoff = offsetXAD(oxad);
2804	oxaddr = addressXAD(oxad);
2805	xlen = lengthXAD(oxad);
2806
2807	/* validate extent offset */
2808	offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
2809	if (offset >= ip->i_size)
2810		return ESTALE;	/* stale extent */
2811
2812	jEVENT(0,
2813	       ("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx\n",
2814		xtype, (ulong) xoff, xlen, (ulong) oxaddr,
2815		(ulong) nxaddr));
2816
2817	/*
2818	 *      1. get and validate the parent xtpage/xad entry
2819	 *      covering the source extent to be relocated;
2820	 */
2821	if (xtype == DATAEXT) {
2822		/* search in leaf entry */
2823		rc = xtSearch(ip, xoff, &cmp, &btstack, 0);
2824		if (rc)
2825			return rc;
2826		if (cmp) {
2827			XT_PUTPAGE(pmp);
2828			return ESTALE;
2829		}
2830
2831		/* retrieve search result */
2832		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2833
2834		/* validate for exact match with a single entry */
2835		xad = &pp->xad[index];
2836		if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) {
2837			XT_PUTPAGE(pmp);
2838			return ESTALE;
2839		}
2840	} else {		/* (xtype == XTPAGE) */
2841
2842		/* search in internal entry */
2843		rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0);
2844		if (rc)
2845			return rc;
2846		if (cmp) {
2847			XT_PUTPAGE(pmp);
2848			return ESTALE;
2849		}
2850
2851		/* retrieve search result */
2852		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2853
2854		/* xtSearchNode() validated for exact match with a single entry
2855		 */
2856		xad = &pp->xad[index];
2857	}
2858	jEVENT(0, ("xtRelocate: parent xad entry validated.\n"));
2859
2860	/*
2861	 *      2. relocate the extent
2862	 */
2863	if (xtype == DATAEXT) {
2864		/* if the extent is allocated-but-not-recorded
2865		 * there is no real data to be moved in this extent,
2866		 */
2867		if (xad->flag & XAD_NOTRECORDED)
2868			goto out;
2869		else
2870			/* release xtpage for cmRead()/xtLookup() */
2871			XT_PUTPAGE(pmp);
2872
2873		/*
2874		 *      cmRelocate()
2875		 *
2876		 * copy target data pages to be relocated;
2877		 *
2878		 * data extent must start at page boundary and
2879		 * multiple of page size (except the last data extent);
2880		 * read in each page of the source data extent into cbuf,
2881		 * update the cbuf extent descriptor of the page to be
2882		 * homeward bound to new dst data extent
2883		 * copy the data from the old extent to new extent.
2884		 * copy is essential for compressed files to avoid problems
2885		 * that can arise if there was a change in compression
2886		 * algorithms.
2887		 * it is a good strategy because it may disrupt cache
2888		 * policy to keep the pages in memory afterwards.
2889		 */
2890		offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
2891		assert((offset & CM_OFFSET) == 0);
2892		nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize;
2893		pno = offset >> CM_L2BSIZE;
2894		npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE;
2895/*
2896                npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
2897                         (offset >> CM_L2BSIZE) + 1;
2898*/
2899		sxaddr = oxaddr;
2900		dxaddr = nxaddr;
2901
2902		/* process the request one cache buffer at a time */
2903		for (nbrd = 0; nbrd < nbytes; nbrd += nb,
2904		     offset += nb, pno++, npages--) {
2905			/* compute page size */
2906			nb = min(nbytes - nbrd, CM_BSIZE);
2907
2908			/* get the cache buffer of the page */
2909			if (rc = cmRead(ip, offset, npages, &cp))
2910				break;
2911
2912			assert(addressPXD(&cp->cm_pxd) == sxaddr);
2913			assert(!cp->cm_modified);
2914
2915			/* bind buffer with the new extent address */
2916			nblks = nb >> JFS_IP(ip->i_sb)->l2bsize;
2917			cmSetXD(ip, cp, pno, dxaddr, nblks);
2918
2919			/* release the cbuf, mark it as modified */
2920			cmPut(cp, TRUE);
2921
2922			dxaddr += nblks;
2923			sxaddr += nblks;
2924		}
2925
2926		/* get back parent page */
2927		rc = xtSearch(ip, xoff, &cmp, &btstack, 0);
2928		XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2929		jEVENT(0, ("xtRelocate: target data extent relocated.\n"));
2930	} else {		/* (xtype  == XTPAGE) */
2931
2932		/*
2933		 * read in the target xtpage from the source extent;
2934		 */
2935		XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
2936		if (rc) {
2937			XT_PUTPAGE(pmp);
2938			return rc;
2939		}
2940
2941		/*
2942		 * read in sibling pages if any to update sibling pointers;
2943		 */
2944		rmp = NULL;
2945		if (p->header.next) {
2946			nextbn = le64_to_cpu(p->header.next);
2947			XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
2948			if (rc) {
2949				XT_PUTPAGE(pmp);
2950				XT_PUTPAGE(mp);
2951				return (rc);
2952			}
2953		}
2954
2955		lmp = NULL;
2956		if (p->header.prev) {
2957			prevbn = le64_to_cpu(p->header.prev);
2958			XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
2959			if (rc) {
2960				XT_PUTPAGE(pmp);
2961				XT_PUTPAGE(mp);
2962				if (rmp)
2963					XT_PUTPAGE(rmp);
2964				return (rc);
2965			}
2966		}
2967
2968		/* at this point, all xtpages to be updated are in memory */
2969
2970		/*
2971		 * update sibling pointers of sibling xtpages if any;
2972		 */
2973		if (lmp) {
2974			BT_MARK_DIRTY(lmp, ip);
2975			tlck =
2976			    txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
2977			lp->header.next = cpu_to_le64(nxaddr);
2978			XT_PUTPAGE(lmp);
2979		}
2980
2981		if (rmp) {
2982			BT_MARK_DIRTY(rmp, ip);
2983			tlck =
2984			    txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
2985			rp->header.prev = cpu_to_le64(nxaddr);
2986			XT_PUTPAGE(rmp);
2987		}
2988
2989		/*
2990		 * update the target xtpage to be relocated
2991		 *
2992		 * update the self address of the target page
2993		 * and write to destination extent;
2994		 * redo image covers the whole xtpage since it is new page
2995		 * to the destination extent;
2996		 * update of bmap for the free of source extent
2997		 * of the target xtpage itself:
2998		 * update of bmap for the allocation of destination extent
2999		 * of the target xtpage itself:
3000		 * update of bmap for the extents covered by xad entries in
3001		 * the target xtpage is not necessary since they are not
3002		 * updated;
3003		 * if not committed before this relocation,
3004		 * target page may contain XAD_NEW entries which must
3005		 * be scanned for bmap update (logredo() always
3006		 * scan xtpage REDOPAGE image for bmap update);
3007		 * if committed before this relocation (tlckRELOCATE),
3008		 * scan may be skipped by commit() and logredo();
3009		 */
3010		BT_MARK_DIRTY(mp, ip);
3011		/* tlckNEW init  xtlck->lwm.offset = XTENTRYSTART; */
3012		tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW);
3013		xtlck = (struct xtlock *) & tlck->lock;
3014
3015		/* update the self address in the xtpage header */
3016		pxd = &p->header.self;
3017		PXDaddress(pxd, nxaddr);
3018
3019		/* linelock for the after image of the whole page */
3020		xtlck->lwm.length =
3021		    le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
3022
3023		/* update the buffer extent descriptor of target xtpage */
3024		xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
3025		bmSetXD(mp, nxaddr, xsize);
3026
3027		/* unpin the target page to new homeward bound */
3028		XT_PUTPAGE(mp);
3029		jEVENT(0, ("xtRelocate: target xtpage relocated.\n"));
3030	}
3031
3032	/*
3033	 *      3. acquire maplock for the source extent to be freed;
3034	 *
3035	 * acquire a maplock saving the src relocated extent address;
3036	 * to free of the extent at commit time;
3037	 */
3038      out:
3039	/* if DATAEXT relocation, write a LOG_UPDATEMAP record for
3040	 * free PXD of the source data extent (logredo() will update
3041	 * bmap for free of source data extent), and update bmap for
3042	 * free of the source data extent;
3043	 */
3044	if (xtype == DATAEXT)
3045		tlck = txMaplock(tid, ip, tlckMAP);
3046	/* if XTPAGE relocation, write a LOG_NOREDOPAGE record
3047	 * for the source xtpage (logredo() will init NoRedoPage
3048	 * filter and will also update bmap for free of the source
3049	 * xtpage), and update bmap for free of the source xtpage;
3050	 * N.B. We use tlckMAP instead of tlkcXTREE because there
3051	 *      is no buffer associated with this lock since the buffer
3052	 *      has been redirected to the target location.
3053	 */
3054	else			/* (xtype  == XTPAGE) */
3055		tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE);
3056
3057	pxdlock = (struct pxd_lock *) & tlck->lock;
3058	pxdlock->flag = mlckFREEPXD;
3059	PXDaddress(&pxdlock->pxd, oxaddr);
3060	PXDlength(&pxdlock->pxd, xlen);
3061	pxdlock->index = 1;
3062
3063	/*
3064	 *      4. update the parent xad entry for relocation;
3065	 *
3066	 * acquire tlck for the parent entry with XAD_NEW as entry
3067	 * update which will write LOG_REDOPAGE and update bmap for
3068	 * allocation of XAD_NEW destination extent;
3069	 */
3070	jEVENT(0, ("xtRelocate: update parent xad entry.\n"));
3071	BT_MARK_DIRTY(pmp, ip);
3072	tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW);
3073	xtlck = (struct xtlock *) & tlck->lock;
3074
3075	/* update the XAD with the new destination extent; */
3076	xad = &pp->xad[index];
3077	xad->flag |= XAD_NEW;
3078	XADaddress(xad, nxaddr);
3079
3080	xtlck->lwm.offset = min(index, xtlck->lwm.offset);
3081	xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) -
3082	    xtlck->lwm.offset;
3083
3084	/* unpin the parent xtpage */
3085	XT_PUTPAGE(pmp);
3086
3087	return rc;
3088}
3089
3090
3091/*
3092 *      xtSearchNode()
3093 *
3094 * function:    search for the internal xad entry covering specified extent.
3095 *              This function is mainly used by defragfs utility.
3096 *
3097 * parameters:
3098 *      ip      - file object;
3099 *      xad     - extent to find;
3100 *      cmpp    - comparison result:
3101 *      btstack - traverse stack;
3102 *      flag    - search process flag;
3103 *
3104 * returns:
3105 *      btstack contains (bn, index) of search path traversed to the entry.
3106 *      *cmpp is set to result of comparison with the entry returned.
3107 *      the page containing the entry is pinned at exit.
3108 */
3109static int xtSearchNode(struct inode *ip, xad_t * xad,	/* required XAD entry */
3110			int *cmpp, struct btstack * btstack, int flag)
3111{
3112	int rc = 0;
3113	s64 xoff, xaddr;
3114	int xlen;
3115	int cmp = 1;		/* init for empty page */
3116	s64 bn;			/* block number */
3117	struct metapage *mp;	/* meta-page buffer */
3118	xtpage_t *p;		/* page */
3119	int base, index, lim;
3120	struct btframe *btsp;
3121	s64 t64;
3122
3123	BT_CLR(btstack);
3124
3125	xoff = offsetXAD(xad);
3126	xlen = lengthXAD(xad);
3127	xaddr = addressXAD(xad);
3128
3129	/*
3130	 *      search down tree from root:
3131	 *
3132	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
3133	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
3134	 *
3135	 * if entry with search key K is not found
3136	 * internal page search find the entry with largest key Ki
3137	 * less than K which point to the child page to search;
3138	 * leaf page search find the entry with smallest key Kj
3139	 * greater than K so that the returned index is the position of
3140	 * the entry to be shifted right for insertion of new entry.
3141	 * for empty tree, search key is greater than any key of the tree.
3142	 *
3143	 * by convention, root bn = 0.
3144	 */
3145	for (bn = 0;;) {
3146		/* get/pin the page to search */
3147		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3148		if (rc)
3149			return rc;
3150		if (p->header.flag & BT_LEAF)
3151			return ESTALE;
3152
3153		lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
3154
3155		/*
3156		 * binary search with search key K on the current page
3157		 */
3158		for (base = XTENTRYSTART; lim; lim >>= 1) {
3159			index = base + (lim >> 1);
3160
3161			XT_CMP(cmp, xoff, &p->xad[index], t64);
3162			if (cmp == 0) {
3163				/*
3164				 *      search hit
3165				 *
3166				 * verify for exact match;
3167				 */
3168				if (xaddr == addressXAD(&p->xad[index]) &&
3169				    xoff == offsetXAD(&p->xad[index])) {
3170					*cmpp = cmp;
3171
3172					/* save search result */
3173					btsp = btstack->top;
3174					btsp->bn = bn;
3175					btsp->index = index;
3176					btsp->mp = mp;
3177
3178					return 0;
3179				}
3180
3181				/* descend/search its child page */
3182				goto next;
3183			}
3184
3185			if (cmp > 0) {
3186				base = index + 1;
3187				--lim;
3188			}
3189		}
3190
3191		/*
3192		 *      search miss - non-leaf page:
3193		 *
3194		 * base is the smallest index with key (Kj) greater than
3195		 * search key (K) and may be zero or maxentry index.
3196		 * if base is non-zero, decrement base by one to get the parent
3197		 * entry of the child page to search.
3198		 */
3199		index = base ? base - 1 : base;
3200
3201		/*
3202		 * go down to child page
3203		 */
3204	      next:
3205		/* get the child page block number */
3206		bn = addressXAD(&p->xad[index]);
3207
3208		/* unpin the parent page */
3209		XT_PUTPAGE(mp);
3210	}
3211}
3212
3213
3214/*
3215 *      xtRelink()
3216 *
3217 * function:
3218 *      link around a freed page.
3219 *
3220 * Parameter:
3221 *      int           tid,
3222 *      struct inode    *ip,
3223 *      xtpage_t        *p)
3224 *
3225 * returns:
3226 */
3227static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p)
3228{
3229	int rc = 0;
3230	struct metapage *mp;
3231	s64 nextbn, prevbn;
3232	struct tlock *tlck;
3233
3234	nextbn = le64_to_cpu(p->header.next);
3235	prevbn = le64_to_cpu(p->header.prev);
3236
3237	/* update prev pointer of the next page */
3238	if (nextbn != 0) {
3239		XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
3240		if (rc)
3241			return rc;
3242
3243		/*
3244		 * acquire a transaction lock on the page;
3245		 *
3246		 * action: update prev pointer;
3247		 */
3248		BT_MARK_DIRTY(mp, ip);
3249		tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
3250
3251		/* the page may already have been tlock'd */
3252
3253		p->header.prev = cpu_to_le64(prevbn);
3254
3255		XT_PUTPAGE(mp);
3256	}
3257
3258	/* update next pointer of the previous page */
3259	if (prevbn != 0) {
3260		XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
3261		if (rc)
3262			return rc;
3263
3264		/*
3265		 * acquire a transaction lock on the page;
3266		 *
3267		 * action: update next pointer;
3268		 */
3269		BT_MARK_DIRTY(mp, ip);
3270		tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
3271
3272		/* the page may already have been tlock'd */
3273
3274		p->header.next = le64_to_cpu(nextbn);
3275
3276		XT_PUTPAGE(mp);
3277	}
3278
3279	return 0;
3280}
3281#endif				/*  _STILL_TO_PORT */
3282
3283
3284/*
3285 *      xtInitRoot()
3286 *
3287 * initialize file root (inline in inode)
3288 */
3289void xtInitRoot(tid_t tid, struct inode *ip)
3290{
3291	xtpage_t *p;
3292	struct tlock *tlck;
3293
3294	/*
3295	 * acquire a transaction lock on the root
3296	 *
3297	 * action:
3298	 */
3299	tlck = txLock(tid, ip, (struct metapage *) &JFS_IP(ip)->bxflag,
3300		      tlckXTREE | tlckNEW);
3301	p = &JFS_IP(ip)->i_xtroot;
3302
3303	p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
3304	p->header.nextindex = cpu_to_le16(XTENTRYSTART);
3305
3306	if (S_ISDIR(ip->i_mode))
3307		p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR);
3308	else {
3309		p->header.maxentry = cpu_to_le16(XTROOTINITSLOT);
3310		ip->i_size = 0;
3311	}
3312
3313
3314	return;
3315}
3316
3317
3318/*
3319 * We can run into a deadlock truncating a file with a large number of
3320 * xtree pages (large fragmented file).  A robust fix would entail a
3321 * reservation system where we would reserve a number of metadata pages
3322 * and tlocks which we would be guaranteed without a deadlock.  Without
3323 * this, a partial fix is to limit number of metadata pages we will lock
3324 * in a single transaction.  Currently we will truncate the file so that
3325 * no more than 50 leaf pages will be locked.  The caller of xtTruncate
3326 * will be responsible for ensuring that the current transaction gets
3327 * committed, and that subsequent transactions are created to truncate
3328 * the file further if needed.
3329 */
3330#define MAX_TRUNCATE_LEAVES 50
3331
3332/*
3333 *      xtTruncate()
3334 *
3335 * function:
3336 *      traverse for truncation logging backward bottom up;
3337 *      terminate at the last extent entry at the current subtree
3338 *      root page covering new down size.
3339 *      truncation may occur within the last extent entry.
3340 *
3341 * parameter:
3342 *      int           tid,
3343 *      struct inode    *ip,
3344 *      s64           newsize,
3345 *      int           type)   {PWMAP, PMAP, WMAP; DELETE, TRUNCATE}
3346 *
3347 * return:
3348 *
3349 * note:
3350 *      PWMAP:
3351 *       1. truncate (non-COMMIT_NOLINK file)
3352 *          by jfs_truncate() or jfs_open(O_TRUNC):
3353 *          xtree is updated;
3354 *	 2. truncate index table of directory when last entry removed
3355 *       map update via tlock at commit time;
3356 *      PMAP:
3357 *	 Call xtTruncate_pmap instead
3358 *      WMAP:
3359 *       1. remove (free zero link count) on last reference release
3360 *          (pmap has been freed at commit zero link count);
3361 *       2. truncate (COMMIT_NOLINK file, i.e., tmp file):
3362 *          xtree is updated;
3363 *       map update directly at truncation time;
3364 *
3365 *      if (DELETE)
3366 *              no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient);
3367 *      else if (TRUNCATE)
3368 *              must write LOG_NOREDOPAGE for deleted index page;
3369 *
3370 * pages may already have been tlocked by anonymous transactions
3371 * during file growth (i.e., write) before truncation;
3372 *
3373 * except last truncated entry, deleted entries remains as is
3374 * in the page (nextindex is updated) for other use
3375 * (e.g., log/update allocation map): this avoid copying the page
3376 * info but delay free of pages;
3377 *
3378 */
3379s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3380{
3381	int rc = 0;
3382	s64 teof;
3383	struct metapage *mp;
3384	xtpage_t *p;
3385	s64 bn;
3386	int index, nextindex;
3387	xad_t *xad;
3388	s64 xoff, xaddr;
3389	int xlen, len, freexlen;
3390	struct btstack btstack;
3391	struct btframe *parent;
3392	struct tblock *tblk = 0;
3393	struct tlock *tlck = 0;
3394	struct xtlock *xtlck = 0;
3395	struct xdlistlock xadlock;	/* maplock for COMMIT_WMAP */
3396	struct pxd_lock *pxdlock;		/* maplock for COMMIT_WMAP */
3397	s64 nfreed;
3398	int freed, log;
3399	int locked_leaves = 0;
3400
3401	/* save object truncation type */
3402	if (tid) {
3403		tblk = tid_to_tblock(tid);
3404		tblk->xflag |= flag;
3405	}
3406
3407	nfreed = 0;
3408
3409	flag &= COMMIT_MAP;
3410	assert(flag != COMMIT_PMAP);
3411
3412	if (flag == COMMIT_PWMAP)
3413		log = 1;
3414	else {
3415		log = 0;
3416		xadlock.flag = mlckFREEXADLIST;
3417		xadlock.index = 1;
3418	}
3419
3420	/*
3421	 * if the newsize is not an integral number of pages,
3422	 * the file between newsize and next page boundary will
3423	 * be cleared.
3424	 * if truncating into a file hole, it will cause
3425	 * a full block to be allocated for the logical block.
3426	 */
3427
3428	/*
3429	 * release page blocks of truncated region <teof, eof>
3430	 *
3431	 * free the data blocks from the leaf index blocks.
3432	 * delete the parent index entries corresponding to
3433	 * the freed child data/index blocks.
3434	 * free the index blocks themselves which aren't needed
3435	 * in new sized file.
3436	 *
3437	 * index blocks are updated only if the blocks are to be
3438	 * retained in the new sized file.
3439	 * if type is PMAP, the data and index pages are NOT
3440	 * freed, and the data and index blocks are NOT freed
3441	 * from  working map.
3442	 * (this will allow continued access of data/index of
3443	 * temporary file (zerolink count file truncated to zero-length)).
3444	 */
3445	teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
3446	    JFS_SBI(ip->i_sb)->l2bsize;
3447
3448	/* clear stack */
3449	BT_CLR(&btstack);
3450
3451	/*
3452	 * start with root
3453	 *
3454	 * root resides in the inode
3455	 */
3456	bn = 0;
3457
3458	/*
3459	 * first access of each page:
3460	 */
3461      getPage:
3462	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3463	if (rc)
3464		return -rc;
3465
3466	/* process entries backward from last index */
3467	index = le16_to_cpu(p->header.nextindex) - 1;
3468
3469	if (p->header.flag & BT_INTERNAL)
3470		goto getChild;
3471
3472	/*
3473	 *      leaf page
3474	 */
3475
3476	/* Since this is the rightmost leaf, and we may have already freed
3477	 * a page that was formerly to the right, let's make sure that the
3478	 * next pointer is zero.
3479	 */
3480	p->header.next = 0;
3481
3482	freed = 0;
3483
3484	/* does region covered by leaf page precede Teof ? */
3485	xad = &p->xad[index];
3486	xoff = offsetXAD(xad);
3487	xlen = lengthXAD(xad);
3488	if (teof >= xoff + xlen) {
3489		XT_PUTPAGE(mp);
3490		goto getParent;
3491	}
3492
3493	/* (re)acquire tlock of the leaf page */
3494	if (log) {
3495		if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
3496			/*
3497			 * We need to limit the size of the transaction
3498			 * to avoid exhausting pagecache & tlocks
3499			 */
3500			XT_PUTPAGE(mp);
3501			newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
3502			goto getParent;
3503		}
3504		tlck = txLock(tid, ip, mp, tlckXTREE);
3505		tlck->type = tlckXTREE | tlckTRUNCATE;
3506		xtlck = (struct xtlock *) & tlck->lock;
3507		xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
3508	}
3509	BT_MARK_DIRTY(mp, ip);
3510
3511	/*
3512	 * scan backward leaf page entries
3513	 */
3514	for (; index >= XTENTRYSTART; index--) {
3515		xad = &p->xad[index];
3516		xoff = offsetXAD(xad);
3517		xlen = lengthXAD(xad);
3518		xaddr = addressXAD(xad);
3519
3520		/*
3521		 * entry beyond eof: continue scan of current page
3522		 *          xad
3523		 * ---|---=======------->
3524		 *   eof
3525		 */
3526		if (teof < xoff) {
3527			nfreed += xlen;
3528			continue;
3529		}
3530
3531		/*
3532		 * (xoff <= teof): last entry to be deleted from page;
3533		 * If other entries remain in page: keep and update the page.
3534		 */
3535
3536		/*
3537		 * eof == entry_start: delete the entry
3538		 *           xad
3539		 * -------|=======------->
3540		 *       eof
3541		 *
3542		 */
3543		if (teof == xoff) {
3544			nfreed += xlen;
3545
3546			if (index == XTENTRYSTART)
3547				break;
3548
3549			nextindex = index;
3550		}
3551		/*
3552		 * eof within the entry: truncate the entry.
3553		 *          xad
3554		 * -------===|===------->
3555		 *          eof
3556		 */
3557		else if (teof < xoff + xlen) {
3558			/* update truncated entry */
3559			len = teof - xoff;
3560			freexlen = xlen - len;
3561			XADlength(xad, len);
3562
3563			/* save pxd of truncated extent in tlck */
3564			xaddr += len;
3565			if (log) {	/* COMMIT_PWMAP */
3566				xtlck->lwm.offset = (xtlck->lwm.offset) ?
3567				    min(index, (int)xtlck->lwm.offset) : index;
3568				xtlck->lwm.length = index + 1 -
3569				    xtlck->lwm.offset;
3570				xtlck->twm.offset = index;
3571				pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
3572				pxdlock->flag = mlckFREEPXD;
3573				PXDaddress(&pxdlock->pxd, xaddr);
3574				PXDlength(&pxdlock->pxd, freexlen);
3575			}
3576			/* free truncated extent */
3577			else {	/* COMMIT_WMAP */
3578
3579				pxdlock = (struct pxd_lock *) & xadlock;
3580				pxdlock->flag = mlckFREEPXD;
3581				PXDaddress(&pxdlock->pxd, xaddr);
3582				PXDlength(&pxdlock->pxd, freexlen);
3583				txFreeMap(ip, pxdlock, 0, COMMIT_WMAP);
3584
3585				/* reset map lock */
3586				xadlock.flag = mlckFREEXADLIST;
3587			}
3588
3589			/* current entry is new last entry; */
3590			nextindex = index + 1;
3591
3592			nfreed += freexlen;
3593		}
3594		/*
3595		 * eof beyond the entry:
3596		 *          xad
3597		 * -------=======---|--->
3598		 *                 eof
3599		 */
3600		else {		/* (xoff + xlen < teof) */
3601
3602			nextindex = index + 1;
3603		}
3604
3605		if (nextindex < le16_to_cpu(p->header.nextindex)) {
3606			if (!log) {	/* COMMIT_WAMP */
3607				xadlock.xdlist = &p->xad[nextindex];
3608				xadlock.count =
3609				    le16_to_cpu(p->header.nextindex) -
3610				    nextindex;
3611				txFreeMap(ip, (struct maplock *) & xadlock, 0,
3612					  COMMIT_WMAP);
3613			}
3614			p->header.nextindex = cpu_to_le16(nextindex);
3615		}
3616
3617		XT_PUTPAGE(mp);
3618
3619		/* assert(freed == 0); */
3620		goto getParent;
3621	}			/* end scan of leaf page entries */
3622
3623	freed = 1;
3624
3625	/*
3626	 * leaf page become empty: free the page if type != PMAP
3627	 */
3628	if (log) {		/* COMMIT_PWMAP */
3629		/* txCommit() with tlckFREE:
3630		 * free data extents covered by leaf [XTENTRYSTART:hwm);
3631		 * invalidate leaf if COMMIT_PWMAP;
3632		 * if (TRUNCATE), will write LOG_NOREDOPAGE;
3633		 */
3634		tlck->type = tlckXTREE | tlckFREE;
3635	} else {		/* COMMIT_WAMP */
3636
3637		/* free data extents covered by leaf */
3638		xadlock.xdlist = &p->xad[XTENTRYSTART];
3639		xadlock.count =
3640		    le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
3641		txFreeMap(ip, (struct maplock *) & xadlock, 0, COMMIT_WMAP);
3642	}
3643
3644	if (p->header.flag & BT_ROOT) {
3645		p->header.flag &= ~BT_INTERNAL;
3646		p->header.flag |= BT_LEAF;
3647		p->header.nextindex = cpu_to_le16(XTENTRYSTART);
3648
3649		XT_PUTPAGE(mp);	/* debug */
3650		goto out;
3651	} else {
3652		if (log) {	/* COMMIT_PWMAP */
3653			/* page will be invalidated at tx completion
3654			 */
3655			XT_PUTPAGE(mp);
3656		} else {	/* COMMIT_WMAP */
3657
3658			if (mp->lid)
3659				lid_to_tlock(mp->lid)->flag |= tlckFREELOCK;
3660
3661			/* invalidate empty leaf page */
3662			discard_metapage(mp);
3663		}
3664	}
3665
3666	/*
3667	 * the leaf page become empty: delete the parent entry
3668	 * for the leaf page if the parent page is to be kept
3669	 * in the new sized file.
3670	 */
3671
3672	/*
3673	 * go back up to the parent page
3674	 */
3675      getParent:
3676	/* pop/restore parent entry for the current child page */
3677	if ((parent = BT_POP(&btstack)) == NULL)
3678		/* current page must have been root */
3679		goto out;
3680
3681	/* get back the parent page */
3682	bn = parent->bn;
3683	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3684	if (rc)
3685		return -rc;
3686
3687	index = parent->index;
3688
3689	/*
3690	 * child page was not empty:
3691	 */
3692	if (freed == 0) {
3693		/* has any entry deleted from parent ? */
3694		if (index < le16_to_cpu(p->header.nextindex) - 1) {
3695			/* (re)acquire tlock on the parent page */
3696			if (log) {	/* COMMIT_PWMAP */
3697				/* txCommit() with tlckTRUNCATE:
3698				 * free child extents covered by parent [);
3699				 */
3700				tlck = txLock(tid, ip, mp, tlckXTREE);
3701				xtlck = (struct xtlock *) & tlck->lock;
3702				if (!(tlck->type & tlckTRUNCATE)) {
3703					xtlck->hwm.offset =
3704					    le16_to_cpu(p->header.
3705							nextindex) - 1;
3706					tlck->type =
3707					    tlckXTREE | tlckTRUNCATE;
3708				}
3709			} else {	/* COMMIT_WMAP */
3710
3711				/* free child extents covered by parent */
3712				xadlock.xdlist = &p->xad[index + 1];
3713				xadlock.count =
3714				    le16_to_cpu(p->header.nextindex) -
3715				    index - 1;
3716				txFreeMap(ip, (struct maplock *) & xadlock, 0,
3717					  COMMIT_WMAP);
3718			}
3719			BT_MARK_DIRTY(mp, ip);
3720
3721			p->header.nextindex = cpu_to_le16(index + 1);
3722		}
3723		XT_PUTPAGE(mp);
3724		goto getParent;
3725	}
3726
3727	/*
3728	 * child page was empty:
3729	 */
3730	nfreed += lengthXAD(&p->xad[index]);
3731
3732	/*
3733	 * During working map update, child page's tlock must be handled
3734	 * before parent's.  This is because the parent's tlock will cause
3735	 * the child's disk space to be marked available in the wmap, so
3736	 * it's important that the child page be released by that time.
3737	 *
3738	 * ToDo:  tlocks should be on doubly-linked list, so we can
3739	 * quickly remove it and add it to the end.
3740	 */
3741
3742	/*
3743	 * Move parent page's tlock to the end of the tid's tlock list
3744	 */
3745	if (log && mp->lid && (tblk->last != mp->lid) &&
3746	    lid_to_tlock(mp->lid)->tid) {
3747		lid_t lid = mp->lid;
3748		struct tlock *prev;
3749
3750		tlck = lid_to_tlock(lid);
3751
3752		if (tblk->next == lid)
3753			tblk->next = tlck->next;
3754		else {
3755			for (prev = lid_to_tlock(tblk->next);
3756			     prev->next != lid;
3757			     prev = lid_to_tlock(prev->next)) {
3758				assert(prev->next);
3759			}
3760			prev->next = tlck->next;
3761		}
3762		lid_to_tlock(tblk->last)->next = lid;
3763		tlck->next = 0;
3764		tblk->last = lid;
3765	}
3766
3767	/*
3768	 * parent page become empty: free the page
3769	 */
3770	if (index == XTENTRYSTART) {
3771		if (log) {	/* COMMIT_PWMAP */
3772			/* txCommit() with tlckFREE:
3773			 * free child extents covered by parent;
3774			 * invalidate parent if COMMIT_PWMAP;
3775			 */
3776			tlck = txLock(tid, ip, mp, tlckXTREE);
3777			xtlck = (struct xtlock *) & tlck->lock;
3778			xtlck->hwm.offset =
3779			    le16_to_cpu(p->header.nextindex) - 1;
3780			tlck->type = tlckXTREE | tlckFREE;
3781		} else {	/* COMMIT_WMAP */
3782
3783			/* free child extents covered by parent */
3784			xadlock.xdlist = &p->xad[XTENTRYSTART];
3785			xadlock.count =
3786			    le16_to_cpu(p->header.nextindex) -
3787			    XTENTRYSTART;
3788			txFreeMap(ip, (struct maplock *) & xadlock, 0,
3789				  COMMIT_WMAP);
3790		}
3791		BT_MARK_DIRTY(mp, ip);
3792
3793		if (p->header.flag & BT_ROOT) {
3794			p->header.flag &= ~BT_INTERNAL;
3795			p->header.flag |= BT_LEAF;
3796			p->header.nextindex = cpu_to_le16(XTENTRYSTART);
3797			if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) {
3798				/*
3799				 * Shrink root down to allow inline
3800				 * EA (otherwise fsck complains)
3801				 */
3802				p->header.maxentry =
3803				    cpu_to_le16(XTROOTINITSLOT);
3804				JFS_IP(ip)->mode2 |= INLINEEA;
3805			}
3806
3807			XT_PUTPAGE(mp);	/* debug */
3808			goto out;
3809		} else {
3810			if (log) {	/* COMMIT_PWMAP */
3811				/* page will be invalidated at tx completion
3812				 */
3813				XT_PUTPAGE(mp);
3814			} else {	/* COMMIT_WMAP */
3815
3816				if (mp->lid)
3817					lid_to_tlock(mp->lid)->flag |=
3818						tlckFREELOCK;
3819
3820				/* invalidate parent page */
3821				discard_metapage(mp);
3822			}
3823
3824			/* parent has become empty and freed:
3825			 * go back up to its parent page
3826			 */
3827			/* freed = 1; */
3828			goto getParent;
3829		}
3830	}
3831	/*
3832	 * parent page still has entries for front region;
3833	 */
3834	else {
3835		/* try truncate region covered by preceding entry
3836		 * (process backward)
3837		 */
3838		index--;
3839
3840		/* go back down to the child page corresponding
3841		 * to the entry
3842		 */
3843		goto getChild;
3844	}
3845
3846	/*
3847	 *      internal page: go down to child page of current entry
3848	 */
3849      getChild:
3850	/* save current parent entry for the child page */
3851	BT_PUSH(&btstack, bn, index);
3852
3853	/* get child page */
3854	xad = &p->xad[index];
3855	bn = addressXAD(xad);
3856
3857	/*
3858	 * first access of each internal entry:
3859	 */
3860	/* release parent page */
3861	XT_PUTPAGE(mp);
3862
3863	/* process the child page */
3864	goto getPage;
3865
3866      out:
3867	/*
3868	 * update file resource stat
3869	 */
3870	/* set size
3871	 */
3872	if (S_ISDIR(ip->i_mode) && !newsize)
3873		ip->i_size = 1;	/* fsck hates zero-length directories */
3874	else
3875		ip->i_size = newsize;
3876
3877	/* update nblocks to reflect freed blocks */
3878	ip->i_blocks -= LBLK2PBLK(ip->i_sb, nfreed);
3879
3880	/*
3881	 * free tlock of invalidated pages
3882	 */
3883	if (flag == COMMIT_WMAP)
3884		txFreelock(ip);
3885
3886	return newsize;
3887}
3888
3889
3890/*
3891 *      xtTruncate_pmap()
3892 *
3893 * function:
3894 *	Perform truncate to zero lenghth for deleted file, leaving the
3895 *	the xtree and working map untouched.  This allows the file to
3896 *	be accessed via open file handles, while the delete of the file
3897 *	is committed to disk.
3898 *
3899 * parameter:
3900 *      tid_t		tid,
3901 *      struct inode	*ip,
3902 *      s64		committed_size)
3903 *
3904 * return: new committed size
3905 *
3906 * note:
3907 *
3908 *	To avoid deadlock by holding too many transaction locks, the
3909 *	truncation may be broken up into multiple transactions.
3910 *	The committed_size keeps track of part of the file has been
3911 *	freed from the pmaps.
3912 */
3913s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
3914{
3915	s64 bn;
3916	struct btstack btstack;
3917	int cmp;
3918	int index;
3919	int locked_leaves = 0;
3920	struct metapage *mp;
3921	xtpage_t *p;
3922	struct btframe *parent;
3923	int rc;
3924	struct tblock *tblk;
3925	struct tlock *tlck = 0;
3926	xad_t *xad;
3927	int xlen;
3928	s64 xoff;
3929	struct xtlock *xtlck = 0;
3930
3931	/* save object truncation type */
3932	tblk = tid_to_tblock(tid);
3933	tblk->xflag |= COMMIT_PMAP;
3934
3935	/* clear stack */
3936	BT_CLR(&btstack);
3937
3938	if (committed_size) {
3939		xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1;
3940		rc = xtSearch(ip, xoff, &cmp, &btstack, 0);
3941		if (rc)
3942			return -rc;
3943		assert(cmp == 0);
3944		XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
3945	} else {
3946		/*
3947		 * start with root
3948		 *
3949		 * root resides in the inode
3950		 */
3951		bn = 0;
3952
3953		/*
3954		 * first access of each page:
3955		 */
3956      getPage:
3957		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3958		if (rc)
3959			return -rc;
3960
3961		/* process entries backward from last index */
3962		index = le16_to_cpu(p->header.nextindex) - 1;
3963
3964		if (p->header.flag & BT_INTERNAL)
3965			goto getChild;
3966	}
3967
3968	/*
3969	 *      leaf page
3970	 */
3971
3972	if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
3973		/*
3974		 * We need to limit the size of the transaction
3975		 * to avoid exhausting pagecache & tlocks
3976		 */
3977		xad = &p->xad[index];
3978		xoff = offsetXAD(xad);
3979		xlen = lengthXAD(xad);
3980		XT_PUTPAGE(mp);
3981		return  (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
3982	}
3983	tlck = txLock(tid, ip, mp, tlckXTREE);
3984	tlck->type = tlckXTREE | tlckFREE;
3985	xtlck = (struct xtlock *) & tlck->lock;
3986	xtlck->hwm.offset = index;
3987
3988
3989	XT_PUTPAGE(mp);
3990
3991	/*
3992	 * go back up to the parent page
3993	 */
3994      getParent:
3995	/* pop/restore parent entry for the current child page */
3996	if ((parent = BT_POP(&btstack)) == NULL)
3997		/* current page must have been root */
3998		goto out;
3999
4000	/* get back the parent page */
4001	bn = parent->bn;
4002	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4003	if (rc)
4004		return -rc;
4005
4006	index = parent->index;
4007
4008	/*
4009	 * parent page become empty: free the page
4010	 */
4011	if (index == XTENTRYSTART) {
4012		/* txCommit() with tlckFREE:
4013		 * free child extents covered by parent;
4014		 * invalidate parent if COMMIT_PWMAP;
4015		 */
4016		tlck = txLock(tid, ip, mp, tlckXTREE);
4017		xtlck = (struct xtlock *) & tlck->lock;
4018		xtlck->hwm.offset =
4019		    le16_to_cpu(p->header.nextindex) - 1;
4020		tlck->type = tlckXTREE | tlckFREE;
4021
4022		XT_PUTPAGE(mp);
4023
4024		if (p->header.flag & BT_ROOT) {
4025
4026			goto out;
4027		} else {
4028			goto getParent;
4029		}
4030	}
4031	/*
4032	 * parent page still has entries for front region;
4033	 */
4034	else
4035		index--;
4036	/*
4037	 *      internal page: go down to child page of current entry
4038	 */
4039      getChild:
4040	/* save current parent entry for the child page */
4041	BT_PUSH(&btstack, bn, index);
4042
4043	/* get child page */
4044	xad = &p->xad[index];
4045	bn = addressXAD(xad);
4046
4047	/*
4048	 * first access of each internal entry:
4049	 */
4050	/* release parent page */
4051	XT_PUTPAGE(mp);
4052
4053	/* process the child page */
4054	goto getPage;
4055
4056      out:
4057
4058	return 0;
4059}
4060
4061
4062#ifdef _JFS_DEBUG_XTREE
4063/*
4064 *      xtDisplayTree()
4065 *
4066 * function: traverse forward
4067 */
4068int xtDisplayTree(struct inode *ip)
4069{
4070	int rc = 0;
4071	struct metapage *mp;
4072	xtpage_t *p;
4073	s64 bn, pbn;
4074	int index, lastindex, v, h;
4075	xad_t *xad;
4076	struct btstack btstack;
4077	struct btframe *btsp;
4078	struct btframe *parent;
4079
4080	printk("display B+-tree.\n");
4081
4082	/* clear stack */
4083	btsp = btstack.stack;
4084
4085	/*
4086	 * start with root
4087	 *
4088	 * root resides in the inode
4089	 */
4090	bn = 0;
4091	v = h = 0;
4092
4093	/*
4094	 * first access of each page:
4095	 */
4096      getPage:
4097	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4098	if (rc)
4099		return rc;
4100
4101	/* process entries forward from first index */
4102	index = XTENTRYSTART;
4103	lastindex = le16_to_cpu(p->header.nextindex) - 1;
4104
4105	if (p->header.flag & BT_INTERNAL) {
4106		/*
4107		 * first access of each internal page
4108		 */
4109		goto getChild;
4110	} else {		/* (p->header.flag & BT_LEAF) */
4111
4112		/*
4113		 * first access of each leaf page
4114		 */
4115		printf("leaf page ");
4116		xtDisplayPage(ip, bn, p);
4117
4118		/* unpin the leaf page */
4119		XT_PUTPAGE(mp);
4120	}
4121
4122	/*
4123	 * go back up to the parent page
4124	 */
4125      getParent:
4126	/* pop/restore parent entry for the current child page */
4127	if ((parent = (btsp == btstack.stack ? NULL : --btsp)) == NULL)
4128		/* current page must have been root */
4129		return;
4130
4131	/*
4132	 * parent page scan completed
4133	 */
4134	if ((index = parent->index) == (lastindex = parent->lastindex)) {
4135		/* go back up to the parent page */
4136		goto getParent;
4137	}
4138
4139	/*
4140	 * parent page has entries remaining
4141	 */
4142	/* get back the parent page */
4143	bn = parent->bn;
4144	/* v = parent->level; */
4145	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4146	if (rc)
4147		return rc;
4148
4149	/* get next parent entry */
4150	index++;
4151
4152	/*
4153	 * internal page: go down to child page of current entry
4154	 */
4155      getChild:
4156	/* push/save current parent entry for the child page */
4157	btsp->bn = pbn = bn;
4158	btsp->index = index;
4159	btsp->lastindex = lastindex;
4160	/* btsp->level = v; */
4161	/* btsp->node = h; */
4162	++btsp;
4163
4164	/* get child page */
4165	xad = &p->xad[index];
4166	bn = addressXAD(xad);
4167
4168	/*
4169	 * first access of each internal entry:
4170	 */
4171	/* release parent page */
4172	XT_PUTPAGE(mp);
4173
4174	printk("traverse down 0x%lx[%d]->0x%lx\n", (ulong) pbn, index,
4175	       (ulong) bn);
4176	v++;
4177	h = index;
4178
4179	/* process the child page */
4180	goto getPage;
4181}
4182
4183
4184/*
4185 *      xtDisplayPage()
4186 *
4187 * function: display page
4188 */
4189int xtDisplayPage(struct inode *ip, s64 bn, xtpage_t * p)
4190{
4191	int rc = 0;
4192	struct metapage *mp;
4193	xad_t *xad;
4194	s64 xaddr, xoff;
4195	int xlen, i, j;
4196
4197	if (p == NULL) {
4198		XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4199		if (rc)
4200			return rc;
4201	}
4202
4203	/* display page control */
4204	printf("bn:0x%lx flag:0x%x nextindex:%d\n",
4205	       (ulong) bn, p->header.flag,
4206	       le16_to_cpu(p->header.nextindex));
4207
4208	/* display entries */
4209	xad = &p->xad[XTENTRYSTART];
4210		for (i = XTENTRYSTART, j = 1; i < le16_to_cpu(p->header.nextindex);
4211		     i++, xad++, j++) {
4212			xoff = offsetXAD(xad);
4213			xaddr = addressXAD(xad);
4214			xlen = lengthXAD(xad);
4215			printf("\t[%d] 0x%lx:0x%lx(0x%x)", i, (ulong) xoff,
4216			       (ulong) xaddr, xlen);
4217
4218			if (j == 4) {
4219				printf("\n");
4220				j = 0;
4221		}
4222	}
4223
4224	printf("\n");
4225}
4226#endif				/* _JFS_DEBUG_XTREE */
4227
4228
4229#ifdef _JFS_WIP
4230/*
4231 *      xtGather()
4232 *
4233 * function:
4234 *      traverse for allocation acquiring tlock at commit time
4235 *      (vs at the time of update) logging backward top down
4236 *
4237 * note:
4238 *      problem - establishing that all new allocation have been
4239 *      processed both for append and random write in sparse file
4240 *      at the current entry at the current subtree root page
4241 *
4242 */
4243int xtGather(t)
4244btree_t *t;
4245{
4246	int rc = 0;
4247	xtpage_t *p;
4248	u64 bn;
4249	int index;
4250	btentry_t *e;
4251	struct btstack btstack;
4252	struct btsf *parent;
4253
4254	/* clear stack */
4255	BT_CLR(&btstack);
4256
4257	/*
4258	 * start with root
4259	 *
4260	 * root resides in the inode
4261	 */
4262	bn = 0;
4263	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4264	if (rc)
4265		return rc;
4266
4267	/* new root is NOT pointed by a new entry
4268	   if (p->header.flag & NEW)
4269	   allocate new page lock;
4270	   write a NEWPAGE log;
4271	 */
4272
4273      dopage:
4274	/*
4275	 * first access of each page:
4276	 */
4277	/* process entries backward from last index */
4278	index = le16_to_cpu(p->header.nextindex) - 1;
4279
4280	if (p->header.flag & BT_LEAF) {
4281		/*
4282		 * first access of each leaf page
4283		 */
4284		/* process leaf page entries backward */
4285		for (; index >= XTENTRYSTART; index--) {
4286			e = &p->xad[index];
4287			/*
4288			 * if newpage, log NEWPAGE.
4289			 *
4290			 if (e->flag & XAD_NEW) {
4291			 nfound =+ entry->length;
4292			 update current page lock for the entry;
4293			 newpage(entry);
4294			 *
4295			 * if moved, log move.
4296			 *
4297			 } else if (e->flag & XAD_MOVED) {
4298			 reset flag;
4299			 update current page lock for the entry;
4300			 }
4301			 */
4302		}
4303
4304		/* unpin the leaf page */
4305		XT_PUTPAGE(mp);
4306
4307		/*
4308		 * go back up to the parent page
4309		 */
4310	      getParent:
4311		/* restore parent entry for the current child page */
4312		if ((parent = BT_POP(&btstack)) == NULL)
4313			/* current page must have been root */
4314			return 0;
4315
4316		if ((index = parent->index) == XTENTRYSTART) {
4317			/*
4318			 * parent page scan completed
4319			 */
4320			/* go back up to the parent page */
4321			goto getParent;
4322		} else {
4323			/*
4324			 * parent page has entries remaining
4325			 */
4326			/* get back the parent page */
4327			bn = parent->bn;
4328			XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4329			if (rc)
4330				return EIO;
4331
4332			/* first subroot page which
4333			 * covers all new allocated blocks
4334			 * itself not new/modified.
4335			 * (if modified from split of descendent,
4336			 * go down path of split page)
4337
4338			 if (nfound == nnew &&
4339			 !(p->header.flag & (NEW | MOD)))
4340			 exit scan;
4341			 */
4342
4343			/* process parent page entries backward */
4344			index--;
4345		}
4346	} else {
4347		/*
4348		 * first access of each internal page
4349		 */
4350	}
4351
4352	/*
4353	 * internal page: go down to child page of current entry
4354	 */
4355
4356	/* save current parent entry for the child page */
4357	BT_PUSH(&btstack, bn, index);
4358
4359	/* get current entry for the child page */
4360	e = &p->xad[index];
4361
4362	/*
4363	 * first access of each internal entry:
4364	 */
4365	/*
4366	 * if new entry, log btree_tnewentry.
4367	 *
4368	 if (e->flag & XAD_NEW)
4369	 update parent page lock for the entry;
4370	 */
4371
4372	/* release parent page */
4373	XT_PUTPAGE(mp);
4374
4375	/* get child page */
4376	bn = e->bn;
4377	XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4378	if (rc)
4379		return rc;
4380
4381	/*
4382	 * first access of each non-root page:
4383	 */
4384	/*
4385	 * if new, log btree_newpage.
4386	 *
4387	 if (p->header.flag & NEW)
4388	 allocate new page lock;
4389	 write a NEWPAGE log (next, prev);
4390	 */
4391
4392	/* process the child page */
4393	goto dopage;
4394
4395      out:
4396	return 0;
4397}
4398#endif				/* _JFS_WIP */
4399
4400
4401#ifdef CONFIG_JFS_STATISTICS
4402int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
4403		    int *eof, void *data)
4404{
4405	int len = 0;
4406	off_t begin;
4407
4408	len += sprintf(buffer,
4409		       "JFS Xtree statistics\n"
4410		       "====================\n"
4411		       "searches = %d\n"
4412		       "fast searches = %d\n"
4413		       "splits = %d\n",
4414		       xtStat.search,
4415		       xtStat.fastSearch,
4416		       xtStat.split);
4417
4418	begin = offset;
4419	*start = buffer + begin;
4420	len -= begin;
4421
4422	if (len > length)
4423		len = length;
4424	else
4425		*eof = 1;
4426
4427	if (len < 0)
4428		len = 0;
4429
4430	return len;
4431}
4432#endif
4433