1/*
2 *   Copyright (c) International Business Machines Corp., 2000-2002
3 *
4 *   This program is free software;  you can redistribute it and/or modify
5 *   it under the terms of the GNU General Public License as published by
6 *   the Free Software Foundation; either version 2 of the License, or
7 *   (at your option) any later version.
8 *
9 *   This program is distributed in the hope that it will be useful,
10 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
12 *   the GNU General Public License for more details.
13 *
14 *   You should have received a copy of the GNU General Public License
15 *   along with this program;  if not, write to the Free Software
16 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19/*
20 *	jfs_dtree.c: directory B+-tree manager
21 *
22 * B+-tree with variable length key directory:
23 *
24 * each directory page is structured as an array of 32-byte
25 * directory entry slots initialized as a freelist
26 * to avoid search/compaction of free space at insertion.
27 * when an entry is inserted, a number of slots are allocated
28 * from the freelist as required to store variable length data
29 * of the entry; when the entry is deleted, slots of the entry
30 * are returned to freelist.
31 *
32 * leaf entry stores full name as key and file serial number
33 * (aka inode number) as data.
34 * internal/router entry stores sufffix compressed name
35 * as key and simple extent descriptor as data.
36 *
37 * each directory page maintains a sorted entry index table
38 * which stores the start slot index of sorted entries
39 * to allow binary search on the table.
40 *
41 * directory starts as a root/leaf page in on-disk inode
42 * inline data area.
43 * when it becomes full, it starts a leaf of a external extent
44 * of length of 1 block. each time the first leaf becomes full,
45 * it is extended rather than split (its size is doubled),
46 * until its length becoms 4 KBytes, from then the extent is split
47 * with new 4 Kbyte extent when it becomes full
48 * to reduce external fragmentation of small directories.
49 *
50 * blah, blah, blah, for linear scan of directory in pieces by
51 * readdir().
52 *
53 *
54 *	case-insensitive directory file system
55 *
56 * names are stored in case-sensitive way in leaf entry.
57 * but stored, searched and compared in case-insensitive (uppercase) order
58 * (i.e., both search key and entry key are folded for search/compare):
59 * (note that case-sensitive order is BROKEN in storage, e.g.,
60 *  sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad
61 *
62 *  entries which folds to the same key makes up a equivalent class
63 *  whose members are stored as contiguous cluster (may cross page boundary)
64 *  but whose order is arbitrary and acts as duplicate, e.g.,
65 *  abc, Abc, aBc, abC)
66 *
67 * once match is found at leaf, requires scan forward/backward
68 * either for, in case-insensitive search, duplicate
69 * or for, in case-sensitive search, for exact match
70 *
71 * router entry must be created/stored in case-insensitive way
72 * in internal entry:
73 * (right most key of left page and left most key of right page
74 * are folded, and its suffix compression is propagated as router
75 * key in parent)
76 * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB>
77 * should be made the router key for the split)
78 *
79 * case-insensitive search:
80 *
81 * 	fold search key;
82 *
83 *	case-insensitive search of B-tree:
84 *	for internal entry, router key is already folded;
85 *	for leaf entry, fold the entry key before comparison.
86 *
87 *	if (leaf entry case-insensitive match found)
88 *		if (next entry satisfies case-insensitive match)
89 *			return EDUPLICATE;
90 *		if (prev entry satisfies case-insensitive match)
91 *			return EDUPLICATE;
92 *		return match;
93 *	else
94 *		return no match;
95 *
96 * 	serialization:
97 * target directory inode lock is being held on entry/exit
98 * of all main directory service routines.
99 *
100 *	log based recovery:
101 */
102
103#include <linux/fs.h>
104#include "jfs_incore.h"
105#include "jfs_superblock.h"
106#include "jfs_filsys.h"
107#include "jfs_metapage.h"
108#include "jfs_dmap.h"
109#include "jfs_unicode.h"
110#include "jfs_debug.h"
111
112/* dtree split parameter */
113struct dtsplit {
114	struct metapage *mp;
115	s16 index;
116	s16 nslot;
117	struct component_name *key;
118	ddata_t *data;
119	struct pxdlist *pxdlist;
120};
121
122#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
123
124/* get page buffer for specified block address */
125#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
126{\
127	BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\
128	if (!(RC))\
129	{\
130		if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\
131		    ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\
132		{\
133			jERROR(1,("DT_GETPAGE: dtree page corrupt\n"));\
134			BT_PUTPAGE(MP);\
135			updateSuper((IP)->i_sb, FM_DIRTY);\
136			MP = NULL;\
137			RC = EIO;\
138		}\
139	}\
140}
141
142/* for consistency */
143#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
144
145#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
146	BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot)
147
148/*
149 * forward references
150 */
151static int dtSplitUp(tid_t tid, struct inode *ip,
152		     struct dtsplit * split, struct btstack * btstack);
153
154static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
155		       struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp);
156
157static int dtExtendPage(tid_t tid, struct inode *ip,
158			struct dtsplit * split, struct btstack * btstack);
159
160static int dtSplitRoot(tid_t tid, struct inode *ip,
161		       struct dtsplit * split, struct metapage ** rmpp);
162
163static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
164		      dtpage_t * fp, struct btstack * btstack);
165
166static int dtSearchNode(struct inode *ip,
167			s64 lmxaddr, pxd_t * kpxd, struct btstack * btstack);
168
169static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p);
170
171static int dtReadFirst(struct inode *ip, struct btstack * btstack);
172
173static int dtReadNext(struct inode *ip,
174		      loff_t * offset, struct btstack * btstack);
175
176static int dtCompare(struct component_name * key, dtpage_t * p, int si);
177
178static int ciCompare(struct component_name * key, dtpage_t * p, int si,
179		     int flag);
180
181static void dtGetKey(dtpage_t * p, int i, struct component_name * key,
182		     int flag);
183
184static void ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
185			       int ri, struct component_name * key, int flag);
186
187static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
188			  ddata_t * data, struct dt_lock **);
189
190static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
191			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
192			int do_index);
193
194static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock);
195
196static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock);
197
198static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock);
199
200#define ciToUpper(c)	UniStrupr((c)->name)
201
202/*
203 *	find_index()
204 *
205 *	Returns dtree page containing directory table entry for specified
206 *	index and pointer to its entry.
207 *
208 *	mp must be released by caller.
209 */
210static struct dir_table_slot *find_index(struct inode *ip, u32 index,
211					 struct metapage ** mp)
212{
213	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
214	s64 blkno;
215	s64 offset;
216	int page_offset;
217	struct dir_table_slot *slot;
218	static int maxWarnings = 10;
219
220	if (index < 2) {
221		if (maxWarnings) {
222			jERROR(1, ("find_entry called with index = %d\n",
223				   index));
224			maxWarnings--;
225		}
226		return 0;
227	}
228
229	if (index >= jfs_ip->next_index) {
230		jFYI(1, ("find_entry called with index >= next_index\n"));
231		return 0;
232	}
233
234	if (jfs_ip->next_index <= (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
235		/*
236		 * Inline directory table
237		 */
238		*mp = 0;
239		slot = &jfs_ip->i_dirtable[index - 2];
240	} else {
241		offset = (index - 2) * sizeof(struct dir_table_slot);
242		page_offset = offset & (PSIZE - 1);
243		blkno = ((offset + 1) >> L2PSIZE) <<
244		    JFS_SBI(ip->i_sb)->l2nbperpage;
245
246		if (*mp && ((*mp)->index != blkno)) {
247			release_metapage(*mp);
248			*mp = 0;
249		}
250		if (*mp == 0)
251			*mp = read_metapage(ip, blkno, PSIZE, 0);
252		if (*mp == 0) {
253			jERROR(1,
254			       ("free_index: error reading directory table\n"));
255			return 0;
256		}
257
258		slot =
259		    (struct dir_table_slot *) ((char *) (*mp)->data +
260					       page_offset);
261	}
262	return slot;
263}
264
265static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
266			      u32 index)
267{
268	struct tlock *tlck;
269	struct linelock *llck;
270	struct lv *lv;
271
272	tlck = txLock(tid, ip, mp, tlckDATA);
273	llck = (struct linelock *) tlck->lock;
274
275	if (llck->index >= llck->maxcnt)
276		llck = txLinelock(llck);
277	lv = &llck->lv[llck->index];
278
279	/*
280	 *      Linelock slot size is twice the size of directory table
281	 *      slot size.  512 entries per page.
282	 */
283	lv->offset = ((index - 2) & 511) >> 1;
284	lv->length = 1;
285	llck->index++;
286}
287
288/*
289 *	add_index()
290 *
291 *	Adds an entry to the directory index table.  This is used to provide
292 *	each directory entry with a persistent index in which to resume
293 *	directory traversals
294 */
295static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
296{
297	struct super_block *sb = ip->i_sb;
298	struct jfs_sb_info *sbi = JFS_SBI(sb);
299	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
300	u64 blkno;
301	struct dir_table_slot *dirtab_slot;
302	u32 index;
303	struct linelock *llck;
304	struct lv *lv;
305	struct metapage *mp;
306	s64 offset;
307	uint page_offset;
308	int rc;
309	struct tlock *tlck;
310	s64 xaddr;
311
312	ASSERT(DO_INDEX(ip));
313
314	if (jfs_ip->next_index < 2) {
315		jERROR(1, ("next_index = %d.  Please fix this!\n",
316			   jfs_ip->next_index));
317		jfs_ip->next_index = 2;
318	}
319
320	index = jfs_ip->next_index++;
321
322	if (index <= MAX_INLINE_DIRTABLE_ENTRY) {
323		/*
324		 * i_size reflects size of index table, or 8 bytes per entry.
325		 */
326		ip->i_size = (loff_t) (index - 1) << 3;
327
328		/*
329		 * dir table fits inline within inode
330		 */
331		dirtab_slot = &jfs_ip->i_dirtable[index-2];
332		dirtab_slot->flag = DIR_INDEX_VALID;
333		dirtab_slot->slot = slot;
334		DTSaddress(dirtab_slot, bn);
335
336		set_cflag(COMMIT_Dirtable, ip);
337
338		return index;
339	}
340	if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
341		/*
342		 * It's time to move the inline table to an external
343		 * page and begin to build the xtree
344		 */
345
346		/*
347		 * Save the table, we're going to overwrite it with the
348		 * xtree root
349		 */
350		struct dir_table_slot temp_table[12];
351		memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table));
352
353		/*
354		 * Initialize empty x-tree
355		 */
356		xtInitRoot(tid, ip);
357
358		/*
359		 * Allocate the first block & add it to the xtree
360		 */
361		xaddr = 0;
362		if ((rc =
363		     xtInsert(tid, ip, 0, 0, sbi->nbperpage,
364			      &xaddr, 0))) {
365			jFYI(1, ("add_index: xtInsert failed!\n"));
366			return -1;
367		}
368		ip->i_size = PSIZE;
369		ip->i_blocks += LBLK2PBLK(sb, sbi->nbperpage);
370
371		if ((mp = get_metapage(ip, 0, ip->i_blksize, 0)) == 0) {
372			jERROR(1, ("add_index: get_metapage failed!\n"));
373			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
374			return -1;
375		}
376		tlck = txLock(tid, ip, mp, tlckDATA);
377		llck = (struct linelock *) & tlck->lock;
378		ASSERT(llck->index == 0);
379		lv = &llck->lv[0];
380
381		lv->offset = 0;
382		lv->length = 6;	/* tlckDATA slot size is 16 bytes */
383		llck->index++;
384
385		memcpy(mp->data, temp_table, sizeof(temp_table));
386
387		mark_metapage_dirty(mp);
388		release_metapage(mp);
389
390		/*
391		 * Logging is now directed by xtree tlocks
392		 */
393		clear_cflag(COMMIT_Dirtable, ip);
394	}
395
396	offset = (index - 2) * sizeof(struct dir_table_slot);
397	page_offset = offset & (PSIZE - 1);
398	blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage;
399	if (page_offset == 0) {
400		/*
401		 * This will be the beginning of a new page
402		 */
403		xaddr = 0;
404		if ((rc =
405		     xtInsert(tid, ip, 0, blkno, sbi->nbperpage,
406			      &xaddr, 0))) {
407			jFYI(1, ("add_index: xtInsert failed!\n"));
408			jfs_ip->next_index--;
409			return -1;
410		}
411		ip->i_size += PSIZE;
412		ip->i_blocks += LBLK2PBLK(sb, sbi->nbperpage);
413
414		if ((mp = get_metapage(ip, blkno, PSIZE, 0)))
415			memset(mp->data, 0, PSIZE);	/* Just looks better */
416		else
417			xtTruncate(tid, ip, offset, COMMIT_PWMAP);
418	} else
419		mp = read_metapage(ip, blkno, PSIZE, 0);
420
421	if (mp == 0) {
422		jERROR(1, ("add_index: get/read_metapage failed!\n"));
423		return -1;
424	}
425
426	lock_index(tid, ip, mp, index);
427
428	dirtab_slot =
429	    (struct dir_table_slot *) ((char *) mp->data + page_offset);
430	dirtab_slot->flag = DIR_INDEX_VALID;
431	dirtab_slot->slot = slot;
432	DTSaddress(dirtab_slot, bn);
433
434	mark_metapage_dirty(mp);
435	release_metapage(mp);
436
437	return index;
438}
439
440/*
441 *	free_index()
442 *
443 *	Marks an entry to the directory index table as free.
444 */
445static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
446{
447	struct dir_table_slot *dirtab_slot;
448	struct metapage *mp = 0;
449
450	dirtab_slot = find_index(ip, index, &mp);
451
452	if (dirtab_slot == 0)
453		return;
454
455	dirtab_slot->flag = DIR_INDEX_FREE;
456	dirtab_slot->slot = dirtab_slot->addr1 = 0;
457	dirtab_slot->addr2 = cpu_to_le32(next);
458
459	if (mp) {
460		lock_index(tid, ip, mp, index);
461		mark_metapage_dirty(mp);
462		release_metapage(mp);
463	} else
464		set_cflag(COMMIT_Dirtable, ip);
465}
466
467/*
468 *	modify_index()
469 *
470 *	Changes an entry in the directory index table
471 */
472static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
473			 int slot, struct metapage ** mp)
474{
475	struct dir_table_slot *dirtab_slot;
476
477	dirtab_slot = find_index(ip, index, mp);
478
479	if (dirtab_slot == 0)
480		return;
481
482	DTSaddress(dirtab_slot, bn);
483	dirtab_slot->slot = slot;
484
485	if (*mp) {
486		lock_index(tid, ip, *mp, index);
487		mark_metapage_dirty(*mp);
488	} else
489		set_cflag(COMMIT_Dirtable, ip);
490}
491
492/*
493 *	read_index()
494 *
495 *	reads a directory table slot
496 */
497static int read_index(struct inode *ip, u32 index,
498		     struct dir_table_slot * dirtab_slot)
499{
500	struct metapage *mp = 0;
501	struct dir_table_slot *slot;
502
503	slot = find_index(ip, index, &mp);
504	if (slot == 0) {
505		return -EIO;
506	}
507
508	memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot));
509
510	if (mp)
511		release_metapage(mp);
512
513	return 0;
514}
515
516/*
517 *	dtSearch()
518 *
519 * function:
520 *	Search for the entry with specified key
521 *
522 * parameter:
523 *
524 * return: 0 - search result on stack, leaf page pinned;
525 *	   errno - I/O error
526 */
527int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
528	     struct btstack * btstack, int flag)
529{
530	int rc = 0;
531	int cmp = 1;		/* init for empty page */
532	s64 bn;
533	struct metapage *mp;
534	dtpage_t *p;
535	s8 *stbl;
536	int base, index, lim;
537	struct btframe *btsp;
538	pxd_t *pxd;
539	int psize = 288;	/* initial in-line directory */
540	ino_t inumber;
541	struct component_name ciKey;
542	struct super_block *sb = ip->i_sb;
543
544	ciKey.name =
545	    (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
546				GFP_NOFS);
547	if (ciKey.name == 0) {
548		rc = ENOMEM;
549		goto dtSearch_Exit2;
550	}
551
552
553	/* uppercase search key for c-i directory */
554	UniStrcpy(ciKey.name, key->name);
555	ciKey.namlen = key->namlen;
556
557	/* only uppercase if case-insensitive support is on */
558	if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) {
559		ciToUpper(&ciKey);
560	}
561	BT_CLR(btstack);	/* reset stack */
562
563	/* init level count for max pages to split */
564	btstack->nsplit = 1;
565
566	/*
567	 *      search down tree from root:
568	 *
569	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
570	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
571	 *
572	 * if entry with search key K is not found
573	 * internal page search find the entry with largest key Ki
574	 * less than K which point to the child page to search;
575	 * leaf page search find the entry with smallest key Kj
576	 * greater than K so that the returned index is the position of
577	 * the entry to be shifted right for insertion of new entry.
578	 * for empty tree, search key is greater than any key of the tree.
579	 *
580	 * by convention, root bn = 0.
581	 */
582	for (bn = 0;;) {
583		/* get/pin the page to search */
584		DT_GETPAGE(ip, bn, mp, psize, p, rc);
585		if (rc)
586			goto dtSearch_Exit1;
587
588		/* get sorted entry table of the page */
589		stbl = DT_GETSTBL(p);
590
591		/*
592		 * binary search with search key K on the current page.
593		 */
594		for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
595			index = base + (lim >> 1);
596
597			if (p->header.flag & BT_LEAF) {
598				/* uppercase leaf name to compare */
599				cmp =
600				    ciCompare(&ciKey, p, stbl[index],
601					      JFS_SBI(sb)->mntflag);
602			} else {
603				/* router key is in uppercase */
604
605				cmp = dtCompare(&ciKey, p, stbl[index]);
606
607
608			}
609			if (cmp == 0) {
610				/*
611				 *      search hit
612				 */
613				/* search hit - leaf page:
614				 * return the entry found
615				 */
616				if (p->header.flag & BT_LEAF) {
617					inumber = le32_to_cpu(
618			((struct ldtentry *) & p->slot[stbl[index]])->inumber);
619
620					/*
621					 * search for JFS_LOOKUP
622					 */
623					if (flag == JFS_LOOKUP) {
624						*data = inumber;
625						rc = 0;
626						goto out;
627					}
628
629					/*
630					 * search for JFS_CREATE
631					 */
632					if (flag == JFS_CREATE) {
633						*data = inumber;
634						rc = EEXIST;
635						goto out;
636					}
637
638					/*
639					 * search for JFS_REMOVE or JFS_RENAME
640					 */
641					if ((flag == JFS_REMOVE ||
642					     flag == JFS_RENAME) &&
643					    *data != inumber) {
644						rc = ESTALE;
645						goto out;
646					}
647
648					/*
649					 * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME
650					 */
651					/* save search result */
652					*data = inumber;
653					btsp = btstack->top;
654					btsp->bn = bn;
655					btsp->index = index;
656					btsp->mp = mp;
657
658					rc = 0;
659					goto dtSearch_Exit1;
660				}
661
662				/* search hit - internal page:
663				 * descend/search its child page
664				 */
665				goto getChild;
666			}
667
668			if (cmp > 0) {
669				base = index + 1;
670				--lim;
671			}
672		}
673
674		/*
675		 *      search miss
676		 *
677		 * base is the smallest index with key (Kj) greater than
678		 * search key (K) and may be zero or (maxindex + 1) index.
679		 */
680		/*
681		 * search miss - leaf page
682		 *
683		 * return location of entry (base) where new entry with
684		 * search key K is to be inserted.
685		 */
686		if (p->header.flag & BT_LEAF) {
687			/*
688			 * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME
689			 */
690			if (flag == JFS_LOOKUP || flag == JFS_REMOVE ||
691			    flag == JFS_RENAME) {
692				rc = ENOENT;
693				goto out;
694			}
695
696			/*
697			 * search for JFS_CREATE|JFS_FINDDIR:
698			 *
699			 * save search result
700			 */
701			*data = 0;
702			btsp = btstack->top;
703			btsp->bn = bn;
704			btsp->index = base;
705			btsp->mp = mp;
706
707			rc = 0;
708			goto dtSearch_Exit1;
709		}
710
711		/*
712		 * search miss - internal page
713		 *
714		 * if base is non-zero, decrement base by one to get the parent
715		 * entry of the child page to search.
716		 */
717		index = base ? base - 1 : base;
718
719		/*
720		 * go down to child page
721		 */
722	      getChild:
723		/* update max. number of pages to split */
724		if (btstack->nsplit >= 8) {
725			/* Something's corrupted, mark filesytem dirty so
726			 * chkdsk will fix it.
727			 */
728			jERROR(1, ("stack overrun in dtSearch!\n"));
729			updateSuper(sb, FM_DIRTY);
730			rc = EIO;
731			goto out;
732		}
733		btstack->nsplit++;
734
735		/* push (bn, index) of the parent page/entry */
736		BT_PUSH(btstack, bn, index);
737
738		/* get the child page block number */
739		pxd = (pxd_t *) & p->slot[stbl[index]];
740		bn = addressPXD(pxd);
741		psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
742
743		/* unpin the parent page */
744		DT_PUTPAGE(mp);
745	}
746
747      out:
748	DT_PUTPAGE(mp);
749
750      dtSearch_Exit1:
751
752	kfree(ciKey.name);
753
754      dtSearch_Exit2:
755
756	return rc;
757}
758
759
760/*
761 *	dtInsert()
762 *
763 * function: insert an entry to directory tree
764 *
765 * parameter:
766 *
767 * return: 0 - success;
768 *	   errno - failure;
769 */
770int dtInsert(tid_t tid, struct inode *ip,
771	 struct component_name * name, ino_t * fsn, struct btstack * btstack)
772{
773	int rc = 0;
774	struct metapage *mp;	/* meta-page buffer */
775	dtpage_t *p;		/* base B+-tree index page */
776	s64 bn;
777	int index;
778	struct dtsplit split;	/* split information */
779	ddata_t data;
780	struct dt_lock *dtlck;
781	int n;
782	struct tlock *tlck;
783	struct lv *lv;
784
785	/*
786	 *      retrieve search result
787	 *
788	 * dtSearch() returns (leaf page pinned, index at which to insert).
789	 * n.b. dtSearch() may return index of (maxindex + 1) of
790	 * the full page.
791	 */
792	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
793
794	/*
795	 *      insert entry for new key
796	 */
797	if (DO_INDEX(ip)) {
798		if (JFS_IP(ip)->next_index == DIREND) {
799			DT_PUTPAGE(mp);
800			return EMLINK;
801		}
802		n = NDTLEAF(name->namlen);
803		data.leaf.tid = tid;
804		data.leaf.ip = ip;
805	} else {
806		n = NDTLEAF_LEGACY(name->namlen);
807		data.leaf.ip = 0;	/* signifies legacy directory format */
808	}
809	data.leaf.ino = cpu_to_le32(*fsn);
810
811	/*
812	 *      leaf page does not have enough room for new entry:
813	 *
814	 *      extend/split the leaf page;
815	 *
816	 * dtSplitUp() will insert the entry and unpin the leaf page.
817	 */
818	if (n > p->header.freecnt) {
819		split.mp = mp;
820		split.index = index;
821		split.nslot = n;
822		split.key = name;
823		split.data = &data;
824		rc = dtSplitUp(tid, ip, &split, btstack);
825		return rc;
826	}
827
828	/*
829	 *      leaf page does have enough room for new entry:
830	 *
831	 *      insert the new data entry into the leaf page;
832	 */
833	BT_MARK_DIRTY(mp, ip);
834	/*
835	 * acquire a transaction lock on the leaf page
836	 */
837	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
838	dtlck = (struct dt_lock *) & tlck->lock;
839	ASSERT(dtlck->index == 0);
840	lv = & dtlck->lv[0];
841
842	/* linelock header */
843	lv->offset = 0;
844	lv->length = 1;
845	dtlck->index++;
846
847	dtInsertEntry(p, index, name, &data, &dtlck);
848
849	/* linelock stbl of non-root leaf page */
850	if (!(p->header.flag & BT_ROOT)) {
851		if (dtlck->index >= dtlck->maxcnt)
852			dtlck = (struct dt_lock *) txLinelock(dtlck);
853		lv = & dtlck->lv[dtlck->index];
854		n = index >> L2DTSLOTSIZE;
855		lv->offset = p->header.stblindex + n;
856		lv->length =
857		    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
858		dtlck->index++;
859	}
860
861	/* unpin the leaf page */
862	DT_PUTPAGE(mp);
863
864	return 0;
865}
866
867
868/*
869 *	dtSplitUp()
870 *
871 * function: propagate insertion bottom up;
872 *
873 * parameter:
874 *
875 * return: 0 - success;
876 *	   errno - failure;
877 * 	leaf page unpinned;
878 */
879static int dtSplitUp(tid_t tid,
880	  struct inode *ip, struct dtsplit * split, struct btstack * btstack)
881{
882	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
883	int rc = 0;
884	struct metapage *smp;
885	dtpage_t *sp;		/* split page */
886	struct metapage *rmp;
887	dtpage_t *rp;		/* new right page split from sp */
888	pxd_t rpxd;		/* new right page extent descriptor */
889	struct metapage *lmp;
890	dtpage_t *lp;		/* left child page */
891	int skip;		/* index of entry of insertion */
892	struct btframe *parent;	/* parent page entry on traverse stack */
893	s64 xaddr, nxaddr;
894	int xlen, xsize;
895	struct pxdlist pxdlist;
896	pxd_t *pxd;
897	struct component_name key = { 0, 0 };
898	ddata_t *data = split->data;
899	int n;
900	struct dt_lock *dtlck;
901	struct tlock *tlck;
902	struct lv *lv;
903
904	/* get split page */
905	smp = split->mp;
906	sp = DT_PAGE(ip, smp);
907
908	key.name =
909	    (wchar_t *) kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t),
910				GFP_NOFS);
911	if (key.name == 0) {
912		DT_PUTPAGE(smp);
913		rc = ENOMEM;
914		goto dtSplitUp_Exit;
915	}
916
917	/*
918	 *      split leaf page
919	 *
920	 * The split routines insert the new entry, and
921	 * acquire txLock as appropriate.
922	 */
923	/*
924	 *      split root leaf page:
925	 */
926	if (sp->header.flag & BT_ROOT) {
927		/*
928		 * allocate a single extent child page
929		 */
930		xlen = 1;
931		n = sbi->bsize >> L2DTSLOTSIZE;
932		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
933		n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */
934		if (n <= split->nslot)
935			xlen++;
936		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)))
937			goto freeKeyName;
938
939		pxdlist.maxnpxd = 1;
940		pxdlist.npxd = 0;
941		pxd = &pxdlist.pxd[0];
942		PXDaddress(pxd, xaddr);
943		PXDlength(pxd, xlen);
944		split->pxdlist = &pxdlist;
945		rc = dtSplitRoot(tid, ip, split, &rmp);
946
947		DT_PUTPAGE(rmp);
948		DT_PUTPAGE(smp);
949
950		goto freeKeyName;
951	}
952
953	/*
954	 *      extend first leaf page
955	 *
956	 * extend the 1st extent if less than buffer page size
957	 * (dtExtendPage() reurns leaf page unpinned)
958	 */
959	pxd = &sp->header.self;
960	xlen = lengthPXD(pxd);
961	xsize = xlen << sbi->l2bsize;
962	if (xsize < PSIZE) {
963		xaddr = addressPXD(pxd);
964		n = xsize >> L2DTSLOTSIZE;
965		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
966		if ((n + sp->header.freecnt) <= split->nslot)
967			n = xlen + (xlen << 1);
968		else
969			n = xlen;
970		if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
971				    (s64) n, &nxaddr)))
972			goto extendOut;
973
974		pxdlist.maxnpxd = 1;
975		pxdlist.npxd = 0;
976		pxd = &pxdlist.pxd[0];
977		PXDaddress(pxd, nxaddr)
978		    PXDlength(pxd, xlen + n);
979		split->pxdlist = &pxdlist;
980		if ((rc = dtExtendPage(tid, ip, split, btstack))) {
981			nxaddr = addressPXD(pxd);
982			if (xaddr != nxaddr) {
983				/* free relocated extent */
984				xlen = lengthPXD(pxd);
985				dbFree(ip, nxaddr, (s64) xlen);
986			} else {
987				/* free extended delta */
988				xlen = lengthPXD(pxd) - n;
989				xaddr = addressPXD(pxd) + xlen;
990				dbFree(ip, xaddr, (s64) n);
991			}
992		}
993
994	      extendOut:
995		DT_PUTPAGE(smp);
996		goto freeKeyName;
997	}
998
999	/*
1000	 *      split leaf page <sp> into <sp> and a new right page <rp>.
1001	 *
1002	 * return <rp> pinned and its extent descriptor <rpxd>
1003	 */
1004	/*
1005	 * allocate new directory page extent and
1006	 * new index page(s) to cover page split(s)
1007	 *
1008	 * allocation hint: ?
1009	 */
1010	n = btstack->nsplit;
1011	pxdlist.maxnpxd = pxdlist.npxd = 0;
1012	xlen = sbi->nbperpage;
1013	for (pxd = pxdlist.pxd; n > 0; n--, pxd++) {
1014		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) {
1015			PXDaddress(pxd, xaddr);
1016			PXDlength(pxd, xlen);
1017			pxdlist.maxnpxd++;
1018			continue;
1019		}
1020
1021		DT_PUTPAGE(smp);
1022
1023		/* undo allocation */
1024		goto splitOut;
1025	}
1026
1027	split->pxdlist = &pxdlist;
1028	if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) {
1029		DT_PUTPAGE(smp);
1030
1031		/* undo allocation */
1032		goto splitOut;
1033	}
1034
1035	/*
1036	 * propagate up the router entry for the leaf page just split
1037	 *
1038	 * insert a router entry for the new page into the parent page,
1039	 * propagate the insert/split up the tree by walking back the stack
1040	 * of (bn of parent page, index of child page entry in parent page)
1041	 * that were traversed during the search for the page that split.
1042	 *
1043	 * the propagation of insert/split up the tree stops if the root
1044	 * splits or the page inserted into doesn't have to split to hold
1045	 * the new entry.
1046	 *
1047	 * the parent entry for the split page remains the same, and
1048	 * a new entry is inserted at its right with the first key and
1049	 * block number of the new right page.
1050	 *
1051	 * There are a maximum of 4 pages pinned at any time:
1052	 * two children, left parent and right parent (when the parent splits).
1053	 * keep the child pages pinned while working on the parent.
1054	 * make sure that all pins are released at exit.
1055	 */
1056	while ((parent = BT_POP(btstack)) != NULL) {
1057		/* parent page specified by stack frame <parent> */
1058
1059		/* keep current child pages (<lp>, <rp>) pinned */
1060		lmp = smp;
1061		lp = sp;
1062
1063		/*
1064		 * insert router entry in parent for new right child page <rp>
1065		 */
1066		/* get the parent page <sp> */
1067		DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
1068		if (rc) {
1069			DT_PUTPAGE(lmp);
1070			DT_PUTPAGE(rmp);
1071			goto splitOut;
1072		}
1073
1074		/*
1075		 * The new key entry goes ONE AFTER the index of parent entry,
1076		 * because the split was to the right.
1077		 */
1078		skip = parent->index + 1;
1079
1080		/*
1081		 * compute the key for the router entry
1082		 *
1083		 * key suffix compression:
1084		 * for internal pages that have leaf pages as children,
1085		 * retain only what's needed to distinguish between
1086		 * the new entry and the entry on the page to its left.
1087		 * If the keys compare equal, retain the entire key.
1088		 *
1089		 * note that compression is performed only at computing
1090		 * router key at the lowest internal level.
1091		 * further compression of the key between pairs of higher
1092		 * level internal pages loses too much information and
1093		 * the search may fail.
1094		 * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,}
1095		 * results in two adjacent parent entries (a)(xx).
1096		 * if split occurs between these two entries, and
1097		 * if compression is applied, the router key of parent entry
1098		 * of right page (x) will divert search for x into right
1099		 * subtree and miss x in the left subtree.)
1100		 *
1101		 * the entire key must be retained for the next-to-leftmost
1102		 * internal key at any level of the tree, or search may fail
1103		 * (e.g., ?)
1104		 */
1105		switch (rp->header.flag & BT_TYPE) {
1106		case BT_LEAF:
1107			/*
1108			 * compute the length of prefix for suffix compression
1109			 * between last entry of left page and first entry
1110			 * of right page
1111			 */
1112			if ((sp->header.flag & BT_ROOT && skip > 1) ||
1113			    sp->header.prev != 0 || skip > 1) {
1114				/* compute uppercase router prefix key */
1115				ciGetLeafPrefixKey(lp,
1116						   lp->header.nextindex - 1,
1117						   rp, 0, &key, sbi->mntflag);
1118			} else {
1119				/* next to leftmost entry of
1120				   lowest internal level */
1121
1122				/* compute uppercase router key */
1123				dtGetKey(rp, 0, &key, sbi->mntflag);
1124				key.name[key.namlen] = 0;
1125
1126				if ((sbi->mntflag & JFS_OS2) == JFS_OS2)
1127					ciToUpper(&key);
1128			}
1129
1130			n = NDTINTERNAL(key.namlen);
1131			break;
1132
1133		case BT_INTERNAL:
1134			dtGetKey(rp, 0, &key, sbi->mntflag);
1135			n = NDTINTERNAL(key.namlen);
1136			break;
1137
1138		default:
1139			jERROR(2, ("dtSplitUp(): UFO!\n"));
1140			break;
1141		}
1142
1143		/* unpin left child page */
1144		DT_PUTPAGE(lmp);
1145
1146		/*
1147		 * compute the data for the router entry
1148		 */
1149		data->xd = rpxd;	/* child page xd */
1150
1151		/*
1152		 * parent page is full - split the parent page
1153		 */
1154		if (n > sp->header.freecnt) {
1155			/* init for parent page split */
1156			split->mp = smp;
1157			split->index = skip;	/* index at insert */
1158			split->nslot = n;
1159			split->key = &key;
1160			/* split->data = data; */
1161
1162			/* unpin right child page */
1163			DT_PUTPAGE(rmp);
1164
1165			/* The split routines insert the new entry,
1166			 * acquire txLock as appropriate.
1167			 * return <rp> pinned and its block number <rbn>.
1168			 */
1169			rc = (sp->header.flag & BT_ROOT) ?
1170			    dtSplitRoot(tid, ip, split, &rmp) :
1171			    dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd);
1172			if (rc) {
1173				DT_PUTPAGE(smp);
1174				goto splitOut;
1175			}
1176
1177			/* smp and rmp are pinned */
1178		}
1179		/*
1180		 * parent page is not full - insert router entry in parent page
1181		 */
1182		else {
1183			BT_MARK_DIRTY(smp, ip);
1184			/*
1185			 * acquire a transaction lock on the parent page
1186			 */
1187			tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
1188			dtlck = (struct dt_lock *) & tlck->lock;
1189			ASSERT(dtlck->index == 0);
1190			lv = & dtlck->lv[0];
1191
1192			/* linelock header */
1193			lv->offset = 0;
1194			lv->length = 1;
1195			dtlck->index++;
1196
1197			/* linelock stbl of non-root parent page */
1198			if (!(sp->header.flag & BT_ROOT)) {
1199				lv++;
1200				n = skip >> L2DTSLOTSIZE;
1201				lv->offset = sp->header.stblindex + n;
1202				lv->length =
1203				    ((sp->header.nextindex -
1204				      1) >> L2DTSLOTSIZE) - n + 1;
1205				dtlck->index++;
1206			}
1207
1208			dtInsertEntry(sp, skip, &key, data, &dtlck);
1209
1210			/* exit propagate up */
1211			break;
1212		}
1213	}
1214
1215	/* unpin current split and its right page */
1216	DT_PUTPAGE(smp);
1217	DT_PUTPAGE(rmp);
1218
1219	/*
1220	 * free remaining extents allocated for split
1221	 */
1222      splitOut:
1223	n = pxdlist.npxd;
1224	pxd = &pxdlist.pxd[n];
1225	for (; n < pxdlist.maxnpxd; n++, pxd++)
1226		dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd));
1227
1228      freeKeyName:
1229	kfree(key.name);
1230
1231      dtSplitUp_Exit:
1232
1233	return rc;
1234}
1235
1236
1237/*
1238 *	dtSplitPage()
1239 *
1240 * function: Split a non-root page of a btree.
1241 *
1242 * parameter:
1243 *
1244 * return: 0 - success;
1245 *	   errno - failure;
1246 *	return split and new page pinned;
1247 */
1248static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1249	    struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp)
1250{
1251	struct super_block *sb = ip->i_sb;
1252	int rc = 0;
1253	struct metapage *smp;
1254	dtpage_t *sp;
1255	struct metapage *rmp;
1256	dtpage_t *rp;		/* new right page allocated */
1257	s64 rbn;		/* new right page block number */
1258	struct metapage *mp;
1259	dtpage_t *p;
1260	s64 nextbn;
1261	struct pxdlist *pxdlist;
1262	pxd_t *pxd;
1263	int skip, nextindex, half, left, nxt, off, si;
1264	struct ldtentry *ldtentry;
1265	struct idtentry *idtentry;
1266	u8 *stbl;
1267	struct dtslot *f;
1268	int fsi, stblsize;
1269	int n;
1270	struct dt_lock *sdtlck, *rdtlck;
1271	struct tlock *tlck;
1272	struct dt_lock *dtlck;
1273	struct lv *slv, *rlv, *lv;
1274
1275	/* get split page */
1276	smp = split->mp;
1277	sp = DT_PAGE(ip, smp);
1278
1279	/*
1280	 * allocate the new right page for the split
1281	 */
1282	pxdlist = split->pxdlist;
1283	pxd = &pxdlist->pxd[pxdlist->npxd];
1284	pxdlist->npxd++;
1285	rbn = addressPXD(pxd);
1286	rmp = get_metapage(ip, rbn, PSIZE, 1);
1287	if (rmp == NULL)
1288		return EIO;
1289
1290	jEVENT(0,
1291	       ("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p\n", ip, smp, rmp));
1292
1293	BT_MARK_DIRTY(rmp, ip);
1294	/*
1295	 * acquire a transaction lock on the new right page
1296	 */
1297	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
1298	rdtlck = (struct dt_lock *) & tlck->lock;
1299
1300	rp = (dtpage_t *) rmp->data;
1301	*rpp = rp;
1302	rp->header.self = *pxd;
1303
1304	BT_MARK_DIRTY(smp, ip);
1305	/*
1306	 * acquire a transaction lock on the split page
1307	 *
1308	 * action:
1309	 */
1310	tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
1311	sdtlck = (struct dt_lock *) & tlck->lock;
1312
1313	/* linelock header of split page */
1314	ASSERT(sdtlck->index == 0);
1315	slv = & sdtlck->lv[0];
1316	slv->offset = 0;
1317	slv->length = 1;
1318	sdtlck->index++;
1319
1320	/*
1321	 * initialize/update sibling pointers between sp and rp
1322	 */
1323	nextbn = le64_to_cpu(sp->header.next);
1324	rp->header.next = cpu_to_le64(nextbn);
1325	rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
1326	sp->header.next = cpu_to_le64(rbn);
1327
1328	/*
1329	 * initialize new right page
1330	 */
1331	rp->header.flag = sp->header.flag;
1332
1333	/* compute sorted entry table at start of extent data area */
1334	rp->header.nextindex = 0;
1335	rp->header.stblindex = 1;
1336
1337	n = PSIZE >> L2DTSLOTSIZE;
1338	rp->header.maxslot = n;
1339	stblsize = (n + 31) >> L2DTSLOTSIZE;	/* in unit of slot */
1340
1341	/* init freelist */
1342	fsi = rp->header.stblindex + stblsize;
1343	rp->header.freelist = fsi;
1344	rp->header.freecnt = rp->header.maxslot - fsi;
1345
1346	/*
1347	 *      sequential append at tail: append without split
1348	 *
1349	 * If splitting the last page on a level because of appending
1350	 * a entry to it (skip is maxentry), it's likely that the access is
1351	 * sequential. Adding an empty page on the side of the level is less
1352	 * work and can push the fill factor much higher than normal.
1353	 * If we're wrong it's no big deal, we'll just do the split the right
1354	 * way next time.
1355	 * (It may look like it's equally easy to do a similar hack for
1356	 * reverse sorted data, that is, split the tree left,
1357	 * but it's not. Be my guest.)
1358	 */
1359	if (nextbn == 0 && split->index == sp->header.nextindex) {
1360		/* linelock header + stbl (first slot) of new page */
1361		rlv = & rdtlck->lv[rdtlck->index];
1362		rlv->offset = 0;
1363		rlv->length = 2;
1364		rdtlck->index++;
1365
1366		/*
1367		 * initialize freelist of new right page
1368		 */
1369		f = &rp->slot[fsi];
1370		for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1371			f->next = fsi;
1372		f->next = -1;
1373
1374		/* insert entry at the first entry of the new right page */
1375		dtInsertEntry(rp, 0, split->key, split->data, &rdtlck);
1376
1377		goto out;
1378	}
1379
1380	/*
1381	 *      non-sequential insert (at possibly middle page)
1382	 */
1383
1384	/*
1385	 * update prev pointer of previous right sibling page;
1386	 */
1387	if (nextbn != 0) {
1388		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
1389		if (rc)
1390			return rc;
1391
1392		BT_MARK_DIRTY(mp, ip);
1393		/*
1394		 * acquire a transaction lock on the next page
1395		 */
1396		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
1397		jEVENT(0,
1398		       ("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p\n",
1399			tlck, ip, mp));
1400		dtlck = (struct dt_lock *) & tlck->lock;
1401
1402		/* linelock header of previous right sibling page */
1403		lv = & dtlck->lv[dtlck->index];
1404		lv->offset = 0;
1405		lv->length = 1;
1406		dtlck->index++;
1407
1408		p->header.prev = cpu_to_le64(rbn);
1409
1410		DT_PUTPAGE(mp);
1411	}
1412
1413	/*
1414	 * split the data between the split and right pages.
1415	 */
1416	skip = split->index;
1417	half = (PSIZE >> L2DTSLOTSIZE) >> 1;	/* swag */
1418	left = 0;
1419
1420	/*
1421	 *      compute fill factor for split pages
1422	 *
1423	 * <nxt> traces the next entry to move to rp
1424	 * <off> traces the next entry to stay in sp
1425	 */
1426	stbl = (u8 *) & sp->slot[sp->header.stblindex];
1427	nextindex = sp->header.nextindex;
1428	for (nxt = off = 0; nxt < nextindex; ++off) {
1429		if (off == skip)
1430			/* check for fill factor with new entry size */
1431			n = split->nslot;
1432		else {
1433			si = stbl[nxt];
1434			switch (sp->header.flag & BT_TYPE) {
1435			case BT_LEAF:
1436				ldtentry = (struct ldtentry *) & sp->slot[si];
1437				if (DO_INDEX(ip))
1438					n = NDTLEAF(ldtentry->namlen);
1439				else
1440					n = NDTLEAF_LEGACY(ldtentry->
1441							   namlen);
1442				break;
1443
1444			case BT_INTERNAL:
1445				idtentry = (struct idtentry *) & sp->slot[si];
1446				n = NDTINTERNAL(idtentry->namlen);
1447				break;
1448
1449			default:
1450				break;
1451			}
1452
1453			++nxt;	/* advance to next entry to move in sp */
1454		}
1455
1456		left += n;
1457		if (left >= half)
1458			break;
1459	}
1460
1461	/* <nxt> poins to the 1st entry to move */
1462
1463	/*
1464	 *      move entries to right page
1465	 *
1466	 * dtMoveEntry() initializes rp and reserves entry for insertion
1467	 *
1468	 * split page moved out entries are linelocked;
1469	 * new/right page moved in entries are linelocked;
1470	 */
1471	/* linelock header + stbl of new right page */
1472	rlv = & rdtlck->lv[rdtlck->index];
1473	rlv->offset = 0;
1474	rlv->length = 5;
1475	rdtlck->index++;
1476
1477	dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip));
1478
1479	sp->header.nextindex = nxt;
1480
1481	/*
1482	 * finalize freelist of new right page
1483	 */
1484	fsi = rp->header.freelist;
1485	f = &rp->slot[fsi];
1486	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1487		f->next = fsi;
1488	f->next = -1;
1489
1490	/*
1491	 * Update directory index table for entries now in right page
1492	 */
1493	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
1494		mp = 0;
1495		stbl = DT_GETSTBL(rp);
1496		for (n = 0; n < rp->header.nextindex; n++) {
1497			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
1498			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
1499				     rbn, n, &mp);
1500		}
1501		if (mp)
1502			release_metapage(mp);
1503	}
1504
1505	/*
1506	 * the skipped index was on the left page,
1507	 */
1508	if (skip <= off) {
1509		/* insert the new entry in the split page */
1510		dtInsertEntry(sp, skip, split->key, split->data, &sdtlck);
1511
1512		/* linelock stbl of split page */
1513		if (sdtlck->index >= sdtlck->maxcnt)
1514			sdtlck = (struct dt_lock *) txLinelock(sdtlck);
1515		slv = & sdtlck->lv[sdtlck->index];
1516		n = skip >> L2DTSLOTSIZE;
1517		slv->offset = sp->header.stblindex + n;
1518		slv->length =
1519		    ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
1520		sdtlck->index++;
1521	}
1522	/*
1523	 * the skipped index was on the right page,
1524	 */
1525	else {
1526		/* adjust the skip index to reflect the new position */
1527		skip -= nxt;
1528
1529		/* insert the new entry in the right page */
1530		dtInsertEntry(rp, skip, split->key, split->data, &rdtlck);
1531	}
1532
1533      out:
1534	*rmpp = rmp;
1535	*rpxdp = *pxd;
1536
1537	ip->i_blocks += LBLK2PBLK(sb, lengthPXD(pxd));
1538
1539	jEVENT(0, ("dtSplitPage: ip:0x%p sp:0x%p rp:0x%p\n", ip, sp, rp));
1540	return 0;
1541}
1542
1543
1544/*
1545 *	dtExtendPage()
1546 *
1547 * function: extend 1st/only directory leaf page
1548 *
1549 * parameter:
1550 *
1551 * return: 0 - success;
1552 *	   errno - failure;
1553 *	return extended page pinned;
1554 */
1555static int dtExtendPage(tid_t tid,
1556	     struct inode *ip, struct dtsplit * split, struct btstack * btstack)
1557{
1558	struct super_block *sb = ip->i_sb;
1559	int rc;
1560	struct metapage *smp, *pmp, *mp;
1561	dtpage_t *sp, *pp;
1562	struct pxdlist *pxdlist;
1563	pxd_t *pxd, *tpxd;
1564	int xlen, xsize;
1565	int newstblindex, newstblsize;
1566	int oldstblindex, oldstblsize;
1567	int fsi, last;
1568	struct dtslot *f;
1569	struct btframe *parent;
1570	int n;
1571	struct dt_lock *dtlck;
1572	s64 xaddr, txaddr;
1573	struct tlock *tlck;
1574	struct pxd_lock *pxdlock;
1575	struct lv *lv;
1576	uint type;
1577	struct ldtentry *ldtentry;
1578	u8 *stbl;
1579
1580	/* get page to extend */
1581	smp = split->mp;
1582	sp = DT_PAGE(ip, smp);
1583
1584	/* get parent/root page */
1585	parent = BT_POP(btstack);
1586	DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc);
1587	if (rc)
1588		return (rc);
1589
1590	/*
1591	 *      extend the extent
1592	 */
1593	pxdlist = split->pxdlist;
1594	pxd = &pxdlist->pxd[pxdlist->npxd];
1595	pxdlist->npxd++;
1596
1597	xaddr = addressPXD(pxd);
1598	tpxd = &sp->header.self;
1599	txaddr = addressPXD(tpxd);
1600	/* in-place extension */
1601	if (xaddr == txaddr) {
1602		type = tlckEXTEND;
1603	}
1604	/* relocation */
1605	else {
1606		type = tlckNEW;
1607
1608		/* save moved extent descriptor for later free */
1609		tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE);
1610		pxdlock = (struct pxd_lock *) & tlck->lock;
1611		pxdlock->flag = mlckFREEPXD;
1612		pxdlock->pxd = sp->header.self;
1613		pxdlock->index = 1;
1614
1615		/*
1616		 * Update directory index table to reflect new page address
1617		 */
1618		if (DO_INDEX(ip)) {
1619			mp = 0;
1620			stbl = DT_GETSTBL(sp);
1621			for (n = 0; n < sp->header.nextindex; n++) {
1622				ldtentry =
1623				    (struct ldtentry *) & sp->slot[stbl[n]];
1624				modify_index(tid, ip,
1625					     le32_to_cpu(ldtentry->index),
1626					     xaddr, n, &mp);
1627			}
1628			if (mp)
1629				release_metapage(mp);
1630		}
1631	}
1632
1633	/*
1634	 *      extend the page
1635	 */
1636	sp->header.self = *pxd;
1637
1638	jEVENT(0,
1639	       ("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p\n", ip, smp, sp));
1640
1641	BT_MARK_DIRTY(smp, ip);
1642	/*
1643	 * acquire a transaction lock on the extended/leaf page
1644	 */
1645	tlck = txLock(tid, ip, smp, tlckDTREE | type);
1646	dtlck = (struct dt_lock *) & tlck->lock;
1647	lv = & dtlck->lv[0];
1648
1649	/* update buffer extent descriptor of extended page */
1650	xlen = lengthPXD(pxd);
1651	xsize = xlen << JFS_SBI(sb)->l2bsize;
1652#ifdef _STILL_TO_PORT
1653	bmSetXD(smp, xaddr, xsize);
1654#endif				/*  _STILL_TO_PORT */
1655
1656	/*
1657	 * copy old stbl to new stbl at start of extended area
1658	 */
1659	oldstblindex = sp->header.stblindex;
1660	oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE;
1661	newstblindex = sp->header.maxslot;
1662	n = xsize >> L2DTSLOTSIZE;
1663	newstblsize = (n + 31) >> L2DTSLOTSIZE;
1664	memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex],
1665	       sp->header.nextindex);
1666
1667	/*
1668	 * in-line extension: linelock old area of extended page
1669	 */
1670	if (type == tlckEXTEND) {
1671		/* linelock header */
1672		lv->offset = 0;
1673		lv->length = 1;
1674		dtlck->index++;
1675		lv++;
1676
1677		/* linelock new stbl of extended page */
1678		lv->offset = newstblindex;
1679		lv->length = newstblsize;
1680	}
1681	/*
1682	 * relocation: linelock whole relocated area
1683	 */
1684	else {
1685		lv->offset = 0;
1686		lv->length = sp->header.maxslot + newstblsize;
1687	}
1688
1689	dtlck->index++;
1690
1691	sp->header.maxslot = n;
1692	sp->header.stblindex = newstblindex;
1693	/* sp->header.nextindex remains the same */
1694
1695	/*
1696	 * add old stbl region at head of freelist
1697	 */
1698	fsi = oldstblindex;
1699	f = &sp->slot[fsi];
1700	last = sp->header.freelist;
1701	for (n = 0; n < oldstblsize; n++, fsi++, f++) {
1702		f->next = last;
1703		last = fsi;
1704	}
1705	sp->header.freelist = last;
1706	sp->header.freecnt += oldstblsize;
1707
1708	/*
1709	 * append free region of newly extended area at tail of freelist
1710	 */
1711	/* init free region of newly extended area */
1712	fsi = n = newstblindex + newstblsize;
1713	f = &sp->slot[fsi];
1714	for (fsi++; fsi < sp->header.maxslot; f++, fsi++)
1715		f->next = fsi;
1716	f->next = -1;
1717
1718	/* append new free region at tail of old freelist */
1719	fsi = sp->header.freelist;
1720	if (fsi == -1)
1721		sp->header.freelist = n;
1722	else {
1723		do {
1724			f = &sp->slot[fsi];
1725			fsi = f->next;
1726		} while (fsi != -1);
1727
1728		f->next = n;
1729	}
1730
1731	sp->header.freecnt += sp->header.maxslot - n;
1732
1733	/*
1734	 * insert the new entry
1735	 */
1736	dtInsertEntry(sp, split->index, split->key, split->data, &dtlck);
1737
1738	BT_MARK_DIRTY(pmp, ip);
1739	/*
1740	 * linelock any freeslots residing in old extent
1741	 */
1742	if (type == tlckEXTEND) {
1743		n = sp->header.maxslot >> 2;
1744		if (sp->header.freelist < n)
1745			dtLinelockFreelist(sp, n, &dtlck);
1746	}
1747
1748	/*
1749	 *      update parent entry on the parent/root page
1750	 */
1751	/*
1752	 * acquire a transaction lock on the parent/root page
1753	 */
1754	tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
1755	dtlck = (struct dt_lock *) & tlck->lock;
1756	lv = & dtlck->lv[dtlck->index];
1757
1758	/* linelock parent entry - 1st slot */
1759	lv->offset = 1;
1760	lv->length = 1;
1761	dtlck->index++;
1762
1763	/* update the parent pxd for page extension */
1764	tpxd = (pxd_t *) & pp->slot[1];
1765	*tpxd = *pxd;
1766
1767	/* Since the directory might have an EA and/or ACL associated with it
1768	 * we need to make sure we take that into account when setting the
1769	 * i_nblocks
1770	 */
1771	ip->i_blocks = LBLK2PBLK(ip->i_sb, xlen +
1772				 ((JFS_IP(ip)->ea.flag & DXD_EXTENT) ?
1773				  lengthDXD(&JFS_IP(ip)->ea) : 0) +
1774				 ((JFS_IP(ip)->acl.flag & DXD_EXTENT) ?
1775				  lengthDXD(&JFS_IP(ip)->acl) : 0));
1776
1777	jEVENT(0,
1778	       ("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p\n", ip, smp, sp));
1779
1780
1781	DT_PUTPAGE(pmp);
1782	return 0;
1783}
1784
1785
1786/*
1787 *	dtSplitRoot()
1788 *
1789 * function:
1790 *	split the full root page into
1791 *	original/root/split page and new right page
1792 *	i.e., root remains fixed in tree anchor (inode) and
1793 *	the root is copied to a single new right child page
1794 *	since root page << non-root page, and
1795 *	the split root page contains a single entry for the
1796 *	new right child page.
1797 *
1798 * parameter:
1799 *
1800 * return: 0 - success;
1801 *	   errno - failure;
1802 *	return new page pinned;
1803 */
1804static int dtSplitRoot(tid_t tid,
1805	    struct inode *ip, struct dtsplit * split, struct metapage ** rmpp)
1806{
1807	struct super_block *sb = ip->i_sb;
1808	struct metapage *smp;
1809	dtroot_t *sp;
1810	struct metapage *rmp;
1811	dtpage_t *rp;
1812	s64 rbn;
1813	int xlen;
1814	int xsize;
1815	struct dtslot *f;
1816	s8 *stbl;
1817	int fsi, stblsize, n;
1818	struct idtentry *s;
1819	pxd_t *ppxd;
1820	struct pxdlist *pxdlist;
1821	pxd_t *pxd;
1822	struct dt_lock *dtlck;
1823	struct tlock *tlck;
1824	struct lv *lv;
1825
1826	/* get split root page */
1827	smp = split->mp;
1828	sp = &JFS_IP(ip)->i_dtroot;
1829
1830	/*
1831	 *      allocate/initialize a single (right) child page
1832	 *
1833	 * N.B. at first split, a one (or two) block to fit new entry
1834	 * is allocated; at subsequent split, a full page is allocated;
1835	 */
1836	pxdlist = split->pxdlist;
1837	pxd = &pxdlist->pxd[pxdlist->npxd];
1838	pxdlist->npxd++;
1839	rbn = addressPXD(pxd);
1840	xlen = lengthPXD(pxd);
1841	xsize = xlen << JFS_SBI(sb)->l2bsize;
1842	rmp = get_metapage(ip, rbn, xsize, 1);
1843	rp = rmp->data;
1844
1845	BT_MARK_DIRTY(rmp, ip);
1846	/*
1847	 * acquire a transaction lock on the new right page
1848	 */
1849	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
1850	dtlck = (struct dt_lock *) & tlck->lock;
1851
1852	rp->header.flag =
1853	    (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
1854	rp->header.self = *pxd;
1855
1856	/* initialize sibling pointers */
1857	rp->header.next = 0;
1858	rp->header.prev = 0;
1859
1860	/*
1861	 *      move in-line root page into new right page extent
1862	 */
1863	/* linelock header + copied entries + new stbl (1st slot) in new page */
1864	ASSERT(dtlck->index == 0);
1865	lv = & dtlck->lv[0];
1866	lv->offset = 0;
1867	lv->length = 10;	/* 1 + 8 + 1 */
1868	dtlck->index++;
1869
1870	n = xsize >> L2DTSLOTSIZE;
1871	rp->header.maxslot = n;
1872	stblsize = (n + 31) >> L2DTSLOTSIZE;
1873
1874	/* copy old stbl to new stbl at start of extended area */
1875	rp->header.stblindex = DTROOTMAXSLOT;
1876	stbl = (s8 *) & rp->slot[DTROOTMAXSLOT];
1877	memcpy(stbl, sp->header.stbl, sp->header.nextindex);
1878	rp->header.nextindex = sp->header.nextindex;
1879
1880	/* copy old data area to start of new data area */
1881	memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE);
1882
1883	/*
1884	 * append free region of newly extended area at tail of freelist
1885	 */
1886	/* init free region of newly extended area */
1887	fsi = n = DTROOTMAXSLOT + stblsize;
1888	f = &rp->slot[fsi];
1889	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1890		f->next = fsi;
1891	f->next = -1;
1892
1893	/* append new free region at tail of old freelist */
1894	fsi = sp->header.freelist;
1895	if (fsi == -1)
1896		rp->header.freelist = n;
1897	else {
1898		rp->header.freelist = fsi;
1899
1900		do {
1901			f = &rp->slot[fsi];
1902			fsi = f->next;
1903		} while (fsi != -1);
1904
1905		f->next = n;
1906	}
1907
1908	rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n;
1909
1910	/*
1911	 * Update directory index table for entries now in right page
1912	 */
1913	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
1914		struct metapage *mp = 0;
1915		struct ldtentry *ldtentry;
1916
1917		stbl = DT_GETSTBL(rp);
1918		for (n = 0; n < rp->header.nextindex; n++) {
1919			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
1920			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
1921				     rbn, n, &mp);
1922		}
1923		if (mp)
1924			release_metapage(mp);
1925	}
1926	/*
1927	 * insert the new entry into the new right/child page
1928	 * (skip index in the new right page will not change)
1929	 */
1930	dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
1931
1932	/*
1933	 *      reset parent/root page
1934	 *
1935	 * set the 1st entry offset to 0, which force the left-most key
1936	 * at any level of the tree to be less than any search key.
1937	 *
1938	 * The btree comparison code guarantees that the left-most key on any
1939	 * level of the tree is never used, so it doesn't need to be filled in.
1940	 */
1941	BT_MARK_DIRTY(smp, ip);
1942	/*
1943	 * acquire a transaction lock on the root page (in-memory inode)
1944	 */
1945	tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT);
1946	dtlck = (struct dt_lock *) & tlck->lock;
1947
1948	/* linelock root */
1949	ASSERT(dtlck->index == 0);
1950	lv = & dtlck->lv[0];
1951	lv->offset = 0;
1952	lv->length = DTROOTMAXSLOT;
1953	dtlck->index++;
1954
1955	/* update page header of root */
1956	if (sp->header.flag & BT_LEAF) {
1957		sp->header.flag &= ~BT_LEAF;
1958		sp->header.flag |= BT_INTERNAL;
1959	}
1960
1961	/* init the first entry */
1962	s = (struct idtentry *) & sp->slot[DTENTRYSTART];
1963	ppxd = (pxd_t *) s;
1964	*ppxd = *pxd;
1965	s->next = -1;
1966	s->namlen = 0;
1967
1968	stbl = sp->header.stbl;
1969	stbl[0] = DTENTRYSTART;
1970	sp->header.nextindex = 1;
1971
1972	/* init freelist */
1973	fsi = DTENTRYSTART + 1;
1974	f = &sp->slot[fsi];
1975
1976	/* init free region of remaining area */
1977	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
1978		f->next = fsi;
1979	f->next = -1;
1980
1981	sp->header.freelist = DTENTRYSTART + 1;
1982	sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1);
1983
1984	*rmpp = rmp;
1985
1986	ip->i_blocks += LBLK2PBLK(ip->i_sb, lengthPXD(pxd));
1987	return 0;
1988}
1989
1990
1991/*
1992 *	dtDelete()
1993 *
1994 * function: delete the entry(s) referenced by a key.
1995 *
1996 * parameter:
1997 *
1998 * return:
1999 */
2000int dtDelete(tid_t tid,
2001	 struct inode *ip, struct component_name * key, ino_t * ino, int flag)
2002{
2003	int rc = 0;
2004	s64 bn;
2005	struct metapage *mp, *imp;
2006	dtpage_t *p;
2007	int index;
2008	struct btstack btstack;
2009	struct dt_lock *dtlck;
2010	struct tlock *tlck;
2011	struct lv *lv;
2012	int i;
2013	struct ldtentry *ldtentry;
2014	u8 *stbl;
2015	u32 table_index, next_index;
2016	struct metapage *nmp;
2017	dtpage_t *np;
2018
2019	/*
2020	 *      search for the entry to delete:
2021	 *
2022	 * dtSearch() returns (leaf page pinned, index at which to delete).
2023	 */
2024	if ((rc = dtSearch(ip, key, ino, &btstack, flag)))
2025		return rc;
2026
2027	/* retrieve search result */
2028	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2029
2030	/*
2031	 * We need to find put the index of the next entry into the
2032	 * directory index table in order to resume a readdir from this
2033	 * entry.
2034	 */
2035	if (DO_INDEX(ip)) {
2036		stbl = DT_GETSTBL(p);
2037		ldtentry = (struct ldtentry *) & p->slot[stbl[index]];
2038		table_index = le32_to_cpu(ldtentry->index);
2039		if (index == (p->header.nextindex - 1)) {
2040			/*
2041			 * Last entry in this leaf page
2042			 */
2043			if ((p->header.flag & BT_ROOT)
2044			    || (p->header.next == 0))
2045				next_index = -1;
2046			else {
2047				/* Read next leaf page */
2048				DT_GETPAGE(ip, le64_to_cpu(p->header.next),
2049					   nmp, PSIZE, np, rc);
2050				if (rc)
2051					next_index = -1;
2052				else {
2053					stbl = DT_GETSTBL(np);
2054					ldtentry =
2055					    (struct ldtentry *) & np->
2056					    slot[stbl[0]];
2057					next_index =
2058					    le32_to_cpu(ldtentry->index);
2059					DT_PUTPAGE(nmp);
2060				}
2061			}
2062		} else {
2063			ldtentry =
2064			    (struct ldtentry *) & p->slot[stbl[index + 1]];
2065			next_index = le32_to_cpu(ldtentry->index);
2066		}
2067		free_index(tid, ip, table_index, next_index);
2068	}
2069	/*
2070	 * the leaf page becomes empty, delete the page
2071	 */
2072	if (p->header.nextindex == 1) {
2073		/* delete empty page */
2074		rc = dtDeleteUp(tid, ip, mp, p, &btstack);
2075	}
2076	/*
2077	 * the leaf page has other entries remaining:
2078	 *
2079	 * delete the entry from the leaf page.
2080	 */
2081	else {
2082		BT_MARK_DIRTY(mp, ip);
2083		/*
2084		 * acquire a transaction lock on the leaf page
2085		 */
2086		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
2087		dtlck = (struct dt_lock *) & tlck->lock;
2088
2089		/*
2090		 * Do not assume that dtlck->index will be zero.  During a
2091		 * rename within a directory, this transaction may have
2092		 * modified this page already when adding the new entry.
2093		 */
2094
2095		/* linelock header */
2096		if (dtlck->index >= dtlck->maxcnt)
2097			dtlck = (struct dt_lock *) txLinelock(dtlck);
2098		lv = & dtlck->lv[dtlck->index];
2099		lv->offset = 0;
2100		lv->length = 1;
2101		dtlck->index++;
2102
2103		/* linelock stbl of non-root leaf page */
2104		if (!(p->header.flag & BT_ROOT)) {
2105			if (dtlck->index >= dtlck->maxcnt)
2106				dtlck = (struct dt_lock *) txLinelock(dtlck);
2107			lv = & dtlck->lv[dtlck->index];
2108			i = index >> L2DTSLOTSIZE;
2109			lv->offset = p->header.stblindex + i;
2110			lv->length =
2111			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
2112			    i + 1;
2113			dtlck->index++;
2114		}
2115
2116		/* free the leaf entry */
2117		dtDeleteEntry(p, index, &dtlck);
2118
2119		/*
2120		 * Update directory index table for entries moved in stbl
2121		 */
2122		if (DO_INDEX(ip) && index < p->header.nextindex) {
2123			imp = 0;
2124			stbl = DT_GETSTBL(p);
2125			for (i = index; i < p->header.nextindex; i++) {
2126				ldtentry =
2127				    (struct ldtentry *) & p->slot[stbl[i]];
2128				modify_index(tid, ip,
2129					     le32_to_cpu(ldtentry->index),
2130					     bn, i, &imp);
2131			}
2132			if (imp)
2133				release_metapage(imp);
2134		}
2135
2136		DT_PUTPAGE(mp);
2137	}
2138
2139	return rc;
2140}
2141
2142
2143/*
2144 *	dtDeleteUp()
2145 *
2146 * function:
2147 *	free empty pages as propagating deletion up the tree
2148 *
2149 * parameter:
2150 *
2151 * return:
2152 */
2153static int dtDeleteUp(tid_t tid, struct inode *ip,
2154	   struct metapage * fmp, dtpage_t * fp, struct btstack * btstack)
2155{
2156	int rc = 0;
2157	struct metapage *mp;
2158	dtpage_t *p;
2159	int index, nextindex;
2160	int xlen;
2161	struct btframe *parent;
2162	struct dt_lock *dtlck;
2163	struct tlock *tlck;
2164	struct lv *lv;
2165	struct pxd_lock *pxdlock;
2166	int i;
2167
2168	/*
2169	 *      keep the root leaf page which has become empty
2170	 */
2171	if (BT_IS_ROOT(fmp)) {
2172		/*
2173		 * reset the root
2174		 *
2175		 * dtInitRoot() acquires txlock on the root
2176		 */
2177		dtInitRoot(tid, ip, PARENT(ip));
2178
2179		DT_PUTPAGE(fmp);
2180
2181		return 0;
2182	}
2183
2184	/*
2185	 *      free the non-root leaf page
2186	 */
2187	/*
2188	 * acquire a transaction lock on the page
2189	 *
2190	 * write FREEXTENT|NOREDOPAGE log record
2191	 * N.B. linelock is overlaid as freed extent descriptor, and
2192	 * the buffer page is freed;
2193	 */
2194	tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
2195	pxdlock = (struct pxd_lock *) & tlck->lock;
2196	pxdlock->flag = mlckFREEPXD;
2197	pxdlock->pxd = fp->header.self;
2198	pxdlock->index = 1;
2199
2200	/* update sibling pointers */
2201	if ((rc = dtRelink(tid, ip, fp)))
2202		return rc;
2203
2204	xlen = lengthPXD(&fp->header.self);
2205	ip->i_blocks -= LBLK2PBLK(ip->i_sb, xlen);
2206
2207	/* free/invalidate its buffer page */
2208	discard_metapage(fmp);
2209
2210	/*
2211	 *      propagate page deletion up the directory tree
2212	 *
2213	 * If the delete from the parent page makes it empty,
2214	 * continue all the way up the tree.
2215	 * stop if the root page is reached (which is never deleted) or
2216	 * if the entry deletion does not empty the page.
2217	 */
2218	while ((parent = BT_POP(btstack)) != NULL) {
2219		/* pin the parent page <sp> */
2220		DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
2221		if (rc)
2222			return rc;
2223
2224		/*
2225		 * free the extent of the child page deleted
2226		 */
2227		index = parent->index;
2228
2229		/*
2230		 * delete the entry for the child page from parent
2231		 */
2232		nextindex = p->header.nextindex;
2233
2234		/*
2235		 * the parent has the single entry being deleted:
2236		 *
2237		 * free the parent page which has become empty.
2238		 */
2239		if (nextindex == 1) {
2240			/*
2241			 * keep the root internal page which has become empty
2242			 */
2243			if (p->header.flag & BT_ROOT) {
2244				/*
2245				 * reset the root
2246				 *
2247				 * dtInitRoot() acquires txlock on the root
2248				 */
2249				dtInitRoot(tid, ip, PARENT(ip));
2250
2251				DT_PUTPAGE(mp);
2252
2253				return 0;
2254			}
2255			/*
2256			 * free the parent page
2257			 */
2258			else {
2259				/*
2260				 * acquire a transaction lock on the page
2261				 *
2262				 * write FREEXTENT|NOREDOPAGE log record
2263				 */
2264				tlck =
2265				    txMaplock(tid, ip,
2266					      tlckDTREE | tlckFREE);
2267				pxdlock = (struct pxd_lock *) & tlck->lock;
2268				pxdlock->flag = mlckFREEPXD;
2269				pxdlock->pxd = p->header.self;
2270				pxdlock->index = 1;
2271
2272				/* update sibling pointers */
2273				if ((rc = dtRelink(tid, ip, p)))
2274					return rc;
2275
2276				xlen = lengthPXD(&p->header.self);
2277				ip->i_blocks -= LBLK2PBLK(ip->i_sb, xlen);
2278
2279				/* free/invalidate its buffer page */
2280				discard_metapage(mp);
2281
2282				/* propagate up */
2283				continue;
2284			}
2285		}
2286
2287		/*
2288		 * the parent has other entries remaining:
2289		 *
2290		 * delete the router entry from the parent page.
2291		 */
2292		BT_MARK_DIRTY(mp, ip);
2293		/*
2294		 * acquire a transaction lock on the page
2295		 *
2296		 * action: router entry deletion
2297		 */
2298		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
2299		dtlck = (struct dt_lock *) & tlck->lock;
2300
2301		/* linelock header */
2302		if (dtlck->index >= dtlck->maxcnt)
2303			dtlck = (struct dt_lock *) txLinelock(dtlck);
2304		lv = & dtlck->lv[dtlck->index];
2305		lv->offset = 0;
2306		lv->length = 1;
2307		dtlck->index++;
2308
2309		/* linelock stbl of non-root leaf page */
2310		if (!(p->header.flag & BT_ROOT)) {
2311			if (dtlck->index < dtlck->maxcnt)
2312				lv++;
2313			else {
2314				dtlck = (struct dt_lock *) txLinelock(dtlck);
2315				lv = & dtlck->lv[0];
2316			}
2317			i = index >> L2DTSLOTSIZE;
2318			lv->offset = p->header.stblindex + i;
2319			lv->length =
2320			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
2321			    i + 1;
2322			dtlck->index++;
2323		}
2324
2325		/* free the router entry */
2326		dtDeleteEntry(p, index, &dtlck);
2327
2328		/* reset key of new leftmost entry of level (for consistency) */
2329		if (index == 0 &&
2330		    ((p->header.flag & BT_ROOT) || p->header.prev == 0))
2331			dtTruncateEntry(p, 0, &dtlck);
2332
2333		/* unpin the parent page */
2334		DT_PUTPAGE(mp);
2335
2336		/* exit propagation up */
2337		break;
2338	}
2339
2340	return 0;
2341}
2342
2343
2344/*
2345 * NAME:        dtRelocate()
2346 *
2347 * FUNCTION:    relocate dtpage (internal or leaf) of directory;
2348 *              This function is mainly used by defragfs utility.
2349 */
2350int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2351	       s64 nxaddr)
2352{
2353	int rc = 0;
2354	struct metapage *mp, *pmp, *lmp, *rmp;
2355	dtpage_t *p, *pp, *rp = 0, *lp= 0;
2356	s64 bn;
2357	int index;
2358	struct btstack btstack;
2359	pxd_t *pxd;
2360	s64 oxaddr, nextbn, prevbn;
2361	int xlen, xsize;
2362	struct tlock *tlck;
2363	struct dt_lock *dtlck;
2364	struct pxd_lock *pxdlock;
2365	s8 *stbl;
2366	struct lv *lv;
2367
2368	oxaddr = addressPXD(opxd);
2369	xlen = lengthPXD(opxd);
2370
2371	jEVENT(0, ("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d\n",
2372		   (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr,
2373		   xlen));
2374
2375	/*
2376	 *      1. get the internal parent dtpage covering
2377	 *      router entry for the tartget page to be relocated;
2378	 */
2379	rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
2380	if (rc)
2381		return rc;
2382
2383	/* retrieve search result */
2384	DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2385	jEVENT(0, ("dtRelocate: parent router entry validated.\n"));
2386
2387	/*
2388	 *      2. relocate the target dtpage
2389	 */
2390	/* read in the target page from src extent */
2391	DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
2392	if (rc) {
2393		/* release the pinned parent page */
2394		DT_PUTPAGE(pmp);
2395		return rc;
2396	}
2397
2398	/*
2399	 * read in sibling pages if any to update sibling pointers;
2400	 */
2401	rmp = NULL;
2402	if (p->header.next) {
2403		nextbn = le64_to_cpu(p->header.next);
2404		DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
2405		if (rc) {
2406			DT_PUTPAGE(mp);
2407			DT_PUTPAGE(pmp);
2408			return (rc);
2409		}
2410	}
2411
2412	lmp = NULL;
2413	if (p->header.prev) {
2414		prevbn = le64_to_cpu(p->header.prev);
2415		DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
2416		if (rc) {
2417			DT_PUTPAGE(mp);
2418			DT_PUTPAGE(pmp);
2419			if (rmp)
2420				DT_PUTPAGE(rmp);
2421			return (rc);
2422		}
2423	}
2424
2425	/* at this point, all xtpages to be updated are in memory */
2426
2427	/*
2428	 * update sibling pointers of sibling dtpages if any;
2429	 */
2430	if (lmp) {
2431		tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK);
2432		dtlck = (struct dt_lock *) & tlck->lock;
2433		/* linelock header */
2434		ASSERT(dtlck->index == 0);
2435		lv = & dtlck->lv[0];
2436		lv->offset = 0;
2437		lv->length = 1;
2438		dtlck->index++;
2439
2440		lp->header.next = cpu_to_le64(nxaddr);
2441		DT_PUTPAGE(lmp);
2442	}
2443
2444	if (rmp) {
2445		tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK);
2446		dtlck = (struct dt_lock *) & tlck->lock;
2447		/* linelock header */
2448		ASSERT(dtlck->index == 0);
2449		lv = & dtlck->lv[0];
2450		lv->offset = 0;
2451		lv->length = 1;
2452		dtlck->index++;
2453
2454		rp->header.prev = cpu_to_le64(nxaddr);
2455		DT_PUTPAGE(rmp);
2456	}
2457
2458	/*
2459	 * update the target dtpage to be relocated
2460	 *
2461	 * write LOG_REDOPAGE of LOG_NEW type for dst page
2462	 * for the whole target page (logredo() will apply
2463	 * after image and update bmap for allocation of the
2464	 * dst extent), and update bmap for allocation of
2465	 * the dst extent;
2466	 */
2467	tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW);
2468	dtlck = (struct dt_lock *) & tlck->lock;
2469	/* linelock header */
2470	ASSERT(dtlck->index == 0);
2471	lv = & dtlck->lv[0];
2472
2473	/* update the self address in the dtpage header */
2474	pxd = &p->header.self;
2475	PXDaddress(pxd, nxaddr);
2476
2477	/* the dst page is the same as the src page, i.e.,
2478	 * linelock for afterimage of the whole page;
2479	 */
2480	lv->offset = 0;
2481	lv->length = p->header.maxslot;
2482	dtlck->index++;
2483
2484	/* update the buffer extent descriptor of the dtpage */
2485	xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
2486#ifdef _STILL_TO_PORT
2487	bmSetXD(mp, nxaddr, xsize);
2488#endif /* _STILL_TO_PORT */
2489	/* unpin the relocated page */
2490	DT_PUTPAGE(mp);
2491	jEVENT(0, ("dtRelocate: target dtpage relocated.\n"));
2492
2493	/* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec
2494	 * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec
2495	 * will also force a bmap update ).
2496	 */
2497
2498	/*
2499	 *      3. acquire maplock for the source extent to be freed;
2500	 */
2501	/* for dtpage relocation, write a LOG_NOREDOPAGE record
2502	 * for the source dtpage (logredo() will init NoRedoPage
2503	 * filter and will also update bmap for free of the source
2504	 * dtpage), and upadte bmap for free of the source dtpage;
2505	 */
2506	tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
2507	pxdlock = (struct pxd_lock *) & tlck->lock;
2508	pxdlock->flag = mlckFREEPXD;
2509	PXDaddress(&pxdlock->pxd, oxaddr);
2510	PXDlength(&pxdlock->pxd, xlen);
2511	pxdlock->index = 1;
2512
2513	/*
2514	 *      4. update the parent router entry for relocation;
2515	 *
2516	 * acquire tlck for the parent entry covering the target dtpage;
2517	 * write LOG_REDOPAGE to apply after image only;
2518	 */
2519	jEVENT(0, ("dtRelocate: update parent router entry.\n"));
2520	tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
2521	dtlck = (struct dt_lock *) & tlck->lock;
2522	lv = & dtlck->lv[dtlck->index];
2523
2524	/* update the PXD with the new address */
2525	stbl = DT_GETSTBL(pp);
2526	pxd = (pxd_t *) & pp->slot[stbl[index]];
2527	PXDaddress(pxd, nxaddr);
2528	lv->offset = stbl[index];
2529	lv->length = 1;
2530	dtlck->index++;
2531
2532	/* unpin the parent dtpage */
2533	DT_PUTPAGE(pmp);
2534
2535	return rc;
2536}
2537
2538
2539/*
2540 * NAME:	dtSearchNode()
2541 *
2542 * FUNCTION:	Search for an dtpage containing a specified address
2543 *              This function is mainly used by defragfs utility.
2544 *
2545 * NOTE:	Search result on stack, the found page is pinned at exit.
2546 *		The result page must be an internal dtpage.
2547 *		lmxaddr give the address of the left most page of the
2548 *		dtree level, in which the required dtpage resides.
2549 */
2550static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
2551			struct btstack * btstack)
2552{
2553	int rc = 0;
2554	s64 bn;
2555	struct metapage *mp;
2556	dtpage_t *p;
2557	int psize = 288;	/* initial in-line directory */
2558	s8 *stbl;
2559	int i;
2560	pxd_t *pxd;
2561	struct btframe *btsp;
2562
2563	BT_CLR(btstack);	/* reset stack */
2564
2565	/*
2566	 *      descend tree to the level with specified leftmost page
2567	 *
2568	 *  by convention, root bn = 0.
2569	 */
2570	for (bn = 0;;) {
2571		/* get/pin the page to search */
2572		DT_GETPAGE(ip, bn, mp, psize, p, rc);
2573		if (rc)
2574			return rc;
2575
2576		/* does the xaddr of leftmost page of the levevl
2577		 * matches levevl search key ?
2578		 */
2579		if (p->header.flag & BT_ROOT) {
2580			if (lmxaddr == 0)
2581				break;
2582		} else if (addressPXD(&p->header.self) == lmxaddr)
2583			break;
2584
2585		/*
2586		 * descend down to leftmost child page
2587		 */
2588		if (p->header.flag & BT_LEAF)
2589			return ESTALE;
2590
2591		/* get the leftmost entry */
2592		stbl = DT_GETSTBL(p);
2593		pxd = (pxd_t *) & p->slot[stbl[0]];
2594
2595		/* get the child page block address */
2596		bn = addressPXD(pxd);
2597		psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
2598		/* unpin the parent page */
2599		DT_PUTPAGE(mp);
2600	}
2601
2602	/*
2603	 *      search each page at the current levevl
2604	 */
2605      loop:
2606	stbl = DT_GETSTBL(p);
2607	for (i = 0; i < p->header.nextindex; i++) {
2608		pxd = (pxd_t *) & p->slot[stbl[i]];
2609
2610		/* found the specified router entry */
2611		if (addressPXD(pxd) == addressPXD(kpxd) &&
2612		    lengthPXD(pxd) == lengthPXD(kpxd)) {
2613			btsp = btstack->top;
2614			btsp->bn = bn;
2615			btsp->index = i;
2616			btsp->mp = mp;
2617
2618			return 0;
2619		}
2620	}
2621
2622	/* get the right sibling page if any */
2623	if (p->header.next)
2624		bn = le64_to_cpu(p->header.next);
2625	else {
2626		DT_PUTPAGE(mp);
2627		return ESTALE;
2628	}
2629
2630	/* unpin current page */
2631	DT_PUTPAGE(mp);
2632
2633	/* get the right sibling page */
2634	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2635	if (rc)
2636		return rc;
2637
2638	goto loop;
2639}
2640
2641
2642/*
2643 *	dtRelink()
2644 *
2645 * function:
2646 *	link around a freed page.
2647 *
2648 * parameter:
2649 *	fp:	page to be freed
2650 *
2651 * return:
2652 */
2653static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p)
2654{
2655	int rc;
2656	struct metapage *mp;
2657	s64 nextbn, prevbn;
2658	struct tlock *tlck;
2659	struct dt_lock *dtlck;
2660	struct lv *lv;
2661
2662	nextbn = le64_to_cpu(p->header.next);
2663	prevbn = le64_to_cpu(p->header.prev);
2664
2665	/* update prev pointer of the next page */
2666	if (nextbn != 0) {
2667		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
2668		if (rc)
2669			return rc;
2670
2671		BT_MARK_DIRTY(mp, ip);
2672		/*
2673		 * acquire a transaction lock on the next page
2674		 *
2675		 * action: update prev pointer;
2676		 */
2677		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
2678		jEVENT(0,
2679		       ("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p\n",
2680			tlck, ip, mp));
2681		dtlck = (struct dt_lock *) & tlck->lock;
2682
2683		/* linelock header */
2684		if (dtlck->index >= dtlck->maxcnt)
2685			dtlck = (struct dt_lock *) txLinelock(dtlck);
2686		lv = & dtlck->lv[dtlck->index];
2687		lv->offset = 0;
2688		lv->length = 1;
2689		dtlck->index++;
2690
2691		p->header.prev = cpu_to_le64(prevbn);
2692		DT_PUTPAGE(mp);
2693	}
2694
2695	/* update next pointer of the previous page */
2696	if (prevbn != 0) {
2697		DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
2698		if (rc)
2699			return rc;
2700
2701		BT_MARK_DIRTY(mp, ip);
2702		/*
2703		 * acquire a transaction lock on the prev page
2704		 *
2705		 * action: update next pointer;
2706		 */
2707		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
2708		jEVENT(0,
2709		       ("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p\n",
2710			tlck, ip, mp));
2711		dtlck = (struct dt_lock *) & tlck->lock;
2712
2713		/* linelock header */
2714		if (dtlck->index >= dtlck->maxcnt)
2715			dtlck = (struct dt_lock *) txLinelock(dtlck);
2716		lv = & dtlck->lv[dtlck->index];
2717		lv->offset = 0;
2718		lv->length = 1;
2719		dtlck->index++;
2720
2721		p->header.next = cpu_to_le64(nextbn);
2722		DT_PUTPAGE(mp);
2723	}
2724
2725	return 0;
2726}
2727
2728
2729/*
2730 *	dtInitRoot()
2731 *
2732 * initialize directory root (inline in inode)
2733 */
2734void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot)
2735{
2736	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
2737	dtroot_t *p;
2738	int fsi;
2739	struct dtslot *f;
2740	struct tlock *tlck;
2741	struct dt_lock *dtlck;
2742	struct lv *lv;
2743	u16 xflag_save;
2744
2745	/*
2746	 * If this was previously an non-empty directory, we need to remove
2747	 * the old directory table.
2748	 */
2749	if (DO_INDEX(ip)) {
2750		if (jfs_ip->next_index > (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
2751			struct tblock *tblk = tid_to_tblock(tid);
2752			/*
2753			 * We're playing games with the tid's xflag.  If
2754			 * we're removing a regular file, the file's xtree
2755			 * is committed with COMMIT_PMAP, but we always
2756			 * commit the directories xtree with COMMIT_PWMAP.
2757			 */
2758			xflag_save = tblk->xflag;
2759			tblk->xflag = 0;
2760			/*
2761			 * xtTruncate isn't guaranteed to fully truncate
2762			 * the xtree.  The caller needs to check i_size
2763			 * after committing the transaction to see if
2764			 * additional truncation is needed.  The
2765			 * COMMIT_Stale flag tells caller that we
2766			 * initiated the truncation.
2767			 */
2768			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
2769			set_cflag(COMMIT_Stale, ip);
2770
2771			tblk->xflag = xflag_save;
2772			/*
2773			 * Tells jfs_metapage code that the metadata pages
2774			 * for the index table are no longer useful, and
2775			 * remove them from page cache.
2776			 */
2777			invalidate_inode_metapages(ip);
2778		} else
2779			ip->i_size = 1;
2780
2781		jfs_ip->next_index = 2;
2782	} else
2783		ip->i_size = IDATASIZE;
2784
2785	/*
2786	 * acquire a transaction lock on the root
2787	 *
2788	 * action: directory initialization;
2789	 */
2790	tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag,
2791		      tlckDTREE | tlckENTRY | tlckBTROOT);
2792	dtlck = (struct dt_lock *) & tlck->lock;
2793
2794	/* linelock root */
2795	ASSERT(dtlck->index == 0);
2796	lv = & dtlck->lv[0];
2797	lv->offset = 0;
2798	lv->length = DTROOTMAXSLOT;
2799	dtlck->index++;
2800
2801	p = &jfs_ip->i_dtroot;
2802
2803	p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
2804
2805	p->header.nextindex = 0;
2806
2807	/* init freelist */
2808	fsi = 1;
2809	f = &p->slot[fsi];
2810
2811	/* init data area of root */
2812	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
2813		f->next = fsi;
2814	f->next = -1;
2815
2816	p->header.freelist = 1;
2817	p->header.freecnt = 8;
2818
2819	/* init '..' entry */
2820	p->header.idotdot = cpu_to_le32(idotdot);
2821
2822
2823	return;
2824}
2825
2826/*
2827 *	add_missing_indices()
2828 *
2829 * function: Fix dtree page in which one or more entries has an invalid index.
2830 *	     fsck.jfs should really fix this, but it currently does not.
2831 *	     Called from jfs_readdir when bad index is detected.
2832 */
2833static void add_missing_indices(struct inode *inode, s64 bn)
2834{
2835	struct ldtentry *d;
2836	struct dt_lock *dtlck;
2837	int i;
2838	uint index;
2839	struct lv *lv;
2840	struct metapage *mp;
2841	dtpage_t *p;
2842	int rc;
2843	s8 *stbl;
2844	tid_t tid;
2845	struct tlock *tlck;
2846
2847	tid = txBegin(inode->i_sb, 0);
2848
2849	DT_GETPAGE(inode, bn, mp, PSIZE, p, rc);
2850
2851	if (rc) {
2852		printk(KERN_ERR "DT_GETPAGE failed!\n");
2853		goto end;
2854	}
2855	BT_MARK_DIRTY(mp, inode);
2856
2857	ASSERT(p->header.flag & BT_LEAF);
2858
2859	tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY);
2860	dtlck = (struct dt_lock *) &tlck->lock;
2861
2862	stbl = DT_GETSTBL(p);
2863	for (i = 0; i < p->header.nextindex; i++) {
2864		d = (struct ldtentry *) &p->slot[stbl[i]];
2865		index = le32_to_cpu(d->index);
2866		if ((index < 2) || (index >= JFS_IP(inode)->next_index)) {
2867			d->index = cpu_to_le32(add_index(tid, inode, bn, i));
2868			if (dtlck->index >= dtlck->maxcnt)
2869				dtlck = (struct dt_lock *) txLinelock(dtlck);
2870			lv = dtlck->lv;
2871			lv->offset = stbl[i];
2872			lv->length = 1;
2873			dtlck->index++;
2874		}
2875	}
2876
2877	DT_PUTPAGE(mp);
2878	(void) txCommit(tid, 1, &inode, 0);
2879end:
2880	txEnd(tid);
2881}
2882
2883/*
2884 * Buffer to hold directory entry info while traversing a dtree page
2885 * before being fed to the filldir function
2886 */
2887struct jfs_dirent {
2888	loff_t position;
2889	int ino;
2890	u16 name_len;
2891	char name[0];
2892};
2893
2894/*
2895 * function to determine next variable-sized jfs_dirent in buffer
2896 */
2897inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
2898{
2899	return (struct jfs_dirent *)
2900		((char *)dirent +
2901		 ((sizeof (struct jfs_dirent) + dirent->name_len + 1 +
2902		   sizeof (loff_t) - 1) &
2903		  ~(sizeof (loff_t) - 1)));
2904}
2905
2906/*
2907 *	jfs_readdir()
2908 *
2909 * function: read directory entries sequentially
2910 *	from the specified entry offset
2911 *
2912 * parameter:
2913 *
2914 * return: offset = (pn, index) of start entry
2915 *	of next jfs_readdir()/dtRead()
2916 */
2917int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
2918{
2919	struct inode *ip = filp->f_dentry->d_inode;
2920	struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
2921	int rc = 0;
2922	loff_t dtpos;	/* legacy OS/2 style position */
2923	struct dtoffset {
2924		s16 pn;
2925		s16 index;
2926		s32 unused;
2927	} *dtoffset = (struct dtoffset *) &dtpos;
2928	s64 bn;
2929	struct metapage *mp;
2930	dtpage_t *p;
2931	int index;
2932	s8 *stbl;
2933	struct btstack btstack;
2934	int i, next;
2935	struct ldtentry *d;
2936	struct dtslot *t;
2937	int d_namleft, len, outlen;
2938	unsigned long dirent_buf;
2939	char *name_ptr;
2940	int dtlhdrdatalen;
2941	u32 dir_index;
2942	int do_index = 0;
2943	uint loop_count = 0;
2944	struct jfs_dirent *jfs_dirent;
2945	int jfs_dirents;
2946	int overflow, fix_page, page_fixed = 0;
2947	static int unique_pos = 2;	/* If we can't fix broken index */
2948
2949	if (filp->f_pos == DIREND)
2950		return 0;
2951
2952	if (DO_INDEX(ip)) {
2953		/*
2954		 * persistent index is stored in directory entries.
2955		 * Special cases:        0 = .
2956		 *                       1 = ..
2957		 *                      -1 = End of directory
2958		 */
2959		do_index = 1;
2960		dtlhdrdatalen = DTLHDRDATALEN;
2961
2962		dir_index = (u32) filp->f_pos;
2963
2964		if (dir_index > 1) {
2965			struct dir_table_slot dirtab_slot;
2966
2967			if (dtEmpty(ip) ||
2968			    (dir_index >= JFS_IP(ip)->next_index)) {
2969				/* Stale position.  Directory has shrunk */
2970				filp->f_pos = DIREND;
2971				return 0;
2972			}
2973		      repeat:
2974			rc = read_index(ip, dir_index, &dirtab_slot);
2975			if (rc) {
2976				filp->f_pos = DIREND;
2977				return rc;
2978			}
2979			if (dirtab_slot.flag == DIR_INDEX_FREE) {
2980				if (loop_count++ > JFS_IP(ip)->next_index) {
2981					jERROR(1, ("jfs_readdir detected "
2982						   "infinite loop!\n"));
2983					filp->f_pos = DIREND;
2984					return 0;
2985				}
2986				dir_index = le32_to_cpu(dirtab_slot.addr2);
2987				if (dir_index == -1) {
2988					filp->f_pos = DIREND;
2989					return 0;
2990				}
2991				goto repeat;
2992			}
2993			bn = addressDTS(&dirtab_slot);
2994			index = dirtab_slot.slot;
2995			DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2996			if (rc) {
2997				filp->f_pos = DIREND;
2998				return 0;
2999			}
3000			if (p->header.flag & BT_INTERNAL) {
3001				jERROR(1,("jfs_readdir: bad index table\n"));
3002				DT_PUTPAGE(mp);
3003				filp->f_pos = -1;
3004				return 0;
3005			}
3006		} else {
3007			if (dir_index == 0) {
3008				/*
3009				 * self "."
3010				 */
3011				filp->f_pos = 0;
3012				if (filldir(dirent, ".", 1, 0, ip->i_ino,
3013					    DT_DIR))
3014					return 0;
3015			}
3016			/*
3017			 * parent ".."
3018			 */
3019			filp->f_pos = 1;
3020			if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR))
3021				return 0;
3022
3023			/*
3024			 * Find first entry of left-most leaf
3025			 */
3026			if (dtEmpty(ip)) {
3027				filp->f_pos = DIREND;
3028				return 0;
3029			}
3030
3031			if ((rc = dtReadFirst(ip, &btstack)))
3032				return -rc;
3033
3034			DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
3035		}
3036	} else {
3037		/*
3038		 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
3039		 *
3040		 * pn = index = 0:      First entry "."
3041		 * pn = 0; index = 1:   Second entry ".."
3042		 * pn > 0:              Real entries, pn=1 -> leftmost page
3043		 * pn = index = -1:     No more entries
3044		 */
3045		dtlhdrdatalen = DTLHDRDATALEN_LEGACY;
3046
3047		dtpos = filp->f_pos;
3048		if (dtpos == 0) {
3049			/* build "." entry */
3050
3051			if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino,
3052				    DT_DIR))
3053				return 0;
3054			dtoffset->index = 1;
3055			filp->f_pos = dtpos;
3056		}
3057
3058		if (dtoffset->pn == 0) {
3059			if (dtoffset->index == 1) {
3060				/* build ".." entry */
3061
3062				if (filldir(dirent, "..", 2, filp->f_pos,
3063					    PARENT(ip), DT_DIR))
3064					return 0;
3065			} else {
3066				jERROR(1,
3067				       ("jfs_readdir called with invalid offset!\n"));
3068			}
3069			dtoffset->pn = 1;
3070			dtoffset->index = 0;
3071			filp->f_pos = dtpos;
3072		}
3073
3074		if (dtEmpty(ip)) {
3075			filp->f_pos = DIREND;
3076			return 0;
3077		}
3078
3079		if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) {
3080			jERROR(1,
3081			       ("jfs_readdir: unexpected rc = %d from dtReadNext\n",
3082				rc));
3083			filp->f_pos = DIREND;
3084			return 0;
3085		}
3086		/* get start leaf page and index */
3087		DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
3088
3089		/* offset beyond directory eof ? */
3090		if (bn < 0) {
3091			filp->f_pos = DIREND;
3092			return 0;
3093		}
3094	}
3095
3096	dirent_buf = __get_free_page(GFP_KERNEL);
3097	if (dirent_buf == 0) {
3098		DT_PUTPAGE(mp);
3099		jERROR(1, ("jfs_readdir: __get_free_page failed!\n"));
3100		filp->f_pos = DIREND;
3101		return -ENOMEM;
3102	}
3103
3104	while (1) {
3105		jfs_dirent = (struct jfs_dirent *) dirent_buf;
3106		jfs_dirents = 0;
3107		overflow = fix_page = 0;
3108
3109		stbl = DT_GETSTBL(p);
3110
3111		for (i = index; i < p->header.nextindex; i++) {
3112			d = (struct ldtentry *) & p->slot[stbl[i]];
3113
3114			if (((long) jfs_dirent + d->namlen + 1) >
3115			    (dirent_buf + PSIZE)) {
3116				/* DBCS codepages could overrun dirent_buf */
3117				index = i;
3118				overflow = 1;
3119				break;
3120			}
3121
3122			d_namleft = d->namlen;
3123			name_ptr = jfs_dirent->name;
3124			jfs_dirent->ino = le32_to_cpu(d->inumber);
3125
3126			if (do_index) {
3127				len = min(d_namleft, DTLHDRDATALEN);
3128				jfs_dirent->position = le32_to_cpu(d->index);
3129				/*
3130				 * d->index should always be valid, but it
3131				 * isn't.  fsck.jfs doesn't create the
3132				 * directory index for the lost+found
3133				 * directory.  Rather than let it go,
3134				 * we can try to fix it.
3135				 */
3136				if ((jfs_dirent->position < 2) ||
3137				    (jfs_dirent->position >=
3138				     JFS_IP(ip)->next_index)) {
3139					if (!page_fixed && !isReadOnly(ip)) {
3140						fix_page = 1;
3141						/*
3142						 * setting overflow and setting
3143						 * index to i will cause the
3144						 * same page to be processed
3145						 * again starting here
3146						 */
3147						overflow = 1;
3148						index = i;
3149						break;
3150					}
3151					jfs_dirent->position = unique_pos++;
3152				}
3153			} else {
3154				jfs_dirent->position = dtpos;
3155				len = min(d_namleft, DTLHDRDATALEN_LEGACY);
3156			}
3157
3158			/* copy the name of head/only segment */
3159			outlen = jfs_strfromUCS_le(name_ptr, d->name, len,
3160						   codepage);
3161			jfs_dirent->name_len = outlen;
3162
3163			/* copy name in the additional segment(s) */
3164			next = d->next;
3165			while (next >= 0) {
3166				t = (struct dtslot *) & p->slot[next];
3167				name_ptr += outlen;
3168				d_namleft -= len;
3169				/* Sanity Check */
3170				if (d_namleft == 0) {
3171					jERROR(1,("JFS:Dtree error: "
3172					  "ino = %ld, bn=%Ld, index = %d\n",
3173						  (long)ip->i_ino, (long long)bn, i));
3174					updateSuper(ip->i_sb, FM_DIRTY);
3175					goto skip_one;
3176				}
3177				len = min(d_namleft, DTSLOTDATALEN);
3178				outlen = jfs_strfromUCS_le(name_ptr, t->name,
3179							   len, codepage);
3180				jfs_dirent->name_len += outlen;
3181
3182				next = t->next;
3183			}
3184
3185			jfs_dirents++;
3186			jfs_dirent = next_jfs_dirent(jfs_dirent);
3187skip_one:
3188			if (!do_index)
3189				dtoffset->index++;
3190		}
3191
3192		if (!overflow) {
3193			/* Point to next leaf page */
3194			if (p->header.flag & BT_ROOT)
3195				bn = 0;
3196			else {
3197				bn = le64_to_cpu(p->header.next);
3198				index = 0;
3199				/* update offset (pn:index) for new page */
3200				if (!do_index) {
3201					dtoffset->pn++;
3202					dtoffset->index = 0;
3203				}
3204			}
3205			page_fixed = 0;
3206		}
3207
3208		/* unpin previous leaf page */
3209		DT_PUTPAGE(mp);
3210
3211		jfs_dirent = (struct jfs_dirent *) dirent_buf;
3212		while (jfs_dirents--) {
3213			filp->f_pos = jfs_dirent->position;
3214			if (filldir(dirent, jfs_dirent->name,
3215				    jfs_dirent->name_len, filp->f_pos,
3216				    jfs_dirent->ino, DT_UNKNOWN))
3217				goto out;
3218			jfs_dirent = next_jfs_dirent(jfs_dirent);
3219		}
3220
3221		if (fix_page) {
3222			add_missing_indices(ip, bn);
3223			page_fixed = 1;
3224		}
3225
3226		if (!overflow && (bn == 0)) {
3227			filp->f_pos = DIREND;
3228			break;
3229		}
3230
3231		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3232		if (rc) {
3233			free_page(dirent_buf);
3234			return -rc;
3235		}
3236	}
3237
3238      out:
3239	free_page(dirent_buf);
3240
3241	return rc;
3242}
3243
3244
3245/*
3246 *	dtReadFirst()
3247 *
3248 * function: get the leftmost page of the directory
3249 */
3250static int dtReadFirst(struct inode *ip, struct btstack * btstack)
3251{
3252	int rc = 0;
3253	s64 bn;
3254	int psize = 288;	/* initial in-line directory */
3255	struct metapage *mp;
3256	dtpage_t *p;
3257	s8 *stbl;
3258	struct btframe *btsp;
3259	pxd_t *xd;
3260
3261	BT_CLR(btstack);	/* reset stack */
3262
3263	/*
3264	 *      descend leftmost path of the tree
3265	 *
3266	 * by convention, root bn = 0.
3267	 */
3268	for (bn = 0;;) {
3269		DT_GETPAGE(ip, bn, mp, psize, p, rc);
3270		if (rc)
3271			return rc;
3272
3273		/*
3274		 * leftmost leaf page
3275		 */
3276		if (p->header.flag & BT_LEAF) {
3277			/* return leftmost entry */
3278			btsp = btstack->top;
3279			btsp->bn = bn;
3280			btsp->index = 0;
3281			btsp->mp = mp;
3282
3283			return 0;
3284		}
3285
3286		/*
3287		 * descend down to leftmost child page
3288		 */
3289		/* push (bn, index) of the parent page/entry */
3290		BT_PUSH(btstack, bn, 0);
3291
3292		/* get the leftmost entry */
3293		stbl = DT_GETSTBL(p);
3294		xd = (pxd_t *) & p->slot[stbl[0]];
3295
3296		/* get the child page block address */
3297		bn = addressPXD(xd);
3298		psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize;
3299
3300		/* unpin the parent page */
3301		DT_PUTPAGE(mp);
3302	}
3303}
3304
3305
3306/*
3307 *	dtReadNext()
3308 *
3309 * function: get the page of the specified offset (pn:index)
3310 *
3311 * return: if (offset > eof), bn = -1;
3312 *
3313 * note: if index > nextindex of the target leaf page,
3314 * start with 1st entry of next leaf page;
3315 */
3316static int dtReadNext(struct inode *ip, loff_t * offset,
3317		      struct btstack * btstack)
3318{
3319	int rc = 0;
3320	struct dtoffset {
3321		s16 pn;
3322		s16 index;
3323		s32 unused;
3324	} *dtoffset = (struct dtoffset *) offset;
3325	s64 bn;
3326	struct metapage *mp;
3327	dtpage_t *p;
3328	int index;
3329	int pn;
3330	s8 *stbl;
3331	struct btframe *btsp, *parent;
3332	pxd_t *xd;
3333
3334	/*
3335	 * get leftmost leaf page pinned
3336	 */
3337	if ((rc = dtReadFirst(ip, btstack)))
3338		return rc;
3339
3340	/* get leaf page */
3341	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
3342
3343	/* get the start offset (pn:index) */
3344	pn = dtoffset->pn - 1;	/* Now pn = 0 represents leftmost leaf */
3345	index = dtoffset->index;
3346
3347	/* start at leftmost page ? */
3348	if (pn == 0) {
3349		/* offset beyond eof ? */
3350		if (index < p->header.nextindex)
3351			goto out;
3352
3353		if (p->header.flag & BT_ROOT) {
3354			bn = -1;
3355			goto out;
3356		}
3357
3358		/* start with 1st entry of next leaf page */
3359		dtoffset->pn++;
3360		dtoffset->index = index = 0;
3361		goto a;
3362	}
3363
3364	/* start at non-leftmost page: scan parent pages for large pn */
3365	if (p->header.flag & BT_ROOT) {
3366		bn = -1;
3367		goto out;
3368	}
3369
3370	/* start after next leaf page ? */
3371	if (pn > 1)
3372		goto b;
3373
3374	/* get leaf page pn = 1 */
3375      a:
3376	bn = le64_to_cpu(p->header.next);
3377
3378	/* unpin leaf page */
3379	DT_PUTPAGE(mp);
3380
3381	/* offset beyond eof ? */
3382	if (bn == 0) {
3383		bn = -1;
3384		goto out;
3385	}
3386
3387	goto c;
3388
3389	/*
3390	 * scan last internal page level to get target leaf page
3391	 */
3392      b:
3393	/* unpin leftmost leaf page */
3394	DT_PUTPAGE(mp);
3395
3396	/* get left most parent page */
3397	btsp = btstack->top;
3398	parent = btsp - 1;
3399	bn = parent->bn;
3400	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3401	if (rc)
3402		return rc;
3403
3404	/* scan parent pages at last internal page level */
3405	while (pn >= p->header.nextindex) {
3406		pn -= p->header.nextindex;
3407
3408		/* get next parent page address */
3409		bn = le64_to_cpu(p->header.next);
3410
3411		/* unpin current parent page */
3412		DT_PUTPAGE(mp);
3413
3414		/* offset beyond eof ? */
3415		if (bn == 0) {
3416			bn = -1;
3417			goto out;
3418		}
3419
3420		/* get next parent page */
3421		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3422		if (rc)
3423			return rc;
3424
3425		/* update parent page stack frame */
3426		parent->bn = bn;
3427	}
3428
3429	/* get leaf page address */
3430	stbl = DT_GETSTBL(p);
3431	xd = (pxd_t *) & p->slot[stbl[pn]];
3432	bn = addressPXD(xd);
3433
3434	/* unpin parent page */
3435	DT_PUTPAGE(mp);
3436
3437	/*
3438	 * get target leaf page
3439	 */
3440      c:
3441	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3442	if (rc)
3443		return rc;
3444
3445	/*
3446	 * leaf page has been completed:
3447	 * start with 1st entry of next leaf page
3448	 */
3449	if (index >= p->header.nextindex) {
3450		bn = le64_to_cpu(p->header.next);
3451
3452		/* unpin leaf page */
3453		DT_PUTPAGE(mp);
3454
3455		/* offset beyond eof ? */
3456		if (bn == 0) {
3457			bn = -1;
3458			goto out;
3459		}
3460
3461		/* get next leaf page */
3462		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3463		if (rc)
3464			return rc;
3465
3466		/* start with 1st entry of next leaf page */
3467		dtoffset->pn++;
3468		dtoffset->index = 0;
3469	}
3470
3471      out:
3472	/* return target leaf page pinned */
3473	btsp = btstack->top;
3474	btsp->bn = bn;
3475	btsp->index = dtoffset->index;
3476	btsp->mp = mp;
3477
3478	return 0;
3479}
3480
3481
3482/*
3483 *	dtCompare()
3484 *
3485 * function: compare search key with an internal entry
3486 *
3487 * return:
3488 *	< 0 if k is < record
3489 *	= 0 if k is = record
3490 *	> 0 if k is > record
3491 */
3492static int dtCompare(struct component_name * key,	/* search key */
3493		     dtpage_t * p,	/* directory page */
3494		     int si)
3495{				/* entry slot index */
3496	wchar_t *kname, *name;
3497	int klen, namlen, len, rc;
3498	struct idtentry *ih;
3499	struct dtslot *t;
3500
3501	/*
3502	 * force the left-most key on internal pages, at any level of
3503	 * the tree, to be less than any search key.
3504	 * this obviates having to update the leftmost key on an internal
3505	 * page when the user inserts a new key in the tree smaller than
3506	 * anything that has been stored.
3507	 *
3508	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
3509	 * at any internal page at any level of the tree,
3510	 * it descends to child of the entry anyway -
3511	 * ? make the entry as min size dummy entry)
3512	 *
3513	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
3514	 * return (1);
3515	 */
3516
3517	kname = key->name;
3518	klen = key->namlen;
3519
3520	ih = (struct idtentry *) & p->slot[si];
3521	si = ih->next;
3522	name = ih->name;
3523	namlen = ih->namlen;
3524	len = min(namlen, DTIHDRDATALEN);
3525
3526	/* compare with head/only segment */
3527	len = min(klen, len);
3528	if ((rc = UniStrncmp_le(kname, name, len)))
3529		return rc;
3530
3531	klen -= len;
3532	namlen -= len;
3533
3534	/* compare with additional segment(s) */
3535	kname += len;
3536	while (klen > 0 && namlen > 0) {
3537		/* compare with next name segment */
3538		t = (struct dtslot *) & p->slot[si];
3539		len = min(namlen, DTSLOTDATALEN);
3540		len = min(klen, len);
3541		name = t->name;
3542		if ((rc = UniStrncmp_le(kname, name, len)))
3543			return rc;
3544
3545		klen -= len;
3546		namlen -= len;
3547		kname += len;
3548		si = t->next;
3549	}
3550
3551	return (klen - namlen);
3552}
3553
3554
3555
3556
3557/*
3558 *	ciCompare()
3559 *
3560 * function: compare search key with an (leaf/internal) entry
3561 *
3562 * return:
3563 *	< 0 if k is < record
3564 *	= 0 if k is = record
3565 *	> 0 if k is > record
3566 */
3567static int ciCompare(struct component_name * key,	/* search key */
3568		     dtpage_t * p,	/* directory page */
3569		     int si,	/* entry slot index */
3570		     int flag)
3571{
3572	wchar_t *kname, *name, x;
3573	int klen, namlen, len, rc;
3574	struct ldtentry *lh;
3575	struct idtentry *ih;
3576	struct dtslot *t;
3577	int i;
3578
3579	/*
3580	 * force the left-most key on internal pages, at any level of
3581	 * the tree, to be less than any search key.
3582	 * this obviates having to update the leftmost key on an internal
3583	 * page when the user inserts a new key in the tree smaller than
3584	 * anything that has been stored.
3585	 *
3586	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
3587	 * at any internal page at any level of the tree,
3588	 * it descends to child of the entry anyway -
3589	 * ? make the entry as min size dummy entry)
3590	 *
3591	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
3592	 * return (1);
3593	 */
3594
3595	kname = key->name;
3596	klen = key->namlen;
3597
3598	/*
3599	 * leaf page entry
3600	 */
3601	if (p->header.flag & BT_LEAF) {
3602		lh = (struct ldtentry *) & p->slot[si];
3603		si = lh->next;
3604		name = lh->name;
3605		namlen = lh->namlen;
3606		if (flag & JFS_DIR_INDEX)
3607			len = min(namlen, DTLHDRDATALEN);
3608		else
3609			len = min(namlen, DTLHDRDATALEN_LEGACY);
3610	}
3611	/*
3612	 * internal page entry
3613	 */
3614	else {
3615		ih = (struct idtentry *) & p->slot[si];
3616		si = ih->next;
3617		name = ih->name;
3618		namlen = ih->namlen;
3619		len = min(namlen, DTIHDRDATALEN);
3620	}
3621
3622	/* compare with head/only segment */
3623	len = min(klen, len);
3624	for (i = 0; i < len; i++, kname++, name++) {
3625		/* only uppercase if case-insensitive support is on */
3626		if ((flag & JFS_OS2) == JFS_OS2)
3627			x = UniToupper(le16_to_cpu(*name));
3628		else
3629			x = le16_to_cpu(*name);
3630		if ((rc = *kname - x))
3631			return rc;
3632	}
3633
3634	klen -= len;
3635	namlen -= len;
3636
3637	/* compare with additional segment(s) */
3638	while (klen > 0 && namlen > 0) {
3639		/* compare with next name segment */
3640		t = (struct dtslot *) & p->slot[si];
3641		len = min(namlen, DTSLOTDATALEN);
3642		len = min(klen, len);
3643		name = t->name;
3644		for (i = 0; i < len; i++, kname++, name++) {
3645			/* only uppercase if case-insensitive support is on */
3646			if ((flag & JFS_OS2) == JFS_OS2)
3647				x = UniToupper(le16_to_cpu(*name));
3648			else
3649				x = le16_to_cpu(*name);
3650
3651			if ((rc = *kname - x))
3652				return rc;
3653		}
3654
3655		klen -= len;
3656		namlen -= len;
3657		si = t->next;
3658	}
3659
3660	return (klen - namlen);
3661}
3662
3663
3664/*
3665 *	ciGetLeafPrefixKey()
3666 *
3667 * function: compute prefix of suffix compression
3668 *	     from two adjacent leaf entries
3669 *	     across page boundary
3670 *
3671 * return:
3672 *	Number of prefix bytes needed to distinguish b from a.
3673 */
3674static void ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
3675			       int ri, struct component_name * key, int flag)
3676{
3677	int klen, namlen;
3678	wchar_t *pl, *pr, *kname;
3679	wchar_t lname[JFS_NAME_MAX + 1];
3680	struct component_name lkey = { 0, lname };
3681	wchar_t rname[JFS_NAME_MAX + 1];
3682	struct component_name rkey = { 0, rname };
3683
3684	/* get left and right key */
3685	dtGetKey(lp, li, &lkey, flag);
3686	lkey.name[lkey.namlen] = 0;
3687
3688	if ((flag & JFS_OS2) == JFS_OS2)
3689		ciToUpper(&lkey);
3690
3691	dtGetKey(rp, ri, &rkey, flag);
3692	rkey.name[rkey.namlen] = 0;
3693
3694
3695	if ((flag & JFS_OS2) == JFS_OS2)
3696		ciToUpper(&rkey);
3697
3698	/* compute prefix */
3699	klen = 0;
3700	kname = key->name;
3701	namlen = min(lkey.namlen, rkey.namlen);
3702	for (pl = lkey.name, pr = rkey.name;
3703	     namlen; pl++, pr++, namlen--, klen++, kname++) {
3704		*kname = *pr;
3705		if (*pl != *pr) {
3706			key->namlen = klen + 1;
3707			return;
3708		}
3709	}
3710
3711	/* l->namlen <= r->namlen since l <= r */
3712	if (lkey.namlen < rkey.namlen) {
3713		*kname = *pr;
3714		key->namlen = klen + 1;
3715	} else			/* l->namelen == r->namelen */
3716		key->namlen = klen;
3717
3718	return;
3719}
3720
3721
3722
3723/*
3724 *	dtGetKey()
3725 *
3726 * function: get key of the entry
3727 */
3728static void dtGetKey(dtpage_t * p, int i,	/* entry index */
3729		     struct component_name * key, int flag)
3730{
3731	int si;
3732	s8 *stbl;
3733	struct ldtentry *lh;
3734	struct idtentry *ih;
3735	struct dtslot *t;
3736	int namlen, len;
3737	wchar_t *name, *kname;
3738
3739	/* get entry */
3740	stbl = DT_GETSTBL(p);
3741	si = stbl[i];
3742	if (p->header.flag & BT_LEAF) {
3743		lh = (struct ldtentry *) & p->slot[si];
3744		si = lh->next;
3745		namlen = lh->namlen;
3746		name = lh->name;
3747		if (flag & JFS_DIR_INDEX)
3748			len = min(namlen, DTLHDRDATALEN);
3749		else
3750			len = min(namlen, DTLHDRDATALEN_LEGACY);
3751	} else {
3752		ih = (struct idtentry *) & p->slot[si];
3753		si = ih->next;
3754		namlen = ih->namlen;
3755		name = ih->name;
3756		len = min(namlen, DTIHDRDATALEN);
3757	}
3758
3759	key->namlen = namlen;
3760	kname = key->name;
3761
3762	/*
3763	 * move head/only segment
3764	 */
3765	UniStrncpy_le(kname, name, len);
3766
3767	/*
3768	 * move additional segment(s)
3769	 */
3770	while (si >= 0) {
3771		/* get next segment */
3772		t = &p->slot[si];
3773		kname += len;
3774		namlen -= len;
3775		len = min(namlen, DTSLOTDATALEN);
3776		UniStrncpy_le(kname, t->name, len);
3777
3778		si = t->next;
3779	}
3780}
3781
3782
3783/*
3784 *	dtInsertEntry()
3785 *
3786 * function: allocate free slot(s) and
3787 *	     write a leaf/internal entry
3788 *
3789 * return: entry slot index
3790 */
3791static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
3792			  ddata_t * data, struct dt_lock ** dtlock)
3793{
3794	struct dtslot *h, *t;
3795	struct ldtentry *lh = 0;
3796	struct idtentry *ih = 0;
3797	int hsi, fsi, klen, len, nextindex;
3798	wchar_t *kname, *name;
3799	s8 *stbl;
3800	pxd_t *xd;
3801	struct dt_lock *dtlck = *dtlock;
3802	struct lv *lv;
3803	int xsi, n;
3804	s64 bn = 0;
3805	struct metapage *mp = 0;
3806
3807	klen = key->namlen;
3808	kname = key->name;
3809
3810	/* allocate a free slot */
3811	hsi = fsi = p->header.freelist;
3812	h = &p->slot[fsi];
3813	p->header.freelist = h->next;
3814	--p->header.freecnt;
3815
3816	/* open new linelock */
3817	if (dtlck->index >= dtlck->maxcnt)
3818		dtlck = (struct dt_lock *) txLinelock(dtlck);
3819
3820	lv = & dtlck->lv[dtlck->index];
3821	lv->offset = hsi;
3822
3823	/* write head/only segment */
3824	if (p->header.flag & BT_LEAF) {
3825		lh = (struct ldtentry *) h;
3826		lh->next = h->next;
3827		lh->inumber = data->leaf.ino;	/* little-endian */
3828		lh->namlen = klen;
3829		name = lh->name;
3830		if (data->leaf.ip) {
3831			len = min(klen, DTLHDRDATALEN);
3832			if (!(p->header.flag & BT_ROOT))
3833				bn = addressPXD(&p->header.self);
3834			lh->index = cpu_to_le32(add_index(data->leaf.tid,
3835							  data->leaf.ip,
3836							  bn, index));
3837		} else
3838			len = min(klen, DTLHDRDATALEN_LEGACY);
3839	} else {
3840		ih = (struct idtentry *) h;
3841		ih->next = h->next;
3842		xd = (pxd_t *) ih;
3843		*xd = data->xd;
3844		ih->namlen = klen;
3845		name = ih->name;
3846		len = min(klen, DTIHDRDATALEN);
3847	}
3848
3849	UniStrncpy_le(name, kname, len);
3850
3851	n = 1;
3852	xsi = hsi;
3853
3854	/* write additional segment(s) */
3855	t = h;
3856	klen -= len;
3857	while (klen) {
3858		/* get free slot */
3859		fsi = p->header.freelist;
3860		t = &p->slot[fsi];
3861		p->header.freelist = t->next;
3862		--p->header.freecnt;
3863
3864		/* is next slot contiguous ? */
3865		if (fsi != xsi + 1) {
3866			/* close current linelock */
3867			lv->length = n;
3868			dtlck->index++;
3869
3870			/* open new linelock */
3871			if (dtlck->index < dtlck->maxcnt)
3872				lv++;
3873			else {
3874				dtlck = (struct dt_lock *) txLinelock(dtlck);
3875				lv = & dtlck->lv[0];
3876			}
3877
3878			lv->offset = fsi;
3879			n = 0;
3880		}
3881
3882		kname += len;
3883		len = min(klen, DTSLOTDATALEN);
3884		UniStrncpy_le(t->name, kname, len);
3885
3886		n++;
3887		xsi = fsi;
3888		klen -= len;
3889	}
3890
3891	/* close current linelock */
3892	lv->length = n;
3893	dtlck->index++;
3894
3895	*dtlock = dtlck;
3896
3897	/* terminate last/only segment */
3898	if (h == t) {
3899		/* single segment entry */
3900		if (p->header.flag & BT_LEAF)
3901			lh->next = -1;
3902		else
3903			ih->next = -1;
3904	} else
3905		/* multi-segment entry */
3906		t->next = -1;
3907
3908	/* if insert into middle, shift right succeeding entries in stbl */
3909	stbl = DT_GETSTBL(p);
3910	nextindex = p->header.nextindex;
3911	if (index < nextindex) {
3912		memmove(stbl + index + 1, stbl + index, nextindex - index);
3913
3914		if ((p->header.flag & BT_LEAF) && data->leaf.ip) {
3915			/*
3916			 * Need to update slot number for entries that moved
3917			 * in the stbl
3918			 */
3919			mp = 0;
3920			for (n = index + 1; n <= nextindex; n++) {
3921				lh = (struct ldtentry *) & (p->slot[stbl[n]]);
3922				modify_index(data->leaf.tid, data->leaf.ip,
3923					     le32_to_cpu(lh->index), bn, n,
3924					     &mp);
3925			}
3926			if (mp)
3927				release_metapage(mp);
3928		}
3929	}
3930
3931	stbl[index] = hsi;
3932
3933	/* advance next available entry index of stbl */
3934	++p->header.nextindex;
3935}
3936
3937
3938/*
3939 *	dtMoveEntry()
3940 *
3941 * function: move entries from split/left page to new/right page
3942 *
3943 *	nextindex of dst page and freelist/freecnt of both pages
3944 *	are updated.
3945 */
3946static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
3947			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
3948			int do_index)
3949{
3950	int ssi, next;		/* src slot index */
3951	int di;			/* dst entry index */
3952	int dsi;		/* dst slot index */
3953	s8 *sstbl, *dstbl;	/* sorted entry table */
3954	int snamlen, len;
3955	struct ldtentry *slh, *dlh = 0;
3956	struct idtentry *sih, *dih = 0;
3957	struct dtslot *h, *s, *d;
3958	struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock;
3959	struct lv *slv, *dlv;
3960	int xssi, ns, nd;
3961	int sfsi;
3962
3963	sstbl = (s8 *) & sp->slot[sp->header.stblindex];
3964	dstbl = (s8 *) & dp->slot[dp->header.stblindex];
3965
3966	dsi = dp->header.freelist;	/* first (whole page) free slot */
3967	sfsi = sp->header.freelist;
3968
3969	/* linelock destination entry slot */
3970	dlv = & ddtlck->lv[ddtlck->index];
3971	dlv->offset = dsi;
3972
3973	/* linelock source entry slot */
3974	slv = & sdtlck->lv[sdtlck->index];
3975	slv->offset = sstbl[si];
3976	xssi = slv->offset - 1;
3977
3978	/*
3979	 * move entries
3980	 */
3981	ns = nd = 0;
3982	for (di = 0; si < sp->header.nextindex; si++, di++) {
3983		ssi = sstbl[si];
3984		dstbl[di] = dsi;
3985
3986		/* is next slot contiguous ? */
3987		if (ssi != xssi + 1) {
3988			/* close current linelock */
3989			slv->length = ns;
3990			sdtlck->index++;
3991
3992			/* open new linelock */
3993			if (sdtlck->index < sdtlck->maxcnt)
3994				slv++;
3995			else {
3996				sdtlck = (struct dt_lock *) txLinelock(sdtlck);
3997				slv = & sdtlck->lv[0];
3998			}
3999
4000			slv->offset = ssi;
4001			ns = 0;
4002		}
4003
4004		/*
4005		 * move head/only segment of an entry
4006		 */
4007		/* get dst slot */
4008		h = d = &dp->slot[dsi];
4009
4010		/* get src slot and move */
4011		s = &sp->slot[ssi];
4012		if (sp->header.flag & BT_LEAF) {
4013			/* get source entry */
4014			slh = (struct ldtentry *) s;
4015			dlh = (struct ldtentry *) h;
4016			snamlen = slh->namlen;
4017
4018			if (do_index) {
4019				len = min(snamlen, DTLHDRDATALEN);
4020				dlh->index = slh->index; /* little-endian */
4021			} else
4022				len = min(snamlen, DTLHDRDATALEN_LEGACY);
4023
4024			memcpy(dlh, slh, 6 + len * 2);
4025
4026			next = slh->next;
4027
4028			/* update dst head/only segment next field */
4029			dsi++;
4030			dlh->next = dsi;
4031		} else {
4032			sih = (struct idtentry *) s;
4033			snamlen = sih->namlen;
4034
4035			len = min(snamlen, DTIHDRDATALEN);
4036			dih = (struct idtentry *) h;
4037			memcpy(dih, sih, 10 + len * 2);
4038			next = sih->next;
4039
4040			dsi++;
4041			dih->next = dsi;
4042		}
4043
4044		/* free src head/only segment */
4045		s->next = sfsi;
4046		s->cnt = 1;
4047		sfsi = ssi;
4048
4049		ns++;
4050		nd++;
4051		xssi = ssi;
4052
4053		/*
4054		 * move additional segment(s) of the entry
4055		 */
4056		snamlen -= len;
4057		while ((ssi = next) >= 0) {
4058			/* is next slot contiguous ? */
4059			if (ssi != xssi + 1) {
4060				/* close current linelock */
4061				slv->length = ns;
4062				sdtlck->index++;
4063
4064				/* open new linelock */
4065				if (sdtlck->index < sdtlck->maxcnt)
4066					slv++;
4067				else {
4068					sdtlck =
4069					    (struct dt_lock *)
4070					    txLinelock(sdtlck);
4071					slv = & sdtlck->lv[0];
4072				}
4073
4074				slv->offset = ssi;
4075				ns = 0;
4076			}
4077
4078			/* get next source segment */
4079			s = &sp->slot[ssi];
4080
4081			/* get next destination free slot */
4082			d++;
4083
4084			len = min(snamlen, DTSLOTDATALEN);
4085			UniStrncpy(d->name, s->name, len);
4086
4087			ns++;
4088			nd++;
4089			xssi = ssi;
4090
4091			dsi++;
4092			d->next = dsi;
4093
4094			/* free source segment */
4095			next = s->next;
4096			s->next = sfsi;
4097			s->cnt = 1;
4098			sfsi = ssi;
4099
4100			snamlen -= len;
4101		}		/* end while */
4102
4103		/* terminate dst last/only segment */
4104		if (h == d) {
4105			/* single segment entry */
4106			if (dp->header.flag & BT_LEAF)
4107				dlh->next = -1;
4108			else
4109				dih->next = -1;
4110		} else
4111			/* multi-segment entry */
4112			d->next = -1;
4113	}			/* end for */
4114
4115	/* close current linelock */
4116	slv->length = ns;
4117	sdtlck->index++;
4118	*sdtlock = sdtlck;
4119
4120	dlv->length = nd;
4121	ddtlck->index++;
4122	*ddtlock = ddtlck;
4123
4124	/* update source header */
4125	sp->header.freelist = sfsi;
4126	sp->header.freecnt += nd;
4127
4128	/* update destination header */
4129	dp->header.nextindex = di;
4130
4131	dp->header.freelist = dsi;
4132	dp->header.freecnt -= nd;
4133}
4134
4135
4136/*
4137 *	dtDeleteEntry()
4138 *
4139 * function: free a (leaf/internal) entry
4140 *
4141 * log freelist header, stbl, and each segment slot of entry
4142 * (even though last/only segment next field is modified,
4143 * physical image logging requires all segment slots of
4144 * the entry logged to avoid applying previous updates
4145 * to the same slots)
4146 */
4147static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock)
4148{
4149	int fsi;		/* free entry slot index */
4150	s8 *stbl;
4151	struct dtslot *t;
4152	int si, freecnt;
4153	struct dt_lock *dtlck = *dtlock;
4154	struct lv *lv;
4155	int xsi, n;
4156
4157	/* get free entry slot index */
4158	stbl = DT_GETSTBL(p);
4159	fsi = stbl[fi];
4160
4161	/* open new linelock */
4162	if (dtlck->index >= dtlck->maxcnt)
4163		dtlck = (struct dt_lock *) txLinelock(dtlck);
4164	lv = & dtlck->lv[dtlck->index];
4165
4166	lv->offset = fsi;
4167
4168	/* get the head/only segment */
4169	t = &p->slot[fsi];
4170	if (p->header.flag & BT_LEAF)
4171		si = ((struct ldtentry *) t)->next;
4172	else
4173		si = ((struct idtentry *) t)->next;
4174	t->next = si;
4175	t->cnt = 1;
4176
4177	n = freecnt = 1;
4178	xsi = fsi;
4179
4180	/* find the last/only segment */
4181	while (si >= 0) {
4182		/* is next slot contiguous ? */
4183		if (si != xsi + 1) {
4184			/* close current linelock */
4185			lv->length = n;
4186			dtlck->index++;
4187
4188			/* open new linelock */
4189			if (dtlck->index < dtlck->maxcnt)
4190				lv++;
4191			else {
4192				dtlck = (struct dt_lock *) txLinelock(dtlck);
4193				lv = & dtlck->lv[0];
4194			}
4195
4196			lv->offset = si;
4197			n = 0;
4198		}
4199
4200		n++;
4201		xsi = si;
4202		freecnt++;
4203
4204		t = &p->slot[si];
4205		t->cnt = 1;
4206		si = t->next;
4207	}
4208
4209	/* close current linelock */
4210	lv->length = n;
4211	dtlck->index++;
4212
4213	*dtlock = dtlck;
4214
4215	/* update freelist */
4216	t->next = p->header.freelist;
4217	p->header.freelist = fsi;
4218	p->header.freecnt += freecnt;
4219
4220	/* if delete from middle,
4221	 * shift left the succedding entries in the stbl
4222	 */
4223	si = p->header.nextindex;
4224	if (fi < si - 1)
4225		memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1);
4226
4227	p->header.nextindex--;
4228}
4229
4230
4231/*
4232 *	dtTruncateEntry()
4233 *
4234 * function: truncate a (leaf/internal) entry
4235 *
4236 * log freelist header, stbl, and each segment slot of entry
4237 * (even though last/only segment next field is modified,
4238 * physical image logging requires all segment slots of
4239 * the entry logged to avoid applying previous updates
4240 * to the same slots)
4241 */
4242static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock)
4243{
4244	int tsi;		/* truncate entry slot index */
4245	s8 *stbl;
4246	struct dtslot *t;
4247	int si, freecnt;
4248	struct dt_lock *dtlck = *dtlock;
4249	struct lv *lv;
4250	int fsi, xsi, n;
4251
4252	/* get free entry slot index */
4253	stbl = DT_GETSTBL(p);
4254	tsi = stbl[ti];
4255
4256	/* open new linelock */
4257	if (dtlck->index >= dtlck->maxcnt)
4258		dtlck = (struct dt_lock *) txLinelock(dtlck);
4259	lv = & dtlck->lv[dtlck->index];
4260
4261	lv->offset = tsi;
4262
4263	/* get the head/only segment */
4264	t = &p->slot[tsi];
4265	ASSERT(p->header.flag & BT_INTERNAL);
4266	((struct idtentry *) t)->namlen = 0;
4267	si = ((struct idtentry *) t)->next;
4268	((struct idtentry *) t)->next = -1;
4269
4270	n = 1;
4271	freecnt = 0;
4272	fsi = si;
4273	xsi = tsi;
4274
4275	/* find the last/only segment */
4276	while (si >= 0) {
4277		/* is next slot contiguous ? */
4278		if (si != xsi + 1) {
4279			/* close current linelock */
4280			lv->length = n;
4281			dtlck->index++;
4282
4283			/* open new linelock */
4284			if (dtlck->index < dtlck->maxcnt)
4285				lv++;
4286			else {
4287				dtlck = (struct dt_lock *) txLinelock(dtlck);
4288				lv = & dtlck->lv[0];
4289			}
4290
4291			lv->offset = si;
4292			n = 0;
4293		}
4294
4295		n++;
4296		xsi = si;
4297		freecnt++;
4298
4299		t = &p->slot[si];
4300		t->cnt = 1;
4301		si = t->next;
4302	}
4303
4304	/* close current linelock */
4305	lv->length = n;
4306	dtlck->index++;
4307
4308	*dtlock = dtlck;
4309
4310	/* update freelist */
4311	if (freecnt == 0)
4312		return;
4313	t->next = p->header.freelist;
4314	p->header.freelist = fsi;
4315	p->header.freecnt += freecnt;
4316}
4317
4318
4319/*
4320 *	dtLinelockFreelist()
4321 */
4322static void dtLinelockFreelist(dtpage_t * p,	/* directory page */
4323			       int m,	/* max slot index */
4324			       struct dt_lock ** dtlock)
4325{
4326	int fsi;		/* free entry slot index */
4327	struct dtslot *t;
4328	int si;
4329	struct dt_lock *dtlck = *dtlock;
4330	struct lv *lv;
4331	int xsi, n;
4332
4333	/* get free entry slot index */
4334	fsi = p->header.freelist;
4335
4336	/* open new linelock */
4337	if (dtlck->index >= dtlck->maxcnt)
4338		dtlck = (struct dt_lock *) txLinelock(dtlck);
4339	lv = & dtlck->lv[dtlck->index];
4340
4341	lv->offset = fsi;
4342
4343	n = 1;
4344	xsi = fsi;
4345
4346	t = &p->slot[fsi];
4347	si = t->next;
4348
4349	/* find the last/only segment */
4350	while (si < m && si >= 0) {
4351		/* is next slot contiguous ? */
4352		if (si != xsi + 1) {
4353			/* close current linelock */
4354			lv->length = n;
4355			dtlck->index++;
4356
4357			/* open new linelock */
4358			if (dtlck->index < dtlck->maxcnt)
4359				lv++;
4360			else {
4361				dtlck = (struct dt_lock *) txLinelock(dtlck);
4362				lv = & dtlck->lv[0];
4363			}
4364
4365			lv->offset = si;
4366			n = 0;
4367		}
4368
4369		n++;
4370		xsi = si;
4371
4372		t = &p->slot[si];
4373		si = t->next;
4374	}
4375
4376	/* close current linelock */
4377	lv->length = n;
4378	dtlck->index++;
4379
4380	*dtlock = dtlck;
4381}
4382
4383
4384/*
4385 * NAME: dtModify
4386 *
4387 * FUNCTION: Modify the inode number part of a directory entry
4388 *
4389 * PARAMETERS:
4390 *	tid	- Transaction id
4391 *	ip	- Inode of parent directory
4392 *	key	- Name of entry to be modified
4393 *	orig_ino	- Original inode number expected in entry
4394 *	new_ino	- New inode number to put into entry
4395 *	flag	- JFS_RENAME
4396 *
4397 * RETURNS:
4398 *	ESTALE	- If entry found does not match orig_ino passed in
4399 *	ENOENT	- If no entry can be found to match key
4400 *	0	- If successfully modified entry
4401 */
4402int dtModify(tid_t tid, struct inode *ip,
4403	 struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag)
4404{
4405	int rc;
4406	s64 bn;
4407	struct metapage *mp;
4408	dtpage_t *p;
4409	int index;
4410	struct btstack btstack;
4411	struct tlock *tlck;
4412	struct dt_lock *dtlck;
4413	struct lv *lv;
4414	s8 *stbl;
4415	int entry_si;		/* entry slot index */
4416	struct ldtentry *entry;
4417
4418	/*
4419	 *      search for the entry to modify:
4420	 *
4421	 * dtSearch() returns (leaf page pinned, index at which to modify).
4422	 */
4423	if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag)))
4424		return rc;
4425
4426	/* retrieve search result */
4427	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
4428
4429	BT_MARK_DIRTY(mp, ip);
4430	/*
4431	 * acquire a transaction lock on the leaf page of named entry
4432	 */
4433	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
4434	dtlck = (struct dt_lock *) & tlck->lock;
4435
4436	/* get slot index of the entry */
4437	stbl = DT_GETSTBL(p);
4438	entry_si = stbl[index];
4439
4440	/* linelock entry */
4441	ASSERT(dtlck->index == 0);
4442	lv = & dtlck->lv[0];
4443	lv->offset = entry_si;
4444	lv->length = 1;
4445	dtlck->index++;
4446
4447	/* get the head/only segment */
4448	entry = (struct ldtentry *) & p->slot[entry_si];
4449
4450	/* substitute the inode number of the entry */
4451	entry->inumber = cpu_to_le32(new_ino);
4452
4453	/* unpin the leaf page */
4454	DT_PUTPAGE(mp);
4455
4456	return 0;
4457}
4458
4459#ifdef _JFS_DEBUG_DTREE
4460/*
4461 *	dtDisplayTree()
4462 *
4463 * function: traverse forward
4464 */
4465int dtDisplayTree(struct inode *ip)
4466{
4467	int rc;
4468	struct metapage *mp;
4469	dtpage_t *p;
4470	s64 bn, pbn;
4471	int index, lastindex, v, h;
4472	pxd_t *xd;
4473	struct btstack btstack;
4474	struct btframe *btsp;
4475	struct btframe *parent;
4476	u8 *stbl;
4477	int psize = 256;
4478
4479	printk("display B+-tree.\n");
4480
4481	/* clear stack */
4482	btsp = btstack.stack;
4483
4484	/*
4485	 * start with root
4486	 *
4487	 * root resides in the inode
4488	 */
4489	bn = 0;
4490	v = h = 0;
4491
4492	/*
4493	 * first access of each page:
4494	 */
4495      newPage:
4496	DT_GETPAGE(ip, bn, mp, psize, p, rc);
4497	if (rc)
4498		return rc;
4499
4500	/* process entries forward from first index */
4501	index = 0;
4502	lastindex = p->header.nextindex - 1;
4503
4504	if (p->header.flag & BT_INTERNAL) {
4505		/*
4506		 * first access of each internal page
4507		 */
4508		printf("internal page ");
4509		dtDisplayPage(ip, bn, p);
4510
4511		goto getChild;
4512	} else {		/* (p->header.flag & BT_LEAF) */
4513
4514		/*
4515		 * first access of each leaf page
4516		 */
4517		printf("leaf page ");
4518		dtDisplayPage(ip, bn, p);
4519
4520		/*
4521		 * process leaf page entries
4522		 *
4523		 for ( ; index <= lastindex; index++)
4524		 {
4525		 }
4526		 */
4527
4528		/* unpin the leaf page */
4529		DT_PUTPAGE(mp);
4530	}
4531
4532	/*
4533	 * go back up to the parent page
4534	 */
4535      getParent:
4536	/* pop/restore parent entry for the current child page */
4537	if ((parent = (btsp == btstack.stack ? NULL : --btsp)) == NULL)
4538		/* current page must have been root */
4539		return;
4540
4541	/*
4542	 * parent page scan completed
4543	 */
4544	if ((index = parent->index) == (lastindex = parent->lastindex)) {
4545		/* go back up to the parent page */
4546		goto getParent;
4547	}
4548
4549	/*
4550	 * parent page has entries remaining
4551	 */
4552	/* get back the parent page */
4553	bn = parent->bn;
4554	/* v = parent->level; */
4555	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4556	if (rc)
4557		return rc;
4558
4559	/* get next parent entry */
4560	index++;
4561
4562	/*
4563	 * internal page: go down to child page of current entry
4564	 */
4565      getChild:
4566	/* push/save current parent entry for the child page */
4567	btsp->bn = pbn = bn;
4568	btsp->index = index;
4569	btsp->lastindex = lastindex;
4570	/* btsp->level = v; */
4571	/* btsp->node = h; */
4572	++btsp;
4573
4574	/* get current entry for the child page */
4575	stbl = DT_GETSTBL(p);
4576	xd = (pxd_t *) & p->slot[stbl[index]];
4577
4578	/*
4579	 * first access of each internal entry:
4580	 */
4581
4582	/* get child page */
4583	bn = addressPXD(xd);
4584	psize = lengthPXD(xd) << ip->i_ipmnt->i_l2bsize;
4585
4586	printk("traverse down 0x%Lx[%d]->0x%Lx\n", pbn, index, bn);
4587	v++;
4588	h = index;
4589
4590	/* release parent page */
4591	DT_PUTPAGE(mp);
4592
4593	/* process the child page */
4594	goto newPage;
4595}
4596
4597
4598/*
4599 *	dtDisplayPage()
4600 *
4601 * function: display page
4602 */
4603int dtDisplayPage(struct inode *ip, s64 bn, dtpage_t * p)
4604{
4605	int rc;
4606	struct metapage *mp;
4607	struct ldtentry *lh;
4608	struct idtentry *ih;
4609	pxd_t *xd;
4610	int i, j;
4611	u8 *stbl;
4612	wchar_t name[JFS_NAME_MAX + 1];
4613	struct component_name key = { 0, name };
4614	int freepage = 0;
4615
4616	if (p == NULL) {
4617		freepage = 1;
4618		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4619		if (rc)
4620			return rc;
4621	}
4622
4623	/* display page control */
4624	printk("bn:0x%Lx flag:0x%08x nextindex:%d\n",
4625	       bn, p->header.flag, p->header.nextindex);
4626
4627	/* display entries */
4628	stbl = DT_GETSTBL(p);
4629	for (i = 0, j = 1; i < p->header.nextindex; i++, j++) {
4630		dtGetKey(p, i, &key, JFS_SBI(ip->i_sb)->mntflag);
4631		key.name[key.namlen] = '\0';
4632		if (p->header.flag & BT_LEAF) {
4633			lh = (struct ldtentry *) & p->slot[stbl[i]];
4634			printf("\t[%d] %s:%d", i, key.name,
4635			       le32_to_cpu(lh->inumber));
4636		} else {
4637			ih = (struct idtentry *) & p->slot[stbl[i]];
4638			xd = (pxd_t *) ih;
4639			bn = addressPXD(xd);
4640			printf("\t[%d] %s:0x%Lx", i, key.name, bn);
4641		}
4642
4643		if (j == 4) {
4644			printf("\n");
4645			j = 0;
4646		}
4647	}
4648
4649	printf("\n");
4650
4651	if (freepage)
4652		DT_PUTPAGE(mp);
4653
4654	return 0;
4655}
4656#endif				/* _JFS_DEBUG_DTREE */
4657