1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 *   Copyright (C) International Business Machines Corp., 2000-2004
4 */
5
6/*
7 *	jfs_dtree.c: directory B+-tree manager
8 *
9 * B+-tree with variable length key directory:
10 *
11 * each directory page is structured as an array of 32-byte
12 * directory entry slots initialized as a freelist
13 * to avoid search/compaction of free space at insertion.
14 * when an entry is inserted, a number of slots are allocated
15 * from the freelist as required to store variable length data
16 * of the entry; when the entry is deleted, slots of the entry
17 * are returned to freelist.
18 *
19 * leaf entry stores full name as key and file serial number
20 * (aka inode number) as data.
21 * internal/router entry stores sufffix compressed name
22 * as key and simple extent descriptor as data.
23 *
24 * each directory page maintains a sorted entry index table
25 * which stores the start slot index of sorted entries
26 * to allow binary search on the table.
27 *
28 * directory starts as a root/leaf page in on-disk inode
29 * inline data area.
30 * when it becomes full, it starts a leaf of a external extent
31 * of length of 1 block. each time the first leaf becomes full,
32 * it is extended rather than split (its size is doubled),
33 * until its length becoms 4 KBytes, from then the extent is split
34 * with new 4 Kbyte extent when it becomes full
35 * to reduce external fragmentation of small directories.
36 *
37 * blah, blah, blah, for linear scan of directory in pieces by
38 * readdir().
39 *
40 *
41 *	case-insensitive directory file system
42 *
43 * names are stored in case-sensitive way in leaf entry.
44 * but stored, searched and compared in case-insensitive (uppercase) order
45 * (i.e., both search key and entry key are folded for search/compare):
46 * (note that case-sensitive order is BROKEN in storage, e.g.,
47 *  sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad
48 *
49 *  entries which folds to the same key makes up a equivalent class
50 *  whose members are stored as contiguous cluster (may cross page boundary)
51 *  but whose order is arbitrary and acts as duplicate, e.g.,
52 *  abc, Abc, aBc, abC)
53 *
54 * once match is found at leaf, requires scan forward/backward
55 * either for, in case-insensitive search, duplicate
56 * or for, in case-sensitive search, for exact match
57 *
58 * router entry must be created/stored in case-insensitive way
59 * in internal entry:
60 * (right most key of left page and left most key of right page
61 * are folded, and its suffix compression is propagated as router
62 * key in parent)
63 * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB>
64 * should be made the router key for the split)
65 *
66 * case-insensitive search:
67 *
68 *	fold search key;
69 *
70 *	case-insensitive search of B-tree:
71 *	for internal entry, router key is already folded;
72 *	for leaf entry, fold the entry key before comparison.
73 *
74 *	if (leaf entry case-insensitive match found)
75 *		if (next entry satisfies case-insensitive match)
76 *			return EDUPLICATE;
77 *		if (prev entry satisfies case-insensitive match)
78 *			return EDUPLICATE;
79 *		return match;
80 *	else
81 *		return no match;
82 *
83 *	serialization:
84 * target directory inode lock is being held on entry/exit
85 * of all main directory service routines.
86 *
87 *	log based recovery:
88 */
89
90#include <linux/fs.h>
91#include <linux/quotaops.h>
92#include <linux/slab.h>
93#include "jfs_incore.h"
94#include "jfs_superblock.h"
95#include "jfs_filsys.h"
96#include "jfs_metapage.h"
97#include "jfs_dmap.h"
98#include "jfs_unicode.h"
99#include "jfs_debug.h"
100
101/* dtree split parameter */
102struct dtsplit {
103	struct metapage *mp;
104	s16 index;
105	s16 nslot;
106	struct component_name *key;
107	ddata_t *data;
108	struct pxdlist *pxdlist;
109};
110
111#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
112
113/* get page buffer for specified block address */
114#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)				\
115do {									\
116	BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot);	\
117	if (!(RC)) {							\
118		if (((P)->header.nextindex >				\
119		     (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
120		    ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) {	\
121			BT_PUTPAGE(MP);					\
122			jfs_error((IP)->i_sb,				\
123				  "DT_GETPAGE: dtree page corrupt\n");	\
124			MP = NULL;					\
125			RC = -EIO;					\
126		}							\
127	}								\
128} while (0)
129
130/* for consistency */
131#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
132
133#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
134	BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot)
135
136/*
137 * forward references
138 */
139static int dtSplitUp(tid_t tid, struct inode *ip,
140		     struct dtsplit * split, struct btstack * btstack);
141
142static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
143		       struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp);
144
145static int dtExtendPage(tid_t tid, struct inode *ip,
146			struct dtsplit * split, struct btstack * btstack);
147
148static int dtSplitRoot(tid_t tid, struct inode *ip,
149		       struct dtsplit * split, struct metapage ** rmpp);
150
151static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
152		      dtpage_t * fp, struct btstack * btstack);
153
154static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p);
155
156static int dtReadFirst(struct inode *ip, struct btstack * btstack);
157
158static int dtReadNext(struct inode *ip,
159		      loff_t * offset, struct btstack * btstack);
160
161static int dtCompare(struct component_name * key, dtpage_t * p, int si);
162
163static int ciCompare(struct component_name * key, dtpage_t * p, int si,
164		     int flag);
165
166static void dtGetKey(dtpage_t * p, int i, struct component_name * key,
167		     int flag);
168
169static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
170			      int ri, struct component_name * key, int flag);
171
172static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
173			  ddata_t * data, struct dt_lock **);
174
175static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
176			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
177			int do_index);
178
179static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock);
180
181static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock);
182
183static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock);
184
185#define ciToUpper(c)	UniStrupr((c)->name)
186
187/*
188 *	read_index_page()
189 *
190 *	Reads a page of a directory's index table.
191 *	Having metadata mapped into the directory inode's address space
192 *	presents a multitude of problems.  We avoid this by mapping to
193 *	the absolute address space outside of the *_metapage routines
194 */
195static struct metapage *read_index_page(struct inode *inode, s64 blkno)
196{
197	int rc;
198	s64 xaddr;
199	int xflag;
200	s32 xlen;
201
202	rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
203	if (rc || (xaddr == 0))
204		return NULL;
205
206	return read_metapage(inode, xaddr, PSIZE, 1);
207}
208
209/*
210 *	get_index_page()
211 *
212 *	Same as get_index_page(), but get's a new page without reading
213 */
214static struct metapage *get_index_page(struct inode *inode, s64 blkno)
215{
216	int rc;
217	s64 xaddr;
218	int xflag;
219	s32 xlen;
220
221	rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
222	if (rc || (xaddr == 0))
223		return NULL;
224
225	return get_metapage(inode, xaddr, PSIZE, 1);
226}
227
228/*
229 *	find_index()
230 *
231 *	Returns dtree page containing directory table entry for specified
232 *	index and pointer to its entry.
233 *
234 *	mp must be released by caller.
235 */
236static struct dir_table_slot *find_index(struct inode *ip, u32 index,
237					 struct metapage ** mp, s64 *lblock)
238{
239	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
240	s64 blkno;
241	s64 offset;
242	int page_offset;
243	struct dir_table_slot *slot;
244	static int maxWarnings = 10;
245
246	if (index < 2) {
247		if (maxWarnings) {
248			jfs_warn("find_entry called with index = %d", index);
249			maxWarnings--;
250		}
251		return NULL;
252	}
253
254	if (index >= jfs_ip->next_index) {
255		jfs_warn("find_entry called with index >= next_index");
256		return NULL;
257	}
258
259	if (jfs_dirtable_inline(ip)) {
260		/*
261		 * Inline directory table
262		 */
263		*mp = NULL;
264		slot = &jfs_ip->i_dirtable[index - 2];
265	} else {
266		offset = (index - 2) * sizeof(struct dir_table_slot);
267		page_offset = offset & (PSIZE - 1);
268		blkno = ((offset + 1) >> L2PSIZE) <<
269		    JFS_SBI(ip->i_sb)->l2nbperpage;
270
271		if (*mp && (*lblock != blkno)) {
272			release_metapage(*mp);
273			*mp = NULL;
274		}
275		if (!(*mp)) {
276			*lblock = blkno;
277			*mp = read_index_page(ip, blkno);
278		}
279		if (!(*mp)) {
280			jfs_err("free_index: error reading directory table");
281			return NULL;
282		}
283
284		slot =
285		    (struct dir_table_slot *) ((char *) (*mp)->data +
286					       page_offset);
287	}
288	return slot;
289}
290
291static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
292			      u32 index)
293{
294	struct tlock *tlck;
295	struct linelock *llck;
296	struct lv *lv;
297
298	tlck = txLock(tid, ip, mp, tlckDATA);
299	llck = (struct linelock *) tlck->lock;
300
301	if (llck->index >= llck->maxcnt)
302		llck = txLinelock(llck);
303	lv = &llck->lv[llck->index];
304
305	/*
306	 *	Linelock slot size is twice the size of directory table
307	 *	slot size.  512 entries per page.
308	 */
309	lv->offset = ((index - 2) & 511) >> 1;
310	lv->length = 1;
311	llck->index++;
312}
313
314/*
315 *	add_index()
316 *
317 *	Adds an entry to the directory index table.  This is used to provide
318 *	each directory entry with a persistent index in which to resume
319 *	directory traversals
320 */
321static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
322{
323	struct super_block *sb = ip->i_sb;
324	struct jfs_sb_info *sbi = JFS_SBI(sb);
325	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
326	u64 blkno;
327	struct dir_table_slot *dirtab_slot;
328	u32 index;
329	struct linelock *llck;
330	struct lv *lv;
331	struct metapage *mp;
332	s64 offset;
333	uint page_offset;
334	struct tlock *tlck;
335	s64 xaddr;
336
337	ASSERT(DO_INDEX(ip));
338
339	if (jfs_ip->next_index < 2) {
340		jfs_warn("add_index: next_index = %d.  Resetting!",
341			   jfs_ip->next_index);
342		jfs_ip->next_index = 2;
343	}
344
345	index = jfs_ip->next_index++;
346
347	if (index <= MAX_INLINE_DIRTABLE_ENTRY) {
348		/*
349		 * i_size reflects size of index table, or 8 bytes per entry.
350		 */
351		ip->i_size = (loff_t) (index - 1) << 3;
352
353		/*
354		 * dir table fits inline within inode
355		 */
356		dirtab_slot = &jfs_ip->i_dirtable[index-2];
357		dirtab_slot->flag = DIR_INDEX_VALID;
358		dirtab_slot->slot = slot;
359		DTSaddress(dirtab_slot, bn);
360
361		set_cflag(COMMIT_Dirtable, ip);
362
363		return index;
364	}
365	if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
366		struct dir_table_slot temp_table[12];
367
368		/*
369		 * It's time to move the inline table to an external
370		 * page and begin to build the xtree
371		 */
372		if (dquot_alloc_block(ip, sbi->nbperpage))
373			goto clean_up;
374		if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
375			dquot_free_block(ip, sbi->nbperpage);
376			goto clean_up;
377		}
378
379		/*
380		 * Save the table, we're going to overwrite it with the
381		 * xtree root
382		 */
383		memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table));
384
385		/*
386		 * Initialize empty x-tree
387		 */
388		xtInitRoot(tid, ip);
389
390		/*
391		 * Add the first block to the xtree
392		 */
393		if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) {
394			/* This really shouldn't fail */
395			jfs_warn("add_index: xtInsert failed!");
396			memcpy(&jfs_ip->i_dirtable, temp_table,
397			       sizeof (temp_table));
398			dbFree(ip, xaddr, sbi->nbperpage);
399			dquot_free_block(ip, sbi->nbperpage);
400			goto clean_up;
401		}
402		ip->i_size = PSIZE;
403
404		mp = get_index_page(ip, 0);
405		if (!mp) {
406			jfs_err("add_index: get_metapage failed!");
407			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
408			memcpy(&jfs_ip->i_dirtable, temp_table,
409			       sizeof (temp_table));
410			goto clean_up;
411		}
412		tlck = txLock(tid, ip, mp, tlckDATA);
413		llck = (struct linelock *) & tlck->lock;
414		ASSERT(llck->index == 0);
415		lv = &llck->lv[0];
416
417		lv->offset = 0;
418		lv->length = 6;	/* tlckDATA slot size is 16 bytes */
419		llck->index++;
420
421		memcpy(mp->data, temp_table, sizeof(temp_table));
422
423		mark_metapage_dirty(mp);
424		release_metapage(mp);
425
426		/*
427		 * Logging is now directed by xtree tlocks
428		 */
429		clear_cflag(COMMIT_Dirtable, ip);
430	}
431
432	offset = (index - 2) * sizeof(struct dir_table_slot);
433	page_offset = offset & (PSIZE - 1);
434	blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage;
435	if (page_offset == 0) {
436		/*
437		 * This will be the beginning of a new page
438		 */
439		xaddr = 0;
440		if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) {
441			jfs_warn("add_index: xtInsert failed!");
442			goto clean_up;
443		}
444		ip->i_size += PSIZE;
445
446		if ((mp = get_index_page(ip, blkno)))
447			memset(mp->data, 0, PSIZE);	/* Just looks better */
448		else
449			xtTruncate(tid, ip, offset, COMMIT_PWMAP);
450	} else
451		mp = read_index_page(ip, blkno);
452
453	if (!mp) {
454		jfs_err("add_index: get/read_metapage failed!");
455		goto clean_up;
456	}
457
458	lock_index(tid, ip, mp, index);
459
460	dirtab_slot =
461	    (struct dir_table_slot *) ((char *) mp->data + page_offset);
462	dirtab_slot->flag = DIR_INDEX_VALID;
463	dirtab_slot->slot = slot;
464	DTSaddress(dirtab_slot, bn);
465
466	mark_metapage_dirty(mp);
467	release_metapage(mp);
468
469	return index;
470
471      clean_up:
472
473	jfs_ip->next_index--;
474
475	return 0;
476}
477
478/*
479 *	free_index()
480 *
481 *	Marks an entry to the directory index table as free.
482 */
483static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
484{
485	struct dir_table_slot *dirtab_slot;
486	s64 lblock;
487	struct metapage *mp = NULL;
488
489	dirtab_slot = find_index(ip, index, &mp, &lblock);
490
491	if (!dirtab_slot)
492		return;
493
494	dirtab_slot->flag = DIR_INDEX_FREE;
495	dirtab_slot->slot = dirtab_slot->addr1 = 0;
496	dirtab_slot->addr2 = cpu_to_le32(next);
497
498	if (mp) {
499		lock_index(tid, ip, mp, index);
500		mark_metapage_dirty(mp);
501		release_metapage(mp);
502	} else
503		set_cflag(COMMIT_Dirtable, ip);
504}
505
506/*
507 *	modify_index()
508 *
509 *	Changes an entry in the directory index table
510 */
511static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
512			 int slot, struct metapage ** mp, s64 *lblock)
513{
514	struct dir_table_slot *dirtab_slot;
515
516	dirtab_slot = find_index(ip, index, mp, lblock);
517
518	if (!dirtab_slot)
519		return;
520
521	DTSaddress(dirtab_slot, bn);
522	dirtab_slot->slot = slot;
523
524	if (*mp) {
525		lock_index(tid, ip, *mp, index);
526		mark_metapage_dirty(*mp);
527	} else
528		set_cflag(COMMIT_Dirtable, ip);
529}
530
531/*
532 *	read_index()
533 *
534 *	reads a directory table slot
535 */
536static int read_index(struct inode *ip, u32 index,
537		     struct dir_table_slot * dirtab_slot)
538{
539	s64 lblock;
540	struct metapage *mp = NULL;
541	struct dir_table_slot *slot;
542
543	slot = find_index(ip, index, &mp, &lblock);
544	if (!slot) {
545		return -EIO;
546	}
547
548	memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot));
549
550	if (mp)
551		release_metapage(mp);
552
553	return 0;
554}
555
556/*
557 *	dtSearch()
558 *
559 * function:
560 *	Search for the entry with specified key
561 *
562 * parameter:
563 *
564 * return: 0 - search result on stack, leaf page pinned;
565 *	   errno - I/O error
566 */
567int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
568	     struct btstack * btstack, int flag)
569{
570	int rc = 0;
571	int cmp = 1;		/* init for empty page */
572	s64 bn;
573	struct metapage *mp;
574	dtpage_t *p;
575	s8 *stbl;
576	int base, index, lim;
577	struct btframe *btsp;
578	pxd_t *pxd;
579	int psize = 288;	/* initial in-line directory */
580	ino_t inumber;
581	struct component_name ciKey;
582	struct super_block *sb = ip->i_sb;
583
584	ciKey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t),
585				   GFP_NOFS);
586	if (!ciKey.name) {
587		rc = -ENOMEM;
588		goto dtSearch_Exit2;
589	}
590
591
592	/* uppercase search key for c-i directory */
593	UniStrcpy(ciKey.name, key->name);
594	ciKey.namlen = key->namlen;
595
596	/* only uppercase if case-insensitive support is on */
597	if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) {
598		ciToUpper(&ciKey);
599	}
600	BT_CLR(btstack);	/* reset stack */
601
602	/* init level count for max pages to split */
603	btstack->nsplit = 1;
604
605	/*
606	 *	search down tree from root:
607	 *
608	 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
609	 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
610	 *
611	 * if entry with search key K is not found
612	 * internal page search find the entry with largest key Ki
613	 * less than K which point to the child page to search;
614	 * leaf page search find the entry with smallest key Kj
615	 * greater than K so that the returned index is the position of
616	 * the entry to be shifted right for insertion of new entry.
617	 * for empty tree, search key is greater than any key of the tree.
618	 *
619	 * by convention, root bn = 0.
620	 */
621	for (bn = 0;;) {
622		/* get/pin the page to search */
623		DT_GETPAGE(ip, bn, mp, psize, p, rc);
624		if (rc)
625			goto dtSearch_Exit1;
626
627		/* get sorted entry table of the page */
628		stbl = DT_GETSTBL(p);
629
630		/*
631		 * binary search with search key K on the current page.
632		 */
633		for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
634			index = base + (lim >> 1);
635
636			if (stbl[index] < 0) {
637				rc = -EIO;
638				goto out;
639			}
640
641			if (p->header.flag & BT_LEAF) {
642				/* uppercase leaf name to compare */
643				cmp =
644				    ciCompare(&ciKey, p, stbl[index],
645					      JFS_SBI(sb)->mntflag);
646			} else {
647				/* router key is in uppercase */
648
649				cmp = dtCompare(&ciKey, p, stbl[index]);
650
651
652			}
653			if (cmp == 0) {
654				/*
655				 *	search hit
656				 */
657				/* search hit - leaf page:
658				 * return the entry found
659				 */
660				if (p->header.flag & BT_LEAF) {
661					inumber = le32_to_cpu(
662			((struct ldtentry *) & p->slot[stbl[index]])->inumber);
663
664					/*
665					 * search for JFS_LOOKUP
666					 */
667					if (flag == JFS_LOOKUP) {
668						*data = inumber;
669						rc = 0;
670						goto out;
671					}
672
673					/*
674					 * search for JFS_CREATE
675					 */
676					if (flag == JFS_CREATE) {
677						*data = inumber;
678						rc = -EEXIST;
679						goto out;
680					}
681
682					/*
683					 * search for JFS_REMOVE or JFS_RENAME
684					 */
685					if ((flag == JFS_REMOVE ||
686					     flag == JFS_RENAME) &&
687					    *data != inumber) {
688						rc = -ESTALE;
689						goto out;
690					}
691
692					/*
693					 * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME
694					 */
695					/* save search result */
696					*data = inumber;
697					btsp = btstack->top;
698					btsp->bn = bn;
699					btsp->index = index;
700					btsp->mp = mp;
701
702					rc = 0;
703					goto dtSearch_Exit1;
704				}
705
706				/* search hit - internal page:
707				 * descend/search its child page
708				 */
709				goto getChild;
710			}
711
712			if (cmp > 0) {
713				base = index + 1;
714				--lim;
715			}
716		}
717
718		/*
719		 *	search miss
720		 *
721		 * base is the smallest index with key (Kj) greater than
722		 * search key (K) and may be zero or (maxindex + 1) index.
723		 */
724		/*
725		 * search miss - leaf page
726		 *
727		 * return location of entry (base) where new entry with
728		 * search key K is to be inserted.
729		 */
730		if (p->header.flag & BT_LEAF) {
731			/*
732			 * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME
733			 */
734			if (flag == JFS_LOOKUP || flag == JFS_REMOVE ||
735			    flag == JFS_RENAME) {
736				rc = -ENOENT;
737				goto out;
738			}
739
740			/*
741			 * search for JFS_CREATE|JFS_FINDDIR:
742			 *
743			 * save search result
744			 */
745			*data = 0;
746			btsp = btstack->top;
747			btsp->bn = bn;
748			btsp->index = base;
749			btsp->mp = mp;
750
751			rc = 0;
752			goto dtSearch_Exit1;
753		}
754
755		/*
756		 * search miss - internal page
757		 *
758		 * if base is non-zero, decrement base by one to get the parent
759		 * entry of the child page to search.
760		 */
761		index = base ? base - 1 : base;
762
763		/*
764		 * go down to child page
765		 */
766	      getChild:
767		/* update max. number of pages to split */
768		if (BT_STACK_FULL(btstack)) {
769			/* Something's corrupted, mark filesystem dirty so
770			 * chkdsk will fix it.
771			 */
772			jfs_error(sb, "stack overrun!\n");
773			BT_STACK_DUMP(btstack);
774			rc = -EIO;
775			goto out;
776		}
777		btstack->nsplit++;
778
779		/* push (bn, index) of the parent page/entry */
780		BT_PUSH(btstack, bn, index);
781
782		/* get the child page block number */
783		pxd = (pxd_t *) & p->slot[stbl[index]];
784		bn = addressPXD(pxd);
785		psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
786
787		/* unpin the parent page */
788		DT_PUTPAGE(mp);
789	}
790
791      out:
792	DT_PUTPAGE(mp);
793
794      dtSearch_Exit1:
795
796	kfree(ciKey.name);
797
798      dtSearch_Exit2:
799
800	return rc;
801}
802
803
804/*
805 *	dtInsert()
806 *
807 * function: insert an entry to directory tree
808 *
809 * parameter:
810 *
811 * return: 0 - success;
812 *	   errno - failure;
813 */
814int dtInsert(tid_t tid, struct inode *ip,
815	 struct component_name * name, ino_t * fsn, struct btstack * btstack)
816{
817	int rc = 0;
818	struct metapage *mp;	/* meta-page buffer */
819	dtpage_t *p;		/* base B+-tree index page */
820	s64 bn;
821	int index;
822	struct dtsplit split;	/* split information */
823	ddata_t data;
824	struct dt_lock *dtlck;
825	int n;
826	struct tlock *tlck;
827	struct lv *lv;
828
829	/*
830	 *	retrieve search result
831	 *
832	 * dtSearch() returns (leaf page pinned, index at which to insert).
833	 * n.b. dtSearch() may return index of (maxindex + 1) of
834	 * the full page.
835	 */
836	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
837
838	/*
839	 *	insert entry for new key
840	 */
841	if (DO_INDEX(ip)) {
842		if (JFS_IP(ip)->next_index == DIREND) {
843			DT_PUTPAGE(mp);
844			return -EMLINK;
845		}
846		n = NDTLEAF(name->namlen);
847		data.leaf.tid = tid;
848		data.leaf.ip = ip;
849	} else {
850		n = NDTLEAF_LEGACY(name->namlen);
851		data.leaf.ip = NULL;	/* signifies legacy directory format */
852	}
853	data.leaf.ino = *fsn;
854
855	/*
856	 *	leaf page does not have enough room for new entry:
857	 *
858	 *	extend/split the leaf page;
859	 *
860	 * dtSplitUp() will insert the entry and unpin the leaf page.
861	 */
862	if (n > p->header.freecnt) {
863		split.mp = mp;
864		split.index = index;
865		split.nslot = n;
866		split.key = name;
867		split.data = &data;
868		rc = dtSplitUp(tid, ip, &split, btstack);
869		return rc;
870	}
871
872	/*
873	 *	leaf page does have enough room for new entry:
874	 *
875	 *	insert the new data entry into the leaf page;
876	 */
877	BT_MARK_DIRTY(mp, ip);
878	/*
879	 * acquire a transaction lock on the leaf page
880	 */
881	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
882	dtlck = (struct dt_lock *) & tlck->lock;
883	ASSERT(dtlck->index == 0);
884	lv = & dtlck->lv[0];
885
886	/* linelock header */
887	lv->offset = 0;
888	lv->length = 1;
889	dtlck->index++;
890
891	dtInsertEntry(p, index, name, &data, &dtlck);
892
893	/* linelock stbl of non-root leaf page */
894	if (!(p->header.flag & BT_ROOT)) {
895		if (dtlck->index >= dtlck->maxcnt)
896			dtlck = (struct dt_lock *) txLinelock(dtlck);
897		lv = & dtlck->lv[dtlck->index];
898		n = index >> L2DTSLOTSIZE;
899		lv->offset = p->header.stblindex + n;
900		lv->length =
901		    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
902		dtlck->index++;
903	}
904
905	/* unpin the leaf page */
906	DT_PUTPAGE(mp);
907
908	return 0;
909}
910
911
912/*
913 *	dtSplitUp()
914 *
915 * function: propagate insertion bottom up;
916 *
917 * parameter:
918 *
919 * return: 0 - success;
920 *	   errno - failure;
921 *	leaf page unpinned;
922 */
923static int dtSplitUp(tid_t tid,
924	  struct inode *ip, struct dtsplit * split, struct btstack * btstack)
925{
926	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
927	int rc = 0;
928	struct metapage *smp;
929	dtpage_t *sp;		/* split page */
930	struct metapage *rmp;
931	dtpage_t *rp;		/* new right page split from sp */
932	pxd_t rpxd;		/* new right page extent descriptor */
933	struct metapage *lmp;
934	dtpage_t *lp;		/* left child page */
935	int skip;		/* index of entry of insertion */
936	struct btframe *parent;	/* parent page entry on traverse stack */
937	s64 xaddr, nxaddr;
938	int xlen, xsize;
939	struct pxdlist pxdlist;
940	pxd_t *pxd;
941	struct component_name key = { 0, NULL };
942	ddata_t *data = split->data;
943	int n;
944	struct dt_lock *dtlck;
945	struct tlock *tlck;
946	struct lv *lv;
947	int quota_allocation = 0;
948
949	/* get split page */
950	smp = split->mp;
951	sp = DT_PAGE(ip, smp);
952
953	key.name = kmalloc_array(JFS_NAME_MAX + 2, sizeof(wchar_t), GFP_NOFS);
954	if (!key.name) {
955		DT_PUTPAGE(smp);
956		rc = -ENOMEM;
957		goto dtSplitUp_Exit;
958	}
959
960	/*
961	 *	split leaf page
962	 *
963	 * The split routines insert the new entry, and
964	 * acquire txLock as appropriate.
965	 */
966	/*
967	 *	split root leaf page:
968	 */
969	if (sp->header.flag & BT_ROOT) {
970		/*
971		 * allocate a single extent child page
972		 */
973		xlen = 1;
974		n = sbi->bsize >> L2DTSLOTSIZE;
975		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
976		n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */
977		if (n <= split->nslot)
978			xlen++;
979		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) {
980			DT_PUTPAGE(smp);
981			goto freeKeyName;
982		}
983
984		pxdlist.maxnpxd = 1;
985		pxdlist.npxd = 0;
986		pxd = &pxdlist.pxd[0];
987		PXDaddress(pxd, xaddr);
988		PXDlength(pxd, xlen);
989		split->pxdlist = &pxdlist;
990		rc = dtSplitRoot(tid, ip, split, &rmp);
991
992		if (rc)
993			dbFree(ip, xaddr, xlen);
994		else
995			DT_PUTPAGE(rmp);
996
997		DT_PUTPAGE(smp);
998
999		if (!DO_INDEX(ip))
1000			ip->i_size = xlen << sbi->l2bsize;
1001
1002		goto freeKeyName;
1003	}
1004
1005	/*
1006	 *	extend first leaf page
1007	 *
1008	 * extend the 1st extent if less than buffer page size
1009	 * (dtExtendPage() reurns leaf page unpinned)
1010	 */
1011	pxd = &sp->header.self;
1012	xlen = lengthPXD(pxd);
1013	xsize = xlen << sbi->l2bsize;
1014	if (xsize < PSIZE) {
1015		xaddr = addressPXD(pxd);
1016		n = xsize >> L2DTSLOTSIZE;
1017		n -= (n + 31) >> L2DTSLOTSIZE;	/* stbl size */
1018		if ((n + sp->header.freecnt) <= split->nslot)
1019			n = xlen + (xlen << 1);
1020		else
1021			n = xlen;
1022
1023		/* Allocate blocks to quota. */
1024		rc = dquot_alloc_block(ip, n);
1025		if (rc)
1026			goto extendOut;
1027		quota_allocation += n;
1028
1029		if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
1030				    (s64) n, &nxaddr)))
1031			goto extendOut;
1032
1033		pxdlist.maxnpxd = 1;
1034		pxdlist.npxd = 0;
1035		pxd = &pxdlist.pxd[0];
1036		PXDaddress(pxd, nxaddr);
1037		PXDlength(pxd, xlen + n);
1038		split->pxdlist = &pxdlist;
1039		if ((rc = dtExtendPage(tid, ip, split, btstack))) {
1040			nxaddr = addressPXD(pxd);
1041			if (xaddr != nxaddr) {
1042				/* free relocated extent */
1043				xlen = lengthPXD(pxd);
1044				dbFree(ip, nxaddr, (s64) xlen);
1045			} else {
1046				/* free extended delta */
1047				xlen = lengthPXD(pxd) - n;
1048				xaddr = addressPXD(pxd) + xlen;
1049				dbFree(ip, xaddr, (s64) n);
1050			}
1051		} else if (!DO_INDEX(ip))
1052			ip->i_size = lengthPXD(pxd) << sbi->l2bsize;
1053
1054
1055	      extendOut:
1056		DT_PUTPAGE(smp);
1057		goto freeKeyName;
1058	}
1059
1060	/*
1061	 *	split leaf page <sp> into <sp> and a new right page <rp>.
1062	 *
1063	 * return <rp> pinned and its extent descriptor <rpxd>
1064	 */
1065	/*
1066	 * allocate new directory page extent and
1067	 * new index page(s) to cover page split(s)
1068	 *
1069	 * allocation hint: ?
1070	 */
1071	n = btstack->nsplit;
1072	pxdlist.maxnpxd = pxdlist.npxd = 0;
1073	xlen = sbi->nbperpage;
1074	for (pxd = pxdlist.pxd; n > 0; n--, pxd++) {
1075		if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) {
1076			PXDaddress(pxd, xaddr);
1077			PXDlength(pxd, xlen);
1078			pxdlist.maxnpxd++;
1079			continue;
1080		}
1081
1082		DT_PUTPAGE(smp);
1083
1084		/* undo allocation */
1085		goto splitOut;
1086	}
1087
1088	split->pxdlist = &pxdlist;
1089	if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) {
1090		DT_PUTPAGE(smp);
1091
1092		/* undo allocation */
1093		goto splitOut;
1094	}
1095
1096	if (!DO_INDEX(ip))
1097		ip->i_size += PSIZE;
1098
1099	/*
1100	 * propagate up the router entry for the leaf page just split
1101	 *
1102	 * insert a router entry for the new page into the parent page,
1103	 * propagate the insert/split up the tree by walking back the stack
1104	 * of (bn of parent page, index of child page entry in parent page)
1105	 * that were traversed during the search for the page that split.
1106	 *
1107	 * the propagation of insert/split up the tree stops if the root
1108	 * splits or the page inserted into doesn't have to split to hold
1109	 * the new entry.
1110	 *
1111	 * the parent entry for the split page remains the same, and
1112	 * a new entry is inserted at its right with the first key and
1113	 * block number of the new right page.
1114	 *
1115	 * There are a maximum of 4 pages pinned at any time:
1116	 * two children, left parent and right parent (when the parent splits).
1117	 * keep the child pages pinned while working on the parent.
1118	 * make sure that all pins are released at exit.
1119	 */
1120	while ((parent = BT_POP(btstack)) != NULL) {
1121		/* parent page specified by stack frame <parent> */
1122
1123		/* keep current child pages (<lp>, <rp>) pinned */
1124		lmp = smp;
1125		lp = sp;
1126
1127		/*
1128		 * insert router entry in parent for new right child page <rp>
1129		 */
1130		/* get the parent page <sp> */
1131		DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
1132		if (rc) {
1133			DT_PUTPAGE(lmp);
1134			DT_PUTPAGE(rmp);
1135			goto splitOut;
1136		}
1137
1138		/*
1139		 * The new key entry goes ONE AFTER the index of parent entry,
1140		 * because the split was to the right.
1141		 */
1142		skip = parent->index + 1;
1143
1144		/*
1145		 * compute the key for the router entry
1146		 *
1147		 * key suffix compression:
1148		 * for internal pages that have leaf pages as children,
1149		 * retain only what's needed to distinguish between
1150		 * the new entry and the entry on the page to its left.
1151		 * If the keys compare equal, retain the entire key.
1152		 *
1153		 * note that compression is performed only at computing
1154		 * router key at the lowest internal level.
1155		 * further compression of the key between pairs of higher
1156		 * level internal pages loses too much information and
1157		 * the search may fail.
1158		 * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,}
1159		 * results in two adjacent parent entries (a)(xx).
1160		 * if split occurs between these two entries, and
1161		 * if compression is applied, the router key of parent entry
1162		 * of right page (x) will divert search for x into right
1163		 * subtree and miss x in the left subtree.)
1164		 *
1165		 * the entire key must be retained for the next-to-leftmost
1166		 * internal key at any level of the tree, or search may fail
1167		 * (e.g., ?)
1168		 */
1169		switch (rp->header.flag & BT_TYPE) {
1170		case BT_LEAF:
1171			/*
1172			 * compute the length of prefix for suffix compression
1173			 * between last entry of left page and first entry
1174			 * of right page
1175			 */
1176			if ((sp->header.flag & BT_ROOT && skip > 1) ||
1177			    sp->header.prev != 0 || skip > 1) {
1178				/* compute uppercase router prefix key */
1179				rc = ciGetLeafPrefixKey(lp,
1180							lp->header.nextindex-1,
1181							rp, 0, &key,
1182							sbi->mntflag);
1183				if (rc) {
1184					DT_PUTPAGE(lmp);
1185					DT_PUTPAGE(rmp);
1186					DT_PUTPAGE(smp);
1187					goto splitOut;
1188				}
1189			} else {
1190				/* next to leftmost entry of
1191				   lowest internal level */
1192
1193				/* compute uppercase router key */
1194				dtGetKey(rp, 0, &key, sbi->mntflag);
1195				key.name[key.namlen] = 0;
1196
1197				if ((sbi->mntflag & JFS_OS2) == JFS_OS2)
1198					ciToUpper(&key);
1199			}
1200
1201			n = NDTINTERNAL(key.namlen);
1202			break;
1203
1204		case BT_INTERNAL:
1205			dtGetKey(rp, 0, &key, sbi->mntflag);
1206			n = NDTINTERNAL(key.namlen);
1207			break;
1208
1209		default:
1210			jfs_err("dtSplitUp(): UFO!");
1211			break;
1212		}
1213
1214		/* unpin left child page */
1215		DT_PUTPAGE(lmp);
1216
1217		/*
1218		 * compute the data for the router entry
1219		 */
1220		data->xd = rpxd;	/* child page xd */
1221
1222		/*
1223		 * parent page is full - split the parent page
1224		 */
1225		if (n > sp->header.freecnt) {
1226			/* init for parent page split */
1227			split->mp = smp;
1228			split->index = skip;	/* index at insert */
1229			split->nslot = n;
1230			split->key = &key;
1231			/* split->data = data; */
1232
1233			/* unpin right child page */
1234			DT_PUTPAGE(rmp);
1235
1236			/* The split routines insert the new entry,
1237			 * acquire txLock as appropriate.
1238			 * return <rp> pinned and its block number <rbn>.
1239			 */
1240			rc = (sp->header.flag & BT_ROOT) ?
1241			    dtSplitRoot(tid, ip, split, &rmp) :
1242			    dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd);
1243			if (rc) {
1244				DT_PUTPAGE(smp);
1245				goto splitOut;
1246			}
1247
1248			/* smp and rmp are pinned */
1249		}
1250		/*
1251		 * parent page is not full - insert router entry in parent page
1252		 */
1253		else {
1254			BT_MARK_DIRTY(smp, ip);
1255			/*
1256			 * acquire a transaction lock on the parent page
1257			 */
1258			tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
1259			dtlck = (struct dt_lock *) & tlck->lock;
1260			ASSERT(dtlck->index == 0);
1261			lv = & dtlck->lv[0];
1262
1263			/* linelock header */
1264			lv->offset = 0;
1265			lv->length = 1;
1266			dtlck->index++;
1267
1268			/* linelock stbl of non-root parent page */
1269			if (!(sp->header.flag & BT_ROOT)) {
1270				lv++;
1271				n = skip >> L2DTSLOTSIZE;
1272				lv->offset = sp->header.stblindex + n;
1273				lv->length =
1274				    ((sp->header.nextindex -
1275				      1) >> L2DTSLOTSIZE) - n + 1;
1276				dtlck->index++;
1277			}
1278
1279			dtInsertEntry(sp, skip, &key, data, &dtlck);
1280
1281			/* exit propagate up */
1282			break;
1283		}
1284	}
1285
1286	/* unpin current split and its right page */
1287	DT_PUTPAGE(smp);
1288	DT_PUTPAGE(rmp);
1289
1290	/*
1291	 * free remaining extents allocated for split
1292	 */
1293      splitOut:
1294	n = pxdlist.npxd;
1295	pxd = &pxdlist.pxd[n];
1296	for (; n < pxdlist.maxnpxd; n++, pxd++)
1297		dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd));
1298
1299      freeKeyName:
1300	kfree(key.name);
1301
1302	/* Rollback quota allocation */
1303	if (rc && quota_allocation)
1304		dquot_free_block(ip, quota_allocation);
1305
1306      dtSplitUp_Exit:
1307
1308	return rc;
1309}
1310
1311
1312/*
1313 *	dtSplitPage()
1314 *
1315 * function: Split a non-root page of a btree.
1316 *
1317 * parameter:
1318 *
1319 * return: 0 - success;
1320 *	   errno - failure;
1321 *	return split and new page pinned;
1322 */
1323static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1324	    struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp)
1325{
1326	int rc = 0;
1327	struct metapage *smp;
1328	dtpage_t *sp;
1329	struct metapage *rmp;
1330	dtpage_t *rp;		/* new right page allocated */
1331	s64 rbn;		/* new right page block number */
1332	struct metapage *mp;
1333	dtpage_t *p;
1334	s64 nextbn;
1335	struct pxdlist *pxdlist;
1336	pxd_t *pxd;
1337	int skip, nextindex, half, left, nxt, off, si;
1338	struct ldtentry *ldtentry;
1339	struct idtentry *idtentry;
1340	u8 *stbl;
1341	struct dtslot *f;
1342	int fsi, stblsize;
1343	int n;
1344	struct dt_lock *sdtlck, *rdtlck;
1345	struct tlock *tlck;
1346	struct dt_lock *dtlck;
1347	struct lv *slv, *rlv, *lv;
1348
1349	/* get split page */
1350	smp = split->mp;
1351	sp = DT_PAGE(ip, smp);
1352
1353	/*
1354	 * allocate the new right page for the split
1355	 */
1356	pxdlist = split->pxdlist;
1357	pxd = &pxdlist->pxd[pxdlist->npxd];
1358	pxdlist->npxd++;
1359	rbn = addressPXD(pxd);
1360	rmp = get_metapage(ip, rbn, PSIZE, 1);
1361	if (rmp == NULL)
1362		return -EIO;
1363
1364	/* Allocate blocks to quota. */
1365	rc = dquot_alloc_block(ip, lengthPXD(pxd));
1366	if (rc) {
1367		release_metapage(rmp);
1368		return rc;
1369	}
1370
1371	jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
1372
1373	BT_MARK_DIRTY(rmp, ip);
1374	/*
1375	 * acquire a transaction lock on the new right page
1376	 */
1377	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
1378	rdtlck = (struct dt_lock *) & tlck->lock;
1379
1380	rp = (dtpage_t *) rmp->data;
1381	*rpp = rp;
1382	rp->header.self = *pxd;
1383
1384	BT_MARK_DIRTY(smp, ip);
1385	/*
1386	 * acquire a transaction lock on the split page
1387	 *
1388	 * action:
1389	 */
1390	tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
1391	sdtlck = (struct dt_lock *) & tlck->lock;
1392
1393	/* linelock header of split page */
1394	ASSERT(sdtlck->index == 0);
1395	slv = & sdtlck->lv[0];
1396	slv->offset = 0;
1397	slv->length = 1;
1398	sdtlck->index++;
1399
1400	/*
1401	 * initialize/update sibling pointers between sp and rp
1402	 */
1403	nextbn = le64_to_cpu(sp->header.next);
1404	rp->header.next = cpu_to_le64(nextbn);
1405	rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
1406	sp->header.next = cpu_to_le64(rbn);
1407
1408	/*
1409	 * initialize new right page
1410	 */
1411	rp->header.flag = sp->header.flag;
1412
1413	/* compute sorted entry table at start of extent data area */
1414	rp->header.nextindex = 0;
1415	rp->header.stblindex = 1;
1416
1417	n = PSIZE >> L2DTSLOTSIZE;
1418	rp->header.maxslot = n;
1419	stblsize = (n + 31) >> L2DTSLOTSIZE;	/* in unit of slot */
1420
1421	/* init freelist */
1422	fsi = rp->header.stblindex + stblsize;
1423	rp->header.freelist = fsi;
1424	rp->header.freecnt = rp->header.maxslot - fsi;
1425
1426	/*
1427	 *	sequential append at tail: append without split
1428	 *
1429	 * If splitting the last page on a level because of appending
1430	 * a entry to it (skip is maxentry), it's likely that the access is
1431	 * sequential. Adding an empty page on the side of the level is less
1432	 * work and can push the fill factor much higher than normal.
1433	 * If we're wrong it's no big deal, we'll just do the split the right
1434	 * way next time.
1435	 * (It may look like it's equally easy to do a similar hack for
1436	 * reverse sorted data, that is, split the tree left,
1437	 * but it's not. Be my guest.)
1438	 */
1439	if (nextbn == 0 && split->index == sp->header.nextindex) {
1440		/* linelock header + stbl (first slot) of new page */
1441		rlv = & rdtlck->lv[rdtlck->index];
1442		rlv->offset = 0;
1443		rlv->length = 2;
1444		rdtlck->index++;
1445
1446		/*
1447		 * initialize freelist of new right page
1448		 */
1449		f = &rp->slot[fsi];
1450		for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1451			f->next = fsi;
1452		f->next = -1;
1453
1454		/* insert entry at the first entry of the new right page */
1455		dtInsertEntry(rp, 0, split->key, split->data, &rdtlck);
1456
1457		goto out;
1458	}
1459
1460	/*
1461	 *	non-sequential insert (at possibly middle page)
1462	 */
1463
1464	/*
1465	 * update prev pointer of previous right sibling page;
1466	 */
1467	if (nextbn != 0) {
1468		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
1469		if (rc) {
1470			discard_metapage(rmp);
1471			return rc;
1472		}
1473
1474		BT_MARK_DIRTY(mp, ip);
1475		/*
1476		 * acquire a transaction lock on the next page
1477		 */
1478		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
1479		jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p",
1480			tlck, ip, mp);
1481		dtlck = (struct dt_lock *) & tlck->lock;
1482
1483		/* linelock header of previous right sibling page */
1484		lv = & dtlck->lv[dtlck->index];
1485		lv->offset = 0;
1486		lv->length = 1;
1487		dtlck->index++;
1488
1489		p->header.prev = cpu_to_le64(rbn);
1490
1491		DT_PUTPAGE(mp);
1492	}
1493
1494	/*
1495	 * split the data between the split and right pages.
1496	 */
1497	skip = split->index;
1498	half = (PSIZE >> L2DTSLOTSIZE) >> 1;	/* swag */
1499	left = 0;
1500
1501	/*
1502	 *	compute fill factor for split pages
1503	 *
1504	 * <nxt> traces the next entry to move to rp
1505	 * <off> traces the next entry to stay in sp
1506	 */
1507	stbl = (u8 *) & sp->slot[sp->header.stblindex];
1508	nextindex = sp->header.nextindex;
1509	for (nxt = off = 0; nxt < nextindex; ++off) {
1510		if (off == skip)
1511			/* check for fill factor with new entry size */
1512			n = split->nslot;
1513		else {
1514			si = stbl[nxt];
1515			switch (sp->header.flag & BT_TYPE) {
1516			case BT_LEAF:
1517				ldtentry = (struct ldtentry *) & sp->slot[si];
1518				if (DO_INDEX(ip))
1519					n = NDTLEAF(ldtentry->namlen);
1520				else
1521					n = NDTLEAF_LEGACY(ldtentry->
1522							   namlen);
1523				break;
1524
1525			case BT_INTERNAL:
1526				idtentry = (struct idtentry *) & sp->slot[si];
1527				n = NDTINTERNAL(idtentry->namlen);
1528				break;
1529
1530			default:
1531				break;
1532			}
1533
1534			++nxt;	/* advance to next entry to move in sp */
1535		}
1536
1537		left += n;
1538		if (left >= half)
1539			break;
1540	}
1541
1542	/* <nxt> poins to the 1st entry to move */
1543
1544	/*
1545	 *	move entries to right page
1546	 *
1547	 * dtMoveEntry() initializes rp and reserves entry for insertion
1548	 *
1549	 * split page moved out entries are linelocked;
1550	 * new/right page moved in entries are linelocked;
1551	 */
1552	/* linelock header + stbl of new right page */
1553	rlv = & rdtlck->lv[rdtlck->index];
1554	rlv->offset = 0;
1555	rlv->length = 5;
1556	rdtlck->index++;
1557
1558	dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip));
1559
1560	sp->header.nextindex = nxt;
1561
1562	/*
1563	 * finalize freelist of new right page
1564	 */
1565	fsi = rp->header.freelist;
1566	f = &rp->slot[fsi];
1567	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1568		f->next = fsi;
1569	f->next = -1;
1570
1571	/*
1572	 * Update directory index table for entries now in right page
1573	 */
1574	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
1575		s64 lblock;
1576
1577		mp = NULL;
1578		stbl = DT_GETSTBL(rp);
1579		for (n = 0; n < rp->header.nextindex; n++) {
1580			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
1581			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
1582				     rbn, n, &mp, &lblock);
1583		}
1584		if (mp)
1585			release_metapage(mp);
1586	}
1587
1588	/*
1589	 * the skipped index was on the left page,
1590	 */
1591	if (skip <= off) {
1592		/* insert the new entry in the split page */
1593		dtInsertEntry(sp, skip, split->key, split->data, &sdtlck);
1594
1595		/* linelock stbl of split page */
1596		if (sdtlck->index >= sdtlck->maxcnt)
1597			sdtlck = (struct dt_lock *) txLinelock(sdtlck);
1598		slv = & sdtlck->lv[sdtlck->index];
1599		n = skip >> L2DTSLOTSIZE;
1600		slv->offset = sp->header.stblindex + n;
1601		slv->length =
1602		    ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
1603		sdtlck->index++;
1604	}
1605	/*
1606	 * the skipped index was on the right page,
1607	 */
1608	else {
1609		/* adjust the skip index to reflect the new position */
1610		skip -= nxt;
1611
1612		/* insert the new entry in the right page */
1613		dtInsertEntry(rp, skip, split->key, split->data, &rdtlck);
1614	}
1615
1616      out:
1617	*rmpp = rmp;
1618	*rpxdp = *pxd;
1619
1620	return rc;
1621}
1622
1623
1624/*
1625 *	dtExtendPage()
1626 *
1627 * function: extend 1st/only directory leaf page
1628 *
1629 * parameter:
1630 *
1631 * return: 0 - success;
1632 *	   errno - failure;
1633 *	return extended page pinned;
1634 */
1635static int dtExtendPage(tid_t tid,
1636	     struct inode *ip, struct dtsplit * split, struct btstack * btstack)
1637{
1638	struct super_block *sb = ip->i_sb;
1639	int rc;
1640	struct metapage *smp, *pmp, *mp;
1641	dtpage_t *sp, *pp;
1642	struct pxdlist *pxdlist;
1643	pxd_t *pxd, *tpxd;
1644	int xlen, xsize;
1645	int newstblindex, newstblsize;
1646	int oldstblindex, oldstblsize;
1647	int fsi, last;
1648	struct dtslot *f;
1649	struct btframe *parent;
1650	int n;
1651	struct dt_lock *dtlck;
1652	s64 xaddr, txaddr;
1653	struct tlock *tlck;
1654	struct pxd_lock *pxdlock;
1655	struct lv *lv;
1656	uint type;
1657	struct ldtentry *ldtentry;
1658	u8 *stbl;
1659
1660	/* get page to extend */
1661	smp = split->mp;
1662	sp = DT_PAGE(ip, smp);
1663
1664	/* get parent/root page */
1665	parent = BT_POP(btstack);
1666	DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc);
1667	if (rc)
1668		return (rc);
1669
1670	/*
1671	 *	extend the extent
1672	 */
1673	pxdlist = split->pxdlist;
1674	pxd = &pxdlist->pxd[pxdlist->npxd];
1675	pxdlist->npxd++;
1676
1677	xaddr = addressPXD(pxd);
1678	tpxd = &sp->header.self;
1679	txaddr = addressPXD(tpxd);
1680	/* in-place extension */
1681	if (xaddr == txaddr) {
1682		type = tlckEXTEND;
1683	}
1684	/* relocation */
1685	else {
1686		type = tlckNEW;
1687
1688		/* save moved extent descriptor for later free */
1689		tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE);
1690		pxdlock = (struct pxd_lock *) & tlck->lock;
1691		pxdlock->flag = mlckFREEPXD;
1692		pxdlock->pxd = sp->header.self;
1693		pxdlock->index = 1;
1694
1695		/*
1696		 * Update directory index table to reflect new page address
1697		 */
1698		if (DO_INDEX(ip)) {
1699			s64 lblock;
1700
1701			mp = NULL;
1702			stbl = DT_GETSTBL(sp);
1703			for (n = 0; n < sp->header.nextindex; n++) {
1704				ldtentry =
1705				    (struct ldtentry *) & sp->slot[stbl[n]];
1706				modify_index(tid, ip,
1707					     le32_to_cpu(ldtentry->index),
1708					     xaddr, n, &mp, &lblock);
1709			}
1710			if (mp)
1711				release_metapage(mp);
1712		}
1713	}
1714
1715	/*
1716	 *	extend the page
1717	 */
1718	sp->header.self = *pxd;
1719
1720	jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp);
1721
1722	BT_MARK_DIRTY(smp, ip);
1723	/*
1724	 * acquire a transaction lock on the extended/leaf page
1725	 */
1726	tlck = txLock(tid, ip, smp, tlckDTREE | type);
1727	dtlck = (struct dt_lock *) & tlck->lock;
1728	lv = & dtlck->lv[0];
1729
1730	/* update buffer extent descriptor of extended page */
1731	xlen = lengthPXD(pxd);
1732	xsize = xlen << JFS_SBI(sb)->l2bsize;
1733
1734	/*
1735	 * copy old stbl to new stbl at start of extended area
1736	 */
1737	oldstblindex = sp->header.stblindex;
1738	oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE;
1739	newstblindex = sp->header.maxslot;
1740	n = xsize >> L2DTSLOTSIZE;
1741	newstblsize = (n + 31) >> L2DTSLOTSIZE;
1742	memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex],
1743	       sp->header.nextindex);
1744
1745	/*
1746	 * in-line extension: linelock old area of extended page
1747	 */
1748	if (type == tlckEXTEND) {
1749		/* linelock header */
1750		lv->offset = 0;
1751		lv->length = 1;
1752		dtlck->index++;
1753		lv++;
1754
1755		/* linelock new stbl of extended page */
1756		lv->offset = newstblindex;
1757		lv->length = newstblsize;
1758	}
1759	/*
1760	 * relocation: linelock whole relocated area
1761	 */
1762	else {
1763		lv->offset = 0;
1764		lv->length = sp->header.maxslot + newstblsize;
1765	}
1766
1767	dtlck->index++;
1768
1769	sp->header.maxslot = n;
1770	sp->header.stblindex = newstblindex;
1771	/* sp->header.nextindex remains the same */
1772
1773	/*
1774	 * add old stbl region at head of freelist
1775	 */
1776	fsi = oldstblindex;
1777	f = &sp->slot[fsi];
1778	last = sp->header.freelist;
1779	for (n = 0; n < oldstblsize; n++, fsi++, f++) {
1780		f->next = last;
1781		last = fsi;
1782	}
1783	sp->header.freelist = last;
1784	sp->header.freecnt += oldstblsize;
1785
1786	/*
1787	 * append free region of newly extended area at tail of freelist
1788	 */
1789	/* init free region of newly extended area */
1790	fsi = n = newstblindex + newstblsize;
1791	f = &sp->slot[fsi];
1792	for (fsi++; fsi < sp->header.maxslot; f++, fsi++)
1793		f->next = fsi;
1794	f->next = -1;
1795
1796	/* append new free region at tail of old freelist */
1797	fsi = sp->header.freelist;
1798	if (fsi == -1)
1799		sp->header.freelist = n;
1800	else {
1801		do {
1802			f = &sp->slot[fsi];
1803			fsi = f->next;
1804		} while (fsi != -1);
1805
1806		f->next = n;
1807	}
1808
1809	sp->header.freecnt += sp->header.maxslot - n;
1810
1811	/*
1812	 * insert the new entry
1813	 */
1814	dtInsertEntry(sp, split->index, split->key, split->data, &dtlck);
1815
1816	BT_MARK_DIRTY(pmp, ip);
1817	/*
1818	 * linelock any freeslots residing in old extent
1819	 */
1820	if (type == tlckEXTEND) {
1821		n = sp->header.maxslot >> 2;
1822		if (sp->header.freelist < n)
1823			dtLinelockFreelist(sp, n, &dtlck);
1824	}
1825
1826	/*
1827	 *	update parent entry on the parent/root page
1828	 */
1829	/*
1830	 * acquire a transaction lock on the parent/root page
1831	 */
1832	tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
1833	dtlck = (struct dt_lock *) & tlck->lock;
1834	lv = & dtlck->lv[dtlck->index];
1835
1836	/* linelock parent entry - 1st slot */
1837	lv->offset = 1;
1838	lv->length = 1;
1839	dtlck->index++;
1840
1841	/* update the parent pxd for page extension */
1842	tpxd = (pxd_t *) & pp->slot[1];
1843	*tpxd = *pxd;
1844
1845	DT_PUTPAGE(pmp);
1846	return 0;
1847}
1848
1849
1850/*
1851 *	dtSplitRoot()
1852 *
1853 * function:
1854 *	split the full root page into
1855 *	original/root/split page and new right page
1856 *	i.e., root remains fixed in tree anchor (inode) and
1857 *	the root is copied to a single new right child page
1858 *	since root page << non-root page, and
1859 *	the split root page contains a single entry for the
1860 *	new right child page.
1861 *
1862 * parameter:
1863 *
1864 * return: 0 - success;
1865 *	   errno - failure;
1866 *	return new page pinned;
1867 */
1868static int dtSplitRoot(tid_t tid,
1869	    struct inode *ip, struct dtsplit * split, struct metapage ** rmpp)
1870{
1871	struct super_block *sb = ip->i_sb;
1872	struct metapage *smp;
1873	dtroot_t *sp;
1874	struct metapage *rmp;
1875	dtpage_t *rp;
1876	s64 rbn;
1877	int xlen;
1878	int xsize;
1879	struct dtslot *f;
1880	s8 *stbl;
1881	int fsi, stblsize, n;
1882	struct idtentry *s;
1883	pxd_t *ppxd;
1884	struct pxdlist *pxdlist;
1885	pxd_t *pxd;
1886	struct dt_lock *dtlck;
1887	struct tlock *tlck;
1888	struct lv *lv;
1889	int rc;
1890
1891	/* get split root page */
1892	smp = split->mp;
1893	sp = &JFS_IP(ip)->i_dtroot;
1894
1895	/*
1896	 *	allocate/initialize a single (right) child page
1897	 *
1898	 * N.B. at first split, a one (or two) block to fit new entry
1899	 * is allocated; at subsequent split, a full page is allocated;
1900	 */
1901	pxdlist = split->pxdlist;
1902	pxd = &pxdlist->pxd[pxdlist->npxd];
1903	pxdlist->npxd++;
1904	rbn = addressPXD(pxd);
1905	xlen = lengthPXD(pxd);
1906	xsize = xlen << JFS_SBI(sb)->l2bsize;
1907	rmp = get_metapage(ip, rbn, xsize, 1);
1908	if (!rmp)
1909		return -EIO;
1910
1911	rp = rmp->data;
1912
1913	/* Allocate blocks to quota. */
1914	rc = dquot_alloc_block(ip, lengthPXD(pxd));
1915	if (rc) {
1916		release_metapage(rmp);
1917		return rc;
1918	}
1919
1920	BT_MARK_DIRTY(rmp, ip);
1921	/*
1922	 * acquire a transaction lock on the new right page
1923	 */
1924	tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
1925	dtlck = (struct dt_lock *) & tlck->lock;
1926
1927	rp->header.flag =
1928	    (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
1929	rp->header.self = *pxd;
1930
1931	/* initialize sibling pointers */
1932	rp->header.next = 0;
1933	rp->header.prev = 0;
1934
1935	/*
1936	 *	move in-line root page into new right page extent
1937	 */
1938	/* linelock header + copied entries + new stbl (1st slot) in new page */
1939	ASSERT(dtlck->index == 0);
1940	lv = & dtlck->lv[0];
1941	lv->offset = 0;
1942	lv->length = 10;	/* 1 + 8 + 1 */
1943	dtlck->index++;
1944
1945	n = xsize >> L2DTSLOTSIZE;
1946	rp->header.maxslot = n;
1947	stblsize = (n + 31) >> L2DTSLOTSIZE;
1948
1949	/* copy old stbl to new stbl at start of extended area */
1950	rp->header.stblindex = DTROOTMAXSLOT;
1951	stbl = (s8 *) & rp->slot[DTROOTMAXSLOT];
1952	memcpy(stbl, sp->header.stbl, sp->header.nextindex);
1953	rp->header.nextindex = sp->header.nextindex;
1954
1955	/* copy old data area to start of new data area */
1956	memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE);
1957
1958	/*
1959	 * append free region of newly extended area at tail of freelist
1960	 */
1961	/* init free region of newly extended area */
1962	fsi = n = DTROOTMAXSLOT + stblsize;
1963	f = &rp->slot[fsi];
1964	for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1965		f->next = fsi;
1966	f->next = -1;
1967
1968	/* append new free region at tail of old freelist */
1969	fsi = sp->header.freelist;
1970	if (fsi == -1)
1971		rp->header.freelist = n;
1972	else {
1973		rp->header.freelist = fsi;
1974
1975		do {
1976			f = &rp->slot[fsi];
1977			fsi = f->next;
1978		} while (fsi >= 0);
1979
1980		f->next = n;
1981	}
1982
1983	rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n;
1984
1985	/*
1986	 * Update directory index table for entries now in right page
1987	 */
1988	if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
1989		s64 lblock;
1990		struct metapage *mp = NULL;
1991		struct ldtentry *ldtentry;
1992
1993		stbl = DT_GETSTBL(rp);
1994		for (n = 0; n < rp->header.nextindex; n++) {
1995			ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
1996			modify_index(tid, ip, le32_to_cpu(ldtentry->index),
1997				     rbn, n, &mp, &lblock);
1998		}
1999		if (mp)
2000			release_metapage(mp);
2001	}
2002	/*
2003	 * insert the new entry into the new right/child page
2004	 * (skip index in the new right page will not change)
2005	 */
2006	dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
2007
2008	/*
2009	 *	reset parent/root page
2010	 *
2011	 * set the 1st entry offset to 0, which force the left-most key
2012	 * at any level of the tree to be less than any search key.
2013	 *
2014	 * The btree comparison code guarantees that the left-most key on any
2015	 * level of the tree is never used, so it doesn't need to be filled in.
2016	 */
2017	BT_MARK_DIRTY(smp, ip);
2018	/*
2019	 * acquire a transaction lock on the root page (in-memory inode)
2020	 */
2021	tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT);
2022	dtlck = (struct dt_lock *) & tlck->lock;
2023
2024	/* linelock root */
2025	ASSERT(dtlck->index == 0);
2026	lv = & dtlck->lv[0];
2027	lv->offset = 0;
2028	lv->length = DTROOTMAXSLOT;
2029	dtlck->index++;
2030
2031	/* update page header of root */
2032	if (sp->header.flag & BT_LEAF) {
2033		sp->header.flag &= ~BT_LEAF;
2034		sp->header.flag |= BT_INTERNAL;
2035	}
2036
2037	/* init the first entry */
2038	s = (struct idtentry *) & sp->slot[DTENTRYSTART];
2039	ppxd = (pxd_t *) s;
2040	*ppxd = *pxd;
2041	s->next = -1;
2042	s->namlen = 0;
2043
2044	stbl = sp->header.stbl;
2045	stbl[0] = DTENTRYSTART;
2046	sp->header.nextindex = 1;
2047
2048	/* init freelist */
2049	fsi = DTENTRYSTART + 1;
2050	f = &sp->slot[fsi];
2051
2052	/* init free region of remaining area */
2053	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
2054		f->next = fsi;
2055	f->next = -1;
2056
2057	sp->header.freelist = DTENTRYSTART + 1;
2058	sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1);
2059
2060	*rmpp = rmp;
2061
2062	return 0;
2063}
2064
2065
2066/*
2067 *	dtDelete()
2068 *
2069 * function: delete the entry(s) referenced by a key.
2070 *
2071 * parameter:
2072 *
2073 * return:
2074 */
2075int dtDelete(tid_t tid,
2076	 struct inode *ip, struct component_name * key, ino_t * ino, int flag)
2077{
2078	int rc = 0;
2079	s64 bn;
2080	struct metapage *mp, *imp;
2081	dtpage_t *p;
2082	int index;
2083	struct btstack btstack;
2084	struct dt_lock *dtlck;
2085	struct tlock *tlck;
2086	struct lv *lv;
2087	int i;
2088	struct ldtentry *ldtentry;
2089	u8 *stbl;
2090	u32 table_index, next_index;
2091	struct metapage *nmp;
2092	dtpage_t *np;
2093
2094	/*
2095	 *	search for the entry to delete:
2096	 *
2097	 * dtSearch() returns (leaf page pinned, index at which to delete).
2098	 */
2099	if ((rc = dtSearch(ip, key, ino, &btstack, flag)))
2100		return rc;
2101
2102	/* retrieve search result */
2103	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2104
2105	/*
2106	 * We need to find put the index of the next entry into the
2107	 * directory index table in order to resume a readdir from this
2108	 * entry.
2109	 */
2110	if (DO_INDEX(ip)) {
2111		stbl = DT_GETSTBL(p);
2112		ldtentry = (struct ldtentry *) & p->slot[stbl[index]];
2113		table_index = le32_to_cpu(ldtentry->index);
2114		if (index == (p->header.nextindex - 1)) {
2115			/*
2116			 * Last entry in this leaf page
2117			 */
2118			if ((p->header.flag & BT_ROOT)
2119			    || (p->header.next == 0))
2120				next_index = -1;
2121			else {
2122				/* Read next leaf page */
2123				DT_GETPAGE(ip, le64_to_cpu(p->header.next),
2124					   nmp, PSIZE, np, rc);
2125				if (rc)
2126					next_index = -1;
2127				else {
2128					stbl = DT_GETSTBL(np);
2129					ldtentry =
2130					    (struct ldtentry *) & np->
2131					    slot[stbl[0]];
2132					next_index =
2133					    le32_to_cpu(ldtentry->index);
2134					DT_PUTPAGE(nmp);
2135				}
2136			}
2137		} else {
2138			ldtentry =
2139			    (struct ldtentry *) & p->slot[stbl[index + 1]];
2140			next_index = le32_to_cpu(ldtentry->index);
2141		}
2142		free_index(tid, ip, table_index, next_index);
2143	}
2144	/*
2145	 * the leaf page becomes empty, delete the page
2146	 */
2147	if (p->header.nextindex == 1) {
2148		/* delete empty page */
2149		rc = dtDeleteUp(tid, ip, mp, p, &btstack);
2150	}
2151	/*
2152	 * the leaf page has other entries remaining:
2153	 *
2154	 * delete the entry from the leaf page.
2155	 */
2156	else {
2157		BT_MARK_DIRTY(mp, ip);
2158		/*
2159		 * acquire a transaction lock on the leaf page
2160		 */
2161		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
2162		dtlck = (struct dt_lock *) & tlck->lock;
2163
2164		/*
2165		 * Do not assume that dtlck->index will be zero.  During a
2166		 * rename within a directory, this transaction may have
2167		 * modified this page already when adding the new entry.
2168		 */
2169
2170		/* linelock header */
2171		if (dtlck->index >= dtlck->maxcnt)
2172			dtlck = (struct dt_lock *) txLinelock(dtlck);
2173		lv = & dtlck->lv[dtlck->index];
2174		lv->offset = 0;
2175		lv->length = 1;
2176		dtlck->index++;
2177
2178		/* linelock stbl of non-root leaf page */
2179		if (!(p->header.flag & BT_ROOT)) {
2180			if (dtlck->index >= dtlck->maxcnt)
2181				dtlck = (struct dt_lock *) txLinelock(dtlck);
2182			lv = & dtlck->lv[dtlck->index];
2183			i = index >> L2DTSLOTSIZE;
2184			lv->offset = p->header.stblindex + i;
2185			lv->length =
2186			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
2187			    i + 1;
2188			dtlck->index++;
2189		}
2190
2191		/* free the leaf entry */
2192		dtDeleteEntry(p, index, &dtlck);
2193
2194		/*
2195		 * Update directory index table for entries moved in stbl
2196		 */
2197		if (DO_INDEX(ip) && index < p->header.nextindex) {
2198			s64 lblock;
2199
2200			imp = NULL;
2201			stbl = DT_GETSTBL(p);
2202			for (i = index; i < p->header.nextindex; i++) {
2203				ldtentry =
2204				    (struct ldtentry *) & p->slot[stbl[i]];
2205				modify_index(tid, ip,
2206					     le32_to_cpu(ldtentry->index),
2207					     bn, i, &imp, &lblock);
2208			}
2209			if (imp)
2210				release_metapage(imp);
2211		}
2212
2213		DT_PUTPAGE(mp);
2214	}
2215
2216	return rc;
2217}
2218
2219
2220/*
2221 *	dtDeleteUp()
2222 *
2223 * function:
2224 *	free empty pages as propagating deletion up the tree
2225 *
2226 * parameter:
2227 *
2228 * return:
2229 */
2230static int dtDeleteUp(tid_t tid, struct inode *ip,
2231	   struct metapage * fmp, dtpage_t * fp, struct btstack * btstack)
2232{
2233	int rc = 0;
2234	struct metapage *mp;
2235	dtpage_t *p;
2236	int index, nextindex;
2237	int xlen;
2238	struct btframe *parent;
2239	struct dt_lock *dtlck;
2240	struct tlock *tlck;
2241	struct lv *lv;
2242	struct pxd_lock *pxdlock;
2243	int i;
2244
2245	/*
2246	 *	keep the root leaf page which has become empty
2247	 */
2248	if (BT_IS_ROOT(fmp)) {
2249		/*
2250		 * reset the root
2251		 *
2252		 * dtInitRoot() acquires txlock on the root
2253		 */
2254		dtInitRoot(tid, ip, PARENT(ip));
2255
2256		DT_PUTPAGE(fmp);
2257
2258		return 0;
2259	}
2260
2261	/*
2262	 *	free the non-root leaf page
2263	 */
2264	/*
2265	 * acquire a transaction lock on the page
2266	 *
2267	 * write FREEXTENT|NOREDOPAGE log record
2268	 * N.B. linelock is overlaid as freed extent descriptor, and
2269	 * the buffer page is freed;
2270	 */
2271	tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
2272	pxdlock = (struct pxd_lock *) & tlck->lock;
2273	pxdlock->flag = mlckFREEPXD;
2274	pxdlock->pxd = fp->header.self;
2275	pxdlock->index = 1;
2276
2277	/* update sibling pointers */
2278	if ((rc = dtRelink(tid, ip, fp))) {
2279		BT_PUTPAGE(fmp);
2280		return rc;
2281	}
2282
2283	xlen = lengthPXD(&fp->header.self);
2284
2285	/* Free quota allocation. */
2286	dquot_free_block(ip, xlen);
2287
2288	/* free/invalidate its buffer page */
2289	discard_metapage(fmp);
2290
2291	/*
2292	 *	propagate page deletion up the directory tree
2293	 *
2294	 * If the delete from the parent page makes it empty,
2295	 * continue all the way up the tree.
2296	 * stop if the root page is reached (which is never deleted) or
2297	 * if the entry deletion does not empty the page.
2298	 */
2299	while ((parent = BT_POP(btstack)) != NULL) {
2300		/* pin the parent page <sp> */
2301		DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
2302		if (rc)
2303			return rc;
2304
2305		/*
2306		 * free the extent of the child page deleted
2307		 */
2308		index = parent->index;
2309
2310		/*
2311		 * delete the entry for the child page from parent
2312		 */
2313		nextindex = p->header.nextindex;
2314
2315		/*
2316		 * the parent has the single entry being deleted:
2317		 *
2318		 * free the parent page which has become empty.
2319		 */
2320		if (nextindex == 1) {
2321			/*
2322			 * keep the root internal page which has become empty
2323			 */
2324			if (p->header.flag & BT_ROOT) {
2325				/*
2326				 * reset the root
2327				 *
2328				 * dtInitRoot() acquires txlock on the root
2329				 */
2330				dtInitRoot(tid, ip, PARENT(ip));
2331
2332				DT_PUTPAGE(mp);
2333
2334				return 0;
2335			}
2336			/*
2337			 * free the parent page
2338			 */
2339			else {
2340				/*
2341				 * acquire a transaction lock on the page
2342				 *
2343				 * write FREEXTENT|NOREDOPAGE log record
2344				 */
2345				tlck =
2346				    txMaplock(tid, ip,
2347					      tlckDTREE | tlckFREE);
2348				pxdlock = (struct pxd_lock *) & tlck->lock;
2349				pxdlock->flag = mlckFREEPXD;
2350				pxdlock->pxd = p->header.self;
2351				pxdlock->index = 1;
2352
2353				/* update sibling pointers */
2354				if ((rc = dtRelink(tid, ip, p))) {
2355					DT_PUTPAGE(mp);
2356					return rc;
2357				}
2358
2359				xlen = lengthPXD(&p->header.self);
2360
2361				/* Free quota allocation */
2362				dquot_free_block(ip, xlen);
2363
2364				/* free/invalidate its buffer page */
2365				discard_metapage(mp);
2366
2367				/* propagate up */
2368				continue;
2369			}
2370		}
2371
2372		/*
2373		 * the parent has other entries remaining:
2374		 *
2375		 * delete the router entry from the parent page.
2376		 */
2377		BT_MARK_DIRTY(mp, ip);
2378		/*
2379		 * acquire a transaction lock on the page
2380		 *
2381		 * action: router entry deletion
2382		 */
2383		tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
2384		dtlck = (struct dt_lock *) & tlck->lock;
2385
2386		/* linelock header */
2387		if (dtlck->index >= dtlck->maxcnt)
2388			dtlck = (struct dt_lock *) txLinelock(dtlck);
2389		lv = & dtlck->lv[dtlck->index];
2390		lv->offset = 0;
2391		lv->length = 1;
2392		dtlck->index++;
2393
2394		/* linelock stbl of non-root leaf page */
2395		if (!(p->header.flag & BT_ROOT)) {
2396			if (dtlck->index < dtlck->maxcnt)
2397				lv++;
2398			else {
2399				dtlck = (struct dt_lock *) txLinelock(dtlck);
2400				lv = & dtlck->lv[0];
2401			}
2402			i = index >> L2DTSLOTSIZE;
2403			lv->offset = p->header.stblindex + i;
2404			lv->length =
2405			    ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
2406			    i + 1;
2407			dtlck->index++;
2408		}
2409
2410		/* free the router entry */
2411		dtDeleteEntry(p, index, &dtlck);
2412
2413		/* reset key of new leftmost entry of level (for consistency) */
2414		if (index == 0 &&
2415		    ((p->header.flag & BT_ROOT) || p->header.prev == 0))
2416			dtTruncateEntry(p, 0, &dtlck);
2417
2418		/* unpin the parent page */
2419		DT_PUTPAGE(mp);
2420
2421		/* exit propagation up */
2422		break;
2423	}
2424
2425	if (!DO_INDEX(ip))
2426		ip->i_size -= PSIZE;
2427
2428	return 0;
2429}
2430
2431/*
2432 *	dtRelink()
2433 *
2434 * function:
2435 *	link around a freed page.
2436 *
2437 * parameter:
2438 *	fp:	page to be freed
2439 *
2440 * return:
2441 */
2442static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p)
2443{
2444	int rc;
2445	struct metapage *mp;
2446	s64 nextbn, prevbn;
2447	struct tlock *tlck;
2448	struct dt_lock *dtlck;
2449	struct lv *lv;
2450
2451	nextbn = le64_to_cpu(p->header.next);
2452	prevbn = le64_to_cpu(p->header.prev);
2453
2454	/* update prev pointer of the next page */
2455	if (nextbn != 0) {
2456		DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
2457		if (rc)
2458			return rc;
2459
2460		BT_MARK_DIRTY(mp, ip);
2461		/*
2462		 * acquire a transaction lock on the next page
2463		 *
2464		 * action: update prev pointer;
2465		 */
2466		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
2467		jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
2468			tlck, ip, mp);
2469		dtlck = (struct dt_lock *) & tlck->lock;
2470
2471		/* linelock header */
2472		if (dtlck->index >= dtlck->maxcnt)
2473			dtlck = (struct dt_lock *) txLinelock(dtlck);
2474		lv = & dtlck->lv[dtlck->index];
2475		lv->offset = 0;
2476		lv->length = 1;
2477		dtlck->index++;
2478
2479		p->header.prev = cpu_to_le64(prevbn);
2480		DT_PUTPAGE(mp);
2481	}
2482
2483	/* update next pointer of the previous page */
2484	if (prevbn != 0) {
2485		DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
2486		if (rc)
2487			return rc;
2488
2489		BT_MARK_DIRTY(mp, ip);
2490		/*
2491		 * acquire a transaction lock on the prev page
2492		 *
2493		 * action: update next pointer;
2494		 */
2495		tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
2496		jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
2497			tlck, ip, mp);
2498		dtlck = (struct dt_lock *) & tlck->lock;
2499
2500		/* linelock header */
2501		if (dtlck->index >= dtlck->maxcnt)
2502			dtlck = (struct dt_lock *) txLinelock(dtlck);
2503		lv = & dtlck->lv[dtlck->index];
2504		lv->offset = 0;
2505		lv->length = 1;
2506		dtlck->index++;
2507
2508		p->header.next = cpu_to_le64(nextbn);
2509		DT_PUTPAGE(mp);
2510	}
2511
2512	return 0;
2513}
2514
2515
2516/*
2517 *	dtInitRoot()
2518 *
2519 * initialize directory root (inline in inode)
2520 */
2521void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot)
2522{
2523	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
2524	dtroot_t *p;
2525	int fsi;
2526	struct dtslot *f;
2527	struct tlock *tlck;
2528	struct dt_lock *dtlck;
2529	struct lv *lv;
2530	u16 xflag_save;
2531
2532	/*
2533	 * If this was previously an non-empty directory, we need to remove
2534	 * the old directory table.
2535	 */
2536	if (DO_INDEX(ip)) {
2537		if (!jfs_dirtable_inline(ip)) {
2538			struct tblock *tblk = tid_to_tblock(tid);
2539			/*
2540			 * We're playing games with the tid's xflag.  If
2541			 * we're removing a regular file, the file's xtree
2542			 * is committed with COMMIT_PMAP, but we always
2543			 * commit the directories xtree with COMMIT_PWMAP.
2544			 */
2545			xflag_save = tblk->xflag;
2546			tblk->xflag = 0;
2547			/*
2548			 * xtTruncate isn't guaranteed to fully truncate
2549			 * the xtree.  The caller needs to check i_size
2550			 * after committing the transaction to see if
2551			 * additional truncation is needed.  The
2552			 * COMMIT_Stale flag tells caller that we
2553			 * initiated the truncation.
2554			 */
2555			xtTruncate(tid, ip, 0, COMMIT_PWMAP);
2556			set_cflag(COMMIT_Stale, ip);
2557
2558			tblk->xflag = xflag_save;
2559		} else
2560			ip->i_size = 1;
2561
2562		jfs_ip->next_index = 2;
2563	} else
2564		ip->i_size = IDATASIZE;
2565
2566	/*
2567	 * acquire a transaction lock on the root
2568	 *
2569	 * action: directory initialization;
2570	 */
2571	tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag,
2572		      tlckDTREE | tlckENTRY | tlckBTROOT);
2573	dtlck = (struct dt_lock *) & tlck->lock;
2574
2575	/* linelock root */
2576	ASSERT(dtlck->index == 0);
2577	lv = & dtlck->lv[0];
2578	lv->offset = 0;
2579	lv->length = DTROOTMAXSLOT;
2580	dtlck->index++;
2581
2582	p = &jfs_ip->i_dtroot;
2583
2584	p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
2585
2586	p->header.nextindex = 0;
2587
2588	/* init freelist */
2589	fsi = 1;
2590	f = &p->slot[fsi];
2591
2592	/* init data area of root */
2593	for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
2594		f->next = fsi;
2595	f->next = -1;
2596
2597	p->header.freelist = 1;
2598	p->header.freecnt = 8;
2599
2600	/* init '..' entry */
2601	p->header.idotdot = cpu_to_le32(idotdot);
2602
2603	return;
2604}
2605
2606/*
2607 *	add_missing_indices()
2608 *
2609 * function: Fix dtree page in which one or more entries has an invalid index.
2610 *	     fsck.jfs should really fix this, but it currently does not.
2611 *	     Called from jfs_readdir when bad index is detected.
2612 */
2613static void add_missing_indices(struct inode *inode, s64 bn)
2614{
2615	struct ldtentry *d;
2616	struct dt_lock *dtlck;
2617	int i;
2618	uint index;
2619	struct lv *lv;
2620	struct metapage *mp;
2621	dtpage_t *p;
2622	int rc;
2623	s8 *stbl;
2624	tid_t tid;
2625	struct tlock *tlck;
2626
2627	tid = txBegin(inode->i_sb, 0);
2628
2629	DT_GETPAGE(inode, bn, mp, PSIZE, p, rc);
2630
2631	if (rc) {
2632		printk(KERN_ERR "DT_GETPAGE failed!\n");
2633		goto end;
2634	}
2635	BT_MARK_DIRTY(mp, inode);
2636
2637	ASSERT(p->header.flag & BT_LEAF);
2638
2639	tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY);
2640	if (BT_IS_ROOT(mp))
2641		tlck->type |= tlckBTROOT;
2642
2643	dtlck = (struct dt_lock *) &tlck->lock;
2644
2645	stbl = DT_GETSTBL(p);
2646	for (i = 0; i < p->header.nextindex; i++) {
2647		d = (struct ldtentry *) &p->slot[stbl[i]];
2648		index = le32_to_cpu(d->index);
2649		if ((index < 2) || (index >= JFS_IP(inode)->next_index)) {
2650			d->index = cpu_to_le32(add_index(tid, inode, bn, i));
2651			if (dtlck->index >= dtlck->maxcnt)
2652				dtlck = (struct dt_lock *) txLinelock(dtlck);
2653			lv = &dtlck->lv[dtlck->index];
2654			lv->offset = stbl[i];
2655			lv->length = 1;
2656			dtlck->index++;
2657		}
2658	}
2659
2660	DT_PUTPAGE(mp);
2661	(void) txCommit(tid, 1, &inode, 0);
2662end:
2663	txEnd(tid);
2664}
2665
2666/*
2667 * Buffer to hold directory entry info while traversing a dtree page
2668 * before being fed to the filldir function
2669 */
2670struct jfs_dirent {
2671	loff_t position;
2672	int ino;
2673	u16 name_len;
2674	char name[];
2675};
2676
2677/*
2678 * function to determine next variable-sized jfs_dirent in buffer
2679 */
2680static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
2681{
2682	return (struct jfs_dirent *)
2683		((char *)dirent +
2684		 ((sizeof (struct jfs_dirent) + dirent->name_len + 1 +
2685		   sizeof (loff_t) - 1) &
2686		  ~(sizeof (loff_t) - 1)));
2687}
2688
2689/*
2690 *	jfs_readdir()
2691 *
2692 * function: read directory entries sequentially
2693 *	from the specified entry offset
2694 *
2695 * parameter:
2696 *
2697 * return: offset = (pn, index) of start entry
2698 *	of next jfs_readdir()/dtRead()
2699 */
2700int jfs_readdir(struct file *file, struct dir_context *ctx)
2701{
2702	struct inode *ip = file_inode(file);
2703	struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
2704	int rc = 0;
2705	loff_t dtpos;	/* legacy OS/2 style position */
2706	struct dtoffset {
2707		s16 pn;
2708		s16 index;
2709		s32 unused;
2710	} *dtoffset = (struct dtoffset *) &dtpos;
2711	s64 bn;
2712	struct metapage *mp;
2713	dtpage_t *p;
2714	int index;
2715	s8 *stbl;
2716	struct btstack btstack;
2717	int i, next;
2718	struct ldtentry *d;
2719	struct dtslot *t;
2720	int d_namleft, len, outlen;
2721	unsigned long dirent_buf;
2722	char *name_ptr;
2723	u32 dir_index;
2724	int do_index = 0;
2725	uint loop_count = 0;
2726	struct jfs_dirent *jfs_dirent;
2727	int jfs_dirents;
2728	int overflow, fix_page, page_fixed = 0;
2729	static int unique_pos = 2;	/* If we can't fix broken index */
2730
2731	if (ctx->pos == DIREND)
2732		return 0;
2733
2734	if (DO_INDEX(ip)) {
2735		/*
2736		 * persistent index is stored in directory entries.
2737		 * Special cases:	 0 = .
2738		 *			 1 = ..
2739		 *			-1 = End of directory
2740		 */
2741		do_index = 1;
2742
2743		dir_index = (u32) ctx->pos;
2744
2745		/*
2746		 * NFSv4 reserves cookies 1 and 2 for . and .. so the value
2747		 * we return to the vfs is one greater than the one we use
2748		 * internally.
2749		 */
2750		if (dir_index)
2751			dir_index--;
2752
2753		if (dir_index > 1) {
2754			struct dir_table_slot dirtab_slot;
2755
2756			if (dtEmpty(ip) ||
2757			    (dir_index >= JFS_IP(ip)->next_index)) {
2758				/* Stale position.  Directory has shrunk */
2759				ctx->pos = DIREND;
2760				return 0;
2761			}
2762		      repeat:
2763			rc = read_index(ip, dir_index, &dirtab_slot);
2764			if (rc) {
2765				ctx->pos = DIREND;
2766				return rc;
2767			}
2768			if (dirtab_slot.flag == DIR_INDEX_FREE) {
2769				if (loop_count++ > JFS_IP(ip)->next_index) {
2770					jfs_err("jfs_readdir detected infinite loop!");
2771					ctx->pos = DIREND;
2772					return 0;
2773				}
2774				dir_index = le32_to_cpu(dirtab_slot.addr2);
2775				if (dir_index == -1) {
2776					ctx->pos = DIREND;
2777					return 0;
2778				}
2779				goto repeat;
2780			}
2781			bn = addressDTS(&dirtab_slot);
2782			index = dirtab_slot.slot;
2783			DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2784			if (rc) {
2785				ctx->pos = DIREND;
2786				return 0;
2787			}
2788			if (p->header.flag & BT_INTERNAL) {
2789				jfs_err("jfs_readdir: bad index table");
2790				DT_PUTPAGE(mp);
2791				ctx->pos = DIREND;
2792				return 0;
2793			}
2794		} else {
2795			if (dir_index == 0) {
2796				/*
2797				 * self "."
2798				 */
2799				ctx->pos = 1;
2800				if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
2801					return 0;
2802			}
2803			/*
2804			 * parent ".."
2805			 */
2806			ctx->pos = 2;
2807			if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
2808				return 0;
2809
2810			/*
2811			 * Find first entry of left-most leaf
2812			 */
2813			if (dtEmpty(ip)) {
2814				ctx->pos = DIREND;
2815				return 0;
2816			}
2817
2818			if ((rc = dtReadFirst(ip, &btstack)))
2819				return rc;
2820
2821			DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2822		}
2823	} else {
2824		/*
2825		 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
2826		 *
2827		 * pn = 0; index = 1:	First entry "."
2828		 * pn = 0; index = 2:	Second entry ".."
2829		 * pn > 0:		Real entries, pn=1 -> leftmost page
2830		 * pn = index = -1:	No more entries
2831		 */
2832		dtpos = ctx->pos;
2833		if (dtpos < 2) {
2834			/* build "." entry */
2835			ctx->pos = 1;
2836			if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
2837				return 0;
2838			dtoffset->index = 2;
2839			ctx->pos = dtpos;
2840		}
2841
2842		if (dtoffset->pn == 0) {
2843			if (dtoffset->index == 2) {
2844				/* build ".." entry */
2845				if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
2846					return 0;
2847			} else {
2848				jfs_err("jfs_readdir called with invalid offset!");
2849			}
2850			dtoffset->pn = 1;
2851			dtoffset->index = 0;
2852			ctx->pos = dtpos;
2853		}
2854
2855		if (dtEmpty(ip)) {
2856			ctx->pos = DIREND;
2857			return 0;
2858		}
2859
2860		if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) {
2861			jfs_err("jfs_readdir: unexpected rc = %d from dtReadNext",
2862				rc);
2863			ctx->pos = DIREND;
2864			return 0;
2865		}
2866		/* get start leaf page and index */
2867		DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2868
2869		/* offset beyond directory eof ? */
2870		if (bn < 0) {
2871			ctx->pos = DIREND;
2872			return 0;
2873		}
2874	}
2875
2876	dirent_buf = __get_free_page(GFP_KERNEL);
2877	if (dirent_buf == 0) {
2878		DT_PUTPAGE(mp);
2879		jfs_warn("jfs_readdir: __get_free_page failed!");
2880		ctx->pos = DIREND;
2881		return -ENOMEM;
2882	}
2883
2884	while (1) {
2885		jfs_dirent = (struct jfs_dirent *) dirent_buf;
2886		jfs_dirents = 0;
2887		overflow = fix_page = 0;
2888
2889		stbl = DT_GETSTBL(p);
2890
2891		for (i = index; i < p->header.nextindex; i++) {
2892			d = (struct ldtentry *) & p->slot[stbl[i]];
2893
2894			if (((long) jfs_dirent + d->namlen + 1) >
2895			    (dirent_buf + PAGE_SIZE)) {
2896				/* DBCS codepages could overrun dirent_buf */
2897				index = i;
2898				overflow = 1;
2899				break;
2900			}
2901
2902			d_namleft = d->namlen;
2903			name_ptr = jfs_dirent->name;
2904			jfs_dirent->ino = le32_to_cpu(d->inumber);
2905
2906			if (do_index) {
2907				len = min(d_namleft, DTLHDRDATALEN);
2908				jfs_dirent->position = le32_to_cpu(d->index);
2909				/*
2910				 * d->index should always be valid, but it
2911				 * isn't.  fsck.jfs doesn't create the
2912				 * directory index for the lost+found
2913				 * directory.  Rather than let it go,
2914				 * we can try to fix it.
2915				 */
2916				if ((jfs_dirent->position < 2) ||
2917				    (jfs_dirent->position >=
2918				     JFS_IP(ip)->next_index)) {
2919					if (!page_fixed && !isReadOnly(ip)) {
2920						fix_page = 1;
2921						/*
2922						 * setting overflow and setting
2923						 * index to i will cause the
2924						 * same page to be processed
2925						 * again starting here
2926						 */
2927						overflow = 1;
2928						index = i;
2929						break;
2930					}
2931					jfs_dirent->position = unique_pos++;
2932				}
2933				/*
2934				 * We add 1 to the index because we may
2935				 * use a value of 2 internally, and NFSv4
2936				 * doesn't like that.
2937				 */
2938				jfs_dirent->position++;
2939			} else {
2940				jfs_dirent->position = dtpos;
2941				len = min(d_namleft, DTLHDRDATALEN_LEGACY);
2942			}
2943
2944			/* copy the name of head/only segment */
2945			outlen = jfs_strfromUCS_le(name_ptr, d->name, len,
2946						   codepage);
2947			jfs_dirent->name_len = outlen;
2948
2949			/* copy name in the additional segment(s) */
2950			next = d->next;
2951			while (next >= 0) {
2952				t = (struct dtslot *) & p->slot[next];
2953				name_ptr += outlen;
2954				d_namleft -= len;
2955				/* Sanity Check */
2956				if (d_namleft == 0) {
2957					jfs_error(ip->i_sb,
2958						  "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n",
2959						  (long)ip->i_ino,
2960						  (long long)bn,
2961						  i);
2962					goto skip_one;
2963				}
2964				len = min(d_namleft, DTSLOTDATALEN);
2965				outlen = jfs_strfromUCS_le(name_ptr, t->name,
2966							   len, codepage);
2967				jfs_dirent->name_len += outlen;
2968
2969				next = t->next;
2970			}
2971
2972			jfs_dirents++;
2973			jfs_dirent = next_jfs_dirent(jfs_dirent);
2974skip_one:
2975			if (!do_index)
2976				dtoffset->index++;
2977		}
2978
2979		if (!overflow) {
2980			/* Point to next leaf page */
2981			if (p->header.flag & BT_ROOT)
2982				bn = 0;
2983			else {
2984				bn = le64_to_cpu(p->header.next);
2985				index = 0;
2986				/* update offset (pn:index) for new page */
2987				if (!do_index) {
2988					dtoffset->pn++;
2989					dtoffset->index = 0;
2990				}
2991			}
2992			page_fixed = 0;
2993		}
2994
2995		/* unpin previous leaf page */
2996		DT_PUTPAGE(mp);
2997
2998		jfs_dirent = (struct jfs_dirent *) dirent_buf;
2999		while (jfs_dirents--) {
3000			ctx->pos = jfs_dirent->position;
3001			if (!dir_emit(ctx, jfs_dirent->name,
3002				    jfs_dirent->name_len,
3003				    jfs_dirent->ino, DT_UNKNOWN))
3004				goto out;
3005			jfs_dirent = next_jfs_dirent(jfs_dirent);
3006		}
3007
3008		if (fix_page) {
3009			add_missing_indices(ip, bn);
3010			page_fixed = 1;
3011		}
3012
3013		if (!overflow && (bn == 0)) {
3014			ctx->pos = DIREND;
3015			break;
3016		}
3017
3018		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3019		if (rc) {
3020			free_page(dirent_buf);
3021			return rc;
3022		}
3023	}
3024
3025      out:
3026	free_page(dirent_buf);
3027
3028	return rc;
3029}
3030
3031
3032/*
3033 *	dtReadFirst()
3034 *
3035 * function: get the leftmost page of the directory
3036 */
3037static int dtReadFirst(struct inode *ip, struct btstack * btstack)
3038{
3039	int rc = 0;
3040	s64 bn;
3041	int psize = 288;	/* initial in-line directory */
3042	struct metapage *mp;
3043	dtpage_t *p;
3044	s8 *stbl;
3045	struct btframe *btsp;
3046	pxd_t *xd;
3047
3048	BT_CLR(btstack);	/* reset stack */
3049
3050	/*
3051	 *	descend leftmost path of the tree
3052	 *
3053	 * by convention, root bn = 0.
3054	 */
3055	for (bn = 0;;) {
3056		DT_GETPAGE(ip, bn, mp, psize, p, rc);
3057		if (rc)
3058			return rc;
3059
3060		/*
3061		 * leftmost leaf page
3062		 */
3063		if (p->header.flag & BT_LEAF) {
3064			/* return leftmost entry */
3065			btsp = btstack->top;
3066			btsp->bn = bn;
3067			btsp->index = 0;
3068			btsp->mp = mp;
3069
3070			return 0;
3071		}
3072
3073		/*
3074		 * descend down to leftmost child page
3075		 */
3076		if (BT_STACK_FULL(btstack)) {
3077			DT_PUTPAGE(mp);
3078			jfs_error(ip->i_sb, "btstack overrun\n");
3079			BT_STACK_DUMP(btstack);
3080			return -EIO;
3081		}
3082		/* push (bn, index) of the parent page/entry */
3083		BT_PUSH(btstack, bn, 0);
3084
3085		/* get the leftmost entry */
3086		stbl = DT_GETSTBL(p);
3087		xd = (pxd_t *) & p->slot[stbl[0]];
3088
3089		/* get the child page block address */
3090		bn = addressPXD(xd);
3091		psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize;
3092
3093		/* unpin the parent page */
3094		DT_PUTPAGE(mp);
3095	}
3096}
3097
3098
3099/*
3100 *	dtReadNext()
3101 *
3102 * function: get the page of the specified offset (pn:index)
3103 *
3104 * return: if (offset > eof), bn = -1;
3105 *
3106 * note: if index > nextindex of the target leaf page,
3107 * start with 1st entry of next leaf page;
3108 */
3109static int dtReadNext(struct inode *ip, loff_t * offset,
3110		      struct btstack * btstack)
3111{
3112	int rc = 0;
3113	struct dtoffset {
3114		s16 pn;
3115		s16 index;
3116		s32 unused;
3117	} *dtoffset = (struct dtoffset *) offset;
3118	s64 bn;
3119	struct metapage *mp;
3120	dtpage_t *p;
3121	int index;
3122	int pn;
3123	s8 *stbl;
3124	struct btframe *btsp, *parent;
3125	pxd_t *xd;
3126
3127	/*
3128	 * get leftmost leaf page pinned
3129	 */
3130	if ((rc = dtReadFirst(ip, btstack)))
3131		return rc;
3132
3133	/* get leaf page */
3134	DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
3135
3136	/* get the start offset (pn:index) */
3137	pn = dtoffset->pn - 1;	/* Now pn = 0 represents leftmost leaf */
3138	index = dtoffset->index;
3139
3140	/* start at leftmost page ? */
3141	if (pn == 0) {
3142		/* offset beyond eof ? */
3143		if (index < p->header.nextindex)
3144			goto out;
3145
3146		if (p->header.flag & BT_ROOT) {
3147			bn = -1;
3148			goto out;
3149		}
3150
3151		/* start with 1st entry of next leaf page */
3152		dtoffset->pn++;
3153		dtoffset->index = index = 0;
3154		goto a;
3155	}
3156
3157	/* start at non-leftmost page: scan parent pages for large pn */
3158	if (p->header.flag & BT_ROOT) {
3159		bn = -1;
3160		goto out;
3161	}
3162
3163	/* start after next leaf page ? */
3164	if (pn > 1)
3165		goto b;
3166
3167	/* get leaf page pn = 1 */
3168      a:
3169	bn = le64_to_cpu(p->header.next);
3170
3171	/* unpin leaf page */
3172	DT_PUTPAGE(mp);
3173
3174	/* offset beyond eof ? */
3175	if (bn == 0) {
3176		bn = -1;
3177		goto out;
3178	}
3179
3180	goto c;
3181
3182	/*
3183	 * scan last internal page level to get target leaf page
3184	 */
3185      b:
3186	/* unpin leftmost leaf page */
3187	DT_PUTPAGE(mp);
3188
3189	/* get left most parent page */
3190	btsp = btstack->top;
3191	parent = btsp - 1;
3192	bn = parent->bn;
3193	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3194	if (rc)
3195		return rc;
3196
3197	/* scan parent pages at last internal page level */
3198	while (pn >= p->header.nextindex) {
3199		pn -= p->header.nextindex;
3200
3201		/* get next parent page address */
3202		bn = le64_to_cpu(p->header.next);
3203
3204		/* unpin current parent page */
3205		DT_PUTPAGE(mp);
3206
3207		/* offset beyond eof ? */
3208		if (bn == 0) {
3209			bn = -1;
3210			goto out;
3211		}
3212
3213		/* get next parent page */
3214		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3215		if (rc)
3216			return rc;
3217
3218		/* update parent page stack frame */
3219		parent->bn = bn;
3220	}
3221
3222	/* get leaf page address */
3223	stbl = DT_GETSTBL(p);
3224	xd = (pxd_t *) & p->slot[stbl[pn]];
3225	bn = addressPXD(xd);
3226
3227	/* unpin parent page */
3228	DT_PUTPAGE(mp);
3229
3230	/*
3231	 * get target leaf page
3232	 */
3233      c:
3234	DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3235	if (rc)
3236		return rc;
3237
3238	/*
3239	 * leaf page has been completed:
3240	 * start with 1st entry of next leaf page
3241	 */
3242	if (index >= p->header.nextindex) {
3243		bn = le64_to_cpu(p->header.next);
3244
3245		/* unpin leaf page */
3246		DT_PUTPAGE(mp);
3247
3248		/* offset beyond eof ? */
3249		if (bn == 0) {
3250			bn = -1;
3251			goto out;
3252		}
3253
3254		/* get next leaf page */
3255		DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3256		if (rc)
3257			return rc;
3258
3259		/* start with 1st entry of next leaf page */
3260		dtoffset->pn++;
3261		dtoffset->index = 0;
3262	}
3263
3264      out:
3265	/* return target leaf page pinned */
3266	btsp = btstack->top;
3267	btsp->bn = bn;
3268	btsp->index = dtoffset->index;
3269	btsp->mp = mp;
3270
3271	return 0;
3272}
3273
3274
3275/*
3276 *	dtCompare()
3277 *
3278 * function: compare search key with an internal entry
3279 *
3280 * return:
3281 *	< 0 if k is < record
3282 *	= 0 if k is = record
3283 *	> 0 if k is > record
3284 */
3285static int dtCompare(struct component_name * key,	/* search key */
3286		     dtpage_t * p,	/* directory page */
3287		     int si)
3288{				/* entry slot index */
3289	wchar_t *kname;
3290	__le16 *name;
3291	int klen, namlen, len, rc;
3292	struct idtentry *ih;
3293	struct dtslot *t;
3294
3295	/*
3296	 * force the left-most key on internal pages, at any level of
3297	 * the tree, to be less than any search key.
3298	 * this obviates having to update the leftmost key on an internal
3299	 * page when the user inserts a new key in the tree smaller than
3300	 * anything that has been stored.
3301	 *
3302	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
3303	 * at any internal page at any level of the tree,
3304	 * it descends to child of the entry anyway -
3305	 * ? make the entry as min size dummy entry)
3306	 *
3307	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
3308	 * return (1);
3309	 */
3310
3311	kname = key->name;
3312	klen = key->namlen;
3313
3314	ih = (struct idtentry *) & p->slot[si];
3315	si = ih->next;
3316	name = ih->name;
3317	namlen = ih->namlen;
3318	len = min(namlen, DTIHDRDATALEN);
3319
3320	/* compare with head/only segment */
3321	len = min(klen, len);
3322	if ((rc = UniStrncmp_le(kname, name, len)))
3323		return rc;
3324
3325	klen -= len;
3326	namlen -= len;
3327
3328	/* compare with additional segment(s) */
3329	kname += len;
3330	while (klen > 0 && namlen > 0) {
3331		/* compare with next name segment */
3332		t = (struct dtslot *) & p->slot[si];
3333		len = min(namlen, DTSLOTDATALEN);
3334		len = min(klen, len);
3335		name = t->name;
3336		if ((rc = UniStrncmp_le(kname, name, len)))
3337			return rc;
3338
3339		klen -= len;
3340		namlen -= len;
3341		kname += len;
3342		si = t->next;
3343	}
3344
3345	return (klen - namlen);
3346}
3347
3348
3349
3350
3351/*
3352 *	ciCompare()
3353 *
3354 * function: compare search key with an (leaf/internal) entry
3355 *
3356 * return:
3357 *	< 0 if k is < record
3358 *	= 0 if k is = record
3359 *	> 0 if k is > record
3360 */
3361static int ciCompare(struct component_name * key,	/* search key */
3362		     dtpage_t * p,	/* directory page */
3363		     int si,	/* entry slot index */
3364		     int flag)
3365{
3366	wchar_t *kname, x;
3367	__le16 *name;
3368	int klen, namlen, len, rc;
3369	struct ldtentry *lh;
3370	struct idtentry *ih;
3371	struct dtslot *t;
3372	int i;
3373
3374	/*
3375	 * force the left-most key on internal pages, at any level of
3376	 * the tree, to be less than any search key.
3377	 * this obviates having to update the leftmost key on an internal
3378	 * page when the user inserts a new key in the tree smaller than
3379	 * anything that has been stored.
3380	 *
3381	 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
3382	 * at any internal page at any level of the tree,
3383	 * it descends to child of the entry anyway -
3384	 * ? make the entry as min size dummy entry)
3385	 *
3386	 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
3387	 * return (1);
3388	 */
3389
3390	kname = key->name;
3391	klen = key->namlen;
3392
3393	/*
3394	 * leaf page entry
3395	 */
3396	if (p->header.flag & BT_LEAF) {
3397		lh = (struct ldtentry *) & p->slot[si];
3398		si = lh->next;
3399		name = lh->name;
3400		namlen = lh->namlen;
3401		if (flag & JFS_DIR_INDEX)
3402			len = min(namlen, DTLHDRDATALEN);
3403		else
3404			len = min(namlen, DTLHDRDATALEN_LEGACY);
3405	}
3406	/*
3407	 * internal page entry
3408	 */
3409	else {
3410		ih = (struct idtentry *) & p->slot[si];
3411		si = ih->next;
3412		name = ih->name;
3413		namlen = ih->namlen;
3414		len = min(namlen, DTIHDRDATALEN);
3415	}
3416
3417	/* compare with head/only segment */
3418	len = min(klen, len);
3419	for (i = 0; i < len; i++, kname++, name++) {
3420		/* only uppercase if case-insensitive support is on */
3421		if ((flag & JFS_OS2) == JFS_OS2)
3422			x = UniToupper(le16_to_cpu(*name));
3423		else
3424			x = le16_to_cpu(*name);
3425		if ((rc = *kname - x))
3426			return rc;
3427	}
3428
3429	klen -= len;
3430	namlen -= len;
3431
3432	/* compare with additional segment(s) */
3433	while (klen > 0 && namlen > 0) {
3434		/* compare with next name segment */
3435		t = (struct dtslot *) & p->slot[si];
3436		len = min(namlen, DTSLOTDATALEN);
3437		len = min(klen, len);
3438		name = t->name;
3439		for (i = 0; i < len; i++, kname++, name++) {
3440			/* only uppercase if case-insensitive support is on */
3441			if ((flag & JFS_OS2) == JFS_OS2)
3442				x = UniToupper(le16_to_cpu(*name));
3443			else
3444				x = le16_to_cpu(*name);
3445
3446			if ((rc = *kname - x))
3447				return rc;
3448		}
3449
3450		klen -= len;
3451		namlen -= len;
3452		si = t->next;
3453	}
3454
3455	return (klen - namlen);
3456}
3457
3458
3459/*
3460 *	ciGetLeafPrefixKey()
3461 *
3462 * function: compute prefix of suffix compression
3463 *	     from two adjacent leaf entries
3464 *	     across page boundary
3465 *
3466 * return: non-zero on error
3467 *
3468 */
3469static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
3470			       int ri, struct component_name * key, int flag)
3471{
3472	int klen, namlen;
3473	wchar_t *pl, *pr, *kname;
3474	struct component_name lkey;
3475	struct component_name rkey;
3476
3477	lkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t),
3478					GFP_KERNEL);
3479	if (lkey.name == NULL)
3480		return -ENOMEM;
3481
3482	rkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t),
3483					GFP_KERNEL);
3484	if (rkey.name == NULL) {
3485		kfree(lkey.name);
3486		return -ENOMEM;
3487	}
3488
3489	/* get left and right key */
3490	dtGetKey(lp, li, &lkey, flag);
3491	lkey.name[lkey.namlen] = 0;
3492
3493	if ((flag & JFS_OS2) == JFS_OS2)
3494		ciToUpper(&lkey);
3495
3496	dtGetKey(rp, ri, &rkey, flag);
3497	rkey.name[rkey.namlen] = 0;
3498
3499
3500	if ((flag & JFS_OS2) == JFS_OS2)
3501		ciToUpper(&rkey);
3502
3503	/* compute prefix */
3504	klen = 0;
3505	kname = key->name;
3506	namlen = min(lkey.namlen, rkey.namlen);
3507	for (pl = lkey.name, pr = rkey.name;
3508	     namlen; pl++, pr++, namlen--, klen++, kname++) {
3509		*kname = *pr;
3510		if (*pl != *pr) {
3511			key->namlen = klen + 1;
3512			goto free_names;
3513		}
3514	}
3515
3516	/* l->namlen <= r->namlen since l <= r */
3517	if (lkey.namlen < rkey.namlen) {
3518		*kname = *pr;
3519		key->namlen = klen + 1;
3520	} else			/* l->namelen == r->namelen */
3521		key->namlen = klen;
3522
3523free_names:
3524	kfree(lkey.name);
3525	kfree(rkey.name);
3526	return 0;
3527}
3528
3529
3530
3531/*
3532 *	dtGetKey()
3533 *
3534 * function: get key of the entry
3535 */
3536static void dtGetKey(dtpage_t * p, int i,	/* entry index */
3537		     struct component_name * key, int flag)
3538{
3539	int si;
3540	s8 *stbl;
3541	struct ldtentry *lh;
3542	struct idtentry *ih;
3543	struct dtslot *t;
3544	int namlen, len;
3545	wchar_t *kname;
3546	__le16 *name;
3547
3548	/* get entry */
3549	stbl = DT_GETSTBL(p);
3550	si = stbl[i];
3551	if (p->header.flag & BT_LEAF) {
3552		lh = (struct ldtentry *) & p->slot[si];
3553		si = lh->next;
3554		namlen = lh->namlen;
3555		name = lh->name;
3556		if (flag & JFS_DIR_INDEX)
3557			len = min(namlen, DTLHDRDATALEN);
3558		else
3559			len = min(namlen, DTLHDRDATALEN_LEGACY);
3560	} else {
3561		ih = (struct idtentry *) & p->slot[si];
3562		si = ih->next;
3563		namlen = ih->namlen;
3564		name = ih->name;
3565		len = min(namlen, DTIHDRDATALEN);
3566	}
3567
3568	key->namlen = namlen;
3569	kname = key->name;
3570
3571	/*
3572	 * move head/only segment
3573	 */
3574	UniStrncpy_from_le(kname, name, len);
3575
3576	/*
3577	 * move additional segment(s)
3578	 */
3579	while (si >= 0) {
3580		/* get next segment */
3581		t = &p->slot[si];
3582		kname += len;
3583		namlen -= len;
3584		len = min(namlen, DTSLOTDATALEN);
3585		UniStrncpy_from_le(kname, t->name, len);
3586
3587		si = t->next;
3588	}
3589}
3590
3591
3592/*
3593 *	dtInsertEntry()
3594 *
3595 * function: allocate free slot(s) and
3596 *	     write a leaf/internal entry
3597 *
3598 * return: entry slot index
3599 */
3600static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
3601			  ddata_t * data, struct dt_lock ** dtlock)
3602{
3603	struct dtslot *h, *t;
3604	struct ldtentry *lh = NULL;
3605	struct idtentry *ih = NULL;
3606	int hsi, fsi, klen, len, nextindex;
3607	wchar_t *kname;
3608	__le16 *name;
3609	s8 *stbl;
3610	pxd_t *xd;
3611	struct dt_lock *dtlck = *dtlock;
3612	struct lv *lv;
3613	int xsi, n;
3614	s64 bn = 0;
3615	struct metapage *mp = NULL;
3616
3617	klen = key->namlen;
3618	kname = key->name;
3619
3620	/* allocate a free slot */
3621	hsi = fsi = p->header.freelist;
3622	h = &p->slot[fsi];
3623	p->header.freelist = h->next;
3624	--p->header.freecnt;
3625
3626	/* open new linelock */
3627	if (dtlck->index >= dtlck->maxcnt)
3628		dtlck = (struct dt_lock *) txLinelock(dtlck);
3629
3630	lv = & dtlck->lv[dtlck->index];
3631	lv->offset = hsi;
3632
3633	/* write head/only segment */
3634	if (p->header.flag & BT_LEAF) {
3635		lh = (struct ldtentry *) h;
3636		lh->next = h->next;
3637		lh->inumber = cpu_to_le32(data->leaf.ino);
3638		lh->namlen = klen;
3639		name = lh->name;
3640		if (data->leaf.ip) {
3641			len = min(klen, DTLHDRDATALEN);
3642			if (!(p->header.flag & BT_ROOT))
3643				bn = addressPXD(&p->header.self);
3644			lh->index = cpu_to_le32(add_index(data->leaf.tid,
3645							  data->leaf.ip,
3646							  bn, index));
3647		} else
3648			len = min(klen, DTLHDRDATALEN_LEGACY);
3649	} else {
3650		ih = (struct idtentry *) h;
3651		ih->next = h->next;
3652		xd = (pxd_t *) ih;
3653		*xd = data->xd;
3654		ih->namlen = klen;
3655		name = ih->name;
3656		len = min(klen, DTIHDRDATALEN);
3657	}
3658
3659	UniStrncpy_to_le(name, kname, len);
3660
3661	n = 1;
3662	xsi = hsi;
3663
3664	/* write additional segment(s) */
3665	t = h;
3666	klen -= len;
3667	while (klen) {
3668		/* get free slot */
3669		fsi = p->header.freelist;
3670		t = &p->slot[fsi];
3671		p->header.freelist = t->next;
3672		--p->header.freecnt;
3673
3674		/* is next slot contiguous ? */
3675		if (fsi != xsi + 1) {
3676			/* close current linelock */
3677			lv->length = n;
3678			dtlck->index++;
3679
3680			/* open new linelock */
3681			if (dtlck->index < dtlck->maxcnt)
3682				lv++;
3683			else {
3684				dtlck = (struct dt_lock *) txLinelock(dtlck);
3685				lv = & dtlck->lv[0];
3686			}
3687
3688			lv->offset = fsi;
3689			n = 0;
3690		}
3691
3692		kname += len;
3693		len = min(klen, DTSLOTDATALEN);
3694		UniStrncpy_to_le(t->name, kname, len);
3695
3696		n++;
3697		xsi = fsi;
3698		klen -= len;
3699	}
3700
3701	/* close current linelock */
3702	lv->length = n;
3703	dtlck->index++;
3704
3705	*dtlock = dtlck;
3706
3707	/* terminate last/only segment */
3708	if (h == t) {
3709		/* single segment entry */
3710		if (p->header.flag & BT_LEAF)
3711			lh->next = -1;
3712		else
3713			ih->next = -1;
3714	} else
3715		/* multi-segment entry */
3716		t->next = -1;
3717
3718	/* if insert into middle, shift right succeeding entries in stbl */
3719	stbl = DT_GETSTBL(p);
3720	nextindex = p->header.nextindex;
3721	if (index < nextindex) {
3722		memmove(stbl + index + 1, stbl + index, nextindex - index);
3723
3724		if ((p->header.flag & BT_LEAF) && data->leaf.ip) {
3725			s64 lblock;
3726
3727			/*
3728			 * Need to update slot number for entries that moved
3729			 * in the stbl
3730			 */
3731			mp = NULL;
3732			for (n = index + 1; n <= nextindex; n++) {
3733				lh = (struct ldtentry *) & (p->slot[stbl[n]]);
3734				modify_index(data->leaf.tid, data->leaf.ip,
3735					     le32_to_cpu(lh->index), bn, n,
3736					     &mp, &lblock);
3737			}
3738			if (mp)
3739				release_metapage(mp);
3740		}
3741	}
3742
3743	stbl[index] = hsi;
3744
3745	/* advance next available entry index of stbl */
3746	++p->header.nextindex;
3747}
3748
3749
3750/*
3751 *	dtMoveEntry()
3752 *
3753 * function: move entries from split/left page to new/right page
3754 *
3755 *	nextindex of dst page and freelist/freecnt of both pages
3756 *	are updated.
3757 */
3758static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
3759			struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
3760			int do_index)
3761{
3762	int ssi, next;		/* src slot index */
3763	int di;			/* dst entry index */
3764	int dsi;		/* dst slot index */
3765	s8 *sstbl, *dstbl;	/* sorted entry table */
3766	int snamlen, len;
3767	struct ldtentry *slh, *dlh = NULL;
3768	struct idtentry *sih, *dih = NULL;
3769	struct dtslot *h, *s, *d;
3770	struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock;
3771	struct lv *slv, *dlv;
3772	int xssi, ns, nd;
3773	int sfsi;
3774
3775	sstbl = (s8 *) & sp->slot[sp->header.stblindex];
3776	dstbl = (s8 *) & dp->slot[dp->header.stblindex];
3777
3778	dsi = dp->header.freelist;	/* first (whole page) free slot */
3779	sfsi = sp->header.freelist;
3780
3781	/* linelock destination entry slot */
3782	dlv = & ddtlck->lv[ddtlck->index];
3783	dlv->offset = dsi;
3784
3785	/* linelock source entry slot */
3786	slv = & sdtlck->lv[sdtlck->index];
3787	slv->offset = sstbl[si];
3788	xssi = slv->offset - 1;
3789
3790	/*
3791	 * move entries
3792	 */
3793	ns = nd = 0;
3794	for (di = 0; si < sp->header.nextindex; si++, di++) {
3795		ssi = sstbl[si];
3796		dstbl[di] = dsi;
3797
3798		/* is next slot contiguous ? */
3799		if (ssi != xssi + 1) {
3800			/* close current linelock */
3801			slv->length = ns;
3802			sdtlck->index++;
3803
3804			/* open new linelock */
3805			if (sdtlck->index < sdtlck->maxcnt)
3806				slv++;
3807			else {
3808				sdtlck = (struct dt_lock *) txLinelock(sdtlck);
3809				slv = & sdtlck->lv[0];
3810			}
3811
3812			slv->offset = ssi;
3813			ns = 0;
3814		}
3815
3816		/*
3817		 * move head/only segment of an entry
3818		 */
3819		/* get dst slot */
3820		h = d = &dp->slot[dsi];
3821
3822		/* get src slot and move */
3823		s = &sp->slot[ssi];
3824		if (sp->header.flag & BT_LEAF) {
3825			/* get source entry */
3826			slh = (struct ldtentry *) s;
3827			dlh = (struct ldtentry *) h;
3828			snamlen = slh->namlen;
3829
3830			if (do_index) {
3831				len = min(snamlen, DTLHDRDATALEN);
3832				dlh->index = slh->index; /* little-endian */
3833			} else
3834				len = min(snamlen, DTLHDRDATALEN_LEGACY);
3835
3836			memcpy(dlh, slh, 6 + len * 2);
3837
3838			next = slh->next;
3839
3840			/* update dst head/only segment next field */
3841			dsi++;
3842			dlh->next = dsi;
3843		} else {
3844			sih = (struct idtentry *) s;
3845			snamlen = sih->namlen;
3846
3847			len = min(snamlen, DTIHDRDATALEN);
3848			dih = (struct idtentry *) h;
3849			memcpy(dih, sih, 10 + len * 2);
3850			next = sih->next;
3851
3852			dsi++;
3853			dih->next = dsi;
3854		}
3855
3856		/* free src head/only segment */
3857		s->next = sfsi;
3858		s->cnt = 1;
3859		sfsi = ssi;
3860
3861		ns++;
3862		nd++;
3863		xssi = ssi;
3864
3865		/*
3866		 * move additional segment(s) of the entry
3867		 */
3868		snamlen -= len;
3869		while ((ssi = next) >= 0) {
3870			/* is next slot contiguous ? */
3871			if (ssi != xssi + 1) {
3872				/* close current linelock */
3873				slv->length = ns;
3874				sdtlck->index++;
3875
3876				/* open new linelock */
3877				if (sdtlck->index < sdtlck->maxcnt)
3878					slv++;
3879				else {
3880					sdtlck =
3881					    (struct dt_lock *)
3882					    txLinelock(sdtlck);
3883					slv = & sdtlck->lv[0];
3884				}
3885
3886				slv->offset = ssi;
3887				ns = 0;
3888			}
3889
3890			/* get next source segment */
3891			s = &sp->slot[ssi];
3892
3893			/* get next destination free slot */
3894			d++;
3895
3896			len = min(snamlen, DTSLOTDATALEN);
3897			UniStrncpy_le(d->name, s->name, len);
3898
3899			ns++;
3900			nd++;
3901			xssi = ssi;
3902
3903			dsi++;
3904			d->next = dsi;
3905
3906			/* free source segment */
3907			next = s->next;
3908			s->next = sfsi;
3909			s->cnt = 1;
3910			sfsi = ssi;
3911
3912			snamlen -= len;
3913		}		/* end while */
3914
3915		/* terminate dst last/only segment */
3916		if (h == d) {
3917			/* single segment entry */
3918			if (dp->header.flag & BT_LEAF)
3919				dlh->next = -1;
3920			else
3921				dih->next = -1;
3922		} else
3923			/* multi-segment entry */
3924			d->next = -1;
3925	}			/* end for */
3926
3927	/* close current linelock */
3928	slv->length = ns;
3929	sdtlck->index++;
3930	*sdtlock = sdtlck;
3931
3932	dlv->length = nd;
3933	ddtlck->index++;
3934	*ddtlock = ddtlck;
3935
3936	/* update source header */
3937	sp->header.freelist = sfsi;
3938	sp->header.freecnt += nd;
3939
3940	/* update destination header */
3941	dp->header.nextindex = di;
3942
3943	dp->header.freelist = dsi;
3944	dp->header.freecnt -= nd;
3945}
3946
3947
3948/*
3949 *	dtDeleteEntry()
3950 *
3951 * function: free a (leaf/internal) entry
3952 *
3953 * log freelist header, stbl, and each segment slot of entry
3954 * (even though last/only segment next field is modified,
3955 * physical image logging requires all segment slots of
3956 * the entry logged to avoid applying previous updates
3957 * to the same slots)
3958 */
3959static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock)
3960{
3961	int fsi;		/* free entry slot index */
3962	s8 *stbl;
3963	struct dtslot *t;
3964	int si, freecnt;
3965	struct dt_lock *dtlck = *dtlock;
3966	struct lv *lv;
3967	int xsi, n;
3968
3969	/* get free entry slot index */
3970	stbl = DT_GETSTBL(p);
3971	fsi = stbl[fi];
3972
3973	/* open new linelock */
3974	if (dtlck->index >= dtlck->maxcnt)
3975		dtlck = (struct dt_lock *) txLinelock(dtlck);
3976	lv = & dtlck->lv[dtlck->index];
3977
3978	lv->offset = fsi;
3979
3980	/* get the head/only segment */
3981	t = &p->slot[fsi];
3982	if (p->header.flag & BT_LEAF)
3983		si = ((struct ldtentry *) t)->next;
3984	else
3985		si = ((struct idtentry *) t)->next;
3986	t->next = si;
3987	t->cnt = 1;
3988
3989	n = freecnt = 1;
3990	xsi = fsi;
3991
3992	/* find the last/only segment */
3993	while (si >= 0) {
3994		/* is next slot contiguous ? */
3995		if (si != xsi + 1) {
3996			/* close current linelock */
3997			lv->length = n;
3998			dtlck->index++;
3999
4000			/* open new linelock */
4001			if (dtlck->index < dtlck->maxcnt)
4002				lv++;
4003			else {
4004				dtlck = (struct dt_lock *) txLinelock(dtlck);
4005				lv = & dtlck->lv[0];
4006			}
4007
4008			lv->offset = si;
4009			n = 0;
4010		}
4011
4012		n++;
4013		xsi = si;
4014		freecnt++;
4015
4016		t = &p->slot[si];
4017		t->cnt = 1;
4018		si = t->next;
4019	}
4020
4021	/* close current linelock */
4022	lv->length = n;
4023	dtlck->index++;
4024
4025	*dtlock = dtlck;
4026
4027	/* update freelist */
4028	t->next = p->header.freelist;
4029	p->header.freelist = fsi;
4030	p->header.freecnt += freecnt;
4031
4032	/* if delete from middle,
4033	 * shift left the succedding entries in the stbl
4034	 */
4035	si = p->header.nextindex;
4036	if (fi < si - 1)
4037		memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1);
4038
4039	p->header.nextindex--;
4040}
4041
4042
4043/*
4044 *	dtTruncateEntry()
4045 *
4046 * function: truncate a (leaf/internal) entry
4047 *
4048 * log freelist header, stbl, and each segment slot of entry
4049 * (even though last/only segment next field is modified,
4050 * physical image logging requires all segment slots of
4051 * the entry logged to avoid applying previous updates
4052 * to the same slots)
4053 */
4054static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock)
4055{
4056	int tsi;		/* truncate entry slot index */
4057	s8 *stbl;
4058	struct dtslot *t;
4059	int si, freecnt;
4060	struct dt_lock *dtlck = *dtlock;
4061	struct lv *lv;
4062	int fsi, xsi, n;
4063
4064	/* get free entry slot index */
4065	stbl = DT_GETSTBL(p);
4066	tsi = stbl[ti];
4067
4068	/* open new linelock */
4069	if (dtlck->index >= dtlck->maxcnt)
4070		dtlck = (struct dt_lock *) txLinelock(dtlck);
4071	lv = & dtlck->lv[dtlck->index];
4072
4073	lv->offset = tsi;
4074
4075	/* get the head/only segment */
4076	t = &p->slot[tsi];
4077	ASSERT(p->header.flag & BT_INTERNAL);
4078	((struct idtentry *) t)->namlen = 0;
4079	si = ((struct idtentry *) t)->next;
4080	((struct idtentry *) t)->next = -1;
4081
4082	n = 1;
4083	freecnt = 0;
4084	fsi = si;
4085	xsi = tsi;
4086
4087	/* find the last/only segment */
4088	while (si >= 0) {
4089		/* is next slot contiguous ? */
4090		if (si != xsi + 1) {
4091			/* close current linelock */
4092			lv->length = n;
4093			dtlck->index++;
4094
4095			/* open new linelock */
4096			if (dtlck->index < dtlck->maxcnt)
4097				lv++;
4098			else {
4099				dtlck = (struct dt_lock *) txLinelock(dtlck);
4100				lv = & dtlck->lv[0];
4101			}
4102
4103			lv->offset = si;
4104			n = 0;
4105		}
4106
4107		n++;
4108		xsi = si;
4109		freecnt++;
4110
4111		t = &p->slot[si];
4112		t->cnt = 1;
4113		si = t->next;
4114	}
4115
4116	/* close current linelock */
4117	lv->length = n;
4118	dtlck->index++;
4119
4120	*dtlock = dtlck;
4121
4122	/* update freelist */
4123	if (freecnt == 0)
4124		return;
4125	t->next = p->header.freelist;
4126	p->header.freelist = fsi;
4127	p->header.freecnt += freecnt;
4128}
4129
4130
4131/*
4132 *	dtLinelockFreelist()
4133 */
4134static void dtLinelockFreelist(dtpage_t * p,	/* directory page */
4135			       int m,	/* max slot index */
4136			       struct dt_lock ** dtlock)
4137{
4138	int fsi;		/* free entry slot index */
4139	struct dtslot *t;
4140	int si;
4141	struct dt_lock *dtlck = *dtlock;
4142	struct lv *lv;
4143	int xsi, n;
4144
4145	/* get free entry slot index */
4146	fsi = p->header.freelist;
4147
4148	/* open new linelock */
4149	if (dtlck->index >= dtlck->maxcnt)
4150		dtlck = (struct dt_lock *) txLinelock(dtlck);
4151	lv = & dtlck->lv[dtlck->index];
4152
4153	lv->offset = fsi;
4154
4155	n = 1;
4156	xsi = fsi;
4157
4158	t = &p->slot[fsi];
4159	si = t->next;
4160
4161	/* find the last/only segment */
4162	while (si < m && si >= 0) {
4163		/* is next slot contiguous ? */
4164		if (si != xsi + 1) {
4165			/* close current linelock */
4166			lv->length = n;
4167			dtlck->index++;
4168
4169			/* open new linelock */
4170			if (dtlck->index < dtlck->maxcnt)
4171				lv++;
4172			else {
4173				dtlck = (struct dt_lock *) txLinelock(dtlck);
4174				lv = & dtlck->lv[0];
4175			}
4176
4177			lv->offset = si;
4178			n = 0;
4179		}
4180
4181		n++;
4182		xsi = si;
4183
4184		t = &p->slot[si];
4185		si = t->next;
4186	}
4187
4188	/* close current linelock */
4189	lv->length = n;
4190	dtlck->index++;
4191
4192	*dtlock = dtlck;
4193}
4194
4195
4196/*
4197 * NAME: dtModify
4198 *
4199 * FUNCTION: Modify the inode number part of a directory entry
4200 *
4201 * PARAMETERS:
4202 *	tid	- Transaction id
4203 *	ip	- Inode of parent directory
4204 *	key	- Name of entry to be modified
4205 *	orig_ino	- Original inode number expected in entry
4206 *	new_ino	- New inode number to put into entry
4207 *	flag	- JFS_RENAME
4208 *
4209 * RETURNS:
4210 *	-ESTALE	- If entry found does not match orig_ino passed in
4211 *	-ENOENT	- If no entry can be found to match key
4212 *	0	- If successfully modified entry
4213 */
4214int dtModify(tid_t tid, struct inode *ip,
4215	 struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag)
4216{
4217	int rc;
4218	s64 bn;
4219	struct metapage *mp;
4220	dtpage_t *p;
4221	int index;
4222	struct btstack btstack;
4223	struct tlock *tlck;
4224	struct dt_lock *dtlck;
4225	struct lv *lv;
4226	s8 *stbl;
4227	int entry_si;		/* entry slot index */
4228	struct ldtentry *entry;
4229
4230	/*
4231	 *	search for the entry to modify:
4232	 *
4233	 * dtSearch() returns (leaf page pinned, index at which to modify).
4234	 */
4235	if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag)))
4236		return rc;
4237
4238	/* retrieve search result */
4239	DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
4240
4241	BT_MARK_DIRTY(mp, ip);
4242	/*
4243	 * acquire a transaction lock on the leaf page of named entry
4244	 */
4245	tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
4246	dtlck = (struct dt_lock *) & tlck->lock;
4247
4248	/* get slot index of the entry */
4249	stbl = DT_GETSTBL(p);
4250	entry_si = stbl[index];
4251
4252	/* linelock entry */
4253	ASSERT(dtlck->index == 0);
4254	lv = & dtlck->lv[0];
4255	lv->offset = entry_si;
4256	lv->length = 1;
4257	dtlck->index++;
4258
4259	/* get the head/only segment */
4260	entry = (struct ldtentry *) & p->slot[entry_si];
4261
4262	/* substitute the inode number of the entry */
4263	entry->inumber = cpu_to_le32(new_ino);
4264
4265	/* unpin the leaf page */
4266	DT_PUTPAGE(mp);
4267
4268	return 0;
4269}
4270