1/*
2 *   Copyright (c) International Business Machines Corp., 2000-2002
3 *
4 *   This program is free software;  you can redistribute it and/or modify
5 *   it under the terms of the GNU General Public License as published by
6 *   the Free Software Foundation; either version 2 of the License, or
7 *   (at your option) any later version.
8 *
9 *   This program is distributed in the hope that it will be useful,
10 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
12 *   the GNU General Public License for more details.
13 *
14 *   You should have received a copy of the GNU General Public License
15 *   along with this program;  if not, write to the Free Software
16 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19/*
20 *	jfs_imap.c: inode allocation map manager
21 *
22 * Serialization:
23 *   Each AG has a simple lock which is used to control the serialization of
24 *	the AG level lists.  This lock should be taken first whenever an AG
25 *	level list will be modified or accessed.
26 *
27 *   Each IAG is locked by obtaining the buffer for the IAG page.
28 *
29 *   There is also a inode lock for the inode map inode.  A read lock needs to
30 *	be taken whenever an IAG is read from the map or the global level
31 *	information is read.  A write lock needs to be taken whenever the global
32 *	level information is modified or an atomic operation needs to be used.
33 *
34 *	If more than one IAG is read at one time, the read lock may not
35 *	be given up until all of the IAG's are read.  Otherwise, a deadlock
36 *	may occur when trying to obtain the read lock while another thread
37 *	holding the read lock is waiting on the IAG already being held.
38 *
39 *   The control page of the inode map is read into memory by diMount().
40 *	Thereafter it should only be modified in memory and then it will be
41 *	written out when the filesystem is unmounted by diUnmount().
42 */
43
44#include <linux/fs.h>
45#include <linux/locks.h>
46#include "jfs_incore.h"
47#include "jfs_filsys.h"
48#include "jfs_dinode.h"
49#include "jfs_dmap.h"
50#include "jfs_imap.h"
51#include "jfs_metapage.h"
52#include "jfs_superblock.h"
53#include "jfs_debug.h"
54
55/*
56 * imap locks
57 */
58/* iag free list lock */
59#define IAGFREE_LOCK_INIT(imap)		init_MUTEX(&imap->im_freelock)
60#define IAGFREE_LOCK(imap)		down(&imap->im_freelock)
61#define IAGFREE_UNLOCK(imap)		up(&imap->im_freelock)
62
63/* per ag iag list locks */
64#define AG_LOCK_INIT(imap,index)	init_MUTEX(&(imap->im_aglock[index]))
65#define AG_LOCK(imap,agno)		down(&imap->im_aglock[agno])
66#define AG_UNLOCK(imap,agno)		up(&imap->im_aglock[agno])
67
68/*
69 * external references
70 */
71extern struct address_space_operations jfs_aops;
72
73/*
74 * forward references
75 */
76static int diAllocAG(struct inomap *, int, boolean_t, struct inode *);
77static int diAllocAny(struct inomap *, int, boolean_t, struct inode *);
78static int diAllocBit(struct inomap *, struct iag *, int);
79static int diAllocExt(struct inomap *, int, struct inode *);
80static int diAllocIno(struct inomap *, int, struct inode *);
81static int diFindFree(u32, int);
82static int diNewExt(struct inomap *, struct iag *, int);
83static int diNewIAG(struct inomap *, int *, int, struct metapage **);
84static void duplicateIXtree(struct super_block *, s64, int, s64 *);
85
86static int diIAGRead(struct inomap * imap, int, struct metapage **);
87static int copy_from_dinode(struct dinode *, struct inode *);
88static void copy_to_dinode(struct dinode *, struct inode *);
89
90/*
91 *	debug code for double-checking inode map
92 */
93/* #define	_JFS_DEBUG_IMAP	1 */
94
95#ifdef	_JFS_DEBUG_IMAP
96#define DBG_DIINIT(imap)	DBGdiInit(imap)
97#define DBG_DIALLOC(imap, ino)	DBGdiAlloc(imap, ino)
98#define DBG_DIFREE(imap, ino)	DBGdiFree(imap, ino)
99
100static void *DBGdiInit(struct inomap * imap);
101static void DBGdiAlloc(struct inomap * imap, ino_t ino);
102static void DBGdiFree(struct inomap * imap, ino_t ino);
103#else
104#define DBG_DIINIT(imap)
105#define DBG_DIALLOC(imap, ino)
106#define DBG_DIFREE(imap, ino)
107#endif				/* _JFS_DEBUG_IMAP */
108
109/*
110 * NAME:        diMount()
111 *
112 * FUNCTION:    initialize the incore inode map control structures for
113 *		a fileset or aggregate init time.
114 *
115 *              the inode map's control structure (dinomap) is
116 *              brought in from disk and placed in virtual memory.
117 *
118 * PARAMETERS:
119 *      ipimap  - pointer to inode map inode for the aggregate or fileset.
120 *
121 * RETURN VALUES:
122 *      0       - success
123 *      ENOMEM  - insufficient free virtual memory.
124 *      EIO  	- i/o error.
125 */
126int diMount(struct inode *ipimap)
127{
128	struct inomap *imap;
129	struct metapage *mp;
130	int index;
131	struct dinomap *dinom_le;
132
133	/*
134	 * allocate/initialize the in-memory inode map control structure
135	 */
136	/* allocate the in-memory inode map control structure. */
137	imap = (struct inomap *) kmalloc(sizeof(struct inomap), GFP_KERNEL);
138	if (imap == NULL) {
139		jERROR(1, ("diMount: kmalloc returned NULL!\n"));
140		return (ENOMEM);
141	}
142
143	/* read the on-disk inode map control structure. */
144
145	mp = read_metapage(ipimap,
146			   IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
147			   PSIZE, 0);
148	if (mp == NULL) {
149		kfree(imap);
150		return (EIO);
151	}
152
153	/* copy the on-disk version to the in-memory version. */
154	dinom_le = (struct dinomap *) mp->data;
155	imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag);
156	imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag);
157	atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos));
158	atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree));
159	imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext);
160	imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext);
161	for (index = 0; index < MAXAG; index++) {
162		imap->im_agctl[index].inofree =
163		    le32_to_cpu(dinom_le->in_agctl[index].inofree);
164		imap->im_agctl[index].extfree =
165		    le32_to_cpu(dinom_le->in_agctl[index].extfree);
166		imap->im_agctl[index].numinos =
167		    le32_to_cpu(dinom_le->in_agctl[index].numinos);
168		imap->im_agctl[index].numfree =
169		    le32_to_cpu(dinom_le->in_agctl[index].numfree);
170	}
171
172	/* release the buffer. */
173	release_metapage(mp);
174
175	/*
176	 * allocate/initialize inode allocation map locks
177	 */
178	/* allocate and init iag free list lock */
179	IAGFREE_LOCK_INIT(imap);
180
181	/* allocate and init ag list locks */
182	for (index = 0; index < MAXAG; index++) {
183		AG_LOCK_INIT(imap, index);
184	}
185
186	/* bind the inode map inode and inode map control structure
187	 * to each other.
188	 */
189	imap->im_ipimap = ipimap;
190	JFS_IP(ipimap)->i_imap = imap;
191
192//      DBG_DIINIT(imap);
193
194	return (0);
195}
196
197
198/*
199 * NAME:        diUnmount()
200 *
201 * FUNCTION:    write to disk the incore inode map control structures for
202 *		a fileset or aggregate at unmount time.
203 *
204 * PARAMETERS:
205 *      ipimap  - pointer to inode map inode for the aggregate or fileset.
206 *
207 * RETURN VALUES:
208 *      0       - success
209 *      ENOMEM  - insufficient free virtual memory.
210 *      EIO  	- i/o error.
211 */
212int diUnmount(struct inode *ipimap, int mounterror)
213{
214	struct inomap *imap = JFS_IP(ipimap)->i_imap;
215
216	/*
217	 * update the on-disk inode map control structure
218	 */
219
220	if (!(mounterror || isReadOnly(ipimap)))
221		diSync(ipimap);
222
223	/*
224	 * Invalidate the page cache buffers
225	 */
226	truncate_inode_pages(ipimap->i_mapping, 0);
227
228	/*
229	 * free in-memory control structure
230	 */
231	kfree(imap);
232
233	return (0);
234}
235
236
237/*
238 *	diSync()
239 */
240int diSync(struct inode *ipimap)
241{
242	struct dinomap *dinom_le;
243	struct inomap *imp = JFS_IP(ipimap)->i_imap;
244	struct metapage *mp;
245	int index;
246
247	/*
248	 * write imap global conrol page
249	 */
250	/* read the on-disk inode map control structure */
251	mp = get_metapage(ipimap,
252			  IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
253			  PSIZE, 0);
254	if (mp == NULL) {
255		jERROR(1,("diSync: get_metapage failed!\n"));
256		return EIO;
257	}
258
259	/* copy the in-memory version to the on-disk version */
260	dinom_le = (struct dinomap *) mp->data;
261	dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag);
262	dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag);
263	dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos));
264	dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree));
265	dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext);
266	dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext);
267	for (index = 0; index < MAXAG; index++) {
268		dinom_le->in_agctl[index].inofree =
269		    cpu_to_le32(imp->im_agctl[index].inofree);
270		dinom_le->in_agctl[index].extfree =
271		    cpu_to_le32(imp->im_agctl[index].extfree);
272		dinom_le->in_agctl[index].numinos =
273		    cpu_to_le32(imp->im_agctl[index].numinos);
274		dinom_le->in_agctl[index].numfree =
275		    cpu_to_le32(imp->im_agctl[index].numfree);
276	}
277
278	/* write out the control structure */
279	write_metapage(mp);
280
281	/*
282	 * write out dirty pages of imap
283	 */
284	fsync_inode_data_buffers(ipimap);
285
286	diWriteSpecial(ipimap, 0);
287
288	return (0);
289}
290
291
292/*
293 * NAME:        diRead()
294 *
295 * FUNCTION:    initialize an incore inode from disk.
296 *
297 *		on entry, the specifed incore inode should itself
298 *		specify the disk inode number corresponding to the
299 *		incore inode (i.e. i_number should be initialized).
300 *
301 *		this routine handles incore inode initialization for
302 *		both "special" and "regular" inodes.  special inodes
303 *		are those required early in the mount process and
304 *	        require special handling since much of the file system
305 *		is not yet initialized.  these "special" inodes are
306 *		identified by a NULL inode map inode pointer and are
307 *		actually initialized by a call to diReadSpecial().
308 *
309 *		for regular inodes, the iag describing the disk inode
310 *		is read from disk to determine the inode extent address
311 *		for the disk inode.  with the inode extent address in
312 *		hand, the page of the extent that contains the disk
313 *		inode is read and the disk inode is copied to the
314 *		incore inode.
315 *
316 * PARAMETERS:
317 *      ip  -  pointer to incore inode to be initialized from disk.
318 *
319 * RETURN VALUES:
320 *      0       - success
321 *      EIO  	- i/o error.
322 *      ENOMEM	- insufficient memory
323 *
324 */
325int diRead(struct inode *ip)
326{
327	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
328	int iagno, ino, extno, rc;
329	struct inode *ipimap;
330	struct dinode *dp;
331	struct iag *iagp;
332	struct metapage *mp;
333	s64 blkno, agstart;
334	struct inomap *imap;
335	int block_offset;
336	int inodes_left;
337	uint pageno;
338	int rel_inode;
339
340	jFYI(1, ("diRead: ino = %ld\n", ip->i_ino));
341
342	ipimap = sbi->ipimap;
343	JFS_IP(ip)->ipimap = ipimap;
344
345	/* determine the iag number for this inode (number) */
346	iagno = INOTOIAG(ip->i_ino);
347
348	/* read the iag */
349	imap = JFS_IP(ipimap)->i_imap;
350	IREAD_LOCK(ipimap);
351	rc = diIAGRead(imap, iagno, &mp);
352	IREAD_UNLOCK(ipimap);
353	if (rc) {
354		jERROR(1, ("diRead: diIAGRead returned %d\n", rc));
355		return (rc);
356	}
357
358	iagp = (struct iag *) mp->data;
359
360	/* determine inode extent that holds the disk inode */
361	ino = ip->i_ino & (INOSPERIAG - 1);
362	extno = ino >> L2INOSPEREXT;
363
364	if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) ||
365	    (addressPXD(&iagp->inoext[extno]) == 0)) {
366		release_metapage(mp);
367		return ESTALE;
368	}
369
370	/* get disk block number of the page within the inode extent
371	 * that holds the disk inode.
372	 */
373	blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage);
374
375	/* get the ag for the iag */
376	agstart = le64_to_cpu(iagp->agstart);
377
378	release_metapage(mp);
379
380	rel_inode = (ino & (INOSPERPAGE - 1));
381	pageno = blkno >> sbi->l2nbperpage;
382
383	if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
384		/*
385		 * OS/2 didn't always align inode extents on page boundaries
386		 */
387		inodes_left =
388		     (sbi->nbperpage - block_offset) << sbi->l2niperblk;
389
390		if (rel_inode < inodes_left)
391			rel_inode += block_offset << sbi->l2niperblk;
392		else {
393			pageno += 1;
394			rel_inode -= inodes_left;
395		}
396	}
397
398	/* read the page of disk inode */
399	mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
400	if (mp == 0) {
401		jERROR(1, ("diRead: read_metapage failed\n"));
402		return EIO;
403	}
404
405	/* locate the the disk inode requested */
406	dp = (struct dinode *) mp->data;
407	dp += rel_inode;
408
409	if (ip->i_ino != le32_to_cpu(dp->di_number)) {
410		jERROR(1, ("diRead: i_ino != di_number\n"));
411		updateSuper(ip->i_sb, FM_DIRTY);
412		rc = EIO;
413	} else if (le32_to_cpu(dp->di_nlink) == 0)
414		rc = ESTALE;
415	else
416		/* copy the disk inode to the in-memory inode */
417		rc = copy_from_dinode(dp, ip);
418
419	release_metapage(mp);
420
421	/* set the ag for the inode */
422	JFS_IP(ip)->agno = BLKTOAG(agstart, sbi);
423	JFS_IP(ip)->active_ag = -1;
424
425	return (rc);
426}
427
428
429/*
430 * NAME:        diReadSpecial()
431 *
432 * FUNCTION:    initialize a 'special' inode from disk.
433 *
434 *		this routines handles aggregate level inodes.  The
435 *		inode cache cannot differentiate between the
436 *		aggregate inodes and the filesystem inodes, so we
437 *		handle these here.  We don't actually use the aggregate
438 *	        inode map, since these inodes are at a fixed location
439 *		and in some cases the aggregate inode map isn't initialized
440 *		yet.
441 *
442 * PARAMETERS:
443 *      sb - filesystem superblock
444 *	inum - aggregate inode number
445 *	secondary - 1 if secondary aggregate inode table
446 *
447 * RETURN VALUES:
448 *      new inode	- success
449 *      NULL		- i/o error.
450 */
451struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
452{
453	struct jfs_sb_info *sbi = JFS_SBI(sb);
454	uint address;
455	struct dinode *dp;
456	struct inode *ip;
457	struct metapage *mp;
458	int rc;
459
460	ip = new_inode(sb);
461	if (ip == NULL) {
462		jERROR(1,
463		       ("diReadSpecial: new_inode returned NULL!\n"));
464		return ip;
465	}
466
467	rc = alloc_jfs_inode(ip);
468	if (rc) {
469		make_bad_inode(ip);
470		iput(ip);
471		return NULL;
472	}
473
474	if (secondary) {
475		address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
476		JFS_IP(ip)->ipimap = sbi->ipaimap2;
477	} else {
478		address = AITBL_OFF >> L2PSIZE;
479		JFS_IP(ip)->ipimap = sbi->ipaimap;
480	}
481
482	ASSERT(inum < INOSPEREXT);
483
484	ip->i_ino = inum;
485
486	address += inum >> 3;	/* 8 inodes per 4K page */
487
488	/* read the page of fixed disk inode (AIT) in raw mode */
489	jEVENT(0,
490	       ("Reading aggregate inode %d from block %d\n", (uint) inum,
491		address));
492	mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
493	if (mp == NULL) {
494		ip->i_sb = NULL;
495		ip->i_nlink = 1;	/* Don't want iput() deleting it */
496		iput(ip);
497		return (NULL);
498	}
499
500	/* get the pointer to the disk inode of interest */
501	dp = (struct dinode *) (mp->data);
502	dp += inum % 8;		/* 8 inodes per 4K page */
503
504	/* copy on-disk inode to in-memory inode */
505	if ((copy_from_dinode(dp, ip)) != 0) {
506		/* handle bad return by returning NULL for ip */
507		ip->i_sb = NULL;
508		ip->i_nlink = 1;	/* Don't want iput() deleting it */
509		iput(ip);
510		/* release the page */
511		release_metapage(mp);
512		return (NULL);
513
514	}
515
516	ip->i_mapping->a_ops = &jfs_aops;
517	ip->i_mapping->gfp_mask = GFP_NOFS;
518
519	if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) {
520		sbi->gengen = le32_to_cpu(dp->di_gengen);
521		sbi->inostamp = le32_to_cpu(dp->di_inostamp);
522	}
523
524	/* release the page */
525	release_metapage(mp);
526
527	return (ip);
528}
529
530/*
531 * NAME:        diWriteSpecial()
532 *
533 * FUNCTION:    Write the special inode to disk
534 *
535 * PARAMETERS:
536 *      ip - special inode
537 *	secondary - 1 if secondary aggregate inode table
538 *
539 * RETURN VALUES: none
540 */
541
542void diWriteSpecial(struct inode *ip, int secondary)
543{
544	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
545	uint address;
546	struct dinode *dp;
547	ino_t inum = ip->i_ino;
548	struct metapage *mp;
549
550	ip->i_state &= ~I_DIRTY;
551
552	if (secondary)
553		address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
554	else
555		address = AITBL_OFF >> L2PSIZE;
556
557	ASSERT(inum < INOSPEREXT);
558
559	address += inum >> 3;	/* 8 inodes per 4K page */
560
561	/* read the page of fixed disk inode (AIT) in raw mode */
562	jEVENT(0,
563	       ("Reading aggregate inode %d from block %d\n", (uint) inum,
564		address));
565	mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
566	if (mp == NULL) {
567		jERROR(1,
568		       ("diWriteSpecial: failed to read aggregate inode extent!\n"));
569		return;
570	}
571
572	/* get the pointer to the disk inode of interest */
573	dp = (struct dinode *) (mp->data);
574	dp += inum % 8;		/* 8 inodes per 4K page */
575
576	/* copy on-disk inode to in-memory inode */
577	copy_to_dinode(dp, ip);
578	memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288);
579
580	if (inum == FILESYSTEM_I)
581		dp->di_gengen = cpu_to_le32(sbi->gengen);
582
583	/* write the page */
584	write_metapage(mp);
585}
586
587/*
588 * NAME:        diFreeSpecial()
589 *
590 * FUNCTION:    Free allocated space for special inode
591 */
592void diFreeSpecial(struct inode *ip)
593{
594	if (ip == NULL) {
595		jERROR(1, ("diFreeSpecial called with NULL ip!\n"));
596		return;
597	}
598	fsync_inode_data_buffers(ip);
599	truncate_inode_pages(ip->i_mapping, 0);
600	iput(ip);
601}
602
603
604
605/*
606 * NAME:        diWrite()
607 *
608 * FUNCTION:    write the on-disk inode portion of the in-memory inode
609 *		to its corresponding on-disk inode.
610 *
611 *		on entry, the specifed incore inode should itself
612 *		specify the disk inode number corresponding to the
613 *		incore inode (i.e. i_number should be initialized).
614 *
615 *		the inode contains the inode extent address for the disk
616 *		inode.  with the inode extent address in hand, the
617 *		page of the extent that contains the disk inode is
618 *		read and the disk inode portion of the incore inode
619 *		is copied to the disk inode.
620 *
621 * PARAMETERS:
622 *	tid -  transacation id
623 *      ip  -  pointer to incore inode to be written to the inode extent.
624 *
625 * RETURN VALUES:
626 *      0       - success
627 *      EIO  	- i/o error.
628 */
629int diWrite(tid_t tid, struct inode *ip)
630{
631	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
632	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
633	int rc = 0;
634	s32 ino;
635	struct dinode *dp;
636	s64 blkno;
637	int block_offset;
638	int inodes_left;
639	struct metapage *mp;
640	uint pageno;
641	int rel_inode;
642	int dioffset;
643	struct inode *ipimap;
644	uint type;
645	lid_t lid;
646	struct tlock *ditlck, *tlck;
647	struct linelock *dilinelock, *ilinelock;
648	struct lv *lv;
649	int n;
650
651	ipimap = jfs_ip->ipimap;
652
653	ino = ip->i_ino & (INOSPERIAG - 1);
654
655	assert(lengthPXD(&(jfs_ip->ixpxd)) ==
656	       JFS_IP(ipimap)->i_imap->im_nbperiext);
657	assert(addressPXD(&(jfs_ip->ixpxd)));
658
659	/*
660	 * read the page of disk inode containing the specified inode:
661	 */
662	/* compute the block address of the page */
663	blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage);
664
665	rel_inode = (ino & (INOSPERPAGE - 1));
666	pageno = blkno >> sbi->l2nbperpage;
667
668	if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
669		/*
670		 * OS/2 didn't always align inode extents on page boundaries
671		 */
672		inodes_left =
673		    (sbi->nbperpage - block_offset) << sbi->l2niperblk;
674
675		if (rel_inode < inodes_left)
676			rel_inode += block_offset << sbi->l2niperblk;
677		else {
678			pageno += 1;
679			rel_inode -= inodes_left;
680		}
681	}
682	/* read the page of disk inode */
683      retry:
684	mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
685	if (mp == 0)
686		return (EIO);
687
688	/* get the pointer to the disk inode */
689	dp = (struct dinode *) mp->data;
690	dp += rel_inode;
691
692	dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE;
693
694	/*
695	 * acquire transaction lock on the on-disk inode;
696	 * N.B. tlock is acquired on ipimap not ip;
697	 */
698	if ((ditlck =
699	     txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL)
700		goto retry;
701	dilinelock = (struct linelock *) & ditlck->lock;
702
703	/*
704	 * copy btree root from in-memory inode to on-disk inode
705	 *
706	 * (tlock is taken from inline B+-tree root in in-memory
707	 * inode when the B+-tree root is updated, which is pointed
708	 * by jfs_ip->blid as well as being on tx tlock list)
709	 *
710	 * further processing of btree root is based on the copy
711	 * in in-memory inode, where txLog() will log from, and,
712	 * for xtree root, txUpdateMap() will update map and reset
713	 * XAD_NEW bit;
714	 */
715
716	if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) {
717		/*
718		 * This is the special xtree inside the directory for storing
719		 * the directory table
720		 */
721		xtpage_t *p, *xp;
722		xad_t *xad;
723
724		jfs_ip->xtlid = 0;
725		tlck = lid_to_tlock(lid);
726		assert(tlck->type & tlckXTREE);
727		tlck->type |= tlckBTROOT;
728		tlck->mp = mp;
729		ilinelock = (struct linelock *) & tlck->lock;
730
731		/*
732		 * copy xtree root from inode to dinode:
733		 */
734		p = &jfs_ip->i_xtroot;
735		xp = (xtpage_t *) &dp->di_dirtable;
736		lv = ilinelock->lv;
737		for (n = 0; n < ilinelock->index; n++, lv++) {
738			memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
739			       lv->length << L2XTSLOTSIZE);
740		}
741
742		/* reset on-disk (metadata page) xtree XAD_NEW bit */
743		xad = &xp->xad[XTENTRYSTART];
744		for (n = XTENTRYSTART;
745		     n < le16_to_cpu(xp->header.nextindex); n++, xad++)
746			if (xad->flag & (XAD_NEW | XAD_EXTENDED))
747				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
748	}
749
750	if ((lid = jfs_ip->blid) == 0)
751		goto inlineData;
752	jfs_ip->blid = 0;
753
754	tlck = lid_to_tlock(lid);
755	type = tlck->type;
756	tlck->type |= tlckBTROOT;
757	tlck->mp = mp;
758	ilinelock = (struct linelock *) & tlck->lock;
759
760	/*
761	 *      regular file: 16 byte (XAD slot) granularity
762	 */
763	if (type & tlckXTREE) {
764		xtpage_t *p, *xp;
765		xad_t *xad;
766
767		/*
768		 * copy xtree root from inode to dinode:
769		 */
770		p = &jfs_ip->i_xtroot;
771		xp = &dp->di_xtroot;
772		lv = ilinelock->lv;
773		for (n = 0; n < ilinelock->index; n++, lv++) {
774			memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
775			       lv->length << L2XTSLOTSIZE);
776		}
777
778		/* reset on-disk (metadata page) xtree XAD_NEW bit */
779		xad = &xp->xad[XTENTRYSTART];
780		for (n = XTENTRYSTART;
781		     n < le16_to_cpu(xp->header.nextindex); n++, xad++)
782			if (xad->flag & (XAD_NEW | XAD_EXTENDED))
783				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
784	}
785	/*
786	 *      directory: 32 byte (directory entry slot) granularity
787	 */
788	else if (type & tlckDTREE) {
789		dtpage_t *p, *xp;
790
791		/*
792		 * copy dtree root from inode to dinode:
793		 */
794		p = (dtpage_t *) &jfs_ip->i_dtroot;
795		xp = (dtpage_t *) & dp->di_dtroot;
796		lv = ilinelock->lv;
797		for (n = 0; n < ilinelock->index; n++, lv++) {
798			memcpy(&xp->slot[lv->offset], &p->slot[lv->offset],
799			       lv->length << L2DTSLOTSIZE);
800		}
801	} else {
802		jERROR(1, ("diWrite: UFO tlock\n"));
803	}
804
805      inlineData:
806	/*
807	 * copy inline symlink from in-memory inode to on-disk inode
808	 */
809	if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) {
810		lv = & dilinelock->lv[dilinelock->index];
811		lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE;
812		lv->length = 2;
813		memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE);
814		dilinelock->index++;
815	}
816	/*
817	 * copy inline data from in-memory inode to on-disk inode:
818	 * 128 byte slot granularity
819	 */
820	if (test_cflag(COMMIT_Inlineea, ip)) {
821		lv = & dilinelock->lv[dilinelock->index];
822		lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE;
823		lv->length = 1;
824		memcpy(&dp->di_inlineea, jfs_ip->i_inline_ea, INODESLOTSIZE);
825		dilinelock->index++;
826
827		clear_cflag(COMMIT_Inlineea, ip);
828	}
829
830	/*
831	 *      lock/copy inode base: 128 byte slot granularity
832	 */
833// baseDinode:
834	lv = & dilinelock->lv[dilinelock->index];
835	lv->offset = dioffset >> L2INODESLOTSIZE;
836	copy_to_dinode(dp, ip);
837	if (test_and_clear_cflag(COMMIT_Dirtable, ip)) {
838		lv->length = 2;
839		memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96);
840	} else
841		lv->length = 1;
842	dilinelock->index++;
843
844#ifdef _JFS_FASTDASD
845	/*
846	 * We aren't logging changes to the DASD used in directory inodes,
847	 * but we need to write them to disk.  If we don't unmount cleanly,
848	 * mount will recalculate the DASD used.
849	 */
850	if (S_ISDIR(ip->i_mode)
851	    && (ip->i_ipmnt->i_mntflag & JFS_DASD_ENABLED))
852		bcopy(&ip->i_DASD, &dp->di_DASD, sizeof(struct dasd));
853#endif				/*  _JFS_FASTDASD */
854
855	/* release the buffer holding the updated on-disk inode.
856	 * the buffer will be later written by commit processing.
857	 */
858	write_metapage(mp);
859
860	return (rc);
861}
862
863
864/*
865 * NAME:        diFree(ip)
866 *
867 * FUNCTION:    free a specified inode from the inode working map
868 *		for a fileset or aggregate.
869 *
870 *		if the inode to be freed represents the first (only)
871 *		free inode within the iag, the iag will be placed on
872 *		the ag free inode list.
873 *
874 *		freeing the inode will cause the inode extent to be
875 *		freed if the inode is the only allocated inode within
876 *		the extent.  in this case all the disk resource backing
877 *		up the inode extent will be freed. in addition, the iag
878 *		will be placed on the ag extent free list if the extent
879 *		is the first free extent in the iag.  if freeing the
880 *		extent also means that no free inodes will exist for
881 *		the iag, the iag will also be removed from the ag free
882 *		inode list.
883 *
884 *		the iag describing the inode will be freed if the extent
885 *		is to be freed and it is the only backed extent within
886 *		the iag.  in this case, the iag will be removed from the
887 *		ag free extent list and ag free inode list and placed on
888 *		the inode map's free iag list.
889 *
890 *		a careful update approach is used to provide consistency
891 *		in the face of updates to multiple buffers.  under this
892 *		approach, all required buffers are obtained before making
893 *		any updates and are held until all updates are complete.
894 *
895 * PARAMETERS:
896 *      ip  	- inode to be freed.
897 *
898 * RETURN VALUES:
899 *      0       - success
900 *      EIO  	- i/o error.
901 */
902int diFree(struct inode *ip)
903{
904	int rc;
905	ino_t inum = ip->i_ino;
906	struct iag *iagp, *aiagp, *biagp, *ciagp, *diagp;
907	struct metapage *mp, *amp, *bmp, *cmp, *dmp;
908	int iagno, ino, extno, bitno, sword, agno;
909	int back, fwd;
910	u32 bitmap, mask;
911	struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap;
912	struct inomap *imap = JFS_IP(ipimap)->i_imap;
913	pxd_t freepxd;
914	tid_t tid;
915	struct inode *iplist[3];
916	struct tlock *tlck;
917	struct pxd_lock *pxdlock;
918
919	/*
920	 * This is just to suppress compiler warnings.  The same logic that
921	 * references these variables is used to initialize them.
922	 */
923	aiagp = biagp = ciagp = diagp = NULL;
924
925	/* get the iag number containing the inode.
926	 */
927	iagno = INOTOIAG(inum);
928
929	/* make sure that the iag is contained within
930	 * the map.
931	 */
932	//assert(iagno < imap->im_nextiag);
933	if (iagno >= imap->im_nextiag) {
934		jERROR(1, ("diFree: inum = %d, iagno = %d, nextiag = %d\n",
935			   (uint) inum, iagno, imap->im_nextiag));
936		dump_mem("imap", imap, 32);
937		updateSuper(ip->i_sb, FM_DIRTY);
938		return EIO;
939	}
940
941	/* get the allocation group for this ino.
942	 */
943	agno = JFS_IP(ip)->agno;
944
945	/* Lock the AG specific inode map information
946	 */
947	AG_LOCK(imap, agno);
948
949	/* Obtain read lock in imap inode.  Don't release it until we have
950	 * read all of the IAG's that we are going to.
951	 */
952	IREAD_LOCK(ipimap);
953
954	/* read the iag.
955	 */
956	if ((rc = diIAGRead(imap, iagno, &mp))) {
957		IREAD_UNLOCK(ipimap);
958		AG_UNLOCK(imap, agno);
959		return (rc);
960	}
961	iagp = (struct iag *) mp->data;
962
963	/* get the inode number and extent number of the inode within
964	 * the iag and the inode number within the extent.
965	 */
966	ino = inum & (INOSPERIAG - 1);
967	extno = ino >> L2INOSPEREXT;
968	bitno = ino & (INOSPEREXT - 1);
969	mask = HIGHORDER >> bitno;
970
971	assert(le32_to_cpu(iagp->wmap[extno]) & mask);
972#ifdef _STILL_TO_PORT
973	assert((le32_to_cpu(iagp->pmap[extno]) & mask) == 0);
974#endif				/*  _STILL_TO_PORT */
975	assert(addressPXD(&iagp->inoext[extno]));
976
977	/* compute the bitmap for the extent reflecting the freed inode.
978	 */
979	bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask;
980
981	if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) {
982		jERROR(1,("diFree: numfree > numinos\n"));
983		release_metapage(mp);
984		IREAD_UNLOCK(ipimap);
985		AG_UNLOCK(imap, agno);
986		updateSuper(ip->i_sb, FM_DIRTY);
987		return EIO;
988	}
989	/*
990	 *      inode extent still has some inodes or below low water mark:
991	 *      keep the inode extent;
992	 */
993	if (bitmap ||
994	    imap->im_agctl[agno].numfree < 96 ||
995	    (imap->im_agctl[agno].numfree < 288 &&
996	     (((imap->im_agctl[agno].numfree * 100) /
997	       imap->im_agctl[agno].numinos) <= 25))) {
998		/* if the iag currently has no free inodes (i.e.,
999		 * the inode being freed is the first free inode of iag),
1000		 * insert the iag at head of the inode free list for the ag.
1001		 */
1002		if (iagp->nfreeinos == 0) {
1003			/* check if there are any iags on the ag inode
1004			 * free list.  if so, read the first one so that
1005			 * we can link the current iag onto the list at
1006			 * the head.
1007			 */
1008			if ((fwd = imap->im_agctl[agno].inofree) >= 0) {
1009				/* read the iag that currently is the head
1010				 * of the list.
1011				 */
1012				if ((rc = diIAGRead(imap, fwd, &amp))) {
1013					IREAD_UNLOCK(ipimap);
1014					AG_UNLOCK(imap, agno);
1015					release_metapage(mp);
1016					return (rc);
1017				}
1018				aiagp = (struct iag *) amp->data;
1019
1020				/* make current head point back to the iag.
1021				 */
1022				aiagp->inofreeback = cpu_to_le32(iagno);
1023
1024				write_metapage(amp);
1025			}
1026
1027			/* iag points forward to current head and iag
1028			 * becomes the new head of the list.
1029			 */
1030			iagp->inofreefwd =
1031			    cpu_to_le32(imap->im_agctl[agno].inofree);
1032			iagp->inofreeback = -1;
1033			imap->im_agctl[agno].inofree = iagno;
1034		}
1035		IREAD_UNLOCK(ipimap);
1036
1037		/* update the free inode summary map for the extent if
1038		 * freeing the inode means the extent will now have free
1039		 * inodes (i.e., the inode being freed is the first free
1040		 * inode of extent),
1041		 */
1042		if (iagp->wmap[extno] == ONES) {
1043			sword = extno >> L2EXTSPERSUM;
1044			bitno = extno & (EXTSPERSUM - 1);
1045			iagp->inosmap[sword] &=
1046			    cpu_to_le32(~(HIGHORDER >> bitno));
1047		}
1048
1049		/* update the bitmap.
1050		 */
1051		iagp->wmap[extno] = cpu_to_le32(bitmap);
1052		DBG_DIFREE(imap, inum);
1053
1054		/* update the free inode counts at the iag, ag and
1055		 * map level.
1056		 */
1057		iagp->nfreeinos =
1058		    cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 1);
1059		imap->im_agctl[agno].numfree += 1;
1060		atomic_inc(&imap->im_numfree);
1061
1062		/* release the AG inode map lock
1063		 */
1064		AG_UNLOCK(imap, agno);
1065
1066		/* write the iag */
1067		write_metapage(mp);
1068
1069		return (0);
1070	}
1071
1072
1073	/*
1074	 *      inode extent has become free and above low water mark:
1075	 *      free the inode extent;
1076	 */
1077
1078	/*
1079	 *      prepare to update iag list(s) (careful update step 1)
1080	 */
1081	amp = bmp = cmp = dmp = NULL;
1082	fwd = back = -1;
1083
1084	/* check if the iag currently has no free extents.  if so,
1085	 * it will be placed on the head of the ag extent free list.
1086	 */
1087	if (iagp->nfreeexts == 0) {
1088		/* check if the ag extent free list has any iags.
1089		 * if so, read the iag at the head of the list now.
1090		 * this (head) iag will be updated later to reflect
1091		 * the addition of the current iag at the head of
1092		 * the list.
1093		 */
1094		if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
1095			if ((rc = diIAGRead(imap, fwd, &amp)))
1096				goto error_out;
1097			aiagp = (struct iag *) amp->data;
1098		}
1099	} else {
1100		/* iag has free extents. check if the addition of a free
1101		 * extent will cause all extents to be free within this
1102		 * iag.  if so, the iag will be removed from the ag extent
1103		 * free list and placed on the inode map's free iag list.
1104		 */
1105		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
1106			/* in preparation for removing the iag from the
1107			 * ag extent free list, read the iags preceeding
1108			 * and following the iag on the ag extent free
1109			 * list.
1110			 */
1111			if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
1112				if ((rc = diIAGRead(imap, fwd, &amp)))
1113					goto error_out;
1114				aiagp = (struct iag *) amp->data;
1115			}
1116
1117			if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
1118				if ((rc = diIAGRead(imap, back, &bmp)))
1119					goto error_out;
1120				biagp = (struct iag *) bmp->data;
1121			}
1122		}
1123	}
1124
1125	/* remove the iag from the ag inode free list if freeing
1126	 * this extent cause the iag to have no free inodes.
1127	 */
1128	if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
1129		int inofreeback = le32_to_cpu(iagp->inofreeback);
1130		int inofreefwd = le32_to_cpu(iagp->inofreefwd);
1131
1132		/* in preparation for removing the iag from the
1133		 * ag inode free list, read the iags preceeding
1134		 * and following the iag on the ag inode free
1135		 * list.  before reading these iags, we must make
1136		 * sure that we already don't have them in hand
1137		 * from up above, since re-reading an iag (buffer)
1138		 * we are currently holding would cause a deadlock.
1139		 */
1140		if (inofreefwd >= 0) {
1141
1142			if (inofreefwd == fwd)
1143				ciagp = (struct iag *) amp->data;
1144			else if (inofreefwd == back)
1145				ciagp = (struct iag *) bmp->data;
1146			else {
1147				if ((rc =
1148				     diIAGRead(imap, inofreefwd, &cmp)))
1149					goto error_out;
1150				assert(cmp != NULL);
1151				ciagp = (struct iag *) cmp->data;
1152			}
1153			assert(ciagp != NULL);
1154		}
1155
1156		if (inofreeback >= 0) {
1157			if (inofreeback == fwd)
1158				diagp = (struct iag *) amp->data;
1159			else if (inofreeback == back)
1160				diagp = (struct iag *) bmp->data;
1161			else {
1162				if ((rc =
1163				     diIAGRead(imap, inofreeback, &dmp)))
1164					goto error_out;
1165				assert(dmp != NULL);
1166				diagp = (struct iag *) dmp->data;
1167			}
1168			assert(diagp != NULL);
1169		}
1170	}
1171
1172	IREAD_UNLOCK(ipimap);
1173
1174	/*
1175	 * invalidate any page of the inode extent freed from buffer cache;
1176	 */
1177	freepxd = iagp->inoext[extno];
1178	invalidate_pxd_metapages(ip->i_sb->s_bdev->bd_inode, freepxd);
1179
1180	/*
1181	 *      update iag list(s) (careful update step 2)
1182	 */
1183	/* add the iag to the ag extent free list if this is the
1184	 * first free extent for the iag.
1185	 */
1186	if (iagp->nfreeexts == 0) {
1187		if (fwd >= 0)
1188			aiagp->extfreeback = cpu_to_le32(iagno);
1189
1190		iagp->extfreefwd =
1191		    cpu_to_le32(imap->im_agctl[agno].extfree);
1192		iagp->extfreeback = -1;
1193		imap->im_agctl[agno].extfree = iagno;
1194	} else {
1195		/* remove the iag from the ag extent list if all extents
1196		 * are now free and place it on the inode map iag free list.
1197		 */
1198		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
1199			if (fwd >= 0)
1200				aiagp->extfreeback = iagp->extfreeback;
1201
1202			if (back >= 0)
1203				biagp->extfreefwd = iagp->extfreefwd;
1204			else
1205				imap->im_agctl[agno].extfree =
1206				    le32_to_cpu(iagp->extfreefwd);
1207
1208			iagp->extfreefwd = iagp->extfreeback = -1;
1209
1210			IAGFREE_LOCK(imap);
1211			iagp->iagfree = cpu_to_le32(imap->im_freeiag);
1212			imap->im_freeiag = iagno;
1213			IAGFREE_UNLOCK(imap);
1214		}
1215	}
1216
1217	/* remove the iag from the ag inode free list if freeing
1218	 * this extent causes the iag to have no free inodes.
1219	 */
1220	if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
1221		if ((int) le32_to_cpu(iagp->inofreefwd) >= 0)
1222			ciagp->inofreeback = iagp->inofreeback;
1223
1224		if ((int) le32_to_cpu(iagp->inofreeback) >= 0)
1225			diagp->inofreefwd = iagp->inofreefwd;
1226		else
1227			imap->im_agctl[agno].inofree =
1228			    le32_to_cpu(iagp->inofreefwd);
1229
1230		iagp->inofreefwd = iagp->inofreeback = -1;
1231	}
1232
1233	/* update the inode extent address and working map
1234	 * to reflect the free extent.
1235	 * the permanent map should have been updated already
1236	 * for the inode being freed.
1237	 */
1238	assert(iagp->pmap[extno] == 0);
1239	iagp->wmap[extno] = 0;
1240	DBG_DIFREE(imap, inum);
1241	PXDlength(&iagp->inoext[extno], 0);
1242	PXDaddress(&iagp->inoext[extno], 0);
1243
1244	/* update the free extent and free inode summary maps
1245	 * to reflect the freed extent.
1246	 * the inode summary map is marked to indicate no inodes
1247	 * available for the freed extent.
1248	 */
1249	sword = extno >> L2EXTSPERSUM;
1250	bitno = extno & (EXTSPERSUM - 1);
1251	mask = HIGHORDER >> bitno;
1252	iagp->inosmap[sword] |= cpu_to_le32(mask);
1253	iagp->extsmap[sword] &= cpu_to_le32(~mask);
1254
1255	/* update the number of free inodes and number of free extents
1256	 * for the iag.
1257	 */
1258	iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) -
1259				      (INOSPEREXT - 1));
1260	iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) + 1);
1261
1262	/* update the number of free inodes and backed inodes
1263	 * at the ag and inode map level.
1264	 */
1265	imap->im_agctl[agno].numfree -= (INOSPEREXT - 1);
1266	imap->im_agctl[agno].numinos -= INOSPEREXT;
1267	atomic_sub(INOSPEREXT - 1, &imap->im_numfree);
1268	atomic_sub(INOSPEREXT, &imap->im_numinos);
1269
1270	if (amp)
1271		write_metapage(amp);
1272	if (bmp)
1273		write_metapage(bmp);
1274	if (cmp)
1275		write_metapage(cmp);
1276	if (dmp)
1277		write_metapage(dmp);
1278
1279	/*
1280	 * start transaction to update block allocation map
1281	 * for the inode extent freed;
1282	 *
1283	 * N.B. AG_LOCK is released and iag will be released below, and
1284	 * other thread may allocate inode from/reusing the ixad freed
1285	 * BUT with new/different backing inode extent from the extent
1286	 * to be freed by the transaction;
1287	 */
1288	tid = txBegin(ipimap->i_sb, COMMIT_FORCE);
1289
1290	/* acquire tlock of the iag page of the freed ixad
1291	 * to force the page NOHOMEOK (even though no data is
1292	 * logged from the iag page) until NOREDOPAGE|FREEXTENT log
1293	 * for the free of the extent is committed;
1294	 * write FREEXTENT|NOREDOPAGE log record
1295	 * N.B. linelock is overlaid as freed extent descriptor;
1296	 */
1297	tlck = txLock(tid, ipimap, mp, tlckINODE | tlckFREE);
1298	pxdlock = (struct pxd_lock *) & tlck->lock;
1299	pxdlock->flag = mlckFREEPXD;
1300	pxdlock->pxd = freepxd;
1301	pxdlock->index = 1;
1302
1303	write_metapage(mp);
1304
1305	iplist[0] = ipimap;
1306
1307	/*
1308	 * logredo needs the IAG number and IAG extent index in order
1309	 * to ensure that the IMap is consistent.  The least disruptive
1310	 * way to pass these values through  to the transaction manager
1311	 * is in the iplist array.
1312	 *
1313	 * It's not pretty, but it works.
1314	 */
1315	iplist[1] = (struct inode *) (size_t)iagno;
1316	iplist[2] = (struct inode *) (size_t)extno;
1317
1318	rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);	// D233382
1319
1320	txEnd(tid);
1321
1322	/* unlock the AG inode map information */
1323	AG_UNLOCK(imap, agno);
1324
1325	return (0);
1326
1327      error_out:
1328	IREAD_UNLOCK(ipimap);
1329
1330	if (amp)
1331		release_metapage(amp);
1332	if (bmp)
1333		release_metapage(bmp);
1334	if (cmp)
1335		release_metapage(cmp);
1336	if (dmp)
1337		release_metapage(dmp);
1338
1339	AG_UNLOCK(imap, agno);
1340
1341	release_metapage(mp);
1342
1343	return (rc);
1344}
1345
1346/*
1347 * There are several places in the diAlloc* routines where we initialize
1348 * the inode.
1349 */
1350static inline void
1351diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
1352{
1353	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
1354	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
1355
1356	ip->i_ino = (iagno << L2INOSPERIAG) + ino;
1357	DBG_DIALLOC(JFS_IP(ipimap)->i_imap, ip->i_ino);
1358	jfs_ip->ixpxd = iagp->inoext[extno];
1359	jfs_ip->agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
1360	jfs_ip->active_ag = -1;
1361}
1362
1363
1364/*
1365 * NAME:        diAlloc(pip,dir,ip)
1366 *
1367 * FUNCTION:    allocate a disk inode from the inode working map
1368 *		for a fileset or aggregate.
1369 *
1370 * PARAMETERS:
1371 *      pip  	- pointer to incore inode for the parent inode.
1372 *      dir  	- TRUE if the new disk inode is for a directory.
1373 *      ip  	- pointer to a new inode
1374 *
1375 * RETURN VALUES:
1376 *      0       - success.
1377 *      ENOSPC 	- insufficient disk resources.
1378 *      EIO  	- i/o error.
1379 */
1380int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip)
1381{
1382	int rc, ino, iagno, addext, extno, bitno, sword;
1383	int nwords, rem, i, agno;
1384	u32 mask, inosmap, extsmap;
1385	struct inode *ipimap;
1386	struct metapage *mp;
1387	ino_t inum;
1388	struct iag *iagp;
1389	struct inomap *imap;
1390
1391	/* get the pointers to the inode map inode and the
1392	 * corresponding imap control structure.
1393	 */
1394	ipimap = JFS_SBI(pip->i_sb)->ipimap;
1395	imap = JFS_IP(ipimap)->i_imap;
1396	JFS_IP(ip)->ipimap = ipimap;
1397	JFS_IP(ip)->fileset = FILESYSTEM_I;
1398
1399	/* for a directory, the allocation policy is to start
1400	 * at the ag level using the preferred ag.
1401	 */
1402	if (dir == TRUE) {
1403		agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
1404		AG_LOCK(imap, agno);
1405		goto tryag;
1406	}
1407
1408	/* for files, the policy starts off by trying to allocate from
1409	 * the same iag containing the parent disk inode:
1410	 * try to allocate the new disk inode close to the parent disk
1411	 * inode, using parent disk inode number + 1 as the allocation
1412	 * hint.  (we use a left-to-right policy to attempt to avoid
1413	 * moving backward on the disk.)  compute the hint within the
1414	 * file system and the iag.
1415	 */
1416
1417	/* get the ag number of this iag */
1418	agno = JFS_IP(pip)->agno;
1419
1420	if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) {
1421		/*
1422		 * There is an open file actively growing.  We want to
1423		 * allocate new inodes from a different ag to avoid
1424		 * fragmentation problems.
1425		 */
1426		agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
1427		AG_LOCK(imap, agno);
1428		goto tryag;
1429	}
1430
1431	inum = pip->i_ino + 1;
1432	ino = inum & (INOSPERIAG - 1);
1433
1434	/* back off the the hint if it is outside of the iag */
1435	if (ino == 0)
1436		inum = pip->i_ino;
1437
1438	/* lock the AG inode map information */
1439	AG_LOCK(imap, agno);
1440
1441	/* Get read lock on imap inode */
1442	IREAD_LOCK(ipimap);
1443
1444	/* get the iag number and read the iag */
1445	iagno = INOTOIAG(inum);
1446	if ((rc = diIAGRead(imap, iagno, &mp))) {
1447		IREAD_UNLOCK(ipimap);
1448		return (rc);
1449	}
1450	iagp = (struct iag *) mp->data;
1451
1452	/* determine if new inode extent is allowed to be added to the iag.
1453	 * new inode extent can be added to the iag if the ag
1454	 * has less than 32 free disk inodes and the iag has free extents.
1455	 */
1456	addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts);
1457
1458	/*
1459	 *      try to allocate from the IAG
1460	 */
1461	/* check if the inode may be allocated from the iag
1462	 * (i.e. the inode has free inodes or new extent can be added).
1463	 */
1464	if (iagp->nfreeinos || addext) {
1465		/* determine the extent number of the hint.
1466		 */
1467		extno = ino >> L2INOSPEREXT;
1468
1469		/* check if the extent containing the hint has backed
1470		 * inodes.  if so, try to allocate within this extent.
1471		 */
1472		if (addressPXD(&iagp->inoext[extno])) {
1473			bitno = ino & (INOSPEREXT - 1);
1474			if ((bitno =
1475			     diFindFree(le32_to_cpu(iagp->wmap[extno]),
1476					bitno))
1477			    < INOSPEREXT) {
1478				ino = (extno << L2INOSPEREXT) + bitno;
1479
1480				/* a free inode (bit) was found within this
1481				 * extent, so allocate it.
1482				 */
1483				rc = diAllocBit(imap, iagp, ino);
1484				IREAD_UNLOCK(ipimap);
1485				if (rc) {
1486					assert(rc == EIO);
1487				} else {
1488					/* set the results of the allocation
1489					 * and write the iag.
1490					 */
1491					diInitInode(ip, iagno, ino, extno,
1492						    iagp);
1493					mark_metapage_dirty(mp);
1494				}
1495				release_metapage(mp);
1496
1497				/* free the AG lock and return.
1498				 */
1499				AG_UNLOCK(imap, agno);
1500				return (rc);
1501			}
1502
1503			if (!addext)
1504				extno =
1505				    (extno ==
1506				     EXTSPERIAG - 1) ? 0 : extno + 1;
1507		}
1508
1509		/*
1510		 * no free inodes within the extent containing the hint.
1511		 *
1512		 * try to allocate from the backed extents following
1513		 * hint or, if appropriate (i.e. addext is true), allocate
1514		 * an extent of free inodes at or following the extent
1515		 * containing the hint.
1516		 *
1517		 * the free inode and free extent summary maps are used
1518		 * here, so determine the starting summary map position
1519		 * and the number of words we'll have to examine.  again,
1520		 * the approach is to allocate following the hint, so we
1521		 * might have to initially ignore prior bits of the summary
1522		 * map that represent extents prior to the extent containing
1523		 * the hint and later revisit these bits.
1524		 */
1525		bitno = extno & (EXTSPERSUM - 1);
1526		nwords = (bitno == 0) ? SMAPSZ : SMAPSZ + 1;
1527		sword = extno >> L2EXTSPERSUM;
1528
1529		/* mask any prior bits for the starting words of the
1530		 * summary map.
1531		 */
1532		mask = ONES << (EXTSPERSUM - bitno);
1533		inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask;
1534		extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask;
1535
1536		/* scan the free inode and free extent summary maps for
1537		 * free resources.
1538		 */
1539		for (i = 0; i < nwords; i++) {
1540			/* check if this word of the free inode summary
1541			 * map describes an extent with free inodes.
1542			 */
1543			if (~inosmap) {
1544				/* an extent with free inodes has been
1545				 * found. determine the extent number
1546				 * and the inode number within the extent.
1547				 */
1548				rem = diFindFree(inosmap, 0);
1549				extno = (sword << L2EXTSPERSUM) + rem;
1550				rem =
1551				    diFindFree(le32_to_cpu
1552					       (iagp->wmap[extno]), 0);
1553				assert(rem < INOSPEREXT);
1554
1555				/* determine the inode number within the
1556				 * iag and allocate the inode from the
1557				 * map.
1558				 */
1559				ino = (extno << L2INOSPEREXT) + rem;
1560				rc = diAllocBit(imap, iagp, ino);
1561				IREAD_UNLOCK(ipimap);
1562				if (rc) {
1563					assert(rc == EIO);
1564				} else {
1565					/* set the results of the allocation
1566					 * and write the iag.
1567					 */
1568					diInitInode(ip, iagno, ino, extno,
1569						    iagp);
1570					mark_metapage_dirty(mp);
1571				}
1572				release_metapage(mp);
1573
1574				/* free the AG lock and return.
1575				 */
1576				AG_UNLOCK(imap, agno);
1577				return (rc);
1578
1579			}
1580
1581			/* check if we may allocate an extent of free
1582			 * inodes and whether this word of the free
1583			 * extents summary map describes a free extent.
1584			 */
1585			if (addext && ~extsmap) {
1586				/* a free extent has been found.  determine
1587				 * the extent number.
1588				 */
1589				rem = diFindFree(extsmap, 0);
1590				extno = (sword << L2EXTSPERSUM) + rem;
1591
1592				/* allocate an extent of free inodes.
1593				 */
1594				if ((rc = diNewExt(imap, iagp, extno))) {
1595					/* if there is no disk space for a
1596					 * new extent, try to allocate the
1597					 * disk inode from somewhere else.
1598					 */
1599					if (rc == ENOSPC)
1600						break;
1601
1602					assert(rc == EIO);
1603				} else {
1604					/* set the results of the allocation
1605					 * and write the iag.
1606					 */
1607					diInitInode(ip, iagno,
1608						    extno << L2INOSPEREXT,
1609						    extno, iagp);
1610					mark_metapage_dirty(mp);
1611				}
1612				release_metapage(mp);
1613				/* free the imap inode & the AG lock & return.
1614				 */
1615				IREAD_UNLOCK(ipimap);
1616				AG_UNLOCK(imap, agno);
1617				return (rc);
1618			}
1619
1620			/* move on to the next set of summary map words.
1621			 */
1622			sword = (sword == SMAPSZ - 1) ? 0 : sword + 1;
1623			inosmap = le32_to_cpu(iagp->inosmap[sword]);
1624			extsmap = le32_to_cpu(iagp->extsmap[sword]);
1625		}
1626	}
1627	/* unlock imap inode */
1628	IREAD_UNLOCK(ipimap);
1629
1630	/* nothing doing in this iag, so release it. */
1631	release_metapage(mp);
1632
1633      tryag:
1634	/*
1635	 * try to allocate anywhere within the same AG as the parent inode.
1636	 */
1637	rc = diAllocAG(imap, agno, dir, ip);
1638
1639	AG_UNLOCK(imap, agno);
1640
1641	if (rc != ENOSPC)
1642		return (rc);
1643
1644	/*
1645	 * try to allocate in any AG.
1646	 */
1647	return (diAllocAny(imap, agno, dir, ip));
1648}
1649
1650
1651/*
1652 * NAME:        diAllocAG(imap,agno,dir,ip)
1653 *
1654 * FUNCTION:    allocate a disk inode from the allocation group.
1655 *
1656 *		this routine first determines if a new extent of free
1657 *		inodes should be added for the allocation group, with
1658 *		the current request satisfied from this extent. if this
1659 *		is the case, an attempt will be made to do just that.  if
1660 *		this attempt fails or it has been determined that a new
1661 *		extent should not be added, an attempt is made to satisfy
1662 *		the request by allocating an existing (backed) free inode
1663 *		from the allocation group.
1664 *
1665 * PRE CONDITION: Already have the AG lock for this AG.
1666 *
1667 * PARAMETERS:
1668 *      imap  	- pointer to inode map control structure.
1669 *      agno  	- allocation group to allocate from.
1670 *      dir  	- TRUE if the new disk inode is for a directory.
1671 *      ip  	- pointer to the new inode to be filled in on successful return
1672 *		  with the disk inode number allocated, its extent address
1673 *		  and the start of the ag.
1674 *
1675 * RETURN VALUES:
1676 *      0       - success.
1677 *      ENOSPC 	- insufficient disk resources.
1678 *      EIO  	- i/o error.
1679 */
1680static int
1681diAllocAG(struct inomap * imap, int agno, boolean_t dir, struct inode *ip)
1682{
1683	int rc, addext, numfree, numinos;
1684
1685	/* get the number of free and the number of backed disk
1686	 * inodes currently within the ag.
1687	 */
1688	numfree = imap->im_agctl[agno].numfree;
1689	numinos = imap->im_agctl[agno].numinos;
1690
1691	if (numfree > numinos) {
1692		jERROR(1,("diAllocAG: numfree > numinos\n"));
1693		updateSuper(ip->i_sb, FM_DIRTY);
1694		return EIO;
1695	}
1696
1697	/* determine if we should allocate a new extent of free inodes
1698	 * within the ag: for directory inodes, add a new extent
1699	 * if there are a small number of free inodes or number of free
1700	 * inodes is a small percentage of the number of backed inodes.
1701	 */
1702	if (dir == TRUE)
1703		addext = (numfree < 64 ||
1704			  (numfree < 256
1705			   && ((numfree * 100) / numinos) <= 20));
1706	else
1707		addext = (numfree == 0);
1708
1709	/*
1710	 * try to allocate a new extent of free inodes.
1711	 */
1712	if (addext) {
1713		/* if free space is not avaliable for this new extent, try
1714		 * below to allocate a free and existing (already backed)
1715		 * inode from the ag.
1716		 */
1717		if ((rc = diAllocExt(imap, agno, ip)) != ENOSPC)
1718			return (rc);
1719	}
1720
1721	/*
1722	 * try to allocate an existing free inode from the ag.
1723	 */
1724	return (diAllocIno(imap, agno, ip));
1725}
1726
1727
1728/*
1729 * NAME:        diAllocAny(imap,agno,dir,iap)
1730 *
1731 * FUNCTION:    allocate a disk inode from any other allocation group.
1732 *
1733 *		this routine is called when an allocation attempt within
1734 *		the primary allocation group has failed. if attempts to
1735 *		allocate an inode from any allocation group other than the
1736 *		specified primary group.
1737 *
1738 * PARAMETERS:
1739 *      imap  	- pointer to inode map control structure.
1740 *      agno  	- primary allocation group (to avoid).
1741 *      dir  	- TRUE if the new disk inode is for a directory.
1742 *      ip  	- pointer to a new inode to be filled in on successful return
1743 *		  with the disk inode number allocated, its extent address
1744 *		  and the start of the ag.
1745 *
1746 * RETURN VALUES:
1747 *      0       - success.
1748 *      ENOSPC 	- insufficient disk resources.
1749 *      EIO  	- i/o error.
1750 */
1751static int
1752diAllocAny(struct inomap * imap, int agno, boolean_t dir, struct inode *ip)
1753{
1754	int ag, rc;
1755	int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag;
1756
1757
1758	/* try to allocate from the ags following agno up to
1759	 * the maximum ag number.
1760	 */
1761	for (ag = agno + 1; ag <= maxag; ag++) {
1762		AG_LOCK(imap, ag);
1763
1764		rc = diAllocAG(imap, ag, dir, ip);
1765
1766		AG_UNLOCK(imap, ag);
1767
1768		if (rc != ENOSPC)
1769			return (rc);
1770	}
1771
1772	/* try to allocate from the ags in front of agno.
1773	 */
1774	for (ag = 0; ag < agno; ag++) {
1775		AG_LOCK(imap, ag);
1776
1777		rc = diAllocAG(imap, ag, dir, ip);
1778
1779		AG_UNLOCK(imap, ag);
1780
1781		if (rc != ENOSPC)
1782			return (rc);
1783	}
1784
1785	/* no free disk inodes.
1786	 */
1787	return (ENOSPC);
1788}
1789
1790
1791/*
1792 * NAME:        diAllocIno(imap,agno,ip)
1793 *
1794 * FUNCTION:    allocate a disk inode from the allocation group's free
1795 *		inode list, returning an error if this free list is
1796 *		empty (i.e. no iags on the list).
1797 *
1798 *		allocation occurs from the first iag on the list using
1799 *		the iag's free inode summary map to find the leftmost
1800 *		free inode in the iag.
1801 *
1802 * PRE CONDITION: Already have AG lock for this AG.
1803 *
1804 * PARAMETERS:
1805 *      imap  	- pointer to inode map control structure.
1806 *      agno  	- allocation group.
1807 *      ip  	- pointer to new inode to be filled in on successful return
1808 *		  with the disk inode number allocated, its extent address
1809 *		  and the start of the ag.
1810 *
1811 * RETURN VALUES:
1812 *      0       - success.
1813 *      ENOSPC 	- insufficient disk resources.
1814 *      EIO  	- i/o error.
1815 */
1816static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1817{
1818	int iagno, ino, rc, rem, extno, sword;
1819	struct metapage *mp;
1820	struct iag *iagp;
1821
1822	/* check if there are iags on the ag's free inode list.
1823	 */
1824	if ((iagno = imap->im_agctl[agno].inofree) < 0)
1825		return (ENOSPC);
1826
1827	/* obtain read lock on imap inode */
1828	IREAD_LOCK(imap->im_ipimap);
1829
1830	/* read the iag at the head of the list.
1831	 */
1832	if ((rc = diIAGRead(imap, iagno, &mp))) {
1833		IREAD_UNLOCK(imap->im_ipimap);
1834		return (rc);
1835	}
1836	iagp = (struct iag *) mp->data;
1837
1838	/* better be free inodes in this iag if it is on the
1839	 * list.
1840	 */
1841	//assert(iagp->nfreeinos);
1842	if (!iagp->nfreeinos) {
1843		jERROR(1,
1844		       ("diAllocIno: nfreeinos = 0, but iag on freelist\n"));
1845		jERROR(1, ("  agno = %d, iagno = %d\n", agno, iagno));
1846		dump_mem("iag", iagp, 64);
1847		updateSuper(ip->i_sb, FM_DIRTY);
1848		return EIO;
1849	}
1850
1851	/* scan the free inode summary map to find an extent
1852	 * with free inodes.
1853	 */
1854	for (sword = 0;; sword++) {
1855		assert(sword < SMAPSZ);
1856
1857		if (~iagp->inosmap[sword])
1858			break;
1859	}
1860
1861	/* found a extent with free inodes. determine
1862	 * the extent number.
1863	 */
1864	rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0);
1865	assert(rem < EXTSPERSUM);
1866	extno = (sword << L2EXTSPERSUM) + rem;
1867
1868	/* find the first free inode in the extent.
1869	 */
1870	rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0);
1871	assert(rem < INOSPEREXT);
1872
1873	/* compute the inode number within the iag.
1874	 */
1875	ino = (extno << L2INOSPEREXT) + rem;
1876
1877	/* allocate the inode.
1878	 */
1879	rc = diAllocBit(imap, iagp, ino);
1880	IREAD_UNLOCK(imap->im_ipimap);
1881	if (rc) {
1882		release_metapage(mp);
1883		return (rc);
1884	}
1885
1886	/* set the results of the allocation and write the iag.
1887	 */
1888	diInitInode(ip, iagno, ino, extno, iagp);
1889	write_metapage(mp);
1890
1891	return (0);
1892}
1893
1894
1895/*
1896 * NAME:        diAllocExt(imap,agno,ip)
1897 *
1898 * FUNCTION:   	add a new extent of free inodes to an iag, allocating
1899 *	       	an inode from this extent to satisfy the current allocation
1900 *	       	request.
1901 *
1902 *		this routine first tries to find an existing iag with free
1903 *		extents through the ag free extent list.  if list is not
1904 *		empty, the head of the list will be selected as the home
1905 *		of the new extent of free inodes.  otherwise (the list is
1906 *		empty), a new iag will be allocated for the ag to contain
1907 *		the extent.
1908 *
1909 *		once an iag has been selected, the free extent summary map
1910 *		is used to locate a free extent within the iag and diNewExt()
1911 *		is called to initialize the extent, with initialization
1912 *		including the allocation of the first inode of the extent
1913 *		for the purpose of satisfying this request.
1914 *
1915 * PARAMETERS:
1916 *      imap  	- pointer to inode map control structure.
1917 *      agno  	- allocation group number.
1918 *      ip  	- pointer to new inode to be filled in on successful return
1919 *		  with the disk inode number allocated, its extent address
1920 *		  and the start of the ag.
1921 *
1922 * RETURN VALUES:
1923 *      0       - success.
1924 *      ENOSPC 	- insufficient disk resources.
1925 *      EIO  	- i/o error.
1926 */
1927static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1928{
1929	int rem, iagno, sword, extno, rc;
1930	struct metapage *mp;
1931	struct iag *iagp;
1932
1933	/* check if the ag has any iags with free extents.  if not,
1934	 * allocate a new iag for the ag.
1935	 */
1936	if ((iagno = imap->im_agctl[agno].extfree) < 0) {
1937		/* If successful, diNewIAG will obtain the read lock on the
1938		 * imap inode.
1939		 */
1940		if ((rc = diNewIAG(imap, &iagno, agno, &mp))) {
1941			return (rc);
1942		}
1943		iagp = (struct iag *) mp->data;
1944
1945		/* set the ag number if this a brand new iag
1946		 */
1947		iagp->agstart =
1948		    cpu_to_le64(AGTOBLK(agno, imap->im_ipimap));
1949	} else {
1950		/* read the iag.
1951		 */
1952		IREAD_LOCK(imap->im_ipimap);
1953		if ((rc = diIAGRead(imap, iagno, &mp))) {
1954			assert(0);
1955		}
1956		iagp = (struct iag *) mp->data;
1957	}
1958
1959	/* using the free extent summary map, find a free extent.
1960	 */
1961	for (sword = 0;; sword++) {
1962		assert(sword < SMAPSZ);
1963		if (~iagp->extsmap[sword])
1964			break;
1965	}
1966
1967	/* determine the extent number of the free extent.
1968	 */
1969	rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0);
1970	assert(rem < EXTSPERSUM);
1971	extno = (sword << L2EXTSPERSUM) + rem;
1972
1973	/* initialize the new extent.
1974	 */
1975	rc = diNewExt(imap, iagp, extno);
1976	IREAD_UNLOCK(imap->im_ipimap);
1977	if (rc) {
1978		/* something bad happened.  if a new iag was allocated,
1979		 * place it back on the inode map's iag free list, and
1980		 * clear the ag number information.
1981		 */
1982		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
1983			IAGFREE_LOCK(imap);
1984			iagp->iagfree = cpu_to_le32(imap->im_freeiag);
1985			imap->im_freeiag = iagno;
1986			IAGFREE_UNLOCK(imap);
1987		}
1988		write_metapage(mp);
1989		return (rc);
1990	}
1991
1992	/* set the results of the allocation and write the iag.
1993	 */
1994	diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp);
1995
1996	write_metapage(mp);
1997
1998	return (0);
1999}
2000
2001
2002/*
2003 * NAME:        diAllocBit(imap,iagp,ino)
2004 *
2005 * FUNCTION:   	allocate a backed inode from an iag.
2006 *
2007 *		this routine performs the mechanics of allocating a
2008 *		specified inode from a backed extent.
2009 *
2010 *		if the inode to be allocated represents the last free
2011 *		inode within the iag, the iag will be removed from the
2012 *		ag free inode list.
2013 *
2014 *		a careful update approach is used to provide consistency
2015 *		in the face of updates to multiple buffers.  under this
2016 *		approach, all required buffers are obtained before making
2017 *		any updates and are held all are updates are complete.
2018 *
2019 * PRE CONDITION: Already have buffer lock on iagp.  Already have AG lock on
2020 *	this AG.  Must have read lock on imap inode.
2021 *
2022 * PARAMETERS:
2023 *      imap  	- pointer to inode map control structure.
2024 *      iagp  	- pointer to iag.
2025 *      ino   	- inode number to be allocated within the iag.
2026 *
2027 * RETURN VALUES:
2028 *      0       - success.
2029 *      ENOSPC 	- insufficient disk resources.
2030 *      EIO  	- i/o error.
2031 */
2032static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
2033{
2034	int extno, bitno, agno, sword, rc;
2035	struct metapage *amp, *bmp;
2036	struct iag *aiagp = 0, *biagp = 0;
2037	u32 mask;
2038
2039	/* check if this is the last free inode within the iag.
2040	 * if so, it will have to be removed from the ag free
2041	 * inode list, so get the iags preceeding and following
2042	 * it on the list.
2043	 */
2044	if (iagp->nfreeinos == cpu_to_le32(1)) {
2045		amp = bmp = NULL;
2046
2047		if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) {
2048			if ((rc =
2049			     diIAGRead(imap, le32_to_cpu(iagp->inofreefwd),
2050				       &amp)))
2051				return (rc);
2052			aiagp = (struct iag *) amp->data;
2053		}
2054
2055		if ((int) le32_to_cpu(iagp->inofreeback) >= 0) {
2056			if ((rc =
2057			     diIAGRead(imap,
2058				       le32_to_cpu(iagp->inofreeback),
2059				       &bmp))) {
2060				if (amp)
2061					release_metapage(amp);
2062				return (rc);
2063			}
2064			biagp = (struct iag *) bmp->data;
2065		}
2066	}
2067
2068	/* get the ag number, extent number, inode number within
2069	 * the extent.
2070	 */
2071	agno = BLKTOAG(le64_to_cpu(iagp->agstart), JFS_SBI(imap->im_ipimap->i_sb));
2072	extno = ino >> L2INOSPEREXT;
2073	bitno = ino & (INOSPEREXT - 1);
2074
2075	/* compute the mask for setting the map.
2076	 */
2077	mask = HIGHORDER >> bitno;
2078
2079	/* the inode should be free and backed.
2080	 */
2081	assert((le32_to_cpu(iagp->pmap[extno]) & mask) == 0);
2082	assert((le32_to_cpu(iagp->wmap[extno]) & mask) == 0);
2083	assert(addressPXD(&iagp->inoext[extno]) != 0);
2084
2085	/* mark the inode as allocated in the working map.
2086	 */
2087	iagp->wmap[extno] |= cpu_to_le32(mask);
2088
2089	/* check if all inodes within the extent are now
2090	 * allocated.  if so, update the free inode summary
2091	 * map to reflect this.
2092	 */
2093	if (iagp->wmap[extno] == ONES) {
2094		sword = extno >> L2EXTSPERSUM;
2095		bitno = extno & (EXTSPERSUM - 1);
2096		iagp->inosmap[sword] |= cpu_to_le32(HIGHORDER >> bitno);
2097	}
2098
2099	/* if this was the last free inode in the iag, remove the
2100	 * iag from the ag free inode list.
2101	 */
2102	if (iagp->nfreeinos == cpu_to_le32(1)) {
2103		if (amp) {
2104			aiagp->inofreeback = iagp->inofreeback;
2105			write_metapage(amp);
2106		}
2107
2108		if (bmp) {
2109			biagp->inofreefwd = iagp->inofreefwd;
2110			write_metapage(bmp);
2111		} else {
2112			imap->im_agctl[agno].inofree =
2113			    le32_to_cpu(iagp->inofreefwd);
2114		}
2115		iagp->inofreefwd = iagp->inofreeback = -1;
2116	}
2117
2118	/* update the free inode count at the iag, ag, inode
2119	 * map levels.
2120	 */
2121	iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - 1);
2122	imap->im_agctl[agno].numfree -= 1;
2123	atomic_dec(&imap->im_numfree);
2124
2125	return (0);
2126}
2127
2128
2129/*
2130 * NAME:        diNewExt(imap,iagp,extno)
2131 *
2132 * FUNCTION:    initialize a new extent of inodes for an iag, allocating
2133 *	        the first inode of the extent for use for the current
2134 *	        allocation request.
2135 *
2136 *		disk resources are allocated for the new extent of inodes
2137 *		and the inodes themselves are initialized to reflect their
2138 *		existence within the extent (i.e. their inode numbers and
2139 *		inode extent addresses are set) and their initial state
2140 *		(mode and link count are set to zero).
2141 *
2142 *		if the iag is new, it is not yet on an ag extent free list
2143 *		but will now be placed on this list.
2144 *
2145 *		if the allocation of the new extent causes the iag to
2146 *		have no free extent, the iag will be removed from the
2147 *		ag extent free list.
2148 *
2149 *		if the iag has no free backed inodes, it will be placed
2150 *		on the ag free inode list, since the addition of the new
2151 *		extent will now cause it to have free inodes.
2152 *
2153 *		a careful update approach is used to provide consistency
2154 *		(i.e. list consistency) in the face of updates to multiple
2155 *		buffers.  under this approach, all required buffers are
2156 *		obtained before making any updates and are held until all
2157 *		updates are complete.
2158 *
2159 * PRE CONDITION: Already have buffer lock on iagp.  Already have AG lock on
2160 *	this AG.  Must have read lock on imap inode.
2161 *
2162 * PARAMETERS:
2163 *      imap  	- pointer to inode map control structure.
2164 *      iagp  	- pointer to iag.
2165 *      extno  	- extent number.
2166 *
2167 * RETURN VALUES:
2168 *      0       - success.
2169 *      ENOSPC 	- insufficient disk resources.
2170 *      EIO  	- i/o error.
2171 */
2172static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2173{
2174	int agno, iagno, fwd, back, freei = 0, sword, rc;
2175	struct iag *aiagp = 0, *biagp = 0, *ciagp = 0;
2176	struct metapage *amp, *bmp, *cmp, *dmp;
2177	struct inode *ipimap;
2178	s64 blkno, hint;
2179	int i, j;
2180	u32 mask;
2181	ino_t ino;
2182	struct dinode *dp;
2183	struct jfs_sb_info *sbi;
2184
2185	/* better have free extents.
2186	 */
2187	assert(iagp->nfreeexts);
2188
2189	/* get the inode map inode.
2190	 */
2191	ipimap = imap->im_ipimap;
2192	sbi = JFS_SBI(ipimap->i_sb);
2193
2194	amp = bmp = cmp = NULL;
2195
2196	/* get the ag and iag numbers for this iag.
2197	 */
2198	agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
2199	iagno = le32_to_cpu(iagp->iagnum);
2200
2201	/* check if this is the last free extent within the
2202	 * iag.  if so, the iag must be removed from the ag
2203	 * free extent list, so get the iags preceeding and
2204	 * following the iag on this list.
2205	 */
2206	if (iagp->nfreeexts == cpu_to_le32(1)) {
2207		if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
2208			if ((rc = diIAGRead(imap, fwd, &amp)))
2209				return (rc);
2210			aiagp = (struct iag *) amp->data;
2211		}
2212
2213		if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
2214			if ((rc = diIAGRead(imap, back, &bmp)))
2215				goto error_out;
2216			biagp = (struct iag *) bmp->data;
2217		}
2218	} else {
2219		/* the iag has free extents.  if all extents are free
2220		 * (as is the case for a newly allocated iag), the iag
2221		 * must be added to the ag free extent list, so get
2222		 * the iag at the head of the list in preparation for
2223		 * adding this iag to this list.
2224		 */
2225		fwd = back = -1;
2226		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
2227			if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
2228				if ((rc = diIAGRead(imap, fwd, &amp)))
2229					goto error_out;
2230				aiagp = (struct iag *) amp->data;
2231			}
2232		}
2233	}
2234
2235	/* check if the iag has no free inodes.  if so, the iag
2236	 * will have to be added to the ag free inode list, so get
2237	 * the iag at the head of the list in preparation for
2238	 * adding this iag to this list.  in doing this, we must
2239	 * check if we already have the iag at the head of
2240	 * the list in hand.
2241	 */
2242	if (iagp->nfreeinos == 0) {
2243		freei = imap->im_agctl[agno].inofree;
2244
2245		if (freei >= 0) {
2246			if (freei == fwd) {
2247				ciagp = aiagp;
2248			} else if (freei == back) {
2249				ciagp = biagp;
2250			} else {
2251				if ((rc = diIAGRead(imap, freei, &cmp)))
2252					goto error_out;
2253				ciagp = (struct iag *) cmp->data;
2254			}
2255			assert(ciagp != NULL);
2256		}
2257	}
2258
2259	/* allocate disk space for the inode extent.
2260	 */
2261	if ((extno == 0) || (addressPXD(&iagp->inoext[extno - 1]) == 0))
2262		hint = ((s64) agno << sbi->bmap->db_agl2size) - 1;
2263	else
2264		hint = addressPXD(&iagp->inoext[extno - 1]) +
2265		    lengthPXD(&iagp->inoext[extno - 1]) - 1;
2266
2267	if ((rc = dbAlloc(ipimap, hint, (s64) imap->im_nbperiext, &blkno)))
2268		goto error_out;
2269
2270	/* compute the inode number of the first inode within the
2271	 * extent.
2272	 */
2273	ino = (iagno << L2INOSPERIAG) + (extno << L2INOSPEREXT);
2274
2275	/* initialize the inodes within the newly allocated extent a
2276	 * page at a time.
2277	 */
2278	for (i = 0; i < imap->im_nbperiext; i += sbi->nbperpage) {
2279		/* get a buffer for this page of disk inodes.
2280		 */
2281		dmp = get_metapage(ipimap, blkno + i, PSIZE, 1);
2282		if (dmp == NULL) {
2283			rc = EIO;
2284			goto error_out;
2285		}
2286		dp = (struct dinode *) dmp->data;
2287
2288		/* initialize the inode number, mode, link count and
2289		 * inode extent address.
2290		 */
2291		for (j = 0; j < INOSPERPAGE; j++, dp++, ino++) {
2292			dp->di_inostamp = cpu_to_le32(sbi->inostamp);
2293			dp->di_number = cpu_to_le32(ino);
2294			dp->di_fileset = cpu_to_le32(FILESYSTEM_I);
2295			dp->di_mode = 0;
2296			dp->di_nlink = 0;
2297			PXDaddress(&(dp->di_ixpxd), blkno);
2298			PXDlength(&(dp->di_ixpxd), imap->im_nbperiext);
2299		}
2300		write_metapage(dmp);
2301	}
2302
2303	/* if this is the last free extent within the iag, remove the
2304	 * iag from the ag free extent list.
2305	 */
2306	if (iagp->nfreeexts == cpu_to_le32(1)) {
2307		if (fwd >= 0)
2308			aiagp->extfreeback = iagp->extfreeback;
2309
2310		if (back >= 0)
2311			biagp->extfreefwd = iagp->extfreefwd;
2312		else
2313			imap->im_agctl[agno].extfree =
2314			    le32_to_cpu(iagp->extfreefwd);
2315
2316		iagp->extfreefwd = iagp->extfreeback = -1;
2317	} else {
2318		/* if the iag has all free extents (newly allocated iag),
2319		 * add the iag to the ag free extent list.
2320		 */
2321		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
2322			if (fwd >= 0)
2323				aiagp->extfreeback = cpu_to_le32(iagno);
2324
2325			iagp->extfreefwd = cpu_to_le32(fwd);
2326			iagp->extfreeback = -1;
2327			imap->im_agctl[agno].extfree = iagno;
2328		}
2329	}
2330
2331	/* if the iag has no free inodes, add the iag to the
2332	 * ag free inode list.
2333	 */
2334	if (iagp->nfreeinos == 0) {
2335		if (freei >= 0)
2336			ciagp->inofreeback = cpu_to_le32(iagno);
2337
2338		iagp->inofreefwd =
2339		    cpu_to_le32(imap->im_agctl[agno].inofree);
2340		iagp->inofreeback = -1;
2341		imap->im_agctl[agno].inofree = iagno;
2342	}
2343
2344	/* initialize the extent descriptor of the extent. */
2345	PXDlength(&iagp->inoext[extno], imap->im_nbperiext);
2346	PXDaddress(&iagp->inoext[extno], blkno);
2347
2348	/* initialize the working and persistent map of the extent.
2349	 * the working map will be initialized such that
2350	 * it indicates the first inode of the extent is allocated.
2351	 */
2352	iagp->wmap[extno] = cpu_to_le32(HIGHORDER);
2353	iagp->pmap[extno] = 0;
2354
2355	/* update the free inode and free extent summary maps
2356	 * for the extent to indicate the extent has free inodes
2357	 * and no longer represents a free extent.
2358	 */
2359	sword = extno >> L2EXTSPERSUM;
2360	mask = HIGHORDER >> (extno & (EXTSPERSUM - 1));
2361	iagp->extsmap[sword] |= cpu_to_le32(mask);
2362	iagp->inosmap[sword] &= cpu_to_le32(~mask);
2363
2364	/* update the free inode and free extent counts for the
2365	 * iag.
2366	 */
2367	iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) +
2368				      (INOSPEREXT - 1));
2369	iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) - 1);
2370
2371	/* update the free and backed inode counts for the ag.
2372	 */
2373	imap->im_agctl[agno].numfree += (INOSPEREXT - 1);
2374	imap->im_agctl[agno].numinos += INOSPEREXT;
2375
2376	/* update the free and backed inode counts for the inode map.
2377	 */
2378	atomic_add(INOSPEREXT - 1, &imap->im_numfree);
2379	atomic_add(INOSPEREXT, &imap->im_numinos);
2380
2381	/* write the iags.
2382	 */
2383	if (amp)
2384		write_metapage(amp);
2385	if (bmp)
2386		write_metapage(bmp);
2387	if (cmp)
2388		write_metapage(cmp);
2389
2390	return (0);
2391
2392      error_out:
2393
2394	/* release the iags.
2395	 */
2396	if (amp)
2397		release_metapage(amp);
2398	if (bmp)
2399		release_metapage(bmp);
2400	if (cmp)
2401		release_metapage(cmp);
2402
2403	return (rc);
2404}
2405
2406
2407/*
2408 * NAME:        diNewIAG(imap,iagnop,agno)
2409 *
2410 * FUNCTION:   	allocate a new iag for an allocation group.
2411 *
2412 *		first tries to allocate the iag from the inode map
2413 *		iagfree list:
2414 *		if the list has free iags, the head of the list is removed
2415 *		and returned to satisfy the request.
2416 *		if the inode map's iag free list is empty, the inode map
2417 *		is extended to hold a new iag. this new iag is initialized
2418 *		and returned to satisfy the request.
2419 *
2420 * PARAMETERS:
2421 *      imap  	- pointer to inode map control structure.
2422 *      iagnop 	- pointer to an iag number set with the number of the
2423 *		  newly allocated iag upon successful return.
2424 *      agno  	- allocation group number.
2425 *	bpp	- Buffer pointer to be filled in with new IAG's buffer
2426 *
2427 * RETURN VALUES:
2428 *      0       - success.
2429 *      ENOSPC 	- insufficient disk resources.
2430 *      EIO  	- i/o error.
2431 *
2432 * serialization:
2433 *	AG lock held on entry/exit;
2434 *	write lock on the map is held inside;
2435 *	read lock on the map is held on successful completion;
2436 *
2437 * note: new iag transaction:
2438 * . synchronously write iag;
2439 * . write log of xtree and inode  of imap;
2440 * . commit;
2441 * . synchronous write of xtree (right to left, bottom to top);
2442 * . at start of logredo(): init in-memory imap with one additional iag page;
2443 * . at end of logredo(): re-read imap inode to determine
2444 *   new imap size;
2445 */
2446static int
2447diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2448{
2449	int rc;
2450	int iagno, i, xlen;
2451	struct inode *ipimap;
2452	struct super_block *sb;
2453	struct jfs_sb_info *sbi;
2454	struct metapage *mp;
2455	struct iag *iagp;
2456	s64 xaddr = 0;
2457	s64 blkno;
2458	tid_t tid;
2459#ifdef _STILL_TO_PORT
2460	xad_t xad;
2461#endif				/*  _STILL_TO_PORT */
2462	struct inode *iplist[1];
2463
2464	/* pick up pointers to the inode map and mount inodes */
2465	ipimap = imap->im_ipimap;
2466	sb = ipimap->i_sb;
2467	sbi = JFS_SBI(sb);
2468
2469	/* acquire the free iag lock */
2470	IAGFREE_LOCK(imap);
2471
2472	/* if there are any iags on the inode map free iag list,
2473	 * allocate the iag from the head of the list.
2474	 */
2475	if (imap->im_freeiag >= 0) {
2476		/* pick up the iag number at the head of the list */
2477		iagno = imap->im_freeiag;
2478
2479		/* determine the logical block number of the iag */
2480		blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
2481	} else {
2482		/* no free iags. the inode map will have to be extented
2483		 * to include a new iag.
2484		 */
2485
2486		/* acquire inode map lock */
2487		IWRITE_LOCK(ipimap);
2488
2489		assert(ipimap->i_size >> L2PSIZE == imap->im_nextiag + 1);
2490
2491		/* get the next avaliable iag number */
2492		iagno = imap->im_nextiag;
2493
2494		/* make sure that we have not exceeded the maximum inode
2495		 * number limit.
2496		 */
2497		if (iagno > (MAXIAGS - 1)) {
2498			/* release the inode map lock */
2499			IWRITE_UNLOCK(ipimap);
2500
2501			rc = ENOSPC;
2502			goto out;
2503		}
2504
2505		/*
2506		 * synchronously append new iag page.
2507		 */
2508		/* determine the logical address of iag page to append */
2509		blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
2510
2511		/* Allocate extent for new iag page */
2512		xlen = sbi->nbperpage;
2513		if ((rc = dbAlloc(ipimap, 0, (s64) xlen, &xaddr))) {
2514			/* release the inode map lock */
2515			IWRITE_UNLOCK(ipimap);
2516
2517			goto out;
2518		}
2519
2520		/* assign a buffer for the page */
2521		mp = get_metapage(ipimap, xaddr, PSIZE, 1);
2522		//bp = bmAssign(ipimap, blkno, xaddr, PSIZE, bmREAD_PAGE);
2523		if (!mp) {
2524			/* Free the blocks allocated for the iag since it was
2525			 * not successfully added to the inode map
2526			 */
2527			dbFree(ipimap, xaddr, (s64) xlen);
2528
2529			/* release the inode map lock */
2530			IWRITE_UNLOCK(ipimap);
2531
2532			rc = EIO;
2533			goto out;
2534		}
2535		iagp = (struct iag *) mp->data;
2536
2537		/* init the iag */
2538		memset(iagp, 0, sizeof(struct iag));
2539		iagp->iagnum = cpu_to_le32(iagno);
2540		iagp->inofreefwd = iagp->inofreeback = -1;
2541		iagp->extfreefwd = iagp->extfreeback = -1;
2542		iagp->iagfree = -1;
2543		iagp->nfreeinos = 0;
2544		iagp->nfreeexts = cpu_to_le32(EXTSPERIAG);
2545
2546		/* initialize the free inode summary map (free extent
2547		 * summary map initialization handled by bzero).
2548		 */
2549		for (i = 0; i < SMAPSZ; i++)
2550			iagp->inosmap[i] = ONES;
2551
2552		flush_metapage(mp);
2553#ifdef _STILL_TO_PORT
2554		/* synchronously write the iag page */
2555		if (bmWrite(bp)) {
2556			/* Free the blocks allocated for the iag since it was
2557			 * not successfully added to the inode map
2558			 */
2559			dbFree(ipimap, xaddr, (s64) xlen);
2560
2561			/* release the inode map lock */
2562			IWRITE_UNLOCK(ipimap);
2563
2564			rc = EIO;
2565			goto out;
2566		}
2567
2568		/* Now the iag is on disk */
2569
2570		/*
2571		 * start tyransaction of update of the inode map
2572		 * addressing structure pointing to the new iag page;
2573		 */
2574#endif				/*  _STILL_TO_PORT */
2575		tid = txBegin(sb, COMMIT_FORCE);
2576
2577		/* update the inode map addressing structure to point to it */
2578		if ((rc =
2579		     xtInsert(tid, ipimap, 0, blkno, xlen, &xaddr, 0))) {
2580			/* Free the blocks allocated for the iag since it was
2581			 * not successfully added to the inode map
2582			 */
2583			dbFree(ipimap, xaddr, (s64) xlen);
2584
2585			/* release the inode map lock */
2586			IWRITE_UNLOCK(ipimap);
2587
2588			goto out;
2589		}
2590
2591		/* update the inode map's inode to reflect the extension */
2592		ipimap->i_size += PSIZE;
2593		ipimap->i_blocks += LBLK2PBLK(sb, xlen);
2594
2595		/*
2596		 * txCommit(COMMIT_FORCE) will synchronously write address
2597		 * index pages and inode after commit in careful update order
2598		 * of address index pages (right to left, bottom up);
2599		 */
2600		iplist[0] = ipimap;
2601		rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
2602
2603		txEnd(tid);
2604
2605		duplicateIXtree(sb, blkno, xlen, &xaddr);
2606
2607		/* update the next avaliable iag number */
2608		imap->im_nextiag += 1;
2609
2610		/* Add the iag to the iag free list so we don't lose the iag
2611		 * if a failure happens now.
2612		 */
2613		imap->im_freeiag = iagno;
2614
2615		/* Until we have logredo working, we want the imap inode &
2616		 * control page to be up to date.
2617		 */
2618		diSync(ipimap);
2619
2620		/* release the inode map lock */
2621		IWRITE_UNLOCK(ipimap);
2622	}
2623
2624	/* obtain read lock on map */
2625	IREAD_LOCK(ipimap);
2626
2627	/* read the iag */
2628	if ((rc = diIAGRead(imap, iagno, &mp))) {
2629		IREAD_UNLOCK(ipimap);
2630		rc = EIO;
2631		goto out;
2632	}
2633	iagp = (struct iag *) mp->data;
2634
2635	/* remove the iag from the iag free list */
2636	imap->im_freeiag = le32_to_cpu(iagp->iagfree);
2637	iagp->iagfree = -1;
2638
2639	/* set the return iag number and buffer pointer */
2640	*iagnop = iagno;
2641	*mpp = mp;
2642
2643      out:
2644	/* release the iag free lock */
2645	IAGFREE_UNLOCK(imap);
2646
2647	return (rc);
2648}
2649
2650/*
2651 * NAME:        diIAGRead()
2652 *
2653 * FUNCTION:    get the buffer for the specified iag within a fileset
2654 *		or aggregate inode map.
2655 *
2656 * PARAMETERS:
2657 *      imap  	- pointer to inode map control structure.
2658 *      iagno  	- iag number.
2659 *      bpp  	- point to buffer pointer to be filled in on successful
2660 *		  exit.
2661 *
2662 * SERIALIZATION:
2663 *	must have read lock on imap inode
2664 *	(When called by diExtendFS, the filesystem is quiesced, therefore
2665 *	 the read lock is unnecessary.)
2666 *
2667 * RETURN VALUES:
2668 *      0       - success.
2669 *      EIO  	- i/o error.
2670 */
2671static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
2672{
2673	struct inode *ipimap = imap->im_ipimap;
2674	s64 blkno;
2675
2676	/* compute the logical block number of the iag. */
2677	blkno = IAGTOLBLK(iagno, JFS_SBI(ipimap->i_sb)->l2nbperpage);
2678
2679	/* read the iag. */
2680	*mpp = read_metapage(ipimap, blkno, PSIZE, 0);
2681	if (*mpp == NULL) {
2682		return (EIO);
2683	}
2684
2685	return (0);
2686}
2687
2688/*
2689 * NAME:        diFindFree()
2690 *
2691 * FUNCTION:    find the first free bit in a word starting at
2692 *		the specified bit position.
2693 *
2694 * PARAMETERS:
2695 *      word  	- word to be examined.
2696 *      start  	- starting bit position.
2697 *
2698 * RETURN VALUES:
2699 *      bit position of first free bit in the word or 32 if
2700 *	no free bits were found.
2701 */
2702static int diFindFree(u32 word, int start)
2703{
2704	int bitno;
2705	assert(start < 32);
2706	/* scan the word for the first free bit. */
2707	for (word <<= start, bitno = start; bitno < 32;
2708	     bitno++, word <<= 1) {
2709		if ((word & HIGHORDER) == 0)
2710			break;
2711	}
2712	return (bitno);
2713}
2714
2715/*
2716 * NAME:	diUpdatePMap()
2717 *
2718 * FUNCTION: Update the persistent map in an IAG for the allocation or
2719 *	freeing of the specified inode.
2720 *
2721 * PRE CONDITIONS: Working map has already been updated for allocate.
2722 *
2723 * PARAMETERS:
2724 *	ipimap	- Incore inode map inode
2725 *	inum	- Number of inode to mark in permanent map
2726 *	is_free	- If TRUE indicates inode should be marked freed, otherwise
2727 *		  indicates inode should be marked allocated.
2728 *
2729 * RETURNS: 0 for success
2730 */
2731int
2732diUpdatePMap(struct inode *ipimap,
2733	     unsigned long inum, boolean_t is_free, struct tblock * tblk)
2734{
2735	int rc;
2736	struct iag *iagp;
2737	struct metapage *mp;
2738	int iagno, ino, extno, bitno;
2739	struct inomap *imap;
2740	u32 mask;
2741	struct jfs_log *log;
2742	int lsn, difft, diffp;
2743
2744	imap = JFS_IP(ipimap)->i_imap;
2745	/* get the iag number containing the inode */
2746	iagno = INOTOIAG(inum);
2747	/* make sure that the iag is contained within the map */
2748	assert(iagno < imap->im_nextiag);
2749	/* read the iag */
2750	IREAD_LOCK(ipimap);
2751	rc = diIAGRead(imap, iagno, &mp);
2752	IREAD_UNLOCK(ipimap);
2753	if (rc)
2754		return (rc);
2755	iagp = (struct iag *) mp->data;
2756	/* get the inode number and extent number of the inode within
2757	 * the iag and the inode number within the extent.
2758	 */
2759	ino = inum & (INOSPERIAG - 1);
2760	extno = ino >> L2INOSPEREXT;
2761	bitno = ino & (INOSPEREXT - 1);
2762	mask = HIGHORDER >> bitno;
2763	/*
2764	 * mark the inode free in persistent map:
2765	 */
2766	if (is_free == TRUE) {
2767		/* The inode should have been allocated both in working
2768		 * map and in persistent map;
2769		 * the inode will be freed from working map at the release
2770		 * of last reference release;
2771		 */
2772//              assert(le32_to_cpu(iagp->wmap[extno]) & mask);
2773		if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
2774			jERROR(1,
2775			       ("diUpdatePMap: inode %ld not marked as allocated in wmap!\n",
2776				inum));
2777			updateSuper(ipimap->i_sb, FM_DIRTY);
2778		}
2779//              assert(le32_to_cpu(iagp->pmap[extno]) & mask);
2780		if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
2781			jERROR(1,
2782			       ("diUpdatePMap: inode %ld not marked as allocated in pmap!\n",
2783				inum));
2784			updateSuper(ipimap->i_sb, FM_DIRTY);
2785		}
2786		/* update the bitmap for the extent of the freed inode */
2787		iagp->pmap[extno] &= cpu_to_le32(~mask);
2788	}
2789	/*
2790	 * mark the inode allocated in persistent map:
2791	 */
2792	else {
2793		/* The inode should be already allocated in the working map
2794		 * and should be free in persistent map;
2795		 */
2796		assert(le32_to_cpu(iagp->wmap[extno]) & mask);
2797		assert((le32_to_cpu(iagp->pmap[extno]) & mask) == 0);
2798		/* update the bitmap for the extent of the allocated inode */
2799		iagp->pmap[extno] |= cpu_to_le32(mask);
2800	}
2801	/*
2802	 * update iag lsn
2803	 */
2804	lsn = tblk->lsn;
2805	log = JFS_SBI(tblk->sb)->log;
2806	if (mp->lsn != 0) {
2807		/* inherit older/smaller lsn */
2808		logdiff(difft, lsn, log);
2809		logdiff(diffp, mp->lsn, log);
2810		if (difft < diffp) {
2811			mp->lsn = lsn;
2812			/* move mp after tblock in logsync list */
2813			LOGSYNC_LOCK(log);
2814			list_del(&mp->synclist);
2815			list_add(&mp->synclist, &tblk->synclist);
2816			LOGSYNC_UNLOCK(log);
2817		}
2818		/* inherit younger/larger clsn */
2819		LOGSYNC_LOCK(log);
2820		assert(mp->clsn);
2821		logdiff(difft, tblk->clsn, log);
2822		logdiff(diffp, mp->clsn, log);
2823		if (difft > diffp)
2824			mp->clsn = tblk->clsn;
2825		LOGSYNC_UNLOCK(log);
2826	} else {
2827		mp->log = log;
2828		mp->lsn = lsn;
2829		/* insert mp after tblock in logsync list */
2830		LOGSYNC_LOCK(log);
2831		log->count++;
2832		list_add(&mp->synclist, &tblk->synclist);
2833		mp->clsn = tblk->clsn;
2834		LOGSYNC_UNLOCK(log);
2835	}
2836//      bmLazyWrite(mp, log->flag & JFS_COMMIT);
2837	write_metapage(mp);
2838	return (0);
2839}
2840
2841/*
2842 *	diExtendFS()
2843 *
2844 * function: update imap for extendfs();
2845 *
2846 * note: AG size has been increased s.t. each k old contiguous AGs are
2847 * coalesced into a new AG;
2848 */
2849int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2850{
2851	int rc, rcx = 0;
2852	struct inomap *imap = JFS_IP(ipimap)->i_imap;
2853	struct iag *iagp = 0, *hiagp = 0;
2854	struct bmap *mp = JFS_SBI(ipbmap->i_sb)->bmap;
2855	struct metapage *bp, *hbp;
2856	int i, n, head;
2857	int numinos, xnuminos = 0, xnumfree = 0;
2858	s64 agstart;
2859
2860	jEVENT(0, ("diExtendFS: nextiag:%d numinos:%d numfree:%d\n",
2861		   imap->im_nextiag, atomic_read(&imap->im_numinos),
2862		   atomic_read(&imap->im_numfree)));
2863
2864	/*
2865	 *      reconstruct imap
2866	 *
2867	 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
2868	 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
2869	 * note: new AG size = old AG size * (2**x).
2870	 */
2871
2872	/* init per AG control information im_agctl[] */
2873	for (i = 0; i < MAXAG; i++) {
2874		imap->im_agctl[i].inofree = -1;	/* free inode list */
2875		imap->im_agctl[i].extfree = -1;	/* free extent list */
2876		imap->im_agctl[i].numinos = 0;	/* number of backed inodes */
2877		imap->im_agctl[i].numfree = 0;	/* number of free backed inodes */
2878	}
2879
2880	/*
2881	 *      process each iag page of the map.
2882	 *
2883	 * rebuild AG Free Inode List, AG Free Inode Extent List;
2884	 */
2885	for (i = 0; i < imap->im_nextiag; i++) {
2886		if ((rc = diIAGRead(imap, i, &bp))) {
2887			rcx = rc;
2888			continue;
2889		}
2890		iagp = (struct iag *) bp->data;
2891		assert(le32_to_cpu(iagp->iagnum) == i);
2892
2893		/* leave free iag in the free iag list */
2894		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
2895		        release_metapage(bp);
2896			continue;
2897		}
2898
2899		/* agstart that computes to the same ag is treated as same; */
2900		agstart = le64_to_cpu(iagp->agstart);
2901		/* iagp->agstart = agstart & ~(mp->db_agsize - 1); */
2902		n = agstart >> mp->db_agl2size;
2903/*
2904printf("diExtendFS: iag:%d agstart:%Ld agno:%d\n", i, agstart, n);
2905*/
2906
2907		/* compute backed inodes */
2908		numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts))
2909		    << L2INOSPEREXT;
2910		if (numinos > 0) {
2911			/* merge AG backed inodes */
2912			imap->im_agctl[n].numinos += numinos;
2913			xnuminos += numinos;
2914		}
2915
2916		/* if any backed free inodes, insert at AG free inode list */
2917		if ((int) le32_to_cpu(iagp->nfreeinos) > 0) {
2918			if ((head = imap->im_agctl[n].inofree) == -1)
2919				iagp->inofreefwd = iagp->inofreeback = -1;
2920			else {
2921				if ((rc = diIAGRead(imap, head, &hbp))) {
2922					rcx = rc;
2923					goto nextiag;
2924				}
2925				hiagp = (struct iag *) hbp->data;
2926				hiagp->inofreeback =
2927				    le32_to_cpu(iagp->iagnum);
2928				iagp->inofreefwd = cpu_to_le32(head);
2929				iagp->inofreeback = -1;
2930				write_metapage(hbp);
2931			}
2932
2933			imap->im_agctl[n].inofree =
2934			    le32_to_cpu(iagp->iagnum);
2935
2936			/* merge AG backed free inodes */
2937			imap->im_agctl[n].numfree +=
2938			    le32_to_cpu(iagp->nfreeinos);
2939			xnumfree += le32_to_cpu(iagp->nfreeinos);
2940		}
2941
2942		/* if any free extents, insert at AG free extent list */
2943		if (le32_to_cpu(iagp->nfreeexts) > 0) {
2944			if ((head = imap->im_agctl[n].extfree) == -1)
2945				iagp->extfreefwd = iagp->extfreeback = -1;
2946			else {
2947				if ((rc = diIAGRead(imap, head, &hbp))) {
2948					rcx = rc;
2949					goto nextiag;
2950				}
2951				hiagp = (struct iag *) hbp->data;
2952				hiagp->extfreeback = iagp->iagnum;
2953				iagp->extfreefwd = cpu_to_le32(head);
2954				iagp->extfreeback = -1;
2955				write_metapage(hbp);
2956			}
2957
2958			imap->im_agctl[n].extfree =
2959			    le32_to_cpu(iagp->iagnum);
2960		}
2961
2962	      nextiag:
2963		write_metapage(bp);
2964	}
2965
2966	ASSERT(xnuminos == atomic_read(&imap->im_numinos) &&
2967	       xnumfree == atomic_read(&imap->im_numfree));
2968
2969	return rcx;
2970}
2971
2972
2973/*
2974 *	duplicateIXtree()
2975 *
2976 * serialization: IWRITE_LOCK held on entry/exit
2977 *
2978 * note: shadow page with regular inode (rel.2);
2979 */
2980static void duplicateIXtree(struct super_block *sb, s64 blkno,
2981			    int xlen, s64 *xaddr)
2982{
2983	struct jfs_superblock *j_sb;
2984	struct buffer_head *bh;
2985	struct inode *ip;
2986	tid_t tid;
2987	int rc;
2988
2989	/* if AIT2 ipmap2 is bad, do not try to update it */
2990	if (JFS_SBI(sb)->mntflag & JFS_BAD_SAIT)	/* s_flag */
2991		return;
2992	ip = diReadSpecial(sb, FILESYSTEM_I, 1);
2993	if (ip == NULL) {
2994		JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
2995		if ((rc = readSuper(sb, &bh)))
2996			return;
2997		j_sb = (struct jfs_superblock *)bh->b_data;
2998		j_sb->s_flag |= JFS_BAD_SAIT;
2999
3000		mark_buffer_dirty(bh);
3001		ll_rw_block(WRITE, 1, &bh);
3002		wait_on_buffer(bh);
3003		brelse(bh);
3004		return;
3005	}
3006
3007	/* start transaction */
3008	tid = txBegin(sb, COMMIT_FORCE);
3009	/* update the inode map addressing structure to point to it */
3010	if ((rc = xtInsert(tid, ip, 0, blkno, xlen, xaddr, 0))) {
3011		JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
3012		txAbort(tid, 1);
3013		goto cleanup;
3014
3015	}
3016	/* update the inode map's inode to reflect the extension */
3017	ip->i_size += PSIZE;
3018	ip->i_blocks += LBLK2PBLK(sb, xlen);
3019	rc = txCommit(tid, 1, &ip, COMMIT_FORCE);
3020      cleanup:
3021	txEnd(tid);
3022	diFreeSpecial(ip);
3023}
3024
3025/*
3026 * NAME:        copy_from_dinode()
3027 *
3028 * FUNCTION:    Copies inode info from disk inode to in-memory inode
3029 *
3030 * RETURN VALUES:
3031 *      0       - success
3032 *      ENOMEM	- insufficient memory
3033 */
3034static int copy_from_dinode(struct dinode * dip, struct inode *ip)
3035{
3036	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
3037
3038	jfs_ip->fileset = le32_to_cpu(dip->di_fileset);
3039	jfs_ip->mode2 = le32_to_cpu(dip->di_mode);
3040
3041	ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff;
3042	ip->i_nlink = le32_to_cpu(dip->di_nlink);
3043	ip->i_uid = le32_to_cpu(dip->di_uid);
3044	ip->i_gid = le32_to_cpu(dip->di_gid);
3045	ip->i_size = le64_to_cpu(dip->di_size);
3046	ip->i_atime = le32_to_cpu(dip->di_atime.tv_sec);
3047	ip->i_mtime = le32_to_cpu(dip->di_mtime.tv_sec);
3048	ip->i_ctime = le32_to_cpu(dip->di_ctime.tv_sec);
3049	ip->i_blksize = ip->i_sb->s_blocksize;
3050	ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
3051	ip->i_generation = le32_to_cpu(dip->di_gen);
3052
3053	jfs_ip->ixpxd = dip->di_ixpxd;	/* in-memory pxd's are little-endian */
3054	jfs_ip->acl = dip->di_acl;	/* as are dxd's */
3055	jfs_ip->ea = dip->di_ea;
3056	jfs_ip->next_index = le32_to_cpu(dip->di_next_index);
3057	jfs_ip->otime = le32_to_cpu(dip->di_otime.tv_sec);
3058	jfs_ip->acltype = le32_to_cpu(dip->di_acltype);
3059
3060	if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode))
3061		ip->i_rdev = to_kdev_t(le32_to_cpu(dip->di_rdev));
3062
3063	if (S_ISDIR(ip->i_mode)) {
3064		memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384);
3065	} else if (S_ISREG(ip->i_mode) || S_ISLNK(ip->i_mode)) {
3066		memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288);
3067	} else
3068		memcpy(&jfs_ip->i_inline_ea, &dip->di_inlineea, 128);
3069
3070	/* Zero the in-memory-only stuff */
3071	jfs_ip->cflag = 0;
3072	jfs_ip->btindex = 0;
3073	jfs_ip->btorder = 0;
3074	jfs_ip->bxflag = 0;
3075	jfs_ip->blid = 0;
3076	jfs_ip->atlhead = 0;
3077	jfs_ip->atltail = 0;
3078	jfs_ip->xtlid = 0;
3079	return (0);
3080}
3081
3082/*
3083 * NAME:        copy_to_dinode()
3084 *
3085 * FUNCTION:    Copies inode info from in-memory inode to disk inode
3086 */
3087static void copy_to_dinode(struct dinode * dip, struct inode *ip)
3088{
3089	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
3090
3091	dip->di_fileset = cpu_to_le32(jfs_ip->fileset);
3092	dip->di_inostamp = cpu_to_le32(JFS_SBI(ip->i_sb)->inostamp);
3093	dip->di_number = cpu_to_le32(ip->i_ino);
3094	dip->di_gen = cpu_to_le32(ip->i_generation);
3095	dip->di_size = cpu_to_le64(ip->i_size);
3096	dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks));
3097	dip->di_nlink = cpu_to_le32(ip->i_nlink);
3098	dip->di_uid = cpu_to_le32(ip->i_uid);
3099	dip->di_gid = cpu_to_le32(ip->i_gid);
3100	/*
3101	 * mode2 is only needed for storing the higher order bits.
3102	 * Trust i_mode for the lower order ones
3103	 */
3104	dip->di_mode = cpu_to_le32((jfs_ip->mode2 & 0xffff0000) | ip->i_mode);
3105	dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime);
3106	dip->di_atime.tv_nsec = 0;
3107	dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime);
3108	dip->di_ctime.tv_nsec = 0;
3109	dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime);
3110	dip->di_mtime.tv_nsec = 0;
3111	dip->di_ixpxd = jfs_ip->ixpxd;	/* in-memory pxd's are little-endian */
3112	dip->di_acl = jfs_ip->acl;	/* as are dxd's */
3113	dip->di_ea = jfs_ip->ea;
3114	dip->di_next_index = cpu_to_le32(jfs_ip->next_index);
3115	dip->di_otime.tv_sec = cpu_to_le32(jfs_ip->otime);
3116	dip->di_otime.tv_nsec = 0;
3117	dip->di_acltype = cpu_to_le32(jfs_ip->acltype);
3118
3119	if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode))
3120		dip->di_rdev = cpu_to_le32(kdev_t_to_nr(ip->i_rdev));
3121}
3122
3123#ifdef	_JFS_DEBUG_IMAP
3124/*
3125 *	DBGdiInit()
3126 */
3127static void *DBGdiInit(struct inomap * imap)
3128{
3129	u32 *dimap;
3130	int size;
3131	size = 64 * 1024;
3132	if ((dimap = (u32 *) xmalloc(size, L2PSIZE, kernel_heap)) == NULL)
3133		assert(0);
3134	bzero((void *) dimap, size);
3135	imap->im_DBGdimap = dimap;
3136}
3137
3138/*
3139 *	DBGdiAlloc()
3140 */
3141static void DBGdiAlloc(struct inomap * imap, ino_t ino)
3142{
3143	u32 *dimap = imap->im_DBGdimap;
3144	int w, b;
3145	u32 m;
3146	w = ino >> 5;
3147	b = ino & 31;
3148	m = 0x80000000 >> b;
3149	assert(w < 64 * 256);
3150	if (dimap[w] & m) {
3151		printk("DEBUG diAlloc: duplicate alloc ino:0x%x\n", ino);
3152	}
3153	dimap[w] |= m;
3154}
3155
3156/*
3157 *	DBGdiFree()
3158 */
3159static void DBGdiFree(struct inomap * imap, ino_t ino)
3160{
3161	u32 *dimap = imap->im_DBGdimap;
3162	int w, b;
3163	u32 m;
3164	w = ino >> 5;
3165	b = ino & 31;
3166	m = 0x80000000 >> b;
3167	assert(w < 64 * 256);
3168	if ((dimap[w] & m) == 0) {
3169		printk("DEBUG diFree: duplicate free ino:0x%x\n", ino);
3170	}
3171	dimap[w] &= ~m;
3172}
3173
3174static void dump_cp(struct inomap * ipimap, char *function, int line)
3175{
3176	printk("\n* ********* *\nControl Page %s %d\n", function, line);
3177	printk("FreeIAG %d\tNextIAG %d\n", ipimap->im_freeiag,
3178	       ipimap->im_nextiag);
3179	printk("NumInos %d\tNumFree %d\n",
3180	       atomic_read(&ipimap->im_numinos),
3181	       atomic_read(&ipimap->im_numfree));
3182	printk("AG InoFree %d\tAG ExtFree %d\n",
3183	       ipimap->im_agctl[0].inofree, ipimap->im_agctl[0].extfree);
3184	printk("AG NumInos %d\tAG NumFree %d\n",
3185	       ipimap->im_agctl[0].numinos, ipimap->im_agctl[0].numfree);
3186}
3187
3188static void dump_iag(struct iag * iag, char *function, int line)
3189{
3190	printk("\n* ********* *\nIAG %s %d\n", function, line);
3191	printk("IagNum %d\tIAG Free %d\n", le32_to_cpu(iag->iagnum),
3192	       le32_to_cpu(iag->iagfree));
3193	printk("InoFreeFwd %d\tInoFreeBack %d\n",
3194	       le32_to_cpu(iag->inofreefwd),
3195	       le32_to_cpu(iag->inofreeback));
3196	printk("ExtFreeFwd %d\tExtFreeBack %d\n",
3197	       le32_to_cpu(iag->extfreefwd),
3198	       le32_to_cpu(iag->extfreeback));
3199	printk("NFreeInos %d\tNFreeExts %d\n", le32_to_cpu(iag->nfreeinos),
3200	       le32_to_cpu(iag->nfreeexts));
3201}
3202#endif				/* _JFS_DEBUG_IMAP */
3203