1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 *   Copyright (C) International Business Machines Corp., 2000-2005
4 *   Portions Copyright (C) Christoph Hellwig, 2001-2002
5 */
6
7/*
8 *	jfs_txnmgr.c: transaction manager
9 *
10 * notes:
11 * transaction starts with txBegin() and ends with txCommit()
12 * or txAbort().
13 *
14 * tlock is acquired at the time of update;
15 * (obviate scan at commit time for xtree and dtree)
16 * tlock and mp points to each other;
17 * (no hashlist for mp -> tlock).
18 *
19 * special cases:
20 * tlock on in-memory inode:
21 * in-place tlock in the in-memory inode itself;
22 * converted to page lock by iWrite() at commit time.
23 *
24 * tlock during write()/mmap() under anonymous transaction (tid = 0):
25 * transferred (?) to transaction at commit time.
26 *
27 * use the page itself to update allocation maps
28 * (obviate intermediate replication of allocation/deallocation data)
29 * hold on to mp+lock thru update of maps
30 */
31
32#include <linux/fs.h>
33#include <linux/vmalloc.h>
34#include <linux/completion.h>
35#include <linux/freezer.h>
36#include <linux/module.h>
37#include <linux/moduleparam.h>
38#include <linux/kthread.h>
39#include <linux/seq_file.h>
40#include "jfs_incore.h"
41#include "jfs_inode.h"
42#include "jfs_filsys.h"
43#include "jfs_metapage.h"
44#include "jfs_dinode.h"
45#include "jfs_imap.h"
46#include "jfs_dmap.h"
47#include "jfs_superblock.h"
48#include "jfs_debug.h"
49
50/*
51 *	transaction management structures
52 */
53static struct {
54	int freetid;		/* index of a free tid structure */
55	int freelock;		/* index first free lock word */
56	wait_queue_head_t freewait;	/* eventlist of free tblock */
57	wait_queue_head_t freelockwait;	/* eventlist of free tlock */
58	wait_queue_head_t lowlockwait;	/* eventlist of ample tlocks */
59	int tlocksInUse;	/* Number of tlocks in use */
60	spinlock_t LazyLock;	/* synchronize sync_queue & unlock_queue */
61/*	struct tblock *sync_queue; * Transactions waiting for data sync */
62	struct list_head unlock_queue;	/* Txns waiting to be released */
63	struct list_head anon_list;	/* inodes having anonymous txns */
64	struct list_head anon_list2;	/* inodes having anonymous txns
65					   that couldn't be sync'ed */
66} TxAnchor;
67
68int jfs_tlocks_low;		/* Indicates low number of available tlocks */
69
70#ifdef CONFIG_JFS_STATISTICS
71static struct {
72	uint txBegin;
73	uint txBegin_barrier;
74	uint txBegin_lockslow;
75	uint txBegin_freetid;
76	uint txBeginAnon;
77	uint txBeginAnon_barrier;
78	uint txBeginAnon_lockslow;
79	uint txLockAlloc;
80	uint txLockAlloc_freelock;
81} TxStat;
82#endif
83
84static int nTxBlock = -1;	/* number of transaction blocks */
85module_param(nTxBlock, int, 0);
86MODULE_PARM_DESC(nTxBlock,
87		 "Number of transaction blocks (max:65536)");
88
89static int nTxLock = -1;	/* number of transaction locks */
90module_param(nTxLock, int, 0);
91MODULE_PARM_DESC(nTxLock,
92		 "Number of transaction locks (max:65536)");
93
94struct tblock *TxBlock;	/* transaction block table */
95static int TxLockLWM;	/* Low water mark for number of txLocks used */
96static int TxLockHWM;	/* High water mark for number of txLocks used */
97static int TxLockVHWM;	/* Very High water mark */
98struct tlock *TxLock;	/* transaction lock table */
99
100/*
101 *	transaction management lock
102 */
103static DEFINE_SPINLOCK(jfsTxnLock);
104
105#define TXN_LOCK()		spin_lock(&jfsTxnLock)
106#define TXN_UNLOCK()		spin_unlock(&jfsTxnLock)
107
108#define LAZY_LOCK_INIT()	spin_lock_init(&TxAnchor.LazyLock)
109#define LAZY_LOCK(flags)	spin_lock_irqsave(&TxAnchor.LazyLock, flags)
110#define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags)
111
112static DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait);
113static int jfs_commit_thread_waking;
114
115/*
116 * Retry logic exist outside these macros to protect from spurrious wakeups.
117 */
118static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
119{
120	DECLARE_WAITQUEUE(wait, current);
121
122	add_wait_queue(event, &wait);
123	set_current_state(TASK_UNINTERRUPTIBLE);
124	TXN_UNLOCK();
125	io_schedule();
126	remove_wait_queue(event, &wait);
127}
128
129#define TXN_SLEEP(event)\
130{\
131	TXN_SLEEP_DROP_LOCK(event);\
132	TXN_LOCK();\
133}
134
135#define TXN_WAKEUP(event) wake_up_all(event)
136
137/*
138 *	statistics
139 */
140static struct {
141	tid_t maxtid;		/* 4: biggest tid ever used */
142	lid_t maxlid;		/* 4: biggest lid ever used */
143	int ntid;		/* 4: # of transactions performed */
144	int nlid;		/* 4: # of tlocks acquired */
145	int waitlock;		/* 4: # of tlock wait */
146} stattx;
147
148/*
149 * forward references
150 */
151static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd,
152		struct tlock *tlck, struct commit *cd);
153static void dataLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd,
154		struct tlock *tlck);
155static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
156		struct tlock * tlck);
157static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
158		struct tlock * tlck);
159static void txAllocPMap(struct inode *ip, struct maplock * maplock,
160		struct tblock * tblk);
161static void txForce(struct tblock * tblk);
162static void txLog(struct jfs_log *log, struct tblock *tblk,
163		struct commit *cd);
164static void txUpdateMap(struct tblock * tblk);
165static void txRelease(struct tblock * tblk);
166static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
167	   struct tlock * tlck);
168static void LogSyncRelease(struct metapage * mp);
169
170/*
171 *		transaction block/lock management
172 *		---------------------------------
173 */
174
175/*
176 * Get a transaction lock from the free list.  If the number in use is
177 * greater than the high water mark, wake up the sync daemon.  This should
178 * free some anonymous transaction locks.  (TXN_LOCK must be held.)
179 */
180static lid_t txLockAlloc(void)
181{
182	lid_t lid;
183
184	INCREMENT(TxStat.txLockAlloc);
185	if (!TxAnchor.freelock) {
186		INCREMENT(TxStat.txLockAlloc_freelock);
187	}
188
189	while (!(lid = TxAnchor.freelock))
190		TXN_SLEEP(&TxAnchor.freelockwait);
191	TxAnchor.freelock = TxLock[lid].next;
192	HIGHWATERMARK(stattx.maxlid, lid);
193	if ((++TxAnchor.tlocksInUse > TxLockHWM) && (jfs_tlocks_low == 0)) {
194		jfs_info("txLockAlloc tlocks low");
195		jfs_tlocks_low = 1;
196		wake_up_process(jfsSyncThread);
197	}
198
199	return lid;
200}
201
202static void txLockFree(lid_t lid)
203{
204	TxLock[lid].tid = 0;
205	TxLock[lid].next = TxAnchor.freelock;
206	TxAnchor.freelock = lid;
207	TxAnchor.tlocksInUse--;
208	if (jfs_tlocks_low && (TxAnchor.tlocksInUse < TxLockLWM)) {
209		jfs_info("txLockFree jfs_tlocks_low no more");
210		jfs_tlocks_low = 0;
211		TXN_WAKEUP(&TxAnchor.lowlockwait);
212	}
213	TXN_WAKEUP(&TxAnchor.freelockwait);
214}
215
216/*
217 * NAME:	txInit()
218 *
219 * FUNCTION:	initialize transaction management structures
220 *
221 * RETURN:
222 *
223 * serialization: single thread at jfs_init()
224 */
225int txInit(void)
226{
227	int k, size;
228	struct sysinfo si;
229
230	/* Set defaults for nTxLock and nTxBlock if unset */
231
232	if (nTxLock == -1) {
233		if (nTxBlock == -1) {
234			/* Base default on memory size */
235			si_meminfo(&si);
236			if (si.totalram > (256 * 1024)) /* 1 GB */
237				nTxLock = 64 * 1024;
238			else
239				nTxLock = si.totalram >> 2;
240		} else if (nTxBlock > (8 * 1024))
241			nTxLock = 64 * 1024;
242		else
243			nTxLock = nTxBlock << 3;
244	}
245	if (nTxBlock == -1)
246		nTxBlock = nTxLock >> 3;
247
248	/* Verify tunable parameters */
249	if (nTxBlock < 16)
250		nTxBlock = 16;	/* No one should set it this low */
251	if (nTxBlock > 65536)
252		nTxBlock = 65536;
253	if (nTxLock < 256)
254		nTxLock = 256;	/* No one should set it this low */
255	if (nTxLock > 65536)
256		nTxLock = 65536;
257
258	printk(KERN_INFO "JFS: nTxBlock = %d, nTxLock = %d\n",
259	       nTxBlock, nTxLock);
260	/*
261	 * initialize transaction block (tblock) table
262	 *
263	 * transaction id (tid) = tblock index
264	 * tid = 0 is reserved.
265	 */
266	TxLockLWM = (nTxLock * 4) / 10;
267	TxLockHWM = (nTxLock * 7) / 10;
268	TxLockVHWM = (nTxLock * 8) / 10;
269
270	size = sizeof(struct tblock) * nTxBlock;
271	TxBlock = vmalloc(size);
272	if (TxBlock == NULL)
273		return -ENOMEM;
274
275	for (k = 1; k < nTxBlock - 1; k++) {
276		TxBlock[k].next = k + 1;
277		init_waitqueue_head(&TxBlock[k].gcwait);
278		init_waitqueue_head(&TxBlock[k].waitor);
279	}
280	TxBlock[k].next = 0;
281	init_waitqueue_head(&TxBlock[k].gcwait);
282	init_waitqueue_head(&TxBlock[k].waitor);
283
284	TxAnchor.freetid = 1;
285	init_waitqueue_head(&TxAnchor.freewait);
286
287	stattx.maxtid = 1;	/* statistics */
288
289	/*
290	 * initialize transaction lock (tlock) table
291	 *
292	 * transaction lock id = tlock index
293	 * tlock id = 0 is reserved.
294	 */
295	size = sizeof(struct tlock) * nTxLock;
296	TxLock = vmalloc(size);
297	if (TxLock == NULL) {
298		vfree(TxBlock);
299		return -ENOMEM;
300	}
301
302	/* initialize tlock table */
303	for (k = 1; k < nTxLock - 1; k++)
304		TxLock[k].next = k + 1;
305	TxLock[k].next = 0;
306	init_waitqueue_head(&TxAnchor.freelockwait);
307	init_waitqueue_head(&TxAnchor.lowlockwait);
308
309	TxAnchor.freelock = 1;
310	TxAnchor.tlocksInUse = 0;
311	INIT_LIST_HEAD(&TxAnchor.anon_list);
312	INIT_LIST_HEAD(&TxAnchor.anon_list2);
313
314	LAZY_LOCK_INIT();
315	INIT_LIST_HEAD(&TxAnchor.unlock_queue);
316
317	stattx.maxlid = 1;	/* statistics */
318
319	return 0;
320}
321
322/*
323 * NAME:	txExit()
324 *
325 * FUNCTION:	clean up when module is unloaded
326 */
327void txExit(void)
328{
329	vfree(TxLock);
330	TxLock = NULL;
331	vfree(TxBlock);
332	TxBlock = NULL;
333}
334
335/*
336 * NAME:	txBegin()
337 *
338 * FUNCTION:	start a transaction.
339 *
340 * PARAMETER:	sb	- superblock
341 *		flag	- force for nested tx;
342 *
343 * RETURN:	tid	- transaction id
344 *
345 * note: flag force allows to start tx for nested tx
346 * to prevent deadlock on logsync barrier;
347 */
348tid_t txBegin(struct super_block *sb, int flag)
349{
350	tid_t t;
351	struct tblock *tblk;
352	struct jfs_log *log;
353
354	jfs_info("txBegin: flag = 0x%x", flag);
355	log = JFS_SBI(sb)->log;
356
357	if (!log) {
358		jfs_error(sb, "read-only filesystem\n");
359		return 0;
360	}
361
362	TXN_LOCK();
363
364	INCREMENT(TxStat.txBegin);
365
366      retry:
367	if (!(flag & COMMIT_FORCE)) {
368		/*
369		 * synchronize with logsync barrier
370		 */
371		if (test_bit(log_SYNCBARRIER, &log->flag) ||
372		    test_bit(log_QUIESCE, &log->flag)) {
373			INCREMENT(TxStat.txBegin_barrier);
374			TXN_SLEEP(&log->syncwait);
375			goto retry;
376		}
377	}
378	if (flag == 0) {
379		/*
380		 * Don't begin transaction if we're getting starved for tlocks
381		 * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately
382		 * free tlocks)
383		 */
384		if (TxAnchor.tlocksInUse > TxLockVHWM) {
385			INCREMENT(TxStat.txBegin_lockslow);
386			TXN_SLEEP(&TxAnchor.lowlockwait);
387			goto retry;
388		}
389	}
390
391	/*
392	 * allocate transaction id/block
393	 */
394	if ((t = TxAnchor.freetid) == 0) {
395		jfs_info("txBegin: waiting for free tid");
396		INCREMENT(TxStat.txBegin_freetid);
397		TXN_SLEEP(&TxAnchor.freewait);
398		goto retry;
399	}
400
401	tblk = tid_to_tblock(t);
402
403	if ((tblk->next == 0) && !(flag & COMMIT_FORCE)) {
404		/* Don't let a non-forced transaction take the last tblk */
405		jfs_info("txBegin: waiting for free tid");
406		INCREMENT(TxStat.txBegin_freetid);
407		TXN_SLEEP(&TxAnchor.freewait);
408		goto retry;
409	}
410
411	TxAnchor.freetid = tblk->next;
412
413	/*
414	 * initialize transaction
415	 */
416
417	/*
418	 * We can't zero the whole thing or we screw up another thread being
419	 * awakened after sleeping on tblk->waitor
420	 *
421	 * memset(tblk, 0, sizeof(struct tblock));
422	 */
423	tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0;
424
425	tblk->sb = sb;
426	++log->logtid;
427	tblk->logtid = log->logtid;
428
429	++log->active;
430
431	HIGHWATERMARK(stattx.maxtid, t);	/* statistics */
432	INCREMENT(stattx.ntid);	/* statistics */
433
434	TXN_UNLOCK();
435
436	jfs_info("txBegin: returning tid = %d", t);
437
438	return t;
439}
440
441/*
442 * NAME:	txBeginAnon()
443 *
444 * FUNCTION:	start an anonymous transaction.
445 *		Blocks if logsync or available tlocks are low to prevent
446 *		anonymous tlocks from depleting supply.
447 *
448 * PARAMETER:	sb	- superblock
449 *
450 * RETURN:	none
451 */
452void txBeginAnon(struct super_block *sb)
453{
454	struct jfs_log *log;
455
456	log = JFS_SBI(sb)->log;
457
458	TXN_LOCK();
459	INCREMENT(TxStat.txBeginAnon);
460
461      retry:
462	/*
463	 * synchronize with logsync barrier
464	 */
465	if (test_bit(log_SYNCBARRIER, &log->flag) ||
466	    test_bit(log_QUIESCE, &log->flag)) {
467		INCREMENT(TxStat.txBeginAnon_barrier);
468		TXN_SLEEP(&log->syncwait);
469		goto retry;
470	}
471
472	/*
473	 * Don't begin transaction if we're getting starved for tlocks
474	 */
475	if (TxAnchor.tlocksInUse > TxLockVHWM) {
476		INCREMENT(TxStat.txBeginAnon_lockslow);
477		TXN_SLEEP(&TxAnchor.lowlockwait);
478		goto retry;
479	}
480	TXN_UNLOCK();
481}
482
483/*
484 *	txEnd()
485 *
486 * function: free specified transaction block.
487 *
488 *	logsync barrier processing:
489 *
490 * serialization:
491 */
492void txEnd(tid_t tid)
493{
494	struct tblock *tblk = tid_to_tblock(tid);
495	struct jfs_log *log;
496
497	jfs_info("txEnd: tid = %d", tid);
498	TXN_LOCK();
499
500	/*
501	 * wakeup transactions waiting on the page locked
502	 * by the current transaction
503	 */
504	TXN_WAKEUP(&tblk->waitor);
505
506	log = JFS_SBI(tblk->sb)->log;
507
508	/*
509	 * Lazy commit thread can't free this guy until we mark it UNLOCKED,
510	 * otherwise, we would be left with a transaction that may have been
511	 * reused.
512	 *
513	 * Lazy commit thread will turn off tblkGC_LAZY before calling this
514	 * routine.
515	 */
516	if (tblk->flag & tblkGC_LAZY) {
517		jfs_info("txEnd called w/lazy tid: %d, tblk = 0x%p", tid, tblk);
518		TXN_UNLOCK();
519
520		spin_lock_irq(&log->gclock);	// LOGGC_LOCK
521		tblk->flag |= tblkGC_UNLOCKED;
522		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
523		return;
524	}
525
526	jfs_info("txEnd: tid: %d, tblk = 0x%p", tid, tblk);
527
528	assert(tblk->next == 0);
529
530	/*
531	 * insert tblock back on freelist
532	 */
533	tblk->next = TxAnchor.freetid;
534	TxAnchor.freetid = tid;
535
536	/*
537	 * mark the tblock not active
538	 */
539	if (--log->active == 0) {
540		clear_bit(log_FLUSH, &log->flag);
541
542		/*
543		 * synchronize with logsync barrier
544		 */
545		if (test_bit(log_SYNCBARRIER, &log->flag)) {
546			TXN_UNLOCK();
547
548			/* write dirty metadata & forward log syncpt */
549			jfs_syncpt(log, 1);
550
551			jfs_info("log barrier off: 0x%x", log->lsn);
552
553			/* enable new transactions start */
554			clear_bit(log_SYNCBARRIER, &log->flag);
555
556			/* wakeup all waitors for logsync barrier */
557			TXN_WAKEUP(&log->syncwait);
558
559			goto wakeup;
560		}
561	}
562
563	TXN_UNLOCK();
564wakeup:
565	/*
566	 * wakeup all waitors for a free tblock
567	 */
568	TXN_WAKEUP(&TxAnchor.freewait);
569}
570
571/*
572 *	txLock()
573 *
574 * function: acquire a transaction lock on the specified <mp>
575 *
576 * parameter:
577 *
578 * return:	transaction lock id
579 *
580 * serialization:
581 */
582struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
583		     int type)
584{
585	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
586	int dir_xtree = 0;
587	lid_t lid;
588	tid_t xtid;
589	struct tlock *tlck;
590	struct xtlock *xtlck;
591	struct linelock *linelock;
592	xtpage_t *p;
593	struct tblock *tblk;
594
595	TXN_LOCK();
596
597	if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) &&
598	    !(mp->xflag & COMMIT_PAGE)) {
599		/*
600		 * Directory inode is special.  It can have both an xtree tlock
601		 * and a dtree tlock associated with it.
602		 */
603		dir_xtree = 1;
604		lid = jfs_ip->xtlid;
605	} else
606		lid = mp->lid;
607
608	/* is page not locked by a transaction ? */
609	if (lid == 0)
610		goto allocateLock;
611
612	jfs_info("txLock: tid:%d ip:0x%p mp:0x%p lid:%d", tid, ip, mp, lid);
613
614	/* is page locked by the requester transaction ? */
615	tlck = lid_to_tlock(lid);
616	if ((xtid = tlck->tid) == tid) {
617		TXN_UNLOCK();
618		goto grantLock;
619	}
620
621	/*
622	 * is page locked by anonymous transaction/lock ?
623	 *
624	 * (page update without transaction (i.e., file write) is
625	 * locked under anonymous transaction tid = 0:
626	 * anonymous tlocks maintained on anonymous tlock list of
627	 * the inode of the page and available to all anonymous
628	 * transactions until txCommit() time at which point
629	 * they are transferred to the transaction tlock list of
630	 * the committing transaction of the inode)
631	 */
632	if (xtid == 0) {
633		tlck->tid = tid;
634		TXN_UNLOCK();
635		tblk = tid_to_tblock(tid);
636		/*
637		 * The order of the tlocks in the transaction is important
638		 * (during truncate, child xtree pages must be freed before
639		 * parent's tlocks change the working map).
640		 * Take tlock off anonymous list and add to tail of
641		 * transaction list
642		 *
643		 * Note:  We really need to get rid of the tid & lid and
644		 * use list_head's.  This code is getting UGLY!
645		 */
646		if (jfs_ip->atlhead == lid) {
647			if (jfs_ip->atltail == lid) {
648				/* only anonymous txn.
649				 * Remove from anon_list
650				 */
651				TXN_LOCK();
652				list_del_init(&jfs_ip->anon_inode_list);
653				TXN_UNLOCK();
654			}
655			jfs_ip->atlhead = tlck->next;
656		} else {
657			lid_t last;
658			for (last = jfs_ip->atlhead;
659			     lid_to_tlock(last)->next != lid;
660			     last = lid_to_tlock(last)->next) {
661				assert(last);
662			}
663			lid_to_tlock(last)->next = tlck->next;
664			if (jfs_ip->atltail == lid)
665				jfs_ip->atltail = last;
666		}
667
668		/* insert the tlock at tail of transaction tlock list */
669
670		if (tblk->next)
671			lid_to_tlock(tblk->last)->next = lid;
672		else
673			tblk->next = lid;
674		tlck->next = 0;
675		tblk->last = lid;
676
677		goto grantLock;
678	}
679
680	goto waitLock;
681
682	/*
683	 * allocate a tlock
684	 */
685      allocateLock:
686	lid = txLockAlloc();
687	tlck = lid_to_tlock(lid);
688
689	/*
690	 * initialize tlock
691	 */
692	tlck->tid = tid;
693
694	TXN_UNLOCK();
695
696	/* mark tlock for meta-data page */
697	if (mp->xflag & COMMIT_PAGE) {
698
699		tlck->flag = tlckPAGELOCK;
700
701		/* mark the page dirty and nohomeok */
702		metapage_nohomeok(mp);
703
704		jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p",
705			 mp, mp->nohomeok, tid, tlck);
706
707		/* if anonymous transaction, and buffer is on the group
708		 * commit synclist, mark inode to show this.  This will
709		 * prevent the buffer from being marked nohomeok for too
710		 * long a time.
711		 */
712		if ((tid == 0) && mp->lsn)
713			set_cflag(COMMIT_Synclist, ip);
714	}
715	/* mark tlock for in-memory inode */
716	else
717		tlck->flag = tlckINODELOCK;
718
719	if (S_ISDIR(ip->i_mode))
720		tlck->flag |= tlckDIRECTORY;
721
722	tlck->type = 0;
723
724	/* bind the tlock and the page */
725	tlck->ip = ip;
726	tlck->mp = mp;
727	if (dir_xtree)
728		jfs_ip->xtlid = lid;
729	else
730		mp->lid = lid;
731
732	/*
733	 * enqueue transaction lock to transaction/inode
734	 */
735	/* insert the tlock at tail of transaction tlock list */
736	if (tid) {
737		tblk = tid_to_tblock(tid);
738		if (tblk->next)
739			lid_to_tlock(tblk->last)->next = lid;
740		else
741			tblk->next = lid;
742		tlck->next = 0;
743		tblk->last = lid;
744	}
745	/* anonymous transaction:
746	 * insert the tlock at head of inode anonymous tlock list
747	 */
748	else {
749		tlck->next = jfs_ip->atlhead;
750		jfs_ip->atlhead = lid;
751		if (tlck->next == 0) {
752			/* This inode's first anonymous transaction */
753			jfs_ip->atltail = lid;
754			TXN_LOCK();
755			list_add_tail(&jfs_ip->anon_inode_list,
756				      &TxAnchor.anon_list);
757			TXN_UNLOCK();
758		}
759	}
760
761	/* initialize type dependent area for linelock */
762	linelock = (struct linelock *) & tlck->lock;
763	linelock->next = 0;
764	linelock->flag = tlckLINELOCK;
765	linelock->maxcnt = TLOCKSHORT;
766	linelock->index = 0;
767
768	switch (type & tlckTYPE) {
769	case tlckDTREE:
770		linelock->l2linesize = L2DTSLOTSIZE;
771		break;
772
773	case tlckXTREE:
774		linelock->l2linesize = L2XTSLOTSIZE;
775
776		xtlck = (struct xtlock *) linelock;
777		xtlck->header.offset = 0;
778		xtlck->header.length = 2;
779
780		if (type & tlckNEW) {
781			xtlck->lwm.offset = XTENTRYSTART;
782		} else {
783			if (mp->xflag & COMMIT_PAGE)
784				p = (xtpage_t *) mp->data;
785			else
786				p = (xtpage_t *) &jfs_ip->i_xtroot;
787			xtlck->lwm.offset =
788			    le16_to_cpu(p->header.nextindex);
789		}
790		xtlck->lwm.length = 0;	/* ! */
791		xtlck->twm.offset = 0;
792		xtlck->hwm.offset = 0;
793
794		xtlck->index = 2;
795		break;
796
797	case tlckINODE:
798		linelock->l2linesize = L2INODESLOTSIZE;
799		break;
800
801	case tlckDATA:
802		linelock->l2linesize = L2DATASLOTSIZE;
803		break;
804
805	default:
806		jfs_err("UFO tlock:0x%p", tlck);
807	}
808
809	/*
810	 * update tlock vector
811	 */
812      grantLock:
813	tlck->type |= type;
814
815	return tlck;
816
817	/*
818	 * page is being locked by another transaction:
819	 */
820      waitLock:
821	/* Only locks on ipimap or ipaimap should reach here */
822	/* assert(jfs_ip->fileset == AGGREGATE_I); */
823	if (jfs_ip->fileset != AGGREGATE_I) {
824		printk(KERN_ERR "txLock: trying to lock locked page!");
825		print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4,
826			       ip, sizeof(*ip), 0);
827		print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4,
828			       mp, sizeof(*mp), 0);
829		print_hex_dump(KERN_ERR, "Locker's tblock: ",
830			       DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid),
831			       sizeof(struct tblock), 0);
832		print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4,
833			       tlck, sizeof(*tlck), 0);
834		BUG();
835	}
836	INCREMENT(stattx.waitlock);	/* statistics */
837	TXN_UNLOCK();
838	release_metapage(mp);
839	TXN_LOCK();
840	xtid = tlck->tid;	/* reacquire after dropping TXN_LOCK */
841
842	jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d",
843		 tid, xtid, lid);
844
845	/* Recheck everything since dropping TXN_LOCK */
846	if (xtid && (tlck->mp == mp) && (mp->lid == lid))
847		TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor);
848	else
849		TXN_UNLOCK();
850	jfs_info("txLock: awakened     tid = %d, lid = %d", tid, lid);
851
852	return NULL;
853}
854
855/*
856 * NAME:	txRelease()
857 *
858 * FUNCTION:	Release buffers associated with transaction locks, but don't
859 *		mark homeok yet.  The allows other transactions to modify
860 *		buffers, but won't let them go to disk until commit record
861 *		actually gets written.
862 *
863 * PARAMETER:
864 *		tblk	-
865 *
866 * RETURN:	Errors from subroutines.
867 */
868static void txRelease(struct tblock * tblk)
869{
870	struct metapage *mp;
871	lid_t lid;
872	struct tlock *tlck;
873
874	TXN_LOCK();
875
876	for (lid = tblk->next; lid; lid = tlck->next) {
877		tlck = lid_to_tlock(lid);
878		if ((mp = tlck->mp) != NULL &&
879		    (tlck->type & tlckBTROOT) == 0) {
880			assert(mp->xflag & COMMIT_PAGE);
881			mp->lid = 0;
882		}
883	}
884
885	/*
886	 * wakeup transactions waiting on a page locked
887	 * by the current transaction
888	 */
889	TXN_WAKEUP(&tblk->waitor);
890
891	TXN_UNLOCK();
892}
893
894/*
895 * NAME:	txUnlock()
896 *
897 * FUNCTION:	Initiates pageout of pages modified by tid in journalled
898 *		objects and frees their lockwords.
899 */
900static void txUnlock(struct tblock * tblk)
901{
902	struct tlock *tlck;
903	struct linelock *linelock;
904	lid_t lid, next, llid, k;
905	struct metapage *mp;
906	struct jfs_log *log;
907	int difft, diffp;
908	unsigned long flags;
909
910	jfs_info("txUnlock: tblk = 0x%p", tblk);
911	log = JFS_SBI(tblk->sb)->log;
912
913	/*
914	 * mark page under tlock homeok (its log has been written):
915	 */
916	for (lid = tblk->next; lid; lid = next) {
917		tlck = lid_to_tlock(lid);
918		next = tlck->next;
919
920		jfs_info("unlocking lid = %d, tlck = 0x%p", lid, tlck);
921
922		/* unbind page from tlock */
923		if ((mp = tlck->mp) != NULL &&
924		    (tlck->type & tlckBTROOT) == 0) {
925			assert(mp->xflag & COMMIT_PAGE);
926
927			/* hold buffer
928			 */
929			hold_metapage(mp);
930
931			assert(mp->nohomeok > 0);
932			_metapage_homeok(mp);
933
934			/* inherit younger/larger clsn */
935			LOGSYNC_LOCK(log, flags);
936			if (mp->clsn) {
937				logdiff(difft, tblk->clsn, log);
938				logdiff(diffp, mp->clsn, log);
939				if (difft > diffp)
940					mp->clsn = tblk->clsn;
941			} else
942				mp->clsn = tblk->clsn;
943			LOGSYNC_UNLOCK(log, flags);
944
945			assert(!(tlck->flag & tlckFREEPAGE));
946
947			put_metapage(mp);
948		}
949
950		/* insert tlock, and linelock(s) of the tlock if any,
951		 * at head of freelist
952		 */
953		TXN_LOCK();
954
955		llid = ((struct linelock *) & tlck->lock)->next;
956		while (llid) {
957			linelock = (struct linelock *) lid_to_tlock(llid);
958			k = linelock->next;
959			txLockFree(llid);
960			llid = k;
961		}
962		txLockFree(lid);
963
964		TXN_UNLOCK();
965	}
966	tblk->next = tblk->last = 0;
967
968	/*
969	 * remove tblock from logsynclist
970	 * (allocation map pages inherited lsn of tblk and
971	 * has been inserted in logsync list at txUpdateMap())
972	 */
973	if (tblk->lsn) {
974		LOGSYNC_LOCK(log, flags);
975		log->count--;
976		list_del(&tblk->synclist);
977		LOGSYNC_UNLOCK(log, flags);
978	}
979}
980
981/*
982 *	txMaplock()
983 *
984 * function: allocate a transaction lock for freed page/entry;
985 *	for freed page, maplock is used as xtlock/dtlock type;
986 */
987struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
988{
989	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
990	lid_t lid;
991	struct tblock *tblk;
992	struct tlock *tlck;
993	struct maplock *maplock;
994
995	TXN_LOCK();
996
997	/*
998	 * allocate a tlock
999	 */
1000	lid = txLockAlloc();
1001	tlck = lid_to_tlock(lid);
1002
1003	/*
1004	 * initialize tlock
1005	 */
1006	tlck->tid = tid;
1007
1008	/* bind the tlock and the object */
1009	tlck->flag = tlckINODELOCK;
1010	if (S_ISDIR(ip->i_mode))
1011		tlck->flag |= tlckDIRECTORY;
1012	tlck->ip = ip;
1013	tlck->mp = NULL;
1014
1015	tlck->type = type;
1016
1017	/*
1018	 * enqueue transaction lock to transaction/inode
1019	 */
1020	/* insert the tlock at tail of transaction tlock list */
1021	if (tid) {
1022		tblk = tid_to_tblock(tid);
1023		if (tblk->next)
1024			lid_to_tlock(tblk->last)->next = lid;
1025		else
1026			tblk->next = lid;
1027		tlck->next = 0;
1028		tblk->last = lid;
1029	}
1030	/* anonymous transaction:
1031	 * insert the tlock at head of inode anonymous tlock list
1032	 */
1033	else {
1034		tlck->next = jfs_ip->atlhead;
1035		jfs_ip->atlhead = lid;
1036		if (tlck->next == 0) {
1037			/* This inode's first anonymous transaction */
1038			jfs_ip->atltail = lid;
1039			list_add_tail(&jfs_ip->anon_inode_list,
1040				      &TxAnchor.anon_list);
1041		}
1042	}
1043
1044	TXN_UNLOCK();
1045
1046	/* initialize type dependent area for maplock */
1047	maplock = (struct maplock *) & tlck->lock;
1048	maplock->next = 0;
1049	maplock->maxcnt = 0;
1050	maplock->index = 0;
1051
1052	return tlck;
1053}
1054
1055/*
1056 *	txLinelock()
1057 *
1058 * function: allocate a transaction lock for log vector list
1059 */
1060struct linelock *txLinelock(struct linelock * tlock)
1061{
1062	lid_t lid;
1063	struct tlock *tlck;
1064	struct linelock *linelock;
1065
1066	TXN_LOCK();
1067
1068	/* allocate a TxLock structure */
1069	lid = txLockAlloc();
1070	tlck = lid_to_tlock(lid);
1071
1072	TXN_UNLOCK();
1073
1074	/* initialize linelock */
1075	linelock = (struct linelock *) tlck;
1076	linelock->next = 0;
1077	linelock->flag = tlckLINELOCK;
1078	linelock->maxcnt = TLOCKLONG;
1079	linelock->index = 0;
1080	if (tlck->flag & tlckDIRECTORY)
1081		linelock->flag |= tlckDIRECTORY;
1082
1083	/* append linelock after tlock */
1084	linelock->next = tlock->next;
1085	tlock->next = lid;
1086
1087	return linelock;
1088}
1089
1090/*
1091 *		transaction commit management
1092 *		-----------------------------
1093 */
1094
1095/*
1096 * NAME:	txCommit()
1097 *
1098 * FUNCTION:	commit the changes to the objects specified in
1099 *		clist.  For journalled segments only the
1100 *		changes of the caller are committed, ie by tid.
1101 *		for non-journalled segments the data are flushed to
1102 *		disk and then the change to the disk inode and indirect
1103 *		blocks committed (so blocks newly allocated to the
1104 *		segment will be made a part of the segment atomically).
1105 *
1106 *		all of the segments specified in clist must be in
1107 *		one file system. no more than 6 segments are needed
1108 *		to handle all unix svcs.
1109 *
1110 *		if the i_nlink field (i.e. disk inode link count)
1111 *		is zero, and the type of inode is a regular file or
1112 *		directory, or symbolic link , the inode is truncated
1113 *		to zero length. the truncation is committed but the
1114 *		VM resources are unaffected until it is closed (see
1115 *		iput and iclose).
1116 *
1117 * PARAMETER:
1118 *
1119 * RETURN:
1120 *
1121 * serialization:
1122 *		on entry the inode lock on each segment is assumed
1123 *		to be held.
1124 *
1125 * i/o error:
1126 */
1127int txCommit(tid_t tid,		/* transaction identifier */
1128	     int nip,		/* number of inodes to commit */
1129	     struct inode **iplist,	/* list of inode to commit */
1130	     int flag)
1131{
1132	int rc = 0;
1133	struct commit cd;
1134	struct jfs_log *log;
1135	struct tblock *tblk;
1136	struct lrd *lrd;
1137	struct inode *ip;
1138	struct jfs_inode_info *jfs_ip;
1139	int k, n;
1140	ino_t top;
1141	struct super_block *sb;
1142
1143	jfs_info("txCommit, tid = %d, flag = %d", tid, flag);
1144	/* is read-only file system ? */
1145	if (isReadOnly(iplist[0])) {
1146		rc = -EROFS;
1147		goto TheEnd;
1148	}
1149
1150	sb = cd.sb = iplist[0]->i_sb;
1151	cd.tid = tid;
1152
1153	if (tid == 0)
1154		tid = txBegin(sb, 0);
1155	tblk = tid_to_tblock(tid);
1156
1157	/*
1158	 * initialize commit structure
1159	 */
1160	log = JFS_SBI(sb)->log;
1161	cd.log = log;
1162
1163	/* initialize log record descriptor in commit */
1164	lrd = &cd.lrd;
1165	lrd->logtid = cpu_to_le32(tblk->logtid);
1166	lrd->backchain = 0;
1167
1168	tblk->xflag |= flag;
1169
1170	if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0)
1171		tblk->xflag |= COMMIT_LAZY;
1172	/*
1173	 *	prepare non-journaled objects for commit
1174	 *
1175	 * flush data pages of non-journaled file
1176	 * to prevent the file getting non-initialized disk blocks
1177	 * in case of crash.
1178	 * (new blocks - )
1179	 */
1180	cd.iplist = iplist;
1181	cd.nip = nip;
1182
1183	/*
1184	 *	acquire transaction lock on (on-disk) inodes
1185	 *
1186	 * update on-disk inode from in-memory inode
1187	 * acquiring transaction locks for AFTER records
1188	 * on the on-disk inode of file object
1189	 *
1190	 * sort the inodes array by inode number in descending order
1191	 * to prevent deadlock when acquiring transaction lock
1192	 * of on-disk inodes on multiple on-disk inode pages by
1193	 * multiple concurrent transactions
1194	 */
1195	for (k = 0; k < cd.nip; k++) {
1196		top = (cd.iplist[k])->i_ino;
1197		for (n = k + 1; n < cd.nip; n++) {
1198			ip = cd.iplist[n];
1199			if (ip->i_ino > top) {
1200				top = ip->i_ino;
1201				cd.iplist[n] = cd.iplist[k];
1202				cd.iplist[k] = ip;
1203			}
1204		}
1205
1206		ip = cd.iplist[k];
1207		jfs_ip = JFS_IP(ip);
1208
1209		/*
1210		 * BUGBUG - This code has temporarily been removed.  The
1211		 * intent is to ensure that any file data is written before
1212		 * the metadata is committed to the journal.  This prevents
1213		 * uninitialized data from appearing in a file after the
1214		 * journal has been replayed.  (The uninitialized data
1215		 * could be sensitive data removed by another user.)
1216		 *
1217		 * The problem now is that we are holding the IWRITELOCK
1218		 * on the inode, and calling filemap_fdatawrite on an
1219		 * unmapped page will cause a deadlock in jfs_get_block.
1220		 *
1221		 * The long term solution is to pare down the use of
1222		 * IWRITELOCK.  We are currently holding it too long.
1223		 * We could also be smarter about which data pages need
1224		 * to be written before the transaction is committed and
1225		 * when we don't need to worry about it at all.
1226		 *
1227		 * if ((!S_ISDIR(ip->i_mode))
1228		 *    && (tblk->flag & COMMIT_DELETE) == 0)
1229		 *	filemap_write_and_wait(ip->i_mapping);
1230		 */
1231
1232		/*
1233		 * Mark inode as not dirty.  It will still be on the dirty
1234		 * inode list, but we'll know not to commit it again unless
1235		 * it gets marked dirty again
1236		 */
1237		clear_cflag(COMMIT_Dirty, ip);
1238
1239		/* inherit anonymous tlock(s) of inode */
1240		if (jfs_ip->atlhead) {
1241			lid_to_tlock(jfs_ip->atltail)->next = tblk->next;
1242			tblk->next = jfs_ip->atlhead;
1243			if (!tblk->last)
1244				tblk->last = jfs_ip->atltail;
1245			jfs_ip->atlhead = jfs_ip->atltail = 0;
1246			TXN_LOCK();
1247			list_del_init(&jfs_ip->anon_inode_list);
1248			TXN_UNLOCK();
1249		}
1250
1251		/*
1252		 * acquire transaction lock on on-disk inode page
1253		 * (become first tlock of the tblk's tlock list)
1254		 */
1255		if (((rc = diWrite(tid, ip))))
1256			goto out;
1257	}
1258
1259	/*
1260	 *	write log records from transaction locks
1261	 *
1262	 * txUpdateMap() resets XAD_NEW in XAD.
1263	 */
1264	txLog(log, tblk, &cd);
1265
1266	/*
1267	 * Ensure that inode isn't reused before
1268	 * lazy commit thread finishes processing
1269	 */
1270	if (tblk->xflag & COMMIT_DELETE) {
1271		ihold(tblk->u.ip);
1272		/*
1273		 * Avoid a rare deadlock
1274		 *
1275		 * If the inode is locked, we may be blocked in
1276		 * jfs_commit_inode.  If so, we don't want the
1277		 * lazy_commit thread doing the last iput() on the inode
1278		 * since that may block on the locked inode.  Instead,
1279		 * commit the transaction synchronously, so the last iput
1280		 * will be done by the calling thread (or later)
1281		 */
1282		/*
1283		 * I believe this code is no longer needed.  Splitting I_LOCK
1284		 * into two bits, I_NEW and I_SYNC should prevent this
1285		 * deadlock as well.  But since I don't have a JFS testload
1286		 * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done.
1287		 * Joern
1288		 */
1289		if (tblk->u.ip->i_state & I_SYNC)
1290			tblk->xflag &= ~COMMIT_LAZY;
1291	}
1292
1293	ASSERT((!(tblk->xflag & COMMIT_DELETE)) ||
1294	       ((tblk->u.ip->i_nlink == 0) &&
1295		!test_cflag(COMMIT_Nolink, tblk->u.ip)));
1296
1297	/*
1298	 *	write COMMIT log record
1299	 */
1300	lrd->type = cpu_to_le16(LOG_COMMIT);
1301	lrd->length = 0;
1302	lmLog(log, tblk, lrd, NULL);
1303
1304	lmGroupCommit(log, tblk);
1305
1306	/*
1307	 *	- transaction is now committed -
1308	 */
1309
1310	/*
1311	 * force pages in careful update
1312	 * (imap addressing structure update)
1313	 */
1314	if (flag & COMMIT_FORCE)
1315		txForce(tblk);
1316
1317	/*
1318	 *	update allocation map.
1319	 *
1320	 * update inode allocation map and inode:
1321	 * free pager lock on memory object of inode if any.
1322	 * update block allocation map.
1323	 *
1324	 * txUpdateMap() resets XAD_NEW in XAD.
1325	 */
1326	if (tblk->xflag & COMMIT_FORCE)
1327		txUpdateMap(tblk);
1328
1329	/*
1330	 *	free transaction locks and pageout/free pages
1331	 */
1332	txRelease(tblk);
1333
1334	if ((tblk->flag & tblkGC_LAZY) == 0)
1335		txUnlock(tblk);
1336
1337
1338	/*
1339	 *	reset in-memory object state
1340	 */
1341	for (k = 0; k < cd.nip; k++) {
1342		ip = cd.iplist[k];
1343		jfs_ip = JFS_IP(ip);
1344
1345		/*
1346		 * reset in-memory inode state
1347		 */
1348		jfs_ip->bxflag = 0;
1349		jfs_ip->blid = 0;
1350	}
1351
1352      out:
1353	if (rc != 0)
1354		txAbort(tid, 1);
1355
1356      TheEnd:
1357	jfs_info("txCommit: tid = %d, returning %d", tid, rc);
1358	return rc;
1359}
1360
1361/*
1362 * NAME:	txLog()
1363 *
1364 * FUNCTION:	Writes AFTER log records for all lines modified
1365 *		by tid for segments specified by inodes in comdata.
1366 *		Code assumes only WRITELOCKS are recorded in lockwords.
1367 *
1368 * PARAMETERS:
1369 *
1370 * RETURN :
1371 */
1372static void txLog(struct jfs_log *log, struct tblock *tblk, struct commit *cd)
1373{
1374	struct inode *ip;
1375	lid_t lid;
1376	struct tlock *tlck;
1377	struct lrd *lrd = &cd->lrd;
1378
1379	/*
1380	 * write log record(s) for each tlock of transaction,
1381	 */
1382	for (lid = tblk->next; lid; lid = tlck->next) {
1383		tlck = lid_to_tlock(lid);
1384
1385		tlck->flag |= tlckLOG;
1386
1387		/* initialize lrd common */
1388		ip = tlck->ip;
1389		lrd->aggregate = cpu_to_le32(JFS_SBI(ip->i_sb)->aggregate);
1390		lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset);
1391		lrd->log.redopage.inode = cpu_to_le32(ip->i_ino);
1392
1393		/* write log record of page from the tlock */
1394		switch (tlck->type & tlckTYPE) {
1395		case tlckXTREE:
1396			xtLog(log, tblk, lrd, tlck);
1397			break;
1398
1399		case tlckDTREE:
1400			dtLog(log, tblk, lrd, tlck);
1401			break;
1402
1403		case tlckINODE:
1404			diLog(log, tblk, lrd, tlck, cd);
1405			break;
1406
1407		case tlckMAP:
1408			mapLog(log, tblk, lrd, tlck);
1409			break;
1410
1411		case tlckDATA:
1412			dataLog(log, tblk, lrd, tlck);
1413			break;
1414
1415		default:
1416			jfs_err("UFO tlock:0x%p", tlck);
1417		}
1418	}
1419
1420	return;
1421}
1422
1423/*
1424 *	diLog()
1425 *
1426 * function:	log inode tlock and format maplock to update bmap;
1427 */
1428static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd,
1429		 struct tlock *tlck, struct commit *cd)
1430{
1431	struct metapage *mp;
1432	pxd_t *pxd;
1433	struct pxd_lock *pxdlock;
1434
1435	mp = tlck->mp;
1436
1437	/* initialize as REDOPAGE record format */
1438	lrd->log.redopage.type = cpu_to_le16(LOG_INODE);
1439	lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE);
1440
1441	pxd = &lrd->log.redopage.pxd;
1442
1443	/*
1444	 *	inode after image
1445	 */
1446	if (tlck->type & tlckENTRY) {
1447		/* log after-image for logredo(): */
1448		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1449		PXDaddress(pxd, mp->index);
1450		PXDlength(pxd,
1451			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1452		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1453
1454		/* mark page as homeward bound */
1455		tlck->flag |= tlckWRITEPAGE;
1456	} else if (tlck->type & tlckFREE) {
1457		/*
1458		 *	free inode extent
1459		 *
1460		 * (pages of the freed inode extent have been invalidated and
1461		 * a maplock for free of the extent has been formatted at
1462		 * txLock() time);
1463		 *
1464		 * the tlock had been acquired on the inode allocation map page
1465		 * (iag) that specifies the freed extent, even though the map
1466		 * page is not itself logged, to prevent pageout of the map
1467		 * page before the log;
1468		 */
1469
1470		/* log LOG_NOREDOINOEXT of the freed inode extent for
1471		 * logredo() to start NoRedoPage filters, and to update
1472		 * imap and bmap for free of the extent;
1473		 */
1474		lrd->type = cpu_to_le16(LOG_NOREDOINOEXT);
1475		/*
1476		 * For the LOG_NOREDOINOEXT record, we need
1477		 * to pass the IAG number and inode extent
1478		 * index (within that IAG) from which the
1479		 * extent is being released.  These have been
1480		 * passed to us in the iplist[1] and iplist[2].
1481		 */
1482		lrd->log.noredoinoext.iagnum =
1483		    cpu_to_le32((u32) (size_t) cd->iplist[1]);
1484		lrd->log.noredoinoext.inoext_idx =
1485		    cpu_to_le32((u32) (size_t) cd->iplist[2]);
1486
1487		pxdlock = (struct pxd_lock *) & tlck->lock;
1488		*pxd = pxdlock->pxd;
1489		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1490
1491		/* update bmap */
1492		tlck->flag |= tlckUPDATEMAP;
1493
1494		/* mark page as homeward bound */
1495		tlck->flag |= tlckWRITEPAGE;
1496	} else
1497		jfs_err("diLog: UFO type tlck:0x%p", tlck);
1498	return;
1499}
1500
1501/*
1502 *	dataLog()
1503 *
1504 * function:	log data tlock
1505 */
1506static void dataLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd,
1507	    struct tlock *tlck)
1508{
1509	struct metapage *mp;
1510	pxd_t *pxd;
1511
1512	mp = tlck->mp;
1513
1514	/* initialize as REDOPAGE record format */
1515	lrd->log.redopage.type = cpu_to_le16(LOG_DATA);
1516	lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE);
1517
1518	pxd = &lrd->log.redopage.pxd;
1519
1520	/* log after-image for logredo(): */
1521	lrd->type = cpu_to_le16(LOG_REDOPAGE);
1522
1523	if (jfs_dirtable_inline(tlck->ip)) {
1524		/*
1525		 * The table has been truncated, we've must have deleted
1526		 * the last entry, so don't bother logging this
1527		 */
1528		mp->lid = 0;
1529		grab_metapage(mp);
1530		metapage_homeok(mp);
1531		discard_metapage(mp);
1532		tlck->mp = NULL;
1533		return;
1534	}
1535
1536	PXDaddress(pxd, mp->index);
1537	PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits);
1538
1539	lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1540
1541	/* mark page as homeward bound */
1542	tlck->flag |= tlckWRITEPAGE;
1543
1544	return;
1545}
1546
1547/*
1548 *	dtLog()
1549 *
1550 * function:	log dtree tlock and format maplock to update bmap;
1551 */
1552static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1553	   struct tlock * tlck)
1554{
1555	struct metapage *mp;
1556	struct pxd_lock *pxdlock;
1557	pxd_t *pxd;
1558
1559	mp = tlck->mp;
1560
1561	/* initialize as REDOPAGE/NOREDOPAGE record format */
1562	lrd->log.redopage.type = cpu_to_le16(LOG_DTREE);
1563	lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE);
1564
1565	pxd = &lrd->log.redopage.pxd;
1566
1567	if (tlck->type & tlckBTROOT)
1568		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
1569
1570	/*
1571	 *	page extension via relocation: entry insertion;
1572	 *	page extension in-place: entry insertion;
1573	 *	new right page from page split, reinitialized in-line
1574	 *	root from root page split: entry insertion;
1575	 */
1576	if (tlck->type & (tlckNEW | tlckEXTEND)) {
1577		/* log after-image of the new page for logredo():
1578		 * mark log (LOG_NEW) for logredo() to initialize
1579		 * freelist and update bmap for alloc of the new page;
1580		 */
1581		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1582		if (tlck->type & tlckEXTEND)
1583			lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND);
1584		else
1585			lrd->log.redopage.type |= cpu_to_le16(LOG_NEW);
1586		PXDaddress(pxd, mp->index);
1587		PXDlength(pxd,
1588			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1589		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1590
1591		/* format a maplock for txUpdateMap() to update bPMAP for
1592		 * alloc of the new page;
1593		 */
1594		if (tlck->type & tlckBTROOT)
1595			return;
1596		tlck->flag |= tlckUPDATEMAP;
1597		pxdlock = (struct pxd_lock *) & tlck->lock;
1598		pxdlock->flag = mlckALLOCPXD;
1599		pxdlock->pxd = *pxd;
1600
1601		pxdlock->index = 1;
1602
1603		/* mark page as homeward bound */
1604		tlck->flag |= tlckWRITEPAGE;
1605		return;
1606	}
1607
1608	/*
1609	 *	entry insertion/deletion,
1610	 *	sibling page link update (old right page before split);
1611	 */
1612	if (tlck->type & (tlckENTRY | tlckRELINK)) {
1613		/* log after-image for logredo(): */
1614		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1615		PXDaddress(pxd, mp->index);
1616		PXDlength(pxd,
1617			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1618		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1619
1620		/* mark page as homeward bound */
1621		tlck->flag |= tlckWRITEPAGE;
1622		return;
1623	}
1624
1625	/*
1626	 *	page deletion: page has been invalidated
1627	 *	page relocation: source extent
1628	 *
1629	 *	a maplock for free of the page has been formatted
1630	 *	at txLock() time);
1631	 */
1632	if (tlck->type & (tlckFREE | tlckRELOCATE)) {
1633		/* log LOG_NOREDOPAGE of the deleted page for logredo()
1634		 * to start NoRedoPage filter and to update bmap for free
1635		 * of the deletd page
1636		 */
1637		lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
1638		pxdlock = (struct pxd_lock *) & tlck->lock;
1639		*pxd = pxdlock->pxd;
1640		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1641
1642		/* a maplock for txUpdateMap() for free of the page
1643		 * has been formatted at txLock() time;
1644		 */
1645		tlck->flag |= tlckUPDATEMAP;
1646	}
1647	return;
1648}
1649
1650/*
1651 *	xtLog()
1652 *
1653 * function:	log xtree tlock and format maplock to update bmap;
1654 */
1655static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1656	   struct tlock * tlck)
1657{
1658	struct inode *ip;
1659	struct metapage *mp;
1660	xtpage_t *p;
1661	struct xtlock *xtlck;
1662	struct maplock *maplock;
1663	struct xdlistlock *xadlock;
1664	struct pxd_lock *pxdlock;
1665	pxd_t *page_pxd;
1666	int next, lwm, hwm;
1667
1668	ip = tlck->ip;
1669	mp = tlck->mp;
1670
1671	/* initialize as REDOPAGE/NOREDOPAGE record format */
1672	lrd->log.redopage.type = cpu_to_le16(LOG_XTREE);
1673	lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE);
1674
1675	page_pxd = &lrd->log.redopage.pxd;
1676
1677	if (tlck->type & tlckBTROOT) {
1678		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
1679		p = (xtpage_t *) &JFS_IP(ip)->i_xtroot;
1680		if (S_ISDIR(ip->i_mode))
1681			lrd->log.redopage.type |=
1682			    cpu_to_le16(LOG_DIR_XTREE);
1683	} else
1684		p = (xtpage_t *) mp->data;
1685	next = le16_to_cpu(p->header.nextindex);
1686
1687	xtlck = (struct xtlock *) & tlck->lock;
1688
1689	maplock = (struct maplock *) & tlck->lock;
1690	xadlock = (struct xdlistlock *) maplock;
1691
1692	/*
1693	 *	entry insertion/extension;
1694	 *	sibling page link update (old right page before split);
1695	 */
1696	if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) {
1697		/* log after-image for logredo():
1698		 * logredo() will update bmap for alloc of new/extended
1699		 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
1700		 * after-image of XADlist;
1701		 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
1702		 * applying the after-image to the meta-data page.
1703		 */
1704		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1705		PXDaddress(page_pxd, mp->index);
1706		PXDlength(page_pxd,
1707			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1708		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1709
1710		/* format a maplock for txUpdateMap() to update bPMAP
1711		 * for alloc of new/extended extents of XAD[lwm:next)
1712		 * from the page itself;
1713		 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
1714		 */
1715		lwm = xtlck->lwm.offset;
1716		if (lwm == 0)
1717			lwm = XTPAGEMAXSLOT;
1718
1719		if (lwm == next)
1720			goto out;
1721		if (lwm > next) {
1722			jfs_err("xtLog: lwm > next");
1723			goto out;
1724		}
1725		tlck->flag |= tlckUPDATEMAP;
1726		xadlock->flag = mlckALLOCXADLIST;
1727		xadlock->count = next - lwm;
1728		if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) {
1729			int i;
1730			pxd_t *pxd;
1731			/*
1732			 * Lazy commit may allow xtree to be modified before
1733			 * txUpdateMap runs.  Copy xad into linelock to
1734			 * preserve correct data.
1735			 *
1736			 * We can fit twice as may pxd's as xads in the lock
1737			 */
1738			xadlock->flag = mlckALLOCPXDLIST;
1739			pxd = xadlock->xdlist = &xtlck->pxdlock;
1740			for (i = 0; i < xadlock->count; i++) {
1741				PXDaddress(pxd, addressXAD(&p->xad[lwm + i]));
1742				PXDlength(pxd, lengthXAD(&p->xad[lwm + i]));
1743				p->xad[lwm + i].flag &=
1744				    ~(XAD_NEW | XAD_EXTENDED);
1745				pxd++;
1746			}
1747		} else {
1748			/*
1749			 * xdlist will point to into inode's xtree, ensure
1750			 * that transaction is not committed lazily.
1751			 */
1752			xadlock->flag = mlckALLOCXADLIST;
1753			xadlock->xdlist = &p->xad[lwm];
1754			tblk->xflag &= ~COMMIT_LAZY;
1755		}
1756		jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d count:%d",
1757			 tlck->ip, mp, tlck, lwm, xadlock->count);
1758
1759		maplock->index = 1;
1760
1761	      out:
1762		/* mark page as homeward bound */
1763		tlck->flag |= tlckWRITEPAGE;
1764
1765		return;
1766	}
1767
1768	/*
1769	 *	page deletion: file deletion/truncation (ref. xtTruncate())
1770	 *
1771	 * (page will be invalidated after log is written and bmap
1772	 * is updated from the page);
1773	 */
1774	if (tlck->type & tlckFREE) {
1775		/* LOG_NOREDOPAGE log for NoRedoPage filter:
1776		 * if page free from file delete, NoRedoFile filter from
1777		 * inode image of zero link count will subsume NoRedoPage
1778		 * filters for each page;
1779		 * if page free from file truncattion, write NoRedoPage
1780		 * filter;
1781		 *
1782		 * upadte of block allocation map for the page itself:
1783		 * if page free from deletion and truncation, LOG_UPDATEMAP
1784		 * log for the page itself is generated from processing
1785		 * its parent page xad entries;
1786		 */
1787		/* if page free from file truncation, log LOG_NOREDOPAGE
1788		 * of the deleted page for logredo() to start NoRedoPage
1789		 * filter for the page;
1790		 */
1791		if (tblk->xflag & COMMIT_TRUNCATE) {
1792			/* write NOREDOPAGE for the page */
1793			lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
1794			PXDaddress(page_pxd, mp->index);
1795			PXDlength(page_pxd,
1796				  mp->logical_size >> tblk->sb->
1797				  s_blocksize_bits);
1798			lrd->backchain =
1799			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1800
1801			if (tlck->type & tlckBTROOT) {
1802				/* Empty xtree must be logged */
1803				lrd->type = cpu_to_le16(LOG_REDOPAGE);
1804				lrd->backchain =
1805				    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1806			}
1807		}
1808
1809		/* init LOG_UPDATEMAP of the freed extents
1810		 * XAD[XTENTRYSTART:hwm) from the deleted page itself
1811		 * for logredo() to update bmap;
1812		 */
1813		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1814		lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST);
1815		xtlck = (struct xtlock *) & tlck->lock;
1816		hwm = xtlck->hwm.offset;
1817		lrd->log.updatemap.nxd =
1818		    cpu_to_le16(hwm - XTENTRYSTART + 1);
1819		/* reformat linelock for lmLog() */
1820		xtlck->header.offset = XTENTRYSTART;
1821		xtlck->header.length = hwm - XTENTRYSTART + 1;
1822		xtlck->index = 1;
1823		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1824
1825		/* format a maplock for txUpdateMap() to update bmap
1826		 * to free extents of XAD[XTENTRYSTART:hwm) from the
1827		 * deleted page itself;
1828		 */
1829		tlck->flag |= tlckUPDATEMAP;
1830		xadlock->count = hwm - XTENTRYSTART + 1;
1831		if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) {
1832			int i;
1833			pxd_t *pxd;
1834			/*
1835			 * Lazy commit may allow xtree to be modified before
1836			 * txUpdateMap runs.  Copy xad into linelock to
1837			 * preserve correct data.
1838			 *
1839			 * We can fit twice as may pxd's as xads in the lock
1840			 */
1841			xadlock->flag = mlckFREEPXDLIST;
1842			pxd = xadlock->xdlist = &xtlck->pxdlock;
1843			for (i = 0; i < xadlock->count; i++) {
1844				PXDaddress(pxd,
1845					addressXAD(&p->xad[XTENTRYSTART + i]));
1846				PXDlength(pxd,
1847					lengthXAD(&p->xad[XTENTRYSTART + i]));
1848				pxd++;
1849			}
1850		} else {
1851			/*
1852			 * xdlist will point to into inode's xtree, ensure
1853			 * that transaction is not committed lazily.
1854			 */
1855			xadlock->flag = mlckFREEXADLIST;
1856			xadlock->xdlist = &p->xad[XTENTRYSTART];
1857			tblk->xflag &= ~COMMIT_LAZY;
1858		}
1859		jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2",
1860			 tlck->ip, mp, xadlock->count);
1861
1862		maplock->index = 1;
1863
1864		/* mark page as invalid */
1865		if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode))
1866		    && !(tlck->type & tlckBTROOT))
1867			tlck->flag |= tlckFREEPAGE;
1868		/*
1869		   else (tblk->xflag & COMMIT_PMAP)
1870		   ? release the page;
1871		 */
1872		return;
1873	}
1874
1875	/*
1876	 *	page/entry truncation: file truncation (ref. xtTruncate())
1877	 *
1878	 *	|----------+------+------+---------------|
1879	 *		   |      |      |
1880	 *		   |      |     hwm - hwm before truncation
1881	 *		   |     next - truncation point
1882	 *		  lwm - lwm before truncation
1883	 * header ?
1884	 */
1885	if (tlck->type & tlckTRUNCATE) {
1886		pxd_t pxd;	/* truncated extent of xad */
1887		int twm;
1888
1889		/*
1890		 * For truncation the entire linelock may be used, so it would
1891		 * be difficult to store xad list in linelock itself.
1892		 * Therefore, we'll just force transaction to be committed
1893		 * synchronously, so that xtree pages won't be changed before
1894		 * txUpdateMap runs.
1895		 */
1896		tblk->xflag &= ~COMMIT_LAZY;
1897		lwm = xtlck->lwm.offset;
1898		if (lwm == 0)
1899			lwm = XTPAGEMAXSLOT;
1900		hwm = xtlck->hwm.offset;
1901		twm = xtlck->twm.offset;
1902
1903		/*
1904		 *	write log records
1905		 */
1906		/* log after-image for logredo():
1907		 *
1908		 * logredo() will update bmap for alloc of new/extended
1909		 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
1910		 * after-image of XADlist;
1911		 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
1912		 * applying the after-image to the meta-data page.
1913		 */
1914		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1915		PXDaddress(page_pxd, mp->index);
1916		PXDlength(page_pxd,
1917			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1918		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1919
1920		/*
1921		 * truncate entry XAD[twm == next - 1]:
1922		 */
1923		if (twm == next - 1) {
1924			/* init LOG_UPDATEMAP for logredo() to update bmap for
1925			 * free of truncated delta extent of the truncated
1926			 * entry XAD[next - 1]:
1927			 * (xtlck->pxdlock = truncated delta extent);
1928			 */
1929			pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
1930			/* assert(pxdlock->type & tlckTRUNCATE); */
1931			lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1932			lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
1933			lrd->log.updatemap.nxd = cpu_to_le16(1);
1934			lrd->log.updatemap.pxd = pxdlock->pxd;
1935			pxd = pxdlock->pxd;	/* save to format maplock */
1936			lrd->backchain =
1937			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1938		}
1939
1940		/*
1941		 * free entries XAD[next:hwm]:
1942		 */
1943		if (hwm >= next) {
1944			/* init LOG_UPDATEMAP of the freed extents
1945			 * XAD[next:hwm] from the deleted page itself
1946			 * for logredo() to update bmap;
1947			 */
1948			lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1949			lrd->log.updatemap.type =
1950			    cpu_to_le16(LOG_FREEXADLIST);
1951			xtlck = (struct xtlock *) & tlck->lock;
1952			hwm = xtlck->hwm.offset;
1953			lrd->log.updatemap.nxd =
1954			    cpu_to_le16(hwm - next + 1);
1955			/* reformat linelock for lmLog() */
1956			xtlck->header.offset = next;
1957			xtlck->header.length = hwm - next + 1;
1958			xtlck->index = 1;
1959			lrd->backchain =
1960			    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1961		}
1962
1963		/*
1964		 *	format maplock(s) for txUpdateMap() to update bmap
1965		 */
1966		maplock->index = 0;
1967
1968		/*
1969		 * allocate entries XAD[lwm:next):
1970		 */
1971		if (lwm < next) {
1972			/* format a maplock for txUpdateMap() to update bPMAP
1973			 * for alloc of new/extended extents of XAD[lwm:next)
1974			 * from the page itself;
1975			 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
1976			 */
1977			tlck->flag |= tlckUPDATEMAP;
1978			xadlock->flag = mlckALLOCXADLIST;
1979			xadlock->count = next - lwm;
1980			xadlock->xdlist = &p->xad[lwm];
1981
1982			jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d lwm:%d next:%d",
1983				 tlck->ip, mp, xadlock->count, lwm, next);
1984			maplock->index++;
1985			xadlock++;
1986		}
1987
1988		/*
1989		 * truncate entry XAD[twm == next - 1]:
1990		 */
1991		if (twm == next - 1) {
1992			/* format a maplock for txUpdateMap() to update bmap
1993			 * to free truncated delta extent of the truncated
1994			 * entry XAD[next - 1];
1995			 * (xtlck->pxdlock = truncated delta extent);
1996			 */
1997			tlck->flag |= tlckUPDATEMAP;
1998			pxdlock = (struct pxd_lock *) xadlock;
1999			pxdlock->flag = mlckFREEPXD;
2000			pxdlock->count = 1;
2001			pxdlock->pxd = pxd;
2002
2003			jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d hwm:%d",
2004				 ip, mp, pxdlock->count, hwm);
2005			maplock->index++;
2006			xadlock++;
2007		}
2008
2009		/*
2010		 * free entries XAD[next:hwm]:
2011		 */
2012		if (hwm >= next) {
2013			/* format a maplock for txUpdateMap() to update bmap
2014			 * to free extents of XAD[next:hwm] from thedeleted
2015			 * page itself;
2016			 */
2017			tlck->flag |= tlckUPDATEMAP;
2018			xadlock->flag = mlckFREEXADLIST;
2019			xadlock->count = hwm - next + 1;
2020			xadlock->xdlist = &p->xad[next];
2021
2022			jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d next:%d hwm:%d",
2023				 tlck->ip, mp, xadlock->count, next, hwm);
2024			maplock->index++;
2025		}
2026
2027		/* mark page as homeward bound */
2028		tlck->flag |= tlckWRITEPAGE;
2029	}
2030	return;
2031}
2032
2033/*
2034 *	mapLog()
2035 *
2036 * function:	log from maplock of freed data extents;
2037 */
2038static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
2039		   struct tlock * tlck)
2040{
2041	struct pxd_lock *pxdlock;
2042	int i, nlock;
2043	pxd_t *pxd;
2044
2045	/*
2046	 *	page relocation: free the source page extent
2047	 *
2048	 * a maplock for txUpdateMap() for free of the page
2049	 * has been formatted at txLock() time saving the src
2050	 * relocated page address;
2051	 */
2052	if (tlck->type & tlckRELOCATE) {
2053		/* log LOG_NOREDOPAGE of the old relocated page
2054		 * for logredo() to start NoRedoPage filter;
2055		 */
2056		lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
2057		pxdlock = (struct pxd_lock *) & tlck->lock;
2058		pxd = &lrd->log.redopage.pxd;
2059		*pxd = pxdlock->pxd;
2060		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2061
2062		/* (N.B. currently, logredo() does NOT update bmap
2063		 * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE);
2064		 * if page free from relocation, LOG_UPDATEMAP log is
2065		 * specifically generated now for logredo()
2066		 * to update bmap for free of src relocated page;
2067		 * (new flag LOG_RELOCATE may be introduced which will
2068		 * inform logredo() to start NORedoPage filter and also
2069		 * update block allocation map at the same time, thus
2070		 * avoiding an extra log write);
2071		 */
2072		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
2073		lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
2074		lrd->log.updatemap.nxd = cpu_to_le16(1);
2075		lrd->log.updatemap.pxd = pxdlock->pxd;
2076		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2077
2078		/* a maplock for txUpdateMap() for free of the page
2079		 * has been formatted at txLock() time;
2080		 */
2081		tlck->flag |= tlckUPDATEMAP;
2082		return;
2083	}
2084	/*
2085
2086	 * Otherwise it's not a relocate request
2087	 *
2088	 */
2089	else {
2090		/* log LOG_UPDATEMAP for logredo() to update bmap for
2091		 * free of truncated/relocated delta extent of the data;
2092		 * e.g.: external EA extent, relocated/truncated extent
2093		 * from xtTailgate();
2094		 */
2095		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
2096		pxdlock = (struct pxd_lock *) & tlck->lock;
2097		nlock = pxdlock->index;
2098		for (i = 0; i < nlock; i++, pxdlock++) {
2099			if (pxdlock->flag & mlckALLOCPXD)
2100				lrd->log.updatemap.type =
2101				    cpu_to_le16(LOG_ALLOCPXD);
2102			else
2103				lrd->log.updatemap.type =
2104				    cpu_to_le16(LOG_FREEPXD);
2105			lrd->log.updatemap.nxd = cpu_to_le16(1);
2106			lrd->log.updatemap.pxd = pxdlock->pxd;
2107			lrd->backchain =
2108			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2109			jfs_info("mapLog: xaddr:0x%lx xlen:0x%x",
2110				 (ulong) addressPXD(&pxdlock->pxd),
2111				 lengthPXD(&pxdlock->pxd));
2112		}
2113
2114		/* update bmap */
2115		tlck->flag |= tlckUPDATEMAP;
2116	}
2117}
2118
2119/*
2120 *	txEA()
2121 *
2122 * function:	acquire maplock for EA/ACL extents or
2123 *		set COMMIT_INLINE flag;
2124 */
2125void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
2126{
2127	struct tlock *tlck = NULL;
2128	struct pxd_lock *maplock = NULL, *pxdlock = NULL;
2129
2130	/*
2131	 * format maplock for alloc of new EA extent
2132	 */
2133	if (newea) {
2134		/* Since the newea could be a completely zeroed entry we need to
2135		 * check for the two flags which indicate we should actually
2136		 * commit new EA data
2137		 */
2138		if (newea->flag & DXD_EXTENT) {
2139			tlck = txMaplock(tid, ip, tlckMAP);
2140			maplock = (struct pxd_lock *) & tlck->lock;
2141			pxdlock = (struct pxd_lock *) maplock;
2142			pxdlock->flag = mlckALLOCPXD;
2143			PXDaddress(&pxdlock->pxd, addressDXD(newea));
2144			PXDlength(&pxdlock->pxd, lengthDXD(newea));
2145			pxdlock++;
2146			maplock->index = 1;
2147		} else if (newea->flag & DXD_INLINE) {
2148			tlck = NULL;
2149
2150			set_cflag(COMMIT_Inlineea, ip);
2151		}
2152	}
2153
2154	/*
2155	 * format maplock for free of old EA extent
2156	 */
2157	if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) {
2158		if (tlck == NULL) {
2159			tlck = txMaplock(tid, ip, tlckMAP);
2160			maplock = (struct pxd_lock *) & tlck->lock;
2161			pxdlock = (struct pxd_lock *) maplock;
2162			maplock->index = 0;
2163		}
2164		pxdlock->flag = mlckFREEPXD;
2165		PXDaddress(&pxdlock->pxd, addressDXD(oldea));
2166		PXDlength(&pxdlock->pxd, lengthDXD(oldea));
2167		maplock->index++;
2168	}
2169}
2170
2171/*
2172 *	txForce()
2173 *
2174 * function: synchronously write pages locked by transaction
2175 *	     after txLog() but before txUpdateMap();
2176 */
2177static void txForce(struct tblock * tblk)
2178{
2179	struct tlock *tlck;
2180	lid_t lid, next;
2181	struct metapage *mp;
2182
2183	/*
2184	 * reverse the order of transaction tlocks in
2185	 * careful update order of address index pages
2186	 * (right to left, bottom up)
2187	 */
2188	tlck = lid_to_tlock(tblk->next);
2189	lid = tlck->next;
2190	tlck->next = 0;
2191	while (lid) {
2192		tlck = lid_to_tlock(lid);
2193		next = tlck->next;
2194		tlck->next = tblk->next;
2195		tblk->next = lid;
2196		lid = next;
2197	}
2198
2199	/*
2200	 * synchronously write the page, and
2201	 * hold the page for txUpdateMap();
2202	 */
2203	for (lid = tblk->next; lid; lid = next) {
2204		tlck = lid_to_tlock(lid);
2205		next = tlck->next;
2206
2207		if ((mp = tlck->mp) != NULL &&
2208		    (tlck->type & tlckBTROOT) == 0) {
2209			assert(mp->xflag & COMMIT_PAGE);
2210
2211			if (tlck->flag & tlckWRITEPAGE) {
2212				tlck->flag &= ~tlckWRITEPAGE;
2213
2214				/* do not release page to freelist */
2215				force_metapage(mp);
2216#if 0
2217				/*
2218				 * The "right" thing to do here is to
2219				 * synchronously write the metadata.
2220				 * With the current implementation this
2221				 * is hard since write_metapage requires
2222				 * us to kunmap & remap the page.  If we
2223				 * have tlocks pointing into the metadata
2224				 * pages, we don't want to do this.  I think
2225				 * we can get by with synchronously writing
2226				 * the pages when they are released.
2227				 */
2228				assert(mp->nohomeok);
2229				set_bit(META_dirty, &mp->flag);
2230				set_bit(META_sync, &mp->flag);
2231#endif
2232			}
2233		}
2234	}
2235}
2236
2237/*
2238 *	txUpdateMap()
2239 *
2240 * function:	update persistent allocation map (and working map
2241 *		if appropriate);
2242 *
2243 * parameter:
2244 */
2245static void txUpdateMap(struct tblock * tblk)
2246{
2247	struct inode *ip;
2248	struct inode *ipimap;
2249	lid_t lid;
2250	struct tlock *tlck;
2251	struct maplock *maplock;
2252	struct pxd_lock pxdlock;
2253	int maptype;
2254	int k, nlock;
2255	struct metapage *mp = NULL;
2256
2257	ipimap = JFS_SBI(tblk->sb)->ipimap;
2258
2259	maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP;
2260
2261
2262	/*
2263	 *	update block allocation map
2264	 *
2265	 * update allocation state in pmap (and wmap) and
2266	 * update lsn of the pmap page;
2267	 */
2268	/*
2269	 * scan each tlock/page of transaction for block allocation/free:
2270	 *
2271	 * for each tlock/page of transaction, update map.
2272	 *  ? are there tlock for pmap and pwmap at the same time ?
2273	 */
2274	for (lid = tblk->next; lid; lid = tlck->next) {
2275		tlck = lid_to_tlock(lid);
2276
2277		if ((tlck->flag & tlckUPDATEMAP) == 0)
2278			continue;
2279
2280		if (tlck->flag & tlckFREEPAGE) {
2281			/*
2282			 * Another thread may attempt to reuse freed space
2283			 * immediately, so we want to get rid of the metapage
2284			 * before anyone else has a chance to get it.
2285			 * Lock metapage, update maps, then invalidate
2286			 * the metapage.
2287			 */
2288			mp = tlck->mp;
2289			ASSERT(mp->xflag & COMMIT_PAGE);
2290			grab_metapage(mp);
2291		}
2292
2293		/*
2294		 * extent list:
2295		 * . in-line PXD list:
2296		 * . out-of-line XAD list:
2297		 */
2298		maplock = (struct maplock *) & tlck->lock;
2299		nlock = maplock->index;
2300
2301		for (k = 0; k < nlock; k++, maplock++) {
2302			/*
2303			 * allocate blocks in persistent map:
2304			 *
2305			 * blocks have been allocated from wmap at alloc time;
2306			 */
2307			if (maplock->flag & mlckALLOC) {
2308				txAllocPMap(ipimap, maplock, tblk);
2309			}
2310			/*
2311			 * free blocks in persistent and working map:
2312			 * blocks will be freed in pmap and then in wmap;
2313			 *
2314			 * ? tblock specifies the PMAP/PWMAP based upon
2315			 * transaction
2316			 *
2317			 * free blocks in persistent map:
2318			 * blocks will be freed from wmap at last reference
2319			 * release of the object for regular files;
2320			 *
2321			 * Alway free blocks from both persistent & working
2322			 * maps for directories
2323			 */
2324			else {	/* (maplock->flag & mlckFREE) */
2325
2326				if (tlck->flag & tlckDIRECTORY)
2327					txFreeMap(ipimap, maplock,
2328						  tblk, COMMIT_PWMAP);
2329				else
2330					txFreeMap(ipimap, maplock,
2331						  tblk, maptype);
2332			}
2333		}
2334		if (tlck->flag & tlckFREEPAGE) {
2335			if (!(tblk->flag & tblkGC_LAZY)) {
2336				/* This is equivalent to txRelease */
2337				ASSERT(mp->lid == lid);
2338				tlck->mp->lid = 0;
2339			}
2340			assert(mp->nohomeok == 1);
2341			metapage_homeok(mp);
2342			discard_metapage(mp);
2343			tlck->mp = NULL;
2344		}
2345	}
2346	/*
2347	 *	update inode allocation map
2348	 *
2349	 * update allocation state in pmap and
2350	 * update lsn of the pmap page;
2351	 * update in-memory inode flag/state
2352	 *
2353	 * unlock mapper/write lock
2354	 */
2355	if (tblk->xflag & COMMIT_CREATE) {
2356		diUpdatePMap(ipimap, tblk->ino, false, tblk);
2357		/* update persistent block allocation map
2358		 * for the allocation of inode extent;
2359		 */
2360		pxdlock.flag = mlckALLOCPXD;
2361		pxdlock.pxd = tblk->u.ixpxd;
2362		pxdlock.index = 1;
2363		txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk);
2364	} else if (tblk->xflag & COMMIT_DELETE) {
2365		ip = tblk->u.ip;
2366		diUpdatePMap(ipimap, ip->i_ino, true, tblk);
2367		iput(ip);
2368	}
2369}
2370
2371/*
2372 *	txAllocPMap()
2373 *
2374 * function: allocate from persistent map;
2375 *
2376 * parameter:
2377 *	ipbmap	-
2378 *	malock	-
2379 *		xad list:
2380 *		pxd:
2381 *
2382 *	maptype -
2383 *		allocate from persistent map;
2384 *		free from persistent map;
2385 *		(e.g., tmp file - free from working map at releae
2386 *		 of last reference);
2387 *		free from persistent and working map;
2388 *
2389 *	lsn	- log sequence number;
2390 */
2391static void txAllocPMap(struct inode *ip, struct maplock * maplock,
2392			struct tblock * tblk)
2393{
2394	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
2395	struct xdlistlock *xadlistlock;
2396	xad_t *xad;
2397	s64 xaddr;
2398	int xlen;
2399	struct pxd_lock *pxdlock;
2400	struct xdlistlock *pxdlistlock;
2401	pxd_t *pxd;
2402	int n;
2403
2404	/*
2405	 * allocate from persistent map;
2406	 */
2407	if (maplock->flag & mlckALLOCXADLIST) {
2408		xadlistlock = (struct xdlistlock *) maplock;
2409		xad = xadlistlock->xdlist;
2410		for (n = 0; n < xadlistlock->count; n++, xad++) {
2411			if (xad->flag & (XAD_NEW | XAD_EXTENDED)) {
2412				xaddr = addressXAD(xad);
2413				xlen = lengthXAD(xad);
2414				dbUpdatePMap(ipbmap, false, xaddr,
2415					     (s64) xlen, tblk);
2416				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
2417				jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
2418					 (ulong) xaddr, xlen);
2419			}
2420		}
2421	} else if (maplock->flag & mlckALLOCPXD) {
2422		pxdlock = (struct pxd_lock *) maplock;
2423		xaddr = addressPXD(&pxdlock->pxd);
2424		xlen = lengthPXD(&pxdlock->pxd);
2425		dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk);
2426		jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen);
2427	} else {		/* (maplock->flag & mlckALLOCPXDLIST) */
2428
2429		pxdlistlock = (struct xdlistlock *) maplock;
2430		pxd = pxdlistlock->xdlist;
2431		for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2432			xaddr = addressPXD(pxd);
2433			xlen = lengthPXD(pxd);
2434			dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen,
2435				     tblk);
2436			jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
2437				 (ulong) xaddr, xlen);
2438		}
2439	}
2440}
2441
2442/*
2443 *	txFreeMap()
2444 *
2445 * function:	free from persistent and/or working map;
2446 *
2447 * todo: optimization
2448 */
2449void txFreeMap(struct inode *ip,
2450	       struct maplock * maplock, struct tblock * tblk, int maptype)
2451{
2452	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
2453	struct xdlistlock *xadlistlock;
2454	xad_t *xad;
2455	s64 xaddr;
2456	int xlen;
2457	struct pxd_lock *pxdlock;
2458	struct xdlistlock *pxdlistlock;
2459	pxd_t *pxd;
2460	int n;
2461
2462	jfs_info("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x",
2463		 tblk, maplock, maptype);
2464
2465	/*
2466	 * free from persistent map;
2467	 */
2468	if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) {
2469		if (maplock->flag & mlckFREEXADLIST) {
2470			xadlistlock = (struct xdlistlock *) maplock;
2471			xad = xadlistlock->xdlist;
2472			for (n = 0; n < xadlistlock->count; n++, xad++) {
2473				if (!(xad->flag & XAD_NEW)) {
2474					xaddr = addressXAD(xad);
2475					xlen = lengthXAD(xad);
2476					dbUpdatePMap(ipbmap, true, xaddr,
2477						     (s64) xlen, tblk);
2478					jfs_info("freePMap: xaddr:0x%lx xlen:%d",
2479						 (ulong) xaddr, xlen);
2480				}
2481			}
2482		} else if (maplock->flag & mlckFREEPXD) {
2483			pxdlock = (struct pxd_lock *) maplock;
2484			xaddr = addressPXD(&pxdlock->pxd);
2485			xlen = lengthPXD(&pxdlock->pxd);
2486			dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen,
2487				     tblk);
2488			jfs_info("freePMap: xaddr:0x%lx xlen:%d",
2489				 (ulong) xaddr, xlen);
2490		} else {	/* (maplock->flag & mlckALLOCPXDLIST) */
2491
2492			pxdlistlock = (struct xdlistlock *) maplock;
2493			pxd = pxdlistlock->xdlist;
2494			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2495				xaddr = addressPXD(pxd);
2496				xlen = lengthPXD(pxd);
2497				dbUpdatePMap(ipbmap, true, xaddr,
2498					     (s64) xlen, tblk);
2499				jfs_info("freePMap: xaddr:0x%lx xlen:%d",
2500					 (ulong) xaddr, xlen);
2501			}
2502		}
2503	}
2504
2505	/*
2506	 * free from working map;
2507	 */
2508	if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) {
2509		if (maplock->flag & mlckFREEXADLIST) {
2510			xadlistlock = (struct xdlistlock *) maplock;
2511			xad = xadlistlock->xdlist;
2512			for (n = 0; n < xadlistlock->count; n++, xad++) {
2513				xaddr = addressXAD(xad);
2514				xlen = lengthXAD(xad);
2515				dbFree(ip, xaddr, (s64) xlen);
2516				xad->flag = 0;
2517				jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2518					 (ulong) xaddr, xlen);
2519			}
2520		} else if (maplock->flag & mlckFREEPXD) {
2521			pxdlock = (struct pxd_lock *) maplock;
2522			xaddr = addressPXD(&pxdlock->pxd);
2523			xlen = lengthPXD(&pxdlock->pxd);
2524			dbFree(ip, xaddr, (s64) xlen);
2525			jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2526				 (ulong) xaddr, xlen);
2527		} else {	/* (maplock->flag & mlckFREEPXDLIST) */
2528
2529			pxdlistlock = (struct xdlistlock *) maplock;
2530			pxd = pxdlistlock->xdlist;
2531			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2532				xaddr = addressPXD(pxd);
2533				xlen = lengthPXD(pxd);
2534				dbFree(ip, xaddr, (s64) xlen);
2535				jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2536					 (ulong) xaddr, xlen);
2537			}
2538		}
2539	}
2540}
2541
2542/*
2543 *	txFreelock()
2544 *
2545 * function:	remove tlock from inode anonymous locklist
2546 */
2547void txFreelock(struct inode *ip)
2548{
2549	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
2550	struct tlock *xtlck, *tlck;
2551	lid_t xlid = 0, lid;
2552
2553	if (!jfs_ip->atlhead)
2554		return;
2555
2556	TXN_LOCK();
2557	xtlck = (struct tlock *) &jfs_ip->atlhead;
2558
2559	while ((lid = xtlck->next) != 0) {
2560		tlck = lid_to_tlock(lid);
2561		if (tlck->flag & tlckFREELOCK) {
2562			xtlck->next = tlck->next;
2563			txLockFree(lid);
2564		} else {
2565			xtlck = tlck;
2566			xlid = lid;
2567		}
2568	}
2569
2570	if (jfs_ip->atlhead)
2571		jfs_ip->atltail = xlid;
2572	else {
2573		jfs_ip->atltail = 0;
2574		/*
2575		 * If inode was on anon_list, remove it
2576		 */
2577		list_del_init(&jfs_ip->anon_inode_list);
2578	}
2579	TXN_UNLOCK();
2580}
2581
2582/*
2583 *	txAbort()
2584 *
2585 * function: abort tx before commit;
2586 *
2587 * frees line-locks and segment locks for all
2588 * segments in comdata structure.
2589 * Optionally sets state of file-system to FM_DIRTY in super-block.
2590 * log age of page-frames in memory for which caller has
2591 * are reset to 0 (to avoid logwarap).
2592 */
2593void txAbort(tid_t tid, int dirty)
2594{
2595	lid_t lid, next;
2596	struct metapage *mp;
2597	struct tblock *tblk = tid_to_tblock(tid);
2598	struct tlock *tlck;
2599
2600	/*
2601	 * free tlocks of the transaction
2602	 */
2603	for (lid = tblk->next; lid; lid = next) {
2604		tlck = lid_to_tlock(lid);
2605		next = tlck->next;
2606		mp = tlck->mp;
2607		JFS_IP(tlck->ip)->xtlid = 0;
2608
2609		if (mp) {
2610			mp->lid = 0;
2611
2612			/*
2613			 * reset lsn of page to avoid logwarap:
2614			 *
2615			 * (page may have been previously committed by another
2616			 * transaction(s) but has not been paged, i.e.,
2617			 * it may be on logsync list even though it has not
2618			 * been logged for the current tx.)
2619			 */
2620			if (mp->xflag & COMMIT_PAGE && mp->lsn)
2621				LogSyncRelease(mp);
2622		}
2623		/* insert tlock at head of freelist */
2624		TXN_LOCK();
2625		txLockFree(lid);
2626		TXN_UNLOCK();
2627	}
2628
2629	/* caller will free the transaction block */
2630
2631	tblk->next = tblk->last = 0;
2632
2633	/*
2634	 * mark filesystem dirty
2635	 */
2636	if (dirty)
2637		jfs_error(tblk->sb, "\n");
2638
2639	return;
2640}
2641
2642/*
2643 *	txLazyCommit(void)
2644 *
2645 *	All transactions except those changing ipimap (COMMIT_FORCE) are
2646 *	processed by this routine.  This insures that the inode and block
2647 *	allocation maps are updated in order.  For synchronous transactions,
2648 *	let the user thread finish processing after txUpdateMap() is called.
2649 */
2650static void txLazyCommit(struct tblock * tblk)
2651{
2652	struct jfs_log *log;
2653
2654	while (((tblk->flag & tblkGC_READY) == 0) &&
2655	       ((tblk->flag & tblkGC_UNLOCKED) == 0)) {
2656		/* We must have gotten ahead of the user thread
2657		 */
2658		jfs_info("jfs_lazycommit: tblk 0x%p not unlocked", tblk);
2659		yield();
2660	}
2661
2662	jfs_info("txLazyCommit: processing tblk 0x%p", tblk);
2663
2664	txUpdateMap(tblk);
2665
2666	log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
2667
2668	spin_lock_irq(&log->gclock);	// LOGGC_LOCK
2669
2670	tblk->flag |= tblkGC_COMMITTED;
2671
2672	if (tblk->flag & tblkGC_READY)
2673		log->gcrtc--;
2674
2675	wake_up_all(&tblk->gcwait);	// LOGGC_WAKEUP
2676
2677	/*
2678	 * Can't release log->gclock until we've tested tblk->flag
2679	 */
2680	if (tblk->flag & tblkGC_LAZY) {
2681		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
2682		txUnlock(tblk);
2683		tblk->flag &= ~tblkGC_LAZY;
2684		txEnd(tblk - TxBlock);	/* Convert back to tid */
2685	} else
2686		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
2687
2688	jfs_info("txLazyCommit: done: tblk = 0x%p", tblk);
2689}
2690
2691/*
2692 *	jfs_lazycommit(void)
2693 *
2694 *	To be run as a kernel daemon.  If lbmIODone is called in an interrupt
2695 *	context, or where blocking is not wanted, this routine will process
2696 *	committed transactions from the unlock queue.
2697 */
2698int jfs_lazycommit(void *arg)
2699{
2700	int WorkDone;
2701	struct tblock *tblk;
2702	unsigned long flags;
2703	struct jfs_sb_info *sbi;
2704
2705	set_freezable();
2706	do {
2707		LAZY_LOCK(flags);
2708		jfs_commit_thread_waking = 0;	/* OK to wake another thread */
2709		while (!list_empty(&TxAnchor.unlock_queue)) {
2710			WorkDone = 0;
2711			list_for_each_entry(tblk, &TxAnchor.unlock_queue,
2712					    cqueue) {
2713
2714				sbi = JFS_SBI(tblk->sb);
2715				/*
2716				 * For each volume, the transactions must be
2717				 * handled in order.  If another commit thread
2718				 * is handling a tblk for this superblock,
2719				 * skip it
2720				 */
2721				if (sbi->commit_state & IN_LAZYCOMMIT)
2722					continue;
2723
2724				sbi->commit_state |= IN_LAZYCOMMIT;
2725				WorkDone = 1;
2726
2727				/*
2728				 * Remove transaction from queue
2729				 */
2730				list_del(&tblk->cqueue);
2731
2732				LAZY_UNLOCK(flags);
2733				txLazyCommit(tblk);
2734				LAZY_LOCK(flags);
2735
2736				sbi->commit_state &= ~IN_LAZYCOMMIT;
2737				/*
2738				 * Don't continue in the for loop.  (We can't
2739				 * anyway, it's unsafe!)  We want to go back to
2740				 * the beginning of the list.
2741				 */
2742				break;
2743			}
2744
2745			/* If there was nothing to do, don't continue */
2746			if (!WorkDone)
2747				break;
2748		}
2749		/* In case a wakeup came while all threads were active */
2750		jfs_commit_thread_waking = 0;
2751
2752		if (freezing(current)) {
2753			LAZY_UNLOCK(flags);
2754			try_to_freeze();
2755		} else {
2756			DECLARE_WAITQUEUE(wq, current);
2757
2758			add_wait_queue(&jfs_commit_thread_wait, &wq);
2759			set_current_state(TASK_INTERRUPTIBLE);
2760			LAZY_UNLOCK(flags);
2761			schedule();
2762			remove_wait_queue(&jfs_commit_thread_wait, &wq);
2763		}
2764	} while (!kthread_should_stop());
2765
2766	if (!list_empty(&TxAnchor.unlock_queue))
2767		jfs_err("jfs_lazycommit being killed w/pending transactions!");
2768	else
2769		jfs_info("jfs_lazycommit being killed");
2770	return 0;
2771}
2772
2773void txLazyUnlock(struct tblock * tblk)
2774{
2775	unsigned long flags;
2776
2777	LAZY_LOCK(flags);
2778
2779	list_add_tail(&tblk->cqueue, &TxAnchor.unlock_queue);
2780	/*
2781	 * Don't wake up a commit thread if there is already one servicing
2782	 * this superblock, or if the last one we woke up hasn't started yet.
2783	 */
2784	if (!(JFS_SBI(tblk->sb)->commit_state & IN_LAZYCOMMIT) &&
2785	    !jfs_commit_thread_waking) {
2786		jfs_commit_thread_waking = 1;
2787		wake_up(&jfs_commit_thread_wait);
2788	}
2789	LAZY_UNLOCK(flags);
2790}
2791
2792static void LogSyncRelease(struct metapage * mp)
2793{
2794	struct jfs_log *log = mp->log;
2795
2796	assert(mp->nohomeok);
2797	assert(log);
2798	metapage_homeok(mp);
2799}
2800
2801/*
2802 *	txQuiesce
2803 *
2804 *	Block all new transactions and push anonymous transactions to
2805 *	completion
2806 *
2807 *	This does almost the same thing as jfs_sync below.  We don't
2808 *	worry about deadlocking when jfs_tlocks_low is set, since we would
2809 *	expect jfs_sync to get us out of that jam.
2810 */
2811void txQuiesce(struct super_block *sb)
2812{
2813	struct inode *ip;
2814	struct jfs_inode_info *jfs_ip;
2815	struct jfs_log *log = JFS_SBI(sb)->log;
2816	tid_t tid;
2817
2818	set_bit(log_QUIESCE, &log->flag);
2819
2820	TXN_LOCK();
2821restart:
2822	while (!list_empty(&TxAnchor.anon_list)) {
2823		jfs_ip = list_entry(TxAnchor.anon_list.next,
2824				    struct jfs_inode_info,
2825				    anon_inode_list);
2826		ip = &jfs_ip->vfs_inode;
2827
2828		/*
2829		 * inode will be removed from anonymous list
2830		 * when it is committed
2831		 */
2832		TXN_UNLOCK();
2833		tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE);
2834		mutex_lock(&jfs_ip->commit_mutex);
2835		txCommit(tid, 1, &ip, 0);
2836		txEnd(tid);
2837		mutex_unlock(&jfs_ip->commit_mutex);
2838		/*
2839		 * Just to be safe.  I don't know how
2840		 * long we can run without blocking
2841		 */
2842		cond_resched();
2843		TXN_LOCK();
2844	}
2845
2846	/*
2847	 * If jfs_sync is running in parallel, there could be some inodes
2848	 * on anon_list2.  Let's check.
2849	 */
2850	if (!list_empty(&TxAnchor.anon_list2)) {
2851		list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
2852		goto restart;
2853	}
2854	TXN_UNLOCK();
2855
2856	/*
2857	 * We may need to kick off the group commit
2858	 */
2859	jfs_flush_journal(log, 0);
2860}
2861
2862/*
2863 * txResume()
2864 *
2865 * Allows transactions to start again following txQuiesce
2866 */
2867void txResume(struct super_block *sb)
2868{
2869	struct jfs_log *log = JFS_SBI(sb)->log;
2870
2871	clear_bit(log_QUIESCE, &log->flag);
2872	TXN_WAKEUP(&log->syncwait);
2873}
2874
2875/*
2876 *	jfs_sync(void)
2877 *
2878 *	To be run as a kernel daemon.  This is awakened when tlocks run low.
2879 *	We write any inodes that have anonymous tlocks so they will become
2880 *	available.
2881 */
2882int jfs_sync(void *arg)
2883{
2884	struct inode *ip;
2885	struct jfs_inode_info *jfs_ip;
2886	tid_t tid;
2887
2888	set_freezable();
2889	do {
2890		/*
2891		 * write each inode on the anonymous inode list
2892		 */
2893		TXN_LOCK();
2894		while (jfs_tlocks_low && !list_empty(&TxAnchor.anon_list)) {
2895			jfs_ip = list_entry(TxAnchor.anon_list.next,
2896					    struct jfs_inode_info,
2897					    anon_inode_list);
2898			ip = &jfs_ip->vfs_inode;
2899
2900			if (! igrab(ip)) {
2901				/*
2902				 * Inode is being freed
2903				 */
2904				list_del_init(&jfs_ip->anon_inode_list);
2905			} else if (mutex_trylock(&jfs_ip->commit_mutex)) {
2906				/*
2907				 * inode will be removed from anonymous list
2908				 * when it is committed
2909				 */
2910				TXN_UNLOCK();
2911				tid = txBegin(ip->i_sb, COMMIT_INODE);
2912				txCommit(tid, 1, &ip, 0);
2913				txEnd(tid);
2914				mutex_unlock(&jfs_ip->commit_mutex);
2915
2916				iput(ip);
2917				/*
2918				 * Just to be safe.  I don't know how
2919				 * long we can run without blocking
2920				 */
2921				cond_resched();
2922				TXN_LOCK();
2923			} else {
2924				/* We can't get the commit mutex.  It may
2925				 * be held by a thread waiting for tlock's
2926				 * so let's not block here.  Save it to
2927				 * put back on the anon_list.
2928				 */
2929
2930				/* Move from anon_list to anon_list2 */
2931				list_move(&jfs_ip->anon_inode_list,
2932					  &TxAnchor.anon_list2);
2933
2934				TXN_UNLOCK();
2935				iput(ip);
2936				TXN_LOCK();
2937			}
2938		}
2939		/* Add anon_list2 back to anon_list */
2940		list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
2941
2942		if (freezing(current)) {
2943			TXN_UNLOCK();
2944			try_to_freeze();
2945		} else {
2946			set_current_state(TASK_INTERRUPTIBLE);
2947			TXN_UNLOCK();
2948			schedule();
2949		}
2950	} while (!kthread_should_stop());
2951
2952	jfs_info("jfs_sync being killed");
2953	return 0;
2954}
2955
2956#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
2957int jfs_txanchor_proc_show(struct seq_file *m, void *v)
2958{
2959	char *freewait;
2960	char *freelockwait;
2961	char *lowlockwait;
2962
2963	freewait =
2964	    waitqueue_active(&TxAnchor.freewait) ? "active" : "empty";
2965	freelockwait =
2966	    waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty";
2967	lowlockwait =
2968	    waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
2969
2970	seq_printf(m,
2971		       "JFS TxAnchor\n"
2972		       "============\n"
2973		       "freetid = %d\n"
2974		       "freewait = %s\n"
2975		       "freelock = %d\n"
2976		       "freelockwait = %s\n"
2977		       "lowlockwait = %s\n"
2978		       "tlocksInUse = %d\n"
2979		       "jfs_tlocks_low = %d\n"
2980		       "unlock_queue is %sempty\n",
2981		       TxAnchor.freetid,
2982		       freewait,
2983		       TxAnchor.freelock,
2984		       freelockwait,
2985		       lowlockwait,
2986		       TxAnchor.tlocksInUse,
2987		       jfs_tlocks_low,
2988		       list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
2989	return 0;
2990}
2991#endif
2992
2993#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
2994int jfs_txstats_proc_show(struct seq_file *m, void *v)
2995{
2996	seq_printf(m,
2997		       "JFS TxStats\n"
2998		       "===========\n"
2999		       "calls to txBegin = %d\n"
3000		       "txBegin blocked by sync barrier = %d\n"
3001		       "txBegin blocked by tlocks low = %d\n"
3002		       "txBegin blocked by no free tid = %d\n"
3003		       "calls to txBeginAnon = %d\n"
3004		       "txBeginAnon blocked by sync barrier = %d\n"
3005		       "txBeginAnon blocked by tlocks low = %d\n"
3006		       "calls to txLockAlloc = %d\n"
3007		       "tLockAlloc blocked by no free lock = %d\n",
3008		       TxStat.txBegin,
3009		       TxStat.txBegin_barrier,
3010		       TxStat.txBegin_lockslow,
3011		       TxStat.txBegin_freetid,
3012		       TxStat.txBeginAnon,
3013		       TxStat.txBeginAnon_barrier,
3014		       TxStat.txBeginAnon_lockslow,
3015		       TxStat.txLockAlloc,
3016		       TxStat.txLockAlloc_freelock);
3017	return 0;
3018}
3019#endif
3020