1/*
2 *   Copyright (C) International Business Machines Corp., 2000-2004
3 *   Portions Copyright (C) Christoph Hellwig, 2001-2002
4 *
5 *   This program is free software;  you can redistribute it and/or modify
6 *   it under the terms of the GNU General Public License as published by
7 *   the Free Software Foundation; either version 2 of the License, or
8 *   (at your option) any later version.
9 *
10 *   This program is distributed in the hope that it will be useful,
11 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
12 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
13 *   the GNU General Public License for more details.
14 *
15 *   You should have received a copy of the GNU General Public License
16 *   along with this program;  if not, write to the Free Software
17 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20/*
21 *	jfs_logmgr.c: log manager
22 *
23 * for related information, see transaction manager (jfs_txnmgr.c), and
24 * recovery manager (jfs_logredo.c).
25 *
26 * note: for detail, RTFS.
27 *
28 *	log buffer manager:
29 * special purpose buffer manager supporting log i/o requirements.
30 * per log serial pageout of logpage
31 * queuing i/o requests and redrive i/o at iodone
32 * maintain current logpage buffer
33 * no caching since append only
34 * appropriate jfs buffer cache buffers as needed
35 *
36 *	group commit:
37 * transactions which wrote COMMIT records in the same in-memory
38 * log page during the pageout of previous/current log page(s) are
39 * committed together by the pageout of the page.
40 *
41 *	TBD lazy commit:
42 * transactions are committed asynchronously when the log page
43 * containing it COMMIT is paged out when it becomes full;
44 *
45 *	serialization:
46 * . a per log lock serialize log write.
47 * . a per log lock serialize group commit.
48 * . a per log lock serialize log open/close;
49 *
50 *	TBD log integrity:
51 * careful-write (ping-pong) of last logpage to recover from crash
52 * in overwrite.
53 * detection of split (out-of-order) write of physical sectors
54 * of last logpage via timestamp at end of each sector
55 * with its mirror data array at trailer).
56 *
57 *	alternatives:
58 * lsn - 64-bit monotonically increasing integer vs
59 * 32-bit lspn and page eor.
60 */
61
62#include <linux/fs.h>
63#include <linux/blkdev.h>
64#include <linux/interrupt.h>
65#include <linux/completion.h>
66#include <linux/kthread.h>
67#include <linux/buffer_head.h>		/* for sync_blockdev() */
68#include <linux/bio.h>
69#include <linux/freezer.h>
70#include <linux/delay.h>
71#include <linux/mutex.h>
72#include "jfs_incore.h"
73#include "jfs_filsys.h"
74#include "jfs_metapage.h"
75#include "jfs_superblock.h"
76#include "jfs_txnmgr.h"
77#include "jfs_debug.h"
78
79
80/*
81 * lbuf's ready to be redriven.  Protected by log_redrive_lock (jfsIO thread)
82 */
83static struct lbuf *log_redrive_list;
84static DEFINE_SPINLOCK(log_redrive_lock);
85
86
87/*
88 *	log read/write serialization (per log)
89 */
90#define LOG_LOCK_INIT(log)	mutex_init(&(log)->loglock)
91#define LOG_LOCK(log)		mutex_lock(&((log)->loglock))
92#define LOG_UNLOCK(log)		mutex_unlock(&((log)->loglock))
93
94
95/*
96 *	log group commit serialization (per log)
97 */
98
99#define LOGGC_LOCK_INIT(log)	spin_lock_init(&(log)->gclock)
100#define LOGGC_LOCK(log)		spin_lock_irq(&(log)->gclock)
101#define LOGGC_UNLOCK(log)	spin_unlock_irq(&(log)->gclock)
102#define LOGGC_WAKEUP(tblk)	wake_up_all(&(tblk)->gcwait)
103
104/*
105 *	log sync serialization (per log)
106 */
107#define	LOGSYNC_DELTA(logsize)		min((logsize)/8, 128*LOGPSIZE)
108#define	LOGSYNC_BARRIER(logsize)	((logsize)/4)
109/*
110#define	LOGSYNC_DELTA(logsize)		min((logsize)/4, 256*LOGPSIZE)
111#define	LOGSYNC_BARRIER(logsize)	((logsize)/2)
112*/
113
114
115/*
116 *	log buffer cache synchronization
117 */
118static DEFINE_SPINLOCK(jfsLCacheLock);
119
120#define	LCACHE_LOCK(flags)	spin_lock_irqsave(&jfsLCacheLock, flags)
121#define	LCACHE_UNLOCK(flags)	spin_unlock_irqrestore(&jfsLCacheLock, flags)
122
123/*
124 * See __SLEEP_COND in jfs_locks.h
125 */
126#define LCACHE_SLEEP_COND(wq, cond, flags)	\
127do {						\
128	if (cond)				\
129		break;				\
130	__SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \
131} while (0)
132
133#define	LCACHE_WAKEUP(event)	wake_up(event)
134
135
136/*
137 *	lbuf buffer cache (lCache) control
138 */
139/* log buffer manager pageout control (cumulative, inclusive) */
140#define	lbmREAD		0x0001
141#define	lbmWRITE	0x0002	/* enqueue at tail of write queue;
142				 * init pageout if at head of queue;
143				 */
144#define	lbmRELEASE	0x0004	/* remove from write queue
145				 * at completion of pageout;
146				 * do not free/recycle it yet:
147				 * caller will free it;
148				 */
149#define	lbmSYNC		0x0008	/* do not return to freelist
150				 * when removed from write queue;
151				 */
152#define lbmFREE		0x0010	/* return to freelist
153				 * at completion of pageout;
154				 * the buffer may be recycled;
155				 */
156#define	lbmDONE		0x0020
157#define	lbmERROR	0x0040
158#define lbmGC		0x0080	/* lbmIODone to perform post-GC processing
159				 * of log page
160				 */
161#define lbmDIRECT	0x0100
162
163/*
164 * Global list of active external journals
165 */
166static LIST_HEAD(jfs_external_logs);
167static struct jfs_log *dummy_log = NULL;
168static DEFINE_MUTEX(jfs_log_mutex);
169
170/*
171 * forward references
172 */
173static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk,
174			 struct lrd * lrd, struct tlock * tlck);
175
176static int lmNextPage(struct jfs_log * log);
177static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi,
178			   int activate);
179
180static int open_inline_log(struct super_block *sb);
181static int open_dummy_log(struct super_block *sb);
182static int lbmLogInit(struct jfs_log * log);
183static void lbmLogShutdown(struct jfs_log * log);
184static struct lbuf *lbmAllocate(struct jfs_log * log, int);
185static void lbmFree(struct lbuf * bp);
186static void lbmfree(struct lbuf * bp);
187static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp);
188static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block);
189static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag);
190static int lbmIOWait(struct lbuf * bp, int flag);
191static bio_end_io_t lbmIODone;
192static void lbmStartIO(struct lbuf * bp);
193static void lmGCwrite(struct jfs_log * log, int cant_block);
194static int lmLogSync(struct jfs_log * log, int hard_sync);
195
196
197
198/*
199 *	statistics
200 */
201#ifdef CONFIG_JFS_STATISTICS
202static struct lmStat {
203	uint commit;		/* # of commit */
204	uint pagedone;		/* # of page written */
205	uint submitted;		/* # of pages submitted */
206	uint full_page;		/* # of full pages submitted */
207	uint partial_page;	/* # of partial pages submitted */
208} lmStat;
209#endif
210
211
212/*
213 * NAME:	lmLog()
214 *
215 * FUNCTION:	write a log record;
216 *
217 * PARAMETER:
218 *
219 * RETURN:	lsn - offset to the next log record to write (end-of-log);
220 *		-1  - error;
221 *
222 * note: todo: log error handler
223 */
224int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
225	  struct tlock * tlck)
226{
227	int lsn;
228	int diffp, difft;
229	struct metapage *mp = NULL;
230	unsigned long flags;
231
232	jfs_info("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p",
233		 log, tblk, lrd, tlck);
234
235	LOG_LOCK(log);
236
237	/* log by (out-of-transaction) JFS ? */
238	if (tblk == NULL)
239		goto writeRecord;
240
241	/* log from page ? */
242	if (tlck == NULL ||
243	    tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL)
244		goto writeRecord;
245
246	/*
247	 *      initialize/update page/transaction recovery lsn
248	 */
249	lsn = log->lsn;
250
251	LOGSYNC_LOCK(log, flags);
252
253	/*
254	 * initialize page lsn if first log write of the page
255	 */
256	if (mp->lsn == 0) {
257		mp->log = log;
258		mp->lsn = lsn;
259		log->count++;
260
261		/* insert page at tail of logsynclist */
262		list_add_tail(&mp->synclist, &log->synclist);
263	}
264
265	/*
266	 *      initialize/update lsn of tblock of the page
267	 *
268	 * transaction inherits oldest lsn of pages associated
269	 * with allocation/deallocation of resources (their
270	 * log records are used to reconstruct allocation map
271	 * at recovery time: inode for inode allocation map,
272	 * B+-tree index of extent descriptors for block
273	 * allocation map);
274	 * allocation map pages inherit transaction lsn at
275	 * commit time to allow forwarding log syncpt past log
276	 * records associated with allocation/deallocation of
277	 * resources only after persistent map of these map pages
278	 * have been updated and propagated to home.
279	 */
280	/*
281	 * initialize transaction lsn:
282	 */
283	if (tblk->lsn == 0) {
284		/* inherit lsn of its first page logged */
285		tblk->lsn = mp->lsn;
286		log->count++;
287
288		/* insert tblock after the page on logsynclist */
289		list_add(&tblk->synclist, &mp->synclist);
290	}
291	/*
292	 * update transaction lsn:
293	 */
294	else {
295		/* inherit oldest/smallest lsn of page */
296		logdiff(diffp, mp->lsn, log);
297		logdiff(difft, tblk->lsn, log);
298		if (diffp < difft) {
299			/* update tblock lsn with page lsn */
300			tblk->lsn = mp->lsn;
301
302			/* move tblock after page on logsynclist */
303			list_move(&tblk->synclist, &mp->synclist);
304		}
305	}
306
307	LOGSYNC_UNLOCK(log, flags);
308
309	/*
310	 *      write the log record
311	 */
312      writeRecord:
313	lsn = lmWriteRecord(log, tblk, lrd, tlck);
314
315	/*
316	 * forward log syncpt if log reached next syncpt trigger
317	 */
318	logdiff(diffp, lsn, log);
319	if (diffp >= log->nextsync)
320		lsn = lmLogSync(log, 0);
321
322	/* update end-of-log lsn */
323	log->lsn = lsn;
324
325	LOG_UNLOCK(log);
326
327	/* return end-of-log address */
328	return lsn;
329}
330
331/*
332 * NAME:	lmWriteRecord()
333 *
334 * FUNCTION:	move the log record to current log page
335 *
336 * PARAMETER:	cd	- commit descriptor
337 *
338 * RETURN:	end-of-log address
339 *
340 * serialization: LOG_LOCK() held on entry/exit
341 */
342static int
343lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
344	      struct tlock * tlck)
345{
346	int lsn = 0;		/* end-of-log address */
347	struct lbuf *bp;	/* dst log page buffer */
348	struct logpage *lp;	/* dst log page */
349	caddr_t dst;		/* destination address in log page */
350	int dstoffset;		/* end-of-log offset in log page */
351	int freespace;		/* free space in log page */
352	caddr_t p;		/* src meta-data page */
353	caddr_t src;
354	int srclen;
355	int nbytes;		/* number of bytes to move */
356	int i;
357	int len;
358	struct linelock *linelock;
359	struct lv *lv;
360	struct lvd *lvd;
361	int l2linesize;
362
363	len = 0;
364
365	/* retrieve destination log page to write */
366	bp = (struct lbuf *) log->bp;
367	lp = (struct logpage *) bp->l_ldata;
368	dstoffset = log->eor;
369
370	/* any log data to write ? */
371	if (tlck == NULL)
372		goto moveLrd;
373
374	/*
375	 *      move log record data
376	 */
377	/* retrieve source meta-data page to log */
378	if (tlck->flag & tlckPAGELOCK) {
379		p = (caddr_t) (tlck->mp->data);
380		linelock = (struct linelock *) & tlck->lock;
381	}
382	/* retrieve source in-memory inode to log */
383	else if (tlck->flag & tlckINODELOCK) {
384		if (tlck->type & tlckDTREE)
385			p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot;
386		else
387			p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot;
388		linelock = (struct linelock *) & tlck->lock;
389	}
390#ifdef	_JFS_WIP
391	else if (tlck->flag & tlckINLINELOCK) {
392
393		inlinelock = (struct inlinelock *) & tlck;
394		p = (caddr_t) & inlinelock->pxd;
395		linelock = (struct linelock *) & tlck;
396	}
397#endif				/* _JFS_WIP */
398	else {
399		jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck);
400		return 0;	/* Probably should trap */
401	}
402	l2linesize = linelock->l2linesize;
403
404      moveData:
405	ASSERT(linelock->index <= linelock->maxcnt);
406
407	lv = linelock->lv;
408	for (i = 0; i < linelock->index; i++, lv++) {
409		if (lv->length == 0)
410			continue;
411
412		/* is page full ? */
413		if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) {
414			/* page become full: move on to next page */
415			lmNextPage(log);
416
417			bp = log->bp;
418			lp = (struct logpage *) bp->l_ldata;
419			dstoffset = LOGPHDRSIZE;
420		}
421
422		/*
423		 * move log vector data
424		 */
425		src = (u8 *) p + (lv->offset << l2linesize);
426		srclen = lv->length << l2linesize;
427		len += srclen;
428		while (srclen > 0) {
429			freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
430			nbytes = min(freespace, srclen);
431			dst = (caddr_t) lp + dstoffset;
432			memcpy(dst, src, nbytes);
433			dstoffset += nbytes;
434
435			/* is page not full ? */
436			if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
437				break;
438
439			/* page become full: move on to next page */
440			lmNextPage(log);
441
442			bp = (struct lbuf *) log->bp;
443			lp = (struct logpage *) bp->l_ldata;
444			dstoffset = LOGPHDRSIZE;
445
446			srclen -= nbytes;
447			src += nbytes;
448		}
449
450		/*
451		 * move log vector descriptor
452		 */
453		len += 4;
454		lvd = (struct lvd *) ((caddr_t) lp + dstoffset);
455		lvd->offset = cpu_to_le16(lv->offset);
456		lvd->length = cpu_to_le16(lv->length);
457		dstoffset += 4;
458		jfs_info("lmWriteRecord: lv offset:%d length:%d",
459			 lv->offset, lv->length);
460	}
461
462	if ((i = linelock->next)) {
463		linelock = (struct linelock *) lid_to_tlock(i);
464		goto moveData;
465	}
466
467	/*
468	 *      move log record descriptor
469	 */
470      moveLrd:
471	lrd->length = cpu_to_le16(len);
472
473	src = (caddr_t) lrd;
474	srclen = LOGRDSIZE;
475
476	while (srclen > 0) {
477		freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
478		nbytes = min(freespace, srclen);
479		dst = (caddr_t) lp + dstoffset;
480		memcpy(dst, src, nbytes);
481
482		dstoffset += nbytes;
483		srclen -= nbytes;
484
485		/* are there more to move than freespace of page ? */
486		if (srclen)
487			goto pageFull;
488
489		/*
490		 * end of log record descriptor
491		 */
492
493		/* update last log record eor */
494		log->eor = dstoffset;
495		bp->l_eor = dstoffset;
496		lsn = (log->page << L2LOGPSIZE) + dstoffset;
497
498		if (lrd->type & cpu_to_le16(LOG_COMMIT)) {
499			tblk->clsn = lsn;
500			jfs_info("wr: tclsn:0x%x, beor:0x%x", tblk->clsn,
501				 bp->l_eor);
502
503			INCREMENT(lmStat.commit);	/* # of commit */
504
505			/*
506			 * enqueue tblock for group commit:
507			 *
508			 * enqueue tblock of non-trivial/synchronous COMMIT
509			 * at tail of group commit queue
510			 * (trivial/asynchronous COMMITs are ignored by
511			 * group commit.)
512			 */
513			LOGGC_LOCK(log);
514
515			/* init tblock gc state */
516			tblk->flag = tblkGC_QUEUE;
517			tblk->bp = log->bp;
518			tblk->pn = log->page;
519			tblk->eor = log->eor;
520
521			/* enqueue transaction to commit queue */
522			list_add_tail(&tblk->cqueue, &log->cqueue);
523
524			LOGGC_UNLOCK(log);
525		}
526
527		jfs_info("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x",
528			le16_to_cpu(lrd->type), log->bp, log->page, dstoffset);
529
530		/* page not full ? */
531		if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
532			return lsn;
533
534	      pageFull:
535		/* page become full: move on to next page */
536		lmNextPage(log);
537
538		bp = (struct lbuf *) log->bp;
539		lp = (struct logpage *) bp->l_ldata;
540		dstoffset = LOGPHDRSIZE;
541		src += nbytes;
542	}
543
544	return lsn;
545}
546
547
548/*
549 * NAME:	lmNextPage()
550 *
551 * FUNCTION:	write current page and allocate next page.
552 *
553 * PARAMETER:	log
554 *
555 * RETURN:	0
556 *
557 * serialization: LOG_LOCK() held on entry/exit
558 */
559static int lmNextPage(struct jfs_log * log)
560{
561	struct logpage *lp;
562	int lspn;		/* log sequence page number */
563	int pn;			/* current page number */
564	struct lbuf *bp;
565	struct lbuf *nextbp;
566	struct tblock *tblk;
567
568	/* get current log page number and log sequence page number */
569	pn = log->page;
570	bp = log->bp;
571	lp = (struct logpage *) bp->l_ldata;
572	lspn = le32_to_cpu(lp->h.page);
573
574	LOGGC_LOCK(log);
575
576	/*
577	 *      write or queue the full page at the tail of write queue
578	 */
579	/* get the tail tblk on commit queue */
580	if (list_empty(&log->cqueue))
581		tblk = NULL;
582	else
583		tblk = list_entry(log->cqueue.prev, struct tblock, cqueue);
584
585	/* every tblk who has COMMIT record on the current page,
586	 * and has not been committed, must be on commit queue
587	 * since tblk is queued at commit queueu at the time
588	 * of writing its COMMIT record on the page before
589	 * page becomes full (even though the tblk thread
590	 * who wrote COMMIT record may have been suspended
591	 * currently);
592	 */
593
594	/* is page bound with outstanding tail tblk ? */
595	if (tblk && tblk->pn == pn) {
596		/* mark tblk for end-of-page */
597		tblk->flag |= tblkGC_EOP;
598
599		if (log->cflag & logGC_PAGEOUT) {
600			/* if page is not already on write queue,
601			 * just enqueue (no lbmWRITE to prevent redrive)
602			 * buffer to wqueue to ensure correct serial order
603			 * of the pages since log pages will be added
604			 * continuously
605			 */
606			if (bp->l_wqnext == NULL)
607				lbmWrite(log, bp, 0, 0);
608		} else {
609			/*
610			 * No current GC leader, initiate group commit
611			 */
612			log->cflag |= logGC_PAGEOUT;
613			lmGCwrite(log, 0);
614		}
615	}
616	/* page is not bound with outstanding tblk:
617	 * init write or mark it to be redriven (lbmWRITE)
618	 */
619	else {
620		/* finalize the page */
621		bp->l_ceor = bp->l_eor;
622		lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
623		lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0);
624	}
625	LOGGC_UNLOCK(log);
626
627	/*
628	 *      allocate/initialize next page
629	 */
630	/* if log wraps, the first data page of log is 2
631	 * (0 never used, 1 is superblock).
632	 */
633	log->page = (pn == log->size - 1) ? 2 : pn + 1;
634	log->eor = LOGPHDRSIZE;	/* ? valid page empty/full at logRedo() */
635
636	/* allocate/initialize next log page buffer */
637	nextbp = lbmAllocate(log, log->page);
638	nextbp->l_eor = log->eor;
639	log->bp = nextbp;
640
641	/* initialize next log page */
642	lp = (struct logpage *) nextbp->l_ldata;
643	lp->h.page = lp->t.page = cpu_to_le32(lspn + 1);
644	lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
645
646	return 0;
647}
648
649
650/*
651 * NAME:	lmGroupCommit()
652 *
653 * FUNCTION:	group commit
654 *	initiate pageout of the pages with COMMIT in the order of
655 *	page number - redrive pageout of the page at the head of
656 *	pageout queue until full page has been written.
657 *
658 * RETURN:
659 *
660 * NOTE:
661 *	LOGGC_LOCK serializes log group commit queue, and
662 *	transaction blocks on the commit queue.
663 *	N.B. LOG_LOCK is NOT held during lmGroupCommit().
664 */
665int lmGroupCommit(struct jfs_log * log, struct tblock * tblk)
666{
667	int rc = 0;
668
669	LOGGC_LOCK(log);
670
671	/* group committed already ? */
672	if (tblk->flag & tblkGC_COMMITTED) {
673		if (tblk->flag & tblkGC_ERROR)
674			rc = -EIO;
675
676		LOGGC_UNLOCK(log);
677		return rc;
678	}
679	jfs_info("lmGroup Commit: tblk = 0x%p, gcrtc = %d", tblk, log->gcrtc);
680
681	if (tblk->xflag & COMMIT_LAZY)
682		tblk->flag |= tblkGC_LAZY;
683
684	if ((!(log->cflag & logGC_PAGEOUT)) && (!list_empty(&log->cqueue)) &&
685	    (!(tblk->xflag & COMMIT_LAZY) || test_bit(log_FLUSH, &log->flag)
686	     || jfs_tlocks_low)) {
687		/*
688		 * No pageout in progress
689		 *
690		 * start group commit as its group leader.
691		 */
692		log->cflag |= logGC_PAGEOUT;
693
694		lmGCwrite(log, 0);
695	}
696
697	if (tblk->xflag & COMMIT_LAZY) {
698		/*
699		 * Lazy transactions can leave now
700		 */
701		LOGGC_UNLOCK(log);
702		return 0;
703	}
704
705	/* lmGCwrite gives up LOGGC_LOCK, check again */
706
707	if (tblk->flag & tblkGC_COMMITTED) {
708		if (tblk->flag & tblkGC_ERROR)
709			rc = -EIO;
710
711		LOGGC_UNLOCK(log);
712		return rc;
713	}
714
715	/* upcount transaction waiting for completion
716	 */
717	log->gcrtc++;
718	tblk->flag |= tblkGC_READY;
719
720	__SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED),
721		     LOGGC_LOCK(log), LOGGC_UNLOCK(log));
722
723	/* removed from commit queue */
724	if (tblk->flag & tblkGC_ERROR)
725		rc = -EIO;
726
727	LOGGC_UNLOCK(log);
728	return rc;
729}
730
731/*
732 * NAME:	lmGCwrite()
733 *
734 * FUNCTION:	group commit write
735 *	initiate write of log page, building a group of all transactions
736 *	with commit records on that page.
737 *
738 * RETURN:	None
739 *
740 * NOTE:
741 *	LOGGC_LOCK must be held by caller.
742 *	N.B. LOG_LOCK is NOT held during lmGroupCommit().
743 */
744static void lmGCwrite(struct jfs_log * log, int cant_write)
745{
746	struct lbuf *bp;
747	struct logpage *lp;
748	int gcpn;		/* group commit page number */
749	struct tblock *tblk;
750	struct tblock *xtblk = NULL;
751
752	/*
753	 * build the commit group of a log page
754	 *
755	 * scan commit queue and make a commit group of all
756	 * transactions with COMMIT records on the same log page.
757	 */
758	/* get the head tblk on the commit queue */
759	gcpn = list_entry(log->cqueue.next, struct tblock, cqueue)->pn;
760
761	list_for_each_entry(tblk, &log->cqueue, cqueue) {
762		if (tblk->pn != gcpn)
763			break;
764
765		xtblk = tblk;
766
767		/* state transition: (QUEUE, READY) -> COMMIT */
768		tblk->flag |= tblkGC_COMMIT;
769	}
770	tblk = xtblk;		/* last tblk of the page */
771
772	/*
773	 * pageout to commit transactions on the log page.
774	 */
775	bp = (struct lbuf *) tblk->bp;
776	lp = (struct logpage *) bp->l_ldata;
777	/* is page already full ? */
778	if (tblk->flag & tblkGC_EOP) {
779		/* mark page to free at end of group commit of the page */
780		tblk->flag &= ~tblkGC_EOP;
781		tblk->flag |= tblkGC_FREE;
782		bp->l_ceor = bp->l_eor;
783		lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
784		lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC,
785			 cant_write);
786		INCREMENT(lmStat.full_page);
787	}
788	/* page is not yet full */
789	else {
790		bp->l_ceor = tblk->eor;	/* ? bp->l_ceor = bp->l_eor; */
791		lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
792		lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write);
793		INCREMENT(lmStat.partial_page);
794	}
795}
796
797/*
798 * NAME:	lmPostGC()
799 *
800 * FUNCTION:	group commit post-processing
801 *	Processes transactions after their commit records have been written
802 *	to disk, redriving log I/O if necessary.
803 *
804 * RETURN:	None
805 *
806 * NOTE:
807 *	This routine is called a interrupt time by lbmIODone
808 */
809static void lmPostGC(struct lbuf * bp)
810{
811	unsigned long flags;
812	struct jfs_log *log = bp->l_log;
813	struct logpage *lp;
814	struct tblock *tblk, *temp;
815
816	//LOGGC_LOCK(log);
817	spin_lock_irqsave(&log->gclock, flags);
818	/*
819	 * current pageout of group commit completed.
820	 *
821	 * remove/wakeup transactions from commit queue who were
822	 * group committed with the current log page
823	 */
824	list_for_each_entry_safe(tblk, temp, &log->cqueue, cqueue) {
825		if (!(tblk->flag & tblkGC_COMMIT))
826			break;
827		/* if transaction was marked GC_COMMIT then
828		 * it has been shipped in the current pageout
829		 * and made it to disk - it is committed.
830		 */
831
832		if (bp->l_flag & lbmERROR)
833			tblk->flag |= tblkGC_ERROR;
834
835		/* remove it from the commit queue */
836		list_del(&tblk->cqueue);
837		tblk->flag &= ~tblkGC_QUEUE;
838
839		if (tblk == log->flush_tblk) {
840			/* we can stop flushing the log now */
841			clear_bit(log_FLUSH, &log->flag);
842			log->flush_tblk = NULL;
843		}
844
845		jfs_info("lmPostGC: tblk = 0x%p, flag = 0x%x", tblk,
846			 tblk->flag);
847
848		if (!(tblk->xflag & COMMIT_FORCE))
849			/*
850			 * Hand tblk over to lazy commit thread
851			 */
852			txLazyUnlock(tblk);
853		else {
854			/* state transition: COMMIT -> COMMITTED */
855			tblk->flag |= tblkGC_COMMITTED;
856
857			if (tblk->flag & tblkGC_READY)
858				log->gcrtc--;
859
860			LOGGC_WAKEUP(tblk);
861		}
862
863		/* was page full before pageout ?
864		 * (and this is the last tblk bound with the page)
865		 */
866		if (tblk->flag & tblkGC_FREE)
867			lbmFree(bp);
868		/* did page become full after pageout ?
869		 * (and this is the last tblk bound with the page)
870		 */
871		else if (tblk->flag & tblkGC_EOP) {
872			/* finalize the page */
873			lp = (struct logpage *) bp->l_ldata;
874			bp->l_ceor = bp->l_eor;
875			lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
876			jfs_info("lmPostGC: calling lbmWrite");
877			lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE,
878				 1);
879		}
880
881	}
882
883	/* are there any transactions who have entered lnGroupCommit()
884	 * (whose COMMITs are after that of the last log page written.
885	 * They are waiting for new group commit (above at (SLEEP 1))
886	 * or lazy transactions are on a full (queued) log page,
887	 * select the latest ready transaction as new group leader and
888	 * wake her up to lead her group.
889	 */
890	if ((!list_empty(&log->cqueue)) &&
891	    ((log->gcrtc > 0) || (tblk->bp->l_wqnext != NULL) ||
892	     test_bit(log_FLUSH, &log->flag) || jfs_tlocks_low))
893		/*
894		 * Call lmGCwrite with new group leader
895		 */
896		lmGCwrite(log, 1);
897
898	/* no transaction are ready yet (transactions are only just
899	 * queued (GC_QUEUE) and not entered for group commit yet).
900	 * the first transaction entering group commit
901	 * will elect herself as new group leader.
902	 */
903	else
904		log->cflag &= ~logGC_PAGEOUT;
905
906	//LOGGC_UNLOCK(log);
907	spin_unlock_irqrestore(&log->gclock, flags);
908	return;
909}
910
911/*
912 * NAME:	lmLogSync()
913 *
914 * FUNCTION:	write log SYNCPT record for specified log
915 *	if new sync address is available
916 *	(normally the case if sync() is executed by back-ground
917 *	process).
918 *	calculate new value of i_nextsync which determines when
919 *	this code is called again.
920 *
921 * PARAMETERS:	log	- log structure
922 *		hard_sync - 1 to force all metadata to be written
923 *
924 * RETURN:	0
925 *
926 * serialization: LOG_LOCK() held on entry/exit
927 */
928static int lmLogSync(struct jfs_log * log, int hard_sync)
929{
930	int logsize;
931	int written;		/* written since last syncpt */
932	int free;		/* free space left available */
933	int delta;		/* additional delta to write normally */
934	int more;		/* additional write granted */
935	struct lrd lrd;
936	int lsn;
937	struct logsyncblk *lp;
938	struct jfs_sb_info *sbi;
939	unsigned long flags;
940
941	/* push dirty metapages out to disk */
942	if (hard_sync)
943		list_for_each_entry(sbi, &log->sb_list, log_list) {
944			filemap_fdatawrite(sbi->ipbmap->i_mapping);
945			filemap_fdatawrite(sbi->ipimap->i_mapping);
946			filemap_fdatawrite(sbi->direct_inode->i_mapping);
947		}
948	else
949		list_for_each_entry(sbi, &log->sb_list, log_list) {
950			filemap_flush(sbi->ipbmap->i_mapping);
951			filemap_flush(sbi->ipimap->i_mapping);
952			filemap_flush(sbi->direct_inode->i_mapping);
953		}
954
955	/*
956	 *      forward syncpt
957	 */
958	/* if last sync is same as last syncpt,
959	 * invoke sync point forward processing to update sync.
960	 */
961
962	if (log->sync == log->syncpt) {
963		LOGSYNC_LOCK(log, flags);
964		if (list_empty(&log->synclist))
965			log->sync = log->lsn;
966		else {
967			lp = list_entry(log->synclist.next,
968					struct logsyncblk, synclist);
969			log->sync = lp->lsn;
970		}
971		LOGSYNC_UNLOCK(log, flags);
972
973	}
974
975	/* if sync is different from last syncpt,
976	 * write a SYNCPT record with syncpt = sync.
977	 * reset syncpt = sync
978	 */
979	if (log->sync != log->syncpt) {
980		lrd.logtid = 0;
981		lrd.backchain = 0;
982		lrd.type = cpu_to_le16(LOG_SYNCPT);
983		lrd.length = 0;
984		lrd.log.syncpt.sync = cpu_to_le32(log->sync);
985		lsn = lmWriteRecord(log, NULL, &lrd, NULL);
986
987		log->syncpt = log->sync;
988	} else
989		lsn = log->lsn;
990
991	/*
992	 *      setup next syncpt trigger (SWAG)
993	 */
994	logsize = log->logsize;
995
996	logdiff(written, lsn, log);
997	free = logsize - written;
998	delta = LOGSYNC_DELTA(logsize);
999	more = min(free / 2, delta);
1000	if (more < 2 * LOGPSIZE) {
1001		jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n");
1002		/*
1003		 *      log wrapping
1004		 *
1005		 * option 1 - panic ? No.!
1006		 * option 2 - shutdown file systems
1007		 *            associated with log ?
1008		 * option 3 - extend log ?
1009		 */
1010		/*
1011		 * option 4 - second chance
1012		 *
1013		 * mark log wrapped, and continue.
1014		 * when all active transactions are completed,
1015		 * mark log vaild for recovery.
1016		 * if crashed during invalid state, log state
1017		 * implies invald log, forcing fsck().
1018		 */
1019		/* mark log state log wrap in log superblock */
1020		/* log->state = LOGWRAP; */
1021
1022		/* reset sync point computation */
1023		log->syncpt = log->sync = lsn;
1024		log->nextsync = delta;
1025	} else
1026		/* next syncpt trigger = written + more */
1027		log->nextsync = written + more;
1028
1029	/* if number of bytes written from last sync point is more
1030	 * than 1/4 of the log size, stop new transactions from
1031	 * starting until all current transactions are completed
1032	 * by setting syncbarrier flag.
1033	 */
1034	if (!test_bit(log_SYNCBARRIER, &log->flag) &&
1035	    (written > LOGSYNC_BARRIER(logsize)) && log->active) {
1036		set_bit(log_SYNCBARRIER, &log->flag);
1037		jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn,
1038			 log->syncpt);
1039		/*
1040		 * We may have to initiate group commit
1041		 */
1042		jfs_flush_journal(log, 0);
1043	}
1044
1045	return lsn;
1046}
1047
1048/*
1049 * NAME:	jfs_syncpt
1050 *
1051 * FUNCTION:	write log SYNCPT record for specified log
1052 *
1053 * PARAMETERS:	log	  - log structure
1054 *		hard_sync - set to 1 to force metadata to be written
1055 */
1056void jfs_syncpt(struct jfs_log *log, int hard_sync)
1057{	LOG_LOCK(log);
1058	lmLogSync(log, hard_sync);
1059	LOG_UNLOCK(log);
1060}
1061
1062/*
1063 * NAME:	lmLogOpen()
1064 *
1065 * FUNCTION:    open the log on first open;
1066 *	insert filesystem in the active list of the log.
1067 *
1068 * PARAMETER:	ipmnt	- file system mount inode
1069 *		iplog	- log inode (out)
1070 *
1071 * RETURN:
1072 *
1073 * serialization:
1074 */
1075int lmLogOpen(struct super_block *sb)
1076{
1077	int rc;
1078	struct block_device *bdev;
1079	struct jfs_log *log;
1080	struct jfs_sb_info *sbi = JFS_SBI(sb);
1081
1082	if (sbi->flag & JFS_NOINTEGRITY)
1083		return open_dummy_log(sb);
1084
1085	if (sbi->mntflag & JFS_INLINELOG)
1086		return open_inline_log(sb);
1087
1088	mutex_lock(&jfs_log_mutex);
1089	list_for_each_entry(log, &jfs_external_logs, journal_list) {
1090		if (log->bdev->bd_dev == sbi->logdev) {
1091			if (memcmp(log->uuid, sbi->loguuid,
1092				   sizeof(log->uuid))) {
1093				jfs_warn("wrong uuid on JFS journal\n");
1094				mutex_unlock(&jfs_log_mutex);
1095				return -EINVAL;
1096			}
1097			/*
1098			 * add file system to log active file system list
1099			 */
1100			if ((rc = lmLogFileSystem(log, sbi, 1))) {
1101				mutex_unlock(&jfs_log_mutex);
1102				return rc;
1103			}
1104			goto journal_found;
1105		}
1106	}
1107
1108	if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL))) {
1109		mutex_unlock(&jfs_log_mutex);
1110		return -ENOMEM;
1111	}
1112	INIT_LIST_HEAD(&log->sb_list);
1113	init_waitqueue_head(&log->syncwait);
1114
1115	/*
1116	 *      external log as separate logical volume
1117	 *
1118	 * file systems to log may have n-to-1 relationship;
1119	 */
1120
1121	bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE);
1122	if (IS_ERR(bdev)) {
1123		rc = -PTR_ERR(bdev);
1124		goto free;
1125	}
1126
1127	if ((rc = bd_claim(bdev, log))) {
1128		goto close;
1129	}
1130
1131	log->bdev = bdev;
1132	memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
1133
1134	/*
1135	 * initialize log:
1136	 */
1137	if ((rc = lmLogInit(log)))
1138		goto unclaim;
1139
1140	list_add(&log->journal_list, &jfs_external_logs);
1141
1142	/*
1143	 * add file system to log active file system list
1144	 */
1145	if ((rc = lmLogFileSystem(log, sbi, 1)))
1146		goto shutdown;
1147
1148journal_found:
1149	LOG_LOCK(log);
1150	list_add(&sbi->log_list, &log->sb_list);
1151	sbi->log = log;
1152	LOG_UNLOCK(log);
1153
1154	mutex_unlock(&jfs_log_mutex);
1155	return 0;
1156
1157	/*
1158	 *      unwind on error
1159	 */
1160      shutdown:		/* unwind lbmLogInit() */
1161	list_del(&log->journal_list);
1162	lbmLogShutdown(log);
1163
1164      unclaim:
1165	bd_release(bdev);
1166
1167      close:		/* close external log device */
1168	blkdev_put(bdev);
1169
1170      free:		/* free log descriptor */
1171	mutex_unlock(&jfs_log_mutex);
1172	kfree(log);
1173
1174	jfs_warn("lmLogOpen: exit(%d)", rc);
1175	return rc;
1176}
1177
1178static int open_inline_log(struct super_block *sb)
1179{
1180	struct jfs_log *log;
1181	int rc;
1182
1183	if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL)))
1184		return -ENOMEM;
1185	INIT_LIST_HEAD(&log->sb_list);
1186	init_waitqueue_head(&log->syncwait);
1187
1188	set_bit(log_INLINELOG, &log->flag);
1189	log->bdev = sb->s_bdev;
1190	log->base = addressPXD(&JFS_SBI(sb)->logpxd);
1191	log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
1192	    (L2LOGPSIZE - sb->s_blocksize_bits);
1193	log->l2bsize = sb->s_blocksize_bits;
1194	ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits);
1195
1196	/*
1197	 * initialize log.
1198	 */
1199	if ((rc = lmLogInit(log))) {
1200		kfree(log);
1201		jfs_warn("lmLogOpen: exit(%d)", rc);
1202		return rc;
1203	}
1204
1205	list_add(&JFS_SBI(sb)->log_list, &log->sb_list);
1206	JFS_SBI(sb)->log = log;
1207
1208	return rc;
1209}
1210
1211static int open_dummy_log(struct super_block *sb)
1212{
1213	int rc;
1214
1215	mutex_lock(&jfs_log_mutex);
1216	if (!dummy_log) {
1217		dummy_log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL);
1218		if (!dummy_log) {
1219			mutex_unlock(&jfs_log_mutex);
1220			return -ENOMEM;
1221		}
1222		INIT_LIST_HEAD(&dummy_log->sb_list);
1223		init_waitqueue_head(&dummy_log->syncwait);
1224		dummy_log->no_integrity = 1;
1225		/* Make up some stuff */
1226		dummy_log->base = 0;
1227		dummy_log->size = 1024;
1228		rc = lmLogInit(dummy_log);
1229		if (rc) {
1230			kfree(dummy_log);
1231			dummy_log = NULL;
1232			mutex_unlock(&jfs_log_mutex);
1233			return rc;
1234		}
1235	}
1236
1237	LOG_LOCK(dummy_log);
1238	list_add(&JFS_SBI(sb)->log_list, &dummy_log->sb_list);
1239	JFS_SBI(sb)->log = dummy_log;
1240	LOG_UNLOCK(dummy_log);
1241	mutex_unlock(&jfs_log_mutex);
1242
1243	return 0;
1244}
1245
1246/*
1247 * NAME:	lmLogInit()
1248 *
1249 * FUNCTION:	log initialization at first log open.
1250 *
1251 *	logredo() (or logformat()) should have been run previously.
1252 *	initialize the log from log superblock.
1253 *	set the log state in the superblock to LOGMOUNT and
1254 *	write SYNCPT log record.
1255 *
1256 * PARAMETER:	log	- log structure
1257 *
1258 * RETURN:	0	- if ok
1259 *		-EINVAL	- bad log magic number or superblock dirty
1260 *		error returned from logwait()
1261 *
1262 * serialization: single first open thread
1263 */
1264int lmLogInit(struct jfs_log * log)
1265{
1266	int rc = 0;
1267	struct lrd lrd;
1268	struct logsuper *logsuper;
1269	struct lbuf *bpsuper;
1270	struct lbuf *bp;
1271	struct logpage *lp;
1272	int lsn = 0;
1273
1274	jfs_info("lmLogInit: log:0x%p", log);
1275
1276	/* initialize the group commit serialization lock */
1277	LOGGC_LOCK_INIT(log);
1278
1279	/* allocate/initialize the log write serialization lock */
1280	LOG_LOCK_INIT(log);
1281
1282	LOGSYNC_LOCK_INIT(log);
1283
1284	INIT_LIST_HEAD(&log->synclist);
1285
1286	INIT_LIST_HEAD(&log->cqueue);
1287	log->flush_tblk = NULL;
1288
1289	log->count = 0;
1290
1291	/*
1292	 * initialize log i/o
1293	 */
1294	if ((rc = lbmLogInit(log)))
1295		return rc;
1296
1297	if (!test_bit(log_INLINELOG, &log->flag))
1298		log->l2bsize = L2LOGPSIZE;
1299
1300	/* check for disabled journaling to disk */
1301	if (log->no_integrity) {
1302		/*
1303		 * Journal pages will still be filled.  When the time comes
1304		 * to actually do the I/O, the write is not done, and the
1305		 * endio routine is called directly.
1306		 */
1307		bp = lbmAllocate(log , 0);
1308		log->bp = bp;
1309		bp->l_pn = bp->l_eor = 0;
1310	} else {
1311		/*
1312		 * validate log superblock
1313		 */
1314		if ((rc = lbmRead(log, 1, &bpsuper)))
1315			goto errout10;
1316
1317		logsuper = (struct logsuper *) bpsuper->l_ldata;
1318
1319		if (logsuper->magic != cpu_to_le32(LOGMAGIC)) {
1320			jfs_warn("*** Log Format Error ! ***");
1321			rc = -EINVAL;
1322			goto errout20;
1323		}
1324
1325		/* logredo() should have been run successfully. */
1326		if (logsuper->state != cpu_to_le32(LOGREDONE)) {
1327			jfs_warn("*** Log Is Dirty ! ***");
1328			rc = -EINVAL;
1329			goto errout20;
1330		}
1331
1332		/* initialize log from log superblock */
1333		if (test_bit(log_INLINELOG,&log->flag)) {
1334			if (log->size != le32_to_cpu(logsuper->size)) {
1335				rc = -EINVAL;
1336				goto errout20;
1337			}
1338			jfs_info("lmLogInit: inline log:0x%p base:0x%Lx "
1339				 "size:0x%x", log,
1340				 (unsigned long long) log->base, log->size);
1341		} else {
1342			if (memcmp(logsuper->uuid, log->uuid, 16)) {
1343				jfs_warn("wrong uuid on JFS log device");
1344				goto errout20;
1345			}
1346			log->size = le32_to_cpu(logsuper->size);
1347			log->l2bsize = le32_to_cpu(logsuper->l2bsize);
1348			jfs_info("lmLogInit: external log:0x%p base:0x%Lx "
1349				 "size:0x%x", log,
1350				 (unsigned long long) log->base, log->size);
1351		}
1352
1353		log->page = le32_to_cpu(logsuper->end) / LOGPSIZE;
1354		log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page);
1355
1356		/*
1357		 * initialize for log append write mode
1358		 */
1359		/* establish current/end-of-log page/buffer */
1360		if ((rc = lbmRead(log, log->page, &bp)))
1361			goto errout20;
1362
1363		lp = (struct logpage *) bp->l_ldata;
1364
1365		jfs_info("lmLogInit: lsn:0x%x page:%d eor:%d:%d",
1366			 le32_to_cpu(logsuper->end), log->page, log->eor,
1367			 le16_to_cpu(lp->h.eor));
1368
1369		log->bp = bp;
1370		bp->l_pn = log->page;
1371		bp->l_eor = log->eor;
1372
1373		/* if current page is full, move on to next page */
1374		if (log->eor >= LOGPSIZE - LOGPTLRSIZE)
1375			lmNextPage(log);
1376
1377		/*
1378		 * initialize log syncpoint
1379		 */
1380		/*
1381		 * write the first SYNCPT record with syncpoint = 0
1382		 * (i.e., log redo up to HERE !);
1383		 * remove current page from lbm write queue at end of pageout
1384		 * (to write log superblock update), but do not release to
1385		 * freelist;
1386		 */
1387		lrd.logtid = 0;
1388		lrd.backchain = 0;
1389		lrd.type = cpu_to_le16(LOG_SYNCPT);
1390		lrd.length = 0;
1391		lrd.log.syncpt.sync = 0;
1392		lsn = lmWriteRecord(log, NULL, &lrd, NULL);
1393		bp = log->bp;
1394		bp->l_ceor = bp->l_eor;
1395		lp = (struct logpage *) bp->l_ldata;
1396		lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
1397		lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0);
1398		if ((rc = lbmIOWait(bp, 0)))
1399			goto errout30;
1400
1401		/*
1402		 * update/write superblock
1403		 */
1404		logsuper->state = cpu_to_le32(LOGMOUNT);
1405		log->serial = le32_to_cpu(logsuper->serial) + 1;
1406		logsuper->serial = cpu_to_le32(log->serial);
1407		lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
1408		if ((rc = lbmIOWait(bpsuper, lbmFREE)))
1409			goto errout30;
1410	}
1411
1412	/* initialize logsync parameters */
1413	log->logsize = (log->size - 2) << L2LOGPSIZE;
1414	log->lsn = lsn;
1415	log->syncpt = lsn;
1416	log->sync = log->syncpt;
1417	log->nextsync = LOGSYNC_DELTA(log->logsize);
1418
1419	jfs_info("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x",
1420		 log->lsn, log->syncpt, log->sync);
1421
1422	/*
1423	 * initialize for lazy/group commit
1424	 */
1425	log->clsn = lsn;
1426
1427	return 0;
1428
1429	/*
1430	 *      unwind on error
1431	 */
1432      errout30:		/* release log page */
1433	log->wqueue = NULL;
1434	bp->l_wqnext = NULL;
1435	lbmFree(bp);
1436
1437      errout20:		/* release log superblock */
1438	lbmFree(bpsuper);
1439
1440      errout10:		/* unwind lbmLogInit() */
1441	lbmLogShutdown(log);
1442
1443	jfs_warn("lmLogInit: exit(%d)", rc);
1444	return rc;
1445}
1446
1447
1448/*
1449 * NAME:	lmLogClose()
1450 *
1451 * FUNCTION:	remove file system <ipmnt> from active list of log <iplog>
1452 *		and close it on last close.
1453 *
1454 * PARAMETER:	sb	- superblock
1455 *
1456 * RETURN:	errors from subroutines
1457 *
1458 * serialization:
1459 */
1460int lmLogClose(struct super_block *sb)
1461{
1462	struct jfs_sb_info *sbi = JFS_SBI(sb);
1463	struct jfs_log *log = sbi->log;
1464	struct block_device *bdev;
1465	int rc = 0;
1466
1467	jfs_info("lmLogClose: log:0x%p", log);
1468
1469	mutex_lock(&jfs_log_mutex);
1470	LOG_LOCK(log);
1471	list_del(&sbi->log_list);
1472	LOG_UNLOCK(log);
1473	sbi->log = NULL;
1474
1475	/*
1476	 * We need to make sure all of the "written" metapages
1477	 * actually make it to disk
1478	 */
1479	sync_blockdev(sb->s_bdev);
1480
1481	if (test_bit(log_INLINELOG, &log->flag)) {
1482		/*
1483		 *      in-line log in host file system
1484		 */
1485		rc = lmLogShutdown(log);
1486		kfree(log);
1487		goto out;
1488	}
1489
1490	if (!log->no_integrity)
1491		lmLogFileSystem(log, sbi, 0);
1492
1493	if (!list_empty(&log->sb_list))
1494		goto out;
1495
1496	/*
1497	 * TODO: ensure that the dummy_log is in a state to allow
1498	 * lbmLogShutdown to deallocate all the buffers and call
1499	 * kfree against dummy_log.  For now, leave dummy_log & its
1500	 * buffers in memory, and resuse if another no-integrity mount
1501	 * is requested.
1502	 */
1503	if (log->no_integrity)
1504		goto out;
1505
1506	/*
1507	 *      external log as separate logical volume
1508	 */
1509	list_del(&log->journal_list);
1510	bdev = log->bdev;
1511	rc = lmLogShutdown(log);
1512
1513	bd_release(bdev);
1514	blkdev_put(bdev);
1515
1516	kfree(log);
1517
1518      out:
1519	mutex_unlock(&jfs_log_mutex);
1520	jfs_info("lmLogClose: exit(%d)", rc);
1521	return rc;
1522}
1523
1524
1525/*
1526 * NAME:	jfs_flush_journal()
1527 *
1528 * FUNCTION:	initiate write of any outstanding transactions to the journal
1529 *		and optionally wait until they are all written to disk
1530 *
1531 *		wait == 0  flush until latest txn is committed, don't wait
1532 *		wait == 1  flush until latest txn is committed, wait
1533 *		wait > 1   flush until all txn's are complete, wait
1534 */
1535void jfs_flush_journal(struct jfs_log *log, int wait)
1536{
1537	int i;
1538	struct tblock *target = NULL;
1539	struct jfs_sb_info *sbi;
1540
1541	/* jfs_write_inode may call us during read-only mount */
1542	if (!log)
1543		return;
1544
1545	jfs_info("jfs_flush_journal: log:0x%p wait=%d", log, wait);
1546
1547	LOGGC_LOCK(log);
1548
1549	if (!list_empty(&log->cqueue)) {
1550		/*
1551		 * This ensures that we will keep writing to the journal as long
1552		 * as there are unwritten commit records
1553		 */
1554		target = list_entry(log->cqueue.prev, struct tblock, cqueue);
1555
1556		if (test_bit(log_FLUSH, &log->flag)) {
1557			/*
1558			 * We're already flushing.
1559			 * if flush_tblk is NULL, we are flushing everything,
1560			 * so leave it that way.  Otherwise, update it to the
1561			 * latest transaction
1562			 */
1563			if (log->flush_tblk)
1564				log->flush_tblk = target;
1565		} else {
1566			/* Only flush until latest transaction is committed */
1567			log->flush_tblk = target;
1568			set_bit(log_FLUSH, &log->flag);
1569
1570			/*
1571			 * Initiate I/O on outstanding transactions
1572			 */
1573			if (!(log->cflag & logGC_PAGEOUT)) {
1574				log->cflag |= logGC_PAGEOUT;
1575				lmGCwrite(log, 0);
1576			}
1577		}
1578	}
1579	if ((wait > 1) || test_bit(log_SYNCBARRIER, &log->flag)) {
1580		/* Flush until all activity complete */
1581		set_bit(log_FLUSH, &log->flag);
1582		log->flush_tblk = NULL;
1583	}
1584
1585	if (wait && target && !(target->flag & tblkGC_COMMITTED)) {
1586		DECLARE_WAITQUEUE(__wait, current);
1587
1588		add_wait_queue(&target->gcwait, &__wait);
1589		set_current_state(TASK_UNINTERRUPTIBLE);
1590		LOGGC_UNLOCK(log);
1591		schedule();
1592		__set_current_state(TASK_RUNNING);
1593		LOGGC_LOCK(log);
1594		remove_wait_queue(&target->gcwait, &__wait);
1595	}
1596	LOGGC_UNLOCK(log);
1597
1598	if (wait < 2)
1599		return;
1600
1601	list_for_each_entry(sbi, &log->sb_list, log_list) {
1602		filemap_fdatawrite(sbi->ipbmap->i_mapping);
1603		filemap_fdatawrite(sbi->ipimap->i_mapping);
1604		filemap_fdatawrite(sbi->direct_inode->i_mapping);
1605	}
1606
1607	/*
1608	 * If there was recent activity, we may need to wait
1609	 * for the lazycommit thread to catch up
1610	 */
1611	if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) {
1612		for (i = 0; i < 200; i++) {	/* Too much? */
1613			msleep(250);
1614			if (list_empty(&log->cqueue) &&
1615			    list_empty(&log->synclist))
1616				break;
1617		}
1618	}
1619	assert(list_empty(&log->cqueue));
1620
1621#ifdef CONFIG_JFS_DEBUG
1622	if (!list_empty(&log->synclist)) {
1623		struct logsyncblk *lp;
1624
1625		list_for_each_entry(lp, &log->synclist, synclist) {
1626			if (lp->xflag & COMMIT_PAGE) {
1627				struct metapage *mp = (struct metapage *)lp;
1628				dump_mem("orphan metapage", lp,
1629					 sizeof(struct metapage));
1630				dump_mem("page", mp->page, sizeof(struct page));
1631			}
1632			else
1633				dump_mem("orphan tblock", lp,
1634					 sizeof(struct tblock));
1635		}
1636	}
1637#endif
1638	//assert(list_empty(&log->synclist));
1639	clear_bit(log_FLUSH, &log->flag);
1640}
1641
1642/*
1643 * NAME:	lmLogShutdown()
1644 *
1645 * FUNCTION:	log shutdown at last LogClose().
1646 *
1647 *		write log syncpt record.
1648 *		update super block to set redone flag to 0.
1649 *
1650 * PARAMETER:	log	- log inode
1651 *
1652 * RETURN:	0	- success
1653 *
1654 * serialization: single last close thread
1655 */
1656int lmLogShutdown(struct jfs_log * log)
1657{
1658	int rc;
1659	struct lrd lrd;
1660	int lsn;
1661	struct logsuper *logsuper;
1662	struct lbuf *bpsuper;
1663	struct lbuf *bp;
1664	struct logpage *lp;
1665
1666	jfs_info("lmLogShutdown: log:0x%p", log);
1667
1668	jfs_flush_journal(log, 2);
1669
1670	/*
1671	 * write the last SYNCPT record with syncpoint = 0
1672	 * (i.e., log redo up to HERE !)
1673	 */
1674	lrd.logtid = 0;
1675	lrd.backchain = 0;
1676	lrd.type = cpu_to_le16(LOG_SYNCPT);
1677	lrd.length = 0;
1678	lrd.log.syncpt.sync = 0;
1679
1680	lsn = lmWriteRecord(log, NULL, &lrd, NULL);
1681	bp = log->bp;
1682	lp = (struct logpage *) bp->l_ldata;
1683	lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
1684	lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0);
1685	lbmIOWait(log->bp, lbmFREE);
1686	log->bp = NULL;
1687
1688	/*
1689	 * synchronous update log superblock
1690	 * mark log state as shutdown cleanly
1691	 * (i.e., Log does not need to be replayed).
1692	 */
1693	if ((rc = lbmRead(log, 1, &bpsuper)))
1694		goto out;
1695
1696	logsuper = (struct logsuper *) bpsuper->l_ldata;
1697	logsuper->state = cpu_to_le32(LOGREDONE);
1698	logsuper->end = cpu_to_le32(lsn);
1699	lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
1700	rc = lbmIOWait(bpsuper, lbmFREE);
1701
1702	jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d",
1703		 lsn, log->page, log->eor);
1704
1705      out:
1706	/*
1707	 * shutdown per log i/o
1708	 */
1709	lbmLogShutdown(log);
1710
1711	if (rc) {
1712		jfs_warn("lmLogShutdown: exit(%d)", rc);
1713	}
1714	return rc;
1715}
1716
1717
1718/*
1719 * NAME:	lmLogFileSystem()
1720 *
1721 * FUNCTION:	insert (<activate> = true)/remove (<activate> = false)
1722 *	file system into/from log active file system list.
1723 *
1724 * PARAMETE:	log	- pointer to logs inode.
1725 *		fsdev	- kdev_t of filesystem.
1726 *		serial  - pointer to returned log serial number
1727 *		activate - insert/remove device from active list.
1728 *
1729 * RETURN:	0	- success
1730 *		errors returned by vms_iowait().
1731 */
1732static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi,
1733			   int activate)
1734{
1735	int rc = 0;
1736	int i;
1737	struct logsuper *logsuper;
1738	struct lbuf *bpsuper;
1739	char *uuid = sbi->uuid;
1740
1741	/*
1742	 * insert/remove file system device to log active file system list.
1743	 */
1744	if ((rc = lbmRead(log, 1, &bpsuper)))
1745		return rc;
1746
1747	logsuper = (struct logsuper *) bpsuper->l_ldata;
1748	if (activate) {
1749		for (i = 0; i < MAX_ACTIVE; i++)
1750			if (!memcmp(logsuper->active[i].uuid, NULL_UUID, 16)) {
1751				memcpy(logsuper->active[i].uuid, uuid, 16);
1752				sbi->aggregate = i;
1753				break;
1754			}
1755		if (i == MAX_ACTIVE) {
1756			jfs_warn("Too many file systems sharing journal!");
1757			lbmFree(bpsuper);
1758			return -EMFILE;	/* Is there a better rc? */
1759		}
1760	} else {
1761		for (i = 0; i < MAX_ACTIVE; i++)
1762			if (!memcmp(logsuper->active[i].uuid, uuid, 16)) {
1763				memcpy(logsuper->active[i].uuid, NULL_UUID, 16);
1764				break;
1765			}
1766		if (i == MAX_ACTIVE) {
1767			jfs_warn("Somebody stomped on the journal!");
1768			lbmFree(bpsuper);
1769			return -EIO;
1770		}
1771
1772	}
1773
1774	/*
1775	 * synchronous write log superblock:
1776	 *
1777	 * write sidestream bypassing write queue:
1778	 * at file system mount, log super block is updated for
1779	 * activation of the file system before any log record
1780	 * (MOUNT record) of the file system, and at file system
1781	 * unmount, all meta data for the file system has been
1782	 * flushed before log super block is updated for deactivation
1783	 * of the file system.
1784	 */
1785	lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
1786	rc = lbmIOWait(bpsuper, lbmFREE);
1787
1788	return rc;
1789}
1790
1791/*
1792 *		log buffer manager (lbm)
1793 *		------------------------
1794 *
1795 * special purpose buffer manager supporting log i/o requirements.
1796 *
1797 * per log write queue:
1798 * log pageout occurs in serial order by fifo write queue and
1799 * restricting to a single i/o in pregress at any one time.
1800 * a circular singly-linked list
1801 * (log->wrqueue points to the tail, and buffers are linked via
1802 * bp->wrqueue field), and
1803 * maintains log page in pageout ot waiting for pageout in serial pageout.
1804 */
1805
1806/*
1807 *	lbmLogInit()
1808 *
1809 * initialize per log I/O setup at lmLogInit()
1810 */
1811static int lbmLogInit(struct jfs_log * log)
1812{				/* log inode */
1813	int i;
1814	struct lbuf *lbuf;
1815
1816	jfs_info("lbmLogInit: log:0x%p", log);
1817
1818	/* initialize current buffer cursor */
1819	log->bp = NULL;
1820
1821	/* initialize log device write queue */
1822	log->wqueue = NULL;
1823
1824	/*
1825	 * Each log has its own buffer pages allocated to it.  These are
1826	 * not managed by the page cache.  This ensures that a transaction
1827	 * writing to the log does not block trying to allocate a page from
1828	 * the page cache (for the log).  This would be bad, since page
1829	 * allocation waits on the kswapd thread that may be committing inodes
1830	 * which would cause log activity.  Was that clear?  I'm trying to
1831	 * avoid deadlock here.
1832	 */
1833	init_waitqueue_head(&log->free_wait);
1834
1835	log->lbuf_free = NULL;
1836
1837	for (i = 0; i < LOGPAGES;) {
1838		char *buffer;
1839		uint offset;
1840		struct page *page;
1841
1842		buffer = (char *) get_zeroed_page(GFP_KERNEL);
1843		if (buffer == NULL)
1844			goto error;
1845		page = virt_to_page(buffer);
1846		for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) {
1847			lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL);
1848			if (lbuf == NULL) {
1849				if (offset == 0)
1850					free_page((unsigned long) buffer);
1851				goto error;
1852			}
1853			if (offset) /* we already have one reference */
1854				get_page(page);
1855			lbuf->l_offset = offset;
1856			lbuf->l_ldata = buffer + offset;
1857			lbuf->l_page = page;
1858			lbuf->l_log = log;
1859			init_waitqueue_head(&lbuf->l_ioevent);
1860
1861			lbuf->l_freelist = log->lbuf_free;
1862			log->lbuf_free = lbuf;
1863			i++;
1864		}
1865	}
1866
1867	return (0);
1868
1869      error:
1870	lbmLogShutdown(log);
1871	return -ENOMEM;
1872}
1873
1874
1875/*
1876 *	lbmLogShutdown()
1877 *
1878 * finalize per log I/O setup at lmLogShutdown()
1879 */
1880static void lbmLogShutdown(struct jfs_log * log)
1881{
1882	struct lbuf *lbuf;
1883
1884	jfs_info("lbmLogShutdown: log:0x%p", log);
1885
1886	lbuf = log->lbuf_free;
1887	while (lbuf) {
1888		struct lbuf *next = lbuf->l_freelist;
1889		__free_page(lbuf->l_page);
1890		kfree(lbuf);
1891		lbuf = next;
1892	}
1893}
1894
1895
1896/*
1897 *	lbmAllocate()
1898 *
1899 * allocate an empty log buffer
1900 */
1901static struct lbuf *lbmAllocate(struct jfs_log * log, int pn)
1902{
1903	struct lbuf *bp;
1904	unsigned long flags;
1905
1906	/*
1907	 * recycle from log buffer freelist if any
1908	 */
1909	LCACHE_LOCK(flags);
1910	LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags);
1911	log->lbuf_free = bp->l_freelist;
1912	LCACHE_UNLOCK(flags);
1913
1914	bp->l_flag = 0;
1915
1916	bp->l_wqnext = NULL;
1917	bp->l_freelist = NULL;
1918
1919	bp->l_pn = pn;
1920	bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize));
1921	bp->l_ceor = 0;
1922
1923	return bp;
1924}
1925
1926
1927/*
1928 *	lbmFree()
1929 *
1930 * release a log buffer to freelist
1931 */
1932static void lbmFree(struct lbuf * bp)
1933{
1934	unsigned long flags;
1935
1936	LCACHE_LOCK(flags);
1937
1938	lbmfree(bp);
1939
1940	LCACHE_UNLOCK(flags);
1941}
1942
1943static void lbmfree(struct lbuf * bp)
1944{
1945	struct jfs_log *log = bp->l_log;
1946
1947	assert(bp->l_wqnext == NULL);
1948
1949	/*
1950	 * return the buffer to head of freelist
1951	 */
1952	bp->l_freelist = log->lbuf_free;
1953	log->lbuf_free = bp;
1954
1955	wake_up(&log->free_wait);
1956	return;
1957}
1958
1959
1960/*
1961 * NAME:	lbmRedrive
1962 *
1963 * FUNCTION:	add a log buffer to the log redrive list
1964 *
1965 * PARAMETER:
1966 *     bp	- log buffer
1967 *
1968 * NOTES:
1969 *	Takes log_redrive_lock.
1970 */
1971static inline void lbmRedrive(struct lbuf *bp)
1972{
1973	unsigned long flags;
1974
1975	spin_lock_irqsave(&log_redrive_lock, flags);
1976	bp->l_redrive_next = log_redrive_list;
1977	log_redrive_list = bp;
1978	spin_unlock_irqrestore(&log_redrive_lock, flags);
1979
1980	wake_up_process(jfsIOthread);
1981}
1982
1983
1984/*
1985 *	lbmRead()
1986 */
1987static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
1988{
1989	struct bio *bio;
1990	struct lbuf *bp;
1991
1992	/*
1993	 * allocate a log buffer
1994	 */
1995	*bpp = bp = lbmAllocate(log, pn);
1996	jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn);
1997
1998	bp->l_flag |= lbmREAD;
1999
2000	bio = bio_alloc(GFP_NOFS, 1);
2001
2002	bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
2003	bio->bi_bdev = log->bdev;
2004	bio->bi_io_vec[0].bv_page = bp->l_page;
2005	bio->bi_io_vec[0].bv_len = LOGPSIZE;
2006	bio->bi_io_vec[0].bv_offset = bp->l_offset;
2007
2008	bio->bi_vcnt = 1;
2009	bio->bi_idx = 0;
2010	bio->bi_size = LOGPSIZE;
2011
2012	bio->bi_end_io = lbmIODone;
2013	bio->bi_private = bp;
2014	submit_bio(READ_SYNC, bio);
2015
2016	wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
2017
2018	return 0;
2019}
2020
2021
2022/*
2023 *	lbmWrite()
2024 *
2025 * buffer at head of pageout queue stays after completion of
2026 * partial-page pageout and redriven by explicit initiation of
2027 * pageout by caller until full-page pageout is completed and
2028 * released.
2029 *
2030 * device driver i/o done redrives pageout of new buffer at
2031 * head of pageout queue when current buffer at head of pageout
2032 * queue is released at the completion of its full-page pageout.
2033 *
2034 * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit().
2035 * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone()
2036 */
2037static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag,
2038		     int cant_block)
2039{
2040	struct lbuf *tail;
2041	unsigned long flags;
2042
2043	jfs_info("lbmWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn);
2044
2045	/* map the logical block address to physical block address */
2046	bp->l_blkno =
2047	    log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
2048
2049	LCACHE_LOCK(flags);		/* disable+lock */
2050
2051	/*
2052	 * initialize buffer for device driver
2053	 */
2054	bp->l_flag = flag;
2055
2056	/*
2057	 *      insert bp at tail of write queue associated with log
2058	 *
2059	 * (request is either for bp already/currently at head of queue
2060	 * or new bp to be inserted at tail)
2061	 */
2062	tail = log->wqueue;
2063
2064	/* is buffer not already on write queue ? */
2065	if (bp->l_wqnext == NULL) {
2066		/* insert at tail of wqueue */
2067		if (tail == NULL) {
2068			log->wqueue = bp;
2069			bp->l_wqnext = bp;
2070		} else {
2071			log->wqueue = bp;
2072			bp->l_wqnext = tail->l_wqnext;
2073			tail->l_wqnext = bp;
2074		}
2075
2076		tail = bp;
2077	}
2078
2079	/* is buffer at head of wqueue and for write ? */
2080	if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) {
2081		LCACHE_UNLOCK(flags);	/* unlock+enable */
2082		return;
2083	}
2084
2085	LCACHE_UNLOCK(flags);	/* unlock+enable */
2086
2087	if (cant_block)
2088		lbmRedrive(bp);
2089	else if (flag & lbmSYNC)
2090		lbmStartIO(bp);
2091	else {
2092		LOGGC_UNLOCK(log);
2093		lbmStartIO(bp);
2094		LOGGC_LOCK(log);
2095	}
2096}
2097
2098
2099/*
2100 *	lbmDirectWrite()
2101 *
2102 * initiate pageout bypassing write queue for sidestream
2103 * (e.g., log superblock) write;
2104 */
2105static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
2106{
2107	jfs_info("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x",
2108		 bp, flag, bp->l_pn);
2109
2110	/*
2111	 * initialize buffer for device driver
2112	 */
2113	bp->l_flag = flag | lbmDIRECT;
2114
2115	/* map the logical block address to physical block address */
2116	bp->l_blkno =
2117	    log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
2118
2119	/*
2120	 *      initiate pageout of the page
2121	 */
2122	lbmStartIO(bp);
2123}
2124
2125
2126/*
2127 * NAME:	lbmStartIO()
2128 *
2129 * FUNCTION:	Interface to DD strategy routine
2130 *
2131 * RETURN:      none
2132 *
2133 * serialization: LCACHE_LOCK() is NOT held during log i/o;
2134 */
2135static void lbmStartIO(struct lbuf * bp)
2136{
2137	struct bio *bio;
2138	struct jfs_log *log = bp->l_log;
2139
2140	jfs_info("lbmStartIO\n");
2141
2142	bio = bio_alloc(GFP_NOFS, 1);
2143	bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
2144	bio->bi_bdev = log->bdev;
2145	bio->bi_io_vec[0].bv_page = bp->l_page;
2146	bio->bi_io_vec[0].bv_len = LOGPSIZE;
2147	bio->bi_io_vec[0].bv_offset = bp->l_offset;
2148
2149	bio->bi_vcnt = 1;
2150	bio->bi_idx = 0;
2151	bio->bi_size = LOGPSIZE;
2152
2153	bio->bi_end_io = lbmIODone;
2154	bio->bi_private = bp;
2155
2156	/* check if journaling to disk has been disabled */
2157	if (log->no_integrity) {
2158		bio->bi_size = 0;
2159		lbmIODone(bio, 0, 0);
2160	} else {
2161		submit_bio(WRITE_SYNC, bio);
2162		INCREMENT(lmStat.submitted);
2163	}
2164}
2165
2166
2167/*
2168 *	lbmIOWait()
2169 */
2170static int lbmIOWait(struct lbuf * bp, int flag)
2171{
2172	unsigned long flags;
2173	int rc = 0;
2174
2175	jfs_info("lbmIOWait1: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
2176
2177	LCACHE_LOCK(flags);		/* disable+lock */
2178
2179	LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags);
2180
2181	rc = (bp->l_flag & lbmERROR) ? -EIO : 0;
2182
2183	if (flag & lbmFREE)
2184		lbmfree(bp);
2185
2186	LCACHE_UNLOCK(flags);	/* unlock+enable */
2187
2188	jfs_info("lbmIOWait2: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
2189	return rc;
2190}
2191
2192/*
2193 *	lbmIODone()
2194 *
2195 * executed at INTIODONE level
2196 */
2197static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2198{
2199	struct lbuf *bp = bio->bi_private;
2200	struct lbuf *nextbp, *tail;
2201	struct jfs_log *log;
2202	unsigned long flags;
2203
2204	if (bio->bi_size)
2205		return 1;
2206
2207	/*
2208	 * get back jfs buffer bound to the i/o buffer
2209	 */
2210	jfs_info("lbmIODone: bp:0x%p flag:0x%x", bp, bp->l_flag);
2211
2212	LCACHE_LOCK(flags);		/* disable+lock */
2213
2214	bp->l_flag |= lbmDONE;
2215
2216	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2217		bp->l_flag |= lbmERROR;
2218
2219		jfs_err("lbmIODone: I/O error in JFS log");
2220	}
2221
2222	bio_put(bio);
2223
2224	/*
2225	 *      pagein completion
2226	 */
2227	if (bp->l_flag & lbmREAD) {
2228		bp->l_flag &= ~lbmREAD;
2229
2230		LCACHE_UNLOCK(flags);	/* unlock+enable */
2231
2232		/* wakeup I/O initiator */
2233		LCACHE_WAKEUP(&bp->l_ioevent);
2234
2235		return 0;
2236	}
2237
2238	/*
2239	 *      pageout completion
2240	 *
2241	 * the bp at the head of write queue has completed pageout.
2242	 *
2243	 * if single-commit/full-page pageout, remove the current buffer
2244	 * from head of pageout queue, and redrive pageout with
2245	 * the new buffer at head of pageout queue;
2246	 * otherwise, the partial-page pageout buffer stays at
2247	 * the head of pageout queue to be redriven for pageout
2248	 * by lmGroupCommit() until full-page pageout is completed.
2249	 */
2250	bp->l_flag &= ~lbmWRITE;
2251	INCREMENT(lmStat.pagedone);
2252
2253	/* update committed lsn */
2254	log = bp->l_log;
2255	log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor;
2256
2257	if (bp->l_flag & lbmDIRECT) {
2258		LCACHE_WAKEUP(&bp->l_ioevent);
2259		LCACHE_UNLOCK(flags);
2260		return 0;
2261	}
2262
2263	tail = log->wqueue;
2264
2265	/* single element queue */
2266	if (bp == tail) {
2267		/* remove head buffer of full-page pageout
2268		 * from log device write queue
2269		 */
2270		if (bp->l_flag & lbmRELEASE) {
2271			log->wqueue = NULL;
2272			bp->l_wqnext = NULL;
2273		}
2274	}
2275	/* multi element queue */
2276	else {
2277		/* remove head buffer of full-page pageout
2278		 * from log device write queue
2279		 */
2280		if (bp->l_flag & lbmRELEASE) {
2281			nextbp = tail->l_wqnext = bp->l_wqnext;
2282			bp->l_wqnext = NULL;
2283
2284			/*
2285			 * redrive pageout of next page at head of write queue:
2286			 * redrive next page without any bound tblk
2287			 * (i.e., page w/o any COMMIT records), or
2288			 * first page of new group commit which has been
2289			 * queued after current page (subsequent pageout
2290			 * is performed synchronously, except page without
2291			 * any COMMITs) by lmGroupCommit() as indicated
2292			 * by lbmWRITE flag;
2293			 */
2294			if (nextbp->l_flag & lbmWRITE) {
2295				/*
2296				 * We can't do the I/O at interrupt time.
2297				 * The jfsIO thread can do it
2298				 */
2299				lbmRedrive(nextbp);
2300			}
2301		}
2302	}
2303
2304	/*
2305	 *      synchronous pageout:
2306	 *
2307	 * buffer has not necessarily been removed from write queue
2308	 * (e.g., synchronous write of partial-page with COMMIT):
2309	 * leave buffer for i/o initiator to dispose
2310	 */
2311	if (bp->l_flag & lbmSYNC) {
2312		LCACHE_UNLOCK(flags);	/* unlock+enable */
2313
2314		/* wakeup I/O initiator */
2315		LCACHE_WAKEUP(&bp->l_ioevent);
2316	}
2317
2318	/*
2319	 *      Group Commit pageout:
2320	 */
2321	else if (bp->l_flag & lbmGC) {
2322		LCACHE_UNLOCK(flags);
2323		lmPostGC(bp);
2324	}
2325
2326	/*
2327	 *      asynchronous pageout:
2328	 *
2329	 * buffer must have been removed from write queue:
2330	 * insert buffer at head of freelist where it can be recycled
2331	 */
2332	else {
2333		assert(bp->l_flag & lbmRELEASE);
2334		assert(bp->l_flag & lbmFREE);
2335		lbmfree(bp);
2336
2337		LCACHE_UNLOCK(flags);	/* unlock+enable */
2338	}
2339
2340	return 0;
2341}
2342
2343int jfsIOWait(void *arg)
2344{
2345	struct lbuf *bp;
2346
2347	do {
2348		spin_lock_irq(&log_redrive_lock);
2349		while ((bp = log_redrive_list) != 0) {
2350			log_redrive_list = bp->l_redrive_next;
2351			bp->l_redrive_next = NULL;
2352			spin_unlock_irq(&log_redrive_lock);
2353			lbmStartIO(bp);
2354			spin_lock_irq(&log_redrive_lock);
2355		}
2356
2357		if (freezing(current)) {
2358			spin_unlock_irq(&log_redrive_lock);
2359			refrigerator();
2360		} else {
2361			set_current_state(TASK_INTERRUPTIBLE);
2362			spin_unlock_irq(&log_redrive_lock);
2363			schedule();
2364			__set_current_state(TASK_RUNNING);
2365		}
2366	} while (!kthread_should_stop());
2367
2368	jfs_info("jfsIOWait being killed!");
2369	return 0;
2370}
2371
2372int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
2373{
2374	int rc = -EIO;
2375	struct jfs_sb_info *sbi;
2376	struct logsuper *logsuper;
2377	struct logpage *lp;
2378	int lspn;		/* log sequence page number */
2379	struct lrd *lrd_ptr;
2380	int npages = 0;
2381	struct lbuf *bp;
2382
2383	jfs_info("lmLogFormat: logAddress:%Ld logSize:%d",
2384		 (long long)logAddress, logSize);
2385
2386	sbi = list_entry(log->sb_list.next, struct jfs_sb_info, log_list);
2387
2388	/* allocate a log buffer */
2389	bp = lbmAllocate(log, 1);
2390
2391	npages = logSize >> sbi->l2nbperpage;
2392
2393	/*
2394	 *      log space:
2395	 *
2396	 * page 0 - reserved;
2397	 * page 1 - log superblock;
2398	 * page 2 - log data page: A SYNC log record is written
2399	 *          into this page at logform time;
2400	 * pages 3-N - log data page: set to empty log data pages;
2401	 */
2402	/*
2403	 *      init log superblock: log page 1
2404	 */
2405	logsuper = (struct logsuper *) bp->l_ldata;
2406
2407	logsuper->magic = cpu_to_le32(LOGMAGIC);
2408	logsuper->version = cpu_to_le32(LOGVERSION);
2409	logsuper->state = cpu_to_le32(LOGREDONE);
2410	logsuper->flag = cpu_to_le32(sbi->mntflag);	/* ? */
2411	logsuper->size = cpu_to_le32(npages);
2412	logsuper->bsize = cpu_to_le32(sbi->bsize);
2413	logsuper->l2bsize = cpu_to_le32(sbi->l2bsize);
2414	logsuper->end = cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE);
2415
2416	bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
2417	bp->l_blkno = logAddress + sbi->nbperpage;
2418	lbmStartIO(bp);
2419	if ((rc = lbmIOWait(bp, 0)))
2420		goto exit;
2421
2422	/*
2423	 *      init pages 2 to npages-1 as log data pages:
2424	 *
2425	 * log page sequence number (lpsn) initialization:
2426	 *
2427	 * pn:   0     1     2     3                 n-1
2428	 *       +-----+-----+=====+=====+===.....===+=====+
2429	 * lspn:             N-1   0     1           N-2
2430	 *                   <--- N page circular file ---->
2431	 *
2432	 * the N (= npages-2) data pages of the log is maintained as
2433	 * a circular file for the log records;
2434	 * lpsn grows by 1 monotonically as each log page is written
2435	 * to the circular file of the log;
2436	 * and setLogpage() will not reset the page number even if
2437	 * the eor is equal to LOGPHDRSIZE. In order for binary search
2438	 * still work in find log end process, we have to simulate the
2439	 * log wrap situation at the log format time.
2440	 * The 1st log page written will have the highest lpsn. Then
2441	 * the succeeding log pages will have ascending order of
2442	 * the lspn starting from 0, ... (N-2)
2443	 */
2444	lp = (struct logpage *) bp->l_ldata;
2445	/*
2446	 * initialize 1st log page to be written: lpsn = N - 1,
2447	 * write a SYNCPT log record is written to this page
2448	 */
2449	lp->h.page = lp->t.page = cpu_to_le32(npages - 3);
2450	lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE);
2451
2452	lrd_ptr = (struct lrd *) &lp->data;
2453	lrd_ptr->logtid = 0;
2454	lrd_ptr->backchain = 0;
2455	lrd_ptr->type = cpu_to_le16(LOG_SYNCPT);
2456	lrd_ptr->length = 0;
2457	lrd_ptr->log.syncpt.sync = 0;
2458
2459	bp->l_blkno += sbi->nbperpage;
2460	bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
2461	lbmStartIO(bp);
2462	if ((rc = lbmIOWait(bp, 0)))
2463		goto exit;
2464
2465	/*
2466	 *      initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
2467	 */
2468	for (lspn = 0; lspn < npages - 3; lspn++) {
2469		lp->h.page = lp->t.page = cpu_to_le32(lspn);
2470		lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
2471
2472		bp->l_blkno += sbi->nbperpage;
2473		bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
2474		lbmStartIO(bp);
2475		if ((rc = lbmIOWait(bp, 0)))
2476			goto exit;
2477	}
2478
2479	rc = 0;
2480exit:
2481	/*
2482	 *      finalize log
2483	 */
2484	/* release the buffer */
2485	lbmFree(bp);
2486
2487	return rc;
2488}
2489
2490#ifdef CONFIG_JFS_STATISTICS
2491int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
2492		      int *eof, void *data)
2493{
2494	int len = 0;
2495	off_t begin;
2496
2497	len += sprintf(buffer,
2498		       "JFS Logmgr stats\n"
2499		       "================\n"
2500		       "commits = %d\n"
2501		       "writes submitted = %d\n"
2502		       "writes completed = %d\n"
2503		       "full pages submitted = %d\n"
2504		       "partial pages submitted = %d\n",
2505		       lmStat.commit,
2506		       lmStat.submitted,
2507		       lmStat.pagedone,
2508		       lmStat.full_page,
2509		       lmStat.partial_page);
2510
2511	begin = offset;
2512	*start = buffer + begin;
2513	len -= begin;
2514
2515	if (len > length)
2516		len = length;
2517	else
2518		*eof = 1;
2519
2520	if (len < 0)
2521		len = 0;
2522
2523	return len;
2524}
2525#endif /* CONFIG_JFS_STATISTICS */
2526