1/*
2 *   Copyright (C) International Business Machines Corp., 2000-2004
3 *   Portions Copyright (C) Christoph Hellwig, 2001-2002
4 *
5 *   This program is free software;  you can redistribute it and/or modify
6 *   it under the terms of the GNU General Public License as published by
7 *   the Free Software Foundation; either version 2 of the License, or
8 *   (at your option) any later version.
9 *
10 *   This program is distributed in the hope that it will be useful,
11 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
12 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
13 *   the GNU General Public License for more details.
14 *
15 *   You should have received a copy of the GNU General Public License
16 *   along with this program;  if not, write to the Free Software
17 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19#ifndef	_H_JFS_LOGMGR
20#define _H_JFS_LOGMGR
21
22#include "jfs_filsys.h"
23#include "jfs_lock.h"
24
25/*
26 *	log manager configuration parameters
27 */
28
29/* log page size */
30#define	LOGPSIZE	4096
31#define	L2LOGPSIZE	12
32
33#define LOGPAGES	16	/* Log pages per mounted file system */
34
35/*
36 *	log logical volume
37 *
38 * a log is used to make the commit operation on journalled
39 * files within the same logical volume group atomic.
40 * a log is implemented with a logical volume.
41 * there is one log per logical volume group.
42 *
43 * block 0 of the log logical volume is not used (ipl etc).
44 * block 1 contains a log "superblock" and is used by logFormat(),
45 * lmLogInit(), lmLogShutdown(), and logRedo() to record status
46 * of the log but is not otherwise used during normal processing.
47 * blocks 2 - (N-1) are used to contain log records.
48 *
49 * when a volume group is varied-on-line, logRedo() must have
50 * been executed before the file systems (logical volumes) in
51 * the volume group can be mounted.
52 */
53/*
54 *	log superblock (block 1 of logical volume)
55 */
56#define	LOGSUPER_B	1
57#define	LOGSTART_B	2
58
59#define	LOGMAGIC	0x87654321
60#define	LOGVERSION	1
61
62#define MAX_ACTIVE	128	/* Max active file systems sharing log */
63
64struct logsuper {
65	__le32 magic;		/* 4: log lv identifier */
66	__le32 version;		/* 4: version number */
67	__le32 serial;		/* 4: log open/mount counter */
68	__le32 size;		/* 4: size in number of LOGPSIZE blocks */
69	__le32 bsize;		/* 4: logical block size in byte */
70	__le32 l2bsize;		/* 4: log2 of bsize */
71
72	__le32 flag;		/* 4: option */
73	__le32 state;		/* 4: state - see below */
74
75	__le32 end;		/* 4: addr of last log record set by logredo */
76	char uuid[16];		/* 16: 128-bit journal uuid */
77	char label[16];		/* 16: journal label */
78	struct {
79		char uuid[16];
80	} active[MAX_ACTIVE];	/* 2048: active file systems list */
81};
82
83#define NULL_UUID "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
84
85/* log flag: commit option (see jfs_filsys.h) */
86
87/* log state */
88#define	LOGMOUNT	0	/* log mounted by lmLogInit() */
89#define LOGREDONE	1	/* log shutdown by lmLogShutdown().
90				 * log redo completed by logredo().
91				 */
92#define LOGWRAP		2	/* log wrapped */
93#define LOGREADERR	3	/* log read error detected in logredo() */
94
95
96/*
97 *	log logical page
98 *
99 * (this comment should be rewritten !)
100 * the header and trailer structures (h,t) will normally have
101 * the same page and eor value.
102 * An exception to this occurs when a complete page write is not
103 * accomplished on a power failure. Since the hardware may "split write"
104 * sectors in the page, any out of order sequence may occur during powerfail
105 * and needs to be recognized during log replay.  The xor value is
106 * an "exclusive or" of all log words in the page up to eor.  This
107 * 32 bit eor is stored with the top 16 bits in the header and the
108 * bottom 16 bits in the trailer.  logredo can easily recognize pages
109 * that were not completed by reconstructing this eor and checking
110 * the log page.
111 *
112 * Previous versions of the operating system did not allow split
113 * writes and detected partially written records in logredo by
114 * ordering the updates to the header, trailer, and the move of data
115 * into the logdata area.  The order: (1) data is moved (2) header
116 * is updated (3) trailer is updated.  In logredo, when the header
117 * differed from the trailer, the header and trailer were reconciled
118 * as follows: if h.page != t.page they were set to the smaller of
119 * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only)
120 * h.eor != t.eor they were set to the smaller of their two values.
121 */
122struct logpage {
123	struct {		/* header */
124		__le32 page;	/* 4: log sequence page number */
125		__le16 rsrvd;	/* 2: */
126		__le16 eor;	/* 2: end-of-log offset of lasrt record write */
127	} h;
128
129	__le32 data[LOGPSIZE / 4 - 4];	/* log record area */
130
131	struct {		/* trailer */
132		__le32 page;	/* 4: normally the same as h.page */
133		__le16 rsrvd;	/* 2: */
134		__le16 eor;	/* 2: normally the same as h.eor */
135	} t;
136};
137
138#define LOGPHDRSIZE	8	/* log page header size */
139#define LOGPTLRSIZE	8	/* log page trailer size */
140
141
142/*
143 *	log record
144 *
145 * (this comment should be rewritten !)
146 * jfs uses only "after" log records (only a single writer is allowed
147 * in a  page, pages are written to temporary paging space if
148 * if they must be written to disk before commit, and i/o is
149 * scheduled for modified pages to their home location after
150 * the log records containing the after values and the commit
151 * record is written to the log on disk, undo discards the copy
152 * in main-memory.)
153 *
154 * a log record consists of a data area of variable length followed by
155 * a descriptor of fixed size LOGRDSIZE bytes.
156 * the  data area is rounded up to an integral number of 4-bytes and
157 * must be no longer than LOGPSIZE.
158 * the descriptor is of size of multiple of 4-bytes and aligned on a
159 * 4-byte boundary.
160 * records are packed one after the other in the data area of log pages.
161 * (sometimes a DUMMY record is inserted so that at least one record ends
162 * on every page or the longest record is placed on at most two pages).
163 * the field eor in page header/trailer points to the byte following
164 * the last record on a page.
165 */
166
167/* log record types */
168#define LOG_COMMIT		0x8000
169#define LOG_SYNCPT		0x4000
170#define LOG_MOUNT		0x2000
171#define LOG_REDOPAGE		0x0800
172#define LOG_NOREDOPAGE		0x0080
173#define LOG_NOREDOINOEXT	0x0040
174#define LOG_UPDATEMAP		0x0008
175#define LOG_NOREDOFILE		0x0001
176
177/* REDOPAGE/NOREDOPAGE log record data type */
178#define	LOG_INODE		0x0001
179#define	LOG_XTREE		0x0002
180#define	LOG_DTREE		0x0004
181#define	LOG_BTROOT		0x0010
182#define	LOG_EA			0x0020
183#define	LOG_ACL			0x0040
184#define	LOG_DATA		0x0080
185#define	LOG_NEW			0x0100
186#define	LOG_EXTEND		0x0200
187#define LOG_RELOCATE		0x0400
188#define LOG_DIR_XTREE		0x0800	/* Xtree is in directory inode */
189
190/* UPDATEMAP log record descriptor type */
191#define	LOG_ALLOCXADLIST	0x0080
192#define	LOG_ALLOCPXDLIST	0x0040
193#define	LOG_ALLOCXAD		0x0020
194#define	LOG_ALLOCPXD		0x0010
195#define	LOG_FREEXADLIST		0x0008
196#define	LOG_FREEPXDLIST		0x0004
197#define	LOG_FREEXAD		0x0002
198#define	LOG_FREEPXD		0x0001
199
200
201struct lrd {
202	/*
203	 * type independent area
204	 */
205	__le32 logtid;		/* 4: log transaction identifier */
206	__le32 backchain;	/* 4: ptr to prev record of same transaction */
207	__le16 type;		/* 2: record type */
208	__le16 length;		/* 2: length of data in record (in byte) */
209	__le32 aggregate;	/* 4: file system lv/aggregate */
210	/* (16) */
211
212	/*
213	 * type dependent area (20)
214	 */
215	union {
216
217		/*
218		 *      COMMIT: commit
219		 *
220		 * transaction commit: no type-dependent information;
221		 */
222
223		/*
224		 *      REDOPAGE: after-image
225		 *
226		 * apply after-image;
227		 *
228		 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
229		 */
230		struct {
231			__le32 fileset;	/* 4: fileset number */
232			__le32 inode;	/* 4: inode number */
233			__le16 type;	/* 2: REDOPAGE record type */
234			__le16 l2linesize;	/* 2: log2 of line size */
235			pxd_t pxd;	/* 8: on-disk page pxd */
236		} redopage;	/* (20) */
237
238		/*
239		 *      NOREDOPAGE: the page is freed
240		 *
241		 * do not apply after-image records which precede this record
242		 * in the log with the same page block number to this page.
243		 *
244		 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
245		 */
246		struct {
247			__le32 fileset;	/* 4: fileset number */
248			__le32 inode;	/* 4: inode number */
249			__le16 type;	/* 2: NOREDOPAGE record type */
250			__le16 rsrvd;	/* 2: reserved */
251			pxd_t pxd;	/* 8: on-disk page pxd */
252		} noredopage;	/* (20) */
253
254		/*
255		 *      UPDATEMAP: update block allocation map
256		 *
257		 * either in-line PXD,
258		 * or     out-of-line  XADLIST;
259		 *
260		 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
261		 */
262		struct {
263			__le32 fileset;	/* 4: fileset number */
264			__le32 inode;	/* 4: inode number */
265			__le16 type;	/* 2: UPDATEMAP record type */
266			__le16 nxd;	/* 2: number of extents */
267			pxd_t pxd;	/* 8: pxd */
268		} updatemap;	/* (20) */
269
270		/*
271		 *      NOREDOINOEXT: the inode extent is freed
272		 *
273		 * do not apply after-image records which precede this
274		 * record in the log with the any of the 4 page block
275		 * numbers in this inode extent.
276		 *
277		 * NOTE: The fileset and pxd fields MUST remain in
278		 *       the same fields in the REDOPAGE record format.
279		 *
280		 */
281		struct {
282			__le32 fileset;	/* 4: fileset number */
283			__le32 iagnum;	/* 4: IAG number     */
284			__le32 inoext_idx;	/* 4: inode extent index */
285			pxd_t pxd;	/* 8: on-disk page pxd */
286		} noredoinoext;	/* (20) */
287
288		/*
289		 *      SYNCPT: log sync point
290		 *
291		 * replay log upto syncpt address specified;
292		 */
293		struct {
294			__le32 sync;	/* 4: syncpt address (0 = here) */
295		} syncpt;
296
297		/*
298		 *      MOUNT: file system mount
299		 *
300		 * file system mount: no type-dependent information;
301		 */
302
303		/*
304		 *      ? FREEXTENT: free specified extent(s)
305		 *
306		 * free specified extent(s) from block allocation map
307		 * N.B.: nextents should be length of data/sizeof(xad_t)
308		 */
309		struct {
310			__le32 type;	/* 4: FREEXTENT record type */
311			__le32 nextent;	/* 4: number of extents */
312
313			/* data: PXD or XAD list */
314		} freextent;
315
316		/*
317		 *      ? NOREDOFILE: this file is freed
318		 *
319		 * do not apply records which precede this record in the log
320		 * with the same inode number.
321		 *
322		 * NOREDOFILE must be the first to be written at commit
323		 * (last to be read in logredo()) - it prevents
324		 * replay of preceding updates of all preceding generations
325		 * of the inumber esp. the on-disk inode itself.
326		 */
327		struct {
328			__le32 fileset;	/* 4: fileset number */
329			__le32 inode;	/* 4: inode number */
330		} noredofile;
331
332		/*
333		 *      ? NEWPAGE:
334		 *
335		 * metadata type dependent
336		 */
337		struct {
338			__le32 fileset;	/* 4: fileset number */
339			__le32 inode;	/* 4: inode number */
340			__le32 type;	/* 4: NEWPAGE record type */
341			pxd_t pxd;	/* 8: on-disk page pxd */
342		} newpage;
343
344		/*
345		 *      ? DUMMY: filler
346		 *
347		 * no type-dependent information
348		 */
349	} log;
350};					/* (36) */
351
352#define	LOGRDSIZE	(sizeof(struct lrd))
353
354/*
355 *	line vector descriptor
356 */
357struct lvd {
358	__le16 offset;
359	__le16 length;
360};
361
362
363/*
364 *	log logical volume
365 */
366struct jfs_log {
367
368	struct list_head sb_list;/*  This is used to sync metadata
369				 *    before writing syncpt.
370				 */
371	struct list_head journal_list; /* Global list */
372	struct block_device *bdev; /* 4: log lv pointer */
373	int serial;		/* 4: log mount serial number */
374
375	s64 base;		/* @8: log extent address (inline log ) */
376	int size;		/* 4: log size in log page (in page) */
377	int l2bsize;		/* 4: log2 of bsize */
378
379	long flag;		/* 4: flag */
380
381	struct lbuf *lbuf_free;	/* 4: free lbufs */
382	wait_queue_head_t free_wait;	/* 4: */
383
384	/* log write */
385	int logtid;		/* 4: log tid */
386	int page;		/* 4: page number of eol page */
387	int eor;		/* 4: eor of last record in eol page */
388	struct lbuf *bp;	/* 4: current log page buffer */
389
390	struct mutex loglock;	/* 4: log write serialization lock */
391
392	/* syncpt */
393	int nextsync;		/* 4: bytes to write before next syncpt */
394	int active;		/* 4: */
395	wait_queue_head_t syncwait;	/* 4: */
396
397	/* commit */
398	uint cflag;		/* 4: */
399	struct list_head cqueue; /* FIFO commit queue */
400	struct tblock *flush_tblk; /* tblk we're waiting on for flush */
401	int gcrtc;		/* 4: GC_READY transaction count */
402	struct tblock *gclrt;	/* 4: latest GC_READY transaction */
403	spinlock_t gclock;	/* 4: group commit lock */
404	int logsize;		/* 4: log data area size in byte */
405	int lsn;		/* 4: end-of-log */
406	int clsn;		/* 4: clsn */
407	int syncpt;		/* 4: addr of last syncpt record */
408	int sync;		/* 4: addr from last logsync() */
409	struct list_head synclist;	/* 8: logsynclist anchor */
410	spinlock_t synclock;	/* 4: synclist lock */
411	struct lbuf *wqueue;	/* 4: log pageout queue */
412	int count;		/* 4: count */
413	char uuid[16];		/* 16: 128-bit uuid of log device */
414
415	int no_integrity;	/* 3: flag to disable journaling to disk */
416};
417
418/*
419 * Log flag
420 */
421#define log_INLINELOG	1
422#define log_SYNCBARRIER	2
423#define log_QUIESCE	3
424#define log_FLUSH	4
425
426/*
427 * group commit flag
428 */
429/* jfs_log */
430#define logGC_PAGEOUT	0x00000001
431
432/* tblock/lbuf */
433#define tblkGC_QUEUE		0x0001
434#define tblkGC_READY		0x0002
435#define tblkGC_COMMIT		0x0004
436#define tblkGC_COMMITTED	0x0008
437#define tblkGC_EOP		0x0010
438#define tblkGC_FREE		0x0020
439#define tblkGC_LEADER		0x0040
440#define tblkGC_ERROR		0x0080
441#define tblkGC_LAZY		0x0100	// D230860
442#define tblkGC_UNLOCKED		0x0200	// D230860
443
444/*
445 *		log cache buffer header
446 */
447struct lbuf {
448	struct jfs_log *l_log;	/* 4: log associated with buffer */
449
450	/*
451	 * data buffer base area
452	 */
453	uint l_flag;		/* 4: pageout control flags */
454
455	struct lbuf *l_wqnext;	/* 4: write queue link */
456	struct lbuf *l_freelist;	/* 4: freelistlink */
457
458	int l_pn;		/* 4: log page number */
459	int l_eor;		/* 4: log record eor */
460	int l_ceor;		/* 4: committed log record eor */
461
462	s64 l_blkno;		/* 8: log page block number */
463	caddr_t l_ldata;	/* 4: data page */
464	struct page *l_page;	/* The page itself */
465	uint l_offset;		/* Offset of l_ldata within the page */
466
467	wait_queue_head_t l_ioevent;	/* 4: i/o done event */
468};
469
470/* Reuse l_freelist for redrive list */
471#define l_redrive_next l_freelist
472
473/*
474 *	logsynclist block
475 *
476 * common logsyncblk prefix for jbuf_t and tblock
477 */
478struct logsyncblk {
479	u16 xflag;		/* flags */
480	u16 flag;		/* only meaninful in tblock */
481	lid_t lid;		/* lock id */
482	s32 lsn;		/* log sequence number */
483	struct list_head synclist;	/* log sync list link */
484};
485
486/*
487 *	logsynclist serialization (per log)
488 */
489
490#define LOGSYNC_LOCK_INIT(log) spin_lock_init(&(log)->synclock)
491#define LOGSYNC_LOCK(log, flags) spin_lock_irqsave(&(log)->synclock, flags)
492#define LOGSYNC_UNLOCK(log, flags) \
493	spin_unlock_irqrestore(&(log)->synclock, flags)
494
495/* compute the difference in bytes of lsn from sync point */
496#define logdiff(diff, lsn, log)\
497{\
498	diff = (lsn) - (log)->syncpt;\
499	if (diff < 0)\
500		diff += (log)->logsize;\
501}
502
503extern int lmLogOpen(struct super_block *sb);
504extern int lmLogClose(struct super_block *sb);
505extern int lmLogShutdown(struct jfs_log * log);
506extern int lmLogInit(struct jfs_log * log);
507extern int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize);
508extern int lmGroupCommit(struct jfs_log *, struct tblock *);
509extern int jfsIOWait(void *);
510extern void jfs_flush_journal(struct jfs_log * log, int wait);
511extern void jfs_syncpt(struct jfs_log *log, int hard_sync);
512
513#endif				/* _H_JFS_LOGMGR */
514