1/*
2 * linux/fs/recovery.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal recovery routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/errno.h>
19#include <linux/slab.h>
20#include "hfsplus_jbd.h"
21
22/*
23 * Maintain information about the progress of the recovery job, so that
24 * the different passes can carry information between them.
25 */
26struct recovery_info
27{
28	hfsplus_jbd_tid_t		start_transaction;
29	hfsplus_jbd_tid_t		end_transaction;
30
31	int		nr_replays;
32	int		nr_revokes;
33	int		nr_revoke_hits;
34};
35
36enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
37static int do_one_pass(hfsplus_jbd_t *journal,
38				struct recovery_info *info, enum passtype pass);
39static int scan_revoke_records(hfsplus_jbd_t *, struct buffer_head *,
40				hfsplus_jbd_tid_t, struct recovery_info *);
41
42#ifdef __KERNEL__
43
44/* Release readahead buffers after use */
45void hfsplus_jbd_brelse_array(struct buffer_head *b[], int n)
46{
47	while (--n >= 0)
48		brelse (b[n]);
49}
50
51
52/*
53 * When reading from the journal, we are going through the block device
54 * layer directly and so there is no readahead being done for us.  We
55 * need to implement any readahead ourselves if we want it to happen at
56 * all.  Recovery is basically one long sequential read, so make sure we
57 * do the IO in reasonably large chunks.
58 *
59 * This is not so critical that we need to be enormously clever about
60 * the readahead size, though.  128K is a purely arbitrary, good-enough
61 * fixed value.
62 */
63
64#define MAXBUF 8
65static int do_readahead(hfsplus_jbd_t *journal, unsigned int start)
66{
67	int err;
68	unsigned int max, nbufs, next;
69	unsigned long blocknr;
70	struct buffer_head *bh;
71
72	struct buffer_head * bufs[MAXBUF];
73
74	/* Do up to 128K of readahead */
75	max = start + (128 * 1024 / journal->j_blocksize);
76	if (max > journal->j_maxlen)
77		max = journal->j_maxlen;
78
79	/* Do the readahead itself.  We'll submit MAXBUF buffer_heads at
80	 * a time to the block device IO layer. */
81
82	nbufs = 0;
83
84	for (next = start; next < max; next++) {
85		err = hfsplus_jbd_bmap(journal, next, &blocknr);
86
87		if (err) {
88			printk (KERN_ERR "JBD: bad block at offset %u\n",
89				next);
90			goto failed;
91		}
92
93		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
94		if (!bh) {
95			err = -ENOMEM;
96			goto failed;
97		}
98
99		if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
100			bufs[nbufs++] = bh;
101			if (nbufs == MAXBUF) {
102				ll_rw_block(READ, nbufs, bufs);
103				hfsplus_jbd_brelse_array(bufs, nbufs);
104				nbufs = 0;
105			}
106		} else
107			brelse(bh);
108	}
109
110	if (nbufs)
111		ll_rw_block(READ, nbufs, bufs);
112	err = 0;
113
114failed:
115	if (nbufs)
116		hfsplus_jbd_brelse_array(bufs, nbufs);
117	return err;
118}
119
120#endif /* __KERNEL__ */
121
122
123/*
124 * Read a block from the journal
125 */
126
127static int jread(struct buffer_head **bhp, hfsplus_jbd_t *journal,
128		 unsigned int offset)
129{
130	int err;
131	unsigned long blocknr;
132	struct buffer_head *bh;
133
134	*bhp = NULL;
135
136	if (offset >= journal->j_maxlen) {
137		printk(KERN_ERR "JBD: corrupted journal superblock\n");
138		return -EIO;
139	}
140
141	err = hfsplus_jbd_bmap(journal, offset, &blocknr);
142
143	if (err) {
144		printk (KERN_ERR "JBD: bad block at offset %u\n",
145			offset);
146		return err;
147	}
148
149	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
150	if (!bh)
151		return -ENOMEM;
152
153	if (!buffer_uptodate(bh)) {
154		/* If this is a brand new buffer, start readahead.
155                   Otherwise, we assume we are already reading it.  */
156		if (!buffer_req(bh))
157			do_readahead(journal, offset);
158		wait_on_buffer(bh);
159	}
160
161	if (!buffer_uptodate(bh)) {
162		printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
163			offset);
164		brelse(bh);
165		return -EIO;
166	}
167
168	*bhp = bh;
169	return 0;
170}
171
172
173/*
174 * Count the number of in-use tags in a journal descriptor block.
175 */
176
177static int count_tags(struct buffer_head *bh, int size)
178{
179	char *			tagp;
180	hfsplus_jbd_block_tag_t *	tag;
181	int			nr = 0;
182
183	tagp = &bh->b_data[sizeof(hfsplus_jbd_header_t)];
184
185	while ((tagp - bh->b_data + sizeof(hfsplus_jbd_block_tag_t)) <= size) {
186		tag = (hfsplus_jbd_block_tag_t *) tagp;
187
188		nr++;
189		tagp += sizeof(hfsplus_jbd_block_tag_t);
190		if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
191			tagp += 16;
192
193		if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
194			break;
195	}
196
197	return nr;
198}
199
200
201/* Make sure we wrap around the log correctly! */
202#define wrap(journal, var)						\
203do {									\
204	if (var >= (journal)->j_last)					\
205		var -= ((journal)->j_last - (journal)->j_first);	\
206} while (0)
207
208/**
209 * hfsplus_jbd_recover - recovers a on-disk journal
210 * @journal: the journal to recover
211 *
212 * The primary function for recovering the log contents when mounting a
213 * journaled device.
214 *
215 * Recovery is done in three passes.  In the first pass, we look for the
216 * end of the log.  In the second, we assemble the list of revoke
217 * blocks.  In the third and final pass, we replay any un-revoked blocks
218 * in the log.
219 */
220int hfsplus_jbd_recover(hfsplus_jbd_t *journal)
221{
222	int			err;
223	hfsplus_jbd_superblock_t *	sb;
224
225	struct recovery_info	info;
226
227	memset(&info, 0, sizeof(info));
228	sb = journal->j_superblock;
229
230	/*
231	 * The journal superblock's s_start field (the current log head)
232	 * is always zero if, and only if, the journal was cleanly
233	 * unmounted.
234	 */
235
236	if (!sb->s_start) {
237		hfsplus_jbd_debug(1, "No recovery required, last transaction %d\n",
238			  be32_to_cpu(sb->s_sequence));
239		journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
240		return 0;
241	}
242
243	err = do_one_pass(journal, &info, PASS_SCAN);
244	if (!err)
245		err = do_one_pass(journal, &info, PASS_REVOKE);
246	if (!err)
247		err = do_one_pass(journal, &info, PASS_REPLAY);
248
249	hfsplus_jbd_debug(0, "JBD: recovery, exit status %d, "
250		  "recovered transactions %u to %u\n",
251		  err, info.start_transaction, info.end_transaction);
252	hfsplus_jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
253		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
254
255	/* Restart the log at the next transaction ID, thus invalidating
256	 * any existing commit records in the log. */
257	journal->j_transaction_sequence = ++info.end_transaction;
258
259	hfsplus_jbd_clear_revoke(journal);
260	sync_blockdev(journal->j_fs_dev);
261	return err;
262}
263
264/**
265 * hfsplus_jbd_skip_recovery - Start journal and wipe exiting records
266 * @journal: journal to startup
267 *
268 * Locate any valid recovery information from the journal and set up the
269 * journal structures in memory to ignore it (presumably because the
270 * caller has evidence that it is out of date).
271 * This function does'nt appear to be exorted..
272 *
273 * We perform one pass over the journal to allow us to tell the user how
274 * much recovery information is being erased, and to let us initialise
275 * the journal transaction sequence numbers to the next unused ID.
276 */
277int hfsplus_jbd_skip_recovery(hfsplus_jbd_t *journal)
278{
279	int			err;
280	hfsplus_jbd_superblock_t *	sb;
281
282	struct recovery_info	info;
283
284	memset (&info, 0, sizeof(info));
285	sb = journal->j_superblock;
286
287	err = do_one_pass(journal, &info, PASS_SCAN);
288
289	if (err) {
290		printk(KERN_ERR "JBD: error %d scanning journal\n", err);
291		++journal->j_transaction_sequence;
292	} else {
293		int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
294		hfsplus_jbd_debug(0,
295			  "JBD: ignoring %d transaction%s from the journal.\n",
296			  dropped, (dropped == 1) ? "" : "s");
297		journal->j_transaction_sequence = ++info.end_transaction;
298	}
299
300	journal->j_tail = 0;
301	return err;
302}
303
304static int do_one_pass(hfsplus_jbd_t *journal,
305			struct recovery_info *info, enum passtype pass)
306{
307	unsigned int		first_commit_ID, next_commit_ID;
308	unsigned long		next_log_block;
309	int			err, success = 0;
310	hfsplus_jbd_superblock_t *	sb;
311	hfsplus_jbd_header_t * 	tmp;
312	struct buffer_head *	bh;
313	unsigned int		sequence;
314	int			blocktype;
315
316	/* Precompute the maximum metadata descriptors in a descriptor block */
317	int			MAX_BLOCKS_PER_DESC;
318	MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(hfsplus_jbd_header_t))
319			       / sizeof(hfsplus_jbd_block_tag_t));
320
321	/*
322	 * First thing is to establish what we expect to find in the log
323	 * (in terms of transaction IDs), and where (in terms of log
324	 * block offsets): query the superblock.
325	 */
326
327	sb = journal->j_superblock;
328	next_commit_ID = be32_to_cpu(sb->s_sequence);
329	next_log_block = be32_to_cpu(sb->s_start);
330
331	first_commit_ID = next_commit_ID;
332	if (pass == PASS_SCAN)
333		info->start_transaction = first_commit_ID;
334
335	hfsplus_jbd_debug(1, "Starting recovery pass %d\n", pass);
336
337	/*
338	 * Now we walk through the log, transaction by transaction,
339	 * making sure that each transaction has a commit block in the
340	 * expected place.  Each complete transaction gets replayed back
341	 * into the main filesystem.
342	 */
343
344	while (1) {
345		int			flags;
346		char *			tagp;
347		hfsplus_jbd_block_tag_t *	tag;
348		struct buffer_head *	obh;
349		struct buffer_head *	nbh;
350
351		cond_resched();		/* We're under lock_kernel() */
352
353		/* If we already know where to stop the log traversal,
354		 * check right now that we haven't gone past the end of
355		 * the log. */
356
357		if (pass != PASS_SCAN)
358			if (hfsplus_tid_geq(next_commit_ID, info->end_transaction))
359				break;
360
361		hfsplus_jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
362			  next_commit_ID, next_log_block, journal->j_last);
363
364		/* Skip over each chunk of the transaction looking
365		 * either the next descriptor block or the final commit
366		 * record. */
367
368		hfsplus_jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
369		err = jread(&bh, journal, next_log_block);
370		if (err)
371			goto failed;
372
373		next_log_block++;
374		wrap(journal, next_log_block);
375
376		/* What kind of buffer is it?
377		 *
378		 * If it is a descriptor block, check that it has the
379		 * expected sequence number.  Otherwise, we're all done
380		 * here. */
381
382		tmp = (hfsplus_jbd_header_t *)bh->b_data;
383
384		if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
385printk("@@@@@@@ Oops! .. Need to check it function: %s, Line: %d\n", __FUNCTION__, __LINE__);
386			brelse(bh);
387			break;
388		}
389
390		blocktype = be32_to_cpu(tmp->h_blocktype);
391		sequence = be32_to_cpu(tmp->h_sequence);
392		hfsplus_jbd_debug(3, "Found magic %d, sequence %d\n",
393			  blocktype, sequence);
394
395		if (sequence != next_commit_ID) {
396			brelse(bh);
397			break;
398		}
399
400		/* OK, we have a valid descriptor block which matches
401		 * all of the sequence number checks.  What are we going
402		 * to do with it?  That depends on the pass... */
403
404		switch(blocktype) {
405		case JFS_DESCRIPTOR_BLOCK:
406			/* If it is a valid descriptor block, replay it
407			 * in pass REPLAY; otherwise, just skip over the
408			 * blocks it describes. */
409			if (pass != PASS_REPLAY) {
410				next_log_block +=
411					count_tags(bh, journal->j_blocksize);
412				wrap(journal, next_log_block);
413				brelse(bh);
414				continue;
415			}
416
417			/* A descriptor block: we can now write all of
418			 * the data blocks.  Yay, useful work is finally
419			 * getting done here! */
420
421			tagp = &bh->b_data[sizeof(hfsplus_jbd_header_t)];
422			while ((tagp - bh->b_data +sizeof(hfsplus_jbd_block_tag_t))
423			       <= journal->j_blocksize) {
424				unsigned long io_block;
425
426				tag = (hfsplus_jbd_block_tag_t *) tagp;
427				flags = be32_to_cpu(tag->t_flags);
428
429				io_block = next_log_block++;
430				wrap(journal, next_log_block);
431				err = jread(&obh, journal, io_block);
432				if (err) {
433					/* Recover what we can, but
434					 * report failure at the end. */
435					success = err;
436					printk (KERN_ERR
437						"JBD: IO error %d recovering "
438						"block %ld in log\n",
439						err, io_block);
440				} else {
441					unsigned long blocknr;
442
443					HFSPLUS_J_ASSERT(obh != NULL);
444					blocknr = be32_to_cpu(tag->t_blocknr);
445
446					/* If the block has been
447					 * revoked, then we're all done
448					 * here. */
449					if (hfsplus_jbd_test_revoke
450					    (journal, blocknr,
451					     next_commit_ID)) {
452						brelse(obh);
453						++info->nr_revoke_hits;
454						goto skip_write;
455					}
456
457					/* Find a buffer for the new
458					 * data being restored */
459					nbh = __getblk(journal->j_fs_dev,
460							blocknr,
461							journal->j_blocksize);
462					if (nbh == NULL) {
463						printk(KERN_ERR
464						       "JBD: Out of memory "
465						       "during recovery.\n");
466						err = -ENOMEM;
467						brelse(bh);
468						brelse(obh);
469						goto failed;
470					}
471
472					lock_buffer(nbh);
473					memcpy(nbh->b_data, obh->b_data,
474							journal->j_blocksize);
475					if (flags & JFS_FLAG_ESCAPE) {
476printk("@@@@@@@ Oops! .. Need to check it function: %s, Line: %d\n", __FUNCTION__, __LINE__);
477						*((__be32 *)bh->b_data) =
478						cpu_to_be32(JFS_MAGIC_NUMBER);
479					}
480
481					HFSPLUS_BUFFER_TRACE(nbh, "marking dirty");
482					set_buffer_uptodate(nbh);
483					mark_buffer_dirty(nbh);
484					HFSPLUS_BUFFER_TRACE(nbh, "marking uptodate");
485					++info->nr_replays;
486					/* ll_rw_block(WRITE, 1, &nbh); */
487					unlock_buffer(nbh);
488					brelse(obh);
489					brelse(nbh);
490				}
491
492			skip_write:
493				tagp += sizeof(hfsplus_jbd_block_tag_t);
494				if (!(flags & JFS_FLAG_SAME_UUID))
495					tagp += 16;
496
497				if (flags & JFS_FLAG_LAST_TAG)
498					break;
499			}
500
501			brelse(bh);
502			continue;
503
504		case JFS_COMMIT_BLOCK:
505			/* Found an expected commit block: not much to
506			 * do other than move on to the next sequence
507			 * number. */
508			brelse(bh);
509			next_commit_ID++;
510			continue;
511
512		case JFS_REVOKE_BLOCK:
513			/* If we aren't in the REVOKE pass, then we can
514			 * just skip over this block. */
515			if (pass != PASS_REVOKE) {
516				brelse(bh);
517				continue;
518			}
519
520			err = scan_revoke_records(journal, bh,
521						  next_commit_ID, info);
522			brelse(bh);
523			if (err)
524				goto failed;
525			continue;
526
527		default:
528			hfsplus_jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
529				  blocktype);
530			goto done;
531		}
532	}
533
534 done:
535	/*
536	 * We broke out of the log scan loop: either we came to the
537	 * known end of the log or we found an unexpected block in the
538	 * log.  If the latter happened, then we know that the "current"
539	 * transaction marks the end of the valid log.
540	 */
541
542	if (pass == PASS_SCAN)
543		info->end_transaction = next_commit_ID;
544	else {
545		/* It's really bad news if different passes end up at
546		 * different places (but possible due to IO errors). */
547		if (info->end_transaction != next_commit_ID) {
548			printk (KERN_ERR "JBD: recovery pass %d ended at "
549				"transaction %u, expected %u\n",
550				pass, next_commit_ID, info->end_transaction);
551			if (!success)
552				success = -EIO;
553		}
554	}
555
556	return success;
557
558 failed:
559	return err;
560}
561
562
563/* Scan a revoke record, marking all blocks mentioned as revoked. */
564
565static int scan_revoke_records(hfsplus_jbd_t *journal, struct buffer_head *bh,
566			       hfsplus_jbd_tid_t sequence, struct recovery_info *info)
567{
568	hfsplus_jbd_revoke_header_t *header;
569	int offset, max;
570
571	header = (hfsplus_jbd_revoke_header_t *) bh->b_data;
572	offset = sizeof(hfsplus_jbd_revoke_header_t);
573	max = be32_to_cpu(header->r_count);
574
575	while (offset < max) {
576		unsigned long blocknr;
577		int err;
578
579		blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
580		offset += 4;
581		err = hfsplus_jbd_set_revoke(journal, blocknr, sequence);
582		if (err)
583			return err;
584		++info->nr_revokes;
585	}
586	return 0;
587}
588