1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_dir.h"
28#include "xfs_dir2.h"
29#include "xfs_dmapi.h"
30#include "xfs_mount.h"
31#include "xfs_error.h"
32#include "xfs_bmap_btree.h"
33#include "xfs_alloc_btree.h"
34#include "xfs_ialloc_btree.h"
35#include "xfs_dir_sf.h"
36#include "xfs_dir2_sf.h"
37#include "xfs_attr_sf.h"
38#include "xfs_dinode.h"
39#include "xfs_inode.h"
40#include "xfs_inode_item.h"
41#include "xfs_imap.h"
42#include "xfs_alloc.h"
43#include "xfs_ialloc.h"
44#include "xfs_log_priv.h"
45#include "xfs_buf_item.h"
46#include "xfs_log_recover.h"
47#include "xfs_extfree_item.h"
48#include "xfs_trans_priv.h"
49#include "xfs_quota.h"
50#include "xfs_rw.h"
51
52STATIC int	xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
53STATIC int	xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
54STATIC void	xlog_recover_insert_item_backq(xlog_recover_item_t **q,
55					       xlog_recover_item_t *item);
56#if defined(DEBUG)
57STATIC void	xlog_recover_check_summary(xlog_t *);
58STATIC void	xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int);
59#else
60#define	xlog_recover_check_summary(log)
61#define	xlog_recover_check_ail(mp, lip, gen)
62#endif
63
64
65/*
66 * Sector aligned buffer routines for buffer create/read/write/access
67 */
68
69#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs)	\
70	( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \
71	((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
72#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno)	((bno) & ~(log)->l_sectbb_mask)
73
74xfs_buf_t *
75xlog_get_bp(
76	xlog_t		*log,
77	int		num_bblks)
78{
79	ASSERT(num_bblks > 0);
80
81	if (log->l_sectbb_log) {
82		if (num_bblks > 1)
83			num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
84		num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks);
85	}
86	return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp);
87}
88
89void
90xlog_put_bp(
91	xfs_buf_t	*bp)
92{
93	xfs_buf_free(bp);
94}
95
96
97/*
98 * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
99 */
100int
101xlog_bread(
102	xlog_t		*log,
103	xfs_daddr_t	blk_no,
104	int		nbblks,
105	xfs_buf_t	*bp)
106{
107	int		error;
108
109	if (log->l_sectbb_log) {
110		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
111		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
112	}
113
114	ASSERT(nbblks > 0);
115	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
116	ASSERT(bp);
117
118	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
119	XFS_BUF_READ(bp);
120	XFS_BUF_BUSY(bp);
121	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
122	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
123
124	xfsbdstrat(log->l_mp, bp);
125	if ((error = xfs_iowait(bp)))
126		xfs_ioerror_alert("xlog_bread", log->l_mp,
127				  bp, XFS_BUF_ADDR(bp));
128	return error;
129}
130
131/*
132 * Write out the buffer at the given block for the given number of blocks.
133 * The buffer is kept locked across the write and is returned locked.
134 * This can only be used for synchronous log writes.
135 */
136STATIC int
137xlog_bwrite(
138	xlog_t		*log,
139	xfs_daddr_t	blk_no,
140	int		nbblks,
141	xfs_buf_t	*bp)
142{
143	int		error;
144
145	if (log->l_sectbb_log) {
146		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
147		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
148	}
149
150	ASSERT(nbblks > 0);
151	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
152
153	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
154	XFS_BUF_ZEROFLAGS(bp);
155	XFS_BUF_BUSY(bp);
156	XFS_BUF_HOLD(bp);
157	XFS_BUF_PSEMA(bp, PRIBIO);
158	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
159	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
160
161	if ((error = xfs_bwrite(log->l_mp, bp)))
162		xfs_ioerror_alert("xlog_bwrite", log->l_mp,
163				  bp, XFS_BUF_ADDR(bp));
164	return error;
165}
166
167STATIC xfs_caddr_t
168xlog_align(
169	xlog_t		*log,
170	xfs_daddr_t	blk_no,
171	int		nbblks,
172	xfs_buf_t	*bp)
173{
174	xfs_caddr_t	ptr;
175
176	if (!log->l_sectbb_log)
177		return XFS_BUF_PTR(bp);
178
179	ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
180	ASSERT(XFS_BUF_SIZE(bp) >=
181		BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
182	return ptr;
183}
184
185#ifdef DEBUG
186/*
187 * dump debug superblock and log record information
188 */
189STATIC void
190xlog_header_check_dump(
191	xfs_mount_t		*mp,
192	xlog_rec_header_t	*head)
193{
194	int			b;
195
196	printk("%s:  SB : uuid = ", __FUNCTION__);
197	for (b = 0; b < 16; b++)
198		printk("%02x",((unsigned char *)&mp->m_sb.sb_uuid)[b]);
199	printk(", fmt = %d\n", XLOG_FMT);
200	printk("    log : uuid = ");
201	for (b = 0; b < 16; b++)
202		printk("%02x",((unsigned char *)&head->h_fs_uuid)[b]);
203	printk(", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT));
204}
205#else
206#define xlog_header_check_dump(mp, head)
207#endif
208
209/*
210 * check log record header for recovery
211 */
212STATIC int
213xlog_header_check_recover(
214	xfs_mount_t		*mp,
215	xlog_rec_header_t	*head)
216{
217	ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
218
219	/*
220	 * IRIX doesn't write the h_fmt field and leaves it zeroed
221	 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
222	 * a dirty log created in IRIX.
223	 */
224	if (unlikely(INT_GET(head->h_fmt, ARCH_CONVERT) != XLOG_FMT)) {
225		xlog_warn(
226	"XFS: dirty log written in incompatible format - can't recover");
227		xlog_header_check_dump(mp, head);
228		XFS_ERROR_REPORT("xlog_header_check_recover(1)",
229				 XFS_ERRLEVEL_HIGH, mp);
230		return XFS_ERROR(EFSCORRUPTED);
231	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
232		xlog_warn(
233	"XFS: dirty log entry has mismatched uuid - can't recover");
234		xlog_header_check_dump(mp, head);
235		XFS_ERROR_REPORT("xlog_header_check_recover(2)",
236				 XFS_ERRLEVEL_HIGH, mp);
237		return XFS_ERROR(EFSCORRUPTED);
238	}
239	return 0;
240}
241
242/*
243 * read the head block of the log and check the header
244 */
245STATIC int
246xlog_header_check_mount(
247	xfs_mount_t		*mp,
248	xlog_rec_header_t	*head)
249{
250	ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
251
252	if (uuid_is_nil(&head->h_fs_uuid)) {
253		/*
254		 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
255		 * h_fs_uuid is nil, we assume this log was last mounted
256		 * by IRIX and continue.
257		 */
258		xlog_warn("XFS: nil uuid in log - IRIX style log");
259	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
260		xlog_warn("XFS: log has mismatched uuid - can't recover");
261		xlog_header_check_dump(mp, head);
262		XFS_ERROR_REPORT("xlog_header_check_mount",
263				 XFS_ERRLEVEL_HIGH, mp);
264		return XFS_ERROR(EFSCORRUPTED);
265	}
266	return 0;
267}
268
269STATIC void
270xlog_recover_iodone(
271	struct xfs_buf	*bp)
272{
273	xfs_mount_t	*mp;
274
275	ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
276
277	if (XFS_BUF_GETERROR(bp)) {
278		/*
279		 * We're not going to bother about retrying
280		 * this during recovery. One strike!
281		 */
282		mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
283		xfs_ioerror_alert("xlog_recover_iodone",
284				  mp, bp, XFS_BUF_ADDR(bp));
285		xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
286	}
287	XFS_BUF_SET_FSPRIVATE(bp, NULL);
288	XFS_BUF_CLR_IODONE_FUNC(bp);
289	xfs_biodone(bp);
290}
291
292/*
293 * This routine finds (to an approximation) the first block in the physical
294 * log which contains the given cycle.  It uses a binary search algorithm.
295 * Note that the algorithm can not be perfect because the disk will not
296 * necessarily be perfect.
297 */
298STATIC int
299xlog_find_cycle_start(
300	xlog_t		*log,
301	xfs_buf_t	*bp,
302	xfs_daddr_t	first_blk,
303	xfs_daddr_t	*last_blk,
304	uint		cycle)
305{
306	xfs_caddr_t	offset;
307	xfs_daddr_t	mid_blk;
308	uint		mid_cycle;
309	int		error;
310
311	mid_blk = BLK_AVG(first_blk, *last_blk);
312	while (mid_blk != first_blk && mid_blk != *last_blk) {
313		if ((error = xlog_bread(log, mid_blk, 1, bp)))
314			return error;
315		offset = xlog_align(log, mid_blk, 1, bp);
316		mid_cycle = GET_CYCLE(offset, ARCH_CONVERT);
317		if (mid_cycle == cycle) {
318			*last_blk = mid_blk;
319			/* last_half_cycle == mid_cycle */
320		} else {
321			first_blk = mid_blk;
322			/* first_half_cycle == mid_cycle */
323		}
324		mid_blk = BLK_AVG(first_blk, *last_blk);
325	}
326	ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
327	       (mid_blk == *last_blk && mid_blk-1 == first_blk));
328
329	return 0;
330}
331
332/*
333 * Check that the range of blocks does not contain the cycle number
334 * given.  The scan needs to occur from front to back and the ptr into the
335 * region must be updated since a later routine will need to perform another
336 * test.  If the region is completely good, we end up returning the same
337 * last block number.
338 *
339 * Set blkno to -1 if we encounter no errors.  This is an invalid block number
340 * since we don't ever expect logs to get this large.
341 */
342STATIC int
343xlog_find_verify_cycle(
344	xlog_t		*log,
345	xfs_daddr_t	start_blk,
346	int		nbblks,
347	uint		stop_on_cycle_no,
348	xfs_daddr_t	*new_blk)
349{
350	xfs_daddr_t	i, j;
351	uint		cycle;
352	xfs_buf_t	*bp;
353	xfs_daddr_t	bufblks;
354	xfs_caddr_t	buf = NULL;
355	int		error = 0;
356
357	bufblks = 1 << ffs(nbblks);
358
359	while (!(bp = xlog_get_bp(log, bufblks))) {
360		/* can't get enough memory to do everything in one big buffer */
361		bufblks >>= 1;
362		if (bufblks <= log->l_sectbb_log)
363			return ENOMEM;
364	}
365
366	for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
367		int	bcount;
368
369		bcount = min(bufblks, (start_blk + nbblks - i));
370
371		if ((error = xlog_bread(log, i, bcount, bp)))
372			goto out;
373
374		buf = xlog_align(log, i, bcount, bp);
375		for (j = 0; j < bcount; j++) {
376			cycle = GET_CYCLE(buf, ARCH_CONVERT);
377			if (cycle == stop_on_cycle_no) {
378				*new_blk = i+j;
379				goto out;
380			}
381
382			buf += BBSIZE;
383		}
384	}
385
386	*new_blk = -1;
387
388out:
389	xlog_put_bp(bp);
390	return error;
391}
392
393/*
394 * Potentially backup over partial log record write.
395 *
396 * In the typical case, last_blk is the number of the block directly after
397 * a good log record.  Therefore, we subtract one to get the block number
398 * of the last block in the given buffer.  extra_bblks contains the number
399 * of blocks we would have read on a previous read.  This happens when the
400 * last log record is split over the end of the physical log.
401 *
402 * extra_bblks is the number of blocks potentially verified on a previous
403 * call to this routine.
404 */
405STATIC int
406xlog_find_verify_log_record(
407	xlog_t			*log,
408	xfs_daddr_t		start_blk,
409	xfs_daddr_t		*last_blk,
410	int			extra_bblks)
411{
412	xfs_daddr_t		i;
413	xfs_buf_t		*bp;
414	xfs_caddr_t		offset = NULL;
415	xlog_rec_header_t	*head = NULL;
416	int			error = 0;
417	int			smallmem = 0;
418	int			num_blks = *last_blk - start_blk;
419	int			xhdrs;
420
421	ASSERT(start_blk != 0 || *last_blk != start_blk);
422
423	if (!(bp = xlog_get_bp(log, num_blks))) {
424		if (!(bp = xlog_get_bp(log, 1)))
425			return ENOMEM;
426		smallmem = 1;
427	} else {
428		if ((error = xlog_bread(log, start_blk, num_blks, bp)))
429			goto out;
430		offset = xlog_align(log, start_blk, num_blks, bp);
431		offset += ((num_blks - 1) << BBSHIFT);
432	}
433
434	for (i = (*last_blk) - 1; i >= 0; i--) {
435		if (i < start_blk) {
436			/* valid log record not found */
437			xlog_warn(
438		"XFS: Log inconsistent (didn't find previous header)");
439			ASSERT(0);
440			error = XFS_ERROR(EIO);
441			goto out;
442		}
443
444		if (smallmem) {
445			if ((error = xlog_bread(log, i, 1, bp)))
446				goto out;
447			offset = xlog_align(log, i, 1, bp);
448		}
449
450		head = (xlog_rec_header_t *)offset;
451
452		if (XLOG_HEADER_MAGIC_NUM ==
453		    INT_GET(head->h_magicno, ARCH_CONVERT))
454			break;
455
456		if (!smallmem)
457			offset -= BBSIZE;
458	}
459
460	/*
461	 * We hit the beginning of the physical log & still no header.  Return
462	 * to caller.  If caller can handle a return of -1, then this routine
463	 * will be called again for the end of the physical log.
464	 */
465	if (i == -1) {
466		error = -1;
467		goto out;
468	}
469
470	/*
471	 * We have the final block of the good log (the first block
472	 * of the log record _before_ the head. So we check the uuid.
473	 */
474	if ((error = xlog_header_check_mount(log->l_mp, head)))
475		goto out;
476
477	/*
478	 * We may have found a log record header before we expected one.
479	 * last_blk will be the 1st block # with a given cycle #.  We may end
480	 * up reading an entire log record.  In this case, we don't want to
481	 * reset last_blk.  Only when last_blk points in the middle of a log
482	 * record do we update last_blk.
483	 */
484	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
485		uint	h_size = INT_GET(head->h_size, ARCH_CONVERT);
486
487		xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
488		if (h_size % XLOG_HEADER_CYCLE_SIZE)
489			xhdrs++;
490	} else {
491		xhdrs = 1;
492	}
493
494	if (*last_blk - i + extra_bblks
495			!= BTOBB(INT_GET(head->h_len, ARCH_CONVERT)) + xhdrs)
496		*last_blk = i;
497
498out:
499	xlog_put_bp(bp);
500	return error;
501}
502
503/*
504 * Head is defined to be the point of the log where the next log write
505 * write could go.  This means that incomplete LR writes at the end are
506 * eliminated when calculating the head.  We aren't guaranteed that previous
507 * LR have complete transactions.  We only know that a cycle number of
508 * current cycle number -1 won't be present in the log if we start writing
509 * from our current block number.
510 *
511 * last_blk contains the block number of the first block with a given
512 * cycle number.
513 *
514 * Return: zero if normal, non-zero if error.
515 */
516STATIC int
517xlog_find_head(
518	xlog_t 		*log,
519	xfs_daddr_t	*return_head_blk)
520{
521	xfs_buf_t	*bp;
522	xfs_caddr_t	offset;
523	xfs_daddr_t	new_blk, first_blk = 0, start_blk, last_blk, head_blk;
524	int		num_scan_bblks;
525	uint		first_half_cycle, last_half_cycle;
526	uint		stop_on_cycle;
527	int		error, log_bbnum = log->l_logBBsize;
528
529	/* Is the end of the log device zeroed? */
530	if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
531		*return_head_blk = first_blk;
532
533		/* Is the whole lot zeroed? */
534		if (!first_blk) {
535			/* Linux XFS shouldn't generate totally zeroed logs -
536			 * mkfs etc write a dummy unmount record to a fresh
537			 * log so we can store the uuid in there
538			 */
539			xlog_warn("XFS: totally zeroed log");
540		}
541
542		return 0;
543	} else if (error) {
544		xlog_warn("XFS: empty log check failed");
545		return error;
546	}
547
548	first_blk = 0;			/* get cycle # of 1st block */
549	bp = xlog_get_bp(log, 1);
550	if (!bp)
551		return ENOMEM;
552	if ((error = xlog_bread(log, 0, 1, bp)))
553		goto bp_err;
554	offset = xlog_align(log, 0, 1, bp);
555	first_half_cycle = GET_CYCLE(offset, ARCH_CONVERT);
556
557	last_blk = head_blk = log_bbnum - 1;	/* get cycle # of last block */
558	if ((error = xlog_bread(log, last_blk, 1, bp)))
559		goto bp_err;
560	offset = xlog_align(log, last_blk, 1, bp);
561	last_half_cycle = GET_CYCLE(offset, ARCH_CONVERT);
562	ASSERT(last_half_cycle != 0);
563
564	/*
565	 * If the 1st half cycle number is equal to the last half cycle number,
566	 * then the entire log is stamped with the same cycle number.  In this
567	 * case, head_blk can't be set to zero (which makes sense).  The below
568	 * math doesn't work out properly with head_blk equal to zero.  Instead,
569	 * we set it to log_bbnum which is an invalid block number, but this
570	 * value makes the math correct.  If head_blk doesn't changed through
571	 * all the tests below, *head_blk is set to zero at the very end rather
572	 * than log_bbnum.  In a sense, log_bbnum and zero are the same block
573	 * in a circular file.
574	 */
575	if (first_half_cycle == last_half_cycle) {
576		/*
577		 * In this case we believe that the entire log should have
578		 * cycle number last_half_cycle.  We need to scan backwards
579		 * from the end verifying that there are no holes still
580		 * containing last_half_cycle - 1.  If we find such a hole,
581		 * then the start of that hole will be the new head.  The
582		 * simple case looks like
583		 *        x | x ... | x - 1 | x
584		 * Another case that fits this picture would be
585		 *        x | x + 1 | x ... | x
586		 * In this case the head really is somewhere at the end of the
587		 * log, as one of the latest writes at the beginning was
588		 * incomplete.
589		 * One more case is
590		 *        x | x + 1 | x ... | x - 1 | x
591		 * This is really the combination of the above two cases, and
592		 * the head has to end up at the start of the x-1 hole at the
593		 * end of the log.
594		 *
595		 * In the 256k log case, we will read from the beginning to the
596		 * end of the log and search for cycle numbers equal to x-1.
597		 * We don't worry about the x+1 blocks that we encounter,
598		 * because we know that they cannot be the head since the log
599		 * started with x.
600		 */
601		head_blk = log_bbnum;
602		stop_on_cycle = last_half_cycle - 1;
603	} else {
604		/*
605		 * In this case we want to find the first block with cycle
606		 * number matching last_half_cycle.  We expect the log to be
607		 * some variation on
608		 *        x + 1 ... | x ...
609		 * The first block with cycle number x (last_half_cycle) will
610		 * be where the new head belongs.  First we do a binary search
611		 * for the first occurrence of last_half_cycle.  The binary
612		 * search may not be totally accurate, so then we scan back
613		 * from there looking for occurrences of last_half_cycle before
614		 * us.  If that backwards scan wraps around the beginning of
615		 * the log, then we look for occurrences of last_half_cycle - 1
616		 * at the end of the log.  The cases we're looking for look
617		 * like
618		 *        x + 1 ... | x | x + 1 | x ...
619		 *                               ^ binary search stopped here
620		 * or
621		 *        x + 1 ... | x ... | x - 1 | x
622		 *        <---------> less than scan distance
623		 */
624		stop_on_cycle = last_half_cycle;
625		if ((error = xlog_find_cycle_start(log, bp, first_blk,
626						&head_blk, last_half_cycle)))
627			goto bp_err;
628	}
629
630	/*
631	 * Now validate the answer.  Scan back some number of maximum possible
632	 * blocks and make sure each one has the expected cycle number.  The
633	 * maximum is determined by the total possible amount of buffering
634	 * in the in-core log.  The following number can be made tighter if
635	 * we actually look at the block size of the filesystem.
636	 */
637	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
638	if (head_blk >= num_scan_bblks) {
639		/*
640		 * We are guaranteed that the entire check can be performed
641		 * in one buffer.
642		 */
643		start_blk = head_blk - num_scan_bblks;
644		if ((error = xlog_find_verify_cycle(log,
645						start_blk, num_scan_bblks,
646						stop_on_cycle, &new_blk)))
647			goto bp_err;
648		if (new_blk != -1)
649			head_blk = new_blk;
650	} else {		/* need to read 2 parts of log */
651		/*
652		 * We are going to scan backwards in the log in two parts.
653		 * First we scan the physical end of the log.  In this part
654		 * of the log, we are looking for blocks with cycle number
655		 * last_half_cycle - 1.
656		 * If we find one, then we know that the log starts there, as
657		 * we've found a hole that didn't get written in going around
658		 * the end of the physical log.  The simple case for this is
659		 *        x + 1 ... | x ... | x - 1 | x
660		 *        <---------> less than scan distance
661		 * If all of the blocks at the end of the log have cycle number
662		 * last_half_cycle, then we check the blocks at the start of
663		 * the log looking for occurrences of last_half_cycle.  If we
664		 * find one, then our current estimate for the location of the
665		 * first occurrence of last_half_cycle is wrong and we move
666		 * back to the hole we've found.  This case looks like
667		 *        x + 1 ... | x | x + 1 | x ...
668		 *                               ^ binary search stopped here
669		 * Another case we need to handle that only occurs in 256k
670		 * logs is
671		 *        x + 1 ... | x ... | x+1 | x ...
672		 *                   ^ binary search stops here
673		 * In a 256k log, the scan at the end of the log will see the
674		 * x + 1 blocks.  We need to skip past those since that is
675		 * certainly not the head of the log.  By searching for
676		 * last_half_cycle-1 we accomplish that.
677		 */
678		start_blk = log_bbnum - num_scan_bblks + head_blk;
679		ASSERT(head_blk <= INT_MAX &&
680			(xfs_daddr_t) num_scan_bblks - head_blk >= 0);
681		if ((error = xlog_find_verify_cycle(log, start_blk,
682					num_scan_bblks - (int)head_blk,
683					(stop_on_cycle - 1), &new_blk)))
684			goto bp_err;
685		if (new_blk != -1) {
686			head_blk = new_blk;
687			goto bad_blk;
688		}
689
690		/*
691		 * Scan beginning of log now.  The last part of the physical
692		 * log is good.  This scan needs to verify that it doesn't find
693		 * the last_half_cycle.
694		 */
695		start_blk = 0;
696		ASSERT(head_blk <= INT_MAX);
697		if ((error = xlog_find_verify_cycle(log,
698					start_blk, (int)head_blk,
699					stop_on_cycle, &new_blk)))
700			goto bp_err;
701		if (new_blk != -1)
702			head_blk = new_blk;
703	}
704
705 bad_blk:
706	/*
707	 * Now we need to make sure head_blk is not pointing to a block in
708	 * the middle of a log record.
709	 */
710	num_scan_bblks = XLOG_REC_SHIFT(log);
711	if (head_blk >= num_scan_bblks) {
712		start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
713
714		/* start ptr at last block ptr before head_blk */
715		if ((error = xlog_find_verify_log_record(log, start_blk,
716							&head_blk, 0)) == -1) {
717			error = XFS_ERROR(EIO);
718			goto bp_err;
719		} else if (error)
720			goto bp_err;
721	} else {
722		start_blk = 0;
723		ASSERT(head_blk <= INT_MAX);
724		if ((error = xlog_find_verify_log_record(log, start_blk,
725							&head_blk, 0)) == -1) {
726			/* We hit the beginning of the log during our search */
727			start_blk = log_bbnum - num_scan_bblks + head_blk;
728			new_blk = log_bbnum;
729			ASSERT(start_blk <= INT_MAX &&
730				(xfs_daddr_t) log_bbnum-start_blk >= 0);
731			ASSERT(head_blk <= INT_MAX);
732			if ((error = xlog_find_verify_log_record(log,
733							start_blk, &new_blk,
734							(int)head_blk)) == -1) {
735				error = XFS_ERROR(EIO);
736				goto bp_err;
737			} else if (error)
738				goto bp_err;
739			if (new_blk != log_bbnum)
740				head_blk = new_blk;
741		} else if (error)
742			goto bp_err;
743	}
744
745	xlog_put_bp(bp);
746	if (head_blk == log_bbnum)
747		*return_head_blk = 0;
748	else
749		*return_head_blk = head_blk;
750	/*
751	 * When returning here, we have a good block number.  Bad block
752	 * means that during a previous crash, we didn't have a clean break
753	 * from cycle number N to cycle number N-1.  In this case, we need
754	 * to find the first block with cycle number N-1.
755	 */
756	return 0;
757
758 bp_err:
759	xlog_put_bp(bp);
760
761	if (error)
762	    xlog_warn("XFS: failed to find log head");
763	return error;
764}
765
766/*
767 * Find the sync block number or the tail of the log.
768 *
769 * This will be the block number of the last record to have its
770 * associated buffers synced to disk.  Every log record header has
771 * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
772 * to get a sync block number.  The only concern is to figure out which
773 * log record header to believe.
774 *
775 * The following algorithm uses the log record header with the largest
776 * lsn.  The entire log record does not need to be valid.  We only care
777 * that the header is valid.
778 *
779 * We could speed up search by using current head_blk buffer, but it is not
780 * available.
781 */
782int
783xlog_find_tail(
784	xlog_t			*log,
785	xfs_daddr_t		*head_blk,
786	xfs_daddr_t		*tail_blk)
787{
788	xlog_rec_header_t	*rhead;
789	xlog_op_header_t	*op_head;
790	xfs_caddr_t		offset = NULL;
791	xfs_buf_t		*bp;
792	int			error, i, found;
793	xfs_daddr_t		umount_data_blk;
794	xfs_daddr_t		after_umount_blk;
795	xfs_lsn_t		tail_lsn;
796	int			hblks;
797
798	found = 0;
799
800	/*
801	 * Find previous log record
802	 */
803	if ((error = xlog_find_head(log, head_blk)))
804		return error;
805
806	bp = xlog_get_bp(log, 1);
807	if (!bp)
808		return ENOMEM;
809	if (*head_blk == 0) {				/* special case */
810		if ((error = xlog_bread(log, 0, 1, bp)))
811			goto bread_err;
812		offset = xlog_align(log, 0, 1, bp);
813		if (GET_CYCLE(offset, ARCH_CONVERT) == 0) {
814			*tail_blk = 0;
815			/* leave all other log inited values alone */
816			goto exit;
817		}
818	}
819
820	/*
821	 * Search backwards looking for log record header block
822	 */
823	ASSERT(*head_blk < INT_MAX);
824	for (i = (int)(*head_blk) - 1; i >= 0; i--) {
825		if ((error = xlog_bread(log, i, 1, bp)))
826			goto bread_err;
827		offset = xlog_align(log, i, 1, bp);
828		if (XLOG_HEADER_MAGIC_NUM ==
829		    INT_GET(*(uint *)offset, ARCH_CONVERT)) {
830			found = 1;
831			break;
832		}
833	}
834	/*
835	 * If we haven't found the log record header block, start looking
836	 * again from the end of the physical log.  XXXmiken: There should be
837	 * a check here to make sure we didn't search more than N blocks in
838	 * the previous code.
839	 */
840	if (!found) {
841		for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
842			if ((error = xlog_bread(log, i, 1, bp)))
843				goto bread_err;
844			offset = xlog_align(log, i, 1, bp);
845			if (XLOG_HEADER_MAGIC_NUM ==
846			    INT_GET(*(uint*)offset, ARCH_CONVERT)) {
847				found = 2;
848				break;
849			}
850		}
851	}
852	if (!found) {
853		xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
854		ASSERT(0);
855		return XFS_ERROR(EIO);
856	}
857
858	/* find blk_no of tail of log */
859	rhead = (xlog_rec_header_t *)offset;
860	*tail_blk = BLOCK_LSN(INT_GET(rhead->h_tail_lsn, ARCH_CONVERT));
861
862	/*
863	 * Reset log values according to the state of the log when we
864	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
865	 * one because the next write starts a new cycle rather than
866	 * continuing the cycle of the last good log record.  At this
867	 * point we have guaranteed that all partial log records have been
868	 * accounted for.  Therefore, we know that the last good log record
869	 * written was complete and ended exactly on the end boundary
870	 * of the physical log.
871	 */
872	log->l_prev_block = i;
873	log->l_curr_block = (int)*head_blk;
874	log->l_curr_cycle = INT_GET(rhead->h_cycle, ARCH_CONVERT);
875	if (found == 2)
876		log->l_curr_cycle++;
877	log->l_tail_lsn = INT_GET(rhead->h_tail_lsn, ARCH_CONVERT);
878	log->l_last_sync_lsn = INT_GET(rhead->h_lsn, ARCH_CONVERT);
879	log->l_grant_reserve_cycle = log->l_curr_cycle;
880	log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
881	log->l_grant_write_cycle = log->l_curr_cycle;
882	log->l_grant_write_bytes = BBTOB(log->l_curr_block);
883
884	/*
885	 * Look for unmount record.  If we find it, then we know there
886	 * was a clean unmount.  Since 'i' could be the last block in
887	 * the physical log, we convert to a log block before comparing
888	 * to the head_blk.
889	 *
890	 * Save the current tail lsn to use to pass to
891	 * xlog_clear_stale_blocks() below.  We won't want to clear the
892	 * unmount record if there is one, so we pass the lsn of the
893	 * unmount record rather than the block after it.
894	 */
895	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
896		int	h_size = INT_GET(rhead->h_size, ARCH_CONVERT);
897		int	h_version = INT_GET(rhead->h_version, ARCH_CONVERT);
898
899		if ((h_version & XLOG_VERSION_2) &&
900		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
901			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
902			if (h_size % XLOG_HEADER_CYCLE_SIZE)
903				hblks++;
904		} else {
905			hblks = 1;
906		}
907	} else {
908		hblks = 1;
909	}
910	after_umount_blk = (i + hblks + (int)
911		BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT))) % log->l_logBBsize;
912	tail_lsn = log->l_tail_lsn;
913	if (*head_blk == after_umount_blk &&
914	    INT_GET(rhead->h_num_logops, ARCH_CONVERT) == 1) {
915		umount_data_blk = (i + hblks) % log->l_logBBsize;
916		if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
917			goto bread_err;
918		}
919		offset = xlog_align(log, umount_data_blk, 1, bp);
920		op_head = (xlog_op_header_t *)offset;
921		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
922			/*
923			 * Set tail and last sync so that newly written
924			 * log records will point recovery to after the
925			 * current unmount record.
926			 */
927			ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, log->l_curr_cycle,
928					after_umount_blk);
929			ASSIGN_ANY_LSN_HOST(log->l_last_sync_lsn, log->l_curr_cycle,
930					after_umount_blk);
931			*tail_blk = after_umount_blk;
932		}
933	}
934
935	/*
936	 * Make sure that there are no blocks in front of the head
937	 * with the same cycle number as the head.  This can happen
938	 * because we allow multiple outstanding log writes concurrently,
939	 * and the later writes might make it out before earlier ones.
940	 *
941	 * We use the lsn from before modifying it so that we'll never
942	 * overwrite the unmount record after a clean unmount.
943	 *
944	 * Do this only if we are going to recover the filesystem
945	 *
946	 * NOTE: This used to say "if (!readonly)"
947	 * However on Linux, we can & do recover a read-only filesystem.
948	 * We only skip recovery if NORECOVERY is specified on mount,
949	 * in which case we would not be here.
950	 *
951	 * But... if the -device- itself is readonly, just skip this.
952	 * We can't recover this device anyway, so it won't matter.
953	 */
954	if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
955		error = xlog_clear_stale_blocks(log, tail_lsn);
956	}
957
958bread_err:
959exit:
960	xlog_put_bp(bp);
961
962	if (error)
963		xlog_warn("XFS: failed to locate log tail");
964	return error;
965}
966
967/*
968 * Is the log zeroed at all?
969 *
970 * The last binary search should be changed to perform an X block read
971 * once X becomes small enough.  You can then search linearly through
972 * the X blocks.  This will cut down on the number of reads we need to do.
973 *
974 * If the log is partially zeroed, this routine will pass back the blkno
975 * of the first block with cycle number 0.  It won't have a complete LR
976 * preceding it.
977 *
978 * Return:
979 *	0  => the log is completely written to
980 *	-1 => use *blk_no as the first block of the log
981 *	>0 => error has occurred
982 */
983int
984xlog_find_zeroed(
985	xlog_t		*log,
986	xfs_daddr_t	*blk_no)
987{
988	xfs_buf_t	*bp;
989	xfs_caddr_t	offset;
990	uint	        first_cycle, last_cycle;
991	xfs_daddr_t	new_blk, last_blk, start_blk;
992	xfs_daddr_t     num_scan_bblks;
993	int	        error, log_bbnum = log->l_logBBsize;
994
995	/* check totally zeroed log */
996	bp = xlog_get_bp(log, 1);
997	if (!bp)
998		return ENOMEM;
999	if ((error = xlog_bread(log, 0, 1, bp)))
1000		goto bp_err;
1001	offset = xlog_align(log, 0, 1, bp);
1002	first_cycle = GET_CYCLE(offset, ARCH_CONVERT);
1003	if (first_cycle == 0) {		/* completely zeroed log */
1004		*blk_no = 0;
1005		xlog_put_bp(bp);
1006		return -1;
1007	}
1008
1009	/* check partially zeroed log */
1010	if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
1011		goto bp_err;
1012	offset = xlog_align(log, log_bbnum-1, 1, bp);
1013	last_cycle = GET_CYCLE(offset, ARCH_CONVERT);
1014	if (last_cycle != 0) {		/* log completely written to */
1015		xlog_put_bp(bp);
1016		return 0;
1017	} else if (first_cycle != 1) {
1018		/*
1019		 * If the cycle of the last block is zero, the cycle of
1020		 * the first block must be 1. If it's not, maybe we're
1021		 * not looking at a log... Bail out.
1022		 */
1023		xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
1024		return XFS_ERROR(EINVAL);
1025	}
1026
1027	/* we have a partially zeroed log */
1028	last_blk = log_bbnum-1;
1029	if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1030		goto bp_err;
1031
1032	/*
1033	 * Validate the answer.  Because there is no way to guarantee that
1034	 * the entire log is made up of log records which are the same size,
1035	 * we scan over the defined maximum blocks.  At this point, the maximum
1036	 * is not chosen to mean anything special.   XXXmiken
1037	 */
1038	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1039	ASSERT(num_scan_bblks <= INT_MAX);
1040
1041	if (last_blk < num_scan_bblks)
1042		num_scan_bblks = last_blk;
1043	start_blk = last_blk - num_scan_bblks;
1044
1045	/*
1046	 * We search for any instances of cycle number 0 that occur before
1047	 * our current estimate of the head.  What we're trying to detect is
1048	 *        1 ... | 0 | 1 | 0...
1049	 *                       ^ binary search ends here
1050	 */
1051	if ((error = xlog_find_verify_cycle(log, start_blk,
1052					 (int)num_scan_bblks, 0, &new_blk)))
1053		goto bp_err;
1054	if (new_blk != -1)
1055		last_blk = new_blk;
1056
1057	/*
1058	 * Potentially backup over partial log record write.  We don't need
1059	 * to search the end of the log because we know it is zero.
1060	 */
1061	if ((error = xlog_find_verify_log_record(log, start_blk,
1062				&last_blk, 0)) == -1) {
1063	    error = XFS_ERROR(EIO);
1064	    goto bp_err;
1065	} else if (error)
1066	    goto bp_err;
1067
1068	*blk_no = last_blk;
1069bp_err:
1070	xlog_put_bp(bp);
1071	if (error)
1072		return error;
1073	return -1;
1074}
1075
1076/*
1077 * These are simple subroutines used by xlog_clear_stale_blocks() below
1078 * to initialize a buffer full of empty log record headers and write
1079 * them into the log.
1080 */
1081STATIC void
1082xlog_add_record(
1083	xlog_t			*log,
1084	xfs_caddr_t		buf,
1085	int			cycle,
1086	int			block,
1087	int			tail_cycle,
1088	int			tail_block)
1089{
1090	xlog_rec_header_t	*recp = (xlog_rec_header_t *)buf;
1091
1092	memset(buf, 0, BBSIZE);
1093	INT_SET(recp->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM);
1094	INT_SET(recp->h_cycle, ARCH_CONVERT, cycle);
1095	INT_SET(recp->h_version, ARCH_CONVERT,
1096			XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1);
1097	ASSIGN_ANY_LSN_DISK(recp->h_lsn, cycle, block);
1098	ASSIGN_ANY_LSN_DISK(recp->h_tail_lsn, tail_cycle, tail_block);
1099	INT_SET(recp->h_fmt, ARCH_CONVERT, XLOG_FMT);
1100	memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1101}
1102
1103STATIC int
1104xlog_write_log_records(
1105	xlog_t		*log,
1106	int		cycle,
1107	int		start_block,
1108	int		blocks,
1109	int		tail_cycle,
1110	int		tail_block)
1111{
1112	xfs_caddr_t	offset;
1113	xfs_buf_t	*bp;
1114	int		balign, ealign;
1115	int		sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
1116	int		end_block = start_block + blocks;
1117	int		bufblks;
1118	int		error = 0;
1119	int		i, j = 0;
1120
1121	bufblks = 1 << ffs(blocks);
1122	while (!(bp = xlog_get_bp(log, bufblks))) {
1123		bufblks >>= 1;
1124		if (bufblks <= log->l_sectbb_log)
1125			return ENOMEM;
1126	}
1127
1128	/* We may need to do a read at the start to fill in part of
1129	 * the buffer in the starting sector not covered by the first
1130	 * write below.
1131	 */
1132	balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
1133	if (balign != start_block) {
1134		if ((error = xlog_bread(log, start_block, 1, bp))) {
1135			xlog_put_bp(bp);
1136			return error;
1137		}
1138		j = start_block - balign;
1139	}
1140
1141	for (i = start_block; i < end_block; i += bufblks) {
1142		int		bcount, endcount;
1143
1144		bcount = min(bufblks, end_block - start_block);
1145		endcount = bcount - j;
1146
1147		/* We may need to do a read at the end to fill in part of
1148		 * the buffer in the final sector not covered by the write.
1149		 * If this is the same sector as the above read, skip it.
1150		 */
1151		ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block);
1152		if (j == 0 && (start_block + endcount > ealign)) {
1153			offset = XFS_BUF_PTR(bp);
1154			balign = BBTOB(ealign - start_block);
1155			XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb));
1156			if ((error = xlog_bread(log, ealign, sectbb, bp)))
1157				break;
1158			XFS_BUF_SET_PTR(bp, offset, bufblks);
1159		}
1160
1161		offset = xlog_align(log, start_block, endcount, bp);
1162		for (; j < endcount; j++) {
1163			xlog_add_record(log, offset, cycle, i+j,
1164					tail_cycle, tail_block);
1165			offset += BBSIZE;
1166		}
1167		error = xlog_bwrite(log, start_block, endcount, bp);
1168		if (error)
1169			break;
1170		start_block += endcount;
1171		j = 0;
1172	}
1173	xlog_put_bp(bp);
1174	return error;
1175}
1176
1177/*
1178 * This routine is called to blow away any incomplete log writes out
1179 * in front of the log head.  We do this so that we won't become confused
1180 * if we come up, write only a little bit more, and then crash again.
1181 * If we leave the partial log records out there, this situation could
1182 * cause us to think those partial writes are valid blocks since they
1183 * have the current cycle number.  We get rid of them by overwriting them
1184 * with empty log records with the old cycle number rather than the
1185 * current one.
1186 *
1187 * The tail lsn is passed in rather than taken from
1188 * the log so that we will not write over the unmount record after a
1189 * clean unmount in a 512 block log.  Doing so would leave the log without
1190 * any valid log records in it until a new one was written.  If we crashed
1191 * during that time we would not be able to recover.
1192 */
1193STATIC int
1194xlog_clear_stale_blocks(
1195	xlog_t		*log,
1196	xfs_lsn_t	tail_lsn)
1197{
1198	int		tail_cycle, head_cycle;
1199	int		tail_block, head_block;
1200	int		tail_distance, max_distance;
1201	int		distance;
1202	int		error;
1203
1204	tail_cycle = CYCLE_LSN(tail_lsn);
1205	tail_block = BLOCK_LSN(tail_lsn);
1206	head_cycle = log->l_curr_cycle;
1207	head_block = log->l_curr_block;
1208
1209	/*
1210	 * Figure out the distance between the new head of the log
1211	 * and the tail.  We want to write over any blocks beyond the
1212	 * head that we may have written just before the crash, but
1213	 * we don't want to overwrite the tail of the log.
1214	 */
1215	if (head_cycle == tail_cycle) {
1216		/*
1217		 * The tail is behind the head in the physical log,
1218		 * so the distance from the head to the tail is the
1219		 * distance from the head to the end of the log plus
1220		 * the distance from the beginning of the log to the
1221		 * tail.
1222		 */
1223		if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1224			XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1225					 XFS_ERRLEVEL_LOW, log->l_mp);
1226			return XFS_ERROR(EFSCORRUPTED);
1227		}
1228		tail_distance = tail_block + (log->l_logBBsize - head_block);
1229	} else {
1230		/*
1231		 * The head is behind the tail in the physical log,
1232		 * so the distance from the head to the tail is just
1233		 * the tail block minus the head block.
1234		 */
1235		if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1236			XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1237					 XFS_ERRLEVEL_LOW, log->l_mp);
1238			return XFS_ERROR(EFSCORRUPTED);
1239		}
1240		tail_distance = tail_block - head_block;
1241	}
1242
1243	/*
1244	 * If the head is right up against the tail, we can't clear
1245	 * anything.
1246	 */
1247	if (tail_distance <= 0) {
1248		ASSERT(tail_distance == 0);
1249		return 0;
1250	}
1251
1252	max_distance = XLOG_TOTAL_REC_SHIFT(log);
1253	/*
1254	 * Take the smaller of the maximum amount of outstanding I/O
1255	 * we could have and the distance to the tail to clear out.
1256	 * We take the smaller so that we don't overwrite the tail and
1257	 * we don't waste all day writing from the head to the tail
1258	 * for no reason.
1259	 */
1260	max_distance = MIN(max_distance, tail_distance);
1261
1262	if ((head_block + max_distance) <= log->l_logBBsize) {
1263		/*
1264		 * We can stomp all the blocks we need to without
1265		 * wrapping around the end of the log.  Just do it
1266		 * in a single write.  Use the cycle number of the
1267		 * current cycle minus one so that the log will look like:
1268		 *     n ... | n - 1 ...
1269		 */
1270		error = xlog_write_log_records(log, (head_cycle - 1),
1271				head_block, max_distance, tail_cycle,
1272				tail_block);
1273		if (error)
1274			return error;
1275	} else {
1276		/*
1277		 * We need to wrap around the end of the physical log in
1278		 * order to clear all the blocks.  Do it in two separate
1279		 * I/Os.  The first write should be from the head to the
1280		 * end of the physical log, and it should use the current
1281		 * cycle number minus one just like above.
1282		 */
1283		distance = log->l_logBBsize - head_block;
1284		error = xlog_write_log_records(log, (head_cycle - 1),
1285				head_block, distance, tail_cycle,
1286				tail_block);
1287
1288		if (error)
1289			return error;
1290
1291		/*
1292		 * Now write the blocks at the start of the physical log.
1293		 * This writes the remainder of the blocks we want to clear.
1294		 * It uses the current cycle number since we're now on the
1295		 * same cycle as the head so that we get:
1296		 *    n ... n ... | n - 1 ...
1297		 *    ^^^^^ blocks we're writing
1298		 */
1299		distance = max_distance - (log->l_logBBsize - head_block);
1300		error = xlog_write_log_records(log, head_cycle, 0, distance,
1301				tail_cycle, tail_block);
1302		if (error)
1303			return error;
1304	}
1305
1306	return 0;
1307}
1308
1309/******************************************************************************
1310 *
1311 *		Log recover routines
1312 *
1313 ******************************************************************************
1314 */
1315
1316STATIC xlog_recover_t *
1317xlog_recover_find_tid(
1318	xlog_recover_t		*q,
1319	xlog_tid_t		tid)
1320{
1321	xlog_recover_t		*p = q;
1322
1323	while (p != NULL) {
1324		if (p->r_log_tid == tid)
1325		    break;
1326		p = p->r_next;
1327	}
1328	return p;
1329}
1330
1331STATIC void
1332xlog_recover_put_hashq(
1333	xlog_recover_t		**q,
1334	xlog_recover_t		*trans)
1335{
1336	trans->r_next = *q;
1337	*q = trans;
1338}
1339
1340STATIC void
1341xlog_recover_add_item(
1342	xlog_recover_item_t	**itemq)
1343{
1344	xlog_recover_item_t	*item;
1345
1346	item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1347	xlog_recover_insert_item_backq(itemq, item);
1348}
1349
1350STATIC int
1351xlog_recover_add_to_cont_trans(
1352	xlog_recover_t		*trans,
1353	xfs_caddr_t		dp,
1354	int			len)
1355{
1356	xlog_recover_item_t	*item;
1357	xfs_caddr_t		ptr, old_ptr;
1358	int			old_len;
1359
1360	item = trans->r_itemq;
1361	if (item == 0) {
1362		/* finish copying rest of trans header */
1363		xlog_recover_add_item(&trans->r_itemq);
1364		ptr = (xfs_caddr_t) &trans->r_theader +
1365				sizeof(xfs_trans_header_t) - len;
1366		memcpy(ptr, dp, len); /* d, s, l */
1367		return 0;
1368	}
1369	item = item->ri_prev;
1370
1371	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1372	old_len = item->ri_buf[item->ri_cnt-1].i_len;
1373
1374	ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
1375	memcpy(&ptr[old_len], dp, len); /* d, s, l */
1376	item->ri_buf[item->ri_cnt-1].i_len += len;
1377	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1378	return 0;
1379}
1380
1381/*
1382 * The next region to add is the start of a new region.  It could be
1383 * a whole region or it could be the first part of a new region.  Because
1384 * of this, the assumption here is that the type and size fields of all
1385 * format structures fit into the first 32 bits of the structure.
1386 *
1387 * This works because all regions must be 32 bit aligned.  Therefore, we
1388 * either have both fields or we have neither field.  In the case we have
1389 * neither field, the data part of the region is zero length.  We only have
1390 * a log_op_header and can throw away the header since a new one will appear
1391 * later.  If we have at least 4 bytes, then we can determine how many regions
1392 * will appear in the current log item.
1393 */
1394STATIC int
1395xlog_recover_add_to_trans(
1396	xlog_recover_t		*trans,
1397	xfs_caddr_t		dp,
1398	int			len)
1399{
1400	xfs_inode_log_format_t	*in_f;			/* any will do */
1401	xlog_recover_item_t	*item;
1402	xfs_caddr_t		ptr;
1403
1404	if (!len)
1405		return 0;
1406	item = trans->r_itemq;
1407	if (item == 0) {
1408		ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
1409		if (len == sizeof(xfs_trans_header_t))
1410			xlog_recover_add_item(&trans->r_itemq);
1411		memcpy(&trans->r_theader, dp, len); /* d, s, l */
1412		return 0;
1413	}
1414
1415	ptr = kmem_alloc(len, KM_SLEEP);
1416	memcpy(ptr, dp, len);
1417	in_f = (xfs_inode_log_format_t *)ptr;
1418
1419	if (item->ri_prev->ri_total != 0 &&
1420	     item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
1421		xlog_recover_add_item(&trans->r_itemq);
1422	}
1423	item = trans->r_itemq;
1424	item = item->ri_prev;
1425
1426	if (item->ri_total == 0) {		/* first region to be added */
1427		item->ri_total	= in_f->ilf_size;
1428		ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
1429		item->ri_buf = kmem_zalloc((item->ri_total *
1430					    sizeof(xfs_log_iovec_t)), KM_SLEEP);
1431	}
1432	ASSERT(item->ri_total > item->ri_cnt);
1433	/* Description region is ri_buf[0] */
1434	item->ri_buf[item->ri_cnt].i_addr = ptr;
1435	item->ri_buf[item->ri_cnt].i_len  = len;
1436	item->ri_cnt++;
1437	return 0;
1438}
1439
1440STATIC void
1441xlog_recover_new_tid(
1442	xlog_recover_t		**q,
1443	xlog_tid_t		tid,
1444	xfs_lsn_t		lsn)
1445{
1446	xlog_recover_t		*trans;
1447
1448	trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1449	trans->r_log_tid   = tid;
1450	trans->r_lsn	   = lsn;
1451	xlog_recover_put_hashq(q, trans);
1452}
1453
1454STATIC int
1455xlog_recover_unlink_tid(
1456	xlog_recover_t		**q,
1457	xlog_recover_t		*trans)
1458{
1459	xlog_recover_t		*tp;
1460	int			found = 0;
1461
1462	ASSERT(trans != 0);
1463	if (trans == *q) {
1464		*q = (*q)->r_next;
1465	} else {
1466		tp = *q;
1467		while (tp != 0) {
1468			if (tp->r_next == trans) {
1469				found = 1;
1470				break;
1471			}
1472			tp = tp->r_next;
1473		}
1474		if (!found) {
1475			xlog_warn(
1476			     "XFS: xlog_recover_unlink_tid: trans not found");
1477			ASSERT(0);
1478			return XFS_ERROR(EIO);
1479		}
1480		tp->r_next = tp->r_next->r_next;
1481	}
1482	return 0;
1483}
1484
1485STATIC void
1486xlog_recover_insert_item_backq(
1487	xlog_recover_item_t	**q,
1488	xlog_recover_item_t	*item)
1489{
1490	if (*q == 0) {
1491		item->ri_prev = item->ri_next = item;
1492		*q = item;
1493	} else {
1494		item->ri_next		= *q;
1495		item->ri_prev		= (*q)->ri_prev;
1496		(*q)->ri_prev		= item;
1497		item->ri_prev->ri_next	= item;
1498	}
1499}
1500
1501STATIC void
1502xlog_recover_insert_item_frontq(
1503	xlog_recover_item_t	**q,
1504	xlog_recover_item_t	*item)
1505{
1506	xlog_recover_insert_item_backq(q, item);
1507	*q = item;
1508}
1509
1510STATIC int
1511xlog_recover_reorder_trans(
1512	xlog_t			*log,
1513	xlog_recover_t		*trans)
1514{
1515	xlog_recover_item_t	*first_item, *itemq, *itemq_next;
1516	xfs_buf_log_format_t	*buf_f;
1517	xfs_buf_log_format_v1_t	*obuf_f;
1518	ushort			flags = 0;
1519
1520	first_item = itemq = trans->r_itemq;
1521	trans->r_itemq = NULL;
1522	do {
1523		itemq_next = itemq->ri_next;
1524		buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
1525		switch (ITEM_TYPE(itemq)) {
1526		case XFS_LI_BUF:
1527			flags = buf_f->blf_flags;
1528			break;
1529		case XFS_LI_6_1_BUF:
1530		case XFS_LI_5_3_BUF:
1531			obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
1532			flags = obuf_f->blf_flags;
1533			break;
1534		}
1535
1536		switch (ITEM_TYPE(itemq)) {
1537		case XFS_LI_BUF:
1538		case XFS_LI_6_1_BUF:
1539		case XFS_LI_5_3_BUF:
1540			if (!(flags & XFS_BLI_CANCEL)) {
1541				xlog_recover_insert_item_frontq(&trans->r_itemq,
1542								itemq);
1543				break;
1544			}
1545		case XFS_LI_INODE:
1546		case XFS_LI_6_1_INODE:
1547		case XFS_LI_5_3_INODE:
1548		case XFS_LI_DQUOT:
1549		case XFS_LI_QUOTAOFF:
1550		case XFS_LI_EFD:
1551		case XFS_LI_EFI:
1552			xlog_recover_insert_item_backq(&trans->r_itemq, itemq);
1553			break;
1554		default:
1555			xlog_warn(
1556	"XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
1557			ASSERT(0);
1558			return XFS_ERROR(EIO);
1559		}
1560		itemq = itemq_next;
1561	} while (first_item != itemq);
1562	return 0;
1563}
1564
1565/*
1566 * Build up the table of buf cancel records so that we don't replay
1567 * cancelled data in the second pass.  For buffer records that are
1568 * not cancel records, there is nothing to do here so we just return.
1569 *
1570 * If we get a cancel record which is already in the table, this indicates
1571 * that the buffer was cancelled multiple times.  In order to ensure
1572 * that during pass 2 we keep the record in the table until we reach its
1573 * last occurrence in the log, we keep a reference count in the cancel
1574 * record in the table to tell us how many times we expect to see this
1575 * record during the second pass.
1576 */
1577STATIC void
1578xlog_recover_do_buffer_pass1(
1579	xlog_t			*log,
1580	xfs_buf_log_format_t	*buf_f)
1581{
1582	xfs_buf_cancel_t	*bcp;
1583	xfs_buf_cancel_t	*nextp;
1584	xfs_buf_cancel_t	*prevp;
1585	xfs_buf_cancel_t	**bucket;
1586	xfs_buf_log_format_v1_t	*obuf_f;
1587	xfs_daddr_t		blkno = 0;
1588	uint			len = 0;
1589	ushort			flags = 0;
1590
1591	switch (buf_f->blf_type) {
1592	case XFS_LI_BUF:
1593		blkno = buf_f->blf_blkno;
1594		len = buf_f->blf_len;
1595		flags = buf_f->blf_flags;
1596		break;
1597	case XFS_LI_6_1_BUF:
1598	case XFS_LI_5_3_BUF:
1599		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
1600		blkno = (xfs_daddr_t) obuf_f->blf_blkno;
1601		len = obuf_f->blf_len;
1602		flags = obuf_f->blf_flags;
1603		break;
1604	}
1605
1606	/*
1607	 * If this isn't a cancel buffer item, then just return.
1608	 */
1609	if (!(flags & XFS_BLI_CANCEL))
1610		return;
1611
1612	/*
1613	 * Insert an xfs_buf_cancel record into the hash table of
1614	 * them.  If there is already an identical record, bump
1615	 * its reference count.
1616	 */
1617	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1618					  XLOG_BC_TABLE_SIZE];
1619	/*
1620	 * If the hash bucket is empty then just insert a new record into
1621	 * the bucket.
1622	 */
1623	if (*bucket == NULL) {
1624		bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1625						     KM_SLEEP);
1626		bcp->bc_blkno = blkno;
1627		bcp->bc_len = len;
1628		bcp->bc_refcount = 1;
1629		bcp->bc_next = NULL;
1630		*bucket = bcp;
1631		return;
1632	}
1633
1634	/*
1635	 * The hash bucket is not empty, so search for duplicates of our
1636	 * record.  If we find one them just bump its refcount.  If not
1637	 * then add us at the end of the list.
1638	 */
1639	prevp = NULL;
1640	nextp = *bucket;
1641	while (nextp != NULL) {
1642		if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
1643			nextp->bc_refcount++;
1644			return;
1645		}
1646		prevp = nextp;
1647		nextp = nextp->bc_next;
1648	}
1649	ASSERT(prevp != NULL);
1650	bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1651					     KM_SLEEP);
1652	bcp->bc_blkno = blkno;
1653	bcp->bc_len = len;
1654	bcp->bc_refcount = 1;
1655	bcp->bc_next = NULL;
1656	prevp->bc_next = bcp;
1657}
1658
1659/*
1660 * Check to see whether the buffer being recovered has a corresponding
1661 * entry in the buffer cancel record table.  If it does then return 1
1662 * so that it will be cancelled, otherwise return 0.  If the buffer is
1663 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
1664 * the refcount on the entry in the table and remove it from the table
1665 * if this is the last reference.
1666 *
1667 * We remove the cancel record from the table when we encounter its
1668 * last occurrence in the log so that if the same buffer is re-used
1669 * again after its last cancellation we actually replay the changes
1670 * made at that point.
1671 */
1672STATIC int
1673xlog_check_buffer_cancelled(
1674	xlog_t			*log,
1675	xfs_daddr_t		blkno,
1676	uint			len,
1677	ushort			flags)
1678{
1679	xfs_buf_cancel_t	*bcp;
1680	xfs_buf_cancel_t	*prevp;
1681	xfs_buf_cancel_t	**bucket;
1682
1683	if (log->l_buf_cancel_table == NULL) {
1684		/*
1685		 * There is nothing in the table built in pass one,
1686		 * so this buffer must not be cancelled.
1687		 */
1688		ASSERT(!(flags & XFS_BLI_CANCEL));
1689		return 0;
1690	}
1691
1692	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1693					  XLOG_BC_TABLE_SIZE];
1694	bcp = *bucket;
1695	if (bcp == NULL) {
1696		/*
1697		 * There is no corresponding entry in the table built
1698		 * in pass one, so this buffer has not been cancelled.
1699		 */
1700		ASSERT(!(flags & XFS_BLI_CANCEL));
1701		return 0;
1702	}
1703
1704	/*
1705	 * Search for an entry in the buffer cancel table that
1706	 * matches our buffer.
1707	 */
1708	prevp = NULL;
1709	while (bcp != NULL) {
1710		if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
1711			/*
1712			 * We've go a match, so return 1 so that the
1713			 * recovery of this buffer is cancelled.
1714			 * If this buffer is actually a buffer cancel
1715			 * log item, then decrement the refcount on the
1716			 * one in the table and remove it if this is the
1717			 * last reference.
1718			 */
1719			if (flags & XFS_BLI_CANCEL) {
1720				bcp->bc_refcount--;
1721				if (bcp->bc_refcount == 0) {
1722					if (prevp == NULL) {
1723						*bucket = bcp->bc_next;
1724					} else {
1725						prevp->bc_next = bcp->bc_next;
1726					}
1727					kmem_free(bcp,
1728						  sizeof(xfs_buf_cancel_t));
1729				}
1730			}
1731			return 1;
1732		}
1733		prevp = bcp;
1734		bcp = bcp->bc_next;
1735	}
1736	/*
1737	 * We didn't find a corresponding entry in the table, so
1738	 * return 0 so that the buffer is NOT cancelled.
1739	 */
1740	ASSERT(!(flags & XFS_BLI_CANCEL));
1741	return 0;
1742}
1743
1744STATIC int
1745xlog_recover_do_buffer_pass2(
1746	xlog_t			*log,
1747	xfs_buf_log_format_t	*buf_f)
1748{
1749	xfs_buf_log_format_v1_t	*obuf_f;
1750	xfs_daddr_t		blkno = 0;
1751	ushort			flags = 0;
1752	uint			len = 0;
1753
1754	switch (buf_f->blf_type) {
1755	case XFS_LI_BUF:
1756		blkno = buf_f->blf_blkno;
1757		flags = buf_f->blf_flags;
1758		len = buf_f->blf_len;
1759		break;
1760	case XFS_LI_6_1_BUF:
1761	case XFS_LI_5_3_BUF:
1762		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
1763		blkno = (xfs_daddr_t) obuf_f->blf_blkno;
1764		flags = obuf_f->blf_flags;
1765		len = (xfs_daddr_t) obuf_f->blf_len;
1766		break;
1767	}
1768
1769	return xlog_check_buffer_cancelled(log, blkno, len, flags);
1770}
1771
1772/*
1773 * Perform recovery for a buffer full of inodes.  In these buffers,
1774 * the only data which should be recovered is that which corresponds
1775 * to the di_next_unlinked pointers in the on disk inode structures.
1776 * The rest of the data for the inodes is always logged through the
1777 * inodes themselves rather than the inode buffer and is recovered
1778 * in xlog_recover_do_inode_trans().
1779 *
1780 * The only time when buffers full of inodes are fully recovered is
1781 * when the buffer is full of newly allocated inodes.  In this case
1782 * the buffer will not be marked as an inode buffer and so will be
1783 * sent to xlog_recover_do_reg_buffer() below during recovery.
1784 */
1785STATIC int
1786xlog_recover_do_inode_buffer(
1787	xfs_mount_t		*mp,
1788	xlog_recover_item_t	*item,
1789	xfs_buf_t		*bp,
1790	xfs_buf_log_format_t	*buf_f)
1791{
1792	int			i;
1793	int			item_index;
1794	int			bit;
1795	int			nbits;
1796	int			reg_buf_offset;
1797	int			reg_buf_bytes;
1798	int			next_unlinked_offset;
1799	int			inodes_per_buf;
1800	xfs_agino_t		*logged_nextp;
1801	xfs_agino_t		*buffer_nextp;
1802	xfs_buf_log_format_v1_t	*obuf_f;
1803	unsigned int		*data_map = NULL;
1804	unsigned int		map_size = 0;
1805
1806	switch (buf_f->blf_type) {
1807	case XFS_LI_BUF:
1808		data_map = buf_f->blf_data_map;
1809		map_size = buf_f->blf_map_size;
1810		break;
1811	case XFS_LI_6_1_BUF:
1812	case XFS_LI_5_3_BUF:
1813		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
1814		data_map = obuf_f->blf_data_map;
1815		map_size = obuf_f->blf_map_size;
1816		break;
1817	}
1818	/*
1819	 * Set the variables corresponding to the current region to
1820	 * 0 so that we'll initialize them on the first pass through
1821	 * the loop.
1822	 */
1823	reg_buf_offset = 0;
1824	reg_buf_bytes = 0;
1825	bit = 0;
1826	nbits = 0;
1827	item_index = 0;
1828	inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1829	for (i = 0; i < inodes_per_buf; i++) {
1830		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
1831			offsetof(xfs_dinode_t, di_next_unlinked);
1832
1833		while (next_unlinked_offset >=
1834		       (reg_buf_offset + reg_buf_bytes)) {
1835			/*
1836			 * The next di_next_unlinked field is beyond
1837			 * the current logged region.  Find the next
1838			 * logged region that contains or is beyond
1839			 * the current di_next_unlinked field.
1840			 */
1841			bit += nbits;
1842			bit = xfs_next_bit(data_map, map_size, bit);
1843
1844			/*
1845			 * If there are no more logged regions in the
1846			 * buffer, then we're done.
1847			 */
1848			if (bit == -1) {
1849				return 0;
1850			}
1851
1852			nbits = xfs_contig_bits(data_map, map_size,
1853							 bit);
1854			ASSERT(nbits > 0);
1855			reg_buf_offset = bit << XFS_BLI_SHIFT;
1856			reg_buf_bytes = nbits << XFS_BLI_SHIFT;
1857			item_index++;
1858		}
1859
1860		/*
1861		 * If the current logged region starts after the current
1862		 * di_next_unlinked field, then move on to the next
1863		 * di_next_unlinked field.
1864		 */
1865		if (next_unlinked_offset < reg_buf_offset) {
1866			continue;
1867		}
1868
1869		ASSERT(item->ri_buf[item_index].i_addr != NULL);
1870		ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
1871		ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1872
1873		/*
1874		 * The current logged region contains a copy of the
1875		 * current di_next_unlinked field.  Extract its value
1876		 * and copy it to the buffer copy.
1877		 */
1878		logged_nextp = (xfs_agino_t *)
1879			       ((char *)(item->ri_buf[item_index].i_addr) +
1880				(next_unlinked_offset - reg_buf_offset));
1881		if (unlikely(*logged_nextp == 0)) {
1882			xfs_fs_cmn_err(CE_ALERT, mp,
1883				"bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
1884				item, bp);
1885			XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1886					 XFS_ERRLEVEL_LOW, mp);
1887			return XFS_ERROR(EFSCORRUPTED);
1888		}
1889
1890		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
1891					      next_unlinked_offset);
1892		INT_SET(*buffer_nextp, ARCH_CONVERT, *logged_nextp);
1893	}
1894
1895	return 0;
1896}
1897
1898/*
1899 * Perform a 'normal' buffer recovery.  Each logged region of the
1900 * buffer should be copied over the corresponding region in the
1901 * given buffer.  The bitmap in the buf log format structure indicates
1902 * where to place the logged data.
1903 */
1904/*ARGSUSED*/
1905STATIC void
1906xlog_recover_do_reg_buffer(
1907	xfs_mount_t		*mp,
1908	xlog_recover_item_t	*item,
1909	xfs_buf_t		*bp,
1910	xfs_buf_log_format_t	*buf_f)
1911{
1912	int			i;
1913	int			bit;
1914	int			nbits;
1915	xfs_buf_log_format_v1_t	*obuf_f;
1916	unsigned int		*data_map = NULL;
1917	unsigned int		map_size = 0;
1918	int                     error;
1919
1920	switch (buf_f->blf_type) {
1921	case XFS_LI_BUF:
1922		data_map = buf_f->blf_data_map;
1923		map_size = buf_f->blf_map_size;
1924		break;
1925	case XFS_LI_6_1_BUF:
1926	case XFS_LI_5_3_BUF:
1927		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
1928		data_map = obuf_f->blf_data_map;
1929		map_size = obuf_f->blf_map_size;
1930		break;
1931	}
1932	bit = 0;
1933	i = 1;  /* 0 is the buf format structure */
1934	while (1) {
1935		bit = xfs_next_bit(data_map, map_size, bit);
1936		if (bit == -1)
1937			break;
1938		nbits = xfs_contig_bits(data_map, map_size, bit);
1939		ASSERT(nbits > 0);
1940		ASSERT(item->ri_buf[i].i_addr != 0);
1941		ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
1942		ASSERT(XFS_BUF_COUNT(bp) >=
1943		       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
1944
1945		/*
1946		 * Do a sanity check if this is a dquot buffer. Just checking
1947		 * the first dquot in the buffer should do. XXXThis is
1948		 * probably a good thing to do for other buf types also.
1949		 */
1950		error = 0;
1951		if (buf_f->blf_flags &
1952		   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
1953			error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
1954					       item->ri_buf[i].i_addr,
1955					       -1, 0, XFS_QMOPT_DOWARN,
1956					       "dquot_buf_recover");
1957		}
1958		if (!error)
1959			memcpy(xfs_buf_offset(bp,
1960				(uint)bit << XFS_BLI_SHIFT),	/* dest */
1961				item->ri_buf[i].i_addr,		/* source */
1962				nbits<<XFS_BLI_SHIFT);		/* length */
1963		i++;
1964		bit += nbits;
1965	}
1966
1967	/* Shouldn't be any more regions */
1968	ASSERT(i == item->ri_total);
1969}
1970
1971/*
1972 * Do some primitive error checking on ondisk dquot data structures.
1973 */
1974int
1975xfs_qm_dqcheck(
1976	xfs_disk_dquot_t *ddq,
1977	xfs_dqid_t	 id,
1978	uint		 type,	  /* used only when IO_dorepair is true */
1979	uint		 flags,
1980	char		 *str)
1981{
1982	xfs_dqblk_t	 *d = (xfs_dqblk_t *)ddq;
1983	int		errs = 0;
1984
1985	/*
1986	 * We can encounter an uninitialized dquot buffer for 2 reasons:
1987	 * 1. If we crash while deleting the quotainode(s), and those blks got
1988	 *    used for user data. This is because we take the path of regular
1989	 *    file deletion; however, the size field of quotainodes is never
1990	 *    updated, so all the tricks that we play in itruncate_finish
1991	 *    don't quite matter.
1992	 *
1993	 * 2. We don't play the quota buffers when there's a quotaoff logitem.
1994	 *    But the allocation will be replayed so we'll end up with an
1995	 *    uninitialized quota block.
1996	 *
1997	 * This is all fine; things are still consistent, and we haven't lost
1998	 * any quota information. Just don't complain about bad dquot blks.
1999	 */
2000	if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
2001		if (flags & XFS_QMOPT_DOWARN)
2002			cmn_err(CE_ALERT,
2003			"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
2004			str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
2005		errs++;
2006	}
2007	if (ddq->d_version != XFS_DQUOT_VERSION) {
2008		if (flags & XFS_QMOPT_DOWARN)
2009			cmn_err(CE_ALERT,
2010			"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
2011			str, id, ddq->d_version, XFS_DQUOT_VERSION);
2012		errs++;
2013	}
2014
2015	if (ddq->d_flags != XFS_DQ_USER &&
2016	    ddq->d_flags != XFS_DQ_PROJ &&
2017	    ddq->d_flags != XFS_DQ_GROUP) {
2018		if (flags & XFS_QMOPT_DOWARN)
2019			cmn_err(CE_ALERT,
2020			"%s : XFS dquot ID 0x%x, unknown flags 0x%x",
2021			str, id, ddq->d_flags);
2022		errs++;
2023	}
2024
2025	if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
2026		if (flags & XFS_QMOPT_DOWARN)
2027			cmn_err(CE_ALERT,
2028			"%s : ondisk-dquot 0x%p, ID mismatch: "
2029			"0x%x expected, found id 0x%x",
2030			str, ddq, id, be32_to_cpu(ddq->d_id));
2031		errs++;
2032	}
2033
2034	if (!errs && ddq->d_id) {
2035		if (ddq->d_blk_softlimit &&
2036		    be64_to_cpu(ddq->d_bcount) >=
2037				be64_to_cpu(ddq->d_blk_softlimit)) {
2038			if (!ddq->d_btimer) {
2039				if (flags & XFS_QMOPT_DOWARN)
2040					cmn_err(CE_ALERT,
2041					"%s : Dquot ID 0x%x (0x%p) "
2042					"BLK TIMER NOT STARTED",
2043					str, (int)be32_to_cpu(ddq->d_id), ddq);
2044				errs++;
2045			}
2046		}
2047		if (ddq->d_ino_softlimit &&
2048		    be64_to_cpu(ddq->d_icount) >=
2049				be64_to_cpu(ddq->d_ino_softlimit)) {
2050			if (!ddq->d_itimer) {
2051				if (flags & XFS_QMOPT_DOWARN)
2052					cmn_err(CE_ALERT,
2053					"%s : Dquot ID 0x%x (0x%p) "
2054					"INODE TIMER NOT STARTED",
2055					str, (int)be32_to_cpu(ddq->d_id), ddq);
2056				errs++;
2057			}
2058		}
2059		if (ddq->d_rtb_softlimit &&
2060		    be64_to_cpu(ddq->d_rtbcount) >=
2061				be64_to_cpu(ddq->d_rtb_softlimit)) {
2062			if (!ddq->d_rtbtimer) {
2063				if (flags & XFS_QMOPT_DOWARN)
2064					cmn_err(CE_ALERT,
2065					"%s : Dquot ID 0x%x (0x%p) "
2066					"RTBLK TIMER NOT STARTED",
2067					str, (int)be32_to_cpu(ddq->d_id), ddq);
2068				errs++;
2069			}
2070		}
2071	}
2072
2073	if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
2074		return errs;
2075
2076	if (flags & XFS_QMOPT_DOWARN)
2077		cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
2078
2079	/*
2080	 * Typically, a repair is only requested by quotacheck.
2081	 */
2082	ASSERT(id != -1);
2083	ASSERT(flags & XFS_QMOPT_DQREPAIR);
2084	memset(d, 0, sizeof(xfs_dqblk_t));
2085
2086	d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
2087	d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
2088	d->dd_diskdq.d_flags = type;
2089	d->dd_diskdq.d_id = cpu_to_be32(id);
2090
2091	return errs;
2092}
2093
2094/*
2095 * Perform a dquot buffer recovery.
2096 * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
2097 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2098 * Else, treat it as a regular buffer and do recovery.
2099 */
2100STATIC void
2101xlog_recover_do_dquot_buffer(
2102	xfs_mount_t		*mp,
2103	xlog_t			*log,
2104	xlog_recover_item_t	*item,
2105	xfs_buf_t		*bp,
2106	xfs_buf_log_format_t	*buf_f)
2107{
2108	uint			type;
2109
2110	/*
2111	 * Filesystems are required to send in quota flags at mount time.
2112	 */
2113	if (mp->m_qflags == 0) {
2114		return;
2115	}
2116
2117	type = 0;
2118	if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
2119		type |= XFS_DQ_USER;
2120	if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
2121		type |= XFS_DQ_PROJ;
2122	if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
2123		type |= XFS_DQ_GROUP;
2124	/*
2125	 * This type of quotas was turned off, so ignore this buffer
2126	 */
2127	if (log->l_quotaoffs_flag & type)
2128		return;
2129
2130	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2131}
2132
2133/*
2134 * This routine replays a modification made to a buffer at runtime.
2135 * There are actually two types of buffer, regular and inode, which
2136 * are handled differently.  Inode buffers are handled differently
2137 * in that we only recover a specific set of data from them, namely
2138 * the inode di_next_unlinked fields.  This is because all other inode
2139 * data is actually logged via inode records and any data we replay
2140 * here which overlaps that may be stale.
2141 *
2142 * When meta-data buffers are freed at run time we log a buffer item
2143 * with the XFS_BLI_CANCEL bit set to indicate that previous copies
2144 * of the buffer in the log should not be replayed at recovery time.
2145 * This is so that if the blocks covered by the buffer are reused for
2146 * file data before we crash we don't end up replaying old, freed
2147 * meta-data into a user's file.
2148 *
2149 * To handle the cancellation of buffer log items, we make two passes
2150 * over the log during recovery.  During the first we build a table of
2151 * those buffers which have been cancelled, and during the second we
2152 * only replay those buffers which do not have corresponding cancel
2153 * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
2154 * for more details on the implementation of the table of cancel records.
2155 */
2156STATIC int
2157xlog_recover_do_buffer_trans(
2158	xlog_t			*log,
2159	xlog_recover_item_t	*item,
2160	int			pass)
2161{
2162	xfs_buf_log_format_t	*buf_f;
2163	xfs_buf_log_format_v1_t	*obuf_f;
2164	xfs_mount_t		*mp;
2165	xfs_buf_t		*bp;
2166	int			error;
2167	int			cancel;
2168	xfs_daddr_t		blkno;
2169	int			len;
2170	ushort			flags;
2171
2172	buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
2173
2174	if (pass == XLOG_RECOVER_PASS1) {
2175		/*
2176		 * In this pass we're only looking for buf items
2177		 * with the XFS_BLI_CANCEL bit set.
2178		 */
2179		xlog_recover_do_buffer_pass1(log, buf_f);
2180		return 0;
2181	} else {
2182		/*
2183		 * In this pass we want to recover all the buffers
2184		 * which have not been cancelled and are not
2185		 * cancellation buffers themselves.  The routine
2186		 * we call here will tell us whether or not to
2187		 * continue with the replay of this buffer.
2188		 */
2189		cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2190		if (cancel) {
2191			return 0;
2192		}
2193	}
2194	switch (buf_f->blf_type) {
2195	case XFS_LI_BUF:
2196		blkno = buf_f->blf_blkno;
2197		len = buf_f->blf_len;
2198		flags = buf_f->blf_flags;
2199		break;
2200	case XFS_LI_6_1_BUF:
2201	case XFS_LI_5_3_BUF:
2202		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
2203		blkno = obuf_f->blf_blkno;
2204		len = obuf_f->blf_len;
2205		flags = obuf_f->blf_flags;
2206		break;
2207	default:
2208		xfs_fs_cmn_err(CE_ALERT, log->l_mp,
2209			"xfs_log_recover: unknown buffer type 0x%x, logdev %s",
2210			buf_f->blf_type, log->l_mp->m_logname ?
2211			log->l_mp->m_logname : "internal");
2212		XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
2213				 XFS_ERRLEVEL_LOW, log->l_mp);
2214		return XFS_ERROR(EFSCORRUPTED);
2215	}
2216
2217	mp = log->l_mp;
2218	if (flags & XFS_BLI_INODE_BUF) {
2219		bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len,
2220								XFS_BUF_LOCK);
2221	} else {
2222		bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0);
2223	}
2224	if (XFS_BUF_ISERROR(bp)) {
2225		xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
2226				  bp, blkno);
2227		error = XFS_BUF_GETERROR(bp);
2228		xfs_buf_relse(bp);
2229		return error;
2230	}
2231
2232	error = 0;
2233	if (flags & XFS_BLI_INODE_BUF) {
2234		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2235	} else if (flags &
2236		  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
2237		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2238	} else {
2239		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2240	}
2241	if (error)
2242		return XFS_ERROR(error);
2243
2244	/*
2245	 * Perform delayed write on the buffer.  Asynchronous writes will be
2246	 * slower when taking into account all the buffers to be flushed.
2247	 *
2248	 * Also make sure that only inode buffers with good sizes stay in
2249	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
2250	 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
2251	 * buffers in the log can be a different size if the log was generated
2252	 * by an older kernel using unclustered inode buffers or a newer kernel
2253	 * running with a different inode cluster size.  Regardless, if the
2254	 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
2255	 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
2256	 * the buffer out of the buffer cache so that the buffer won't
2257	 * overlap with future reads of those inodes.
2258	 */
2259	if (XFS_DINODE_MAGIC ==
2260	    INT_GET(*((__uint16_t *)(xfs_buf_offset(bp, 0))), ARCH_CONVERT) &&
2261	    (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
2262			(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2263		XFS_BUF_STALE(bp);
2264		error = xfs_bwrite(mp, bp);
2265	} else {
2266		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
2267		       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
2268		XFS_BUF_SET_FSPRIVATE(bp, mp);
2269		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2270		xfs_bdwrite(mp, bp);
2271	}
2272
2273	return (error);
2274}
2275
2276STATIC int
2277xlog_recover_do_inode_trans(
2278	xlog_t			*log,
2279	xlog_recover_item_t	*item,
2280	int			pass)
2281{
2282	xfs_inode_log_format_t	*in_f;
2283	xfs_mount_t		*mp;
2284	xfs_buf_t		*bp;
2285	xfs_imap_t		imap;
2286	xfs_dinode_t		*dip;
2287	xfs_ino_t		ino;
2288	int			len;
2289	xfs_caddr_t		src;
2290	xfs_caddr_t		dest;
2291	int			error;
2292	int			attr_index;
2293	uint			fields;
2294	xfs_dinode_core_t	*dicp;
2295
2296	if (pass == XLOG_RECOVER_PASS1) {
2297		return 0;
2298	}
2299
2300	in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
2301	ino = in_f->ilf_ino;
2302	mp = log->l_mp;
2303	if (ITEM_TYPE(item) == XFS_LI_INODE) {
2304		imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno;
2305		imap.im_len = in_f->ilf_len;
2306		imap.im_boffset = in_f->ilf_boffset;
2307	} else {
2308		/*
2309		 * It's an old inode format record.  We don't know where
2310		 * its cluster is located on disk, and we can't allow
2311		 * xfs_imap() to figure it out because the inode btrees
2312		 * are not ready to be used.  Therefore do not pass the
2313		 * XFS_IMAP_LOOKUP flag to xfs_imap().  This will give
2314		 * us only the single block in which the inode lives
2315		 * rather than its cluster, so we must make sure to
2316		 * invalidate the buffer when we write it out below.
2317		 */
2318		imap.im_blkno = 0;
2319		xfs_imap(log->l_mp, NULL, ino, &imap, 0);
2320	}
2321
2322	/*
2323	 * Inode buffers can be freed, look out for it,
2324	 * and do not replay the inode.
2325	 */
2326	if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0))
2327		return 0;
2328
2329	bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
2330								XFS_BUF_LOCK);
2331	if (XFS_BUF_ISERROR(bp)) {
2332		xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
2333				  bp, imap.im_blkno);
2334		error = XFS_BUF_GETERROR(bp);
2335		xfs_buf_relse(bp);
2336		return error;
2337	}
2338	error = 0;
2339	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2340	dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
2341
2342	/*
2343	 * Make sure the place we're flushing out to really looks
2344	 * like an inode!
2345	 */
2346	if (unlikely(INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC)) {
2347		xfs_buf_relse(bp);
2348		xfs_fs_cmn_err(CE_ALERT, mp,
2349			"xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
2350			dip, bp, ino);
2351		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
2352				 XFS_ERRLEVEL_LOW, mp);
2353		return XFS_ERROR(EFSCORRUPTED);
2354	}
2355	dicp = (xfs_dinode_core_t*)(item->ri_buf[1].i_addr);
2356	if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2357		xfs_buf_relse(bp);
2358		xfs_fs_cmn_err(CE_ALERT, mp,
2359			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
2360			item, ino);
2361		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
2362				 XFS_ERRLEVEL_LOW, mp);
2363		return XFS_ERROR(EFSCORRUPTED);
2364	}
2365
2366	/* Skip replay when the on disk inode is newer than the log one */
2367	if (dicp->di_flushiter <
2368	    INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT)) {
2369		/*
2370		 * Deal with the wrap case, DI_MAX_FLUSH is less
2371		 * than smaller numbers
2372		 */
2373		if ((INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT)
2374							== DI_MAX_FLUSH) &&
2375		    (dicp->di_flushiter < (DI_MAX_FLUSH>>1))) {
2376			/* do nothing */
2377		} else {
2378			xfs_buf_relse(bp);
2379			return 0;
2380		}
2381	}
2382	/* Take the opportunity to reset the flush iteration count */
2383	dicp->di_flushiter = 0;
2384
2385	if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
2386		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2387		    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2388			XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
2389					 XFS_ERRLEVEL_LOW, mp, dicp);
2390			xfs_buf_relse(bp);
2391			xfs_fs_cmn_err(CE_ALERT, mp,
2392				"xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2393				item, dip, bp, ino);
2394			return XFS_ERROR(EFSCORRUPTED);
2395		}
2396	} else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
2397		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2398		    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2399		    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2400			XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
2401					     XFS_ERRLEVEL_LOW, mp, dicp);
2402			xfs_buf_relse(bp);
2403			xfs_fs_cmn_err(CE_ALERT, mp,
2404				"xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2405				item, dip, bp, ino);
2406			return XFS_ERROR(EFSCORRUPTED);
2407		}
2408	}
2409	if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2410		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
2411				     XFS_ERRLEVEL_LOW, mp, dicp);
2412		xfs_buf_relse(bp);
2413		xfs_fs_cmn_err(CE_ALERT, mp,
2414			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2415			item, dip, bp, ino,
2416			dicp->di_nextents + dicp->di_anextents,
2417			dicp->di_nblocks);
2418		return XFS_ERROR(EFSCORRUPTED);
2419	}
2420	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2421		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
2422				     XFS_ERRLEVEL_LOW, mp, dicp);
2423		xfs_buf_relse(bp);
2424		xfs_fs_cmn_err(CE_ALERT, mp,
2425			"xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
2426			item, dip, bp, ino, dicp->di_forkoff);
2427		return XFS_ERROR(EFSCORRUPTED);
2428	}
2429	if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) {
2430		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
2431				     XFS_ERRLEVEL_LOW, mp, dicp);
2432		xfs_buf_relse(bp);
2433		xfs_fs_cmn_err(CE_ALERT, mp,
2434			"xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
2435			item->ri_buf[1].i_len, item);
2436		return XFS_ERROR(EFSCORRUPTED);
2437	}
2438
2439	/* The core is in in-core format */
2440	xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core,
2441			      (xfs_dinode_core_t*)item->ri_buf[1].i_addr, -1);
2442
2443	/* the rest is in on-disk format */
2444	if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) {
2445		memcpy((xfs_caddr_t) dip + sizeof(xfs_dinode_core_t),
2446			item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t),
2447			item->ri_buf[1].i_len  - sizeof(xfs_dinode_core_t));
2448	}
2449
2450	fields = in_f->ilf_fields;
2451	switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
2452	case XFS_ILOG_DEV:
2453		INT_SET(dip->di_u.di_dev, ARCH_CONVERT, in_f->ilf_u.ilfu_rdev);
2454
2455		break;
2456	case XFS_ILOG_UUID:
2457		dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid;
2458		break;
2459	}
2460
2461	if (in_f->ilf_size == 2)
2462		goto write_inode_buffer;
2463	len = item->ri_buf[2].i_len;
2464	src = item->ri_buf[2].i_addr;
2465	ASSERT(in_f->ilf_size <= 4);
2466	ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
2467	ASSERT(!(fields & XFS_ILOG_DFORK) ||
2468	       (len == in_f->ilf_dsize));
2469
2470	switch (fields & XFS_ILOG_DFORK) {
2471	case XFS_ILOG_DDATA:
2472	case XFS_ILOG_DEXT:
2473		memcpy(&dip->di_u, src, len);
2474		break;
2475
2476	case XFS_ILOG_DBROOT:
2477		xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
2478				 &(dip->di_u.di_bmbt),
2479				 XFS_DFORK_DSIZE(dip, mp));
2480		break;
2481
2482	default:
2483		/*
2484		 * There are no data fork flags set.
2485		 */
2486		ASSERT((fields & XFS_ILOG_DFORK) == 0);
2487		break;
2488	}
2489
2490	/*
2491	 * If we logged any attribute data, recover it.  There may or
2492	 * may not have been any other non-core data logged in this
2493	 * transaction.
2494	 */
2495	if (in_f->ilf_fields & XFS_ILOG_AFORK) {
2496		if (in_f->ilf_fields & XFS_ILOG_DFORK) {
2497			attr_index = 3;
2498		} else {
2499			attr_index = 2;
2500		}
2501		len = item->ri_buf[attr_index].i_len;
2502		src = item->ri_buf[attr_index].i_addr;
2503		ASSERT(len == in_f->ilf_asize);
2504
2505		switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
2506		case XFS_ILOG_ADATA:
2507		case XFS_ILOG_AEXT:
2508			dest = XFS_DFORK_APTR(dip);
2509			ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
2510			memcpy(dest, src, len);
2511			break;
2512
2513		case XFS_ILOG_ABROOT:
2514			dest = XFS_DFORK_APTR(dip);
2515			xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
2516					 (xfs_bmdr_block_t*)dest,
2517					 XFS_DFORK_ASIZE(dip, mp));
2518			break;
2519
2520		default:
2521			xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
2522			ASSERT(0);
2523			xfs_buf_relse(bp);
2524			return XFS_ERROR(EIO);
2525		}
2526	}
2527
2528write_inode_buffer:
2529	if (ITEM_TYPE(item) == XFS_LI_INODE) {
2530		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
2531		       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
2532		XFS_BUF_SET_FSPRIVATE(bp, mp);
2533		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2534		xfs_bdwrite(mp, bp);
2535	} else {
2536		XFS_BUF_STALE(bp);
2537		error = xfs_bwrite(mp, bp);
2538	}
2539
2540	return (error);
2541}
2542
2543/*
2544 * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
2545 * structure, so that we know not to do any dquot item or dquot buffer recovery,
2546 * of that type.
2547 */
2548STATIC int
2549xlog_recover_do_quotaoff_trans(
2550	xlog_t			*log,
2551	xlog_recover_item_t	*item,
2552	int			pass)
2553{
2554	xfs_qoff_logformat_t	*qoff_f;
2555
2556	if (pass == XLOG_RECOVER_PASS2) {
2557		return (0);
2558	}
2559
2560	qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
2561	ASSERT(qoff_f);
2562
2563	/*
2564	 * The logitem format's flag tells us if this was user quotaoff,
2565	 * group/project quotaoff or both.
2566	 */
2567	if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
2568		log->l_quotaoffs_flag |= XFS_DQ_USER;
2569	if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
2570		log->l_quotaoffs_flag |= XFS_DQ_PROJ;
2571	if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2572		log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2573
2574	return (0);
2575}
2576
2577/*
2578 * Recover a dquot record
2579 */
2580STATIC int
2581xlog_recover_do_dquot_trans(
2582	xlog_t			*log,
2583	xlog_recover_item_t	*item,
2584	int			pass)
2585{
2586	xfs_mount_t		*mp;
2587	xfs_buf_t		*bp;
2588	struct xfs_disk_dquot	*ddq, *recddq;
2589	int			error;
2590	xfs_dq_logformat_t	*dq_f;
2591	uint			type;
2592
2593	if (pass == XLOG_RECOVER_PASS1) {
2594		return 0;
2595	}
2596	mp = log->l_mp;
2597
2598	/*
2599	 * Filesystems are required to send in quota flags at mount time.
2600	 */
2601	if (mp->m_qflags == 0)
2602		return (0);
2603
2604	recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
2605	ASSERT(recddq);
2606	/*
2607	 * This type of quotas was turned off, so ignore this record.
2608	 */
2609	type = INT_GET(recddq->d_flags, ARCH_CONVERT) &
2610			(XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
2611	ASSERT(type);
2612	if (log->l_quotaoffs_flag & type)
2613		return (0);
2614
2615	/*
2616	 * At this point we know that quota was _not_ turned off.
2617	 * Since the mount flags are not indicating to us otherwise, this
2618	 * must mean that quota is on, and the dquot needs to be replayed.
2619	 * Remember that we may not have fully recovered the superblock yet,
2620	 * so we can't do the usual trick of looking at the SB quota bits.
2621	 *
2622	 * The other possibility, of course, is that the quota subsystem was
2623	 * removed since the last mount - ENOSYS.
2624	 */
2625	dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
2626	ASSERT(dq_f);
2627	if ((error = xfs_qm_dqcheck(recddq,
2628			   dq_f->qlf_id,
2629			   0, XFS_QMOPT_DOWARN,
2630			   "xlog_recover_do_dquot_trans (log copy)"))) {
2631		return XFS_ERROR(EIO);
2632	}
2633	ASSERT(dq_f->qlf_len == 1);
2634
2635	error = xfs_read_buf(mp, mp->m_ddev_targp,
2636			     dq_f->qlf_blkno,
2637			     XFS_FSB_TO_BB(mp, dq_f->qlf_len),
2638			     0, &bp);
2639	if (error) {
2640		xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
2641				  bp, dq_f->qlf_blkno);
2642		return error;
2643	}
2644	ASSERT(bp);
2645	ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
2646
2647	/*
2648	 * At least the magic num portion should be on disk because this
2649	 * was among a chunk of dquots created earlier, and we did some
2650	 * minimal initialization then.
2651	 */
2652	if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2653			   "xlog_recover_do_dquot_trans")) {
2654		xfs_buf_relse(bp);
2655		return XFS_ERROR(EIO);
2656	}
2657
2658	memcpy(ddq, recddq, item->ri_buf[1].i_len);
2659
2660	ASSERT(dq_f->qlf_size == 2);
2661	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
2662	       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
2663	XFS_BUF_SET_FSPRIVATE(bp, mp);
2664	XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2665	xfs_bdwrite(mp, bp);
2666
2667	return (0);
2668}
2669
2670/*
2671 * This routine is called to create an in-core extent free intent
2672 * item from the efi format structure which was logged on disk.
2673 * It allocates an in-core efi, copies the extents from the format
2674 * structure into it, and adds the efi to the AIL with the given
2675 * LSN.
2676 */
2677STATIC void
2678xlog_recover_do_efi_trans(
2679	xlog_t			*log,
2680	xlog_recover_item_t	*item,
2681	xfs_lsn_t		lsn,
2682	int			pass)
2683{
2684	xfs_mount_t		*mp;
2685	xfs_efi_log_item_t	*efip;
2686	xfs_efi_log_format_t	*efi_formatp;
2687	SPLDECL(s);
2688
2689	if (pass == XLOG_RECOVER_PASS1) {
2690		return;
2691	}
2692
2693	efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
2694	ASSERT(item->ri_buf[0].i_len ==
2695	       (sizeof(xfs_efi_log_format_t) +
2696		((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t))));
2697
2698	mp = log->l_mp;
2699	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2700	memcpy((char *)&(efip->efi_format), (char *)efi_formatp,
2701	      sizeof(xfs_efi_log_format_t) +
2702	      ((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t)));
2703	efip->efi_next_extent = efi_formatp->efi_nextents;
2704	efip->efi_flags |= XFS_EFI_COMMITTED;
2705
2706	AIL_LOCK(mp,s);
2707	/*
2708	 * xfs_trans_update_ail() drops the AIL lock.
2709	 */
2710	xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn, s);
2711}
2712
2713
2714/*
2715 * This routine is called when an efd format structure is found in
2716 * a committed transaction in the log.  It's purpose is to cancel
2717 * the corresponding efi if it was still in the log.  To do this
2718 * it searches the AIL for the efi with an id equal to that in the
2719 * efd format structure.  If we find it, we remove the efi from the
2720 * AIL and free it.
2721 */
2722STATIC void
2723xlog_recover_do_efd_trans(
2724	xlog_t			*log,
2725	xlog_recover_item_t	*item,
2726	int			pass)
2727{
2728	xfs_mount_t		*mp;
2729	xfs_efd_log_format_t	*efd_formatp;
2730	xfs_efi_log_item_t	*efip = NULL;
2731	xfs_log_item_t		*lip;
2732	int			gen;
2733	__uint64_t		efi_id;
2734	SPLDECL(s);
2735
2736	if (pass == XLOG_RECOVER_PASS1) {
2737		return;
2738	}
2739
2740	efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
2741	ASSERT(item->ri_buf[0].i_len ==
2742	       (sizeof(xfs_efd_log_format_t) +
2743		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_t))));
2744	efi_id = efd_formatp->efd_efi_id;
2745
2746	/*
2747	 * Search for the efi with the id in the efd format structure
2748	 * in the AIL.
2749	 */
2750	mp = log->l_mp;
2751	AIL_LOCK(mp,s);
2752	lip = xfs_trans_first_ail(mp, &gen);
2753	while (lip != NULL) {
2754		if (lip->li_type == XFS_LI_EFI) {
2755			efip = (xfs_efi_log_item_t *)lip;
2756			if (efip->efi_format.efi_id == efi_id) {
2757				/*
2758				 * xfs_trans_delete_ail() drops the
2759				 * AIL lock.
2760				 */
2761				xfs_trans_delete_ail(mp, lip, s);
2762				break;
2763			}
2764		}
2765		lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
2766	}
2767
2768	/*
2769	 * If we found it, then free it up.  If it wasn't there, it
2770	 * must have been overwritten in the log.  Oh well.
2771	 */
2772	if (lip != NULL) {
2773		xfs_efi_item_free(efip);
2774	} else {
2775		AIL_UNLOCK(mp, s);
2776	}
2777}
2778
2779/*
2780 * Perform the transaction
2781 *
2782 * If the transaction modifies a buffer or inode, do it now.  Otherwise,
2783 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2784 */
2785STATIC int
2786xlog_recover_do_trans(
2787	xlog_t			*log,
2788	xlog_recover_t		*trans,
2789	int			pass)
2790{
2791	int			error = 0;
2792	xlog_recover_item_t	*item, *first_item;
2793
2794	if ((error = xlog_recover_reorder_trans(log, trans)))
2795		return error;
2796	first_item = item = trans->r_itemq;
2797	do {
2798		/*
2799		 * we don't need to worry about the block number being
2800		 * truncated in > 1 TB buffers because in user-land,
2801		 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
2802		 * the blknos will get through the user-mode buffer
2803		 * cache properly.  The only bad case is o32 kernels
2804		 * where xfs_daddr_t is 32-bits but mount will warn us
2805		 * off a > 1 TB filesystem before we get here.
2806		 */
2807		if ((ITEM_TYPE(item) == XFS_LI_BUF) ||
2808		    (ITEM_TYPE(item) == XFS_LI_6_1_BUF) ||
2809		    (ITEM_TYPE(item) == XFS_LI_5_3_BUF)) {
2810			if  ((error = xlog_recover_do_buffer_trans(log, item,
2811								 pass)))
2812				break;
2813		} else if ((ITEM_TYPE(item) == XFS_LI_INODE) ||
2814			   (ITEM_TYPE(item) == XFS_LI_6_1_INODE) ||
2815			   (ITEM_TYPE(item) == XFS_LI_5_3_INODE)) {
2816			if ((error = xlog_recover_do_inode_trans(log, item,
2817								pass)))
2818				break;
2819		} else if (ITEM_TYPE(item) == XFS_LI_EFI) {
2820			xlog_recover_do_efi_trans(log, item, trans->r_lsn,
2821						  pass);
2822		} else if (ITEM_TYPE(item) == XFS_LI_EFD) {
2823			xlog_recover_do_efd_trans(log, item, pass);
2824		} else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
2825			if ((error = xlog_recover_do_dquot_trans(log, item,
2826								   pass)))
2827					break;
2828		} else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
2829			if ((error = xlog_recover_do_quotaoff_trans(log, item,
2830								   pass)))
2831					break;
2832		} else {
2833			xlog_warn("XFS: xlog_recover_do_trans");
2834			ASSERT(0);
2835			error = XFS_ERROR(EIO);
2836			break;
2837		}
2838		item = item->ri_next;
2839	} while (first_item != item);
2840
2841	return error;
2842}
2843
2844/*
2845 * Free up any resources allocated by the transaction
2846 *
2847 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2848 */
2849STATIC void
2850xlog_recover_free_trans(
2851	xlog_recover_t		*trans)
2852{
2853	xlog_recover_item_t	*first_item, *item, *free_item;
2854	int			i;
2855
2856	item = first_item = trans->r_itemq;
2857	do {
2858		free_item = item;
2859		item = item->ri_next;
2860		 /* Free the regions in the item. */
2861		for (i = 0; i < free_item->ri_cnt; i++) {
2862			kmem_free(free_item->ri_buf[i].i_addr,
2863				  free_item->ri_buf[i].i_len);
2864		}
2865		/* Free the item itself */
2866		kmem_free(free_item->ri_buf,
2867			  (free_item->ri_total * sizeof(xfs_log_iovec_t)));
2868		kmem_free(free_item, sizeof(xlog_recover_item_t));
2869	} while (first_item != item);
2870	/* Free the transaction recover structure */
2871	kmem_free(trans, sizeof(xlog_recover_t));
2872}
2873
2874STATIC int
2875xlog_recover_commit_trans(
2876	xlog_t			*log,
2877	xlog_recover_t		**q,
2878	xlog_recover_t		*trans,
2879	int			pass)
2880{
2881	int			error;
2882
2883	if ((error = xlog_recover_unlink_tid(q, trans)))
2884		return error;
2885	if ((error = xlog_recover_do_trans(log, trans, pass)))
2886		return error;
2887	xlog_recover_free_trans(trans);			/* no error */
2888	return 0;
2889}
2890
2891STATIC int
2892xlog_recover_unmount_trans(
2893	xlog_recover_t		*trans)
2894{
2895	/* Do nothing now */
2896	xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
2897	return 0;
2898}
2899
2900/*
2901 * There are two valid states of the r_state field.  0 indicates that the
2902 * transaction structure is in a normal state.  We have either seen the
2903 * start of the transaction or the last operation we added was not a partial
2904 * operation.  If the last operation we added to the transaction was a
2905 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2906 *
2907 * NOTE: skip LRs with 0 data length.
2908 */
2909STATIC int
2910xlog_recover_process_data(
2911	xlog_t			*log,
2912	xlog_recover_t		*rhash[],
2913	xlog_rec_header_t	*rhead,
2914	xfs_caddr_t		dp,
2915	int			pass)
2916{
2917	xfs_caddr_t		lp;
2918	int			num_logops;
2919	xlog_op_header_t	*ohead;
2920	xlog_recover_t		*trans;
2921	xlog_tid_t		tid;
2922	int			error;
2923	unsigned long		hash;
2924	uint			flags;
2925
2926	lp = dp + INT_GET(rhead->h_len, ARCH_CONVERT);
2927	num_logops = INT_GET(rhead->h_num_logops, ARCH_CONVERT);
2928
2929	/* check the log format matches our own - else we can't recover */
2930	if (xlog_header_check_recover(log->l_mp, rhead))
2931		return (XFS_ERROR(EIO));
2932
2933	while ((dp < lp) && num_logops) {
2934		ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
2935		ohead = (xlog_op_header_t *)dp;
2936		dp += sizeof(xlog_op_header_t);
2937		if (ohead->oh_clientid != XFS_TRANSACTION &&
2938		    ohead->oh_clientid != XFS_LOG) {
2939			xlog_warn(
2940		"XFS: xlog_recover_process_data: bad clientid");
2941			ASSERT(0);
2942			return (XFS_ERROR(EIO));
2943		}
2944		tid = INT_GET(ohead->oh_tid, ARCH_CONVERT);
2945		hash = XLOG_RHASH(tid);
2946		trans = xlog_recover_find_tid(rhash[hash], tid);
2947		if (trans == NULL) {		   /* not found; add new tid */
2948			if (ohead->oh_flags & XLOG_START_TRANS)
2949				xlog_recover_new_tid(&rhash[hash], tid,
2950					INT_GET(rhead->h_lsn, ARCH_CONVERT));
2951		} else {
2952			ASSERT(dp+INT_GET(ohead->oh_len, ARCH_CONVERT) <= lp);
2953			flags = ohead->oh_flags & ~XLOG_END_TRANS;
2954			if (flags & XLOG_WAS_CONT_TRANS)
2955				flags &= ~XLOG_CONTINUE_TRANS;
2956			switch (flags) {
2957			case XLOG_COMMIT_TRANS:
2958				error = xlog_recover_commit_trans(log,
2959						&rhash[hash], trans, pass);
2960				break;
2961			case XLOG_UNMOUNT_TRANS:
2962				error = xlog_recover_unmount_trans(trans);
2963				break;
2964			case XLOG_WAS_CONT_TRANS:
2965				error = xlog_recover_add_to_cont_trans(trans,
2966						dp, INT_GET(ohead->oh_len,
2967							ARCH_CONVERT));
2968				break;
2969			case XLOG_START_TRANS:
2970				xlog_warn(
2971			"XFS: xlog_recover_process_data: bad transaction");
2972				ASSERT(0);
2973				error = XFS_ERROR(EIO);
2974				break;
2975			case 0:
2976			case XLOG_CONTINUE_TRANS:
2977				error = xlog_recover_add_to_trans(trans,
2978						dp, INT_GET(ohead->oh_len,
2979							ARCH_CONVERT));
2980				break;
2981			default:
2982				xlog_warn(
2983			"XFS: xlog_recover_process_data: bad flag");
2984				ASSERT(0);
2985				error = XFS_ERROR(EIO);
2986				break;
2987			}
2988			if (error)
2989				return error;
2990		}
2991		dp += INT_GET(ohead->oh_len, ARCH_CONVERT);
2992		num_logops--;
2993	}
2994	return 0;
2995}
2996
2997/*
2998 * Process an extent free intent item that was recovered from
2999 * the log.  We need to free the extents that it describes.
3000 */
3001STATIC void
3002xlog_recover_process_efi(
3003	xfs_mount_t		*mp,
3004	xfs_efi_log_item_t	*efip)
3005{
3006	xfs_efd_log_item_t	*efdp;
3007	xfs_trans_t		*tp;
3008	int			i;
3009	xfs_extent_t		*extp;
3010	xfs_fsblock_t		startblock_fsb;
3011
3012	ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
3013
3014	/*
3015	 * First check the validity of the extents described by the
3016	 * EFI.  If any are bad, then assume that all are bad and
3017	 * just toss the EFI.
3018	 */
3019	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3020		extp = &(efip->efi_format.efi_extents[i]);
3021		startblock_fsb = XFS_BB_TO_FSB(mp,
3022				   XFS_FSB_TO_DADDR(mp, extp->ext_start));
3023		if ((startblock_fsb == 0) ||
3024		    (extp->ext_len == 0) ||
3025		    (startblock_fsb >= mp->m_sb.sb_dblocks) ||
3026		    (extp->ext_len >= mp->m_sb.sb_agblocks)) {
3027			/*
3028			 * This will pull the EFI from the AIL and
3029			 * free the memory associated with it.
3030			 */
3031			xfs_efi_release(efip, efip->efi_format.efi_nextents);
3032			return;
3033		}
3034	}
3035
3036	tp = xfs_trans_alloc(mp, 0);
3037	xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
3038	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
3039
3040	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3041		extp = &(efip->efi_format.efi_extents[i]);
3042		xfs_free_extent(tp, extp->ext_start, extp->ext_len);
3043		xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
3044					 extp->ext_len);
3045	}
3046
3047	efip->efi_flags |= XFS_EFI_RECOVERED;
3048	xfs_trans_commit(tp, 0, NULL);
3049}
3050
3051/*
3052 * Verify that once we've encountered something other than an EFI
3053 * in the AIL that there are no more EFIs in the AIL.
3054 */
3055#if defined(DEBUG)
3056STATIC void
3057xlog_recover_check_ail(
3058	xfs_mount_t		*mp,
3059	xfs_log_item_t		*lip,
3060	int			gen)
3061{
3062	int			orig_gen = gen;
3063
3064	do {
3065		ASSERT(lip->li_type != XFS_LI_EFI);
3066		lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3067		/*
3068		 * The check will be bogus if we restart from the
3069		 * beginning of the AIL, so ASSERT that we don't.
3070		 * We never should since we're holding the AIL lock
3071		 * the entire time.
3072		 */
3073		ASSERT(gen == orig_gen);
3074	} while (lip != NULL);
3075}
3076#endif	/* DEBUG */
3077
3078/*
3079 * When this is called, all of the EFIs which did not have
3080 * corresponding EFDs should be in the AIL.  What we do now
3081 * is free the extents associated with each one.
3082 *
3083 * Since we process the EFIs in normal transactions, they
3084 * will be removed at some point after the commit.  This prevents
3085 * us from just walking down the list processing each one.
3086 * We'll use a flag in the EFI to skip those that we've already
3087 * processed and use the AIL iteration mechanism's generation
3088 * count to try to speed this up at least a bit.
3089 *
3090 * When we start, we know that the EFIs are the only things in
3091 * the AIL.  As we process them, however, other items are added
3092 * to the AIL.  Since everything added to the AIL must come after
3093 * everything already in the AIL, we stop processing as soon as
3094 * we see something other than an EFI in the AIL.
3095 */
3096STATIC void
3097xlog_recover_process_efis(
3098	xlog_t			*log)
3099{
3100	xfs_log_item_t		*lip;
3101	xfs_efi_log_item_t	*efip;
3102	int			gen;
3103	xfs_mount_t		*mp;
3104	SPLDECL(s);
3105
3106	mp = log->l_mp;
3107	AIL_LOCK(mp,s);
3108
3109	lip = xfs_trans_first_ail(mp, &gen);
3110	while (lip != NULL) {
3111		/*
3112		 * We're done when we see something other than an EFI.
3113		 */
3114		if (lip->li_type != XFS_LI_EFI) {
3115			xlog_recover_check_ail(mp, lip, gen);
3116			break;
3117		}
3118
3119		/*
3120		 * Skip EFIs that we've already processed.
3121		 */
3122		efip = (xfs_efi_log_item_t *)lip;
3123		if (efip->efi_flags & XFS_EFI_RECOVERED) {
3124			lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3125			continue;
3126		}
3127
3128		AIL_UNLOCK(mp, s);
3129		xlog_recover_process_efi(mp, efip);
3130		AIL_LOCK(mp,s);
3131		lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3132	}
3133	AIL_UNLOCK(mp, s);
3134}
3135
3136/*
3137 * This routine performs a transaction to null out a bad inode pointer
3138 * in an agi unlinked inode hash bucket.
3139 */
3140STATIC void
3141xlog_recover_clear_agi_bucket(
3142	xfs_mount_t	*mp,
3143	xfs_agnumber_t	agno,
3144	int		bucket)
3145{
3146	xfs_trans_t	*tp;
3147	xfs_agi_t	*agi;
3148	xfs_buf_t	*agibp;
3149	int		offset;
3150	int		error;
3151
3152	tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
3153	xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
3154
3155	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3156				   XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
3157				   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
3158	if (error) {
3159		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3160		return;
3161	}
3162
3163	agi = XFS_BUF_TO_AGI(agibp);
3164	if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC) {
3165		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3166		return;
3167	}
3168
3169	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
3170	offset = offsetof(xfs_agi_t, agi_unlinked) +
3171		 (sizeof(xfs_agino_t) * bucket);
3172	xfs_trans_log_buf(tp, agibp, offset,
3173			  (offset + sizeof(xfs_agino_t) - 1));
3174
3175	(void) xfs_trans_commit(tp, 0, NULL);
3176}
3177
3178/*
3179 * xlog_iunlink_recover
3180 *
3181 * This is called during recovery to process any inodes which
3182 * we unlinked but not freed when the system crashed.  These
3183 * inodes will be on the lists in the AGI blocks.  What we do
3184 * here is scan all the AGIs and fully truncate and free any
3185 * inodes found on the lists.  Each inode is removed from the
3186 * lists when it has been fully truncated and is freed.  The
3187 * freeing of the inode and its removal from the list must be
3188 * atomic.
3189 */
3190void
3191xlog_recover_process_iunlinks(
3192	xlog_t		*log)
3193{
3194	xfs_mount_t	*mp;
3195	xfs_agnumber_t	agno;
3196	xfs_agi_t	*agi;
3197	xfs_buf_t	*agibp;
3198	xfs_buf_t	*ibp;
3199	xfs_dinode_t	*dip;
3200	xfs_inode_t	*ip;
3201	xfs_agino_t	agino;
3202	xfs_ino_t	ino;
3203	int		bucket;
3204	int		error;
3205	uint		mp_dmevmask;
3206
3207	mp = log->l_mp;
3208
3209	/*
3210	 * Prevent any DMAPI event from being sent while in this function.
3211	 */
3212	mp_dmevmask = mp->m_dmevmask;
3213	mp->m_dmevmask = 0;
3214
3215	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3216		/*
3217		 * Find the agi for this ag.
3218		 */
3219		agibp = xfs_buf_read(mp->m_ddev_targp,
3220				XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
3221				XFS_FSS_TO_BB(mp, 1), 0);
3222		if (XFS_BUF_ISERROR(agibp)) {
3223			xfs_ioerror_alert("xlog_recover_process_iunlinks(#1)",
3224				log->l_mp, agibp,
3225				XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)));
3226		}
3227		agi = XFS_BUF_TO_AGI(agibp);
3228		ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agi->agi_magicnum));
3229
3230		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
3231
3232			agino = be32_to_cpu(agi->agi_unlinked[bucket]);
3233			while (agino != NULLAGINO) {
3234
3235				/*
3236				 * Release the agi buffer so that it can
3237				 * be acquired in the normal course of the
3238				 * transaction to truncate and free the inode.
3239				 */
3240				xfs_buf_relse(agibp);
3241
3242				ino = XFS_AGINO_TO_INO(mp, agno, agino);
3243				error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
3244				ASSERT(error || (ip != NULL));
3245
3246				if (!error) {
3247					/*
3248					 * Get the on disk inode to find the
3249					 * next inode in the bucket.
3250					 */
3251					error = xfs_itobp(mp, NULL, ip, &dip,
3252							&ibp, 0, 0);
3253					ASSERT(error || (dip != NULL));
3254				}
3255
3256				if (!error) {
3257					ASSERT(ip->i_d.di_nlink == 0);
3258
3259					/* setup for the next pass */
3260					agino = INT_GET(dip->di_next_unlinked,
3261							ARCH_CONVERT);
3262					xfs_buf_relse(ibp);
3263					/*
3264					 * Prevent any DMAPI event from
3265					 * being sent when the
3266					 * reference on the inode is
3267					 * dropped.
3268					 */
3269					ip->i_d.di_dmevmask = 0;
3270
3271					/*
3272					 * If this is a new inode, handle
3273					 * it specially.  Otherwise,
3274					 * just drop our reference to the
3275					 * inode.  If there are no
3276					 * other references, this will
3277					 * send the inode to
3278					 * xfs_inactive() which will
3279					 * truncate the file and free
3280					 * the inode.
3281					 */
3282					if (ip->i_d.di_mode == 0)
3283						xfs_iput_new(ip, 0);
3284					else
3285						VN_RELE(XFS_ITOV(ip));
3286				} else {
3287					/*
3288					 * We can't read in the inode
3289					 * this bucket points to, or
3290					 * this inode is messed up.  Just
3291					 * ditch this bucket of inodes.  We
3292					 * will lose some inodes and space,
3293					 * but at least we won't hang.  Call
3294					 * xlog_recover_clear_agi_bucket()
3295					 * to perform a transaction to clear
3296					 * the inode pointer in the bucket.
3297					 */
3298					xlog_recover_clear_agi_bucket(mp, agno,
3299							bucket);
3300
3301					agino = NULLAGINO;
3302				}
3303
3304				/*
3305				 * Reacquire the agibuffer and continue around
3306				 * the loop.
3307				 */
3308				agibp = xfs_buf_read(mp->m_ddev_targp,
3309						XFS_AG_DADDR(mp, agno,
3310							XFS_AGI_DADDR(mp)),
3311						XFS_FSS_TO_BB(mp, 1), 0);
3312				if (XFS_BUF_ISERROR(agibp)) {
3313					xfs_ioerror_alert(
3314				"xlog_recover_process_iunlinks(#2)",
3315						log->l_mp, agibp,
3316						XFS_AG_DADDR(mp, agno,
3317							XFS_AGI_DADDR(mp)));
3318				}
3319				agi = XFS_BUF_TO_AGI(agibp);
3320				ASSERT(XFS_AGI_MAGIC == be32_to_cpu(
3321					agi->agi_magicnum));
3322			}
3323		}
3324
3325		/*
3326		 * Release the buffer for the current agi so we can
3327		 * go on to the next one.
3328		 */
3329		xfs_buf_relse(agibp);
3330	}
3331
3332	mp->m_dmevmask = mp_dmevmask;
3333}
3334
3335
3336#ifdef DEBUG
3337STATIC void
3338xlog_pack_data_checksum(
3339	xlog_t		*log,
3340	xlog_in_core_t	*iclog,
3341	int		size)
3342{
3343	int		i;
3344	uint		*up;
3345	uint		chksum = 0;
3346
3347	up = (uint *)iclog->ic_datap;
3348	/* divide length by 4 to get # words */
3349	for (i = 0; i < (size >> 2); i++) {
3350		chksum ^= INT_GET(*up, ARCH_CONVERT);
3351		up++;
3352	}
3353	INT_SET(iclog->ic_header.h_chksum, ARCH_CONVERT, chksum);
3354}
3355#else
3356#define xlog_pack_data_checksum(log, iclog, size)
3357#endif
3358
3359/*
3360 * Stamp cycle number in every block
3361 */
3362void
3363xlog_pack_data(
3364	xlog_t			*log,
3365	xlog_in_core_t		*iclog,
3366	int			roundoff)
3367{
3368	int			i, j, k;
3369	int			size = iclog->ic_offset + roundoff;
3370	uint			cycle_lsn;
3371	xfs_caddr_t		dp;
3372	xlog_in_core_2_t	*xhdr;
3373
3374	xlog_pack_data_checksum(log, iclog, size);
3375
3376	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
3377
3378	dp = iclog->ic_datap;
3379	for (i = 0; i < BTOBB(size) &&
3380		i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3381		iclog->ic_header.h_cycle_data[i] = *(uint *)dp;
3382		*(uint *)dp = cycle_lsn;
3383		dp += BBSIZE;
3384	}
3385
3386	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
3387		xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
3388		for ( ; i < BTOBB(size); i++) {
3389			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3390			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3391			xhdr[j].hic_xheader.xh_cycle_data[k] = *(uint *)dp;
3392			*(uint *)dp = cycle_lsn;
3393			dp += BBSIZE;
3394		}
3395
3396		for (i = 1; i < log->l_iclog_heads; i++) {
3397			xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
3398		}
3399	}
3400}
3401
3402#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
3403STATIC void
3404xlog_unpack_data_checksum(
3405	xlog_rec_header_t	*rhead,
3406	xfs_caddr_t		dp,
3407	xlog_t			*log)
3408{
3409	uint			*up = (uint *)dp;
3410	uint			chksum = 0;
3411	int			i;
3412
3413	/* divide length by 4 to get # words */
3414	for (i=0; i < INT_GET(rhead->h_len, ARCH_CONVERT) >> 2; i++) {
3415		chksum ^= INT_GET(*up, ARCH_CONVERT);
3416		up++;
3417	}
3418	if (chksum != INT_GET(rhead->h_chksum, ARCH_CONVERT)) {
3419	    if (rhead->h_chksum ||
3420		((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
3421		    cmn_err(CE_DEBUG,
3422			"XFS: LogR chksum mismatch: was (0x%x) is (0x%x)",
3423			    INT_GET(rhead->h_chksum, ARCH_CONVERT), chksum);
3424		    cmn_err(CE_DEBUG,
3425"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
3426		    if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
3427			    cmn_err(CE_DEBUG,
3428				"XFS: LogR this is a LogV2 filesystem");
3429		    }
3430		    log->l_flags |= XLOG_CHKSUM_MISMATCH;
3431	    }
3432	}
3433}
3434#else
3435#define xlog_unpack_data_checksum(rhead, dp, log)
3436#endif
3437
3438STATIC void
3439xlog_unpack_data(
3440	xlog_rec_header_t	*rhead,
3441	xfs_caddr_t		dp,
3442	xlog_t			*log)
3443{
3444	int			i, j, k;
3445	xlog_in_core_2_t	*xhdr;
3446
3447	for (i = 0; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)) &&
3448		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3449		*(uint *)dp = *(uint *)&rhead->h_cycle_data[i];
3450		dp += BBSIZE;
3451	}
3452
3453	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
3454		xhdr = (xlog_in_core_2_t *)rhead;
3455		for ( ; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); i++) {
3456			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3457			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3458			*(uint *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
3459			dp += BBSIZE;
3460		}
3461	}
3462
3463	xlog_unpack_data_checksum(rhead, dp, log);
3464}
3465
3466STATIC int
3467xlog_valid_rec_header(
3468	xlog_t			*log,
3469	xlog_rec_header_t	*rhead,
3470	xfs_daddr_t		blkno)
3471{
3472	int			hlen;
3473
3474	if (unlikely(
3475	    (INT_GET(rhead->h_magicno, ARCH_CONVERT) !=
3476			XLOG_HEADER_MAGIC_NUM))) {
3477		XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
3478				XFS_ERRLEVEL_LOW, log->l_mp);
3479		return XFS_ERROR(EFSCORRUPTED);
3480	}
3481	if (unlikely(
3482	    (!rhead->h_version ||
3483	    (INT_GET(rhead->h_version, ARCH_CONVERT) &
3484			(~XLOG_VERSION_OKBITS)) != 0))) {
3485		xlog_warn("XFS: %s: unrecognised log version (%d).",
3486			__FUNCTION__, INT_GET(rhead->h_version, ARCH_CONVERT));
3487		return XFS_ERROR(EIO);
3488	}
3489
3490	/* LR body must have data or it wouldn't have been written */
3491	hlen = INT_GET(rhead->h_len, ARCH_CONVERT);
3492	if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
3493		XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
3494				XFS_ERRLEVEL_LOW, log->l_mp);
3495		return XFS_ERROR(EFSCORRUPTED);
3496	}
3497	if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
3498		XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
3499				XFS_ERRLEVEL_LOW, log->l_mp);
3500		return XFS_ERROR(EFSCORRUPTED);
3501	}
3502	return 0;
3503}
3504
3505/*
3506 * Read the log from tail to head and process the log records found.
3507 * Handle the two cases where the tail and head are in the same cycle
3508 * and where the active portion of the log wraps around the end of
3509 * the physical log separately.  The pass parameter is passed through
3510 * to the routines called to process the data and is not looked at
3511 * here.
3512 */
3513STATIC int
3514xlog_do_recovery_pass(
3515	xlog_t			*log,
3516	xfs_daddr_t		head_blk,
3517	xfs_daddr_t		tail_blk,
3518	int			pass)
3519{
3520	xlog_rec_header_t	*rhead;
3521	xfs_daddr_t		blk_no;
3522	xfs_caddr_t		bufaddr, offset;
3523	xfs_buf_t		*hbp, *dbp;
3524	int			error = 0, h_size;
3525	int			bblks, split_bblks;
3526	int			hblks, split_hblks, wrapped_hblks;
3527	xlog_recover_t		*rhash[XLOG_RHASH_SIZE];
3528
3529	ASSERT(head_blk != tail_blk);
3530
3531	/*
3532	 * Read the header of the tail block and get the iclog buffer size from
3533	 * h_size.  Use this to tell how many sectors make up the log header.
3534	 */
3535	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
3536		/*
3537		 * When using variable length iclogs, read first sector of
3538		 * iclog header and extract the header size from it.  Get a
3539		 * new hbp that is the correct size.
3540		 */
3541		hbp = xlog_get_bp(log, 1);
3542		if (!hbp)
3543			return ENOMEM;
3544		if ((error = xlog_bread(log, tail_blk, 1, hbp)))
3545			goto bread_err1;
3546		offset = xlog_align(log, tail_blk, 1, hbp);
3547		rhead = (xlog_rec_header_t *)offset;
3548		error = xlog_valid_rec_header(log, rhead, tail_blk);
3549		if (error)
3550			goto bread_err1;
3551		h_size = INT_GET(rhead->h_size, ARCH_CONVERT);
3552		if ((INT_GET(rhead->h_version, ARCH_CONVERT)
3553				& XLOG_VERSION_2) &&
3554		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
3555			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
3556			if (h_size % XLOG_HEADER_CYCLE_SIZE)
3557				hblks++;
3558			xlog_put_bp(hbp);
3559			hbp = xlog_get_bp(log, hblks);
3560		} else {
3561			hblks = 1;
3562		}
3563	} else {
3564		ASSERT(log->l_sectbb_log == 0);
3565		hblks = 1;
3566		hbp = xlog_get_bp(log, 1);
3567		h_size = XLOG_BIG_RECORD_BSIZE;
3568	}
3569
3570	if (!hbp)
3571		return ENOMEM;
3572	dbp = xlog_get_bp(log, BTOBB(h_size));
3573	if (!dbp) {
3574		xlog_put_bp(hbp);
3575		return ENOMEM;
3576	}
3577
3578	memset(rhash, 0, sizeof(rhash));
3579	if (tail_blk <= head_blk) {
3580		for (blk_no = tail_blk; blk_no < head_blk; ) {
3581			if ((error = xlog_bread(log, blk_no, hblks, hbp)))
3582				goto bread_err2;
3583			offset = xlog_align(log, blk_no, hblks, hbp);
3584			rhead = (xlog_rec_header_t *)offset;
3585			error = xlog_valid_rec_header(log, rhead, blk_no);
3586			if (error)
3587				goto bread_err2;
3588
3589			/* blocks in data section */
3590			bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
3591			error = xlog_bread(log, blk_no + hblks, bblks, dbp);
3592			if (error)
3593				goto bread_err2;
3594			offset = xlog_align(log, blk_no + hblks, bblks, dbp);
3595			xlog_unpack_data(rhead, offset, log);
3596			if ((error = xlog_recover_process_data(log,
3597						rhash, rhead, offset, pass)))
3598				goto bread_err2;
3599			blk_no += bblks + hblks;
3600		}
3601	} else {
3602		/*
3603		 * Perform recovery around the end of the physical log.
3604		 * When the head is not on the same cycle number as the tail,
3605		 * we can't do a sequential recovery as above.
3606		 */
3607		blk_no = tail_blk;
3608		while (blk_no < log->l_logBBsize) {
3609			/*
3610			 * Check for header wrapping around physical end-of-log
3611			 */
3612			offset = NULL;
3613			split_hblks = 0;
3614			wrapped_hblks = 0;
3615			if (blk_no + hblks <= log->l_logBBsize) {
3616				/* Read header in one read */
3617				error = xlog_bread(log, blk_no, hblks, hbp);
3618				if (error)
3619					goto bread_err2;
3620				offset = xlog_align(log, blk_no, hblks, hbp);
3621			} else {
3622				/* This LR is split across physical log end */
3623				if (blk_no != log->l_logBBsize) {
3624					/* some data before physical log end */
3625					ASSERT(blk_no <= INT_MAX);
3626					split_hblks = log->l_logBBsize - (int)blk_no;
3627					ASSERT(split_hblks > 0);
3628					if ((error = xlog_bread(log, blk_no,
3629							split_hblks, hbp)))
3630						goto bread_err2;
3631					offset = xlog_align(log, blk_no,
3632							split_hblks, hbp);
3633				}
3634				/*
3635				 * Note: this black magic still works with
3636				 * large sector sizes (non-512) only because:
3637				 * - we increased the buffer size originally
3638				 *   by 1 sector giving us enough extra space
3639				 *   for the second read;
3640				 * - the log start is guaranteed to be sector
3641				 *   aligned;
3642				 * - we read the log end (LR header start)
3643				 *   _first_, then the log start (LR header end)
3644				 *   - order is important.
3645				 */
3646				bufaddr = XFS_BUF_PTR(hbp);
3647				XFS_BUF_SET_PTR(hbp,
3648						bufaddr + BBTOB(split_hblks),
3649						BBTOB(hblks - split_hblks));
3650				wrapped_hblks = hblks - split_hblks;
3651				error = xlog_bread(log, 0, wrapped_hblks, hbp);
3652				if (error)
3653					goto bread_err2;
3654				XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks));
3655				if (!offset)
3656					offset = xlog_align(log, 0,
3657							wrapped_hblks, hbp);
3658			}
3659			rhead = (xlog_rec_header_t *)offset;
3660			error = xlog_valid_rec_header(log, rhead,
3661						split_hblks ? blk_no : 0);
3662			if (error)
3663				goto bread_err2;
3664
3665			bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
3666			blk_no += hblks;
3667
3668			/* Read in data for log record */
3669			if (blk_no + bblks <= log->l_logBBsize) {
3670				error = xlog_bread(log, blk_no, bblks, dbp);
3671				if (error)
3672					goto bread_err2;
3673				offset = xlog_align(log, blk_no, bblks, dbp);
3674			} else {
3675				/* This log record is split across the
3676				 * physical end of log */
3677				offset = NULL;
3678				split_bblks = 0;
3679				if (blk_no != log->l_logBBsize) {
3680					/* some data is before the physical
3681					 * end of log */
3682					ASSERT(!wrapped_hblks);
3683					ASSERT(blk_no <= INT_MAX);
3684					split_bblks =
3685						log->l_logBBsize - (int)blk_no;
3686					ASSERT(split_bblks > 0);
3687					if ((error = xlog_bread(log, blk_no,
3688							split_bblks, dbp)))
3689						goto bread_err2;
3690					offset = xlog_align(log, blk_no,
3691							split_bblks, dbp);
3692				}
3693				/*
3694				 * Note: this black magic still works with
3695				 * large sector sizes (non-512) only because:
3696				 * - we increased the buffer size originally
3697				 *   by 1 sector giving us enough extra space
3698				 *   for the second read;
3699				 * - the log start is guaranteed to be sector
3700				 *   aligned;
3701				 * - we read the log end (LR header start)
3702				 *   _first_, then the log start (LR header end)
3703				 *   - order is important.
3704				 */
3705				bufaddr = XFS_BUF_PTR(dbp);
3706				XFS_BUF_SET_PTR(dbp,
3707						bufaddr + BBTOB(split_bblks),
3708						BBTOB(bblks - split_bblks));
3709				if ((error = xlog_bread(log, wrapped_hblks,
3710						bblks - split_bblks, dbp)))
3711					goto bread_err2;
3712				XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
3713				if (!offset)
3714					offset = xlog_align(log, wrapped_hblks,
3715						bblks - split_bblks, dbp);
3716			}
3717			xlog_unpack_data(rhead, offset, log);
3718			if ((error = xlog_recover_process_data(log, rhash,
3719							rhead, offset, pass)))
3720				goto bread_err2;
3721			blk_no += bblks;
3722		}
3723
3724		ASSERT(blk_no >= log->l_logBBsize);
3725		blk_no -= log->l_logBBsize;
3726
3727		/* read first part of physical log */
3728		while (blk_no < head_blk) {
3729			if ((error = xlog_bread(log, blk_no, hblks, hbp)))
3730				goto bread_err2;
3731			offset = xlog_align(log, blk_no, hblks, hbp);
3732			rhead = (xlog_rec_header_t *)offset;
3733			error = xlog_valid_rec_header(log, rhead, blk_no);
3734			if (error)
3735				goto bread_err2;
3736			bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
3737			if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
3738				goto bread_err2;
3739			offset = xlog_align(log, blk_no+hblks, bblks, dbp);
3740			xlog_unpack_data(rhead, offset, log);
3741			if ((error = xlog_recover_process_data(log, rhash,
3742							rhead, offset, pass)))
3743				goto bread_err2;
3744			blk_no += bblks + hblks;
3745		}
3746	}
3747
3748 bread_err2:
3749	xlog_put_bp(dbp);
3750 bread_err1:
3751	xlog_put_bp(hbp);
3752	return error;
3753}
3754
3755/*
3756 * Do the recovery of the log.  We actually do this in two phases.
3757 * The two passes are necessary in order to implement the function
3758 * of cancelling a record written into the log.  The first pass
3759 * determines those things which have been cancelled, and the
3760 * second pass replays log items normally except for those which
3761 * have been cancelled.  The handling of the replay and cancellations
3762 * takes place in the log item type specific routines.
3763 *
3764 * The table of items which have cancel records in the log is allocated
3765 * and freed at this level, since only here do we know when all of
3766 * the log recovery has been completed.
3767 */
3768STATIC int
3769xlog_do_log_recovery(
3770	xlog_t		*log,
3771	xfs_daddr_t	head_blk,
3772	xfs_daddr_t	tail_blk)
3773{
3774	int		error;
3775
3776	ASSERT(head_blk != tail_blk);
3777
3778	/*
3779	 * First do a pass to find all of the cancelled buf log items.
3780	 * Store them in the buf_cancel_table for use in the second pass.
3781	 */
3782	log->l_buf_cancel_table =
3783		(xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
3784						 sizeof(xfs_buf_cancel_t*),
3785						 KM_SLEEP);
3786	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3787				      XLOG_RECOVER_PASS1);
3788	if (error != 0) {
3789		kmem_free(log->l_buf_cancel_table,
3790			  XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
3791		log->l_buf_cancel_table = NULL;
3792		return error;
3793	}
3794	/*
3795	 * Then do a second pass to actually recover the items in the log.
3796	 * When it is complete free the table of buf cancel items.
3797	 */
3798	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3799				      XLOG_RECOVER_PASS2);
3800#ifdef DEBUG
3801	{
3802		int	i;
3803
3804		for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3805			ASSERT(log->l_buf_cancel_table[i] == NULL);
3806	}
3807#endif	/* DEBUG */
3808
3809	kmem_free(log->l_buf_cancel_table,
3810		  XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
3811	log->l_buf_cancel_table = NULL;
3812
3813	return error;
3814}
3815
3816/*
3817 * Do the actual recovery
3818 */
3819STATIC int
3820xlog_do_recover(
3821	xlog_t		*log,
3822	xfs_daddr_t	head_blk,
3823	xfs_daddr_t	tail_blk)
3824{
3825	int		error;
3826	xfs_buf_t	*bp;
3827	xfs_sb_t	*sbp;
3828
3829	/*
3830	 * XXX: Disable log recovery for now, until we fix panics.
3831	 */
3832	printf("XFS log recovery disabled.\n");
3833	return (EOPNOTSUPP);
3834	/*
3835	 * First replay the images in the log.
3836	 */
3837	error = xlog_do_log_recovery(log, head_blk, tail_blk);
3838	if (error) {
3839		return error;
3840	}
3841
3842	XFS_bflush(log->l_mp->m_ddev_targp);
3843
3844	/*
3845	 * If IO errors happened during recovery, bail out.
3846	 */
3847	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
3848		return (EIO);
3849	}
3850
3851	/*
3852	 * We now update the tail_lsn since much of the recovery has completed
3853	 * and there may be space available to use.  If there were no extent
3854	 * or iunlinks, we can free up the entire log and set the tail_lsn to
3855	 * be the last_sync_lsn.  This was set in xlog_find_tail to be the
3856	 * lsn of the last known good LR on disk.  If there are extent frees
3857	 * or iunlinks they will have some entries in the AIL; so we look at
3858	 * the AIL to determine how to set the tail_lsn.
3859	 */
3860	xlog_assign_tail_lsn(log->l_mp);
3861
3862	/*
3863	 * Now that we've finished replaying all buffer and inode
3864	 * updates, re-read in the superblock.
3865	 */
3866	bp = xfs_getsb(log->l_mp, 0);
3867	XFS_BUF_UNDONE(bp);
3868	XFS_BUF_READ(bp);
3869	xfsbdstrat(log->l_mp, bp);
3870	if ((error = xfs_iowait(bp))) {
3871		xfs_ioerror_alert("xlog_do_recover",
3872				  log->l_mp, bp, XFS_BUF_ADDR(bp));
3873		ASSERT(0);
3874		xfs_buf_relse(bp);
3875		return error;
3876	}
3877
3878	/* Convert superblock from on-disk format */
3879	sbp = &log->l_mp->m_sb;
3880	xfs_xlatesb(XFS_BUF_TO_SBP(bp), sbp, 1, XFS_SB_ALL_BITS);
3881	ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
3882	ASSERT(XFS_SB_GOOD_VERSION(sbp));
3883	xfs_buf_relse(bp);
3884
3885	xlog_recover_check_summary(log);
3886
3887	/* Normal transactions can now occur */
3888	log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
3889	return 0;
3890}
3891
3892/*
3893 * Perform recovery and re-initialize some log variables in xlog_find_tail.
3894 *
3895 * Return error or zero.
3896 */
3897int
3898xlog_recover(
3899	xlog_t		*log)
3900{
3901	xfs_daddr_t	head_blk, tail_blk;
3902	int		error;
3903
3904	/* find the tail of the log */
3905	if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
3906		return error;
3907
3908	if (tail_blk != head_blk) {
3909		/* There used to be a comment here:
3910		 *
3911		 * disallow recovery on read-only mounts.  note -- mount
3912		 * checks for ENOSPC and turns it into an intelligent
3913		 * error message.
3914		 * ...but this is no longer true.  Now, unless you specify
3915		 * NORECOVERY (in which case this function would never be
3916		 * called), we just go ahead and recover.  We do this all
3917		 * under the vfs layer, so we can get away with it unless
3918		 * the device itself is read-only, in which case we fail.
3919		 */
3920		if ((error = xfs_dev_is_read_only(log->l_mp,
3921						"recovery required"))) {
3922			return error;
3923		}
3924
3925		cmn_err(CE_NOTE,
3926			"Starting XFS recovery on filesystem: %s (logdev: %s)",
3927			log->l_mp->m_fsname, log->l_mp->m_logname ?
3928			log->l_mp->m_logname : "internal");
3929
3930		error = xlog_do_recover(log, head_blk, tail_blk);
3931		log->l_flags |= XLOG_RECOVERY_NEEDED;
3932	}
3933	return error;
3934}
3935
3936/*
3937 * In the first part of recovery we replay inodes and buffers and build
3938 * up the list of extent free items which need to be processed.  Here
3939 * we process the extent free items and clean up the on disk unlinked
3940 * inode lists.  This is separated from the first part of recovery so
3941 * that the root and real-time bitmap inodes can be read in from disk in
3942 * between the two stages.  This is necessary so that we can free space
3943 * in the real-time portion of the file system.
3944 */
3945int
3946xlog_recover_finish(
3947	xlog_t		*log,
3948	int		mfsi_flags)
3949{
3950	/*
3951	 * Now we're ready to do the transactions needed for the
3952	 * rest of recovery.  Start with completing all the extent
3953	 * free intent records and then process the unlinked inode
3954	 * lists.  At this point, we essentially run in normal mode
3955	 * except that we're still performing recovery actions
3956	 * rather than accepting new requests.
3957	 */
3958	if (log->l_flags & XLOG_RECOVERY_NEEDED) {
3959		xlog_recover_process_efis(log);
3960		/*
3961		 * Sync the log to get all the EFIs out of the AIL.
3962		 * This isn't absolutely necessary, but it helps in
3963		 * case the unlink transactions would have problems
3964		 * pushing the EFIs out of the way.
3965		 */
3966		xfs_log_force(log->l_mp, (xfs_lsn_t)0,
3967			      (XFS_LOG_FORCE | XFS_LOG_SYNC));
3968
3969		if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) {
3970			xlog_recover_process_iunlinks(log);
3971		}
3972
3973		xlog_recover_check_summary(log);
3974
3975		cmn_err(CE_NOTE,
3976			"Ending XFS recovery on filesystem: %s (logdev: %s)",
3977			log->l_mp->m_fsname, log->l_mp->m_logname ?
3978			log->l_mp->m_logname : "internal");
3979		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3980	} else {
3981		cmn_err(CE_DEBUG,
3982			"!Ending clean XFS mount for filesystem: %s",
3983			log->l_mp->m_fsname);
3984	}
3985	return 0;
3986}
3987
3988
3989#if defined(DEBUG)
3990/*
3991 * Read all of the agf and agi counters and check that they
3992 * are consistent with the superblock counters.
3993 */
3994void
3995xlog_recover_check_summary(
3996	xlog_t		*log)
3997{
3998	xfs_mount_t	*mp;
3999	xfs_agf_t	*agfp;
4000	xfs_agi_t	*agip;
4001	xfs_buf_t	*agfbp;
4002	xfs_buf_t	*agibp;
4003	xfs_daddr_t	agfdaddr;
4004	xfs_daddr_t	agidaddr;
4005	xfs_buf_t	*sbbp;
4006#ifdef XFS_LOUD_RECOVERY
4007	xfs_sb_t	*sbp;
4008#endif
4009	xfs_agnumber_t	agno;
4010	__uint64_t	freeblks;
4011	__uint64_t	itotal;
4012	__uint64_t	ifree;
4013
4014	mp = log->l_mp;
4015
4016	freeblks = 0LL;
4017	itotal = 0LL;
4018	ifree = 0LL;
4019	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
4020		agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp));
4021		agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr,
4022				XFS_FSS_TO_BB(mp, 1), 0);
4023		if (XFS_BUF_ISERROR(agfbp)) {
4024			xfs_ioerror_alert("xlog_recover_check_summary(agf)",
4025						mp, agfbp, agfdaddr);
4026		}
4027		agfp = XFS_BUF_TO_AGF(agfbp);
4028		ASSERT(XFS_AGF_MAGIC == be32_to_cpu(agfp->agf_magicnum));
4029		ASSERT(XFS_AGF_GOOD_VERSION(be32_to_cpu(agfp->agf_versionnum)));
4030		ASSERT(be32_to_cpu(agfp->agf_seqno) == agno);
4031
4032		freeblks += be32_to_cpu(agfp->agf_freeblks) +
4033			    be32_to_cpu(agfp->agf_flcount);
4034		xfs_buf_relse(agfbp);
4035
4036		agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
4037		agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr,
4038				XFS_FSS_TO_BB(mp, 1), 0);
4039		if (XFS_BUF_ISERROR(agibp)) {
4040			xfs_ioerror_alert("xlog_recover_check_summary(agi)",
4041					  mp, agibp, agidaddr);
4042		}
4043		agip = XFS_BUF_TO_AGI(agibp);
4044		ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agip->agi_magicnum));
4045		ASSERT(XFS_AGI_GOOD_VERSION(be32_to_cpu(agip->agi_versionnum)));
4046		ASSERT(be32_to_cpu(agip->agi_seqno) == agno);
4047
4048		itotal += be32_to_cpu(agip->agi_count);
4049		ifree += be32_to_cpu(agip->agi_freecount);
4050		xfs_buf_relse(agibp);
4051	}
4052
4053	sbbp = xfs_getsb(mp, 0);
4054#ifdef XFS_LOUD_RECOVERY
4055	sbp = &mp->m_sb;
4056	xfs_xlatesb(XFS_BUF_TO_SBP(sbbp), sbp, 1, XFS_SB_ALL_BITS);
4057	cmn_err(CE_NOTE,
4058		"xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
4059		sbp->sb_icount, itotal);
4060	cmn_err(CE_NOTE,
4061		"xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
4062		sbp->sb_ifree, ifree);
4063	cmn_err(CE_NOTE,
4064		"xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
4065		sbp->sb_fdblocks, freeblks);
4066#if 0
4067	/*
4068	 * This is turned off until I account for the allocation
4069	 * btree blocks which live in free space.
4070	 */
4071	ASSERT(sbp->sb_icount == itotal);
4072	ASSERT(sbp->sb_ifree == ifree);
4073	ASSERT(sbp->sb_fdblocks == freeblks);
4074#endif
4075#endif
4076	xfs_buf_relse(sbbp);
4077}
4078#endif /* DEBUG */
4079