1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h"
30#include "xfs_error.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h"
38#include "xfs_inode_item.h"
39#include "xfs_imap.h"
40#include "xfs_alloc.h"
41#include "xfs_ialloc.h"
42#include "xfs_log_priv.h"
43#include "xfs_buf_item.h"
44#include "xfs_log_recover.h"
45#include "xfs_extfree_item.h"
46#include "xfs_trans_priv.h"
47#include "xfs_quota.h"
48#include "xfs_rw.h"
49
50STATIC int	xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
51STATIC int	xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
52STATIC void	xlog_recover_insert_item_backq(xlog_recover_item_t **q,
53					       xlog_recover_item_t *item);
54#if defined(DEBUG)
55STATIC void	xlog_recover_check_summary(xlog_t *);
56STATIC void	xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int);
57#else
58#define	xlog_recover_check_summary(log)
59#define	xlog_recover_check_ail(mp, lip, gen)
60#endif
61
62
63/*
64 * Sector aligned buffer routines for buffer create/read/write/access
65 */
66
67#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs)	\
68	( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \
69	((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
70#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno)	((bno) & ~(log)->l_sectbb_mask)
71
72xfs_buf_t *
73xlog_get_bp(
74	xlog_t		*log,
75	int		num_bblks)
76{
77	ASSERT(num_bblks > 0);
78
79	if (log->l_sectbb_log) {
80		if (num_bblks > 1)
81			num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
82		num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks);
83	}
84	return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp);
85}
86
87void
88xlog_put_bp(
89	xfs_buf_t	*bp)
90{
91	xfs_buf_free(bp);
92}
93
94
95/*
96 * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
97 */
98int
99xlog_bread(
100	xlog_t		*log,
101	xfs_daddr_t	blk_no,
102	int		nbblks,
103	xfs_buf_t	*bp)
104{
105	int		error;
106
107	if (log->l_sectbb_log) {
108		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
109		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
110	}
111
112	ASSERT(nbblks > 0);
113	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
114	ASSERT(bp);
115
116	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
117	XFS_BUF_READ(bp);
118	XFS_BUF_BUSY(bp);
119	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
120	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
121
122	xfsbdstrat(log->l_mp, bp);
123	if ((error = xfs_iowait(bp)))
124		xfs_ioerror_alert("xlog_bread", log->l_mp,
125				  bp, XFS_BUF_ADDR(bp));
126	return error;
127}
128
129/*
130 * Write out the buffer at the given block for the given number of blocks.
131 * The buffer is kept locked across the write and is returned locked.
132 * This can only be used for synchronous log writes.
133 */
134STATIC int
135xlog_bwrite(
136	xlog_t		*log,
137	xfs_daddr_t	blk_no,
138	int		nbblks,
139	xfs_buf_t	*bp)
140{
141	int		error;
142
143	if (log->l_sectbb_log) {
144		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
145		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
146	}
147
148	ASSERT(nbblks > 0);
149	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
150
151	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
152	XFS_BUF_ZEROFLAGS(bp);
153	XFS_BUF_BUSY(bp);
154	XFS_BUF_HOLD(bp);
155	XFS_BUF_PSEMA(bp, PRIBIO);
156	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
157	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
158
159	if ((error = xfs_bwrite(log->l_mp, bp)))
160		xfs_ioerror_alert("xlog_bwrite", log->l_mp,
161				  bp, XFS_BUF_ADDR(bp));
162	return error;
163}
164
165STATIC xfs_caddr_t
166xlog_align(
167	xlog_t		*log,
168	xfs_daddr_t	blk_no,
169	int		nbblks,
170	xfs_buf_t	*bp)
171{
172	xfs_caddr_t	ptr;
173
174	if (!log->l_sectbb_log)
175		return XFS_BUF_PTR(bp);
176
177	ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
178	ASSERT(XFS_BUF_SIZE(bp) >=
179		BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
180	return ptr;
181}
182
183#ifdef DEBUG
184/*
185 * dump debug superblock and log record information
186 */
187STATIC void
188xlog_header_check_dump(
189	xfs_mount_t		*mp,
190	xlog_rec_header_t	*head)
191{
192	int			b;
193
194	cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __FUNCTION__);
195	for (b = 0; b < 16; b++)
196		cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
197	cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
198	cmn_err(CE_DEBUG, "    log : uuid = ");
199	for (b = 0; b < 16; b++)
200		cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]);
201	cmn_err(CE_DEBUG, ", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT));
202}
203#else
204#define xlog_header_check_dump(mp, head)
205#endif
206
207/*
208 * check log record header for recovery
209 */
210STATIC int
211xlog_header_check_recover(
212	xfs_mount_t		*mp,
213	xlog_rec_header_t	*head)
214{
215	ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
216
217	/*
218	 * IRIX doesn't write the h_fmt field and leaves it zeroed
219	 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
220	 * a dirty log created in IRIX.
221	 */
222	if (unlikely(INT_GET(head->h_fmt, ARCH_CONVERT) != XLOG_FMT)) {
223		xlog_warn(
224	"XFS: dirty log written in incompatible format - can't recover");
225		xlog_header_check_dump(mp, head);
226		XFS_ERROR_REPORT("xlog_header_check_recover(1)",
227				 XFS_ERRLEVEL_HIGH, mp);
228		return XFS_ERROR(EFSCORRUPTED);
229	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
230		xlog_warn(
231	"XFS: dirty log entry has mismatched uuid - can't recover");
232		xlog_header_check_dump(mp, head);
233		XFS_ERROR_REPORT("xlog_header_check_recover(2)",
234				 XFS_ERRLEVEL_HIGH, mp);
235		return XFS_ERROR(EFSCORRUPTED);
236	}
237	return 0;
238}
239
240/*
241 * read the head block of the log and check the header
242 */
243STATIC int
244xlog_header_check_mount(
245	xfs_mount_t		*mp,
246	xlog_rec_header_t	*head)
247{
248	ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
249
250	if (uuid_is_nil(&head->h_fs_uuid)) {
251		/*
252		 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
253		 * h_fs_uuid is nil, we assume this log was last mounted
254		 * by IRIX and continue.
255		 */
256		xlog_warn("XFS: nil uuid in log - IRIX style log");
257	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
258		xlog_warn("XFS: log has mismatched uuid - can't recover");
259		xlog_header_check_dump(mp, head);
260		XFS_ERROR_REPORT("xlog_header_check_mount",
261				 XFS_ERRLEVEL_HIGH, mp);
262		return XFS_ERROR(EFSCORRUPTED);
263	}
264	return 0;
265}
266
267STATIC void
268xlog_recover_iodone(
269	struct xfs_buf	*bp)
270{
271	xfs_mount_t	*mp;
272
273	ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
274
275	if (XFS_BUF_GETERROR(bp)) {
276		/*
277		 * We're not going to bother about retrying
278		 * this during recovery. One strike!
279		 */
280		mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
281		xfs_ioerror_alert("xlog_recover_iodone",
282				  mp, bp, XFS_BUF_ADDR(bp));
283		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
284	}
285	XFS_BUF_SET_FSPRIVATE(bp, NULL);
286	XFS_BUF_CLR_IODONE_FUNC(bp);
287	xfs_biodone(bp);
288}
289
290/*
291 * This routine finds (to an approximation) the first block in the physical
292 * log which contains the given cycle.  It uses a binary search algorithm.
293 * Note that the algorithm can not be perfect because the disk will not
294 * necessarily be perfect.
295 */
296int
297xlog_find_cycle_start(
298	xlog_t		*log,
299	xfs_buf_t	*bp,
300	xfs_daddr_t	first_blk,
301	xfs_daddr_t	*last_blk,
302	uint		cycle)
303{
304	xfs_caddr_t	offset;
305	xfs_daddr_t	mid_blk;
306	uint		mid_cycle;
307	int		error;
308
309	mid_blk = BLK_AVG(first_blk, *last_blk);
310	while (mid_blk != first_blk && mid_blk != *last_blk) {
311		if ((error = xlog_bread(log, mid_blk, 1, bp)))
312			return error;
313		offset = xlog_align(log, mid_blk, 1, bp);
314		mid_cycle = GET_CYCLE(offset, ARCH_CONVERT);
315		if (mid_cycle == cycle) {
316			*last_blk = mid_blk;
317			/* last_half_cycle == mid_cycle */
318		} else {
319			first_blk = mid_blk;
320			/* first_half_cycle == mid_cycle */
321		}
322		mid_blk = BLK_AVG(first_blk, *last_blk);
323	}
324	ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
325	       (mid_blk == *last_blk && mid_blk-1 == first_blk));
326
327	return 0;
328}
329
330/*
331 * Check that the range of blocks does not contain the cycle number
332 * given.  The scan needs to occur from front to back and the ptr into the
333 * region must be updated since a later routine will need to perform another
334 * test.  If the region is completely good, we end up returning the same
335 * last block number.
336 *
337 * Set blkno to -1 if we encounter no errors.  This is an invalid block number
338 * since we don't ever expect logs to get this large.
339 */
340STATIC int
341xlog_find_verify_cycle(
342	xlog_t		*log,
343	xfs_daddr_t	start_blk,
344	int		nbblks,
345	uint		stop_on_cycle_no,
346	xfs_daddr_t	*new_blk)
347{
348	xfs_daddr_t	i, j;
349	uint		cycle;
350	xfs_buf_t	*bp;
351	xfs_daddr_t	bufblks;
352	xfs_caddr_t	buf = NULL;
353	int		error = 0;
354
355	bufblks = 1 << ffs(nbblks);
356
357	while (!(bp = xlog_get_bp(log, bufblks))) {
358		/* can't get enough memory to do everything in one big buffer */
359		bufblks >>= 1;
360		if (bufblks <= log->l_sectbb_log)
361			return ENOMEM;
362	}
363
364	for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
365		int	bcount;
366
367		bcount = min(bufblks, (start_blk + nbblks - i));
368
369		if ((error = xlog_bread(log, i, bcount, bp)))
370			goto out;
371
372		buf = xlog_align(log, i, bcount, bp);
373		for (j = 0; j < bcount; j++) {
374			cycle = GET_CYCLE(buf, ARCH_CONVERT);
375			if (cycle == stop_on_cycle_no) {
376				*new_blk = i+j;
377				goto out;
378			}
379
380			buf += BBSIZE;
381		}
382	}
383
384	*new_blk = -1;
385
386out:
387	xlog_put_bp(bp);
388	return error;
389}
390
391/*
392 * Potentially backup over partial log record write.
393 *
394 * In the typical case, last_blk is the number of the block directly after
395 * a good log record.  Therefore, we subtract one to get the block number
396 * of the last block in the given buffer.  extra_bblks contains the number
397 * of blocks we would have read on a previous read.  This happens when the
398 * last log record is split over the end of the physical log.
399 *
400 * extra_bblks is the number of blocks potentially verified on a previous
401 * call to this routine.
402 */
403STATIC int
404xlog_find_verify_log_record(
405	xlog_t			*log,
406	xfs_daddr_t		start_blk,
407	xfs_daddr_t		*last_blk,
408	int			extra_bblks)
409{
410	xfs_daddr_t		i;
411	xfs_buf_t		*bp;
412	xfs_caddr_t		offset = NULL;
413	xlog_rec_header_t	*head = NULL;
414	int			error = 0;
415	int			smallmem = 0;
416	int			num_blks = *last_blk - start_blk;
417	int			xhdrs;
418
419	ASSERT(start_blk != 0 || *last_blk != start_blk);
420
421	if (!(bp = xlog_get_bp(log, num_blks))) {
422		if (!(bp = xlog_get_bp(log, 1)))
423			return ENOMEM;
424		smallmem = 1;
425	} else {
426		if ((error = xlog_bread(log, start_blk, num_blks, bp)))
427			goto out;
428		offset = xlog_align(log, start_blk, num_blks, bp);
429		offset += ((num_blks - 1) << BBSHIFT);
430	}
431
432	for (i = (*last_blk) - 1; i >= 0; i--) {
433		if (i < start_blk) {
434			/* valid log record not found */
435			xlog_warn(
436		"XFS: Log inconsistent (didn't find previous header)");
437			ASSERT(0);
438			error = XFS_ERROR(EIO);
439			goto out;
440		}
441
442		if (smallmem) {
443			if ((error = xlog_bread(log, i, 1, bp)))
444				goto out;
445			offset = xlog_align(log, i, 1, bp);
446		}
447
448		head = (xlog_rec_header_t *)offset;
449
450		if (XLOG_HEADER_MAGIC_NUM ==
451		    INT_GET(head->h_magicno, ARCH_CONVERT))
452			break;
453
454		if (!smallmem)
455			offset -= BBSIZE;
456	}
457
458	/*
459	 * We hit the beginning of the physical log & still no header.  Return
460	 * to caller.  If caller can handle a return of -1, then this routine
461	 * will be called again for the end of the physical log.
462	 */
463	if (i == -1) {
464		error = -1;
465		goto out;
466	}
467
468	/*
469	 * We have the final block of the good log (the first block
470	 * of the log record _before_ the head. So we check the uuid.
471	 */
472	if ((error = xlog_header_check_mount(log->l_mp, head)))
473		goto out;
474
475	/*
476	 * We may have found a log record header before we expected one.
477	 * last_blk will be the 1st block # with a given cycle #.  We may end
478	 * up reading an entire log record.  In this case, we don't want to
479	 * reset last_blk.  Only when last_blk points in the middle of a log
480	 * record do we update last_blk.
481	 */
482	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
483		uint	h_size = INT_GET(head->h_size, ARCH_CONVERT);
484
485		xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
486		if (h_size % XLOG_HEADER_CYCLE_SIZE)
487			xhdrs++;
488	} else {
489		xhdrs = 1;
490	}
491
492	if (*last_blk - i + extra_bblks
493			!= BTOBB(INT_GET(head->h_len, ARCH_CONVERT)) + xhdrs)
494		*last_blk = i;
495
496out:
497	xlog_put_bp(bp);
498	return error;
499}
500
501/*
502 * Head is defined to be the point of the log where the next log write
503 * write could go.  This means that incomplete LR writes at the end are
504 * eliminated when calculating the head.  We aren't guaranteed that previous
505 * LR have complete transactions.  We only know that a cycle number of
506 * current cycle number -1 won't be present in the log if we start writing
507 * from our current block number.
508 *
509 * last_blk contains the block number of the first block with a given
510 * cycle number.
511 *
512 * Return: zero if normal, non-zero if error.
513 */
514STATIC int
515xlog_find_head(
516	xlog_t 		*log,
517	xfs_daddr_t	*return_head_blk)
518{
519	xfs_buf_t	*bp;
520	xfs_caddr_t	offset;
521	xfs_daddr_t	new_blk, first_blk, start_blk, last_blk, head_blk;
522	int		num_scan_bblks;
523	uint		first_half_cycle, last_half_cycle;
524	uint		stop_on_cycle;
525	int		error, log_bbnum = log->l_logBBsize;
526
527	/* Is the end of the log device zeroed? */
528	if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
529		*return_head_blk = first_blk;
530
531		/* Is the whole lot zeroed? */
532		if (!first_blk) {
533			/* Linux XFS shouldn't generate totally zeroed logs -
534			 * mkfs etc write a dummy unmount record to a fresh
535			 * log so we can store the uuid in there
536			 */
537			xlog_warn("XFS: totally zeroed log");
538		}
539
540		return 0;
541	} else if (error) {
542		xlog_warn("XFS: empty log check failed");
543		return error;
544	}
545
546	first_blk = 0;			/* get cycle # of 1st block */
547	bp = xlog_get_bp(log, 1);
548	if (!bp)
549		return ENOMEM;
550	if ((error = xlog_bread(log, 0, 1, bp)))
551		goto bp_err;
552	offset = xlog_align(log, 0, 1, bp);
553	first_half_cycle = GET_CYCLE(offset, ARCH_CONVERT);
554
555	last_blk = head_blk = log_bbnum - 1;	/* get cycle # of last block */
556	if ((error = xlog_bread(log, last_blk, 1, bp)))
557		goto bp_err;
558	offset = xlog_align(log, last_blk, 1, bp);
559	last_half_cycle = GET_CYCLE(offset, ARCH_CONVERT);
560	ASSERT(last_half_cycle != 0);
561
562	/*
563	 * If the 1st half cycle number is equal to the last half cycle number,
564	 * then the entire log is stamped with the same cycle number.  In this
565	 * case, head_blk can't be set to zero (which makes sense).  The below
566	 * math doesn't work out properly with head_blk equal to zero.  Instead,
567	 * we set it to log_bbnum which is an invalid block number, but this
568	 * value makes the math correct.  If head_blk doesn't changed through
569	 * all the tests below, *head_blk is set to zero at the very end rather
570	 * than log_bbnum.  In a sense, log_bbnum and zero are the same block
571	 * in a circular file.
572	 */
573	if (first_half_cycle == last_half_cycle) {
574		/*
575		 * In this case we believe that the entire log should have
576		 * cycle number last_half_cycle.  We need to scan backwards
577		 * from the end verifying that there are no holes still
578		 * containing last_half_cycle - 1.  If we find such a hole,
579		 * then the start of that hole will be the new head.  The
580		 * simple case looks like
581		 *        x | x ... | x - 1 | x
582		 * Another case that fits this picture would be
583		 *        x | x + 1 | x ... | x
584		 * In this case the head really is somewhere at the end of the
585		 * log, as one of the latest writes at the beginning was
586		 * incomplete.
587		 * One more case is
588		 *        x | x + 1 | x ... | x - 1 | x
589		 * This is really the combination of the above two cases, and
590		 * the head has to end up at the start of the x-1 hole at the
591		 * end of the log.
592		 *
593		 * In the 256k log case, we will read from the beginning to the
594		 * end of the log and search for cycle numbers equal to x-1.
595		 * We don't worry about the x+1 blocks that we encounter,
596		 * because we know that they cannot be the head since the log
597		 * started with x.
598		 */
599		head_blk = log_bbnum;
600		stop_on_cycle = last_half_cycle - 1;
601	} else {
602		/*
603		 * In this case we want to find the first block with cycle
604		 * number matching last_half_cycle.  We expect the log to be
605		 * some variation on
606		 *        x + 1 ... | x ...
607		 * The first block with cycle number x (last_half_cycle) will
608		 * be where the new head belongs.  First we do a binary search
609		 * for the first occurrence of last_half_cycle.  The binary
610		 * search may not be totally accurate, so then we scan back
611		 * from there looking for occurrences of last_half_cycle before
612		 * us.  If that backwards scan wraps around the beginning of
613		 * the log, then we look for occurrences of last_half_cycle - 1
614		 * at the end of the log.  The cases we're looking for look
615		 * like
616		 *        x + 1 ... | x | x + 1 | x ...
617		 *                               ^ binary search stopped here
618		 * or
619		 *        x + 1 ... | x ... | x - 1 | x
620		 *        <---------> less than scan distance
621		 */
622		stop_on_cycle = last_half_cycle;
623		if ((error = xlog_find_cycle_start(log, bp, first_blk,
624						&head_blk, last_half_cycle)))
625			goto bp_err;
626	}
627
628	/*
629	 * Now validate the answer.  Scan back some number of maximum possible
630	 * blocks and make sure each one has the expected cycle number.  The
631	 * maximum is determined by the total possible amount of buffering
632	 * in the in-core log.  The following number can be made tighter if
633	 * we actually look at the block size of the filesystem.
634	 */
635	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
636	if (head_blk >= num_scan_bblks) {
637		/*
638		 * We are guaranteed that the entire check can be performed
639		 * in one buffer.
640		 */
641		start_blk = head_blk - num_scan_bblks;
642		if ((error = xlog_find_verify_cycle(log,
643						start_blk, num_scan_bblks,
644						stop_on_cycle, &new_blk)))
645			goto bp_err;
646		if (new_blk != -1)
647			head_blk = new_blk;
648	} else {		/* need to read 2 parts of log */
649		/*
650		 * We are going to scan backwards in the log in two parts.
651		 * First we scan the physical end of the log.  In this part
652		 * of the log, we are looking for blocks with cycle number
653		 * last_half_cycle - 1.
654		 * If we find one, then we know that the log starts there, as
655		 * we've found a hole that didn't get written in going around
656		 * the end of the physical log.  The simple case for this is
657		 *        x + 1 ... | x ... | x - 1 | x
658		 *        <---------> less than scan distance
659		 * If all of the blocks at the end of the log have cycle number
660		 * last_half_cycle, then we check the blocks at the start of
661		 * the log looking for occurrences of last_half_cycle.  If we
662		 * find one, then our current estimate for the location of the
663		 * first occurrence of last_half_cycle is wrong and we move
664		 * back to the hole we've found.  This case looks like
665		 *        x + 1 ... | x | x + 1 | x ...
666		 *                               ^ binary search stopped here
667		 * Another case we need to handle that only occurs in 256k
668		 * logs is
669		 *        x + 1 ... | x ... | x+1 | x ...
670		 *                   ^ binary search stops here
671		 * In a 256k log, the scan at the end of the log will see the
672		 * x + 1 blocks.  We need to skip past those since that is
673		 * certainly not the head of the log.  By searching for
674		 * last_half_cycle-1 we accomplish that.
675		 */
676		start_blk = log_bbnum - num_scan_bblks + head_blk;
677		ASSERT(head_blk <= INT_MAX &&
678			(xfs_daddr_t) num_scan_bblks - head_blk >= 0);
679		if ((error = xlog_find_verify_cycle(log, start_blk,
680					num_scan_bblks - (int)head_blk,
681					(stop_on_cycle - 1), &new_blk)))
682			goto bp_err;
683		if (new_blk != -1) {
684			head_blk = new_blk;
685			goto bad_blk;
686		}
687
688		/*
689		 * Scan beginning of log now.  The last part of the physical
690		 * log is good.  This scan needs to verify that it doesn't find
691		 * the last_half_cycle.
692		 */
693		start_blk = 0;
694		ASSERT(head_blk <= INT_MAX);
695		if ((error = xlog_find_verify_cycle(log,
696					start_blk, (int)head_blk,
697					stop_on_cycle, &new_blk)))
698			goto bp_err;
699		if (new_blk != -1)
700			head_blk = new_blk;
701	}
702
703 bad_blk:
704	/*
705	 * Now we need to make sure head_blk is not pointing to a block in
706	 * the middle of a log record.
707	 */
708	num_scan_bblks = XLOG_REC_SHIFT(log);
709	if (head_blk >= num_scan_bblks) {
710		start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
711
712		/* start ptr at last block ptr before head_blk */
713		if ((error = xlog_find_verify_log_record(log, start_blk,
714							&head_blk, 0)) == -1) {
715			error = XFS_ERROR(EIO);
716			goto bp_err;
717		} else if (error)
718			goto bp_err;
719	} else {
720		start_blk = 0;
721		ASSERT(head_blk <= INT_MAX);
722		if ((error = xlog_find_verify_log_record(log, start_blk,
723							&head_blk, 0)) == -1) {
724			/* We hit the beginning of the log during our search */
725			start_blk = log_bbnum - num_scan_bblks + head_blk;
726			new_blk = log_bbnum;
727			ASSERT(start_blk <= INT_MAX &&
728				(xfs_daddr_t) log_bbnum-start_blk >= 0);
729			ASSERT(head_blk <= INT_MAX);
730			if ((error = xlog_find_verify_log_record(log,
731							start_blk, &new_blk,
732							(int)head_blk)) == -1) {
733				error = XFS_ERROR(EIO);
734				goto bp_err;
735			} else if (error)
736				goto bp_err;
737			if (new_blk != log_bbnum)
738				head_blk = new_blk;
739		} else if (error)
740			goto bp_err;
741	}
742
743	xlog_put_bp(bp);
744	if (head_blk == log_bbnum)
745		*return_head_blk = 0;
746	else
747		*return_head_blk = head_blk;
748	/*
749	 * When returning here, we have a good block number.  Bad block
750	 * means that during a previous crash, we didn't have a clean break
751	 * from cycle number N to cycle number N-1.  In this case, we need
752	 * to find the first block with cycle number N-1.
753	 */
754	return 0;
755
756 bp_err:
757	xlog_put_bp(bp);
758
759	if (error)
760	    xlog_warn("XFS: failed to find log head");
761	return error;
762}
763
764/*
765 * Find the sync block number or the tail of the log.
766 *
767 * This will be the block number of the last record to have its
768 * associated buffers synced to disk.  Every log record header has
769 * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
770 * to get a sync block number.  The only concern is to figure out which
771 * log record header to believe.
772 *
773 * The following algorithm uses the log record header with the largest
774 * lsn.  The entire log record does not need to be valid.  We only care
775 * that the header is valid.
776 *
777 * We could speed up search by using current head_blk buffer, but it is not
778 * available.
779 */
780int
781xlog_find_tail(
782	xlog_t			*log,
783	xfs_daddr_t		*head_blk,
784	xfs_daddr_t		*tail_blk)
785{
786	xlog_rec_header_t	*rhead;
787	xlog_op_header_t	*op_head;
788	xfs_caddr_t		offset = NULL;
789	xfs_buf_t		*bp;
790	int			error, i, found;
791	xfs_daddr_t		umount_data_blk;
792	xfs_daddr_t		after_umount_blk;
793	xfs_lsn_t		tail_lsn;
794	int			hblks;
795
796	found = 0;
797
798	/*
799	 * Find previous log record
800	 */
801	if ((error = xlog_find_head(log, head_blk)))
802		return error;
803
804	bp = xlog_get_bp(log, 1);
805	if (!bp)
806		return ENOMEM;
807	if (*head_blk == 0) {				/* special case */
808		if ((error = xlog_bread(log, 0, 1, bp)))
809			goto bread_err;
810		offset = xlog_align(log, 0, 1, bp);
811		if (GET_CYCLE(offset, ARCH_CONVERT) == 0) {
812			*tail_blk = 0;
813			/* leave all other log inited values alone */
814			goto exit;
815		}
816	}
817
818	/*
819	 * Search backwards looking for log record header block
820	 */
821	ASSERT(*head_blk < INT_MAX);
822	for (i = (int)(*head_blk) - 1; i >= 0; i--) {
823		if ((error = xlog_bread(log, i, 1, bp)))
824			goto bread_err;
825		offset = xlog_align(log, i, 1, bp);
826		if (XLOG_HEADER_MAGIC_NUM ==
827		    INT_GET(*(uint *)offset, ARCH_CONVERT)) {
828			found = 1;
829			break;
830		}
831	}
832	/*
833	 * If we haven't found the log record header block, start looking
834	 * again from the end of the physical log.  XXXmiken: There should be
835	 * a check here to make sure we didn't search more than N blocks in
836	 * the previous code.
837	 */
838	if (!found) {
839		for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
840			if ((error = xlog_bread(log, i, 1, bp)))
841				goto bread_err;
842			offset = xlog_align(log, i, 1, bp);
843			if (XLOG_HEADER_MAGIC_NUM ==
844			    INT_GET(*(uint*)offset, ARCH_CONVERT)) {
845				found = 2;
846				break;
847			}
848		}
849	}
850	if (!found) {
851		xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
852		ASSERT(0);
853		return XFS_ERROR(EIO);
854	}
855
856	/* find blk_no of tail of log */
857	rhead = (xlog_rec_header_t *)offset;
858	*tail_blk = BLOCK_LSN(INT_GET(rhead->h_tail_lsn, ARCH_CONVERT));
859
860	/*
861	 * Reset log values according to the state of the log when we
862	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
863	 * one because the next write starts a new cycle rather than
864	 * continuing the cycle of the last good log record.  At this
865	 * point we have guaranteed that all partial log records have been
866	 * accounted for.  Therefore, we know that the last good log record
867	 * written was complete and ended exactly on the end boundary
868	 * of the physical log.
869	 */
870	log->l_prev_block = i;
871	log->l_curr_block = (int)*head_blk;
872	log->l_curr_cycle = INT_GET(rhead->h_cycle, ARCH_CONVERT);
873	if (found == 2)
874		log->l_curr_cycle++;
875	log->l_tail_lsn = INT_GET(rhead->h_tail_lsn, ARCH_CONVERT);
876	log->l_last_sync_lsn = INT_GET(rhead->h_lsn, ARCH_CONVERT);
877	log->l_grant_reserve_cycle = log->l_curr_cycle;
878	log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
879	log->l_grant_write_cycle = log->l_curr_cycle;
880	log->l_grant_write_bytes = BBTOB(log->l_curr_block);
881
882	/*
883	 * Look for unmount record.  If we find it, then we know there
884	 * was a clean unmount.  Since 'i' could be the last block in
885	 * the physical log, we convert to a log block before comparing
886	 * to the head_blk.
887	 *
888	 * Save the current tail lsn to use to pass to
889	 * xlog_clear_stale_blocks() below.  We won't want to clear the
890	 * unmount record if there is one, so we pass the lsn of the
891	 * unmount record rather than the block after it.
892	 */
893	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
894		int	h_size = INT_GET(rhead->h_size, ARCH_CONVERT);
895		int	h_version = INT_GET(rhead->h_version, ARCH_CONVERT);
896
897		if ((h_version & XLOG_VERSION_2) &&
898		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
899			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
900			if (h_size % XLOG_HEADER_CYCLE_SIZE)
901				hblks++;
902		} else {
903			hblks = 1;
904		}
905	} else {
906		hblks = 1;
907	}
908	after_umount_blk = (i + hblks + (int)
909		BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT))) % log->l_logBBsize;
910	tail_lsn = log->l_tail_lsn;
911	if (*head_blk == after_umount_blk &&
912	    INT_GET(rhead->h_num_logops, ARCH_CONVERT) == 1) {
913		umount_data_blk = (i + hblks) % log->l_logBBsize;
914		if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
915			goto bread_err;
916		}
917		offset = xlog_align(log, umount_data_blk, 1, bp);
918		op_head = (xlog_op_header_t *)offset;
919		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
920			/*
921			 * Set tail and last sync so that newly written
922			 * log records will point recovery to after the
923			 * current unmount record.
924			 */
925			ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, log->l_curr_cycle,
926					after_umount_blk);
927			ASSIGN_ANY_LSN_HOST(log->l_last_sync_lsn, log->l_curr_cycle,
928					after_umount_blk);
929			*tail_blk = after_umount_blk;
930		}
931	}
932
933	/*
934	 * Make sure that there are no blocks in front of the head
935	 * with the same cycle number as the head.  This can happen
936	 * because we allow multiple outstanding log writes concurrently,
937	 * and the later writes might make it out before earlier ones.
938	 *
939	 * We use the lsn from before modifying it so that we'll never
940	 * overwrite the unmount record after a clean unmount.
941	 *
942	 * Do this only if we are going to recover the filesystem
943	 *
944	 * NOTE: This used to say "if (!readonly)"
945	 * However on Linux, we can & do recover a read-only filesystem.
946	 * We only skip recovery if NORECOVERY is specified on mount,
947	 * in which case we would not be here.
948	 *
949	 * But... if the -device- itself is readonly, just skip this.
950	 * We can't recover this device anyway, so it won't matter.
951	 */
952	if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
953		error = xlog_clear_stale_blocks(log, tail_lsn);
954	}
955
956bread_err:
957exit:
958	xlog_put_bp(bp);
959
960	if (error)
961		xlog_warn("XFS: failed to locate log tail");
962	return error;
963}
964
965/*
966 * Is the log zeroed at all?
967 *
968 * The last binary search should be changed to perform an X block read
969 * once X becomes small enough.  You can then search linearly through
970 * the X blocks.  This will cut down on the number of reads we need to do.
971 *
972 * If the log is partially zeroed, this routine will pass back the blkno
973 * of the first block with cycle number 0.  It won't have a complete LR
974 * preceding it.
975 *
976 * Return:
977 *	0  => the log is completely written to
978 *	-1 => use *blk_no as the first block of the log
979 *	>0 => error has occurred
980 */
981int
982xlog_find_zeroed(
983	xlog_t		*log,
984	xfs_daddr_t	*blk_no)
985{
986	xfs_buf_t	*bp;
987	xfs_caddr_t	offset;
988	uint	        first_cycle, last_cycle;
989	xfs_daddr_t	new_blk, last_blk, start_blk;
990	xfs_daddr_t     num_scan_bblks;
991	int	        error, log_bbnum = log->l_logBBsize;
992
993	*blk_no = 0;
994
995	/* check totally zeroed log */
996	bp = xlog_get_bp(log, 1);
997	if (!bp)
998		return ENOMEM;
999	if ((error = xlog_bread(log, 0, 1, bp)))
1000		goto bp_err;
1001	offset = xlog_align(log, 0, 1, bp);
1002	first_cycle = GET_CYCLE(offset, ARCH_CONVERT);
1003	if (first_cycle == 0) {		/* completely zeroed log */
1004		*blk_no = 0;
1005		xlog_put_bp(bp);
1006		return -1;
1007	}
1008
1009	/* check partially zeroed log */
1010	if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
1011		goto bp_err;
1012	offset = xlog_align(log, log_bbnum-1, 1, bp);
1013	last_cycle = GET_CYCLE(offset, ARCH_CONVERT);
1014	if (last_cycle != 0) {		/* log completely written to */
1015		xlog_put_bp(bp);
1016		return 0;
1017	} else if (first_cycle != 1) {
1018		/*
1019		 * If the cycle of the last block is zero, the cycle of
1020		 * the first block must be 1. If it's not, maybe we're
1021		 * not looking at a log... Bail out.
1022		 */
1023		xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
1024		return XFS_ERROR(EINVAL);
1025	}
1026
1027	/* we have a partially zeroed log */
1028	last_blk = log_bbnum-1;
1029	if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1030		goto bp_err;
1031
1032	/*
1033	 * Validate the answer.  Because there is no way to guarantee that
1034	 * the entire log is made up of log records which are the same size,
1035	 * we scan over the defined maximum blocks.  At this point, the maximum
1036	 * is not chosen to mean anything special.   XXXmiken
1037	 */
1038	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1039	ASSERT(num_scan_bblks <= INT_MAX);
1040
1041	if (last_blk < num_scan_bblks)
1042		num_scan_bblks = last_blk;
1043	start_blk = last_blk - num_scan_bblks;
1044
1045	/*
1046	 * We search for any instances of cycle number 0 that occur before
1047	 * our current estimate of the head.  What we're trying to detect is
1048	 *        1 ... | 0 | 1 | 0...
1049	 *                       ^ binary search ends here
1050	 */
1051	if ((error = xlog_find_verify_cycle(log, start_blk,
1052					 (int)num_scan_bblks, 0, &new_blk)))
1053		goto bp_err;
1054	if (new_blk != -1)
1055		last_blk = new_blk;
1056
1057	/*
1058	 * Potentially backup over partial log record write.  We don't need
1059	 * to search the end of the log because we know it is zero.
1060	 */
1061	if ((error = xlog_find_verify_log_record(log, start_blk,
1062				&last_blk, 0)) == -1) {
1063	    error = XFS_ERROR(EIO);
1064	    goto bp_err;
1065	} else if (error)
1066	    goto bp_err;
1067
1068	*blk_no = last_blk;
1069bp_err:
1070	xlog_put_bp(bp);
1071	if (error)
1072		return error;
1073	return -1;
1074}
1075
1076/*
1077 * These are simple subroutines used by xlog_clear_stale_blocks() below
1078 * to initialize a buffer full of empty log record headers and write
1079 * them into the log.
1080 */
1081STATIC void
1082xlog_add_record(
1083	xlog_t			*log,
1084	xfs_caddr_t		buf,
1085	int			cycle,
1086	int			block,
1087	int			tail_cycle,
1088	int			tail_block)
1089{
1090	xlog_rec_header_t	*recp = (xlog_rec_header_t *)buf;
1091
1092	memset(buf, 0, BBSIZE);
1093	INT_SET(recp->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM);
1094	INT_SET(recp->h_cycle, ARCH_CONVERT, cycle);
1095	INT_SET(recp->h_version, ARCH_CONVERT,
1096			XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1);
1097	ASSIGN_ANY_LSN_DISK(recp->h_lsn, cycle, block);
1098	ASSIGN_ANY_LSN_DISK(recp->h_tail_lsn, tail_cycle, tail_block);
1099	INT_SET(recp->h_fmt, ARCH_CONVERT, XLOG_FMT);
1100	memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1101}
1102
1103STATIC int
1104xlog_write_log_records(
1105	xlog_t		*log,
1106	int		cycle,
1107	int		start_block,
1108	int		blocks,
1109	int		tail_cycle,
1110	int		tail_block)
1111{
1112	xfs_caddr_t	offset;
1113	xfs_buf_t	*bp;
1114	int		balign, ealign;
1115	int		sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
1116	int		end_block = start_block + blocks;
1117	int		bufblks;
1118	int		error = 0;
1119	int		i, j = 0;
1120
1121	bufblks = 1 << ffs(blocks);
1122	while (!(bp = xlog_get_bp(log, bufblks))) {
1123		bufblks >>= 1;
1124		if (bufblks <= log->l_sectbb_log)
1125			return ENOMEM;
1126	}
1127
1128	/* We may need to do a read at the start to fill in part of
1129	 * the buffer in the starting sector not covered by the first
1130	 * write below.
1131	 */
1132	balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
1133	if (balign != start_block) {
1134		if ((error = xlog_bread(log, start_block, 1, bp))) {
1135			xlog_put_bp(bp);
1136			return error;
1137		}
1138		j = start_block - balign;
1139	}
1140
1141	for (i = start_block; i < end_block; i += bufblks) {
1142		int		bcount, endcount;
1143
1144		bcount = min(bufblks, end_block - start_block);
1145		endcount = bcount - j;
1146
1147		/* We may need to do a read at the end to fill in part of
1148		 * the buffer in the final sector not covered by the write.
1149		 * If this is the same sector as the above read, skip it.
1150		 */
1151		ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block);
1152		if (j == 0 && (start_block + endcount > ealign)) {
1153			offset = XFS_BUF_PTR(bp);
1154			balign = BBTOB(ealign - start_block);
1155			XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb));
1156			if ((error = xlog_bread(log, ealign, sectbb, bp)))
1157				break;
1158			XFS_BUF_SET_PTR(bp, offset, bufblks);
1159		}
1160
1161		offset = xlog_align(log, start_block, endcount, bp);
1162		for (; j < endcount; j++) {
1163			xlog_add_record(log, offset, cycle, i+j,
1164					tail_cycle, tail_block);
1165			offset += BBSIZE;
1166		}
1167		error = xlog_bwrite(log, start_block, endcount, bp);
1168		if (error)
1169			break;
1170		start_block += endcount;
1171		j = 0;
1172	}
1173	xlog_put_bp(bp);
1174	return error;
1175}
1176
1177/*
1178 * This routine is called to blow away any incomplete log writes out
1179 * in front of the log head.  We do this so that we won't become confused
1180 * if we come up, write only a little bit more, and then crash again.
1181 * If we leave the partial log records out there, this situation could
1182 * cause us to think those partial writes are valid blocks since they
1183 * have the current cycle number.  We get rid of them by overwriting them
1184 * with empty log records with the old cycle number rather than the
1185 * current one.
1186 *
1187 * The tail lsn is passed in rather than taken from
1188 * the log so that we will not write over the unmount record after a
1189 * clean unmount in a 512 block log.  Doing so would leave the log without
1190 * any valid log records in it until a new one was written.  If we crashed
1191 * during that time we would not be able to recover.
1192 */
1193STATIC int
1194xlog_clear_stale_blocks(
1195	xlog_t		*log,
1196	xfs_lsn_t	tail_lsn)
1197{
1198	int		tail_cycle, head_cycle;
1199	int		tail_block, head_block;
1200	int		tail_distance, max_distance;
1201	int		distance;
1202	int		error;
1203
1204	tail_cycle = CYCLE_LSN(tail_lsn);
1205	tail_block = BLOCK_LSN(tail_lsn);
1206	head_cycle = log->l_curr_cycle;
1207	head_block = log->l_curr_block;
1208
1209	/*
1210	 * Figure out the distance between the new head of the log
1211	 * and the tail.  We want to write over any blocks beyond the
1212	 * head that we may have written just before the crash, but
1213	 * we don't want to overwrite the tail of the log.
1214	 */
1215	if (head_cycle == tail_cycle) {
1216		/*
1217		 * The tail is behind the head in the physical log,
1218		 * so the distance from the head to the tail is the
1219		 * distance from the head to the end of the log plus
1220		 * the distance from the beginning of the log to the
1221		 * tail.
1222		 */
1223		if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1224			XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1225					 XFS_ERRLEVEL_LOW, log->l_mp);
1226			return XFS_ERROR(EFSCORRUPTED);
1227		}
1228		tail_distance = tail_block + (log->l_logBBsize - head_block);
1229	} else {
1230		/*
1231		 * The head is behind the tail in the physical log,
1232		 * so the distance from the head to the tail is just
1233		 * the tail block minus the head block.
1234		 */
1235		if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1236			XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1237					 XFS_ERRLEVEL_LOW, log->l_mp);
1238			return XFS_ERROR(EFSCORRUPTED);
1239		}
1240		tail_distance = tail_block - head_block;
1241	}
1242
1243	/*
1244	 * If the head is right up against the tail, we can't clear
1245	 * anything.
1246	 */
1247	if (tail_distance <= 0) {
1248		ASSERT(tail_distance == 0);
1249		return 0;
1250	}
1251
1252	max_distance = XLOG_TOTAL_REC_SHIFT(log);
1253	/*
1254	 * Take the smaller of the maximum amount of outstanding I/O
1255	 * we could have and the distance to the tail to clear out.
1256	 * We take the smaller so that we don't overwrite the tail and
1257	 * we don't waste all day writing from the head to the tail
1258	 * for no reason.
1259	 */
1260	max_distance = MIN(max_distance, tail_distance);
1261
1262	if ((head_block + max_distance) <= log->l_logBBsize) {
1263		/*
1264		 * We can stomp all the blocks we need to without
1265		 * wrapping around the end of the log.  Just do it
1266		 * in a single write.  Use the cycle number of the
1267		 * current cycle minus one so that the log will look like:
1268		 *     n ... | n - 1 ...
1269		 */
1270		error = xlog_write_log_records(log, (head_cycle - 1),
1271				head_block, max_distance, tail_cycle,
1272				tail_block);
1273		if (error)
1274			return error;
1275	} else {
1276		/*
1277		 * We need to wrap around the end of the physical log in
1278		 * order to clear all the blocks.  Do it in two separate
1279		 * I/Os.  The first write should be from the head to the
1280		 * end of the physical log, and it should use the current
1281		 * cycle number minus one just like above.
1282		 */
1283		distance = log->l_logBBsize - head_block;
1284		error = xlog_write_log_records(log, (head_cycle - 1),
1285				head_block, distance, tail_cycle,
1286				tail_block);
1287
1288		if (error)
1289			return error;
1290
1291		/*
1292		 * Now write the blocks at the start of the physical log.
1293		 * This writes the remainder of the blocks we want to clear.
1294		 * It uses the current cycle number since we're now on the
1295		 * same cycle as the head so that we get:
1296		 *    n ... n ... | n - 1 ...
1297		 *    ^^^^^ blocks we're writing
1298		 */
1299		distance = max_distance - (log->l_logBBsize - head_block);
1300		error = xlog_write_log_records(log, head_cycle, 0, distance,
1301				tail_cycle, tail_block);
1302		if (error)
1303			return error;
1304	}
1305
1306	return 0;
1307}
1308
1309/******************************************************************************
1310 *
1311 *		Log recover routines
1312 *
1313 ******************************************************************************
1314 */
1315
1316STATIC xlog_recover_t *
1317xlog_recover_find_tid(
1318	xlog_recover_t		*q,
1319	xlog_tid_t		tid)
1320{
1321	xlog_recover_t		*p = q;
1322
1323	while (p != NULL) {
1324		if (p->r_log_tid == tid)
1325		    break;
1326		p = p->r_next;
1327	}
1328	return p;
1329}
1330
1331STATIC void
1332xlog_recover_put_hashq(
1333	xlog_recover_t		**q,
1334	xlog_recover_t		*trans)
1335{
1336	trans->r_next = *q;
1337	*q = trans;
1338}
1339
1340STATIC void
1341xlog_recover_add_item(
1342	xlog_recover_item_t	**itemq)
1343{
1344	xlog_recover_item_t	*item;
1345
1346	item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1347	xlog_recover_insert_item_backq(itemq, item);
1348}
1349
1350STATIC int
1351xlog_recover_add_to_cont_trans(
1352	xlog_recover_t		*trans,
1353	xfs_caddr_t		dp,
1354	int			len)
1355{
1356	xlog_recover_item_t	*item;
1357	xfs_caddr_t		ptr, old_ptr;
1358	int			old_len;
1359
1360	item = trans->r_itemq;
1361	if (item == 0) {
1362		/* finish copying rest of trans header */
1363		xlog_recover_add_item(&trans->r_itemq);
1364		ptr = (xfs_caddr_t) &trans->r_theader +
1365				sizeof(xfs_trans_header_t) - len;
1366		memcpy(ptr, dp, len); /* d, s, l */
1367		return 0;
1368	}
1369	item = item->ri_prev;
1370
1371	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1372	old_len = item->ri_buf[item->ri_cnt-1].i_len;
1373
1374	ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
1375	memcpy(&ptr[old_len], dp, len); /* d, s, l */
1376	item->ri_buf[item->ri_cnt-1].i_len += len;
1377	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1378	return 0;
1379}
1380
1381/*
1382 * The next region to add is the start of a new region.  It could be
1383 * a whole region or it could be the first part of a new region.  Because
1384 * of this, the assumption here is that the type and size fields of all
1385 * format structures fit into the first 32 bits of the structure.
1386 *
1387 * This works because all regions must be 32 bit aligned.  Therefore, we
1388 * either have both fields or we have neither field.  In the case we have
1389 * neither field, the data part of the region is zero length.  We only have
1390 * a log_op_header and can throw away the header since a new one will appear
1391 * later.  If we have at least 4 bytes, then we can determine how many regions
1392 * will appear in the current log item.
1393 */
1394STATIC int
1395xlog_recover_add_to_trans(
1396	xlog_recover_t		*trans,
1397	xfs_caddr_t		dp,
1398	int			len)
1399{
1400	xfs_inode_log_format_t	*in_f;			/* any will do */
1401	xlog_recover_item_t	*item;
1402	xfs_caddr_t		ptr;
1403
1404	if (!len)
1405		return 0;
1406	item = trans->r_itemq;
1407	if (item == 0) {
1408		ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
1409		if (len == sizeof(xfs_trans_header_t))
1410			xlog_recover_add_item(&trans->r_itemq);
1411		memcpy(&trans->r_theader, dp, len); /* d, s, l */
1412		return 0;
1413	}
1414
1415	ptr = kmem_alloc(len, KM_SLEEP);
1416	memcpy(ptr, dp, len);
1417	in_f = (xfs_inode_log_format_t *)ptr;
1418
1419	if (item->ri_prev->ri_total != 0 &&
1420	     item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
1421		xlog_recover_add_item(&trans->r_itemq);
1422	}
1423	item = trans->r_itemq;
1424	item = item->ri_prev;
1425
1426	if (item->ri_total == 0) {		/* first region to be added */
1427		item->ri_total	= in_f->ilf_size;
1428		ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
1429		item->ri_buf = kmem_zalloc((item->ri_total *
1430					    sizeof(xfs_log_iovec_t)), KM_SLEEP);
1431	}
1432	ASSERT(item->ri_total > item->ri_cnt);
1433	/* Description region is ri_buf[0] */
1434	item->ri_buf[item->ri_cnt].i_addr = ptr;
1435	item->ri_buf[item->ri_cnt].i_len  = len;
1436	item->ri_cnt++;
1437	return 0;
1438}
1439
1440STATIC void
1441xlog_recover_new_tid(
1442	xlog_recover_t		**q,
1443	xlog_tid_t		tid,
1444	xfs_lsn_t		lsn)
1445{
1446	xlog_recover_t		*trans;
1447
1448	trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1449	trans->r_log_tid   = tid;
1450	trans->r_lsn	   = lsn;
1451	xlog_recover_put_hashq(q, trans);
1452}
1453
1454STATIC int
1455xlog_recover_unlink_tid(
1456	xlog_recover_t		**q,
1457	xlog_recover_t		*trans)
1458{
1459	xlog_recover_t		*tp;
1460	int			found = 0;
1461
1462	ASSERT(trans != 0);
1463	if (trans == *q) {
1464		*q = (*q)->r_next;
1465	} else {
1466		tp = *q;
1467		while (tp != 0) {
1468			if (tp->r_next == trans) {
1469				found = 1;
1470				break;
1471			}
1472			tp = tp->r_next;
1473		}
1474		if (!found) {
1475			xlog_warn(
1476			     "XFS: xlog_recover_unlink_tid: trans not found");
1477			ASSERT(0);
1478			return XFS_ERROR(EIO);
1479		}
1480		tp->r_next = tp->r_next->r_next;
1481	}
1482	return 0;
1483}
1484
1485STATIC void
1486xlog_recover_insert_item_backq(
1487	xlog_recover_item_t	**q,
1488	xlog_recover_item_t	*item)
1489{
1490	if (*q == 0) {
1491		item->ri_prev = item->ri_next = item;
1492		*q = item;
1493	} else {
1494		item->ri_next		= *q;
1495		item->ri_prev		= (*q)->ri_prev;
1496		(*q)->ri_prev		= item;
1497		item->ri_prev->ri_next	= item;
1498	}
1499}
1500
1501STATIC void
1502xlog_recover_insert_item_frontq(
1503	xlog_recover_item_t	**q,
1504	xlog_recover_item_t	*item)
1505{
1506	xlog_recover_insert_item_backq(q, item);
1507	*q = item;
1508}
1509
1510STATIC int
1511xlog_recover_reorder_trans(
1512	xlog_recover_t		*trans)
1513{
1514	xlog_recover_item_t	*first_item, *itemq, *itemq_next;
1515	xfs_buf_log_format_t	*buf_f;
1516	ushort			flags = 0;
1517
1518	first_item = itemq = trans->r_itemq;
1519	trans->r_itemq = NULL;
1520	do {
1521		itemq_next = itemq->ri_next;
1522		buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
1523
1524		switch (ITEM_TYPE(itemq)) {
1525		case XFS_LI_BUF:
1526			flags = buf_f->blf_flags;
1527			if (!(flags & XFS_BLI_CANCEL)) {
1528				xlog_recover_insert_item_frontq(&trans->r_itemq,
1529								itemq);
1530				break;
1531			}
1532		case XFS_LI_INODE:
1533		case XFS_LI_DQUOT:
1534		case XFS_LI_QUOTAOFF:
1535		case XFS_LI_EFD:
1536		case XFS_LI_EFI:
1537			xlog_recover_insert_item_backq(&trans->r_itemq, itemq);
1538			break;
1539		default:
1540			xlog_warn(
1541	"XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
1542			ASSERT(0);
1543			return XFS_ERROR(EIO);
1544		}
1545		itemq = itemq_next;
1546	} while (first_item != itemq);
1547	return 0;
1548}
1549
1550/*
1551 * Build up the table of buf cancel records so that we don't replay
1552 * cancelled data in the second pass.  For buffer records that are
1553 * not cancel records, there is nothing to do here so we just return.
1554 *
1555 * If we get a cancel record which is already in the table, this indicates
1556 * that the buffer was cancelled multiple times.  In order to ensure
1557 * that during pass 2 we keep the record in the table until we reach its
1558 * last occurrence in the log, we keep a reference count in the cancel
1559 * record in the table to tell us how many times we expect to see this
1560 * record during the second pass.
1561 */
1562STATIC void
1563xlog_recover_do_buffer_pass1(
1564	xlog_t			*log,
1565	xfs_buf_log_format_t	*buf_f)
1566{
1567	xfs_buf_cancel_t	*bcp;
1568	xfs_buf_cancel_t	*nextp;
1569	xfs_buf_cancel_t	*prevp;
1570	xfs_buf_cancel_t	**bucket;
1571	xfs_daddr_t		blkno = 0;
1572	uint			len = 0;
1573	ushort			flags = 0;
1574
1575	switch (buf_f->blf_type) {
1576	case XFS_LI_BUF:
1577		blkno = buf_f->blf_blkno;
1578		len = buf_f->blf_len;
1579		flags = buf_f->blf_flags;
1580		break;
1581	}
1582
1583	/*
1584	 * If this isn't a cancel buffer item, then just return.
1585	 */
1586	if (!(flags & XFS_BLI_CANCEL))
1587		return;
1588
1589	/*
1590	 * Insert an xfs_buf_cancel record into the hash table of
1591	 * them.  If there is already an identical record, bump
1592	 * its reference count.
1593	 */
1594	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1595					  XLOG_BC_TABLE_SIZE];
1596	/*
1597	 * If the hash bucket is empty then just insert a new record into
1598	 * the bucket.
1599	 */
1600	if (*bucket == NULL) {
1601		bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1602						     KM_SLEEP);
1603		bcp->bc_blkno = blkno;
1604		bcp->bc_len = len;
1605		bcp->bc_refcount = 1;
1606		bcp->bc_next = NULL;
1607		*bucket = bcp;
1608		return;
1609	}
1610
1611	/*
1612	 * The hash bucket is not empty, so search for duplicates of our
1613	 * record.  If we find one them just bump its refcount.  If not
1614	 * then add us at the end of the list.
1615	 */
1616	prevp = NULL;
1617	nextp = *bucket;
1618	while (nextp != NULL) {
1619		if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
1620			nextp->bc_refcount++;
1621			return;
1622		}
1623		prevp = nextp;
1624		nextp = nextp->bc_next;
1625	}
1626	ASSERT(prevp != NULL);
1627	bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1628					     KM_SLEEP);
1629	bcp->bc_blkno = blkno;
1630	bcp->bc_len = len;
1631	bcp->bc_refcount = 1;
1632	bcp->bc_next = NULL;
1633	prevp->bc_next = bcp;
1634}
1635
1636/*
1637 * Check to see whether the buffer being recovered has a corresponding
1638 * entry in the buffer cancel record table.  If it does then return 1
1639 * so that it will be cancelled, otherwise return 0.  If the buffer is
1640 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
1641 * the refcount on the entry in the table and remove it from the table
1642 * if this is the last reference.
1643 *
1644 * We remove the cancel record from the table when we encounter its
1645 * last occurrence in the log so that if the same buffer is re-used
1646 * again after its last cancellation we actually replay the changes
1647 * made at that point.
1648 */
1649STATIC int
1650xlog_check_buffer_cancelled(
1651	xlog_t			*log,
1652	xfs_daddr_t		blkno,
1653	uint			len,
1654	ushort			flags)
1655{
1656	xfs_buf_cancel_t	*bcp;
1657	xfs_buf_cancel_t	*prevp;
1658	xfs_buf_cancel_t	**bucket;
1659
1660	if (log->l_buf_cancel_table == NULL) {
1661		/*
1662		 * There is nothing in the table built in pass one,
1663		 * so this buffer must not be cancelled.
1664		 */
1665		ASSERT(!(flags & XFS_BLI_CANCEL));
1666		return 0;
1667	}
1668
1669	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1670					  XLOG_BC_TABLE_SIZE];
1671	bcp = *bucket;
1672	if (bcp == NULL) {
1673		/*
1674		 * There is no corresponding entry in the table built
1675		 * in pass one, so this buffer has not been cancelled.
1676		 */
1677		ASSERT(!(flags & XFS_BLI_CANCEL));
1678		return 0;
1679	}
1680
1681	/*
1682	 * Search for an entry in the buffer cancel table that
1683	 * matches our buffer.
1684	 */
1685	prevp = NULL;
1686	while (bcp != NULL) {
1687		if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
1688			/*
1689			 * We've go a match, so return 1 so that the
1690			 * recovery of this buffer is cancelled.
1691			 * If this buffer is actually a buffer cancel
1692			 * log item, then decrement the refcount on the
1693			 * one in the table and remove it if this is the
1694			 * last reference.
1695			 */
1696			if (flags & XFS_BLI_CANCEL) {
1697				bcp->bc_refcount--;
1698				if (bcp->bc_refcount == 0) {
1699					if (prevp == NULL) {
1700						*bucket = bcp->bc_next;
1701					} else {
1702						prevp->bc_next = bcp->bc_next;
1703					}
1704					kmem_free(bcp,
1705						  sizeof(xfs_buf_cancel_t));
1706				}
1707			}
1708			return 1;
1709		}
1710		prevp = bcp;
1711		bcp = bcp->bc_next;
1712	}
1713	/*
1714	 * We didn't find a corresponding entry in the table, so
1715	 * return 0 so that the buffer is NOT cancelled.
1716	 */
1717	ASSERT(!(flags & XFS_BLI_CANCEL));
1718	return 0;
1719}
1720
1721STATIC int
1722xlog_recover_do_buffer_pass2(
1723	xlog_t			*log,
1724	xfs_buf_log_format_t	*buf_f)
1725{
1726	xfs_daddr_t		blkno = 0;
1727	ushort			flags = 0;
1728	uint			len = 0;
1729
1730	switch (buf_f->blf_type) {
1731	case XFS_LI_BUF:
1732		blkno = buf_f->blf_blkno;
1733		flags = buf_f->blf_flags;
1734		len = buf_f->blf_len;
1735		break;
1736	}
1737
1738	return xlog_check_buffer_cancelled(log, blkno, len, flags);
1739}
1740
1741/*
1742 * Perform recovery for a buffer full of inodes.  In these buffers,
1743 * the only data which should be recovered is that which corresponds
1744 * to the di_next_unlinked pointers in the on disk inode structures.
1745 * The rest of the data for the inodes is always logged through the
1746 * inodes themselves rather than the inode buffer and is recovered
1747 * in xlog_recover_do_inode_trans().
1748 *
1749 * The only time when buffers full of inodes are fully recovered is
1750 * when the buffer is full of newly allocated inodes.  In this case
1751 * the buffer will not be marked as an inode buffer and so will be
1752 * sent to xlog_recover_do_reg_buffer() below during recovery.
1753 */
1754STATIC int
1755xlog_recover_do_inode_buffer(
1756	xfs_mount_t		*mp,
1757	xlog_recover_item_t	*item,
1758	xfs_buf_t		*bp,
1759	xfs_buf_log_format_t	*buf_f)
1760{
1761	int			i;
1762	int			item_index;
1763	int			bit;
1764	int			nbits;
1765	int			reg_buf_offset;
1766	int			reg_buf_bytes;
1767	int			next_unlinked_offset;
1768	int			inodes_per_buf;
1769	xfs_agino_t		*logged_nextp;
1770	xfs_agino_t		*buffer_nextp;
1771	unsigned int		*data_map = NULL;
1772	unsigned int		map_size = 0;
1773
1774	switch (buf_f->blf_type) {
1775	case XFS_LI_BUF:
1776		data_map = buf_f->blf_data_map;
1777		map_size = buf_f->blf_map_size;
1778		break;
1779	}
1780	/*
1781	 * Set the variables corresponding to the current region to
1782	 * 0 so that we'll initialize them on the first pass through
1783	 * the loop.
1784	 */
1785	reg_buf_offset = 0;
1786	reg_buf_bytes = 0;
1787	bit = 0;
1788	nbits = 0;
1789	item_index = 0;
1790	inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1791	for (i = 0; i < inodes_per_buf; i++) {
1792		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
1793			offsetof(xfs_dinode_t, di_next_unlinked);
1794
1795		while (next_unlinked_offset >=
1796		       (reg_buf_offset + reg_buf_bytes)) {
1797			/*
1798			 * The next di_next_unlinked field is beyond
1799			 * the current logged region.  Find the next
1800			 * logged region that contains or is beyond
1801			 * the current di_next_unlinked field.
1802			 */
1803			bit += nbits;
1804			bit = xfs_next_bit(data_map, map_size, bit);
1805
1806			/*
1807			 * If there are no more logged regions in the
1808			 * buffer, then we're done.
1809			 */
1810			if (bit == -1) {
1811				return 0;
1812			}
1813
1814			nbits = xfs_contig_bits(data_map, map_size,
1815							 bit);
1816			ASSERT(nbits > 0);
1817			reg_buf_offset = bit << XFS_BLI_SHIFT;
1818			reg_buf_bytes = nbits << XFS_BLI_SHIFT;
1819			item_index++;
1820		}
1821
1822		/*
1823		 * If the current logged region starts after the current
1824		 * di_next_unlinked field, then move on to the next
1825		 * di_next_unlinked field.
1826		 */
1827		if (next_unlinked_offset < reg_buf_offset) {
1828			continue;
1829		}
1830
1831		ASSERT(item->ri_buf[item_index].i_addr != NULL);
1832		ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
1833		ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1834
1835		/*
1836		 * The current logged region contains a copy of the
1837		 * current di_next_unlinked field.  Extract its value
1838		 * and copy it to the buffer copy.
1839		 */
1840		logged_nextp = (xfs_agino_t *)
1841			       ((char *)(item->ri_buf[item_index].i_addr) +
1842				(next_unlinked_offset - reg_buf_offset));
1843		if (unlikely(*logged_nextp == 0)) {
1844			xfs_fs_cmn_err(CE_ALERT, mp,
1845				"bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
1846				item, bp);
1847			XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1848					 XFS_ERRLEVEL_LOW, mp);
1849			return XFS_ERROR(EFSCORRUPTED);
1850		}
1851
1852		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
1853					      next_unlinked_offset);
1854		*buffer_nextp = *logged_nextp;
1855	}
1856
1857	return 0;
1858}
1859
1860/*
1861 * Perform a 'normal' buffer recovery.  Each logged region of the
1862 * buffer should be copied over the corresponding region in the
1863 * given buffer.  The bitmap in the buf log format structure indicates
1864 * where to place the logged data.
1865 */
1866/*ARGSUSED*/
1867STATIC void
1868xlog_recover_do_reg_buffer(
1869	xlog_recover_item_t	*item,
1870	xfs_buf_t		*bp,
1871	xfs_buf_log_format_t	*buf_f)
1872{
1873	int			i;
1874	int			bit;
1875	int			nbits;
1876	unsigned int		*data_map = NULL;
1877	unsigned int		map_size = 0;
1878	int                     error;
1879
1880	switch (buf_f->blf_type) {
1881	case XFS_LI_BUF:
1882		data_map = buf_f->blf_data_map;
1883		map_size = buf_f->blf_map_size;
1884		break;
1885	}
1886	bit = 0;
1887	i = 1;  /* 0 is the buf format structure */
1888	while (1) {
1889		bit = xfs_next_bit(data_map, map_size, bit);
1890		if (bit == -1)
1891			break;
1892		nbits = xfs_contig_bits(data_map, map_size, bit);
1893		ASSERT(nbits > 0);
1894		ASSERT(item->ri_buf[i].i_addr != 0);
1895		ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
1896		ASSERT(XFS_BUF_COUNT(bp) >=
1897		       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
1898
1899		/*
1900		 * Do a sanity check if this is a dquot buffer. Just checking
1901		 * the first dquot in the buffer should do. XXXThis is
1902		 * probably a good thing to do for other buf types also.
1903		 */
1904		error = 0;
1905		if (buf_f->blf_flags &
1906		   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
1907			error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
1908					       item->ri_buf[i].i_addr,
1909					       -1, 0, XFS_QMOPT_DOWARN,
1910					       "dquot_buf_recover");
1911		}
1912		if (!error)
1913			memcpy(xfs_buf_offset(bp,
1914				(uint)bit << XFS_BLI_SHIFT),	/* dest */
1915				item->ri_buf[i].i_addr,		/* source */
1916				nbits<<XFS_BLI_SHIFT);		/* length */
1917		i++;
1918		bit += nbits;
1919	}
1920
1921	/* Shouldn't be any more regions */
1922	ASSERT(i == item->ri_total);
1923}
1924
1925/*
1926 * Do some primitive error checking on ondisk dquot data structures.
1927 */
1928int
1929xfs_qm_dqcheck(
1930	xfs_disk_dquot_t *ddq,
1931	xfs_dqid_t	 id,
1932	uint		 type,	  /* used only when IO_dorepair is true */
1933	uint		 flags,
1934	char		 *str)
1935{
1936	xfs_dqblk_t	 *d = (xfs_dqblk_t *)ddq;
1937	int		errs = 0;
1938
1939	/*
1940	 * We can encounter an uninitialized dquot buffer for 2 reasons:
1941	 * 1. If we crash while deleting the quotainode(s), and those blks got
1942	 *    used for user data. This is because we take the path of regular
1943	 *    file deletion; however, the size field of quotainodes is never
1944	 *    updated, so all the tricks that we play in itruncate_finish
1945	 *    don't quite matter.
1946	 *
1947	 * 2. We don't play the quota buffers when there's a quotaoff logitem.
1948	 *    But the allocation will be replayed so we'll end up with an
1949	 *    uninitialized quota block.
1950	 *
1951	 * This is all fine; things are still consistent, and we haven't lost
1952	 * any quota information. Just don't complain about bad dquot blks.
1953	 */
1954	if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
1955		if (flags & XFS_QMOPT_DOWARN)
1956			cmn_err(CE_ALERT,
1957			"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
1958			str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
1959		errs++;
1960	}
1961	if (ddq->d_version != XFS_DQUOT_VERSION) {
1962		if (flags & XFS_QMOPT_DOWARN)
1963			cmn_err(CE_ALERT,
1964			"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
1965			str, id, ddq->d_version, XFS_DQUOT_VERSION);
1966		errs++;
1967	}
1968
1969	if (ddq->d_flags != XFS_DQ_USER &&
1970	    ddq->d_flags != XFS_DQ_PROJ &&
1971	    ddq->d_flags != XFS_DQ_GROUP) {
1972		if (flags & XFS_QMOPT_DOWARN)
1973			cmn_err(CE_ALERT,
1974			"%s : XFS dquot ID 0x%x, unknown flags 0x%x",
1975			str, id, ddq->d_flags);
1976		errs++;
1977	}
1978
1979	if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
1980		if (flags & XFS_QMOPT_DOWARN)
1981			cmn_err(CE_ALERT,
1982			"%s : ondisk-dquot 0x%p, ID mismatch: "
1983			"0x%x expected, found id 0x%x",
1984			str, ddq, id, be32_to_cpu(ddq->d_id));
1985		errs++;
1986	}
1987
1988	if (!errs && ddq->d_id) {
1989		if (ddq->d_blk_softlimit &&
1990		    be64_to_cpu(ddq->d_bcount) >=
1991				be64_to_cpu(ddq->d_blk_softlimit)) {
1992			if (!ddq->d_btimer) {
1993				if (flags & XFS_QMOPT_DOWARN)
1994					cmn_err(CE_ALERT,
1995					"%s : Dquot ID 0x%x (0x%p) "
1996					"BLK TIMER NOT STARTED",
1997					str, (int)be32_to_cpu(ddq->d_id), ddq);
1998				errs++;
1999			}
2000		}
2001		if (ddq->d_ino_softlimit &&
2002		    be64_to_cpu(ddq->d_icount) >=
2003				be64_to_cpu(ddq->d_ino_softlimit)) {
2004			if (!ddq->d_itimer) {
2005				if (flags & XFS_QMOPT_DOWARN)
2006					cmn_err(CE_ALERT,
2007					"%s : Dquot ID 0x%x (0x%p) "
2008					"INODE TIMER NOT STARTED",
2009					str, (int)be32_to_cpu(ddq->d_id), ddq);
2010				errs++;
2011			}
2012		}
2013		if (ddq->d_rtb_softlimit &&
2014		    be64_to_cpu(ddq->d_rtbcount) >=
2015				be64_to_cpu(ddq->d_rtb_softlimit)) {
2016			if (!ddq->d_rtbtimer) {
2017				if (flags & XFS_QMOPT_DOWARN)
2018					cmn_err(CE_ALERT,
2019					"%s : Dquot ID 0x%x (0x%p) "
2020					"RTBLK TIMER NOT STARTED",
2021					str, (int)be32_to_cpu(ddq->d_id), ddq);
2022				errs++;
2023			}
2024		}
2025	}
2026
2027	if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
2028		return errs;
2029
2030	if (flags & XFS_QMOPT_DOWARN)
2031		cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
2032
2033	/*
2034	 * Typically, a repair is only requested by quotacheck.
2035	 */
2036	ASSERT(id != -1);
2037	ASSERT(flags & XFS_QMOPT_DQREPAIR);
2038	memset(d, 0, sizeof(xfs_dqblk_t));
2039
2040	d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
2041	d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
2042	d->dd_diskdq.d_flags = type;
2043	d->dd_diskdq.d_id = cpu_to_be32(id);
2044
2045	return errs;
2046}
2047
2048/*
2049 * Perform a dquot buffer recovery.
2050 * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
2051 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2052 * Else, treat it as a regular buffer and do recovery.
2053 */
2054STATIC void
2055xlog_recover_do_dquot_buffer(
2056	xfs_mount_t		*mp,
2057	xlog_t			*log,
2058	xlog_recover_item_t	*item,
2059	xfs_buf_t		*bp,
2060	xfs_buf_log_format_t	*buf_f)
2061{
2062	uint			type;
2063
2064	/*
2065	 * Filesystems are required to send in quota flags at mount time.
2066	 */
2067	if (mp->m_qflags == 0) {
2068		return;
2069	}
2070
2071	type = 0;
2072	if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
2073		type |= XFS_DQ_USER;
2074	if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
2075		type |= XFS_DQ_PROJ;
2076	if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
2077		type |= XFS_DQ_GROUP;
2078	/*
2079	 * This type of quotas was turned off, so ignore this buffer
2080	 */
2081	if (log->l_quotaoffs_flag & type)
2082		return;
2083
2084	xlog_recover_do_reg_buffer(item, bp, buf_f);
2085}
2086
2087/*
2088 * This routine replays a modification made to a buffer at runtime.
2089 * There are actually two types of buffer, regular and inode, which
2090 * are handled differently.  Inode buffers are handled differently
2091 * in that we only recover a specific set of data from them, namely
2092 * the inode di_next_unlinked fields.  This is because all other inode
2093 * data is actually logged via inode records and any data we replay
2094 * here which overlaps that may be stale.
2095 *
2096 * When meta-data buffers are freed at run time we log a buffer item
2097 * with the XFS_BLI_CANCEL bit set to indicate that previous copies
2098 * of the buffer in the log should not be replayed at recovery time.
2099 * This is so that if the blocks covered by the buffer are reused for
2100 * file data before we crash we don't end up replaying old, freed
2101 * meta-data into a user's file.
2102 *
2103 * To handle the cancellation of buffer log items, we make two passes
2104 * over the log during recovery.  During the first we build a table of
2105 * those buffers which have been cancelled, and during the second we
2106 * only replay those buffers which do not have corresponding cancel
2107 * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
2108 * for more details on the implementation of the table of cancel records.
2109 */
2110STATIC int
2111xlog_recover_do_buffer_trans(
2112	xlog_t			*log,
2113	xlog_recover_item_t	*item,
2114	int			pass)
2115{
2116	xfs_buf_log_format_t	*buf_f;
2117	xfs_mount_t		*mp;
2118	xfs_buf_t		*bp;
2119	int			error;
2120	int			cancel;
2121	xfs_daddr_t		blkno;
2122	int			len;
2123	ushort			flags;
2124
2125	buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
2126
2127	if (pass == XLOG_RECOVER_PASS1) {
2128		/*
2129		 * In this pass we're only looking for buf items
2130		 * with the XFS_BLI_CANCEL bit set.
2131		 */
2132		xlog_recover_do_buffer_pass1(log, buf_f);
2133		return 0;
2134	} else {
2135		/*
2136		 * In this pass we want to recover all the buffers
2137		 * which have not been cancelled and are not
2138		 * cancellation buffers themselves.  The routine
2139		 * we call here will tell us whether or not to
2140		 * continue with the replay of this buffer.
2141		 */
2142		cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2143		if (cancel) {
2144			return 0;
2145		}
2146	}
2147	switch (buf_f->blf_type) {
2148	case XFS_LI_BUF:
2149		blkno = buf_f->blf_blkno;
2150		len = buf_f->blf_len;
2151		flags = buf_f->blf_flags;
2152		break;
2153	default:
2154		xfs_fs_cmn_err(CE_ALERT, log->l_mp,
2155			"xfs_log_recover: unknown buffer type 0x%x, logdev %s",
2156			buf_f->blf_type, log->l_mp->m_logname ?
2157			log->l_mp->m_logname : "internal");
2158		XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
2159				 XFS_ERRLEVEL_LOW, log->l_mp);
2160		return XFS_ERROR(EFSCORRUPTED);
2161	}
2162
2163	mp = log->l_mp;
2164	if (flags & XFS_BLI_INODE_BUF) {
2165		bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len,
2166								XFS_BUF_LOCK);
2167	} else {
2168		bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0);
2169	}
2170	if (XFS_BUF_ISERROR(bp)) {
2171		xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
2172				  bp, blkno);
2173		error = XFS_BUF_GETERROR(bp);
2174		xfs_buf_relse(bp);
2175		return error;
2176	}
2177
2178	error = 0;
2179	if (flags & XFS_BLI_INODE_BUF) {
2180		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2181	} else if (flags &
2182		  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
2183		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2184	} else {
2185		xlog_recover_do_reg_buffer(item, bp, buf_f);
2186	}
2187	if (error)
2188		return XFS_ERROR(error);
2189
2190	/*
2191	 * Perform delayed write on the buffer.  Asynchronous writes will be
2192	 * slower when taking into account all the buffers to be flushed.
2193	 *
2194	 * Also make sure that only inode buffers with good sizes stay in
2195	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
2196	 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
2197	 * buffers in the log can be a different size if the log was generated
2198	 * by an older kernel using unclustered inode buffers or a newer kernel
2199	 * running with a different inode cluster size.  Regardless, if the
2200	 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
2201	 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
2202	 * the buffer out of the buffer cache so that the buffer won't
2203	 * overlap with future reads of those inodes.
2204	 */
2205	if (XFS_DINODE_MAGIC ==
2206	    INT_GET(*((__uint16_t *)(xfs_buf_offset(bp, 0))), ARCH_CONVERT) &&
2207	    (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
2208			(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2209		XFS_BUF_STALE(bp);
2210		error = xfs_bwrite(mp, bp);
2211	} else {
2212		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
2213		       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
2214		XFS_BUF_SET_FSPRIVATE(bp, mp);
2215		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2216		xfs_bdwrite(mp, bp);
2217	}
2218
2219	return (error);
2220}
2221
2222STATIC int
2223xlog_recover_do_inode_trans(
2224	xlog_t			*log,
2225	xlog_recover_item_t	*item,
2226	int			pass)
2227{
2228	xfs_inode_log_format_t	*in_f;
2229	xfs_mount_t		*mp;
2230	xfs_buf_t		*bp;
2231	xfs_imap_t		imap;
2232	xfs_dinode_t		*dip;
2233	xfs_ino_t		ino;
2234	int			len;
2235	xfs_caddr_t		src;
2236	xfs_caddr_t		dest;
2237	int			error;
2238	int			attr_index;
2239	uint			fields;
2240	xfs_dinode_core_t	*dicp;
2241	int			need_free = 0;
2242
2243	if (pass == XLOG_RECOVER_PASS1) {
2244		return 0;
2245	}
2246
2247	if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2248		in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
2249	} else {
2250		in_f = (xfs_inode_log_format_t *)kmem_alloc(
2251			sizeof(xfs_inode_log_format_t), KM_SLEEP);
2252		need_free = 1;
2253		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2254		if (error)
2255			goto error;
2256	}
2257	ino = in_f->ilf_ino;
2258	mp = log->l_mp;
2259	if (ITEM_TYPE(item) == XFS_LI_INODE) {
2260		imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno;
2261		imap.im_len = in_f->ilf_len;
2262		imap.im_boffset = in_f->ilf_boffset;
2263	} else {
2264		/*
2265		 * It's an old inode format record.  We don't know where
2266		 * its cluster is located on disk, and we can't allow
2267		 * xfs_imap() to figure it out because the inode btrees
2268		 * are not ready to be used.  Therefore do not pass the
2269		 * XFS_IMAP_LOOKUP flag to xfs_imap().  This will give
2270		 * us only the single block in which the inode lives
2271		 * rather than its cluster, so we must make sure to
2272		 * invalidate the buffer when we write it out below.
2273		 */
2274		imap.im_blkno = 0;
2275		xfs_imap(log->l_mp, NULL, ino, &imap, 0);
2276	}
2277
2278	/*
2279	 * Inode buffers can be freed, look out for it,
2280	 * and do not replay the inode.
2281	 */
2282	if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) {
2283		error = 0;
2284		goto error;
2285	}
2286
2287	bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
2288								XFS_BUF_LOCK);
2289	if (XFS_BUF_ISERROR(bp)) {
2290		xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
2291				  bp, imap.im_blkno);
2292		error = XFS_BUF_GETERROR(bp);
2293		xfs_buf_relse(bp);
2294		goto error;
2295	}
2296	error = 0;
2297	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2298	dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
2299
2300	/*
2301	 * Make sure the place we're flushing out to really looks
2302	 * like an inode!
2303	 */
2304	if (unlikely(INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC)) {
2305		xfs_buf_relse(bp);
2306		xfs_fs_cmn_err(CE_ALERT, mp,
2307			"xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
2308			dip, bp, ino);
2309		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
2310				 XFS_ERRLEVEL_LOW, mp);
2311		error = EFSCORRUPTED;
2312		goto error;
2313	}
2314	dicp = (xfs_dinode_core_t*)(item->ri_buf[1].i_addr);
2315	if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2316		xfs_buf_relse(bp);
2317		xfs_fs_cmn_err(CE_ALERT, mp,
2318			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
2319			item, ino);
2320		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
2321				 XFS_ERRLEVEL_LOW, mp);
2322		error = EFSCORRUPTED;
2323		goto error;
2324	}
2325
2326	/* Skip replay when the on disk inode is newer than the log one */
2327	if (dicp->di_flushiter <
2328	    INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT)) {
2329		/*
2330		 * Deal with the wrap case, DI_MAX_FLUSH is less
2331		 * than smaller numbers
2332		 */
2333		if ((INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT)
2334							== DI_MAX_FLUSH) &&
2335		    (dicp->di_flushiter < (DI_MAX_FLUSH>>1))) {
2336			/* do nothing */
2337		} else {
2338			xfs_buf_relse(bp);
2339			error = 0;
2340			goto error;
2341		}
2342	}
2343	/* Take the opportunity to reset the flush iteration count */
2344	dicp->di_flushiter = 0;
2345
2346	if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
2347		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2348		    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2349			XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
2350					 XFS_ERRLEVEL_LOW, mp, dicp);
2351			xfs_buf_relse(bp);
2352			xfs_fs_cmn_err(CE_ALERT, mp,
2353				"xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2354				item, dip, bp, ino);
2355			error = EFSCORRUPTED;
2356			goto error;
2357		}
2358	} else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
2359		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2360		    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2361		    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2362			XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
2363					     XFS_ERRLEVEL_LOW, mp, dicp);
2364			xfs_buf_relse(bp);
2365			xfs_fs_cmn_err(CE_ALERT, mp,
2366				"xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2367				item, dip, bp, ino);
2368			error = EFSCORRUPTED;
2369			goto error;
2370		}
2371	}
2372	if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2373		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
2374				     XFS_ERRLEVEL_LOW, mp, dicp);
2375		xfs_buf_relse(bp);
2376		xfs_fs_cmn_err(CE_ALERT, mp,
2377			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2378			item, dip, bp, ino,
2379			dicp->di_nextents + dicp->di_anextents,
2380			dicp->di_nblocks);
2381		error = EFSCORRUPTED;
2382		goto error;
2383	}
2384	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2385		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
2386				     XFS_ERRLEVEL_LOW, mp, dicp);
2387		xfs_buf_relse(bp);
2388		xfs_fs_cmn_err(CE_ALERT, mp,
2389			"xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
2390			item, dip, bp, ino, dicp->di_forkoff);
2391		error = EFSCORRUPTED;
2392		goto error;
2393	}
2394	if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) {
2395		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
2396				     XFS_ERRLEVEL_LOW, mp, dicp);
2397		xfs_buf_relse(bp);
2398		xfs_fs_cmn_err(CE_ALERT, mp,
2399			"xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
2400			item->ri_buf[1].i_len, item);
2401		error = EFSCORRUPTED;
2402		goto error;
2403	}
2404
2405	/* The core is in in-core format */
2406	xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core,
2407			      (xfs_dinode_core_t*)item->ri_buf[1].i_addr, -1);
2408
2409	/* the rest is in on-disk format */
2410	if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) {
2411		memcpy((xfs_caddr_t) dip + sizeof(xfs_dinode_core_t),
2412			item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t),
2413			item->ri_buf[1].i_len  - sizeof(xfs_dinode_core_t));
2414	}
2415
2416	fields = in_f->ilf_fields;
2417	switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
2418	case XFS_ILOG_DEV:
2419		INT_SET(dip->di_u.di_dev, ARCH_CONVERT, in_f->ilf_u.ilfu_rdev);
2420
2421		break;
2422	case XFS_ILOG_UUID:
2423		dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid;
2424		break;
2425	}
2426
2427	if (in_f->ilf_size == 2)
2428		goto write_inode_buffer;
2429	len = item->ri_buf[2].i_len;
2430	src = item->ri_buf[2].i_addr;
2431	ASSERT(in_f->ilf_size <= 4);
2432	ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
2433	ASSERT(!(fields & XFS_ILOG_DFORK) ||
2434	       (len == in_f->ilf_dsize));
2435
2436	switch (fields & XFS_ILOG_DFORK) {
2437	case XFS_ILOG_DDATA:
2438	case XFS_ILOG_DEXT:
2439		memcpy(&dip->di_u, src, len);
2440		break;
2441
2442	case XFS_ILOG_DBROOT:
2443		xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
2444				 &(dip->di_u.di_bmbt),
2445				 XFS_DFORK_DSIZE(dip, mp));
2446		break;
2447
2448	default:
2449		/*
2450		 * There are no data fork flags set.
2451		 */
2452		ASSERT((fields & XFS_ILOG_DFORK) == 0);
2453		break;
2454	}
2455
2456	/*
2457	 * If we logged any attribute data, recover it.  There may or
2458	 * may not have been any other non-core data logged in this
2459	 * transaction.
2460	 */
2461	if (in_f->ilf_fields & XFS_ILOG_AFORK) {
2462		if (in_f->ilf_fields & XFS_ILOG_DFORK) {
2463			attr_index = 3;
2464		} else {
2465			attr_index = 2;
2466		}
2467		len = item->ri_buf[attr_index].i_len;
2468		src = item->ri_buf[attr_index].i_addr;
2469		ASSERT(len == in_f->ilf_asize);
2470
2471		switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
2472		case XFS_ILOG_ADATA:
2473		case XFS_ILOG_AEXT:
2474			dest = XFS_DFORK_APTR(dip);
2475			ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
2476			memcpy(dest, src, len);
2477			break;
2478
2479		case XFS_ILOG_ABROOT:
2480			dest = XFS_DFORK_APTR(dip);
2481			xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
2482					 (xfs_bmdr_block_t*)dest,
2483					 XFS_DFORK_ASIZE(dip, mp));
2484			break;
2485
2486		default:
2487			xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
2488			ASSERT(0);
2489			xfs_buf_relse(bp);
2490			error = EIO;
2491			goto error;
2492		}
2493	}
2494
2495write_inode_buffer:
2496	if (ITEM_TYPE(item) == XFS_LI_INODE) {
2497		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
2498		       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
2499		XFS_BUF_SET_FSPRIVATE(bp, mp);
2500		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2501		xfs_bdwrite(mp, bp);
2502	} else {
2503		XFS_BUF_STALE(bp);
2504		error = xfs_bwrite(mp, bp);
2505	}
2506
2507error:
2508	if (need_free)
2509		kmem_free(in_f, sizeof(*in_f));
2510	return XFS_ERROR(error);
2511}
2512
2513/*
2514 * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
2515 * structure, so that we know not to do any dquot item or dquot buffer recovery,
2516 * of that type.
2517 */
2518STATIC int
2519xlog_recover_do_quotaoff_trans(
2520	xlog_t			*log,
2521	xlog_recover_item_t	*item,
2522	int			pass)
2523{
2524	xfs_qoff_logformat_t	*qoff_f;
2525
2526	if (pass == XLOG_RECOVER_PASS2) {
2527		return (0);
2528	}
2529
2530	qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
2531	ASSERT(qoff_f);
2532
2533	/*
2534	 * The logitem format's flag tells us if this was user quotaoff,
2535	 * group/project quotaoff or both.
2536	 */
2537	if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
2538		log->l_quotaoffs_flag |= XFS_DQ_USER;
2539	if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
2540		log->l_quotaoffs_flag |= XFS_DQ_PROJ;
2541	if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2542		log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2543
2544	return (0);
2545}
2546
2547/*
2548 * Recover a dquot record
2549 */
2550STATIC int
2551xlog_recover_do_dquot_trans(
2552	xlog_t			*log,
2553	xlog_recover_item_t	*item,
2554	int			pass)
2555{
2556	xfs_mount_t		*mp;
2557	xfs_buf_t		*bp;
2558	struct xfs_disk_dquot	*ddq, *recddq;
2559	int			error;
2560	xfs_dq_logformat_t	*dq_f;
2561	uint			type;
2562
2563	if (pass == XLOG_RECOVER_PASS1) {
2564		return 0;
2565	}
2566	mp = log->l_mp;
2567
2568	/*
2569	 * Filesystems are required to send in quota flags at mount time.
2570	 */
2571	if (mp->m_qflags == 0)
2572		return (0);
2573
2574	recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
2575	ASSERT(recddq);
2576	/*
2577	 * This type of quotas was turned off, so ignore this record.
2578	 */
2579	type = INT_GET(recddq->d_flags, ARCH_CONVERT) &
2580			(XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
2581	ASSERT(type);
2582	if (log->l_quotaoffs_flag & type)
2583		return (0);
2584
2585	/*
2586	 * At this point we know that quota was _not_ turned off.
2587	 * Since the mount flags are not indicating to us otherwise, this
2588	 * must mean that quota is on, and the dquot needs to be replayed.
2589	 * Remember that we may not have fully recovered the superblock yet,
2590	 * so we can't do the usual trick of looking at the SB quota bits.
2591	 *
2592	 * The other possibility, of course, is that the quota subsystem was
2593	 * removed since the last mount - ENOSYS.
2594	 */
2595	dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
2596	ASSERT(dq_f);
2597	if ((error = xfs_qm_dqcheck(recddq,
2598			   dq_f->qlf_id,
2599			   0, XFS_QMOPT_DOWARN,
2600			   "xlog_recover_do_dquot_trans (log copy)"))) {
2601		return XFS_ERROR(EIO);
2602	}
2603	ASSERT(dq_f->qlf_len == 1);
2604
2605	error = xfs_read_buf(mp, mp->m_ddev_targp,
2606			     dq_f->qlf_blkno,
2607			     XFS_FSB_TO_BB(mp, dq_f->qlf_len),
2608			     0, &bp);
2609	if (error) {
2610		xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
2611				  bp, dq_f->qlf_blkno);
2612		return error;
2613	}
2614	ASSERT(bp);
2615	ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
2616
2617	/*
2618	 * At least the magic num portion should be on disk because this
2619	 * was among a chunk of dquots created earlier, and we did some
2620	 * minimal initialization then.
2621	 */
2622	if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2623			   "xlog_recover_do_dquot_trans")) {
2624		xfs_buf_relse(bp);
2625		return XFS_ERROR(EIO);
2626	}
2627
2628	memcpy(ddq, recddq, item->ri_buf[1].i_len);
2629
2630	ASSERT(dq_f->qlf_size == 2);
2631	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
2632	       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
2633	XFS_BUF_SET_FSPRIVATE(bp, mp);
2634	XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2635	xfs_bdwrite(mp, bp);
2636
2637	return (0);
2638}
2639
2640/*
2641 * This routine is called to create an in-core extent free intent
2642 * item from the efi format structure which was logged on disk.
2643 * It allocates an in-core efi, copies the extents from the format
2644 * structure into it, and adds the efi to the AIL with the given
2645 * LSN.
2646 */
2647STATIC int
2648xlog_recover_do_efi_trans(
2649	xlog_t			*log,
2650	xlog_recover_item_t	*item,
2651	xfs_lsn_t		lsn,
2652	int			pass)
2653{
2654	int			error;
2655	xfs_mount_t		*mp;
2656	xfs_efi_log_item_t	*efip;
2657	xfs_efi_log_format_t	*efi_formatp;
2658	SPLDECL(s);
2659
2660	if (pass == XLOG_RECOVER_PASS1) {
2661		return 0;
2662	}
2663
2664	efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
2665
2666	mp = log->l_mp;
2667	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2668	if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2669					 &(efip->efi_format)))) {
2670		xfs_efi_item_free(efip);
2671		return error;
2672	}
2673	efip->efi_next_extent = efi_formatp->efi_nextents;
2674	efip->efi_flags |= XFS_EFI_COMMITTED;
2675
2676	AIL_LOCK(mp,s);
2677	/*
2678	 * xfs_trans_update_ail() drops the AIL lock.
2679	 */
2680	xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn, s);
2681	return 0;
2682}
2683
2684
2685/*
2686 * This routine is called when an efd format structure is found in
2687 * a committed transaction in the log.  It's purpose is to cancel
2688 * the corresponding efi if it was still in the log.  To do this
2689 * it searches the AIL for the efi with an id equal to that in the
2690 * efd format structure.  If we find it, we remove the efi from the
2691 * AIL and free it.
2692 */
2693STATIC void
2694xlog_recover_do_efd_trans(
2695	xlog_t			*log,
2696	xlog_recover_item_t	*item,
2697	int			pass)
2698{
2699	xfs_mount_t		*mp;
2700	xfs_efd_log_format_t	*efd_formatp;
2701	xfs_efi_log_item_t	*efip = NULL;
2702	xfs_log_item_t		*lip;
2703	int			gen;
2704	__uint64_t		efi_id;
2705	SPLDECL(s);
2706
2707	if (pass == XLOG_RECOVER_PASS1) {
2708		return;
2709	}
2710
2711	efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
2712	ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2713		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
2714	       (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
2715		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
2716	efi_id = efd_formatp->efd_efi_id;
2717
2718	/*
2719	 * Search for the efi with the id in the efd format structure
2720	 * in the AIL.
2721	 */
2722	mp = log->l_mp;
2723	AIL_LOCK(mp,s);
2724	lip = xfs_trans_first_ail(mp, &gen);
2725	while (lip != NULL) {
2726		if (lip->li_type == XFS_LI_EFI) {
2727			efip = (xfs_efi_log_item_t *)lip;
2728			if (efip->efi_format.efi_id == efi_id) {
2729				/*
2730				 * xfs_trans_delete_ail() drops the
2731				 * AIL lock.
2732				 */
2733				xfs_trans_delete_ail(mp, lip, s);
2734				break;
2735			}
2736		}
2737		lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
2738	}
2739
2740	/*
2741	 * If we found it, then free it up.  If it wasn't there, it
2742	 * must have been overwritten in the log.  Oh well.
2743	 */
2744	if (lip != NULL) {
2745		xfs_efi_item_free(efip);
2746	} else {
2747		AIL_UNLOCK(mp, s);
2748	}
2749}
2750
2751/*
2752 * Perform the transaction
2753 *
2754 * If the transaction modifies a buffer or inode, do it now.  Otherwise,
2755 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2756 */
2757STATIC int
2758xlog_recover_do_trans(
2759	xlog_t			*log,
2760	xlog_recover_t		*trans,
2761	int			pass)
2762{
2763	int			error = 0;
2764	xlog_recover_item_t	*item, *first_item;
2765
2766	if ((error = xlog_recover_reorder_trans(trans)))
2767		return error;
2768	first_item = item = trans->r_itemq;
2769	do {
2770		/*
2771		 * we don't need to worry about the block number being
2772		 * truncated in > 1 TB buffers because in user-land,
2773		 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
2774		 * the blknos will get through the user-mode buffer
2775		 * cache properly.  The only bad case is o32 kernels
2776		 * where xfs_daddr_t is 32-bits but mount will warn us
2777		 * off a > 1 TB filesystem before we get here.
2778		 */
2779		if ((ITEM_TYPE(item) == XFS_LI_BUF)) {
2780			if  ((error = xlog_recover_do_buffer_trans(log, item,
2781								 pass)))
2782				break;
2783		} else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
2784			if ((error = xlog_recover_do_inode_trans(log, item,
2785								pass)))
2786				break;
2787		} else if (ITEM_TYPE(item) == XFS_LI_EFI) {
2788			if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
2789						  pass)))
2790				break;
2791		} else if (ITEM_TYPE(item) == XFS_LI_EFD) {
2792			xlog_recover_do_efd_trans(log, item, pass);
2793		} else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
2794			if ((error = xlog_recover_do_dquot_trans(log, item,
2795								   pass)))
2796					break;
2797		} else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
2798			if ((error = xlog_recover_do_quotaoff_trans(log, item,
2799								   pass)))
2800					break;
2801		} else {
2802			xlog_warn("XFS: xlog_recover_do_trans");
2803			ASSERT(0);
2804			error = XFS_ERROR(EIO);
2805			break;
2806		}
2807		item = item->ri_next;
2808	} while (first_item != item);
2809
2810	return error;
2811}
2812
2813/*
2814 * Free up any resources allocated by the transaction
2815 *
2816 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2817 */
2818STATIC void
2819xlog_recover_free_trans(
2820	xlog_recover_t		*trans)
2821{
2822	xlog_recover_item_t	*first_item, *item, *free_item;
2823	int			i;
2824
2825	item = first_item = trans->r_itemq;
2826	do {
2827		free_item = item;
2828		item = item->ri_next;
2829		 /* Free the regions in the item. */
2830		for (i = 0; i < free_item->ri_cnt; i++) {
2831			kmem_free(free_item->ri_buf[i].i_addr,
2832				  free_item->ri_buf[i].i_len);
2833		}
2834		/* Free the item itself */
2835		kmem_free(free_item->ri_buf,
2836			  (free_item->ri_total * sizeof(xfs_log_iovec_t)));
2837		kmem_free(free_item, sizeof(xlog_recover_item_t));
2838	} while (first_item != item);
2839	/* Free the transaction recover structure */
2840	kmem_free(trans, sizeof(xlog_recover_t));
2841}
2842
2843STATIC int
2844xlog_recover_commit_trans(
2845	xlog_t			*log,
2846	xlog_recover_t		**q,
2847	xlog_recover_t		*trans,
2848	int			pass)
2849{
2850	int			error;
2851
2852	if ((error = xlog_recover_unlink_tid(q, trans)))
2853		return error;
2854	if ((error = xlog_recover_do_trans(log, trans, pass)))
2855		return error;
2856	xlog_recover_free_trans(trans);			/* no error */
2857	return 0;
2858}
2859
2860STATIC int
2861xlog_recover_unmount_trans(
2862	xlog_recover_t		*trans)
2863{
2864	/* Do nothing now */
2865	xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
2866	return 0;
2867}
2868
2869/*
2870 * There are two valid states of the r_state field.  0 indicates that the
2871 * transaction structure is in a normal state.  We have either seen the
2872 * start of the transaction or the last operation we added was not a partial
2873 * operation.  If the last operation we added to the transaction was a
2874 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2875 *
2876 * NOTE: skip LRs with 0 data length.
2877 */
2878STATIC int
2879xlog_recover_process_data(
2880	xlog_t			*log,
2881	xlog_recover_t		*rhash[],
2882	xlog_rec_header_t	*rhead,
2883	xfs_caddr_t		dp,
2884	int			pass)
2885{
2886	xfs_caddr_t		lp;
2887	int			num_logops;
2888	xlog_op_header_t	*ohead;
2889	xlog_recover_t		*trans;
2890	xlog_tid_t		tid;
2891	int			error;
2892	unsigned long		hash;
2893	uint			flags;
2894
2895	lp = dp + INT_GET(rhead->h_len, ARCH_CONVERT);
2896	num_logops = INT_GET(rhead->h_num_logops, ARCH_CONVERT);
2897
2898	/* check the log format matches our own - else we can't recover */
2899	if (xlog_header_check_recover(log->l_mp, rhead))
2900		return (XFS_ERROR(EIO));
2901
2902	while ((dp < lp) && num_logops) {
2903		ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
2904		ohead = (xlog_op_header_t *)dp;
2905		dp += sizeof(xlog_op_header_t);
2906		if (ohead->oh_clientid != XFS_TRANSACTION &&
2907		    ohead->oh_clientid != XFS_LOG) {
2908			xlog_warn(
2909		"XFS: xlog_recover_process_data: bad clientid");
2910			ASSERT(0);
2911			return (XFS_ERROR(EIO));
2912		}
2913		tid = INT_GET(ohead->oh_tid, ARCH_CONVERT);
2914		hash = XLOG_RHASH(tid);
2915		trans = xlog_recover_find_tid(rhash[hash], tid);
2916		if (trans == NULL) {		   /* not found; add new tid */
2917			if (ohead->oh_flags & XLOG_START_TRANS)
2918				xlog_recover_new_tid(&rhash[hash], tid,
2919					INT_GET(rhead->h_lsn, ARCH_CONVERT));
2920		} else {
2921			ASSERT(dp+INT_GET(ohead->oh_len, ARCH_CONVERT) <= lp);
2922			flags = ohead->oh_flags & ~XLOG_END_TRANS;
2923			if (flags & XLOG_WAS_CONT_TRANS)
2924				flags &= ~XLOG_CONTINUE_TRANS;
2925			switch (flags) {
2926			case XLOG_COMMIT_TRANS:
2927				error = xlog_recover_commit_trans(log,
2928						&rhash[hash], trans, pass);
2929				break;
2930			case XLOG_UNMOUNT_TRANS:
2931				error = xlog_recover_unmount_trans(trans);
2932				break;
2933			case XLOG_WAS_CONT_TRANS:
2934				error = xlog_recover_add_to_cont_trans(trans,
2935						dp, INT_GET(ohead->oh_len,
2936							ARCH_CONVERT));
2937				break;
2938			case XLOG_START_TRANS:
2939				xlog_warn(
2940			"XFS: xlog_recover_process_data: bad transaction");
2941				ASSERT(0);
2942				error = XFS_ERROR(EIO);
2943				break;
2944			case 0:
2945			case XLOG_CONTINUE_TRANS:
2946				error = xlog_recover_add_to_trans(trans,
2947						dp, INT_GET(ohead->oh_len,
2948							ARCH_CONVERT));
2949				break;
2950			default:
2951				xlog_warn(
2952			"XFS: xlog_recover_process_data: bad flag");
2953				ASSERT(0);
2954				error = XFS_ERROR(EIO);
2955				break;
2956			}
2957			if (error)
2958				return error;
2959		}
2960		dp += INT_GET(ohead->oh_len, ARCH_CONVERT);
2961		num_logops--;
2962	}
2963	return 0;
2964}
2965
2966/*
2967 * Process an extent free intent item that was recovered from
2968 * the log.  We need to free the extents that it describes.
2969 */
2970STATIC void
2971xlog_recover_process_efi(
2972	xfs_mount_t		*mp,
2973	xfs_efi_log_item_t	*efip)
2974{
2975	xfs_efd_log_item_t	*efdp;
2976	xfs_trans_t		*tp;
2977	int			i;
2978	xfs_extent_t		*extp;
2979	xfs_fsblock_t		startblock_fsb;
2980
2981	ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
2982
2983	/*
2984	 * First check the validity of the extents described by the
2985	 * EFI.  If any are bad, then assume that all are bad and
2986	 * just toss the EFI.
2987	 */
2988	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
2989		extp = &(efip->efi_format.efi_extents[i]);
2990		startblock_fsb = XFS_BB_TO_FSB(mp,
2991				   XFS_FSB_TO_DADDR(mp, extp->ext_start));
2992		if ((startblock_fsb == 0) ||
2993		    (extp->ext_len == 0) ||
2994		    (startblock_fsb >= mp->m_sb.sb_dblocks) ||
2995		    (extp->ext_len >= mp->m_sb.sb_agblocks)) {
2996			/*
2997			 * This will pull the EFI from the AIL and
2998			 * free the memory associated with it.
2999			 */
3000			xfs_efi_release(efip, efip->efi_format.efi_nextents);
3001			return;
3002		}
3003	}
3004
3005	tp = xfs_trans_alloc(mp, 0);
3006	xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
3007	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
3008
3009	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3010		extp = &(efip->efi_format.efi_extents[i]);
3011		xfs_free_extent(tp, extp->ext_start, extp->ext_len);
3012		xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
3013					 extp->ext_len);
3014	}
3015
3016	efip->efi_flags |= XFS_EFI_RECOVERED;
3017	xfs_trans_commit(tp, 0);
3018}
3019
3020/*
3021 * Verify that once we've encountered something other than an EFI
3022 * in the AIL that there are no more EFIs in the AIL.
3023 */
3024#if defined(DEBUG)
3025STATIC void
3026xlog_recover_check_ail(
3027	xfs_mount_t		*mp,
3028	xfs_log_item_t		*lip,
3029	int			gen)
3030{
3031	int			orig_gen = gen;
3032
3033	do {
3034		ASSERT(lip->li_type != XFS_LI_EFI);
3035		lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3036		/*
3037		 * The check will be bogus if we restart from the
3038		 * beginning of the AIL, so ASSERT that we don't.
3039		 * We never should since we're holding the AIL lock
3040		 * the entire time.
3041		 */
3042		ASSERT(gen == orig_gen);
3043	} while (lip != NULL);
3044}
3045#endif	/* DEBUG */
3046
3047/*
3048 * When this is called, all of the EFIs which did not have
3049 * corresponding EFDs should be in the AIL.  What we do now
3050 * is free the extents associated with each one.
3051 *
3052 * Since we process the EFIs in normal transactions, they
3053 * will be removed at some point after the commit.  This prevents
3054 * us from just walking down the list processing each one.
3055 * We'll use a flag in the EFI to skip those that we've already
3056 * processed and use the AIL iteration mechanism's generation
3057 * count to try to speed this up at least a bit.
3058 *
3059 * When we start, we know that the EFIs are the only things in
3060 * the AIL.  As we process them, however, other items are added
3061 * to the AIL.  Since everything added to the AIL must come after
3062 * everything already in the AIL, we stop processing as soon as
3063 * we see something other than an EFI in the AIL.
3064 */
3065STATIC void
3066xlog_recover_process_efis(
3067	xlog_t			*log)
3068{
3069	xfs_log_item_t		*lip;
3070	xfs_efi_log_item_t	*efip;
3071	int			gen;
3072	xfs_mount_t		*mp;
3073	SPLDECL(s);
3074
3075	mp = log->l_mp;
3076	AIL_LOCK(mp,s);
3077
3078	lip = xfs_trans_first_ail(mp, &gen);
3079	while (lip != NULL) {
3080		/*
3081		 * We're done when we see something other than an EFI.
3082		 */
3083		if (lip->li_type != XFS_LI_EFI) {
3084			xlog_recover_check_ail(mp, lip, gen);
3085			break;
3086		}
3087
3088		/*
3089		 * Skip EFIs that we've already processed.
3090		 */
3091		efip = (xfs_efi_log_item_t *)lip;
3092		if (efip->efi_flags & XFS_EFI_RECOVERED) {
3093			lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3094			continue;
3095		}
3096
3097		AIL_UNLOCK(mp, s);
3098		xlog_recover_process_efi(mp, efip);
3099		AIL_LOCK(mp,s);
3100		lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3101	}
3102	AIL_UNLOCK(mp, s);
3103}
3104
3105/*
3106 * This routine performs a transaction to null out a bad inode pointer
3107 * in an agi unlinked inode hash bucket.
3108 */
3109STATIC void
3110xlog_recover_clear_agi_bucket(
3111	xfs_mount_t	*mp,
3112	xfs_agnumber_t	agno,
3113	int		bucket)
3114{
3115	xfs_trans_t	*tp;
3116	xfs_agi_t	*agi;
3117	xfs_buf_t	*agibp;
3118	int		offset;
3119	int		error;
3120
3121	tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
3122	xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
3123
3124	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3125				   XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
3126				   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
3127	if (error) {
3128		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3129		return;
3130	}
3131
3132	agi = XFS_BUF_TO_AGI(agibp);
3133	if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC) {
3134		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3135		return;
3136	}
3137
3138	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
3139	offset = offsetof(xfs_agi_t, agi_unlinked) +
3140		 (sizeof(xfs_agino_t) * bucket);
3141	xfs_trans_log_buf(tp, agibp, offset,
3142			  (offset + sizeof(xfs_agino_t) - 1));
3143
3144	(void) xfs_trans_commit(tp, 0);
3145}
3146
3147/*
3148 * xlog_iunlink_recover
3149 *
3150 * This is called during recovery to process any inodes which
3151 * we unlinked but not freed when the system crashed.  These
3152 * inodes will be on the lists in the AGI blocks.  What we do
3153 * here is scan all the AGIs and fully truncate and free any
3154 * inodes found on the lists.  Each inode is removed from the
3155 * lists when it has been fully truncated and is freed.  The
3156 * freeing of the inode and its removal from the list must be
3157 * atomic.
3158 */
3159void
3160xlog_recover_process_iunlinks(
3161	xlog_t		*log)
3162{
3163	xfs_mount_t	*mp;
3164	xfs_agnumber_t	agno;
3165	xfs_agi_t	*agi;
3166	xfs_buf_t	*agibp;
3167	xfs_buf_t	*ibp;
3168	xfs_dinode_t	*dip;
3169	xfs_inode_t	*ip;
3170	xfs_agino_t	agino;
3171	xfs_ino_t	ino;
3172	int		bucket;
3173	int		error;
3174	uint		mp_dmevmask;
3175
3176	mp = log->l_mp;
3177
3178	/*
3179	 * Prevent any DMAPI event from being sent while in this function.
3180	 */
3181	mp_dmevmask = mp->m_dmevmask;
3182	mp->m_dmevmask = 0;
3183
3184	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3185		/*
3186		 * Find the agi for this ag.
3187		 */
3188		agibp = xfs_buf_read(mp->m_ddev_targp,
3189				XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
3190				XFS_FSS_TO_BB(mp, 1), 0);
3191		if (XFS_BUF_ISERROR(agibp)) {
3192			xfs_ioerror_alert("xlog_recover_process_iunlinks(#1)",
3193				log->l_mp, agibp,
3194				XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)));
3195		}
3196		agi = XFS_BUF_TO_AGI(agibp);
3197		ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agi->agi_magicnum));
3198
3199		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
3200
3201			agino = be32_to_cpu(agi->agi_unlinked[bucket]);
3202			while (agino != NULLAGINO) {
3203
3204				/*
3205				 * Release the agi buffer so that it can
3206				 * be acquired in the normal course of the
3207				 * transaction to truncate and free the inode.
3208				 */
3209				xfs_buf_relse(agibp);
3210
3211				ino = XFS_AGINO_TO_INO(mp, agno, agino);
3212				error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
3213				ASSERT(error || (ip != NULL));
3214
3215				if (!error) {
3216					/*
3217					 * Get the on disk inode to find the
3218					 * next inode in the bucket.
3219					 */
3220					error = xfs_itobp(mp, NULL, ip, &dip,
3221							&ibp, 0, 0);
3222					ASSERT(error || (dip != NULL));
3223				}
3224
3225				if (!error) {
3226					ASSERT(ip->i_d.di_nlink == 0);
3227
3228					/* setup for the next pass */
3229					agino = INT_GET(dip->di_next_unlinked,
3230							ARCH_CONVERT);
3231					xfs_buf_relse(ibp);
3232					/*
3233					 * Prevent any DMAPI event from
3234					 * being sent when the
3235					 * reference on the inode is
3236					 * dropped.
3237					 */
3238					ip->i_d.di_dmevmask = 0;
3239
3240					/*
3241					 * If this is a new inode, handle
3242					 * it specially.  Otherwise,
3243					 * just drop our reference to the
3244					 * inode.  If there are no
3245					 * other references, this will
3246					 * send the inode to
3247					 * xfs_inactive() which will
3248					 * truncate the file and free
3249					 * the inode.
3250					 */
3251					if (ip->i_d.di_mode == 0)
3252						xfs_iput_new(ip, 0);
3253					else
3254						VN_RELE(XFS_ITOV(ip));
3255				} else {
3256					/*
3257					 * We can't read in the inode
3258					 * this bucket points to, or
3259					 * this inode is messed up.  Just
3260					 * ditch this bucket of inodes.  We
3261					 * will lose some inodes and space,
3262					 * but at least we won't hang.  Call
3263					 * xlog_recover_clear_agi_bucket()
3264					 * to perform a transaction to clear
3265					 * the inode pointer in the bucket.
3266					 */
3267					xlog_recover_clear_agi_bucket(mp, agno,
3268							bucket);
3269
3270					agino = NULLAGINO;
3271				}
3272
3273				/*
3274				 * Reacquire the agibuffer and continue around
3275				 * the loop.
3276				 */
3277				agibp = xfs_buf_read(mp->m_ddev_targp,
3278						XFS_AG_DADDR(mp, agno,
3279							XFS_AGI_DADDR(mp)),
3280						XFS_FSS_TO_BB(mp, 1), 0);
3281				if (XFS_BUF_ISERROR(agibp)) {
3282					xfs_ioerror_alert(
3283				"xlog_recover_process_iunlinks(#2)",
3284						log->l_mp, agibp,
3285						XFS_AG_DADDR(mp, agno,
3286							XFS_AGI_DADDR(mp)));
3287				}
3288				agi = XFS_BUF_TO_AGI(agibp);
3289				ASSERT(XFS_AGI_MAGIC == be32_to_cpu(
3290					agi->agi_magicnum));
3291			}
3292		}
3293
3294		/*
3295		 * Release the buffer for the current agi so we can
3296		 * go on to the next one.
3297		 */
3298		xfs_buf_relse(agibp);
3299	}
3300
3301	mp->m_dmevmask = mp_dmevmask;
3302}
3303
3304
3305#ifdef DEBUG
3306STATIC void
3307xlog_pack_data_checksum(
3308	xlog_t		*log,
3309	xlog_in_core_t	*iclog,
3310	int		size)
3311{
3312	int		i;
3313	uint		*up;
3314	uint		chksum = 0;
3315
3316	up = (uint *)iclog->ic_datap;
3317	/* divide length by 4 to get # words */
3318	for (i = 0; i < (size >> 2); i++) {
3319		chksum ^= INT_GET(*up, ARCH_CONVERT);
3320		up++;
3321	}
3322	INT_SET(iclog->ic_header.h_chksum, ARCH_CONVERT, chksum);
3323}
3324#else
3325#define xlog_pack_data_checksum(log, iclog, size)
3326#endif
3327
3328/*
3329 * Stamp cycle number in every block
3330 */
3331void
3332xlog_pack_data(
3333	xlog_t			*log,
3334	xlog_in_core_t		*iclog,
3335	int			roundoff)
3336{
3337	int			i, j, k;
3338	int			size = iclog->ic_offset + roundoff;
3339	uint			cycle_lsn;
3340	xfs_caddr_t		dp;
3341	xlog_in_core_2_t	*xhdr;
3342
3343	xlog_pack_data_checksum(log, iclog, size);
3344
3345	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
3346
3347	dp = iclog->ic_datap;
3348	for (i = 0; i < BTOBB(size) &&
3349		i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3350		iclog->ic_header.h_cycle_data[i] = *(uint *)dp;
3351		*(uint *)dp = cycle_lsn;
3352		dp += BBSIZE;
3353	}
3354
3355	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
3356		xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
3357		for ( ; i < BTOBB(size); i++) {
3358			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3359			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3360			xhdr[j].hic_xheader.xh_cycle_data[k] = *(uint *)dp;
3361			*(uint *)dp = cycle_lsn;
3362			dp += BBSIZE;
3363		}
3364
3365		for (i = 1; i < log->l_iclog_heads; i++) {
3366			xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
3367		}
3368	}
3369}
3370
3371#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
3372STATIC void
3373xlog_unpack_data_checksum(
3374	xlog_rec_header_t	*rhead,
3375	xfs_caddr_t		dp,
3376	xlog_t			*log)
3377{
3378	uint			*up = (uint *)dp;
3379	uint			chksum = 0;
3380	int			i;
3381
3382	/* divide length by 4 to get # words */
3383	for (i=0; i < INT_GET(rhead->h_len, ARCH_CONVERT) >> 2; i++) {
3384		chksum ^= INT_GET(*up, ARCH_CONVERT);
3385		up++;
3386	}
3387	if (chksum != INT_GET(rhead->h_chksum, ARCH_CONVERT)) {
3388	    if (rhead->h_chksum ||
3389		((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
3390		    cmn_err(CE_DEBUG,
3391			"XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
3392			    INT_GET(rhead->h_chksum, ARCH_CONVERT), chksum);
3393		    cmn_err(CE_DEBUG,
3394"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
3395		    if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
3396			    cmn_err(CE_DEBUG,
3397				"XFS: LogR this is a LogV2 filesystem\n");
3398		    }
3399		    log->l_flags |= XLOG_CHKSUM_MISMATCH;
3400	    }
3401	}
3402}
3403#else
3404#define xlog_unpack_data_checksum(rhead, dp, log)
3405#endif
3406
3407STATIC void
3408xlog_unpack_data(
3409	xlog_rec_header_t	*rhead,
3410	xfs_caddr_t		dp,
3411	xlog_t			*log)
3412{
3413	int			i, j, k;
3414	xlog_in_core_2_t	*xhdr;
3415
3416	for (i = 0; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)) &&
3417		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3418		*(uint *)dp = *(uint *)&rhead->h_cycle_data[i];
3419		dp += BBSIZE;
3420	}
3421
3422	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
3423		xhdr = (xlog_in_core_2_t *)rhead;
3424		for ( ; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); i++) {
3425			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3426			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3427			*(uint *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
3428			dp += BBSIZE;
3429		}
3430	}
3431
3432	xlog_unpack_data_checksum(rhead, dp, log);
3433}
3434
3435STATIC int
3436xlog_valid_rec_header(
3437	xlog_t			*log,
3438	xlog_rec_header_t	*rhead,
3439	xfs_daddr_t		blkno)
3440{
3441	int			hlen;
3442
3443	if (unlikely(
3444	    (INT_GET(rhead->h_magicno, ARCH_CONVERT) !=
3445			XLOG_HEADER_MAGIC_NUM))) {
3446		XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
3447				XFS_ERRLEVEL_LOW, log->l_mp);
3448		return XFS_ERROR(EFSCORRUPTED);
3449	}
3450	if (unlikely(
3451	    (!rhead->h_version ||
3452	    (INT_GET(rhead->h_version, ARCH_CONVERT) &
3453			(~XLOG_VERSION_OKBITS)) != 0))) {
3454		xlog_warn("XFS: %s: unrecognised log version (%d).",
3455			__FUNCTION__, INT_GET(rhead->h_version, ARCH_CONVERT));
3456		return XFS_ERROR(EIO);
3457	}
3458
3459	/* LR body must have data or it wouldn't have been written */
3460	hlen = INT_GET(rhead->h_len, ARCH_CONVERT);
3461	if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
3462		XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
3463				XFS_ERRLEVEL_LOW, log->l_mp);
3464		return XFS_ERROR(EFSCORRUPTED);
3465	}
3466	if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
3467		XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
3468				XFS_ERRLEVEL_LOW, log->l_mp);
3469		return XFS_ERROR(EFSCORRUPTED);
3470	}
3471	return 0;
3472}
3473
3474/*
3475 * Read the log from tail to head and process the log records found.
3476 * Handle the two cases where the tail and head are in the same cycle
3477 * and where the active portion of the log wraps around the end of
3478 * the physical log separately.  The pass parameter is passed through
3479 * to the routines called to process the data and is not looked at
3480 * here.
3481 */
3482STATIC int
3483xlog_do_recovery_pass(
3484	xlog_t			*log,
3485	xfs_daddr_t		head_blk,
3486	xfs_daddr_t		tail_blk,
3487	int			pass)
3488{
3489	xlog_rec_header_t	*rhead;
3490	xfs_daddr_t		blk_no;
3491	xfs_caddr_t		bufaddr, offset;
3492	xfs_buf_t		*hbp, *dbp;
3493	int			error = 0, h_size;
3494	int			bblks, split_bblks;
3495	int			hblks, split_hblks, wrapped_hblks;
3496	xlog_recover_t		*rhash[XLOG_RHASH_SIZE];
3497
3498	ASSERT(head_blk != tail_blk);
3499
3500	/*
3501	 * Read the header of the tail block and get the iclog buffer size from
3502	 * h_size.  Use this to tell how many sectors make up the log header.
3503	 */
3504	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
3505		/*
3506		 * When using variable length iclogs, read first sector of
3507		 * iclog header and extract the header size from it.  Get a
3508		 * new hbp that is the correct size.
3509		 */
3510		hbp = xlog_get_bp(log, 1);
3511		if (!hbp)
3512			return ENOMEM;
3513		if ((error = xlog_bread(log, tail_blk, 1, hbp)))
3514			goto bread_err1;
3515		offset = xlog_align(log, tail_blk, 1, hbp);
3516		rhead = (xlog_rec_header_t *)offset;
3517		error = xlog_valid_rec_header(log, rhead, tail_blk);
3518		if (error)
3519			goto bread_err1;
3520		h_size = INT_GET(rhead->h_size, ARCH_CONVERT);
3521		if ((INT_GET(rhead->h_version, ARCH_CONVERT)
3522				& XLOG_VERSION_2) &&
3523		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
3524			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
3525			if (h_size % XLOG_HEADER_CYCLE_SIZE)
3526				hblks++;
3527			xlog_put_bp(hbp);
3528			hbp = xlog_get_bp(log, hblks);
3529		} else {
3530			hblks = 1;
3531		}
3532	} else {
3533		ASSERT(log->l_sectbb_log == 0);
3534		hblks = 1;
3535		hbp = xlog_get_bp(log, 1);
3536		h_size = XLOG_BIG_RECORD_BSIZE;
3537	}
3538
3539	if (!hbp)
3540		return ENOMEM;
3541	dbp = xlog_get_bp(log, BTOBB(h_size));
3542	if (!dbp) {
3543		xlog_put_bp(hbp);
3544		return ENOMEM;
3545	}
3546
3547	memset(rhash, 0, sizeof(rhash));
3548	if (tail_blk <= head_blk) {
3549		for (blk_no = tail_blk; blk_no < head_blk; ) {
3550			if ((error = xlog_bread(log, blk_no, hblks, hbp)))
3551				goto bread_err2;
3552			offset = xlog_align(log, blk_no, hblks, hbp);
3553			rhead = (xlog_rec_header_t *)offset;
3554			error = xlog_valid_rec_header(log, rhead, blk_no);
3555			if (error)
3556				goto bread_err2;
3557
3558			/* blocks in data section */
3559			bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
3560			error = xlog_bread(log, blk_no + hblks, bblks, dbp);
3561			if (error)
3562				goto bread_err2;
3563			offset = xlog_align(log, blk_no + hblks, bblks, dbp);
3564			xlog_unpack_data(rhead, offset, log);
3565			if ((error = xlog_recover_process_data(log,
3566						rhash, rhead, offset, pass)))
3567				goto bread_err2;
3568			blk_no += bblks + hblks;
3569		}
3570	} else {
3571		/*
3572		 * Perform recovery around the end of the physical log.
3573		 * When the head is not on the same cycle number as the tail,
3574		 * we can't do a sequential recovery as above.
3575		 */
3576		blk_no = tail_blk;
3577		while (blk_no < log->l_logBBsize) {
3578			/*
3579			 * Check for header wrapping around physical end-of-log
3580			 */
3581			offset = NULL;
3582			split_hblks = 0;
3583			wrapped_hblks = 0;
3584			if (blk_no + hblks <= log->l_logBBsize) {
3585				/* Read header in one read */
3586				error = xlog_bread(log, blk_no, hblks, hbp);
3587				if (error)
3588					goto bread_err2;
3589				offset = xlog_align(log, blk_no, hblks, hbp);
3590			} else {
3591				/* This LR is split across physical log end */
3592				if (blk_no != log->l_logBBsize) {
3593					/* some data before physical log end */
3594					ASSERT(blk_no <= INT_MAX);
3595					split_hblks = log->l_logBBsize - (int)blk_no;
3596					ASSERT(split_hblks > 0);
3597					if ((error = xlog_bread(log, blk_no,
3598							split_hblks, hbp)))
3599						goto bread_err2;
3600					offset = xlog_align(log, blk_no,
3601							split_hblks, hbp);
3602				}
3603				/*
3604				 * Note: this black magic still works with
3605				 * large sector sizes (non-512) only because:
3606				 * - we increased the buffer size originally
3607				 *   by 1 sector giving us enough extra space
3608				 *   for the second read;
3609				 * - the log start is guaranteed to be sector
3610				 *   aligned;
3611				 * - we read the log end (LR header start)
3612				 *   _first_, then the log start (LR header end)
3613				 *   - order is important.
3614				 */
3615				bufaddr = XFS_BUF_PTR(hbp);
3616				XFS_BUF_SET_PTR(hbp,
3617						bufaddr + BBTOB(split_hblks),
3618						BBTOB(hblks - split_hblks));
3619				wrapped_hblks = hblks - split_hblks;
3620				error = xlog_bread(log, 0, wrapped_hblks, hbp);
3621				if (error)
3622					goto bread_err2;
3623				XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks));
3624				if (!offset)
3625					offset = xlog_align(log, 0,
3626							wrapped_hblks, hbp);
3627			}
3628			rhead = (xlog_rec_header_t *)offset;
3629			error = xlog_valid_rec_header(log, rhead,
3630						split_hblks ? blk_no : 0);
3631			if (error)
3632				goto bread_err2;
3633
3634			bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
3635			blk_no += hblks;
3636
3637			/* Read in data for log record */
3638			if (blk_no + bblks <= log->l_logBBsize) {
3639				error = xlog_bread(log, blk_no, bblks, dbp);
3640				if (error)
3641					goto bread_err2;
3642				offset = xlog_align(log, blk_no, bblks, dbp);
3643			} else {
3644				/* This log record is split across the
3645				 * physical end of log */
3646				offset = NULL;
3647				split_bblks = 0;
3648				if (blk_no != log->l_logBBsize) {
3649					/* some data is before the physical
3650					 * end of log */
3651					ASSERT(!wrapped_hblks);
3652					ASSERT(blk_no <= INT_MAX);
3653					split_bblks =
3654						log->l_logBBsize - (int)blk_no;
3655					ASSERT(split_bblks > 0);
3656					if ((error = xlog_bread(log, blk_no,
3657							split_bblks, dbp)))
3658						goto bread_err2;
3659					offset = xlog_align(log, blk_no,
3660							split_bblks, dbp);
3661				}
3662				/*
3663				 * Note: this black magic still works with
3664				 * large sector sizes (non-512) only because:
3665				 * - we increased the buffer size originally
3666				 *   by 1 sector giving us enough extra space
3667				 *   for the second read;
3668				 * - the log start is guaranteed to be sector
3669				 *   aligned;
3670				 * - we read the log end (LR header start)
3671				 *   _first_, then the log start (LR header end)
3672				 *   - order is important.
3673				 */
3674				bufaddr = XFS_BUF_PTR(dbp);
3675				XFS_BUF_SET_PTR(dbp,
3676						bufaddr + BBTOB(split_bblks),
3677						BBTOB(bblks - split_bblks));
3678				if ((error = xlog_bread(log, wrapped_hblks,
3679						bblks - split_bblks, dbp)))
3680					goto bread_err2;
3681				XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
3682				if (!offset)
3683					offset = xlog_align(log, wrapped_hblks,
3684						bblks - split_bblks, dbp);
3685			}
3686			xlog_unpack_data(rhead, offset, log);
3687			if ((error = xlog_recover_process_data(log, rhash,
3688							rhead, offset, pass)))
3689				goto bread_err2;
3690			blk_no += bblks;
3691		}
3692
3693		ASSERT(blk_no >= log->l_logBBsize);
3694		blk_no -= log->l_logBBsize;
3695
3696		/* read first part of physical log */
3697		while (blk_no < head_blk) {
3698			if ((error = xlog_bread(log, blk_no, hblks, hbp)))
3699				goto bread_err2;
3700			offset = xlog_align(log, blk_no, hblks, hbp);
3701			rhead = (xlog_rec_header_t *)offset;
3702			error = xlog_valid_rec_header(log, rhead, blk_no);
3703			if (error)
3704				goto bread_err2;
3705			bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
3706			if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
3707				goto bread_err2;
3708			offset = xlog_align(log, blk_no+hblks, bblks, dbp);
3709			xlog_unpack_data(rhead, offset, log);
3710			if ((error = xlog_recover_process_data(log, rhash,
3711							rhead, offset, pass)))
3712				goto bread_err2;
3713			blk_no += bblks + hblks;
3714		}
3715	}
3716
3717 bread_err2:
3718	xlog_put_bp(dbp);
3719 bread_err1:
3720	xlog_put_bp(hbp);
3721	return error;
3722}
3723
3724/*
3725 * Do the recovery of the log.  We actually do this in two phases.
3726 * The two passes are necessary in order to implement the function
3727 * of cancelling a record written into the log.  The first pass
3728 * determines those things which have been cancelled, and the
3729 * second pass replays log items normally except for those which
3730 * have been cancelled.  The handling of the replay and cancellations
3731 * takes place in the log item type specific routines.
3732 *
3733 * The table of items which have cancel records in the log is allocated
3734 * and freed at this level, since only here do we know when all of
3735 * the log recovery has been completed.
3736 */
3737STATIC int
3738xlog_do_log_recovery(
3739	xlog_t		*log,
3740	xfs_daddr_t	head_blk,
3741	xfs_daddr_t	tail_blk)
3742{
3743	int		error;
3744
3745	ASSERT(head_blk != tail_blk);
3746
3747	/*
3748	 * First do a pass to find all of the cancelled buf log items.
3749	 * Store them in the buf_cancel_table for use in the second pass.
3750	 */
3751	log->l_buf_cancel_table =
3752		(xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
3753						 sizeof(xfs_buf_cancel_t*),
3754						 KM_SLEEP);
3755	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3756				      XLOG_RECOVER_PASS1);
3757	if (error != 0) {
3758		kmem_free(log->l_buf_cancel_table,
3759			  XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
3760		log->l_buf_cancel_table = NULL;
3761		return error;
3762	}
3763	/*
3764	 * Then do a second pass to actually recover the items in the log.
3765	 * When it is complete free the table of buf cancel items.
3766	 */
3767	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3768				      XLOG_RECOVER_PASS2);
3769#ifdef DEBUG
3770	if (!error) {
3771		int	i;
3772
3773		for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3774			ASSERT(log->l_buf_cancel_table[i] == NULL);
3775	}
3776#endif	/* DEBUG */
3777
3778	kmem_free(log->l_buf_cancel_table,
3779		  XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
3780	log->l_buf_cancel_table = NULL;
3781
3782	return error;
3783}
3784
3785/*
3786 * Do the actual recovery
3787 */
3788STATIC int
3789xlog_do_recover(
3790	xlog_t		*log,
3791	xfs_daddr_t	head_blk,
3792	xfs_daddr_t	tail_blk)
3793{
3794	int		error;
3795	xfs_buf_t	*bp;
3796	xfs_sb_t	*sbp;
3797
3798	/*
3799	 * First replay the images in the log.
3800	 */
3801	error = xlog_do_log_recovery(log, head_blk, tail_blk);
3802	if (error) {
3803		return error;
3804	}
3805
3806	XFS_bflush(log->l_mp->m_ddev_targp);
3807
3808	/*
3809	 * If IO errors happened during recovery, bail out.
3810	 */
3811	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
3812		return (EIO);
3813	}
3814
3815	/*
3816	 * We now update the tail_lsn since much of the recovery has completed
3817	 * and there may be space available to use.  If there were no extent
3818	 * or iunlinks, we can free up the entire log and set the tail_lsn to
3819	 * be the last_sync_lsn.  This was set in xlog_find_tail to be the
3820	 * lsn of the last known good LR on disk.  If there are extent frees
3821	 * or iunlinks they will have some entries in the AIL; so we look at
3822	 * the AIL to determine how to set the tail_lsn.
3823	 */
3824	xlog_assign_tail_lsn(log->l_mp);
3825
3826	/*
3827	 * Now that we've finished replaying all buffer and inode
3828	 * updates, re-read in the superblock.
3829	 */
3830	bp = xfs_getsb(log->l_mp, 0);
3831	XFS_BUF_UNDONE(bp);
3832	XFS_BUF_READ(bp);
3833	xfsbdstrat(log->l_mp, bp);
3834	if ((error = xfs_iowait(bp))) {
3835		xfs_ioerror_alert("xlog_do_recover",
3836				  log->l_mp, bp, XFS_BUF_ADDR(bp));
3837		ASSERT(0);
3838		xfs_buf_relse(bp);
3839		return error;
3840	}
3841
3842	/* Convert superblock from on-disk format */
3843	sbp = &log->l_mp->m_sb;
3844	xfs_xlatesb(XFS_BUF_TO_SBP(bp), sbp, 1, XFS_SB_ALL_BITS);
3845	ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
3846	ASSERT(XFS_SB_GOOD_VERSION(sbp));
3847	xfs_buf_relse(bp);
3848
3849	/* We've re-read the superblock so re-initialize per-cpu counters */
3850	xfs_icsb_reinit_counters(log->l_mp);
3851
3852	xlog_recover_check_summary(log);
3853
3854	/* Normal transactions can now occur */
3855	log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
3856	return 0;
3857}
3858
3859/*
3860 * Perform recovery and re-initialize some log variables in xlog_find_tail.
3861 *
3862 * Return error or zero.
3863 */
3864int
3865xlog_recover(
3866	xlog_t		*log)
3867{
3868	xfs_daddr_t	head_blk, tail_blk;
3869	int		error;
3870
3871	/* find the tail of the log */
3872	if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
3873		return error;
3874
3875	if (tail_blk != head_blk) {
3876		/* There used to be a comment here:
3877		 *
3878		 * disallow recovery on read-only mounts.  note -- mount
3879		 * checks for ENOSPC and turns it into an intelligent
3880		 * error message.
3881		 * ...but this is no longer true.  Now, unless you specify
3882		 * NORECOVERY (in which case this function would never be
3883		 * called), we just go ahead and recover.  We do this all
3884		 * under the vfs layer, so we can get away with it unless
3885		 * the device itself is read-only, in which case we fail.
3886		 */
3887		if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
3888			return error;
3889		}
3890
3891		cmn_err(CE_NOTE,
3892			"Starting XFS recovery on filesystem: %s (logdev: %s)",
3893			log->l_mp->m_fsname, log->l_mp->m_logname ?
3894			log->l_mp->m_logname : "internal");
3895
3896		error = xlog_do_recover(log, head_blk, tail_blk);
3897		log->l_flags |= XLOG_RECOVERY_NEEDED;
3898	}
3899	return error;
3900}
3901
3902/*
3903 * In the first part of recovery we replay inodes and buffers and build
3904 * up the list of extent free items which need to be processed.  Here
3905 * we process the extent free items and clean up the on disk unlinked
3906 * inode lists.  This is separated from the first part of recovery so
3907 * that the root and real-time bitmap inodes can be read in from disk in
3908 * between the two stages.  This is necessary so that we can free space
3909 * in the real-time portion of the file system.
3910 */
3911int
3912xlog_recover_finish(
3913	xlog_t		*log,
3914	int		mfsi_flags)
3915{
3916	/*
3917	 * Now we're ready to do the transactions needed for the
3918	 * rest of recovery.  Start with completing all the extent
3919	 * free intent records and then process the unlinked inode
3920	 * lists.  At this point, we essentially run in normal mode
3921	 * except that we're still performing recovery actions
3922	 * rather than accepting new requests.
3923	 */
3924	if (log->l_flags & XLOG_RECOVERY_NEEDED) {
3925		xlog_recover_process_efis(log);
3926		/*
3927		 * Sync the log to get all the EFIs out of the AIL.
3928		 * This isn't absolutely necessary, but it helps in
3929		 * case the unlink transactions would have problems
3930		 * pushing the EFIs out of the way.
3931		 */
3932		xfs_log_force(log->l_mp, (xfs_lsn_t)0,
3933			      (XFS_LOG_FORCE | XFS_LOG_SYNC));
3934
3935		if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) {
3936			xlog_recover_process_iunlinks(log);
3937		}
3938
3939		xlog_recover_check_summary(log);
3940
3941		cmn_err(CE_NOTE,
3942			"Ending XFS recovery on filesystem: %s (logdev: %s)",
3943			log->l_mp->m_fsname, log->l_mp->m_logname ?
3944			log->l_mp->m_logname : "internal");
3945		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3946	} else {
3947		cmn_err(CE_DEBUG,
3948			"!Ending clean XFS mount for filesystem: %s\n",
3949			log->l_mp->m_fsname);
3950	}
3951	return 0;
3952}
3953
3954
3955#if defined(DEBUG)
3956/*
3957 * Read all of the agf and agi counters and check that they
3958 * are consistent with the superblock counters.
3959 */
3960void
3961xlog_recover_check_summary(
3962	xlog_t		*log)
3963{
3964	xfs_mount_t	*mp;
3965	xfs_agf_t	*agfp;
3966	xfs_agi_t	*agip;
3967	xfs_buf_t	*agfbp;
3968	xfs_buf_t	*agibp;
3969	xfs_daddr_t	agfdaddr;
3970	xfs_daddr_t	agidaddr;
3971	xfs_buf_t	*sbbp;
3972#ifdef XFS_LOUD_RECOVERY
3973	xfs_sb_t	*sbp;
3974#endif
3975	xfs_agnumber_t	agno;
3976	__uint64_t	freeblks;
3977	__uint64_t	itotal;
3978	__uint64_t	ifree;
3979
3980	mp = log->l_mp;
3981
3982	freeblks = 0LL;
3983	itotal = 0LL;
3984	ifree = 0LL;
3985	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3986		agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp));
3987		agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr,
3988				XFS_FSS_TO_BB(mp, 1), 0);
3989		if (XFS_BUF_ISERROR(agfbp)) {
3990			xfs_ioerror_alert("xlog_recover_check_summary(agf)",
3991						mp, agfbp, agfdaddr);
3992		}
3993		agfp = XFS_BUF_TO_AGF(agfbp);
3994		ASSERT(XFS_AGF_MAGIC == be32_to_cpu(agfp->agf_magicnum));
3995		ASSERT(XFS_AGF_GOOD_VERSION(be32_to_cpu(agfp->agf_versionnum)));
3996		ASSERT(be32_to_cpu(agfp->agf_seqno) == agno);
3997
3998		freeblks += be32_to_cpu(agfp->agf_freeblks) +
3999			    be32_to_cpu(agfp->agf_flcount);
4000		xfs_buf_relse(agfbp);
4001
4002		agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
4003		agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr,
4004				XFS_FSS_TO_BB(mp, 1), 0);
4005		if (XFS_BUF_ISERROR(agibp)) {
4006			xfs_ioerror_alert("xlog_recover_check_summary(agi)",
4007					  mp, agibp, agidaddr);
4008		}
4009		agip = XFS_BUF_TO_AGI(agibp);
4010		ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agip->agi_magicnum));
4011		ASSERT(XFS_AGI_GOOD_VERSION(be32_to_cpu(agip->agi_versionnum)));
4012		ASSERT(be32_to_cpu(agip->agi_seqno) == agno);
4013
4014		itotal += be32_to_cpu(agip->agi_count);
4015		ifree += be32_to_cpu(agip->agi_freecount);
4016		xfs_buf_relse(agibp);
4017	}
4018
4019	sbbp = xfs_getsb(mp, 0);
4020#ifdef XFS_LOUD_RECOVERY
4021	sbp = &mp->m_sb;
4022	xfs_xlatesb(XFS_BUF_TO_SBP(sbbp), sbp, 1, XFS_SB_ALL_BITS);
4023	cmn_err(CE_NOTE,
4024		"xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
4025		sbp->sb_icount, itotal);
4026	cmn_err(CE_NOTE,
4027		"xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
4028		sbp->sb_ifree, ifree);
4029	cmn_err(CE_NOTE,
4030		"xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
4031		sbp->sb_fdblocks, freeblks);
4032#endif
4033	xfs_buf_relse(sbbp);
4034}
4035#endif /* DEBUG */
4036