1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_dmapi.h"
27#include "xfs_mount.h"
28#include "xfs_buf_item.h"
29#include "xfs_trans_priv.h"
30#include "xfs_error.h"
31
32
33kmem_zone_t	*xfs_buf_item_zone;
34
35#ifdef XFS_TRANS_DEBUG
36/*
37 * This function uses an alternate strategy for tracking the bytes
38 * that the user requests to be logged.  This can then be used
39 * in conjunction with the bli_orig array in the buf log item to
40 * catch bugs in our callers' code.
41 *
42 * We also double check the bits set in xfs_buf_item_log using a
43 * simple algorithm to check that every byte is accounted for.
44 */
45STATIC void
46xfs_buf_item_log_debug(
47	xfs_buf_log_item_t	*bip,
48	uint			first,
49	uint			last)
50{
51	uint	x;
52	uint	byte;
53	uint	nbytes;
54	uint	chunk_num;
55	uint	word_num;
56	uint	bit_num;
57	uint	bit_set;
58	uint	*wordp;
59
60	ASSERT(bip->bli_logged != NULL);
61	byte = first;
62	nbytes = last - first + 1;
63	bfset(bip->bli_logged, first, nbytes);
64	for (x = 0; x < nbytes; x++) {
65		chunk_num = byte >> XFS_BLI_SHIFT;
66		word_num = chunk_num >> BIT_TO_WORD_SHIFT;
67		bit_num = chunk_num & (NBWORD - 1);
68		wordp = &(bip->bli_format.blf_data_map[word_num]);
69		bit_set = *wordp & (1 << bit_num);
70		ASSERT(bit_set);
71		byte++;
72	}
73}
74
75/*
76 * This function is called when we flush something into a buffer without
77 * logging it.  This happens for things like inodes which are logged
78 * separately from the buffer.
79 */
80void
81xfs_buf_item_flush_log_debug(
82	xfs_buf_t	*bp,
83	uint		first,
84	uint		last)
85{
86	xfs_buf_log_item_t	*bip;
87	uint			nbytes;
88
89	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
90	if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) {
91		return;
92	}
93
94	ASSERT(bip->bli_logged != NULL);
95	nbytes = last - first + 1;
96	bfset(bip->bli_logged, first, nbytes);
97}
98
99/*
100 * This function is called to verify that our callers have logged
101 * all the bytes that they changed.
102 *
103 * It does this by comparing the original copy of the buffer stored in
104 * the buf log item's bli_orig array to the current copy of the buffer
105 * and ensuring that all bytes which mismatch are set in the bli_logged
106 * array of the buf log item.
107 */
108STATIC void
109xfs_buf_item_log_check(
110	xfs_buf_log_item_t	*bip)
111{
112	char		*orig;
113	char		*buffer;
114	int		x;
115	xfs_buf_t	*bp;
116
117	ASSERT(bip->bli_orig != NULL);
118	ASSERT(bip->bli_logged != NULL);
119
120	bp = bip->bli_buf;
121	ASSERT(XFS_BUF_COUNT(bp) > 0);
122	ASSERT(XFS_BUF_PTR(bp) != NULL);
123	orig = bip->bli_orig;
124	buffer = XFS_BUF_PTR(bp);
125	for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
126		if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))
127			cmn_err(CE_PANIC,
128	"xfs_buf_item_log_check bip %x buffer %x orig %x index %d",
129				bip, bp, orig, x);
130	}
131}
132#else
133#define		xfs_buf_item_log_debug(x,y,z)
134#define		xfs_buf_item_log_check(x)
135#endif
136
137STATIC void	xfs_buf_error_relse(xfs_buf_t *bp);
138STATIC void	xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
139
140/*
141 * This returns the number of log iovecs needed to log the
142 * given buf log item.
143 *
144 * It calculates this as 1 iovec for the buf log format structure
145 * and 1 for each stretch of non-contiguous chunks to be logged.
146 * Contiguous chunks are logged in a single iovec.
147 *
148 * If the XFS_BLI_STALE flag has been set, then log nothing.
149 */
150STATIC uint
151xfs_buf_item_size(
152	xfs_buf_log_item_t	*bip)
153{
154	uint		nvecs;
155	int		next_bit;
156	int		last_bit;
157	xfs_buf_t	*bp;
158
159	ASSERT(atomic_read(&bip->bli_refcount) > 0);
160	if (bip->bli_flags & XFS_BLI_STALE) {
161		/*
162		 * The buffer is stale, so all we need to log
163		 * is the buf log format structure with the
164		 * cancel flag in it.
165		 */
166		xfs_buf_item_trace("SIZE STALE", bip);
167		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
168		return 1;
169	}
170
171	bp = bip->bli_buf;
172	ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
173	nvecs = 1;
174	last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
175					 bip->bli_format.blf_map_size, 0);
176	ASSERT(last_bit != -1);
177	nvecs++;
178	while (last_bit != -1) {
179		/*
180		 * This takes the bit number to start looking from and
181		 * returns the next set bit from there.  It returns -1
182		 * if there are no more bits set or the start bit is
183		 * beyond the end of the bitmap.
184		 */
185		next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
186						 bip->bli_format.blf_map_size,
187						 last_bit + 1);
188		/*
189		 * If we run out of bits, leave the loop,
190		 * else if we find a new set of bits bump the number of vecs,
191		 * else keep scanning the current set of bits.
192		 */
193		if (next_bit == -1) {
194			last_bit = -1;
195		} else if (next_bit != last_bit + 1) {
196			last_bit = next_bit;
197			nvecs++;
198		} else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
199			   (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
200			    XFS_BLI_CHUNK)) {
201			last_bit = next_bit;
202			nvecs++;
203		} else {
204			last_bit++;
205		}
206	}
207
208	xfs_buf_item_trace("SIZE NORM", bip);
209	return nvecs;
210}
211
212/*
213 * This is called to fill in the vector of log iovecs for the
214 * given log buf item.  It fills the first entry with a buf log
215 * format structure, and the rest point to contiguous chunks
216 * within the buffer.
217 */
218STATIC void
219xfs_buf_item_format(
220	xfs_buf_log_item_t	*bip,
221	xfs_log_iovec_t		*log_vector)
222{
223	uint		base_size;
224	uint		nvecs;
225	xfs_log_iovec_t	*vecp;
226	xfs_buf_t	*bp;
227	int		first_bit;
228	int		last_bit;
229	int		next_bit;
230	uint		nbits;
231	uint		buffer_offset;
232
233	ASSERT(atomic_read(&bip->bli_refcount) > 0);
234	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
235	       (bip->bli_flags & XFS_BLI_STALE));
236	bp = bip->bli_buf;
237	vecp = log_vector;
238
239	/*
240	 * The size of the base structure is the size of the
241	 * declared structure plus the space for the extra words
242	 * of the bitmap.  We subtract one from the map size, because
243	 * the first element of the bitmap is accounted for in the
244	 * size of the base structure.
245	 */
246	base_size =
247		(uint)(sizeof(xfs_buf_log_format_t) +
248		       ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
249	vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
250	vecp->i_len = base_size;
251	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT);
252	vecp++;
253	nvecs = 1;
254
255	if (bip->bli_flags & XFS_BLI_STALE) {
256		/*
257		 * The buffer is stale, so all we need to log
258		 * is the buf log format structure with the
259		 * cancel flag in it.
260		 */
261		xfs_buf_item_trace("FORMAT STALE", bip);
262		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
263		bip->bli_format.blf_size = nvecs;
264		return;
265	}
266
267	/*
268	 * Fill in an iovec for each set of contiguous chunks.
269	 */
270	first_bit = xfs_next_bit(bip->bli_format.blf_data_map,
271					 bip->bli_format.blf_map_size, 0);
272	ASSERT(first_bit != -1);
273	last_bit = first_bit;
274	nbits = 1;
275	for (;;) {
276		/*
277		 * This takes the bit number to start looking from and
278		 * returns the next set bit from there.  It returns -1
279		 * if there are no more bits set or the start bit is
280		 * beyond the end of the bitmap.
281		 */
282		next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
283						 bip->bli_format.blf_map_size,
284						 (uint)last_bit + 1);
285		/*
286		 * If we run out of bits fill in the last iovec and get
287		 * out of the loop.
288		 * Else if we start a new set of bits then fill in the
289		 * iovec for the series we were looking at and start
290		 * counting the bits in the new one.
291		 * Else we're still in the same set of bits so just
292		 * keep counting and scanning.
293		 */
294		if (next_bit == -1) {
295			buffer_offset = first_bit * XFS_BLI_CHUNK;
296			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
297			vecp->i_len = nbits * XFS_BLI_CHUNK;
298			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
299			nvecs++;
300			break;
301		} else if (next_bit != last_bit + 1) {
302			buffer_offset = first_bit * XFS_BLI_CHUNK;
303			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
304			vecp->i_len = nbits * XFS_BLI_CHUNK;
305			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
306			nvecs++;
307			vecp++;
308			first_bit = next_bit;
309			last_bit = next_bit;
310			nbits = 1;
311		} else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
312			   (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
313			    XFS_BLI_CHUNK)) {
314			buffer_offset = first_bit * XFS_BLI_CHUNK;
315			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
316			vecp->i_len = nbits * XFS_BLI_CHUNK;
317			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
318/* You would think we need to bump the nvecs here too, but we do not
319 * this number is used by recovery, and it gets confused by the boundary
320 * split here
321 *			nvecs++;
322 */
323			vecp++;
324			first_bit = next_bit;
325			last_bit = next_bit;
326			nbits = 1;
327		} else {
328			last_bit++;
329			nbits++;
330		}
331	}
332	bip->bli_format.blf_size = nvecs;
333
334	/*
335	 * Check to make sure everything is consistent.
336	 */
337	xfs_buf_item_trace("FORMAT NORM", bip);
338	xfs_buf_item_log_check(bip);
339}
340
341/*
342 * This is called to pin the buffer associated with the buf log
343 * item in memory so it cannot be written out.  Simply call bpin()
344 * on the buffer to do this.
345 */
346STATIC void
347xfs_buf_item_pin(
348	xfs_buf_log_item_t	*bip)
349{
350	xfs_buf_t	*bp;
351
352	bp = bip->bli_buf;
353	ASSERT(XFS_BUF_ISBUSY(bp));
354	ASSERT(atomic_read(&bip->bli_refcount) > 0);
355	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
356	       (bip->bli_flags & XFS_BLI_STALE));
357	xfs_buf_item_trace("PIN", bip);
358	xfs_buftrace("XFS_PIN", bp);
359	xfs_bpin(bp);
360}
361
362
363/*
364 * This is called to unpin the buffer associated with the buf log
365 * item which was previously pinned with a call to xfs_buf_item_pin().
366 * Just call bunpin() on the buffer to do this.
367 *
368 * Also drop the reference to the buf item for the current transaction.
369 * If the XFS_BLI_STALE flag is set and we are the last reference,
370 * then free up the buf log item and unlock the buffer.
371 */
372STATIC void
373xfs_buf_item_unpin(
374	xfs_buf_log_item_t	*bip,
375	int			stale)
376{
377	xfs_mount_t	*mp;
378	xfs_buf_t	*bp;
379	int		freed;
380	SPLDECL(s);
381
382	bp = bip->bli_buf;
383	ASSERT(bp != NULL);
384	ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
385	ASSERT(atomic_read(&bip->bli_refcount) > 0);
386	xfs_buf_item_trace("UNPIN", bip);
387	xfs_buftrace("XFS_UNPIN", bp);
388
389	freed = atomic_dec_and_test(&bip->bli_refcount);
390	mp = bip->bli_item.li_mountp;
391	xfs_bunpin(bp);
392	if (freed && stale) {
393		ASSERT(bip->bli_flags & XFS_BLI_STALE);
394		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
395		ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
396		ASSERT(XFS_BUF_ISSTALE(bp));
397		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
398		xfs_buf_item_trace("UNPIN STALE", bip);
399		xfs_buftrace("XFS_UNPIN STALE", bp);
400		/*
401		 * If we get called here because of an IO error, we may
402		 * or may not have the item on the AIL. xfs_trans_delete_ail()
403		 * will take care of that situation.
404		 * xfs_trans_delete_ail() drops the AIL lock.
405		 */
406		if (bip->bli_flags & XFS_BLI_STALE_INODE) {
407			xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
408			XFS_BUF_SET_FSPRIVATE(bp, NULL);
409			XFS_BUF_CLR_IODONE_FUNC(bp);
410		} else {
411			AIL_LOCK(mp,s);
412			xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s);
413			xfs_buf_item_relse(bp);
414			ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
415		}
416		xfs_buf_relse(bp);
417	}
418}
419
420/*
421 * this is called from uncommit in the forced-shutdown path.
422 * we need to check to see if the reference count on the log item
423 * is going to drop to zero.  If so, unpin will free the log item
424 * so we need to free the item's descriptor (that points to the item)
425 * in the transaction.
426 */
427STATIC void
428xfs_buf_item_unpin_remove(
429	xfs_buf_log_item_t	*bip,
430	xfs_trans_t		*tp)
431{
432	xfs_buf_t		*bp;
433	xfs_log_item_desc_t	*lidp;
434	int			stale = 0;
435
436	bp = bip->bli_buf;
437	/*
438	 * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
439	 */
440	if ((atomic_read(&bip->bli_refcount) == 1) &&
441	    (bip->bli_flags & XFS_BLI_STALE)) {
442		ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
443		xfs_buf_item_trace("UNPIN REMOVE", bip);
444		xfs_buftrace("XFS_UNPIN_REMOVE", bp);
445		/*
446		 * yes -- clear the xaction descriptor in-use flag
447		 * and free the chunk if required.  We can safely
448		 * do some work here and then call buf_item_unpin
449		 * to do the rest because if the if is true, then
450		 * we are holding the buffer locked so no one else
451		 * will be able to bump up the refcount.
452		 */
453		lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
454		stale = lidp->lid_flags & XFS_LID_BUF_STALE;
455		xfs_trans_free_item(tp, lidp);
456		/*
457		 * Since the transaction no longer refers to the buffer,
458		 * the buffer should no longer refer to the transaction.
459		 */
460		XFS_BUF_SET_FSPRIVATE2(bp, NULL);
461	}
462
463	xfs_buf_item_unpin(bip, stale);
464
465	return;
466}
467
468/*
469 * This is called to attempt to lock the buffer associated with this
470 * buf log item.  Don't sleep on the buffer lock.  If we can't get
471 * the lock right away, return 0.  If we can get the lock, pull the
472 * buffer from the free list, mark it busy, and return 1.
473 */
474STATIC uint
475xfs_buf_item_trylock(
476	xfs_buf_log_item_t	*bip)
477{
478	xfs_buf_t	*bp;
479
480	bp = bip->bli_buf;
481
482	if (XFS_BUF_ISPINNED(bp)) {
483		return XFS_ITEM_PINNED;
484	}
485
486	if (!XFS_BUF_CPSEMA(bp)) {
487		return XFS_ITEM_LOCKED;
488	}
489
490	/*
491	 * Remove the buffer from the free list.  Only do this
492	 * if it's on the free list.  Private buffers like the
493	 * superblock buffer are not.
494	 */
495	XFS_BUF_HOLD(bp);
496
497	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
498	xfs_buf_item_trace("TRYLOCK SUCCESS", bip);
499	return XFS_ITEM_SUCCESS;
500}
501
502/*
503 * Release the buffer associated with the buf log item.
504 * If there is no dirty logged data associated with the
505 * buffer recorded in the buf log item, then free the
506 * buf log item and remove the reference to it in the
507 * buffer.
508 *
509 * This call ignores the recursion count.  It is only called
510 * when the buffer should REALLY be unlocked, regardless
511 * of the recursion count.
512 *
513 * If the XFS_BLI_HOLD flag is set in the buf log item, then
514 * free the log item if necessary but do not unlock the buffer.
515 * This is for support of xfs_trans_bhold(). Make sure the
516 * XFS_BLI_HOLD field is cleared if we don't free the item.
517 */
518STATIC void
519xfs_buf_item_unlock(
520	xfs_buf_log_item_t	*bip)
521{
522	int		aborted;
523	xfs_buf_t	*bp;
524	uint		hold;
525
526	bp = bip->bli_buf;
527	xfs_buftrace("XFS_UNLOCK", bp);
528
529	/*
530	 * Clear the buffer's association with this transaction.
531	 */
532	XFS_BUF_SET_FSPRIVATE2(bp, NULL);
533
534	/*
535	 * If this is a transaction abort, don't return early.
536	 * Instead, allow the brelse to happen.
537	 * Normally it would be done for stale (cancelled) buffers
538	 * at unpin time, but we'll never go through the pin/unpin
539	 * cycle if we abort inside commit.
540	 */
541	aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
542
543	/*
544	 * If the buf item is marked stale, then don't do anything.
545	 * We'll unlock the buffer and free the buf item when the
546	 * buffer is unpinned for the last time.
547	 */
548	if (bip->bli_flags & XFS_BLI_STALE) {
549		bip->bli_flags &= ~XFS_BLI_LOGGED;
550		xfs_buf_item_trace("UNLOCK STALE", bip);
551		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
552		if (!aborted)
553			return;
554	}
555
556	/*
557	 * Drop the transaction's reference to the log item if
558	 * it was not logged as part of the transaction.  Otherwise
559	 * we'll drop the reference in xfs_buf_item_unpin() when
560	 * the transaction is really through with the buffer.
561	 */
562	if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
563		atomic_dec(&bip->bli_refcount);
564	} else {
565		/*
566		 * Clear the logged flag since this is per
567		 * transaction state.
568		 */
569		bip->bli_flags &= ~XFS_BLI_LOGGED;
570	}
571
572	/*
573	 * Before possibly freeing the buf item, determine if we should
574	 * release the buffer at the end of this routine.
575	 */
576	hold = bip->bli_flags & XFS_BLI_HOLD;
577	xfs_buf_item_trace("UNLOCK", bip);
578
579	/*
580	 * If the buf item isn't tracking any data, free it.
581	 * Otherwise, if XFS_BLI_HOLD is set clear it.
582	 */
583	if (xfs_count_bits(bip->bli_format.blf_data_map,
584			      bip->bli_format.blf_map_size, 0) == 0) {
585		xfs_buf_item_relse(bp);
586	} else if (hold) {
587		bip->bli_flags &= ~XFS_BLI_HOLD;
588	}
589
590	/*
591	 * Release the buffer if XFS_BLI_HOLD was not set.
592	 */
593	if (!hold) {
594		xfs_buf_relse(bp);
595	}
596}
597
598/*
599 * This is called to find out where the oldest active copy of the
600 * buf log item in the on disk log resides now that the last log
601 * write of it completed at the given lsn.
602 * We always re-log all the dirty data in a buffer, so usually the
603 * latest copy in the on disk log is the only one that matters.  For
604 * those cases we simply return the given lsn.
605 *
606 * The one exception to this is for buffers full of newly allocated
607 * inodes.  These buffers are only relogged with the XFS_BLI_INODE_BUF
608 * flag set, indicating that only the di_next_unlinked fields from the
609 * inodes in the buffers will be replayed during recovery.  If the
610 * original newly allocated inode images have not yet been flushed
611 * when the buffer is so relogged, then we need to make sure that we
612 * keep the old images in the 'active' portion of the log.  We do this
613 * by returning the original lsn of that transaction here rather than
614 * the current one.
615 */
616STATIC xfs_lsn_t
617xfs_buf_item_committed(
618	xfs_buf_log_item_t	*bip,
619	xfs_lsn_t		lsn)
620{
621	xfs_buf_item_trace("COMMITTED", bip);
622	if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
623	    (bip->bli_item.li_lsn != 0)) {
624		return bip->bli_item.li_lsn;
625	}
626	return (lsn);
627}
628
629/*
630 * This is called to asynchronously write the buffer associated with this
631 * buf log item out to disk. The buffer will already have been locked by
632 * a successful call to xfs_buf_item_trylock().  If the buffer still has
633 * B_DELWRI set, then get it going out to disk with a call to bawrite().
634 * If not, then just release the buffer.
635 */
636STATIC void
637xfs_buf_item_push(
638	xfs_buf_log_item_t	*bip)
639{
640	xfs_buf_t	*bp;
641
642	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
643	xfs_buf_item_trace("PUSH", bip);
644
645	bp = bip->bli_buf;
646
647	if (XFS_BUF_ISDELAYWRITE(bp)) {
648		xfs_bawrite(bip->bli_item.li_mountp, bp);
649	} else {
650		xfs_buf_relse(bp);
651	}
652}
653
654/* ARGSUSED */
655STATIC void
656xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
657{
658}
659
660/*
661 * This is the ops vector shared by all buf log items.
662 */
663static struct xfs_item_ops xfs_buf_item_ops = {
664	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_buf_item_size,
665	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
666					xfs_buf_item_format,
667	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
668	.iop_unpin	= (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin,
669	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
670					xfs_buf_item_unpin_remove,
671	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
672	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_buf_item_unlock,
673	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
674					xfs_buf_item_committed,
675	.iop_push	= (void(*)(xfs_log_item_t*))xfs_buf_item_push,
676	.iop_pushbuf	= NULL,
677	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
678					xfs_buf_item_committing
679};
680
681
682/*
683 * Allocate a new buf log item to go with the given buffer.
684 * Set the buffer's b_fsprivate field to point to the new
685 * buf log item.  If there are other item's attached to the
686 * buffer (see xfs_buf_attach_iodone() below), then put the
687 * buf log item at the front.
688 */
689void
690xfs_buf_item_init(
691	xfs_buf_t	*bp,
692	xfs_mount_t	*mp)
693{
694	xfs_log_item_t		*lip;
695	xfs_buf_log_item_t	*bip;
696	int			chunks;
697	int			map_size;
698
699	/*
700	 * Check to see if there is already a buf log item for
701	 * this buffer.  If there is, it is guaranteed to be
702	 * the first.  If we do already have one, there is
703	 * nothing to do here so return.
704	 */
705	if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp)
706		XFS_BUF_SET_FSPRIVATE3(bp, mp);
707	XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
708	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
709		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
710		if (lip->li_type == XFS_LI_BUF) {
711			return;
712		}
713	}
714
715	/*
716	 * chunks is the number of XFS_BLI_CHUNK size pieces
717	 * the buffer can be divided into. Make sure not to
718	 * truncate any pieces.  map_size is the size of the
719	 * bitmap needed to describe the chunks of the buffer.
720	 */
721	chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
722	map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
723
724	bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
725						    KM_SLEEP);
726	bip->bli_item.li_type = XFS_LI_BUF;
727	bip->bli_item.li_ops = &xfs_buf_item_ops;
728	bip->bli_item.li_mountp = mp;
729	bip->bli_buf = bp;
730	bip->bli_format.blf_type = XFS_LI_BUF;
731	bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
732	bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
733	bip->bli_format.blf_map_size = map_size;
734#ifdef XFS_BLI_TRACE
735	bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP);
736#endif
737
738#ifdef XFS_TRANS_DEBUG
739	/*
740	 * Allocate the arrays for tracking what needs to be logged
741	 * and what our callers request to be logged.  bli_orig
742	 * holds a copy of the original, clean buffer for comparison
743	 * against, and bli_logged keeps a 1 bit flag per byte in
744	 * the buffer to indicate which bytes the callers have asked
745	 * to have logged.
746	 */
747	bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
748	memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp));
749	bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
750#endif
751
752	/*
753	 * Put the buf item into the list of items attached to the
754	 * buffer at the front.
755	 */
756	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
757		bip->bli_item.li_bio_list =
758				XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
759	}
760	XFS_BUF_SET_FSPRIVATE(bp, bip);
761}
762
763
764/*
765 * Mark bytes first through last inclusive as dirty in the buf
766 * item's bitmap.
767 */
768void
769xfs_buf_item_log(
770	xfs_buf_log_item_t	*bip,
771	uint			first,
772	uint			last)
773{
774	uint		first_bit;
775	uint		last_bit;
776	uint		bits_to_set;
777	uint		bits_set;
778	uint		word_num;
779	uint		*wordp;
780	uint		bit;
781	uint		end_bit;
782	uint		mask;
783
784	/*
785	 * Mark the item as having some dirty data for
786	 * quick reference in xfs_buf_item_dirty.
787	 */
788	bip->bli_flags |= XFS_BLI_DIRTY;
789
790	/*
791	 * Convert byte offsets to bit numbers.
792	 */
793	first_bit = first >> XFS_BLI_SHIFT;
794	last_bit = last >> XFS_BLI_SHIFT;
795
796	/*
797	 * Calculate the total number of bits to be set.
798	 */
799	bits_to_set = last_bit - first_bit + 1;
800
801	/*
802	 * Get a pointer to the first word in the bitmap
803	 * to set a bit in.
804	 */
805	word_num = first_bit >> BIT_TO_WORD_SHIFT;
806	wordp = &(bip->bli_format.blf_data_map[word_num]);
807
808	/*
809	 * Calculate the starting bit in the first word.
810	 */
811	bit = first_bit & (uint)(NBWORD - 1);
812
813	/*
814	 * First set any bits in the first word of our range.
815	 * If it starts at bit 0 of the word, it will be
816	 * set below rather than here.  That is what the variable
817	 * bit tells us. The variable bits_set tracks the number
818	 * of bits that have been set so far.  End_bit is the number
819	 * of the last bit to be set in this word plus one.
820	 */
821	if (bit) {
822		end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
823		mask = ((1 << (end_bit - bit)) - 1) << bit;
824		*wordp |= mask;
825		wordp++;
826		bits_set = end_bit - bit;
827	} else {
828		bits_set = 0;
829	}
830
831	/*
832	 * Now set bits a whole word at a time that are between
833	 * first_bit and last_bit.
834	 */
835	while ((bits_to_set - bits_set) >= NBWORD) {
836		*wordp |= 0xffffffff;
837		bits_set += NBWORD;
838		wordp++;
839	}
840
841	/*
842	 * Finally, set any bits left to be set in one last partial word.
843	 */
844	end_bit = bits_to_set - bits_set;
845	if (end_bit) {
846		mask = (1 << end_bit) - 1;
847		*wordp |= mask;
848	}
849
850	xfs_buf_item_log_debug(bip, first, last);
851}
852
853
854/*
855 * Return 1 if the buffer has some data that has been logged (at any
856 * point, not just the current transaction) and 0 if not.
857 */
858uint
859xfs_buf_item_dirty(
860	xfs_buf_log_item_t	*bip)
861{
862	return (bip->bli_flags & XFS_BLI_DIRTY);
863}
864
865/*
866 * This is called when the buf log item is no longer needed.  It should
867 * free the buf log item associated with the given buffer and clear
868 * the buffer's pointer to the buf log item.  If there are no more
869 * items in the list, clear the b_iodone field of the buffer (see
870 * xfs_buf_attach_iodone() below).
871 */
872void
873xfs_buf_item_relse(
874	xfs_buf_t	*bp)
875{
876	xfs_buf_log_item_t	*bip;
877
878	xfs_buftrace("XFS_RELSE", bp);
879	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
880	XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
881	if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
882	    (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
883		XFS_BUF_CLR_IODONE_FUNC(bp);
884	}
885
886#ifdef XFS_TRANS_DEBUG
887	kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
888	bip->bli_orig = NULL;
889	kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
890	bip->bli_logged = NULL;
891#endif /* XFS_TRANS_DEBUG */
892
893#ifdef XFS_BLI_TRACE
894	ktrace_free(bip->bli_trace);
895#endif
896	kmem_zone_free(xfs_buf_item_zone, bip);
897}
898
899
900/*
901 * Add the given log item with its callback to the list of callbacks
902 * to be called when the buffer's I/O completes.  If it is not set
903 * already, set the buffer's b_iodone() routine to be
904 * xfs_buf_iodone_callbacks() and link the log item into the list of
905 * items rooted at b_fsprivate.  Items are always added as the second
906 * entry in the list if there is a first, because the buf item code
907 * assumes that the buf log item is first.
908 */
909void
910xfs_buf_attach_iodone(
911	xfs_buf_t	*bp,
912	void		(*cb)(xfs_buf_t *, xfs_log_item_t *),
913	xfs_log_item_t	*lip)
914{
915	xfs_log_item_t	*head_lip;
916
917	ASSERT(XFS_BUF_ISBUSY(bp));
918	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
919
920	lip->li_cb = cb;
921	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
922		head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
923		lip->li_bio_list = head_lip->li_bio_list;
924		head_lip->li_bio_list = lip;
925	} else {
926		XFS_BUF_SET_FSPRIVATE(bp, lip);
927	}
928
929	ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) ||
930	       (XFS_BUF_IODONE_FUNC(bp) == NULL));
931	XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
932}
933
934STATIC void
935xfs_buf_do_callbacks(
936	xfs_buf_t	*bp,
937	xfs_log_item_t	*lip)
938{
939	xfs_log_item_t	*nlip;
940
941	while (lip != NULL) {
942		nlip = lip->li_bio_list;
943		ASSERT(lip->li_cb != NULL);
944		/*
945		 * Clear the next pointer so we don't have any
946		 * confusion if the item is added to another buf.
947		 * Don't touch the log item after calling its
948		 * callback, because it could have freed itself.
949		 */
950		lip->li_bio_list = NULL;
951		lip->li_cb(bp, lip);
952		lip = nlip;
953	}
954}
955
956/*
957 * This is the iodone() function for buffers which have had callbacks
958 * attached to them by xfs_buf_attach_iodone().  It should remove each
959 * log item from the buffer's list and call the callback of each in turn.
960 * When done, the buffer's fsprivate field is set to NULL and the buffer
961 * is unlocked with a call to iodone().
962 */
963void
964xfs_buf_iodone_callbacks(
965	xfs_buf_t	*bp)
966{
967	xfs_log_item_t	*lip;
968	static ulong	lasttime;
969	static xfs_buftarg_t *lasttarg;
970	xfs_mount_t	*mp;
971
972	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
973	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
974
975	if (XFS_BUF_GETERROR(bp) != 0) {
976		/*
977		 * If we've already decided to shutdown the filesystem
978		 * because of IO errors, there's no point in giving this
979		 * a retry.
980		 */
981		mp = lip->li_mountp;
982		if (XFS_FORCED_SHUTDOWN(mp)) {
983			ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
984			XFS_BUF_SUPER_STALE(bp);
985			xfs_buftrace("BUF_IODONE_CB", bp);
986			xfs_buf_do_callbacks(bp, lip);
987			XFS_BUF_SET_FSPRIVATE(bp, NULL);
988			XFS_BUF_CLR_IODONE_FUNC(bp);
989
990			/*
991			 * XFS_SHUT flag gets set when we go thru the
992			 * entire buffer cache and deliberately start
993			 * throwing away delayed write buffers.
994			 * Since there's no biowait done on those,
995			 * we should just brelse them.
996			 */
997			if (XFS_BUF_ISSHUT(bp)) {
998			    XFS_BUF_UNSHUT(bp);
999				xfs_buf_relse(bp);
1000			} else {
1001				xfs_biodone(bp);
1002			}
1003
1004			return;
1005		}
1006
1007		if ((XFS_BUF_TARGET(bp) != lasttarg) ||
1008		    (time_after(jiffies, (lasttime + 5*HZ)))) {
1009			lasttime = jiffies;
1010			cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
1011					" block 0x%llx in %s",
1012				XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
1013			      (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
1014		}
1015		lasttarg = XFS_BUF_TARGET(bp);
1016
1017		if (XFS_BUF_ISASYNC(bp)) {
1018			/*
1019			 * If the write was asynchronous then noone will be
1020			 * looking for the error.  Clear the error state
1021			 * and write the buffer out again delayed write.
1022			 *
1023			 * XXXsup This is OK, so long as we catch these
1024			 * before we start the umount; we don't want these
1025			 * DELWRI metadata bufs to be hanging around.
1026			 */
1027			XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
1028
1029			if (!(XFS_BUF_ISSTALE(bp))) {
1030				XFS_BUF_DELAYWRITE(bp);
1031				XFS_BUF_DONE(bp);
1032				XFS_BUF_SET_START(bp);
1033			}
1034			ASSERT(XFS_BUF_IODONE_FUNC(bp));
1035			xfs_buftrace("BUF_IODONE ASYNC", bp);
1036			xfs_buf_relse(bp);
1037		} else {
1038			/*
1039			 * If the write of the buffer was not asynchronous,
1040			 * then we want to make sure to return the error
1041			 * to the caller of bwrite().  Because of this we
1042			 * cannot clear the B_ERROR state at this point.
1043			 * Instead we install a callback function that
1044			 * will be called when the buffer is released, and
1045			 * that routine will clear the error state and
1046			 * set the buffer to be written out again after
1047			 * some delay.
1048			 */
1049			/* We actually overwrite the existing b-relse
1050			   function at times, but we're gonna be shutting down
1051			   anyway. */
1052			XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
1053			XFS_BUF_DONE(bp);
1054			XFS_BUF_V_IODONESEMA(bp);
1055		}
1056		return;
1057	}
1058#ifdef XFSERRORDEBUG
1059	xfs_buftrace("XFS BUFCB NOERR", bp);
1060#endif
1061	xfs_buf_do_callbacks(bp, lip);
1062	XFS_BUF_SET_FSPRIVATE(bp, NULL);
1063	XFS_BUF_CLR_IODONE_FUNC(bp);
1064	xfs_biodone(bp);
1065}
1066
1067/*
1068 * This is a callback routine attached to a buffer which gets an error
1069 * when being written out synchronously.
1070 */
1071STATIC void
1072xfs_buf_error_relse(
1073	xfs_buf_t	*bp)
1074{
1075	xfs_log_item_t	*lip;
1076	xfs_mount_t	*mp;
1077
1078	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1079	mp = (xfs_mount_t *)lip->li_mountp;
1080	ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
1081
1082	XFS_BUF_STALE(bp);
1083	XFS_BUF_DONE(bp);
1084	XFS_BUF_UNDELAYWRITE(bp);
1085	XFS_BUF_ERROR(bp,0);
1086	xfs_buftrace("BUF_ERROR_RELSE", bp);
1087	if (! XFS_FORCED_SHUTDOWN(mp))
1088		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1089	/*
1090	 * We have to unpin the pinned buffers so do the
1091	 * callbacks.
1092	 */
1093	xfs_buf_do_callbacks(bp, lip);
1094	XFS_BUF_SET_FSPRIVATE(bp, NULL);
1095	XFS_BUF_CLR_IODONE_FUNC(bp);
1096	XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
1097	xfs_buf_relse(bp);
1098}
1099
1100
1101/*
1102 * This is the iodone() function for buffers which have been
1103 * logged.  It is called when they are eventually flushed out.
1104 * It should remove the buf item from the AIL, and free the buf item.
1105 * It is called by xfs_buf_iodone_callbacks() above which will take
1106 * care of cleaning up the buffer itself.
1107 */
1108/* ARGSUSED */
1109void
1110xfs_buf_iodone(
1111	xfs_buf_t		*bp,
1112	xfs_buf_log_item_t	*bip)
1113{
1114	struct xfs_mount	*mp;
1115	SPLDECL(s);
1116
1117	ASSERT(bip->bli_buf == bp);
1118
1119	mp = bip->bli_item.li_mountp;
1120
1121	/*
1122	 * If we are forcibly shutting down, this may well be
1123	 * off the AIL already. That's because we simulate the
1124	 * log-committed callbacks to unpin these buffers. Or we may never
1125	 * have put this item on AIL because of the transaction was
1126	 * aborted forcibly. xfs_trans_delete_ail() takes care of these.
1127	 *
1128	 * Either way, AIL is useless if we're forcing a shutdown.
1129	 */
1130	AIL_LOCK(mp,s);
1131	/*
1132	 * xfs_trans_delete_ail() drops the AIL lock.
1133	 */
1134	xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s);
1135
1136#ifdef XFS_TRANS_DEBUG
1137	kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
1138	bip->bli_orig = NULL;
1139	kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
1140	bip->bli_logged = NULL;
1141#endif /* XFS_TRANS_DEBUG */
1142
1143#ifdef XFS_BLI_TRACE
1144	ktrace_free(bip->bli_trace);
1145#endif
1146	kmem_zone_free(xfs_buf_item_zone, bip);
1147}
1148
1149#if defined(XFS_BLI_TRACE)
1150void
1151xfs_buf_item_trace(
1152	char			*id,
1153	xfs_buf_log_item_t	*bip)
1154{
1155	xfs_buf_t		*bp;
1156	ASSERT(bip->bli_trace != NULL);
1157
1158	bp = bip->bli_buf;
1159	ktrace_enter(bip->bli_trace,
1160		     (void *)id,
1161		     (void *)bip->bli_buf,
1162		     (void *)((unsigned long)bip->bli_flags),
1163		     (void *)((unsigned long)bip->bli_recur),
1164		     (void *)((unsigned long)atomic_read(&bip->bli_refcount)),
1165		     (void *)((unsigned long)
1166				(0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)),
1167		     (void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))),
1168		     (void *)((unsigned long)XFS_BUF_COUNT(bp)),
1169		     (void *)((unsigned long)XFS_BUF_BFLAGS(bp)),
1170		     XFS_BUF_FSPRIVATE(bp, void *),
1171		     XFS_BUF_FSPRIVATE2(bp, void *),
1172		     (void *)(unsigned long)XFS_BUF_ISPINNED(bp),
1173		     (void *)XFS_BUF_IODONE_FUNC(bp),
1174		     (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))),
1175		     (void *)bip->bli_item.li_desc,
1176		     (void *)((unsigned long)bip->bli_item.li_flags));
1177}
1178#endif /* XFS_BLI_TRACE */
1179