1/*
2 * linux/fs/transaction.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem transaction handling code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages transactions (compound commits managed by the
16 * journaling code) and handles (individual atomic operations by the
17 * filesystem).
18 */
19
20#include <linux/sched.h>
21#include <linux/fs.h>
22#include <linux/jbd.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25#include <linux/locks.h>
26#include <linux/timer.h>
27#include <linux/smp_lock.h>
28#include <linux/mm.h>
29
30extern spinlock_t journal_datalist_lock;
31
32/*
33 * get_transaction: obtain a new transaction_t object.
34 *
35 * Simply allocate and initialise a new transaction.  Create it in
36 * RUNNING state and add it to the current journal (which should not
37 * have an existing running transaction: we only make a new transaction
38 * once we have started to commit the old one).
39 *
40 * Preconditions:
41 *	The journal MUST be locked.  We don't perform atomic mallocs on the
42 *	new transaction	and we can't block without protecting against other
43 *	processes trying to touch the journal while it is in transition.
44 */
45
46static transaction_t * get_transaction (journal_t * journal, int is_try)
47{
48	transaction_t * transaction;
49
50	transaction = jbd_kmalloc (sizeof (transaction_t), GFP_NOFS);
51	if (!transaction)
52		return NULL;
53
54	memset (transaction, 0, sizeof (transaction_t));
55
56	transaction->t_journal = journal;
57	transaction->t_state = T_RUNNING;
58	transaction->t_tid = journal->j_transaction_sequence++;
59	transaction->t_expires = jiffies + journal->j_commit_interval;
60	INIT_LIST_HEAD(&transaction->t_jcb);
61
62	/* Set up the commit timer for the new transaction. */
63	J_ASSERT (!journal->j_commit_timer_active);
64	journal->j_commit_timer_active = 1;
65	journal->j_commit_timer->expires = transaction->t_expires;
66	add_timer(journal->j_commit_timer);
67
68	J_ASSERT (journal->j_running_transaction == NULL);
69	journal->j_running_transaction = transaction;
70
71	return transaction;
72}
73
74/*
75 * Handle management.
76 *
77 * A handle_t is an object which represents a single atomic update to a
78 * filesystem, and which tracks all of the modifications which form part
79 * of that one update.
80 */
81
82/*
83 * start_this_handle: Given a handle, deal with any locking or stalling
84 * needed to make sure that there is enough journal space for the handle
85 * to begin.  Attach the handle to a transaction and set up the
86 * transaction's buffer credits.
87 */
88
89static int start_this_handle(journal_t *journal, handle_t *handle)
90{
91	transaction_t *transaction;
92	int needed;
93	int nblocks = handle->h_buffer_credits;
94
95	if (nblocks > journal->j_max_transaction_buffers) {
96		jbd_debug(1, "JBD: %s wants too many credits (%d > %d)\n",
97		       current->comm, nblocks,
98		       journal->j_max_transaction_buffers);
99		return -ENOSPC;
100	}
101
102	jbd_debug(3, "New handle %p going live.\n", handle);
103
104repeat:
105
106	lock_journal(journal);
107
108repeat_locked:
109
110	if (is_journal_aborted(journal) ||
111	    (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
112		unlock_journal(journal);
113		return -EROFS;
114	}
115
116	/* Wait on the journal's transaction barrier if necessary */
117	if (journal->j_barrier_count) {
118		unlock_journal(journal);
119		sleep_on(&journal->j_wait_transaction_locked);
120		goto repeat;
121	}
122
123	if (!journal->j_running_transaction)
124		get_transaction(journal, 0);
125	/* @@@ Error? */
126	J_ASSERT(journal->j_running_transaction);
127
128	transaction = journal->j_running_transaction;
129
130	/* If the current transaction is locked down for commit, wait
131	 * for the lock to be released. */
132
133	if (transaction->t_state == T_LOCKED) {
134		unlock_journal(journal);
135		jbd_debug(3, "Handle %p stalling...\n", handle);
136		sleep_on(&journal->j_wait_transaction_locked);
137		goto repeat;
138	}
139
140	/* If there is not enough space left in the log to write all
141	 * potential buffers requested by this operation, we need to
142	 * stall pending a log checkpoint to free some more log
143	 * space. */
144
145	needed = transaction->t_outstanding_credits + nblocks;
146
147	if (needed > journal->j_max_transaction_buffers) {
148		/* If the current transaction is already too large, then
149		 * start to commit it: we can then go back and attach
150		 * this handle to a new transaction. */
151
152		jbd_debug(2, "Handle %p starting new commit...\n", handle);
153		log_start_commit(journal, transaction);
154		unlock_journal(journal);
155		sleep_on(&journal->j_wait_transaction_locked);
156		lock_journal(journal);
157		goto repeat_locked;
158	}
159
160	/*
161	 * The commit code assumes that it can get enough log space
162	 * without forcing a checkpoint.  This is *critical* for
163	 * correctness: a checkpoint of a buffer which is also
164	 * associated with a committing transaction creates a deadlock,
165	 * so commit simply cannot force through checkpoints.
166	 *
167	 * We must therefore ensure the necessary space in the journal
168	 * *before* starting to dirty potentially checkpointed buffers
169	 * in the new transaction.
170	 *
171	 * The worst part is, any transaction currently committing can
172	 * reduce the free space arbitrarily.  Be careful to account for
173	 * those buffers when checkpointing.
174	 */
175
176	/*
177	 * @@@ AKPM: This seems rather over-defensive.  We're giving commit
178	 * a _lot_ of headroom: 1/4 of the journal plus the size of
179	 * the committing transaction.  Really, we only need to give it
180	 * committing_transaction->t_outstanding_credits plus "enough" for
181	 * the log control blocks.
182	 * Also, this test is inconsitent with the matching one in
183	 * journal_extend().
184	 */
185	needed = journal->j_max_transaction_buffers;
186	if (journal->j_committing_transaction)
187		needed += journal->j_committing_transaction->
188					t_outstanding_credits;
189
190	if (log_space_left(journal) < needed) {
191		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
192		log_wait_for_space(journal, needed);
193		goto repeat_locked;
194	}
195
196	/* OK, account for the buffers that this operation expects to
197	 * use and add the handle to the running transaction. */
198
199	handle->h_transaction = transaction;
200	transaction->t_outstanding_credits += nblocks;
201	transaction->t_updates++;
202	transaction->t_handle_count++;
203	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
204		  handle, nblocks, transaction->t_outstanding_credits,
205		  log_space_left(journal));
206
207	unlock_journal(journal);
208
209	return 0;
210}
211
212/* Allocate a new handle.  This should probably be in a slab... */
213static handle_t *new_handle(int nblocks)
214{
215	handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
216	if (!handle)
217		return NULL;
218	memset(handle, 0, sizeof (handle_t));
219	handle->h_buffer_credits = nblocks;
220	handle->h_ref = 1;
221	INIT_LIST_HEAD(&handle->h_jcb);
222
223	return handle;
224}
225
226/*
227 * Obtain a new handle.
228 *
229 * We make sure that the transaction can guarantee at least nblocks of
230 * modified buffers in the log.  We block until the log can guarantee
231 * that much space.
232 *
233 * This function is visible to journal users (like ext2fs), so is not
234 * called with the journal already locked.
235 *
236 * Return a pointer to a newly allocated handle, or NULL on failure
237 */
238
239handle_t *journal_start(journal_t *journal, int nblocks)
240{
241	handle_t *handle = journal_current_handle();
242	int err;
243
244	if (!journal)
245		return ERR_PTR(-EROFS);
246
247	if (handle) {
248		J_ASSERT(handle->h_transaction->t_journal == journal);
249		handle->h_ref++;
250		return handle;
251	}
252
253	handle = new_handle(nblocks);
254	if (!handle)
255		return ERR_PTR(-ENOMEM);
256
257	current->journal_info = handle;
258
259	err = start_this_handle(journal, handle);
260	if (err < 0) {
261		kfree(handle);
262		current->journal_info = NULL;
263		return ERR_PTR(err);
264	}
265
266	return handle;
267}
268
269/*
270 * Return zero on success
271 */
272static int try_start_this_handle(journal_t *journal, handle_t *handle)
273{
274	transaction_t *transaction;
275	int needed;
276	int nblocks = handle->h_buffer_credits;
277	int ret = 0;
278
279	jbd_debug(3, "New handle %p maybe going live.\n", handle);
280
281	lock_journal(journal);
282
283	if (is_journal_aborted(journal) ||
284	    (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
285		ret = -EROFS;
286		goto fail_unlock;
287	}
288
289	if (journal->j_barrier_count)
290		goto fail_unlock;
291
292	if (!journal->j_running_transaction && get_transaction(journal, 1) == 0)
293		goto fail_unlock;
294
295	transaction = journal->j_running_transaction;
296	if (transaction->t_state == T_LOCKED)
297		goto fail_unlock;
298
299	needed = transaction->t_outstanding_credits + nblocks;
300	/* We could run log_start_commit here */
301	if (needed > journal->j_max_transaction_buffers)
302		goto fail_unlock;
303
304	needed = journal->j_max_transaction_buffers;
305	if (journal->j_committing_transaction)
306		needed += journal->j_committing_transaction->
307						t_outstanding_credits;
308
309	if (log_space_left(journal) < needed)
310		goto fail_unlock;
311
312	handle->h_transaction = transaction;
313	transaction->t_outstanding_credits += nblocks;
314	transaction->t_updates++;
315	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
316		  handle, nblocks, transaction->t_outstanding_credits,
317		  log_space_left(journal));
318	unlock_journal(journal);
319	return 0;
320
321fail_unlock:
322	unlock_journal(journal);
323	if (ret >= 0)
324		ret = -1;
325	return ret;
326}
327
328/*
329 * Try to start a handle, but non-blockingly.  If we weren't able
330 * to, return an ERR_PTR value.
331 */
332handle_t *journal_try_start(journal_t *journal, int nblocks)
333{
334	handle_t *handle = journal_current_handle();
335	int err;
336
337	if (!journal)
338		return ERR_PTR(-EROFS);
339
340	if (handle) {
341		jbd_debug(4, "h_ref %d -> %d\n",
342				handle->h_ref,
343				handle->h_ref + 1);
344		J_ASSERT(handle->h_transaction->t_journal == journal);
345		if (is_handle_aborted(handle))
346			return ERR_PTR(-EIO);
347		handle->h_ref++;
348		return handle;
349	} else {
350		jbd_debug(4, "no current transaction\n");
351	}
352
353	if (is_journal_aborted(journal))
354		return ERR_PTR(-EIO);
355
356	handle = new_handle(nblocks);
357	if (!handle)
358		return ERR_PTR(-ENOMEM);
359
360	current->journal_info = handle;
361
362	err = try_start_this_handle(journal, handle);
363	if (err < 0) {
364		kfree(handle);
365		current->journal_info = NULL;
366		return ERR_PTR(err);
367	}
368
369	return handle;
370}
371
372/*
373 * journal_extend: extend buffer credits.
374 *
375 * Some transactions, such as large extends and truncates, can be done
376 * atomically all at once or in several stages.  The operation requests
377 * a credit for a number of buffer modications in advance, but can
378 * extend its credit if it needs more.
379 *
380 * journal_extend tries to give the running handle more buffer credits.
381 * It does not guarantee that allocation: this is a best-effort only.
382 * The calling process MUST be able to deal cleanly with a failure to
383 * extend here.
384 *
385 * Return 0 on success, non-zero on failure.
386 *
387 * return code < 0 implies an error
388 * return code > 0 implies normal transaction-full status.
389 */
390
391int journal_extend (handle_t *handle, int nblocks)
392{
393	transaction_t *transaction = handle->h_transaction;
394	journal_t *journal = transaction->t_journal;
395	int result;
396	int wanted;
397
398	lock_journal (journal);
399
400	result = -EIO;
401	if (is_handle_aborted(handle))
402		goto error_out;
403
404	result = 1;
405
406	/* Don't extend a locked-down transaction! */
407	if (handle->h_transaction->t_state != T_RUNNING) {
408		jbd_debug(3, "denied handle %p %d blocks: "
409			  "transaction not running\n", handle, nblocks);
410		goto error_out;
411	}
412
413	wanted = transaction->t_outstanding_credits + nblocks;
414
415	if (wanted > journal->j_max_transaction_buffers) {
416		jbd_debug(3, "denied handle %p %d blocks: "
417			  "transaction too large\n", handle, nblocks);
418		goto error_out;
419	}
420
421	if (wanted > log_space_left(journal)) {
422		jbd_debug(3, "denied handle %p %d blocks: "
423			  "insufficient log space\n", handle, nblocks);
424		goto error_out;
425	}
426
427	handle->h_buffer_credits += nblocks;
428	transaction->t_outstanding_credits += nblocks;
429	result = 0;
430
431	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
432
433error_out:
434	unlock_journal (journal);
435	return result;
436}
437
438
439/*
440 * journal_restart: restart a handle for a multi-transaction filesystem
441 * operation.
442 *
443 * If the journal_extend() call above fails to grant new buffer credits
444 * to a running handle, a call to journal_restart will commit the
445 * handle's transaction so far and reattach the handle to a new
446 * transaction capabable of guaranteeing the requested number of
447 * credits.
448 */
449
450int journal_restart(handle_t *handle, int nblocks)
451{
452	transaction_t *transaction = handle->h_transaction;
453	journal_t *journal = transaction->t_journal;
454	int ret;
455
456	/* If we've had an abort of any type, don't even think about
457	 * actually doing the restart! */
458	if (is_handle_aborted(handle))
459		return 0;
460
461	/* First unlink the handle from its current transaction, and
462	 * start the commit on that. */
463
464	J_ASSERT (transaction->t_updates > 0);
465	J_ASSERT (journal_current_handle() == handle);
466
467	transaction->t_outstanding_credits -= handle->h_buffer_credits;
468	transaction->t_updates--;
469
470	if (!transaction->t_updates)
471		wake_up(&journal->j_wait_updates);
472
473	jbd_debug(2, "restarting handle %p\n", handle);
474	log_start_commit(journal, transaction);
475
476	handle->h_buffer_credits = nblocks;
477	ret = start_this_handle(journal, handle);
478	return ret;
479}
480
481
482/*
483 * Barrier operation: establish a transaction barrier.
484 *
485 * This locks out any further updates from being started, and blocks
486 * until all existing updates have completed, returning only once the
487 * journal is in a quiescent state with no updates running.
488 *
489 * The journal lock should not be held on entry.
490 */
491
492void journal_lock_updates (journal_t *journal)
493{
494	lock_journal(journal);
495	++journal->j_barrier_count;
496
497	/* Wait until there are no running updates */
498	while (1) {
499		transaction_t *transaction = journal->j_running_transaction;
500		if (!transaction)
501			break;
502		if (!transaction->t_updates)
503			break;
504
505		unlock_journal(journal);
506		sleep_on(&journal->j_wait_updates);
507		lock_journal(journal);
508	}
509
510	unlock_journal(journal);
511
512	/* We have now established a barrier against other normal
513	 * updates, but we also need to barrier against other
514	 * journal_lock_updates() calls to make sure that we serialise
515	 * special journal-locked operations too. */
516	down(&journal->j_barrier);
517}
518
519/*
520 * Release a transaction barrier obtained with journal_lock_updates().
521 *
522 * Should be called without the journal lock held.
523 */
524
525void journal_unlock_updates (journal_t *journal)
526{
527	lock_journal(journal);
528
529	J_ASSERT (journal->j_barrier_count != 0);
530
531	up(&journal->j_barrier);
532	--journal->j_barrier_count;
533	wake_up(&journal->j_wait_transaction_locked);
534	unlock_journal(journal);
535}
536
537/*
538 * journal_get_write_access: notify intent to modify a buffer for metadata
539 * (not data) update.
540 *
541 * If the buffer is already part of the current transaction, then there
542 * is nothing we need to do.  If it is already part of a prior
543 * transaction which we are still committing to disk, then we need to
544 * make sure that we do not overwrite the old copy: we do copy-out to
545 * preserve the copy going to disk.  We also account the buffer against
546 * the handle's metadata buffer credits (unless the buffer is already
547 * part of the transaction, that is).
548 *
549 * Returns an error code or 0 on success.
550 *
551 * In full data journalling mode the buffer may be of type BJ_AsyncData,
552 * because we're write()ing a buffer which is also part of a shared mapping.
553 */
554
555static int
556do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy)
557{
558	struct buffer_head *bh;
559	transaction_t *transaction = handle->h_transaction;
560	journal_t *journal = transaction->t_journal;
561	int error;
562	char *frozen_buffer = NULL;
563	int need_copy = 0;
564	int locked;
565
566	jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
567
568	JBUFFER_TRACE(jh, "entry");
569repeat:
570	bh = jh2bh(jh);
571
572	/* @@@ Need to check for errors here at some point. */
573
574	/*
575	 * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a
576	 * simple lock_journal().  This code here will care for locked buffers.
577	 */
578	locked = test_and_set_bit(BH_Lock, &bh->b_state);
579	if (locked) {
580		/* We can't reliably test the buffer state if we found
581		 * it already locked, so just wait for the lock and
582		 * retry. */
583		unlock_journal(journal);
584		__wait_on_buffer(bh);
585		lock_journal(journal);
586		goto repeat;
587	}
588
589	/* We now hold the buffer lock so it is safe to query the buffer
590	 * state.  Is the buffer dirty?
591	 *
592	 * If so, there are two possibilities.  The buffer may be
593	 * non-journaled, and undergoing a quite legitimate writeback.
594	 * Otherwise, it is journaled, and we don't expect dirty buffers
595	 * in that state (the buffers should be marked JBD_Dirty
596	 * instead.)  So either the IO is being done under our own
597	 * control and this is a bug, or it's a third party IO such as
598	 * dump(8) (which may leave the buffer scheduled for read ---
599	 * ie. locked but not dirty) or tune2fs (which may actually have
600	 * the buffer dirtied, ugh.)  */
601
602	if (buffer_dirty(bh)) {
603		spin_lock(&journal_datalist_lock);
604		/* First question: is this buffer already part of the
605		 * current transaction or the existing committing
606		 * transaction? */
607		if (jh->b_transaction) {
608			J_ASSERT_JH(jh, jh->b_transaction == transaction ||
609				    jh->b_transaction == journal->j_committing_transaction);
610			if (jh->b_next_transaction)
611				J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
612			JBUFFER_TRACE(jh, "Unexpected dirty buffer");
613			jbd_unexpected_dirty_buffer(jh);
614		}
615		spin_unlock(&journal_datalist_lock);
616	}
617
618	unlock_buffer(bh);
619
620	error = -EROFS;
621	if (is_handle_aborted(handle))
622		goto out_unlocked;
623	error = 0;
624
625	spin_lock(&journal_datalist_lock);
626
627	/* The buffer is already part of this transaction if
628	 * b_transaction or b_next_transaction points to it. */
629
630	if (jh->b_transaction == transaction ||
631	    jh->b_next_transaction == transaction)
632		goto done_locked;
633
634	/* If there is already a copy-out version of this buffer, then
635	 * we don't need to make another one. */
636
637	if (jh->b_frozen_data) {
638		JBUFFER_TRACE(jh, "has frozen data");
639		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
640		jh->b_next_transaction = transaction;
641
642		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
643		handle->h_buffer_credits--;
644		goto done_locked;
645	}
646
647	/* Is there data here we need to preserve? */
648
649	if (jh->b_transaction && jh->b_transaction != transaction) {
650		JBUFFER_TRACE(jh, "owned by older transaction");
651		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
652		J_ASSERT_JH(jh, jh->b_transaction ==
653					journal->j_committing_transaction);
654
655		/* There is one case we have to be very careful about.
656		 * If the committing transaction is currently writing
657		 * this buffer out to disk and has NOT made a copy-out,
658		 * then we cannot modify the buffer contents at all
659		 * right now.  The essence of copy-out is that it is the
660		 * extra copy, not the primary copy, which gets
661		 * journaled.  If the primary copy is already going to
662		 * disk then we cannot do copy-out here. */
663
664		if (jh->b_jlist == BJ_Shadow) {
665			JBUFFER_TRACE(jh, "on shadow: sleep");
666			spin_unlock(&journal_datalist_lock);
667			unlock_journal(journal);
668			/* commit wakes up all shadow buffers after IO */
669			sleep_on(&jh2bh(jh)->b_wait);
670			lock_journal(journal);
671			goto repeat;
672		}
673
674		/* Only do the copy if the currently-owning transaction
675		 * still needs it.  If it is on the Forget list, the
676		 * committing transaction is past that stage.  The
677		 * buffer had better remain locked during the kmalloc,
678		 * but that should be true --- we hold the journal lock
679		 * still and the buffer is already on the BUF_JOURNAL
680		 * list so won't be flushed.
681		 *
682		 * Subtle point, though: if this is a get_undo_access,
683		 * then we will be relying on the frozen_data to contain
684		 * the new value of the committed_data record after the
685		 * transaction, so we HAVE to force the frozen_data copy
686		 * in that case. */
687
688		if (jh->b_jlist != BJ_Forget || force_copy) {
689			JBUFFER_TRACE(jh, "generate frozen data");
690			if (!frozen_buffer) {
691				JBUFFER_TRACE(jh, "allocate memory for buffer");
692				spin_unlock(&journal_datalist_lock);
693				unlock_journal(journal);
694				frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size,
695							    GFP_NOFS);
696				lock_journal(journal);
697				if (!frozen_buffer) {
698					printk(KERN_EMERG
699						"%s: OOM for frozen_buffer\n",
700						__FUNCTION__);
701					JBUFFER_TRACE(jh, "oom!");
702					error = -ENOMEM;
703					spin_lock(&journal_datalist_lock);
704					goto done_locked;
705				}
706				goto repeat;
707			}
708
709			jh->b_frozen_data = frozen_buffer;
710			frozen_buffer = NULL;
711			need_copy = 1;
712		}
713		jh->b_next_transaction = transaction;
714	}
715
716	J_ASSERT(handle->h_buffer_credits > 0);
717	handle->h_buffer_credits--;
718
719	/* Finally, if the buffer is not journaled right now, we need to
720	 * make sure it doesn't get written to disk before the caller
721	 * actually commits the new data. */
722
723	if (!jh->b_transaction) {
724		JBUFFER_TRACE(jh, "no transaction");
725		J_ASSERT_JH(jh, !jh->b_next_transaction);
726		jh->b_transaction = transaction;
727		JBUFFER_TRACE(jh, "file as BJ_Reserved");
728		__journal_file_buffer(jh, transaction, BJ_Reserved);
729	}
730
731done_locked:
732	spin_unlock(&journal_datalist_lock);
733	if (need_copy) {
734		struct page *page;
735		int offset;
736		char *source;
737
738		J_ASSERT_JH(jh, buffer_uptodate(jh2bh(jh)));
739		page = jh2bh(jh)->b_page;
740		offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
741		source = kmap(page);
742		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
743		kunmap(page);
744	}
745
746
747	/* If we are about to journal a buffer, then any revoke pending
748           on it is no longer valid. */
749	journal_cancel_revoke(handle, jh);
750
751out_unlocked:
752	if (frozen_buffer)
753		kfree(frozen_buffer);
754
755	JBUFFER_TRACE(jh, "exit");
756	return error;
757}
758
759int journal_get_write_access (handle_t *handle, struct buffer_head *bh)
760{
761	transaction_t *transaction = handle->h_transaction;
762	journal_t *journal = transaction->t_journal;
763	struct journal_head *jh = journal_add_journal_head(bh);
764	int rc;
765
766	/* We do not want to get caught playing with fields which the
767	 * log thread also manipulates.  Make sure that the buffer
768	 * completes any outstanding IO before proceeding. */
769	lock_journal(journal);
770	rc = do_get_write_access(handle, jh, 0);
771	journal_unlock_journal_head(jh);
772	unlock_journal(journal);
773	return rc;
774}
775
776
777/*
778 * When the user wants to journal a newly created buffer_head
779 * (ie. getblk() returned a new buffer and we are going to populate it
780 * manually rather than reading off disk), then we need to keep the
781 * buffer_head locked until it has been completely filled with new
782 * data.  In this case, we should be able to make the assertion that
783 * the bh is not already part of an existing transaction.
784 *
785 * The buffer should already be locked by the caller by this point.
786 * There is no lock ranking violation: it was a newly created,
787 * unlocked buffer beforehand. */
788
789int journal_get_create_access (handle_t *handle, struct buffer_head *bh)
790{
791	transaction_t *transaction = handle->h_transaction;
792	journal_t *journal = transaction->t_journal;
793	struct journal_head *jh = journal_add_journal_head(bh);
794	int err;
795
796	jbd_debug(5, "journal_head %p\n", jh);
797	lock_journal(journal);
798	err = -EROFS;
799	if (is_handle_aborted(handle))
800		goto out;
801	err = 0;
802
803	JBUFFER_TRACE(jh, "entry");
804	/* The buffer may already belong to this transaction due to
805	 * pre-zeroing in the filesystem's new_block code.  It may also
806	 * be on the previous, committing transaction's lists, but it
807	 * HAS to be in Forget state in that case: the transaction must
808	 * have deleted the buffer for it to be reused here. */
809	J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
810			 jh->b_transaction == NULL ||
811			 (jh->b_transaction == journal->j_committing_transaction &&
812			  jh->b_jlist == BJ_Forget)));
813
814	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
815	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
816
817	J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
818	handle->h_buffer_credits--;
819
820	spin_lock(&journal_datalist_lock);
821	if (jh->b_transaction == NULL) {
822		jh->b_transaction = transaction;
823		JBUFFER_TRACE(jh, "file as BJ_Reserved");
824		__journal_file_buffer(jh, transaction, BJ_Reserved);
825		JBUFFER_TRACE(jh, "refile");
826		refile_buffer(jh2bh(jh));
827	} else if (jh->b_transaction == journal->j_committing_transaction) {
828		JBUFFER_TRACE(jh, "set next transaction");
829		jh->b_next_transaction = transaction;
830	}
831	spin_unlock(&journal_datalist_lock);
832
833	/*
834	 * akpm: I added this.  ext3_alloc_branch can pick up new indirect
835	 * blocks which contain freed but then revoked metadata.  We need
836	 * to cancel the revoke in case we end up freeing it yet again
837	 * and the reallocating as data - this would cause a second revoke,
838	 * which hits an assertion error.
839	 */
840	JBUFFER_TRACE(jh, "cancelling revoke");
841	journal_cancel_revoke(handle, jh);
842	journal_unlock_journal_head(jh);
843out:
844	unlock_journal(journal);
845	return err;
846}
847
848
849
850/*
851 * journal_get_undo_access: Notify intent to modify metadata with non-
852 * rewindable consequences
853 *
854 * Sometimes there is a need to distinguish between metadata which has
855 * been committed to disk and that which has not.  The ext3fs code uses
856 * this for freeing and allocating space: we have to make sure that we
857 * do not reuse freed space until the deallocation has been committed,
858 * since if we overwrote that space we would make the delete
859 * un-rewindable in case of a crash.
860 *
861 * To deal with that, journal_get_undo_access requests write access to a
862 * buffer for parts of non-rewindable operations such as delete
863 * operations on the bitmaps.  The journaling code must keep a copy of
864 * the buffer's contents prior to the undo_access call until such time
865 * as we know that the buffer has definitely been committed to disk.
866 *
867 * We never need to know which transaction the committed data is part
868 * of: buffers touched here are guaranteed to be dirtied later and so
869 * will be committed to a new transaction in due course, at which point
870 * we can discard the old committed data pointer.
871 *
872 * Returns error number or 0 on success.
873 */
874
875int journal_get_undo_access (handle_t *handle, struct buffer_head *bh)
876{
877	journal_t *journal = handle->h_transaction->t_journal;
878	int err;
879	struct journal_head *jh = journal_add_journal_head(bh);
880
881	JBUFFER_TRACE(jh, "entry");
882	lock_journal(journal);
883
884	/* Do this first --- it can drop the journal lock, so we want to
885	 * make sure that obtaining the committed_data is done
886	 * atomically wrt. completion of any outstanding commits. */
887	err = do_get_write_access (handle, jh, 1);
888	if (err)
889		goto out;
890
891	if (!jh->b_committed_data) {
892		/* Copy out the current buffer contents into the
893		 * preserved, committed copy. */
894		JBUFFER_TRACE(jh, "generate b_committed data");
895		jh->b_committed_data = jbd_kmalloc(jh2bh(jh)->b_size,
896						   GFP_NOFS);
897		if (!jh->b_committed_data) {
898			printk(KERN_EMERG "%s: No memory for committed data!\n",
899				__FUNCTION__);
900			err = -ENOMEM;
901			goto out;
902		}
903
904		memcpy (jh->b_committed_data, jh2bh(jh)->b_data,
905				jh2bh(jh)->b_size);
906	}
907
908out:
909	if (!err)
910		J_ASSERT_JH(jh, jh->b_committed_data);
911	journal_unlock_journal_head(jh);
912	unlock_journal(journal);
913	return err;
914}
915
916/*
917 * journal_dirty_data: mark a buffer as containing dirty data which
918 * needs to be flushed before we can commit the current transaction.
919 *
920 * The buffer is placed on the transaction's data list and is marked as
921 * belonging to the transaction.
922 *
923 * If `async' is set then the writebask will be initiated by the caller
924 * using submit_bh -> end_buffer_io_async.  We put the buffer onto
925 * t_async_datalist.
926 *
927 * Returns error number or 0 on success.
928 *
929 * journal_dirty_data() can be called via page_launder->ext3_writepage
930 * by kswapd.  So it cannot block.  Happily, there's nothing here
931 * which needs lock_journal if `async' is set.
932 *
933 * When the buffer is on the current transaction we freely move it
934 * between BJ_AsyncData and BJ_SyncData according to who tried to
935 * change its state last.
936 */
937
938int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async)
939{
940	journal_t *journal = handle->h_transaction->t_journal;
941	int need_brelse = 0;
942	int wanted_jlist = async ? BJ_AsyncData : BJ_SyncData;
943	struct journal_head *jh;
944
945	if (is_handle_aborted(handle))
946		return 0;
947
948	jh = journal_add_journal_head(bh);
949	JBUFFER_TRACE(jh, "entry");
950
951	/*
952	 * The buffer could *already* be dirty.  Writeout can start
953	 * at any time.
954	 */
955	jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
956
957	/*
958	 * What if the buffer is already part of a running transaction?
959	 *
960	 * There are two cases:
961	 * 1) It is part of the current running transaction.  Refile it,
962	 *    just in case we have allocated it as metadata, deallocated
963	 *    it, then reallocated it as data.
964	 * 2) It is part of the previous, still-committing transaction.
965	 *    If all we want to do is to guarantee that the buffer will be
966	 *    written to disk before this new transaction commits, then
967	 *    being sure that the *previous* transaction has this same
968	 *    property is sufficient for us!  Just leave it on its old
969	 *    transaction.
970	 *
971	 * In case (2), the buffer must not already exist as metadata
972	 * --- that would violate write ordering (a transaction is free
973	 * to write its data at any point, even before the previous
974	 * committing transaction has committed).  The caller must
975	 * never, ever allow this to happen: there's nothing we can do
976	 * about it in this layer.
977	 */
978	spin_lock(&journal_datalist_lock);
979	if (jh->b_transaction) {
980		JBUFFER_TRACE(jh, "has transaction");
981		if (jh->b_transaction != handle->h_transaction) {
982			JBUFFER_TRACE(jh, "belongs to older transaction");
983			J_ASSERT_JH(jh, jh->b_transaction ==
984					journal->j_committing_transaction);
985
986			/* @@@ IS THIS TRUE  ? */
987			/*
988			 * Not any more.  Scenario: someone does a write()
989			 * in data=journal mode.  The buffer's transaction has
990			 * moved into commit.  Then someone does another
991			 * write() to the file.  We do the frozen data copyout
992			 * and set b_next_transaction to point to j_running_t.
993			 * And while we're in that state, someone does a
994			 * writepage() in an attempt to pageout the same area
995			 * of the file via a shared mapping.  At present that
996			 * calls journal_dirty_data(), and we get right here.
997			 * It may be too late to journal the data.  Simply
998			 * falling through to the next test will suffice: the
999			 * data will be dirty and wil be checkpointed.  The
1000			 * ordering comments in the next comment block still
1001			 * apply.
1002			 */
1003			//J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1004
1005			/*
1006			 * If we're journalling data, and this buffer was
1007			 * subject to a write(), it could be metadata, forget
1008			 * or shadow against the committing transaction.  Now,
1009			 * someone has dirtied the same darn page via a mapping
1010			 * and it is being writepage()'d.
1011			 * We *could* just steal the page from commit, with some
1012			 * fancy locking there.  Instead, we just skip it -
1013			 * don't tie the page's buffers to the new transaction
1014			 * at all.
1015			 * Implication: if we crash before the writepage() data
1016			 * is written into the filesystem, recovery will replay
1017			 * the write() data.
1018			 */
1019			if (jh->b_jlist != BJ_None &&
1020					jh->b_jlist != BJ_SyncData &&
1021					jh->b_jlist != BJ_AsyncData) {
1022				JBUFFER_TRACE(jh, "Not stealing");
1023				goto no_journal;
1024			}
1025
1026			/*
1027			 * This buffer may be undergoing writeout in commit.  We
1028			 * can't return from here and let the caller dirty it
1029			 * again because that can cause the write-out loop in
1030			 * commit to never terminate.
1031			 */
1032			if (!async && buffer_dirty(bh)) {
1033				atomic_inc(&bh->b_count);
1034				spin_unlock(&journal_datalist_lock);
1035				need_brelse = 1;
1036				ll_rw_block(WRITE, 1, &bh);
1037				wait_on_buffer(bh);
1038				spin_lock(&journal_datalist_lock);
1039				/* The buffer may become locked again at any
1040				   time if it is redirtied */
1041			}
1042
1043			/* journal_clean_data_list() may have got there first */
1044			if (jh->b_transaction != NULL) {
1045				JBUFFER_TRACE(jh, "unfile from commit");
1046				__journal_unfile_buffer(jh);
1047				jh->b_transaction = NULL;
1048			}
1049			/* The buffer will be refiled below */
1050
1051		}
1052		/*
1053		 * Special case --- the buffer might actually have been
1054		 * allocated and then immediately deallocated in the previous,
1055		 * committing transaction, so might still be left on that
1056		 * transaction's metadata lists.
1057		 */
1058		if (jh->b_jlist != wanted_jlist) {
1059			JBUFFER_TRACE(jh, "not on correct data list: unfile");
1060			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
1061			__journal_unfile_buffer(jh);
1062			jh->b_transaction = NULL;
1063			JBUFFER_TRACE(jh, "file as data");
1064			__journal_file_buffer(jh, handle->h_transaction,
1065						wanted_jlist);
1066		}
1067	} else {
1068		JBUFFER_TRACE(jh, "not on a transaction");
1069		__journal_file_buffer(jh, handle->h_transaction, wanted_jlist);
1070	}
1071no_journal:
1072	spin_unlock(&journal_datalist_lock);
1073	if (need_brelse) {
1074		BUFFER_TRACE(bh, "brelse");
1075		__brelse(bh);
1076	}
1077	JBUFFER_TRACE(jh, "exit");
1078	journal_unlock_journal_head(jh);
1079	return 0;
1080}
1081
1082/*
1083 * journal_dirty_metadata: mark a buffer as containing dirty metadata
1084 * which needs to be journaled as part of the current transaction.
1085 *
1086 * The buffer is placed on the transaction's metadata list and is marked
1087 * as belonging to the transaction.
1088 *
1089 * Special care needs to be taken if the buffer already belongs to the
1090 * current committing transaction (in which case we should have frozen
1091 * data present for that commit).  In that case, we don't relink the
1092 * buffer: that only gets done when the old transaction finally
1093 * completes its commit.
1094 *
1095 * Returns error number or 0 on success.
1096 */
1097
1098int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh)
1099{
1100	transaction_t *transaction = handle->h_transaction;
1101	journal_t *journal = transaction->t_journal;
1102	struct journal_head *jh = bh2jh(bh);
1103
1104	jbd_debug(5, "journal_head %p\n", jh);
1105	JBUFFER_TRACE(jh, "entry");
1106	lock_journal(journal);
1107	if (is_handle_aborted(handle))
1108		goto out_unlock;
1109
1110	spin_lock(&journal_datalist_lock);
1111	set_bit(BH_JBDDirty, &bh->b_state);
1112	set_buffer_flushtime(bh);
1113
1114	J_ASSERT_JH(jh, jh->b_transaction != NULL);
1115
1116	/*
1117	 * Metadata already on the current transaction list doesn't
1118	 * need to be filed.  Metadata on another transaction's list must
1119	 * be committing, and will be refiled once the commit completes:
1120	 * leave it alone for now.
1121	 */
1122
1123	if (jh->b_transaction != transaction) {
1124		JBUFFER_TRACE(jh, "already on other transaction");
1125		J_ASSERT_JH(jh, jh->b_transaction ==
1126					journal->j_committing_transaction);
1127		J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
1128		/* And this case is illegal: we can't reuse another
1129		 * transaction's data buffer, ever. */
1130		J_ASSERT_JH(jh, jh->b_jlist != BJ_SyncData);
1131		goto done_locked;
1132	}
1133
1134	/* That test should have eliminated the following case: */
1135	J_ASSERT_JH(jh, jh->b_frozen_data == 0);
1136
1137	JBUFFER_TRACE(jh, "file as BJ_Metadata");
1138	__journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
1139
1140done_locked:
1141	spin_unlock(&journal_datalist_lock);
1142	JBUFFER_TRACE(jh, "exit");
1143out_unlock:
1144	unlock_journal(journal);
1145	return 0;
1146}
1147
1148
1149/*
1150 * journal_forget: bforget() for potentially-journaled buffers.  We can
1151 * only do the bforget if there are no commits pending against the
1152 * buffer.  If the buffer is dirty in the current running transaction we
1153 * can safely unlink it.
1154 *
1155 * bh may not be a journalled buffer at all - it may be a non-JBD
1156 * buffer which came off the hashtable.  Check for this.
1157 *
1158 * Decrements bh->b_count by one.
1159 *
1160 * Allow this call even if the handle has aborted --- it may be part of
1161 * the caller's cleanup after an abort.
1162 */
1163
1164void journal_forget (handle_t *handle, struct buffer_head *bh)
1165{
1166	transaction_t *transaction = handle->h_transaction;
1167	journal_t *journal = transaction->t_journal;
1168	struct journal_head *jh;
1169
1170	BUFFER_TRACE(bh, "entry");
1171
1172	lock_journal(journal);
1173	spin_lock(&journal_datalist_lock);
1174
1175	if (!buffer_jbd(bh))
1176		goto not_jbd;
1177	jh = bh2jh(bh);
1178
1179	if (jh->b_transaction == handle->h_transaction) {
1180		J_ASSERT_JH(jh, !jh->b_frozen_data);
1181
1182		/* If we are forgetting a buffer which is already part
1183		 * of this transaction, then we can just drop it from
1184		 * the transaction immediately. */
1185		clear_bit(BH_Dirty, &bh->b_state);
1186		clear_bit(BH_JBDDirty, &bh->b_state);
1187
1188		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1189		J_ASSERT_JH(jh, !jh->b_committed_data);
1190
1191		__journal_unfile_buffer(jh);
1192		jh->b_transaction = 0;
1193
1194		/*
1195		 * We are no longer going to journal this buffer.
1196		 * However, the commit of this transaction is still
1197		 * important to the buffer: the delete that we are now
1198		 * processing might obsolete an old log entry, so by
1199		 * committing, we can satisfy the buffer's checkpoint.
1200		 *
1201		 * So, if we have a checkpoint on the buffer, we should
1202		 * now refile the buffer on our BJ_Forget list so that
1203		 * we know to remove the checkpoint after we commit.
1204		 */
1205
1206		if (jh->b_cp_transaction) {
1207			__journal_file_buffer(jh, transaction, BJ_Forget);
1208		} else {
1209			__journal_remove_journal_head(bh);
1210			__brelse(bh);
1211			if (!buffer_jbd(bh)) {
1212				spin_unlock(&journal_datalist_lock);
1213				unlock_journal(journal);
1214				__bforget(bh);
1215				return;
1216			}
1217		}
1218
1219	} else if (jh->b_transaction) {
1220		J_ASSERT_JH(jh, (jh->b_transaction ==
1221				 journal->j_committing_transaction));
1222		/* However, if the buffer is still owned by a prior
1223		 * (committing) transaction, we can't drop it yet... */
1224		JBUFFER_TRACE(jh, "belongs to older transaction");
1225		/* ... but we CAN drop it from the new transaction if we
1226		 * have also modified it since the original commit. */
1227
1228		if (jh->b_next_transaction) {
1229			J_ASSERT(jh->b_next_transaction == transaction);
1230			jh->b_next_transaction = NULL;
1231		}
1232	}
1233
1234not_jbd:
1235	spin_unlock(&journal_datalist_lock);
1236	unlock_journal(journal);
1237	__brelse(bh);
1238	return;
1239}
1240
1241
1242/*
1243 * Register a callback function for this handle.  The function will be
1244 * called when the transaction that this handle is part of has been
1245 * committed to disk with the original callback data struct and the
1246 * error status of the journal as parameters.  There is no guarantee of
1247 * ordering between handles within a single transaction, nor between
1248 * callbacks registered on the same handle.
1249 *
1250 * The caller is responsible for allocating the journal_callback struct.
1251 * This is to allow the caller to add as much extra data to the callback
1252 * as needed, but reduce the overhead of multiple allocations.  The caller
1253 * allocated struct must start with a struct journal_callback at offset 0,
1254 * and has the caller-specific data afterwards.
1255 */
1256void journal_callback_set(handle_t *handle,
1257			  void (*func)(struct journal_callback *jcb, int error),
1258			  struct journal_callback *jcb)
1259{
1260	list_add_tail(&jcb->jcb_list, &handle->h_jcb);
1261	jcb->jcb_func = func;
1262}
1263
1264/*
1265 * All done for a particular handle.
1266 *
1267 * There is not much action needed here.  We just return any remaining
1268 * buffer credits to the transaction and remove the handle.  The only
1269 * complication is that we need to start a commit operation if the
1270 * filesystem is marked for synchronous update.
1271 *
1272 * journal_stop itself will not usually return an error, but it may
1273 * do so in unusual circumstances.  In particular, expect it to
1274 * return -EIO if a journal_abort has been executed since the
1275 * transaction began.
1276 */
1277
1278int journal_stop(handle_t *handle)
1279{
1280	transaction_t *transaction = handle->h_transaction;
1281	journal_t *journal = transaction->t_journal;
1282	int old_handle_count, err;
1283
1284	if (!handle)
1285		return 0;
1286
1287	J_ASSERT (transaction->t_updates > 0);
1288	J_ASSERT (journal_current_handle() == handle);
1289
1290	if (is_handle_aborted(handle))
1291		err = -EIO;
1292	else
1293		err = 0;
1294
1295	if (--handle->h_ref > 0) {
1296		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
1297			  handle->h_ref);
1298		return err;
1299	}
1300
1301	jbd_debug(4, "Handle %p going down\n", handle);
1302
1303	/*
1304	 * Implement synchronous transaction batching.  If the handle
1305	 * was synchronous, don't force a commit immediately.  Let's
1306	 * yield and let another thread piggyback onto this transaction.
1307	 * Keep doing that while new threads continue to arrive.
1308	 * It doesn't cost much - we're about to run a commit and sleep
1309	 * on IO anyway.  Speeds up many-threaded, many-dir operations
1310	 * by 30x or more...
1311	 */
1312	if (handle->h_sync) {
1313		do {
1314			old_handle_count = transaction->t_handle_count;
1315			yield();
1316		} while (old_handle_count != transaction->t_handle_count);
1317	}
1318
1319	current->journal_info = NULL;
1320	transaction->t_outstanding_credits -= handle->h_buffer_credits;
1321	transaction->t_updates--;
1322	if (!transaction->t_updates) {
1323		wake_up(&journal->j_wait_updates);
1324		if (journal->j_barrier_count)
1325			wake_up(&journal->j_wait_transaction_locked);
1326	}
1327
1328	/* Move callbacks from the handle to the transaction. */
1329	list_splice(&handle->h_jcb, &transaction->t_jcb);
1330
1331	/*
1332	 * If the handle is marked SYNC, we need to set another commit
1333	 * going!  We also want to force a commit if the current
1334	 * transaction is occupying too much of the log, or if the
1335	 * transaction is too old now.
1336	 */
1337	if (handle->h_sync ||
1338			transaction->t_outstanding_credits >
1339				journal->j_max_transaction_buffers ||
1340	    		time_after_eq(jiffies, transaction->t_expires)) {
1341		/* Do this even for aborted journals: an abort still
1342		 * completes the commit thread, it just doesn't write
1343		 * anything to disk. */
1344		tid_t tid = transaction->t_tid;
1345
1346		jbd_debug(2, "transaction too old, requesting commit for "
1347					"handle %p\n", handle);
1348		/* This is non-blocking */
1349		log_start_commit(journal, transaction);
1350
1351		/*
1352		 * Special case: JFS_SYNC synchronous updates require us
1353		 * to wait for the commit to complete.
1354		 */
1355		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1356			log_wait_commit(journal, tid);
1357	}
1358	kfree(handle);
1359	return err;
1360}
1361
1362/*
1363 * For synchronous operations: force any uncommitted trasnactions
1364 * to disk.  May seem kludgy, but it reuses all the handle batching
1365 * code in a very simple manner.
1366 */
1367int journal_force_commit(journal_t *journal)
1368{
1369	handle_t *handle;
1370	int ret = 0;
1371
1372	lock_kernel();
1373	handle = journal_start(journal, 1);
1374	if (IS_ERR(handle)) {
1375		ret = PTR_ERR(handle);
1376		goto out;
1377	}
1378	handle->h_sync = 1;
1379	journal_stop(handle);
1380out:
1381	unlock_kernel();
1382	return ret;
1383}
1384
1385/*
1386 *
1387 * List management code snippets: various functions for manipulating the
1388 * transaction buffer lists.
1389 *
1390 */
1391
1392/*
1393 * Append a buffer to a transaction list, given the transaction's list head
1394 * pointer.
1395 * journal_datalist_lock is held.
1396 */
1397
1398static inline void
1399__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
1400{
1401	if (!*list) {
1402		jh->b_tnext = jh->b_tprev = jh;
1403		*list = jh;
1404	} else {
1405		/* Insert at the tail of the list to preserve order */
1406		struct journal_head *first = *list, *last = first->b_tprev;
1407		jh->b_tprev = last;
1408		jh->b_tnext = first;
1409		last->b_tnext = first->b_tprev = jh;
1410	}
1411}
1412
1413/*
1414 * Remove a buffer from a transaction list, given the transaction's list
1415 * head pointer.
1416 *
1417 * Called with journal_datalist_lock held, and the journal may not
1418 * be locked.
1419 */
1420
1421static inline void
1422__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1423{
1424	if (*list == jh) {
1425		*list = jh->b_tnext;
1426		if (*list == jh)
1427			*list = 0;
1428	}
1429	jh->b_tprev->b_tnext = jh->b_tnext;
1430	jh->b_tnext->b_tprev = jh->b_tprev;
1431}
1432
1433/*
1434 * Remove a buffer from the appropriate transaction list.
1435 *
1436 * Note that this function can *change* the value of
1437 * bh->b_transaction->t_sync_datalist, t_async_datalist, t_buffers, t_forget,
1438 * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
1439 * is holding onto a copy of one of thee pointers, it could go bad.
1440 * Generally the caller needs to re-read the pointer from the transaction_t.
1441 *
1442 * If bh->b_jlist is BJ_SyncData or BJ_AsyncData then we may have been called
1443 * via journal_try_to_free_buffer() or journal_clean_data_list().  In that
1444 * case, journal_datalist_lock will be held, and the journal may not be locked.
1445 */
1446void __journal_unfile_buffer(struct journal_head *jh)
1447{
1448	struct journal_head **list = 0;
1449	transaction_t * transaction;
1450
1451	assert_spin_locked(&journal_datalist_lock);
1452	transaction = jh->b_transaction;
1453
1454#ifdef __SMP__
1455	J_ASSERT (current->lock_depth >= 0);
1456#endif
1457	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1458
1459	if (jh->b_jlist != BJ_None)
1460		J_ASSERT_JH(jh, transaction != 0);
1461
1462	switch (jh->b_jlist) {
1463	case BJ_None:
1464		return;
1465	case BJ_SyncData:
1466		list = &transaction->t_sync_datalist;
1467		break;
1468	case BJ_AsyncData:
1469		list = &transaction->t_async_datalist;
1470		break;
1471	case BJ_Metadata:
1472		transaction->t_nr_buffers--;
1473		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
1474		list = &transaction->t_buffers;
1475		break;
1476	case BJ_Forget:
1477		list = &transaction->t_forget;
1478		break;
1479	case BJ_IO:
1480		list = &transaction->t_iobuf_list;
1481		break;
1482	case BJ_Shadow:
1483		list = &transaction->t_shadow_list;
1484		break;
1485	case BJ_LogCtl:
1486		list = &transaction->t_log_list;
1487		break;
1488	case BJ_Reserved:
1489		list = &transaction->t_reserved_list;
1490		break;
1491	}
1492
1493	__blist_del_buffer(list, jh);
1494	jh->b_jlist = BJ_None;
1495	if (test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) {
1496		set_bit(BH_Dirty, &jh2bh(jh)->b_state);
1497	}
1498}
1499
1500void journal_unfile_buffer(struct journal_head *jh)
1501{
1502	spin_lock(&journal_datalist_lock);
1503	__journal_unfile_buffer(jh);
1504	spin_unlock(&journal_datalist_lock);
1505}
1506
1507/*
1508 * Called from journal_try_to_free_buffers().  The journal is not
1509 * locked. lru_list_lock is not held.
1510 *
1511 * Here we see why journal_datalist_lock is global and not per-journal.
1512 * We cannot get back to this buffer's journal pointer without locking
1513 * out journal_clean_data_list() in some manner.
1514 *
1515 * One could use journal_datalist_lock to get unracy access to a
1516 * per-journal lock.
1517 *
1518 * Called with journal_datalist_lock held.
1519 *
1520 * Returns non-zero iff we were able to free the journal_head.
1521 */
1522static int __journal_try_to_free_buffer(struct buffer_head *bh,
1523					int *locked_or_dirty)
1524{
1525	struct journal_head *jh;
1526
1527	assert_spin_locked(&journal_datalist_lock);
1528
1529	jh = bh2jh(bh);
1530
1531	if (buffer_locked(bh) || buffer_dirty(bh)) {
1532		*locked_or_dirty = 1;
1533		goto out;
1534	}
1535
1536	if (!buffer_uptodate(bh))
1537		goto out;
1538
1539	if (jh->b_next_transaction != 0)
1540		goto out;
1541
1542	if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
1543		if (jh->b_jlist == BJ_SyncData || jh->b_jlist==BJ_AsyncData) {
1544			/* A written-back ordered data buffer */
1545			JBUFFER_TRACE(jh, "release data");
1546			__journal_unfile_buffer(jh);
1547			jh->b_transaction = 0;
1548			__journal_remove_journal_head(bh);
1549			__brelse(bh);
1550		}
1551	}
1552	else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
1553		/* written-back checkpointed metadata buffer */
1554		if (jh->b_jlist == BJ_None) {
1555			JBUFFER_TRACE(jh, "remove from checkpoint list");
1556			__journal_remove_checkpoint(jh);
1557			__journal_remove_journal_head(bh);
1558			__brelse(bh);
1559		}
1560	}
1561	return !buffer_jbd(bh);
1562
1563out:
1564	return 0;
1565}
1566
1567/*
1568 * journal_try_to_free_buffers().  For all the buffers on this page,
1569 * if they are fully written out ordered data, move them onto BUF_CLEAN
1570 * so try_to_free_buffers() can reap them.  Called with lru_list_lock
1571 * not held.  Does its own locking.
1572 *
1573 * This complicates JBD locking somewhat.  We aren't protected by the
1574 * BKL here.  We wish to remove the buffer from its committing or
1575 * running transaction's ->t_datalist via __journal_unfile_buffer.
1576 *
1577 * This may *change* the value of transaction_t->t_datalist, so anyone
1578 * who looks at t_datalist needs to lock against this function.
1579 *
1580 * Even worse, someone may be doing a journal_dirty_data on this
1581 * buffer.  So we need to lock against that.  journal_dirty_data()
1582 * will come out of the lock with the buffer dirty, which makes it
1583 * ineligible for release here.
1584 *
1585 * Who else is affected by this?  hmm...  Really the only contender
1586 * is do_get_write_access() - it could be looking at the buffer while
1587 * journal_try_to_free_buffer() is changing its state.  But that
1588 * cannot happen because we never reallocate freed data as metadata
1589 * while the data is part of a transaction.  Yes?
1590 *
1591 * This function returns non-zero if we wish try_to_free_buffers()
1592 * to be called. We do this is the page is releasable by try_to_free_buffers().
1593 * We also do it if the page has locked or dirty buffers and the caller wants
1594 * us to perform sync or async writeout.
1595 */
1596int journal_try_to_free_buffers(journal_t *journal,
1597				struct page *page, int gfp_mask)
1598{
1599	struct buffer_head *bh;
1600	struct buffer_head *tmp;
1601	int locked_or_dirty = 0;
1602	int call_ttfb = 1;
1603
1604	J_ASSERT(PageLocked(page));
1605
1606	bh = page->buffers;
1607	tmp = bh;
1608	spin_lock(&journal_datalist_lock);
1609	do {
1610		struct buffer_head *p = tmp;
1611
1612		tmp = tmp->b_this_page;
1613		if (buffer_jbd(p))
1614			if (!__journal_try_to_free_buffer(p, &locked_or_dirty))
1615				call_ttfb = 0;
1616	} while (tmp != bh);
1617	spin_unlock(&journal_datalist_lock);
1618
1619	if (!(gfp_mask & (__GFP_IO|__GFP_WAIT)))
1620		goto out;
1621	if (!locked_or_dirty)
1622		goto out;
1623	/*
1624	 * The VM wants us to do writeout, or to block on IO, or both.
1625	 * So we allow try_to_free_buffers to be called even if the page
1626	 * still has journalled buffers.
1627	 */
1628	call_ttfb = 1;
1629out:
1630	return call_ttfb;
1631}
1632
1633/*
1634 * This buffer is no longer needed.  If it is on an older transaction's
1635 * checkpoint list we need to record it on this transaction's forget list
1636 * to pin this buffer (and hence its checkpointing transaction) down until
1637 * this transaction commits.  If the buffer isn't on a checkpoint list, we
1638 * release it.
1639 * Returns non-zero if JBD no longer has an interest in the buffer.
1640 */
1641static int dispose_buffer(struct journal_head *jh,
1642		transaction_t *transaction)
1643{
1644	int may_free = 1;
1645	struct buffer_head *bh = jh2bh(jh);
1646
1647	spin_lock(&journal_datalist_lock);
1648	__journal_unfile_buffer(jh);
1649	jh->b_transaction = 0;
1650
1651	if (jh->b_cp_transaction) {
1652		JBUFFER_TRACE(jh, "on running+cp transaction");
1653		__journal_file_buffer(jh, transaction, BJ_Forget);
1654		clear_bit(BH_JBDDirty, &bh->b_state);
1655		may_free = 0;
1656	} else {
1657		JBUFFER_TRACE(jh, "on running transaction");
1658		__journal_remove_journal_head(bh);
1659		__brelse(bh);
1660	}
1661	spin_unlock(&journal_datalist_lock);
1662	return may_free;
1663}
1664
1665/*
1666 * journal_flushpage
1667 *
1668 * This code is tricky.  It has a number of cases to deal with.
1669 *
1670 * There are two invariants which this code relies on:
1671 *
1672 * i_size must be updated on disk before we start calling flushpage on the
1673 * data.
1674 *
1675 *  This is done in ext3 by defining an ext3_setattr method which
1676 *  updates i_size before truncate gets going.  By maintaining this
1677 *  invariant, we can be sure that it is safe to throw away any buffers
1678 *  attached to the current transaction: once the transaction commits,
1679 *  we know that the data will not be needed.
1680 *
1681 *  Note however that we can *not* throw away data belonging to the
1682 *  previous, committing transaction!
1683 *
1684 * Any disk blocks which *are* part of the previous, committing
1685 * transaction (and which therefore cannot be discarded immediately) are
1686 * not going to be reused in the new running transaction
1687 *
1688 *  The bitmap committed_data images guarantee this: any block which is
1689 *  allocated in one transaction and removed in the next will be marked
1690 *  as in-use in the committed_data bitmap, so cannot be reused until
1691 *  the next transaction to delete the block commits.  This means that
1692 *  leaving committing buffers dirty is quite safe: the disk blocks
1693 *  cannot be reallocated to a different file and so buffer aliasing is
1694 *  not possible.
1695 *
1696 *
1697 * The above applies mainly to ordered data mode.  In writeback mode we
1698 * don't make guarantees about the order in which data hits disk --- in
1699 * particular we don't guarantee that new dirty data is flushed before
1700 * transaction commit --- so it is always safe just to discard data
1701 * immediately in that mode.  --sct
1702 */
1703
1704/*
1705 * The journal_unmap_buffer helper function returns zero if the buffer
1706 * concerned remains pinned as an anonymous buffer belonging to an older
1707 * transaction.
1708 *
1709 * We're outside-transaction here.  Either or both of j_running_transaction
1710 * and j_committing_transaction may be NULL.
1711 */
1712static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1713{
1714	transaction_t *transaction;
1715	struct journal_head *jh;
1716	int may_free = 1;
1717
1718	BUFFER_TRACE(bh, "entry");
1719
1720	if (!buffer_mapped(bh))
1721		return 1;
1722
1723	/* It is safe to proceed here without the
1724	 * journal_datalist_spinlock because the buffers cannot be
1725	 * stolen by try_to_free_buffers as long as we are holding the
1726	 * page lock. --sct */
1727
1728	if (!buffer_jbd(bh))
1729		goto zap_buffer;
1730
1731	jh = bh2jh(bh);
1732	transaction = jh->b_transaction;
1733	if (transaction == NULL) {
1734		/* First case: not on any transaction.  If it
1735		 * has no checkpoint link, then we can zap it:
1736		 * it's a writeback-mode buffer so we don't care
1737		 * if it hits disk safely. */
1738		if (!jh->b_cp_transaction) {
1739			JBUFFER_TRACE(jh, "not on any transaction: zap");
1740			goto zap_buffer;
1741		}
1742
1743		if (!buffer_dirty(bh)) {
1744			/* bdflush has written it.  We can drop it now */
1745			goto zap_buffer;
1746		}
1747
1748		/* OK, it must be in the journal but still not
1749		 * written fully to disk: it's metadata or
1750		 * journaled data... */
1751
1752		if (journal->j_running_transaction) {
1753			/* ... and once the current transaction has
1754			 * committed, the buffer won't be needed any
1755			 * longer. */
1756			JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
1757			return dispose_buffer(jh,
1758					journal->j_running_transaction);
1759		} else {
1760			/* There is no currently-running transaction. So the
1761			 * orphan record which we wrote for this file must have
1762			 * passed into commit.  We must attach this buffer to
1763			 * the committing transaction, if it exists. */
1764			if (journal->j_committing_transaction) {
1765				JBUFFER_TRACE(jh, "give to committing trans");
1766				return dispose_buffer(jh,
1767					journal->j_committing_transaction);
1768			} else {
1769				/* The orphan record's transaction has
1770				 * committed.  We can cleanse this buffer */
1771				clear_bit(BH_JBDDirty, &bh->b_state);
1772				goto zap_buffer;
1773			}
1774		}
1775	} else if (transaction == journal->j_committing_transaction) {
1776		/* If it is committing, we simply cannot touch it.  We
1777		 * can remove it's next_transaction pointer from the
1778		 * running transaction if that is set, but nothing
1779		 * else. */
1780		JBUFFER_TRACE(jh, "on committing transaction");
1781		set_bit(BH_Freed, &bh->b_state);
1782		if (jh->b_next_transaction) {
1783			J_ASSERT(jh->b_next_transaction ==
1784					journal->j_running_transaction);
1785			jh->b_next_transaction = NULL;
1786		}
1787		return 0;
1788	} else {
1789		/* Good, the buffer belongs to the running transaction.
1790		 * We are writing our own transaction's data, not any
1791		 * previous one's, so it is safe to throw it away
1792		 * (remember that we expect the filesystem to have set
1793		 * i_size already for this truncate so recovery will not
1794		 * expose the disk blocks we are discarding here.) */
1795		J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
1796		may_free = dispose_buffer(jh, transaction);
1797	}
1798
1799zap_buffer:
1800	if (buffer_dirty(bh))
1801		mark_buffer_clean(bh);
1802	J_ASSERT_BH(bh, !buffer_jdirty(bh));
1803	clear_bit(BH_Uptodate, &bh->b_state);
1804	clear_bit(BH_Mapped, &bh->b_state);
1805	clear_bit(BH_Req, &bh->b_state);
1806	clear_bit(BH_New, &bh->b_state);
1807	return may_free;
1808}
1809
1810/*
1811 * Return non-zero if the page's buffers were successfully reaped
1812 */
1813int journal_flushpage(journal_t *journal,
1814		      struct page *page,
1815		      unsigned long offset)
1816{
1817	struct buffer_head *head, *bh, *next;
1818	unsigned int curr_off = 0;
1819	int may_free = 1;
1820
1821	if (!PageLocked(page))
1822		BUG();
1823	if (!page->buffers)
1824		return 1;
1825
1826	/* We will potentially be playing with lists other than just the
1827	 * data lists (especially for journaled data mode), so be
1828	 * cautious in our locking. */
1829	lock_journal(journal);
1830
1831	head = bh = page->buffers;
1832	do {
1833		unsigned int next_off = curr_off + bh->b_size;
1834		next = bh->b_this_page;
1835
1836		/* AKPM: doing lock_buffer here may be overly paranoid */
1837		if (offset <= curr_off) {
1838		 	/* This block is wholly outside the truncation point */
1839			lock_buffer(bh);
1840			may_free &= journal_unmap_buffer(journal, bh);
1841			unlock_buffer(bh);
1842		}
1843		curr_off = next_off;
1844		bh = next;
1845
1846	} while (bh != head);
1847
1848	unlock_journal(journal);
1849
1850	if (!offset) {
1851		if (!may_free || !try_to_free_buffers(page, 0))
1852			return 0;
1853		J_ASSERT(page->buffers == NULL);
1854	}
1855	return 1;
1856}
1857
1858/*
1859 * File a buffer on the given transaction list.
1860 */
1861void __journal_file_buffer(struct journal_head *jh,
1862			transaction_t *transaction, int jlist)
1863{
1864	struct journal_head **list = 0;
1865	int was_dirty = 0;
1866
1867	assert_spin_locked(&journal_datalist_lock);
1868
1869#ifdef __SMP__
1870	J_ASSERT (current->lock_depth >= 0);
1871#endif
1872	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1873	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
1874				jh->b_transaction == 0);
1875
1876	if (jh->b_transaction && jh->b_jlist == jlist)
1877		return;
1878
1879	/* The following list of buffer states needs to be consistent
1880	 * with __jbd_unexpected_dirty_buffer()'s handling of dirty
1881	 * state. */
1882
1883	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
1884	    jlist == BJ_Shadow || jlist == BJ_Forget) {
1885		if (atomic_set_buffer_clean(jh2bh(jh)) ||
1886		    test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state))
1887			was_dirty = 1;
1888	}
1889
1890	if (jh->b_transaction)
1891		__journal_unfile_buffer(jh);
1892	else
1893		jh->b_transaction = transaction;
1894
1895	switch (jlist) {
1896	case BJ_None:
1897		J_ASSERT_JH(jh, !jh->b_committed_data);
1898		J_ASSERT_JH(jh, !jh->b_frozen_data);
1899		return;
1900	case BJ_SyncData:
1901		list = &transaction->t_sync_datalist;
1902		break;
1903	case BJ_AsyncData:
1904		list = &transaction->t_async_datalist;
1905		break;
1906	case BJ_Metadata:
1907		transaction->t_nr_buffers++;
1908		list = &transaction->t_buffers;
1909		break;
1910	case BJ_Forget:
1911		list = &transaction->t_forget;
1912		break;
1913	case BJ_IO:
1914		list = &transaction->t_iobuf_list;
1915		break;
1916	case BJ_Shadow:
1917		list = &transaction->t_shadow_list;
1918		break;
1919	case BJ_LogCtl:
1920		list = &transaction->t_log_list;
1921		break;
1922	case BJ_Reserved:
1923		list = &transaction->t_reserved_list;
1924		break;
1925	}
1926
1927	__blist_add_buffer(list, jh);
1928	jh->b_jlist = jlist;
1929
1930	if (was_dirty)
1931		set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
1932}
1933
1934void journal_file_buffer(struct journal_head *jh,
1935				transaction_t *transaction, int jlist)
1936{
1937	spin_lock(&journal_datalist_lock);
1938	__journal_file_buffer(jh, transaction, jlist);
1939	spin_unlock(&journal_datalist_lock);
1940}
1941
1942/*
1943 * Remove a buffer from its current buffer list in preparation for
1944 * dropping it from its current transaction entirely.  If the buffer has
1945 * already started to be used by a subsequent transaction, refile the
1946 * buffer on that transaction's metadata list.
1947 */
1948
1949void __journal_refile_buffer(struct journal_head *jh)
1950{
1951	int was_dirty = 0;
1952
1953	assert_spin_locked(&journal_datalist_lock);
1954#ifdef __SMP__
1955	J_ASSERT_JH(jh, current->lock_depth >= 0);
1956#endif
1957	/* If the buffer is now unused, just drop it. */
1958	if (jh->b_next_transaction == NULL) {
1959		__journal_unfile_buffer(jh);
1960		jh->b_transaction = NULL;
1961		/* Onto BUF_DIRTY for writeback */
1962		refile_buffer(jh2bh(jh));
1963		return;
1964	}
1965
1966	/* It has been modified by a later transaction: add it to the
1967	 * new transaction's metadata list. */
1968
1969	if (test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state))
1970			was_dirty = 1;
1971
1972	__journal_unfile_buffer(jh);
1973	jh->b_transaction = jh->b_next_transaction;
1974	jh->b_next_transaction = NULL;
1975	__journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
1976	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
1977
1978	if (was_dirty)
1979		set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
1980
1981}
1982
1983/*
1984 * For the unlocked version of this call, also make sure that any
1985 * hanging journal_head is cleaned up if necessary.
1986 *
1987 * __journal_refile_buffer is usually called as part of a single locked
1988 * operation on a buffer_head, in which the caller is probably going to
1989 * be hooking the journal_head onto other lists.  In that case it is up
1990 * to the caller to remove the journal_head if necessary.  For the
1991 * unlocked journal_refile_buffer call, the caller isn't going to be
1992 * doing anything else to the buffer so we need to do the cleanup
1993 * ourselves to avoid a jh leak.
1994 *
1995 * *** The journal_head may be freed by this call! ***
1996 */
1997void journal_refile_buffer(struct journal_head *jh)
1998{
1999	struct buffer_head *bh;
2000
2001	spin_lock(&journal_datalist_lock);
2002	bh = jh2bh(jh);
2003
2004	__journal_refile_buffer(jh);
2005	__journal_remove_journal_head(bh);
2006
2007	spin_unlock(&journal_datalist_lock);
2008	__brelse(bh);
2009}
2010