1/*
2 * Copyright 2001-2010, Axel D��rfler, axeld@pinc-software.de.
3 * This file may be used under the terms of the MIT License.
4 */
5
6
7//! Transaction and logging
8
9
10#include "Journal.h"
11
12#include "Debug.h"
13#include "Inode.h"
14
15
16struct run_array {
17	int32		count;
18	int32		max_runs;
19	block_run	runs[0];
20
21	void Init(int32 blockSize);
22	void Insert(block_run& run);
23
24	int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); }
25	int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; }
26		// that -1 accounts for an off-by-one error in Be's BFS implementation
27	const block_run& RunAt(int32 i) const { return runs[i]; }
28
29	static int32 MaxRuns(int32 blockSize);
30
31private:
32	static int _Compare(block_run& a, block_run& b);
33	int32 _FindInsertionIndex(block_run& run);
34};
35
36class RunArrays {
37public:
38							RunArrays(Journal* journal);
39							~RunArrays();
40
41			status_t		Insert(off_t blockNumber);
42
43			run_array*		ArrayAt(int32 i) { return fArrays.Array()[i]; }
44			int32			CountArrays() const { return fArrays.CountItems(); }
45
46			uint32			CountBlocks() const { return fBlockCount; }
47			uint32			LogEntryLength() const
48								{ return CountBlocks() + CountArrays(); }
49
50			int32			MaxArrayLength();
51
52private:
53			status_t		_AddArray();
54			bool			_ContainsRun(block_run& run);
55			bool			_AddRun(block_run& run);
56
57			Journal*		fJournal;
58			uint32			fBlockCount;
59			Stack<run_array*> fArrays;
60			run_array*		fLastArray;
61};
62
63class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> {
64public:
65							LogEntry(Journal* journal, uint32 logStart,
66								uint32 length);
67							~LogEntry();
68
69			uint32			Start() const { return fStart; }
70			uint32			Length() const { return fLength; }
71
72#ifdef BFS_DEBUGGER_COMMANDS
73			void			SetTransactionID(int32 id) { fTransactionID = id; }
74			int32			TransactionID() const { return fTransactionID; }
75#endif
76
77			Journal*		GetJournal() { return fJournal; }
78
79private:
80			Journal*		fJournal;
81			uint32			fStart;
82			uint32			fLength;
83#ifdef BFS_DEBUGGER_COMMANDS
84			int32			fTransactionID;
85#endif
86};
87
88
89#if BFS_TRACING && !defined(BFS_SHELL) && !defined(_BOOT_MODE)
90namespace BFSJournalTracing {
91
92class LogEntry : public AbstractTraceEntry {
93public:
94	LogEntry(::LogEntry* entry, off_t logPosition, bool started)
95		:
96		fEntry(entry),
97#ifdef BFS_DEBUGGER_COMMANDS
98		fTransactionID(entry->TransactionID()),
99#endif
100		fStart(entry->Start()),
101		fLength(entry->Length()),
102		fLogPosition(logPosition),
103		fStarted(started)
104	{
105		Initialized();
106	}
107
108	virtual void AddDump(TraceOutput& out)
109	{
110#ifdef BFS_DEBUGGER_COMMANDS
111		out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s "
112			"%lu\n", fStarted ? "Started" : "Written", fEntry,
113			fTransactionID, fStart, fLength,
114			fStarted ? "end" : "start", fLogPosition);
115#else
116		out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n",
117			fStarted ? "Started" : "Written", fEntry, fStart, fLength,
118			fStarted ? "end" : "start", fLogPosition);
119#endif
120	}
121
122private:
123	::LogEntry*	fEntry;
124#ifdef BFS_DEBUGGER_COMMANDS
125	int32		fTransactionID;
126#endif
127	uint32		fStart;
128	uint32		fLength;
129	uint32		fLogPosition;
130	bool		fStarted;
131};
132
133}	// namespace BFSJournalTracing
134
135#	define T(x) new(std::nothrow) BFSJournalTracing::x;
136#else
137#	define T(x) ;
138#endif
139
140
141//	#pragma mark -
142
143
144static void
145add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address,
146	size_t size)
147{
148	if (index > 0 && (addr_t)vecs[index - 1].iov_base
149			+ vecs[index - 1].iov_len == (addr_t)address) {
150		// the iovec can be combined with the previous one
151		vecs[index - 1].iov_len += size;
152		return;
153	}
154
155	if (index == max)
156		panic("no more space for iovecs!");
157
158	// we need to start a new iovec
159	vecs[index].iov_base = const_cast<void*>(address);
160	vecs[index].iov_len = size;
161	index++;
162}
163
164
165//	#pragma mark - LogEntry
166
167
168LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length)
169	:
170	fJournal(journal),
171	fStart(start),
172	fLength(length)
173{
174}
175
176
177LogEntry::~LogEntry()
178{
179}
180
181
182//	#pragma mark - run_array
183
184
185/*!	The run_array's size equals the block size of the BFS volume, so we
186	cannot use a (non-overridden) new.
187	This makes a freshly allocated run_array ready to run.
188*/
189void
190run_array::Init(int32 blockSize)
191{
192	memset(this, 0, blockSize);
193	count = 0;
194	max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize));
195}
196
197
198/*!	Inserts the block_run into the array. You will have to make sure the
199	array is large enough to contain the entry before calling this function.
200*/
201void
202run_array::Insert(block_run& run)
203{
204	int32 index = _FindInsertionIndex(run);
205	if (index == -1) {
206		// add to the end
207		runs[CountRuns()] = run;
208	} else {
209		// insert at index
210		memmove(&runs[index + 1], &runs[index],
211			(CountRuns() - index) * sizeof(off_t));
212		runs[index] = run;
213	}
214
215	count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1);
216}
217
218
219/*static*/ int32
220run_array::MaxRuns(int32 blockSize)
221{
222	// For whatever reason, BFS restricts the maximum array size
223	uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run);
224	if (maxCount < 128)
225		return maxCount;
226
227	return 127;
228}
229
230
231/*static*/ int
232run_array::_Compare(block_run& a, block_run& b)
233{
234	int cmp = a.AllocationGroup() - b.AllocationGroup();
235	if (cmp == 0)
236		return a.Start() - b.Start();
237
238	return cmp;
239}
240
241
242int32
243run_array::_FindInsertionIndex(block_run& run)
244{
245	int32 min = 0, max = CountRuns() - 1;
246	int32 i = 0;
247	if (max >= 8) {
248		while (min <= max) {
249			i = (min + max) / 2;
250
251			int cmp = _Compare(runs[i], run);
252			if (cmp < 0)
253				min = i + 1;
254			else if (cmp > 0)
255				max = i - 1;
256			else
257				return -1;
258		}
259
260		if (_Compare(runs[i], run) < 0)
261			i++;
262	} else {
263		for (; i <= max; i++) {
264			if (_Compare(runs[i], run) > 0)
265				break;
266		}
267		if (i == count)
268			return -1;
269	}
270
271	return i;
272}
273
274
275//	#pragma mark - RunArrays
276
277
278RunArrays::RunArrays(Journal* journal)
279	:
280	fJournal(journal),
281	fBlockCount(0),
282	fArrays(),
283	fLastArray(NULL)
284{
285}
286
287
288RunArrays::~RunArrays()
289{
290	run_array* array;
291	while (fArrays.Pop(&array))
292		free(array);
293}
294
295
296bool
297RunArrays::_ContainsRun(block_run& run)
298{
299	for (int32 i = 0; i < CountArrays(); i++) {
300		run_array* array = ArrayAt(i);
301
302		for (int32 j = 0; j < array->CountRuns(); j++) {
303			block_run& arrayRun = array->runs[j];
304			if (run.AllocationGroup() != arrayRun.AllocationGroup())
305				continue;
306
307			if (run.Start() >= arrayRun.Start()
308				&& run.Start() + run.Length()
309					<= arrayRun.Start() + arrayRun.Length())
310				return true;
311		}
312	}
313
314	return false;
315}
316
317
318/*!	Adds the specified block_run into the array.
319	Note: it doesn't support overlapping - it must only be used
320	with block_runs of length 1!
321*/
322bool
323RunArrays::_AddRun(block_run& run)
324{
325	ASSERT(run.length == 1);
326
327	// Be's BFS log replay routine can only deal with block_runs of size 1
328	// A pity, isn't it? Too sad we have to be compatible.
329
330	if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns())
331		return false;
332
333	fLastArray->Insert(run);
334	fBlockCount++;
335	return true;
336}
337
338
339status_t
340RunArrays::_AddArray()
341{
342	int32 blockSize = fJournal->GetVolume()->BlockSize();
343
344	run_array* array = (run_array*)malloc(blockSize);
345	if (array == NULL)
346		return B_NO_MEMORY;
347
348	if (fArrays.Push(array) != B_OK) {
349		free(array);
350		return B_NO_MEMORY;
351	}
352
353	array->Init(blockSize);
354	fLastArray = array;
355	return B_OK;
356}
357
358
359status_t
360RunArrays::Insert(off_t blockNumber)
361{
362	Volume* volume = fJournal->GetVolume();
363	block_run run = volume->ToBlockRun(blockNumber);
364
365	if (fLastArray != NULL) {
366		// check if the block is already in the array
367		if (_ContainsRun(run))
368			return B_OK;
369	}
370
371	// insert block into array
372
373	if (!_AddRun(run)) {
374		// array is full
375		if (_AddArray() != B_OK || !_AddRun(run))
376			return B_NO_MEMORY;
377	}
378
379	return B_OK;
380}
381
382
383int32
384RunArrays::MaxArrayLength()
385{
386	int32 max = 0;
387	for (int32 i = 0; i < CountArrays(); i++) {
388		if (ArrayAt(i)->CountRuns() > max)
389			max = ArrayAt(i)->CountRuns();
390	}
391
392	return max;
393}
394
395
396//	#pragma mark - Journal
397
398
399Journal::Journal(Volume* volume)
400	:
401	fVolume(volume),
402	fOwner(NULL),
403	fLogSize(volume->Log().Length()),
404	fMaxTransactionSize(fLogSize / 2 - 5),
405	fUsed(0),
406	fUnwrittenTransactions(0),
407	fHasSubtransaction(false),
408	fSeparateSubTransactions(false)
409{
410	recursive_lock_init(&fLock, "bfs journal");
411	mutex_init(&fEntriesLock, "bfs journal entries");
412}
413
414
415Journal::~Journal()
416{
417	FlushLogAndBlocks();
418
419	recursive_lock_destroy(&fLock);
420	mutex_destroy(&fEntriesLock);
421}
422
423
424status_t
425Journal::InitCheck()
426{
427	return B_OK;
428}
429
430
431/*!	\brief Does a very basic consistency check of the run array.
432	It will check the maximum run count as well as if all of the runs fall
433	within a the volume.
434*/
435status_t
436Journal::_CheckRunArray(const run_array* array)
437{
438	int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1;
439		// the -1 works around an off-by-one bug in Be's BFS implementation,
440		// same as in run_array::MaxRuns()
441	if (array->MaxRuns() != maxRuns
442		|| array->CountRuns() > maxRuns
443		|| array->CountRuns() <= 0) {
444		dprintf("run count: %d, array max: %d, max runs: %d\n",
445			(int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns);
446		FATAL(("Log entry has broken header!\n"));
447		return B_ERROR;
448	}
449
450	for (int32 i = 0; i < array->CountRuns(); i++) {
451		if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK)
452			return B_ERROR;
453	}
454
455	PRINT(("Log entry has %ld entries\n", array->CountRuns()));
456	return B_OK;
457}
458
459
460/*!	Replays an entry in the log.
461	\a _start points to the entry in the log, and will be bumped to the next
462	one if replaying succeeded.
463*/
464status_t
465Journal::_ReplayRunArray(int32* _start)
466{
467	PRINT(("ReplayRunArray(start = %ld)\n", *_start));
468
469	off_t logOffset = fVolume->ToBlock(fVolume->Log());
470	off_t firstBlockNumber = *_start % fLogSize;
471
472	CachedBlock cachedArray(fVolume);
473
474	const run_array* array = (const run_array*)cachedArray.SetTo(logOffset
475		+ firstBlockNumber);
476	if (array == NULL)
477		return B_IO_ERROR;
478
479	if (_CheckRunArray(array) < B_OK)
480		return B_BAD_DATA;
481
482	// First pass: check integrity of the blocks in the run array
483
484	CachedBlock cached(fVolume);
485
486	firstBlockNumber = (firstBlockNumber + 1) % fLogSize;
487	off_t blockNumber = firstBlockNumber;
488	int32 blockSize = fVolume->BlockSize();
489
490	for (int32 index = 0; index < array->CountRuns(); index++) {
491		const block_run& run = array->RunAt(index);
492
493		off_t offset = fVolume->ToOffset(run);
494		for (int32 i = 0; i < run.Length(); i++) {
495			const uint8* data = cached.SetTo(logOffset + blockNumber);
496			if (data == NULL)
497				RETURN_ERROR(B_IO_ERROR);
498
499			// TODO: eventually check other well known offsets, like the
500			// root and index dirs
501			if (offset == 0) {
502				// This log entry writes over the superblock - check if
503				// it's valid!
504				if (Volume::CheckSuperBlock(data) != B_OK) {
505					FATAL(("Log contains invalid superblock!\n"));
506					RETURN_ERROR(B_BAD_DATA);
507				}
508			}
509
510			blockNumber = (blockNumber + 1) % fLogSize;
511			offset += blockSize;
512		}
513	}
514
515	// Second pass: write back its blocks
516
517	blockNumber = firstBlockNumber;
518	int32 count = 1;
519
520	for (int32 index = 0; index < array->CountRuns(); index++) {
521		const block_run& run = array->RunAt(index);
522		INFORM(("replay block run %u:%u:%u in log at %" B_PRIdOFF "!\n",
523			(int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber));
524
525		off_t offset = fVolume->ToOffset(run);
526		for (int32 i = 0; i < run.Length(); i++) {
527			const uint8* data = cached.SetTo(logOffset + blockNumber);
528			if (data == NULL)
529				RETURN_ERROR(B_IO_ERROR);
530
531			ssize_t written = write_pos(fVolume->Device(), offset, data,
532				blockSize);
533			if (written != blockSize)
534				RETURN_ERROR(B_IO_ERROR);
535
536			blockNumber = (blockNumber + 1) % fLogSize;
537			offset += blockSize;
538			count++;
539		}
540	}
541
542	*_start += count;
543	return B_OK;
544}
545
546
547/*!	Replays all log entries - this will put the disk into a
548	consistent and clean state, if it was not correctly unmounted
549	before.
550	This method is called by Journal::InitCheck() if the log start
551	and end pointer don't match.
552*/
553status_t
554Journal::ReplayLog()
555{
556	// TODO: this logic won't work whenever the size of the pending transaction
557	//	equals the size of the log (happens with the original BFS only)
558	if (fVolume->LogStart() == fVolume->LogEnd())
559		return B_OK;
560
561	INFORM(("Replay log, disk was not correctly unmounted...\n"));
562
563	if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) {
564		INFORM(("log_start and log_end differ, but disk is marked clean - "
565			"trying to replay log...\n"));
566	}
567
568	if (fVolume->IsReadOnly())
569		return B_READ_ONLY_DEVICE;
570
571	int32 start = fVolume->LogStart();
572	int32 lastStart = -1;
573	while (true) {
574		// stop if the log is completely flushed
575		if (start == fVolume->LogEnd())
576			break;
577
578		if (start == lastStart) {
579			// strange, flushing the log hasn't changed the log_start pointer
580			return B_ERROR;
581		}
582		lastStart = start;
583
584		status_t status = _ReplayRunArray(&start);
585		if (status != B_OK) {
586			FATAL(("replaying log entry from %d failed: %s\n", (int)start,
587				strerror(status)));
588			return B_ERROR;
589		}
590		start = start % fLogSize;
591	}
592
593	PRINT(("replaying worked fine!\n"));
594	fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64(
595		fVolume->LogEnd());
596	fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd());
597	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN;
598
599	return fVolume->WriteSuperBlock();
600}
601
602
603size_t
604Journal::CurrentTransactionSize() const
605{
606	if (_HasSubTransaction()) {
607		return cache_blocks_in_sub_transaction(fVolume->BlockCache(),
608			fTransactionID);
609	}
610
611	return cache_blocks_in_main_transaction(fVolume->BlockCache(),
612		fTransactionID);
613}
614
615
616bool
617Journal::CurrentTransactionTooLarge() const
618{
619	return CurrentTransactionSize() > fLogSize;
620}
621
622
623/*!	This is a callback function that is called by the cache, whenever
624	all blocks of a transaction have been flushed to disk.
625	This lets us keep track of completed transactions, and update
626	the log start pointer as needed. Note, the transactions may not be
627	completed in the order they were written.
628*/
629/*static*/ void
630Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry)
631{
632	LogEntry* logEntry = (LogEntry*)_logEntry;
633
634	PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry,
635		transactionID));
636
637	Journal* journal = logEntry->GetJournal();
638	disk_super_block& superBlock = journal->fVolume->SuperBlock();
639	bool update = false;
640
641	// Set log_start pointer if possible...
642
643	mutex_lock(&journal->fEntriesLock);
644
645	if (logEntry == journal->fEntries.First()) {
646		LogEntry* next = journal->fEntries.GetNext(logEntry);
647		if (next != NULL) {
648			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start()
649				% journal->fLogSize);
650		} else {
651			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(
652				journal->fVolume->LogEnd());
653		}
654
655		update = true;
656	}
657
658	T(LogEntry(logEntry, superBlock.LogStart(), false));
659
660	journal->fUsed -= logEntry->Length();
661	journal->fEntries.Remove(logEntry);
662	mutex_unlock(&journal->fEntriesLock);
663
664	delete logEntry;
665
666	// update the superblock, and change the disk's state, if necessary
667
668	if (update) {
669		if (superBlock.log_start == superBlock.log_end)
670			superBlock.flags = SUPER_BLOCK_DISK_CLEAN;
671
672		status_t status = journal->fVolume->WriteSuperBlock();
673		if (status != B_OK) {
674			FATAL(("_TransactionWritten: could not write back superblock: %s\n",
675				strerror(status)));
676		}
677
678		journal->fVolume->LogStart() = superBlock.LogStart();
679	}
680}
681
682
683/*!	Listens to TRANSACTION_IDLE events, and flushes the log when that happens */
684/*static*/ void
685Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal)
686{
687	// The current transaction seems to be idle - flush it. We can't do this
688	// in this thread, as flushing the log can produce new transaction events.
689	thread_id id = spawn_kernel_thread(&Journal::_FlushLog, "bfs log flusher",
690		B_NORMAL_PRIORITY, _journal);
691	if (id > 0)
692		resume_thread(id);
693}
694
695
696/*static*/ status_t
697Journal::_FlushLog(void* _journal)
698{
699	Journal* journal = (Journal*)_journal;
700	return journal->_FlushLog(false, false);
701}
702
703
704/*!	Writes the blocks that are part of current transaction into the log,
705	and ends the current transaction.
706	If the current transaction is too large to fit into the log, it will
707	try to detach an existing sub-transaction.
708*/
709status_t
710Journal::_WriteTransactionToLog()
711{
712	// TODO: in case of a failure, we need a backup plan like writing all
713	//	changed blocks back to disk immediately (hello disk corruption!)
714
715	bool detached = false;
716
717	if (_TransactionSize() > fLogSize) {
718		// The current transaction won't fit into the log anymore, try to
719		// detach the current sub-transaction
720		if (_HasSubTransaction() && cache_blocks_in_main_transaction(
721				fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) {
722			detached = true;
723		} else {
724			// We created a transaction larger than one we can write back to
725			// disk - the only option we have (besides risking disk corruption
726			// by writing it back anyway), is to let it fail.
727			dprintf("transaction too large (%d blocks, log size %d)!\n",
728				(int)_TransactionSize(), (int)fLogSize);
729			return B_BUFFER_OVERFLOW;
730		}
731	}
732
733	fHasSubtransaction = false;
734
735	int32 blockShift = fVolume->BlockShift();
736	off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
737	off_t logStart = fVolume->LogEnd() % fLogSize;
738	off_t logPosition = logStart;
739	status_t status;
740
741	// create run_array structures for all changed blocks
742
743	RunArrays runArrays(this);
744
745	off_t blockNumber;
746	long cookie = 0;
747	while (cache_next_block_in_transaction(fVolume->BlockCache(),
748			fTransactionID, detached, &cookie, &blockNumber, NULL,
749			NULL) == B_OK) {
750		status = runArrays.Insert(blockNumber);
751		if (status < B_OK) {
752			FATAL(("filling log entry failed!"));
753			return status;
754		}
755	}
756
757	if (runArrays.CountBlocks() == 0) {
758		// nothing has changed during this transaction
759		if (detached) {
760			fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
761				fTransactionID, NULL, NULL);
762			fUnwrittenTransactions = 1;
763		} else {
764			cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL,
765				NULL);
766			fUnwrittenTransactions = 0;
767		}
768		return B_OK;
769	}
770
771	// If necessary, flush the log, so that we have enough space for this
772	// transaction
773	if (runArrays.LogEntryLength() > FreeLogBlocks()) {
774		cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
775		if (runArrays.LogEntryLength() > FreeLogBlocks()) {
776			panic("no space in log after sync (%ld for %ld blocks)!",
777				(long)FreeLogBlocks(), (long)runArrays.LogEntryLength());
778		}
779	}
780
781	// Write log entries to disk
782
783	int32 maxVecs = runArrays.MaxArrayLength() + 1;
784		// one extra for the index block
785
786	iovec* vecs = (iovec*)malloc(sizeof(iovec) * maxVecs);
787	if (vecs == NULL) {
788		// TODO: write back log entries directly?
789		return B_NO_MEMORY;
790	}
791
792	for (int32 k = 0; k < runArrays.CountArrays(); k++) {
793		run_array* array = runArrays.ArrayAt(k);
794		int32 index = 0, count = 1;
795		int32 wrap = fLogSize - logStart;
796
797		add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize());
798
799		// add block runs
800
801		for (int32 i = 0; i < array->CountRuns(); i++) {
802			const block_run& run = array->RunAt(i);
803			off_t blockNumber = fVolume->ToBlock(run);
804
805			for (int32 j = 0; j < run.Length(); j++) {
806				if (count >= wrap) {
807					// We need to write back the first half of the entry
808					// directly as the log wraps around
809					if (writev_pos(fVolume->Device(), logOffset
810						+ (logStart << blockShift), vecs, index) < 0)
811						FATAL(("could not write log area!\n"));
812
813					logPosition = logStart + count;
814					logStart = 0;
815					wrap = fLogSize;
816					count = 0;
817					index = 0;
818				}
819
820				// make blocks available in the cache
821				const void* data = block_cache_get(fVolume->BlockCache(),
822					blockNumber + j);
823				if (data == NULL) {
824					free(vecs);
825					return B_IO_ERROR;
826				}
827
828				add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize());
829				count++;
830			}
831		}
832
833		// write back the rest of the log entry
834		if (count > 0) {
835			logPosition = logStart + count;
836			if (writev_pos(fVolume->Device(), logOffset
837					+ (logStart << blockShift), vecs, index) < 0)
838				FATAL(("could not write log area: %s!\n", strerror(errno)));
839		}
840
841		// release blocks again
842		for (int32 i = 0; i < array->CountRuns(); i++) {
843			const block_run& run = array->RunAt(i);
844			off_t blockNumber = fVolume->ToBlock(run);
845
846			for (int32 j = 0; j < run.Length(); j++) {
847				block_cache_put(fVolume->BlockCache(), blockNumber + j);
848			}
849		}
850
851		logStart = logPosition % fLogSize;
852	}
853
854	free(vecs);
855
856	LogEntry* logEntry = new LogEntry(this, fVolume->LogEnd(),
857		runArrays.LogEntryLength());
858	if (logEntry == NULL) {
859		FATAL(("no memory to allocate log entries!"));
860		return B_NO_MEMORY;
861	}
862
863#ifdef BFS_DEBUGGER_COMMANDS
864	logEntry->SetTransactionID(fTransactionID);
865#endif
866
867	// Update the log end pointer in the superblock
868
869	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
870	fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition);
871
872	status = fVolume->WriteSuperBlock();
873
874	fVolume->LogEnd() = logPosition;
875	T(LogEntry(logEntry, fVolume->LogEnd(), true));
876
877	// We need to flush the drives own cache here to ensure
878	// disk consistency.
879	// If that call fails, we can't do anything about it anyway
880	ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);
881
882	// at this point, we can finally end the transaction - we're in
883	// a guaranteed valid state
884
885	mutex_lock(&fEntriesLock);
886	fEntries.Add(logEntry);
887	fUsed += logEntry->Length();
888	mutex_unlock(&fEntriesLock);
889
890	if (detached) {
891		fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
892			fTransactionID, _TransactionWritten, logEntry);
893		fUnwrittenTransactions = 1;
894
895		if (status == B_OK && _TransactionSize() > fLogSize) {
896			// If the transaction is too large after writing, there is no way to
897			// recover, so let this transaction fail.
898			dprintf("transaction too large (%d blocks, log size %d)!\n",
899				(int)_TransactionSize(), (int)fLogSize);
900			return B_BUFFER_OVERFLOW;
901		}
902	} else {
903		cache_end_transaction(fVolume->BlockCache(), fTransactionID,
904			_TransactionWritten, logEntry);
905		fUnwrittenTransactions = 0;
906	}
907
908	return status;
909}
910
911
912/*!	Flushes the current log entry to disk. If \a flushBlocks is \c true it will
913	also write back all dirty blocks for this volume.
914*/
915status_t
916Journal::_FlushLog(bool canWait, bool flushBlocks)
917{
918	status_t status = canWait ? recursive_lock_lock(&fLock)
919		: recursive_lock_trylock(&fLock);
920	if (status != B_OK)
921		return status;
922
923	if (recursive_lock_get_recursion(&fLock) > 1) {
924		// whoa, FlushLogAndBlocks() was called from inside a transaction
925		recursive_lock_unlock(&fLock);
926		return B_OK;
927	}
928
929	// write the current log entry to disk
930
931	if (fUnwrittenTransactions != 0 && _TransactionSize() != 0) {
932		status = _WriteTransactionToLog();
933		if (status < B_OK)
934			FATAL(("writing current log entry failed: %s\n", strerror(status)));
935	}
936
937	if (flushBlocks)
938		status = fVolume->FlushDevice();
939
940	recursive_lock_unlock(&fLock);
941	return status;
942}
943
944
945/*!	Flushes the current log entry to disk, and also writes back all dirty
946	blocks for this volume (completing all open transactions).
947*/
948status_t
949Journal::FlushLogAndBlocks()
950{
951	return _FlushLog(true, true);
952}
953
954
955status_t
956Journal::Lock(Transaction* owner, bool separateSubTransactions)
957{
958	status_t status = recursive_lock_lock(&fLock);
959	if (status != B_OK)
960		return status;
961
962	if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) {
963		// we'll just use the current transaction again
964		return B_OK;
965	}
966
967	if (separateSubTransactions)
968		fSeparateSubTransactions = true;
969
970	if (owner != NULL)
971		owner->SetParent(fOwner);
972
973	fOwner = owner;
974
975	// TODO: we need a way to find out how big the current transaction is;
976	//	we need to be able to either detach the latest sub transaction on
977	//	demand, as well as having some kind of fall back plan in case the
978	//	sub transaction itself grows bigger than the log.
979	//	For that, it would be nice to have some call-back interface in the
980	//	cache transaction API...
981
982	if (fOwner != NULL) {
983		if (fUnwrittenTransactions > 0) {
984			// start a sub transaction
985			cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID);
986			fHasSubtransaction = true;
987		} else
988			fTransactionID = cache_start_transaction(fVolume->BlockCache());
989
990		if (fTransactionID < B_OK) {
991			recursive_lock_unlock(&fLock);
992			return fTransactionID;
993		}
994
995		cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID,
996			TRANSACTION_IDLE, _TransactionIdle, this);
997	}
998	return B_OK;
999}
1000
1001
1002status_t
1003Journal::Unlock(Transaction* owner, bool success)
1004{
1005	if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) {
1006		// we only end the transaction if we would really unlock it
1007		// TODO: what about failing transactions that do not unlock?
1008		// (they must make the parent fail, too)
1009		if (owner != NULL) {
1010			status_t status = _TransactionDone(success);
1011			if (status != B_OK)
1012				return status;
1013
1014			// Unlocking the inodes might trigger new transactions, but we
1015			// cannot reuse the current one anymore, as this one is already
1016			// closed.
1017			bool separateSubTransactions = fSeparateSubTransactions;
1018			fSeparateSubTransactions = true;
1019			owner->NotifyListeners(success);
1020			fSeparateSubTransactions = separateSubTransactions;
1021
1022			fOwner = owner->Parent();
1023		} else
1024			fOwner = NULL;
1025
1026		fTimestamp = system_time();
1027
1028		if (fSeparateSubTransactions
1029			&& recursive_lock_get_recursion(&fLock) == 1)
1030			fSeparateSubTransactions = false;
1031	} else
1032		owner->MoveListenersTo(fOwner);
1033
1034	recursive_lock_unlock(&fLock);
1035	return B_OK;
1036}
1037
1038
1039uint32
1040Journal::_TransactionSize() const
1041{
1042	int32 count = cache_blocks_in_transaction(fVolume->BlockCache(),
1043		fTransactionID);
1044	if (count <= 0)
1045		return 0;
1046
1047	// take the number of array blocks in this transaction into account
1048	uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize());
1049	uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns;
1050	return count + arrayBlocks;
1051}
1052
1053
1054status_t
1055Journal::_TransactionDone(bool success)
1056{
1057	if (!success) {
1058		if (_HasSubTransaction()) {
1059			cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID);
1060			// We can continue to use the parent transaction afterwards
1061		} else {
1062			cache_abort_transaction(fVolume->BlockCache(), fTransactionID);
1063			fUnwrittenTransactions = 0;
1064		}
1065
1066		return B_OK;
1067	}
1068
1069	// Up to a maximum size, we will just batch several
1070	// transactions together to improve speed
1071	uint32 size = _TransactionSize();
1072	if (size < fMaxTransactionSize) {
1073		// Flush the log from time to time, so that we have enough space
1074		// for this transaction
1075		if (size > FreeLogBlocks())
1076			cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
1077
1078		fUnwrittenTransactions++;
1079		return B_OK;
1080	}
1081
1082	return _WriteTransactionToLog();
1083}
1084
1085
1086//	#pragma mark - debugger commands
1087
1088
1089#ifdef BFS_DEBUGGER_COMMANDS
1090
1091
1092void
1093Journal::Dump()
1094{
1095	kprintf("Journal %p\n", this);
1096	kprintf("  log start:            %" B_PRId32 "\n", fVolume->LogStart());
1097	kprintf("  log end:              %" B_PRId32 "\n", fVolume->LogEnd());
1098	kprintf("  owner:                %p\n", fOwner);
1099	kprintf("  log size:             %" B_PRIu32 "\n", fLogSize);
1100	kprintf("  max transaction size: %" B_PRIu32 "\n", fMaxTransactionSize);
1101	kprintf("  used:                 %" B_PRIu32 "\n", fUsed);
1102	kprintf("  unwritten:            %" B_PRId32 "\n", fUnwrittenTransactions);
1103	kprintf("  timestamp:            %" B_PRId64 "\n", fTimestamp);
1104	kprintf("  transaction ID:       %" B_PRId32 "\n", fTransactionID);
1105	kprintf("  has subtransaction:   %d\n", fHasSubtransaction);
1106	kprintf("  separate sub-trans.:  %d\n", fSeparateSubTransactions);
1107	kprintf("entries:\n");
1108	kprintf("  address        id  start length\n");
1109
1110	LogEntryList::Iterator iterator = fEntries.GetIterator();
1111
1112	while (iterator.HasNext()) {
1113		LogEntry* entry = iterator.Next();
1114
1115		kprintf("  %p %6" B_PRId32 " %6" B_PRIu32 " %6" B_PRIu32 "\n", entry,
1116			entry->TransactionID(), entry->Start(), entry->Length());
1117	}
1118}
1119
1120
1121int
1122dump_journal(int argc, char** argv)
1123{
1124	if (argc != 2 || !strcmp(argv[1], "--help")) {
1125		kprintf("usage: %s <ptr-to-volume>\n", argv[0]);
1126		return 0;
1127	}
1128
1129	Volume* volume = (Volume*)parse_expression(argv[1]);
1130	Journal* journal = volume->GetJournal(0);
1131
1132	journal->Dump();
1133	return 0;
1134}
1135
1136
1137#endif	// BFS_DEBUGGER_COMMANDS
1138
1139
1140//	#pragma mark - TransactionListener
1141
1142
1143TransactionListener::TransactionListener()
1144{
1145}
1146
1147
1148TransactionListener::~TransactionListener()
1149{
1150}
1151
1152
1153//	#pragma mark - Transaction
1154
1155
1156status_t
1157Transaction::Start(Volume* volume, off_t refBlock)
1158{
1159	// has it already been started?
1160	if (fJournal != NULL)
1161		return B_OK;
1162
1163	fJournal = volume->GetJournal(refBlock);
1164	if (fJournal != NULL && fJournal->Lock(this, false) == B_OK)
1165		return B_OK;
1166
1167	fJournal = NULL;
1168	return B_ERROR;
1169}
1170
1171
1172void
1173Transaction::AddListener(TransactionListener* listener)
1174{
1175	if (fJournal == NULL)
1176		panic("Transaction is not running!");
1177
1178	fListeners.Add(listener);
1179}
1180
1181
1182void
1183Transaction::RemoveListener(TransactionListener* listener)
1184{
1185	if (fJournal == NULL)
1186		panic("Transaction is not running!");
1187
1188	fListeners.Remove(listener);
1189	listener->RemovedFromTransaction();
1190}
1191
1192
1193void
1194Transaction::NotifyListeners(bool success)
1195{
1196	while (TransactionListener* listener = fListeners.RemoveHead()) {
1197		listener->TransactionDone(success);
1198		listener->RemovedFromTransaction();
1199	}
1200}
1201
1202
1203/*!	Move the inodes into the parent transaction. This is needed only to make
1204	sure they will still be reverted in case the transaction is aborted.
1205*/
1206void
1207Transaction::MoveListenersTo(Transaction* transaction)
1208{
1209	while (TransactionListener* listener = fListeners.RemoveHead()) {
1210		transaction->fListeners.Add(listener);
1211	}
1212}
1213