1/*
2 * Copyright 2001-2020, Axel D��rfler, axeld@pinc-software.de.
3 * This file may be used under the terms of the MIT License.
4 */
5
6
7//! Transaction and logging
8
9
10#include <StackOrHeapArray.h>
11
12#include "Journal.h"
13
14#include "Debug.h"
15#include "Inode.h"
16
17
18struct run_array {
19	int32		count;
20	int32		max_runs;
21	block_run	runs[0];
22
23	void Init(int32 blockSize);
24	void Insert(block_run& run);
25
26	int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); }
27	int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; }
28		// that -1 accounts for an off-by-one error in Be's BFS implementation
29	const block_run& RunAt(int32 i) const { return runs[i]; }
30
31	static int32 MaxRuns(int32 blockSize);
32
33private:
34	static int _Compare(block_run& a, block_run& b);
35	int32 _FindInsertionIndex(block_run& run);
36};
37
38class RunArrays {
39public:
40							RunArrays(Journal* journal);
41							~RunArrays();
42
43			status_t		Insert(off_t blockNumber);
44
45			run_array*		ArrayAt(int32 i) { return fArrays.Array()[i]; }
46			int32			CountArrays() const { return fArrays.CountItems(); }
47
48			uint32			CountBlocks() const { return fBlockCount; }
49			uint32			LogEntryLength() const
50								{ return CountBlocks() + CountArrays(); }
51
52			int32			MaxArrayLength();
53
54private:
55			status_t		_AddArray();
56			bool			_ContainsRun(block_run& run);
57			bool			_AddRun(block_run& run);
58
59			Journal*		fJournal;
60			uint32			fBlockCount;
61			Stack<run_array*> fArrays;
62			run_array*		fLastArray;
63};
64
65class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> {
66public:
67							LogEntry(Journal* journal, uint32 logStart,
68								uint32 length);
69							~LogEntry();
70
71			uint32			Start() const { return fStart; }
72			uint32			Length() const { return fLength; }
73
74#ifdef BFS_DEBUGGER_COMMANDS
75			void			SetTransactionID(int32 id) { fTransactionID = id; }
76			int32			TransactionID() const { return fTransactionID; }
77#endif
78
79			Journal*		GetJournal() { return fJournal; }
80
81private:
82			Journal*		fJournal;
83			uint32			fStart;
84			uint32			fLength;
85#ifdef BFS_DEBUGGER_COMMANDS
86			int32			fTransactionID;
87#endif
88};
89
90
91#if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE)
92namespace BFSJournalTracing {
93
94class LogEntry : public AbstractTraceEntry {
95public:
96	LogEntry(::LogEntry* entry, off_t logPosition, bool started)
97		:
98		fEntry(entry),
99#ifdef BFS_DEBUGGER_COMMANDS
100		fTransactionID(entry->TransactionID()),
101#endif
102		fStart(entry->Start()),
103		fLength(entry->Length()),
104		fLogPosition(logPosition),
105		fStarted(started)
106	{
107		Initialized();
108	}
109
110	virtual void AddDump(TraceOutput& out)
111	{
112#ifdef BFS_DEBUGGER_COMMANDS
113		out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s "
114			"%lu\n", fStarted ? "Started" : "Written", fEntry,
115			fTransactionID, fStart, fLength,
116			fStarted ? "end" : "start", fLogPosition);
117#else
118		out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n",
119			fStarted ? "Started" : "Written", fEntry, fStart, fLength,
120			fStarted ? "end" : "start", fLogPosition);
121#endif
122	}
123
124private:
125	::LogEntry*	fEntry;
126#ifdef BFS_DEBUGGER_COMMANDS
127	int32		fTransactionID;
128#endif
129	uint32		fStart;
130	uint32		fLength;
131	uint32		fLogPosition;
132	bool		fStarted;
133};
134
135}	// namespace BFSJournalTracing
136
137#	define T(x) new(std::nothrow) BFSJournalTracing::x;
138#else
139#	define T(x) ;
140#endif
141
142
143//	#pragma mark -
144
145
146static void
147add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address,
148	size_t size)
149{
150	if (index > 0 && (addr_t)vecs[index - 1].iov_base
151			+ vecs[index - 1].iov_len == (addr_t)address) {
152		// the iovec can be combined with the previous one
153		vecs[index - 1].iov_len += size;
154		return;
155	}
156
157	if (index == max)
158		panic("no more space for iovecs!");
159
160	// we need to start a new iovec
161	vecs[index].iov_base = const_cast<void*>(address);
162	vecs[index].iov_len = size;
163	index++;
164}
165
166
167//	#pragma mark - LogEntry
168
169
170LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length)
171	:
172	fJournal(journal),
173	fStart(start),
174	fLength(length)
175{
176}
177
178
179LogEntry::~LogEntry()
180{
181}
182
183
184//	#pragma mark - run_array
185
186
187/*!	The run_array's size equals the block size of the BFS volume, so we
188	cannot use a (non-overridden) new.
189	This makes a freshly allocated run_array ready to run.
190*/
191void
192run_array::Init(int32 blockSize)
193{
194	memset(this, 0, blockSize);
195	count = 0;
196	max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize));
197}
198
199
200/*!	Inserts the block_run into the array. You will have to make sure the
201	array is large enough to contain the entry before calling this function.
202*/
203void
204run_array::Insert(block_run& run)
205{
206	int32 index = _FindInsertionIndex(run);
207	if (index == -1) {
208		// add to the end
209		runs[CountRuns()] = run;
210	} else {
211		// insert at index
212		memmove(&runs[index + 1], &runs[index],
213			(CountRuns() - index) * sizeof(off_t));
214		runs[index] = run;
215	}
216
217	count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1);
218}
219
220
221/*static*/ int32
222run_array::MaxRuns(int32 blockSize)
223{
224	// For whatever reason, BFS restricts the maximum array size
225	uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run);
226	if (maxCount < 128)
227		return maxCount;
228
229	return 127;
230}
231
232
233/*static*/ int
234run_array::_Compare(block_run& a, block_run& b)
235{
236	int cmp = a.AllocationGroup() - b.AllocationGroup();
237	if (cmp == 0)
238		return a.Start() - b.Start();
239
240	return cmp;
241}
242
243
244int32
245run_array::_FindInsertionIndex(block_run& run)
246{
247	int32 min = 0, max = CountRuns() - 1;
248	int32 i = 0;
249	if (max >= 8) {
250		while (min <= max) {
251			i = (min + max) / 2;
252
253			int cmp = _Compare(runs[i], run);
254			if (cmp < 0)
255				min = i + 1;
256			else if (cmp > 0)
257				max = i - 1;
258			else
259				return -1;
260		}
261
262		if (_Compare(runs[i], run) < 0)
263			i++;
264	} else {
265		for (; i <= max; i++) {
266			if (_Compare(runs[i], run) > 0)
267				break;
268		}
269		if (i == count)
270			return -1;
271	}
272
273	return i;
274}
275
276
277//	#pragma mark - RunArrays
278
279
280RunArrays::RunArrays(Journal* journal)
281	:
282	fJournal(journal),
283	fBlockCount(0),
284	fArrays(),
285	fLastArray(NULL)
286{
287}
288
289
290RunArrays::~RunArrays()
291{
292	run_array* array;
293	while (fArrays.Pop(&array))
294		free(array);
295}
296
297
298bool
299RunArrays::_ContainsRun(block_run& run)
300{
301	for (int32 i = 0; i < CountArrays(); i++) {
302		run_array* array = ArrayAt(i);
303
304		for (int32 j = 0; j < array->CountRuns(); j++) {
305			block_run& arrayRun = array->runs[j];
306			if (run.AllocationGroup() != arrayRun.AllocationGroup())
307				continue;
308
309			if (run.Start() >= arrayRun.Start()
310				&& run.Start() + run.Length()
311					<= arrayRun.Start() + arrayRun.Length())
312				return true;
313		}
314	}
315
316	return false;
317}
318
319
320/*!	Adds the specified block_run into the array.
321	Note: it doesn't support overlapping - it must only be used
322	with block_runs of length 1!
323*/
324bool
325RunArrays::_AddRun(block_run& run)
326{
327	ASSERT(run.length == 1);
328
329	// Be's BFS log replay routine can only deal with block_runs of size 1
330	// A pity, isn't it? Too sad we have to be compatible.
331
332	if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns())
333		return false;
334
335	fLastArray->Insert(run);
336	fBlockCount++;
337	return true;
338}
339
340
341status_t
342RunArrays::_AddArray()
343{
344	int32 blockSize = fJournal->GetVolume()->BlockSize();
345
346	run_array* array = (run_array*)malloc(blockSize);
347	if (array == NULL)
348		return B_NO_MEMORY;
349
350	if (fArrays.Push(array) != B_OK) {
351		free(array);
352		return B_NO_MEMORY;
353	}
354
355	array->Init(blockSize);
356	fLastArray = array;
357	return B_OK;
358}
359
360
361status_t
362RunArrays::Insert(off_t blockNumber)
363{
364	Volume* volume = fJournal->GetVolume();
365	block_run run = volume->ToBlockRun(blockNumber);
366
367	if (fLastArray != NULL) {
368		// check if the block is already in the array
369		if (_ContainsRun(run))
370			return B_OK;
371	}
372
373	// insert block into array
374
375	if (!_AddRun(run)) {
376		// array is full
377		if (_AddArray() != B_OK || !_AddRun(run))
378			return B_NO_MEMORY;
379	}
380
381	return B_OK;
382}
383
384
385int32
386RunArrays::MaxArrayLength()
387{
388	int32 max = 0;
389	for (int32 i = 0; i < CountArrays(); i++) {
390		if (ArrayAt(i)->CountRuns() > max)
391			max = ArrayAt(i)->CountRuns();
392	}
393
394	return max;
395}
396
397
398//	#pragma mark - Journal
399
400
401Journal::Journal(Volume* volume)
402	:
403	fVolume(volume),
404	fOwner(NULL),
405	fLogSize(volume->Log().Length()),
406	fMaxTransactionSize(fLogSize / 2 - 5),
407	fUsed(0),
408	fUnwrittenTransactions(0),
409	fHasSubtransaction(false),
410	fSeparateSubTransactions(false)
411{
412	recursive_lock_init(&fLock, "bfs journal");
413	mutex_init(&fEntriesLock, "bfs journal entries");
414
415	fLogFlusherSem = create_sem(0, "bfs log flusher");
416	fLogFlusher = spawn_kernel_thread(&Journal::_LogFlusher, "bfs log flusher",
417		B_NORMAL_PRIORITY, this);
418	if (fLogFlusher > 0)
419		resume_thread(fLogFlusher);
420}
421
422
423Journal::~Journal()
424{
425	FlushLogAndBlocks();
426
427	recursive_lock_destroy(&fLock);
428	mutex_destroy(&fEntriesLock);
429
430	sem_id logFlusher = fLogFlusherSem;
431	fLogFlusherSem = -1;
432	delete_sem(logFlusher);
433	wait_for_thread(fLogFlusher, NULL);
434}
435
436
437status_t
438Journal::InitCheck()
439{
440	return B_OK;
441}
442
443
444/*!	\brief Does a very basic consistency check of the run array.
445	It will check the maximum run count as well as if all of the runs fall
446	within a the volume.
447*/
448status_t
449Journal::_CheckRunArray(const run_array* array)
450{
451	int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1;
452		// the -1 works around an off-by-one bug in Be's BFS implementation,
453		// same as in run_array::MaxRuns()
454	if (array->MaxRuns() != maxRuns
455		|| array->CountRuns() > maxRuns
456		|| array->CountRuns() <= 0) {
457		dprintf("run count: %d, array max: %d, max runs: %d\n",
458			(int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns);
459		FATAL(("Log entry has broken header!\n"));
460		return B_ERROR;
461	}
462
463	for (int32 i = 0; i < array->CountRuns(); i++) {
464		if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK)
465			return B_ERROR;
466	}
467
468	PRINT(("Log entry has %" B_PRId32 " entries\n", array->CountRuns()));
469	return B_OK;
470}
471
472
473/*!	Replays an entry in the log.
474	\a _start points to the entry in the log, and will be bumped to the next
475	one if replaying succeeded.
476*/
477status_t
478Journal::_ReplayRunArray(int32* _start)
479{
480	PRINT(("ReplayRunArray(start = %" B_PRId32 ")\n", *_start));
481
482	off_t logOffset = fVolume->ToBlock(fVolume->Log());
483	off_t firstBlockNumber = *_start % fLogSize;
484
485	CachedBlock cachedArray(fVolume);
486
487	status_t status = cachedArray.SetTo(logOffset + firstBlockNumber);
488	if (status != B_OK)
489		return status;
490
491	const run_array* array = (const run_array*)cachedArray.Block();
492	if (_CheckRunArray(array) < B_OK)
493		return B_BAD_DATA;
494
495	// First pass: check integrity of the blocks in the run array
496
497	CachedBlock cached(fVolume);
498
499	firstBlockNumber = (firstBlockNumber + 1) % fLogSize;
500	off_t blockNumber = firstBlockNumber;
501	int32 blockSize = fVolume->BlockSize();
502
503	for (int32 index = 0; index < array->CountRuns(); index++) {
504		const block_run& run = array->RunAt(index);
505
506		off_t offset = fVolume->ToOffset(run);
507		for (int32 i = 0; i < run.Length(); i++) {
508			status = cached.SetTo(logOffset + blockNumber);
509			if (status != B_OK)
510				RETURN_ERROR(status);
511
512			// TODO: eventually check other well known offsets, like the
513			// root and index dirs
514			if (offset == 0) {
515				// This log entry writes over the superblock - check if
516				// it's valid!
517				if (Volume::CheckSuperBlock(cached.Block()) != B_OK) {
518					FATAL(("Log contains invalid superblock!\n"));
519					RETURN_ERROR(B_BAD_DATA);
520				}
521			}
522
523			blockNumber = (blockNumber + 1) % fLogSize;
524			offset += blockSize;
525		}
526	}
527
528	// Second pass: write back its blocks
529
530	blockNumber = firstBlockNumber;
531	int32 count = 1;
532
533	for (int32 index = 0; index < array->CountRuns(); index++) {
534		const block_run& run = array->RunAt(index);
535		INFORM(("replay block run %u:%u:%u in log at %" B_PRIdOFF "!\n",
536			(int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber));
537
538		off_t offset = fVolume->ToOffset(run);
539		for (int32 i = 0; i < run.Length(); i++) {
540			status = cached.SetTo(logOffset + blockNumber);
541			if (status != B_OK)
542				RETURN_ERROR(status);
543
544			ssize_t written = write_pos(fVolume->Device(), offset,
545				cached.Block(), blockSize);
546			if (written != blockSize)
547				RETURN_ERROR(B_IO_ERROR);
548
549			blockNumber = (blockNumber + 1) % fLogSize;
550			offset += blockSize;
551			count++;
552		}
553	}
554
555	*_start += count;
556	return B_OK;
557}
558
559
560/*!	Replays all log entries - this will put the disk into a
561	consistent and clean state, if it was not correctly unmounted
562	before.
563	This method is called by Journal::InitCheck() if the log start
564	and end pointer don't match.
565*/
566status_t
567Journal::ReplayLog()
568{
569	// TODO: this logic won't work whenever the size of the pending transaction
570	//	equals the size of the log (happens with the original BFS only)
571	if (fVolume->LogStart() == fVolume->LogEnd())
572		return B_OK;
573
574	INFORM(("Replay log, disk was not correctly unmounted...\n"));
575
576	if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) {
577		INFORM(("log_start and log_end differ, but disk is marked clean - "
578			"trying to replay log...\n"));
579	}
580
581	if (fVolume->IsReadOnly())
582		return B_READ_ONLY_DEVICE;
583
584	int32 start = fVolume->LogStart();
585	int32 lastStart = -1;
586	while (true) {
587		// stop if the log is completely flushed
588		if (start == fVolume->LogEnd())
589			break;
590
591		if (start == lastStart) {
592			// strange, flushing the log hasn't changed the log_start pointer
593			return B_ERROR;
594		}
595		lastStart = start;
596
597		status_t status = _ReplayRunArray(&start);
598		if (status != B_OK) {
599			FATAL(("replaying log entry from %d failed: %s\n", (int)start,
600				strerror(status)));
601			return B_ERROR;
602		}
603		start = start % fLogSize;
604	}
605
606	PRINT(("replaying worked fine!\n"));
607	fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64(
608		fVolume->LogEnd());
609	fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd());
610	fVolume->SuperBlock().flags = HOST_ENDIAN_TO_BFS_INT32(
611		SUPER_BLOCK_DISK_CLEAN);
612
613	return fVolume->WriteSuperBlock();
614}
615
616
617size_t
618Journal::CurrentTransactionSize() const
619{
620	if (_HasSubTransaction()) {
621		return cache_blocks_in_sub_transaction(fVolume->BlockCache(),
622			fTransactionID);
623	}
624
625	return cache_blocks_in_main_transaction(fVolume->BlockCache(),
626		fTransactionID);
627}
628
629
630bool
631Journal::CurrentTransactionTooLarge() const
632{
633	return CurrentTransactionSize() > fLogSize;
634}
635
636
637/*!	This is a callback function that is called by the cache, whenever
638	all blocks of a transaction have been flushed to disk.
639	This lets us keep track of completed transactions, and update
640	the log start pointer as needed. Note, the transactions may not be
641	completed in the order they were written.
642*/
643/*static*/ void
644Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry)
645{
646	LogEntry* logEntry = (LogEntry*)_logEntry;
647
648	PRINT(("Log entry %p has been finished, transaction ID = %" B_PRId32 "\n",
649		logEntry, transactionID));
650
651	Journal* journal = logEntry->GetJournal();
652	disk_super_block& superBlock = journal->fVolume->SuperBlock();
653	bool update = false;
654
655	// Set log_start pointer if possible...
656
657	mutex_lock(&journal->fEntriesLock);
658
659	if (logEntry == journal->fEntries.First()) {
660		LogEntry* next = journal->fEntries.GetNext(logEntry);
661		if (next != NULL) {
662			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start()
663				% journal->fLogSize);
664		} else {
665			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(
666				journal->fVolume->LogEnd());
667		}
668
669		update = true;
670	}
671
672	T(LogEntry(logEntry, superBlock.LogStart(), false));
673
674	journal->fUsed -= logEntry->Length();
675	journal->fEntries.Remove(logEntry);
676	mutex_unlock(&journal->fEntriesLock);
677
678	delete logEntry;
679
680	// update the superblock, and change the disk's state, if necessary
681
682	if (update) {
683		if (superBlock.log_start == superBlock.log_end)
684			superBlock.flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
685
686		status_t status = journal->fVolume->WriteSuperBlock();
687		if (status != B_OK) {
688			FATAL(("_TransactionWritten: could not write back superblock: %s\n",
689				strerror(status)));
690		}
691
692		journal->fVolume->LogStart() = superBlock.LogStart();
693	}
694}
695
696
697/*!	Listens to TRANSACTION_IDLE events, and flushes the log when that happens */
698/*static*/ void
699Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal)
700{
701	// The current transaction seems to be idle - flush it. (We can't do this
702	// in this thread, as flushing the log can produce new transaction events.)
703	Journal* journal = (Journal*)_journal;
704	release_sem(journal->fLogFlusherSem);
705}
706
707
708/*static*/ status_t
709Journal::_LogFlusher(void* _journal)
710{
711	Journal* journal = (Journal*)_journal;
712	while (journal->fLogFlusherSem >= 0) {
713		if (acquire_sem(journal->fLogFlusherSem) != B_OK)
714			continue;
715
716		journal->_FlushLog(false, false);
717	}
718	return B_OK;
719}
720
721
722/*!	Writes the blocks that are part of current transaction into the log,
723	and ends the current transaction.
724	If the current transaction is too large to fit into the log, it will
725	try to detach an existing sub-transaction.
726*/
727status_t
728Journal::_WriteTransactionToLog()
729{
730	// TODO: in case of a failure, we need a backup plan like writing all
731	//	changed blocks back to disk immediately (hello disk corruption!)
732
733	bool detached = false;
734
735	if (_TransactionSize() > fLogSize) {
736		// The current transaction won't fit into the log anymore, try to
737		// detach the current sub-transaction
738		if (_HasSubTransaction() && cache_blocks_in_main_transaction(
739				fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) {
740			detached = true;
741		} else {
742			// We created a transaction larger than one we can write back to
743			// disk - the only option we have (besides risking disk corruption
744			// by writing it back anyway), is to let it fail.
745			dprintf("transaction too large (%d blocks, log size %d)!\n",
746				(int)_TransactionSize(), (int)fLogSize);
747			return B_BUFFER_OVERFLOW;
748		}
749	}
750
751	fHasSubtransaction = false;
752
753	int32 blockShift = fVolume->BlockShift();
754	off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
755	off_t logStart = fVolume->LogEnd() % fLogSize;
756	off_t logPosition = logStart;
757	status_t status;
758
759	// create run_array structures for all changed blocks
760
761	RunArrays runArrays(this);
762
763	off_t blockNumber;
764	long cookie = 0;
765	while (cache_next_block_in_transaction(fVolume->BlockCache(),
766			fTransactionID, detached, &cookie, &blockNumber, NULL,
767			NULL) == B_OK) {
768		status = runArrays.Insert(blockNumber);
769		if (status < B_OK) {
770			FATAL(("filling log entry failed!"));
771			return status;
772		}
773	}
774
775	if (runArrays.CountBlocks() == 0) {
776		// nothing has changed during this transaction
777		if (detached) {
778			fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
779				fTransactionID, NULL, NULL);
780			fUnwrittenTransactions = 1;
781		} else {
782			cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL,
783				NULL);
784			fUnwrittenTransactions = 0;
785		}
786		return B_OK;
787	}
788
789	// If necessary, flush the log, so that we have enough space for this
790	// transaction
791	if (runArrays.LogEntryLength() > FreeLogBlocks()) {
792		cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
793		if (runArrays.LogEntryLength() > FreeLogBlocks()) {
794			panic("no space in log after sync (%ld for %ld blocks)!",
795				(long)FreeLogBlocks(), (long)runArrays.LogEntryLength());
796		}
797	}
798
799	// Write log entries to disk
800
801	int32 maxVecs = runArrays.MaxArrayLength() + 1;
802		// one extra for the index block
803
804	BStackOrHeapArray<iovec, 8> vecs(maxVecs);
805	if (!vecs.IsValid()) {
806		// TODO: write back log entries directly?
807		return B_NO_MEMORY;
808	}
809
810	for (int32 k = 0; k < runArrays.CountArrays(); k++) {
811		run_array* array = runArrays.ArrayAt(k);
812		int32 index = 0, count = 1;
813		int32 wrap = fLogSize - logStart;
814
815		add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize());
816
817		// add block runs
818
819		for (int32 i = 0; i < array->CountRuns(); i++) {
820			const block_run& run = array->RunAt(i);
821			off_t blockNumber = fVolume->ToBlock(run);
822
823			for (int32 j = 0; j < run.Length(); j++) {
824				if (count >= wrap) {
825					// We need to write back the first half of the entry
826					// directly as the log wraps around
827					if (writev_pos(fVolume->Device(), logOffset
828						+ (logStart << blockShift), vecs, index) < 0)
829						FATAL(("could not write log area!\n"));
830
831					logPosition = logStart + count;
832					logStart = 0;
833					wrap = fLogSize;
834					count = 0;
835					index = 0;
836				}
837
838				// make blocks available in the cache
839				const void* data = block_cache_get(fVolume->BlockCache(),
840					blockNumber + j);
841				if (data == NULL)
842					return B_IO_ERROR;
843
844				add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize());
845				count++;
846			}
847		}
848
849		// write back the rest of the log entry
850		if (count > 0) {
851			logPosition = logStart + count;
852			if (writev_pos(fVolume->Device(), logOffset
853					+ (logStart << blockShift), vecs, index) < 0)
854				FATAL(("could not write log area: %s!\n", strerror(errno)));
855		}
856
857		// release blocks again
858		for (int32 i = 0; i < array->CountRuns(); i++) {
859			const block_run& run = array->RunAt(i);
860			off_t blockNumber = fVolume->ToBlock(run);
861
862			for (int32 j = 0; j < run.Length(); j++) {
863				block_cache_put(fVolume->BlockCache(), blockNumber + j);
864			}
865		}
866
867		logStart = logPosition % fLogSize;
868	}
869
870	LogEntry* logEntry = new(std::nothrow) LogEntry(this, fVolume->LogEnd(),
871		runArrays.LogEntryLength());
872	if (logEntry == NULL) {
873		FATAL(("no memory to allocate log entries!"));
874		return B_NO_MEMORY;
875	}
876
877#ifdef BFS_DEBUGGER_COMMANDS
878	logEntry->SetTransactionID(fTransactionID);
879#endif
880
881	// Update the log end pointer in the superblock
882
883	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
884	fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition);
885
886	status = fVolume->WriteSuperBlock();
887
888	fVolume->LogEnd() = logPosition;
889	T(LogEntry(logEntry, fVolume->LogEnd(), true));
890
891	// We need to flush the drives own cache here to ensure
892	// disk consistency.
893	// If that call fails, we can't do anything about it anyway
894	ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);
895
896	// at this point, we can finally end the transaction - we're in
897	// a guaranteed valid state
898
899	mutex_lock(&fEntriesLock);
900	fEntries.Add(logEntry);
901	fUsed += logEntry->Length();
902	mutex_unlock(&fEntriesLock);
903
904	if (detached) {
905		fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
906			fTransactionID, _TransactionWritten, logEntry);
907		fUnwrittenTransactions = 1;
908
909		if (status == B_OK && _TransactionSize() > fLogSize) {
910			// If the transaction is too large after writing, there is no way to
911			// recover, so let this transaction fail.
912			dprintf("transaction too large (%d blocks, log size %d)!\n",
913				(int)_TransactionSize(), (int)fLogSize);
914			return B_BUFFER_OVERFLOW;
915		}
916	} else {
917		cache_end_transaction(fVolume->BlockCache(), fTransactionID,
918			_TransactionWritten, logEntry);
919		fUnwrittenTransactions = 0;
920	}
921
922	return status;
923}
924
925
926/*!	Flushes the current log entry to disk. If \a flushBlocks is \c true it will
927	also write back all dirty blocks for this volume.
928*/
929status_t
930Journal::_FlushLog(bool canWait, bool flushBlocks)
931{
932	status_t status = canWait ? recursive_lock_lock(&fLock)
933		: recursive_lock_trylock(&fLock);
934	if (status != B_OK)
935		return status;
936
937	if (recursive_lock_get_recursion(&fLock) > 1) {
938		// whoa, FlushLogAndBlocks() was called from inside a transaction
939		recursive_lock_unlock(&fLock);
940		return B_OK;
941	}
942
943	// write the current log entry to disk
944
945	if (fUnwrittenTransactions != 0) {
946		status = _WriteTransactionToLog();
947		if (status < B_OK)
948			FATAL(("writing current log entry failed: %s\n", strerror(status)));
949	}
950
951	if (flushBlocks)
952		status = fVolume->FlushDevice();
953
954	recursive_lock_unlock(&fLock);
955	return status;
956}
957
958
959/*!	Flushes the current log entry to disk, and also writes back all dirty
960	blocks for this volume (completing all open transactions).
961*/
962status_t
963Journal::FlushLogAndBlocks()
964{
965	return _FlushLog(true, true);
966}
967
968
969status_t
970Journal::Lock(Transaction* owner, bool separateSubTransactions)
971{
972	status_t status = recursive_lock_lock(&fLock);
973	if (status != B_OK)
974		return status;
975
976	if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) {
977		// we'll just use the current transaction again
978		return B_OK;
979	}
980
981	if (separateSubTransactions)
982		fSeparateSubTransactions = true;
983
984	if (owner != NULL)
985		owner->SetParent(fOwner);
986
987	fOwner = owner;
988
989	// TODO: we need a way to find out how big the current transaction is;
990	//	we need to be able to either detach the latest sub transaction on
991	//	demand, as well as having some kind of fall back plan in case the
992	//	sub transaction itself grows bigger than the log.
993	//	For that, it would be nice to have some call-back interface in the
994	//	cache transaction API...
995
996	if (fOwner != NULL) {
997		if (fUnwrittenTransactions > 0) {
998			// start a sub transaction
999			cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID);
1000			fHasSubtransaction = true;
1001		} else
1002			fTransactionID = cache_start_transaction(fVolume->BlockCache());
1003
1004		if (fTransactionID < B_OK) {
1005			recursive_lock_unlock(&fLock);
1006			return fTransactionID;
1007		}
1008
1009		cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID,
1010			TRANSACTION_IDLE, _TransactionIdle, this);
1011	}
1012	return B_OK;
1013}
1014
1015
1016status_t
1017Journal::Unlock(Transaction* owner, bool success)
1018{
1019	if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) {
1020		// we only end the transaction if we would really unlock it
1021		// TODO: what about failing transactions that do not unlock?
1022		// (they must make the parent fail, too)
1023		if (owner != NULL) {
1024			status_t status = _TransactionDone(success);
1025			if (status != B_OK)
1026				return status;
1027
1028			// Unlocking the inodes might trigger new transactions, but we
1029			// cannot reuse the current one anymore, as this one is already
1030			// closed.
1031			bool separateSubTransactions = fSeparateSubTransactions;
1032			fSeparateSubTransactions = true;
1033			owner->NotifyListeners(success);
1034			fSeparateSubTransactions = separateSubTransactions;
1035
1036			fOwner = owner->Parent();
1037		} else
1038			fOwner = NULL;
1039
1040		fTimestamp = system_time();
1041
1042		if (fSeparateSubTransactions
1043			&& recursive_lock_get_recursion(&fLock) == 1)
1044			fSeparateSubTransactions = false;
1045	} else
1046		owner->MoveListenersTo(fOwner);
1047
1048	recursive_lock_unlock(&fLock);
1049	return B_OK;
1050}
1051
1052
1053uint32
1054Journal::_TransactionSize() const
1055{
1056	int32 count = cache_blocks_in_transaction(fVolume->BlockCache(),
1057		fTransactionID);
1058	if (count <= 0)
1059		return 0;
1060
1061	// take the number of array blocks in this transaction into account
1062	uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize());
1063	uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns;
1064	return count + arrayBlocks;
1065}
1066
1067
1068status_t
1069Journal::_TransactionDone(bool success)
1070{
1071	if (!success) {
1072		if (_HasSubTransaction()) {
1073			cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID);
1074			// We can continue to use the parent transaction afterwards
1075		} else {
1076			cache_abort_transaction(fVolume->BlockCache(), fTransactionID);
1077			fUnwrittenTransactions = 0;
1078		}
1079
1080		return B_OK;
1081	}
1082
1083	// Up to a maximum size, we will just batch several
1084	// transactions together to improve speed
1085	uint32 size = _TransactionSize();
1086	if (size < fMaxTransactionSize) {
1087		// Flush the log from time to time, so that we have enough space
1088		// for this transaction
1089		if (size > FreeLogBlocks())
1090			cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
1091
1092		fUnwrittenTransactions++;
1093		return B_OK;
1094	}
1095
1096	return _WriteTransactionToLog();
1097}
1098
1099
1100//	#pragma mark - debugger commands
1101
1102
1103#ifdef BFS_DEBUGGER_COMMANDS
1104
1105
1106void
1107Journal::Dump()
1108{
1109	kprintf("Journal %p\n", this);
1110	kprintf("  log start:            %" B_PRId32 "\n", fVolume->LogStart());
1111	kprintf("  log end:              %" B_PRId32 "\n", fVolume->LogEnd());
1112	kprintf("  owner:                %p\n", fOwner);
1113	kprintf("  log size:             %" B_PRIu32 "\n", fLogSize);
1114	kprintf("  max transaction size: %" B_PRIu32 "\n", fMaxTransactionSize);
1115	kprintf("  used:                 %" B_PRIu32 "\n", fUsed);
1116	kprintf("  unwritten:            %" B_PRId32 "\n", fUnwrittenTransactions);
1117	kprintf("  timestamp:            %" B_PRId64 "\n", fTimestamp);
1118	kprintf("  transaction ID:       %" B_PRId32 "\n", fTransactionID);
1119	kprintf("  has subtransaction:   %d\n", fHasSubtransaction);
1120	kprintf("  separate sub-trans.:  %d\n", fSeparateSubTransactions);
1121	kprintf("entries:\n");
1122	kprintf("  address        id  start length\n");
1123
1124	LogEntryList::Iterator iterator = fEntries.GetIterator();
1125
1126	while (iterator.HasNext()) {
1127		LogEntry* entry = iterator.Next();
1128
1129		kprintf("  %p %6" B_PRId32 " %6" B_PRIu32 " %6" B_PRIu32 "\n", entry,
1130			entry->TransactionID(), entry->Start(), entry->Length());
1131	}
1132}
1133
1134
1135int
1136dump_journal(int argc, char** argv)
1137{
1138	if (argc != 2 || !strcmp(argv[1], "--help")) {
1139		kprintf("usage: %s <ptr-to-volume>\n", argv[0]);
1140		return 0;
1141	}
1142
1143	Volume* volume = (Volume*)parse_expression(argv[1]);
1144	Journal* journal = volume->GetJournal(0);
1145
1146	journal->Dump();
1147	return 0;
1148}
1149
1150
1151#endif	// BFS_DEBUGGER_COMMANDS
1152
1153
1154//	#pragma mark - TransactionListener
1155
1156
1157TransactionListener::TransactionListener()
1158{
1159}
1160
1161
1162TransactionListener::~TransactionListener()
1163{
1164}
1165
1166
1167//	#pragma mark - Transaction
1168
1169
1170status_t
1171Transaction::Start(Volume* volume, off_t refBlock)
1172{
1173	// has it already been started?
1174	if (fJournal != NULL)
1175		return B_OK;
1176
1177	fJournal = volume->GetJournal(refBlock);
1178	if (fJournal != NULL && fJournal->Lock(this, false) == B_OK)
1179		return B_OK;
1180
1181	fJournal = NULL;
1182	return B_ERROR;
1183}
1184
1185
1186void
1187Transaction::AddListener(TransactionListener* listener)
1188{
1189	if (fJournal == NULL)
1190		panic("Transaction is not running!");
1191
1192	fListeners.Add(listener);
1193}
1194
1195
1196void
1197Transaction::RemoveListener(TransactionListener* listener)
1198{
1199	if (fJournal == NULL)
1200		panic("Transaction is not running!");
1201
1202	fListeners.Remove(listener);
1203	listener->RemovedFromTransaction();
1204}
1205
1206
1207void
1208Transaction::NotifyListeners(bool success)
1209{
1210	while (TransactionListener* listener = fListeners.RemoveHead()) {
1211		listener->TransactionDone(success);
1212		listener->RemovedFromTransaction();
1213	}
1214}
1215
1216
1217/*!	Move the inodes into the parent transaction. This is needed only to make
1218	sure they will still be reverted in case the transaction is aborted.
1219*/
1220void
1221Transaction::MoveListenersTo(Transaction* transaction)
1222{
1223	while (TransactionListener* listener = fListeners.RemoveHead()) {
1224		transaction->fListeners.Add(listener);
1225	}
1226}
1227