1/*
2 * Copyright 2004-2020, Axel D��rfler, axeld@pinc-software.de.
3 * Distributed under the terms of the MIT License.
4 */
5
6
7#include <block_cache.h>
8
9#include <unistd.h>
10#include <stdlib.h>
11#include <string.h>
12#include <errno.h>
13
14#include <KernelExport.h>
15#include <fs_cache.h>
16
17#include <condition_variable.h>
18#include <lock.h>
19#include <low_resource_manager.h>
20#include <slab/Slab.h>
21#include <tracing.h>
22#include <util/kernel_cpp.h>
23#include <util/DoublyLinkedList.h>
24#include <util/AutoLock.h>
25#include <vm/vm_page.h>
26
27#include "kernel_debug_config.h"
28
29
30// TODO: this is a naive but growing implementation to test the API:
31//	block reading/writing is not at all optimized for speed, it will
32//	just read and write single blocks.
33// TODO: the retrieval/copy of the original data could be delayed until the
34//		new data must be written, ie. in low memory situations.
35
36//#define TRACE_BLOCK_CACHE
37#ifdef TRACE_BLOCK_CACHE
38#	define TRACE(x)	dprintf x
39#else
40#	define TRACE(x) ;
41#endif
42
43#define TRACE_ALWAYS(x) dprintf x
44
45// This macro is used for fatal situations that are acceptable in a running
46// system, like out of memory situations - should only panic for debugging.
47#define FATAL(x) panic x
48
49static const bigtime_t kTransactionIdleTime = 2000000LL;
50	// a transaction is considered idle after 2 seconds of inactivity
51
52
53namespace {
54
55struct cache_transaction;
56struct cached_block;
57struct block_cache;
58typedef DoublyLinkedListLink<cached_block> block_link;
59
60struct cached_block {
61	cached_block*	next;			// next in hash
62	cached_block*	transaction_next;
63	block_link		link;
64	off_t			block_number;
65	void*			current_data;
66		// The data that is seen by everyone using the API; this one is always
67		// present.
68	void*			original_data;
69		// When in a transaction, this contains the original data from before
70		// the transaction.
71	void*			parent_data;
72		// This is a lazily alloced buffer that represents the contents of the
73		// block in the parent transaction. It may point to current_data if the
74		// contents have been changed only in the parent transaction, or, if the
75		// block has been changed in the current sub transaction already, to a
76		// new block containing the contents changed in the parent transaction.
77		// If this is NULL, the block has not been changed in the parent
78		// transaction at all.
79#if BLOCK_CACHE_DEBUG_CHANGED
80	void*			compare;
81#endif
82	int32			ref_count;
83	int32			last_accessed;
84	bool			busy_reading : 1;
85	bool			busy_writing : 1;
86	bool			is_writing : 1;
87		// Block has been checked out for writing without transactions, and
88		// cannot be written back if set
89	bool			is_dirty : 1;
90	bool			unused : 1;
91	bool			discard : 1;
92	bool			busy_reading_waiters : 1;
93	bool			busy_writing_waiters : 1;
94	cache_transaction* transaction;
95		// This is the current active transaction, if any, the block is
96		// currently in (meaning was changed as a part of it).
97	cache_transaction* previous_transaction;
98		// This is set to the last transaction that was ended containing this
99		// block. In this case, the block has not yet written back yet, and
100		// the changed data is either in current_data, or original_data -- the
101		// latter if the block is already being part of another transaction.
102		// There can only be one previous transaction, so when the active
103		// transaction ends, the changes of the previous transaction have to
104		// be written back before that transaction becomes the next previous
105		// transaction.
106
107	bool CanBeWritten() const;
108	int32 LastAccess() const
109		{ return system_time() / 1000000L - last_accessed; }
110};
111
112typedef DoublyLinkedList<cached_block,
113	DoublyLinkedListMemberGetLink<cached_block,
114		&cached_block::link> > block_list;
115
116struct cache_notification : DoublyLinkedListLinkImpl<cache_notification> {
117	static inline void* operator new(size_t size);
118	static inline void operator delete(void* block);
119
120	int32			transaction_id;
121	int32			events_pending;
122	int32			events;
123	transaction_notification_hook hook;
124	void*			data;
125	bool			delete_after_event;
126};
127
128typedef DoublyLinkedList<cache_notification> NotificationList;
129
130static object_cache* sCacheNotificationCache;
131
132struct cache_listener;
133typedef DoublyLinkedListLink<cache_listener> listener_link;
134
135struct cache_listener : cache_notification {
136	listener_link	link;
137};
138
139typedef DoublyLinkedList<cache_listener,
140	DoublyLinkedListMemberGetLink<cache_listener,
141		&cache_listener::link> > ListenerList;
142
143void*
144cache_notification::operator new(size_t size)
145{
146	// We can't really know whether something is a cache_notification or a
147	// cache_listener at runtime, so we just use one object_cache for both
148	// with the size set to that of the (slightly larger) cache_listener.
149	// In practice, the vast majority of cache_notifications are really
150	// cache_listeners, so this is a more than acceptable trade-off.
151	ASSERT(size <= sizeof(cache_listener));
152	return object_cache_alloc(sCacheNotificationCache, 0);
153}
154
155void
156cache_notification::operator delete(void* block)
157{
158	object_cache_free(sCacheNotificationCache, block, 0);
159}
160
161
162struct BlockHash {
163	typedef off_t			KeyType;
164	typedef	cached_block	ValueType;
165
166	size_t HashKey(KeyType key) const
167	{
168		return key;
169	}
170
171	size_t Hash(ValueType* block) const
172	{
173		return block->block_number;
174	}
175
176	bool Compare(KeyType key, ValueType* block) const
177	{
178		return block->block_number == key;
179	}
180
181	ValueType*& GetLink(ValueType* value) const
182	{
183		return value->next;
184	}
185};
186
187typedef BOpenHashTable<BlockHash> BlockTable;
188
189
190struct TransactionHash {
191	typedef int32				KeyType;
192	typedef	cache_transaction	ValueType;
193
194	size_t HashKey(KeyType key) const
195	{
196		return key;
197	}
198
199	size_t Hash(ValueType* transaction) const;
200	bool Compare(KeyType key, ValueType* transaction) const;
201	ValueType*& GetLink(ValueType* value) const;
202};
203
204typedef BOpenHashTable<TransactionHash> TransactionTable;
205
206
207struct block_cache : DoublyLinkedListLinkImpl<block_cache> {
208	BlockTable*		hash;
209	mutex			lock;
210	int				fd;
211	off_t			max_blocks;
212	size_t			block_size;
213	int32			next_transaction_id;
214	cache_transaction* last_transaction;
215	TransactionTable* transaction_hash;
216
217	object_cache*	buffer_cache;
218	block_list		unused_blocks;
219	uint32			unused_block_count;
220
221	ConditionVariable busy_reading_condition;
222	uint32			busy_reading_count;
223	bool			busy_reading_waiters;
224
225	ConditionVariable busy_writing_condition;
226	uint32			busy_writing_count;
227	bool			busy_writing_waiters;
228
229	bigtime_t		last_block_write;
230	bigtime_t		last_block_write_duration;
231
232	uint32			num_dirty_blocks;
233	bool			read_only;
234
235	NotificationList pending_notifications;
236	ConditionVariable condition_variable;
237
238					block_cache(int fd, off_t numBlocks, size_t blockSize,
239						bool readOnly);
240					~block_cache();
241
242	status_t		Init();
243
244	void			Free(void* buffer);
245	void*			Allocate();
246	void			FreeBlock(cached_block* block);
247	cached_block*	NewBlock(off_t blockNumber);
248	void			FreeBlockParentData(cached_block* block);
249
250	void			RemoveUnusedBlocks(int32 count, int32 minSecondsOld = 0);
251	void			RemoveBlock(cached_block* block);
252	void			DiscardBlock(cached_block* block);
253
254private:
255	static void		_LowMemoryHandler(void* data, uint32 resources,
256						int32 level);
257	cached_block*	_GetUnusedBlock();
258};
259
260struct cache_transaction {
261	cache_transaction();
262
263	cache_transaction* next;
264	int32			id;
265	int32			num_blocks;
266	int32			main_num_blocks;
267	int32			sub_num_blocks;
268	cached_block*	first_block;
269	block_list		blocks;
270	ListenerList	listeners;
271	bool			open;
272	bool			has_sub_transaction;
273	bigtime_t		last_used;
274	int32			busy_writing_count;
275};
276
277
278class BlockWriter {
279public:
280								BlockWriter(block_cache* cache,
281									size_t max = SIZE_MAX);
282								~BlockWriter();
283
284			bool				Add(cached_block* block,
285									cache_transaction* transaction = NULL);
286			bool				Add(cache_transaction* transaction,
287									bool& hasLeftOvers);
288
289			status_t			Write(cache_transaction* transaction = NULL,
290									bool canUnlock = true);
291
292			bool				DeletedTransaction() const
293									{ return fDeletedTransaction; }
294
295	static	status_t			WriteBlock(block_cache* cache,
296									cached_block* block);
297
298private:
299			void*				_Data(cached_block* block) const;
300			status_t			_WriteBlock(cached_block* block);
301			void				_BlockDone(cached_block* block,
302									cache_transaction* transaction);
303			void				_UnmarkWriting(cached_block* block);
304
305	static	int					_CompareBlocks(const void* _blockA,
306									const void* _blockB);
307
308private:
309	static	const size_t		kBufferSize = 64;
310
311			block_cache*		fCache;
312			cached_block*		fBuffer[kBufferSize];
313			cached_block**		fBlocks;
314			size_t				fCount;
315			size_t				fTotal;
316			size_t				fCapacity;
317			size_t				fMax;
318			status_t			fStatus;
319			bool				fDeletedTransaction;
320};
321
322
323class TransactionLocking {
324public:
325	inline bool Lock(block_cache* cache)
326	{
327		mutex_lock(&cache->lock);
328
329		while (cache->busy_writing_count != 0) {
330			// wait for all blocks to be written
331			ConditionVariableEntry entry;
332			cache->busy_writing_condition.Add(&entry);
333			cache->busy_writing_waiters = true;
334
335			mutex_unlock(&cache->lock);
336
337			entry.Wait();
338
339			mutex_lock(&cache->lock);
340		}
341
342		return true;
343	}
344
345	inline void Unlock(block_cache* cache)
346	{
347		mutex_unlock(&cache->lock);
348	}
349};
350
351typedef AutoLocker<block_cache, TransactionLocking> TransactionLocker;
352
353} // namespace
354
355
356#if BLOCK_CACHE_BLOCK_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
357namespace BlockTracing {
358
359class Action : public AbstractTraceEntry {
360public:
361	Action(block_cache* cache, cached_block* block)
362		:
363		fCache(cache),
364		fBlockNumber(block->block_number),
365		fIsDirty(block->is_dirty),
366		fHasOriginal(block->original_data != NULL),
367		fHasParent(block->parent_data != NULL),
368		fTransactionID(-1),
369		fPreviousID(-1)
370	{
371		if (block->transaction != NULL)
372			fTransactionID = block->transaction->id;
373		if (block->previous_transaction != NULL)
374			fPreviousID = block->previous_transaction->id;
375	}
376
377	virtual void AddDump(TraceOutput& out)
378	{
379		out.Print("block cache %p, %s %" B_PRIu64 ", %c%c%c transaction %" B_PRId32
380			" (previous id %" B_PRId32 ")\n", fCache, _Action(), fBlockNumber,
381			fIsDirty ? 'd' : '-', fHasOriginal ? 'o' : '-',
382			fHasParent ? 'p' : '-', fTransactionID, fPreviousID);
383	}
384
385	virtual const char* _Action() const = 0;
386
387private:
388	block_cache*		fCache;
389	uint64				fBlockNumber;
390	bool				fIsDirty;
391	bool				fHasOriginal;
392	bool				fHasParent;
393	int32				fTransactionID;
394	int32				fPreviousID;
395};
396
397class Get : public Action {
398public:
399	Get(block_cache* cache, cached_block* block)
400		:
401		Action(cache, block)
402	{
403		Initialized();
404	}
405
406	virtual const char* _Action() const { return "get"; }
407};
408
409class Put : public Action {
410public:
411	Put(block_cache* cache, cached_block* block)
412		:
413		Action(cache, block)
414	{
415		Initialized();
416	}
417
418	virtual const char* _Action() const { return "put"; }
419};
420
421class Read : public Action {
422public:
423	Read(block_cache* cache, cached_block* block)
424		:
425		Action(cache, block)
426	{
427		Initialized();
428	}
429
430	virtual const char* _Action() const { return "read"; }
431};
432
433class Write : public Action {
434public:
435	Write(block_cache* cache, cached_block* block)
436		:
437		Action(cache, block)
438	{
439		Initialized();
440	}
441
442	virtual const char* _Action() const { return "write"; }
443};
444
445class Flush : public Action {
446public:
447	Flush(block_cache* cache, cached_block* block, bool getUnused = false)
448		:
449		Action(cache, block),
450		fGetUnused(getUnused)
451	{
452		Initialized();
453	}
454
455	virtual const char* _Action() const
456		{ return fGetUnused ? "get-unused" : "flush"; }
457
458private:
459	bool	fGetUnused;
460};
461
462class Error : public AbstractTraceEntry {
463public:
464	Error(block_cache* cache, uint64 blockNumber, const char* message,
465			status_t status = B_OK)
466		:
467		fCache(cache),
468		fBlockNumber(blockNumber),
469		fMessage(message),
470		fStatus(status)
471	{
472		Initialized();
473	}
474
475	virtual void AddDump(TraceOutput& out)
476	{
477		out.Print("block cache %p, error %" B_PRIu64 ", %s%s%s",
478			fCache, fBlockNumber, fMessage, fStatus != B_OK ? ": " : "",
479			fStatus != B_OK ? strerror(fStatus) : "");
480	}
481
482private:
483	block_cache*	fCache;
484	uint64			fBlockNumber;
485	const char*		fMessage;
486	status_t		fStatus;
487};
488
489#if BLOCK_CACHE_BLOCK_TRACING >= 2
490class BlockData : public AbstractTraceEntry {
491public:
492	enum {
493		kCurrent	= 0x01,
494		kParent		= 0x02,
495		kOriginal	= 0x04
496	};
497
498	BlockData(block_cache* cache, cached_block* block, const char* message)
499		:
500		fCache(cache),
501		fSize(cache->block_size),
502		fBlockNumber(block->block_number),
503		fMessage(message)
504	{
505		_Allocate(fCurrent, block->current_data);
506		_Allocate(fParent, block->parent_data);
507		_Allocate(fOriginal, block->original_data);
508
509#if KTRACE_PRINTF_STACK_TRACE
510		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
511			false);
512#endif
513
514		Initialized();
515	}
516
517	virtual void AddDump(TraceOutput& out)
518	{
519		out.Print("block cache %p, block %" B_PRIu64 ", data %c%c%c: %s",
520			fCache, fBlockNumber, fCurrent != NULL ? 'c' : '-',
521			fParent != NULL ? 'p' : '-', fOriginal != NULL ? 'o' : '-',
522			fMessage);
523	}
524
525#if KTRACE_PRINTF_STACK_TRACE
526	virtual void DumpStackTrace(TraceOutput& out)
527	{
528		out.PrintStackTrace(fStackTrace);
529	}
530#endif
531
532	void DumpBlocks(uint32 which, uint32 offset, uint32 size)
533	{
534		if ((which & kCurrent) != 0)
535			DumpBlock(kCurrent, offset, size);
536		if ((which & kParent) != 0)
537			DumpBlock(kParent, offset, size);
538		if ((which & kOriginal) != 0)
539			DumpBlock(kOriginal, offset, size);
540	}
541
542	void DumpBlock(uint32 which, uint32 offset, uint32 size)
543	{
544		if (offset > fSize) {
545			kprintf("invalid offset (block size %" B_PRIu32 ")\n", fSize);
546			return;
547		}
548		if (offset + size > fSize)
549			size = fSize - offset;
550
551		const char* label;
552		uint8* data;
553
554		if ((which & kCurrent) != 0) {
555			label = "current";
556			data = fCurrent;
557		} else if ((which & kParent) != 0) {
558			label = "parent";
559			data = fParent;
560		} else if ((which & kOriginal) != 0) {
561			label = "original";
562			data = fOriginal;
563		} else
564			return;
565
566		kprintf("%s: offset %" B_PRIu32 ", %" B_PRIu32 " bytes\n", label, offset, size);
567
568		static const uint32 kBlockSize = 16;
569		data += offset;
570
571		for (uint32 i = 0; i < size;) {
572			int start = i;
573
574			kprintf("  %04" B_PRIx32 " ", i);
575			for (; i < start + kBlockSize; i++) {
576				if (!(i % 4))
577					kprintf(" ");
578
579				if (i >= size)
580					kprintf("  ");
581				else
582					kprintf("%02x", data[i]);
583			}
584
585			kprintf("\n");
586		}
587	}
588
589private:
590	void _Allocate(uint8*& target, void* source)
591	{
592		if (source == NULL) {
593			target = NULL;
594			return;
595		}
596
597		target = alloc_tracing_buffer_memcpy(source, fSize, false);
598	}
599
600	block_cache*	fCache;
601	uint32			fSize;
602	uint64			fBlockNumber;
603	const char*		fMessage;
604	uint8*			fCurrent;
605	uint8*			fParent;
606	uint8*			fOriginal;
607#if KTRACE_PRINTF_STACK_TRACE
608	tracing_stack_trace* fStackTrace;
609#endif
610};
611#endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
612
613}	// namespace BlockTracing
614
615#	define TB(x) new(std::nothrow) BlockTracing::x;
616#else
617#	define TB(x) ;
618#endif
619
620#if BLOCK_CACHE_BLOCK_TRACING >= 2
621#	define TB2(x) new(std::nothrow) BlockTracing::x;
622#else
623#	define TB2(x) ;
624#endif
625
626
627#if BLOCK_CACHE_TRANSACTION_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
628namespace TransactionTracing {
629
630class Action : public AbstractTraceEntry {
631public:
632	Action(const char* label, block_cache* cache,
633			cache_transaction* transaction)
634		:
635		fCache(cache),
636		fTransaction(transaction),
637		fID(transaction->id),
638		fSub(transaction->has_sub_transaction),
639		fNumBlocks(transaction->num_blocks),
640		fSubNumBlocks(transaction->sub_num_blocks)
641	{
642		strlcpy(fLabel, label, sizeof(fLabel));
643		Initialized();
644	}
645
646	virtual void AddDump(TraceOutput& out)
647	{
648		out.Print("block cache %p, %s transaction %p (id %" B_PRId32 ")%s"
649			", %" B_PRId32 "/%" B_PRId32 " blocks", fCache, fLabel, fTransaction,
650			fID, fSub ? " sub" : "", fNumBlocks, fSubNumBlocks);
651	}
652
653private:
654	char				fLabel[12];
655	block_cache*		fCache;
656	cache_transaction*	fTransaction;
657	int32				fID;
658	bool				fSub;
659	int32				fNumBlocks;
660	int32				fSubNumBlocks;
661};
662
663class Detach : public AbstractTraceEntry {
664public:
665	Detach(block_cache* cache, cache_transaction* transaction,
666			cache_transaction* newTransaction)
667		:
668		fCache(cache),
669		fTransaction(transaction),
670		fID(transaction->id),
671		fSub(transaction->has_sub_transaction),
672		fNewTransaction(newTransaction),
673		fNewID(newTransaction->id)
674	{
675		Initialized();
676	}
677
678	virtual void AddDump(TraceOutput& out)
679	{
680		out.Print("block cache %p, detach transaction %p (id %" B_PRId32 ")"
681			"from transaction %p (id %" B_PRId32 ")%s",
682			fCache, fNewTransaction, fNewID, fTransaction, fID,
683			fSub ? " sub" : "");
684	}
685
686private:
687	block_cache*		fCache;
688	cache_transaction*	fTransaction;
689	int32				fID;
690	bool				fSub;
691	cache_transaction*	fNewTransaction;
692	int32				fNewID;
693};
694
695class Abort : public AbstractTraceEntry {
696public:
697	Abort(block_cache* cache, cache_transaction* transaction)
698		:
699		fCache(cache),
700		fTransaction(transaction),
701		fID(transaction->id),
702		fNumBlocks(0)
703	{
704		bool isSub = transaction->has_sub_transaction;
705		fNumBlocks = isSub ? transaction->sub_num_blocks
706			: transaction->num_blocks;
707		fBlocks = (off_t*)alloc_tracing_buffer(fNumBlocks * sizeof(off_t));
708		if (fBlocks != NULL) {
709			cached_block* block = transaction->first_block;
710			for (int32 i = 0; block != NULL && i < fNumBlocks;
711					block = block->transaction_next) {
712				fBlocks[i++] = block->block_number;
713			}
714		} else
715			fNumBlocks = 0;
716
717#if KTRACE_PRINTF_STACK_TRACE
718		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
719			false);
720#endif
721
722		Initialized();
723	}
724
725	virtual void AddDump(TraceOutput& out)
726	{
727		out.Print("block cache %p, abort transaction "
728			"%p (id %" B_PRId32 "), blocks", fCache, fTransaction, fID);
729		for (int32 i = 0; i < fNumBlocks && !out.IsFull(); i++)
730			out.Print(" %" B_PRIdOFF, fBlocks[i]);
731	}
732
733#if KTRACE_PRINTF_STACK_TRACE
734	virtual void DumpStackTrace(TraceOutput& out)
735	{
736		out.PrintStackTrace(fStackTrace);
737	}
738#endif
739
740private:
741	block_cache*		fCache;
742	cache_transaction*	fTransaction;
743	int32				fID;
744	off_t*				fBlocks;
745	int32				fNumBlocks;
746#if KTRACE_PRINTF_STACK_TRACE
747	tracing_stack_trace* fStackTrace;
748#endif
749};
750
751}	// namespace TransactionTracing
752
753#	define T(x) new(std::nothrow) TransactionTracing::x;
754#else
755#	define T(x) ;
756#endif
757
758
759static DoublyLinkedList<block_cache> sCaches;
760static mutex sCachesLock = MUTEX_INITIALIZER("block caches");
761static mutex sCachesMemoryUseLock
762	= MUTEX_INITIALIZER("block caches memory use");
763static size_t sUsedMemory;
764static sem_id sEventSemaphore;
765static mutex sNotificationsLock
766	= MUTEX_INITIALIZER("block cache notifications");
767static thread_id sNotifierWriterThread;
768static DoublyLinkedListLink<block_cache> sMarkCache;
769	// TODO: this only works if the link is the first entry of block_cache
770static object_cache* sBlockCache;
771
772
773//	#pragma mark - notifications/listener
774
775
776/*!	Checks whether or not this is an event that closes a transaction. */
777static inline bool
778is_closing_event(int32 event)
779{
780	return (event & (TRANSACTION_ABORTED | TRANSACTION_ENDED)) != 0;
781}
782
783
784static inline bool
785is_written_event(int32 event)
786{
787	return (event & TRANSACTION_WRITTEN) != 0;
788}
789
790
791/*!	From the specified \a notification, it will remove the lowest pending
792	event, and return that one in \a _event.
793	If there is no pending event anymore, it will return \c false.
794*/
795static bool
796get_next_pending_event(cache_notification* notification, int32* _event)
797{
798	for (int32 eventMask = 1; eventMask <= TRANSACTION_IDLE; eventMask <<= 1) {
799		int32 pending = atomic_and(&notification->events_pending,
800			~eventMask);
801
802		bool more = (pending & ~eventMask) != 0;
803
804		if ((pending & eventMask) != 0) {
805			*_event = eventMask;
806			return more;
807		}
808	}
809
810	return false;
811}
812
813
814static void
815flush_pending_notifications(block_cache* cache)
816{
817	ASSERT_LOCKED_MUTEX(&sCachesLock);
818
819	while (true) {
820		MutexLocker locker(sNotificationsLock);
821
822		cache_notification* notification = cache->pending_notifications.Head();
823		if (notification == NULL)
824			return;
825
826		bool deleteAfterEvent = false;
827		int32 event = -1;
828		if (!get_next_pending_event(notification, &event)) {
829			// remove the notification if this was the last pending event
830			cache->pending_notifications.Remove(notification);
831			deleteAfterEvent = notification->delete_after_event;
832		}
833
834		if (event >= 0) {
835			// Notify listener, we need to copy the notification, as it might
836			// be removed when we unlock the list.
837			cache_notification copy = *notification;
838			locker.Unlock();
839
840			copy.hook(copy.transaction_id, event, copy.data);
841
842			locker.Lock();
843		}
844
845		if (deleteAfterEvent)
846			delete notification;
847	}
848}
849
850
851/*!	Flushes all pending notifications by calling the appropriate hook
852	functions.
853	Must not be called with a cache lock held.
854*/
855static void
856flush_pending_notifications()
857{
858	MutexLocker _(sCachesLock);
859
860	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
861	while (iterator.HasNext()) {
862		block_cache* cache = iterator.Next();
863
864		flush_pending_notifications(cache);
865	}
866}
867
868
869/*!	Initializes the \a notification as specified. */
870static void
871set_notification(cache_transaction* transaction,
872	cache_notification &notification, int32 events,
873	transaction_notification_hook hook, void* data)
874{
875	notification.transaction_id = transaction != NULL ? transaction->id : -1;
876	notification.events_pending = 0;
877	notification.events = events;
878	notification.hook = hook;
879	notification.data = data;
880	notification.delete_after_event = false;
881}
882
883
884/*!	Makes sure the notification is deleted. It either deletes it directly,
885	when possible, or marks it for deletion if the notification is pending.
886*/
887static void
888delete_notification(cache_notification* notification)
889{
890	MutexLocker locker(sNotificationsLock);
891
892	if (notification->events_pending != 0)
893		notification->delete_after_event = true;
894	else
895		delete notification;
896}
897
898
899/*!	Adds the notification to the pending notifications list, or, if it's
900	already part of it, updates its events_pending field.
901	Also marks the notification to be deleted if \a deleteNotification
902	is \c true.
903	Triggers the notifier thread to run.
904*/
905static void
906add_notification(block_cache* cache, cache_notification* notification,
907	int32 event, bool deleteNotification)
908{
909	if (notification->hook == NULL)
910		return;
911
912	int32 pending = atomic_or(&notification->events_pending, event);
913	if (pending == 0) {
914		// not yet part of the notification list
915		MutexLocker locker(sNotificationsLock);
916		if (deleteNotification)
917			notification->delete_after_event = true;
918		cache->pending_notifications.Add(notification);
919	} else if (deleteNotification) {
920		// we might need to delete it ourselves if we're late
921		delete_notification(notification);
922	}
923
924	release_sem_etc(sEventSemaphore, 1, B_DO_NOT_RESCHEDULE);
925		// We're probably still holding some locks that makes rescheduling
926		// not a good idea at this point.
927}
928
929
930/*!	Notifies all interested listeners of this transaction about the \a event.
931	If \a event is a closing event (ie. TRANSACTION_ENDED, and
932	TRANSACTION_ABORTED), all listeners except those listening to
933	TRANSACTION_WRITTEN will be removed.
934*/
935static void
936notify_transaction_listeners(block_cache* cache, cache_transaction* transaction,
937	int32 event)
938{
939	T(Action("notify", cache, transaction));
940
941	bool isClosing = is_closing_event(event);
942	bool isWritten = is_written_event(event);
943
944	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
945	while (iterator.HasNext()) {
946		cache_listener* listener = iterator.Next();
947
948		bool remove = (isClosing && !is_written_event(listener->events))
949			|| (isWritten && is_written_event(listener->events));
950		if (remove)
951			iterator.Remove();
952
953		if ((listener->events & event) != 0)
954			add_notification(cache, listener, event, remove);
955		else if (remove)
956			delete_notification(listener);
957	}
958}
959
960
961/*!	Removes and deletes all listeners that are still monitoring this
962	transaction.
963*/
964static void
965remove_transaction_listeners(block_cache* cache, cache_transaction* transaction)
966{
967	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
968	while (iterator.HasNext()) {
969		cache_listener* listener = iterator.Next();
970		iterator.Remove();
971
972		delete_notification(listener);
973	}
974}
975
976
977static status_t
978add_transaction_listener(block_cache* cache, cache_transaction* transaction,
979	int32 events, transaction_notification_hook hookFunction, void* data)
980{
981	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
982	while (iterator.HasNext()) {
983		cache_listener* listener = iterator.Next();
984
985		if (listener->data == data && listener->hook == hookFunction) {
986			// this listener already exists, just update it
987			listener->events |= events;
988			return B_OK;
989		}
990	}
991
992	cache_listener* listener = new cache_listener;
993	if (listener == NULL)
994		return B_NO_MEMORY;
995
996	set_notification(transaction, *listener, events, hookFunction, data);
997	transaction->listeners.Add(listener);
998	return B_OK;
999}
1000
1001
1002//	#pragma mark - private transaction
1003
1004
1005cache_transaction::cache_transaction()
1006{
1007	num_blocks = 0;
1008	main_num_blocks = 0;
1009	sub_num_blocks = 0;
1010	first_block = NULL;
1011	open = true;
1012	has_sub_transaction = false;
1013	last_used = system_time();
1014	busy_writing_count = 0;
1015}
1016
1017
1018static void
1019delete_transaction(block_cache* cache, cache_transaction* transaction)
1020{
1021	if (cache->last_transaction == transaction)
1022		cache->last_transaction = NULL;
1023
1024	remove_transaction_listeners(cache, transaction);
1025	delete transaction;
1026}
1027
1028
1029static cache_transaction*
1030lookup_transaction(block_cache* cache, int32 id)
1031{
1032	return cache->transaction_hash->Lookup(id);
1033}
1034
1035
1036size_t TransactionHash::Hash(cache_transaction* transaction) const
1037{
1038	return transaction->id;
1039}
1040
1041
1042bool TransactionHash::Compare(int32 key, cache_transaction* transaction) const
1043{
1044	return transaction->id == key;
1045}
1046
1047
1048cache_transaction*& TransactionHash::GetLink(cache_transaction* value) const
1049{
1050	return value->next;
1051}
1052
1053
1054/*!	Writes back any changes made to blocks in \a transaction that are still
1055	part of a previous transacton.
1056*/
1057static status_t
1058write_blocks_in_previous_transaction(block_cache* cache,
1059	cache_transaction* transaction)
1060{
1061	BlockWriter writer(cache);
1062
1063	cached_block* block = transaction->first_block;
1064	for (; block != NULL; block = block->transaction_next) {
1065		if (block->previous_transaction != NULL) {
1066			// need to write back pending changes
1067			writer.Add(block);
1068		}
1069	}
1070
1071	return writer.Write();
1072}
1073
1074
1075//	#pragma mark - cached_block
1076
1077
1078bool
1079cached_block::CanBeWritten() const
1080{
1081	return !busy_writing && !busy_reading
1082		&& (previous_transaction != NULL
1083			|| (transaction == NULL && is_dirty && !is_writing));
1084}
1085
1086
1087//	#pragma mark - BlockWriter
1088
1089
1090BlockWriter::BlockWriter(block_cache* cache, size_t max)
1091	:
1092	fCache(cache),
1093	fBlocks(fBuffer),
1094	fCount(0),
1095	fTotal(0),
1096	fCapacity(kBufferSize),
1097	fMax(max),
1098	fStatus(B_OK),
1099	fDeletedTransaction(false)
1100{
1101}
1102
1103
1104BlockWriter::~BlockWriter()
1105{
1106	if (fBlocks != fBuffer)
1107		free(fBlocks);
1108}
1109
1110
1111/*!	Adds the specified block to the to be written array. If no more blocks can
1112	be added, false is returned, otherwise true.
1113*/
1114bool
1115BlockWriter::Add(cached_block* block, cache_transaction* transaction)
1116{
1117	ASSERT(block->CanBeWritten());
1118
1119	if (fTotal == fMax)
1120		return false;
1121
1122	if (fCount >= fCapacity) {
1123		// Enlarge array if necessary
1124		cached_block** newBlocks;
1125		size_t newCapacity = max_c(256, fCapacity * 2);
1126		if (fBlocks == fBuffer)
1127			newBlocks = (cached_block**)malloc(newCapacity * sizeof(void*));
1128		else {
1129			newBlocks = (cached_block**)realloc(fBlocks,
1130				newCapacity * sizeof(void*));
1131		}
1132
1133		if (newBlocks == NULL) {
1134			// Allocating a larger array failed - we need to write back what
1135			// we have synchronously now (this will also clear the array)
1136			Write(transaction, false);
1137		} else {
1138			if (fBlocks == fBuffer)
1139				memcpy(newBlocks, fBuffer, kBufferSize * sizeof(void*));
1140
1141			fBlocks = newBlocks;
1142			fCapacity = newCapacity;
1143		}
1144	}
1145
1146	fBlocks[fCount++] = block;
1147	fTotal++;
1148	block->busy_writing = true;
1149	fCache->busy_writing_count++;
1150	if (block->previous_transaction != NULL)
1151		block->previous_transaction->busy_writing_count++;
1152
1153	return true;
1154}
1155
1156
1157/*!	Adds all blocks of the specified transaction to the to be written array.
1158	If no more blocks can be added, false is returned, otherwise true.
1159*/
1160bool
1161BlockWriter::Add(cache_transaction* transaction, bool& hasLeftOvers)
1162{
1163	ASSERT(!transaction->open);
1164
1165	if (transaction->busy_writing_count != 0) {
1166		hasLeftOvers = true;
1167		return true;
1168	}
1169
1170	hasLeftOvers = false;
1171
1172	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
1173	while (cached_block* block = blockIterator.Next()) {
1174		if (!block->CanBeWritten()) {
1175			// This block was already part of a previous transaction within this
1176			// writer
1177			hasLeftOvers = true;
1178			continue;
1179		}
1180		if (!Add(block, transaction))
1181			return false;
1182
1183		if (DeletedTransaction())
1184			break;
1185	}
1186
1187	return true;
1188}
1189
1190
1191/*! Cache must be locked when calling this method, but it will be unlocked
1192	while the blocks are written back.
1193*/
1194status_t
1195BlockWriter::Write(cache_transaction* transaction, bool canUnlock)
1196{
1197	if (fCount == 0)
1198		return B_OK;
1199
1200	if (canUnlock)
1201		mutex_unlock(&fCache->lock);
1202
1203	// Sort blocks in their on-disk order
1204	// TODO: ideally, this should be handled by the I/O scheduler
1205
1206	qsort(fBlocks, fCount, sizeof(void*), &_CompareBlocks);
1207	fDeletedTransaction = false;
1208
1209	bigtime_t start = system_time();
1210
1211	for (uint32 i = 0; i < fCount; i++) {
1212		status_t status = _WriteBlock(fBlocks[i]);
1213		if (status != B_OK) {
1214			// propagate to global error handling
1215			if (fStatus == B_OK)
1216				fStatus = status;
1217
1218			_UnmarkWriting(fBlocks[i]);
1219			fBlocks[i] = NULL;
1220				// This block will not be marked clean
1221		}
1222	}
1223
1224	bigtime_t finish = system_time();
1225
1226	if (canUnlock)
1227		mutex_lock(&fCache->lock);
1228
1229	if (fStatus == B_OK && fCount >= 8) {
1230		fCache->last_block_write = finish;
1231		fCache->last_block_write_duration = (fCache->last_block_write - start)
1232			/ fCount;
1233	}
1234
1235	for (uint32 i = 0; i < fCount; i++)
1236		_BlockDone(fBlocks[i], transaction);
1237
1238	fCount = 0;
1239	return fStatus;
1240}
1241
1242
1243/*!	Writes the specified \a block back to disk. It will always only write back
1244	the oldest change of the block if it is part of more than one transaction.
1245	It will automatically send out TRANSACTION_WRITTEN notices, as well as
1246	delete transactions when they are no longer used, and \a deleteTransaction
1247	is \c true.
1248*/
1249/*static*/ status_t
1250BlockWriter::WriteBlock(block_cache* cache, cached_block* block)
1251{
1252	BlockWriter writer(cache);
1253
1254	writer.Add(block);
1255	return writer.Write();
1256}
1257
1258
1259void*
1260BlockWriter::_Data(cached_block* block) const
1261{
1262	return block->previous_transaction != NULL && block->original_data != NULL
1263		? block->original_data : block->current_data;
1264		// We first need to write back changes from previous transactions
1265}
1266
1267
1268status_t
1269BlockWriter::_WriteBlock(cached_block* block)
1270{
1271	ASSERT(block->busy_writing);
1272
1273	TRACE(("BlockWriter::_WriteBlock(block %" B_PRIdOFF ")\n", block->block_number));
1274	TB(Write(fCache, block));
1275	TB2(BlockData(fCache, block, "before write"));
1276
1277	size_t blockSize = fCache->block_size;
1278
1279	ssize_t written = write_pos(fCache->fd,
1280		block->block_number * blockSize, _Data(block), blockSize);
1281
1282	if (written != (ssize_t)blockSize) {
1283		TB(Error(fCache, block->block_number, "write failed", written));
1284		TRACE_ALWAYS(("could not write back block %" B_PRIdOFF " (%s)\n", block->block_number,
1285			strerror(errno)));
1286		if (written < 0)
1287			return errno;
1288
1289		return B_IO_ERROR;
1290	}
1291
1292	return B_OK;
1293}
1294
1295
1296void
1297BlockWriter::_BlockDone(cached_block* block,
1298	cache_transaction* transaction)
1299{
1300	if (block == NULL) {
1301		// An error occured when trying to write this block
1302		return;
1303	}
1304
1305	if (fCache->num_dirty_blocks > 0)
1306		fCache->num_dirty_blocks--;
1307
1308	if (_Data(block) == block->current_data)
1309		block->is_dirty = false;
1310
1311	_UnmarkWriting(block);
1312
1313	cache_transaction* previous = block->previous_transaction;
1314	if (previous != NULL) {
1315		previous->blocks.Remove(block);
1316		block->previous_transaction = NULL;
1317
1318		if (block->original_data != NULL && block->transaction == NULL) {
1319			// This block is not part of a transaction, so it does not need
1320			// its original pointer anymore.
1321			fCache->Free(block->original_data);
1322			block->original_data = NULL;
1323		}
1324
1325		// Has the previous transaction been finished with that write?
1326		if (--previous->num_blocks == 0) {
1327			TRACE(("cache transaction %" B_PRId32 " finished!\n", previous->id));
1328			T(Action("written", fCache, previous));
1329
1330			notify_transaction_listeners(fCache, previous,
1331				TRANSACTION_WRITTEN);
1332
1333			if (transaction != NULL) {
1334				// This function is called while iterating transaction_hash. We
1335				// use RemoveUnchecked so the iterator is still valid. A regular
1336				// Remove can trigger a resize of the hash table which would
1337				// result in the linked items in the table changing order.
1338				fCache->transaction_hash->RemoveUnchecked(transaction);
1339			} else
1340				fCache->transaction_hash->Remove(previous);
1341
1342			delete_transaction(fCache, previous);
1343			fDeletedTransaction = true;
1344		}
1345	}
1346	if (block->transaction == NULL && block->ref_count == 0 && !block->unused) {
1347		// the block is no longer used
1348		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1349		block->unused = true;
1350		fCache->unused_blocks.Add(block);
1351		fCache->unused_block_count++;
1352	}
1353
1354	TB2(BlockData(fCache, block, "after write"));
1355}
1356
1357
1358void
1359BlockWriter::_UnmarkWriting(cached_block* block)
1360{
1361	block->busy_writing = false;
1362	if (block->previous_transaction != NULL)
1363		block->previous_transaction->busy_writing_count--;
1364	fCache->busy_writing_count--;
1365
1366	if ((fCache->busy_writing_waiters && fCache->busy_writing_count == 0)
1367		|| block->busy_writing_waiters) {
1368		fCache->busy_writing_waiters = false;
1369		block->busy_writing_waiters = false;
1370		fCache->busy_writing_condition.NotifyAll();
1371	}
1372}
1373
1374
1375/*static*/ int
1376BlockWriter::_CompareBlocks(const void* _blockA, const void* _blockB)
1377{
1378	cached_block* blockA = *(cached_block**)_blockA;
1379	cached_block* blockB = *(cached_block**)_blockB;
1380
1381	off_t diff = blockA->block_number - blockB->block_number;
1382	if (diff > 0)
1383		return 1;
1384
1385	return diff < 0 ? -1 : 0;
1386}
1387
1388
1389//	#pragma mark - block_cache
1390
1391
1392block_cache::block_cache(int _fd, off_t numBlocks, size_t blockSize,
1393		bool readOnly)
1394	:
1395	hash(NULL),
1396	fd(_fd),
1397	max_blocks(numBlocks),
1398	block_size(blockSize),
1399	next_transaction_id(1),
1400	last_transaction(NULL),
1401	transaction_hash(NULL),
1402	buffer_cache(NULL),
1403	unused_block_count(0),
1404	busy_reading_count(0),
1405	busy_reading_waiters(false),
1406	busy_writing_count(0),
1407	busy_writing_waiters(0),
1408	last_block_write(0),
1409	last_block_write_duration(0),
1410	num_dirty_blocks(0),
1411	read_only(readOnly)
1412{
1413}
1414
1415
1416/*! Should be called with the cache's lock held. */
1417block_cache::~block_cache()
1418{
1419	unregister_low_resource_handler(&_LowMemoryHandler, this);
1420
1421	delete transaction_hash;
1422	delete hash;
1423
1424	delete_object_cache(buffer_cache);
1425
1426	mutex_destroy(&lock);
1427}
1428
1429
1430status_t
1431block_cache::Init()
1432{
1433	busy_reading_condition.Init(this, "cache block busy_reading");
1434	busy_writing_condition.Init(this, "cache block busy writing");
1435	condition_variable.Init(this, "cache transaction sync");
1436	mutex_init(&lock, "block cache");
1437
1438	buffer_cache = create_object_cache_etc("block cache buffers", block_size,
1439		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
1440	if (buffer_cache == NULL)
1441		return B_NO_MEMORY;
1442
1443	hash = new BlockTable();
1444	if (hash == NULL || hash->Init(1024) != B_OK)
1445		return B_NO_MEMORY;
1446
1447	transaction_hash = new(std::nothrow) TransactionTable();
1448	if (transaction_hash == NULL || transaction_hash->Init(16) != B_OK)
1449		return B_NO_MEMORY;
1450
1451	return register_low_resource_handler(&_LowMemoryHandler, this,
1452		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1453			| B_KERNEL_RESOURCE_ADDRESS_SPACE, 0);
1454}
1455
1456
1457void
1458block_cache::Free(void* buffer)
1459{
1460	if (buffer != NULL)
1461		object_cache_free(buffer_cache, buffer, 0);
1462}
1463
1464
1465void*
1466block_cache::Allocate()
1467{
1468	void* block = object_cache_alloc(buffer_cache, 0);
1469	if (block != NULL)
1470		return block;
1471
1472	// recycle existing before allocating a new one
1473	RemoveUnusedBlocks(100);
1474
1475	return object_cache_alloc(buffer_cache, 0);
1476}
1477
1478
1479void
1480block_cache::FreeBlock(cached_block* block)
1481{
1482	Free(block->current_data);
1483
1484	if (block->original_data != NULL || block->parent_data != NULL) {
1485		panic("block_cache::FreeBlock(): %" B_PRIdOFF ", original %p, parent %p\n",
1486			block->block_number, block->original_data, block->parent_data);
1487	}
1488
1489#if BLOCK_CACHE_DEBUG_CHANGED
1490	Free(block->compare);
1491#endif
1492
1493	object_cache_free(sBlockCache, block, 0);
1494}
1495
1496
1497/*! Allocates a new block for \a blockNumber, ready for use */
1498cached_block*
1499block_cache::NewBlock(off_t blockNumber)
1500{
1501	cached_block* block = NULL;
1502
1503	if (low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1504			| B_KERNEL_RESOURCE_ADDRESS_SPACE) != B_NO_LOW_RESOURCE) {
1505		// recycle existing instead of allocating a new one
1506		block = _GetUnusedBlock();
1507	}
1508	if (block == NULL) {
1509		block = (cached_block*)object_cache_alloc(sBlockCache, 0);
1510		if (block != NULL) {
1511			block->current_data = Allocate();
1512			if (block->current_data == NULL) {
1513				object_cache_free(sBlockCache, block, 0);
1514				return NULL;
1515			}
1516		} else {
1517			TB(Error(this, blockNumber, "allocation failed"));
1518			dprintf("block allocation failed, unused list is %sempty.\n",
1519				unused_blocks.IsEmpty() ? "" : "not ");
1520
1521			// allocation failed, try to reuse an unused block
1522			block = _GetUnusedBlock();
1523			if (block == NULL) {
1524				TB(Error(this, blockNumber, "get unused failed"));
1525				FATAL(("could not allocate block!\n"));
1526				return NULL;
1527			}
1528		}
1529	}
1530
1531	block->block_number = blockNumber;
1532	block->ref_count = 0;
1533	block->last_accessed = 0;
1534	block->transaction_next = NULL;
1535	block->transaction = block->previous_transaction = NULL;
1536	block->original_data = NULL;
1537	block->parent_data = NULL;
1538	block->busy_reading = false;
1539	block->busy_writing = false;
1540	block->is_writing = false;
1541	block->is_dirty = false;
1542	block->unused = false;
1543	block->discard = false;
1544	block->busy_reading_waiters = false;
1545	block->busy_writing_waiters = false;
1546#if BLOCK_CACHE_DEBUG_CHANGED
1547	block->compare = NULL;
1548#endif
1549
1550	return block;
1551}
1552
1553
1554void
1555block_cache::FreeBlockParentData(cached_block* block)
1556{
1557	ASSERT(block->parent_data != NULL);
1558	if (block->parent_data != block->current_data)
1559		Free(block->parent_data);
1560	block->parent_data = NULL;
1561}
1562
1563
1564void
1565block_cache::RemoveUnusedBlocks(int32 count, int32 minSecondsOld)
1566{
1567	TRACE(("block_cache: remove up to %" B_PRId32 " unused blocks\n", count));
1568
1569	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1570			cached_block* block = iterator.Next();) {
1571		if (minSecondsOld >= block->LastAccess()) {
1572			// The list is sorted by last access
1573			break;
1574		}
1575		if (block->busy_reading || block->busy_writing)
1576			continue;
1577
1578		TB(Flush(this, block));
1579		TRACE(("  remove block %" B_PRIdOFF ", last accessed %" B_PRId32 "\n",
1580			block->block_number, block->last_accessed));
1581
1582		// this can only happen if no transactions are used
1583		if (block->is_dirty && !block->discard) {
1584			if (block->busy_writing)
1585				continue;
1586
1587			BlockWriter::WriteBlock(this, block);
1588		}
1589
1590		// remove block from lists
1591		iterator.Remove();
1592		unused_block_count--;
1593		RemoveBlock(block);
1594
1595		if (--count <= 0)
1596			break;
1597	}
1598}
1599
1600
1601void
1602block_cache::RemoveBlock(cached_block* block)
1603{
1604	hash->Remove(block);
1605	FreeBlock(block);
1606}
1607
1608
1609/*!	Discards the block from a transaction (this method must not be called
1610	for blocks not part of a transaction).
1611*/
1612void
1613block_cache::DiscardBlock(cached_block* block)
1614{
1615	ASSERT(block->discard);
1616	ASSERT(block->previous_transaction == NULL);
1617
1618	if (block->parent_data != NULL)
1619		FreeBlockParentData(block);
1620
1621	if (block->original_data != NULL) {
1622		Free(block->original_data);
1623		block->original_data = NULL;
1624	}
1625
1626	RemoveBlock(block);
1627}
1628
1629
1630void
1631block_cache::_LowMemoryHandler(void* data, uint32 resources, int32 level)
1632{
1633	TRACE(("block_cache: low memory handler called with level %" B_PRId32 "\n", level));
1634
1635	// free some blocks according to the low memory state
1636	// (if there is enough memory left, we don't free any)
1637
1638	block_cache* cache = (block_cache*)data;
1639	if (cache->unused_block_count <= 1)
1640		return;
1641
1642	int32 free = 0;
1643	int32 secondsOld = 0;
1644	switch (level) {
1645		case B_NO_LOW_RESOURCE:
1646			return;
1647		case B_LOW_RESOURCE_NOTE:
1648			free = cache->unused_block_count / 4;
1649			secondsOld = 120;
1650			break;
1651		case B_LOW_RESOURCE_WARNING:
1652			free = cache->unused_block_count / 2;
1653			secondsOld = 10;
1654			break;
1655		case B_LOW_RESOURCE_CRITICAL:
1656			free = cache->unused_block_count - 1;
1657			secondsOld = 0;
1658			break;
1659	}
1660
1661	MutexLocker locker(&cache->lock);
1662
1663	if (!locker.IsLocked()) {
1664		// If our block_cache were deleted, it could be that we had
1665		// been called before that deletion went through, therefore,
1666		// acquiring its lock might fail.
1667		return;
1668	}
1669
1670#ifdef TRACE_BLOCK_CACHE
1671	uint32 oldUnused = cache->unused_block_count;
1672#endif
1673
1674	cache->RemoveUnusedBlocks(free, secondsOld);
1675
1676	TRACE(("block_cache::_LowMemoryHandler(): %p: unused: %" B_PRIu32 " -> %" B_PRIu32 "\n",
1677		cache, oldUnused, cache->unused_block_count));
1678}
1679
1680
1681cached_block*
1682block_cache::_GetUnusedBlock()
1683{
1684	TRACE(("block_cache: get unused block\n"));
1685
1686	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1687			cached_block* block = iterator.Next();) {
1688		TB(Flush(this, block, true));
1689		// this can only happen if no transactions are used
1690		if (block->is_dirty && !block->busy_writing && !block->discard)
1691			BlockWriter::WriteBlock(this, block);
1692
1693		// remove block from lists
1694		iterator.Remove();
1695		unused_block_count--;
1696		hash->Remove(block);
1697
1698		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1699		block->unused = false;
1700
1701		// TODO: see if compare data is handled correctly here!
1702#if BLOCK_CACHE_DEBUG_CHANGED
1703		if (block->compare != NULL)
1704			Free(block->compare);
1705#endif
1706		return block;
1707	}
1708
1709	return NULL;
1710}
1711
1712
1713//	#pragma mark - private block functions
1714
1715
1716/*!	Cache must be locked.
1717*/
1718static void
1719mark_block_busy_reading(block_cache* cache, cached_block* block)
1720{
1721	block->busy_reading = true;
1722	cache->busy_reading_count++;
1723}
1724
1725
1726/*!	Cache must be locked.
1727*/
1728static void
1729mark_block_unbusy_reading(block_cache* cache, cached_block* block)
1730{
1731	block->busy_reading = false;
1732	cache->busy_reading_count--;
1733
1734	if ((cache->busy_reading_waiters && cache->busy_reading_count == 0)
1735		|| block->busy_reading_waiters) {
1736		cache->busy_reading_waiters = false;
1737		block->busy_reading_waiters = false;
1738		cache->busy_reading_condition.NotifyAll();
1739	}
1740}
1741
1742
1743/*!	Cache must be locked.
1744*/
1745static void
1746wait_for_busy_reading_block(block_cache* cache, cached_block* block)
1747{
1748	while (block->busy_reading) {
1749		// wait for at least the specified block to be read in
1750		ConditionVariableEntry entry;
1751		cache->busy_reading_condition.Add(&entry);
1752		block->busy_reading_waiters = true;
1753
1754		mutex_unlock(&cache->lock);
1755
1756		entry.Wait();
1757
1758		mutex_lock(&cache->lock);
1759	}
1760}
1761
1762
1763/*!	Cache must be locked.
1764*/
1765static void
1766wait_for_busy_reading_blocks(block_cache* cache)
1767{
1768	while (cache->busy_reading_count != 0) {
1769		// wait for all blocks to be read in
1770		ConditionVariableEntry entry;
1771		cache->busy_reading_condition.Add(&entry);
1772		cache->busy_reading_waiters = true;
1773
1774		mutex_unlock(&cache->lock);
1775
1776		entry.Wait();
1777
1778		mutex_lock(&cache->lock);
1779	}
1780}
1781
1782
1783/*!	Cache must be locked.
1784*/
1785static void
1786wait_for_busy_writing_block(block_cache* cache, cached_block* block)
1787{
1788	while (block->busy_writing) {
1789		// wait for all blocks to be written back
1790		ConditionVariableEntry entry;
1791		cache->busy_writing_condition.Add(&entry);
1792		block->busy_writing_waiters = true;
1793
1794		mutex_unlock(&cache->lock);
1795
1796		entry.Wait();
1797
1798		mutex_lock(&cache->lock);
1799	}
1800}
1801
1802
1803/*!	Cache must be locked.
1804*/
1805static void
1806wait_for_busy_writing_blocks(block_cache* cache)
1807{
1808	while (cache->busy_writing_count != 0) {
1809		// wait for all blocks to be written back
1810		ConditionVariableEntry entry;
1811		cache->busy_writing_condition.Add(&entry);
1812		cache->busy_writing_waiters = true;
1813
1814		mutex_unlock(&cache->lock);
1815
1816		entry.Wait();
1817
1818		mutex_lock(&cache->lock);
1819	}
1820}
1821
1822
1823/*!	Removes a reference from the specified \a block. If this was the last
1824	reference, the block is moved into the unused list.
1825	In low memory situations, it will also free some blocks from that list,
1826	but not necessarily the \a block it just released.
1827*/
1828static void
1829put_cached_block(block_cache* cache, cached_block* block)
1830{
1831#if BLOCK_CACHE_DEBUG_CHANGED
1832	if (!block->is_dirty && block->compare != NULL
1833		&& memcmp(block->current_data, block->compare, cache->block_size)) {
1834		dprintf("new block:\n");
1835		dump_block((const char*)block->current_data, 256, "  ");
1836		dprintf("unchanged block:\n");
1837		dump_block((const char*)block->compare, 256, "  ");
1838		BlockWriter::WriteBlock(cache, block);
1839		panic("block_cache: supposed to be clean block was changed!\n");
1840
1841		cache->Free(block->compare);
1842		block->compare = NULL;
1843	}
1844#endif
1845	TB(Put(cache, block));
1846
1847	if (block->ref_count < 1) {
1848		panic("Invalid ref_count for block %p, cache %p\n", block, cache);
1849		return;
1850	}
1851
1852	if (--block->ref_count == 0
1853		&& block->transaction == NULL && block->previous_transaction == NULL) {
1854		// This block is not used anymore, and not part of any transaction
1855		block->is_writing = false;
1856
1857		if (block->discard) {
1858			cache->RemoveBlock(block);
1859		} else {
1860			// put this block in the list of unused blocks
1861			ASSERT(!block->unused);
1862			block->unused = true;
1863
1864			ASSERT(block->original_data == NULL && block->parent_data == NULL);
1865			cache->unused_blocks.Add(block);
1866			cache->unused_block_count++;
1867		}
1868	}
1869}
1870
1871
1872static void
1873put_cached_block(block_cache* cache, off_t blockNumber)
1874{
1875	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1876		panic("put_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
1877			blockNumber, cache->max_blocks - 1);
1878	}
1879
1880	cached_block* block = cache->hash->Lookup(blockNumber);
1881	if (block != NULL)
1882		put_cached_block(cache, block);
1883	else {
1884		TB(Error(cache, blockNumber, "put unknown"));
1885	}
1886}
1887
1888
1889/*!	Retrieves the block \a blockNumber from the hash table, if it's already
1890	there, or reads it from the disk.
1891	You need to have the cache locked when calling this function.
1892
1893	\param _allocated tells you whether or not a new block has been allocated
1894		to satisfy your request.
1895	\param readBlock if \c false, the block will not be read in case it was
1896		not already in the cache. The block you retrieve may contain random
1897		data. If \c true, the cache will be temporarily unlocked while the
1898		block is read in.
1899*/
1900static status_t
1901get_cached_block(block_cache* cache, off_t blockNumber, bool* _allocated,
1902	bool readBlock, cached_block** _block)
1903{
1904	ASSERT_LOCKED_MUTEX(&cache->lock);
1905
1906	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1907		panic("get_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
1908			blockNumber, cache->max_blocks - 1);
1909		return B_BAD_VALUE;
1910	}
1911
1912retry:
1913	cached_block* block = cache->hash->Lookup(blockNumber);
1914	*_allocated = false;
1915
1916	if (block == NULL) {
1917		// put block into cache
1918		block = cache->NewBlock(blockNumber);
1919		if (block == NULL)
1920			return B_NO_MEMORY;
1921
1922		cache->hash->Insert(block);
1923		*_allocated = true;
1924	} else if (block->busy_reading) {
1925		// The block is currently busy_reading - wait and try again later
1926		wait_for_busy_reading_block(cache, block);
1927		goto retry;
1928	}
1929
1930	if (block->unused) {
1931		//TRACE(("remove block %" B_PRIdOFF " from unused\n", blockNumber));
1932		block->unused = false;
1933		cache->unused_blocks.Remove(block);
1934		cache->unused_block_count--;
1935	}
1936
1937	if (*_allocated && readBlock) {
1938		// read block into cache
1939		int32 blockSize = cache->block_size;
1940
1941		mark_block_busy_reading(cache, block);
1942		mutex_unlock(&cache->lock);
1943
1944		ssize_t bytesRead = read_pos(cache->fd, blockNumber * blockSize,
1945			block->current_data, blockSize);
1946
1947		mutex_lock(&cache->lock);
1948		if (bytesRead < blockSize) {
1949			cache->RemoveBlock(block);
1950			TB(Error(cache, blockNumber, "read failed", bytesRead));
1951
1952			TRACE_ALWAYS(("could not read block %" B_PRIdOFF ": bytesRead: %zd, error: %s\n",
1953				blockNumber, bytesRead, strerror(errno)));
1954			return errno;
1955		}
1956		TB(Read(cache, block));
1957
1958		mark_block_unbusy_reading(cache, block);
1959	}
1960
1961	block->ref_count++;
1962	block->last_accessed = system_time() / 1000000L;
1963
1964	*_block = block;
1965	return B_OK;
1966}
1967
1968
1969/*!	Returns the writable block data for the requested blockNumber.
1970	If \a cleared is true, the block is not read from disk; an empty block
1971	is returned.
1972
1973	This is the only method to insert a block into a transaction. It makes
1974	sure that the previous block contents are preserved in that case.
1975*/
1976static status_t
1977get_writable_cached_block(block_cache* cache, off_t blockNumber, off_t base,
1978	off_t length, int32 transactionID, bool cleared, void** _block)
1979{
1980	TRACE(("get_writable_cached_block(blockNumber = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
1981		blockNumber, transactionID));
1982
1983	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1984		panic("get_writable_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
1985			blockNumber, cache->max_blocks - 1);
1986		return B_BAD_VALUE;
1987	}
1988
1989	bool allocated;
1990	cached_block* block;
1991	status_t status = get_cached_block(cache, blockNumber, &allocated,
1992		!cleared, &block);
1993	if (status != B_OK)
1994		return status;
1995
1996	if (block->busy_writing)
1997		wait_for_busy_writing_block(cache, block);
1998
1999	block->discard = false;
2000
2001	// if there is no transaction support, we just return the current block
2002	if (transactionID == -1) {
2003		if (cleared) {
2004			mark_block_busy_reading(cache, block);
2005			mutex_unlock(&cache->lock);
2006
2007			memset(block->current_data, 0, cache->block_size);
2008
2009			mutex_lock(&cache->lock);
2010			mark_block_unbusy_reading(cache, block);
2011		}
2012
2013		block->is_writing = true;
2014
2015		if (!block->is_dirty) {
2016			cache->num_dirty_blocks++;
2017			block->is_dirty = true;
2018				// mark the block as dirty
2019		}
2020
2021		TB(Get(cache, block));
2022		*_block = block->current_data;
2023		return B_OK;
2024	}
2025
2026	cache_transaction* transaction = block->transaction;
2027
2028	if (transaction != NULL && transaction->id != transactionID) {
2029		// TODO: we have to wait here until the other transaction is done.
2030		//	Maybe we should even panic, since we can't prevent any deadlocks.
2031		panic("get_writable_cached_block(): asked to get busy writable block "
2032			"(transaction %" B_PRId32 ")\n", block->transaction->id);
2033		put_cached_block(cache, block);
2034		return B_BAD_VALUE;
2035	}
2036	if (transaction == NULL && transactionID != -1) {
2037		// get new transaction
2038		transaction = lookup_transaction(cache, transactionID);
2039		if (transaction == NULL) {
2040			panic("get_writable_cached_block(): invalid transaction %" B_PRId32 "!\n",
2041				transactionID);
2042			put_cached_block(cache, block);
2043			return B_BAD_VALUE;
2044		}
2045		if (!transaction->open) {
2046			panic("get_writable_cached_block(): transaction already done!\n");
2047			put_cached_block(cache, block);
2048			return B_BAD_VALUE;
2049		}
2050
2051		block->transaction = transaction;
2052
2053		// attach the block to the transaction block list
2054		block->transaction_next = transaction->first_block;
2055		transaction->first_block = block;
2056		transaction->num_blocks++;
2057	}
2058	if (transaction != NULL)
2059		transaction->last_used = system_time();
2060
2061	bool wasUnchanged = block->original_data == NULL
2062		|| block->previous_transaction != NULL;
2063
2064	if (!(allocated && cleared) && block->original_data == NULL) {
2065		// we already have data, so we need to preserve it
2066		block->original_data = cache->Allocate();
2067		if (block->original_data == NULL) {
2068			TB(Error(cache, blockNumber, "allocate original failed"));
2069			FATAL(("could not allocate original_data\n"));
2070			put_cached_block(cache, block);
2071			return B_NO_MEMORY;
2072		}
2073
2074		mark_block_busy_reading(cache, block);
2075		mutex_unlock(&cache->lock);
2076
2077		memcpy(block->original_data, block->current_data, cache->block_size);
2078
2079		mutex_lock(&cache->lock);
2080		mark_block_unbusy_reading(cache, block);
2081	}
2082	if (block->parent_data == block->current_data) {
2083		// remember any previous contents for the parent transaction
2084		block->parent_data = cache->Allocate();
2085		if (block->parent_data == NULL) {
2086			// TODO: maybe we should just continue the current transaction in
2087			// this case...
2088			TB(Error(cache, blockNumber, "allocate parent failed"));
2089			FATAL(("could not allocate parent\n"));
2090			put_cached_block(cache, block);
2091			return B_NO_MEMORY;
2092		}
2093
2094		mark_block_busy_reading(cache, block);
2095		mutex_unlock(&cache->lock);
2096
2097		memcpy(block->parent_data, block->current_data, cache->block_size);
2098
2099		mutex_lock(&cache->lock);
2100		mark_block_unbusy_reading(cache, block);
2101
2102		transaction->sub_num_blocks++;
2103	} else if (transaction != NULL && transaction->has_sub_transaction
2104		&& block->parent_data == NULL && wasUnchanged)
2105		transaction->sub_num_blocks++;
2106
2107	if (cleared) {
2108		mark_block_busy_reading(cache, block);
2109		mutex_unlock(&cache->lock);
2110
2111		memset(block->current_data, 0, cache->block_size);
2112
2113		mutex_lock(&cache->lock);
2114		mark_block_unbusy_reading(cache, block);
2115	}
2116
2117	block->is_dirty = true;
2118	TB(Get(cache, block));
2119	TB2(BlockData(cache, block, "get writable"));
2120
2121	*_block = block->current_data;
2122	return B_OK;
2123}
2124
2125
2126#if DEBUG_BLOCK_CACHE
2127
2128
2129static void
2130dump_block(cached_block* block)
2131{
2132	kprintf("%08lx %9" B_PRIdOFF " %08lx %08lx %08lx %5" B_PRId32 " %6" B_PRId32
2133		" %c%c%c%c%c%c %08lx %08lx\n",
2134		(addr_t)block, block->block_number,
2135		(addr_t)block->current_data, (addr_t)block->original_data,
2136		(addr_t)block->parent_data, block->ref_count, block->LastAccess(),
2137		block->busy_reading ? 'r' : '-', block->busy_writing ? 'w' : '-',
2138		block->is_writing ? 'W' : '-', block->is_dirty ? 'D' : '-',
2139		block->unused ? 'U' : '-', block->discard ? 'D' : '-',
2140		(addr_t)block->transaction,
2141		(addr_t)block->previous_transaction);
2142}
2143
2144
2145static void
2146dump_block_long(cached_block* block)
2147{
2148	kprintf("BLOCK %p\n", block);
2149	kprintf(" current data:  %p\n", block->current_data);
2150	kprintf(" original data: %p\n", block->original_data);
2151	kprintf(" parent data:   %p\n", block->parent_data);
2152#if BLOCK_CACHE_DEBUG_CHANGED
2153	kprintf(" compare data:  %p\n", block->compare);
2154#endif
2155	kprintf(" ref_count:     %" B_PRId32 "\n", block->ref_count);
2156	kprintf(" accessed:      %" B_PRId32 "\n", block->LastAccess());
2157	kprintf(" flags:        ");
2158	if (block->busy_reading)
2159		kprintf(" busy_reading");
2160	if (block->busy_writing)
2161		kprintf(" busy_writing");
2162	if (block->is_writing)
2163		kprintf(" is-writing");
2164	if (block->is_dirty)
2165		kprintf(" is-dirty");
2166	if (block->unused)
2167		kprintf(" unused");
2168	if (block->discard)
2169		kprintf(" discard");
2170	kprintf("\n");
2171	if (block->transaction != NULL) {
2172		kprintf(" transaction:   %p (%" B_PRId32 ")\n", block->transaction,
2173			block->transaction->id);
2174		if (block->transaction_next != NULL) {
2175			kprintf(" next in transaction: %" B_PRIdOFF "\n",
2176				block->transaction_next->block_number);
2177		}
2178	}
2179	if (block->previous_transaction != NULL) {
2180		kprintf(" previous transaction: %p (%" B_PRId32 ")\n",
2181			block->previous_transaction,
2182			block->previous_transaction->id);
2183	}
2184
2185	set_debug_variable("_current", (addr_t)block->current_data);
2186	set_debug_variable("_original", (addr_t)block->original_data);
2187	set_debug_variable("_parent", (addr_t)block->parent_data);
2188}
2189
2190
2191static int
2192dump_cached_block(int argc, char** argv)
2193{
2194	if (argc != 2) {
2195		kprintf("usage: %s <block-address>\n", argv[0]);
2196		return 0;
2197	}
2198
2199	dump_block_long((struct cached_block*)(addr_t)parse_expression(argv[1]));
2200	return 0;
2201}
2202
2203
2204static int
2205dump_cache(int argc, char** argv)
2206{
2207	bool showTransactions = false;
2208	bool showBlocks = false;
2209	int32 i = 1;
2210	while (argv[i] != NULL && argv[i][0] == '-') {
2211		for (char* arg = &argv[i][1]; arg[0]; arg++) {
2212			switch (arg[0]) {
2213				case 'b':
2214					showBlocks = true;
2215					break;
2216				case 't':
2217					showTransactions = true;
2218					break;
2219				default:
2220					print_debugger_command_usage(argv[0]);
2221					return 0;
2222			}
2223		}
2224		i++;
2225	}
2226
2227	if (i >= argc) {
2228		print_debugger_command_usage(argv[0]);
2229		return 0;
2230	}
2231
2232	block_cache* cache = (struct block_cache*)(addr_t)parse_expression(argv[i]);
2233	if (cache == NULL) {
2234		kprintf("invalid cache address\n");
2235		return 0;
2236	}
2237
2238	off_t blockNumber = -1;
2239	if (i + 1 < argc) {
2240		blockNumber = parse_expression(argv[i + 1]);
2241		cached_block* block = cache->hash->Lookup(blockNumber);
2242		if (block != NULL)
2243			dump_block_long(block);
2244		else
2245			kprintf("block %" B_PRIdOFF " not found\n", blockNumber);
2246		return 0;
2247	}
2248
2249	kprintf("BLOCK CACHE: %p\n", cache);
2250
2251	kprintf(" fd:           %d\n", cache->fd);
2252	kprintf(" max_blocks:   %" B_PRIdOFF "\n", cache->max_blocks);
2253	kprintf(" block_size:   %zu\n", cache->block_size);
2254	kprintf(" next_transaction_id: %" B_PRId32 "\n", cache->next_transaction_id);
2255	kprintf(" buffer_cache: %p\n", cache->buffer_cache);
2256	kprintf(" busy_reading: %" B_PRIu32 ", %s waiters\n", cache->busy_reading_count,
2257		cache->busy_reading_waiters ? "has" : "no");
2258	kprintf(" busy_writing: %" B_PRIu32 ", %s waiters\n", cache->busy_writing_count,
2259		cache->busy_writing_waiters ? "has" : "no");
2260
2261	if (!cache->pending_notifications.IsEmpty()) {
2262		kprintf(" pending notifications:\n");
2263
2264		NotificationList::Iterator iterator
2265			= cache->pending_notifications.GetIterator();
2266		while (iterator.HasNext()) {
2267			cache_notification* notification = iterator.Next();
2268
2269			kprintf("  %p %5" B_PRIx32 " %p - %p\n", notification,
2270				notification->events_pending, notification->hook,
2271				notification->data);
2272		}
2273	}
2274
2275	if (showTransactions) {
2276		kprintf(" transactions:\n");
2277		kprintf("address       id state  blocks  main   sub\n");
2278
2279		TransactionTable::Iterator iterator(cache->transaction_hash);
2280
2281		while (iterator.HasNext()) {
2282			cache_transaction* transaction = iterator.Next();
2283			kprintf("%p %5" B_PRId32 " %-7s %5" B_PRId32 " %5" B_PRId32 " %5"
2284				B_PRId32 "\n", transaction, transaction->id,
2285				transaction->open ? "open" : "closed",
2286				transaction->num_blocks, transaction->main_num_blocks,
2287				transaction->sub_num_blocks);
2288		}
2289	}
2290
2291	if (showBlocks) {
2292		kprintf(" blocks:\n");
2293		kprintf("address  block no. current  original parent    refs access "
2294			"flags transact prev. trans\n");
2295	}
2296
2297	uint32 referenced = 0;
2298	uint32 count = 0;
2299	uint32 dirty = 0;
2300	uint32 discarded = 0;
2301	BlockTable::Iterator iterator(cache->hash);
2302	while (iterator.HasNext()) {
2303		cached_block* block = iterator.Next();
2304		if (showBlocks)
2305			dump_block(block);
2306
2307		if (block->is_dirty)
2308			dirty++;
2309		if (block->discard)
2310			discarded++;
2311		if (block->ref_count)
2312			referenced++;
2313		count++;
2314	}
2315
2316	kprintf(" %" B_PRIu32 " blocks total, %" B_PRIu32 " dirty, %" B_PRIu32
2317		" discarded, %" B_PRIu32 " referenced, %" B_PRIu32 " busy, %" B_PRIu32
2318		" in unused.\n",
2319		count, dirty, discarded, referenced, cache->busy_reading_count,
2320		cache->unused_block_count);
2321	return 0;
2322}
2323
2324
2325static int
2326dump_transaction(int argc, char** argv)
2327{
2328	bool showBlocks = false;
2329	int i = 1;
2330	if (argc > 1 && !strcmp(argv[1], "-b")) {
2331		showBlocks = true;
2332		i++;
2333	}
2334
2335	if (argc - i < 1 || argc - i > 2) {
2336		print_debugger_command_usage(argv[0]);
2337		return 0;
2338	}
2339
2340	cache_transaction* transaction = NULL;
2341
2342	if (argc - i == 1) {
2343		transaction = (cache_transaction*)(addr_t)parse_expression(argv[i]);
2344	} else {
2345		block_cache* cache = (block_cache*)(addr_t)parse_expression(argv[i]);
2346		int32 id = parse_expression(argv[i + 1]);
2347		transaction = lookup_transaction(cache, id);
2348		if (transaction == NULL) {
2349			kprintf("No transaction with ID %" B_PRId32 " found.\n", id);
2350			return 0;
2351		}
2352	}
2353
2354	kprintf("TRANSACTION %p\n", transaction);
2355
2356	kprintf(" id:             %" B_PRId32 "\n", transaction->id);
2357	kprintf(" num block:      %" B_PRId32 "\n", transaction->num_blocks);
2358	kprintf(" main num block: %" B_PRId32 "\n", transaction->main_num_blocks);
2359	kprintf(" sub num block:  %" B_PRId32 "\n", transaction->sub_num_blocks);
2360	kprintf(" has sub:        %d\n", transaction->has_sub_transaction);
2361	kprintf(" state:          %s\n", transaction->open ? "open" : "closed");
2362	kprintf(" idle:           %" B_PRId64 " secs\n",
2363		(system_time() - transaction->last_used) / 1000000);
2364
2365	kprintf(" listeners:\n");
2366
2367	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
2368	while (iterator.HasNext()) {
2369		cache_listener* listener = iterator.Next();
2370
2371		kprintf("  %p %5" B_PRIx32 " %p - %p\n", listener, listener->events_pending,
2372			listener->hook, listener->data);
2373	}
2374
2375	if (!showBlocks)
2376		return 0;
2377
2378	kprintf(" blocks:\n");
2379	kprintf("address  block no. current  original parent    refs access "
2380		"flags transact prev. trans\n");
2381
2382	cached_block* block = transaction->first_block;
2383	while (block != NULL) {
2384		dump_block(block);
2385		block = block->transaction_next;
2386	}
2387
2388	kprintf("--\n");
2389
2390	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
2391	while (blockIterator.HasNext()) {
2392		block = blockIterator.Next();
2393		dump_block(block);
2394	}
2395
2396	return 0;
2397}
2398
2399
2400static int
2401dump_caches(int argc, char** argv)
2402{
2403	kprintf("Block caches:\n");
2404	DoublyLinkedList<block_cache>::Iterator i = sCaches.GetIterator();
2405	while (i.HasNext()) {
2406		block_cache* cache = i.Next();
2407		if (cache == (block_cache*)&sMarkCache)
2408			continue;
2409
2410		kprintf("  %p\n", cache);
2411	}
2412
2413	return 0;
2414}
2415
2416
2417#if BLOCK_CACHE_BLOCK_TRACING >= 2
2418static int
2419dump_block_data(int argc, char** argv)
2420{
2421	using namespace BlockTracing;
2422
2423	// Determine which blocks to show
2424
2425	bool printStackTrace = true;
2426	uint32 which = 0;
2427	int32 i = 1;
2428	while (i < argc && argv[i][0] == '-') {
2429		char* arg = &argv[i][1];
2430		while (arg[0]) {
2431			switch (arg[0]) {
2432				case 'c':
2433					which |= BlockData::kCurrent;
2434					break;
2435				case 'p':
2436					which |= BlockData::kParent;
2437					break;
2438				case 'o':
2439					which |= BlockData::kOriginal;
2440					break;
2441
2442				default:
2443					kprintf("invalid block specifier (only o/c/p are "
2444						"allowed).\n");
2445					return 0;
2446			}
2447			arg++;
2448		}
2449
2450		i++;
2451	}
2452	if (which == 0)
2453		which = BlockData::kCurrent | BlockData::kParent | BlockData::kOriginal;
2454
2455	if (i == argc) {
2456		print_debugger_command_usage(argv[0]);
2457		return 0;
2458	}
2459
2460	// Get the range of blocks to print
2461
2462	int64 from = parse_expression(argv[i]);
2463	int64 to = from;
2464	if (argc > i + 1)
2465		to = parse_expression(argv[i + 1]);
2466	if (to < from)
2467		to = from;
2468
2469	uint32 offset = 0;
2470	uint32 size = LONG_MAX;
2471	if (argc > i + 2)
2472		offset = parse_expression(argv[i + 2]);
2473	if (argc > i + 3)
2474		size = parse_expression(argv[i + 3]);
2475
2476	TraceEntryIterator iterator;
2477	iterator.MoveTo(from - 1);
2478
2479	static char sBuffer[1024];
2480	LazyTraceOutput out(sBuffer, sizeof(sBuffer), TRACE_OUTPUT_TEAM_ID);
2481
2482	while (TraceEntry* entry = iterator.Next()) {
2483		int32 index = iterator.Index();
2484		if (index > to)
2485			break;
2486
2487		Action* action = dynamic_cast<Action*>(entry);
2488		if (action != NULL) {
2489			out.Clear();
2490			out.DumpEntry(action);
2491			continue;
2492		}
2493
2494		BlockData* blockData = dynamic_cast<BlockData*>(entry);
2495		if (blockData == NULL)
2496			continue;
2497
2498		out.Clear();
2499
2500		const char* dump = out.DumpEntry(entry);
2501		int length = strlen(dump);
2502		if (length > 0 && dump[length - 1] == '\n')
2503			length--;
2504
2505		kprintf("%5" B_PRId32 ". %.*s\n", index, length, dump);
2506
2507		if (printStackTrace) {
2508			out.Clear();
2509			entry->DumpStackTrace(out);
2510			if (out.Size() > 0)
2511				kputs(out.Buffer());
2512		}
2513
2514		blockData->DumpBlocks(which, offset, size);
2515	}
2516
2517	return 0;
2518}
2519#endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
2520
2521
2522#endif	// DEBUG_BLOCK_CACHE
2523
2524
2525/*!	Traverses through the block_cache list, and returns one cache after the
2526	other. The cache returned is automatically locked when you get it, and
2527	unlocked with the next call to this function. Ignores caches that are in
2528	deletion state.
2529	Returns \c NULL when the end of the list is reached.
2530*/
2531static block_cache*
2532get_next_locked_block_cache(block_cache* last)
2533{
2534	MutexLocker _(sCachesLock);
2535
2536	block_cache* cache;
2537	if (last != NULL) {
2538		mutex_unlock(&last->lock);
2539
2540		cache = sCaches.GetNext((block_cache*)&sMarkCache);
2541		sCaches.Remove((block_cache*)&sMarkCache);
2542	} else
2543		cache = sCaches.Head();
2544
2545	if (cache != NULL) {
2546		mutex_lock(&cache->lock);
2547		sCaches.Insert(sCaches.GetNext(cache), (block_cache*)&sMarkCache);
2548	}
2549
2550	return cache;
2551}
2552
2553
2554/*!	Background thread that continuously checks for pending notifications of
2555	all caches.
2556	Every two seconds, it will also write back up to 64 blocks per cache.
2557*/
2558static status_t
2559block_notifier_and_writer(void* /*data*/)
2560{
2561	const bigtime_t kDefaultTimeout = 2000000LL;
2562	bigtime_t timeout = kDefaultTimeout;
2563
2564	while (true) {
2565		bigtime_t start = system_time();
2566
2567		status_t status = acquire_sem_etc(sEventSemaphore, 1,
2568			B_RELATIVE_TIMEOUT, timeout);
2569		if (status == B_OK) {
2570			flush_pending_notifications();
2571			timeout -= system_time() - start;
2572			continue;
2573		}
2574
2575		// Write 64 blocks of each block_cache roughly every 2 seconds,
2576		// potentially more or less depending on congestion and drive speeds
2577		// (usually much less.) We do not want to queue everything at once
2578		// because a future transaction might then get held up waiting for
2579		// a specific block to be written.
2580		timeout = kDefaultTimeout;
2581		size_t usedMemory;
2582		object_cache_get_usage(sBlockCache, &usedMemory);
2583
2584		block_cache* cache = NULL;
2585		while ((cache = get_next_locked_block_cache(cache)) != NULL) {
2586			// Give some breathing room: wait 2x the length of the potential
2587			// maximum block count-sized write between writes, and also skip
2588			// if there are more than 16 blocks currently being written.
2589			const bigtime_t next = cache->last_block_write
2590					+ cache->last_block_write_duration * 2 * 64;
2591			if (cache->busy_writing_count > 16 || system_time() < next) {
2592				if (cache->last_block_write_duration > 0) {
2593					timeout = min_c(timeout,
2594						cache->last_block_write_duration * 2 * 64);
2595				}
2596				continue;
2597			}
2598
2599			BlockWriter writer(cache, 64);
2600			bool hasMoreBlocks = false;
2601
2602			size_t cacheUsedMemory;
2603			object_cache_get_usage(cache->buffer_cache, &cacheUsedMemory);
2604			usedMemory += cacheUsedMemory;
2605
2606			if (cache->num_dirty_blocks) {
2607				// This cache is not using transactions, we'll scan the blocks
2608				// directly
2609				BlockTable::Iterator iterator(cache->hash);
2610
2611				while (iterator.HasNext()) {
2612					cached_block* block = iterator.Next();
2613					if (block->CanBeWritten() && !writer.Add(block)) {
2614						hasMoreBlocks = true;
2615						break;
2616					}
2617				}
2618			} else {
2619				TransactionTable::Iterator iterator(cache->transaction_hash);
2620
2621				while (iterator.HasNext()) {
2622					cache_transaction* transaction = iterator.Next();
2623					if (transaction->open) {
2624						if (system_time() > transaction->last_used
2625								+ kTransactionIdleTime) {
2626							// Transaction is open but idle
2627							notify_transaction_listeners(cache, transaction,
2628								TRANSACTION_IDLE);
2629						}
2630						continue;
2631					}
2632
2633					bool hasLeftOvers;
2634						// we ignore this one
2635					if (!writer.Add(transaction, hasLeftOvers)) {
2636						hasMoreBlocks = true;
2637						break;
2638					}
2639				}
2640			}
2641
2642			writer.Write();
2643
2644			if (hasMoreBlocks && cache->last_block_write_duration > 0) {
2645				// There are probably still more blocks that we could write, so
2646				// see if we can decrease the timeout.
2647				timeout = min_c(timeout,
2648					cache->last_block_write_duration * 2 * 64);
2649			}
2650
2651			if ((block_cache_used_memory() / B_PAGE_SIZE)
2652					> vm_page_num_pages() / 2) {
2653				// Try to reduce memory usage to half of the available
2654				// RAM at maximum
2655				cache->RemoveUnusedBlocks(1000, 10);
2656			}
2657		}
2658
2659		MutexLocker _(sCachesMemoryUseLock);
2660		sUsedMemory = usedMemory;
2661	}
2662
2663	// never can get here
2664	return B_OK;
2665}
2666
2667
2668/*!	Notify function for wait_for_notifications(). */
2669static void
2670notify_sync(int32 transactionID, int32 event, void* _cache)
2671{
2672	block_cache* cache = (block_cache*)_cache;
2673
2674	cache->condition_variable.NotifyOne();
2675}
2676
2677
2678/*!	Must be called with the sCachesLock held. */
2679static bool
2680is_valid_cache(block_cache* cache)
2681{
2682	ASSERT_LOCKED_MUTEX(&sCachesLock);
2683
2684	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
2685	while (iterator.HasNext()) {
2686		if (cache == iterator.Next())
2687			return true;
2688	}
2689
2690	return false;
2691}
2692
2693
2694/*!	Waits until all pending notifications are carried out.
2695	Safe to be called from the block writer/notifier thread.
2696	You must not hold the \a cache lock when calling this function.
2697*/
2698static void
2699wait_for_notifications(block_cache* cache)
2700{
2701	MutexLocker locker(sCachesLock);
2702
2703	if (find_thread(NULL) == sNotifierWriterThread) {
2704		// We're the notifier thread, don't wait, but flush all pending
2705		// notifications directly.
2706		if (is_valid_cache(cache))
2707			flush_pending_notifications(cache);
2708		return;
2709	}
2710
2711	// add sync notification
2712	cache_notification notification;
2713	set_notification(NULL, notification, TRANSACTION_WRITTEN, notify_sync,
2714		cache);
2715
2716	ConditionVariableEntry entry;
2717	cache->condition_variable.Add(&entry);
2718
2719	add_notification(cache, &notification, TRANSACTION_WRITTEN, false);
2720	locker.Unlock();
2721
2722	// wait for notification hook to be called
2723	entry.Wait();
2724}
2725
2726
2727status_t
2728block_cache_init(void)
2729{
2730	sBlockCache = create_object_cache_etc("cached blocks", sizeof(cached_block),
2731		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
2732	if (sBlockCache == NULL)
2733		return B_NO_MEMORY;
2734
2735	sCacheNotificationCache = create_object_cache("cache notifications",
2736		sizeof(cache_listener), 8, NULL, NULL, NULL);
2737	if (sCacheNotificationCache == NULL)
2738		return B_NO_MEMORY;
2739
2740	new (&sCaches) DoublyLinkedList<block_cache>;
2741		// manually call constructor
2742
2743	sEventSemaphore = create_sem(0, "block cache event");
2744	if (sEventSemaphore < B_OK)
2745		return sEventSemaphore;
2746
2747	sNotifierWriterThread = spawn_kernel_thread(&block_notifier_and_writer,
2748		"block notifier/writer", B_LOW_PRIORITY, NULL);
2749	if (sNotifierWriterThread >= B_OK)
2750		resume_thread(sNotifierWriterThread);
2751
2752#if DEBUG_BLOCK_CACHE
2753	add_debugger_command_etc("block_caches", &dump_caches,
2754		"dumps all block caches", "\n", 0);
2755	add_debugger_command_etc("block_cache", &dump_cache,
2756		"dumps a specific block cache",
2757		"[-bt] <cache-address> [block-number]\n"
2758		"  -t lists the transactions\n"
2759		"  -b lists all blocks\n", 0);
2760	add_debugger_command("cached_block", &dump_cached_block,
2761		"dumps the specified cached block");
2762	add_debugger_command_etc("transaction", &dump_transaction,
2763		"dumps a specific transaction", "[-b] ((<cache> <id>) | <transaction>)\n"
2764		"Either use a block cache pointer and an ID or a pointer to the transaction.\n"
2765		"  -b lists all blocks that are part of this transaction\n", 0);
2766#	if BLOCK_CACHE_BLOCK_TRACING >= 2
2767	add_debugger_command_etc("block_cache_data", &dump_block_data,
2768		"dumps the data blocks logged for the actions",
2769		"[-cpo] <from> [<to> [<offset> [<size>]]]\n"
2770		"If no data specifier is used, all blocks are shown by default.\n"
2771		" -c       the current data is shown, if available.\n"
2772		" -p       the parent data is shown, if available.\n"
2773		" -o       the original data is shown, if available.\n"
2774		" <from>   first index of tracing entries to show.\n"
2775		" <to>     if given, the last entry. If not, only <from> is shown.\n"
2776		" <offset> the offset of the block data.\n"
2777		" <from>   the size of the block data that is dumped\n", 0);
2778#	endif
2779#endif	// DEBUG_BLOCK_CACHE
2780
2781	return B_OK;
2782}
2783
2784
2785size_t
2786block_cache_used_memory(void)
2787{
2788	MutexLocker _(sCachesMemoryUseLock);
2789	return sUsedMemory;
2790}
2791
2792
2793//	#pragma mark - public transaction API
2794
2795
2796int32
2797cache_start_transaction(void* _cache)
2798{
2799	block_cache* cache = (block_cache*)_cache;
2800	TransactionLocker locker(cache);
2801
2802	if (cache->last_transaction && cache->last_transaction->open) {
2803		panic("last transaction (%" B_PRId32 ") still open!\n",
2804			cache->last_transaction->id);
2805	}
2806
2807	cache_transaction* transaction = new(std::nothrow) cache_transaction;
2808	if (transaction == NULL)
2809		return B_NO_MEMORY;
2810
2811	transaction->id = atomic_add(&cache->next_transaction_id, 1);
2812	cache->last_transaction = transaction;
2813
2814	TRACE(("cache_start_transaction(): id %" B_PRId32 " started\n", transaction->id));
2815	T(Action("start", cache, transaction));
2816
2817	cache->transaction_hash->Insert(transaction);
2818
2819	return transaction->id;
2820}
2821
2822
2823status_t
2824cache_sync_transaction(void* _cache, int32 id)
2825{
2826	block_cache* cache = (block_cache*)_cache;
2827	bool hadBusy;
2828
2829	TRACE(("cache_sync_transaction(id %" B_PRId32 ")\n", id));
2830
2831	do {
2832		TransactionLocker locker(cache);
2833		hadBusy = false;
2834
2835		BlockWriter writer(cache);
2836		TransactionTable::Iterator iterator(cache->transaction_hash);
2837
2838		while (iterator.HasNext()) {
2839			// close all earlier transactions which haven't been closed yet
2840			cache_transaction* transaction = iterator.Next();
2841
2842			if (transaction->busy_writing_count != 0) {
2843				hadBusy = true;
2844				continue;
2845			}
2846			if (transaction->id <= id && !transaction->open) {
2847				// write back all of their remaining dirty blocks
2848				T(Action("sync", cache, transaction));
2849
2850				bool hasLeftOvers;
2851				writer.Add(transaction, hasLeftOvers);
2852
2853				if (hasLeftOvers) {
2854					// This transaction contains blocks that a previous
2855					// transaction is trying to write back in this write run
2856					hadBusy = true;
2857				}
2858			}
2859		}
2860
2861		status_t status = writer.Write();
2862		if (status != B_OK)
2863			return status;
2864	} while (hadBusy);
2865
2866	wait_for_notifications(cache);
2867		// make sure that all pending TRANSACTION_WRITTEN notifications
2868		// are handled after we return
2869	return B_OK;
2870}
2871
2872
2873status_t
2874cache_end_transaction(void* _cache, int32 id,
2875	transaction_notification_hook hook, void* data)
2876{
2877	block_cache* cache = (block_cache*)_cache;
2878	TransactionLocker locker(cache);
2879
2880	TRACE(("cache_end_transaction(id = %" B_PRId32 ")\n", id));
2881
2882	cache_transaction* transaction = lookup_transaction(cache, id);
2883	if (transaction == NULL) {
2884		panic("cache_end_transaction(): invalid transaction ID\n");
2885		return B_BAD_VALUE;
2886	}
2887
2888	// Write back all pending transaction blocks
2889	status_t status = write_blocks_in_previous_transaction(cache, transaction);
2890	if (status != B_OK)
2891		return status;
2892
2893	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
2894
2895	if (hook != NULL
2896		&& add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN,
2897			hook, data) != B_OK) {
2898		return B_NO_MEMORY;
2899	}
2900
2901	T(Action("end", cache, transaction));
2902
2903	// iterate through all blocks and free the unchanged original contents
2904
2905	cached_block* next;
2906	for (cached_block* block = transaction->first_block; block != NULL;
2907			block = next) {
2908		next = block->transaction_next;
2909		ASSERT(block->previous_transaction == NULL);
2910
2911		if (block->discard) {
2912			// This block has been discarded in the transaction
2913			cache->DiscardBlock(block);
2914			transaction->num_blocks--;
2915			continue;
2916		}
2917
2918		if (block->original_data != NULL) {
2919			cache->Free(block->original_data);
2920			block->original_data = NULL;
2921		}
2922		if (block->parent_data != NULL) {
2923			ASSERT(transaction->has_sub_transaction);
2924			cache->FreeBlockParentData(block);
2925		}
2926
2927		// move the block to the previous transaction list
2928		transaction->blocks.Add(block);
2929
2930		block->previous_transaction = transaction;
2931		block->transaction_next = NULL;
2932		block->transaction = NULL;
2933	}
2934
2935	transaction->open = false;
2936	return B_OK;
2937}
2938
2939
2940status_t
2941cache_abort_transaction(void* _cache, int32 id)
2942{
2943	block_cache* cache = (block_cache*)_cache;
2944	TransactionLocker locker(cache);
2945
2946	TRACE(("cache_abort_transaction(id = %" B_PRId32 ")\n", id));
2947
2948	cache_transaction* transaction = lookup_transaction(cache, id);
2949	if (transaction == NULL) {
2950		panic("cache_abort_transaction(): invalid transaction ID\n");
2951		return B_BAD_VALUE;
2952	}
2953
2954	T(Abort(cache, transaction));
2955	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
2956
2957	// iterate through all blocks and restore their original contents
2958
2959	cached_block* block = transaction->first_block;
2960	cached_block* next;
2961	for (; block != NULL; block = next) {
2962		next = block->transaction_next;
2963
2964		if (block->original_data != NULL) {
2965			TRACE(("cache_abort_transaction(id = %" B_PRId32 "): restored contents of "
2966				"block %" B_PRIdOFF "\n", transaction->id, block->block_number));
2967			memcpy(block->current_data, block->original_data,
2968				cache->block_size);
2969			cache->Free(block->original_data);
2970			block->original_data = NULL;
2971		}
2972		if (transaction->has_sub_transaction && block->parent_data != NULL)
2973			cache->FreeBlockParentData(block);
2974
2975		block->transaction_next = NULL;
2976		block->transaction = NULL;
2977		block->discard = false;
2978		if (block->previous_transaction == NULL)
2979			block->is_dirty = false;
2980	}
2981
2982	cache->transaction_hash->Remove(transaction);
2983	delete_transaction(cache, transaction);
2984	return B_OK;
2985}
2986
2987
2988/*!	Acknowledges the current parent transaction, and starts a new transaction
2989	from its sub transaction.
2990	The new transaction also gets a new transaction ID.
2991*/
2992int32
2993cache_detach_sub_transaction(void* _cache, int32 id,
2994	transaction_notification_hook hook, void* data)
2995{
2996	block_cache* cache = (block_cache*)_cache;
2997	TransactionLocker locker(cache);
2998
2999	TRACE(("cache_detach_sub_transaction(id = %" B_PRId32 ")\n", id));
3000
3001	cache_transaction* transaction = lookup_transaction(cache, id);
3002	if (transaction == NULL) {
3003		panic("cache_detach_sub_transaction(): invalid transaction ID\n");
3004		return B_BAD_VALUE;
3005	}
3006	if (!transaction->has_sub_transaction)
3007		return B_BAD_VALUE;
3008
3009	// iterate through all blocks and free the unchanged original contents
3010
3011	status_t status = write_blocks_in_previous_transaction(cache, transaction);
3012	if (status != B_OK)
3013		return status;
3014
3015	// create a new transaction for the sub transaction
3016	cache_transaction* newTransaction = new(std::nothrow) cache_transaction;
3017	if (newTransaction == NULL)
3018		return B_NO_MEMORY;
3019
3020	newTransaction->id = atomic_add(&cache->next_transaction_id, 1);
3021	T(Detach(cache, transaction, newTransaction));
3022
3023	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3024
3025	if (add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN, hook,
3026			data) != B_OK) {
3027		delete newTransaction;
3028		return B_NO_MEMORY;
3029	}
3030
3031	cached_block* last = NULL;
3032	cached_block* next;
3033	for (cached_block* block = transaction->first_block; block != NULL;
3034			block = next) {
3035		next = block->transaction_next;
3036		ASSERT(block->previous_transaction == NULL);
3037
3038		if (block->discard) {
3039			cache->DiscardBlock(block);
3040			transaction->main_num_blocks--;
3041			continue;
3042		}
3043
3044		if (block->parent_data != NULL) {
3045			// The block changed in the parent - free the original data, since
3046			// they will be replaced by what is in current.
3047			ASSERT(block->original_data != NULL);
3048			cache->Free(block->original_data);
3049
3050			if (block->parent_data != block->current_data) {
3051				// The block had been changed in both transactions
3052				block->original_data = block->parent_data;
3053			} else {
3054				// The block has only been changed in the parent
3055				block->original_data = NULL;
3056			}
3057
3058			// move the block to the previous transaction list
3059			transaction->blocks.Add(block);
3060			block->previous_transaction = transaction;
3061			block->parent_data = NULL;
3062		}
3063
3064		if (block->original_data != NULL) {
3065			// This block had been changed in the current sub transaction,
3066			// we need to move this block over to the new transaction.
3067			ASSERT(block->parent_data == NULL);
3068
3069			if (last == NULL)
3070				newTransaction->first_block = block;
3071			else
3072				last->transaction_next = block;
3073
3074			block->transaction = newTransaction;
3075			last = block;
3076		} else
3077			block->transaction = NULL;
3078
3079		block->transaction_next = NULL;
3080	}
3081
3082	newTransaction->num_blocks = transaction->sub_num_blocks;
3083
3084	transaction->open = false;
3085	transaction->has_sub_transaction = false;
3086	transaction->num_blocks = transaction->main_num_blocks;
3087	transaction->sub_num_blocks = 0;
3088
3089	cache->transaction_hash->Insert(newTransaction);
3090	cache->last_transaction = newTransaction;
3091
3092	return newTransaction->id;
3093}
3094
3095
3096status_t
3097cache_abort_sub_transaction(void* _cache, int32 id)
3098{
3099	block_cache* cache = (block_cache*)_cache;
3100	TransactionLocker locker(cache);
3101
3102	TRACE(("cache_abort_sub_transaction(id = %" B_PRId32 ")\n", id));
3103
3104	cache_transaction* transaction = lookup_transaction(cache, id);
3105	if (transaction == NULL) {
3106		panic("cache_abort_sub_transaction(): invalid transaction ID\n");
3107		return B_BAD_VALUE;
3108	}
3109	if (!transaction->has_sub_transaction)
3110		return B_BAD_VALUE;
3111
3112	T(Abort(cache, transaction));
3113	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
3114
3115	// revert all changes back to the version of the parent
3116
3117	cached_block* block = transaction->first_block;
3118	cached_block* last = NULL;
3119	cached_block* next;
3120	for (; block != NULL; block = next) {
3121		next = block->transaction_next;
3122
3123		if (block->parent_data == NULL) {
3124			// The parent transaction didn't change the block, but the sub
3125			// transaction did - we need to revert to the original data.
3126			// The block is no longer part of the transaction
3127			if (block->original_data != NULL) {
3128				// The block might not have original data if was empty
3129				memcpy(block->current_data, block->original_data,
3130					cache->block_size);
3131			}
3132
3133			if (last != NULL)
3134				last->transaction_next = next;
3135			else
3136				transaction->first_block = next;
3137
3138			block->transaction_next = NULL;
3139			block->transaction = NULL;
3140			transaction->num_blocks--;
3141
3142			if (block->previous_transaction == NULL) {
3143				cache->Free(block->original_data);
3144				block->original_data = NULL;
3145				block->is_dirty = false;
3146
3147				if (block->ref_count == 0) {
3148					// Move the block into the unused list if possible
3149					block->unused = true;
3150					cache->unused_blocks.Add(block);
3151					cache->unused_block_count++;
3152				}
3153			}
3154		} else {
3155			if (block->parent_data != block->current_data) {
3156				// The block has been changed and must be restored - the block
3157				// is still dirty and part of the transaction
3158				TRACE(("cache_abort_sub_transaction(id = %" B_PRId32 "): "
3159					"restored contents of block %" B_PRIdOFF "\n",
3160					transaction->id, block->block_number));
3161				memcpy(block->current_data, block->parent_data,
3162					cache->block_size);
3163				cache->Free(block->parent_data);
3164				// The block stays dirty
3165			}
3166			block->parent_data = NULL;
3167			last = block;
3168		}
3169
3170		block->discard = false;
3171	}
3172
3173	// all subsequent changes will go into the main transaction
3174	transaction->has_sub_transaction = false;
3175	transaction->sub_num_blocks = 0;
3176
3177	return B_OK;
3178}
3179
3180
3181status_t
3182cache_start_sub_transaction(void* _cache, int32 id)
3183{
3184	block_cache* cache = (block_cache*)_cache;
3185	TransactionLocker locker(cache);
3186
3187	TRACE(("cache_start_sub_transaction(id = %" B_PRId32 ")\n", id));
3188
3189	cache_transaction* transaction = lookup_transaction(cache, id);
3190	if (transaction == NULL) {
3191		panic("cache_start_sub_transaction(): invalid transaction ID %" B_PRId32 "\n",
3192			id);
3193		return B_BAD_VALUE;
3194	}
3195
3196	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3197
3198	// move all changed blocks up to the parent
3199
3200	cached_block* block = transaction->first_block;
3201	cached_block* next;
3202	for (; block != NULL; block = next) {
3203		next = block->transaction_next;
3204
3205		if (block->parent_data != NULL) {
3206			// There already is an older sub transaction - we acknowledge
3207			// its changes and move its blocks up to the parent
3208			ASSERT(transaction->has_sub_transaction);
3209			cache->FreeBlockParentData(block);
3210		}
3211		if (block->discard) {
3212			// This block has been discarded in the parent transaction.
3213			// Just throw away any changes made in this transaction, so that
3214			// it can still be reverted to its original contents if needed
3215			ASSERT(block->previous_transaction == NULL);
3216			if (block->original_data != NULL) {
3217				memcpy(block->current_data, block->original_data,
3218					cache->block_size);
3219				block->original_data = NULL;
3220			}
3221			continue;
3222		}
3223
3224		// we "allocate" the parent data lazily, that means, we don't copy
3225		// the data (and allocate memory for it) until we need to
3226		block->parent_data = block->current_data;
3227	}
3228
3229	// all subsequent changes will go into the sub transaction
3230	transaction->has_sub_transaction = true;
3231	transaction->main_num_blocks = transaction->num_blocks;
3232	transaction->sub_num_blocks = 0;
3233	T(Action("start-sub", cache, transaction));
3234
3235	return B_OK;
3236}
3237
3238
3239/*!	Adds a transaction listener that gets notified when the transaction
3240	is ended, aborted, written, or idle as specified by \a events.
3241	The listener gets automatically removed when the transaction ends.
3242*/
3243status_t
3244cache_add_transaction_listener(void* _cache, int32 id, int32 events,
3245	transaction_notification_hook hook, void* data)
3246{
3247	block_cache* cache = (block_cache*)_cache;
3248	TransactionLocker locker(cache);
3249
3250	cache_transaction* transaction = lookup_transaction(cache, id);
3251	if (transaction == NULL)
3252		return B_BAD_VALUE;
3253
3254	return add_transaction_listener(cache, transaction, events, hook, data);
3255}
3256
3257
3258status_t
3259cache_remove_transaction_listener(void* _cache, int32 id,
3260	transaction_notification_hook hookFunction, void* data)
3261{
3262	block_cache* cache = (block_cache*)_cache;
3263	TransactionLocker locker(cache);
3264
3265	cache_transaction* transaction = lookup_transaction(cache, id);
3266	if (transaction == NULL)
3267		return B_BAD_VALUE;
3268
3269	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
3270	while (iterator.HasNext()) {
3271		cache_listener* listener = iterator.Next();
3272		if (listener->data == data && listener->hook == hookFunction) {
3273			iterator.Remove();
3274
3275			if (listener->events_pending != 0) {
3276				MutexLocker _(sNotificationsLock);
3277				if (listener->events_pending != 0)
3278					cache->pending_notifications.Remove(listener);
3279			}
3280			delete listener;
3281			return B_OK;
3282		}
3283	}
3284
3285	return B_ENTRY_NOT_FOUND;
3286}
3287
3288
3289status_t
3290cache_next_block_in_transaction(void* _cache, int32 id, bool mainOnly,
3291	long* _cookie, off_t* _blockNumber, void** _data, void** _unchangedData)
3292{
3293	cached_block* block = (cached_block*)*_cookie;
3294	block_cache* cache = (block_cache*)_cache;
3295	TransactionLocker locker(cache);
3296
3297	cache_transaction* transaction = lookup_transaction(cache, id);
3298	if (transaction == NULL || !transaction->open)
3299		return B_BAD_VALUE;
3300
3301	if (block == NULL)
3302		block = transaction->first_block;
3303	else
3304		block = block->transaction_next;
3305
3306	if (transaction->has_sub_transaction) {
3307		if (mainOnly) {
3308			// find next block that the parent changed
3309			while (block != NULL && block->parent_data == NULL)
3310				block = block->transaction_next;
3311		} else {
3312			// find next non-discarded block
3313			while (block != NULL && block->discard)
3314				block = block->transaction_next;
3315		}
3316	}
3317
3318	if (block == NULL)
3319		return B_ENTRY_NOT_FOUND;
3320
3321	if (_blockNumber)
3322		*_blockNumber = block->block_number;
3323	if (_data)
3324		*_data = mainOnly ? block->parent_data : block->current_data;
3325	if (_unchangedData)
3326		*_unchangedData = block->original_data;
3327
3328	*_cookie = (addr_t)block;
3329	return B_OK;
3330}
3331
3332
3333int32
3334cache_blocks_in_transaction(void* _cache, int32 id)
3335{
3336	block_cache* cache = (block_cache*)_cache;
3337	TransactionLocker locker(cache);
3338
3339	cache_transaction* transaction = lookup_transaction(cache, id);
3340	if (transaction == NULL)
3341		return B_BAD_VALUE;
3342
3343	return transaction->num_blocks;
3344}
3345
3346
3347/*!	Returns the number of blocks that are part of the main transaction. If this
3348	transaction does not have a sub transaction yet, this is the same value as
3349	cache_blocks_in_transaction() would return.
3350*/
3351int32
3352cache_blocks_in_main_transaction(void* _cache, int32 id)
3353{
3354	block_cache* cache = (block_cache*)_cache;
3355	TransactionLocker locker(cache);
3356
3357	cache_transaction* transaction = lookup_transaction(cache, id);
3358	if (transaction == NULL)
3359		return B_BAD_VALUE;
3360
3361	if (transaction->has_sub_transaction)
3362		return transaction->main_num_blocks;
3363
3364	return transaction->num_blocks;
3365}
3366
3367
3368int32
3369cache_blocks_in_sub_transaction(void* _cache, int32 id)
3370{
3371	block_cache* cache = (block_cache*)_cache;
3372	TransactionLocker locker(cache);
3373
3374	cache_transaction* transaction = lookup_transaction(cache, id);
3375	if (transaction == NULL)
3376		return B_BAD_VALUE;
3377
3378	return transaction->sub_num_blocks;
3379}
3380
3381
3382/*!	Check if block is in transaction
3383*/
3384bool
3385cache_has_block_in_transaction(void* _cache, int32 id, off_t blockNumber)
3386{
3387	block_cache* cache = (block_cache*)_cache;
3388	TransactionLocker locker(cache);
3389
3390	cached_block* block = cache->hash->Lookup(blockNumber);
3391
3392	return (block != NULL && block->transaction != NULL
3393		&& block->transaction->id == id);
3394}
3395
3396
3397//	#pragma mark - public block cache API
3398
3399
3400void
3401block_cache_delete(void* _cache, bool allowWrites)
3402{
3403	block_cache* cache = (block_cache*)_cache;
3404
3405	if (allowWrites)
3406		block_cache_sync(cache);
3407
3408	mutex_lock(&sCachesLock);
3409	sCaches.Remove(cache);
3410	mutex_unlock(&sCachesLock);
3411
3412	mutex_lock(&cache->lock);
3413
3414	// wait for all blocks to become unbusy
3415	wait_for_busy_reading_blocks(cache);
3416	wait_for_busy_writing_blocks(cache);
3417
3418	// free all blocks
3419
3420	cached_block* block = cache->hash->Clear(true);
3421	while (block != NULL) {
3422		cached_block* next = block->next;
3423		cache->FreeBlock(block);
3424		block = next;
3425	}
3426
3427	// free all transactions (they will all be aborted)
3428
3429	cache_transaction* transaction = cache->transaction_hash->Clear(true);
3430	while (transaction != NULL) {
3431		cache_transaction* next = transaction->next;
3432		delete transaction;
3433		transaction = next;
3434	}
3435
3436	delete cache;
3437}
3438
3439
3440void*
3441block_cache_create(int fd, off_t numBlocks, size_t blockSize, bool readOnly)
3442{
3443	block_cache* cache = new(std::nothrow) block_cache(fd, numBlocks, blockSize,
3444		readOnly);
3445	if (cache == NULL)
3446		return NULL;
3447
3448	if (cache->Init() != B_OK) {
3449		delete cache;
3450		return NULL;
3451	}
3452
3453	MutexLocker _(sCachesLock);
3454	sCaches.Add(cache);
3455
3456	return cache;
3457}
3458
3459
3460status_t
3461block_cache_sync(void* _cache)
3462{
3463	block_cache* cache = (block_cache*)_cache;
3464
3465	// We will sync all dirty blocks to disk that have a completed
3466	// transaction or no transaction only
3467
3468	MutexLocker locker(&cache->lock);
3469
3470	BlockWriter writer(cache);
3471	BlockTable::Iterator iterator(cache->hash);
3472
3473	while (iterator.HasNext()) {
3474		cached_block* block = iterator.Next();
3475		if (block->CanBeWritten())
3476			writer.Add(block);
3477	}
3478
3479	status_t status = writer.Write();
3480
3481	locker.Unlock();
3482
3483	wait_for_notifications(cache);
3484		// make sure that all pending TRANSACTION_WRITTEN notifications
3485		// are handled after we return
3486	return status;
3487}
3488
3489
3490status_t
3491block_cache_sync_etc(void* _cache, off_t blockNumber, size_t numBlocks)
3492{
3493	block_cache* cache = (block_cache*)_cache;
3494
3495	// We will sync all dirty blocks to disk that have a completed
3496	// transaction or no transaction only
3497
3498	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
3499		panic("block_cache_sync_etc: invalid block number %" B_PRIdOFF
3500			" (max %" B_PRIdOFF ")",
3501			blockNumber, cache->max_blocks - 1);
3502		return B_BAD_VALUE;
3503	}
3504
3505	MutexLocker locker(&cache->lock);
3506	BlockWriter writer(cache);
3507
3508	for (; numBlocks > 0; numBlocks--, blockNumber++) {
3509		cached_block* block = cache->hash->Lookup(blockNumber);
3510		if (block == NULL)
3511			continue;
3512
3513		if (block->CanBeWritten())
3514			writer.Add(block);
3515	}
3516
3517	status_t status = writer.Write();
3518
3519	locker.Unlock();
3520
3521	wait_for_notifications(cache);
3522		// make sure that all pending TRANSACTION_WRITTEN notifications
3523		// are handled after we return
3524	return status;
3525}
3526
3527
3528/*!	Discards a block from the current transaction or from the cache.
3529	You have to call this function when you no longer use a block, ie. when it
3530	might be reclaimed by the file cache in order to make sure they won't
3531	interfere.
3532*/
3533void
3534block_cache_discard(void* _cache, off_t blockNumber, size_t numBlocks)
3535{
3536	// TODO: this could be a nice place to issue the ATA trim command
3537	block_cache* cache = (block_cache*)_cache;
3538	TransactionLocker locker(cache);
3539
3540	BlockWriter writer(cache);
3541
3542	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3543		cached_block* block = cache->hash->Lookup(blockNumber);
3544		if (block != NULL && block->previous_transaction != NULL)
3545			writer.Add(block);
3546	}
3547
3548	writer.Write();
3549		// TODO: this can fail, too!
3550
3551	blockNumber -= numBlocks;
3552		// reset blockNumber to its original value
3553
3554	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3555		cached_block* block = cache->hash->Lookup(blockNumber);
3556		if (block == NULL)
3557			continue;
3558
3559		ASSERT(block->previous_transaction == NULL);
3560
3561		if (block->unused) {
3562			cache->unused_blocks.Remove(block);
3563			cache->unused_block_count--;
3564			cache->RemoveBlock(block);
3565		} else {
3566			if (block->transaction != NULL && block->parent_data != NULL
3567				&& block->parent_data != block->current_data) {
3568				panic("Discarded block %" B_PRIdOFF " has already been changed in this "
3569					"transaction!", blockNumber);
3570			}
3571
3572			// mark it as discarded (in the current transaction only, if any)
3573			block->discard = true;
3574		}
3575	}
3576}
3577
3578
3579status_t
3580block_cache_make_writable(void* _cache, off_t blockNumber, int32 transaction)
3581{
3582	block_cache* cache = (block_cache*)_cache;
3583	MutexLocker locker(&cache->lock);
3584
3585	if (cache->read_only) {
3586		panic("tried to make block writable on a read-only cache!");
3587		return B_ERROR;
3588	}
3589
3590	// TODO: this can be done better!
3591	void* block;
3592	status_t status = get_writable_cached_block(cache, blockNumber,
3593		blockNumber, 1, transaction, false, &block);
3594	if (status == B_OK) {
3595		put_cached_block((block_cache*)_cache, blockNumber);
3596		return B_OK;
3597	}
3598
3599	return status;
3600}
3601
3602
3603status_t
3604block_cache_get_writable_etc(void* _cache, off_t blockNumber, off_t base,
3605	off_t length, int32 transaction, void** _block)
3606{
3607	block_cache* cache = (block_cache*)_cache;
3608	MutexLocker locker(&cache->lock);
3609
3610	TRACE(("block_cache_get_writable_etc(block = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
3611		blockNumber, transaction));
3612	if (cache->read_only)
3613		panic("tried to get writable block on a read-only cache!");
3614
3615	return get_writable_cached_block(cache, blockNumber, base, length,
3616		transaction, false, _block);
3617}
3618
3619
3620void*
3621block_cache_get_writable(void* _cache, off_t blockNumber, int32 transaction)
3622{
3623	void* block;
3624	if (block_cache_get_writable_etc(_cache, blockNumber,
3625			blockNumber, 1, transaction, &block) == B_OK)
3626		return block;
3627
3628	return NULL;
3629}
3630
3631
3632void*
3633block_cache_get_empty(void* _cache, off_t blockNumber, int32 transaction)
3634{
3635	block_cache* cache = (block_cache*)_cache;
3636	MutexLocker locker(&cache->lock);
3637
3638	TRACE(("block_cache_get_empty(block = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
3639		blockNumber, transaction));
3640	if (cache->read_only)
3641		panic("tried to get empty writable block on a read-only cache!");
3642
3643	void* block;
3644	if (get_writable_cached_block((block_cache*)_cache, blockNumber,
3645			blockNumber, 1, transaction, true, &block) == B_OK)
3646		return block;
3647
3648	return NULL;
3649}
3650
3651
3652status_t
3653block_cache_get_etc(void* _cache, off_t blockNumber, off_t base, off_t length,
3654	const void** _block)
3655{
3656	block_cache* cache = (block_cache*)_cache;
3657	MutexLocker locker(&cache->lock);
3658	bool allocated;
3659
3660	cached_block* block;
3661	status_t status = get_cached_block(cache, blockNumber, &allocated, true,
3662		&block);
3663	if (status != B_OK)
3664		return status;
3665
3666#if BLOCK_CACHE_DEBUG_CHANGED
3667	if (block->compare == NULL)
3668		block->compare = cache->Allocate();
3669	if (block->compare != NULL)
3670		memcpy(block->compare, block->current_data, cache->block_size);
3671#endif
3672	TB(Get(cache, block));
3673
3674	*_block = block->current_data;
3675	return B_OK;
3676}
3677
3678
3679const void*
3680block_cache_get(void* _cache, off_t blockNumber)
3681{
3682	const void* block;
3683	if (block_cache_get_etc(_cache, blockNumber, blockNumber, 1, &block)
3684			== B_OK)
3685		return block;
3686
3687	return NULL;
3688}
3689
3690
3691/*!	Changes the internal status of a writable block to \a dirty. This can be
3692	helpful in case you realize you don't need to change that block anymore
3693	for whatever reason.
3694
3695	Note, you must only use this function on blocks that were acquired
3696	writable!
3697*/
3698status_t
3699block_cache_set_dirty(void* _cache, off_t blockNumber, bool dirty,
3700	int32 transaction)
3701{
3702	block_cache* cache = (block_cache*)_cache;
3703	MutexLocker locker(&cache->lock);
3704
3705	cached_block* block = cache->hash->Lookup(blockNumber);
3706	if (block == NULL)
3707		return B_BAD_VALUE;
3708	if (block->is_dirty == dirty) {
3709		// there is nothing to do for us
3710		return B_OK;
3711	}
3712
3713	// TODO: not yet implemented
3714	if (dirty)
3715		panic("block_cache_set_dirty(): not yet implemented that way!\n");
3716
3717	return B_OK;
3718}
3719
3720
3721void
3722block_cache_put(void* _cache, off_t blockNumber)
3723{
3724	block_cache* cache = (block_cache*)_cache;
3725	MutexLocker locker(&cache->lock);
3726
3727	put_cached_block(cache, blockNumber);
3728}
3729
3730