1/*
2 * Copyright 2004-2020, Axel D��rfler, axeld@pinc-software.de.
3 * Distributed under the terms of the MIT License.
4 */
5
6
7#include <block_cache.h>
8
9#include <unistd.h>
10#include <stdlib.h>
11#include <string.h>
12#include <errno.h>
13
14#include <KernelExport.h>
15#include <fs_cache.h>
16
17#include <condition_variable.h>
18#include <lock.h>
19#include <low_resource_manager.h>
20#include <slab/Slab.h>
21#include <tracing.h>
22#include <util/kernel_cpp.h>
23#include <util/DoublyLinkedList.h>
24#include <util/AutoLock.h>
25#include <vm/vm_page.h>
26
27#include "kernel_debug_config.h"
28
29
30// TODO: this is a naive but growing implementation to test the API:
31//	block reading/writing is not at all optimized for speed, it will
32//	just read and write single blocks.
33// TODO: the retrieval/copy of the original data could be delayed until the
34//		new data must be written, ie. in low memory situations.
35
36#ifdef _KERNEL_MODE
37#	define TRACE_ALWAYS(x...) dprintf(x)
38#else
39#	define TRACE_ALWAYS(x...) printf(x)
40#endif
41
42//#define TRACE_BLOCK_CACHE
43#ifdef TRACE_BLOCK_CACHE
44#	define TRACE(x)	TRACE_ALWAYS(x)
45#else
46#	define TRACE(x) ;
47#endif
48
49
50// This macro is used for fatal situations that are acceptable in a running
51// system, like out of memory situations - should only panic for debugging.
52#define FATAL(x) panic x
53
54static const bigtime_t kTransactionIdleTime = 2000000LL;
55	// a transaction is considered idle after 2 seconds of inactivity
56
57
58namespace {
59
60struct cache_transaction;
61struct cached_block;
62struct block_cache;
63typedef DoublyLinkedListLink<cached_block> block_link;
64
65struct cached_block {
66	cached_block*	next;			// next in hash
67	cached_block*	transaction_next;
68	block_link		link;
69	off_t			block_number;
70	void*			current_data;
71		// The data that is seen by everyone using the API; this one is always
72		// present.
73	void*			original_data;
74		// When in a transaction, this contains the original data from before
75		// the transaction.
76	void*			parent_data;
77		// This is a lazily alloced buffer that represents the contents of the
78		// block in the parent transaction. It may point to current_data if the
79		// contents have been changed only in the parent transaction, or, if the
80		// block has been changed in the current sub transaction already, to a
81		// new block containing the contents changed in the parent transaction.
82		// If this is NULL, the block has not been changed in the parent
83		// transaction at all.
84#if BLOCK_CACHE_DEBUG_CHANGED
85	void*			compare;
86#endif
87	int32			ref_count;
88	int32			last_accessed;
89	bool			busy_reading : 1;
90	bool			busy_writing : 1;
91	bool			is_writing : 1;
92		// Block has been checked out for writing without transactions, and
93		// cannot be written back if set
94	bool			is_dirty : 1;
95	bool			unused : 1;
96	bool			discard : 1;
97	bool			busy_reading_waiters : 1;
98	bool			busy_writing_waiters : 1;
99	cache_transaction* transaction;
100		// This is the current active transaction, if any, the block is
101		// currently in (meaning was changed as a part of it).
102	cache_transaction* previous_transaction;
103		// This is set to the last transaction that was ended containing this
104		// block. In this case, the block has not yet written back yet, and
105		// the changed data is either in current_data, or original_data -- the
106		// latter if the block is already being part of another transaction.
107		// There can only be one previous transaction, so when the active
108		// transaction ends, the changes of the previous transaction have to
109		// be written back before that transaction becomes the next previous
110		// transaction.
111
112	bool CanBeWritten() const;
113	int32 LastAccess() const
114		{ return system_time() / 1000000L - last_accessed; }
115};
116
117typedef DoublyLinkedList<cached_block,
118	DoublyLinkedListMemberGetLink<cached_block,
119		&cached_block::link> > block_list;
120
121struct cache_notification : DoublyLinkedListLinkImpl<cache_notification> {
122	static inline void* operator new(size_t size);
123	static inline void operator delete(void* block);
124
125	int32			transaction_id;
126	int32			events_pending;
127	int32			events;
128	transaction_notification_hook hook;
129	void*			data;
130	bool			delete_after_event;
131};
132
133typedef DoublyLinkedList<cache_notification> NotificationList;
134
135static object_cache* sCacheNotificationCache;
136
137struct cache_listener;
138typedef DoublyLinkedListLink<cache_listener> listener_link;
139
140struct cache_listener : cache_notification {
141	listener_link	link;
142};
143
144typedef DoublyLinkedList<cache_listener,
145	DoublyLinkedListMemberGetLink<cache_listener,
146		&cache_listener::link> > ListenerList;
147
148void*
149cache_notification::operator new(size_t size)
150{
151	// We can't really know whether something is a cache_notification or a
152	// cache_listener at runtime, so we just use one object_cache for both
153	// with the size set to that of the (slightly larger) cache_listener.
154	// In practice, the vast majority of cache_notifications are really
155	// cache_listeners, so this is a more than acceptable trade-off.
156	ASSERT(size <= sizeof(cache_listener));
157	return object_cache_alloc(sCacheNotificationCache, 0);
158}
159
160void
161cache_notification::operator delete(void* block)
162{
163	object_cache_free(sCacheNotificationCache, block, 0);
164}
165
166
167struct BlockHash {
168	typedef off_t			KeyType;
169	typedef	cached_block	ValueType;
170
171	size_t HashKey(KeyType key) const
172	{
173		return key;
174	}
175
176	size_t Hash(ValueType* block) const
177	{
178		return block->block_number;
179	}
180
181	bool Compare(KeyType key, ValueType* block) const
182	{
183		return block->block_number == key;
184	}
185
186	ValueType*& GetLink(ValueType* value) const
187	{
188		return value->next;
189	}
190};
191
192typedef BOpenHashTable<BlockHash> BlockTable;
193
194
195struct TransactionHash {
196	typedef int32				KeyType;
197	typedef	cache_transaction	ValueType;
198
199	size_t HashKey(KeyType key) const
200	{
201		return key;
202	}
203
204	size_t Hash(ValueType* transaction) const;
205	bool Compare(KeyType key, ValueType* transaction) const;
206	ValueType*& GetLink(ValueType* value) const;
207};
208
209typedef BOpenHashTable<TransactionHash> TransactionTable;
210
211
212struct block_cache : DoublyLinkedListLinkImpl<block_cache> {
213	BlockTable*		hash;
214	mutex			lock;
215	int				fd;
216	off_t			max_blocks;
217	size_t			block_size;
218	int32			next_transaction_id;
219	cache_transaction* last_transaction;
220	TransactionTable* transaction_hash;
221
222	object_cache*	buffer_cache;
223	block_list		unused_blocks;
224	uint32			unused_block_count;
225
226	ConditionVariable busy_reading_condition;
227	uint32			busy_reading_count;
228	bool			busy_reading_waiters;
229
230	ConditionVariable busy_writing_condition;
231	uint32			busy_writing_count;
232	bool			busy_writing_waiters;
233
234	bigtime_t		last_block_write;
235	bigtime_t		last_block_write_duration;
236
237	uint32			num_dirty_blocks;
238	bool			read_only;
239
240	NotificationList pending_notifications;
241	ConditionVariable condition_variable;
242
243					block_cache(int fd, off_t numBlocks, size_t blockSize,
244						bool readOnly);
245					~block_cache();
246
247	status_t		Init();
248
249	void			Free(void* buffer);
250	void*			Allocate();
251	void			FreeBlock(cached_block* block);
252	cached_block*	NewBlock(off_t blockNumber);
253	void			FreeBlockParentData(cached_block* block);
254
255	void			RemoveUnusedBlocks(int32 count, int32 minSecondsOld = 0);
256	void			RemoveBlock(cached_block* block);
257	void			DiscardBlock(cached_block* block);
258
259private:
260	static void		_LowMemoryHandler(void* data, uint32 resources,
261						int32 level);
262	cached_block*	_GetUnusedBlock();
263};
264
265struct cache_transaction {
266	cache_transaction();
267
268	cache_transaction* next;
269	int32			id;
270	int32			num_blocks;
271	int32			main_num_blocks;
272	int32			sub_num_blocks;
273	cached_block*	first_block;
274	block_list		blocks;
275	ListenerList	listeners;
276	bool			open;
277	bool			has_sub_transaction;
278	bigtime_t		last_used;
279	int32			busy_writing_count;
280};
281
282
283class BlockWriter {
284public:
285								BlockWriter(block_cache* cache,
286									size_t max = SIZE_MAX);
287								~BlockWriter();
288
289			bool				Add(cached_block* block,
290									cache_transaction* transaction = NULL);
291			bool				Add(cache_transaction* transaction,
292									bool& hasLeftOvers);
293
294			status_t			Write(cache_transaction* transaction = NULL,
295									bool canUnlock = true);
296
297			bool				DeletedTransaction() const
298									{ return fDeletedTransaction; }
299
300	static	status_t			WriteBlock(block_cache* cache,
301									cached_block* block);
302
303private:
304			void*				_Data(cached_block* block) const;
305			status_t			_WriteBlock(cached_block* block);
306			void				_BlockDone(cached_block* block,
307									cache_transaction* transaction);
308			void				_UnmarkWriting(cached_block* block);
309
310	static	int					_CompareBlocks(const void* _blockA,
311									const void* _blockB);
312
313private:
314	static	const size_t		kBufferSize = 64;
315
316			block_cache*		fCache;
317			cached_block*		fBuffer[kBufferSize];
318			cached_block**		fBlocks;
319			size_t				fCount;
320			size_t				fTotal;
321			size_t				fCapacity;
322			size_t				fMax;
323			status_t			fStatus;
324			bool				fDeletedTransaction;
325};
326
327
328class TransactionLocking {
329public:
330	inline bool Lock(block_cache* cache)
331	{
332		mutex_lock(&cache->lock);
333
334		while (cache->busy_writing_count != 0) {
335			// wait for all blocks to be written
336			ConditionVariableEntry entry;
337			cache->busy_writing_condition.Add(&entry);
338			cache->busy_writing_waiters = true;
339
340			mutex_unlock(&cache->lock);
341
342			entry.Wait();
343
344			mutex_lock(&cache->lock);
345		}
346
347		return true;
348	}
349
350	inline void Unlock(block_cache* cache)
351	{
352		mutex_unlock(&cache->lock);
353	}
354};
355
356typedef AutoLocker<block_cache, TransactionLocking> TransactionLocker;
357
358} // namespace
359
360
361#if BLOCK_CACHE_BLOCK_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
362namespace BlockTracing {
363
364class Action : public AbstractTraceEntry {
365public:
366	Action(block_cache* cache, cached_block* block)
367		:
368		fCache(cache),
369		fBlockNumber(block->block_number),
370		fIsDirty(block->is_dirty),
371		fHasOriginal(block->original_data != NULL),
372		fHasParent(block->parent_data != NULL),
373		fTransactionID(-1),
374		fPreviousID(-1)
375	{
376		if (block->transaction != NULL)
377			fTransactionID = block->transaction->id;
378		if (block->previous_transaction != NULL)
379			fPreviousID = block->previous_transaction->id;
380	}
381
382	virtual void AddDump(TraceOutput& out)
383	{
384		out.Print("block cache %p, %s %" B_PRIu64 ", %c%c%c transaction %" B_PRId32
385			" (previous id %" B_PRId32 ")\n", fCache, _Action(), fBlockNumber,
386			fIsDirty ? 'd' : '-', fHasOriginal ? 'o' : '-',
387			fHasParent ? 'p' : '-', fTransactionID, fPreviousID);
388	}
389
390	virtual const char* _Action() const = 0;
391
392private:
393	block_cache*		fCache;
394	uint64				fBlockNumber;
395	bool				fIsDirty;
396	bool				fHasOriginal;
397	bool				fHasParent;
398	int32				fTransactionID;
399	int32				fPreviousID;
400};
401
402class Get : public Action {
403public:
404	Get(block_cache* cache, cached_block* block)
405		:
406		Action(cache, block)
407	{
408		Initialized();
409	}
410
411	virtual const char* _Action() const { return "get"; }
412};
413
414class Put : public Action {
415public:
416	Put(block_cache* cache, cached_block* block)
417		:
418		Action(cache, block)
419	{
420		Initialized();
421	}
422
423	virtual const char* _Action() const { return "put"; }
424};
425
426class Read : public Action {
427public:
428	Read(block_cache* cache, cached_block* block)
429		:
430		Action(cache, block)
431	{
432		Initialized();
433	}
434
435	virtual const char* _Action() const { return "read"; }
436};
437
438class Write : public Action {
439public:
440	Write(block_cache* cache, cached_block* block)
441		:
442		Action(cache, block)
443	{
444		Initialized();
445	}
446
447	virtual const char* _Action() const { return "write"; }
448};
449
450class Flush : public Action {
451public:
452	Flush(block_cache* cache, cached_block* block, bool getUnused = false)
453		:
454		Action(cache, block),
455		fGetUnused(getUnused)
456	{
457		Initialized();
458	}
459
460	virtual const char* _Action() const
461		{ return fGetUnused ? "get-unused" : "flush"; }
462
463private:
464	bool	fGetUnused;
465};
466
467class Error : public AbstractTraceEntry {
468public:
469	Error(block_cache* cache, uint64 blockNumber, const char* message,
470			status_t status = B_OK)
471		:
472		fCache(cache),
473		fBlockNumber(blockNumber),
474		fMessage(message),
475		fStatus(status)
476	{
477		Initialized();
478	}
479
480	virtual void AddDump(TraceOutput& out)
481	{
482		out.Print("block cache %p, error %" B_PRIu64 ", %s%s%s",
483			fCache, fBlockNumber, fMessage, fStatus != B_OK ? ": " : "",
484			fStatus != B_OK ? strerror(fStatus) : "");
485	}
486
487private:
488	block_cache*	fCache;
489	uint64			fBlockNumber;
490	const char*		fMessage;
491	status_t		fStatus;
492};
493
494#if BLOCK_CACHE_BLOCK_TRACING >= 2
495class BlockData : public AbstractTraceEntry {
496public:
497	enum {
498		kCurrent	= 0x01,
499		kParent		= 0x02,
500		kOriginal	= 0x04
501	};
502
503	BlockData(block_cache* cache, cached_block* block, const char* message)
504		:
505		fCache(cache),
506		fSize(cache->block_size),
507		fBlockNumber(block->block_number),
508		fMessage(message)
509	{
510		_Allocate(fCurrent, block->current_data);
511		_Allocate(fParent, block->parent_data);
512		_Allocate(fOriginal, block->original_data);
513
514#if KTRACE_PRINTF_STACK_TRACE
515		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
516			false);
517#endif
518
519		Initialized();
520	}
521
522	virtual void AddDump(TraceOutput& out)
523	{
524		out.Print("block cache %p, block %" B_PRIu64 ", data %c%c%c: %s",
525			fCache, fBlockNumber, fCurrent != NULL ? 'c' : '-',
526			fParent != NULL ? 'p' : '-', fOriginal != NULL ? 'o' : '-',
527			fMessage);
528	}
529
530#if KTRACE_PRINTF_STACK_TRACE
531	virtual void DumpStackTrace(TraceOutput& out)
532	{
533		out.PrintStackTrace(fStackTrace);
534	}
535#endif
536
537	void DumpBlocks(uint32 which, uint32 offset, uint32 size)
538	{
539		if ((which & kCurrent) != 0)
540			DumpBlock(kCurrent, offset, size);
541		if ((which & kParent) != 0)
542			DumpBlock(kParent, offset, size);
543		if ((which & kOriginal) != 0)
544			DumpBlock(kOriginal, offset, size);
545	}
546
547	void DumpBlock(uint32 which, uint32 offset, uint32 size)
548	{
549		if (offset > fSize) {
550			kprintf("invalid offset (block size %" B_PRIu32 ")\n", fSize);
551			return;
552		}
553		if (offset + size > fSize)
554			size = fSize - offset;
555
556		const char* label;
557		uint8* data;
558
559		if ((which & kCurrent) != 0) {
560			label = "current";
561			data = fCurrent;
562		} else if ((which & kParent) != 0) {
563			label = "parent";
564			data = fParent;
565		} else if ((which & kOriginal) != 0) {
566			label = "original";
567			data = fOriginal;
568		} else
569			return;
570
571		kprintf("%s: offset %" B_PRIu32 ", %" B_PRIu32 " bytes\n", label, offset, size);
572
573		static const uint32 kBlockSize = 16;
574		data += offset;
575
576		for (uint32 i = 0; i < size;) {
577			int start = i;
578
579			kprintf("  %04" B_PRIx32 " ", i);
580			for (; i < start + kBlockSize; i++) {
581				if (!(i % 4))
582					kprintf(" ");
583
584				if (i >= size)
585					kprintf("  ");
586				else
587					kprintf("%02x", data[i]);
588			}
589
590			kprintf("\n");
591		}
592	}
593
594private:
595	void _Allocate(uint8*& target, void* source)
596	{
597		if (source == NULL) {
598			target = NULL;
599			return;
600		}
601
602		target = alloc_tracing_buffer_memcpy(source, fSize, false);
603	}
604
605	block_cache*	fCache;
606	uint32			fSize;
607	uint64			fBlockNumber;
608	const char*		fMessage;
609	uint8*			fCurrent;
610	uint8*			fParent;
611	uint8*			fOriginal;
612#if KTRACE_PRINTF_STACK_TRACE
613	tracing_stack_trace* fStackTrace;
614#endif
615};
616#endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
617
618}	// namespace BlockTracing
619
620#	define TB(x) new(std::nothrow) BlockTracing::x;
621#else
622#	define TB(x) ;
623#endif
624
625#if BLOCK_CACHE_BLOCK_TRACING >= 2
626#	define TB2(x) new(std::nothrow) BlockTracing::x;
627#else
628#	define TB2(x) ;
629#endif
630
631
632#if BLOCK_CACHE_TRANSACTION_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
633namespace TransactionTracing {
634
635class Action : public AbstractTraceEntry {
636public:
637	Action(const char* label, block_cache* cache,
638			cache_transaction* transaction)
639		:
640		fCache(cache),
641		fTransaction(transaction),
642		fID(transaction->id),
643		fSub(transaction->has_sub_transaction),
644		fNumBlocks(transaction->num_blocks),
645		fSubNumBlocks(transaction->sub_num_blocks)
646	{
647		strlcpy(fLabel, label, sizeof(fLabel));
648		Initialized();
649	}
650
651	virtual void AddDump(TraceOutput& out)
652	{
653		out.Print("block cache %p, %s transaction %p (id %" B_PRId32 ")%s"
654			", %" B_PRId32 "/%" B_PRId32 " blocks", fCache, fLabel, fTransaction,
655			fID, fSub ? " sub" : "", fNumBlocks, fSubNumBlocks);
656	}
657
658private:
659	char				fLabel[12];
660	block_cache*		fCache;
661	cache_transaction*	fTransaction;
662	int32				fID;
663	bool				fSub;
664	int32				fNumBlocks;
665	int32				fSubNumBlocks;
666};
667
668class Detach : public AbstractTraceEntry {
669public:
670	Detach(block_cache* cache, cache_transaction* transaction,
671			cache_transaction* newTransaction)
672		:
673		fCache(cache),
674		fTransaction(transaction),
675		fID(transaction->id),
676		fSub(transaction->has_sub_transaction),
677		fNewTransaction(newTransaction),
678		fNewID(newTransaction->id)
679	{
680		Initialized();
681	}
682
683	virtual void AddDump(TraceOutput& out)
684	{
685		out.Print("block cache %p, detach transaction %p (id %" B_PRId32 ")"
686			"from transaction %p (id %" B_PRId32 ")%s",
687			fCache, fNewTransaction, fNewID, fTransaction, fID,
688			fSub ? " sub" : "");
689	}
690
691private:
692	block_cache*		fCache;
693	cache_transaction*	fTransaction;
694	int32				fID;
695	bool				fSub;
696	cache_transaction*	fNewTransaction;
697	int32				fNewID;
698};
699
700class Abort : public AbstractTraceEntry {
701public:
702	Abort(block_cache* cache, cache_transaction* transaction)
703		:
704		fCache(cache),
705		fTransaction(transaction),
706		fID(transaction->id),
707		fNumBlocks(0)
708	{
709		bool isSub = transaction->has_sub_transaction;
710		fNumBlocks = isSub ? transaction->sub_num_blocks
711			: transaction->num_blocks;
712		fBlocks = (off_t*)alloc_tracing_buffer(fNumBlocks * sizeof(off_t));
713		if (fBlocks != NULL) {
714			cached_block* block = transaction->first_block;
715			for (int32 i = 0; block != NULL && i < fNumBlocks;
716					block = block->transaction_next) {
717				fBlocks[i++] = block->block_number;
718			}
719		} else
720			fNumBlocks = 0;
721
722#if KTRACE_PRINTF_STACK_TRACE
723		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
724			false);
725#endif
726
727		Initialized();
728	}
729
730	virtual void AddDump(TraceOutput& out)
731	{
732		out.Print("block cache %p, abort transaction "
733			"%p (id %" B_PRId32 "), blocks", fCache, fTransaction, fID);
734		for (int32 i = 0; i < fNumBlocks && !out.IsFull(); i++)
735			out.Print(" %" B_PRIdOFF, fBlocks[i]);
736	}
737
738#if KTRACE_PRINTF_STACK_TRACE
739	virtual void DumpStackTrace(TraceOutput& out)
740	{
741		out.PrintStackTrace(fStackTrace);
742	}
743#endif
744
745private:
746	block_cache*		fCache;
747	cache_transaction*	fTransaction;
748	int32				fID;
749	off_t*				fBlocks;
750	int32				fNumBlocks;
751#if KTRACE_PRINTF_STACK_TRACE
752	tracing_stack_trace* fStackTrace;
753#endif
754};
755
756}	// namespace TransactionTracing
757
758#	define T(x) new(std::nothrow) TransactionTracing::x;
759#else
760#	define T(x) ;
761#endif
762
763
764static DoublyLinkedList<block_cache> sCaches;
765static mutex sCachesLock = MUTEX_INITIALIZER("block caches");
766static mutex sCachesMemoryUseLock
767	= MUTEX_INITIALIZER("block caches memory use");
768static size_t sUsedMemory;
769static sem_id sEventSemaphore;
770static mutex sNotificationsLock
771	= MUTEX_INITIALIZER("block cache notifications");
772static thread_id sNotifierWriterThread;
773static DoublyLinkedListLink<block_cache> sMarkCache;
774	// TODO: this only works if the link is the first entry of block_cache
775static object_cache* sBlockCache;
776
777
778//	#pragma mark - notifications/listener
779
780
781/*!	Checks whether or not this is an event that closes a transaction. */
782static inline bool
783is_closing_event(int32 event)
784{
785	return (event & (TRANSACTION_ABORTED | TRANSACTION_ENDED)) != 0;
786}
787
788
789static inline bool
790is_written_event(int32 event)
791{
792	return (event & TRANSACTION_WRITTEN) != 0;
793}
794
795
796/*!	From the specified \a notification, it will remove the lowest pending
797	event, and return that one in \a _event.
798	If there is no pending event anymore, it will return \c false.
799*/
800static bool
801get_next_pending_event(cache_notification* notification, int32* _event)
802{
803	for (int32 eventMask = 1; eventMask <= TRANSACTION_IDLE; eventMask <<= 1) {
804		int32 pending = atomic_and(&notification->events_pending,
805			~eventMask);
806
807		bool more = (pending & ~eventMask) != 0;
808
809		if ((pending & eventMask) != 0) {
810			*_event = eventMask;
811			return more;
812		}
813	}
814
815	return false;
816}
817
818
819static void
820flush_pending_notifications(block_cache* cache)
821{
822	ASSERT_LOCKED_MUTEX(&sCachesLock);
823
824	while (true) {
825		MutexLocker locker(sNotificationsLock);
826
827		cache_notification* notification = cache->pending_notifications.Head();
828		if (notification == NULL)
829			return;
830
831		bool deleteAfterEvent = false;
832		int32 event = -1;
833		if (!get_next_pending_event(notification, &event)) {
834			// remove the notification if this was the last pending event
835			cache->pending_notifications.Remove(notification);
836			deleteAfterEvent = notification->delete_after_event;
837		}
838
839		if (event >= 0) {
840			// Notify listener, we need to copy the notification, as it might
841			// be removed when we unlock the list.
842			cache_notification copy = *notification;
843			locker.Unlock();
844
845			copy.hook(copy.transaction_id, event, copy.data);
846
847			locker.Lock();
848		}
849
850		if (deleteAfterEvent)
851			delete notification;
852	}
853}
854
855
856/*!	Flushes all pending notifications by calling the appropriate hook
857	functions.
858	Must not be called with a cache lock held.
859*/
860static void
861flush_pending_notifications()
862{
863	MutexLocker _(sCachesLock);
864
865	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
866	while (iterator.HasNext()) {
867		block_cache* cache = iterator.Next();
868
869		flush_pending_notifications(cache);
870	}
871}
872
873
874/*!	Initializes the \a notification as specified. */
875static void
876set_notification(cache_transaction* transaction,
877	cache_notification &notification, int32 events,
878	transaction_notification_hook hook, void* data)
879{
880	notification.transaction_id = transaction != NULL ? transaction->id : -1;
881	notification.events_pending = 0;
882	notification.events = events;
883	notification.hook = hook;
884	notification.data = data;
885	notification.delete_after_event = false;
886}
887
888
889/*!	Makes sure the notification is deleted. It either deletes it directly,
890	when possible, or marks it for deletion if the notification is pending.
891*/
892static void
893delete_notification(cache_notification* notification)
894{
895	MutexLocker locker(sNotificationsLock);
896
897	if (notification->events_pending != 0)
898		notification->delete_after_event = true;
899	else
900		delete notification;
901}
902
903
904/*!	Adds the notification to the pending notifications list, or, if it's
905	already part of it, updates its events_pending field.
906	Also marks the notification to be deleted if \a deleteNotification
907	is \c true.
908	Triggers the notifier thread to run.
909*/
910static void
911add_notification(block_cache* cache, cache_notification* notification,
912	int32 event, bool deleteNotification)
913{
914	if (notification->hook == NULL)
915		return;
916
917	int32 pending = atomic_or(&notification->events_pending, event);
918	if (pending == 0) {
919		// not yet part of the notification list
920		MutexLocker locker(sNotificationsLock);
921		if (deleteNotification)
922			notification->delete_after_event = true;
923		cache->pending_notifications.Add(notification);
924	} else if (deleteNotification) {
925		// we might need to delete it ourselves if we're late
926		delete_notification(notification);
927	}
928
929	release_sem_etc(sEventSemaphore, 1, B_DO_NOT_RESCHEDULE);
930		// We're probably still holding some locks that makes rescheduling
931		// not a good idea at this point.
932}
933
934
935/*!	Notifies all interested listeners of this transaction about the \a event.
936	If \a event is a closing event (ie. TRANSACTION_ENDED, and
937	TRANSACTION_ABORTED), all listeners except those listening to
938	TRANSACTION_WRITTEN will be removed.
939*/
940static void
941notify_transaction_listeners(block_cache* cache, cache_transaction* transaction,
942	int32 event)
943{
944	T(Action("notify", cache, transaction));
945
946	bool isClosing = is_closing_event(event);
947	bool isWritten = is_written_event(event);
948
949	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
950	while (iterator.HasNext()) {
951		cache_listener* listener = iterator.Next();
952
953		bool remove = (isClosing && !is_written_event(listener->events))
954			|| (isWritten && is_written_event(listener->events));
955		if (remove)
956			iterator.Remove();
957
958		if ((listener->events & event) != 0)
959			add_notification(cache, listener, event, remove);
960		else if (remove)
961			delete_notification(listener);
962	}
963}
964
965
966/*!	Removes and deletes all listeners that are still monitoring this
967	transaction.
968*/
969static void
970remove_transaction_listeners(block_cache* cache, cache_transaction* transaction)
971{
972	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
973	while (iterator.HasNext()) {
974		cache_listener* listener = iterator.Next();
975		iterator.Remove();
976
977		delete_notification(listener);
978	}
979}
980
981
982static status_t
983add_transaction_listener(block_cache* cache, cache_transaction* transaction,
984	int32 events, transaction_notification_hook hookFunction, void* data)
985{
986	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
987	while (iterator.HasNext()) {
988		cache_listener* listener = iterator.Next();
989
990		if (listener->data == data && listener->hook == hookFunction) {
991			// this listener already exists, just update it
992			listener->events |= events;
993			return B_OK;
994		}
995	}
996
997	cache_listener* listener = new cache_listener;
998	if (listener == NULL)
999		return B_NO_MEMORY;
1000
1001	set_notification(transaction, *listener, events, hookFunction, data);
1002	transaction->listeners.Add(listener);
1003	return B_OK;
1004}
1005
1006
1007//	#pragma mark - private transaction
1008
1009
1010cache_transaction::cache_transaction()
1011{
1012	num_blocks = 0;
1013	main_num_blocks = 0;
1014	sub_num_blocks = 0;
1015	first_block = NULL;
1016	open = true;
1017	has_sub_transaction = false;
1018	last_used = system_time();
1019	busy_writing_count = 0;
1020}
1021
1022
1023static void
1024delete_transaction(block_cache* cache, cache_transaction* transaction)
1025{
1026	if (cache->last_transaction == transaction)
1027		cache->last_transaction = NULL;
1028
1029	remove_transaction_listeners(cache, transaction);
1030	delete transaction;
1031}
1032
1033
1034static cache_transaction*
1035lookup_transaction(block_cache* cache, int32 id)
1036{
1037	return cache->transaction_hash->Lookup(id);
1038}
1039
1040
1041size_t TransactionHash::Hash(cache_transaction* transaction) const
1042{
1043	return transaction->id;
1044}
1045
1046
1047bool TransactionHash::Compare(int32 key, cache_transaction* transaction) const
1048{
1049	return transaction->id == key;
1050}
1051
1052
1053cache_transaction*& TransactionHash::GetLink(cache_transaction* value) const
1054{
1055	return value->next;
1056}
1057
1058
1059/*!	Writes back any changes made to blocks in \a transaction that are still
1060	part of a previous transacton.
1061*/
1062static status_t
1063write_blocks_in_previous_transaction(block_cache* cache,
1064	cache_transaction* transaction)
1065{
1066	BlockWriter writer(cache);
1067
1068	cached_block* block = transaction->first_block;
1069	for (; block != NULL; block = block->transaction_next) {
1070		if (block->previous_transaction != NULL) {
1071			// need to write back pending changes
1072			writer.Add(block);
1073		}
1074	}
1075
1076	return writer.Write();
1077}
1078
1079
1080//	#pragma mark - cached_block
1081
1082
1083bool
1084cached_block::CanBeWritten() const
1085{
1086	return !busy_writing && !busy_reading
1087		&& (previous_transaction != NULL
1088			|| (transaction == NULL && is_dirty && !is_writing));
1089}
1090
1091
1092//	#pragma mark - BlockWriter
1093
1094
1095BlockWriter::BlockWriter(block_cache* cache, size_t max)
1096	:
1097	fCache(cache),
1098	fBlocks(fBuffer),
1099	fCount(0),
1100	fTotal(0),
1101	fCapacity(kBufferSize),
1102	fMax(max),
1103	fStatus(B_OK),
1104	fDeletedTransaction(false)
1105{
1106}
1107
1108
1109BlockWriter::~BlockWriter()
1110{
1111	if (fBlocks != fBuffer)
1112		free(fBlocks);
1113}
1114
1115
1116/*!	Adds the specified block to the to be written array. If no more blocks can
1117	be added, false is returned, otherwise true.
1118*/
1119bool
1120BlockWriter::Add(cached_block* block, cache_transaction* transaction)
1121{
1122	ASSERT(block->CanBeWritten());
1123
1124	if (fTotal == fMax)
1125		return false;
1126
1127	if (fCount >= fCapacity) {
1128		// Enlarge array if necessary
1129		cached_block** newBlocks;
1130		size_t newCapacity = max_c(256, fCapacity * 2);
1131		if (fBlocks == fBuffer)
1132			newBlocks = (cached_block**)malloc(newCapacity * sizeof(void*));
1133		else {
1134			newBlocks = (cached_block**)realloc(fBlocks,
1135				newCapacity * sizeof(void*));
1136		}
1137
1138		if (newBlocks == NULL) {
1139			// Allocating a larger array failed - we need to write back what
1140			// we have synchronously now (this will also clear the array)
1141			Write(transaction, false);
1142		} else {
1143			if (fBlocks == fBuffer)
1144				memcpy(newBlocks, fBuffer, kBufferSize * sizeof(void*));
1145
1146			fBlocks = newBlocks;
1147			fCapacity = newCapacity;
1148		}
1149	}
1150
1151	fBlocks[fCount++] = block;
1152	fTotal++;
1153	block->busy_writing = true;
1154	fCache->busy_writing_count++;
1155	if (block->previous_transaction != NULL)
1156		block->previous_transaction->busy_writing_count++;
1157
1158	return true;
1159}
1160
1161
1162/*!	Adds all blocks of the specified transaction to the to be written array.
1163	If no more blocks can be added, false is returned, otherwise true.
1164*/
1165bool
1166BlockWriter::Add(cache_transaction* transaction, bool& hasLeftOvers)
1167{
1168	ASSERT(!transaction->open);
1169
1170	if (transaction->busy_writing_count != 0) {
1171		hasLeftOvers = true;
1172		return true;
1173	}
1174
1175	hasLeftOvers = false;
1176
1177	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
1178	while (cached_block* block = blockIterator.Next()) {
1179		if (!block->CanBeWritten()) {
1180			// This block was already part of a previous transaction within this
1181			// writer
1182			hasLeftOvers = true;
1183			continue;
1184		}
1185		if (!Add(block, transaction))
1186			return false;
1187
1188		if (DeletedTransaction())
1189			break;
1190	}
1191
1192	return true;
1193}
1194
1195
1196/*! Cache must be locked when calling this method, but it will be unlocked
1197	while the blocks are written back.
1198*/
1199status_t
1200BlockWriter::Write(cache_transaction* transaction, bool canUnlock)
1201{
1202	if (fCount == 0)
1203		return B_OK;
1204
1205	if (canUnlock)
1206		mutex_unlock(&fCache->lock);
1207
1208	// Sort blocks in their on-disk order
1209	// TODO: ideally, this should be handled by the I/O scheduler
1210
1211	qsort(fBlocks, fCount, sizeof(void*), &_CompareBlocks);
1212	fDeletedTransaction = false;
1213
1214	bigtime_t start = system_time();
1215
1216	for (uint32 i = 0; i < fCount; i++) {
1217		status_t status = _WriteBlock(fBlocks[i]);
1218		if (status != B_OK) {
1219			// propagate to global error handling
1220			if (fStatus == B_OK)
1221				fStatus = status;
1222
1223			_UnmarkWriting(fBlocks[i]);
1224			fBlocks[i] = NULL;
1225				// This block will not be marked clean
1226		}
1227	}
1228
1229	bigtime_t finish = system_time();
1230
1231	if (canUnlock)
1232		mutex_lock(&fCache->lock);
1233
1234	if (fStatus == B_OK && fCount >= 8) {
1235		fCache->last_block_write = finish;
1236		fCache->last_block_write_duration = (fCache->last_block_write - start)
1237			/ fCount;
1238	}
1239
1240	for (uint32 i = 0; i < fCount; i++)
1241		_BlockDone(fBlocks[i], transaction);
1242
1243	fCount = 0;
1244	return fStatus;
1245}
1246
1247
1248/*!	Writes the specified \a block back to disk. It will always only write back
1249	the oldest change of the block if it is part of more than one transaction.
1250	It will automatically send out TRANSACTION_WRITTEN notices, as well as
1251	delete transactions when they are no longer used, and \a deleteTransaction
1252	is \c true.
1253*/
1254/*static*/ status_t
1255BlockWriter::WriteBlock(block_cache* cache, cached_block* block)
1256{
1257	BlockWriter writer(cache);
1258
1259	writer.Add(block);
1260	return writer.Write();
1261}
1262
1263
1264void*
1265BlockWriter::_Data(cached_block* block) const
1266{
1267	return block->previous_transaction != NULL && block->original_data != NULL
1268		? block->original_data : block->current_data;
1269		// We first need to write back changes from previous transactions
1270}
1271
1272
1273status_t
1274BlockWriter::_WriteBlock(cached_block* block)
1275{
1276	ASSERT(block->busy_writing);
1277
1278	TRACE(("BlockWriter::_WriteBlock(block %" B_PRIdOFF ")\n", block->block_number));
1279	TB(Write(fCache, block));
1280	TB2(BlockData(fCache, block, "before write"));
1281
1282	size_t blockSize = fCache->block_size;
1283
1284	ssize_t written = write_pos(fCache->fd,
1285		block->block_number * blockSize, _Data(block), blockSize);
1286
1287	if (written != (ssize_t)blockSize) {
1288		TB(Error(fCache, block->block_number, "write failed", written));
1289		TRACE_ALWAYS("could not write back block %" B_PRIdOFF " (%s)\n",
1290			block->block_number, strerror(errno));
1291		if (written < 0)
1292			return errno;
1293
1294		return B_IO_ERROR;
1295	}
1296
1297	return B_OK;
1298}
1299
1300
1301void
1302BlockWriter::_BlockDone(cached_block* block,
1303	cache_transaction* transaction)
1304{
1305	if (block == NULL) {
1306		// An error occured when trying to write this block
1307		return;
1308	}
1309
1310	if (fCache->num_dirty_blocks > 0)
1311		fCache->num_dirty_blocks--;
1312
1313	if (_Data(block) == block->current_data)
1314		block->is_dirty = false;
1315
1316	_UnmarkWriting(block);
1317
1318	cache_transaction* previous = block->previous_transaction;
1319	if (previous != NULL) {
1320		previous->blocks.Remove(block);
1321		block->previous_transaction = NULL;
1322
1323		if (block->original_data != NULL && block->transaction == NULL) {
1324			// This block is not part of a transaction, so it does not need
1325			// its original pointer anymore.
1326			fCache->Free(block->original_data);
1327			block->original_data = NULL;
1328		}
1329
1330		// Has the previous transaction been finished with that write?
1331		if (--previous->num_blocks == 0) {
1332			TRACE(("cache transaction %" B_PRId32 " finished!\n", previous->id));
1333			T(Action("written", fCache, previous));
1334
1335			notify_transaction_listeners(fCache, previous,
1336				TRANSACTION_WRITTEN);
1337
1338			if (transaction != NULL) {
1339				// This function is called while iterating transaction_hash. We
1340				// use RemoveUnchecked so the iterator is still valid. A regular
1341				// Remove can trigger a resize of the hash table which would
1342				// result in the linked items in the table changing order.
1343				fCache->transaction_hash->RemoveUnchecked(transaction);
1344			} else
1345				fCache->transaction_hash->Remove(previous);
1346
1347			delete_transaction(fCache, previous);
1348			fDeletedTransaction = true;
1349		}
1350	}
1351	if (block->transaction == NULL && block->ref_count == 0 && !block->unused) {
1352		// the block is no longer used
1353		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1354		block->unused = true;
1355		fCache->unused_blocks.Add(block);
1356		fCache->unused_block_count++;
1357	}
1358
1359	TB2(BlockData(fCache, block, "after write"));
1360}
1361
1362
1363void
1364BlockWriter::_UnmarkWriting(cached_block* block)
1365{
1366	block->busy_writing = false;
1367	if (block->previous_transaction != NULL)
1368		block->previous_transaction->busy_writing_count--;
1369	fCache->busy_writing_count--;
1370
1371	if ((fCache->busy_writing_waiters && fCache->busy_writing_count == 0)
1372		|| block->busy_writing_waiters) {
1373		fCache->busy_writing_waiters = false;
1374		block->busy_writing_waiters = false;
1375		fCache->busy_writing_condition.NotifyAll();
1376	}
1377}
1378
1379
1380/*static*/ int
1381BlockWriter::_CompareBlocks(const void* _blockA, const void* _blockB)
1382{
1383	cached_block* blockA = *(cached_block**)_blockA;
1384	cached_block* blockB = *(cached_block**)_blockB;
1385
1386	off_t diff = blockA->block_number - blockB->block_number;
1387	if (diff > 0)
1388		return 1;
1389
1390	return diff < 0 ? -1 : 0;
1391}
1392
1393
1394//	#pragma mark - block_cache
1395
1396
1397block_cache::block_cache(int _fd, off_t numBlocks, size_t blockSize,
1398		bool readOnly)
1399	:
1400	hash(NULL),
1401	fd(_fd),
1402	max_blocks(numBlocks),
1403	block_size(blockSize),
1404	next_transaction_id(1),
1405	last_transaction(NULL),
1406	transaction_hash(NULL),
1407	buffer_cache(NULL),
1408	unused_block_count(0),
1409	busy_reading_count(0),
1410	busy_reading_waiters(false),
1411	busy_writing_count(0),
1412	busy_writing_waiters(0),
1413	last_block_write(0),
1414	last_block_write_duration(0),
1415	num_dirty_blocks(0),
1416	read_only(readOnly)
1417{
1418}
1419
1420
1421/*! Should be called with the cache's lock held. */
1422block_cache::~block_cache()
1423{
1424	unregister_low_resource_handler(&_LowMemoryHandler, this);
1425
1426	delete transaction_hash;
1427	delete hash;
1428
1429	delete_object_cache(buffer_cache);
1430
1431	mutex_destroy(&lock);
1432}
1433
1434
1435status_t
1436block_cache::Init()
1437{
1438	busy_reading_condition.Init(this, "cache block busy_reading");
1439	busy_writing_condition.Init(this, "cache block busy writing");
1440	condition_variable.Init(this, "cache transaction sync");
1441	mutex_init(&lock, "block cache");
1442
1443	buffer_cache = create_object_cache_etc("block cache buffers", block_size,
1444		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
1445	if (buffer_cache == NULL)
1446		return B_NO_MEMORY;
1447
1448	hash = new BlockTable();
1449	if (hash == NULL || hash->Init(1024) != B_OK)
1450		return B_NO_MEMORY;
1451
1452	transaction_hash = new(std::nothrow) TransactionTable();
1453	if (transaction_hash == NULL || transaction_hash->Init(16) != B_OK)
1454		return B_NO_MEMORY;
1455
1456	return register_low_resource_handler(&_LowMemoryHandler, this,
1457		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1458			| B_KERNEL_RESOURCE_ADDRESS_SPACE, 0);
1459}
1460
1461
1462void
1463block_cache::Free(void* buffer)
1464{
1465	if (buffer != NULL)
1466		object_cache_free(buffer_cache, buffer, 0);
1467}
1468
1469
1470void*
1471block_cache::Allocate()
1472{
1473	void* block = object_cache_alloc(buffer_cache, 0);
1474	if (block != NULL)
1475		return block;
1476
1477	// recycle existing before allocating a new one
1478	RemoveUnusedBlocks(100);
1479
1480	return object_cache_alloc(buffer_cache, 0);
1481}
1482
1483
1484void
1485block_cache::FreeBlock(cached_block* block)
1486{
1487	Free(block->current_data);
1488
1489	if (block->original_data != NULL || block->parent_data != NULL) {
1490		panic("block_cache::FreeBlock(): %" B_PRIdOFF ", original %p, parent %p\n",
1491			block->block_number, block->original_data, block->parent_data);
1492	}
1493
1494#if BLOCK_CACHE_DEBUG_CHANGED
1495	Free(block->compare);
1496#endif
1497
1498	object_cache_free(sBlockCache, block, 0);
1499}
1500
1501
1502/*! Allocates a new block for \a blockNumber, ready for use */
1503cached_block*
1504block_cache::NewBlock(off_t blockNumber)
1505{
1506	cached_block* block = NULL;
1507
1508	if (low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1509			| B_KERNEL_RESOURCE_ADDRESS_SPACE) != B_NO_LOW_RESOURCE) {
1510		// recycle existing instead of allocating a new one
1511		block = _GetUnusedBlock();
1512	}
1513	if (block == NULL) {
1514		block = (cached_block*)object_cache_alloc(sBlockCache, 0);
1515		if (block != NULL) {
1516			block->current_data = Allocate();
1517			if (block->current_data == NULL) {
1518				object_cache_free(sBlockCache, block, 0);
1519				return NULL;
1520			}
1521		} else {
1522			TB(Error(this, blockNumber, "allocation failed"));
1523			TRACE_ALWAYS("block allocation failed, unused list is %sempty.\n",
1524				unused_blocks.IsEmpty() ? "" : "not ");
1525
1526			// allocation failed, try to reuse an unused block
1527			block = _GetUnusedBlock();
1528			if (block == NULL) {
1529				TB(Error(this, blockNumber, "get unused failed"));
1530				FATAL(("could not allocate block!\n"));
1531				return NULL;
1532			}
1533		}
1534	}
1535
1536	block->block_number = blockNumber;
1537	block->ref_count = 0;
1538	block->last_accessed = 0;
1539	block->transaction_next = NULL;
1540	block->transaction = block->previous_transaction = NULL;
1541	block->original_data = NULL;
1542	block->parent_data = NULL;
1543	block->busy_reading = false;
1544	block->busy_writing = false;
1545	block->is_writing = false;
1546	block->is_dirty = false;
1547	block->unused = false;
1548	block->discard = false;
1549	block->busy_reading_waiters = false;
1550	block->busy_writing_waiters = false;
1551#if BLOCK_CACHE_DEBUG_CHANGED
1552	block->compare = NULL;
1553#endif
1554
1555	return block;
1556}
1557
1558
1559void
1560block_cache::FreeBlockParentData(cached_block* block)
1561{
1562	ASSERT(block->parent_data != NULL);
1563	if (block->parent_data != block->current_data)
1564		Free(block->parent_data);
1565	block->parent_data = NULL;
1566}
1567
1568
1569void
1570block_cache::RemoveUnusedBlocks(int32 count, int32 minSecondsOld)
1571{
1572	TRACE(("block_cache: remove up to %" B_PRId32 " unused blocks\n", count));
1573
1574	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1575			cached_block* block = iterator.Next();) {
1576		if (minSecondsOld >= block->LastAccess()) {
1577			// The list is sorted by last access
1578			break;
1579		}
1580		if (block->busy_reading || block->busy_writing)
1581			continue;
1582
1583		TB(Flush(this, block));
1584		TRACE(("  remove block %" B_PRIdOFF ", last accessed %" B_PRId32 "\n",
1585			block->block_number, block->last_accessed));
1586
1587		// this can only happen if no transactions are used
1588		if (block->is_dirty && !block->discard) {
1589			if (block->busy_writing)
1590				continue;
1591
1592			BlockWriter::WriteBlock(this, block);
1593		}
1594
1595		// remove block from lists
1596		iterator.Remove();
1597		unused_block_count--;
1598		RemoveBlock(block);
1599
1600		if (--count <= 0)
1601			break;
1602	}
1603}
1604
1605
1606void
1607block_cache::RemoveBlock(cached_block* block)
1608{
1609	hash->Remove(block);
1610	FreeBlock(block);
1611}
1612
1613
1614/*!	Discards the block from a transaction (this method must not be called
1615	for blocks not part of a transaction).
1616*/
1617void
1618block_cache::DiscardBlock(cached_block* block)
1619{
1620	ASSERT(block->discard);
1621	ASSERT(block->previous_transaction == NULL);
1622
1623	if (block->parent_data != NULL)
1624		FreeBlockParentData(block);
1625
1626	if (block->original_data != NULL) {
1627		Free(block->original_data);
1628		block->original_data = NULL;
1629	}
1630
1631	RemoveBlock(block);
1632}
1633
1634
1635void
1636block_cache::_LowMemoryHandler(void* data, uint32 resources, int32 level)
1637{
1638	TRACE(("block_cache: low memory handler called with level %" B_PRId32 "\n", level));
1639
1640	// free some blocks according to the low memory state
1641	// (if there is enough memory left, we don't free any)
1642
1643	block_cache* cache = (block_cache*)data;
1644	if (cache->unused_block_count <= 1)
1645		return;
1646
1647	int32 free = 0;
1648	int32 secondsOld = 0;
1649	switch (level) {
1650		case B_NO_LOW_RESOURCE:
1651			return;
1652		case B_LOW_RESOURCE_NOTE:
1653			free = cache->unused_block_count / 4;
1654			secondsOld = 120;
1655			break;
1656		case B_LOW_RESOURCE_WARNING:
1657			free = cache->unused_block_count / 2;
1658			secondsOld = 10;
1659			break;
1660		case B_LOW_RESOURCE_CRITICAL:
1661			free = cache->unused_block_count - 1;
1662			secondsOld = 0;
1663			break;
1664	}
1665
1666	MutexLocker locker(&cache->lock);
1667
1668	if (!locker.IsLocked()) {
1669		// If our block_cache were deleted, it could be that we had
1670		// been called before that deletion went through, therefore,
1671		// acquiring its lock might fail.
1672		return;
1673	}
1674
1675#ifdef TRACE_BLOCK_CACHE
1676	uint32 oldUnused = cache->unused_block_count;
1677#endif
1678
1679	cache->RemoveUnusedBlocks(free, secondsOld);
1680
1681	TRACE(("block_cache::_LowMemoryHandler(): %p: unused: %" B_PRIu32 " -> %" B_PRIu32 "\n",
1682		cache, oldUnused, cache->unused_block_count));
1683}
1684
1685
1686cached_block*
1687block_cache::_GetUnusedBlock()
1688{
1689	TRACE(("block_cache: get unused block\n"));
1690
1691	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1692			cached_block* block = iterator.Next();) {
1693		TB(Flush(this, block, true));
1694		// this can only happen if no transactions are used
1695		if (block->is_dirty && !block->busy_writing && !block->discard)
1696			BlockWriter::WriteBlock(this, block);
1697
1698		// remove block from lists
1699		iterator.Remove();
1700		unused_block_count--;
1701		hash->Remove(block);
1702
1703		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1704		block->unused = false;
1705
1706		// TODO: see if compare data is handled correctly here!
1707#if BLOCK_CACHE_DEBUG_CHANGED
1708		if (block->compare != NULL)
1709			Free(block->compare);
1710#endif
1711		return block;
1712	}
1713
1714	return NULL;
1715}
1716
1717
1718//	#pragma mark - private block functions
1719
1720
1721/*!	Cache must be locked.
1722*/
1723static void
1724mark_block_busy_reading(block_cache* cache, cached_block* block)
1725{
1726	block->busy_reading = true;
1727	cache->busy_reading_count++;
1728}
1729
1730
1731/*!	Cache must be locked.
1732*/
1733static void
1734mark_block_unbusy_reading(block_cache* cache, cached_block* block)
1735{
1736	block->busy_reading = false;
1737	cache->busy_reading_count--;
1738
1739	if ((cache->busy_reading_waiters && cache->busy_reading_count == 0)
1740		|| block->busy_reading_waiters) {
1741		cache->busy_reading_waiters = false;
1742		block->busy_reading_waiters = false;
1743		cache->busy_reading_condition.NotifyAll();
1744	}
1745}
1746
1747
1748/*!	Cache must be locked.
1749*/
1750static void
1751wait_for_busy_reading_block(block_cache* cache, cached_block* block)
1752{
1753	while (block->busy_reading) {
1754		// wait for at least the specified block to be read in
1755		ConditionVariableEntry entry;
1756		cache->busy_reading_condition.Add(&entry);
1757		block->busy_reading_waiters = true;
1758
1759		mutex_unlock(&cache->lock);
1760
1761		entry.Wait();
1762
1763		mutex_lock(&cache->lock);
1764	}
1765}
1766
1767
1768/*!	Cache must be locked.
1769*/
1770static void
1771wait_for_busy_reading_blocks(block_cache* cache)
1772{
1773	while (cache->busy_reading_count != 0) {
1774		// wait for all blocks to be read in
1775		ConditionVariableEntry entry;
1776		cache->busy_reading_condition.Add(&entry);
1777		cache->busy_reading_waiters = true;
1778
1779		mutex_unlock(&cache->lock);
1780
1781		entry.Wait();
1782
1783		mutex_lock(&cache->lock);
1784	}
1785}
1786
1787
1788/*!	Cache must be locked.
1789*/
1790static void
1791wait_for_busy_writing_block(block_cache* cache, cached_block* block)
1792{
1793	while (block->busy_writing) {
1794		// wait for all blocks to be written back
1795		ConditionVariableEntry entry;
1796		cache->busy_writing_condition.Add(&entry);
1797		block->busy_writing_waiters = true;
1798
1799		mutex_unlock(&cache->lock);
1800
1801		entry.Wait();
1802
1803		mutex_lock(&cache->lock);
1804	}
1805}
1806
1807
1808/*!	Cache must be locked.
1809*/
1810static void
1811wait_for_busy_writing_blocks(block_cache* cache)
1812{
1813	while (cache->busy_writing_count != 0) {
1814		// wait for all blocks to be written back
1815		ConditionVariableEntry entry;
1816		cache->busy_writing_condition.Add(&entry);
1817		cache->busy_writing_waiters = true;
1818
1819		mutex_unlock(&cache->lock);
1820
1821		entry.Wait();
1822
1823		mutex_lock(&cache->lock);
1824	}
1825}
1826
1827
1828/*!	Removes a reference from the specified \a block. If this was the last
1829	reference, the block is moved into the unused list.
1830	In low memory situations, it will also free some blocks from that list,
1831	but not necessarily the \a block it just released.
1832*/
1833static void
1834put_cached_block(block_cache* cache, cached_block* block)
1835{
1836#if BLOCK_CACHE_DEBUG_CHANGED
1837	if (!block->is_dirty && block->compare != NULL
1838		&& memcmp(block->current_data, block->compare, cache->block_size)) {
1839		TRACE_ALWAYS("new block:\n");
1840		dump_block((const char*)block->current_data, 256, "  ");
1841		TRACE_ALWAYS("unchanged block:\n");
1842		dump_block((const char*)block->compare, 256, "  ");
1843		BlockWriter::WriteBlock(cache, block);
1844		panic("block_cache: supposed to be clean block was changed!\n");
1845
1846		cache->Free(block->compare);
1847		block->compare = NULL;
1848	}
1849#endif
1850	TB(Put(cache, block));
1851
1852	if (block->ref_count < 1) {
1853		panic("Invalid ref_count for block %p, cache %p\n", block, cache);
1854		return;
1855	}
1856
1857	if (--block->ref_count == 0
1858		&& block->transaction == NULL && block->previous_transaction == NULL) {
1859		// This block is not used anymore, and not part of any transaction
1860		block->is_writing = false;
1861
1862		if (block->discard) {
1863			cache->RemoveBlock(block);
1864		} else {
1865			// put this block in the list of unused blocks
1866			ASSERT(!block->unused);
1867			block->unused = true;
1868
1869			ASSERT(block->original_data == NULL && block->parent_data == NULL);
1870			cache->unused_blocks.Add(block);
1871			cache->unused_block_count++;
1872		}
1873	}
1874}
1875
1876
1877static void
1878put_cached_block(block_cache* cache, off_t blockNumber)
1879{
1880	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1881		panic("put_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
1882			blockNumber, cache->max_blocks - 1);
1883	}
1884
1885	cached_block* block = cache->hash->Lookup(blockNumber);
1886	if (block != NULL)
1887		put_cached_block(cache, block);
1888	else {
1889		TB(Error(cache, blockNumber, "put unknown"));
1890	}
1891}
1892
1893
1894/*!	Retrieves the block \a blockNumber from the hash table, if it's already
1895	there, or reads it from the disk.
1896	You need to have the cache locked when calling this function.
1897
1898	\param _allocated tells you whether or not a new block has been allocated
1899		to satisfy your request.
1900	\param readBlock if \c false, the block will not be read in case it was
1901		not already in the cache. The block you retrieve may contain random
1902		data. If \c true, the cache will be temporarily unlocked while the
1903		block is read in.
1904*/
1905static status_t
1906get_cached_block(block_cache* cache, off_t blockNumber, bool* _allocated,
1907	bool readBlock, cached_block** _block)
1908{
1909	ASSERT_LOCKED_MUTEX(&cache->lock);
1910
1911	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1912		panic("get_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
1913			blockNumber, cache->max_blocks - 1);
1914		return B_BAD_VALUE;
1915	}
1916
1917retry:
1918	cached_block* block = cache->hash->Lookup(blockNumber);
1919	*_allocated = false;
1920
1921	if (block == NULL) {
1922		// put block into cache
1923		block = cache->NewBlock(blockNumber);
1924		if (block == NULL)
1925			return B_NO_MEMORY;
1926
1927		cache->hash->Insert(block);
1928		*_allocated = true;
1929	} else if (block->busy_reading) {
1930		// The block is currently busy_reading - wait and try again later
1931		wait_for_busy_reading_block(cache, block);
1932		goto retry;
1933	}
1934
1935	if (block->unused) {
1936		//TRACE(("remove block %" B_PRIdOFF " from unused\n", blockNumber));
1937		block->unused = false;
1938		cache->unused_blocks.Remove(block);
1939		cache->unused_block_count--;
1940	}
1941
1942	if (*_allocated && readBlock) {
1943		// read block into cache
1944		int32 blockSize = cache->block_size;
1945
1946		mark_block_busy_reading(cache, block);
1947		mutex_unlock(&cache->lock);
1948
1949		ssize_t bytesRead = read_pos(cache->fd, blockNumber * blockSize,
1950			block->current_data, blockSize);
1951
1952		mutex_lock(&cache->lock);
1953		if (bytesRead < blockSize) {
1954			cache->RemoveBlock(block);
1955			TB(Error(cache, blockNumber, "read failed", bytesRead));
1956
1957			TRACE_ALWAYS("could not read block %" B_PRIdOFF ": bytesRead: %zd,"
1958				" error: %s\n", blockNumber, bytesRead, strerror(errno));
1959			return errno;
1960		}
1961		TB(Read(cache, block));
1962
1963		mark_block_unbusy_reading(cache, block);
1964	}
1965
1966	block->ref_count++;
1967	block->last_accessed = system_time() / 1000000L;
1968
1969	*_block = block;
1970	return B_OK;
1971}
1972
1973
1974/*!	Returns the writable block data for the requested blockNumber.
1975	If \a cleared is true, the block is not read from disk; an empty block
1976	is returned.
1977
1978	This is the only method to insert a block into a transaction. It makes
1979	sure that the previous block contents are preserved in that case.
1980*/
1981static status_t
1982get_writable_cached_block(block_cache* cache, off_t blockNumber, off_t base,
1983	off_t length, int32 transactionID, bool cleared, void** _block)
1984{
1985	TRACE(("get_writable_cached_block(blockNumber = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
1986		blockNumber, transactionID));
1987
1988	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1989		panic("get_writable_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
1990			blockNumber, cache->max_blocks - 1);
1991		return B_BAD_VALUE;
1992	}
1993
1994	bool allocated;
1995	cached_block* block;
1996	status_t status = get_cached_block(cache, blockNumber, &allocated,
1997		!cleared, &block);
1998	if (status != B_OK)
1999		return status;
2000
2001	if (block->busy_writing)
2002		wait_for_busy_writing_block(cache, block);
2003
2004	block->discard = false;
2005
2006	// if there is no transaction support, we just return the current block
2007	if (transactionID == -1) {
2008		if (cleared) {
2009			mark_block_busy_reading(cache, block);
2010			mutex_unlock(&cache->lock);
2011
2012			memset(block->current_data, 0, cache->block_size);
2013
2014			mutex_lock(&cache->lock);
2015			mark_block_unbusy_reading(cache, block);
2016		}
2017
2018		block->is_writing = true;
2019
2020		if (!block->is_dirty) {
2021			cache->num_dirty_blocks++;
2022			block->is_dirty = true;
2023				// mark the block as dirty
2024		}
2025
2026		TB(Get(cache, block));
2027		*_block = block->current_data;
2028		return B_OK;
2029	}
2030
2031	cache_transaction* transaction = block->transaction;
2032
2033	if (transaction != NULL && transaction->id != transactionID) {
2034		// TODO: we have to wait here until the other transaction is done.
2035		//	Maybe we should even panic, since we can't prevent any deadlocks.
2036		panic("get_writable_cached_block(): asked to get busy writable block "
2037			"(transaction %" B_PRId32 ")\n", block->transaction->id);
2038		put_cached_block(cache, block);
2039		return B_BAD_VALUE;
2040	}
2041	if (transaction == NULL && transactionID != -1) {
2042		// get new transaction
2043		transaction = lookup_transaction(cache, transactionID);
2044		if (transaction == NULL) {
2045			panic("get_writable_cached_block(): invalid transaction %" B_PRId32 "!\n",
2046				transactionID);
2047			put_cached_block(cache, block);
2048			return B_BAD_VALUE;
2049		}
2050		if (!transaction->open) {
2051			panic("get_writable_cached_block(): transaction already done!\n");
2052			put_cached_block(cache, block);
2053			return B_BAD_VALUE;
2054		}
2055
2056		block->transaction = transaction;
2057
2058		// attach the block to the transaction block list
2059		block->transaction_next = transaction->first_block;
2060		transaction->first_block = block;
2061		transaction->num_blocks++;
2062	}
2063	if (transaction != NULL)
2064		transaction->last_used = system_time();
2065
2066	bool wasUnchanged = block->original_data == NULL
2067		|| block->previous_transaction != NULL;
2068
2069	if (!(allocated && cleared) && block->original_data == NULL) {
2070		// we already have data, so we need to preserve it
2071		block->original_data = cache->Allocate();
2072		if (block->original_data == NULL) {
2073			TB(Error(cache, blockNumber, "allocate original failed"));
2074			FATAL(("could not allocate original_data\n"));
2075			put_cached_block(cache, block);
2076			return B_NO_MEMORY;
2077		}
2078
2079		mark_block_busy_reading(cache, block);
2080		mutex_unlock(&cache->lock);
2081
2082		memcpy(block->original_data, block->current_data, cache->block_size);
2083
2084		mutex_lock(&cache->lock);
2085		mark_block_unbusy_reading(cache, block);
2086	}
2087	if (block->parent_data == block->current_data) {
2088		// remember any previous contents for the parent transaction
2089		block->parent_data = cache->Allocate();
2090		if (block->parent_data == NULL) {
2091			// TODO: maybe we should just continue the current transaction in
2092			// this case...
2093			TB(Error(cache, blockNumber, "allocate parent failed"));
2094			FATAL(("could not allocate parent\n"));
2095			put_cached_block(cache, block);
2096			return B_NO_MEMORY;
2097		}
2098
2099		mark_block_busy_reading(cache, block);
2100		mutex_unlock(&cache->lock);
2101
2102		memcpy(block->parent_data, block->current_data, cache->block_size);
2103
2104		mutex_lock(&cache->lock);
2105		mark_block_unbusy_reading(cache, block);
2106
2107		transaction->sub_num_blocks++;
2108	} else if (transaction != NULL && transaction->has_sub_transaction
2109		&& block->parent_data == NULL && wasUnchanged)
2110		transaction->sub_num_blocks++;
2111
2112	if (cleared) {
2113		mark_block_busy_reading(cache, block);
2114		mutex_unlock(&cache->lock);
2115
2116		memset(block->current_data, 0, cache->block_size);
2117
2118		mutex_lock(&cache->lock);
2119		mark_block_unbusy_reading(cache, block);
2120	}
2121
2122	block->is_dirty = true;
2123	TB(Get(cache, block));
2124	TB2(BlockData(cache, block, "get writable"));
2125
2126	*_block = block->current_data;
2127	return B_OK;
2128}
2129
2130
2131#if DEBUG_BLOCK_CACHE
2132
2133
2134static void
2135dump_block(cached_block* block)
2136{
2137	kprintf("%08lx %9" B_PRIdOFF " %08lx %08lx %08lx %5" B_PRId32 " %6" B_PRId32
2138		" %c%c%c%c%c%c %08lx %08lx\n",
2139		(addr_t)block, block->block_number,
2140		(addr_t)block->current_data, (addr_t)block->original_data,
2141		(addr_t)block->parent_data, block->ref_count, block->LastAccess(),
2142		block->busy_reading ? 'r' : '-', block->busy_writing ? 'w' : '-',
2143		block->is_writing ? 'W' : '-', block->is_dirty ? 'D' : '-',
2144		block->unused ? 'U' : '-', block->discard ? 'D' : '-',
2145		(addr_t)block->transaction,
2146		(addr_t)block->previous_transaction);
2147}
2148
2149
2150static void
2151dump_block_long(cached_block* block)
2152{
2153	kprintf("BLOCK %p\n", block);
2154	kprintf(" current data:  %p\n", block->current_data);
2155	kprintf(" original data: %p\n", block->original_data);
2156	kprintf(" parent data:   %p\n", block->parent_data);
2157#if BLOCK_CACHE_DEBUG_CHANGED
2158	kprintf(" compare data:  %p\n", block->compare);
2159#endif
2160	kprintf(" ref_count:     %" B_PRId32 "\n", block->ref_count);
2161	kprintf(" accessed:      %" B_PRId32 "\n", block->LastAccess());
2162	kprintf(" flags:        ");
2163	if (block->busy_reading)
2164		kprintf(" busy_reading");
2165	if (block->busy_writing)
2166		kprintf(" busy_writing");
2167	if (block->is_writing)
2168		kprintf(" is-writing");
2169	if (block->is_dirty)
2170		kprintf(" is-dirty");
2171	if (block->unused)
2172		kprintf(" unused");
2173	if (block->discard)
2174		kprintf(" discard");
2175	kprintf("\n");
2176	if (block->transaction != NULL) {
2177		kprintf(" transaction:   %p (%" B_PRId32 ")\n", block->transaction,
2178			block->transaction->id);
2179		if (block->transaction_next != NULL) {
2180			kprintf(" next in transaction: %" B_PRIdOFF "\n",
2181				block->transaction_next->block_number);
2182		}
2183	}
2184	if (block->previous_transaction != NULL) {
2185		kprintf(" previous transaction: %p (%" B_PRId32 ")\n",
2186			block->previous_transaction,
2187			block->previous_transaction->id);
2188	}
2189
2190	set_debug_variable("_current", (addr_t)block->current_data);
2191	set_debug_variable("_original", (addr_t)block->original_data);
2192	set_debug_variable("_parent", (addr_t)block->parent_data);
2193}
2194
2195
2196static int
2197dump_cached_block(int argc, char** argv)
2198{
2199	if (argc != 2) {
2200		kprintf("usage: %s <block-address>\n", argv[0]);
2201		return 0;
2202	}
2203
2204	dump_block_long((struct cached_block*)(addr_t)parse_expression(argv[1]));
2205	return 0;
2206}
2207
2208
2209static int
2210dump_cache(int argc, char** argv)
2211{
2212	bool showTransactions = false;
2213	bool showBlocks = false;
2214	int32 i = 1;
2215	while (argv[i] != NULL && argv[i][0] == '-') {
2216		for (char* arg = &argv[i][1]; arg[0]; arg++) {
2217			switch (arg[0]) {
2218				case 'b':
2219					showBlocks = true;
2220					break;
2221				case 't':
2222					showTransactions = true;
2223					break;
2224				default:
2225					print_debugger_command_usage(argv[0]);
2226					return 0;
2227			}
2228		}
2229		i++;
2230	}
2231
2232	if (i >= argc) {
2233		print_debugger_command_usage(argv[0]);
2234		return 0;
2235	}
2236
2237	block_cache* cache = (struct block_cache*)(addr_t)parse_expression(argv[i]);
2238	if (cache == NULL) {
2239		kprintf("invalid cache address\n");
2240		return 0;
2241	}
2242
2243	off_t blockNumber = -1;
2244	if (i + 1 < argc) {
2245		blockNumber = parse_expression(argv[i + 1]);
2246		cached_block* block = cache->hash->Lookup(blockNumber);
2247		if (block != NULL)
2248			dump_block_long(block);
2249		else
2250			kprintf("block %" B_PRIdOFF " not found\n", blockNumber);
2251		return 0;
2252	}
2253
2254	kprintf("BLOCK CACHE: %p\n", cache);
2255
2256	kprintf(" fd:           %d\n", cache->fd);
2257	kprintf(" max_blocks:   %" B_PRIdOFF "\n", cache->max_blocks);
2258	kprintf(" block_size:   %zu\n", cache->block_size);
2259	kprintf(" next_transaction_id: %" B_PRId32 "\n", cache->next_transaction_id);
2260	kprintf(" buffer_cache: %p\n", cache->buffer_cache);
2261	kprintf(" busy_reading: %" B_PRIu32 ", %s waiters\n", cache->busy_reading_count,
2262		cache->busy_reading_waiters ? "has" : "no");
2263	kprintf(" busy_writing: %" B_PRIu32 ", %s waiters\n", cache->busy_writing_count,
2264		cache->busy_writing_waiters ? "has" : "no");
2265
2266	if (!cache->pending_notifications.IsEmpty()) {
2267		kprintf(" pending notifications:\n");
2268
2269		NotificationList::Iterator iterator
2270			= cache->pending_notifications.GetIterator();
2271		while (iterator.HasNext()) {
2272			cache_notification* notification = iterator.Next();
2273
2274			kprintf("  %p %5" B_PRIx32 " %p - %p\n", notification,
2275				notification->events_pending, notification->hook,
2276				notification->data);
2277		}
2278	}
2279
2280	if (showTransactions) {
2281		kprintf(" transactions:\n");
2282		kprintf("address       id state  blocks  main   sub\n");
2283
2284		TransactionTable::Iterator iterator(cache->transaction_hash);
2285
2286		while (iterator.HasNext()) {
2287			cache_transaction* transaction = iterator.Next();
2288			kprintf("%p %5" B_PRId32 " %-7s %5" B_PRId32 " %5" B_PRId32 " %5"
2289				B_PRId32 "\n", transaction, transaction->id,
2290				transaction->open ? "open" : "closed",
2291				transaction->num_blocks, transaction->main_num_blocks,
2292				transaction->sub_num_blocks);
2293		}
2294	}
2295
2296	if (showBlocks) {
2297		kprintf(" blocks:\n");
2298		kprintf("address  block no. current  original parent    refs access "
2299			"flags transact prev. trans\n");
2300	}
2301
2302	uint32 referenced = 0;
2303	uint32 count = 0;
2304	uint32 dirty = 0;
2305	uint32 discarded = 0;
2306	BlockTable::Iterator iterator(cache->hash);
2307	while (iterator.HasNext()) {
2308		cached_block* block = iterator.Next();
2309		if (showBlocks)
2310			dump_block(block);
2311
2312		if (block->is_dirty)
2313			dirty++;
2314		if (block->discard)
2315			discarded++;
2316		if (block->ref_count)
2317			referenced++;
2318		count++;
2319	}
2320
2321	kprintf(" %" B_PRIu32 " blocks total, %" B_PRIu32 " dirty, %" B_PRIu32
2322		" discarded, %" B_PRIu32 " referenced, %" B_PRIu32 " busy, %" B_PRIu32
2323		" in unused.\n",
2324		count, dirty, discarded, referenced, cache->busy_reading_count,
2325		cache->unused_block_count);
2326	return 0;
2327}
2328
2329
2330static int
2331dump_transaction(int argc, char** argv)
2332{
2333	bool showBlocks = false;
2334	int i = 1;
2335	if (argc > 1 && !strcmp(argv[1], "-b")) {
2336		showBlocks = true;
2337		i++;
2338	}
2339
2340	if (argc - i < 1 || argc - i > 2) {
2341		print_debugger_command_usage(argv[0]);
2342		return 0;
2343	}
2344
2345	cache_transaction* transaction = NULL;
2346
2347	if (argc - i == 1) {
2348		transaction = (cache_transaction*)(addr_t)parse_expression(argv[i]);
2349	} else {
2350		block_cache* cache = (block_cache*)(addr_t)parse_expression(argv[i]);
2351		int32 id = parse_expression(argv[i + 1]);
2352		transaction = lookup_transaction(cache, id);
2353		if (transaction == NULL) {
2354			kprintf("No transaction with ID %" B_PRId32 " found.\n", id);
2355			return 0;
2356		}
2357	}
2358
2359	kprintf("TRANSACTION %p\n", transaction);
2360
2361	kprintf(" id:             %" B_PRId32 "\n", transaction->id);
2362	kprintf(" num block:      %" B_PRId32 "\n", transaction->num_blocks);
2363	kprintf(" main num block: %" B_PRId32 "\n", transaction->main_num_blocks);
2364	kprintf(" sub num block:  %" B_PRId32 "\n", transaction->sub_num_blocks);
2365	kprintf(" has sub:        %d\n", transaction->has_sub_transaction);
2366	kprintf(" state:          %s\n", transaction->open ? "open" : "closed");
2367	kprintf(" idle:           %" B_PRId64 " secs\n",
2368		(system_time() - transaction->last_used) / 1000000);
2369
2370	kprintf(" listeners:\n");
2371
2372	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
2373	while (iterator.HasNext()) {
2374		cache_listener* listener = iterator.Next();
2375
2376		kprintf("  %p %5" B_PRIx32 " %p - %p\n", listener, listener->events_pending,
2377			listener->hook, listener->data);
2378	}
2379
2380	if (!showBlocks)
2381		return 0;
2382
2383	kprintf(" blocks:\n");
2384	kprintf("address  block no. current  original parent    refs access "
2385		"flags transact prev. trans\n");
2386
2387	cached_block* block = transaction->first_block;
2388	while (block != NULL) {
2389		dump_block(block);
2390		block = block->transaction_next;
2391	}
2392
2393	kprintf("--\n");
2394
2395	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
2396	while (blockIterator.HasNext()) {
2397		block = blockIterator.Next();
2398		dump_block(block);
2399	}
2400
2401	return 0;
2402}
2403
2404
2405static int
2406dump_caches(int argc, char** argv)
2407{
2408	kprintf("Block caches:\n");
2409	DoublyLinkedList<block_cache>::Iterator i = sCaches.GetIterator();
2410	while (i.HasNext()) {
2411		block_cache* cache = i.Next();
2412		if (cache == (block_cache*)&sMarkCache)
2413			continue;
2414
2415		kprintf("  %p\n", cache);
2416	}
2417
2418	return 0;
2419}
2420
2421
2422#if BLOCK_CACHE_BLOCK_TRACING >= 2
2423static int
2424dump_block_data(int argc, char** argv)
2425{
2426	using namespace BlockTracing;
2427
2428	// Determine which blocks to show
2429
2430	bool printStackTrace = true;
2431	uint32 which = 0;
2432	int32 i = 1;
2433	while (i < argc && argv[i][0] == '-') {
2434		char* arg = &argv[i][1];
2435		while (arg[0]) {
2436			switch (arg[0]) {
2437				case 'c':
2438					which |= BlockData::kCurrent;
2439					break;
2440				case 'p':
2441					which |= BlockData::kParent;
2442					break;
2443				case 'o':
2444					which |= BlockData::kOriginal;
2445					break;
2446
2447				default:
2448					kprintf("invalid block specifier (only o/c/p are "
2449						"allowed).\n");
2450					return 0;
2451			}
2452			arg++;
2453		}
2454
2455		i++;
2456	}
2457	if (which == 0)
2458		which = BlockData::kCurrent | BlockData::kParent | BlockData::kOriginal;
2459
2460	if (i == argc) {
2461		print_debugger_command_usage(argv[0]);
2462		return 0;
2463	}
2464
2465	// Get the range of blocks to print
2466
2467	int64 from = parse_expression(argv[i]);
2468	int64 to = from;
2469	if (argc > i + 1)
2470		to = parse_expression(argv[i + 1]);
2471	if (to < from)
2472		to = from;
2473
2474	uint32 offset = 0;
2475	uint32 size = LONG_MAX;
2476	if (argc > i + 2)
2477		offset = parse_expression(argv[i + 2]);
2478	if (argc > i + 3)
2479		size = parse_expression(argv[i + 3]);
2480
2481	TraceEntryIterator iterator;
2482	iterator.MoveTo(from - 1);
2483
2484	static char sBuffer[1024];
2485	LazyTraceOutput out(sBuffer, sizeof(sBuffer), TRACE_OUTPUT_TEAM_ID);
2486
2487	while (TraceEntry* entry = iterator.Next()) {
2488		int32 index = iterator.Index();
2489		if (index > to)
2490			break;
2491
2492		Action* action = dynamic_cast<Action*>(entry);
2493		if (action != NULL) {
2494			out.Clear();
2495			out.DumpEntry(action);
2496			continue;
2497		}
2498
2499		BlockData* blockData = dynamic_cast<BlockData*>(entry);
2500		if (blockData == NULL)
2501			continue;
2502
2503		out.Clear();
2504
2505		const char* dump = out.DumpEntry(entry);
2506		int length = strlen(dump);
2507		if (length > 0 && dump[length - 1] == '\n')
2508			length--;
2509
2510		kprintf("%5" B_PRId32 ". %.*s\n", index, length, dump);
2511
2512		if (printStackTrace) {
2513			out.Clear();
2514			entry->DumpStackTrace(out);
2515			if (out.Size() > 0)
2516				kputs(out.Buffer());
2517		}
2518
2519		blockData->DumpBlocks(which, offset, size);
2520	}
2521
2522	return 0;
2523}
2524#endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
2525
2526
2527#endif	// DEBUG_BLOCK_CACHE
2528
2529
2530/*!	Traverses through the block_cache list, and returns one cache after the
2531	other. The cache returned is automatically locked when you get it, and
2532	unlocked with the next call to this function. Ignores caches that are in
2533	deletion state.
2534	Returns \c NULL when the end of the list is reached.
2535*/
2536static block_cache*
2537get_next_locked_block_cache(block_cache* last)
2538{
2539	MutexLocker _(sCachesLock);
2540
2541	block_cache* cache;
2542	if (last != NULL) {
2543		mutex_unlock(&last->lock);
2544
2545		cache = sCaches.GetNext((block_cache*)&sMarkCache);
2546		sCaches.Remove((block_cache*)&sMarkCache);
2547	} else
2548		cache = sCaches.Head();
2549
2550	if (cache != NULL) {
2551		mutex_lock(&cache->lock);
2552		sCaches.InsertBefore(sCaches.GetNext(cache), (block_cache*)&sMarkCache);
2553	}
2554
2555	return cache;
2556}
2557
2558
2559/*!	Background thread that continuously checks for pending notifications of
2560	all caches.
2561	Every two seconds, it will also write back up to 64 blocks per cache.
2562*/
2563static status_t
2564block_notifier_and_writer(void* /*data*/)
2565{
2566	const bigtime_t kDefaultTimeout = 2000000LL;
2567	bigtime_t timeout = kDefaultTimeout;
2568
2569	while (true) {
2570		bigtime_t start = system_time();
2571
2572		status_t status = acquire_sem_etc(sEventSemaphore, 1,
2573			B_RELATIVE_TIMEOUT, timeout);
2574		if (status == B_OK) {
2575			flush_pending_notifications();
2576			timeout -= system_time() - start;
2577			continue;
2578		}
2579
2580		// Write 64 blocks of each block_cache roughly every 2 seconds,
2581		// potentially more or less depending on congestion and drive speeds
2582		// (usually much less.) We do not want to queue everything at once
2583		// because a future transaction might then get held up waiting for
2584		// a specific block to be written.
2585		timeout = kDefaultTimeout;
2586		size_t usedMemory;
2587		object_cache_get_usage(sBlockCache, &usedMemory);
2588
2589		block_cache* cache = NULL;
2590		while ((cache = get_next_locked_block_cache(cache)) != NULL) {
2591			// Give some breathing room: wait 2x the length of the potential
2592			// maximum block count-sized write between writes, and also skip
2593			// if there are more than 16 blocks currently being written.
2594			const bigtime_t next = cache->last_block_write
2595					+ cache->last_block_write_duration * 2 * 64;
2596			if (cache->busy_writing_count > 16 || system_time() < next) {
2597				if (cache->last_block_write_duration > 0) {
2598					timeout = min_c(timeout,
2599						cache->last_block_write_duration * 2 * 64);
2600				}
2601				continue;
2602			}
2603
2604			BlockWriter writer(cache, 64);
2605			bool hasMoreBlocks = false;
2606
2607			size_t cacheUsedMemory;
2608			object_cache_get_usage(cache->buffer_cache, &cacheUsedMemory);
2609			usedMemory += cacheUsedMemory;
2610
2611			if (cache->num_dirty_blocks) {
2612				// This cache is not using transactions, we'll scan the blocks
2613				// directly
2614				BlockTable::Iterator iterator(cache->hash);
2615
2616				while (iterator.HasNext()) {
2617					cached_block* block = iterator.Next();
2618					if (block->CanBeWritten() && !writer.Add(block)) {
2619						hasMoreBlocks = true;
2620						break;
2621					}
2622				}
2623			} else {
2624				TransactionTable::Iterator iterator(cache->transaction_hash);
2625
2626				while (iterator.HasNext()) {
2627					cache_transaction* transaction = iterator.Next();
2628					if (transaction->open) {
2629						if (system_time() > transaction->last_used
2630								+ kTransactionIdleTime) {
2631							// Transaction is open but idle
2632							notify_transaction_listeners(cache, transaction,
2633								TRANSACTION_IDLE);
2634						}
2635						continue;
2636					}
2637
2638					bool hasLeftOvers;
2639						// we ignore this one
2640					if (!writer.Add(transaction, hasLeftOvers)) {
2641						hasMoreBlocks = true;
2642						break;
2643					}
2644				}
2645			}
2646
2647			writer.Write();
2648
2649			if (hasMoreBlocks && cache->last_block_write_duration > 0) {
2650				// There are probably still more blocks that we could write, so
2651				// see if we can decrease the timeout.
2652				timeout = min_c(timeout,
2653					cache->last_block_write_duration * 2 * 64);
2654			}
2655
2656			if ((block_cache_used_memory() / B_PAGE_SIZE)
2657					> vm_page_num_pages() / 2) {
2658				// Try to reduce memory usage to half of the available
2659				// RAM at maximum
2660				cache->RemoveUnusedBlocks(1000, 10);
2661			}
2662		}
2663
2664		MutexLocker _(sCachesMemoryUseLock);
2665		sUsedMemory = usedMemory;
2666	}
2667
2668	// never can get here
2669	return B_OK;
2670}
2671
2672
2673/*!	Notify function for wait_for_notifications(). */
2674static void
2675notify_sync(int32 transactionID, int32 event, void* _cache)
2676{
2677	block_cache* cache = (block_cache*)_cache;
2678
2679	cache->condition_variable.NotifyOne();
2680}
2681
2682
2683/*!	Must be called with the sCachesLock held. */
2684static bool
2685is_valid_cache(block_cache* cache)
2686{
2687	ASSERT_LOCKED_MUTEX(&sCachesLock);
2688
2689	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
2690	while (iterator.HasNext()) {
2691		if (cache == iterator.Next())
2692			return true;
2693	}
2694
2695	return false;
2696}
2697
2698
2699/*!	Waits until all pending notifications are carried out.
2700	Safe to be called from the block writer/notifier thread.
2701	You must not hold the \a cache lock when calling this function.
2702*/
2703static void
2704wait_for_notifications(block_cache* cache)
2705{
2706	MutexLocker locker(sCachesLock);
2707
2708	if (find_thread(NULL) == sNotifierWriterThread) {
2709		// We're the notifier thread, don't wait, but flush all pending
2710		// notifications directly.
2711		if (is_valid_cache(cache))
2712			flush_pending_notifications(cache);
2713		return;
2714	}
2715
2716	// add sync notification
2717	cache_notification notification;
2718	set_notification(NULL, notification, TRANSACTION_WRITTEN, notify_sync,
2719		cache);
2720
2721	ConditionVariableEntry entry;
2722	cache->condition_variable.Add(&entry);
2723
2724	add_notification(cache, &notification, TRANSACTION_WRITTEN, false);
2725	locker.Unlock();
2726
2727	// wait for notification hook to be called
2728	entry.Wait();
2729}
2730
2731
2732status_t
2733block_cache_init(void)
2734{
2735	sBlockCache = create_object_cache_etc("cached blocks", sizeof(cached_block),
2736		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
2737	if (sBlockCache == NULL)
2738		return B_NO_MEMORY;
2739
2740	sCacheNotificationCache = create_object_cache("cache notifications",
2741		sizeof(cache_listener), 8, NULL, NULL, NULL);
2742	if (sCacheNotificationCache == NULL)
2743		return B_NO_MEMORY;
2744
2745	new (&sCaches) DoublyLinkedList<block_cache>;
2746		// manually call constructor
2747
2748	sEventSemaphore = create_sem(0, "block cache event");
2749	if (sEventSemaphore < B_OK)
2750		return sEventSemaphore;
2751
2752	sNotifierWriterThread = spawn_kernel_thread(&block_notifier_and_writer,
2753		"block notifier/writer", B_LOW_PRIORITY, NULL);
2754	if (sNotifierWriterThread >= B_OK)
2755		resume_thread(sNotifierWriterThread);
2756
2757#if DEBUG_BLOCK_CACHE
2758	add_debugger_command_etc("block_caches", &dump_caches,
2759		"dumps all block caches", "\n", 0);
2760	add_debugger_command_etc("block_cache", &dump_cache,
2761		"dumps a specific block cache",
2762		"[-bt] <cache-address> [block-number]\n"
2763		"  -t lists the transactions\n"
2764		"  -b lists all blocks\n", 0);
2765	add_debugger_command("cached_block", &dump_cached_block,
2766		"dumps the specified cached block");
2767	add_debugger_command_etc("transaction", &dump_transaction,
2768		"dumps a specific transaction", "[-b] ((<cache> <id>) | <transaction>)\n"
2769		"Either use a block cache pointer and an ID or a pointer to the transaction.\n"
2770		"  -b lists all blocks that are part of this transaction\n", 0);
2771#	if BLOCK_CACHE_BLOCK_TRACING >= 2
2772	add_debugger_command_etc("block_cache_data", &dump_block_data,
2773		"dumps the data blocks logged for the actions",
2774		"[-cpo] <from> [<to> [<offset> [<size>]]]\n"
2775		"If no data specifier is used, all blocks are shown by default.\n"
2776		" -c       the current data is shown, if available.\n"
2777		" -p       the parent data is shown, if available.\n"
2778		" -o       the original data is shown, if available.\n"
2779		" <from>   first index of tracing entries to show.\n"
2780		" <to>     if given, the last entry. If not, only <from> is shown.\n"
2781		" <offset> the offset of the block data.\n"
2782		" <from>   the size of the block data that is dumped\n", 0);
2783#	endif
2784#endif	// DEBUG_BLOCK_CACHE
2785
2786	return B_OK;
2787}
2788
2789
2790size_t
2791block_cache_used_memory(void)
2792{
2793	MutexLocker _(sCachesMemoryUseLock);
2794	return sUsedMemory;
2795}
2796
2797
2798//	#pragma mark - public transaction API
2799
2800
2801int32
2802cache_start_transaction(void* _cache)
2803{
2804	block_cache* cache = (block_cache*)_cache;
2805	TransactionLocker locker(cache);
2806
2807	if (cache->last_transaction && cache->last_transaction->open) {
2808		panic("last transaction (%" B_PRId32 ") still open!\n",
2809			cache->last_transaction->id);
2810	}
2811
2812	cache_transaction* transaction = new(std::nothrow) cache_transaction;
2813	if (transaction == NULL)
2814		return B_NO_MEMORY;
2815
2816	transaction->id = atomic_add(&cache->next_transaction_id, 1);
2817	cache->last_transaction = transaction;
2818
2819	TRACE(("cache_start_transaction(): id %" B_PRId32 " started\n", transaction->id));
2820	T(Action("start", cache, transaction));
2821
2822	cache->transaction_hash->Insert(transaction);
2823
2824	return transaction->id;
2825}
2826
2827
2828status_t
2829cache_sync_transaction(void* _cache, int32 id)
2830{
2831	block_cache* cache = (block_cache*)_cache;
2832	bool hadBusy;
2833
2834	TRACE(("cache_sync_transaction(id %" B_PRId32 ")\n", id));
2835
2836	do {
2837		TransactionLocker locker(cache);
2838		hadBusy = false;
2839
2840		BlockWriter writer(cache);
2841		TransactionTable::Iterator iterator(cache->transaction_hash);
2842
2843		while (iterator.HasNext()) {
2844			// close all earlier transactions which haven't been closed yet
2845			cache_transaction* transaction = iterator.Next();
2846
2847			if (transaction->busy_writing_count != 0) {
2848				hadBusy = true;
2849				continue;
2850			}
2851			if (transaction->id <= id && !transaction->open) {
2852				// write back all of their remaining dirty blocks
2853				T(Action("sync", cache, transaction));
2854
2855				bool hasLeftOvers;
2856				writer.Add(transaction, hasLeftOvers);
2857
2858				if (hasLeftOvers) {
2859					// This transaction contains blocks that a previous
2860					// transaction is trying to write back in this write run
2861					hadBusy = true;
2862				}
2863			}
2864		}
2865
2866		status_t status = writer.Write();
2867		if (status != B_OK)
2868			return status;
2869	} while (hadBusy);
2870
2871	wait_for_notifications(cache);
2872		// make sure that all pending TRANSACTION_WRITTEN notifications
2873		// are handled after we return
2874	return B_OK;
2875}
2876
2877
2878status_t
2879cache_end_transaction(void* _cache, int32 id,
2880	transaction_notification_hook hook, void* data)
2881{
2882	block_cache* cache = (block_cache*)_cache;
2883	TransactionLocker locker(cache);
2884
2885	TRACE(("cache_end_transaction(id = %" B_PRId32 ")\n", id));
2886
2887	cache_transaction* transaction = lookup_transaction(cache, id);
2888	if (transaction == NULL) {
2889		panic("cache_end_transaction(): invalid transaction ID\n");
2890		return B_BAD_VALUE;
2891	}
2892
2893	// Write back all pending transaction blocks
2894	status_t status = write_blocks_in_previous_transaction(cache, transaction);
2895	if (status != B_OK)
2896		return status;
2897
2898	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
2899
2900	if (hook != NULL
2901		&& add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN,
2902			hook, data) != B_OK) {
2903		return B_NO_MEMORY;
2904	}
2905
2906	T(Action("end", cache, transaction));
2907
2908	// iterate through all blocks and free the unchanged original contents
2909
2910	cached_block* next;
2911	for (cached_block* block = transaction->first_block; block != NULL;
2912			block = next) {
2913		next = block->transaction_next;
2914		ASSERT(block->previous_transaction == NULL);
2915
2916		if (block->discard) {
2917			// This block has been discarded in the transaction
2918			cache->DiscardBlock(block);
2919			transaction->num_blocks--;
2920			continue;
2921		}
2922
2923		if (block->original_data != NULL) {
2924			cache->Free(block->original_data);
2925			block->original_data = NULL;
2926		}
2927		if (block->parent_data != NULL) {
2928			ASSERT(transaction->has_sub_transaction);
2929			cache->FreeBlockParentData(block);
2930		}
2931
2932		// move the block to the previous transaction list
2933		transaction->blocks.Add(block);
2934
2935		block->previous_transaction = transaction;
2936		block->transaction_next = NULL;
2937		block->transaction = NULL;
2938	}
2939
2940	transaction->open = false;
2941	return B_OK;
2942}
2943
2944
2945status_t
2946cache_abort_transaction(void* _cache, int32 id)
2947{
2948	block_cache* cache = (block_cache*)_cache;
2949	TransactionLocker locker(cache);
2950
2951	TRACE(("cache_abort_transaction(id = %" B_PRId32 ")\n", id));
2952
2953	cache_transaction* transaction = lookup_transaction(cache, id);
2954	if (transaction == NULL) {
2955		panic("cache_abort_transaction(): invalid transaction ID\n");
2956		return B_BAD_VALUE;
2957	}
2958
2959	T(Abort(cache, transaction));
2960	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
2961
2962	// iterate through all blocks and restore their original contents
2963
2964	cached_block* block = transaction->first_block;
2965	cached_block* next;
2966	for (; block != NULL; block = next) {
2967		next = block->transaction_next;
2968
2969		if (block->original_data != NULL) {
2970			TRACE(("cache_abort_transaction(id = %" B_PRId32 "): restored contents of "
2971				"block %" B_PRIdOFF "\n", transaction->id, block->block_number));
2972			memcpy(block->current_data, block->original_data,
2973				cache->block_size);
2974			cache->Free(block->original_data);
2975			block->original_data = NULL;
2976		}
2977		if (transaction->has_sub_transaction && block->parent_data != NULL)
2978			cache->FreeBlockParentData(block);
2979
2980		block->transaction_next = NULL;
2981		block->transaction = NULL;
2982		block->discard = false;
2983		if (block->previous_transaction == NULL)
2984			block->is_dirty = false;
2985	}
2986
2987	cache->transaction_hash->Remove(transaction);
2988	delete_transaction(cache, transaction);
2989	return B_OK;
2990}
2991
2992
2993/*!	Acknowledges the current parent transaction, and starts a new transaction
2994	from its sub transaction.
2995	The new transaction also gets a new transaction ID.
2996*/
2997int32
2998cache_detach_sub_transaction(void* _cache, int32 id,
2999	transaction_notification_hook hook, void* data)
3000{
3001	block_cache* cache = (block_cache*)_cache;
3002	TransactionLocker locker(cache);
3003
3004	TRACE(("cache_detach_sub_transaction(id = %" B_PRId32 ")\n", id));
3005
3006	cache_transaction* transaction = lookup_transaction(cache, id);
3007	if (transaction == NULL) {
3008		panic("cache_detach_sub_transaction(): invalid transaction ID\n");
3009		return B_BAD_VALUE;
3010	}
3011	if (!transaction->has_sub_transaction)
3012		return B_BAD_VALUE;
3013
3014	// iterate through all blocks and free the unchanged original contents
3015
3016	status_t status = write_blocks_in_previous_transaction(cache, transaction);
3017	if (status != B_OK)
3018		return status;
3019
3020	// create a new transaction for the sub transaction
3021	cache_transaction* newTransaction = new(std::nothrow) cache_transaction;
3022	if (newTransaction == NULL)
3023		return B_NO_MEMORY;
3024
3025	newTransaction->id = atomic_add(&cache->next_transaction_id, 1);
3026	T(Detach(cache, transaction, newTransaction));
3027
3028	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3029
3030	if (add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN, hook,
3031			data) != B_OK) {
3032		delete newTransaction;
3033		return B_NO_MEMORY;
3034	}
3035
3036	cached_block* last = NULL;
3037	cached_block* next;
3038	for (cached_block* block = transaction->first_block; block != NULL;
3039			block = next) {
3040		next = block->transaction_next;
3041		ASSERT(block->previous_transaction == NULL);
3042
3043		if (block->discard) {
3044			cache->DiscardBlock(block);
3045			transaction->main_num_blocks--;
3046			continue;
3047		}
3048
3049		if (block->parent_data != NULL) {
3050			// The block changed in the parent - free the original data, since
3051			// they will be replaced by what is in current.
3052			ASSERT(block->original_data != NULL);
3053			cache->Free(block->original_data);
3054
3055			if (block->parent_data != block->current_data) {
3056				// The block had been changed in both transactions
3057				block->original_data = block->parent_data;
3058			} else {
3059				// The block has only been changed in the parent
3060				block->original_data = NULL;
3061			}
3062			block->parent_data = NULL;
3063
3064			// move the block to the previous transaction list
3065			transaction->blocks.Add(block);
3066			block->previous_transaction = transaction;
3067		}
3068
3069		if (block->original_data != NULL) {
3070			// This block had been changed in the current sub transaction,
3071			// we need to move this block over to the new transaction.
3072			ASSERT(block->parent_data == NULL);
3073
3074			if (last == NULL)
3075				newTransaction->first_block = block;
3076			else
3077				last->transaction_next = block;
3078
3079			block->transaction = newTransaction;
3080			last = block;
3081		} else
3082			block->transaction = NULL;
3083
3084		block->transaction_next = NULL;
3085	}
3086
3087	newTransaction->num_blocks = transaction->sub_num_blocks;
3088
3089	transaction->open = false;
3090	transaction->has_sub_transaction = false;
3091	transaction->num_blocks = transaction->main_num_blocks;
3092	transaction->sub_num_blocks = 0;
3093
3094	cache->transaction_hash->Insert(newTransaction);
3095	cache->last_transaction = newTransaction;
3096
3097	return newTransaction->id;
3098}
3099
3100
3101status_t
3102cache_abort_sub_transaction(void* _cache, int32 id)
3103{
3104	block_cache* cache = (block_cache*)_cache;
3105	TransactionLocker locker(cache);
3106
3107	TRACE(("cache_abort_sub_transaction(id = %" B_PRId32 ")\n", id));
3108
3109	cache_transaction* transaction = lookup_transaction(cache, id);
3110	if (transaction == NULL) {
3111		panic("cache_abort_sub_transaction(): invalid transaction ID\n");
3112		return B_BAD_VALUE;
3113	}
3114	if (!transaction->has_sub_transaction)
3115		return B_BAD_VALUE;
3116
3117	T(Abort(cache, transaction));
3118	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
3119
3120	// revert all changes back to the version of the parent
3121
3122	cached_block* block = transaction->first_block;
3123	cached_block* last = NULL;
3124	cached_block* next;
3125	for (; block != NULL; block = next) {
3126		next = block->transaction_next;
3127
3128		if (block->parent_data == NULL) {
3129			// The parent transaction didn't change the block, but the sub
3130			// transaction did - we need to revert to the original data.
3131			// The block is no longer part of the transaction
3132			if (block->original_data != NULL) {
3133				// The block might not have original data if was empty
3134				memcpy(block->current_data, block->original_data,
3135					cache->block_size);
3136			}
3137
3138			if (last != NULL)
3139				last->transaction_next = next;
3140			else
3141				transaction->first_block = next;
3142
3143			block->transaction_next = NULL;
3144			block->transaction = NULL;
3145			transaction->num_blocks--;
3146
3147			if (block->previous_transaction == NULL) {
3148				cache->Free(block->original_data);
3149				block->original_data = NULL;
3150				block->is_dirty = false;
3151
3152				if (block->ref_count == 0) {
3153					// Move the block into the unused list if possible
3154					block->unused = true;
3155					cache->unused_blocks.Add(block);
3156					cache->unused_block_count++;
3157				}
3158			}
3159		} else {
3160			if (block->parent_data != block->current_data) {
3161				// The block has been changed and must be restored - the block
3162				// is still dirty and part of the transaction
3163				TRACE(("cache_abort_sub_transaction(id = %" B_PRId32 "): "
3164					"restored contents of block %" B_PRIdOFF "\n",
3165					transaction->id, block->block_number));
3166				memcpy(block->current_data, block->parent_data,
3167					cache->block_size);
3168				cache->Free(block->parent_data);
3169				// The block stays dirty
3170			}
3171			block->parent_data = NULL;
3172			last = block;
3173		}
3174
3175		block->discard = false;
3176	}
3177
3178	// all subsequent changes will go into the main transaction
3179	transaction->has_sub_transaction = false;
3180	transaction->sub_num_blocks = 0;
3181
3182	return B_OK;
3183}
3184
3185
3186status_t
3187cache_start_sub_transaction(void* _cache, int32 id)
3188{
3189	block_cache* cache = (block_cache*)_cache;
3190	TransactionLocker locker(cache);
3191
3192	TRACE(("cache_start_sub_transaction(id = %" B_PRId32 ")\n", id));
3193
3194	cache_transaction* transaction = lookup_transaction(cache, id);
3195	if (transaction == NULL) {
3196		panic("cache_start_sub_transaction(): invalid transaction ID %" B_PRId32 "\n",
3197			id);
3198		return B_BAD_VALUE;
3199	}
3200
3201	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3202
3203	// move all changed blocks up to the parent
3204
3205	cached_block* block = transaction->first_block;
3206	cached_block* next;
3207	for (; block != NULL; block = next) {
3208		next = block->transaction_next;
3209
3210		if (block->parent_data != NULL) {
3211			// There already is an older sub transaction - we acknowledge
3212			// its changes and move its blocks up to the parent
3213			ASSERT(transaction->has_sub_transaction);
3214			cache->FreeBlockParentData(block);
3215		}
3216		if (block->discard) {
3217			// This block has been discarded in the parent transaction.
3218			// Just throw away any changes made in this transaction, so that
3219			// it can still be reverted to its original contents if needed
3220			ASSERT(block->previous_transaction == NULL);
3221			if (block->original_data != NULL) {
3222				memcpy(block->current_data, block->original_data,
3223					cache->block_size);
3224
3225				cache->Free(block->original_data);
3226				block->original_data = NULL;
3227			}
3228			continue;
3229		}
3230
3231		// we "allocate" the parent data lazily, that means, we don't copy
3232		// the data (and allocate memory for it) until we need to
3233		block->parent_data = block->current_data;
3234	}
3235
3236	// all subsequent changes will go into the sub transaction
3237	transaction->has_sub_transaction = true;
3238	transaction->main_num_blocks = transaction->num_blocks;
3239	transaction->sub_num_blocks = 0;
3240	T(Action("start-sub", cache, transaction));
3241
3242	return B_OK;
3243}
3244
3245
3246/*!	Adds a transaction listener that gets notified when the transaction
3247	is ended, aborted, written, or idle as specified by \a events.
3248	The listener gets automatically removed when the transaction ends.
3249*/
3250status_t
3251cache_add_transaction_listener(void* _cache, int32 id, int32 events,
3252	transaction_notification_hook hook, void* data)
3253{
3254	block_cache* cache = (block_cache*)_cache;
3255	TransactionLocker locker(cache);
3256
3257	cache_transaction* transaction = lookup_transaction(cache, id);
3258	if (transaction == NULL)
3259		return B_BAD_VALUE;
3260
3261	return add_transaction_listener(cache, transaction, events, hook, data);
3262}
3263
3264
3265status_t
3266cache_remove_transaction_listener(void* _cache, int32 id,
3267	transaction_notification_hook hookFunction, void* data)
3268{
3269	block_cache* cache = (block_cache*)_cache;
3270	TransactionLocker locker(cache);
3271
3272	cache_transaction* transaction = lookup_transaction(cache, id);
3273	if (transaction == NULL)
3274		return B_BAD_VALUE;
3275
3276	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
3277	while (iterator.HasNext()) {
3278		cache_listener* listener = iterator.Next();
3279		if (listener->data == data && listener->hook == hookFunction) {
3280			iterator.Remove();
3281
3282			if (listener->events_pending != 0) {
3283				MutexLocker _(sNotificationsLock);
3284				if (listener->events_pending != 0)
3285					cache->pending_notifications.Remove(listener);
3286			}
3287			delete listener;
3288			return B_OK;
3289		}
3290	}
3291
3292	return B_ENTRY_NOT_FOUND;
3293}
3294
3295
3296status_t
3297cache_next_block_in_transaction(void* _cache, int32 id, bool mainOnly,
3298	long* _cookie, off_t* _blockNumber, void** _data, void** _unchangedData)
3299{
3300	cached_block* block = (cached_block*)*_cookie;
3301	block_cache* cache = (block_cache*)_cache;
3302	TransactionLocker locker(cache);
3303
3304	cache_transaction* transaction = lookup_transaction(cache, id);
3305	if (transaction == NULL || !transaction->open)
3306		return B_BAD_VALUE;
3307
3308	if (block == NULL)
3309		block = transaction->first_block;
3310	else
3311		block = block->transaction_next;
3312
3313	if (transaction->has_sub_transaction) {
3314		if (mainOnly) {
3315			// find next block that the parent changed
3316			while (block != NULL && block->parent_data == NULL)
3317				block = block->transaction_next;
3318		} else {
3319			// find next non-discarded block
3320			while (block != NULL && block->discard)
3321				block = block->transaction_next;
3322		}
3323	}
3324
3325	if (block == NULL)
3326		return B_ENTRY_NOT_FOUND;
3327
3328	if (_blockNumber)
3329		*_blockNumber = block->block_number;
3330	if (_data)
3331		*_data = mainOnly ? block->parent_data : block->current_data;
3332	if (_unchangedData)
3333		*_unchangedData = block->original_data;
3334
3335	*_cookie = (addr_t)block;
3336	return B_OK;
3337}
3338
3339
3340int32
3341cache_blocks_in_transaction(void* _cache, int32 id)
3342{
3343	block_cache* cache = (block_cache*)_cache;
3344	TransactionLocker locker(cache);
3345
3346	cache_transaction* transaction = lookup_transaction(cache, id);
3347	if (transaction == NULL)
3348		return B_BAD_VALUE;
3349
3350	return transaction->num_blocks;
3351}
3352
3353
3354/*!	Returns the number of blocks that are part of the main transaction. If this
3355	transaction does not have a sub transaction yet, this is the same value as
3356	cache_blocks_in_transaction() would return.
3357*/
3358int32
3359cache_blocks_in_main_transaction(void* _cache, int32 id)
3360{
3361	block_cache* cache = (block_cache*)_cache;
3362	TransactionLocker locker(cache);
3363
3364	cache_transaction* transaction = lookup_transaction(cache, id);
3365	if (transaction == NULL)
3366		return B_BAD_VALUE;
3367
3368	if (transaction->has_sub_transaction)
3369		return transaction->main_num_blocks;
3370
3371	return transaction->num_blocks;
3372}
3373
3374
3375int32
3376cache_blocks_in_sub_transaction(void* _cache, int32 id)
3377{
3378	block_cache* cache = (block_cache*)_cache;
3379	TransactionLocker locker(cache);
3380
3381	cache_transaction* transaction = lookup_transaction(cache, id);
3382	if (transaction == NULL)
3383		return B_BAD_VALUE;
3384
3385	return transaction->sub_num_blocks;
3386}
3387
3388
3389/*!	Check if block is in transaction
3390*/
3391bool
3392cache_has_block_in_transaction(void* _cache, int32 id, off_t blockNumber)
3393{
3394	block_cache* cache = (block_cache*)_cache;
3395	TransactionLocker locker(cache);
3396
3397	cached_block* block = cache->hash->Lookup(blockNumber);
3398
3399	return (block != NULL && block->transaction != NULL
3400		&& block->transaction->id == id);
3401}
3402
3403
3404//	#pragma mark - public block cache API
3405
3406
3407void
3408block_cache_delete(void* _cache, bool allowWrites)
3409{
3410	block_cache* cache = (block_cache*)_cache;
3411
3412	if (allowWrites)
3413		block_cache_sync(cache);
3414
3415	mutex_lock(&sCachesLock);
3416	sCaches.Remove(cache);
3417	mutex_unlock(&sCachesLock);
3418
3419	mutex_lock(&cache->lock);
3420
3421	// wait for all blocks to become unbusy
3422	wait_for_busy_reading_blocks(cache);
3423	wait_for_busy_writing_blocks(cache);
3424
3425	// free all blocks
3426
3427	cached_block* block = cache->hash->Clear(true);
3428	while (block != NULL) {
3429		cached_block* next = block->next;
3430		cache->FreeBlock(block);
3431		block = next;
3432	}
3433
3434	// free all transactions (they will all be aborted)
3435
3436	cache_transaction* transaction = cache->transaction_hash->Clear(true);
3437	while (transaction != NULL) {
3438		cache_transaction* next = transaction->next;
3439		delete transaction;
3440		transaction = next;
3441	}
3442
3443	delete cache;
3444}
3445
3446
3447void*
3448block_cache_create(int fd, off_t numBlocks, size_t blockSize, bool readOnly)
3449{
3450	block_cache* cache = new(std::nothrow) block_cache(fd, numBlocks, blockSize,
3451		readOnly);
3452	if (cache == NULL)
3453		return NULL;
3454
3455	if (cache->Init() != B_OK) {
3456		delete cache;
3457		return NULL;
3458	}
3459
3460	MutexLocker _(sCachesLock);
3461	sCaches.Add(cache);
3462
3463	return cache;
3464}
3465
3466
3467status_t
3468block_cache_sync(void* _cache)
3469{
3470	block_cache* cache = (block_cache*)_cache;
3471
3472	// We will sync all dirty blocks to disk that have a completed
3473	// transaction or no transaction only
3474
3475	MutexLocker locker(&cache->lock);
3476
3477	BlockWriter writer(cache);
3478	BlockTable::Iterator iterator(cache->hash);
3479
3480	while (iterator.HasNext()) {
3481		cached_block* block = iterator.Next();
3482		if (block->CanBeWritten())
3483			writer.Add(block);
3484	}
3485
3486	status_t status = writer.Write();
3487
3488	locker.Unlock();
3489
3490	wait_for_notifications(cache);
3491		// make sure that all pending TRANSACTION_WRITTEN notifications
3492		// are handled after we return
3493	return status;
3494}
3495
3496
3497status_t
3498block_cache_sync_etc(void* _cache, off_t blockNumber, size_t numBlocks)
3499{
3500	block_cache* cache = (block_cache*)_cache;
3501
3502	// We will sync all dirty blocks to disk that have a completed
3503	// transaction or no transaction only
3504
3505	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
3506		panic("block_cache_sync_etc: invalid block number %" B_PRIdOFF
3507			" (max %" B_PRIdOFF ")",
3508			blockNumber, cache->max_blocks - 1);
3509		return B_BAD_VALUE;
3510	}
3511
3512	MutexLocker locker(&cache->lock);
3513	BlockWriter writer(cache);
3514
3515	for (; numBlocks > 0; numBlocks--, blockNumber++) {
3516		cached_block* block = cache->hash->Lookup(blockNumber);
3517		if (block == NULL)
3518			continue;
3519
3520		if (block->CanBeWritten())
3521			writer.Add(block);
3522	}
3523
3524	status_t status = writer.Write();
3525
3526	locker.Unlock();
3527
3528	wait_for_notifications(cache);
3529		// make sure that all pending TRANSACTION_WRITTEN notifications
3530		// are handled after we return
3531	return status;
3532}
3533
3534
3535/*!	Discards a block from the current transaction or from the cache.
3536	You have to call this function when you no longer use a block, ie. when it
3537	might be reclaimed by the file cache in order to make sure they won't
3538	interfere.
3539*/
3540void
3541block_cache_discard(void* _cache, off_t blockNumber, size_t numBlocks)
3542{
3543	// TODO: this could be a nice place to issue the ATA trim command
3544	block_cache* cache = (block_cache*)_cache;
3545	TransactionLocker locker(cache);
3546
3547	BlockWriter writer(cache);
3548
3549	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3550		cached_block* block = cache->hash->Lookup(blockNumber);
3551		if (block != NULL && block->previous_transaction != NULL)
3552			writer.Add(block);
3553	}
3554
3555	writer.Write();
3556		// TODO: this can fail, too!
3557
3558	blockNumber -= numBlocks;
3559		// reset blockNumber to its original value
3560
3561	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3562		cached_block* block = cache->hash->Lookup(blockNumber);
3563		if (block == NULL)
3564			continue;
3565
3566		ASSERT(block->previous_transaction == NULL);
3567
3568		if (block->unused) {
3569			cache->unused_blocks.Remove(block);
3570			cache->unused_block_count--;
3571			cache->RemoveBlock(block);
3572		} else {
3573			if (block->transaction != NULL && block->parent_data != NULL
3574				&& block->parent_data != block->current_data) {
3575				panic("Discarded block %" B_PRIdOFF " has already been changed in this "
3576					"transaction!", blockNumber);
3577			}
3578
3579			// mark it as discarded (in the current transaction only, if any)
3580			block->discard = true;
3581		}
3582	}
3583}
3584
3585
3586status_t
3587block_cache_make_writable(void* _cache, off_t blockNumber, int32 transaction)
3588{
3589	block_cache* cache = (block_cache*)_cache;
3590	MutexLocker locker(&cache->lock);
3591
3592	if (cache->read_only) {
3593		panic("tried to make block writable on a read-only cache!");
3594		return B_ERROR;
3595	}
3596
3597	// TODO: this can be done better!
3598	void* block;
3599	status_t status = get_writable_cached_block(cache, blockNumber,
3600		blockNumber, 1, transaction, false, &block);
3601	if (status == B_OK) {
3602		put_cached_block((block_cache*)_cache, blockNumber);
3603		return B_OK;
3604	}
3605
3606	return status;
3607}
3608
3609
3610status_t
3611block_cache_get_writable_etc(void* _cache, off_t blockNumber, off_t base,
3612	off_t length, int32 transaction, void** _block)
3613{
3614	block_cache* cache = (block_cache*)_cache;
3615	MutexLocker locker(&cache->lock);
3616
3617	TRACE(("block_cache_get_writable_etc(block = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
3618		blockNumber, transaction));
3619	if (cache->read_only)
3620		panic("tried to get writable block on a read-only cache!");
3621
3622	return get_writable_cached_block(cache, blockNumber, base, length,
3623		transaction, false, _block);
3624}
3625
3626
3627void*
3628block_cache_get_writable(void* _cache, off_t blockNumber, int32 transaction)
3629{
3630	void* block;
3631	if (block_cache_get_writable_etc(_cache, blockNumber,
3632			blockNumber, 1, transaction, &block) == B_OK)
3633		return block;
3634
3635	return NULL;
3636}
3637
3638
3639void*
3640block_cache_get_empty(void* _cache, off_t blockNumber, int32 transaction)
3641{
3642	block_cache* cache = (block_cache*)_cache;
3643	MutexLocker locker(&cache->lock);
3644
3645	TRACE(("block_cache_get_empty(block = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
3646		blockNumber, transaction));
3647	if (cache->read_only)
3648		panic("tried to get empty writable block on a read-only cache!");
3649
3650	void* block;
3651	if (get_writable_cached_block((block_cache*)_cache, blockNumber,
3652			blockNumber, 1, transaction, true, &block) == B_OK)
3653		return block;
3654
3655	return NULL;
3656}
3657
3658
3659status_t
3660block_cache_get_etc(void* _cache, off_t blockNumber, off_t base, off_t length,
3661	const void** _block)
3662{
3663	block_cache* cache = (block_cache*)_cache;
3664	MutexLocker locker(&cache->lock);
3665	bool allocated;
3666
3667	cached_block* block;
3668	status_t status = get_cached_block(cache, blockNumber, &allocated, true,
3669		&block);
3670	if (status != B_OK)
3671		return status;
3672
3673#if BLOCK_CACHE_DEBUG_CHANGED
3674	if (block->compare == NULL)
3675		block->compare = cache->Allocate();
3676	if (block->compare != NULL)
3677		memcpy(block->compare, block->current_data, cache->block_size);
3678#endif
3679	TB(Get(cache, block));
3680
3681	*_block = block->current_data;
3682	return B_OK;
3683}
3684
3685
3686const void*
3687block_cache_get(void* _cache, off_t blockNumber)
3688{
3689	const void* block;
3690	if (block_cache_get_etc(_cache, blockNumber, blockNumber, 1, &block)
3691			== B_OK)
3692		return block;
3693
3694	return NULL;
3695}
3696
3697
3698/*!	Changes the internal status of a writable block to \a dirty. This can be
3699	helpful in case you realize you don't need to change that block anymore
3700	for whatever reason.
3701
3702	Note, you must only use this function on blocks that were acquired
3703	writable!
3704*/
3705status_t
3706block_cache_set_dirty(void* _cache, off_t blockNumber, bool dirty,
3707	int32 transaction)
3708{
3709	block_cache* cache = (block_cache*)_cache;
3710	MutexLocker locker(&cache->lock);
3711
3712	cached_block* block = cache->hash->Lookup(blockNumber);
3713	if (block == NULL)
3714		return B_BAD_VALUE;
3715	if (block->is_dirty == dirty) {
3716		// there is nothing to do for us
3717		return B_OK;
3718	}
3719
3720	// TODO: not yet implemented
3721	if (dirty)
3722		panic("block_cache_set_dirty(): not yet implemented that way!\n");
3723
3724	return B_OK;
3725}
3726
3727
3728void
3729block_cache_put(void* _cache, off_t blockNumber)
3730{
3731	block_cache* cache = (block_cache*)_cache;
3732	MutexLocker locker(&cache->lock);
3733
3734	put_cached_block(cache, blockNumber);
3735}
3736
3737