1/*
2 * Copyright 2001-2012, Axel D��rfler, axeld@pinc-software.de.
3 * This file may be used under the terms of the MIT License.
4 *
5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6 * no code but achieves binary compatibility with the on disk format.
7 */
8
9
10//! B+Tree implementation
11
12
13#include "Debug.h"
14#include "BPlusTree.h"
15#include "Inode.h"
16#include "Utility.h"
17
18
19/*!	Simple array used for the duplicate handling in the B+Tree. This is an
20	on disk structure.
21*/
22struct duplicate_array {
23	off_t	count;
24	off_t	values[0];
25
26	inline bool IsEmpty() const
27	{
28		return count == 0;
29	}
30
31	inline int32 Count() const
32	{
33		return (int32)BFS_ENDIAN_TO_HOST_INT64(count);
34	}
35
36	inline off_t ValueAt(uint32 index) const
37	{
38		return BFS_ENDIAN_TO_HOST_INT64(values[index]);
39	}
40
41	inline void SetValueAt(uint32 index, off_t value)
42	{
43		values[index] = HOST_ENDIAN_TO_BFS_INT64(value);
44	}
45
46	inline int32 Find(off_t value) const
47	{
48		int32 i;
49		return _FindInternal(value, i) ? i : -1;
50	}
51
52	void Insert(off_t value);
53	bool Remove(off_t value);
54
55private:
56	bool _FindInternal(off_t value, int32& index) const;
57} _PACKED;
58
59
60#ifdef DEBUG
61class NodeChecker {
62public:
63	NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text)
64		:
65		fNode(node),
66		fSize(nodeSize),
67		fText(text)
68	{
69		Check("integrity check failed on construction.");
70	}
71
72	~NodeChecker()
73	{
74		Check("integrity check failed on destruction.");
75	}
76
77	void
78	Check(const char* message)
79	{
80		if (fNode->CheckIntegrity(fSize) != B_OK) {
81			dprintf("%s: %s\n", fText, message);
82			DEBUGGER(("NodeChecker integrity check failed!"));
83		}
84	}
85
86private:
87	const bplustree_node*	fNode;
88	int32					fSize;
89	const char*				fText;
90};
91#endif
92
93
94class BitmapArray {
95public:
96								BitmapArray(size_t numBits);
97								~BitmapArray();
98
99			status_t			InitCheck() const;
100
101			bool				IsSet(size_t index) const;
102			void				Set(size_t index, bool set);
103
104			size_t				CountSet() const { return fCountSet; }
105
106private:
107			uint8*				fBitmap;
108			size_t				fSize;
109			size_t				fCountSet;
110};
111
112
113struct TreeCheck {
114	TreeCheck(BPlusTree* tree)
115		:
116		fLevelCount(0),
117		fFreeCount(0),
118		fNodeSize(tree->NodeSize()),
119		fMaxLevels(tree->fHeader.MaxNumberOfLevels()),
120		fFoundErrors(0),
121		fVisited(tree->Stream()->Size() / tree->NodeSize()),
122		fVisitedFragment(tree->Stream()->Size() / tree->NodeSize())
123	{
124		fPreviousOffsets = (off_t*)malloc(
125			sizeof(off_t) * tree->fHeader.MaxNumberOfLevels());
126		if (fPreviousOffsets != NULL) {
127			for (size_t i = 0; i < fMaxLevels; i++)
128				fPreviousOffsets[i] = BPLUSTREE_NULL;
129		}
130	}
131
132	~TreeCheck()
133	{
134		free(fPreviousOffsets);
135	}
136
137	status_t InitCheck() const
138	{
139		if (fPreviousOffsets == NULL)
140			return B_NO_MEMORY;
141
142		status_t status = fVisited.InitCheck();
143		if (status != B_OK)
144			return status;
145
146		return fVisitedFragment.InitCheck();
147	}
148
149	bool Visited(off_t offset) const
150	{
151		return fVisited.IsSet(offset / fNodeSize);
152	}
153
154	void SetVisited(off_t offset)
155	{
156		fVisited.Set(offset / fNodeSize, true);
157	}
158
159	size_t VisitedCount() const
160	{
161		return fVisited.CountSet();
162	}
163
164	bool VisitedFragment(off_t offset) const
165	{
166		return fVisitedFragment.IsSet(offset / fNodeSize);
167	}
168
169	void SetVisitedFragment(off_t offset)
170	{
171		fVisitedFragment.Set(offset / fNodeSize, true);
172	}
173
174	uint32 MaxLevels() const
175	{
176		return fLevelCount;
177	}
178
179	void SetLevel(uint32 level)
180	{
181		if (fLevelCount < level)
182			fLevelCount = level;
183	}
184
185	off_t PreviousOffset(uint32 level)
186	{
187		return fPreviousOffsets[level];
188	}
189
190	void SetPreviousOffset(uint32 level, off_t offset)
191	{
192		fPreviousOffsets[level] = offset;
193	}
194
195	void FoundError()
196	{
197		fFoundErrors++;
198	}
199
200	bool ErrorsFound()
201	{
202		return fFoundErrors != 0;
203	}
204
205private:
206			uint32				fLevelCount;
207			uint32				fFreeCount;
208			uint32				fNodeSize;
209			uint32				fMaxLevels;
210			uint32				fFoundErrors;
211			BitmapArray			fVisited;
212			BitmapArray			fVisitedFragment;
213			off_t*				fPreviousOffsets;
214};
215
216
217// #pragma mark -
218
219
220// Node Caching for the BPlusTree class
221//
222// With write support, there is the need for a function that allocates new
223// nodes by either returning empty nodes, or by growing the file's data stream
224//
225// !! The CachedNode class assumes that you have properly locked the stream
226// !! before asking for nodes.
227//
228// Note: This code will fail if the block size is smaller than the node size!
229// Since BFS supports block sizes of 1024 bytes or greater, and the node size
230// is hard-coded to 1024 bytes, that's not an issue now.
231
232void
233CachedNode::UnsetUnchanged(Transaction& transaction)
234{
235	if (fTree == NULL || fTree->fStream == NULL)
236		return;
237
238	if (fNode != NULL) {
239		void* cache = fTree->fStream->GetVolume()->BlockCache();
240
241		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
242		block_cache_put(cache, fBlockNumber);
243		fNode = NULL;
244	}
245}
246
247
248void
249CachedNode::Unset()
250{
251	if (fTree == NULL || fTree->fStream == NULL)
252		return;
253
254	if (fNode != NULL) {
255		if (fWritable && fOffset == 0) {
256			// The B+tree header has been updated - we need to update the
257			// BPlusTrees copy of it, as well.
258			memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header));
259		}
260
261		block_cache_put(fTree->fStream->GetVolume()->BlockCache(),
262			fBlockNumber);
263		fNode = NULL;
264	}
265}
266
267
268const bplustree_node*
269CachedNode::SetTo(off_t offset, bool check)
270{
271	const bplustree_node* node;
272	if (SetTo(offset, &node, check) == B_OK)
273		return node;
274
275	return NULL;
276}
277
278
279status_t
280CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check)
281{
282	if (fTree == NULL || fTree->fStream == NULL)
283		RETURN_ERROR(B_BAD_VALUE);
284
285	Unset();
286
287	// You can only ask for nodes at valid positions - you can't
288	// even access the b+tree header with this method (use SetToHeader()
289	// instead)
290	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
291		|| offset <= 0
292		|| (offset % fTree->fNodeSize) != 0)
293		RETURN_ERROR(B_BAD_VALUE);
294
295	if (InternalSetTo(NULL, offset) != NULL && check) {
296		// sanity checks (links, all_key_count)
297		if (!fTree->fHeader.CheckNode(fNode)) {
298			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
299				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
300				fBlockNumber, fTree->fStream->ID()));
301			return B_BAD_DATA;
302		}
303	}
304
305	*_node = fNode;
306	return B_OK;
307}
308
309
310bplustree_node*
311CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check)
312{
313	if (fTree == NULL || fTree->fStream == NULL) {
314		REPORT_ERROR(B_BAD_VALUE);
315		return NULL;
316	}
317
318	Unset();
319
320	// You can only ask for nodes at valid positions - you can't
321	// even access the b+tree header with this method (use SetToHeader()
322	// instead)
323	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
324		|| offset <= 0
325		|| (offset % fTree->fNodeSize) != 0)
326		return NULL;
327
328	if (InternalSetTo(&transaction, offset) != NULL && check) {
329		// sanity checks (links, all_key_count)
330		if (!fTree->fHeader.CheckNode(fNode)) {
331			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
332				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
333				fBlockNumber, fTree->fStream->ID()));
334			return NULL;
335		}
336	}
337	return fNode;
338}
339
340
341bplustree_node*
342CachedNode::MakeWritable(Transaction& transaction)
343{
344	if (fNode == NULL)
345		return NULL;
346
347	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
348			fBlockNumber, transaction.ID()) == B_OK)
349		return fNode;
350
351	return NULL;
352}
353
354
355const bplustree_header*
356CachedNode::SetToHeader()
357{
358	if (fTree == NULL || fTree->fStream == NULL) {
359		REPORT_ERROR(B_BAD_VALUE);
360		return NULL;
361	}
362
363	Unset();
364
365	InternalSetTo(NULL, 0LL);
366	return (bplustree_header*)fNode;
367}
368
369
370bplustree_header*
371CachedNode::SetToWritableHeader(Transaction& transaction)
372{
373	if (fTree == NULL || fTree->fStream == NULL) {
374		REPORT_ERROR(B_BAD_VALUE);
375		return NULL;
376	}
377
378	Unset();
379
380	InternalSetTo(&transaction, 0LL);
381
382	if (fNode != NULL && !fTree->fInTransaction) {
383		transaction.AddListener(fTree);
384		fTree->fInTransaction = true;
385
386		if (!transaction.GetVolume()->IsInitializing()) {
387			acquire_vnode(transaction.GetVolume()->FSVolume(),
388				fTree->fStream->ID());
389		}
390	}
391
392	return (bplustree_header*)fNode;
393}
394
395
396bplustree_node*
397CachedNode::InternalSetTo(Transaction* transaction, off_t offset)
398{
399	fNode = NULL;
400	fOffset = offset;
401
402	off_t fileOffset;
403	block_run run;
404	if (offset < fTree->fStream->Size()
405		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
406		Volume* volume = fTree->fStream->GetVolume();
407
408		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
409		fBlockNumber = volume->ToBlock(run) + blockOffset;
410		uint8* block;
411
412		if (transaction != NULL) {
413			block = (uint8*)block_cache_get_writable(volume->BlockCache(),
414				fBlockNumber, transaction->ID());
415			fWritable = true;
416		} else {
417			block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber);
418			fWritable = false;
419		}
420
421		if (block != NULL) {
422			// The node is somewhere in that block...
423			// (confusing offset calculation)
424			fNode = (bplustree_node*)(block + offset
425				- (fileOffset + (blockOffset << volume->BlockShift())));
426		} else
427			REPORT_ERROR(B_IO_ERROR);
428	}
429	return fNode;
430}
431
432
433status_t
434CachedNode::Free(Transaction& transaction, off_t offset)
435{
436	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
437		RETURN_ERROR(B_BAD_VALUE);
438
439	// TODO: scan the free nodes list and remove all nodes at the end
440	// of the tree - perhaps that shouldn't be done everytime that
441	// function is called, perhaps it should be done when the directory
442	// inode is closed or based on some calculation or whatever...
443
444	CachedNode cached(fTree);
445	bplustree_header* header = cached.SetToWritableHeader(transaction);
446	if (header == NULL)
447		return B_IO_ERROR;
448
449#if 0
450	// TODO: temporarily disabled because CheckNode() doesn't like this...
451	// 		Also, it's such an edge case that it's almost useless, anyway.
452	// if the node is the last one in the tree, we shrink
453	// the tree and file size by one node
454	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
455	if (offset == lastOffset) {
456		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
457		if (status != B_OK)
458			return status;
459
460		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
461		return B_OK;
462	}
463#endif
464
465	// add the node to the free nodes list
466	fNode->left_link = header->free_node_pointer;
467	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
468
469	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
470	return B_OK;
471}
472
473
474status_t
475CachedNode::Allocate(Transaction& transaction, bplustree_node** _node,
476	off_t* _offset)
477{
478	if (fTree == NULL || fTree->fStream == NULL)
479		RETURN_ERROR(B_BAD_VALUE);
480
481	Unset();
482
483	bplustree_header* header;
484	status_t status;
485
486	// if there are any free nodes, recycle them
487	if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) {
488		CachedNode cached(fTree);
489		header = cached.SetToWritableHeader(transaction);
490		if (header == NULL)
491			return B_IO_ERROR;
492
493		*_offset = header->FreeNode();
494		*_node = fNode;
495
496		// set new free node pointer
497		header->free_node_pointer = fNode->left_link;
498		fNode->Initialize();
499		return B_OK;
500	}
501
502	// allocate space for a new node
503	Inode* stream = fTree->fStream;
504	if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK)
505		return status;
506
507	CachedNode cached(fTree);
508	header = cached.SetToWritableHeader(transaction);
509	if (header == NULL)
510		return B_IO_ERROR;
511
512	// the maximum_size has to be changed before the call to SetTo() - or
513	// else it will fail because the requested node is out of bounds
514	off_t offset = header->MaximumSize();
515	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
516		+ fTree->fNodeSize);
517
518	cached.Unset();
519		// SetToWritable() below needs the new values in the tree's header
520
521	if (SetToWritable(transaction, offset, false) == NULL)
522		RETURN_ERROR(B_ERROR);
523
524	fNode->Initialize();
525
526	*_offset = offset;
527	*_node = fNode;
528	return B_OK;
529}
530
531
532//	#pragma mark -
533
534
535BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize)
536	:
537	fStream(NULL),
538	fInTransaction(false)
539{
540	mutex_init(&fIteratorLock, "bfs b+tree iterator");
541	SetTo(transaction, stream);
542}
543
544
545BPlusTree::BPlusTree(Inode* stream)
546	:
547	fStream(NULL),
548	fInTransaction(false)
549{
550	mutex_init(&fIteratorLock, "bfs b+tree iterator");
551	SetTo(stream);
552}
553
554
555BPlusTree::BPlusTree()
556	:
557	fStream(NULL),
558	fNodeSize(BPLUSTREE_NODE_SIZE),
559	fAllowDuplicates(true),
560	fInTransaction(false),
561	fStatus(B_NO_INIT)
562{
563	mutex_init(&fIteratorLock, "bfs b+tree iterator");
564}
565
566
567BPlusTree::~BPlusTree()
568{
569	// if there are any TreeIterators left, we need to stop them
570	// (can happen when the tree's inode gets deleted while
571	// traversing the tree - a TreeIterator doesn't lock the inode)
572	mutex_lock(&fIteratorLock);
573
574	SinglyLinkedList<TreeIterator>::Iterator iterator
575		= fIterators.GetIterator();
576	while (iterator.HasNext())
577		iterator.Next()->Stop();
578
579	mutex_destroy(&fIteratorLock);
580
581	ASSERT(!fInTransaction);
582}
583
584
585/*! Create a new B+Tree on the specified stream */
586status_t
587BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize)
588{
589	// initializes in-memory B+Tree
590
591	fStream = stream;
592
593	CachedNode cached(this);
594	bplustree_header* header = cached.SetToWritableHeader(transaction);
595	if (header == NULL) {
596		// allocate space for new header + node!
597		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
598		if (fStatus != B_OK)
599			RETURN_ERROR(fStatus);
600
601		header = cached.SetToWritableHeader(transaction);
602		if (header == NULL)
603			RETURN_ERROR(fStatus = B_ERROR);
604	}
605
606	fAllowDuplicates = stream->IsIndex()
607		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
608
609	fNodeSize = nodeSize;
610
611	// initialize b+tree header
612 	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
613 	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
614 	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
615 	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
616 	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
617 	header->free_node_pointer
618 		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
619 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
620
621	cached.Unset();
622
623	// initialize b+tree root node
624	cached.SetToWritable(transaction, fHeader.RootNode(), false);
625	if (cached.Node() == NULL)
626		RETURN_ERROR(B_IO_ERROR);
627
628	cached.Node()->Initialize();
629
630	return fStatus = B_OK;
631}
632
633
634status_t
635BPlusTree::SetTo(Inode* stream)
636{
637	if (stream == NULL)
638		RETURN_ERROR(fStatus = B_BAD_VALUE);
639
640	fStream = stream;
641
642	// get on-disk B+Tree header
643
644	CachedNode cached(this);
645	const bplustree_header* header = cached.SetToHeader();
646	if (header != NULL)
647		memcpy(&fHeader, header, sizeof(bplustree_header));
648	else
649		RETURN_ERROR(fStatus = B_IO_ERROR);
650
651	// is header valid?
652
653	if (fHeader.MaximumSize() != stream->Size()) {
654		dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %"
655			B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size());
656		// we can't change the header since we don't have a transaction
657		//fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
658	}
659	if (!fHeader.IsValid()) {
660#ifdef DEBUG
661		dump_bplustree_header(&fHeader);
662		dump_block((const char*)&fHeader, 128);
663#endif
664		RETURN_ERROR(fStatus = B_BAD_DATA);
665	}
666
667	fNodeSize = fHeader.NodeSize();
668
669	// validity check
670	static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
671		S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
672		S_DOUBLE_INDEX};
673	uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
674		| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
675		| S_FLOAT_INDEX | S_DOUBLE_INDEX);
676
677	if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE
678		|| ((stream->Mode() & S_INDEX_DIR) != 0
679			&& kToMode[fHeader.DataType()] != mode)
680		|| !stream->IsContainer()) {
681		D(	dump_bplustree_header(&fHeader);
682			dump_inode(&stream->Node());
683		);
684		RETURN_ERROR(fStatus = B_BAD_TYPE);
685	}
686
687	// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
688	// in the original BFS code - we will honour it nevertheless
689	fAllowDuplicates = stream->IsIndex()
690		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
691
692	cached.SetTo(fHeader.RootNode());
693	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
694}
695
696
697status_t
698BPlusTree::InitCheck()
699{
700	return fStatus;
701}
702
703
704status_t
705BPlusTree::Validate(bool repair, bool& _errorsFound)
706{
707	TreeCheck check(this);
708	if (check.InitCheck() != B_OK)
709		return B_NO_MEMORY;
710
711	check.SetVisited(0);
712
713	// Walk the free nodes
714
715	CachedNode cached(this);
716	off_t freeOffset = fHeader.FreeNode();
717	while (freeOffset > 0) {
718		const bplustree_node* node;
719		status_t status = cached.SetTo(freeOffset, &node, false);
720		if (status != B_OK) {
721			if (status == B_IO_ERROR)
722				return B_IO_ERROR;
723
724			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could "
725				"not be read: %s\n", fStream->ID(), freeOffset,
726				strerror(status));
727			check.FoundError();
728			break;
729		}
730
731		if (check.Visited(freeOffset)) {
732			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
733				" circular!\n", fStream->ID(), freeOffset);
734			// TODO: if 'repair' is true, we could collect all unvisited nodes
735			// at the end, and put the into the free list
736			check.FoundError();
737			break;
738		}
739
740		check.SetVisited(freeOffset);
741
742		if (node->OverflowLink() != BPLUSTREE_FREE) {
743			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
744				" misses free mark!\n", fStream->ID(), freeOffset);
745		}
746		freeOffset = node->LeftLink();
747	}
748
749	// Iterate over the complete tree recursively
750
751	const bplustree_node* root;
752	status_t status = cached.SetTo(fHeader.RootNode(), &root, true);
753	if (status != B_OK)
754		return status;
755
756	status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root);
757
758	if (check.ErrorsFound())
759		_errorsFound = true;
760
761	if (status != B_OK)
762		return status;
763
764	if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) {
765		dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, "
766			"declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(),
767			fHeader.MaxNumberOfLevels());
768	}
769
770	if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) {
771		dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF
772			" nodes.\n", fStream->ID(), check.VisitedCount(),
773			fHeader.MaximumSize() / fNodeSize);
774	}
775
776	return B_OK;
777}
778
779
780status_t
781BPlusTree::MakeEmpty()
782{
783	// Put all nodes into the free list in order
784	Transaction transaction(fStream->GetVolume(), fStream->BlockNumber());
785
786	// Reset the header, and root node
787	CachedNode cached(this);
788	bplustree_header* header = cached.SetToWritableHeader(transaction);
789	if (header == NULL)
790		return B_IO_ERROR;
791
792	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
793	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize());
794	if (fStream->Size() > (off_t)NodeSize() * 2)
795		header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize());
796	else {
797		header->free_node_pointer
798			= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
799	}
800
801	bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false);
802	if (node == NULL)
803		return B_IO_ERROR;
804
805	node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
806	node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
807	node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
808	node->all_key_count = 0;
809	node->all_key_length = 0;
810
811	for (off_t offset = 2 * NodeSize(); offset < fStream->Size();
812			offset += NodeSize()) {
813		bplustree_node* node = cached.SetToWritable(transaction, offset, false);
814		if (node == NULL) {
815			dprintf("--> could not open %" B_PRIdOFF "\n", offset);
816			return B_IO_ERROR;
817		}
818		if (offset < fStream->Size() - (off_t)NodeSize())
819			node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize());
820		else
821			node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
822
823		node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
824
825		// It's not important to write it out in a single transaction; just
826		// make sure it doesn't get too large
827		if (offset % (1024 * 1024) == 0) {
828			transaction.Done();
829			transaction.Start(fStream->GetVolume(), fStream->BlockNumber());
830		}
831	}
832
833	return transaction.Done();
834}
835
836
837int32
838BPlusTree::TypeCodeToKeyType(type_code code)
839{
840	switch (code) {
841		case B_STRING_TYPE:
842			return BPLUSTREE_STRING_TYPE;
843		case B_SSIZE_T_TYPE:
844		case B_INT32_TYPE:
845			return BPLUSTREE_INT32_TYPE;
846		case B_SIZE_T_TYPE:
847		case B_UINT32_TYPE:
848			return BPLUSTREE_UINT32_TYPE;
849		case B_OFF_T_TYPE:
850		case B_INT64_TYPE:
851			return BPLUSTREE_INT64_TYPE;
852		case B_UINT64_TYPE:
853			return BPLUSTREE_UINT64_TYPE;
854		case B_FLOAT_TYPE:
855			return BPLUSTREE_FLOAT_TYPE;
856		case B_DOUBLE_TYPE:
857			return BPLUSTREE_DOUBLE_TYPE;
858	}
859	return -1;
860}
861
862
863int32
864BPlusTree::ModeToKeyType(mode_t mode)
865{
866	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
867			| S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
868		case S_INT_INDEX:
869			return BPLUSTREE_INT32_TYPE;
870		case S_UINT_INDEX:
871			return BPLUSTREE_UINT32_TYPE;
872		case S_LONG_LONG_INDEX:
873			return BPLUSTREE_INT64_TYPE;
874		case S_ULONG_LONG_INDEX:
875			return BPLUSTREE_UINT64_TYPE;
876		case S_FLOAT_INDEX:
877			return BPLUSTREE_FLOAT_TYPE;
878		case S_DOUBLE_INDEX:
879			return BPLUSTREE_DOUBLE_TYPE;
880		case S_STR_INDEX:
881		default:
882			// default is for standard directories
883			return BPLUSTREE_STRING_TYPE;
884	}
885}
886
887
888//	#pragma mark - TransactionListener implementation
889
890
891void
892BPlusTree::TransactionDone(bool success)
893{
894	if (!success) {
895		// update header from disk
896		CachedNode cached(this);
897		const bplustree_header* header = cached.SetToHeader();
898		if (header != NULL)
899			memcpy(&fHeader, header, sizeof(bplustree_header));
900	}
901}
902
903
904void
905BPlusTree::RemovedFromTransaction()
906{
907	fInTransaction = false;
908
909	if (!fStream->GetVolume()->IsInitializing())
910		put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID());
911}
912
913
914//	#pragma mark -
915
916
917void
918BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
919	uint16 splitAt, int8 change)
920{
921	// Although every iterator which is affected by this update currently
922	// waits on a semaphore, other iterators could be added/removed at
923	// any time, so we need to protect this loop
924	MutexLocker _(fIteratorLock);
925
926	SinglyLinkedList<TreeIterator>::Iterator iterator
927		= fIterators.GetIterator();
928	while (iterator.HasNext())
929		iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change);
930}
931
932
933void
934BPlusTree::_AddIterator(TreeIterator* iterator)
935{
936	MutexLocker _(fIteratorLock);
937	fIterators.Add(iterator);
938}
939
940
941void
942BPlusTree::_RemoveIterator(TreeIterator* iterator)
943{
944	MutexLocker _(fIteratorLock);
945	fIterators.Remove(iterator);
946}
947
948
949int32
950BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2,
951	int keyLength2)
952{
953	type_code type = 0;
954	switch (fHeader.DataType()) {
955	    case BPLUSTREE_STRING_TYPE:
956	    	type = B_STRING_TYPE;
957	    	break;
958		case BPLUSTREE_INT32_TYPE:
959	    	type = B_INT32_TYPE;
960	    	break;
961		case BPLUSTREE_UINT32_TYPE:
962	    	type = B_UINT32_TYPE;
963	    	break;
964		case BPLUSTREE_INT64_TYPE:
965	    	type = B_INT64_TYPE;
966	    	break;
967		case BPLUSTREE_UINT64_TYPE:
968	    	type = B_UINT64_TYPE;
969	    	break;
970		case BPLUSTREE_FLOAT_TYPE:
971	    	type = B_FLOAT_TYPE;
972	    	break;
973		case BPLUSTREE_DOUBLE_TYPE:
974	    	type = B_DOUBLE_TYPE;
975	    	break;
976	}
977   	return compareKeys(type, key1, keyLength1, key2, keyLength2);
978}
979
980
981status_t
982BPlusTree::_FindKey(const bplustree_node* node, const uint8* key,
983	uint16 keyLength, uint16* _index, off_t* _next)
984{
985#ifdef DEBUG
986	NodeChecker checker(node, fNodeSize, "find");
987#endif
988
989	if (node->all_key_count == 0) {
990		if (_index)
991			*_index = 0;
992		if (_next)
993			*_next = node->OverflowLink();
994		return B_ENTRY_NOT_FOUND;
995	}
996
997	off_t* values = node->Values();
998	int16 saveIndex = -1;
999
1000	// binary search in the key array
1001	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
1002		uint16 i = (first + last) >> 1;
1003
1004		uint16 searchLength;
1005		uint8* searchKey = node->KeyAt(i, &searchLength);
1006		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
1007				> (uint8*)node + fNodeSize
1008			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
1009			fStream->GetVolume()->Panic();
1010			RETURN_ERROR(B_BAD_DATA);
1011		}
1012
1013		int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
1014		if (cmp < 0) {
1015			last = i - 1;
1016			saveIndex = i;
1017		} else if (cmp > 0) {
1018			saveIndex = first = i + 1;
1019		} else {
1020			if (_index)
1021				*_index = i;
1022			if (_next)
1023				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1024			return B_OK;
1025		}
1026	}
1027
1028	if (_index)
1029		*_index = saveIndex;
1030	if (_next) {
1031		if (saveIndex == node->NumKeys())
1032			*_next = node->OverflowLink();
1033		else
1034			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
1035	}
1036	return B_ENTRY_NOT_FOUND;
1037}
1038
1039
1040/*!	Prepares the stack to contain all nodes that were passed while
1041	following the key, from the root node to the leaf node that could
1042	or should contain that key.
1043*/
1044status_t
1045BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key,
1046	uint16 keyLength)
1047{
1048	// set the root node to begin with
1049	node_and_key nodeAndKey;
1050	nodeAndKey.nodeOffset = fHeader.RootNode();
1051
1052	CachedNode cached(this);
1053	const bplustree_node* node;
1054	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1055		// if we are already on leaf level, we're done
1056		if (node->OverflowLink() == BPLUSTREE_NULL) {
1057			// node that the keyIndex is not properly set here (but it's not
1058			// needed in the calling functions anyway)!
1059			nodeAndKey.keyIndex = 0;
1060			stack.Push(nodeAndKey);
1061			return B_OK;
1062		}
1063
1064		off_t nextOffset;
1065		status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
1066			&nextOffset);
1067
1068		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
1069			RETURN_ERROR(B_ERROR);
1070
1071		if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) {
1072			dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
1073			break;
1074		}
1075
1076		// put the node offset & the correct keyIndex on the stack
1077		stack.Push(nodeAndKey);
1078
1079		nodeAndKey.nodeOffset = nextOffset;
1080	}
1081
1082	FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %"
1083		B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID()));
1084	return B_ERROR;
1085}
1086
1087
1088/*!	This will find a free duplicate fragment in the given bplustree_node.
1089	The CachedNode will be set to the writable fragment on success.
1090*/
1091status_t
1092BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction,
1093	const bplustree_node* node, CachedNode& cached,
1094	off_t* _offset, bplustree_node** _fragment, uint32* _index)
1095{
1096	off_t* values = node->Values();
1097	for (int32 i = 0; i < node->NumKeys(); i++) {
1098		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1099
1100		// does the value link to a duplicate fragment?
1101		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
1102			continue;
1103
1104		const bplustree_node* fragment = cached.SetTo(
1105			bplustree_node::FragmentOffset(value), false);
1106		if (fragment == NULL) {
1107			FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %"
1108				B_PRIdOFF "\n", value, fStream->ID()));
1109			continue;
1110		}
1111
1112		// see if there is some space left for us
1113		uint32 num = bplustree_node::MaxFragments(fNodeSize);
1114		for (uint32 j = 0; j < num; j++) {
1115			duplicate_array* array = fragment->FragmentAt(j);
1116
1117			if (array->IsEmpty()) {
1118				// found an unused fragment
1119				*_fragment = cached.MakeWritable(transaction);
1120				if (*_fragment == NULL)
1121					return B_IO_ERROR;
1122
1123				*_offset = bplustree_node::FragmentOffset(value);
1124				*_index = j;
1125				return B_OK;
1126			}
1127		}
1128	}
1129	return B_ENTRY_NOT_FOUND;
1130}
1131
1132
1133status_t
1134BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached,
1135	const bplustree_node* node, uint16 index, off_t value)
1136{
1137	CachedNode cachedDuplicate(this);
1138	off_t* values = node->Values();
1139	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1140	status_t status;
1141	off_t offset;
1142
1143	if (bplustree_node::IsDuplicate(oldValue)) {
1144		// If it's a duplicate fragment, try to insert it into that, or if it
1145		// doesn't fit anymore, create a new duplicate node
1146
1147		if (bplustree_node::LinkType(oldValue)
1148				== BPLUSTREE_DUPLICATE_FRAGMENT) {
1149			bplustree_node* duplicate = cachedDuplicate.SetToWritable(
1150				transaction, bplustree_node::FragmentOffset(oldValue), false);
1151			if (duplicate == NULL)
1152				return B_IO_ERROR;
1153
1154			duplicate_array* array = duplicate->FragmentAt(
1155				bplustree_node::FragmentIndex(oldValue));
1156			int32 arrayCount = array->Count();
1157			if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) {
1158				FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment "
1159					"%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1160					(int)bplustree_node::FragmentIndex(oldValue),
1161					bplustree_node::FragmentOffset(oldValue), arrayCount,
1162					fStream->ID()));
1163				return B_BAD_DATA;
1164			}
1165
1166			if (arrayCount < NUM_FRAGMENT_VALUES) {
1167				array->Insert(value);
1168			} else {
1169				// Test if the fragment will be empty if we remove this key's
1170				// values
1171				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
1172					// The node will be empty without our values, so let us
1173					// reuse it as a duplicate node
1174					offset = bplustree_node::FragmentOffset(oldValue);
1175
1176					memmove(duplicate->DuplicateArray(), array,
1177						(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1178					duplicate->left_link = duplicate->right_link
1179						= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1180
1181					array = duplicate->DuplicateArray();
1182					array->Insert(value);
1183				} else {
1184					// Create a new duplicate node
1185
1186					cachedDuplicate.UnsetUnchanged(transaction);
1187						// The old duplicate has not been touched, so we can
1188						// reuse it
1189
1190					bplustree_node* newDuplicate;
1191					status = cachedDuplicate.Allocate(transaction,
1192						&newDuplicate, &offset);
1193					if (status != B_OK)
1194						RETURN_ERROR(status);
1195
1196					// Copy the array from the fragment node to the duplicate
1197					// node and free the old entry (by zero'ing all values)
1198					newDuplicate->overflow_link = array->count;
1199					memcpy(&newDuplicate->all_key_count, &array->values[0],
1200						array->Count() * sizeof(off_t));
1201					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1202
1203					array = newDuplicate->DuplicateArray();
1204					array->Insert(value);
1205				}
1206
1207				// Update the main pointer to link to a duplicate node
1208				if (cached.MakeWritable(transaction) == NULL)
1209					return B_IO_ERROR;
1210
1211				values[index]
1212					= HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1213						BPLUSTREE_DUPLICATE_NODE, offset));
1214			}
1215
1216			return B_OK;
1217		}
1218
1219		// Put the value into a dedicated duplicate node
1220
1221		// search for free space in the duplicate nodes of that key
1222		duplicate_array* array;
1223		int32 arrayCount;
1224		const bplustree_node* duplicate;
1225		off_t duplicateOffset;
1226		do {
1227			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1228			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
1229			if (duplicate == NULL)
1230				return B_IO_ERROR;
1231
1232			array = duplicate->DuplicateArray();
1233			arrayCount =array->Count();
1234			if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1235				FATAL(("_InsertDuplicate: Invalid array size in duplicate %"
1236					B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1237					duplicateOffset, arrayCount, fStream->ID()));
1238				return B_BAD_DATA;
1239			}
1240		} while (arrayCount >= NUM_DUPLICATE_VALUES
1241				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
1242
1243		bplustree_node* writableDuplicate
1244			= cachedDuplicate.MakeWritable(transaction);
1245		if (writableDuplicate == NULL)
1246			return B_IO_ERROR;
1247
1248		if (arrayCount < NUM_DUPLICATE_VALUES) {
1249			array = writableDuplicate->DuplicateArray();
1250			array->Insert(value);
1251		} else {
1252			// no space left - add a new duplicate node
1253
1254			bplustree_node* newDuplicate;
1255			status = cachedDuplicate.Allocate(transaction, &newDuplicate,
1256				&offset);
1257			if (status != B_OK)
1258				RETURN_ERROR(status);
1259
1260			// link the two nodes together
1261			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
1262			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
1263
1264			array = newDuplicate->DuplicateArray();
1265			array->count = 0;
1266			array->Insert(value);
1267		}
1268		return B_OK;
1269	}
1270
1271	// Search for a free duplicate fragment or create a new one
1272	// to insert the duplicate value into
1273
1274	uint32 fragmentIndex = 0;
1275	bplustree_node* fragment;
1276	if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
1277			&offset, &fragment, &fragmentIndex) != B_OK) {
1278		// allocate a new duplicate fragment node
1279		status = cachedDuplicate.Allocate(transaction, &fragment, &offset);
1280		if (status != B_OK)
1281			RETURN_ERROR(status);
1282
1283		memset(fragment, 0, fNodeSize);
1284	}
1285
1286	duplicate_array* array = fragment->FragmentAt(fragmentIndex);
1287	array->Insert(oldValue);
1288	array->Insert(value);
1289
1290	if (cached.MakeWritable(transaction) == NULL)
1291		return B_IO_ERROR;
1292
1293	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1294		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
1295
1296	return B_OK;
1297}
1298
1299
1300void
1301BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key,
1302	uint16 keyLength, off_t value)
1303{
1304	// should never happen, but who knows?
1305	if (index > node->NumKeys())
1306		return;
1307
1308	off_t* values = node->Values();
1309	uint16* keyLengths = node->KeyLengths();
1310	uint8* keys = node->Keys();
1311
1312	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
1313	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
1314		+ keyLength);
1315
1316	off_t* newValues = node->Values();
1317	uint16* newKeyLengths = node->KeyLengths();
1318
1319	// move values and copy new value into them
1320	memmove(newValues + index + 1, values + index,
1321		sizeof(off_t) * (node->NumKeys() - 1 - index));
1322	memmove(newValues, values, sizeof(off_t) * index);
1323
1324	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
1325
1326	// move and update key length index
1327	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
1328		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1329			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
1330	}
1331	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
1332
1333	int32 keyStart;
1334	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
1335		+ (keyStart = index > 0
1336			? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
1337
1338	// move keys and copy new key into them
1339	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
1340	int32 size = node->AllKeyLength() - length;
1341	if (size > 0)
1342		memmove(keys + length, keys + length - keyLength, size);
1343
1344	memcpy(keys + keyStart, key, keyLength);
1345}
1346
1347
1348/*!	Splits the \a node into two halves - the other half will be put into
1349	\a other. It also takes care to create a new overflow link if the node
1350	to split is an index node.
1351*/
1352status_t
1353BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset,
1354	bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key,
1355	uint16* _keyLength, off_t* _value)
1356{
1357	if (*_keyIndex > node->NumKeys() + 1)
1358		return B_BAD_VALUE;
1359
1360	uint16* inKeyLengths = node->KeyLengths();
1361	off_t* inKeyValues = node->Values();
1362	uint8* inKeys = node->Keys();
1363	uint8* outKeys = other->Keys();
1364	int32 keyIndex = *_keyIndex;	// can become less than zero!
1365
1366	if (keyIndex > node->NumKeys()) {
1367		FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF
1368			"\n", (int)keyIndex, node->NumKeys(), fStream->ID()));
1369		return B_BAD_VALUE;
1370	}
1371
1372	// how many keys will fit in one (half) page?
1373	// that loop will find the answer to this question and
1374	// change the key lengths indices for their new home
1375
1376	// "bytes" is the number of bytes written for the new key,
1377	// "bytesBefore" are the bytes before that key
1378	// "bytesAfter" are the bytes after the new key, if any
1379	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1380
1381	size_t size = fNodeSize >> 1;
1382	int32 out, in;
1383	for (in = out = 0; in < node->NumKeys() + 1;) {
1384		if (!bytes) {
1385			bytesBefore = in > 0
1386				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1387		}
1388
1389		if (in == keyIndex && !bytes) {
1390			bytes = *_keyLength;
1391		} else {
1392			if (keyIndex < out) {
1393				bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1394					- bytesBefore;
1395			}
1396
1397			in++;
1398		}
1399		out++;
1400
1401		if (key_align(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes)
1402				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1403			// we have found the number of keys in the new node!
1404			break;
1405		}
1406	}
1407
1408	// if the new key was not inserted, set the length of the keys
1409	// that can be copied directly
1410	if (keyIndex >= out && in > 0)
1411		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1412
1413	if (bytesBefore < 0 || bytesAfter < 0)
1414		return B_BAD_DATA;
1415
1416	other->left_link = node->left_link;
1417	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1418	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1419		+ bytesAfter);
1420	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1421
1422	uint16* outKeyLengths = other->KeyLengths();
1423	off_t* outKeyValues = other->Values();
1424	int32 keys = out > keyIndex ? keyIndex : out;
1425
1426	if (bytesBefore) {
1427		// copy the keys
1428		memcpy(outKeys, inKeys, bytesBefore);
1429		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1430		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1431	}
1432	if (bytes) {
1433		// copy the newly inserted key
1434		memcpy(outKeys + bytesBefore, key, bytes);
1435		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1436		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1437
1438		if (bytesAfter) {
1439			// copy the keys after the new key
1440			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1441				bytesAfter);
1442			keys = out - keyIndex - 1;
1443			for (int32 i = 0;i < keys;i++) {
1444				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1445					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i])
1446						+ bytes);
1447			}
1448			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1449				keys * sizeof(off_t));
1450		}
1451	}
1452
1453	// if the new key was already inserted, we shouldn't use it again
1454	if (in != out)
1455		keyIndex--;
1456
1457	int32 total = bytesBefore + bytesAfter;
1458
1459	// these variables are for the key that will be returned
1460	// to the parent node
1461	uint8* newKey = NULL;
1462	uint16 newLength;
1463	bool newAllocated = false;
1464
1465	// If we have split an index node, we have to drop the first key
1466	// of the next node (which can also be the new key to insert).
1467	// The dropped key is also the one which has to be inserted in
1468	// the parent node, so we will set the "newKey" already here.
1469	if (node->OverflowLink() != BPLUSTREE_NULL) {
1470		if (in == keyIndex) {
1471			newKey = key;
1472			newLength = *_keyLength;
1473
1474			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1475			keyIndex--;
1476		} else {
1477			// If a key is dropped (is not the new key), we have to copy
1478			// it, because it would be lost if not.
1479			uint8* droppedKey = node->KeyAt(in, &newLength);
1480			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1481					> (uint8*)node + fNodeSize
1482				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1483				fStream->GetVolume()->Panic();
1484				RETURN_ERROR(B_BAD_DATA);
1485			}
1486			newKey = (uint8*)malloc(newLength);
1487			if (newKey == NULL)
1488				return B_NO_MEMORY;
1489
1490			newAllocated = true;
1491			memcpy(newKey, droppedKey, newLength);
1492
1493			other->overflow_link = inKeyValues[in];
1494			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1495		}
1496	}
1497
1498	// and now the same game for the other page and the rest of the keys
1499	// (but with memmove() instead of memcpy(), because they may overlap)
1500
1501	bytesBefore = bytesAfter = bytes = 0;
1502	out = 0;
1503	int32 skip = in;
1504	while (in < node->NumKeys() + 1) {
1505		if (in == keyIndex && !bytes) {
1506			// it's enough to set bytesBefore once here, because we do
1507			// not need to know the exact length of all keys in this
1508			// loop
1509			bytesBefore = in > skip
1510				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1511			bytes = *_keyLength;
1512			out++;
1513		} else {
1514			if (in < node->NumKeys()) {
1515				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1516					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1517
1518				if (bytes) {
1519					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1520						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1521
1522					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1523						- bytesBefore - bytes;
1524				}
1525				out++;
1526			}
1527			in++;
1528		}
1529	}
1530
1531	// adjust the byte counts (since we were a bit lazy in the loop)
1532	if (keyIndex < skip)
1533		bytesBefore = node->AllKeyLength() - total;
1534
1535	if (bytesBefore < 0 || bytesAfter < 0) {
1536		if (newAllocated)
1537			free(newKey);
1538		return B_BAD_DATA;
1539	}
1540
1541	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1542		// right link, and overflow link can stay the same
1543	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1544		+ bytesAfter);
1545	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1546
1547	// array positions have changed
1548	outKeyLengths = node->KeyLengths();
1549	outKeyValues = node->Values();
1550
1551	// move the keys in the old node: the order is important here,
1552	// because we don't want to overwrite any contents
1553
1554	keys = keyIndex <= skip ? out : keyIndex - skip;
1555	keyIndex -= skip;
1556	in = out - keyIndex - 1;
1557		// Note: keyIndex and in will contain invalid values when the new key
1558		// went to the other node. But in this case bytes and bytesAfter are
1559		// 0 and subsequently we never use keyIndex and in.
1560
1561	if (bytesBefore)
1562		memmove(inKeys, inKeys + total, bytesBefore);
1563	if (bytesAfter) {
1564		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1565			bytesAfter);
1566	}
1567
1568	if (bytesBefore)
1569		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1570	if (bytesAfter) {
1571		// if byteAfter is > 0, keyIndex is larger than skip
1572		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1573			in * sizeof(uint16));
1574	}
1575
1576	if (bytesBefore)
1577		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1578	if (bytesAfter) {
1579		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1580			in * sizeof(off_t));
1581	}
1582
1583	if (bytes) {
1584		// finally, copy the newly inserted key (don't overwrite anything)
1585		memcpy(inKeys + bytesBefore, key, bytes);
1586		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1587		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1588	}
1589
1590	// Prepare the key that will be inserted in the parent node which
1591	// is either the dropped key or the last of the other node.
1592	// If it's the dropped key, "newKey" was already set earlier.
1593
1594	if (newKey == NULL)
1595		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1596
1597	memcpy(key, newKey, newLength);
1598	*_keyLength = newLength;
1599	*_value = otherOffset;
1600
1601	if (newAllocated)
1602		free(newKey);
1603
1604	return B_OK;
1605}
1606
1607
1608/*!	This inserts a key into the tree. The changes made to the tree will
1609	all be part of the \a transaction.
1610	You need to have the inode write locked.
1611*/
1612status_t
1613BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength,
1614	off_t value)
1615{
1616	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1617		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1618		RETURN_ERROR(B_BAD_VALUE);
1619#ifdef DEBUG
1620	if (value < 0)
1621		panic("tried to insert invalid value %Ld!\n", value);
1622#endif
1623
1624	ASSERT_WRITE_LOCKED_INODE(fStream);
1625
1626	Stack<node_and_key> stack;
1627	if (_SeekDown(stack, key, keyLength) != B_OK)
1628		RETURN_ERROR(B_ERROR);
1629
1630	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1631
1632	memcpy(keyBuffer, key, keyLength);
1633	keyBuffer[keyLength] = 0;
1634
1635	node_and_key nodeAndKey;
1636	const bplustree_node* node;
1637
1638	CachedNode cached(this);
1639	while (stack.Pop(&nodeAndKey)
1640		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1641#ifdef DEBUG
1642		NodeChecker checker(node, fNodeSize, "insert");
1643#endif
1644		if (node->IsLeaf()) {
1645			// first round, check for duplicate entries
1646			status_t status = _FindKey(node, key, keyLength,
1647				&nodeAndKey.keyIndex);
1648
1649			// is this a duplicate entry?
1650			if (status == B_OK) {
1651				if (fAllowDuplicates) {
1652					status = _InsertDuplicate(transaction, cached, node,
1653						nodeAndKey.keyIndex, value);
1654					if (status != B_OK)
1655						RETURN_ERROR(status);
1656					return B_OK;
1657				}
1658
1659				return B_NAME_IN_USE;
1660			}
1661		}
1662
1663		bplustree_node* writableNode = cached.MakeWritable(transaction);
1664		if (writableNode == NULL)
1665			return B_IO_ERROR;
1666
1667		// is the node big enough to hold the pair?
1668		if (int32(key_align(sizeof(bplustree_node)
1669				+ writableNode->AllKeyLength() + keyLength)
1670				+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
1671				+ sizeof(off_t))) < fNodeSize) {
1672			_InsertKey(writableNode, nodeAndKey.keyIndex,
1673				keyBuffer, keyLength, value);
1674			_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1675				nodeAndKey.keyIndex, 0, 1);
1676
1677			return B_OK;
1678		} else {
1679			CachedNode cachedNewRoot(this);
1680			CachedNode cachedOther(this);
1681
1682			// do we need to allocate a new root node? if so, then do
1683			// it now
1684			off_t newRoot = BPLUSTREE_NULL;
1685			if (nodeAndKey.nodeOffset == fHeader.RootNode()) {
1686				bplustree_node* root;
1687				status_t status = cachedNewRoot.Allocate(transaction, &root,
1688					&newRoot);
1689				if (status != B_OK) {
1690					// The tree is most likely corrupted!
1691					// But it's still sane at leaf level - we could set
1692					// a flag in the header that forces the tree to be
1693					// rebuild next time...
1694					// But since we will have journaling, that's not a big
1695					// problem anyway.
1696					RETURN_ERROR(status);
1697				}
1698			}
1699
1700			// reserve space for the other node
1701			bplustree_node* other;
1702			off_t otherOffset;
1703			status_t status = cachedOther.Allocate(transaction, &other,
1704				&otherOffset);
1705			if (status != B_OK) {
1706				cachedNewRoot.Free(transaction, newRoot);
1707				RETURN_ERROR(status);
1708			}
1709
1710			if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1711					otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1712					&value) != B_OK) {
1713				// free root node & other node here
1714				cachedOther.Free(transaction, otherOffset);
1715				cachedNewRoot.Free(transaction, newRoot);
1716
1717				RETURN_ERROR(B_ERROR);
1718			}
1719#ifdef DEBUG
1720			checker.Check("insert split");
1721			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1722#endif
1723
1724			_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1725				nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1726
1727			// update the right link of the node in the left of the new node
1728			if ((other = cachedOther.SetToWritable(transaction,
1729					other->LeftLink())) != NULL) {
1730				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1731			}
1732
1733			// create a new root if necessary
1734			if (newRoot != BPLUSTREE_NULL) {
1735				bplustree_node* root = cachedNewRoot.Node();
1736
1737				_InsertKey(root, 0, keyBuffer, keyLength,
1738					writableNode->LeftLink());
1739				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1740					nodeAndKey.nodeOffset);
1741
1742				CachedNode cached(this);
1743				bplustree_header* header
1744					= cached.SetToWritableHeader(transaction);
1745				if (header == NULL)
1746					return B_IO_ERROR;
1747
1748				// finally, update header to point to the new root
1749				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1750				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1751					header->MaxNumberOfLevels() + 1);
1752
1753				return B_OK;
1754			}
1755		}
1756	}
1757	RETURN_ERROR(B_ERROR);
1758}
1759
1760
1761/*!	Removes the duplicate index/value pair from the tree.
1762	It's part of the private tree interface.
1763*/
1764status_t
1765BPlusTree::_RemoveDuplicate(Transaction& transaction,
1766	const bplustree_node* node, CachedNode& cached, uint16 index,
1767	off_t value)
1768{
1769	off_t* values = node->Values();
1770	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1771
1772	CachedNode cachedDuplicate(this);
1773	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1774	bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction,
1775		duplicateOffset, false);
1776	if (duplicate == NULL)
1777		return B_IO_ERROR;
1778
1779	// if it's a duplicate fragment, remove the entry from there
1780	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1781		duplicate_array* array = duplicate->FragmentAt(
1782			bplustree_node::FragmentIndex(oldValue));
1783		int32 arrayCount = array->Count();
1784
1785		if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) {
1786			FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %"
1787				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1788				(int)bplustree_node::FragmentIndex(oldValue), duplicateOffset,
1789				arrayCount, fStream->ID()));
1790			return B_BAD_DATA;
1791		}
1792		if (!array->Remove(value)) {
1793			FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node "
1794				"%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value,
1795				duplicateOffset, fStream->ID()));
1796			return B_ENTRY_NOT_FOUND;
1797		}
1798
1799		// remove the array from the fragment node if it is empty
1800		if (--arrayCount == 1) {
1801			// set the link to the remaining value
1802			if (cached.MakeWritable(transaction) == NULL)
1803				return B_IO_ERROR;
1804
1805			values[index] = array->values[0];
1806
1807			// Remove the whole fragment node, if this was the only array,
1808			// otherwise free just the array
1809			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1810				status_t status = cachedDuplicate.Free(transaction,
1811					duplicateOffset);
1812				if (status != B_OK)
1813					return status;
1814			} else
1815				array->count = 0;
1816		}
1817		return B_OK;
1818	}
1819
1820	// Remove value from a duplicate node!
1821
1822	duplicate_array* array = NULL;
1823	int32 arrayCount = 0;
1824
1825	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1826		FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF
1827			", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID()));
1828		return B_BAD_DATA;
1829	}
1830
1831	// Search the duplicate nodes until the entry could be found (and removed)
1832	while (duplicate != NULL) {
1833		array = duplicate->DuplicateArray();
1834		arrayCount = array->Count();
1835
1836		if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1837			FATAL(("_RemoveDuplicate: Invalid array size in duplicate %"
1838				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1839				duplicateOffset, arrayCount, fStream->ID()));
1840			return B_BAD_DATA;
1841		}
1842
1843		if (array->Remove(value)) {
1844			arrayCount--;
1845			break;
1846		}
1847
1848		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1849			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1850
1851		cachedDuplicate.UnsetUnchanged(transaction);
1852		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset,
1853			false);
1854	}
1855	if (duplicate == NULL)
1856		RETURN_ERROR(B_IO_ERROR);
1857
1858	// The entry got removed from the duplicate node, but we might want to free
1859	// it now in case it's empty
1860
1861	while (true) {
1862		off_t left = duplicate->LeftLink();
1863		off_t right = duplicate->RightLink();
1864		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1865
1866		if ((isLast && arrayCount == 1) || arrayCount == 0) {
1867			// Free empty duplicate page, link their siblings together, and
1868			// update the duplicate link if needed (ie. when we either remove
1869			// the last duplicate node or have a new first one)
1870
1871			if (left == BPLUSTREE_NULL) {
1872				// the duplicate link points to us
1873				if (cached.MakeWritable(transaction) == NULL)
1874					return B_IO_ERROR;
1875
1876				if (arrayCount == 1) {
1877					// This is the last node, and there is only one value left;
1878					// replace the duplicate link with that value, it's no
1879					// duplicate anymore
1880					values[index] = array->values[0];
1881				} else {
1882					// Move the duplicate link to the next node
1883					values[index] = HOST_ENDIAN_TO_BFS_INT64(
1884						bplustree_node::MakeLink(
1885							BPLUSTREE_DUPLICATE_NODE, right));
1886				}
1887			}
1888
1889			status_t status = cachedDuplicate.Free(transaction,
1890				duplicateOffset);
1891			if (status != B_OK)
1892				return status;
1893
1894			if (left != BPLUSTREE_NULL
1895				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left,
1896						false)) != NULL) {
1897				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1898
1899				// If the next node is the last node, we need to free that node
1900				// and convert the duplicate entry back into a normal entry
1901				array = duplicate->DuplicateArray();
1902				arrayCount = array->Count();
1903				if (right == BPLUSTREE_NULL
1904					&& duplicate->LeftLink() == BPLUSTREE_NULL
1905					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1906					duplicateOffset = left;
1907					continue;
1908				}
1909			}
1910			if (right != BPLUSTREE_NULL
1911				&& (duplicate = cachedDuplicate.SetToWritable(transaction,
1912						right, false)) != NULL) {
1913				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1914
1915				// Again, we may need to turn the duplicate entry back into a
1916				// normal entry
1917				array = duplicate->DuplicateArray();
1918				arrayCount = array->Count();
1919				if (left == BPLUSTREE_NULL
1920					&& duplicate->RightLink() == BPLUSTREE_NULL
1921					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1922					duplicateOffset = right;
1923					continue;
1924				}
1925			}
1926			return B_OK;
1927		}
1928		if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) {
1929			// If the number of entries fits in a duplicate fragment, then
1930			// either find a free fragment node, or convert this node to a
1931			// fragment node.
1932			CachedNode cachedOther(this);
1933
1934			bplustree_node* fragment = NULL;
1935			uint32 fragmentIndex = 0;
1936			off_t offset;
1937			if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
1938					&offset, &fragment, &fragmentIndex) == B_OK) {
1939				// move to other node
1940				duplicate_array* target = fragment->FragmentAt(fragmentIndex);
1941				memcpy(target, array,
1942					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1943
1944				cachedDuplicate.Free(transaction, duplicateOffset);
1945				duplicateOffset = offset;
1946			} else {
1947				// convert node
1948				memmove(duplicate, array,
1949					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1950				memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
1951					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1952			}
1953
1954			if (cached.MakeWritable(transaction) == NULL)
1955				return B_IO_ERROR;
1956
1957			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1958				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
1959		}
1960		return B_OK;
1961	}
1962}
1963
1964
1965/*!	Removes the key with the given index from the specified node.
1966	Since it has to get the key from the node anyway (to obtain it's
1967	pointer), it's not needed to pass the key & its length, although
1968	the calling method (BPlusTree::Remove()) have this data.
1969*/
1970void
1971BPlusTree::_RemoveKey(bplustree_node* node, uint16 index)
1972{
1973	// should never happen, but who knows?
1974	if (index > node->NumKeys() && node->NumKeys() > 0) {
1975		FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF
1976			"\n", index, fStream->ID()));
1977		return;
1978	}
1979
1980	off_t* values = node->Values();
1981
1982	// if we would have to drop the overflow link, drop
1983	// the last key instead and update the overflow link
1984	// to the value of that one
1985	if (!node->IsLeaf() && index == node->NumKeys())
1986		node->overflow_link = values[--index];
1987
1988	uint16 length;
1989	uint8* key = node->KeyAt(index, &length);
1990	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize
1991		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1992		FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length,
1993			fStream->ID()));
1994		fStream->GetVolume()->Panic();
1995		return;
1996	}
1997
1998	uint16* keyLengths = node->KeyLengths();
1999	uint8* keys = node->Keys();
2000
2001	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
2002	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(
2003		node->AllKeyLength() - length);
2004
2005	off_t* newValues = node->Values();
2006	uint16* newKeyLengths = node->KeyLengths();
2007
2008	// move key data
2009	memmove(key, key + length, node->AllKeyLength() - (key - keys));
2010
2011	// move and update key lengths
2012	if (index > 0 && newKeyLengths != keyLengths)
2013		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
2014	for (uint16 i = index; i < node->NumKeys(); i++) {
2015		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
2016			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
2017	}
2018
2019	// move values
2020	if (index > 0)
2021		memmove(newValues, values, index * sizeof(off_t));
2022	if (node->NumKeys() > index) {
2023		memmove(newValues + index, values + index + 1,
2024			(node->NumKeys() - index) * sizeof(off_t));
2025	}
2026}
2027
2028
2029/*!	Removes the specified key from the tree. The "value" parameter is only used
2030	for trees which allow duplicates, so you may safely ignore it.
2031	It's not an optional parameter, so at least you have to think about it.
2032	You need to have the inode write locked.
2033*/
2034status_t
2035BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength,
2036	off_t value)
2037{
2038	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2039		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
2040		RETURN_ERROR(B_BAD_VALUE);
2041
2042	ASSERT_WRITE_LOCKED_INODE(fStream);
2043
2044	Stack<node_and_key> stack;
2045	if (_SeekDown(stack, key, keyLength) != B_OK)
2046		RETURN_ERROR(B_ERROR);
2047
2048	node_and_key nodeAndKey;
2049	const bplustree_node* node;
2050
2051	CachedNode cached(this);
2052	while (stack.Pop(&nodeAndKey)
2053		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
2054#ifdef DEBUG
2055		NodeChecker checker(node, fNodeSize, "remove");
2056#endif
2057		if (node->IsLeaf()) {
2058			// first round, check for duplicate entries
2059			status_t status = _FindKey(node, key, keyLength,
2060				&nodeAndKey.keyIndex);
2061			if (status != B_OK)
2062				RETURN_ERROR(status);
2063
2064			// Is this a duplicate entry?
2065			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
2066					node->Values()[nodeAndKey.keyIndex]))) {
2067				if (fAllowDuplicates) {
2068					return _RemoveDuplicate(transaction, node, cached,
2069						nodeAndKey.keyIndex, value);
2070				}
2071
2072				FATAL(("dupliate node found where no duplicates are "
2073					"allowed, inode %" B_PRIdOFF "!\n", fStream->ID()));
2074				RETURN_ERROR(B_ERROR);
2075			} else {
2076				if (node->Values()[nodeAndKey.keyIndex] != value)
2077					return B_ENTRY_NOT_FOUND;
2078
2079				// If we will remove the last key, the iterator will be set
2080				// to the next node after the current - if there aren't any
2081				// more nodes, we need a way to prevent the TreeIterators to
2082				// touch the old node again, we use BPLUSTREE_FREE for this
2083				off_t next = node->RightLink() == BPLUSTREE_NULL
2084					? BPLUSTREE_FREE : node->RightLink();
2085				_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
2086					? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
2087			}
2088		}
2089
2090		bplustree_node* writableNode = cached.MakeWritable(transaction);
2091		if (writableNode == NULL)
2092			return B_IO_ERROR;
2093
2094		// if it's an empty root node, we have to convert it
2095		// to a leaf node by dropping the overflow link, or,
2096		// if it's already a leaf node, just empty it
2097		if (nodeAndKey.nodeOffset == fHeader.RootNode()
2098			&& (node->NumKeys() == 0
2099				|| (node->NumKeys() == 1 && node->IsLeaf()))) {
2100			writableNode->overflow_link
2101				= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2102			writableNode->all_key_count = 0;
2103			writableNode->all_key_length = 0;
2104
2105			// if we've made a leaf node out of the root node, we need
2106			// to reset the maximum number of levels in the header
2107			if (fHeader.MaxNumberOfLevels() != 1) {
2108				CachedNode cached(this);
2109				bplustree_header* header
2110					= cached.SetToWritableHeader(transaction);
2111				if (header == NULL)
2112					return B_IO_ERROR;
2113
2114				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
2115			}
2116			return B_OK;
2117		}
2118
2119		// if there is only one key left, we don't have to remove
2120		// it, we can just dump the node (index nodes still have
2121		// the overflow link, so we have to drop the last key)
2122		if (writableNode->NumKeys() > 1
2123			|| (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
2124			_RemoveKey(writableNode, nodeAndKey.keyIndex);
2125			return B_OK;
2126		}
2127
2128		// when we are here, we can just free the node, but
2129		// we have to update the right/left link of the
2130		// siblings first
2131		CachedNode otherCached(this);
2132		bplustree_node* other = otherCached.SetToWritable(transaction,
2133			writableNode->LeftLink());
2134		if (other != NULL)
2135			other->right_link = writableNode->right_link;
2136
2137		if ((other = otherCached.SetToWritable(transaction, node->RightLink()))
2138				!= NULL) {
2139			other->left_link = writableNode->left_link;
2140		}
2141
2142		cached.Free(transaction, nodeAndKey.nodeOffset);
2143	}
2144	RETURN_ERROR(B_ERROR);
2145}
2146
2147
2148/*!	Replaces the value for the key in the tree.
2149	Returns B_OK if the key could be found and its value replaced,
2150	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
2151	to indicate that something went terribly wrong.
2152	Note that this doesn't work with duplicates - it will just
2153	return B_BAD_TYPE if you call this function on a tree where
2154	duplicates are allowed.
2155	You need to have the inode write locked.
2156*/
2157status_t
2158BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength,
2159	off_t value)
2160{
2161	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2162		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2163		|| key == NULL)
2164		RETURN_ERROR(B_BAD_VALUE);
2165
2166	if (fAllowDuplicates)
2167		RETURN_ERROR(B_BAD_TYPE);
2168
2169	ASSERT_WRITE_LOCKED_INODE(fStream);
2170
2171	off_t nodeOffset = fHeader.RootNode();
2172	CachedNode cached(this);
2173	const bplustree_node* node;
2174
2175	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2176		uint16 keyIndex = 0;
2177		off_t nextOffset;
2178		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2179			&nextOffset);
2180
2181		if (node->OverflowLink() == BPLUSTREE_NULL) {
2182			if (status == B_OK) {
2183				bplustree_node* writableNode = cached.MakeWritable(transaction);
2184				if (writableNode != NULL) {
2185					writableNode->Values()[keyIndex]
2186						= HOST_ENDIAN_TO_BFS_INT64(value);
2187				} else
2188					status = B_IO_ERROR;
2189			}
2190
2191			return status;
2192		} else if (nextOffset == nodeOffset)
2193			RETURN_ERROR(B_ERROR);
2194
2195		nodeOffset = nextOffset;
2196	}
2197	RETURN_ERROR(B_ERROR);
2198}
2199
2200
2201/*!	Searches the key in the tree, and stores the offset found in _value,
2202	if successful.
2203	It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack
2204	while it descends the tree.
2205	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not.
2206	It can also return other errors to indicate that something went wrong.
2207	Note that this doesn't work with duplicates - it will just return
2208	B_BAD_TYPE if you call this function on a tree where duplicates are
2209	allowed.
2210	You need to have the inode read or write locked.
2211*/
2212status_t
2213BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value)
2214{
2215	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2216		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2217		|| key == NULL)
2218		RETURN_ERROR(B_BAD_VALUE);
2219
2220	if (fAllowDuplicates)
2221		RETURN_ERROR(B_BAD_TYPE);
2222
2223	ASSERT_READ_LOCKED_INODE(fStream);
2224
2225	off_t nodeOffset = fHeader.RootNode();
2226	CachedNode cached(this);
2227	const bplustree_node* node;
2228
2229#ifdef DEBUG
2230	int32 levels = 0;
2231#endif
2232
2233	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2234		uint16 keyIndex = 0;
2235		off_t nextOffset;
2236		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2237			&nextOffset);
2238
2239#ifdef DEBUG
2240		levels++;
2241#endif
2242		if (node->OverflowLink() == BPLUSTREE_NULL) {
2243			if (status == B_OK && _value != NULL)
2244				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
2245
2246#ifdef DEBUG
2247			if (levels != (int32)fHeader.MaxNumberOfLevels())
2248				DEBUGGER(("levels don't match"));
2249#endif
2250			return status;
2251		} else if (nextOffset == nodeOffset)
2252			RETURN_ERROR(B_ERROR);
2253
2254		nodeOffset = nextOffset;
2255	}
2256	FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %"
2257		B_PRIdOFF "\n", nodeOffset, fStream->ID()));
2258	RETURN_ERROR(B_ERROR);
2259}
2260
2261
2262status_t
2263BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset,
2264	const uint8* largestKey, uint16 largestKeyLength,
2265	const bplustree_node* parent)
2266{
2267	if (parent->CheckIntegrity(fNodeSize) != B_OK) {
2268		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check "
2269			"failed!\n", fStream->ID(), offset);
2270		check.FoundError();
2271		return B_OK;
2272	}
2273	if (level >= fHeader.MaxNumberOfLevels()) {
2274		dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF
2275			"!\n", fStream->ID(), offset);
2276		check.FoundError();
2277		return B_OK;
2278	}
2279
2280	check.SetLevel(level);
2281
2282	if (check.Visited(offset)) {
2283		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n",
2284			fStream->ID(), offset);
2285		check.FoundError();
2286		return B_OK;
2287	}
2288
2289	check.SetVisited(offset);
2290
2291	uint32 count = parent->NumKeys();
2292	off_t* values = parent->Values();
2293	off_t lastOffset = check.PreviousOffset(level);
2294	CachedNode cached(this);
2295
2296	for (uint32 i = 0; i < count; i++) {
2297		uint16 keyLength;
2298		uint8* key = parent->KeyAt(i, &keyLength);
2299		if (largestKey != NULL) {
2300			int result = compareKeys(fHeader.DataType(), key, keyLength,
2301				largestKey, largestKeyLength);
2302			if (result >= 0) {
2303				dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %"
2304					B_PRIu32 " larger than it should!\n",
2305					fStream->ID(), offset, i);
2306				check.FoundError();
2307			}
2308		}
2309
2310		off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]);
2311		if (bplustree_node::IsDuplicate(childOffset)) {
2312			// Walk the duplicate nodes
2313			off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset);
2314			off_t lastDuplicateOffset = BPLUSTREE_NULL;
2315
2316			while (duplicateOffset != BPLUSTREE_NULL) {
2317				const bplustree_node* node;
2318				status_t status = cached.SetTo(duplicateOffset, &node, false);
2319				if (status != B_OK) {
2320					if (status == B_IO_ERROR)
2321						return B_IO_ERROR;
2322
2323					dprintf("inode %" B_PRIdOFF ": duplicate node at %"
2324						B_PRIdOFF " could not be read: %s\n", fStream->ID(),
2325						duplicateOffset, strerror(status));
2326					check.FoundError();
2327					break;
2328				}
2329
2330				bool isFragmentNode = bplustree_node::LinkType(childOffset)
2331					== BPLUSTREE_DUPLICATE_FRAGMENT;
2332				bool isKnownFragment = isFragmentNode
2333					&& check.VisitedFragment(duplicateOffset);
2334
2335				if (!isKnownFragment && check.Visited(duplicateOffset)) {
2336					dprintf("inode %" B_PRIdOFF ": %s node at %"
2337						B_PRIdOFF " already visited, referenced from %"
2338						B_PRIdOFF "!\n", fStream->ID(),
2339						isFragmentNode ? "fragment" : "duplicate",
2340						duplicateOffset, offset);
2341					check.FoundError();
2342					break;
2343				}
2344
2345				// Fragment nodes may be visited more than once from different
2346				// places
2347				if (!check.Visited(duplicateOffset))
2348					check.SetVisited(duplicateOffset);
2349				if (!isKnownFragment && isFragmentNode)
2350					check.SetVisitedFragment(duplicateOffset);
2351
2352				duplicate_array* array;
2353				int32 minSize;
2354				int32 maxSize;
2355				if (isFragmentNode) {
2356					array = node->FragmentAt(
2357						bplustree_node::FragmentIndex(childOffset));
2358					minSize = 2;
2359					maxSize = NUM_FRAGMENT_VALUES;
2360				} else {
2361					array = node->DuplicateArray();
2362					minSize = 1;
2363					maxSize = NUM_DUPLICATE_VALUES;
2364				}
2365				int32 arrayCount = array->Count();
2366
2367				if (arrayCount < minSize || arrayCount > maxSize) {
2368					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2369						" has invalid array size %" B_PRId32 "!\n",
2370						fStream->ID(), duplicateOffset, arrayCount);
2371					check.FoundError();
2372				} else {
2373					// Simple check if the values in the array may be valid
2374					for (int32 j = 0; j < arrayCount; j++) {
2375						if (!fStream->GetVolume()->IsValidInodeBlock(
2376								array->ValueAt(j))) {
2377							dprintf("inode %" B_PRIdOFF ": duplicate at %"
2378								B_PRIdOFF " contains invalid block %" B_PRIdOFF
2379								" at %" B_PRId32 "!\n", fStream->ID(),
2380								duplicateOffset, array->ValueAt(j), j);
2381							check.FoundError();
2382							break;
2383						}
2384					}
2385				}
2386
2387				// A fragment node is not linked (and does not have valid links)
2388				if (isFragmentNode)
2389					break;
2390
2391				if (node->LeftLink() != lastDuplicateOffset) {
2392					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2393						" has wrong left link %" B_PRIdOFF ", expected %"
2394						B_PRIdOFF "!\n", fStream->ID(), duplicateOffset,
2395						node->LeftLink(), lastDuplicateOffset);
2396					check.FoundError();
2397				}
2398
2399				lastDuplicateOffset = duplicateOffset;
2400				duplicateOffset = node->RightLink();
2401			}
2402		} else if (!parent->IsLeaf()) {
2403			// Test a regular child node recursively
2404			off_t nextOffset = parent->OverflowLink();
2405			if (i < count - 1)
2406				nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]);
2407
2408			if (i == 0 && lastOffset != BPLUSTREE_NULL) {
2409				// Test right link of the previous node
2410				const bplustree_node* previous = cached.SetTo(lastOffset, true);
2411				if (previous == NULL)
2412					return B_IO_ERROR;
2413
2414				if (previous->RightLink() != childOffset) {
2415					dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2416						"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2417						"!\n", fStream->ID(), lastOffset, previous->RightLink(),
2418						childOffset);
2419					check.FoundError();
2420				}
2421			}
2422
2423			status_t status = _ValidateChild(check, cached, level, childOffset,
2424				lastOffset, nextOffset, key, keyLength);
2425			if (status != B_OK)
2426				return status;
2427		} else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) {
2428			dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains "
2429				"invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n",
2430				fStream->ID(), offset, childOffset, i);
2431			check.FoundError();
2432		}
2433
2434		lastOffset = childOffset;
2435	}
2436
2437	if (parent->OverflowLink() != BPLUSTREE_NULL) {
2438		off_t childOffset = parent->OverflowLink();
2439		status_t status = _ValidateChild(check, cached, level, childOffset,
2440			lastOffset, 0, NULL, 0);
2441		if (status != B_OK)
2442			return status;
2443
2444		lastOffset = childOffset;
2445	}
2446
2447	check.SetPreviousOffset(level, lastOffset);
2448	return B_OK;
2449}
2450
2451
2452status_t
2453BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level,
2454	off_t offset, off_t lastOffset, off_t nextOffset,
2455	const uint8* key, uint16 keyLength)
2456{
2457	const bplustree_node* node;
2458	status_t status = cached.SetTo(offset, &node, true);
2459	if (status != B_OK) {
2460		if (status == B_IO_ERROR)
2461			return B_IO_ERROR;
2462
2463		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be "
2464			"read: %s\n", fStream->ID(), offset, strerror(status));
2465		check.FoundError();
2466		return B_OK;
2467	}
2468
2469	if (node->LeftLink() != lastOffset) {
2470		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2471			"wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF
2472			"!\n", fStream->ID(), offset, node->LeftLink(), lastOffset);
2473		check.FoundError();
2474	}
2475
2476	if (nextOffset != 0 && node->RightLink() != nextOffset) {
2477		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2478			"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2479			"!\n", fStream->ID(), offset, node->RightLink(), nextOffset);
2480		check.FoundError();
2481	}
2482
2483	return _ValidateChildren(check, level + 1, offset, key, keyLength, node);
2484}
2485
2486
2487//	#pragma mark -
2488
2489
2490TreeIterator::TreeIterator(BPlusTree* tree)
2491	:
2492	fTree(tree),
2493	fCurrentNodeOffset(BPLUSTREE_NULL)
2494{
2495	tree->_AddIterator(this);
2496}
2497
2498
2499TreeIterator::~TreeIterator()
2500{
2501	if (fTree)
2502		fTree->_RemoveIterator(this);
2503}
2504
2505
2506status_t
2507TreeIterator::Goto(int8 to)
2508{
2509	if (fTree == NULL || fTree->fStream == NULL)
2510		RETURN_ERROR(B_BAD_VALUE);
2511
2512	// lock access to stream
2513	InodeReadLocker locker(fTree->fStream);
2514
2515	off_t nodeOffset = fTree->fHeader.RootNode();
2516	CachedNode cached(fTree);
2517	const bplustree_node* node;
2518
2519	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2520		// is the node a leaf node?
2521		if (node->OverflowLink() == BPLUSTREE_NULL) {
2522			fCurrentNodeOffset = nodeOffset;
2523			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
2524			fDuplicateNode = BPLUSTREE_NULL;
2525
2526			return B_OK;
2527		}
2528
2529		// get the next node offset depending on the direction (and if there
2530		// are any keys in that node at all)
2531		off_t nextOffset;
2532		if (to == BPLUSTREE_END || node->all_key_count == 0)
2533			nextOffset = node->OverflowLink();
2534		else {
2535			if (node->AllKeyLength() > fTree->fNodeSize
2536				|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
2537					- 8 * node->NumKeys())
2538				RETURN_ERROR(B_ERROR);
2539
2540			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
2541		}
2542		if (nextOffset == nodeOffset)
2543			break;
2544
2545		nodeOffset = nextOffset;
2546	}
2547	FATAL(("%s fails\n", __FUNCTION__));
2548
2549	RETURN_ERROR(B_ERROR);
2550}
2551
2552
2553/*!	Iterates through the tree in the specified direction.
2554	When it iterates through duplicates, the "key" is only updated for the
2555	first entry - if you need to know when this happens, use the "duplicate"
2556	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
2557	the other duplicates.
2558	That's not too nice, but saves the 256 bytes that would be needed to
2559	store the last key - if this will ever become an issue, it will be
2560	easy to change.
2561	The other advantage of this is, that the queries can skip all duplicates
2562	at once when they are not relevant to them.
2563*/
2564status_t
2565TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength,
2566	uint16 maxLength, off_t* value, uint16* duplicate)
2567{
2568	if (fTree == NULL)
2569		return B_INTERRUPTED;
2570
2571	bool forward = direction == BPLUSTREE_FORWARD;
2572
2573	if (fCurrentNodeOffset == BPLUSTREE_NULL
2574		&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK)
2575		RETURN_ERROR(B_ERROR);
2576
2577	// if the tree was emptied since the last call
2578	if (fCurrentNodeOffset == BPLUSTREE_FREE)
2579		return B_ENTRY_NOT_FOUND;
2580
2581	// lock access to stream
2582	InodeReadLocker locker(fTree->fStream);
2583
2584	CachedNode cached(fTree);
2585	const bplustree_node* node;
2586
2587	if (fDuplicateNode != BPLUSTREE_NULL) {
2588		// regardless of traverse direction the duplicates are always presented
2589		// in the same order; since they are all considered as equal, this
2590		// shouldn't cause any problems
2591
2592		if (!fIsFragment || fDuplicate < fNumDuplicates) {
2593			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2594				false);
2595		} else
2596			node = NULL;
2597
2598		if (node != NULL) {
2599			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
2600				// If the node is out of duplicates, we go directly to the next
2601				// one
2602				fDuplicateNode = node->RightLink();
2603				if (fDuplicateNode != BPLUSTREE_NULL
2604					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
2605					fNumDuplicates = node->CountDuplicates(fDuplicateNode,
2606						false);
2607					fDuplicate = 0;
2608				}
2609			}
2610			if (fDuplicate < fNumDuplicates) {
2611				*value = node->DuplicateAt(fDuplicateNode, fIsFragment,
2612					fDuplicate++);
2613				if (duplicate)
2614					*duplicate = 2;
2615				return B_OK;
2616			}
2617		}
2618		fDuplicateNode = BPLUSTREE_NULL;
2619	}
2620
2621	off_t savedNodeOffset = fCurrentNodeOffset;
2622	int32 savedKey = fCurrentKey;
2623
2624	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
2625		RETURN_ERROR(B_ERROR);
2626
2627	if (duplicate)
2628		*duplicate = 0;
2629
2630	fCurrentKey += direction;
2631
2632	// is the current key in the current node?
2633	while ((forward && fCurrentKey >= node->NumKeys())
2634			|| (!forward && fCurrentKey < 0)) {
2635		fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2636
2637		// are there any more nodes?
2638		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2639			node = cached.SetTo(fCurrentNodeOffset);
2640			if (!node)
2641				RETURN_ERROR(B_ERROR);
2642
2643			// reset current key
2644			fCurrentKey = forward ? 0 : node->NumKeys();
2645		} else {
2646			// there are no nodes left, so turn back to the last key
2647			fCurrentNodeOffset = savedNodeOffset;
2648			fCurrentKey = savedKey;
2649
2650			return B_ENTRY_NOT_FOUND;
2651		}
2652	}
2653
2654	if (node->all_key_count == 0)
2655		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
2656
2657	uint16 length;
2658	uint8* keyStart = node->KeyAt(fCurrentKey, &length);
2659	if (keyStart + length + sizeof(off_t) + sizeof(uint16)
2660			> (uint8*)node + fTree->fNodeSize
2661		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2662		fTree->fStream->GetVolume()->Panic();
2663		RETURN_ERROR(B_BAD_DATA);
2664	}
2665
2666	length = min_c(length, maxLength);
2667	memcpy(key, keyStart, length);
2668
2669	if (fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE)	{
2670		// terminate string type
2671		if (length == maxLength)
2672			length--;
2673		((char*)key)[length] = '\0';
2674	}
2675	*keyLength = length;
2676
2677	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2678
2679	// duplicate fragments?
2680	uint8 type = bplustree_node::LinkType(offset);
2681	if (type == BPLUSTREE_DUPLICATE_FRAGMENT
2682		|| type == BPLUSTREE_DUPLICATE_NODE) {
2683		fDuplicateNode = offset;
2684
2685		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2686			false);
2687		if (node == NULL)
2688			RETURN_ERROR(B_ERROR);
2689
2690		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2691
2692		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2693		if (fNumDuplicates) {
2694			offset = node->DuplicateAt(offset, fIsFragment, 0);
2695			fDuplicate = 1;
2696			if (duplicate)
2697				*duplicate = 1;
2698		} else {
2699			// Shouldn't happen, but we're dealing here with potentially
2700			// corrupt disks...
2701			fDuplicateNode = BPLUSTREE_NULL;
2702			offset = 0;
2703		}
2704	}
2705	*value = offset;
2706
2707	return B_OK;
2708}
2709
2710
2711/*!	This is more or less a copy of BPlusTree::Find() - but it just
2712	sets the current position in the iterator, regardless of if the
2713	key could be found or not.
2714*/
2715status_t
2716TreeIterator::Find(const uint8* key, uint16 keyLength)
2717{
2718	if (fTree == NULL)
2719		return B_INTERRUPTED;
2720	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2721		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2722		|| key == NULL)
2723		RETURN_ERROR(B_BAD_VALUE);
2724
2725	// lock access to stream
2726	InodeReadLocker locker(fTree->fStream);
2727
2728	off_t nodeOffset = fTree->fHeader.RootNode();
2729
2730	CachedNode cached(fTree);
2731	const bplustree_node* node;
2732	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2733		uint16 keyIndex = 0;
2734		off_t nextOffset;
2735		status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2736			&nextOffset);
2737
2738		if (node->OverflowLink() == BPLUSTREE_NULL) {
2739			fCurrentNodeOffset = nodeOffset;
2740			fCurrentKey = keyIndex - 1;
2741			fDuplicateNode = BPLUSTREE_NULL;
2742
2743			return status;
2744		} else if (nextOffset == nodeOffset)
2745			RETURN_ERROR(B_ERROR);
2746
2747		nodeOffset = nextOffset;
2748	}
2749	RETURN_ERROR(B_ERROR);
2750}
2751
2752
2753void
2754TreeIterator::SkipDuplicates()
2755{
2756	fDuplicateNode = BPLUSTREE_NULL;
2757}
2758
2759
2760void
2761TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex,
2762	uint16 splitAt, int8 change)
2763{
2764	if (offset != fCurrentNodeOffset)
2765		return;
2766
2767	if (nextOffset != BPLUSTREE_NULL) {
2768		fCurrentNodeOffset = nextOffset;
2769		if (splitAt <= fCurrentKey) {
2770			fCurrentKey -= splitAt;
2771			keyIndex -= splitAt;
2772		}
2773	}
2774
2775	// Adjust fCurrentKey to point to the same key as before.
2776	// Note, that if a key is inserted at the current position
2777	// it won't be included in this tree transition.
2778	if (keyIndex <= fCurrentKey)
2779		fCurrentKey += change;
2780
2781	// TODO: duplicate handling!
2782}
2783
2784
2785void
2786TreeIterator::Stop()
2787{
2788	fTree = NULL;
2789}
2790
2791
2792#ifdef DEBUG
2793void
2794TreeIterator::Dump()
2795{
2796	__out("TreeIterator at %p:\n", this);
2797	__out("\tfTree = %p\n", fTree);
2798	__out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset);
2799	__out("\tfCurrentKey = %d\n", (int)fCurrentKey);
2800	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n",
2801		bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode,
2802		fDuplicateNode);
2803	__out("\tfDuplicate = %u\n", fDuplicate);
2804	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2805	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2806}
2807#endif
2808
2809
2810// #pragma mark -
2811
2812
2813bool
2814bplustree_header::IsValid() const
2815{
2816	return Magic() == BPLUSTREE_MAGIC
2817		&& (RootNode() % NodeSize()) == 0
2818		&& IsValidLink(RootNode())
2819		&& IsValidLink(FreeNode());
2820}
2821
2822
2823// #pragma mark -
2824
2825
2826void
2827bplustree_node::Initialize()
2828{
2829	left_link = right_link = overflow_link
2830		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2831	all_key_count = 0;
2832	all_key_length = 0;
2833}
2834
2835
2836uint8*
2837bplustree_node::KeyAt(int32 index, uint16* keyLength) const
2838{
2839	if (index < 0 || index > NumKeys())
2840		return NULL;
2841
2842	uint8* keyStart = Keys();
2843	uint16* keyLengths = KeyLengths();
2844
2845	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2846		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2847	if (index > 0)
2848		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2849
2850	return keyStart;
2851}
2852
2853
2854uint8
2855bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2856{
2857	// the duplicate fragment handling is currently hard-coded to a node size
2858	// of 1024 bytes - with future versions of BFS, this may be a problem
2859
2860	if (isFragment) {
2861		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2862
2863		return ((off_t*)this)[fragment];
2864	}
2865	return OverflowLink();
2866}
2867
2868
2869off_t
2870bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2871{
2872	uint32 start;
2873	if (isFragment)
2874		start = 8 * ((uint64)offset & 0x3ff);
2875	else
2876		start = 2;
2877
2878	return ((off_t*)this)[start + 1 + index];
2879}
2880
2881
2882/*!	Although the name suggests it, this function doesn't return the real
2883	used fragment count; at least, it can only count to two: it returns
2884	0, if there is no fragment used, 1 if there is only one fragment
2885	used, and 2 if there are at least 2 fragments used.
2886*/
2887uint32
2888bplustree_node::FragmentsUsed(uint32 nodeSize) const
2889{
2890	uint32 used = 0;
2891	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2892		duplicate_array* array = FragmentAt(i);
2893		if (array->Count() > 0 && ++used > 1)
2894			return used;
2895	}
2896	return used;
2897}
2898
2899
2900status_t
2901bplustree_node::CheckIntegrity(uint32 nodeSize) const
2902{
2903	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2904		DEBUGGER(("invalid node: key/length count"));
2905
2906	for (int32 i = 0; i < NumKeys(); i++) {
2907		uint16 length;
2908		uint8* key = KeyAt(i, &length);
2909		if (key + length + sizeof(off_t) + sizeof(uint16)
2910				> (uint8*)this + nodeSize
2911			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2912			dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i);
2913			return B_BAD_DATA;
2914		}
2915		if (Values()[i] == -1) {
2916			dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values "
2917				"corrupted\n", this, (int)i, Values()[i]);
2918			return B_BAD_DATA;
2919		}
2920	}
2921	return B_OK;
2922}
2923
2924
2925// #pragma mark -
2926
2927
2928BitmapArray::BitmapArray(size_t numBits)
2929{
2930	fSize = (numBits + 7) / 8;
2931	fBitmap = (uint8*)calloc(fSize, 1);
2932	fCountSet = 0;
2933}
2934
2935
2936BitmapArray::~BitmapArray()
2937{
2938	free(fBitmap);
2939}
2940
2941
2942status_t
2943BitmapArray::InitCheck() const
2944{
2945	return fBitmap != NULL ? B_OK : B_NO_MEMORY;
2946}
2947
2948
2949bool
2950BitmapArray::IsSet(size_t index) const
2951{
2952	uint32 byteIndex = index / 8;
2953	if (byteIndex >= fSize)
2954		return false;
2955
2956	return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0;
2957}
2958
2959
2960void
2961BitmapArray::Set(size_t index, bool set)
2962{
2963	uint32 byteIndex = index / 8;
2964	if (byteIndex >= fSize)
2965		return;
2966
2967	if (set) {
2968		fBitmap[byteIndex] |= 1UL << (index & 0x7);
2969		fCountSet++;
2970	} else {
2971		fBitmap[byteIndex] &= ~(1UL << (index & 0x7));
2972		fCountSet--;
2973	}
2974}
2975
2976
2977// #pragma mark -
2978
2979
2980bool
2981duplicate_array::_FindInternal(off_t value, int32& index) const
2982{
2983	int32 min = 0, max = Count() - 1;
2984	off_t cmp;
2985	while (min <= max) {
2986		index = (min + max) / 2;
2987
2988		cmp = ValueAt(index) - value;
2989		if (cmp < 0)
2990			min = index + 1;
2991		else if (cmp > 0)
2992			max = index - 1;
2993		else
2994			return true;
2995	}
2996	return false;
2997}
2998
2999
3000void
3001duplicate_array::Insert(off_t value)
3002{
3003	// if there are more than 8 values in this array, use a
3004	// binary search, if not, just iterate linearly to find
3005	// the insertion point
3006	int32 size = Count();
3007	int32 i;
3008	if (size > 8 ) {
3009		if (!_FindInternal(value, i) && ValueAt(i) <= value)
3010			i++;
3011	} else {
3012		for (i = 0; i < size; i++) {
3013			if (ValueAt(i) > value)
3014				break;
3015		}
3016	}
3017
3018	memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t));
3019	values[i] = HOST_ENDIAN_TO_BFS_INT64(value);
3020	count = HOST_ENDIAN_TO_BFS_INT64(size + 1);
3021}
3022
3023
3024bool
3025duplicate_array::Remove(off_t value)
3026{
3027	int32 index = Find(value);
3028	if (index == -1)
3029		return false;
3030
3031	int32 newSize = Count() - 1;
3032	memmove(&values[index], &values[index + 1],
3033		(newSize - index) * sizeof(off_t));
3034	count = HOST_ENDIAN_TO_BFS_INT64(newSize);
3035
3036	return true;
3037}
3038
3039
3040// #pragma mark -
3041
3042
3043int32
3044compareKeys(type_code type, const void* key1, int keyLength1,
3045	const void* key2, int keyLength2)
3046{
3047	// if one of the keys is NULL, bail out gracefully
3048	if (key1 == NULL || key2 == NULL) {
3049		// even if it's most probably a bug in the calling code, we try to
3050		// give a meaningful result
3051		if (key1 == NULL && key2 != NULL)
3052			return 1;
3053		if (key2 == NULL && key1 != NULL)
3054			return -1;
3055
3056		return 0;
3057	}
3058
3059	switch (type) {
3060	    case B_STRING_TYPE:
3061    	{
3062			int result = memcmp(key1, key2, min_c(keyLength1, keyLength2));
3063			if (result == 0) {
3064				// ignore trailing null bytes
3065				if ((keyLength1 == keyLength2 + 1
3066						&& ((uint8*)key1)[keyLength2] == '\0')
3067					|| (keyLength2 == keyLength1 + 1
3068						&& ((uint8*)key2)[keyLength1] == '\0'))
3069					return 0;
3070
3071				result = keyLength1 - keyLength2;
3072			}
3073
3074			return result;
3075		}
3076
3077		case B_SSIZE_T_TYPE:
3078		case B_INT32_TYPE:
3079			return *(int32*)key1 - *(int32*)key2;
3080
3081		case B_SIZE_T_TYPE:
3082		case B_UINT32_TYPE:
3083			if (*(uint32*)key1 == *(uint32*)key2)
3084				return 0;
3085			if (*(uint32*)key1 > *(uint32*)key2)
3086				return 1;
3087
3088			return -1;
3089
3090		case B_OFF_T_TYPE:
3091		case B_INT64_TYPE:
3092			if (*(int64*)key1 == *(int64*)key2)
3093				return 0;
3094			if (*(int64*)key1 > *(int64*)key2)
3095				return 1;
3096
3097			return -1;
3098
3099		case B_UINT64_TYPE:
3100			if (*(uint64*)key1 == *(uint64*)key2)
3101				return 0;
3102			if (*(uint64*)key1 > *(uint64*)key2)
3103				return 1;
3104
3105			return -1;
3106
3107		case B_FLOAT_TYPE:
3108		{
3109			float result = *(float*)key1 - *(float*)key2;
3110			if (result == 0.0f)
3111				return 0;
3112
3113			return (result < 0.0f) ? -1 : 1;
3114		}
3115
3116		case B_DOUBLE_TYPE:
3117		{
3118			double result = *(double*)key1 - *(double*)key2;
3119			if (result == 0.0)
3120				return 0;
3121
3122			return (result < 0.0) ? -1 : 1;
3123		}
3124	}
3125
3126	// if the type is unknown, the entries don't match...
3127	return -1;
3128}
3129
3130