1/*
2 * Copyright 2001-2017, Axel D��rfler, axeld@pinc-software.de.
3 * This file may be used under the terms of the MIT License.
4 *
5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6 * no code but achieves binary compatibility with the on disk format.
7 */
8
9
10//! B+Tree implementation
11
12
13#include "BPlusTree.h"
14
15#include <file_systems/QueryParserUtils.h>
16
17#include "Debug.h"
18#include "Utility.h"
19
20#if !_BOOT_MODE
21#	include "Inode.h"
22#else
23#	include "Stream.h"
24
25// BFS::Stream from the bootloader has the same API as Inode.
26#	define Inode BFS::Stream
27
28#	define strerror(x)		"error message unavailable"
29
30namespace BFS {
31#endif
32
33
34/*!	Simple array used for the duplicate handling in the B+Tree. This is an
35	on disk structure.
36*/
37struct duplicate_array {
38	off_t	count;
39	off_t	values[0];
40
41	inline bool IsEmpty() const
42	{
43		return count == 0;
44	}
45
46	inline int32 Count() const
47	{
48		return (int32)BFS_ENDIAN_TO_HOST_INT64(count);
49	}
50
51	inline off_t ValueAt(uint32 index) const
52	{
53		return BFS_ENDIAN_TO_HOST_INT64(values[index]);
54	}
55
56	inline void SetValueAt(uint32 index, off_t value)
57	{
58		values[index] = HOST_ENDIAN_TO_BFS_INT64(value);
59	}
60
61	inline int32 Find(off_t value) const
62	{
63		int32 i;
64		return _FindInternal(value, i) ? i : -1;
65	}
66
67	void Insert(off_t value);
68	bool Remove(off_t value);
69
70private:
71	bool _FindInternal(off_t value, int32& index) const;
72} _PACKED;
73
74
75#ifdef DEBUG
76class NodeChecker {
77public:
78	NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text)
79		:
80		fNode(node),
81		fSize(nodeSize),
82		fText(text)
83	{
84		Check("integrity check failed on construction.");
85	}
86
87	~NodeChecker()
88	{
89		Check("integrity check failed on destruction.");
90	}
91
92	void
93	Check(const char* message)
94	{
95		if (fNode->CheckIntegrity(fSize) != B_OK) {
96			dprintf("%s: %s\n", fText, message);
97			DEBUGGER(("NodeChecker integrity check failed!"));
98		}
99	}
100
101private:
102	const bplustree_node*	fNode;
103	int32					fSize;
104	const char*				fText;
105};
106#endif // DEBUG
107
108
109#if !_BOOT_MODE
110class BitmapArray {
111public:
112								BitmapArray(size_t numBits);
113								~BitmapArray();
114
115			status_t			InitCheck() const;
116
117			bool				IsSet(size_t index) const;
118			void				Set(size_t index, bool set);
119
120			size_t				CountSet() const { return fCountSet; }
121
122private:
123			uint8*				fBitmap;
124			size_t				fSize;
125			size_t				fCountSet;
126};
127
128
129struct TreeCheck {
130	TreeCheck(BPlusTree* tree)
131		:
132		fLevelCount(0),
133		fFreeCount(0),
134		fNodeSize(tree->NodeSize()),
135		fMaxLevels(tree->fHeader.MaxNumberOfLevels()),
136		fFoundErrors(0),
137		fVisited(tree->Stream()->Size() / tree->NodeSize()),
138		fVisitedFragment(tree->Stream()->Size() / tree->NodeSize())
139	{
140		fPreviousOffsets = (off_t*)malloc(
141			sizeof(off_t) * tree->fHeader.MaxNumberOfLevels());
142		if (fPreviousOffsets != NULL) {
143			for (size_t i = 0; i < fMaxLevels; i++)
144				fPreviousOffsets[i] = BPLUSTREE_NULL;
145		}
146	}
147
148	~TreeCheck()
149	{
150		free(fPreviousOffsets);
151	}
152
153	status_t InitCheck() const
154	{
155		if (fPreviousOffsets == NULL)
156			return B_NO_MEMORY;
157
158		status_t status = fVisited.InitCheck();
159		if (status != B_OK)
160			return status;
161
162		return fVisitedFragment.InitCheck();
163	}
164
165	bool Visited(off_t offset) const
166	{
167		return fVisited.IsSet(offset / fNodeSize);
168	}
169
170	void SetVisited(off_t offset)
171	{
172		fVisited.Set(offset / fNodeSize, true);
173	}
174
175	size_t VisitedCount() const
176	{
177		return fVisited.CountSet();
178	}
179
180	bool VisitedFragment(off_t offset) const
181	{
182		return fVisitedFragment.IsSet(offset / fNodeSize);
183	}
184
185	void SetVisitedFragment(off_t offset)
186	{
187		fVisitedFragment.Set(offset / fNodeSize, true);
188	}
189
190	uint32 MaxLevels() const
191	{
192		return fLevelCount;
193	}
194
195	void SetLevel(uint32 level)
196	{
197		if (fLevelCount < level)
198			fLevelCount = level;
199	}
200
201	off_t PreviousOffset(uint32 level)
202	{
203		return fPreviousOffsets[level];
204	}
205
206	void SetPreviousOffset(uint32 level, off_t offset)
207	{
208		fPreviousOffsets[level] = offset;
209	}
210
211	void FoundError()
212	{
213		fFoundErrors++;
214	}
215
216	bool ErrorsFound()
217	{
218		return fFoundErrors != 0;
219	}
220
221private:
222			uint32				fLevelCount;
223			uint32				fFreeCount;
224			uint32				fNodeSize;
225			uint32				fMaxLevels;
226			uint32				fFoundErrors;
227			BitmapArray			fVisited;
228			BitmapArray			fVisitedFragment;
229			off_t*				fPreviousOffsets;
230};
231
232
233// #pragma mark -
234
235
236// Node Caching for the BPlusTree class
237//
238// With write support, there is the need for a function that allocates new
239// nodes by either returning empty nodes, or by growing the file's data stream
240//
241// !! The CachedNode class assumes that you have properly locked the stream
242// !! before asking for nodes.
243//
244// Note: This code will fail if the block size is smaller than the node size!
245// Since BFS supports block sizes of 1024 bytes or greater, and the node size
246// is hard-coded to 1024 bytes, that's not an issue now.
247
248void
249CachedNode::UnsetUnchanged(Transaction& transaction)
250{
251	if (fTree == NULL || fTree->fStream == NULL)
252		return;
253
254	if (fNode != NULL) {
255		void* cache = fTree->fStream->GetVolume()->BlockCache();
256
257		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
258		block_cache_put(cache, fBlockNumber);
259		fNode = NULL;
260	}
261}
262#endif // !_BOOT_MODE
263
264
265void
266CachedNode::Unset()
267{
268	if (fTree == NULL || fTree->fStream == NULL)
269		return;
270
271	if (fNode != NULL) {
272#if !_BOOT_MODE
273		if (fWritable && fOffset == 0) {
274			// The B+tree header has been updated - we need to update the
275			// BPlusTrees copy of it, as well.
276			memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header));
277		}
278
279		block_cache_put(fTree->fStream->GetVolume()->BlockCache(),
280			fBlockNumber);
281#endif // !_BOOT_MODE
282
283		fNode = NULL;
284	}
285}
286
287
288const bplustree_node*
289CachedNode::SetTo(off_t offset, bool check)
290{
291	const bplustree_node* node;
292	if (SetTo(offset, &node, check) == B_OK)
293		return node;
294
295	return NULL;
296}
297
298
299status_t
300CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check)
301{
302	if (fTree == NULL || fTree->fStream == NULL)
303		RETURN_ERROR(B_BAD_VALUE);
304
305	Unset();
306
307	// You can only ask for nodes at valid positions - you can't
308	// even access the b+tree header with this method (use SetToHeader()
309	// instead)
310	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
311		|| offset <= 0
312		|| (offset % fTree->fNodeSize) != 0) {
313		RETURN_ERROR(B_BAD_VALUE);
314	}
315
316	if (InternalSetTo(NULL, offset) != NULL && check) {
317		// sanity checks (links, all_key_count)
318		if (!fTree->fHeader.CheckNode(fNode)) {
319			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
320				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
321				fBlockNumber, fTree->fStream->ID()));
322			return B_BAD_DATA;
323		}
324	}
325
326	*_node = fNode;
327	return B_OK;
328}
329
330
331#if !_BOOT_MODE
332bplustree_node*
333CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check)
334{
335	if (fTree == NULL || fTree->fStream == NULL) {
336		REPORT_ERROR(B_BAD_VALUE);
337		return NULL;
338	}
339
340	Unset();
341
342	// You can only ask for nodes at valid positions - you can't
343	// even access the b+tree header with this method (use SetToHeader()
344	// instead)
345	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
346		|| offset <= 0
347		|| (offset % fTree->fNodeSize) != 0)
348		return NULL;
349
350	if (InternalSetTo(&transaction, offset) != NULL && check) {
351		// sanity checks (links, all_key_count)
352		if (!fTree->fHeader.CheckNode(fNode)) {
353			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
354				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
355				fBlockNumber, fTree->fStream->ID()));
356			return NULL;
357		}
358	}
359	return fNode;
360}
361
362
363bplustree_node*
364CachedNode::MakeWritable(Transaction& transaction)
365{
366	if (fNode == NULL)
367		return NULL;
368
369	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
370			fBlockNumber, transaction.ID()) == B_OK) {
371		return fNode;
372	}
373
374	return NULL;
375}
376#endif // !_BOOT_MODE
377
378
379const bplustree_header*
380CachedNode::SetToHeader()
381{
382	if (fTree == NULL || fTree->fStream == NULL) {
383		REPORT_ERROR(B_BAD_VALUE);
384		return NULL;
385	}
386
387	Unset();
388
389	InternalSetTo(NULL, 0LL);
390	return (bplustree_header*)fNode;
391}
392
393
394#if !_BOOT_MODE
395bplustree_header*
396CachedNode::SetToWritableHeader(Transaction& transaction)
397{
398	if (fTree == NULL || fTree->fStream == NULL) {
399		REPORT_ERROR(B_BAD_VALUE);
400		return NULL;
401	}
402
403	Unset();
404
405	InternalSetTo(&transaction, 0LL);
406
407	if (fNode != NULL && !fTree->fInTransaction) {
408		transaction.AddListener(fTree);
409		fTree->fInTransaction = true;
410
411		if (!transaction.GetVolume()->IsInitializing()) {
412			acquire_vnode(transaction.GetVolume()->FSVolume(),
413				fTree->fStream->ID());
414		}
415	}
416
417	return (bplustree_header*)fNode;
418}
419#endif // !_BOOT_MODE
420
421
422bplustree_node*
423CachedNode::InternalSetTo(Transaction* transaction, off_t offset)
424{
425	fNode = NULL;
426	fOffset = offset;
427
428	off_t fileOffset;
429	block_run run;
430	if (offset < fTree->fStream->Size()
431		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
432
433#if !_BOOT_MODE
434		Volume* volume = fTree->fStream->GetVolume();
435#else
436		Volume* volume = &fTree->fStream->GetVolume();
437#endif
438
439		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
440		fBlockNumber = volume->ToBlock(run) + blockOffset;
441		uint8* block = NULL;
442
443#if !_BOOT_MODE
444		if (transaction != NULL) {
445			block = (uint8*)block_cache_get_writable(volume->BlockCache(),
446				fBlockNumber, transaction->ID());
447			fWritable = true;
448		} else {
449			block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber);
450			fWritable = false;
451		}
452#else // !_BOOT_MODE
453		if (fBlock == NULL) {
454			fBlock = (uint8*)malloc(volume->BlockSize());
455			if (fBlock == NULL)
456				return NULL;
457		}
458
459		if (read_pos(volume->Device(), fBlockNumber << volume->BlockShift(),
460				fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) {
461			block = fBlock;
462		}
463
464		fWritable = false;
465#endif // _BOOT_MODE
466
467		if (block != NULL) {
468			// The node is somewhere in that block...
469			// (confusing offset calculation)
470			fNode = (bplustree_node*)(block + offset
471				- (fileOffset + (blockOffset << volume->BlockShift())));
472		} else
473			REPORT_ERROR(B_IO_ERROR);
474	}
475	return fNode;
476}
477
478
479#if !_BOOT_MODE
480status_t
481CachedNode::Free(Transaction& transaction, off_t offset)
482{
483	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
484		RETURN_ERROR(B_BAD_VALUE);
485
486	// TODO: scan the free nodes list and remove all nodes at the end
487	// of the tree - perhaps that shouldn't be done everytime that
488	// function is called, perhaps it should be done when the directory
489	// inode is closed or based on some calculation or whatever...
490
491	CachedNode cached(fTree);
492	bplustree_header* header = cached.SetToWritableHeader(transaction);
493	if (header == NULL)
494		return B_IO_ERROR;
495
496#if 0
497	// TODO: temporarily disabled because CheckNode() doesn't like this...
498	// 		Also, it's such an edge case that it's almost useless, anyway.
499	// if the node is the last one in the tree, we shrink
500	// the tree and file size by one node
501	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
502	if (offset == lastOffset) {
503		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
504		if (status != B_OK)
505			return status;
506
507		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
508		return B_OK;
509	}
510#endif
511
512	// add the node to the free nodes list
513	fNode->left_link = header->free_node_pointer;
514	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
515
516	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
517	return B_OK;
518}
519
520
521status_t
522CachedNode::Allocate(Transaction& transaction, bplustree_node** _node,
523	off_t* _offset)
524{
525	if (fTree == NULL || fTree->fStream == NULL)
526		RETURN_ERROR(B_BAD_VALUE);
527
528	Unset();
529
530	bplustree_header* header;
531	status_t status;
532
533	// if there are any free nodes, recycle them
534	if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) {
535		CachedNode cached(fTree);
536		header = cached.SetToWritableHeader(transaction);
537		if (header == NULL)
538			return B_IO_ERROR;
539
540		*_offset = header->FreeNode();
541		*_node = fNode;
542
543		// set new free node pointer
544		header->free_node_pointer = fNode->left_link;
545		fNode->Initialize();
546		return B_OK;
547	}
548
549	// allocate space for a new node
550	Inode* stream = fTree->fStream;
551	if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK)
552		return status;
553
554	CachedNode cached(fTree);
555	header = cached.SetToWritableHeader(transaction);
556	if (header == NULL)
557		return B_IO_ERROR;
558
559	// the maximum_size has to be changed before the call to SetTo() - or
560	// else it will fail because the requested node is out of bounds
561	off_t offset = header->MaximumSize();
562	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
563		+ fTree->fNodeSize);
564
565	cached.Unset();
566		// SetToWritable() below needs the new values in the tree's header
567
568	if (SetToWritable(transaction, offset, false) == NULL)
569		RETURN_ERROR(B_ERROR);
570
571	fNode->Initialize();
572
573	*_offset = offset;
574	*_node = fNode;
575	return B_OK;
576}
577
578
579//	#pragma mark -
580
581
582BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize)
583	:
584	fStream(NULL),
585	fInTransaction(false)
586{
587	mutex_init(&fIteratorLock, "bfs b+tree iterator");
588	SetTo(transaction, stream);
589}
590#endif // !_BOOT_MODE
591
592
593BPlusTree::BPlusTree(Inode* stream)
594	:
595	fStream(NULL),
596	fInTransaction(false)
597{
598#if !_BOOT_MODE
599	mutex_init(&fIteratorLock, "bfs b+tree iterator");
600#endif
601
602	SetTo(stream);
603}
604
605
606BPlusTree::BPlusTree()
607	:
608	fStream(NULL),
609	fNodeSize(BPLUSTREE_NODE_SIZE),
610	fAllowDuplicates(true),
611	fInTransaction(false),
612	fStatus(B_NO_INIT)
613{
614#if !_BOOT_MODE
615	mutex_init(&fIteratorLock, "bfs b+tree iterator");
616#endif
617}
618
619
620BPlusTree::~BPlusTree()
621{
622#if !_BOOT_MODE
623	// if there are any TreeIterators left, we need to stop them
624	// (can happen when the tree's inode gets deleted while
625	// traversing the tree - a TreeIterator doesn't lock the inode)
626	mutex_lock(&fIteratorLock);
627
628	SinglyLinkedList<TreeIterator>::Iterator iterator
629		= fIterators.GetIterator();
630	while (iterator.HasNext())
631		iterator.Next()->Stop();
632
633	mutex_destroy(&fIteratorLock);
634
635	ASSERT(!fInTransaction);
636#endif // !_BOOT_MODE
637}
638
639
640#if !_BOOT_MODE
641/*! Create a new B+Tree on the specified stream */
642status_t
643BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize)
644{
645	// initializes in-memory B+Tree
646
647	fStream = stream;
648
649	CachedNode cached(this);
650	bplustree_header* header = cached.SetToWritableHeader(transaction);
651	if (header == NULL) {
652		// allocate space for new header + node!
653		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
654		if (fStatus != B_OK)
655			RETURN_ERROR(fStatus);
656
657		header = cached.SetToWritableHeader(transaction);
658		if (header == NULL)
659			RETURN_ERROR(fStatus = B_ERROR);
660	}
661
662	fAllowDuplicates = stream->IsIndex()
663		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
664
665	fNodeSize = nodeSize;
666
667	// initialize b+tree header
668 	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
669 	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
670 	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
671 	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
672 	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
673 	header->free_node_pointer
674 		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
675 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
676
677	cached.Unset();
678
679	// initialize b+tree root node
680	cached.SetToWritable(transaction, fHeader.RootNode(), false);
681	if (cached.Node() == NULL)
682		RETURN_ERROR(B_IO_ERROR);
683
684	cached.Node()->Initialize();
685
686	return fStatus = B_OK;
687}
688#endif // !_BOOT_MODE
689
690
691status_t
692BPlusTree::SetTo(Inode* stream)
693{
694	if (stream == NULL)
695		RETURN_ERROR(fStatus = B_BAD_VALUE);
696
697	fStream = stream;
698
699	// get on-disk B+Tree header
700
701	CachedNode cached(this);
702	const bplustree_header* header = cached.SetToHeader();
703	if (header != NULL)
704		memcpy(&fHeader, header, sizeof(bplustree_header));
705	else
706		RETURN_ERROR(fStatus = B_IO_ERROR);
707
708	// is header valid?
709
710	if (fHeader.MaximumSize() != stream->Size()) {
711		dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %"
712			B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size());
713		// we can't change the header since we don't have a transaction
714		//fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
715	}
716	if (!fHeader.IsValid()) {
717#ifdef DEBUG
718		dump_bplustree_header(&fHeader);
719		dump_block((const char*)&fHeader, 128);
720#endif
721		RETURN_ERROR(fStatus = B_BAD_DATA);
722	}
723
724	fNodeSize = fHeader.NodeSize();
725
726	// validity check
727	static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
728		S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
729		S_DOUBLE_INDEX};
730	uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
731		| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
732		| S_FLOAT_INDEX | S_DOUBLE_INDEX);
733
734	if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE
735		|| ((stream->Mode() & S_INDEX_DIR) != 0
736			&& kToMode[fHeader.DataType()] != mode)
737		|| !stream->IsContainer()) {
738		D(	dump_bplustree_header(&fHeader);
739			dump_inode(&stream->Node());
740		);
741		RETURN_ERROR(fStatus = B_BAD_TYPE);
742	}
743
744	// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
745	// in the original BFS code - we will honour it nevertheless
746	fAllowDuplicates = stream->IsIndex()
747		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
748
749	cached.SetTo(fHeader.RootNode());
750	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
751}
752
753
754status_t
755BPlusTree::InitCheck()
756{
757	return fStatus;
758}
759
760
761#if !_BOOT_MODE
762status_t
763BPlusTree::Validate(bool repair, bool& _errorsFound)
764{
765	TreeCheck check(this);
766	if (check.InitCheck() != B_OK)
767		return B_NO_MEMORY;
768
769	check.SetVisited(0);
770
771	// Walk the free nodes
772
773	CachedNode cached(this);
774	off_t freeOffset = fHeader.FreeNode();
775	while (freeOffset > 0) {
776		const bplustree_node* node;
777		status_t status = cached.SetTo(freeOffset, &node, false);
778		if (status != B_OK) {
779			if (status == B_IO_ERROR)
780				return B_IO_ERROR;
781
782			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could "
783				"not be read: %s\n", fStream->ID(), freeOffset,
784				strerror(status));
785			check.FoundError();
786			break;
787		}
788
789		if (check.Visited(freeOffset)) {
790			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
791				" circular!\n", fStream->ID(), freeOffset);
792			// TODO: if 'repair' is true, we could collect all unvisited nodes
793			// at the end, and put the into the free list
794			check.FoundError();
795			break;
796		}
797
798		check.SetVisited(freeOffset);
799
800		if (node->OverflowLink() != BPLUSTREE_FREE) {
801			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
802				" misses free mark!\n", fStream->ID(), freeOffset);
803		}
804		freeOffset = node->LeftLink();
805	}
806
807	// Iterate over the complete tree recursively
808
809	const bplustree_node* root;
810	status_t status = cached.SetTo(fHeader.RootNode(), &root, true);
811	if (status != B_OK)
812		return status;
813
814	status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root);
815
816	if (check.ErrorsFound())
817		_errorsFound = true;
818
819	if (status != B_OK)
820		return status;
821
822	if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) {
823		dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, "
824			"declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(),
825			fHeader.MaxNumberOfLevels());
826	}
827
828	if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) {
829		dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF
830			" nodes.\n", fStream->ID(), check.VisitedCount(),
831			fHeader.MaximumSize() / fNodeSize);
832	}
833
834	return B_OK;
835}
836
837
838status_t
839BPlusTree::MakeEmpty()
840{
841	// Put all nodes into the free list in order
842	Transaction transaction(fStream->GetVolume(), fStream->BlockNumber());
843
844	// Reset the header, and root node
845	CachedNode cached(this);
846	bplustree_header* header = cached.SetToWritableHeader(transaction);
847	if (header == NULL)
848		return B_IO_ERROR;
849
850	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
851	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize());
852	if (fStream->Size() > (off_t)NodeSize() * 2)
853		header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize());
854	else {
855		header->free_node_pointer
856			= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
857	}
858
859	bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false);
860	if (node == NULL)
861		return B_IO_ERROR;
862
863	node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
864	node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
865	node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
866	node->all_key_count = 0;
867	node->all_key_length = 0;
868
869	for (off_t offset = 2 * NodeSize(); offset < fStream->Size();
870			offset += NodeSize()) {
871		bplustree_node* node = cached.SetToWritable(transaction, offset, false);
872		if (node == NULL) {
873			dprintf("--> could not open %" B_PRIdOFF "\n", offset);
874			return B_IO_ERROR;
875		}
876		if (offset < fStream->Size() - (off_t)NodeSize())
877			node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize());
878		else
879			node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
880
881		node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
882
883		// It's not important to write it out in a single transaction; just
884		// make sure it doesn't get too large
885		if (offset % (1024 * 1024) == 0) {
886			transaction.Done();
887			transaction.Start(fStream->GetVolume(), fStream->BlockNumber());
888		}
889	}
890
891	return transaction.Done();
892}
893
894
895int32
896BPlusTree::TypeCodeToKeyType(type_code code)
897{
898	switch (code) {
899		case B_STRING_TYPE:
900			return BPLUSTREE_STRING_TYPE;
901		case B_SSIZE_T_TYPE:
902		case B_INT32_TYPE:
903			return BPLUSTREE_INT32_TYPE;
904		case B_SIZE_T_TYPE:
905		case B_UINT32_TYPE:
906			return BPLUSTREE_UINT32_TYPE;
907		case B_OFF_T_TYPE:
908		case B_INT64_TYPE:
909			return BPLUSTREE_INT64_TYPE;
910		case B_UINT64_TYPE:
911			return BPLUSTREE_UINT64_TYPE;
912		case B_FLOAT_TYPE:
913			return BPLUSTREE_FLOAT_TYPE;
914		case B_DOUBLE_TYPE:
915			return BPLUSTREE_DOUBLE_TYPE;
916	}
917	return -1;
918}
919
920
921int32
922BPlusTree::ModeToKeyType(mode_t mode)
923{
924	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
925			| S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
926		case S_INT_INDEX:
927			return BPLUSTREE_INT32_TYPE;
928		case S_UINT_INDEX:
929			return BPLUSTREE_UINT32_TYPE;
930		case S_LONG_LONG_INDEX:
931			return BPLUSTREE_INT64_TYPE;
932		case S_ULONG_LONG_INDEX:
933			return BPLUSTREE_UINT64_TYPE;
934		case S_FLOAT_INDEX:
935			return BPLUSTREE_FLOAT_TYPE;
936		case S_DOUBLE_INDEX:
937			return BPLUSTREE_DOUBLE_TYPE;
938		case S_STR_INDEX:
939		default:
940			// default is for standard directories
941			return BPLUSTREE_STRING_TYPE;
942	}
943}
944#endif // !_BOOT_MODE
945
946
947//	#pragma mark - TransactionListener implementation
948
949
950#if !_BOOT_MODE
951void
952BPlusTree::TransactionDone(bool success)
953{
954	if (!success) {
955		// update header from disk
956		CachedNode cached(this);
957		const bplustree_header* header = cached.SetToHeader();
958		if (header != NULL)
959			memcpy(&fHeader, header, sizeof(bplustree_header));
960	}
961}
962
963
964void
965BPlusTree::RemovedFromTransaction()
966{
967	fInTransaction = false;
968
969	if (!fStream->GetVolume()->IsInitializing())
970		put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID());
971}
972
973
974//	#pragma mark -
975
976
977void
978BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
979	uint16 splitAt, int8 change)
980{
981	// Although every iterator which is affected by this update currently
982	// waits on a semaphore, other iterators could be added/removed at
983	// any time, so we need to protect this loop
984	MutexLocker _(fIteratorLock);
985
986	SinglyLinkedList<TreeIterator>::Iterator iterator
987		= fIterators.GetIterator();
988	while (iterator.HasNext())
989		iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change);
990}
991
992
993void
994BPlusTree::_AddIterator(TreeIterator* iterator)
995{
996	MutexLocker _(fIteratorLock);
997	fIterators.Add(iterator);
998}
999
1000
1001void
1002BPlusTree::_RemoveIterator(TreeIterator* iterator)
1003{
1004	MutexLocker _(fIteratorLock);
1005	fIterators.Remove(iterator);
1006}
1007#endif // !_BOOT_MODE
1008
1009
1010int32
1011BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2,
1012	int keyLength2)
1013{
1014	type_code type = 0;
1015	switch (fHeader.DataType()) {
1016	    case BPLUSTREE_STRING_TYPE:
1017	    	type = B_STRING_TYPE;
1018	    	break;
1019		case BPLUSTREE_INT32_TYPE:
1020	    	type = B_INT32_TYPE;
1021	    	break;
1022		case BPLUSTREE_UINT32_TYPE:
1023	    	type = B_UINT32_TYPE;
1024	    	break;
1025		case BPLUSTREE_INT64_TYPE:
1026	    	type = B_INT64_TYPE;
1027	    	break;
1028		case BPLUSTREE_UINT64_TYPE:
1029	    	type = B_UINT64_TYPE;
1030	    	break;
1031		case BPLUSTREE_FLOAT_TYPE:
1032	    	type = B_FLOAT_TYPE;
1033	    	break;
1034		case BPLUSTREE_DOUBLE_TYPE:
1035	    	type = B_DOUBLE_TYPE;
1036	    	break;
1037	}
1038	return QueryParser::compareKeys(type, key1, keyLength1, key2, keyLength2);
1039}
1040
1041
1042status_t
1043BPlusTree::_FindKey(const bplustree_node* node, const uint8* key,
1044	uint16 keyLength, uint16* _index, off_t* _next)
1045{
1046#ifdef DEBUG
1047	NodeChecker checker(node, fNodeSize, "find");
1048#endif
1049
1050	if (node->all_key_count == 0) {
1051		if (_index)
1052			*_index = 0;
1053		if (_next)
1054			*_next = node->OverflowLink();
1055		return B_ENTRY_NOT_FOUND;
1056	}
1057
1058	Unaligned<off_t>* values = node->Values();
1059	int16 saveIndex = -1;
1060
1061	// binary search in the key array
1062	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
1063		uint16 i = (first + last) >> 1;
1064
1065		uint16 searchLength = 0;
1066		uint8* searchKey = node->KeyAt(i, &searchLength);
1067		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
1068				> (uint8*)node + fNodeSize
1069			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
1070#if !_BOOT_MODE
1071			fStream->GetVolume()->Panic();
1072#endif
1073			RETURN_ERROR(B_BAD_DATA);
1074		}
1075
1076		int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
1077		if (cmp < 0) {
1078			last = i - 1;
1079			saveIndex = i;
1080		} else if (cmp > 0) {
1081			saveIndex = first = i + 1;
1082		} else {
1083			if (_index)
1084				*_index = i;
1085			if (_next)
1086				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1087			return B_OK;
1088		}
1089	}
1090
1091	if (_index)
1092		*_index = saveIndex;
1093	if (_next) {
1094		if (saveIndex == node->NumKeys())
1095			*_next = node->OverflowLink();
1096		else
1097			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
1098	}
1099	return B_ENTRY_NOT_FOUND;
1100}
1101
1102
1103#if !_BOOT_MODE
1104/*!	Prepares the stack to contain all nodes that were passed while
1105	following the key, from the root node to the leaf node that could
1106	or should contain that key.
1107*/
1108status_t
1109BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key,
1110	uint16 keyLength)
1111{
1112	// set the root node to begin with
1113	node_and_key nodeAndKey;
1114	nodeAndKey.nodeOffset = fHeader.RootNode();
1115
1116	CachedNode cached(this);
1117	const bplustree_node* node;
1118	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1119		// if we are already on leaf level, we're done
1120		if (node->OverflowLink() == BPLUSTREE_NULL) {
1121			// node that the keyIndex is not properly set here (but it's not
1122			// needed in the calling functions anyway)!
1123			nodeAndKey.keyIndex = 0;
1124			stack.Push(nodeAndKey);
1125			return B_OK;
1126		}
1127
1128		off_t nextOffset;
1129		status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
1130			&nextOffset);
1131
1132		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
1133			RETURN_ERROR(B_ERROR);
1134
1135		if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) {
1136			dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
1137			break;
1138		}
1139
1140		// put the node offset & the correct keyIndex on the stack
1141		stack.Push(nodeAndKey);
1142
1143		nodeAndKey.nodeOffset = nextOffset;
1144	}
1145
1146	FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %"
1147		B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID()));
1148	return B_ERROR;
1149}
1150
1151
1152/*!	This will find a free duplicate fragment in the given bplustree_node.
1153	The CachedNode will be set to the writable fragment on success.
1154*/
1155status_t
1156BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction,
1157	const bplustree_node* node, CachedNode& cached,
1158	off_t* _offset, bplustree_node** _fragment, uint32* _index)
1159{
1160	Unaligned<off_t>* values = node->Values();
1161	for (int32 i = 0; i < node->NumKeys(); i++) {
1162		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1163
1164		// does the value link to a duplicate fragment?
1165		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
1166			continue;
1167
1168		const bplustree_node* fragment = cached.SetTo(
1169			bplustree_node::FragmentOffset(value), false);
1170		if (fragment == NULL) {
1171			FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %"
1172				B_PRIdOFF "\n", value, fStream->ID()));
1173			continue;
1174		}
1175
1176		// see if there is some space left for us
1177		uint32 num = bplustree_node::MaxFragments(fNodeSize);
1178		for (uint32 j = 0; j < num; j++) {
1179			duplicate_array* array = fragment->FragmentAt(j);
1180
1181			if (array->IsEmpty()) {
1182				// found an unused fragment
1183				*_fragment = cached.MakeWritable(transaction);
1184				if (*_fragment == NULL)
1185					return B_IO_ERROR;
1186
1187				*_offset = bplustree_node::FragmentOffset(value);
1188				*_index = j;
1189				return B_OK;
1190			}
1191		}
1192	}
1193	return B_ENTRY_NOT_FOUND;
1194}
1195
1196
1197status_t
1198BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached,
1199	const bplustree_node* node, uint16 index, off_t value)
1200{
1201	CachedNode cachedDuplicate(this);
1202	Unaligned<off_t>* values = node->Values();
1203	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1204	status_t status;
1205	off_t offset;
1206
1207	if (bplustree_node::IsDuplicate(oldValue)) {
1208		// If it's a duplicate fragment, try to insert it into that, or if it
1209		// doesn't fit anymore, create a new duplicate node
1210
1211		if (bplustree_node::LinkType(oldValue)
1212				== BPLUSTREE_DUPLICATE_FRAGMENT) {
1213			bplustree_node* duplicate = cachedDuplicate.SetToWritable(
1214				transaction, bplustree_node::FragmentOffset(oldValue), false);
1215			if (duplicate == NULL)
1216				return B_IO_ERROR;
1217
1218			duplicate_array* array = duplicate->FragmentAt(
1219				bplustree_node::FragmentIndex(oldValue));
1220			int32 arrayCount = array->Count();
1221			if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) {
1222				FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment "
1223					"%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1224					(int)bplustree_node::FragmentIndex(oldValue),
1225					bplustree_node::FragmentOffset(oldValue), arrayCount,
1226					fStream->ID()));
1227				return B_BAD_DATA;
1228			}
1229
1230			if (arrayCount < NUM_FRAGMENT_VALUES) {
1231				array->Insert(value);
1232			} else {
1233				// Test if the fragment will be empty if we remove this key's
1234				// values
1235				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
1236					// The node will be empty without our values, so let us
1237					// reuse it as a duplicate node
1238					offset = bplustree_node::FragmentOffset(oldValue);
1239
1240					memmove(duplicate->DuplicateArray(), array,
1241						(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1242					duplicate->left_link = duplicate->right_link
1243						= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1244
1245					array = duplicate->DuplicateArray();
1246					array->Insert(value);
1247				} else {
1248					// Create a new duplicate node
1249
1250					cachedDuplicate.UnsetUnchanged(transaction);
1251						// The old duplicate has not been touched, so we can
1252						// reuse it
1253
1254					bplustree_node* newDuplicate;
1255					status = cachedDuplicate.Allocate(transaction,
1256						&newDuplicate, &offset);
1257					if (status != B_OK)
1258						RETURN_ERROR(status);
1259
1260					// Copy the array from the fragment node to the duplicate
1261					// node and free the old entry (by zero'ing all values)
1262					newDuplicate->overflow_link = array->count;
1263					memcpy(&newDuplicate->all_key_count, &array->values[0],
1264						array->Count() * sizeof(off_t));
1265					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1266
1267					array = newDuplicate->DuplicateArray();
1268					array->Insert(value);
1269				}
1270
1271				// Update the main pointer to link to a duplicate node
1272				if (cached.MakeWritable(transaction) == NULL)
1273					return B_IO_ERROR;
1274
1275				values[index]
1276					= HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1277						BPLUSTREE_DUPLICATE_NODE, offset));
1278			}
1279
1280			return B_OK;
1281		}
1282
1283		// Put the value into a dedicated duplicate node
1284
1285		// search for free space in the duplicate nodes of that key
1286		duplicate_array* array;
1287		int32 arrayCount;
1288		const bplustree_node* duplicate;
1289		off_t duplicateOffset;
1290		do {
1291			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1292			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
1293			if (duplicate == NULL)
1294				return B_IO_ERROR;
1295
1296			array = duplicate->DuplicateArray();
1297			arrayCount =array->Count();
1298			if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1299				FATAL(("_InsertDuplicate: Invalid array size in duplicate %"
1300					B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1301					duplicateOffset, arrayCount, fStream->ID()));
1302				return B_BAD_DATA;
1303			}
1304		} while (arrayCount >= NUM_DUPLICATE_VALUES
1305				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
1306
1307		bplustree_node* writableDuplicate
1308			= cachedDuplicate.MakeWritable(transaction);
1309		if (writableDuplicate == NULL)
1310			return B_IO_ERROR;
1311
1312		if (arrayCount < NUM_DUPLICATE_VALUES) {
1313			array = writableDuplicate->DuplicateArray();
1314			array->Insert(value);
1315		} else {
1316			// no space left - add a new duplicate node
1317
1318			bplustree_node* newDuplicate;
1319			status = cachedDuplicate.Allocate(transaction, &newDuplicate,
1320				&offset);
1321			if (status != B_OK)
1322				RETURN_ERROR(status);
1323
1324			// link the two nodes together
1325			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
1326			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
1327
1328			array = newDuplicate->DuplicateArray();
1329			array->count = 0;
1330			array->Insert(value);
1331		}
1332		return B_OK;
1333	}
1334
1335	// Search for a free duplicate fragment or create a new one
1336	// to insert the duplicate value into
1337
1338	uint32 fragmentIndex = 0;
1339	bplustree_node* fragment;
1340	if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
1341			&offset, &fragment, &fragmentIndex) != B_OK) {
1342		// allocate a new duplicate fragment node
1343		status = cachedDuplicate.Allocate(transaction, &fragment, &offset);
1344		if (status != B_OK)
1345			RETURN_ERROR(status);
1346
1347		memset(fragment, 0, fNodeSize);
1348	}
1349
1350	duplicate_array* array = fragment->FragmentAt(fragmentIndex);
1351	array->Insert(oldValue);
1352	array->Insert(value);
1353
1354	if (cached.MakeWritable(transaction) == NULL)
1355		return B_IO_ERROR;
1356
1357	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1358		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
1359
1360	return B_OK;
1361}
1362
1363
1364void
1365BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key,
1366	uint16 keyLength, off_t value)
1367{
1368	// should never happen, but who knows?
1369	if (index > node->NumKeys())
1370		return;
1371
1372	Unaligned<off_t>* values = node->Values();
1373	Unaligned<uint16>* keyLengths = node->KeyLengths();
1374	uint8* keys = node->Keys();
1375
1376	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
1377	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
1378		+ keyLength);
1379
1380	Unaligned<off_t>* newValues = node->Values();
1381	Unaligned<uint16>* newKeyLengths = node->KeyLengths();
1382
1383	// move values and copy new value into them
1384	memmove(newValues + index + 1, values + index,
1385		sizeof(off_t) * (node->NumKeys() - 1 - index));
1386	memmove(newValues, values, sizeof(off_t) * index);
1387
1388	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
1389
1390	// move and update key length index
1391	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
1392		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1393			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
1394	}
1395	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
1396
1397	int32 keyStart;
1398	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
1399		+ (keyStart = index > 0
1400			? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
1401
1402	// move keys and copy new key into them
1403	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
1404	int32 size = node->AllKeyLength() - length;
1405	if (size > 0)
1406		memmove(keys + length, keys + length - keyLength, size);
1407
1408	memcpy(keys + keyStart, key, keyLength);
1409}
1410
1411
1412/*!	Splits the \a node into two halves - the other half will be put into
1413	\a other. It also takes care to create a new overflow link if the node
1414	to split is an index node.
1415*/
1416status_t
1417BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset,
1418	bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key,
1419	uint16* _keyLength, off_t* _value)
1420{
1421	if (*_keyIndex > node->NumKeys() + 1)
1422		return B_BAD_VALUE;
1423
1424	Unaligned<uint16>* inKeyLengths = node->KeyLengths();
1425	Unaligned<off_t>* inKeyValues = node->Values();
1426	uint8* inKeys = node->Keys();
1427	uint8* outKeys = other->Keys();
1428	int32 keyIndex = *_keyIndex;	// can become less than zero!
1429
1430	if (keyIndex > node->NumKeys()) {
1431		FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF
1432			"\n", (int)keyIndex, node->NumKeys(), fStream->ID()));
1433		return B_BAD_VALUE;
1434	}
1435
1436	// How many keys will fit in one (half) page?
1437	// The following loop will find the answer to this question and
1438	// change the key lengths indices for their new home
1439
1440	// "bytes" is the number of bytes written for the new key,
1441	// "bytesBefore" are the bytes before that key
1442	// "bytesAfter" are the bytes after the new key, if any
1443	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1444
1445	size_t size = fNodeSize >> 1;
1446	int32 out, in;
1447	size_t keyLengths = 0;
1448	for (in = out = 0; in < node->NumKeys() + 1;) {
1449		keyLengths = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]);
1450
1451		if (in == keyIndex && !bytes) {
1452			bytes = *_keyLength;
1453			bytesBefore = in > 0
1454				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1455		} else {
1456			if (keyIndex < out)
1457				bytesAfter = keyLengths - bytesBefore;
1458
1459			in++;
1460		}
1461		out++;
1462
1463		if (key_align(sizeof(bplustree_node) + bytes + keyLengths)
1464				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1465			// we have found the number of keys in the new node!
1466			break;
1467		}
1468	}
1469
1470	// if the new key was not inserted, set the length of the keys
1471	// that can be copied directly
1472
1473	if (keyIndex >= out && in > 0)
1474		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1475	else if (keyIndex + 1 < out)
1476		bytesAfter = keyLengths - bytesBefore;
1477
1478	if (bytesBefore < 0 || bytesAfter < 0)
1479		return B_BAD_DATA;
1480
1481	other->left_link = node->left_link;
1482	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1483	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1484		+ bytesAfter);
1485	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1486
1487	Unaligned<uint16>* outKeyLengths = other->KeyLengths();
1488	Unaligned<off_t>* outKeyValues = other->Values();
1489	int32 keys = out > keyIndex ? keyIndex : out;
1490
1491	if (bytesBefore) {
1492		// copy the keys
1493		memcpy(outKeys, inKeys, bytesBefore);
1494		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1495		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1496	}
1497	if (bytes) {
1498		// copy the newly inserted key
1499		memcpy(outKeys + bytesBefore, key, bytes);
1500		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1501		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1502
1503		if (bytesAfter) {
1504			// copy the keys after the new key
1505			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1506				bytesAfter);
1507			keys = out - keyIndex - 1;
1508			for (int32 i = 0;i < keys;i++) {
1509				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1510					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i])
1511						+ bytes);
1512			}
1513			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1514				keys * sizeof(off_t));
1515		}
1516	}
1517
1518	// if the new key was already inserted, we shouldn't use it again
1519	if (in != out)
1520		keyIndex--;
1521
1522	int32 total = bytesBefore + bytesAfter;
1523
1524	// these variables are for the key that will be returned
1525	// to the parent node
1526	uint8* newKey = NULL;
1527	uint16 newLength = 0;
1528	bool newAllocated = false;
1529
1530	// If we have split an index node, we have to drop the first key
1531	// of the next node (which can also be the new key to insert).
1532	// The dropped key is also the one which has to be inserted in
1533	// the parent node, so we will set the "newKey" already here.
1534	if (node->OverflowLink() != BPLUSTREE_NULL) {
1535		if (in == keyIndex) {
1536			newKey = key;
1537			newLength = *_keyLength;
1538
1539			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1540			keyIndex--;
1541		} else {
1542			// If a key is dropped (is not the new key), we have to copy
1543			// it, because it would be lost if not.
1544			uint8* droppedKey = node->KeyAt(in, &newLength);
1545			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1546					> (uint8*)node + fNodeSize
1547				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1548				fStream->GetVolume()->Panic();
1549				RETURN_ERROR(B_BAD_DATA);
1550			}
1551			newKey = (uint8*)malloc(newLength);
1552			if (newKey == NULL)
1553				return B_NO_MEMORY;
1554
1555			newAllocated = true;
1556			memcpy(newKey, droppedKey, newLength);
1557
1558			other->overflow_link = inKeyValues[in];
1559			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1560		}
1561	}
1562
1563	// and now the same game for the other page and the rest of the keys
1564	// (but with memmove() instead of memcpy(), because they may overlap)
1565
1566	bytesBefore = bytesAfter = bytes = 0;
1567	out = 0;
1568	int32 skip = in;
1569	while (in < node->NumKeys() + 1) {
1570		if (in == keyIndex && !bytes) {
1571			// it's enough to set bytesBefore once here, because we do
1572			// not need to know the exact length of all keys in this
1573			// loop
1574			bytesBefore = in > skip
1575				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1576			bytes = *_keyLength;
1577			out++;
1578		} else {
1579			if (in < node->NumKeys()) {
1580				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1581					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1582
1583				if (bytes) {
1584					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1585						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1586
1587					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1588						- bytesBefore - bytes;
1589				}
1590				out++;
1591			}
1592			in++;
1593		}
1594	}
1595
1596	// adjust the byte counts (since we were a bit lazy in the loop)
1597	if (keyIndex < skip)
1598		bytesBefore = node->AllKeyLength() - total;
1599
1600	if (bytesBefore < 0 || bytesAfter < 0) {
1601		if (newAllocated)
1602			free(newKey);
1603		return B_BAD_DATA;
1604	}
1605
1606	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1607		// right link, and overflow link can stay the same
1608	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1609		+ bytesAfter);
1610	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1611
1612	// array positions have changed
1613	outKeyLengths = node->KeyLengths();
1614	outKeyValues = node->Values();
1615
1616	// move the keys in the old node: the order is important here,
1617	// because we don't want to overwrite any contents
1618
1619	keys = keyIndex <= skip ? out : keyIndex - skip;
1620	keyIndex -= skip;
1621	in = out - keyIndex - 1;
1622		// Note: keyIndex and in will contain invalid values when the new key
1623		// went to the other node. But in this case bytes and bytesAfter are
1624		// 0 and subsequently we never use keyIndex and in.
1625
1626	if (bytesBefore)
1627		memmove(inKeys, inKeys + total, bytesBefore);
1628	if (bytesAfter) {
1629		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1630			bytesAfter);
1631	}
1632
1633	if (bytesBefore)
1634		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1635	if (bytesAfter) {
1636		// if byteAfter is > 0, keyIndex is larger than skip
1637		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1638			in * sizeof(uint16));
1639	}
1640
1641	if (bytesBefore)
1642		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1643	if (bytesAfter) {
1644		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1645			in * sizeof(off_t));
1646	}
1647
1648	if (bytes) {
1649		// finally, copy the newly inserted key (don't overwrite anything)
1650		memcpy(inKeys + bytesBefore, key, bytes);
1651		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1652		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1653	}
1654
1655	// Prepare the key that will be inserted in the parent node which
1656	// is either the dropped key or the last of the other node.
1657	// If it's the dropped key, "newKey" was already set earlier.
1658
1659	if (newKey == NULL)
1660		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1661
1662	memcpy(key, newKey, newLength);
1663	*_keyLength = newLength;
1664	*_value = otherOffset;
1665
1666	if (newAllocated)
1667		free(newKey);
1668
1669	return B_OK;
1670}
1671
1672
1673/*!	This inserts a key into the tree. The changes made to the tree will
1674	all be part of the \a transaction.
1675	You need to have the inode write locked.
1676*/
1677status_t
1678BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength,
1679	off_t value)
1680{
1681	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1682		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1683		RETURN_ERROR(B_BAD_VALUE);
1684#ifdef DEBUG
1685	if (value < 0)
1686		panic("tried to insert invalid value %" B_PRId64 "!\n", value);
1687#endif
1688
1689	ASSERT_WRITE_LOCKED_INODE(fStream);
1690
1691	Stack<node_and_key> stack;
1692	if (_SeekDown(stack, key, keyLength) != B_OK)
1693		RETURN_ERROR(B_ERROR);
1694
1695	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH];
1696	memcpy(keyBuffer, key, keyLength);
1697
1698	node_and_key nodeAndKey;
1699	const bplustree_node* node;
1700
1701	CachedNode cached(this);
1702	while (stack.Pop(&nodeAndKey)
1703		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1704#ifdef DEBUG
1705		NodeChecker checker(node, fNodeSize, "insert");
1706#endif
1707		if (node->IsLeaf()) {
1708			// first round, check for duplicate entries
1709			status_t status = _FindKey(node, key, keyLength,
1710				&nodeAndKey.keyIndex);
1711
1712			// is this a duplicate entry?
1713			if (status == B_OK) {
1714				if (fAllowDuplicates) {
1715					status = _InsertDuplicate(transaction, cached, node,
1716						nodeAndKey.keyIndex, value);
1717					if (status != B_OK)
1718						RETURN_ERROR(status);
1719					return B_OK;
1720				}
1721
1722				return B_NAME_IN_USE;
1723			}
1724		}
1725
1726		bplustree_node* writableNode = cached.MakeWritable(transaction);
1727		if (writableNode == NULL)
1728			return B_IO_ERROR;
1729
1730		// is the node big enough to hold the pair?
1731		if (int32(key_align(sizeof(bplustree_node)
1732				+ writableNode->AllKeyLength() + keyLength)
1733				+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
1734				+ sizeof(off_t))) < fNodeSize) {
1735			_InsertKey(writableNode, nodeAndKey.keyIndex,
1736				keyBuffer, keyLength, value);
1737			_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1738				nodeAndKey.keyIndex, 0, 1);
1739
1740			return B_OK;
1741		} else {
1742			CachedNode cachedNewRoot(this);
1743			CachedNode cachedOther(this);
1744
1745			// do we need to allocate a new root node? if so, then do
1746			// it now
1747			off_t newRoot = BPLUSTREE_NULL;
1748			if (nodeAndKey.nodeOffset == fHeader.RootNode()) {
1749				bplustree_node* root;
1750				status_t status = cachedNewRoot.Allocate(transaction, &root,
1751					&newRoot);
1752				if (status != B_OK) {
1753					// The tree is most likely corrupted!
1754					// But it's still sane at leaf level - we could set
1755					// a flag in the header that forces the tree to be
1756					// rebuild next time...
1757					// But since we will have journaling, that's not a big
1758					// problem anyway.
1759					RETURN_ERROR(status);
1760				}
1761			}
1762
1763			// reserve space for the other node
1764			bplustree_node* other;
1765			off_t otherOffset;
1766			status_t status = cachedOther.Allocate(transaction, &other,
1767				&otherOffset);
1768			if (status != B_OK) {
1769				cachedNewRoot.Free(transaction, newRoot);
1770				RETURN_ERROR(status);
1771			}
1772
1773			if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1774					otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1775					&value) != B_OK) {
1776				// free root node & other node here
1777				cachedOther.Free(transaction, otherOffset);
1778				cachedNewRoot.Free(transaction, newRoot);
1779
1780				RETURN_ERROR(B_ERROR);
1781			}
1782#ifdef DEBUG
1783			checker.Check("insert split");
1784			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1785#endif
1786
1787			_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1788				nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1789
1790			// update the right link of the node in the left of the new node
1791			if ((other = cachedOther.SetToWritable(transaction,
1792					other->LeftLink())) != NULL) {
1793				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1794			}
1795
1796			// create a new root if necessary
1797			if (newRoot != BPLUSTREE_NULL) {
1798				bplustree_node* root = cachedNewRoot.Node();
1799
1800				_InsertKey(root, 0, keyBuffer, keyLength,
1801					writableNode->LeftLink());
1802				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1803					nodeAndKey.nodeOffset);
1804
1805				CachedNode cached(this);
1806				bplustree_header* header
1807					= cached.SetToWritableHeader(transaction);
1808				if (header == NULL)
1809					return B_IO_ERROR;
1810
1811				// finally, update header to point to the new root
1812				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1813				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1814					header->MaxNumberOfLevels() + 1);
1815
1816				return B_OK;
1817			}
1818		}
1819	}
1820	RETURN_ERROR(B_ERROR);
1821}
1822
1823
1824/*!	Removes the duplicate index/value pair from the tree.
1825	It's part of the private tree interface.
1826*/
1827status_t
1828BPlusTree::_RemoveDuplicate(Transaction& transaction,
1829	const bplustree_node* node, CachedNode& cached, uint16 index,
1830	off_t value)
1831{
1832	Unaligned<off_t>* values = node->Values();
1833	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1834
1835	CachedNode cachedDuplicate(this);
1836	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1837	bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction,
1838		duplicateOffset, false);
1839	if (duplicate == NULL)
1840		return B_IO_ERROR;
1841
1842	// if it's a duplicate fragment, remove the entry from there
1843	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1844		duplicate_array* array = duplicate->FragmentAt(
1845			bplustree_node::FragmentIndex(oldValue));
1846		int32 arrayCount = array->Count();
1847
1848		if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) {
1849			FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %"
1850				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1851				(int)bplustree_node::FragmentIndex(oldValue), duplicateOffset,
1852				arrayCount, fStream->ID()));
1853			return B_BAD_DATA;
1854		}
1855		if (!array->Remove(value)) {
1856			FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node "
1857				"%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value,
1858				duplicateOffset, fStream->ID()));
1859			return B_ENTRY_NOT_FOUND;
1860		}
1861
1862		// remove the array from the fragment node if it is empty
1863		if (--arrayCount == 1) {
1864			// set the link to the remaining value
1865			if (cached.MakeWritable(transaction) == NULL)
1866				return B_IO_ERROR;
1867
1868			values[index] = array->values[0];
1869
1870			// Remove the whole fragment node, if this was the only array,
1871			// otherwise free just the array
1872			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1873				status_t status = cachedDuplicate.Free(transaction,
1874					duplicateOffset);
1875				if (status != B_OK)
1876					return status;
1877			} else
1878				array->count = 0;
1879		}
1880		return B_OK;
1881	}
1882
1883	// Remove value from a duplicate node!
1884
1885	duplicate_array* array = NULL;
1886	int32 arrayCount = 0;
1887
1888	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1889		FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF
1890			", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID()));
1891		return B_BAD_DATA;
1892	}
1893
1894	// Search the duplicate nodes until the entry could be found (and removed)
1895	while (duplicate != NULL) {
1896		array = duplicate->DuplicateArray();
1897		arrayCount = array->Count();
1898
1899		if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1900			FATAL(("_RemoveDuplicate: Invalid array size in duplicate %"
1901				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1902				duplicateOffset, arrayCount, fStream->ID()));
1903			return B_BAD_DATA;
1904		}
1905
1906		if (array->Remove(value)) {
1907			arrayCount--;
1908			break;
1909		}
1910
1911		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1912			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1913
1914		cachedDuplicate.UnsetUnchanged(transaction);
1915		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset,
1916			false);
1917	}
1918	if (duplicate == NULL)
1919		RETURN_ERROR(B_IO_ERROR);
1920
1921	// The entry got removed from the duplicate node, but we might want to free
1922	// it now in case it's empty
1923
1924	while (true) {
1925		off_t left = duplicate->LeftLink();
1926		off_t right = duplicate->RightLink();
1927		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1928
1929		if ((isLast && arrayCount == 1) || arrayCount == 0) {
1930			// Free empty duplicate page, link their siblings together, and
1931			// update the duplicate link if needed (ie. when we either remove
1932			// the last duplicate node or have a new first one)
1933
1934			if (left == BPLUSTREE_NULL) {
1935				// the duplicate link points to us
1936				if (cached.MakeWritable(transaction) == NULL)
1937					return B_IO_ERROR;
1938
1939				if (arrayCount == 1) {
1940					// This is the last node, and there is only one value left;
1941					// replace the duplicate link with that value, it's no
1942					// duplicate anymore
1943					values[index] = array->values[0];
1944				} else {
1945					// Move the duplicate link to the next node
1946					values[index] = HOST_ENDIAN_TO_BFS_INT64(
1947						bplustree_node::MakeLink(
1948							BPLUSTREE_DUPLICATE_NODE, right));
1949				}
1950			}
1951
1952			status_t status = cachedDuplicate.Free(transaction,
1953				duplicateOffset);
1954			if (status != B_OK)
1955				return status;
1956
1957			if (left != BPLUSTREE_NULL
1958				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left,
1959						false)) != NULL) {
1960				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1961
1962				// If the next node is the last node, we need to free that node
1963				// and convert the duplicate entry back into a normal entry
1964				array = duplicate->DuplicateArray();
1965				arrayCount = array->Count();
1966				if (right == BPLUSTREE_NULL
1967					&& duplicate->LeftLink() == BPLUSTREE_NULL
1968					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1969					duplicateOffset = left;
1970					continue;
1971				}
1972			}
1973			if (right != BPLUSTREE_NULL
1974				&& (duplicate = cachedDuplicate.SetToWritable(transaction,
1975						right, false)) != NULL) {
1976				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1977
1978				// Again, we may need to turn the duplicate entry back into a
1979				// normal entry
1980				array = duplicate->DuplicateArray();
1981				arrayCount = array->Count();
1982				if (left == BPLUSTREE_NULL
1983					&& duplicate->RightLink() == BPLUSTREE_NULL
1984					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1985					duplicateOffset = right;
1986					continue;
1987				}
1988			}
1989			return B_OK;
1990		}
1991		if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) {
1992			// If the number of entries fits in a duplicate fragment, then
1993			// either find a free fragment node, or convert this node to a
1994			// fragment node.
1995			CachedNode cachedOther(this);
1996
1997			bplustree_node* fragment = NULL;
1998			uint32 fragmentIndex = 0;
1999			off_t offset;
2000			if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
2001					&offset, &fragment, &fragmentIndex) == B_OK) {
2002				// move to other node
2003				duplicate_array* target = fragment->FragmentAt(fragmentIndex);
2004				memcpy(target, array,
2005					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2006
2007				cachedDuplicate.Free(transaction, duplicateOffset);
2008				duplicateOffset = offset;
2009			} else {
2010				// convert node
2011				memmove(duplicate, array,
2012					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2013				memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
2014					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2015			}
2016
2017			if (cached.MakeWritable(transaction) == NULL)
2018				return B_IO_ERROR;
2019
2020			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
2021				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
2022		}
2023		return B_OK;
2024	}
2025}
2026
2027
2028/*!	Removes the key with the given index from the specified node.
2029	Since it has to get the key from the node anyway (to obtain it's
2030	pointer), it's not needed to pass the key & its length, although
2031	the calling method (BPlusTree::Remove()) have this data.
2032*/
2033void
2034BPlusTree::_RemoveKey(bplustree_node* node, uint16 index)
2035{
2036	// should never happen, but who knows?
2037	if (index > node->NumKeys() && node->NumKeys() > 0) {
2038		FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF
2039			"\n", index, fStream->ID()));
2040		return;
2041	}
2042
2043	Unaligned<off_t>* values = node->Values();
2044
2045	// if we would have to drop the overflow link, drop
2046	// the last key instead and update the overflow link
2047	// to the value of that one
2048	if (!node->IsLeaf() && index == node->NumKeys())
2049		node->overflow_link = values[--index];
2050
2051	uint16 length = 0;
2052	uint8* key = node->KeyAt(index, &length);
2053	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize
2054		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2055		FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length,
2056			fStream->ID()));
2057		fStream->GetVolume()->Panic();
2058		return;
2059	}
2060
2061	Unaligned<uint16>* keyLengths = node->KeyLengths();
2062	uint8* keys = node->Keys();
2063
2064	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
2065	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(
2066		node->AllKeyLength() - length);
2067
2068	Unaligned<off_t>* newValues = node->Values();
2069	Unaligned<uint16>* newKeyLengths = node->KeyLengths();
2070
2071	// move key data
2072	memmove(key, key + length, node->AllKeyLength() - (key - keys));
2073
2074	// move and update key lengths
2075	if (index > 0 && newKeyLengths != keyLengths)
2076		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
2077	for (uint16 i = index; i < node->NumKeys(); i++) {
2078		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
2079			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
2080	}
2081
2082	// move values
2083	if (index > 0)
2084		memmove(newValues, values, index * sizeof(off_t));
2085	if (node->NumKeys() > index) {
2086		memmove(newValues + index, values + index + 1,
2087			(node->NumKeys() - index) * sizeof(off_t));
2088	}
2089}
2090
2091
2092/*!	Removes the specified key from the tree. The "value" parameter is only used
2093	for trees which allow duplicates, so you may safely ignore it.
2094	It's not an optional parameter, so at least you have to think about it.
2095	You need to have the inode write locked.
2096*/
2097status_t
2098BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength,
2099	off_t value)
2100{
2101	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2102		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
2103		RETURN_ERROR(B_BAD_VALUE);
2104
2105	ASSERT_WRITE_LOCKED_INODE(fStream);
2106
2107	Stack<node_and_key> stack;
2108	if (_SeekDown(stack, key, keyLength) != B_OK)
2109		RETURN_ERROR(B_ERROR);
2110
2111	node_and_key nodeAndKey;
2112	const bplustree_node* node;
2113
2114	CachedNode cached(this);
2115	while (stack.Pop(&nodeAndKey)
2116		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
2117#ifdef DEBUG
2118		NodeChecker checker(node, fNodeSize, "remove");
2119#endif
2120		if (node->IsLeaf()) {
2121			// first round, check for duplicate entries
2122			status_t status = _FindKey(node, key, keyLength,
2123				&nodeAndKey.keyIndex);
2124			if (status != B_OK)
2125				RETURN_ERROR(status);
2126
2127			// Is this a duplicate entry?
2128			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
2129					node->Values()[nodeAndKey.keyIndex]))) {
2130				if (fAllowDuplicates) {
2131					return _RemoveDuplicate(transaction, node, cached,
2132						nodeAndKey.keyIndex, value);
2133				}
2134
2135				FATAL(("dupliate node found where no duplicates are "
2136					"allowed, inode %" B_PRIdOFF "!\n", fStream->ID()));
2137				RETURN_ERROR(B_ERROR);
2138			} else {
2139				if (node->Values()[nodeAndKey.keyIndex] != value)
2140					return B_ENTRY_NOT_FOUND;
2141
2142				// If we will remove the last key, the iterator will be set
2143				// to the next node after the current - if there aren't any
2144				// more nodes, we need a way to prevent the TreeIterators to
2145				// touch the old node again, we use BPLUSTREE_FREE for this
2146				off_t next = node->RightLink() == BPLUSTREE_NULL
2147					? BPLUSTREE_FREE : node->RightLink();
2148				_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
2149					? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
2150			}
2151		}
2152
2153		bplustree_node* writableNode = cached.MakeWritable(transaction);
2154		if (writableNode == NULL)
2155			return B_IO_ERROR;
2156
2157		// if it's an empty root node, we have to convert it
2158		// to a leaf node by dropping the overflow link, or,
2159		// if it's already a leaf node, just empty it
2160		if (nodeAndKey.nodeOffset == fHeader.RootNode()
2161			&& (node->NumKeys() == 0
2162				|| (node->NumKeys() == 1 && node->IsLeaf()))) {
2163			writableNode->overflow_link
2164				= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2165			writableNode->all_key_count = 0;
2166			writableNode->all_key_length = 0;
2167
2168			// if we've made a leaf node out of the root node, we need
2169			// to reset the maximum number of levels in the header
2170			if (fHeader.MaxNumberOfLevels() != 1) {
2171				CachedNode cached(this);
2172				bplustree_header* header
2173					= cached.SetToWritableHeader(transaction);
2174				if (header == NULL)
2175					return B_IO_ERROR;
2176
2177				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
2178			}
2179			return B_OK;
2180		}
2181
2182		// if there is only one key left, we don't have to remove
2183		// it, we can just dump the node (index nodes still have
2184		// the overflow link, so we have to drop the last key)
2185		if (writableNode->NumKeys() > 1
2186			|| (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
2187			_RemoveKey(writableNode, nodeAndKey.keyIndex);
2188			return B_OK;
2189		}
2190
2191		// when we are here, we can just free the node, but
2192		// we have to update the right/left link of the
2193		// siblings first
2194		CachedNode otherCached(this);
2195		bplustree_node* other = otherCached.SetToWritable(transaction,
2196			writableNode->LeftLink());
2197		if (other != NULL)
2198			other->right_link = writableNode->right_link;
2199
2200		if ((other = otherCached.SetToWritable(transaction, node->RightLink()))
2201				!= NULL) {
2202			other->left_link = writableNode->left_link;
2203		}
2204
2205		cached.Free(transaction, nodeAndKey.nodeOffset);
2206	}
2207	RETURN_ERROR(B_ERROR);
2208}
2209
2210
2211/*!	Replaces the value for the key in the tree.
2212	Returns B_OK if the key could be found and its value replaced,
2213	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
2214	to indicate that something went terribly wrong.
2215	Note that this doesn't work with duplicates - it will just
2216	return B_BAD_TYPE if you call this function on a tree where
2217	duplicates are allowed.
2218	You need to have the inode write locked.
2219*/
2220status_t
2221BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength,
2222	off_t value)
2223{
2224	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2225		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2226		|| key == NULL)
2227		RETURN_ERROR(B_BAD_VALUE);
2228
2229	if (fAllowDuplicates)
2230		RETURN_ERROR(B_BAD_TYPE);
2231
2232	ASSERT_WRITE_LOCKED_INODE(fStream);
2233
2234	off_t nodeOffset = fHeader.RootNode();
2235	CachedNode cached(this);
2236	const bplustree_node* node;
2237
2238	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2239		uint16 keyIndex = 0;
2240		off_t nextOffset;
2241		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2242			&nextOffset);
2243
2244		if (node->OverflowLink() == BPLUSTREE_NULL) {
2245			if (status == B_OK) {
2246				bplustree_node* writableNode = cached.MakeWritable(transaction);
2247				if (writableNode != NULL) {
2248					writableNode->Values()[keyIndex]
2249						= HOST_ENDIAN_TO_BFS_INT64(value);
2250				} else
2251					status = B_IO_ERROR;
2252			}
2253
2254			return status;
2255		} else if (nextOffset == nodeOffset)
2256			RETURN_ERROR(B_ERROR);
2257
2258		nodeOffset = nextOffset;
2259	}
2260	RETURN_ERROR(B_ERROR);
2261}
2262#endif // !_BOOT_MODE
2263
2264
2265/*!	Searches the key in the tree, and stores the offset found in _value,
2266	if successful.
2267	It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack
2268	while it descends the tree.
2269	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not.
2270	It can also return other errors to indicate that something went wrong.
2271	Note that this doesn't work with duplicates - it will just return
2272	B_BAD_TYPE if you call this function on a tree where duplicates are
2273	allowed.
2274	You need to have the inode read or write locked.
2275*/
2276status_t
2277BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value)
2278{
2279	if (key == NULL || keyLength < BPLUSTREE_MIN_KEY_LENGTH)
2280		RETURN_ERROR(B_BAD_VALUE);
2281
2282	if (keyLength > BPLUSTREE_MAX_KEY_LENGTH)
2283		RETURN_ERROR(B_NAME_TOO_LONG);
2284
2285	if (fAllowDuplicates)
2286		RETURN_ERROR(B_BAD_TYPE);
2287
2288#if !_BOOT_MODE
2289	ASSERT_READ_LOCKED_INODE(fStream);
2290#endif
2291
2292	off_t nodeOffset = fHeader.RootNode();
2293	CachedNode cached(this);
2294	const bplustree_node* node;
2295
2296#ifdef DEBUG
2297	int32 levels = 0;
2298#endif
2299
2300	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2301		uint16 keyIndex = 0;
2302		off_t nextOffset;
2303		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2304			&nextOffset);
2305
2306#ifdef DEBUG
2307		levels++;
2308#endif
2309		if (node->OverflowLink() == BPLUSTREE_NULL) {
2310			if (status == B_OK && _value != NULL)
2311				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
2312
2313#ifdef DEBUG
2314			if (levels != (int32)fHeader.MaxNumberOfLevels())
2315				DEBUGGER(("levels don't match"));
2316#endif
2317			return status;
2318		} else if (nextOffset == nodeOffset)
2319			RETURN_ERROR(B_ERROR);
2320
2321		nodeOffset = nextOffset;
2322	}
2323	FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %"
2324		B_PRIdOFF "\n", nodeOffset, fStream->ID()));
2325	RETURN_ERROR(B_ERROR);
2326}
2327
2328
2329#if !_BOOT_MODE
2330status_t
2331BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset,
2332	const uint8* largestKey, uint16 largestKeyLength,
2333	const bplustree_node* parent)
2334{
2335	if (parent->CheckIntegrity(fNodeSize) != B_OK) {
2336		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check "
2337			"failed!\n", fStream->ID(), offset);
2338		check.FoundError();
2339		return B_OK;
2340	}
2341	if (level >= fHeader.MaxNumberOfLevels()) {
2342		dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF
2343			"!\n", fStream->ID(), offset);
2344		check.FoundError();
2345		return B_OK;
2346	}
2347
2348	check.SetLevel(level);
2349
2350	if (check.Visited(offset)) {
2351		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n",
2352			fStream->ID(), offset);
2353		check.FoundError();
2354		return B_OK;
2355	}
2356
2357	check.SetVisited(offset);
2358
2359	uint32 count = parent->NumKeys();
2360	Unaligned<off_t>* values = parent->Values();
2361	off_t lastOffset = check.PreviousOffset(level);
2362	CachedNode cached(this);
2363
2364	for (uint32 i = 0; i < count; i++) {
2365		uint16 keyLength;
2366		uint8* key = parent->KeyAt(i, &keyLength);
2367		if (largestKey != NULL) {
2368			int result = _CompareKeys(key, keyLength, largestKey,
2369				largestKeyLength);
2370			if (result > 0 || (result == 0 && i != count - 1)) {
2371				dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %"
2372					B_PRIu32 " larger than it should!\n",
2373					fStream->ID(), offset, i);
2374				check.FoundError();
2375			}
2376		}
2377
2378		off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]);
2379		if (bplustree_node::IsDuplicate(childOffset)) {
2380			// Walk the duplicate nodes
2381			off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset);
2382			off_t lastDuplicateOffset = BPLUSTREE_NULL;
2383
2384			while (duplicateOffset != BPLUSTREE_NULL) {
2385				const bplustree_node* node;
2386				status_t status = cached.SetTo(duplicateOffset, &node, false);
2387				if (status != B_OK) {
2388					if (status == B_IO_ERROR)
2389						return B_IO_ERROR;
2390
2391					dprintf("inode %" B_PRIdOFF ": duplicate node at %"
2392						B_PRIdOFF " could not be read: %s\n", fStream->ID(),
2393						duplicateOffset, strerror(status));
2394					check.FoundError();
2395					break;
2396				}
2397
2398				bool isFragmentNode = bplustree_node::LinkType(childOffset)
2399					== BPLUSTREE_DUPLICATE_FRAGMENT;
2400				bool isKnownFragment = isFragmentNode
2401					&& check.VisitedFragment(duplicateOffset);
2402
2403				if (!isKnownFragment && check.Visited(duplicateOffset)) {
2404					dprintf("inode %" B_PRIdOFF ": %s node at %"
2405						B_PRIdOFF " already visited, referenced from %"
2406						B_PRIdOFF "!\n", fStream->ID(),
2407						isFragmentNode ? "fragment" : "duplicate",
2408						duplicateOffset, offset);
2409					check.FoundError();
2410					break;
2411				}
2412
2413				// Fragment nodes may be visited more than once from different
2414				// places
2415				if (!check.Visited(duplicateOffset))
2416					check.SetVisited(duplicateOffset);
2417				if (!isKnownFragment && isFragmentNode)
2418					check.SetVisitedFragment(duplicateOffset);
2419
2420				duplicate_array* array;
2421				int32 minSize;
2422				int32 maxSize;
2423				if (isFragmentNode) {
2424					array = node->FragmentAt(
2425						bplustree_node::FragmentIndex(childOffset));
2426					minSize = 2;
2427					maxSize = NUM_FRAGMENT_VALUES;
2428				} else {
2429					array = node->DuplicateArray();
2430					minSize = 1;
2431					maxSize = NUM_DUPLICATE_VALUES;
2432				}
2433				int32 arrayCount = array->Count();
2434
2435				if (arrayCount < minSize || arrayCount > maxSize) {
2436					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2437						" has invalid array size %" B_PRId32 "!\n",
2438						fStream->ID(), duplicateOffset, arrayCount);
2439					check.FoundError();
2440				} else {
2441					// Simple check if the values in the array may be valid
2442					for (int32 j = 0; j < arrayCount; j++) {
2443						if (!fStream->GetVolume()->IsValidInodeBlock(
2444								array->ValueAt(j))) {
2445							dprintf("inode %" B_PRIdOFF ": duplicate at %"
2446								B_PRIdOFF " contains invalid block %" B_PRIdOFF
2447								" at %" B_PRId32 "!\n", fStream->ID(),
2448								duplicateOffset, array->ValueAt(j), j);
2449							check.FoundError();
2450							break;
2451						}
2452					}
2453				}
2454
2455				// A fragment node is not linked (and does not have valid links)
2456				if (isFragmentNode)
2457					break;
2458
2459				if (node->LeftLink() != lastDuplicateOffset) {
2460					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2461						" has wrong left link %" B_PRIdOFF ", expected %"
2462						B_PRIdOFF "!\n", fStream->ID(), duplicateOffset,
2463						node->LeftLink(), lastDuplicateOffset);
2464					check.FoundError();
2465				}
2466
2467				lastDuplicateOffset = duplicateOffset;
2468				duplicateOffset = node->RightLink();
2469			}
2470		} else if (!parent->IsLeaf()) {
2471			// Test a regular child node recursively
2472			off_t nextOffset = parent->OverflowLink();
2473			if (i < count - 1)
2474				nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]);
2475
2476			if (i == 0 && lastOffset != BPLUSTREE_NULL) {
2477				// Test right link of the previous node
2478				const bplustree_node* previous = cached.SetTo(lastOffset, true);
2479				if (previous == NULL)
2480					return B_IO_ERROR;
2481
2482				if (previous->RightLink() != childOffset) {
2483					dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2484						"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2485						"!\n", fStream->ID(), lastOffset, previous->RightLink(),
2486						childOffset);
2487					check.FoundError();
2488				}
2489			}
2490
2491			status_t status = _ValidateChild(check, cached, level, childOffset,
2492				lastOffset, nextOffset, key, keyLength);
2493			if (status != B_OK)
2494				return status;
2495		} else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) {
2496			dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains "
2497				"invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n",
2498				fStream->ID(), offset, childOffset, i);
2499			check.FoundError();
2500		}
2501
2502		lastOffset = childOffset;
2503	}
2504
2505	if (parent->OverflowLink() != BPLUSTREE_NULL) {
2506		off_t childOffset = parent->OverflowLink();
2507		status_t status = _ValidateChild(check, cached, level, childOffset,
2508			lastOffset, 0, NULL, 0);
2509		if (status != B_OK)
2510			return status;
2511
2512		lastOffset = childOffset;
2513	}
2514
2515	check.SetPreviousOffset(level, lastOffset);
2516	return B_OK;
2517}
2518
2519
2520status_t
2521BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level,
2522	off_t offset, off_t lastOffset, off_t nextOffset,
2523	const uint8* key, uint16 keyLength)
2524{
2525	const bplustree_node* node;
2526	status_t status = cached.SetTo(offset, &node, true);
2527	if (status != B_OK) {
2528		if (status == B_IO_ERROR)
2529			return B_IO_ERROR;
2530
2531		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be "
2532			"read: %s\n", fStream->ID(), offset, strerror(status));
2533		check.FoundError();
2534		return B_OK;
2535	}
2536
2537	if (node->LeftLink() != lastOffset) {
2538		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2539			"wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF
2540			"!\n", fStream->ID(), offset, node->LeftLink(), lastOffset);
2541		check.FoundError();
2542	}
2543
2544	if (nextOffset != 0 && node->RightLink() != nextOffset) {
2545		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2546			"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2547			"!\n", fStream->ID(), offset, node->RightLink(), nextOffset);
2548		check.FoundError();
2549	}
2550
2551	return _ValidateChildren(check, level + 1, offset, key, keyLength, node);
2552}
2553#endif // !_BOOT_MODE
2554
2555
2556//	#pragma mark -
2557
2558
2559TreeIterator::TreeIterator(BPlusTree* tree)
2560	:
2561	fTree(tree),
2562	fCurrentNodeOffset(BPLUSTREE_NULL)
2563{
2564#if !_BOOT_MODE
2565	tree->_AddIterator(this);
2566#endif
2567}
2568
2569
2570TreeIterator::~TreeIterator()
2571{
2572#if !_BOOT_MODE
2573	if (fTree)
2574		fTree->_RemoveIterator(this);
2575#endif
2576}
2577
2578
2579status_t
2580TreeIterator::Goto(int8 to)
2581{
2582	if (fTree == NULL || fTree->fStream == NULL)
2583		RETURN_ERROR(B_BAD_VALUE);
2584
2585#if !_BOOT_MODE
2586	// lock access to stream
2587	InodeReadLocker locker(fTree->fStream);
2588#endif
2589
2590	off_t nodeOffset = fTree->fHeader.RootNode();
2591	CachedNode cached(fTree);
2592	const bplustree_node* node;
2593
2594	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2595		// is the node a leaf node?
2596		if (node->OverflowLink() == BPLUSTREE_NULL) {
2597			fCurrentNodeOffset = nodeOffset;
2598			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
2599			fDuplicateNode = BPLUSTREE_NULL;
2600
2601			return B_OK;
2602		}
2603
2604		// get the next node offset depending on the direction (and if there
2605		// are any keys in that node at all)
2606		off_t nextOffset;
2607		if (to == BPLUSTREE_END || node->all_key_count == 0)
2608			nextOffset = node->OverflowLink();
2609		else {
2610			if (node->AllKeyLength() > fTree->fNodeSize
2611				|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
2612					- 8 * node->NumKeys())
2613				RETURN_ERROR(B_ERROR);
2614
2615			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
2616		}
2617		if (nextOffset == nodeOffset)
2618			break;
2619
2620		nodeOffset = nextOffset;
2621	}
2622	FATAL(("%s fails\n", __FUNCTION__));
2623
2624	RETURN_ERROR(B_ERROR);
2625}
2626
2627
2628/*!	Iterates through the tree in the specified direction.
2629	When it iterates through duplicates, the "key" is only updated for the
2630	first entry - if you need to know when this happens, use the "duplicate"
2631	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
2632	the other duplicates.
2633	That's not too nice, but saves the 256 bytes that would be needed to
2634	store the last key - if this will ever become an issue, it will be
2635	easy to change.
2636	The other advantage of this is, that the queries can skip all duplicates
2637	at once when they are not relevant to them.
2638*/
2639status_t
2640TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength,
2641	uint16 maxLength, off_t* value, uint16* duplicate)
2642{
2643	if (fTree == NULL)
2644		return B_INTERRUPTED;
2645
2646	bool forward = direction == BPLUSTREE_FORWARD;
2647
2648	if (fCurrentNodeOffset == BPLUSTREE_NULL
2649		&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK)
2650		RETURN_ERROR(B_ERROR);
2651
2652	// if the tree was emptied since the last call
2653	if (fCurrentNodeOffset == BPLUSTREE_FREE)
2654		return B_ENTRY_NOT_FOUND;
2655
2656#if !_BOOT_MODE
2657	// lock access to stream
2658	InodeReadLocker locker(fTree->fStream);
2659#endif
2660
2661	CachedNode cached(fTree);
2662	const bplustree_node* node;
2663
2664	if (fDuplicateNode != BPLUSTREE_NULL) {
2665		// regardless of traverse direction the duplicates are always presented
2666		// in the same order; since they are all considered as equal, this
2667		// shouldn't cause any problems
2668
2669		if (!fIsFragment || fDuplicate < fNumDuplicates) {
2670			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2671				false);
2672		} else
2673			node = NULL;
2674
2675		if (node != NULL) {
2676			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
2677				// If the node is out of duplicates, we go directly to the next
2678				// one
2679				fDuplicateNode = node->RightLink();
2680				if (fDuplicateNode != BPLUSTREE_NULL
2681					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
2682					fNumDuplicates = node->CountDuplicates(fDuplicateNode,
2683						false);
2684					fDuplicate = 0;
2685				}
2686			}
2687			if (fDuplicate < fNumDuplicates) {
2688				*value = node->DuplicateAt(fDuplicateNode, fIsFragment,
2689					fDuplicate++);
2690				if (duplicate)
2691					*duplicate = 2;
2692				return B_OK;
2693			}
2694		}
2695		fDuplicateNode = BPLUSTREE_NULL;
2696	}
2697
2698	off_t savedNodeOffset = fCurrentNodeOffset;
2699	int32 savedKey = fCurrentKey;
2700
2701	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
2702		RETURN_ERROR(B_ERROR);
2703
2704	if (duplicate)
2705		*duplicate = 0;
2706
2707	fCurrentKey += direction;
2708
2709	// is the current key in the current node?
2710	while ((forward && fCurrentKey >= node->NumKeys())
2711			|| (!forward && fCurrentKey < 0)) {
2712		fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2713
2714		// are there any more nodes?
2715		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2716			node = cached.SetTo(fCurrentNodeOffset);
2717			if (!node)
2718				RETURN_ERROR(B_ERROR);
2719
2720			// reset current key
2721			fCurrentKey = forward ? 0 : node->NumKeys() - 1;
2722		} else {
2723			// there are no nodes left, so turn back to the last key
2724			fCurrentNodeOffset = savedNodeOffset;
2725			fCurrentKey = savedKey;
2726
2727			return B_ENTRY_NOT_FOUND;
2728		}
2729	}
2730
2731	if (node->all_key_count == 0)
2732		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
2733
2734	uint16 length = 0;
2735	uint8* keyStart = node->KeyAt(fCurrentKey, &length);
2736	if (keyStart + length + sizeof(off_t) + sizeof(uint16)
2737			> (uint8*)node + fTree->fNodeSize
2738		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2739#if !_BOOT_MODE
2740		fTree->fStream->GetVolume()->Panic();
2741#endif
2742		RETURN_ERROR(B_BAD_DATA);
2743	}
2744
2745	// include the termination for string types
2746	bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE;
2747	if (length + (needsTermination ? 1 : 0) > maxLength) {
2748		if (!needsTermination || maxLength < INODE_FILE_NAME_LENGTH) {
2749			// The buffer is too small, restore the last key and return
2750			// an error
2751			fCurrentNodeOffset = savedNodeOffset;
2752			fCurrentKey = savedKey;
2753			return B_BUFFER_OVERFLOW;
2754		}
2755
2756		// Always cut off strings at the maximum buffer size, and leave
2757		// room for a terminating null byte.
2758		// This allows to handle larger key sizes gracefully.
2759		length = maxLength - 1;
2760	}
2761
2762	memcpy(key, keyStart, length);
2763
2764	if (needsTermination)
2765		((char*)key)[length] = '\0';
2766
2767	*keyLength = length;
2768
2769	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2770
2771	// duplicate fragments?
2772	uint8 type = bplustree_node::LinkType(offset);
2773	if (type == BPLUSTREE_DUPLICATE_FRAGMENT
2774		|| type == BPLUSTREE_DUPLICATE_NODE) {
2775		fDuplicateNode = offset;
2776
2777		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2778			false);
2779		if (node == NULL)
2780			RETURN_ERROR(B_ERROR);
2781
2782		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2783
2784		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2785		if (fNumDuplicates) {
2786			offset = node->DuplicateAt(offset, fIsFragment, 0);
2787			fDuplicate = 1;
2788			if (duplicate)
2789				*duplicate = 1;
2790		} else {
2791			// Shouldn't happen, but we're dealing here with potentially
2792			// corrupt disks...
2793			fDuplicateNode = BPLUSTREE_NULL;
2794			offset = 0;
2795		}
2796	}
2797	*value = offset;
2798
2799	return B_OK;
2800}
2801
2802
2803/*!	This is more or less a copy of BPlusTree::Find() - but it just
2804	sets the current position in the iterator, regardless of if the
2805	key could be found or not.
2806*/
2807status_t
2808TreeIterator::Find(const uint8* key, uint16 keyLength)
2809{
2810	if (fTree == NULL)
2811		return B_INTERRUPTED;
2812	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2813		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2814		|| key == NULL)
2815		RETURN_ERROR(B_BAD_VALUE);
2816
2817#if !_BOOT_MODE
2818	// lock access to stream
2819	InodeReadLocker locker(fTree->fStream);
2820#endif
2821
2822	off_t nodeOffset = fTree->fHeader.RootNode();
2823
2824	CachedNode cached(fTree);
2825	const bplustree_node* node;
2826	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2827		uint16 keyIndex = 0;
2828		off_t nextOffset = 0;
2829		status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2830			&nextOffset);
2831
2832		if (node->OverflowLink() == BPLUSTREE_NULL) {
2833			fCurrentNodeOffset = nodeOffset;
2834			fCurrentKey = keyIndex - 1;
2835			fDuplicateNode = BPLUSTREE_NULL;
2836
2837			return status;
2838		} else if (nextOffset == nodeOffset)
2839			RETURN_ERROR(B_ERROR);
2840
2841		nodeOffset = nextOffset;
2842	}
2843	RETURN_ERROR(B_ERROR);
2844}
2845
2846
2847void
2848TreeIterator::SkipDuplicates()
2849{
2850	fDuplicateNode = BPLUSTREE_NULL;
2851}
2852
2853
2854void
2855TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex,
2856	uint16 splitAt, int8 change)
2857{
2858	if (offset != fCurrentNodeOffset)
2859		return;
2860
2861	if (nextOffset != BPLUSTREE_NULL) {
2862		fCurrentNodeOffset = nextOffset;
2863		if (splitAt <= fCurrentKey) {
2864			fCurrentKey -= splitAt;
2865			keyIndex -= splitAt;
2866		}
2867	}
2868
2869	// Adjust fCurrentKey to point to the same key as before.
2870	// Note, that if a key is inserted at the current position
2871	// it won't be included in this tree transition.
2872	if (keyIndex <= fCurrentKey)
2873		fCurrentKey += change;
2874
2875	// TODO: duplicate handling!
2876}
2877
2878
2879void
2880TreeIterator::Stop()
2881{
2882	fTree = NULL;
2883}
2884
2885
2886#ifdef DEBUG
2887void
2888TreeIterator::Dump()
2889{
2890	__out("TreeIterator at %p:\n", this);
2891	__out("\tfTree = %p\n", fTree);
2892	__out("\tfCurrentNodeOffset = %" B_PRId64 "\n", fCurrentNodeOffset);
2893	__out("\tfCurrentKey = %d\n", (int)fCurrentKey);
2894	__out("\tfDuplicateNode = %" B_PRId64 " (%" B_PRId64 ", 0x%" B_PRIx64 ")\n",
2895		bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode,
2896		fDuplicateNode);
2897	__out("\tfDuplicate = %u\n", fDuplicate);
2898	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2899	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2900}
2901#endif
2902
2903
2904// #pragma mark -
2905
2906
2907bool
2908bplustree_header::IsValid() const
2909{
2910	return Magic() == BPLUSTREE_MAGIC
2911		&& (RootNode() % NodeSize()) == 0
2912		&& IsValidLink(RootNode())
2913		&& IsValidLink(FreeNode());
2914}
2915
2916
2917// #pragma mark -
2918
2919
2920void
2921bplustree_node::Initialize()
2922{
2923	left_link = right_link = overflow_link
2924		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2925	all_key_count = 0;
2926	all_key_length = 0;
2927}
2928
2929
2930uint8*
2931bplustree_node::KeyAt(int32 index, uint16* keyLength) const
2932{
2933	if (index < 0 || index > NumKeys())
2934		return NULL;
2935
2936	uint8* keyStart = Keys();
2937	Unaligned<uint16>* keyLengths = KeyLengths();
2938
2939	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2940		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2941	if (index > 0)
2942		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2943
2944	return keyStart;
2945}
2946
2947
2948uint8
2949bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2950{
2951	// the duplicate fragment handling is currently hard-coded to a node size
2952	// of 1024 bytes - with future versions of BFS, this may be a problem
2953
2954	if (isFragment) {
2955		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2956
2957		return ((off_t*)this)[fragment];
2958	}
2959	return OverflowLink();
2960}
2961
2962
2963off_t
2964bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2965{
2966	uint32 start;
2967	if (isFragment)
2968		start = 8 * ((uint64)offset & 0x3ff);
2969	else
2970		start = 2;
2971
2972	return ((off_t*)this)[start + 1 + index];
2973}
2974
2975
2976/*!	Although the name suggests it, this function doesn't return the real
2977	used fragment count; at least, it can only count to two: it returns
2978	0, if there is no fragment used, 1 if there is only one fragment
2979	used, and 2 if there are at least 2 fragments used.
2980*/
2981uint32
2982bplustree_node::FragmentsUsed(uint32 nodeSize) const
2983{
2984	uint32 used = 0;
2985	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2986		duplicate_array* array = FragmentAt(i);
2987		if (array->Count() > 0 && ++used > 1)
2988			return used;
2989	}
2990	return used;
2991}
2992
2993
2994status_t
2995bplustree_node::CheckIntegrity(uint32 nodeSize) const
2996{
2997	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2998		DEBUGGER(("invalid node: key/length count"));
2999
3000	for (int32 i = 0; i < NumKeys(); i++) {
3001		uint16 length = 0;
3002		uint8* key = KeyAt(i, &length);
3003		if (key + length + sizeof(off_t) + sizeof(uint16)
3004				> (uint8*)this + nodeSize
3005			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
3006			dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i);
3007			return B_BAD_DATA;
3008		}
3009		if (Values()[i] == -1) {
3010			dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values "
3011				"corrupted\n", this, (int)i, Values()[i].value);
3012			return B_BAD_DATA;
3013		}
3014	}
3015	return B_OK;
3016}
3017
3018
3019// #pragma mark -
3020
3021
3022#if !_BOOT_MODE
3023BitmapArray::BitmapArray(size_t numBits)
3024{
3025	fSize = (numBits + 7) / 8;
3026	fBitmap = (uint8*)calloc(fSize, 1);
3027	fCountSet = 0;
3028}
3029
3030
3031BitmapArray::~BitmapArray()
3032{
3033	free(fBitmap);
3034}
3035
3036
3037status_t
3038BitmapArray::InitCheck() const
3039{
3040	return fBitmap != NULL ? B_OK : B_NO_MEMORY;
3041}
3042
3043
3044bool
3045BitmapArray::IsSet(size_t index) const
3046{
3047	uint32 byteIndex = index / 8;
3048	if (byteIndex >= fSize)
3049		return false;
3050
3051	return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0;
3052}
3053
3054
3055void
3056BitmapArray::Set(size_t index, bool set)
3057{
3058	uint32 byteIndex = index / 8;
3059	if (byteIndex >= fSize)
3060		return;
3061
3062	if (set) {
3063		fBitmap[byteIndex] |= 1UL << (index & 0x7);
3064		fCountSet++;
3065	} else {
3066		fBitmap[byteIndex] &= ~(1UL << (index & 0x7));
3067		fCountSet--;
3068	}
3069}
3070#endif // !_BOOT_MODE
3071
3072
3073// #pragma mark -
3074
3075
3076bool
3077duplicate_array::_FindInternal(off_t value, int32& index) const
3078{
3079	int32 min = 0, max = Count() - 1;
3080	off_t cmp;
3081	while (min <= max) {
3082		index = (min + max) / 2;
3083
3084		cmp = ValueAt(index) - value;
3085		if (cmp < 0)
3086			min = index + 1;
3087		else if (cmp > 0)
3088			max = index - 1;
3089		else
3090			return true;
3091	}
3092	return false;
3093}
3094
3095
3096void
3097duplicate_array::Insert(off_t value)
3098{
3099	// if there are more than 8 values in this array, use a
3100	// binary search, if not, just iterate linearly to find
3101	// the insertion point
3102	int32 size = Count();
3103	int32 i = 0;
3104	if (size > 8 ) {
3105		if (!_FindInternal(value, i) && ValueAt(i) <= value)
3106			i++;
3107	} else {
3108		for (i = 0; i < size; i++) {
3109			if (ValueAt(i) > value)
3110				break;
3111		}
3112	}
3113
3114	memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t));
3115	values[i] = HOST_ENDIAN_TO_BFS_INT64(value);
3116	count = HOST_ENDIAN_TO_BFS_INT64(size + 1);
3117}
3118
3119
3120bool
3121duplicate_array::Remove(off_t value)
3122{
3123	int32 index = Find(value);
3124	if (index == -1)
3125		return false;
3126
3127	int32 newSize = Count() - 1;
3128	memmove(&values[index], &values[index + 1],
3129		(newSize - index) * sizeof(off_t));
3130	count = HOST_ENDIAN_TO_BFS_INT64(newSize);
3131
3132	return true;
3133}
3134
3135
3136#if _BOOT_MODE
3137} // namespace BFS
3138#endif
3139