1/* BPlusTree - BFS B+Tree implementation
2**
3** Copyright 2001-2002 pinc Software. All Rights Reserved.
4** Released under the terms of the MIT license.
5*/
6
7
8#include "BPlusTree.h"
9#include "Inode.h"
10#include "Stack.h"
11#include "dump.h"
12
13#include <Debug.h>
14
15#include <string.h>
16#include <stdlib.h>
17#include <stdio.h>
18
19
20#define MAX_NODES_IN_CACHE 20
21
22class CacheableNode : public NodeCache::Cacheable
23{
24	public:
25		CacheableNode(off_t offset,bplustree_node *node)
26			:
27			fOffset(offset),
28			fNode(node)
29		{
30		}
31
32		virtual ~CacheableNode()
33		{
34			if (fNode)
35				free(fNode);
36		}
37
38		virtual bool Equals(off_t offset)
39		{
40			return offset == fOffset;
41		}
42
43		off_t			fOffset;
44		bplustree_node	*fNode;
45};
46
47
48NodeCache::NodeCache(BPlusTree *tree)
49	:	Cache<off_t>(),
50	fTree(tree)
51{
52}
53
54
55Cache<off_t>::Cacheable *
56NodeCache::NewCacheable(off_t offset)
57{
58	return new CacheableNode(offset,fTree->Node(offset,false));
59}
60
61
62bplustree_node *
63NodeCache::Get(off_t offset)
64{
65	CacheableNode *node = (CacheableNode *)Cache<off_t>::Get(offset);
66	return node->fNode;
67}
68
69
70//	#pragma mark -
71
72
73BPlusTree::BPlusTree(int32 keyType,int32 nodeSize,bool allowDuplicates)
74	:
75	fStream(NULL),
76	fHeader(NULL),
77	fCache(this),
78	fCurrentNodeOffset(BPLUSTREE_NULL)
79{
80	SetTo(keyType,nodeSize,allowDuplicates);
81}
82
83
84BPlusTree::BPlusTree(BPositionIO *stream,bool allowDuplicates)
85	:
86	fStream(NULL),
87	fHeader(NULL),
88	fCache(this),
89	fCurrentNodeOffset(BPLUSTREE_NULL)
90{
91	SetTo(stream,allowDuplicates);
92}
93
94
95BPlusTree::BPlusTree()
96	:
97	fStream(NULL),
98	fHeader(NULL),
99	fNodeSize(BPLUSTREE_NODE_SIZE),
100	fAllowDuplicates(true),
101	fStatus(B_NO_INIT),
102	fCache(this),
103	fCurrentNodeOffset(BPLUSTREE_NULL)
104{
105}
106
107
108BPlusTree::~BPlusTree()
109{
110	fCache.Clear();
111}
112
113
114void BPlusTree::Initialize(int32 nodeSize)
115{
116	// free old data
117	fCache.Clear(0,true);
118
119	if (fHeader)
120		free(fHeader);
121
122	fStream = NULL;
123
124	fNodeSize = nodeSize;
125	fHeader = (bplustree_header *)malloc(fNodeSize);
126 	memset(fHeader,0,fNodeSize);
127
128 	fCurrentNodeOffset = BPLUSTREE_NULL;
129}
130
131
132status_t BPlusTree::SetTo(int32 keyType,int32 nodeSize,bool allowDuplicates)
133{
134	// initializes in-memory B+Tree
135
136	Initialize(nodeSize);
137
138	fAllowDuplicates = allowDuplicates;
139	fCache.SetHoldChanges(true);
140
141	// initialize b+tree header
142 	fHeader->magic = BPLUSTREE_MAGIC;
143 	fHeader->node_size = fNodeSize;
144 	fHeader->max_number_of_levels = 1;
145 	fHeader->data_type = keyType;
146 	fHeader->root_node_pointer = fNodeSize;
147 	fHeader->free_node_pointer = BPLUSTREE_NULL;
148 	fHeader->maximum_size = fNodeSize * 2;
149
150	return fStatus = B_OK;
151}
152
153
154status_t BPlusTree::SetTo(BPositionIO *stream,bool allowDuplicates)
155{
156	// initializes on-disk B+Tree
157
158	bplustree_header header;
159	ssize_t read = stream->ReadAt(0,&header,sizeof(bplustree_header));
160	if (read < 0)
161		return fStatus = read;
162
163	// is header valid?
164
165	stream->Seek(0,SEEK_END);
166	off_t size = stream->Position();
167	stream->Seek(0,SEEK_SET);
168
169	//dump_bplustree_header(&header);
170
171	if (header.magic != BPLUSTREE_MAGIC
172		|| header.maximum_size != size
173		|| (header.root_node_pointer % header.node_size) != 0
174		|| !header.IsValidLink(header.root_node_pointer)
175		|| !header.IsValidLink(header.free_node_pointer))
176		return fStatus = B_BAD_DATA;
177
178	fAllowDuplicates = allowDuplicates;
179
180	if (DataStream *dataStream = dynamic_cast<DataStream *>(stream))
181	{
182		uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX,
183						   S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX};
184		uint32 mode = dataStream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
185						   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX);
186
187		if (header.data_type > BPLUSTREE_DOUBLE_TYPE
188			|| (dataStream->Mode() & S_INDEX_DIR) && toMode[header.data_type] != mode
189			|| !dataStream->IsDirectory())
190			return fStatus = B_BAD_TYPE;
191
192		 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
193		fAllowDuplicates = (dataStream->Mode() & (S_INDEX_DIR | 0777)) == S_INDEX_DIR;
194
195		//printf("allows duplicates? %s\n",fAllowDuplicates ? "yes" : "no");
196	}
197
198	Initialize(header.node_size);
199
200	memcpy(fHeader,&header,sizeof(bplustree_header));
201
202	fStream = stream;
203
204	bplustree_node *node = fCache.Get(header.root_node_pointer);
205	//if (node)
206	//	dump_bplustree_node(node);
207
208	return fStatus = node && CheckNode(node) ? B_OK : B_BAD_DATA;
209}
210
211
212status_t BPlusTree::InitCheck()
213{
214	return fStatus;
215}
216
217
218struct validate_info {
219	off_t	offset;
220	off_t	from;
221	bool	free;
222};
223
224
225status_t
226BPlusTree::Validate(bool verbose)
227{
228	// validates the on-disk b+tree
229	// (for now only the node integrity is checked)
230
231	if (!fStream)
232		return B_OK;
233
234	struct validate_info info;
235
236	Stack<validate_info> stack;
237	info.offset = fHeader->root_node_pointer;
238	info.from = BPLUSTREE_NULL;
239	info.free = false;
240	stack.Push(info);
241
242	if (fHeader->free_node_pointer != BPLUSTREE_NULL) {
243		info.offset = fHeader->free_node_pointer;
244		info.free = true;
245		stack.Push(info);
246	}
247
248	char escape[3] = {0x1b, '[', 0};
249	int32 count = 0,freeCount = 0;
250
251	while (true) {
252		if (!stack.Pop(&info)) {
253			if (verbose) {
254				printf("  %" B_PRId32 " B+tree node(s) processed (%" B_PRId32
255					" free node(s)).\n", count, freeCount);
256			}
257			return B_OK;
258		}
259
260		if (!info.free && verbose && ++count % 10 == 0)
261			printf("  %" B_PRId32 "%s1A\n", count, escape);
262		else if (info.free)
263			freeCount++;
264
265		bplustree_node *node;
266		if ((node = fCache.Get(info.offset)) == NULL
267			|| !info.free && !CheckNode(node)) {
268			if (verbose) {
269				fprintf(stderr,"  B+Tree: Could not get data at position %"
270					B_PRIdOFF " (referenced by node at %" B_PRIdOFF ")\n",
271					info.offset, info.from);
272				if ((node = fCache.Get(info.from)) != NULL)
273					dump_bplustree_node(node,fHeader);
274			}
275			return B_BAD_DATA;
276		}
277
278		info.from = info.offset;
279
280		if (node->overflow_link != BPLUSTREE_NULL && !info.free) {
281			info.offset = node->overflow_link;
282			stack.Push(info);
283
284			//dump_bplustree_node(node,fHeader);
285			off_t *values = node->Values();
286			for (int32 i = 0;i < node->all_key_count;i++) {
287				info.offset = values[i];
288				stack.Push(info);
289			}
290		} else if (node->left_link != BPLUSTREE_NULL && info.free) {
291			info.offset = node->left_link;
292			stack.Push(info);
293		}
294	}
295}
296
297
298status_t
299BPlusTree::WriteTo(BPositionIO *stream)
300{
301	ssize_t written = stream->WriteAt(0,fHeader,fNodeSize);
302	if (written < fNodeSize)
303		return written < B_OK ? written : B_IO_ERROR;
304
305	for (off_t offset = fNodeSize;offset < fHeader->maximum_size;offset += fNodeSize)
306	{
307		bplustree_node *node = fCache.Get(offset);
308		if (node == NULL)
309			return B_ERROR;
310
311		written = stream->WriteAt(offset,node,fNodeSize);
312		if (written < fNodeSize)
313			return written < B_OK ? written : B_IO_ERROR;
314	}
315	return stream->SetSize(fHeader->maximum_size);
316}
317
318
319//	#pragma mark -
320
321
322void BPlusTree::SetCurrentNode(bplustree_node *node,off_t offset,int8 to)
323{
324	fCurrentNodeOffset = offset;
325	fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->all_key_count;
326	fDuplicateNode = BPLUSTREE_NULL;
327}
328
329
330status_t BPlusTree::Goto(int8 to)
331{
332	if (fHeader == NULL)
333		return B_BAD_VALUE;
334
335	Stack<off_t> stack;
336	if (stack.Push(fHeader->root_node_pointer) < B_OK)
337		return B_NO_MEMORY;
338
339	bplustree_node *node;
340	off_t pos;
341	while (stack.Pop(&pos) && (node = fCache.Get(pos)) != NULL && CheckNode(node))
342	{
343		// is the node a leaf node?
344		if (node->overflow_link == BPLUSTREE_NULL)
345		{
346			SetCurrentNode(node,pos,to);
347
348			return B_OK;
349		}
350
351		if (to == BPLUSTREE_END || node->all_key_count == 0)
352			pos = node->overflow_link;
353		else
354		{
355			if (node->all_key_length > fNodeSize
356				|| (addr_t)node->Values() > (addr_t)node + fNodeSize
357					- 8 * node->all_key_count)
358				return B_ERROR;
359
360			pos = *node->Values();
361		}
362
363		if (stack.Push(pos) < B_OK)
364			break;
365	}
366	return B_ERROR;
367}
368
369
370status_t BPlusTree::Traverse(int8 direction,void *key,uint16 *keyLength,uint16 maxLength,off_t *value)
371{
372	if (fCurrentNodeOffset == BPLUSTREE_NULL
373		&& Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK)
374		return B_ERROR;
375
376	bplustree_node *node;
377
378	if (fDuplicateNode != BPLUSTREE_NULL)
379	{
380		// regardless of traverse direction the duplicates are always presented in
381		// the same order; since they are all considered as equal, this shouldn't
382		// cause any problems
383
384		if (!fIsFragment || fDuplicate < fNumDuplicates)
385			node = fCache.Get(bplustree_node::FragmentOffset(fDuplicateNode));
386		else
387			node = NULL;
388
389		if (node != NULL)
390		{
391			if (!fIsFragment && fDuplicate >= fNumDuplicates)
392			{
393				// if the node is out of duplicates, we go directly to the next one
394				fDuplicateNode = node->right_link;
395				if (fDuplicateNode != BPLUSTREE_NULL
396					&& (node = fCache.Get(fDuplicateNode)) != NULL)
397				{
398					fNumDuplicates = node->CountDuplicates(fDuplicateNode,false);
399					fDuplicate = 0;
400				}
401			}
402			if (fDuplicate < fNumDuplicates)
403			{
404				*value = node->DuplicateAt(fDuplicateNode,fIsFragment,fDuplicate++);
405				return B_OK;
406			}
407		}
408		fDuplicateNode = BPLUSTREE_NULL;
409	}
410
411	off_t savedNodeOffset = fCurrentNodeOffset;
412	if ((node = fCache.Get(fCurrentNodeOffset)) == NULL || !CheckNode(node))
413		return B_ERROR;
414
415	fCurrentKey += direction;
416
417	// is the current key in the current node?
418	while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->all_key_count)
419		   || (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0))
420	{
421		fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->right_link : node->left_link;
422
423		// are there any more nodes?
424		if (fCurrentNodeOffset != BPLUSTREE_NULL)
425		{
426			node = fCache.Get(fCurrentNodeOffset);
427			if (!node || !CheckNode(node))
428				return B_ERROR;
429
430			// reset current key
431			fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->all_key_count;
432		}
433		else
434		{
435			// there are no nodes left, so turn back to the last key
436			fCurrentNodeOffset = savedNodeOffset;
437			fCurrentKey = direction == BPLUSTREE_FORWARD ? node->all_key_count : -1;
438
439			return B_ENTRY_NOT_FOUND;
440		}
441	}
442
443	if (node->all_key_count == 0)
444		return B_ERROR; //B_ENTRY_NOT_FOUND;
445
446	uint16 length;
447	uint8 *keyStart = node->KeyAt(fCurrentKey,&length);
448
449	length = min_c(length,maxLength);
450	memcpy(key,keyStart,length);
451
452	if (fHeader->data_type == BPLUSTREE_STRING_TYPE)	// terminate string type
453	{
454		if (length == maxLength)
455			length--;
456		((char *)key)[length] = '\0';
457	}
458	*keyLength = length;
459
460	off_t offset = node->Values()[fCurrentKey];
461
462	// duplicate fragments?
463	uint8 type = bplustree_node::LinkType(offset);
464	if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE)
465	{
466		fDuplicateNode = offset;
467
468		node = fCache.Get(bplustree_node::FragmentOffset(fDuplicateNode));
469		if (node == NULL)
470			return B_ERROR;
471
472		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
473
474		fNumDuplicates = node->CountDuplicates(offset,fIsFragment);
475		if (fNumDuplicates)
476		{
477			// give back first duplicate
478			fDuplicate = 1;
479			offset = node->DuplicateAt(offset,fIsFragment,0);
480		}
481		else
482		{
483			// shouldn't happen, but we're dealing here with corrupt disks...
484			fDuplicateNode = BPLUSTREE_NULL;
485			offset = 0;
486		}
487	}
488	*value = offset;
489
490	return B_OK;
491}
492
493
494int32 BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2)
495{
496	switch (fHeader->data_type)
497	{
498	    case BPLUSTREE_STRING_TYPE:
499    	{
500			int len = min_c(keyLength1,keyLength2);
501			int result = strncmp((const char *)key1,(const char *)key2,len);
502
503			if (result == 0)
504				result = keyLength1 - keyLength2;
505
506			return result;
507		}
508
509		case BPLUSTREE_INT32_TYPE:
510			return *(int32 *)key1 - *(int32 *)key2;
511
512		case BPLUSTREE_UINT32_TYPE:
513		{
514			if (*(uint32 *)key1 == *(uint32 *)key2)
515				return 0;
516			else if (*(uint32 *)key1 > *(uint32 *)key2)
517				return 1;
518
519			return -1;
520		}
521
522		case BPLUSTREE_INT64_TYPE:
523		{
524			if (*(int64 *)key1 == *(int64 *)key2)
525				return 0;
526			else if (*(int64 *)key1 > *(int64 *)key2)
527				return 1;
528
529			return -1;
530		}
531
532		case BPLUSTREE_UINT64_TYPE:
533		{
534			if (*(uint64 *)key1 == *(uint64 *)key2)
535				return 0;
536			else if (*(uint64 *)key1 > *(uint64 *)key2)
537				return 1;
538
539			return -1;
540		}
541
542		case BPLUSTREE_FLOAT_TYPE:
543		{
544			float result = *(float *)key1 - *(float *)key2;
545			if (result == 0.0f)
546				return 0;
547
548			return (result < 0.0f) ? -1 : 1;
549		}
550
551		case BPLUSTREE_DOUBLE_TYPE:
552		{
553			double result = *(double *)key1 - *(double *)key2;
554			if (result == 0.0)
555				return 0;
556
557			return (result < 0.0) ? -1 : 1;
558		}
559	}
560	return 0;
561}
562
563
564status_t BPlusTree::FindKey(bplustree_node *node,uint8 *key,uint16 keyLength,uint16 *index,off_t *next)
565{
566	if (node->all_key_count == 0)
567	{
568		if (index)
569			*index = 0;
570		if (next)
571			*next = node->overflow_link;
572		return B_ENTRY_NOT_FOUND;
573	}
574
575	off_t *values = node->Values();
576	int16 saveIndex = 0;
577
578	// binary search in the key array
579	for (int16 first = 0,last = node->all_key_count - 1;first <= last;)
580	{
581		uint16 i = (first + last) >> 1;
582
583		uint16 searchLength;
584		uint8 *searchKey = node->KeyAt(i,&searchLength);
585
586		int32 cmp = CompareKeys(key,keyLength,searchKey,searchLength);
587		if (cmp < 0)
588		{
589			last = i - 1;
590			saveIndex = i;
591		}
592		else if (cmp > 0)
593		{
594			saveIndex = first = i + 1;
595		}
596		else
597		{
598			if (index)
599				*index = i;
600			if (next)
601				*next = values[i];
602			return B_OK;
603		}
604	}
605
606	if (index)
607		*index = saveIndex;
608	if (next)
609	{
610		if (saveIndex == node->all_key_count)
611			*next = node->overflow_link;
612		else
613			*next = values[saveIndex];
614	}
615	return B_ENTRY_NOT_FOUND;
616}
617
618
619status_t BPlusTree::SeekDown(Stack<node_and_key> &stack,uint8 *key,uint16 keyLength)
620{
621	node_and_key nodeAndKey;
622	nodeAndKey.nodeOffset = fHeader->root_node_pointer;
623	nodeAndKey.keyIndex = 0;
624
625	bplustree_node *node;
626	while ((node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node)) {
627		// if we are already on leaf level, we're done
628		if (node->overflow_link == BPLUSTREE_NULL) {
629			// put the node on the stack
630			// node that the keyIndex is not properly set here!
631			nodeAndKey.keyIndex = 0;
632			stack.Push(nodeAndKey);
633			return B_OK;
634		}
635
636		off_t nextOffset;
637		status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex,&nextOffset);
638
639		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
640			return B_ERROR;
641
642		// put the node & the correct keyIndex on the stack
643		stack.Push(nodeAndKey);
644
645		nodeAndKey.nodeOffset = nextOffset;
646	}
647	return B_ERROR;
648}
649
650
651void BPlusTree::InsertKey(bplustree_node *node,uint8 *key,uint16 keyLength,off_t value,uint16 index)
652{
653	// should never happen, but who knows?
654	if (index > node->all_key_count)
655		return;
656
657	off_t *values = node->Values();
658	uint16 *keyLengths = node->KeyLengths();
659	uint8 *keys = node->Keys();
660
661	node->all_key_count++;
662	node->all_key_length += keyLength;
663
664	off_t *newValues = node->Values();
665	uint16 *newKeyLengths = node->KeyLengths();
666
667	// move values and copy new value into them
668	memmove(newValues + index + 1,values + index,sizeof(off_t) * (node->all_key_count - 1 - index));
669	memmove(newValues,values,sizeof(off_t) * index);
670
671	newValues[index] = value;
672
673	// move and update key length index
674	for (uint16 i = node->all_key_count;i-- > index + 1;)
675		newKeyLengths[i] = keyLengths[i - 1] + keyLength;
676	memmove(newKeyLengths,keyLengths,sizeof(uint16) * index);
677
678	int32 keyStart;
679	newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0);
680
681	// move keys and copy new key into them
682	int32 size = node->all_key_length - newKeyLengths[index];
683	if (size > 0)
684		memmove(keys + newKeyLengths[index],keys + newKeyLengths[index] - keyLength,size);
685
686	memcpy(keys + keyStart,key,keyLength);
687}
688
689
690status_t BPlusTree::InsertDuplicate(bplustree_node */*node*/,uint16 /*index*/)
691{
692	printf("DUPLICATE ENTRY!!\n");
693
694//		/* handle dup keys */
695//		if (dupflg == 0)
696//		{
697//			bt_errno(b) = BT_DUPKEY;
698//			goto bombout;
699//		}
700//		else
701//		{
702//			/* paste new data ptr in page */
703//			/* and write it out again. */
704//			off_t	*p;
705//			p = (off_t *)KEYCLD(op->p);
706//			*(p + keyat) = rrn;
707//			op->flags = BT_CHE_DIRTY;
708//			if(bt_wpage(b,op) == BT_ERR ||
709//				bt_wpage(b,kp) == BT_ERR)
710//				return(BT_ERR);
711//
712//			/* mark all as well with tree */
713//			bt_clearerr(b);
714//			return(BT_OK);
715//		}
716
717	return B_OK;
718}
719
720
721status_t BPlusTree::SplitNode(bplustree_node *node,off_t nodeOffset,uint16 *_keyIndex,uint8 *key,uint16 *_keyLength,off_t *_value)
722{
723	if (*_keyIndex > node->all_key_count + 1)
724		return B_BAD_VALUE;
725
726	//printf("before (insert \"%s\" (%d bytes) at %d):\n\n",key,*_keyLength,*_keyIndex);
727	//dump_bplustree_node(node,fHeader);
728	//hexdump(node,fNodeSize);
729
730	off_t otherOffset;
731	bplustree_node *other = fCache.Get(otherOffset = fHeader->maximum_size); //Node(otherOffset = fHeader->maximum_size/*,false*/);
732	if (other == NULL)
733		return B_NO_MEMORY;
734
735	uint16 *inKeyLengths = node->KeyLengths();
736	off_t *inKeyValues = node->Values();
737	uint8 *inKeys = node->Keys();
738	uint8 *outKeys = other->Keys();
739	uint16 keyIndex = *_keyIndex;
740
741	// how many keys will fit in one (half) page?
742
743	// "bytes" is the number of bytes written for the new key,
744	// "bytesBefore" are the bytes before that key
745	// "bytesAfter" are the bytes after the new key, if any
746	int32 bytes = 0,bytesBefore = 0,bytesAfter = 0;
747
748	size_t size = fNodeSize >> 1;
749	int32 out,in;
750	for (in = out = 0;in < node->all_key_count + 1;) {
751		if (!bytes)
752			bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0;
753		if (in == keyIndex && !bytes) {
754			bytes = *_keyLength;
755		} else {
756			if (keyIndex < out) {
757				bytesAfter = inKeyLengths[in] - bytesBefore;
758				// fix the key lengths for the new node
759				inKeyLengths[in] = bytesAfter + bytesBefore + bytes;
760			} //else
761			in++;
762		}
763		out++;
764
765		if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) +
766						out * (sizeof(uint16) + sizeof(off_t)) >= size) {
767			// we have found the number of keys in the new node!
768			break;
769		}
770	}
771
772	// if the new key was not inserted, set the length of the keys
773	// that can be copied directly
774	if (keyIndex >= out && in > 0)
775		bytesBefore = inKeyLengths[in - 1];
776
777	if (bytesBefore < 0 || bytesAfter < 0)
778		return B_BAD_DATA;
779
780	//printf("put %ld keys in the other node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in);
781
782	other->left_link = node->left_link;
783	other->right_link = nodeOffset;
784	other->all_key_length = bytes + bytesBefore + bytesAfter;
785	other->all_key_count = out;
786	//printf("-> used = %ld (bplustree_node = %ld bytes)\n",other->Used(),sizeof(bplustree_node));
787
788	uint16 *outKeyLengths = other->KeyLengths();
789	off_t *outKeyValues = other->Values();
790	int32 keys = out > keyIndex ? keyIndex : out;
791
792	if (bytesBefore) {
793		// copy the keys
794		memcpy(outKeys,inKeys,bytesBefore);
795		memcpy(outKeyLengths,inKeyLengths,keys * sizeof(uint16));
796		memcpy(outKeyValues,inKeyValues,keys * sizeof(off_t));
797	}
798	if (bytes) {
799		// copy the newly inserted key
800		memcpy(outKeys + bytesBefore,key,bytes);
801		outKeyLengths[keyIndex] = bytes + bytesBefore;
802		outKeyValues[keyIndex] = *_value;
803
804		if (bytesAfter) {
805			// copy the keys after the new key
806			memcpy(outKeys + bytesBefore + bytes,inKeys + bytesBefore,bytesAfter);
807			keys = out - keyIndex - 1;
808			memcpy(outKeyLengths + keyIndex + 1,inKeyLengths + keyIndex,keys * sizeof(uint16));
809			memcpy(outKeyValues + keyIndex + 1,inKeyValues + keyIndex,keys * sizeof(off_t));
810		}
811	}
812
813	// if the new key was already inserted, we shouldn't use it again
814	if (in != out)
815		keyIndex--;
816
817	int32 total = bytesBefore + bytesAfter;
818
819	// if we have split an index node, we have to drop the first key
820	// of the next node (which can also be the new key to insert)
821	if (node->overflow_link != BPLUSTREE_NULL) {
822		if (in == keyIndex) {
823			other->overflow_link = *_value;
824			keyIndex--;
825		} else {
826			other->overflow_link = inKeyValues[in];
827			total = inKeyLengths[in++];
828		}
829	}
830
831	// and now the same game for the other page and the rest of the keys
832	// (but with memmove() instead of memcpy(), because they may overlap)
833
834	bytesBefore = bytesAfter = bytes = 0;
835	out = 0;
836	int32 skip = in;
837	while (in < node->all_key_count + 1) {
838		//printf("in = %ld, keyIndex = %d, bytes = %ld\n",in,keyIndex,bytes);
839		if (in == keyIndex && !bytes) {
840			bytesBefore = in > skip ? inKeyLengths[in - 1] : 0;
841			//printf("bytesBefore = %ld\n",bytesBefore);
842			bytes = *_keyLength;
843		} else if (in < node->all_key_count) {
844			//printf("1.inKeyLength[%ld] = %u\n",in,inKeyLengths[in]);
845			inKeyLengths[in] -= total;
846			if (bytes) {
847				inKeyLengths[in] += bytes;
848				bytesAfter = inKeyLengths[in] - bytesBefore - bytes;
849				//printf("2.inKeyLength[%ld] = %u, bytesAfter = %ld, bytesBefore = %ld\n",in,inKeyLengths[in],bytesAfter,bytesBefore);
850			}
851
852			in++;
853		} else
854			in++;
855
856		out++;
857
858		//printf("-> out = %ld, keylen = %ld, %ld bytes total\n",out,bytesBefore,round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) +
859		//				out * (sizeof(uint16) + sizeof(off_t)));
860		if (in > node->all_key_count && keyIndex < in)
861			break;
862	}
863
864	//printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total);
865
866	if (keyIndex >= in && keyIndex - skip < out) {
867		bytesAfter = inKeyLengths[in] - bytesBefore - total;
868	} else if (keyIndex < skip)
869		bytesBefore = node->all_key_length - total;
870
871	//printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total);
872
873	if (bytesBefore < 0 || bytesAfter < 0)
874		return B_BAD_DATA;
875
876	node->left_link = otherOffset;
877		// right link, and overflow link can stay the same
878	node->all_key_length = bytes + bytesBefore + bytesAfter;
879	node->all_key_count = out - 1;
880
881	// array positions have changed
882	outKeyLengths = node->KeyLengths();
883	outKeyValues = node->Values();
884
885	//printf("put %ld keys in the first node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in);
886
887	// move the keys in the old node: the order is important here,
888	// because we don't want to overwrite any contents
889
890	keys = keyIndex <= skip ? out : keyIndex - skip;
891	keyIndex -= skip;
892
893	if (bytesBefore)
894		memmove(inKeys,inKeys + total,bytesBefore);
895	if (bytesAfter)
896		memmove(inKeys + bytesBefore + bytes,inKeys + total + bytesBefore,bytesAfter);
897
898	if (bytesBefore)
899		memmove(outKeyLengths,inKeyLengths + skip,keys * sizeof(uint16));
900	in = out - keyIndex - 1;
901	if (bytesAfter)
902		memmove(outKeyLengths + keyIndex + 1,inKeyLengths + skip + keyIndex,in * sizeof(uint16));
903
904	if (bytesBefore)
905		memmove(outKeyValues,inKeyValues + skip,keys * sizeof(off_t));
906	if (bytesAfter)
907		memmove(outKeyValues + keyIndex + 1,inKeyValues + skip + keyIndex,in * sizeof(off_t));
908
909	if (bytes) {
910		// finally, copy the newly inserted key (don't overwrite anything)
911		memcpy(inKeys + bytesBefore,key,bytes);
912		outKeyLengths[keyIndex] = bytes + bytesBefore;
913		outKeyValues[keyIndex] = *_value;
914	}
915
916	//puts("\n!!!!!!!!!!!!!!! after: !!!!!!!!!!!!!!!\n");
917	//dump_bplustree_node(other,fHeader);
918	//hexdump(other,fNodeSize);
919	//puts("\n");
920	//dump_bplustree_node(node,fHeader);
921	//hexdump(node,fNodeSize);
922
923	// write the updated nodes back
924
925	fCache.SetDirty(otherOffset,true);
926	fCache.SetDirty(nodeOffset,true);
927
928	// update the right link of the node in the left of the new node
929	if (other->left_link != BPLUSTREE_NULL) {
930		bplustree_node *left = fCache.Get(other->left_link);
931		if (left != NULL) {
932			left->right_link = otherOffset;
933			fCache.SetDirty(other->left_link,true);
934		}
935	}
936
937	// prepare key to insert in the parent node
938
939	//printf("skip: %ld, count-1: %u\n",skip,other->all_key_count - 1);
940	uint16 length;
941	uint8 *lastKey = other->KeyAt(other->all_key_count - 1,&length);
942	memcpy(key,lastKey,length);
943	*_keyLength = length;
944	*_value = otherOffset;
945
946	return B_OK;
947}
948
949
950status_t BPlusTree::Insert(uint8 *key,uint16 keyLength,off_t value)
951{
952	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
953		return B_BAD_VALUE;
954
955	Stack<node_and_key> stack;
956	if (SeekDown(stack,key,keyLength) != B_OK)
957		return B_ERROR;
958
959	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
960
961	memcpy(keyBuffer,key,keyLength);
962	keyBuffer[keyLength] = 0;
963
964	off_t valueToInsert = value;
965
966	fCurrentNodeOffset = BPLUSTREE_NULL;
967
968	node_and_key nodeAndKey;
969	bplustree_node *node;
970	uint32 count = 0;
971
972	while (stack.Pop(&nodeAndKey) && (node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node))
973	{
974		if (count++ == 0)	// first round, check for duplicate entries
975		{
976			status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex);
977			if (status == B_ERROR)
978				return B_ERROR;
979
980			// is this a duplicate entry?
981			if (status == B_OK && node->overflow_link == BPLUSTREE_NULL)
982			{
983				if (fAllowDuplicates)
984					return InsertDuplicate(node,nodeAndKey.keyIndex);
985				else
986					return B_NAME_IN_USE;
987			}
988		}
989
990		// is the node big enough to hold the pair?
991		if (int32(round_up(sizeof(bplustree_node) + node->all_key_length + keyLength)
992			+ (node->all_key_count + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize)
993		{
994			InsertKey(node,keyBuffer,keyLength,valueToInsert,nodeAndKey.keyIndex);
995			fCache.SetDirty(nodeAndKey.nodeOffset,true);
996
997			return B_OK;
998		}
999		else
1000		{
1001			// do we need to allocate a new root node? if so, then do
1002			// it now
1003			bplustree_node *rootNode = NULL;
1004			off_t newRoot = BPLUSTREE_NULL;
1005			if (nodeAndKey.nodeOffset == fHeader->root_node_pointer) {
1006				rootNode = fCache.Get(newRoot = fHeader->maximum_size);
1007				if (rootNode == NULL) {
1008					// the tree is most likely dead
1009					return B_NO_MEMORY;
1010				}
1011			}
1012
1013			if (SplitNode(node,nodeAndKey.nodeOffset,&nodeAndKey.keyIndex,keyBuffer,&keyLength,&valueToInsert) < B_OK) {
1014				if (newRoot != BPLUSTREE_NULL) {
1015					// free root node
1016				}
1017				return B_ERROR;
1018			}
1019
1020			if (newRoot != BPLUSTREE_NULL) {
1021				rootNode->overflow_link = nodeAndKey.nodeOffset;
1022
1023				InsertKey(rootNode,keyBuffer,keyLength,node->left_link,0);
1024
1025				fHeader->root_node_pointer = newRoot;
1026				fHeader->max_number_of_levels++;
1027				// write header
1028
1029				fCache.SetDirty(newRoot,true);
1030
1031				return B_OK;
1032			}
1033		}
1034	}
1035
1036	return B_ERROR;
1037}
1038
1039
1040status_t BPlusTree::Find(uint8 *key,uint16 keyLength,off_t *value)
1041{
1042	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1043		return B_BAD_VALUE;
1044
1045	Stack<node_and_key> stack;
1046	if (SeekDown(stack,key,keyLength) != B_OK)
1047		return B_ERROR;
1048
1049	fCurrentNodeOffset = BPLUSTREE_NULL;
1050
1051	node_and_key nodeAndKey;
1052	bplustree_node *node;
1053
1054	if (stack.Pop(&nodeAndKey) && (node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node))
1055	{
1056		status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex);
1057		if (status == B_ERROR)
1058			return B_ERROR;
1059
1060		SetCurrentNode(node,nodeAndKey.nodeOffset);
1061
1062		if (status == B_OK && node->overflow_link == BPLUSTREE_NULL)
1063		{
1064			*value = node->Values()[nodeAndKey.keyIndex];
1065			return B_OK;
1066		}
1067	}
1068	return B_ENTRY_NOT_FOUND;
1069}
1070
1071
1072//	#pragma mark -
1073
1074
1075bool
1076BPlusTree::CheckNode(bplustree_node *node)
1077{
1078	if (!fHeader->IsValidLink(node->left_link)
1079		|| !fHeader->IsValidLink(node->right_link)
1080		|| !fHeader->IsValidLink(node->overflow_link)
1081		|| (int8 *)node->Values() + node->all_key_count * sizeof(off_t) > (int8 *)node + fNodeSize)
1082		return false;
1083
1084	return true;
1085}
1086
1087
1088bplustree_node *BPlusTree::Node(off_t nodeOffset,bool check)
1089{
1090/*printf("1: %d,%d,%d\n",
1091	nodeOffset > fHeader->maximum_size - fNodeSize,
1092	nodeOffset < 0 && nodeOffset != BPLUSTREE_NULL,
1093	(nodeOffset % fNodeSize) != 0);
1094*/
1095	// the super node is always in memory, and shouldn't
1096	// never be taken out of the cache
1097	if (nodeOffset > fHeader->maximum_size /*- fNodeSize*/
1098		|| nodeOffset <= 0 && nodeOffset != BPLUSTREE_NULL
1099		|| (nodeOffset % fNodeSize) != 0)
1100		return NULL;
1101
1102	bplustree_node *node = (bplustree_node *)malloc(fNodeSize);
1103	if (node == NULL)
1104		return NULL;
1105
1106	if (nodeOffset == BPLUSTREE_NULL || !fStream)
1107	{
1108	 	node->left_link = BPLUSTREE_NULL;
1109	 	node->right_link = BPLUSTREE_NULL;
1110	 	node->overflow_link = BPLUSTREE_NULL;
1111	 	node->all_key_count = 0;
1112	 	node->all_key_length = 0;
1113	}
1114	else if (fStream && fStream->ReadAt(nodeOffset,node,fNodeSize) < fNodeSize)
1115	{
1116		free(node);
1117		return NULL;
1118	}
1119
1120	if (check && node != NULL)
1121	{
1122		// sanity checks (links, all_key_count)
1123		if (!fHeader->IsValidLink(node->left_link)
1124			|| !fHeader->IsValidLink(node->right_link)
1125			|| !fHeader->IsValidLink(node->overflow_link)
1126			|| (int8 *)node->Values() + node->all_key_count * sizeof(off_t) > (int8 *)node + fNodeSize)
1127			return NULL;
1128	}
1129
1130	if (!fStream && nodeOffset > fHeader->maximum_size - fNodeSize) {
1131		// do some hacks here
1132		fHeader->maximum_size += fNodeSize;
1133	}
1134
1135	return node;
1136}
1137
1138
1139void BPlusTree::SetHoldChanges(bool hold)
1140{
1141	fCache.SetHoldChanges(hold);
1142}
1143
1144
1145//	#pragma mark -
1146
1147
1148uint8 *bplustree_node::KeyAt(int32 index,uint16 *keyLength) const
1149{
1150	if (index < 0 || index > all_key_count)
1151		return NULL;
1152
1153	uint8 *keyStart = Keys();
1154	uint16 *keyLengths = KeyLengths();
1155
1156	*keyLength = keyLengths[index] - (index != 0 ? keyLengths[index - 1] : 0);
1157	if (index > 0)
1158		keyStart += keyLengths[index - 1];
1159
1160	return keyStart;
1161}
1162
1163
1164uint8 bplustree_node::CountDuplicates(off_t offset,bool isFragment) const
1165{
1166	// the duplicate fragment handling is currently hard-coded to a node size
1167	// of 1024 bytes - with future versions of BFS, this may be a problem
1168
1169	if (isFragment)
1170	{
1171		uint32 fragment = 8 * ((uint64)offset & 0x3ff);
1172
1173		return ((off_t *)this)[fragment];
1174	}
1175	return overflow_link;
1176}
1177
1178
1179off_t bplustree_node::DuplicateAt(off_t offset,bool isFragment,int8 index) const
1180{
1181	uint32 start;
1182	if (isFragment)
1183		start = 8 * ((uint64)offset & 0x3ff);
1184	else
1185		start = 2;
1186
1187	return ((off_t *)this)[start + 1 + index];
1188}
1189
1190