1/* BPlusTree - BFS B+Tree implementation
2**
3** Copyright 2001-2002 pinc Software. All Rights Reserved.
4*/
5
6
7#include "BPlusTree.h"
8#include "Inode.h"
9#include "Stack.h"
10#include "dump.h"
11
12#include <Debug.h>
13
14#include <string.h>
15#include <stdlib.h>
16#include <stdio.h>
17
18
19#define MAX_NODES_IN_CACHE 20
20
21class CacheableNode : public NodeCache::Cacheable
22{
23	public:
24		CacheableNode(off_t offset,bplustree_node *node)
25			:
26			fOffset(offset),
27			fNode(node)
28		{
29		}
30
31		virtual ~CacheableNode()
32		{
33			if (fNode)
34				free(fNode);
35		}
36
37		virtual bool Equals(off_t offset)
38		{
39			return offset == fOffset;
40		}
41
42		off_t			fOffset;
43		bplustree_node	*fNode;
44};
45
46
47NodeCache::NodeCache(BPlusTree *tree)
48	:	Cache<off_t>(),
49	fTree(tree)
50{
51}
52
53
54Cache<off_t>::Cacheable *
55NodeCache::NewCacheable(off_t offset)
56{
57	return new CacheableNode(offset,fTree->Node(offset,false));
58}
59
60
61bplustree_node *
62NodeCache::Get(off_t offset)
63{
64	CacheableNode *node = (CacheableNode *)Cache<off_t>::Get(offset);
65	return node->fNode;
66}
67
68
69//	#pragma mark -
70
71
72BPlusTree::BPlusTree(int32 keyType,int32 nodeSize,bool allowDuplicates)
73	:
74	fStream(NULL),
75	fHeader(NULL),
76	fCache(this),
77	fCurrentNodeOffset(BPLUSTREE_NULL)
78{
79	SetTo(keyType,nodeSize,allowDuplicates);
80}
81
82
83BPlusTree::BPlusTree(BPositionIO *stream,bool allowDuplicates)
84	:
85	fStream(NULL),
86	fHeader(NULL),
87	fCache(this),
88	fCurrentNodeOffset(BPLUSTREE_NULL)
89{
90	SetTo(stream,allowDuplicates);
91}
92
93
94BPlusTree::BPlusTree()
95	:
96	fStream(NULL),
97	fHeader(NULL),
98	fNodeSize(BPLUSTREE_NODE_SIZE),
99	fAllowDuplicates(true),
100	fStatus(B_NO_INIT),
101	fCache(this),
102	fCurrentNodeOffset(BPLUSTREE_NULL)
103{
104}
105
106
107BPlusTree::~BPlusTree()
108{
109	fCache.Clear();
110}
111
112
113void BPlusTree::Initialize(int32 nodeSize)
114{
115	// free old data
116	fCache.Clear(0,true);
117
118	if (fHeader)
119		free(fHeader);
120
121	fStream = NULL;
122
123	fNodeSize = nodeSize;
124	fHeader = (bplustree_header *)malloc(fNodeSize);
125 	memset(fHeader,0,fNodeSize);
126
127 	fCurrentNodeOffset = BPLUSTREE_NULL;
128}
129
130
131status_t BPlusTree::SetTo(int32 keyType,int32 nodeSize,bool allowDuplicates)
132{
133	// initializes in-memory B+Tree
134
135	Initialize(nodeSize);
136
137	fAllowDuplicates = allowDuplicates;
138	fCache.SetHoldChanges(true);
139
140	// initialize b+tree header
141 	fHeader->magic = BPLUSTREE_MAGIC;
142 	fHeader->node_size = fNodeSize;
143 	fHeader->max_number_of_levels = 1;
144 	fHeader->data_type = keyType;
145 	fHeader->root_node_pointer = fNodeSize;
146 	fHeader->free_node_pointer = BPLUSTREE_NULL;
147 	fHeader->maximum_size = fNodeSize * 2;
148
149	return fStatus = B_OK;
150}
151
152
153status_t BPlusTree::SetTo(BPositionIO *stream,bool allowDuplicates)
154{
155	// initializes on-disk B+Tree
156
157	bplustree_header header;
158	ssize_t read = stream->ReadAt(0,&header,sizeof(bplustree_header));
159	if (read < 0)
160		return fStatus = read;
161
162	// is header valid?
163
164	stream->Seek(0,SEEK_END);
165	off_t size = stream->Position();
166	stream->Seek(0,SEEK_SET);
167
168	//dump_bplustree_header(&header);
169
170	if (header.magic != BPLUSTREE_MAGIC
171		|| header.maximum_size != size
172		|| (header.root_node_pointer % header.node_size) != 0
173		|| !header.IsValidLink(header.root_node_pointer)
174		|| !header.IsValidLink(header.free_node_pointer))
175		return fStatus = B_BAD_DATA;
176
177	fAllowDuplicates = allowDuplicates;
178
179	if (DataStream *dataStream = dynamic_cast<DataStream *>(stream))
180	{
181		uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX,
182						   S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX};
183		uint32 mode = dataStream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
184						   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX);
185
186		if (header.data_type > BPLUSTREE_DOUBLE_TYPE
187			|| (dataStream->Mode() & S_INDEX_DIR) && toMode[header.data_type] != mode
188			|| !dataStream->IsDirectory())
189			return fStatus = B_BAD_TYPE;
190
191		 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
192		fAllowDuplicates = (dataStream->Mode() & (S_INDEX_DIR | 0777)) == S_INDEX_DIR;
193
194		//printf("allows duplicates? %s\n",fAllowDuplicates ? "yes" : "no");
195	}
196
197	Initialize(header.node_size);
198
199	memcpy(fHeader,&header,sizeof(bplustree_header));
200
201	fStream = stream;
202
203	bplustree_node *node = fCache.Get(header.root_node_pointer);
204	//if (node)
205	//	dump_bplustree_node(node);
206
207	return fStatus = node && CheckNode(node) ? B_OK : B_BAD_DATA;
208}
209
210
211status_t BPlusTree::InitCheck()
212{
213	return fStatus;
214}
215
216
217struct validate_info {
218	off_t	offset;
219	off_t	from;
220	bool	free;
221};
222
223
224status_t
225BPlusTree::Validate(bool verbose)
226{
227	// validates the on-disk b+tree
228	// (for now only the node integrity is checked)
229
230	if (!fStream)
231		return B_OK;
232
233	struct validate_info info;
234
235	Stack<validate_info> stack;
236	info.offset = fHeader->root_node_pointer;
237	info.from = BPLUSTREE_NULL;
238	info.free = false;
239	stack.Push(info);
240
241	if (fHeader->free_node_pointer != BPLUSTREE_NULL) {
242		info.offset = fHeader->free_node_pointer;
243		info.free = true;
244		stack.Push(info);
245	}
246
247	char escape[3] = {0x1b, '[', 0};
248	int32 count = 0,freeCount = 0;
249
250	while (true) {
251		if (!stack.Pop(&info)) {
252			if (verbose) {
253				printf("  %ld B+tree node(s) processed (%ld free node(s)).\n",
254					count, freeCount);
255			}
256			return B_OK;
257		}
258
259		if (!info.free && verbose && ++count % 10 == 0)
260			printf("  %ld%s1A\n", count, escape);
261		else if (info.free)
262			freeCount++;
263
264		bplustree_node *node;
265		if ((node = fCache.Get(info.offset)) == NULL
266			|| !info.free && !CheckNode(node)) {
267			if (verbose) {
268				fprintf(stderr,"  B+Tree: Could not get data at position %Ld (referenced by node at %Ld)\n",info.offset,info.from);
269				if ((node = fCache.Get(info.from)) != NULL)
270					dump_bplustree_node(node,fHeader);
271			}
272			return B_BAD_DATA;
273		}
274
275		info.from = info.offset;
276
277		if (node->overflow_link != BPLUSTREE_NULL && !info.free) {
278			info.offset = node->overflow_link;
279			stack.Push(info);
280
281			//dump_bplustree_node(node,fHeader);
282			off_t *values = node->Values();
283			for (int32 i = 0;i < node->all_key_count;i++) {
284				info.offset = values[i];
285				stack.Push(info);
286			}
287		} else if (node->left_link != BPLUSTREE_NULL && info.free) {
288			info.offset = node->left_link;
289			stack.Push(info);
290		}
291	}
292}
293
294
295status_t
296BPlusTree::WriteTo(BPositionIO *stream)
297{
298	ssize_t written = stream->WriteAt(0,fHeader,fNodeSize);
299	if (written < fNodeSize)
300		return written < B_OK ? written : B_IO_ERROR;
301
302	for (off_t offset = fNodeSize;offset < fHeader->maximum_size;offset += fNodeSize)
303	{
304		bplustree_node *node = fCache.Get(offset);
305		if (node == NULL)
306			return B_ERROR;
307
308		written = stream->WriteAt(offset,node,fNodeSize);
309		if (written < fNodeSize)
310			return written < B_OK ? written : B_IO_ERROR;
311	}
312	return stream->SetSize(fHeader->maximum_size);
313}
314
315
316//	#pragma mark -
317
318
319void BPlusTree::SetCurrentNode(bplustree_node *node,off_t offset,int8 to)
320{
321	fCurrentNodeOffset = offset;
322	fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->all_key_count;
323	fDuplicateNode = BPLUSTREE_NULL;
324}
325
326
327status_t BPlusTree::Goto(int8 to)
328{
329	if (fHeader == NULL)
330		return B_BAD_VALUE;
331
332	Stack<off_t> stack;
333	if (stack.Push(fHeader->root_node_pointer) < B_OK)
334		return B_NO_MEMORY;
335
336	bplustree_node *node;
337	off_t pos;
338	while (stack.Pop(&pos) && (node = fCache.Get(pos)) != NULL && CheckNode(node))
339	{
340		// is the node a leaf node?
341		if (node->overflow_link == BPLUSTREE_NULL)
342		{
343			SetCurrentNode(node,pos,to);
344
345			return B_OK;
346		}
347
348		if (to == BPLUSTREE_END || node->all_key_count == 0)
349			pos = node->overflow_link;
350		else
351		{
352			if (node->all_key_length > fNodeSize
353				|| (uint32)node->Values() > (uint32)node + fNodeSize - 8 * node->all_key_count)
354				return B_ERROR;
355
356			pos = *node->Values();
357		}
358
359		if (stack.Push(pos) < B_OK)
360			break;
361	}
362	return B_ERROR;
363}
364
365
366status_t BPlusTree::Traverse(int8 direction,void *key,uint16 *keyLength,uint16 maxLength,off_t *value)
367{
368	if (fCurrentNodeOffset == BPLUSTREE_NULL
369		&& Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK)
370		return B_ERROR;
371
372	bplustree_node *node;
373
374	if (fDuplicateNode != BPLUSTREE_NULL)
375	{
376		// regardless of traverse direction the duplicates are always presented in
377		// the same order; since they are all considered as equal, this shouldn't
378		// cause any problems
379
380		if (!fIsFragment || fDuplicate < fNumDuplicates)
381			node = fCache.Get(bplustree_node::FragmentOffset(fDuplicateNode));
382		else
383			node = NULL;
384
385		if (node != NULL)
386		{
387			if (!fIsFragment && fDuplicate >= fNumDuplicates)
388			{
389				// if the node is out of duplicates, we go directly to the next one
390				fDuplicateNode = node->right_link;
391				if (fDuplicateNode != BPLUSTREE_NULL
392					&& (node = fCache.Get(fDuplicateNode)) != NULL)
393				{
394					fNumDuplicates = node->CountDuplicates(fDuplicateNode,false);
395					fDuplicate = 0;
396				}
397			}
398			if (fDuplicate < fNumDuplicates)
399			{
400				*value = node->DuplicateAt(fDuplicateNode,fIsFragment,fDuplicate++);
401				return B_OK;
402			}
403		}
404		fDuplicateNode = BPLUSTREE_NULL;
405	}
406
407	off_t savedNodeOffset = fCurrentNodeOffset;
408	if ((node = fCache.Get(fCurrentNodeOffset)) == NULL || !CheckNode(node))
409		return B_ERROR;
410
411	fCurrentKey += direction;
412
413	// is the current key in the current node?
414	while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->all_key_count)
415		   || (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0))
416	{
417		fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->right_link : node->left_link;
418
419		// are there any more nodes?
420		if (fCurrentNodeOffset != BPLUSTREE_NULL)
421		{
422			node = fCache.Get(fCurrentNodeOffset);
423			if (!node || !CheckNode(node))
424				return B_ERROR;
425
426			// reset current key
427			fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->all_key_count;
428		}
429		else
430		{
431			// there are no nodes left, so turn back to the last key
432			fCurrentNodeOffset = savedNodeOffset;
433			fCurrentKey = direction == BPLUSTREE_FORWARD ? node->all_key_count : -1;
434
435			return B_ENTRY_NOT_FOUND;
436		}
437	}
438
439	if (node->all_key_count == 0)
440		return B_ERROR; //B_ENTRY_NOT_FOUND;
441
442	uint16 length;
443	uint8 *keyStart = node->KeyAt(fCurrentKey,&length);
444
445	length = min_c(length,maxLength);
446	memcpy(key,keyStart,length);
447
448	if (fHeader->data_type == BPLUSTREE_STRING_TYPE)	// terminate string type
449	{
450		if (length == maxLength)
451			length--;
452		((char *)key)[length] = '\0';
453	}
454	*keyLength = length;
455
456	off_t offset = node->Values()[fCurrentKey];
457
458	// duplicate fragments?
459	uint8 type = bplustree_node::LinkType(offset);
460	if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE)
461	{
462		fDuplicateNode = offset;
463
464		node = fCache.Get(bplustree_node::FragmentOffset(fDuplicateNode));
465		if (node == NULL)
466			return B_ERROR;
467
468		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
469
470		fNumDuplicates = node->CountDuplicates(offset,fIsFragment);
471		if (fNumDuplicates)
472		{
473			// give back first duplicate
474			fDuplicate = 1;
475			offset = node->DuplicateAt(offset,fIsFragment,0);
476		}
477		else
478		{
479			// shouldn't happen, but we're dealing here with corrupt disks...
480			fDuplicateNode = BPLUSTREE_NULL;
481			offset = 0;
482		}
483	}
484	*value = offset;
485
486	return B_OK;
487}
488
489
490int32 BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2)
491{
492	switch (fHeader->data_type)
493	{
494	    case BPLUSTREE_STRING_TYPE:
495    	{
496			int len = min_c(keyLength1,keyLength2);
497			int result = strncmp((const char *)key1,(const char *)key2,len);
498
499			if (result == 0)
500				result = keyLength1 - keyLength2;
501
502			return result;
503		}
504
505		case BPLUSTREE_INT32_TYPE:
506			return *(int32 *)key1 - *(int32 *)key2;
507
508		case BPLUSTREE_UINT32_TYPE:
509		{
510			if (*(uint32 *)key1 == *(uint32 *)key2)
511				return 0;
512			else if (*(uint32 *)key1 > *(uint32 *)key2)
513				return 1;
514
515			return -1;
516		}
517
518		case BPLUSTREE_INT64_TYPE:
519		{
520			if (*(int64 *)key1 == *(int64 *)key2)
521				return 0;
522			else if (*(int64 *)key1 > *(int64 *)key2)
523				return 1;
524
525			return -1;
526		}
527
528		case BPLUSTREE_UINT64_TYPE:
529		{
530			if (*(uint64 *)key1 == *(uint64 *)key2)
531				return 0;
532			else if (*(uint64 *)key1 > *(uint64 *)key2)
533				return 1;
534
535			return -1;
536		}
537
538		case BPLUSTREE_FLOAT_TYPE:
539		{
540			float result = *(float *)key1 - *(float *)key2;
541			if (result == 0.0f)
542				return 0;
543
544			return (result < 0.0f) ? -1 : 1;
545		}
546
547		case BPLUSTREE_DOUBLE_TYPE:
548		{
549			double result = *(double *)key1 - *(double *)key2;
550			if (result == 0.0)
551				return 0;
552
553			return (result < 0.0) ? -1 : 1;
554		}
555	}
556	return 0;
557}
558
559
560status_t BPlusTree::FindKey(bplustree_node *node,uint8 *key,uint16 keyLength,uint16 *index,off_t *next)
561{
562	if (node->all_key_count == 0)
563	{
564		if (index)
565			*index = 0;
566		if (next)
567			*next = node->overflow_link;
568		return B_ENTRY_NOT_FOUND;
569	}
570
571	off_t *values = node->Values();
572	int16 saveIndex = 0;
573
574	// binary search in the key array
575	for (int16 first = 0,last = node->all_key_count - 1;first <= last;)
576	{
577		uint16 i = (first + last) >> 1;
578
579		uint16 searchLength;
580		uint8 *searchKey = node->KeyAt(i,&searchLength);
581
582		int32 cmp = CompareKeys(key,keyLength,searchKey,searchLength);
583		if (cmp < 0)
584		{
585			last = i - 1;
586			saveIndex = i;
587		}
588		else if (cmp > 0)
589		{
590			saveIndex = first = i + 1;
591		}
592		else
593		{
594			if (index)
595				*index = i;
596			if (next)
597				*next = values[i];
598			return B_OK;
599		}
600	}
601
602	if (index)
603		*index = saveIndex;
604	if (next)
605	{
606		if (saveIndex == node->all_key_count)
607			*next = node->overflow_link;
608		else
609			*next = values[saveIndex];
610	}
611	return B_ENTRY_NOT_FOUND;
612}
613
614
615status_t BPlusTree::SeekDown(Stack<node_and_key> &stack,uint8 *key,uint16 keyLength)
616{
617	node_and_key nodeAndKey;
618	nodeAndKey.nodeOffset = fHeader->root_node_pointer;
619	nodeAndKey.keyIndex = 0;
620
621	bplustree_node *node;
622	while ((node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node)) {
623		// if we are already on leaf level, we're done
624		if (node->overflow_link == BPLUSTREE_NULL) {
625			// put the node on the stack
626			// node that the keyIndex is not properly set here!
627			nodeAndKey.keyIndex = 0;
628			stack.Push(nodeAndKey);
629			return B_OK;
630		}
631
632		off_t nextOffset;
633		status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex,&nextOffset);
634
635		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
636			return B_ERROR;
637
638		// put the node & the correct keyIndex on the stack
639		stack.Push(nodeAndKey);
640
641		nodeAndKey.nodeOffset = nextOffset;
642	}
643	return B_ERROR;
644}
645
646
647void BPlusTree::InsertKey(bplustree_node *node,uint8 *key,uint16 keyLength,off_t value,uint16 index)
648{
649	// should never happen, but who knows?
650	if (index > node->all_key_count)
651		return;
652
653	off_t *values = node->Values();
654	uint16 *keyLengths = node->KeyLengths();
655	uint8 *keys = node->Keys();
656
657	node->all_key_count++;
658	node->all_key_length += keyLength;
659
660	off_t *newValues = node->Values();
661	uint16 *newKeyLengths = node->KeyLengths();
662
663	// move values and copy new value into them
664	memmove(newValues + index + 1,values + index,sizeof(off_t) * (node->all_key_count - 1 - index));
665	memmove(newValues,values,sizeof(off_t) * index);
666
667	newValues[index] = value;
668
669	// move and update key length index
670	for (uint16 i = node->all_key_count;i-- > index + 1;)
671		newKeyLengths[i] = keyLengths[i - 1] + keyLength;
672	memmove(newKeyLengths,keyLengths,sizeof(uint16) * index);
673
674	int32 keyStart;
675	newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0);
676
677	// move keys and copy new key into them
678	int32 size = node->all_key_length - newKeyLengths[index];
679	if (size > 0)
680		memmove(keys + newKeyLengths[index],keys + newKeyLengths[index] - keyLength,size);
681
682	memcpy(keys + keyStart,key,keyLength);
683}
684
685
686status_t BPlusTree::InsertDuplicate(bplustree_node */*node*/,uint16 /*index*/)
687{
688	printf("DUPLICATE ENTRY!!\n");
689
690//		/* handle dup keys */
691//		if (dupflg == 0)
692//		{
693//			bt_errno(b) = BT_DUPKEY;
694//			goto bombout;
695//		}
696//		else
697//		{
698//			/* paste new data ptr in page */
699//			/* and write it out again. */
700//			off_t	*p;
701//			p = (off_t *)KEYCLD(op->p);
702//			*(p + keyat) = rrn;
703//			op->flags = BT_CHE_DIRTY;
704//			if(bt_wpage(b,op) == BT_ERR ||
705//				bt_wpage(b,kp) == BT_ERR)
706//				return(BT_ERR);
707//
708//			/* mark all as well with tree */
709//			bt_clearerr(b);
710//			return(BT_OK);
711//		}
712
713	return B_OK;
714}
715
716
717status_t BPlusTree::SplitNode(bplustree_node *node,off_t nodeOffset,uint16 *_keyIndex,uint8 *key,uint16 *_keyLength,off_t *_value)
718{
719	if (*_keyIndex > node->all_key_count + 1)
720		return B_BAD_VALUE;
721
722	//printf("before (insert \"%s\" (%d bytes) at %d):\n\n",key,*_keyLength,*_keyIndex);
723	//dump_bplustree_node(node,fHeader);
724	//hexdump(node,fNodeSize);
725
726	off_t otherOffset;
727	bplustree_node *other = fCache.Get(otherOffset = fHeader->maximum_size); //Node(otherOffset = fHeader->maximum_size/*,false*/);
728	if (other == NULL)
729		return B_NO_MEMORY;
730
731	uint16 *inKeyLengths = node->KeyLengths();
732	off_t *inKeyValues = node->Values();
733	uint8 *inKeys = node->Keys();
734	uint8 *outKeys = other->Keys();
735	uint16 keyIndex = *_keyIndex;
736
737	// how many keys will fit in one (half) page?
738
739	// "bytes" is the number of bytes written for the new key,
740	// "bytesBefore" are the bytes before that key
741	// "bytesAfter" are the bytes after the new key, if any
742	int32 bytes = 0,bytesBefore = 0,bytesAfter = 0;
743
744	size_t size = fNodeSize >> 1;
745	int32 out,in;
746	for (in = out = 0;in < node->all_key_count + 1;) {
747		if (!bytes)
748			bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0;
749		if (in == keyIndex && !bytes) {
750			bytes = *_keyLength;
751		} else {
752			if (keyIndex < out) {
753				bytesAfter = inKeyLengths[in] - bytesBefore;
754				// fix the key lengths for the new node
755				inKeyLengths[in] = bytesAfter + bytesBefore + bytes;
756			} //else
757			in++;
758		}
759		out++;
760
761		if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) +
762						out * (sizeof(uint16) + sizeof(off_t)) >= size) {
763			// we have found the number of keys in the new node!
764			break;
765		}
766	}
767
768	// if the new key was not inserted, set the length of the keys
769	// that can be copied directly
770	if (keyIndex >= out && in > 0)
771		bytesBefore = inKeyLengths[in - 1];
772
773	if (bytesBefore < 0 || bytesAfter < 0)
774		return B_BAD_DATA;
775
776	//printf("put %ld keys in the other node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in);
777
778	other->left_link = node->left_link;
779	other->right_link = nodeOffset;
780	other->all_key_length = bytes + bytesBefore + bytesAfter;
781	other->all_key_count = out;
782	//printf("-> used = %ld (bplustree_node = %ld bytes)\n",other->Used(),sizeof(bplustree_node));
783
784	uint16 *outKeyLengths = other->KeyLengths();
785	off_t *outKeyValues = other->Values();
786	int32 keys = out > keyIndex ? keyIndex : out;
787
788	if (bytesBefore) {
789		// copy the keys
790		memcpy(outKeys,inKeys,bytesBefore);
791		memcpy(outKeyLengths,inKeyLengths,keys * sizeof(uint16));
792		memcpy(outKeyValues,inKeyValues,keys * sizeof(off_t));
793	}
794	if (bytes) {
795		// copy the newly inserted key
796		memcpy(outKeys + bytesBefore,key,bytes);
797		outKeyLengths[keyIndex] = bytes + bytesBefore;
798		outKeyValues[keyIndex] = *_value;
799
800		if (bytesAfter) {
801			// copy the keys after the new key
802			memcpy(outKeys + bytesBefore + bytes,inKeys + bytesBefore,bytesAfter);
803			keys = out - keyIndex - 1;
804			memcpy(outKeyLengths + keyIndex + 1,inKeyLengths + keyIndex,keys * sizeof(uint16));
805			memcpy(outKeyValues + keyIndex + 1,inKeyValues + keyIndex,keys * sizeof(off_t));
806		}
807	}
808
809	// if the new key was already inserted, we shouldn't use it again
810	if (in != out)
811		keyIndex--;
812
813	int32 total = bytesBefore + bytesAfter;
814
815	// if we have split an index node, we have to drop the first key
816	// of the next node (which can also be the new key to insert)
817	if (node->overflow_link != BPLUSTREE_NULL) {
818		if (in == keyIndex) {
819			other->overflow_link = *_value;
820			keyIndex--;
821		} else {
822			other->overflow_link = inKeyValues[in];
823			total = inKeyLengths[in++];
824		}
825	}
826
827	// and now the same game for the other page and the rest of the keys
828	// (but with memmove() instead of memcpy(), because they may overlap)
829
830	bytesBefore = bytesAfter = bytes = 0;
831	out = 0;
832	int32 skip = in;
833	while (in < node->all_key_count + 1) {
834		//printf("in = %ld, keyIndex = %d, bytes = %ld\n",in,keyIndex,bytes);
835		if (in == keyIndex && !bytes) {
836			bytesBefore = in > skip ? inKeyLengths[in - 1] : 0;
837			//printf("bytesBefore = %ld\n",bytesBefore);
838			bytes = *_keyLength;
839		} else if (in < node->all_key_count) {
840			//printf("1.inKeyLength[%ld] = %u\n",in,inKeyLengths[in]);
841			inKeyLengths[in] -= total;
842			if (bytes) {
843				inKeyLengths[in] += bytes;
844				bytesAfter = inKeyLengths[in] - bytesBefore - bytes;
845				//printf("2.inKeyLength[%ld] = %u, bytesAfter = %ld, bytesBefore = %ld\n",in,inKeyLengths[in],bytesAfter,bytesBefore);
846			}
847
848			in++;
849		} else
850			in++;
851
852		out++;
853
854		//printf("-> out = %ld, keylen = %ld, %ld bytes total\n",out,bytesBefore,round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) +
855		//				out * (sizeof(uint16) + sizeof(off_t)));
856		if (in > node->all_key_count && keyIndex < in)
857			break;
858	}
859
860	//printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total);
861
862	if (keyIndex >= in && keyIndex - skip < out) {
863		bytesAfter = inKeyLengths[in] - bytesBefore - total;
864	} else if (keyIndex < skip)
865		bytesBefore = node->all_key_length - total;
866
867	//printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total);
868
869	if (bytesBefore < 0 || bytesAfter < 0)
870		return B_BAD_DATA;
871
872	node->left_link = otherOffset;
873		// right link, and overflow link can stay the same
874	node->all_key_length = bytes + bytesBefore + bytesAfter;
875	node->all_key_count = out - 1;
876
877	// array positions have changed
878	outKeyLengths = node->KeyLengths();
879	outKeyValues = node->Values();
880
881	//printf("put %ld keys in the first node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in);
882
883	// move the keys in the old node: the order is important here,
884	// because we don't want to overwrite any contents
885
886	keys = keyIndex <= skip ? out : keyIndex - skip;
887	keyIndex -= skip;
888
889	if (bytesBefore)
890		memmove(inKeys,inKeys + total,bytesBefore);
891	if (bytesAfter)
892		memmove(inKeys + bytesBefore + bytes,inKeys + total + bytesBefore,bytesAfter);
893
894	if (bytesBefore)
895		memmove(outKeyLengths,inKeyLengths + skip,keys * sizeof(uint16));
896	in = out - keyIndex - 1;
897	if (bytesAfter)
898		memmove(outKeyLengths + keyIndex + 1,inKeyLengths + skip + keyIndex,in * sizeof(uint16));
899
900	if (bytesBefore)
901		memmove(outKeyValues,inKeyValues + skip,keys * sizeof(off_t));
902	if (bytesAfter)
903		memmove(outKeyValues + keyIndex + 1,inKeyValues + skip + keyIndex,in * sizeof(off_t));
904
905	if (bytes) {
906		// finally, copy the newly inserted key (don't overwrite anything)
907		memcpy(inKeys + bytesBefore,key,bytes);
908		outKeyLengths[keyIndex] = bytes + bytesBefore;
909		outKeyValues[keyIndex] = *_value;
910	}
911
912	//puts("\n!!!!!!!!!!!!!!! after: !!!!!!!!!!!!!!!\n");
913	//dump_bplustree_node(other,fHeader);
914	//hexdump(other,fNodeSize);
915	//puts("\n");
916	//dump_bplustree_node(node,fHeader);
917	//hexdump(node,fNodeSize);
918
919	// write the updated nodes back
920
921	fCache.SetDirty(otherOffset,true);
922	fCache.SetDirty(nodeOffset,true);
923
924	// update the right link of the node in the left of the new node
925	if (other->left_link != BPLUSTREE_NULL) {
926		bplustree_node *left = fCache.Get(other->left_link);
927		if (left != NULL) {
928			left->right_link = otherOffset;
929			fCache.SetDirty(other->left_link,true);
930		}
931	}
932
933	// prepare key to insert in the parent node
934
935	//printf("skip: %ld, count-1: %u\n",skip,other->all_key_count - 1);
936	uint16 length;
937	uint8 *lastKey = other->KeyAt(other->all_key_count - 1,&length);
938	memcpy(key,lastKey,length);
939	*_keyLength = length;
940	*_value = otherOffset;
941
942	return B_OK;
943}
944
945
946status_t BPlusTree::Insert(uint8 *key,uint16 keyLength,off_t value)
947{
948	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
949		return B_BAD_VALUE;
950
951	Stack<node_and_key> stack;
952	if (SeekDown(stack,key,keyLength) != B_OK)
953		return B_ERROR;
954
955	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
956
957	memcpy(keyBuffer,key,keyLength);
958	keyBuffer[keyLength] = 0;
959
960	off_t valueToInsert = value;
961
962	fCurrentNodeOffset = BPLUSTREE_NULL;
963
964	node_and_key nodeAndKey;
965	bplustree_node *node;
966	uint32 count = 0;
967
968	while (stack.Pop(&nodeAndKey) && (node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node))
969	{
970		if (count++ == 0)	// first round, check for duplicate entries
971		{
972			status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex);
973			if (status == B_ERROR)
974				return B_ERROR;
975
976			// is this a duplicate entry?
977			if (status == B_OK && node->overflow_link == BPLUSTREE_NULL)
978			{
979				if (fAllowDuplicates)
980					return InsertDuplicate(node,nodeAndKey.keyIndex);
981				else
982					return B_NAME_IN_USE;
983			}
984		}
985
986		// is the node big enough to hold the pair?
987		if (int32(round_up(sizeof(bplustree_node) + node->all_key_length + keyLength)
988			+ (node->all_key_count + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize)
989		{
990			InsertKey(node,keyBuffer,keyLength,valueToInsert,nodeAndKey.keyIndex);
991			fCache.SetDirty(nodeAndKey.nodeOffset,true);
992
993			return B_OK;
994		}
995		else
996		{
997			// do we need to allocate a new root node? if so, then do
998			// it now
999			bplustree_node *rootNode = NULL;
1000			off_t newRoot = BPLUSTREE_NULL;
1001			if (nodeAndKey.nodeOffset == fHeader->root_node_pointer) {
1002				rootNode = fCache.Get(newRoot = fHeader->maximum_size);
1003				if (rootNode == NULL) {
1004					// the tree is most likely dead
1005					return B_NO_MEMORY;
1006				}
1007			}
1008
1009			if (SplitNode(node,nodeAndKey.nodeOffset,&nodeAndKey.keyIndex,keyBuffer,&keyLength,&valueToInsert) < B_OK) {
1010				if (newRoot != BPLUSTREE_NULL) {
1011					// free root node
1012				}
1013				return B_ERROR;
1014			}
1015
1016			if (newRoot != BPLUSTREE_NULL) {
1017				rootNode->overflow_link = nodeAndKey.nodeOffset;
1018
1019				InsertKey(rootNode,keyBuffer,keyLength,node->left_link,0);
1020
1021				fHeader->root_node_pointer = newRoot;
1022				fHeader->max_number_of_levels++;
1023				// write header
1024
1025				fCache.SetDirty(newRoot,true);
1026
1027				return B_OK;
1028			}
1029		}
1030	}
1031
1032	return B_ERROR;
1033}
1034
1035
1036status_t BPlusTree::Find(uint8 *key,uint16 keyLength,off_t *value)
1037{
1038	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1039		return B_BAD_VALUE;
1040
1041	Stack<node_and_key> stack;
1042	if (SeekDown(stack,key,keyLength) != B_OK)
1043		return B_ERROR;
1044
1045	fCurrentNodeOffset = BPLUSTREE_NULL;
1046
1047	node_and_key nodeAndKey;
1048	bplustree_node *node;
1049
1050	if (stack.Pop(&nodeAndKey) && (node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node))
1051	{
1052		status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex);
1053		if (status == B_ERROR)
1054			return B_ERROR;
1055
1056		SetCurrentNode(node,nodeAndKey.nodeOffset);
1057
1058		if (status == B_OK && node->overflow_link == BPLUSTREE_NULL)
1059		{
1060			*value = node->Values()[nodeAndKey.keyIndex];
1061			return B_OK;
1062		}
1063	}
1064	return B_ENTRY_NOT_FOUND;
1065}
1066
1067
1068//	#pragma mark -
1069
1070
1071bool
1072BPlusTree::CheckNode(bplustree_node *node)
1073{
1074	if (!fHeader->IsValidLink(node->left_link)
1075		|| !fHeader->IsValidLink(node->right_link)
1076		|| !fHeader->IsValidLink(node->overflow_link)
1077		|| (int8 *)node->Values() + node->all_key_count * sizeof(off_t) > (int8 *)node + fNodeSize)
1078		return false;
1079
1080	return true;
1081}
1082
1083
1084bplustree_node *BPlusTree::Node(off_t nodeOffset,bool check)
1085{
1086/*printf("1: %d,%d,%d\n",
1087	nodeOffset > fHeader->maximum_size - fNodeSize,
1088	nodeOffset < 0 && nodeOffset != BPLUSTREE_NULL,
1089	(nodeOffset % fNodeSize) != 0);
1090*/
1091	// the super node is always in memory, and shouldn't
1092	// never be taken out of the cache
1093	if (nodeOffset > fHeader->maximum_size /*- fNodeSize*/
1094		|| nodeOffset <= 0 && nodeOffset != BPLUSTREE_NULL
1095		|| (nodeOffset % fNodeSize) != 0)
1096		return NULL;
1097
1098	bplustree_node *node = (bplustree_node *)malloc(fNodeSize);
1099	if (node == NULL)
1100		return NULL;
1101
1102	if (nodeOffset == BPLUSTREE_NULL || !fStream)
1103	{
1104	 	node->left_link = BPLUSTREE_NULL;
1105	 	node->right_link = BPLUSTREE_NULL;
1106	 	node->overflow_link = BPLUSTREE_NULL;
1107	 	node->all_key_count = 0;
1108	 	node->all_key_length = 0;
1109	}
1110	else if (fStream && fStream->ReadAt(nodeOffset,node,fNodeSize) < fNodeSize)
1111	{
1112		free(node);
1113		return NULL;
1114	}
1115
1116	if (check && node != NULL)
1117	{
1118		// sanity checks (links, all_key_count)
1119		if (!fHeader->IsValidLink(node->left_link)
1120			|| !fHeader->IsValidLink(node->right_link)
1121			|| !fHeader->IsValidLink(node->overflow_link)
1122			|| (int8 *)node->Values() + node->all_key_count * sizeof(off_t) > (int8 *)node + fNodeSize)
1123			return NULL;
1124	}
1125
1126	if (!fStream && nodeOffset > fHeader->maximum_size - fNodeSize) {
1127		// do some hacks here
1128		fHeader->maximum_size += fNodeSize;
1129	}
1130
1131	return node;
1132}
1133
1134
1135void BPlusTree::SetHoldChanges(bool hold)
1136{
1137	fCache.SetHoldChanges(hold);
1138}
1139
1140
1141//	#pragma mark -
1142
1143
1144uint8 *bplustree_node::KeyAt(int32 index,uint16 *keyLength) const
1145{
1146	if (index < 0 || index > all_key_count)
1147		return NULL;
1148
1149	uint8 *keyStart = Keys();
1150	uint16 *keyLengths = KeyLengths();
1151
1152	*keyLength = keyLengths[index] - (index != 0 ? keyLengths[index - 1] : 0);
1153	if (index > 0)
1154		keyStart += keyLengths[index - 1];
1155
1156	return keyStart;
1157}
1158
1159
1160uint8 bplustree_node::CountDuplicates(off_t offset,bool isFragment) const
1161{
1162	// the duplicate fragment handling is currently hard-coded to a node size
1163	// of 1024 bytes - with future versions of BFS, this may be a problem
1164
1165	if (isFragment)
1166	{
1167		uint32 fragment = 8 * ((uint64)offset & 0x3ff);
1168
1169		return ((off_t *)this)[fragment];
1170	}
1171	return overflow_link;
1172}
1173
1174
1175off_t bplustree_node::DuplicateAt(off_t offset,bool isFragment,int8 index) const
1176{
1177	uint32 start;
1178	if (isFragment)
1179		start = 8 * ((uint64)offset & 0x3ff);
1180	else
1181		start = 2;
1182
1183	return ((off_t *)this)[start + 1 + index];
1184}
1185
1186