1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1990, 1993, 1994
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Mike Olson.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35#if defined(LIBC_SCCS) && !defined(lint)
36static char sccsid[] = "@(#)bt_split.c	8.10 (Berkeley) 1/9/95";
37#endif /* LIBC_SCCS and not lint */
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD$");
40
41#include <sys/param.h>
42
43#include <limits.h>
44#include <stdio.h>
45#include <stdlib.h>
46#include <string.h>
47
48#include <db.h>
49#include "btree.h"
50
51static int	 bt_broot(BTREE *, PAGE *, PAGE *, PAGE *);
52static PAGE	*bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
53static int	 bt_preserve(BTREE *, pgno_t);
54static PAGE	*bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t);
55static PAGE	*bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
56static int	 bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *);
57static recno_t	 rec_total(PAGE *);
58
59#ifdef STATISTICS
60u_long	bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
61#endif
62
63/*
64 * __BT_SPLIT -- Split the tree.
65 *
66 * Parameters:
67 *	t:	tree
68 *	sp:	page to split
69 *	key:	key to insert
70 *	data:	data to insert
71 *	flags:	BIGKEY/BIGDATA flags
72 *	ilen:	insert length
73 *	skip:	index to leave open
74 *
75 * Returns:
76 *	RET_ERROR, RET_SUCCESS
77 */
78int
79__bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags,
80    size_t ilen, u_int32_t argskip)
81{
82	BINTERNAL *bi;
83	BLEAF *bl, *tbl;
84	DBT a, b;
85	EPGNO *parent;
86	PAGE *h, *l, *r, *lchild, *rchild;
87	indx_t nxtindex;
88	u_int16_t skip;
89	u_int32_t n, nbytes, nksize;
90	int parentsplit;
91	char *dest;
92
93	/*
94	 * Split the page into two pages, l and r.  The split routines return
95	 * a pointer to the page into which the key should be inserted and with
96	 * skip set to the offset which should be used.  Additionally, l and r
97	 * are pinned.
98	 */
99	skip = argskip;
100	h = sp->pgno == P_ROOT ?
101	    bt_root(t, sp, &l, &r, &skip, ilen) :
102	    bt_page(t, sp, &l, &r, &skip, ilen);
103	if (h == NULL)
104		return (RET_ERROR);
105
106	/*
107	 * Insert the new key/data pair into the leaf page.  (Key inserts
108	 * always cause a leaf page to split first.)
109	 */
110	h->linp[skip] = h->upper -= ilen;
111	dest = (char *)h + h->upper;
112	if (F_ISSET(t, R_RECNO))
113		WR_RLEAF(dest, data, flags)
114	else
115		WR_BLEAF(dest, key, data, flags)
116
117	/* If the root page was split, make it look right. */
118	if (sp->pgno == P_ROOT &&
119	    (F_ISSET(t, R_RECNO) ?
120	    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
121		goto err2;
122
123	/*
124	 * Now we walk the parent page stack -- a LIFO stack of the pages that
125	 * were traversed when we searched for the page that split.  Each stack
126	 * entry is a page number and a page index offset.  The offset is for
127	 * the page traversed on the search.  We've just split a page, so we
128	 * have to insert a new key into the parent page.
129	 *
130	 * If the insert into the parent page causes it to split, may have to
131	 * continue splitting all the way up the tree.  We stop if the root
132	 * splits or the page inserted into didn't have to split to hold the
133	 * new key.  Some algorithms replace the key for the old page as well
134	 * as the new page.  We don't, as there's no reason to believe that the
135	 * first key on the old page is any better than the key we have, and,
136	 * in the case of a key being placed at index 0 causing the split, the
137	 * key is unavailable.
138	 *
139	 * There are a maximum of 5 pages pinned at any time.  We keep the left
140	 * and right pages pinned while working on the parent.   The 5 are the
141	 * two children, left parent and right parent (when the parent splits)
142	 * and the root page or the overflow key page when calling bt_preserve.
143	 * This code must make sure that all pins are released other than the
144	 * root page or overflow page which is unlocked elsewhere.
145	 */
146	while ((parent = BT_POP(t)) != NULL) {
147		lchild = l;
148		rchild = r;
149
150		/* Get the parent page. */
151		if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
152			goto err2;
153
154		/*
155		 * The new key goes ONE AFTER the index, because the split
156		 * was to the right.
157		 */
158		skip = parent->index + 1;
159
160		/*
161		 * Calculate the space needed on the parent page.
162		 *
163		 * Prefix trees: space hack when inserting into BINTERNAL
164		 * pages.  Retain only what's needed to distinguish between
165		 * the new entry and the LAST entry on the page to its left.
166		 * If the keys compare equal, retain the entire key.  Note,
167		 * we don't touch overflow keys, and the entire key must be
168		 * retained for the next-to-left most key on the leftmost
169		 * page of each level, or the search will fail.  Applicable
170		 * ONLY to internal pages that have leaf pages as children.
171		 * Further reduction of the key between pairs of internal
172		 * pages loses too much information.
173		 */
174		switch (rchild->flags & P_TYPE) {
175		case P_BINTERNAL:
176			bi = GETBINTERNAL(rchild, 0);
177			nbytes = NBINTERNAL(bi->ksize);
178			break;
179		case P_BLEAF:
180			bl = GETBLEAF(rchild, 0);
181			nbytes = NBINTERNAL(bl->ksize);
182			if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
183			    (h->prevpg != P_INVALID || skip > 1)) {
184				tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
185				a.size = tbl->ksize;
186				a.data = tbl->bytes;
187				b.size = bl->ksize;
188				b.data = bl->bytes;
189				nksize = t->bt_pfx(&a, &b);
190				n = NBINTERNAL(nksize);
191				if (n < nbytes) {
192#ifdef STATISTICS
193					bt_pfxsaved += nbytes - n;
194#endif
195					nbytes = n;
196				} else
197					nksize = 0;
198			} else
199				nksize = 0;
200			break;
201		case P_RINTERNAL:
202		case P_RLEAF:
203			nbytes = NRINTERNAL;
204			break;
205		default:
206			abort();
207		}
208
209		/* Split the parent page if necessary or shift the indices. */
210		if ((u_int32_t)(h->upper - h->lower) < nbytes + sizeof(indx_t)) {
211			sp = h;
212			h = h->pgno == P_ROOT ?
213			    bt_root(t, h, &l, &r, &skip, nbytes) :
214			    bt_page(t, h, &l, &r, &skip, nbytes);
215			if (h == NULL)
216				goto err1;
217			parentsplit = 1;
218		} else {
219			if (skip < (nxtindex = NEXTINDEX(h)))
220				memmove(h->linp + skip + 1, h->linp + skip,
221				    (nxtindex - skip) * sizeof(indx_t));
222			h->lower += sizeof(indx_t);
223			parentsplit = 0;
224		}
225
226		/* Insert the key into the parent page. */
227		switch (rchild->flags & P_TYPE) {
228		case P_BINTERNAL:
229			h->linp[skip] = h->upper -= nbytes;
230			dest = (char *)h + h->linp[skip];
231			memmove(dest, bi, nbytes);
232			((BINTERNAL *)dest)->pgno = rchild->pgno;
233			break;
234		case P_BLEAF:
235			h->linp[skip] = h->upper -= nbytes;
236			dest = (char *)h + h->linp[skip];
237			WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
238			    rchild->pgno, bl->flags & P_BIGKEY);
239			memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
240			if (bl->flags & P_BIGKEY) {
241				pgno_t pgno;
242				memcpy(&pgno, bl->bytes, sizeof(pgno));
243				if (bt_preserve(t, pgno) == RET_ERROR)
244					goto err1;
245			}
246			break;
247		case P_RINTERNAL:
248			/*
249			 * Update the left page count.  If split
250			 * added at index 0, fix the correct page.
251			 */
252			if (skip > 0)
253				dest = (char *)h + h->linp[skip - 1];
254			else
255				dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
256			((RINTERNAL *)dest)->nrecs = rec_total(lchild);
257			((RINTERNAL *)dest)->pgno = lchild->pgno;
258
259			/* Update the right page count. */
260			h->linp[skip] = h->upper -= nbytes;
261			dest = (char *)h + h->linp[skip];
262			((RINTERNAL *)dest)->nrecs = rec_total(rchild);
263			((RINTERNAL *)dest)->pgno = rchild->pgno;
264			break;
265		case P_RLEAF:
266			/*
267			 * Update the left page count.  If split
268			 * added at index 0, fix the correct page.
269			 */
270			if (skip > 0)
271				dest = (char *)h + h->linp[skip - 1];
272			else
273				dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
274			((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
275			((RINTERNAL *)dest)->pgno = lchild->pgno;
276
277			/* Update the right page count. */
278			h->linp[skip] = h->upper -= nbytes;
279			dest = (char *)h + h->linp[skip];
280			((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
281			((RINTERNAL *)dest)->pgno = rchild->pgno;
282			break;
283		default:
284			abort();
285		}
286
287		/* Unpin the held pages. */
288		if (!parentsplit) {
289			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
290			break;
291		}
292
293		/* If the root page was split, make it look right. */
294		if (sp->pgno == P_ROOT &&
295		    (F_ISSET(t, R_RECNO) ?
296		    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
297			goto err1;
298
299		mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
300		mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
301	}
302
303	/* Unpin the held pages. */
304	mpool_put(t->bt_mp, l, MPOOL_DIRTY);
305	mpool_put(t->bt_mp, r, MPOOL_DIRTY);
306
307	/* Clear any pages left on the stack. */
308	return (RET_SUCCESS);
309
310	/*
311	 * If something fails in the above loop we were already walking back
312	 * up the tree and the tree is now inconsistent.  Nothing much we can
313	 * do about it but release any memory we're holding.
314	 */
315err1:	mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
316	mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
317
318err2:	mpool_put(t->bt_mp, l, 0);
319	mpool_put(t->bt_mp, r, 0);
320	__dbpanic(t->bt_dbp);
321	return (RET_ERROR);
322}
323
324/*
325 * BT_PAGE -- Split a non-root page of a btree.
326 *
327 * Parameters:
328 *	t:	tree
329 *	h:	root page
330 *	lp:	pointer to left page pointer
331 *	rp:	pointer to right page pointer
332 *	skip:	pointer to index to leave open
333 *	ilen:	insert length
334 *
335 * Returns:
336 *	Pointer to page in which to insert or NULL on error.
337 */
338static PAGE *
339bt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
340{
341	PAGE *l, *r, *tp;
342	pgno_t npg;
343
344#ifdef STATISTICS
345	++bt_split;
346#endif
347	/* Put the new right page for the split into place. */
348	if ((r = __bt_new(t, &npg)) == NULL)
349		return (NULL);
350	r->pgno = npg;
351	r->lower = BTDATAOFF;
352	r->upper = t->bt_psize;
353	r->nextpg = h->nextpg;
354	r->prevpg = h->pgno;
355	r->flags = h->flags & P_TYPE;
356
357	/*
358	 * If we're splitting the last page on a level because we're appending
359	 * a key to it (skip is NEXTINDEX()), it's likely that the data is
360	 * sorted.  Adding an empty page on the side of the level is less work
361	 * and can push the fill factor much higher than normal.  If we're
362	 * wrong it's no big deal, we'll just do the split the right way next
363	 * time.  It may look like it's equally easy to do a similar hack for
364	 * reverse sorted data, that is, split the tree left, but it's not.
365	 * Don't even try.
366	 */
367	if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
368#ifdef STATISTICS
369		++bt_sortsplit;
370#endif
371		h->nextpg = r->pgno;
372		r->lower = BTDATAOFF + sizeof(indx_t);
373		*skip = 0;
374		*lp = h;
375		*rp = r;
376		return (r);
377	}
378
379	/* Put the new left page for the split into place. */
380	if ((l = (PAGE *)calloc(1, t->bt_psize)) == NULL) {
381		mpool_put(t->bt_mp, r, 0);
382		return (NULL);
383	}
384	l->pgno = h->pgno;
385	l->nextpg = r->pgno;
386	l->prevpg = h->prevpg;
387	l->lower = BTDATAOFF;
388	l->upper = t->bt_psize;
389	l->flags = h->flags & P_TYPE;
390
391	/* Fix up the previous pointer of the page after the split page. */
392	if (h->nextpg != P_INVALID) {
393		if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
394			free(l);
395			/* XXX mpool_free(t->bt_mp, r->pgno); */
396			return (NULL);
397		}
398		tp->prevpg = r->pgno;
399		mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
400	}
401
402	/*
403	 * Split right.  The key/data pairs aren't sorted in the btree page so
404	 * it's simpler to copy the data from the split page onto two new pages
405	 * instead of copying half the data to the right page and compacting
406	 * the left page in place.  Since the left page can't change, we have
407	 * to swap the original and the allocated left page after the split.
408	 */
409	tp = bt_psplit(t, h, l, r, skip, ilen);
410
411	/* Move the new left page onto the old left page. */
412	memmove(h, l, t->bt_psize);
413	if (tp == l)
414		tp = h;
415	free(l);
416
417	*lp = h;
418	*rp = r;
419	return (tp);
420}
421
422/*
423 * BT_ROOT -- Split the root page of a btree.
424 *
425 * Parameters:
426 *	t:	tree
427 *	h:	root page
428 *	lp:	pointer to left page pointer
429 *	rp:	pointer to right page pointer
430 *	skip:	pointer to index to leave open
431 *	ilen:	insert length
432 *
433 * Returns:
434 *	Pointer to page in which to insert or NULL on error.
435 */
436static PAGE *
437bt_root(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
438{
439	PAGE *l, *r, *tp;
440	pgno_t lnpg, rnpg;
441
442#ifdef STATISTICS
443	++bt_split;
444	++bt_rootsplit;
445#endif
446	/* Put the new left and right pages for the split into place. */
447	if ((l = __bt_new(t, &lnpg)) == NULL ||
448	    (r = __bt_new(t, &rnpg)) == NULL)
449		return (NULL);
450	l->pgno = lnpg;
451	r->pgno = rnpg;
452	l->nextpg = r->pgno;
453	r->prevpg = l->pgno;
454	l->prevpg = r->nextpg = P_INVALID;
455	l->lower = r->lower = BTDATAOFF;
456	l->upper = r->upper = t->bt_psize;
457	l->flags = r->flags = h->flags & P_TYPE;
458
459	/* Split the root page. */
460	tp = bt_psplit(t, h, l, r, skip, ilen);
461
462	*lp = l;
463	*rp = r;
464	return (tp);
465}
466
467/*
468 * BT_RROOT -- Fix up the recno root page after it has been split.
469 *
470 * Parameters:
471 *	t:	tree
472 *	h:	root page
473 *	l:	left page
474 *	r:	right page
475 *
476 * Returns:
477 *	RET_ERROR, RET_SUCCESS
478 */
479static int
480bt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
481{
482	char *dest;
483
484	/* Insert the left and right keys, set the header information. */
485	h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
486	dest = (char *)h + h->upper;
487	WR_RINTERNAL(dest,
488	    l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
489
490	__PAST_END(h->linp, 1) = h->upper -= NRINTERNAL;
491	dest = (char *)h + h->upper;
492	WR_RINTERNAL(dest,
493	    r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
494
495	h->lower = BTDATAOFF + 2 * sizeof(indx_t);
496
497	/* Unpin the root page, set to recno internal page. */
498	h->flags &= ~P_TYPE;
499	h->flags |= P_RINTERNAL;
500	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
501
502	return (RET_SUCCESS);
503}
504
505/*
506 * BT_BROOT -- Fix up the btree root page after it has been split.
507 *
508 * Parameters:
509 *	t:	tree
510 *	h:	root page
511 *	l:	left page
512 *	r:	right page
513 *
514 * Returns:
515 *	RET_ERROR, RET_SUCCESS
516 */
517static int
518bt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
519{
520	BINTERNAL *bi;
521	BLEAF *bl;
522	u_int32_t nbytes;
523	char *dest;
524
525	/*
526	 * If the root page was a leaf page, change it into an internal page.
527	 * We copy the key we split on (but not the key's data, in the case of
528	 * a leaf page) to the new root page.
529	 *
530	 * The btree comparison code guarantees that the left-most key on any
531	 * level of the tree is never used, so it doesn't need to be filled in.
532	 */
533	nbytes = NBINTERNAL(0);
534	h->linp[0] = h->upper = t->bt_psize - nbytes;
535	dest = (char *)h + h->upper;
536	WR_BINTERNAL(dest, 0, l->pgno, 0);
537
538	switch (h->flags & P_TYPE) {
539	case P_BLEAF:
540		bl = GETBLEAF(r, 0);
541		nbytes = NBINTERNAL(bl->ksize);
542		__PAST_END(h->linp, 1) = h->upper -= nbytes;
543		dest = (char *)h + h->upper;
544		WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
545		memmove(dest, bl->bytes, bl->ksize);
546
547		/*
548		 * If the key is on an overflow page, mark the overflow chain
549		 * so it isn't deleted when the leaf copy of the key is deleted.
550		 */
551	if (bl->flags & P_BIGKEY) {
552			pgno_t pgno;
553			memcpy(&pgno, bl->bytes, sizeof(pgno));
554			if (bt_preserve(t, pgno) == RET_ERROR)
555				return (RET_ERROR);
556		}
557		break;
558	case P_BINTERNAL:
559		bi = GETBINTERNAL(r, 0);
560		nbytes = NBINTERNAL(bi->ksize);
561		__PAST_END(h->linp, 1) = h->upper -= nbytes;
562		dest = (char *)h + h->upper;
563		memmove(dest, bi, nbytes);
564		((BINTERNAL *)dest)->pgno = r->pgno;
565		break;
566	default:
567		abort();
568	}
569
570	/* There are two keys on the page. */
571	h->lower = BTDATAOFF + 2 * sizeof(indx_t);
572
573	/* Unpin the root page, set to btree internal page. */
574	h->flags &= ~P_TYPE;
575	h->flags |= P_BINTERNAL;
576	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
577
578	return (RET_SUCCESS);
579}
580
581/*
582 * BT_PSPLIT -- Do the real work of splitting the page.
583 *
584 * Parameters:
585 *	t:	tree
586 *	h:	page to be split
587 *	l:	page to put lower half of data
588 *	r:	page to put upper half of data
589 *	pskip:	pointer to index to leave open
590 *	ilen:	insert length
591 *
592 * Returns:
593 *	Pointer to page in which to insert.
594 */
595static PAGE *
596bt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen)
597{
598	BINTERNAL *bi;
599	BLEAF *bl;
600	CURSOR *c;
601	RLEAF *rl;
602	PAGE *rval;
603	void *src;
604	indx_t full, half, nxt, off, skip, top, used;
605	u_int32_t nbytes;
606	int bigkeycnt, isbigkey;
607
608	/*
609	 * Split the data to the left and right pages.  Leave the skip index
610	 * open.  Additionally, make some effort not to split on an overflow
611	 * key.  This makes internal page processing faster and can save
612	 * space as overflow keys used by internal pages are never deleted.
613	 */
614	bigkeycnt = 0;
615	skip = *pskip;
616	full = t->bt_psize - BTDATAOFF;
617	half = full / 2;
618	used = 0;
619	for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
620		if (skip == off) {
621			nbytes = ilen;
622			isbigkey = 0;		/* XXX: not really known. */
623		} else
624			switch (h->flags & P_TYPE) {
625			case P_BINTERNAL:
626				src = bi = GETBINTERNAL(h, nxt);
627				nbytes = NBINTERNAL(bi->ksize);
628				isbigkey = bi->flags & P_BIGKEY;
629				break;
630			case P_BLEAF:
631				src = bl = GETBLEAF(h, nxt);
632				nbytes = NBLEAF(bl);
633				isbigkey = bl->flags & P_BIGKEY;
634				break;
635			case P_RINTERNAL:
636				src = GETRINTERNAL(h, nxt);
637				nbytes = NRINTERNAL;
638				isbigkey = 0;
639				break;
640			case P_RLEAF:
641				src = rl = GETRLEAF(h, nxt);
642				nbytes = NRLEAF(rl);
643				isbigkey = 0;
644				break;
645			default:
646				abort();
647			}
648
649		/*
650		 * If the key/data pairs are substantial fractions of the max
651		 * possible size for the page, it's possible to get situations
652		 * where we decide to try and copy too much onto the left page.
653		 * Make sure that doesn't happen.
654		 */
655		if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) ||
656		    nxt == top - 1) {
657			--off;
658			break;
659		}
660
661		/* Copy the key/data pair, if not the skipped index. */
662		if (skip != off) {
663			++nxt;
664
665			l->linp[off] = l->upper -= nbytes;
666			memmove((char *)l + l->upper, src, nbytes);
667		}
668
669		used += nbytes + sizeof(indx_t);
670		if (used >= half) {
671			if (!isbigkey || bigkeycnt == 3)
672				break;
673			else
674				++bigkeycnt;
675		}
676	}
677
678	/*
679	 * Off is the last offset that's valid for the left page.
680	 * Nxt is the first offset to be placed on the right page.
681	 */
682	l->lower += (off + 1) * sizeof(indx_t);
683
684	/*
685	 * If splitting the page that the cursor was on, the cursor has to be
686	 * adjusted to point to the same record as before the split.  If the
687	 * cursor is at or past the skipped slot, the cursor is incremented by
688	 * one.  If the cursor is on the right page, it is decremented by the
689	 * number of records split to the left page.
690	 */
691	c = &t->bt_cursor;
692	if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
693		if (c->pg.index >= skip)
694			++c->pg.index;
695		if (c->pg.index < nxt)			/* Left page. */
696			c->pg.pgno = l->pgno;
697		else {					/* Right page. */
698			c->pg.pgno = r->pgno;
699			c->pg.index -= nxt;
700		}
701	}
702
703	/*
704	 * If the skipped index was on the left page, just return that page.
705	 * Otherwise, adjust the skip index to reflect the new position on
706	 * the right page.
707	 */
708	if (skip <= off) {
709		skip = MAX_PAGE_OFFSET;
710		rval = l;
711	} else {
712		rval = r;
713		*pskip -= nxt;
714	}
715
716	for (off = 0; nxt < top; ++off) {
717		if (skip == nxt) {
718			++off;
719			skip = MAX_PAGE_OFFSET;
720		}
721		switch (h->flags & P_TYPE) {
722		case P_BINTERNAL:
723			src = bi = GETBINTERNAL(h, nxt);
724			nbytes = NBINTERNAL(bi->ksize);
725			break;
726		case P_BLEAF:
727			src = bl = GETBLEAF(h, nxt);
728			nbytes = NBLEAF(bl);
729			break;
730		case P_RINTERNAL:
731			src = GETRINTERNAL(h, nxt);
732			nbytes = NRINTERNAL;
733			break;
734		case P_RLEAF:
735			src = rl = GETRLEAF(h, nxt);
736			nbytes = NRLEAF(rl);
737			break;
738		default:
739			abort();
740		}
741		++nxt;
742		r->linp[off] = r->upper -= nbytes;
743		memmove((char *)r + r->upper, src, nbytes);
744	}
745	r->lower += off * sizeof(indx_t);
746
747	/* If the key is being appended to the page, adjust the index. */
748	if (skip == top)
749		r->lower += sizeof(indx_t);
750
751	return (rval);
752}
753
754/*
755 * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
756 *
757 * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
758 * record that references them gets deleted.  Chains pointed to by internal
759 * pages never get deleted.  This routine marks a chain as pointed to by an
760 * internal page.
761 *
762 * Parameters:
763 *	t:	tree
764 *	pg:	page number of first page in the chain.
765 *
766 * Returns:
767 *	RET_SUCCESS, RET_ERROR.
768 */
769static int
770bt_preserve(BTREE *t, pgno_t pg)
771{
772	PAGE *h;
773
774	if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
775		return (RET_ERROR);
776	h->flags |= P_PRESERVE;
777	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
778	return (RET_SUCCESS);
779}
780
781/*
782 * REC_TOTAL -- Return the number of recno entries below a page.
783 *
784 * Parameters:
785 *	h:	page
786 *
787 * Returns:
788 *	The number of recno entries below a page.
789 *
790 * XXX
791 * These values could be set by the bt_psplit routine.  The problem is that the
792 * entry has to be popped off of the stack etc. or the values have to be passed
793 * all the way back to bt_split/bt_rroot and it's not very clean.
794 */
795static recno_t
796rec_total(PAGE *h)
797{
798	recno_t recs;
799	indx_t nxt, top;
800
801	for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
802		recs += GETRINTERNAL(h, nxt)->nrecs;
803	return (recs);
804}
805