bt_split.c revision 14272
11573Srgrimes/*-
214272Spst * Copyright (c) 1990, 1993, 1994
31573Srgrimes *	The Regents of the University of California.  All rights reserved.
41573Srgrimes *
51573Srgrimes * This code is derived from software contributed to Berkeley by
61573Srgrimes * Mike Olson.
71573Srgrimes *
81573Srgrimes * Redistribution and use in source and binary forms, with or without
91573Srgrimes * modification, are permitted provided that the following conditions
101573Srgrimes * are met:
111573Srgrimes * 1. Redistributions of source code must retain the above copyright
121573Srgrimes *    notice, this list of conditions and the following disclaimer.
131573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
141573Srgrimes *    notice, this list of conditions and the following disclaimer in the
151573Srgrimes *    documentation and/or other materials provided with the distribution.
161573Srgrimes * 3. All advertising materials mentioning features or use of this software
171573Srgrimes *    must display the following acknowledgement:
181573Srgrimes *	This product includes software developed by the University of
191573Srgrimes *	California, Berkeley and its contributors.
201573Srgrimes * 4. Neither the name of the University nor the names of its contributors
211573Srgrimes *    may be used to endorse or promote products derived from this software
221573Srgrimes *    without specific prior written permission.
231573Srgrimes *
241573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
251573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
261573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
271573Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
281573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
291573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
301573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
311573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
321573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
331573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
341573Srgrimes * SUCH DAMAGE.
351573Srgrimes */
361573Srgrimes
371573Srgrimes#if defined(LIBC_SCCS) && !defined(lint)
3814272Spststatic char sccsid[] = "@(#)bt_split.c	8.9 (Berkeley) 7/26/94";
391573Srgrimes#endif /* LIBC_SCCS and not lint */
401573Srgrimes
411573Srgrimes#include <sys/types.h>
421573Srgrimes
431573Srgrimes#include <limits.h>
441573Srgrimes#include <stdio.h>
451573Srgrimes#include <stdlib.h>
461573Srgrimes#include <string.h>
471573Srgrimes
481573Srgrimes#include <db.h>
491573Srgrimes#include "btree.h"
501573Srgrimes
511573Srgrimesstatic int	 bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
521573Srgrimesstatic PAGE	*bt_page
531573Srgrimes		    __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
541573Srgrimesstatic int	 bt_preserve __P((BTREE *, pgno_t));
551573Srgrimesstatic PAGE	*bt_psplit
561573Srgrimes		    __P((BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t));
571573Srgrimesstatic PAGE	*bt_root
581573Srgrimes		    __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
591573Srgrimesstatic int	 bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
601573Srgrimesstatic recno_t	 rec_total __P((PAGE *));
611573Srgrimes
621573Srgrimes#ifdef STATISTICS
631573Srgrimesu_long	bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
641573Srgrimes#endif
651573Srgrimes
661573Srgrimes/*
671573Srgrimes * __BT_SPLIT -- Split the tree.
681573Srgrimes *
691573Srgrimes * Parameters:
701573Srgrimes *	t:	tree
711573Srgrimes *	sp:	page to split
721573Srgrimes *	key:	key to insert
731573Srgrimes *	data:	data to insert
741573Srgrimes *	flags:	BIGKEY/BIGDATA flags
751573Srgrimes *	ilen:	insert length
761573Srgrimes *	skip:	index to leave open
771573Srgrimes *
781573Srgrimes * Returns:
791573Srgrimes *	RET_ERROR, RET_SUCCESS
801573Srgrimes */
811573Srgrimesint
8214272Spst__bt_split(t, sp, key, data, flags, ilen, argskip)
831573Srgrimes	BTREE *t;
841573Srgrimes	PAGE *sp;
851573Srgrimes	const DBT *key, *data;
861573Srgrimes	int flags;
871573Srgrimes	size_t ilen;
8814272Spst	u_int32_t argskip;
891573Srgrimes{
901573Srgrimes	BINTERNAL *bi;
911573Srgrimes	BLEAF *bl, *tbl;
921573Srgrimes	DBT a, b;
931573Srgrimes	EPGNO *parent;
941573Srgrimes	PAGE *h, *l, *r, *lchild, *rchild;
951573Srgrimes	indx_t nxtindex;
9614272Spst	u_int16_t skip;
9714272Spst	u_int32_t n, nbytes, nksize;
981573Srgrimes	int parentsplit;
991573Srgrimes	char *dest;
1001573Srgrimes
1011573Srgrimes	/*
1021573Srgrimes	 * Split the page into two pages, l and r.  The split routines return
1031573Srgrimes	 * a pointer to the page into which the key should be inserted and with
1041573Srgrimes	 * skip set to the offset which should be used.  Additionally, l and r
1051573Srgrimes	 * are pinned.
1061573Srgrimes	 */
10714272Spst	skip = argskip;
1081573Srgrimes	h = sp->pgno == P_ROOT ?
1091573Srgrimes	    bt_root(t, sp, &l, &r, &skip, ilen) :
1101573Srgrimes	    bt_page(t, sp, &l, &r, &skip, ilen);
1111573Srgrimes	if (h == NULL)
1121573Srgrimes		return (RET_ERROR);
1131573Srgrimes
1141573Srgrimes	/*
1151573Srgrimes	 * Insert the new key/data pair into the leaf page.  (Key inserts
1161573Srgrimes	 * always cause a leaf page to split first.)
1171573Srgrimes	 */
1181573Srgrimes	h->linp[skip] = h->upper -= ilen;
1191573Srgrimes	dest = (char *)h + h->upper;
12014272Spst	if (F_ISSET(t, R_RECNO))
1211573Srgrimes		WR_RLEAF(dest, data, flags)
1221573Srgrimes	else
1231573Srgrimes		WR_BLEAF(dest, key, data, flags)
1241573Srgrimes
1251573Srgrimes	/* If the root page was split, make it look right. */
1261573Srgrimes	if (sp->pgno == P_ROOT &&
12714272Spst	    (F_ISSET(t, R_RECNO) ?
1281573Srgrimes	    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
1291573Srgrimes		goto err2;
1301573Srgrimes
1311573Srgrimes	/*
1321573Srgrimes	 * Now we walk the parent page stack -- a LIFO stack of the pages that
1331573Srgrimes	 * were traversed when we searched for the page that split.  Each stack
1341573Srgrimes	 * entry is a page number and a page index offset.  The offset is for
1351573Srgrimes	 * the page traversed on the search.  We've just split a page, so we
1361573Srgrimes	 * have to insert a new key into the parent page.
1371573Srgrimes	 *
1381573Srgrimes	 * If the insert into the parent page causes it to split, may have to
1391573Srgrimes	 * continue splitting all the way up the tree.  We stop if the root
1401573Srgrimes	 * splits or the page inserted into didn't have to split to hold the
1411573Srgrimes	 * new key.  Some algorithms replace the key for the old page as well
1421573Srgrimes	 * as the new page.  We don't, as there's no reason to believe that the
1431573Srgrimes	 * first key on the old page is any better than the key we have, and,
1441573Srgrimes	 * in the case of a key being placed at index 0 causing the split, the
1451573Srgrimes	 * key is unavailable.
1461573Srgrimes	 *
1471573Srgrimes	 * There are a maximum of 5 pages pinned at any time.  We keep the left
1481573Srgrimes	 * and right pages pinned while working on the parent.   The 5 are the
1491573Srgrimes	 * two children, left parent and right parent (when the parent splits)
1501573Srgrimes	 * and the root page or the overflow key page when calling bt_preserve.
1511573Srgrimes	 * This code must make sure that all pins are released other than the
1521573Srgrimes	 * root page or overflow page which is unlocked elsewhere.
1531573Srgrimes	 */
1541573Srgrimes	while ((parent = BT_POP(t)) != NULL) {
1551573Srgrimes		lchild = l;
1561573Srgrimes		rchild = r;
1571573Srgrimes
1581573Srgrimes		/* Get the parent page. */
1591573Srgrimes		if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
1601573Srgrimes			goto err2;
1611573Srgrimes
1621573Srgrimes	 	/*
1631573Srgrimes		 * The new key goes ONE AFTER the index, because the split
1641573Srgrimes		 * was to the right.
1651573Srgrimes		 */
1661573Srgrimes		skip = parent->index + 1;
1671573Srgrimes
1681573Srgrimes		/*
1691573Srgrimes		 * Calculate the space needed on the parent page.
1701573Srgrimes		 *
1711573Srgrimes		 * Prefix trees: space hack when inserting into BINTERNAL
1721573Srgrimes		 * pages.  Retain only what's needed to distinguish between
1731573Srgrimes		 * the new entry and the LAST entry on the page to its left.
1741573Srgrimes		 * If the keys compare equal, retain the entire key.  Note,
1751573Srgrimes		 * we don't touch overflow keys, and the entire key must be
1761573Srgrimes		 * retained for the next-to-left most key on the leftmost
1771573Srgrimes		 * page of each level, or the search will fail.  Applicable
1781573Srgrimes		 * ONLY to internal pages that have leaf pages as children.
1791573Srgrimes		 * Further reduction of the key between pairs of internal
1801573Srgrimes		 * pages loses too much information.
1811573Srgrimes		 */
1821573Srgrimes		switch (rchild->flags & P_TYPE) {
1831573Srgrimes		case P_BINTERNAL:
1841573Srgrimes			bi = GETBINTERNAL(rchild, 0);
1851573Srgrimes			nbytes = NBINTERNAL(bi->ksize);
1861573Srgrimes			break;
1871573Srgrimes		case P_BLEAF:
1881573Srgrimes			bl = GETBLEAF(rchild, 0);
1891573Srgrimes			nbytes = NBINTERNAL(bl->ksize);
1901573Srgrimes			if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
1911573Srgrimes			    (h->prevpg != P_INVALID || skip > 1)) {
1921573Srgrimes				tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
1931573Srgrimes				a.size = tbl->ksize;
1941573Srgrimes				a.data = tbl->bytes;
1951573Srgrimes				b.size = bl->ksize;
1961573Srgrimes				b.data = bl->bytes;
1971573Srgrimes				nksize = t->bt_pfx(&a, &b);
1981573Srgrimes				n = NBINTERNAL(nksize);
1991573Srgrimes				if (n < nbytes) {
2001573Srgrimes#ifdef STATISTICS
2011573Srgrimes					bt_pfxsaved += nbytes - n;
2021573Srgrimes#endif
2031573Srgrimes					nbytes = n;
2041573Srgrimes				} else
2051573Srgrimes					nksize = 0;
2061573Srgrimes			} else
2071573Srgrimes				nksize = 0;
2081573Srgrimes			break;
2091573Srgrimes		case P_RINTERNAL:
2101573Srgrimes		case P_RLEAF:
2111573Srgrimes			nbytes = NRINTERNAL;
2121573Srgrimes			break;
2131573Srgrimes		default:
2141573Srgrimes			abort();
2151573Srgrimes		}
2161573Srgrimes
2171573Srgrimes		/* Split the parent page if necessary or shift the indices. */
2181573Srgrimes		if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
2191573Srgrimes			sp = h;
2201573Srgrimes			h = h->pgno == P_ROOT ?
2211573Srgrimes			    bt_root(t, h, &l, &r, &skip, nbytes) :
2221573Srgrimes			    bt_page(t, h, &l, &r, &skip, nbytes);
2231573Srgrimes			if (h == NULL)
2241573Srgrimes				goto err1;
2251573Srgrimes			parentsplit = 1;
2261573Srgrimes		} else {
2271573Srgrimes			if (skip < (nxtindex = NEXTINDEX(h)))
2281573Srgrimes				memmove(h->linp + skip + 1, h->linp + skip,
2291573Srgrimes				    (nxtindex - skip) * sizeof(indx_t));
2301573Srgrimes			h->lower += sizeof(indx_t);
2311573Srgrimes			parentsplit = 0;
2321573Srgrimes		}
2331573Srgrimes
2341573Srgrimes		/* Insert the key into the parent page. */
23514272Spst		switch (rchild->flags & P_TYPE) {
2361573Srgrimes		case P_BINTERNAL:
2371573Srgrimes			h->linp[skip] = h->upper -= nbytes;
2381573Srgrimes			dest = (char *)h + h->linp[skip];
2391573Srgrimes			memmove(dest, bi, nbytes);
2401573Srgrimes			((BINTERNAL *)dest)->pgno = rchild->pgno;
2411573Srgrimes			break;
2421573Srgrimes		case P_BLEAF:
2431573Srgrimes			h->linp[skip] = h->upper -= nbytes;
2441573Srgrimes			dest = (char *)h + h->linp[skip];
2451573Srgrimes			WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
2461573Srgrimes			    rchild->pgno, bl->flags & P_BIGKEY);
2471573Srgrimes			memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
2481573Srgrimes			if (bl->flags & P_BIGKEY &&
2491573Srgrimes			    bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
2501573Srgrimes				goto err1;
2511573Srgrimes			break;
2521573Srgrimes		case P_RINTERNAL:
2531573Srgrimes			/*
2541573Srgrimes			 * Update the left page count.  If split
2551573Srgrimes			 * added at index 0, fix the correct page.
2561573Srgrimes			 */
2571573Srgrimes			if (skip > 0)
2581573Srgrimes				dest = (char *)h + h->linp[skip - 1];
2591573Srgrimes			else
2601573Srgrimes				dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
2611573Srgrimes			((RINTERNAL *)dest)->nrecs = rec_total(lchild);
2621573Srgrimes			((RINTERNAL *)dest)->pgno = lchild->pgno;
2631573Srgrimes
2641573Srgrimes			/* Update the right page count. */
2651573Srgrimes			h->linp[skip] = h->upper -= nbytes;
2661573Srgrimes			dest = (char *)h + h->linp[skip];
2671573Srgrimes			((RINTERNAL *)dest)->nrecs = rec_total(rchild);
2681573Srgrimes			((RINTERNAL *)dest)->pgno = rchild->pgno;
2691573Srgrimes			break;
2701573Srgrimes		case P_RLEAF:
2711573Srgrimes			/*
2721573Srgrimes			 * Update the left page count.  If split
2731573Srgrimes			 * added at index 0, fix the correct page.
2741573Srgrimes			 */
2751573Srgrimes			if (skip > 0)
2761573Srgrimes				dest = (char *)h + h->linp[skip - 1];
2771573Srgrimes			else
2781573Srgrimes				dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
2791573Srgrimes			((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
2801573Srgrimes			((RINTERNAL *)dest)->pgno = lchild->pgno;
2811573Srgrimes
2821573Srgrimes			/* Update the right page count. */
2831573Srgrimes			h->linp[skip] = h->upper -= nbytes;
2841573Srgrimes			dest = (char *)h + h->linp[skip];
2851573Srgrimes			((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
2861573Srgrimes			((RINTERNAL *)dest)->pgno = rchild->pgno;
2871573Srgrimes			break;
2881573Srgrimes		default:
2891573Srgrimes			abort();
2901573Srgrimes		}
2911573Srgrimes
2921573Srgrimes		/* Unpin the held pages. */
2931573Srgrimes		if (!parentsplit) {
2941573Srgrimes			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
2951573Srgrimes			break;
2961573Srgrimes		}
2971573Srgrimes
2981573Srgrimes		/* If the root page was split, make it look right. */
2991573Srgrimes		if (sp->pgno == P_ROOT &&
30014272Spst		    (F_ISSET(t, R_RECNO) ?
3011573Srgrimes		    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
3021573Srgrimes			goto err1;
3031573Srgrimes
3041573Srgrimes		mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
3051573Srgrimes		mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
3061573Srgrimes	}
3071573Srgrimes
3081573Srgrimes	/* Unpin the held pages. */
3091573Srgrimes	mpool_put(t->bt_mp, l, MPOOL_DIRTY);
3101573Srgrimes	mpool_put(t->bt_mp, r, MPOOL_DIRTY);
3111573Srgrimes
3121573Srgrimes	/* Clear any pages left on the stack. */
3131573Srgrimes	return (RET_SUCCESS);
3141573Srgrimes
3151573Srgrimes	/*
3161573Srgrimes	 * If something fails in the above loop we were already walking back
3171573Srgrimes	 * up the tree and the tree is now inconsistent.  Nothing much we can
3181573Srgrimes	 * do about it but release any memory we're holding.
3191573Srgrimes	 */
3201573Srgrimeserr1:	mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
3211573Srgrimes	mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
3221573Srgrimes
3231573Srgrimeserr2:	mpool_put(t->bt_mp, l, 0);
3241573Srgrimes	mpool_put(t->bt_mp, r, 0);
3251573Srgrimes	__dbpanic(t->bt_dbp);
3261573Srgrimes	return (RET_ERROR);
3271573Srgrimes}
3281573Srgrimes
3291573Srgrimes/*
3301573Srgrimes * BT_PAGE -- Split a non-root page of a btree.
3311573Srgrimes *
3321573Srgrimes * Parameters:
3331573Srgrimes *	t:	tree
3341573Srgrimes *	h:	root page
3351573Srgrimes *	lp:	pointer to left page pointer
3361573Srgrimes *	rp:	pointer to right page pointer
3371573Srgrimes *	skip:	pointer to index to leave open
3381573Srgrimes *	ilen:	insert length
3391573Srgrimes *
3401573Srgrimes * Returns:
3411573Srgrimes *	Pointer to page in which to insert or NULL on error.
3421573Srgrimes */
3431573Srgrimesstatic PAGE *
3441573Srgrimesbt_page(t, h, lp, rp, skip, ilen)
3451573Srgrimes	BTREE *t;
3461573Srgrimes	PAGE *h, **lp, **rp;
3471573Srgrimes	indx_t *skip;
3481573Srgrimes	size_t ilen;
3491573Srgrimes{
3501573Srgrimes	PAGE *l, *r, *tp;
3511573Srgrimes	pgno_t npg;
3521573Srgrimes
3531573Srgrimes#ifdef STATISTICS
3541573Srgrimes	++bt_split;
3551573Srgrimes#endif
3561573Srgrimes	/* Put the new right page for the split into place. */
3571573Srgrimes	if ((r = __bt_new(t, &npg)) == NULL)
3581573Srgrimes		return (NULL);
3591573Srgrimes	r->pgno = npg;
3601573Srgrimes	r->lower = BTDATAOFF;
3611573Srgrimes	r->upper = t->bt_psize;
3621573Srgrimes	r->nextpg = h->nextpg;
3631573Srgrimes	r->prevpg = h->pgno;
3641573Srgrimes	r->flags = h->flags & P_TYPE;
3651573Srgrimes
3661573Srgrimes	/*
3671573Srgrimes	 * If we're splitting the last page on a level because we're appending
3681573Srgrimes	 * a key to it (skip is NEXTINDEX()), it's likely that the data is
3691573Srgrimes	 * sorted.  Adding an empty page on the side of the level is less work
3701573Srgrimes	 * and can push the fill factor much higher than normal.  If we're
3711573Srgrimes	 * wrong it's no big deal, we'll just do the split the right way next
3721573Srgrimes	 * time.  It may look like it's equally easy to do a similar hack for
3731573Srgrimes	 * reverse sorted data, that is, split the tree left, but it's not.
3741573Srgrimes	 * Don't even try.
3751573Srgrimes	 */
3761573Srgrimes	if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
3771573Srgrimes#ifdef STATISTICS
3781573Srgrimes		++bt_sortsplit;
3791573Srgrimes#endif
3801573Srgrimes		h->nextpg = r->pgno;
3811573Srgrimes		r->lower = BTDATAOFF + sizeof(indx_t);
3821573Srgrimes		*skip = 0;
3831573Srgrimes		*lp = h;
3841573Srgrimes		*rp = r;
3851573Srgrimes		return (r);
3861573Srgrimes	}
3871573Srgrimes
3881573Srgrimes	/* Put the new left page for the split into place. */
3891573Srgrimes	if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) {
3901573Srgrimes		mpool_put(t->bt_mp, r, 0);
3911573Srgrimes		return (NULL);
3921573Srgrimes	}
39314272Spst#ifdef PURIFY
39414272Spst	memset(l, 0xff, t->bt_psize);
39514272Spst#endif
3961573Srgrimes	l->pgno = h->pgno;
3971573Srgrimes	l->nextpg = r->pgno;
3981573Srgrimes	l->prevpg = h->prevpg;
3991573Srgrimes	l->lower = BTDATAOFF;
4001573Srgrimes	l->upper = t->bt_psize;
4011573Srgrimes	l->flags = h->flags & P_TYPE;
4021573Srgrimes
4031573Srgrimes	/* Fix up the previous pointer of the page after the split page. */
4041573Srgrimes	if (h->nextpg != P_INVALID) {
4051573Srgrimes		if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
4061573Srgrimes			free(l);
4071573Srgrimes			/* XXX mpool_free(t->bt_mp, r->pgno); */
4081573Srgrimes			return (NULL);
4091573Srgrimes		}
4101573Srgrimes		tp->prevpg = r->pgno;
41114272Spst		mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
4121573Srgrimes	}
4131573Srgrimes
4141573Srgrimes	/*
4151573Srgrimes	 * Split right.  The key/data pairs aren't sorted in the btree page so
4161573Srgrimes	 * it's simpler to copy the data from the split page onto two new pages
4171573Srgrimes	 * instead of copying half the data to the right page and compacting
4181573Srgrimes	 * the left page in place.  Since the left page can't change, we have
4191573Srgrimes	 * to swap the original and the allocated left page after the split.
4201573Srgrimes	 */
4211573Srgrimes	tp = bt_psplit(t, h, l, r, skip, ilen);
4221573Srgrimes
4231573Srgrimes	/* Move the new left page onto the old left page. */
4241573Srgrimes	memmove(h, l, t->bt_psize);
4251573Srgrimes	if (tp == l)
4261573Srgrimes		tp = h;
4271573Srgrimes	free(l);
4281573Srgrimes
4291573Srgrimes	*lp = h;
4301573Srgrimes	*rp = r;
4311573Srgrimes	return (tp);
4321573Srgrimes}
4331573Srgrimes
4341573Srgrimes/*
4351573Srgrimes * BT_ROOT -- Split the root page of a btree.
4361573Srgrimes *
4371573Srgrimes * Parameters:
4381573Srgrimes *	t:	tree
4391573Srgrimes *	h:	root page
4401573Srgrimes *	lp:	pointer to left page pointer
4411573Srgrimes *	rp:	pointer to right page pointer
4421573Srgrimes *	skip:	pointer to index to leave open
4431573Srgrimes *	ilen:	insert length
4441573Srgrimes *
4451573Srgrimes * Returns:
4461573Srgrimes *	Pointer to page in which to insert or NULL on error.
4471573Srgrimes */
4481573Srgrimesstatic PAGE *
4491573Srgrimesbt_root(t, h, lp, rp, skip, ilen)
4501573Srgrimes	BTREE *t;
4511573Srgrimes	PAGE *h, **lp, **rp;
4521573Srgrimes	indx_t *skip;
4531573Srgrimes	size_t ilen;
4541573Srgrimes{
4551573Srgrimes	PAGE *l, *r, *tp;
4561573Srgrimes	pgno_t lnpg, rnpg;
4571573Srgrimes
4581573Srgrimes#ifdef STATISTICS
4591573Srgrimes	++bt_split;
4601573Srgrimes	++bt_rootsplit;
4611573Srgrimes#endif
4621573Srgrimes	/* Put the new left and right pages for the split into place. */
4631573Srgrimes	if ((l = __bt_new(t, &lnpg)) == NULL ||
4641573Srgrimes	    (r = __bt_new(t, &rnpg)) == NULL)
4651573Srgrimes		return (NULL);
4661573Srgrimes	l->pgno = lnpg;
4671573Srgrimes	r->pgno = rnpg;
4681573Srgrimes	l->nextpg = r->pgno;
4691573Srgrimes	r->prevpg = l->pgno;
4701573Srgrimes	l->prevpg = r->nextpg = P_INVALID;
4711573Srgrimes	l->lower = r->lower = BTDATAOFF;
4721573Srgrimes	l->upper = r->upper = t->bt_psize;
4731573Srgrimes	l->flags = r->flags = h->flags & P_TYPE;
4741573Srgrimes
4751573Srgrimes	/* Split the root page. */
4761573Srgrimes	tp = bt_psplit(t, h, l, r, skip, ilen);
4771573Srgrimes
4781573Srgrimes	*lp = l;
4791573Srgrimes	*rp = r;
4801573Srgrimes	return (tp);
4811573Srgrimes}
4821573Srgrimes
4831573Srgrimes/*
4841573Srgrimes * BT_RROOT -- Fix up the recno root page after it has been split.
4851573Srgrimes *
4861573Srgrimes * Parameters:
4871573Srgrimes *	t:	tree
4881573Srgrimes *	h:	root page
4891573Srgrimes *	l:	left page
4901573Srgrimes *	r:	right page
4911573Srgrimes *
4921573Srgrimes * Returns:
4931573Srgrimes *	RET_ERROR, RET_SUCCESS
4941573Srgrimes */
4951573Srgrimesstatic int
4961573Srgrimesbt_rroot(t, h, l, r)
4971573Srgrimes	BTREE *t;
4981573Srgrimes	PAGE *h, *l, *r;
4991573Srgrimes{
5001573Srgrimes	char *dest;
5011573Srgrimes
5021573Srgrimes	/* Insert the left and right keys, set the header information. */
5031573Srgrimes	h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
5041573Srgrimes	dest = (char *)h + h->upper;
5051573Srgrimes	WR_RINTERNAL(dest,
5061573Srgrimes	    l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
5071573Srgrimes
5081573Srgrimes	h->linp[1] = h->upper -= NRINTERNAL;
5091573Srgrimes	dest = (char *)h + h->upper;
5101573Srgrimes	WR_RINTERNAL(dest,
5111573Srgrimes	    r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
5121573Srgrimes
5131573Srgrimes	h->lower = BTDATAOFF + 2 * sizeof(indx_t);
5141573Srgrimes
5151573Srgrimes	/* Unpin the root page, set to recno internal page. */
5161573Srgrimes	h->flags &= ~P_TYPE;
5171573Srgrimes	h->flags |= P_RINTERNAL;
5181573Srgrimes	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
5191573Srgrimes
5201573Srgrimes	return (RET_SUCCESS);
5211573Srgrimes}
5221573Srgrimes
5231573Srgrimes/*
5241573Srgrimes * BT_BROOT -- Fix up the btree root page after it has been split.
5251573Srgrimes *
5261573Srgrimes * Parameters:
5271573Srgrimes *	t:	tree
5281573Srgrimes *	h:	root page
5291573Srgrimes *	l:	left page
5301573Srgrimes *	r:	right page
5311573Srgrimes *
5321573Srgrimes * Returns:
5331573Srgrimes *	RET_ERROR, RET_SUCCESS
5341573Srgrimes */
5351573Srgrimesstatic int
5361573Srgrimesbt_broot(t, h, l, r)
5371573Srgrimes	BTREE *t;
5381573Srgrimes	PAGE *h, *l, *r;
5391573Srgrimes{
5401573Srgrimes	BINTERNAL *bi;
5411573Srgrimes	BLEAF *bl;
54214272Spst	u_int32_t nbytes;
5431573Srgrimes	char *dest;
5441573Srgrimes
5451573Srgrimes	/*
5461573Srgrimes	 * If the root page was a leaf page, change it into an internal page.
5471573Srgrimes	 * We copy the key we split on (but not the key's data, in the case of
5481573Srgrimes	 * a leaf page) to the new root page.
5491573Srgrimes	 *
5501573Srgrimes	 * The btree comparison code guarantees that the left-most key on any
5511573Srgrimes	 * level of the tree is never used, so it doesn't need to be filled in.
5521573Srgrimes	 */
5531573Srgrimes	nbytes = NBINTERNAL(0);
5541573Srgrimes	h->linp[0] = h->upper = t->bt_psize - nbytes;
5551573Srgrimes	dest = (char *)h + h->upper;
5561573Srgrimes	WR_BINTERNAL(dest, 0, l->pgno, 0);
5571573Srgrimes
55814272Spst	switch (h->flags & P_TYPE) {
5591573Srgrimes	case P_BLEAF:
5601573Srgrimes		bl = GETBLEAF(r, 0);
5611573Srgrimes		nbytes = NBINTERNAL(bl->ksize);
5621573Srgrimes		h->linp[1] = h->upper -= nbytes;
5631573Srgrimes		dest = (char *)h + h->upper;
5641573Srgrimes		WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
5651573Srgrimes		memmove(dest, bl->bytes, bl->ksize);
5661573Srgrimes
5671573Srgrimes		/*
5681573Srgrimes		 * If the key is on an overflow page, mark the overflow chain
5691573Srgrimes		 * so it isn't deleted when the leaf copy of the key is deleted.
5701573Srgrimes		 */
5711573Srgrimes		if (bl->flags & P_BIGKEY &&
5721573Srgrimes		    bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
5731573Srgrimes			return (RET_ERROR);
5741573Srgrimes		break;
5751573Srgrimes	case P_BINTERNAL:
5761573Srgrimes		bi = GETBINTERNAL(r, 0);
5771573Srgrimes		nbytes = NBINTERNAL(bi->ksize);
5781573Srgrimes		h->linp[1] = h->upper -= nbytes;
5791573Srgrimes		dest = (char *)h + h->upper;
5801573Srgrimes		memmove(dest, bi, nbytes);
5811573Srgrimes		((BINTERNAL *)dest)->pgno = r->pgno;
5821573Srgrimes		break;
5831573Srgrimes	default:
5841573Srgrimes		abort();
5851573Srgrimes	}
5861573Srgrimes
5871573Srgrimes	/* There are two keys on the page. */
5881573Srgrimes	h->lower = BTDATAOFF + 2 * sizeof(indx_t);
5891573Srgrimes
5901573Srgrimes	/* Unpin the root page, set to btree internal page. */
5911573Srgrimes	h->flags &= ~P_TYPE;
5921573Srgrimes	h->flags |= P_BINTERNAL;
5931573Srgrimes	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
5941573Srgrimes
5951573Srgrimes	return (RET_SUCCESS);
5961573Srgrimes}
5971573Srgrimes
5981573Srgrimes/*
5991573Srgrimes * BT_PSPLIT -- Do the real work of splitting the page.
6001573Srgrimes *
6011573Srgrimes * Parameters:
6021573Srgrimes *	t:	tree
6031573Srgrimes *	h:	page to be split
6041573Srgrimes *	l:	page to put lower half of data
6051573Srgrimes *	r:	page to put upper half of data
6061573Srgrimes *	pskip:	pointer to index to leave open
6071573Srgrimes *	ilen:	insert length
6081573Srgrimes *
6091573Srgrimes * Returns:
6101573Srgrimes *	Pointer to page in which to insert.
6111573Srgrimes */
6121573Srgrimesstatic PAGE *
6131573Srgrimesbt_psplit(t, h, l, r, pskip, ilen)
6141573Srgrimes	BTREE *t;
6151573Srgrimes	PAGE *h, *l, *r;
6161573Srgrimes	indx_t *pskip;
6171573Srgrimes	size_t ilen;
6181573Srgrimes{
6191573Srgrimes	BINTERNAL *bi;
6201573Srgrimes	BLEAF *bl;
62114272Spst	CURSOR *c;
6221573Srgrimes	RLEAF *rl;
6231573Srgrimes	PAGE *rval;
6241573Srgrimes	void *src;
6251573Srgrimes	indx_t full, half, nxt, off, skip, top, used;
62614272Spst	u_int32_t nbytes;
6271573Srgrimes	int bigkeycnt, isbigkey;
6281573Srgrimes
6291573Srgrimes	/*
6301573Srgrimes	 * Split the data to the left and right pages.  Leave the skip index
6311573Srgrimes	 * open.  Additionally, make some effort not to split on an overflow
6321573Srgrimes	 * key.  This makes internal page processing faster and can save
6331573Srgrimes	 * space as overflow keys used by internal pages are never deleted.
6341573Srgrimes	 */
6351573Srgrimes	bigkeycnt = 0;
6361573Srgrimes	skip = *pskip;
6371573Srgrimes	full = t->bt_psize - BTDATAOFF;
6381573Srgrimes	half = full / 2;
6391573Srgrimes	used = 0;
6401573Srgrimes	for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
6411573Srgrimes		if (skip == off) {
6421573Srgrimes			nbytes = ilen;
6431573Srgrimes			isbigkey = 0;		/* XXX: not really known. */
6441573Srgrimes		} else
6451573Srgrimes			switch (h->flags & P_TYPE) {
6461573Srgrimes			case P_BINTERNAL:
6471573Srgrimes				src = bi = GETBINTERNAL(h, nxt);
6481573Srgrimes				nbytes = NBINTERNAL(bi->ksize);
6491573Srgrimes				isbigkey = bi->flags & P_BIGKEY;
6501573Srgrimes				break;
6511573Srgrimes			case P_BLEAF:
6521573Srgrimes				src = bl = GETBLEAF(h, nxt);
6531573Srgrimes				nbytes = NBLEAF(bl);
6541573Srgrimes				isbigkey = bl->flags & P_BIGKEY;
6551573Srgrimes				break;
6561573Srgrimes			case P_RINTERNAL:
6571573Srgrimes				src = GETRINTERNAL(h, nxt);
6581573Srgrimes				nbytes = NRINTERNAL;
6591573Srgrimes				isbigkey = 0;
6601573Srgrimes				break;
6611573Srgrimes			case P_RLEAF:
6621573Srgrimes				src = rl = GETRLEAF(h, nxt);
6631573Srgrimes				nbytes = NRLEAF(rl);
6641573Srgrimes				isbigkey = 0;
6651573Srgrimes				break;
6661573Srgrimes			default:
6671573Srgrimes				abort();
6681573Srgrimes			}
6691573Srgrimes
6701573Srgrimes		/*
6711573Srgrimes		 * If the key/data pairs are substantial fractions of the max
6721573Srgrimes		 * possible size for the page, it's possible to get situations
6731573Srgrimes		 * where we decide to try and copy too much onto the left page.
6741573Srgrimes		 * Make sure that doesn't happen.
6751573Srgrimes		 */
6761573Srgrimes		if (skip <= off && used + nbytes >= full) {
6771573Srgrimes			--off;
6781573Srgrimes			break;
6791573Srgrimes		}
6801573Srgrimes
6811573Srgrimes		/* Copy the key/data pair, if not the skipped index. */
6821573Srgrimes		if (skip != off) {
6831573Srgrimes			++nxt;
6841573Srgrimes
6851573Srgrimes			l->linp[off] = l->upper -= nbytes;
6861573Srgrimes			memmove((char *)l + l->upper, src, nbytes);
6871573Srgrimes		}
6881573Srgrimes
6891573Srgrimes		used += nbytes;
6901573Srgrimes		if (used >= half) {
6911573Srgrimes			if (!isbigkey || bigkeycnt == 3)
6921573Srgrimes				break;
6931573Srgrimes			else
6941573Srgrimes				++bigkeycnt;
6951573Srgrimes		}
6961573Srgrimes	}
6971573Srgrimes
6981573Srgrimes	/*
6991573Srgrimes	 * Off is the last offset that's valid for the left page.
7001573Srgrimes	 * Nxt is the first offset to be placed on the right page.
7011573Srgrimes	 */
7021573Srgrimes	l->lower += (off + 1) * sizeof(indx_t);
7031573Srgrimes
7041573Srgrimes	/*
7051573Srgrimes	 * If splitting the page that the cursor was on, the cursor has to be
7061573Srgrimes	 * adjusted to point to the same record as before the split.  If the
7071573Srgrimes	 * cursor is at or past the skipped slot, the cursor is incremented by
7081573Srgrimes	 * one.  If the cursor is on the right page, it is decremented by the
7091573Srgrimes	 * number of records split to the left page.
7101573Srgrimes	 */
71114272Spst	c = &t->bt_cursor;
71214272Spst	if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
71314272Spst		if (c->pg.index >= skip)
71414272Spst			++c->pg.index;
71514272Spst		if (c->pg.index < nxt)			/* Left page. */
71614272Spst			c->pg.pgno = l->pgno;
7171573Srgrimes		else {					/* Right page. */
71814272Spst			c->pg.pgno = r->pgno;
71914272Spst			c->pg.index -= nxt;
7201573Srgrimes		}
7211573Srgrimes	}
7221573Srgrimes
7231573Srgrimes	/*
7241573Srgrimes	 * If the skipped index was on the left page, just return that page.
7251573Srgrimes	 * Otherwise, adjust the skip index to reflect the new position on
7261573Srgrimes	 * the right page.
7271573Srgrimes	 */
7281573Srgrimes	if (skip <= off) {
7291573Srgrimes		skip = 0;
7301573Srgrimes		rval = l;
7311573Srgrimes	} else {
7321573Srgrimes		rval = r;
7331573Srgrimes		*pskip -= nxt;
7341573Srgrimes	}
7351573Srgrimes
7361573Srgrimes	for (off = 0; nxt < top; ++off) {
7371573Srgrimes		if (skip == nxt) {
7381573Srgrimes			++off;
7391573Srgrimes			skip = 0;
7401573Srgrimes		}
7411573Srgrimes		switch (h->flags & P_TYPE) {
7421573Srgrimes		case P_BINTERNAL:
7431573Srgrimes			src = bi = GETBINTERNAL(h, nxt);
7441573Srgrimes			nbytes = NBINTERNAL(bi->ksize);
7451573Srgrimes			break;
7461573Srgrimes		case P_BLEAF:
7471573Srgrimes			src = bl = GETBLEAF(h, nxt);
7481573Srgrimes			nbytes = NBLEAF(bl);
7491573Srgrimes			break;
7501573Srgrimes		case P_RINTERNAL:
7511573Srgrimes			src = GETRINTERNAL(h, nxt);
7521573Srgrimes			nbytes = NRINTERNAL;
7531573Srgrimes			break;
7541573Srgrimes		case P_RLEAF:
7551573Srgrimes			src = rl = GETRLEAF(h, nxt);
7561573Srgrimes			nbytes = NRLEAF(rl);
7571573Srgrimes			break;
7581573Srgrimes		default:
7591573Srgrimes			abort();
7601573Srgrimes		}
7611573Srgrimes		++nxt;
7621573Srgrimes		r->linp[off] = r->upper -= nbytes;
7631573Srgrimes		memmove((char *)r + r->upper, src, nbytes);
7641573Srgrimes	}
7651573Srgrimes	r->lower += off * sizeof(indx_t);
7661573Srgrimes
7671573Srgrimes	/* If the key is being appended to the page, adjust the index. */
7681573Srgrimes	if (skip == top)
7691573Srgrimes		r->lower += sizeof(indx_t);
7701573Srgrimes
7711573Srgrimes	return (rval);
7721573Srgrimes}
7731573Srgrimes
7741573Srgrimes/*
7751573Srgrimes * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
7761573Srgrimes *
7771573Srgrimes * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
7781573Srgrimes * record that references them gets deleted.  Chains pointed to by internal
7791573Srgrimes * pages never get deleted.  This routine marks a chain as pointed to by an
7801573Srgrimes * internal page.
7811573Srgrimes *
7821573Srgrimes * Parameters:
7831573Srgrimes *	t:	tree
7841573Srgrimes *	pg:	page number of first page in the chain.
7851573Srgrimes *
7861573Srgrimes * Returns:
7871573Srgrimes *	RET_SUCCESS, RET_ERROR.
7881573Srgrimes */
7891573Srgrimesstatic int
7901573Srgrimesbt_preserve(t, pg)
7911573Srgrimes	BTREE *t;
7921573Srgrimes	pgno_t pg;
7931573Srgrimes{
7941573Srgrimes	PAGE *h;
7951573Srgrimes
7961573Srgrimes	if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
7971573Srgrimes		return (RET_ERROR);
7981573Srgrimes	h->flags |= P_PRESERVE;
7991573Srgrimes	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
8001573Srgrimes	return (RET_SUCCESS);
8011573Srgrimes}
8021573Srgrimes
8031573Srgrimes/*
8041573Srgrimes * REC_TOTAL -- Return the number of recno entries below a page.
8051573Srgrimes *
8061573Srgrimes * Parameters:
8071573Srgrimes *	h:	page
8081573Srgrimes *
8091573Srgrimes * Returns:
8101573Srgrimes *	The number of recno entries below a page.
8111573Srgrimes *
8121573Srgrimes * XXX
8131573Srgrimes * These values could be set by the bt_psplit routine.  The problem is that the
8141573Srgrimes * entry has to be popped off of the stack etc. or the values have to be passed
8151573Srgrimes * all the way back to bt_split/bt_rroot and it's not very clean.
8161573Srgrimes */
8171573Srgrimesstatic recno_t
8181573Srgrimesrec_total(h)
8191573Srgrimes	PAGE *h;
8201573Srgrimes{
8211573Srgrimes	recno_t recs;
8221573Srgrimes	indx_t nxt, top;
8231573Srgrimes
8241573Srgrimes	for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
8251573Srgrimes		recs += GETRINTERNAL(h, nxt)->nrecs;
8261573Srgrimes	return (recs);
8271573Srgrimes}
828