11573Srgrimes/*-
214272Spst * Copyright (c) 1990, 1993, 1994
31573Srgrimes *	The Regents of the University of California.  All rights reserved.
41573Srgrimes *
51573Srgrimes * This code is derived from software contributed to Berkeley by
61573Srgrimes * Mike Olson.
71573Srgrimes *
81573Srgrimes * Redistribution and use in source and binary forms, with or without
91573Srgrimes * modification, are permitted provided that the following conditions
101573Srgrimes * are met:
111573Srgrimes * 1. Redistributions of source code must retain the above copyright
121573Srgrimes *    notice, this list of conditions and the following disclaimer.
131573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
141573Srgrimes *    notice, this list of conditions and the following disclaimer in the
151573Srgrimes *    documentation and/or other materials provided with the distribution.
161573Srgrimes * 4. Neither the name of the University nor the names of its contributors
171573Srgrimes *    may be used to endorse or promote products derived from this software
181573Srgrimes *    without specific prior written permission.
191573Srgrimes *
201573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
211573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
221573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
231573Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
241573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
251573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
261573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
271573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
281573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
291573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
301573Srgrimes * SUCH DAMAGE.
311573Srgrimes */
321573Srgrimes
331573Srgrimes#if defined(LIBC_SCCS) && !defined(lint)
34194804Sdelphijstatic char sccsid[] = "@(#)bt_split.c	8.10 (Berkeley) 1/9/95";
351573Srgrimes#endif /* LIBC_SCCS and not lint */
3692905Sobrien#include <sys/cdefs.h>
3792905Sobrien__FBSDID("$FreeBSD$");
381573Srgrimes
391573Srgrimes#include <sys/types.h>
40223262Sbenl#include <sys/param.h>
411573Srgrimes
421573Srgrimes#include <limits.h>
431573Srgrimes#include <stdio.h>
441573Srgrimes#include <stdlib.h>
451573Srgrimes#include <string.h>
461573Srgrimes
471573Srgrimes#include <db.h>
481573Srgrimes#include "btree.h"
491573Srgrimes
5092905Sobrienstatic int	 bt_broot(BTREE *, PAGE *, PAGE *, PAGE *);
51189291Sdelphijstatic PAGE	*bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
5292905Sobrienstatic int	 bt_preserve(BTREE *, pgno_t);
53189291Sdelphijstatic PAGE	*bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t);
54189291Sdelphijstatic PAGE	*bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
5592905Sobrienstatic int	 bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *);
5692905Sobrienstatic recno_t	 rec_total(PAGE *);
571573Srgrimes
581573Srgrimes#ifdef STATISTICS
591573Srgrimesu_long	bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
601573Srgrimes#endif
611573Srgrimes
621573Srgrimes/*
631573Srgrimes * __BT_SPLIT -- Split the tree.
641573Srgrimes *
651573Srgrimes * Parameters:
661573Srgrimes *	t:	tree
671573Srgrimes *	sp:	page to split
681573Srgrimes *	key:	key to insert
691573Srgrimes *	data:	data to insert
701573Srgrimes *	flags:	BIGKEY/BIGDATA flags
711573Srgrimes *	ilen:	insert length
721573Srgrimes *	skip:	index to leave open
731573Srgrimes *
741573Srgrimes * Returns:
751573Srgrimes *	RET_ERROR, RET_SUCCESS
761573Srgrimes */
771573Srgrimesint
78189291Sdelphij__bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags,
79189291Sdelphij    size_t ilen, u_int32_t argskip)
801573Srgrimes{
811573Srgrimes	BINTERNAL *bi;
821573Srgrimes	BLEAF *bl, *tbl;
831573Srgrimes	DBT a, b;
841573Srgrimes	EPGNO *parent;
851573Srgrimes	PAGE *h, *l, *r, *lchild, *rchild;
861573Srgrimes	indx_t nxtindex;
8714272Spst	u_int16_t skip;
8814272Spst	u_int32_t n, nbytes, nksize;
891573Srgrimes	int parentsplit;
901573Srgrimes	char *dest;
911573Srgrimes
921573Srgrimes	/*
931573Srgrimes	 * Split the page into two pages, l and r.  The split routines return
941573Srgrimes	 * a pointer to the page into which the key should be inserted and with
951573Srgrimes	 * skip set to the offset which should be used.  Additionally, l and r
961573Srgrimes	 * are pinned.
971573Srgrimes	 */
9814272Spst	skip = argskip;
991573Srgrimes	h = sp->pgno == P_ROOT ?
1001573Srgrimes	    bt_root(t, sp, &l, &r, &skip, ilen) :
1011573Srgrimes	    bt_page(t, sp, &l, &r, &skip, ilen);
1021573Srgrimes	if (h == NULL)
1031573Srgrimes		return (RET_ERROR);
1041573Srgrimes
1051573Srgrimes	/*
1061573Srgrimes	 * Insert the new key/data pair into the leaf page.  (Key inserts
1071573Srgrimes	 * always cause a leaf page to split first.)
1081573Srgrimes	 */
1091573Srgrimes	h->linp[skip] = h->upper -= ilen;
1101573Srgrimes	dest = (char *)h + h->upper;
11114272Spst	if (F_ISSET(t, R_RECNO))
1121573Srgrimes		WR_RLEAF(dest, data, flags)
1131573Srgrimes	else
1141573Srgrimes		WR_BLEAF(dest, key, data, flags)
1151573Srgrimes
1161573Srgrimes	/* If the root page was split, make it look right. */
1171573Srgrimes	if (sp->pgno == P_ROOT &&
11814272Spst	    (F_ISSET(t, R_RECNO) ?
1191573Srgrimes	    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
1201573Srgrimes		goto err2;
1211573Srgrimes
1221573Srgrimes	/*
1231573Srgrimes	 * Now we walk the parent page stack -- a LIFO stack of the pages that
1241573Srgrimes	 * were traversed when we searched for the page that split.  Each stack
1251573Srgrimes	 * entry is a page number and a page index offset.  The offset is for
1261573Srgrimes	 * the page traversed on the search.  We've just split a page, so we
1271573Srgrimes	 * have to insert a new key into the parent page.
1281573Srgrimes	 *
1291573Srgrimes	 * If the insert into the parent page causes it to split, may have to
1301573Srgrimes	 * continue splitting all the way up the tree.  We stop if the root
1311573Srgrimes	 * splits or the page inserted into didn't have to split to hold the
1321573Srgrimes	 * new key.  Some algorithms replace the key for the old page as well
1331573Srgrimes	 * as the new page.  We don't, as there's no reason to believe that the
1341573Srgrimes	 * first key on the old page is any better than the key we have, and,
1351573Srgrimes	 * in the case of a key being placed at index 0 causing the split, the
1361573Srgrimes	 * key is unavailable.
1371573Srgrimes	 *
1381573Srgrimes	 * There are a maximum of 5 pages pinned at any time.  We keep the left
1391573Srgrimes	 * and right pages pinned while working on the parent.   The 5 are the
1401573Srgrimes	 * two children, left parent and right parent (when the parent splits)
1411573Srgrimes	 * and the root page or the overflow key page when calling bt_preserve.
1421573Srgrimes	 * This code must make sure that all pins are released other than the
1431573Srgrimes	 * root page or overflow page which is unlocked elsewhere.
1441573Srgrimes	 */
1451573Srgrimes	while ((parent = BT_POP(t)) != NULL) {
1461573Srgrimes		lchild = l;
1471573Srgrimes		rchild = r;
1481573Srgrimes
1491573Srgrimes		/* Get the parent page. */
1501573Srgrimes		if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
1511573Srgrimes			goto err2;
1521573Srgrimes
153189327Sdelphij		/*
1541573Srgrimes		 * The new key goes ONE AFTER the index, because the split
1551573Srgrimes		 * was to the right.
1561573Srgrimes		 */
1571573Srgrimes		skip = parent->index + 1;
1581573Srgrimes
1591573Srgrimes		/*
1601573Srgrimes		 * Calculate the space needed on the parent page.
1611573Srgrimes		 *
1621573Srgrimes		 * Prefix trees: space hack when inserting into BINTERNAL
1631573Srgrimes		 * pages.  Retain only what's needed to distinguish between
1641573Srgrimes		 * the new entry and the LAST entry on the page to its left.
1651573Srgrimes		 * If the keys compare equal, retain the entire key.  Note,
1661573Srgrimes		 * we don't touch overflow keys, and the entire key must be
1671573Srgrimes		 * retained for the next-to-left most key on the leftmost
1681573Srgrimes		 * page of each level, or the search will fail.  Applicable
1691573Srgrimes		 * ONLY to internal pages that have leaf pages as children.
1701573Srgrimes		 * Further reduction of the key between pairs of internal
1711573Srgrimes		 * pages loses too much information.
1721573Srgrimes		 */
1731573Srgrimes		switch (rchild->flags & P_TYPE) {
1741573Srgrimes		case P_BINTERNAL:
1751573Srgrimes			bi = GETBINTERNAL(rchild, 0);
1761573Srgrimes			nbytes = NBINTERNAL(bi->ksize);
1771573Srgrimes			break;
1781573Srgrimes		case P_BLEAF:
1791573Srgrimes			bl = GETBLEAF(rchild, 0);
1801573Srgrimes			nbytes = NBINTERNAL(bl->ksize);
1811573Srgrimes			if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
1821573Srgrimes			    (h->prevpg != P_INVALID || skip > 1)) {
1831573Srgrimes				tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
1841573Srgrimes				a.size = tbl->ksize;
1851573Srgrimes				a.data = tbl->bytes;
1861573Srgrimes				b.size = bl->ksize;
1871573Srgrimes				b.data = bl->bytes;
1881573Srgrimes				nksize = t->bt_pfx(&a, &b);
1891573Srgrimes				n = NBINTERNAL(nksize);
1901573Srgrimes				if (n < nbytes) {
1911573Srgrimes#ifdef STATISTICS
1921573Srgrimes					bt_pfxsaved += nbytes - n;
1931573Srgrimes#endif
1941573Srgrimes					nbytes = n;
1951573Srgrimes				} else
1961573Srgrimes					nksize = 0;
1971573Srgrimes			} else
1981573Srgrimes				nksize = 0;
1991573Srgrimes			break;
2001573Srgrimes		case P_RINTERNAL:
2011573Srgrimes		case P_RLEAF:
2021573Srgrimes			nbytes = NRINTERNAL;
2031573Srgrimes			break;
2041573Srgrimes		default:
2051573Srgrimes			abort();
2061573Srgrimes		}
2071573Srgrimes
2081573Srgrimes		/* Split the parent page if necessary or shift the indices. */
209190484Sdelphij		if ((u_int32_t)(h->upper - h->lower) < nbytes + sizeof(indx_t)) {
2101573Srgrimes			sp = h;
2111573Srgrimes			h = h->pgno == P_ROOT ?
2121573Srgrimes			    bt_root(t, h, &l, &r, &skip, nbytes) :
2131573Srgrimes			    bt_page(t, h, &l, &r, &skip, nbytes);
2141573Srgrimes			if (h == NULL)
2151573Srgrimes				goto err1;
2161573Srgrimes			parentsplit = 1;
2171573Srgrimes		} else {
2181573Srgrimes			if (skip < (nxtindex = NEXTINDEX(h)))
2191573Srgrimes				memmove(h->linp + skip + 1, h->linp + skip,
2201573Srgrimes				    (nxtindex - skip) * sizeof(indx_t));
2211573Srgrimes			h->lower += sizeof(indx_t);
2221573Srgrimes			parentsplit = 0;
2231573Srgrimes		}
2241573Srgrimes
2251573Srgrimes		/* Insert the key into the parent page. */
22614272Spst		switch (rchild->flags & P_TYPE) {
2271573Srgrimes		case P_BINTERNAL:
2281573Srgrimes			h->linp[skip] = h->upper -= nbytes;
2291573Srgrimes			dest = (char *)h + h->linp[skip];
2301573Srgrimes			memmove(dest, bi, nbytes);
2311573Srgrimes			((BINTERNAL *)dest)->pgno = rchild->pgno;
2321573Srgrimes			break;
2331573Srgrimes		case P_BLEAF:
2341573Srgrimes			h->linp[skip] = h->upper -= nbytes;
2351573Srgrimes			dest = (char *)h + h->linp[skip];
2361573Srgrimes			WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
2371573Srgrimes			    rchild->pgno, bl->flags & P_BIGKEY);
2381573Srgrimes			memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
2391573Srgrimes			if (bl->flags & P_BIGKEY &&
2401573Srgrimes			    bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
2411573Srgrimes				goto err1;
2421573Srgrimes			break;
2431573Srgrimes		case P_RINTERNAL:
2441573Srgrimes			/*
2451573Srgrimes			 * Update the left page count.  If split
2461573Srgrimes			 * added at index 0, fix the correct page.
2471573Srgrimes			 */
2481573Srgrimes			if (skip > 0)
2491573Srgrimes				dest = (char *)h + h->linp[skip - 1];
2501573Srgrimes			else
2511573Srgrimes				dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
2521573Srgrimes			((RINTERNAL *)dest)->nrecs = rec_total(lchild);
2531573Srgrimes			((RINTERNAL *)dest)->pgno = lchild->pgno;
2541573Srgrimes
2551573Srgrimes			/* Update the right page count. */
2561573Srgrimes			h->linp[skip] = h->upper -= nbytes;
2571573Srgrimes			dest = (char *)h + h->linp[skip];
2581573Srgrimes			((RINTERNAL *)dest)->nrecs = rec_total(rchild);
2591573Srgrimes			((RINTERNAL *)dest)->pgno = rchild->pgno;
2601573Srgrimes			break;
2611573Srgrimes		case P_RLEAF:
2621573Srgrimes			/*
2631573Srgrimes			 * Update the left page count.  If split
2641573Srgrimes			 * added at index 0, fix the correct page.
2651573Srgrimes			 */
2661573Srgrimes			if (skip > 0)
2671573Srgrimes				dest = (char *)h + h->linp[skip - 1];
2681573Srgrimes			else
2691573Srgrimes				dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
2701573Srgrimes			((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
2711573Srgrimes			((RINTERNAL *)dest)->pgno = lchild->pgno;
2721573Srgrimes
2731573Srgrimes			/* Update the right page count. */
2741573Srgrimes			h->linp[skip] = h->upper -= nbytes;
2751573Srgrimes			dest = (char *)h + h->linp[skip];
2761573Srgrimes			((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
2771573Srgrimes			((RINTERNAL *)dest)->pgno = rchild->pgno;
2781573Srgrimes			break;
2791573Srgrimes		default:
2801573Srgrimes			abort();
2811573Srgrimes		}
2821573Srgrimes
2831573Srgrimes		/* Unpin the held pages. */
2841573Srgrimes		if (!parentsplit) {
2851573Srgrimes			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
2861573Srgrimes			break;
2871573Srgrimes		}
2881573Srgrimes
2891573Srgrimes		/* If the root page was split, make it look right. */
2901573Srgrimes		if (sp->pgno == P_ROOT &&
29114272Spst		    (F_ISSET(t, R_RECNO) ?
2921573Srgrimes		    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
2931573Srgrimes			goto err1;
2941573Srgrimes
2951573Srgrimes		mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
2961573Srgrimes		mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
2971573Srgrimes	}
2981573Srgrimes
2991573Srgrimes	/* Unpin the held pages. */
3001573Srgrimes	mpool_put(t->bt_mp, l, MPOOL_DIRTY);
3011573Srgrimes	mpool_put(t->bt_mp, r, MPOOL_DIRTY);
3021573Srgrimes
3031573Srgrimes	/* Clear any pages left on the stack. */
3041573Srgrimes	return (RET_SUCCESS);
3051573Srgrimes
3061573Srgrimes	/*
3071573Srgrimes	 * If something fails in the above loop we were already walking back
3081573Srgrimes	 * up the tree and the tree is now inconsistent.  Nothing much we can
3091573Srgrimes	 * do about it but release any memory we're holding.
3101573Srgrimes	 */
3111573Srgrimeserr1:	mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
3121573Srgrimes	mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
3131573Srgrimes
3141573Srgrimeserr2:	mpool_put(t->bt_mp, l, 0);
3151573Srgrimes	mpool_put(t->bt_mp, r, 0);
3161573Srgrimes	__dbpanic(t->bt_dbp);
3171573Srgrimes	return (RET_ERROR);
3181573Srgrimes}
3191573Srgrimes
3201573Srgrimes/*
3211573Srgrimes * BT_PAGE -- Split a non-root page of a btree.
3221573Srgrimes *
3231573Srgrimes * Parameters:
3241573Srgrimes *	t:	tree
3251573Srgrimes *	h:	root page
3261573Srgrimes *	lp:	pointer to left page pointer
3271573Srgrimes *	rp:	pointer to right page pointer
3281573Srgrimes *	skip:	pointer to index to leave open
3291573Srgrimes *	ilen:	insert length
3301573Srgrimes *
3311573Srgrimes * Returns:
3321573Srgrimes *	Pointer to page in which to insert or NULL on error.
3331573Srgrimes */
3341573Srgrimesstatic PAGE *
335189291Sdelphijbt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
3361573Srgrimes{
3371573Srgrimes	PAGE *l, *r, *tp;
3381573Srgrimes	pgno_t npg;
3391573Srgrimes
3401573Srgrimes#ifdef STATISTICS
3411573Srgrimes	++bt_split;
3421573Srgrimes#endif
3431573Srgrimes	/* Put the new right page for the split into place. */
3441573Srgrimes	if ((r = __bt_new(t, &npg)) == NULL)
3451573Srgrimes		return (NULL);
3461573Srgrimes	r->pgno = npg;
3471573Srgrimes	r->lower = BTDATAOFF;
3481573Srgrimes	r->upper = t->bt_psize;
3491573Srgrimes	r->nextpg = h->nextpg;
3501573Srgrimes	r->prevpg = h->pgno;
3511573Srgrimes	r->flags = h->flags & P_TYPE;
3521573Srgrimes
3531573Srgrimes	/*
3541573Srgrimes	 * If we're splitting the last page on a level because we're appending
3551573Srgrimes	 * a key to it (skip is NEXTINDEX()), it's likely that the data is
3561573Srgrimes	 * sorted.  Adding an empty page on the side of the level is less work
3571573Srgrimes	 * and can push the fill factor much higher than normal.  If we're
3581573Srgrimes	 * wrong it's no big deal, we'll just do the split the right way next
3591573Srgrimes	 * time.  It may look like it's equally easy to do a similar hack for
3601573Srgrimes	 * reverse sorted data, that is, split the tree left, but it's not.
3611573Srgrimes	 * Don't even try.
3621573Srgrimes	 */
3631573Srgrimes	if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
3641573Srgrimes#ifdef STATISTICS
3651573Srgrimes		++bt_sortsplit;
3661573Srgrimes#endif
3671573Srgrimes		h->nextpg = r->pgno;
3681573Srgrimes		r->lower = BTDATAOFF + sizeof(indx_t);
3691573Srgrimes		*skip = 0;
3701573Srgrimes		*lp = h;
3711573Srgrimes		*rp = r;
3721573Srgrimes		return (r);
3731573Srgrimes	}
3741573Srgrimes
3751573Srgrimes	/* Put the new left page for the split into place. */
376190482Sdelphij	if ((l = (PAGE *)calloc(1, t->bt_psize)) == NULL) {
3771573Srgrimes		mpool_put(t->bt_mp, r, 0);
3781573Srgrimes		return (NULL);
3791573Srgrimes	}
3801573Srgrimes	l->pgno = h->pgno;
3811573Srgrimes	l->nextpg = r->pgno;
3821573Srgrimes	l->prevpg = h->prevpg;
3831573Srgrimes	l->lower = BTDATAOFF;
3841573Srgrimes	l->upper = t->bt_psize;
3851573Srgrimes	l->flags = h->flags & P_TYPE;
3861573Srgrimes
3871573Srgrimes	/* Fix up the previous pointer of the page after the split page. */
3881573Srgrimes	if (h->nextpg != P_INVALID) {
3891573Srgrimes		if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
3901573Srgrimes			free(l);
3911573Srgrimes			/* XXX mpool_free(t->bt_mp, r->pgno); */
3921573Srgrimes			return (NULL);
3931573Srgrimes		}
3941573Srgrimes		tp->prevpg = r->pgno;
39514272Spst		mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
3961573Srgrimes	}
3971573Srgrimes
3981573Srgrimes	/*
3991573Srgrimes	 * Split right.  The key/data pairs aren't sorted in the btree page so
4001573Srgrimes	 * it's simpler to copy the data from the split page onto two new pages
4011573Srgrimes	 * instead of copying half the data to the right page and compacting
4021573Srgrimes	 * the left page in place.  Since the left page can't change, we have
4031573Srgrimes	 * to swap the original and the allocated left page after the split.
4041573Srgrimes	 */
4051573Srgrimes	tp = bt_psplit(t, h, l, r, skip, ilen);
4061573Srgrimes
4071573Srgrimes	/* Move the new left page onto the old left page. */
4081573Srgrimes	memmove(h, l, t->bt_psize);
4091573Srgrimes	if (tp == l)
4101573Srgrimes		tp = h;
4111573Srgrimes	free(l);
4121573Srgrimes
4131573Srgrimes	*lp = h;
4141573Srgrimes	*rp = r;
4151573Srgrimes	return (tp);
4161573Srgrimes}
4171573Srgrimes
4181573Srgrimes/*
4191573Srgrimes * BT_ROOT -- Split the root page of a btree.
4201573Srgrimes *
4211573Srgrimes * Parameters:
4221573Srgrimes *	t:	tree
4231573Srgrimes *	h:	root page
4241573Srgrimes *	lp:	pointer to left page pointer
4251573Srgrimes *	rp:	pointer to right page pointer
4261573Srgrimes *	skip:	pointer to index to leave open
4271573Srgrimes *	ilen:	insert length
4281573Srgrimes *
4291573Srgrimes * Returns:
4301573Srgrimes *	Pointer to page in which to insert or NULL on error.
4311573Srgrimes */
4321573Srgrimesstatic PAGE *
433189291Sdelphijbt_root(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
4341573Srgrimes{
4351573Srgrimes	PAGE *l, *r, *tp;
4361573Srgrimes	pgno_t lnpg, rnpg;
4371573Srgrimes
4381573Srgrimes#ifdef STATISTICS
4391573Srgrimes	++bt_split;
4401573Srgrimes	++bt_rootsplit;
4411573Srgrimes#endif
4421573Srgrimes	/* Put the new left and right pages for the split into place. */
4431573Srgrimes	if ((l = __bt_new(t, &lnpg)) == NULL ||
4441573Srgrimes	    (r = __bt_new(t, &rnpg)) == NULL)
4451573Srgrimes		return (NULL);
4461573Srgrimes	l->pgno = lnpg;
4471573Srgrimes	r->pgno = rnpg;
4481573Srgrimes	l->nextpg = r->pgno;
4491573Srgrimes	r->prevpg = l->pgno;
4501573Srgrimes	l->prevpg = r->nextpg = P_INVALID;
4511573Srgrimes	l->lower = r->lower = BTDATAOFF;
4521573Srgrimes	l->upper = r->upper = t->bt_psize;
4531573Srgrimes	l->flags = r->flags = h->flags & P_TYPE;
4541573Srgrimes
4551573Srgrimes	/* Split the root page. */
4561573Srgrimes	tp = bt_psplit(t, h, l, r, skip, ilen);
4571573Srgrimes
4581573Srgrimes	*lp = l;
4591573Srgrimes	*rp = r;
4601573Srgrimes	return (tp);
4611573Srgrimes}
4621573Srgrimes
4631573Srgrimes/*
4641573Srgrimes * BT_RROOT -- Fix up the recno root page after it has been split.
4651573Srgrimes *
4661573Srgrimes * Parameters:
4671573Srgrimes *	t:	tree
4681573Srgrimes *	h:	root page
4691573Srgrimes *	l:	left page
4701573Srgrimes *	r:	right page
4711573Srgrimes *
4721573Srgrimes * Returns:
4731573Srgrimes *	RET_ERROR, RET_SUCCESS
4741573Srgrimes */
4751573Srgrimesstatic int
476189291Sdelphijbt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
4771573Srgrimes{
4781573Srgrimes	char *dest;
4791573Srgrimes
4801573Srgrimes	/* Insert the left and right keys, set the header information. */
4811573Srgrimes	h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
4821573Srgrimes	dest = (char *)h + h->upper;
4831573Srgrimes	WR_RINTERNAL(dest,
4841573Srgrimes	    l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
4851573Srgrimes
486223262Sbenl	__PAST_END(h->linp, 1) = h->upper -= NRINTERNAL;
4871573Srgrimes	dest = (char *)h + h->upper;
4881573Srgrimes	WR_RINTERNAL(dest,
4891573Srgrimes	    r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
4901573Srgrimes
4911573Srgrimes	h->lower = BTDATAOFF + 2 * sizeof(indx_t);
4921573Srgrimes
4931573Srgrimes	/* Unpin the root page, set to recno internal page. */
4941573Srgrimes	h->flags &= ~P_TYPE;
4951573Srgrimes	h->flags |= P_RINTERNAL;
4961573Srgrimes	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
4971573Srgrimes
4981573Srgrimes	return (RET_SUCCESS);
4991573Srgrimes}
5001573Srgrimes
5011573Srgrimes/*
5021573Srgrimes * BT_BROOT -- Fix up the btree root page after it has been split.
5031573Srgrimes *
5041573Srgrimes * Parameters:
5051573Srgrimes *	t:	tree
5061573Srgrimes *	h:	root page
5071573Srgrimes *	l:	left page
5081573Srgrimes *	r:	right page
5091573Srgrimes *
5101573Srgrimes * Returns:
5111573Srgrimes *	RET_ERROR, RET_SUCCESS
5121573Srgrimes */
5131573Srgrimesstatic int
514189291Sdelphijbt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
5151573Srgrimes{
5161573Srgrimes	BINTERNAL *bi;
5171573Srgrimes	BLEAF *bl;
51814272Spst	u_int32_t nbytes;
5191573Srgrimes	char *dest;
5201573Srgrimes
5211573Srgrimes	/*
5221573Srgrimes	 * If the root page was a leaf page, change it into an internal page.
5231573Srgrimes	 * We copy the key we split on (but not the key's data, in the case of
5241573Srgrimes	 * a leaf page) to the new root page.
5251573Srgrimes	 *
5261573Srgrimes	 * The btree comparison code guarantees that the left-most key on any
5271573Srgrimes	 * level of the tree is never used, so it doesn't need to be filled in.
5281573Srgrimes	 */
5291573Srgrimes	nbytes = NBINTERNAL(0);
5301573Srgrimes	h->linp[0] = h->upper = t->bt_psize - nbytes;
5311573Srgrimes	dest = (char *)h + h->upper;
5321573Srgrimes	WR_BINTERNAL(dest, 0, l->pgno, 0);
5331573Srgrimes
53414272Spst	switch (h->flags & P_TYPE) {
5351573Srgrimes	case P_BLEAF:
5361573Srgrimes		bl = GETBLEAF(r, 0);
5371573Srgrimes		nbytes = NBINTERNAL(bl->ksize);
538223262Sbenl		__PAST_END(h->linp, 1) = h->upper -= nbytes;
5391573Srgrimes		dest = (char *)h + h->upper;
5401573Srgrimes		WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
5411573Srgrimes		memmove(dest, bl->bytes, bl->ksize);
5421573Srgrimes
5431573Srgrimes		/*
5441573Srgrimes		 * If the key is on an overflow page, mark the overflow chain
5451573Srgrimes		 * so it isn't deleted when the leaf copy of the key is deleted.
5461573Srgrimes		 */
5471573Srgrimes		if (bl->flags & P_BIGKEY &&
5481573Srgrimes		    bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
5491573Srgrimes			return (RET_ERROR);
5501573Srgrimes		break;
5511573Srgrimes	case P_BINTERNAL:
5521573Srgrimes		bi = GETBINTERNAL(r, 0);
5531573Srgrimes		nbytes = NBINTERNAL(bi->ksize);
554223262Sbenl		__PAST_END(h->linp, 1) = h->upper -= nbytes;
5551573Srgrimes		dest = (char *)h + h->upper;
5561573Srgrimes		memmove(dest, bi, nbytes);
5571573Srgrimes		((BINTERNAL *)dest)->pgno = r->pgno;
5581573Srgrimes		break;
5591573Srgrimes	default:
5601573Srgrimes		abort();
5611573Srgrimes	}
5621573Srgrimes
5631573Srgrimes	/* There are two keys on the page. */
5641573Srgrimes	h->lower = BTDATAOFF + 2 * sizeof(indx_t);
5651573Srgrimes
5661573Srgrimes	/* Unpin the root page, set to btree internal page. */
5671573Srgrimes	h->flags &= ~P_TYPE;
5681573Srgrimes	h->flags |= P_BINTERNAL;
5691573Srgrimes	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
5701573Srgrimes
5711573Srgrimes	return (RET_SUCCESS);
5721573Srgrimes}
5731573Srgrimes
5741573Srgrimes/*
5751573Srgrimes * BT_PSPLIT -- Do the real work of splitting the page.
5761573Srgrimes *
5771573Srgrimes * Parameters:
5781573Srgrimes *	t:	tree
5791573Srgrimes *	h:	page to be split
5801573Srgrimes *	l:	page to put lower half of data
5811573Srgrimes *	r:	page to put upper half of data
5821573Srgrimes *	pskip:	pointer to index to leave open
5831573Srgrimes *	ilen:	insert length
5841573Srgrimes *
5851573Srgrimes * Returns:
5861573Srgrimes *	Pointer to page in which to insert.
5871573Srgrimes */
5881573Srgrimesstatic PAGE *
589189291Sdelphijbt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen)
5901573Srgrimes{
5911573Srgrimes	BINTERNAL *bi;
5921573Srgrimes	BLEAF *bl;
59314272Spst	CURSOR *c;
5941573Srgrimes	RLEAF *rl;
5951573Srgrimes	PAGE *rval;
5961573Srgrimes	void *src;
5971573Srgrimes	indx_t full, half, nxt, off, skip, top, used;
59814272Spst	u_int32_t nbytes;
5991573Srgrimes	int bigkeycnt, isbigkey;
6001573Srgrimes
6011573Srgrimes	/*
6021573Srgrimes	 * Split the data to the left and right pages.  Leave the skip index
6031573Srgrimes	 * open.  Additionally, make some effort not to split on an overflow
6041573Srgrimes	 * key.  This makes internal page processing faster and can save
6051573Srgrimes	 * space as overflow keys used by internal pages are never deleted.
6061573Srgrimes	 */
6071573Srgrimes	bigkeycnt = 0;
6081573Srgrimes	skip = *pskip;
6091573Srgrimes	full = t->bt_psize - BTDATAOFF;
6101573Srgrimes	half = full / 2;
6111573Srgrimes	used = 0;
6121573Srgrimes	for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
6131573Srgrimes		if (skip == off) {
6141573Srgrimes			nbytes = ilen;
6151573Srgrimes			isbigkey = 0;		/* XXX: not really known. */
6161573Srgrimes		} else
6171573Srgrimes			switch (h->flags & P_TYPE) {
6181573Srgrimes			case P_BINTERNAL:
6191573Srgrimes				src = bi = GETBINTERNAL(h, nxt);
6201573Srgrimes				nbytes = NBINTERNAL(bi->ksize);
6211573Srgrimes				isbigkey = bi->flags & P_BIGKEY;
6221573Srgrimes				break;
6231573Srgrimes			case P_BLEAF:
6241573Srgrimes				src = bl = GETBLEAF(h, nxt);
6251573Srgrimes				nbytes = NBLEAF(bl);
6261573Srgrimes				isbigkey = bl->flags & P_BIGKEY;
6271573Srgrimes				break;
6281573Srgrimes			case P_RINTERNAL:
6291573Srgrimes				src = GETRINTERNAL(h, nxt);
6301573Srgrimes				nbytes = NRINTERNAL;
6311573Srgrimes				isbigkey = 0;
6321573Srgrimes				break;
6331573Srgrimes			case P_RLEAF:
6341573Srgrimes				src = rl = GETRLEAF(h, nxt);
6351573Srgrimes				nbytes = NRLEAF(rl);
6361573Srgrimes				isbigkey = 0;
6371573Srgrimes				break;
6381573Srgrimes			default:
6391573Srgrimes				abort();
6401573Srgrimes			}
6411573Srgrimes
6421573Srgrimes		/*
6431573Srgrimes		 * If the key/data pairs are substantial fractions of the max
6441573Srgrimes		 * possible size for the page, it's possible to get situations
6451573Srgrimes		 * where we decide to try and copy too much onto the left page.
6461573Srgrimes		 * Make sure that doesn't happen.
6471573Srgrimes		 */
648194803Sdelphij		if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) ||
649194803Sdelphij		    nxt == top - 1) {
6501573Srgrimes			--off;
6511573Srgrimes			break;
6521573Srgrimes		}
6531573Srgrimes
6541573Srgrimes		/* Copy the key/data pair, if not the skipped index. */
6551573Srgrimes		if (skip != off) {
6561573Srgrimes			++nxt;
6571573Srgrimes
6581573Srgrimes			l->linp[off] = l->upper -= nbytes;
6591573Srgrimes			memmove((char *)l + l->upper, src, nbytes);
6601573Srgrimes		}
6611573Srgrimes
66237155Sguido		used += nbytes + sizeof(indx_t);
6631573Srgrimes		if (used >= half) {
6641573Srgrimes			if (!isbigkey || bigkeycnt == 3)
6651573Srgrimes				break;
6661573Srgrimes			else
6671573Srgrimes				++bigkeycnt;
6681573Srgrimes		}
6691573Srgrimes	}
6701573Srgrimes
6711573Srgrimes	/*
6721573Srgrimes	 * Off is the last offset that's valid for the left page.
6731573Srgrimes	 * Nxt is the first offset to be placed on the right page.
6741573Srgrimes	 */
6751573Srgrimes	l->lower += (off + 1) * sizeof(indx_t);
6761573Srgrimes
6771573Srgrimes	/*
6781573Srgrimes	 * If splitting the page that the cursor was on, the cursor has to be
6791573Srgrimes	 * adjusted to point to the same record as before the split.  If the
6801573Srgrimes	 * cursor is at or past the skipped slot, the cursor is incremented by
6811573Srgrimes	 * one.  If the cursor is on the right page, it is decremented by the
6821573Srgrimes	 * number of records split to the left page.
6831573Srgrimes	 */
68414272Spst	c = &t->bt_cursor;
68514272Spst	if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
68614272Spst		if (c->pg.index >= skip)
68714272Spst			++c->pg.index;
68814272Spst		if (c->pg.index < nxt)			/* Left page. */
68914272Spst			c->pg.pgno = l->pgno;
6901573Srgrimes		else {					/* Right page. */
69114272Spst			c->pg.pgno = r->pgno;
69214272Spst			c->pg.index -= nxt;
6931573Srgrimes		}
6941573Srgrimes	}
6951573Srgrimes
6961573Srgrimes	/*
6971573Srgrimes	 * If the skipped index was on the left page, just return that page.
6981573Srgrimes	 * Otherwise, adjust the skip index to reflect the new position on
6991573Srgrimes	 * the right page.
7001573Srgrimes	 */
7011573Srgrimes	if (skip <= off) {
702135178Skuriyama		skip = MAX_PAGE_OFFSET;
7031573Srgrimes		rval = l;
7041573Srgrimes	} else {
7051573Srgrimes		rval = r;
7061573Srgrimes		*pskip -= nxt;
7071573Srgrimes	}
7081573Srgrimes
7091573Srgrimes	for (off = 0; nxt < top; ++off) {
7101573Srgrimes		if (skip == nxt) {
7111573Srgrimes			++off;
712135178Skuriyama			skip = MAX_PAGE_OFFSET;
7131573Srgrimes		}
7141573Srgrimes		switch (h->flags & P_TYPE) {
7151573Srgrimes		case P_BINTERNAL:
7161573Srgrimes			src = bi = GETBINTERNAL(h, nxt);
7171573Srgrimes			nbytes = NBINTERNAL(bi->ksize);
7181573Srgrimes			break;
7191573Srgrimes		case P_BLEAF:
7201573Srgrimes			src = bl = GETBLEAF(h, nxt);
7211573Srgrimes			nbytes = NBLEAF(bl);
7221573Srgrimes			break;
7231573Srgrimes		case P_RINTERNAL:
7241573Srgrimes			src = GETRINTERNAL(h, nxt);
7251573Srgrimes			nbytes = NRINTERNAL;
7261573Srgrimes			break;
7271573Srgrimes		case P_RLEAF:
7281573Srgrimes			src = rl = GETRLEAF(h, nxt);
7291573Srgrimes			nbytes = NRLEAF(rl);
7301573Srgrimes			break;
7311573Srgrimes		default:
7321573Srgrimes			abort();
7331573Srgrimes		}
7341573Srgrimes		++nxt;
7351573Srgrimes		r->linp[off] = r->upper -= nbytes;
7361573Srgrimes		memmove((char *)r + r->upper, src, nbytes);
7371573Srgrimes	}
7381573Srgrimes	r->lower += off * sizeof(indx_t);
7391573Srgrimes
7401573Srgrimes	/* If the key is being appended to the page, adjust the index. */
7411573Srgrimes	if (skip == top)
7421573Srgrimes		r->lower += sizeof(indx_t);
7431573Srgrimes
7441573Srgrimes	return (rval);
7451573Srgrimes}
7461573Srgrimes
7471573Srgrimes/*
7481573Srgrimes * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
7491573Srgrimes *
7501573Srgrimes * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
7511573Srgrimes * record that references them gets deleted.  Chains pointed to by internal
7521573Srgrimes * pages never get deleted.  This routine marks a chain as pointed to by an
7531573Srgrimes * internal page.
7541573Srgrimes *
7551573Srgrimes * Parameters:
7561573Srgrimes *	t:	tree
7571573Srgrimes *	pg:	page number of first page in the chain.
7581573Srgrimes *
7591573Srgrimes * Returns:
7601573Srgrimes *	RET_SUCCESS, RET_ERROR.
7611573Srgrimes */
7621573Srgrimesstatic int
763189291Sdelphijbt_preserve(BTREE *t, pgno_t pg)
7641573Srgrimes{
7651573Srgrimes	PAGE *h;
7661573Srgrimes
7671573Srgrimes	if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
7681573Srgrimes		return (RET_ERROR);
7691573Srgrimes	h->flags |= P_PRESERVE;
7701573Srgrimes	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
7711573Srgrimes	return (RET_SUCCESS);
7721573Srgrimes}
7731573Srgrimes
7741573Srgrimes/*
7751573Srgrimes * REC_TOTAL -- Return the number of recno entries below a page.
7761573Srgrimes *
7771573Srgrimes * Parameters:
7781573Srgrimes *	h:	page
7791573Srgrimes *
7801573Srgrimes * Returns:
7811573Srgrimes *	The number of recno entries below a page.
7821573Srgrimes *
7831573Srgrimes * XXX
7841573Srgrimes * These values could be set by the bt_psplit routine.  The problem is that the
7851573Srgrimes * entry has to be popped off of the stack etc. or the values have to be passed
7861573Srgrimes * all the way back to bt_split/bt_rroot and it's not very clean.
7871573Srgrimes */
7881573Srgrimesstatic recno_t
789189291Sdelphijrec_total(PAGE *h)
7901573Srgrimes{
7911573Srgrimes	recno_t recs;
7921573Srgrimes	indx_t nxt, top;
7931573Srgrimes
7941573Srgrimes	for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
7951573Srgrimes		recs += GETRINTERNAL(h, nxt)->nrecs;
7961573Srgrimes	return (recs);
7971573Srgrimes}
798