11573Srgrimes/*-
214272Spst * Copyright (c) 1990, 1993, 1994
31573Srgrimes *	The Regents of the University of California.  All rights reserved.
41573Srgrimes *
51573Srgrimes * This code is derived from software contributed to Berkeley by
61573Srgrimes * Mike Olson.
71573Srgrimes *
81573Srgrimes * Redistribution and use in source and binary forms, with or without
91573Srgrimes * modification, are permitted provided that the following conditions
101573Srgrimes * are met:
111573Srgrimes * 1. Redistributions of source code must retain the above copyright
121573Srgrimes *    notice, this list of conditions and the following disclaimer.
131573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
141573Srgrimes *    notice, this list of conditions and the following disclaimer in the
151573Srgrimes *    documentation and/or other materials provided with the distribution.
161573Srgrimes * 4. Neither the name of the University nor the names of its contributors
171573Srgrimes *    may be used to endorse or promote products derived from this software
181573Srgrimes *    without specific prior written permission.
191573Srgrimes *
201573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
211573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
221573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
231573Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
241573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
251573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
261573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
271573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
281573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
291573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
301573Srgrimes * SUCH DAMAGE.
311573Srgrimes */
321573Srgrimes
331573Srgrimes#if defined(LIBC_SCCS) && !defined(lint)
34194804Sdelphijstatic char sccsid[] = "@(#)bt_split.c	8.10 (Berkeley) 1/9/95";
351573Srgrimes#endif /* LIBC_SCCS and not lint */
3692905Sobrien#include <sys/cdefs.h>
3792905Sobrien__FBSDID("$FreeBSD$");
381573Srgrimes
39223262Sbenl#include <sys/param.h>
401573Srgrimes
411573Srgrimes#include <limits.h>
421573Srgrimes#include <stdio.h>
431573Srgrimes#include <stdlib.h>
441573Srgrimes#include <string.h>
451573Srgrimes
461573Srgrimes#include <db.h>
471573Srgrimes#include "btree.h"
481573Srgrimes
4992905Sobrienstatic int	 bt_broot(BTREE *, PAGE *, PAGE *, PAGE *);
50189291Sdelphijstatic PAGE	*bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
5192905Sobrienstatic int	 bt_preserve(BTREE *, pgno_t);
52189291Sdelphijstatic PAGE	*bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t);
53189291Sdelphijstatic PAGE	*bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
5492905Sobrienstatic int	 bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *);
5592905Sobrienstatic recno_t	 rec_total(PAGE *);
561573Srgrimes
571573Srgrimes#ifdef STATISTICS
581573Srgrimesu_long	bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
591573Srgrimes#endif
601573Srgrimes
611573Srgrimes/*
621573Srgrimes * __BT_SPLIT -- Split the tree.
631573Srgrimes *
641573Srgrimes * Parameters:
651573Srgrimes *	t:	tree
661573Srgrimes *	sp:	page to split
671573Srgrimes *	key:	key to insert
681573Srgrimes *	data:	data to insert
691573Srgrimes *	flags:	BIGKEY/BIGDATA flags
701573Srgrimes *	ilen:	insert length
711573Srgrimes *	skip:	index to leave open
721573Srgrimes *
731573Srgrimes * Returns:
741573Srgrimes *	RET_ERROR, RET_SUCCESS
751573Srgrimes */
761573Srgrimesint
77189291Sdelphij__bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags,
78189291Sdelphij    size_t ilen, u_int32_t argskip)
791573Srgrimes{
801573Srgrimes	BINTERNAL *bi;
811573Srgrimes	BLEAF *bl, *tbl;
821573Srgrimes	DBT a, b;
831573Srgrimes	EPGNO *parent;
841573Srgrimes	PAGE *h, *l, *r, *lchild, *rchild;
851573Srgrimes	indx_t nxtindex;
8614272Spst	u_int16_t skip;
8714272Spst	u_int32_t n, nbytes, nksize;
881573Srgrimes	int parentsplit;
891573Srgrimes	char *dest;
901573Srgrimes
911573Srgrimes	/*
921573Srgrimes	 * Split the page into two pages, l and r.  The split routines return
931573Srgrimes	 * a pointer to the page into which the key should be inserted and with
941573Srgrimes	 * skip set to the offset which should be used.  Additionally, l and r
951573Srgrimes	 * are pinned.
961573Srgrimes	 */
9714272Spst	skip = argskip;
981573Srgrimes	h = sp->pgno == P_ROOT ?
991573Srgrimes	    bt_root(t, sp, &l, &r, &skip, ilen) :
1001573Srgrimes	    bt_page(t, sp, &l, &r, &skip, ilen);
1011573Srgrimes	if (h == NULL)
1021573Srgrimes		return (RET_ERROR);
1031573Srgrimes
1041573Srgrimes	/*
1051573Srgrimes	 * Insert the new key/data pair into the leaf page.  (Key inserts
1061573Srgrimes	 * always cause a leaf page to split first.)
1071573Srgrimes	 */
1081573Srgrimes	h->linp[skip] = h->upper -= ilen;
1091573Srgrimes	dest = (char *)h + h->upper;
11014272Spst	if (F_ISSET(t, R_RECNO))
1111573Srgrimes		WR_RLEAF(dest, data, flags)
1121573Srgrimes	else
1131573Srgrimes		WR_BLEAF(dest, key, data, flags)
1141573Srgrimes
1151573Srgrimes	/* If the root page was split, make it look right. */
1161573Srgrimes	if (sp->pgno == P_ROOT &&
11714272Spst	    (F_ISSET(t, R_RECNO) ?
1181573Srgrimes	    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
1191573Srgrimes		goto err2;
1201573Srgrimes
1211573Srgrimes	/*
1221573Srgrimes	 * Now we walk the parent page stack -- a LIFO stack of the pages that
1231573Srgrimes	 * were traversed when we searched for the page that split.  Each stack
1241573Srgrimes	 * entry is a page number and a page index offset.  The offset is for
1251573Srgrimes	 * the page traversed on the search.  We've just split a page, so we
1261573Srgrimes	 * have to insert a new key into the parent page.
1271573Srgrimes	 *
1281573Srgrimes	 * If the insert into the parent page causes it to split, may have to
1291573Srgrimes	 * continue splitting all the way up the tree.  We stop if the root
1301573Srgrimes	 * splits or the page inserted into didn't have to split to hold the
1311573Srgrimes	 * new key.  Some algorithms replace the key for the old page as well
1321573Srgrimes	 * as the new page.  We don't, as there's no reason to believe that the
1331573Srgrimes	 * first key on the old page is any better than the key we have, and,
1341573Srgrimes	 * in the case of a key being placed at index 0 causing the split, the
1351573Srgrimes	 * key is unavailable.
1361573Srgrimes	 *
1371573Srgrimes	 * There are a maximum of 5 pages pinned at any time.  We keep the left
1381573Srgrimes	 * and right pages pinned while working on the parent.   The 5 are the
1391573Srgrimes	 * two children, left parent and right parent (when the parent splits)
1401573Srgrimes	 * and the root page or the overflow key page when calling bt_preserve.
1411573Srgrimes	 * This code must make sure that all pins are released other than the
1421573Srgrimes	 * root page or overflow page which is unlocked elsewhere.
1431573Srgrimes	 */
1441573Srgrimes	while ((parent = BT_POP(t)) != NULL) {
1451573Srgrimes		lchild = l;
1461573Srgrimes		rchild = r;
1471573Srgrimes
1481573Srgrimes		/* Get the parent page. */
1491573Srgrimes		if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
1501573Srgrimes			goto err2;
1511573Srgrimes
152189327Sdelphij		/*
1531573Srgrimes		 * The new key goes ONE AFTER the index, because the split
1541573Srgrimes		 * was to the right.
1551573Srgrimes		 */
1561573Srgrimes		skip = parent->index + 1;
1571573Srgrimes
1581573Srgrimes		/*
1591573Srgrimes		 * Calculate the space needed on the parent page.
1601573Srgrimes		 *
1611573Srgrimes		 * Prefix trees: space hack when inserting into BINTERNAL
1621573Srgrimes		 * pages.  Retain only what's needed to distinguish between
1631573Srgrimes		 * the new entry and the LAST entry on the page to its left.
1641573Srgrimes		 * If the keys compare equal, retain the entire key.  Note,
1651573Srgrimes		 * we don't touch overflow keys, and the entire key must be
1661573Srgrimes		 * retained for the next-to-left most key on the leftmost
1671573Srgrimes		 * page of each level, or the search will fail.  Applicable
1681573Srgrimes		 * ONLY to internal pages that have leaf pages as children.
1691573Srgrimes		 * Further reduction of the key between pairs of internal
1701573Srgrimes		 * pages loses too much information.
1711573Srgrimes		 */
1721573Srgrimes		switch (rchild->flags & P_TYPE) {
1731573Srgrimes		case P_BINTERNAL:
1741573Srgrimes			bi = GETBINTERNAL(rchild, 0);
1751573Srgrimes			nbytes = NBINTERNAL(bi->ksize);
1761573Srgrimes			break;
1771573Srgrimes		case P_BLEAF:
1781573Srgrimes			bl = GETBLEAF(rchild, 0);
1791573Srgrimes			nbytes = NBINTERNAL(bl->ksize);
1801573Srgrimes			if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
1811573Srgrimes			    (h->prevpg != P_INVALID || skip > 1)) {
1821573Srgrimes				tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
1831573Srgrimes				a.size = tbl->ksize;
1841573Srgrimes				a.data = tbl->bytes;
1851573Srgrimes				b.size = bl->ksize;
1861573Srgrimes				b.data = bl->bytes;
1871573Srgrimes				nksize = t->bt_pfx(&a, &b);
1881573Srgrimes				n = NBINTERNAL(nksize);
1891573Srgrimes				if (n < nbytes) {
1901573Srgrimes#ifdef STATISTICS
1911573Srgrimes					bt_pfxsaved += nbytes - n;
1921573Srgrimes#endif
1931573Srgrimes					nbytes = n;
1941573Srgrimes				} else
1951573Srgrimes					nksize = 0;
1961573Srgrimes			} else
1971573Srgrimes				nksize = 0;
1981573Srgrimes			break;
1991573Srgrimes		case P_RINTERNAL:
2001573Srgrimes		case P_RLEAF:
2011573Srgrimes			nbytes = NRINTERNAL;
2021573Srgrimes			break;
2031573Srgrimes		default:
2041573Srgrimes			abort();
2051573Srgrimes		}
2061573Srgrimes
2071573Srgrimes		/* Split the parent page if necessary or shift the indices. */
208190484Sdelphij		if ((u_int32_t)(h->upper - h->lower) < nbytes + sizeof(indx_t)) {
2091573Srgrimes			sp = h;
2101573Srgrimes			h = h->pgno == P_ROOT ?
2111573Srgrimes			    bt_root(t, h, &l, &r, &skip, nbytes) :
2121573Srgrimes			    bt_page(t, h, &l, &r, &skip, nbytes);
2131573Srgrimes			if (h == NULL)
2141573Srgrimes				goto err1;
2151573Srgrimes			parentsplit = 1;
2161573Srgrimes		} else {
2171573Srgrimes			if (skip < (nxtindex = NEXTINDEX(h)))
2181573Srgrimes				memmove(h->linp + skip + 1, h->linp + skip,
2191573Srgrimes				    (nxtindex - skip) * sizeof(indx_t));
2201573Srgrimes			h->lower += sizeof(indx_t);
2211573Srgrimes			parentsplit = 0;
2221573Srgrimes		}
2231573Srgrimes
2241573Srgrimes		/* Insert the key into the parent page. */
22514272Spst		switch (rchild->flags & P_TYPE) {
2261573Srgrimes		case P_BINTERNAL:
2271573Srgrimes			h->linp[skip] = h->upper -= nbytes;
2281573Srgrimes			dest = (char *)h + h->linp[skip];
2291573Srgrimes			memmove(dest, bi, nbytes);
2301573Srgrimes			((BINTERNAL *)dest)->pgno = rchild->pgno;
2311573Srgrimes			break;
2321573Srgrimes		case P_BLEAF:
2331573Srgrimes			h->linp[skip] = h->upper -= nbytes;
2341573Srgrimes			dest = (char *)h + h->linp[skip];
2351573Srgrimes			WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
2361573Srgrimes			    rchild->pgno, bl->flags & P_BIGKEY);
2371573Srgrimes			memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
238295973Spfg			if (bl->flags & P_BIGKEY) {
239295973Spfg				pgno_t pgno;
240295973Spfg				memcpy(&pgno, bl->bytes, sizeof(pgno));
241295973Spfg				if (bt_preserve(t, pgno) == RET_ERROR)
242295973Spfg					goto err1;
243295973Spfg			}
2441573Srgrimes			break;
2451573Srgrimes		case P_RINTERNAL:
2461573Srgrimes			/*
2471573Srgrimes			 * Update the left page count.  If split
2481573Srgrimes			 * added at index 0, fix the correct page.
2491573Srgrimes			 */
2501573Srgrimes			if (skip > 0)
2511573Srgrimes				dest = (char *)h + h->linp[skip - 1];
2521573Srgrimes			else
2531573Srgrimes				dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
2541573Srgrimes			((RINTERNAL *)dest)->nrecs = rec_total(lchild);
2551573Srgrimes			((RINTERNAL *)dest)->pgno = lchild->pgno;
2561573Srgrimes
2571573Srgrimes			/* Update the right page count. */
2581573Srgrimes			h->linp[skip] = h->upper -= nbytes;
2591573Srgrimes			dest = (char *)h + h->linp[skip];
2601573Srgrimes			((RINTERNAL *)dest)->nrecs = rec_total(rchild);
2611573Srgrimes			((RINTERNAL *)dest)->pgno = rchild->pgno;
2621573Srgrimes			break;
2631573Srgrimes		case P_RLEAF:
2641573Srgrimes			/*
2651573Srgrimes			 * Update the left page count.  If split
2661573Srgrimes			 * added at index 0, fix the correct page.
2671573Srgrimes			 */
2681573Srgrimes			if (skip > 0)
2691573Srgrimes				dest = (char *)h + h->linp[skip - 1];
2701573Srgrimes			else
2711573Srgrimes				dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
2721573Srgrimes			((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
2731573Srgrimes			((RINTERNAL *)dest)->pgno = lchild->pgno;
2741573Srgrimes
2751573Srgrimes			/* Update the right page count. */
2761573Srgrimes			h->linp[skip] = h->upper -= nbytes;
2771573Srgrimes			dest = (char *)h + h->linp[skip];
2781573Srgrimes			((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
2791573Srgrimes			((RINTERNAL *)dest)->pgno = rchild->pgno;
2801573Srgrimes			break;
2811573Srgrimes		default:
2821573Srgrimes			abort();
2831573Srgrimes		}
2841573Srgrimes
2851573Srgrimes		/* Unpin the held pages. */
2861573Srgrimes		if (!parentsplit) {
2871573Srgrimes			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
2881573Srgrimes			break;
2891573Srgrimes		}
2901573Srgrimes
2911573Srgrimes		/* If the root page was split, make it look right. */
2921573Srgrimes		if (sp->pgno == P_ROOT &&
29314272Spst		    (F_ISSET(t, R_RECNO) ?
2941573Srgrimes		    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
2951573Srgrimes			goto err1;
2961573Srgrimes
2971573Srgrimes		mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
2981573Srgrimes		mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
2991573Srgrimes	}
3001573Srgrimes
3011573Srgrimes	/* Unpin the held pages. */
3021573Srgrimes	mpool_put(t->bt_mp, l, MPOOL_DIRTY);
3031573Srgrimes	mpool_put(t->bt_mp, r, MPOOL_DIRTY);
3041573Srgrimes
3051573Srgrimes	/* Clear any pages left on the stack. */
3061573Srgrimes	return (RET_SUCCESS);
3071573Srgrimes
3081573Srgrimes	/*
3091573Srgrimes	 * If something fails in the above loop we were already walking back
3101573Srgrimes	 * up the tree and the tree is now inconsistent.  Nothing much we can
3111573Srgrimes	 * do about it but release any memory we're holding.
3121573Srgrimes	 */
3131573Srgrimeserr1:	mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
3141573Srgrimes	mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
3151573Srgrimes
3161573Srgrimeserr2:	mpool_put(t->bt_mp, l, 0);
3171573Srgrimes	mpool_put(t->bt_mp, r, 0);
3181573Srgrimes	__dbpanic(t->bt_dbp);
3191573Srgrimes	return (RET_ERROR);
3201573Srgrimes}
3211573Srgrimes
3221573Srgrimes/*
3231573Srgrimes * BT_PAGE -- Split a non-root page of a btree.
3241573Srgrimes *
3251573Srgrimes * Parameters:
3261573Srgrimes *	t:	tree
3271573Srgrimes *	h:	root page
3281573Srgrimes *	lp:	pointer to left page pointer
3291573Srgrimes *	rp:	pointer to right page pointer
3301573Srgrimes *	skip:	pointer to index to leave open
3311573Srgrimes *	ilen:	insert length
3321573Srgrimes *
3331573Srgrimes * Returns:
3341573Srgrimes *	Pointer to page in which to insert or NULL on error.
3351573Srgrimes */
3361573Srgrimesstatic PAGE *
337189291Sdelphijbt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
3381573Srgrimes{
3391573Srgrimes	PAGE *l, *r, *tp;
3401573Srgrimes	pgno_t npg;
3411573Srgrimes
3421573Srgrimes#ifdef STATISTICS
3431573Srgrimes	++bt_split;
3441573Srgrimes#endif
3451573Srgrimes	/* Put the new right page for the split into place. */
3461573Srgrimes	if ((r = __bt_new(t, &npg)) == NULL)
3471573Srgrimes		return (NULL);
3481573Srgrimes	r->pgno = npg;
3491573Srgrimes	r->lower = BTDATAOFF;
3501573Srgrimes	r->upper = t->bt_psize;
3511573Srgrimes	r->nextpg = h->nextpg;
3521573Srgrimes	r->prevpg = h->pgno;
3531573Srgrimes	r->flags = h->flags & P_TYPE;
3541573Srgrimes
3551573Srgrimes	/*
3561573Srgrimes	 * If we're splitting the last page on a level because we're appending
3571573Srgrimes	 * a key to it (skip is NEXTINDEX()), it's likely that the data is
3581573Srgrimes	 * sorted.  Adding an empty page on the side of the level is less work
3591573Srgrimes	 * and can push the fill factor much higher than normal.  If we're
3601573Srgrimes	 * wrong it's no big deal, we'll just do the split the right way next
3611573Srgrimes	 * time.  It may look like it's equally easy to do a similar hack for
3621573Srgrimes	 * reverse sorted data, that is, split the tree left, but it's not.
3631573Srgrimes	 * Don't even try.
3641573Srgrimes	 */
3651573Srgrimes	if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
3661573Srgrimes#ifdef STATISTICS
3671573Srgrimes		++bt_sortsplit;
3681573Srgrimes#endif
3691573Srgrimes		h->nextpg = r->pgno;
3701573Srgrimes		r->lower = BTDATAOFF + sizeof(indx_t);
3711573Srgrimes		*skip = 0;
3721573Srgrimes		*lp = h;
3731573Srgrimes		*rp = r;
3741573Srgrimes		return (r);
3751573Srgrimes	}
3761573Srgrimes
3771573Srgrimes	/* Put the new left page for the split into place. */
378190482Sdelphij	if ((l = (PAGE *)calloc(1, t->bt_psize)) == NULL) {
3791573Srgrimes		mpool_put(t->bt_mp, r, 0);
3801573Srgrimes		return (NULL);
3811573Srgrimes	}
3821573Srgrimes	l->pgno = h->pgno;
3831573Srgrimes	l->nextpg = r->pgno;
3841573Srgrimes	l->prevpg = h->prevpg;
3851573Srgrimes	l->lower = BTDATAOFF;
3861573Srgrimes	l->upper = t->bt_psize;
3871573Srgrimes	l->flags = h->flags & P_TYPE;
3881573Srgrimes
3891573Srgrimes	/* Fix up the previous pointer of the page after the split page. */
3901573Srgrimes	if (h->nextpg != P_INVALID) {
3911573Srgrimes		if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
3921573Srgrimes			free(l);
3931573Srgrimes			/* XXX mpool_free(t->bt_mp, r->pgno); */
3941573Srgrimes			return (NULL);
3951573Srgrimes		}
3961573Srgrimes		tp->prevpg = r->pgno;
39714272Spst		mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
3981573Srgrimes	}
3991573Srgrimes
4001573Srgrimes	/*
4011573Srgrimes	 * Split right.  The key/data pairs aren't sorted in the btree page so
4021573Srgrimes	 * it's simpler to copy the data from the split page onto two new pages
4031573Srgrimes	 * instead of copying half the data to the right page and compacting
4041573Srgrimes	 * the left page in place.  Since the left page can't change, we have
4051573Srgrimes	 * to swap the original and the allocated left page after the split.
4061573Srgrimes	 */
4071573Srgrimes	tp = bt_psplit(t, h, l, r, skip, ilen);
4081573Srgrimes
4091573Srgrimes	/* Move the new left page onto the old left page. */
4101573Srgrimes	memmove(h, l, t->bt_psize);
4111573Srgrimes	if (tp == l)
4121573Srgrimes		tp = h;
4131573Srgrimes	free(l);
4141573Srgrimes
4151573Srgrimes	*lp = h;
4161573Srgrimes	*rp = r;
4171573Srgrimes	return (tp);
4181573Srgrimes}
4191573Srgrimes
4201573Srgrimes/*
4211573Srgrimes * BT_ROOT -- Split the root page of a btree.
4221573Srgrimes *
4231573Srgrimes * Parameters:
4241573Srgrimes *	t:	tree
4251573Srgrimes *	h:	root page
4261573Srgrimes *	lp:	pointer to left page pointer
4271573Srgrimes *	rp:	pointer to right page pointer
4281573Srgrimes *	skip:	pointer to index to leave open
4291573Srgrimes *	ilen:	insert length
4301573Srgrimes *
4311573Srgrimes * Returns:
4321573Srgrimes *	Pointer to page in which to insert or NULL on error.
4331573Srgrimes */
4341573Srgrimesstatic PAGE *
435189291Sdelphijbt_root(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
4361573Srgrimes{
4371573Srgrimes	PAGE *l, *r, *tp;
4381573Srgrimes	pgno_t lnpg, rnpg;
4391573Srgrimes
4401573Srgrimes#ifdef STATISTICS
4411573Srgrimes	++bt_split;
4421573Srgrimes	++bt_rootsplit;
4431573Srgrimes#endif
4441573Srgrimes	/* Put the new left and right pages for the split into place. */
4451573Srgrimes	if ((l = __bt_new(t, &lnpg)) == NULL ||
4461573Srgrimes	    (r = __bt_new(t, &rnpg)) == NULL)
4471573Srgrimes		return (NULL);
4481573Srgrimes	l->pgno = lnpg;
4491573Srgrimes	r->pgno = rnpg;
4501573Srgrimes	l->nextpg = r->pgno;
4511573Srgrimes	r->prevpg = l->pgno;
4521573Srgrimes	l->prevpg = r->nextpg = P_INVALID;
4531573Srgrimes	l->lower = r->lower = BTDATAOFF;
4541573Srgrimes	l->upper = r->upper = t->bt_psize;
4551573Srgrimes	l->flags = r->flags = h->flags & P_TYPE;
4561573Srgrimes
4571573Srgrimes	/* Split the root page. */
4581573Srgrimes	tp = bt_psplit(t, h, l, r, skip, ilen);
4591573Srgrimes
4601573Srgrimes	*lp = l;
4611573Srgrimes	*rp = r;
4621573Srgrimes	return (tp);
4631573Srgrimes}
4641573Srgrimes
4651573Srgrimes/*
4661573Srgrimes * BT_RROOT -- Fix up the recno root page after it has been split.
4671573Srgrimes *
4681573Srgrimes * Parameters:
4691573Srgrimes *	t:	tree
4701573Srgrimes *	h:	root page
4711573Srgrimes *	l:	left page
4721573Srgrimes *	r:	right page
4731573Srgrimes *
4741573Srgrimes * Returns:
4751573Srgrimes *	RET_ERROR, RET_SUCCESS
4761573Srgrimes */
4771573Srgrimesstatic int
478189291Sdelphijbt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
4791573Srgrimes{
4801573Srgrimes	char *dest;
4811573Srgrimes
4821573Srgrimes	/* Insert the left and right keys, set the header information. */
4831573Srgrimes	h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
4841573Srgrimes	dest = (char *)h + h->upper;
4851573Srgrimes	WR_RINTERNAL(dest,
4861573Srgrimes	    l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
4871573Srgrimes
488223262Sbenl	__PAST_END(h->linp, 1) = h->upper -= NRINTERNAL;
4891573Srgrimes	dest = (char *)h + h->upper;
4901573Srgrimes	WR_RINTERNAL(dest,
4911573Srgrimes	    r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
4921573Srgrimes
4931573Srgrimes	h->lower = BTDATAOFF + 2 * sizeof(indx_t);
4941573Srgrimes
4951573Srgrimes	/* Unpin the root page, set to recno internal page. */
4961573Srgrimes	h->flags &= ~P_TYPE;
4971573Srgrimes	h->flags |= P_RINTERNAL;
4981573Srgrimes	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
4991573Srgrimes
5001573Srgrimes	return (RET_SUCCESS);
5011573Srgrimes}
5021573Srgrimes
5031573Srgrimes/*
5041573Srgrimes * BT_BROOT -- Fix up the btree root page after it has been split.
5051573Srgrimes *
5061573Srgrimes * Parameters:
5071573Srgrimes *	t:	tree
5081573Srgrimes *	h:	root page
5091573Srgrimes *	l:	left page
5101573Srgrimes *	r:	right page
5111573Srgrimes *
5121573Srgrimes * Returns:
5131573Srgrimes *	RET_ERROR, RET_SUCCESS
5141573Srgrimes */
5151573Srgrimesstatic int
516189291Sdelphijbt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
5171573Srgrimes{
5181573Srgrimes	BINTERNAL *bi;
5191573Srgrimes	BLEAF *bl;
52014272Spst	u_int32_t nbytes;
5211573Srgrimes	char *dest;
5221573Srgrimes
5231573Srgrimes	/*
5241573Srgrimes	 * If the root page was a leaf page, change it into an internal page.
5251573Srgrimes	 * We copy the key we split on (but not the key's data, in the case of
5261573Srgrimes	 * a leaf page) to the new root page.
5271573Srgrimes	 *
5281573Srgrimes	 * The btree comparison code guarantees that the left-most key on any
5291573Srgrimes	 * level of the tree is never used, so it doesn't need to be filled in.
5301573Srgrimes	 */
5311573Srgrimes	nbytes = NBINTERNAL(0);
5321573Srgrimes	h->linp[0] = h->upper = t->bt_psize - nbytes;
5331573Srgrimes	dest = (char *)h + h->upper;
5341573Srgrimes	WR_BINTERNAL(dest, 0, l->pgno, 0);
5351573Srgrimes
53614272Spst	switch (h->flags & P_TYPE) {
5371573Srgrimes	case P_BLEAF:
5381573Srgrimes		bl = GETBLEAF(r, 0);
5391573Srgrimes		nbytes = NBINTERNAL(bl->ksize);
540223262Sbenl		__PAST_END(h->linp, 1) = h->upper -= nbytes;
5411573Srgrimes		dest = (char *)h + h->upper;
5421573Srgrimes		WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
5431573Srgrimes		memmove(dest, bl->bytes, bl->ksize);
5441573Srgrimes
5451573Srgrimes		/*
5461573Srgrimes		 * If the key is on an overflow page, mark the overflow chain
5471573Srgrimes		 * so it isn't deleted when the leaf copy of the key is deleted.
5481573Srgrimes		 */
549295973Spfg	if (bl->flags & P_BIGKEY) {
550295973Spfg			pgno_t pgno;
551295973Spfg			memcpy(&pgno, bl->bytes, sizeof(pgno));
552295973Spfg			if (bt_preserve(t, pgno) == RET_ERROR)
553295973Spfg				return (RET_ERROR);
554295973Spfg		}
5551573Srgrimes		break;
5561573Srgrimes	case P_BINTERNAL:
5571573Srgrimes		bi = GETBINTERNAL(r, 0);
5581573Srgrimes		nbytes = NBINTERNAL(bi->ksize);
559223262Sbenl		__PAST_END(h->linp, 1) = h->upper -= nbytes;
5601573Srgrimes		dest = (char *)h + h->upper;
5611573Srgrimes		memmove(dest, bi, nbytes);
5621573Srgrimes		((BINTERNAL *)dest)->pgno = r->pgno;
5631573Srgrimes		break;
5641573Srgrimes	default:
5651573Srgrimes		abort();
5661573Srgrimes	}
5671573Srgrimes
5681573Srgrimes	/* There are two keys on the page. */
5691573Srgrimes	h->lower = BTDATAOFF + 2 * sizeof(indx_t);
5701573Srgrimes
5711573Srgrimes	/* Unpin the root page, set to btree internal page. */
5721573Srgrimes	h->flags &= ~P_TYPE;
5731573Srgrimes	h->flags |= P_BINTERNAL;
5741573Srgrimes	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
5751573Srgrimes
5761573Srgrimes	return (RET_SUCCESS);
5771573Srgrimes}
5781573Srgrimes
5791573Srgrimes/*
5801573Srgrimes * BT_PSPLIT -- Do the real work of splitting the page.
5811573Srgrimes *
5821573Srgrimes * Parameters:
5831573Srgrimes *	t:	tree
5841573Srgrimes *	h:	page to be split
5851573Srgrimes *	l:	page to put lower half of data
5861573Srgrimes *	r:	page to put upper half of data
5871573Srgrimes *	pskip:	pointer to index to leave open
5881573Srgrimes *	ilen:	insert length
5891573Srgrimes *
5901573Srgrimes * Returns:
5911573Srgrimes *	Pointer to page in which to insert.
5921573Srgrimes */
5931573Srgrimesstatic PAGE *
594189291Sdelphijbt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen)
5951573Srgrimes{
5961573Srgrimes	BINTERNAL *bi;
5971573Srgrimes	BLEAF *bl;
59814272Spst	CURSOR *c;
5991573Srgrimes	RLEAF *rl;
6001573Srgrimes	PAGE *rval;
6011573Srgrimes	void *src;
6021573Srgrimes	indx_t full, half, nxt, off, skip, top, used;
60314272Spst	u_int32_t nbytes;
6041573Srgrimes	int bigkeycnt, isbigkey;
6051573Srgrimes
6061573Srgrimes	/*
6071573Srgrimes	 * Split the data to the left and right pages.  Leave the skip index
6081573Srgrimes	 * open.  Additionally, make some effort not to split on an overflow
6091573Srgrimes	 * key.  This makes internal page processing faster and can save
6101573Srgrimes	 * space as overflow keys used by internal pages are never deleted.
6111573Srgrimes	 */
6121573Srgrimes	bigkeycnt = 0;
6131573Srgrimes	skip = *pskip;
6141573Srgrimes	full = t->bt_psize - BTDATAOFF;
6151573Srgrimes	half = full / 2;
6161573Srgrimes	used = 0;
6171573Srgrimes	for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
6181573Srgrimes		if (skip == off) {
6191573Srgrimes			nbytes = ilen;
6201573Srgrimes			isbigkey = 0;		/* XXX: not really known. */
6211573Srgrimes		} else
6221573Srgrimes			switch (h->flags & P_TYPE) {
6231573Srgrimes			case P_BINTERNAL:
6241573Srgrimes				src = bi = GETBINTERNAL(h, nxt);
6251573Srgrimes				nbytes = NBINTERNAL(bi->ksize);
6261573Srgrimes				isbigkey = bi->flags & P_BIGKEY;
6271573Srgrimes				break;
6281573Srgrimes			case P_BLEAF:
6291573Srgrimes				src = bl = GETBLEAF(h, nxt);
6301573Srgrimes				nbytes = NBLEAF(bl);
6311573Srgrimes				isbigkey = bl->flags & P_BIGKEY;
6321573Srgrimes				break;
6331573Srgrimes			case P_RINTERNAL:
6341573Srgrimes				src = GETRINTERNAL(h, nxt);
6351573Srgrimes				nbytes = NRINTERNAL;
6361573Srgrimes				isbigkey = 0;
6371573Srgrimes				break;
6381573Srgrimes			case P_RLEAF:
6391573Srgrimes				src = rl = GETRLEAF(h, nxt);
6401573Srgrimes				nbytes = NRLEAF(rl);
6411573Srgrimes				isbigkey = 0;
6421573Srgrimes				break;
6431573Srgrimes			default:
6441573Srgrimes				abort();
6451573Srgrimes			}
6461573Srgrimes
6471573Srgrimes		/*
6481573Srgrimes		 * If the key/data pairs are substantial fractions of the max
6491573Srgrimes		 * possible size for the page, it's possible to get situations
6501573Srgrimes		 * where we decide to try and copy too much onto the left page.
6511573Srgrimes		 * Make sure that doesn't happen.
6521573Srgrimes		 */
653194803Sdelphij		if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) ||
654194803Sdelphij		    nxt == top - 1) {
6551573Srgrimes			--off;
6561573Srgrimes			break;
6571573Srgrimes		}
6581573Srgrimes
6591573Srgrimes		/* Copy the key/data pair, if not the skipped index. */
6601573Srgrimes		if (skip != off) {
6611573Srgrimes			++nxt;
6621573Srgrimes
6631573Srgrimes			l->linp[off] = l->upper -= nbytes;
6641573Srgrimes			memmove((char *)l + l->upper, src, nbytes);
6651573Srgrimes		}
6661573Srgrimes
66737155Sguido		used += nbytes + sizeof(indx_t);
6681573Srgrimes		if (used >= half) {
6691573Srgrimes			if (!isbigkey || bigkeycnt == 3)
6701573Srgrimes				break;
6711573Srgrimes			else
6721573Srgrimes				++bigkeycnt;
6731573Srgrimes		}
6741573Srgrimes	}
6751573Srgrimes
6761573Srgrimes	/*
6771573Srgrimes	 * Off is the last offset that's valid for the left page.
6781573Srgrimes	 * Nxt is the first offset to be placed on the right page.
6791573Srgrimes	 */
6801573Srgrimes	l->lower += (off + 1) * sizeof(indx_t);
6811573Srgrimes
6821573Srgrimes	/*
6831573Srgrimes	 * If splitting the page that the cursor was on, the cursor has to be
6841573Srgrimes	 * adjusted to point to the same record as before the split.  If the
6851573Srgrimes	 * cursor is at or past the skipped slot, the cursor is incremented by
6861573Srgrimes	 * one.  If the cursor is on the right page, it is decremented by the
6871573Srgrimes	 * number of records split to the left page.
6881573Srgrimes	 */
68914272Spst	c = &t->bt_cursor;
69014272Spst	if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
69114272Spst		if (c->pg.index >= skip)
69214272Spst			++c->pg.index;
69314272Spst		if (c->pg.index < nxt)			/* Left page. */
69414272Spst			c->pg.pgno = l->pgno;
6951573Srgrimes		else {					/* Right page. */
69614272Spst			c->pg.pgno = r->pgno;
69714272Spst			c->pg.index -= nxt;
6981573Srgrimes		}
6991573Srgrimes	}
7001573Srgrimes
7011573Srgrimes	/*
7021573Srgrimes	 * If the skipped index was on the left page, just return that page.
7031573Srgrimes	 * Otherwise, adjust the skip index to reflect the new position on
7041573Srgrimes	 * the right page.
7051573Srgrimes	 */
7061573Srgrimes	if (skip <= off) {
707135178Skuriyama		skip = MAX_PAGE_OFFSET;
7081573Srgrimes		rval = l;
7091573Srgrimes	} else {
7101573Srgrimes		rval = r;
7111573Srgrimes		*pskip -= nxt;
7121573Srgrimes	}
7131573Srgrimes
7141573Srgrimes	for (off = 0; nxt < top; ++off) {
7151573Srgrimes		if (skip == nxt) {
7161573Srgrimes			++off;
717135178Skuriyama			skip = MAX_PAGE_OFFSET;
7181573Srgrimes		}
7191573Srgrimes		switch (h->flags & P_TYPE) {
7201573Srgrimes		case P_BINTERNAL:
7211573Srgrimes			src = bi = GETBINTERNAL(h, nxt);
7221573Srgrimes			nbytes = NBINTERNAL(bi->ksize);
7231573Srgrimes			break;
7241573Srgrimes		case P_BLEAF:
7251573Srgrimes			src = bl = GETBLEAF(h, nxt);
7261573Srgrimes			nbytes = NBLEAF(bl);
7271573Srgrimes			break;
7281573Srgrimes		case P_RINTERNAL:
7291573Srgrimes			src = GETRINTERNAL(h, nxt);
7301573Srgrimes			nbytes = NRINTERNAL;
7311573Srgrimes			break;
7321573Srgrimes		case P_RLEAF:
7331573Srgrimes			src = rl = GETRLEAF(h, nxt);
7341573Srgrimes			nbytes = NRLEAF(rl);
7351573Srgrimes			break;
7361573Srgrimes		default:
7371573Srgrimes			abort();
7381573Srgrimes		}
7391573Srgrimes		++nxt;
7401573Srgrimes		r->linp[off] = r->upper -= nbytes;
7411573Srgrimes		memmove((char *)r + r->upper, src, nbytes);
7421573Srgrimes	}
7431573Srgrimes	r->lower += off * sizeof(indx_t);
7441573Srgrimes
7451573Srgrimes	/* If the key is being appended to the page, adjust the index. */
7461573Srgrimes	if (skip == top)
7471573Srgrimes		r->lower += sizeof(indx_t);
7481573Srgrimes
7491573Srgrimes	return (rval);
7501573Srgrimes}
7511573Srgrimes
7521573Srgrimes/*
7531573Srgrimes * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
7541573Srgrimes *
7551573Srgrimes * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
7561573Srgrimes * record that references them gets deleted.  Chains pointed to by internal
7571573Srgrimes * pages never get deleted.  This routine marks a chain as pointed to by an
7581573Srgrimes * internal page.
7591573Srgrimes *
7601573Srgrimes * Parameters:
7611573Srgrimes *	t:	tree
7621573Srgrimes *	pg:	page number of first page in the chain.
7631573Srgrimes *
7641573Srgrimes * Returns:
7651573Srgrimes *	RET_SUCCESS, RET_ERROR.
7661573Srgrimes */
7671573Srgrimesstatic int
768189291Sdelphijbt_preserve(BTREE *t, pgno_t pg)
7691573Srgrimes{
7701573Srgrimes	PAGE *h;
7711573Srgrimes
7721573Srgrimes	if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
7731573Srgrimes		return (RET_ERROR);
7741573Srgrimes	h->flags |= P_PRESERVE;
7751573Srgrimes	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
7761573Srgrimes	return (RET_SUCCESS);
7771573Srgrimes}
7781573Srgrimes
7791573Srgrimes/*
7801573Srgrimes * REC_TOTAL -- Return the number of recno entries below a page.
7811573Srgrimes *
7821573Srgrimes * Parameters:
7831573Srgrimes *	h:	page
7841573Srgrimes *
7851573Srgrimes * Returns:
7861573Srgrimes *	The number of recno entries below a page.
7871573Srgrimes *
7881573Srgrimes * XXX
7891573Srgrimes * These values could be set by the bt_psplit routine.  The problem is that the
7901573Srgrimes * entry has to be popped off of the stack etc. or the values have to be passed
7911573Srgrimes * all the way back to bt_split/bt_rroot and it's not very clean.
7921573Srgrimes */
7931573Srgrimesstatic recno_t
794189291Sdelphijrec_total(PAGE *h)
7951573Srgrimes{
7961573Srgrimes	recno_t recs;
7971573Srgrimes	indx_t nxt, top;
7981573Srgrimes
7991573Srgrimes	for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
8001573Srgrimes		recs += GETRINTERNAL(h, nxt)->nrecs;
8011573Srgrimes	return (recs);
8021573Srgrimes}
803