11573Srgrimes/*- 214272Spst * Copyright (c) 1990, 1993, 1994 31573Srgrimes * The Regents of the University of California. All rights reserved. 41573Srgrimes * 51573Srgrimes * This code is derived from software contributed to Berkeley by 61573Srgrimes * Mike Olson. 71573Srgrimes * 81573Srgrimes * Redistribution and use in source and binary forms, with or without 91573Srgrimes * modification, are permitted provided that the following conditions 101573Srgrimes * are met: 111573Srgrimes * 1. Redistributions of source code must retain the above copyright 121573Srgrimes * notice, this list of conditions and the following disclaimer. 131573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 141573Srgrimes * notice, this list of conditions and the following disclaimer in the 151573Srgrimes * documentation and/or other materials provided with the distribution. 161573Srgrimes * 4. Neither the name of the University nor the names of its contributors 171573Srgrimes * may be used to endorse or promote products derived from this software 181573Srgrimes * without specific prior written permission. 191573Srgrimes * 201573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 211573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 221573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 231573Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 241573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 251573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 261573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 271573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 281573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 291573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 301573Srgrimes * SUCH DAMAGE. 311573Srgrimes */ 321573Srgrimes 331573Srgrimes#if defined(LIBC_SCCS) && !defined(lint) 34194804Sdelphijstatic char sccsid[] = "@(#)bt_split.c 8.10 (Berkeley) 1/9/95"; 351573Srgrimes#endif /* LIBC_SCCS and not lint */ 3692905Sobrien#include <sys/cdefs.h> 3792905Sobrien__FBSDID("$FreeBSD$"); 381573Srgrimes 39223262Sbenl#include <sys/param.h> 401573Srgrimes 411573Srgrimes#include <limits.h> 421573Srgrimes#include <stdio.h> 431573Srgrimes#include <stdlib.h> 441573Srgrimes#include <string.h> 451573Srgrimes 461573Srgrimes#include <db.h> 471573Srgrimes#include "btree.h" 481573Srgrimes 4992905Sobrienstatic int bt_broot(BTREE *, PAGE *, PAGE *, PAGE *); 50189291Sdelphijstatic PAGE *bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t); 5192905Sobrienstatic int bt_preserve(BTREE *, pgno_t); 52189291Sdelphijstatic PAGE *bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t); 53189291Sdelphijstatic PAGE *bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t); 5492905Sobrienstatic int bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *); 5592905Sobrienstatic recno_t rec_total(PAGE *); 561573Srgrimes 571573Srgrimes#ifdef STATISTICS 581573Srgrimesu_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved; 591573Srgrimes#endif 601573Srgrimes 611573Srgrimes/* 621573Srgrimes * __BT_SPLIT -- Split the tree. 631573Srgrimes * 641573Srgrimes * Parameters: 651573Srgrimes * t: tree 661573Srgrimes * sp: page to split 671573Srgrimes * key: key to insert 681573Srgrimes * data: data to insert 691573Srgrimes * flags: BIGKEY/BIGDATA flags 701573Srgrimes * ilen: insert length 711573Srgrimes * skip: index to leave open 721573Srgrimes * 731573Srgrimes * Returns: 741573Srgrimes * RET_ERROR, RET_SUCCESS 751573Srgrimes */ 761573Srgrimesint 77189291Sdelphij__bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags, 78189291Sdelphij size_t ilen, u_int32_t argskip) 791573Srgrimes{ 801573Srgrimes BINTERNAL *bi; 811573Srgrimes BLEAF *bl, *tbl; 821573Srgrimes DBT a, b; 831573Srgrimes EPGNO *parent; 841573Srgrimes PAGE *h, *l, *r, *lchild, *rchild; 851573Srgrimes indx_t nxtindex; 8614272Spst u_int16_t skip; 8714272Spst u_int32_t n, nbytes, nksize; 881573Srgrimes int parentsplit; 891573Srgrimes char *dest; 901573Srgrimes 911573Srgrimes /* 921573Srgrimes * Split the page into two pages, l and r. The split routines return 931573Srgrimes * a pointer to the page into which the key should be inserted and with 941573Srgrimes * skip set to the offset which should be used. Additionally, l and r 951573Srgrimes * are pinned. 961573Srgrimes */ 9714272Spst skip = argskip; 981573Srgrimes h = sp->pgno == P_ROOT ? 991573Srgrimes bt_root(t, sp, &l, &r, &skip, ilen) : 1001573Srgrimes bt_page(t, sp, &l, &r, &skip, ilen); 1011573Srgrimes if (h == NULL) 1021573Srgrimes return (RET_ERROR); 1031573Srgrimes 1041573Srgrimes /* 1051573Srgrimes * Insert the new key/data pair into the leaf page. (Key inserts 1061573Srgrimes * always cause a leaf page to split first.) 1071573Srgrimes */ 1081573Srgrimes h->linp[skip] = h->upper -= ilen; 1091573Srgrimes dest = (char *)h + h->upper; 11014272Spst if (F_ISSET(t, R_RECNO)) 1111573Srgrimes WR_RLEAF(dest, data, flags) 1121573Srgrimes else 1131573Srgrimes WR_BLEAF(dest, key, data, flags) 1141573Srgrimes 1151573Srgrimes /* If the root page was split, make it look right. */ 1161573Srgrimes if (sp->pgno == P_ROOT && 11714272Spst (F_ISSET(t, R_RECNO) ? 1181573Srgrimes bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) 1191573Srgrimes goto err2; 1201573Srgrimes 1211573Srgrimes /* 1221573Srgrimes * Now we walk the parent page stack -- a LIFO stack of the pages that 1231573Srgrimes * were traversed when we searched for the page that split. Each stack 1241573Srgrimes * entry is a page number and a page index offset. The offset is for 1251573Srgrimes * the page traversed on the search. We've just split a page, so we 1261573Srgrimes * have to insert a new key into the parent page. 1271573Srgrimes * 1281573Srgrimes * If the insert into the parent page causes it to split, may have to 1291573Srgrimes * continue splitting all the way up the tree. We stop if the root 1301573Srgrimes * splits or the page inserted into didn't have to split to hold the 1311573Srgrimes * new key. Some algorithms replace the key for the old page as well 1321573Srgrimes * as the new page. We don't, as there's no reason to believe that the 1331573Srgrimes * first key on the old page is any better than the key we have, and, 1341573Srgrimes * in the case of a key being placed at index 0 causing the split, the 1351573Srgrimes * key is unavailable. 1361573Srgrimes * 1371573Srgrimes * There are a maximum of 5 pages pinned at any time. We keep the left 1381573Srgrimes * and right pages pinned while working on the parent. The 5 are the 1391573Srgrimes * two children, left parent and right parent (when the parent splits) 1401573Srgrimes * and the root page or the overflow key page when calling bt_preserve. 1411573Srgrimes * This code must make sure that all pins are released other than the 1421573Srgrimes * root page or overflow page which is unlocked elsewhere. 1431573Srgrimes */ 1441573Srgrimes while ((parent = BT_POP(t)) != NULL) { 1451573Srgrimes lchild = l; 1461573Srgrimes rchild = r; 1471573Srgrimes 1481573Srgrimes /* Get the parent page. */ 1491573Srgrimes if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 1501573Srgrimes goto err2; 1511573Srgrimes 152189327Sdelphij /* 1531573Srgrimes * The new key goes ONE AFTER the index, because the split 1541573Srgrimes * was to the right. 1551573Srgrimes */ 1561573Srgrimes skip = parent->index + 1; 1571573Srgrimes 1581573Srgrimes /* 1591573Srgrimes * Calculate the space needed on the parent page. 1601573Srgrimes * 1611573Srgrimes * Prefix trees: space hack when inserting into BINTERNAL 1621573Srgrimes * pages. Retain only what's needed to distinguish between 1631573Srgrimes * the new entry and the LAST entry on the page to its left. 1641573Srgrimes * If the keys compare equal, retain the entire key. Note, 1651573Srgrimes * we don't touch overflow keys, and the entire key must be 1661573Srgrimes * retained for the next-to-left most key on the leftmost 1671573Srgrimes * page of each level, or the search will fail. Applicable 1681573Srgrimes * ONLY to internal pages that have leaf pages as children. 1691573Srgrimes * Further reduction of the key between pairs of internal 1701573Srgrimes * pages loses too much information. 1711573Srgrimes */ 1721573Srgrimes switch (rchild->flags & P_TYPE) { 1731573Srgrimes case P_BINTERNAL: 1741573Srgrimes bi = GETBINTERNAL(rchild, 0); 1751573Srgrimes nbytes = NBINTERNAL(bi->ksize); 1761573Srgrimes break; 1771573Srgrimes case P_BLEAF: 1781573Srgrimes bl = GETBLEAF(rchild, 0); 1791573Srgrimes nbytes = NBINTERNAL(bl->ksize); 1801573Srgrimes if (t->bt_pfx && !(bl->flags & P_BIGKEY) && 1811573Srgrimes (h->prevpg != P_INVALID || skip > 1)) { 1821573Srgrimes tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1); 1831573Srgrimes a.size = tbl->ksize; 1841573Srgrimes a.data = tbl->bytes; 1851573Srgrimes b.size = bl->ksize; 1861573Srgrimes b.data = bl->bytes; 1871573Srgrimes nksize = t->bt_pfx(&a, &b); 1881573Srgrimes n = NBINTERNAL(nksize); 1891573Srgrimes if (n < nbytes) { 1901573Srgrimes#ifdef STATISTICS 1911573Srgrimes bt_pfxsaved += nbytes - n; 1921573Srgrimes#endif 1931573Srgrimes nbytes = n; 1941573Srgrimes } else 1951573Srgrimes nksize = 0; 1961573Srgrimes } else 1971573Srgrimes nksize = 0; 1981573Srgrimes break; 1991573Srgrimes case P_RINTERNAL: 2001573Srgrimes case P_RLEAF: 2011573Srgrimes nbytes = NRINTERNAL; 2021573Srgrimes break; 2031573Srgrimes default: 2041573Srgrimes abort(); 2051573Srgrimes } 2061573Srgrimes 2071573Srgrimes /* Split the parent page if necessary or shift the indices. */ 208190484Sdelphij if ((u_int32_t)(h->upper - h->lower) < nbytes + sizeof(indx_t)) { 2091573Srgrimes sp = h; 2101573Srgrimes h = h->pgno == P_ROOT ? 2111573Srgrimes bt_root(t, h, &l, &r, &skip, nbytes) : 2121573Srgrimes bt_page(t, h, &l, &r, &skip, nbytes); 2131573Srgrimes if (h == NULL) 2141573Srgrimes goto err1; 2151573Srgrimes parentsplit = 1; 2161573Srgrimes } else { 2171573Srgrimes if (skip < (nxtindex = NEXTINDEX(h))) 2181573Srgrimes memmove(h->linp + skip + 1, h->linp + skip, 2191573Srgrimes (nxtindex - skip) * sizeof(indx_t)); 2201573Srgrimes h->lower += sizeof(indx_t); 2211573Srgrimes parentsplit = 0; 2221573Srgrimes } 2231573Srgrimes 2241573Srgrimes /* Insert the key into the parent page. */ 22514272Spst switch (rchild->flags & P_TYPE) { 2261573Srgrimes case P_BINTERNAL: 2271573Srgrimes h->linp[skip] = h->upper -= nbytes; 2281573Srgrimes dest = (char *)h + h->linp[skip]; 2291573Srgrimes memmove(dest, bi, nbytes); 2301573Srgrimes ((BINTERNAL *)dest)->pgno = rchild->pgno; 2311573Srgrimes break; 2321573Srgrimes case P_BLEAF: 2331573Srgrimes h->linp[skip] = h->upper -= nbytes; 2341573Srgrimes dest = (char *)h + h->linp[skip]; 2351573Srgrimes WR_BINTERNAL(dest, nksize ? nksize : bl->ksize, 2361573Srgrimes rchild->pgno, bl->flags & P_BIGKEY); 2371573Srgrimes memmove(dest, bl->bytes, nksize ? nksize : bl->ksize); 238295973Spfg if (bl->flags & P_BIGKEY) { 239295973Spfg pgno_t pgno; 240295973Spfg memcpy(&pgno, bl->bytes, sizeof(pgno)); 241295973Spfg if (bt_preserve(t, pgno) == RET_ERROR) 242295973Spfg goto err1; 243295973Spfg } 2441573Srgrimes break; 2451573Srgrimes case P_RINTERNAL: 2461573Srgrimes /* 2471573Srgrimes * Update the left page count. If split 2481573Srgrimes * added at index 0, fix the correct page. 2491573Srgrimes */ 2501573Srgrimes if (skip > 0) 2511573Srgrimes dest = (char *)h + h->linp[skip - 1]; 2521573Srgrimes else 2531573Srgrimes dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; 2541573Srgrimes ((RINTERNAL *)dest)->nrecs = rec_total(lchild); 2551573Srgrimes ((RINTERNAL *)dest)->pgno = lchild->pgno; 2561573Srgrimes 2571573Srgrimes /* Update the right page count. */ 2581573Srgrimes h->linp[skip] = h->upper -= nbytes; 2591573Srgrimes dest = (char *)h + h->linp[skip]; 2601573Srgrimes ((RINTERNAL *)dest)->nrecs = rec_total(rchild); 2611573Srgrimes ((RINTERNAL *)dest)->pgno = rchild->pgno; 2621573Srgrimes break; 2631573Srgrimes case P_RLEAF: 2641573Srgrimes /* 2651573Srgrimes * Update the left page count. If split 2661573Srgrimes * added at index 0, fix the correct page. 2671573Srgrimes */ 2681573Srgrimes if (skip > 0) 2691573Srgrimes dest = (char *)h + h->linp[skip - 1]; 2701573Srgrimes else 2711573Srgrimes dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; 2721573Srgrimes ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild); 2731573Srgrimes ((RINTERNAL *)dest)->pgno = lchild->pgno; 2741573Srgrimes 2751573Srgrimes /* Update the right page count. */ 2761573Srgrimes h->linp[skip] = h->upper -= nbytes; 2771573Srgrimes dest = (char *)h + h->linp[skip]; 2781573Srgrimes ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild); 2791573Srgrimes ((RINTERNAL *)dest)->pgno = rchild->pgno; 2801573Srgrimes break; 2811573Srgrimes default: 2821573Srgrimes abort(); 2831573Srgrimes } 2841573Srgrimes 2851573Srgrimes /* Unpin the held pages. */ 2861573Srgrimes if (!parentsplit) { 2871573Srgrimes mpool_put(t->bt_mp, h, MPOOL_DIRTY); 2881573Srgrimes break; 2891573Srgrimes } 2901573Srgrimes 2911573Srgrimes /* If the root page was split, make it look right. */ 2921573Srgrimes if (sp->pgno == P_ROOT && 29314272Spst (F_ISSET(t, R_RECNO) ? 2941573Srgrimes bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) 2951573Srgrimes goto err1; 2961573Srgrimes 2971573Srgrimes mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); 2981573Srgrimes mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); 2991573Srgrimes } 3001573Srgrimes 3011573Srgrimes /* Unpin the held pages. */ 3021573Srgrimes mpool_put(t->bt_mp, l, MPOOL_DIRTY); 3031573Srgrimes mpool_put(t->bt_mp, r, MPOOL_DIRTY); 3041573Srgrimes 3051573Srgrimes /* Clear any pages left on the stack. */ 3061573Srgrimes return (RET_SUCCESS); 3071573Srgrimes 3081573Srgrimes /* 3091573Srgrimes * If something fails in the above loop we were already walking back 3101573Srgrimes * up the tree and the tree is now inconsistent. Nothing much we can 3111573Srgrimes * do about it but release any memory we're holding. 3121573Srgrimes */ 3131573Srgrimeserr1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); 3141573Srgrimes mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); 3151573Srgrimes 3161573Srgrimeserr2: mpool_put(t->bt_mp, l, 0); 3171573Srgrimes mpool_put(t->bt_mp, r, 0); 3181573Srgrimes __dbpanic(t->bt_dbp); 3191573Srgrimes return (RET_ERROR); 3201573Srgrimes} 3211573Srgrimes 3221573Srgrimes/* 3231573Srgrimes * BT_PAGE -- Split a non-root page of a btree. 3241573Srgrimes * 3251573Srgrimes * Parameters: 3261573Srgrimes * t: tree 3271573Srgrimes * h: root page 3281573Srgrimes * lp: pointer to left page pointer 3291573Srgrimes * rp: pointer to right page pointer 3301573Srgrimes * skip: pointer to index to leave open 3311573Srgrimes * ilen: insert length 3321573Srgrimes * 3331573Srgrimes * Returns: 3341573Srgrimes * Pointer to page in which to insert or NULL on error. 3351573Srgrimes */ 3361573Srgrimesstatic PAGE * 337189291Sdelphijbt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen) 3381573Srgrimes{ 3391573Srgrimes PAGE *l, *r, *tp; 3401573Srgrimes pgno_t npg; 3411573Srgrimes 3421573Srgrimes#ifdef STATISTICS 3431573Srgrimes ++bt_split; 3441573Srgrimes#endif 3451573Srgrimes /* Put the new right page for the split into place. */ 3461573Srgrimes if ((r = __bt_new(t, &npg)) == NULL) 3471573Srgrimes return (NULL); 3481573Srgrimes r->pgno = npg; 3491573Srgrimes r->lower = BTDATAOFF; 3501573Srgrimes r->upper = t->bt_psize; 3511573Srgrimes r->nextpg = h->nextpg; 3521573Srgrimes r->prevpg = h->pgno; 3531573Srgrimes r->flags = h->flags & P_TYPE; 3541573Srgrimes 3551573Srgrimes /* 3561573Srgrimes * If we're splitting the last page on a level because we're appending 3571573Srgrimes * a key to it (skip is NEXTINDEX()), it's likely that the data is 3581573Srgrimes * sorted. Adding an empty page on the side of the level is less work 3591573Srgrimes * and can push the fill factor much higher than normal. If we're 3601573Srgrimes * wrong it's no big deal, we'll just do the split the right way next 3611573Srgrimes * time. It may look like it's equally easy to do a similar hack for 3621573Srgrimes * reverse sorted data, that is, split the tree left, but it's not. 3631573Srgrimes * Don't even try. 3641573Srgrimes */ 3651573Srgrimes if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) { 3661573Srgrimes#ifdef STATISTICS 3671573Srgrimes ++bt_sortsplit; 3681573Srgrimes#endif 3691573Srgrimes h->nextpg = r->pgno; 3701573Srgrimes r->lower = BTDATAOFF + sizeof(indx_t); 3711573Srgrimes *skip = 0; 3721573Srgrimes *lp = h; 3731573Srgrimes *rp = r; 3741573Srgrimes return (r); 3751573Srgrimes } 3761573Srgrimes 3771573Srgrimes /* Put the new left page for the split into place. */ 378190482Sdelphij if ((l = (PAGE *)calloc(1, t->bt_psize)) == NULL) { 3791573Srgrimes mpool_put(t->bt_mp, r, 0); 3801573Srgrimes return (NULL); 3811573Srgrimes } 3821573Srgrimes l->pgno = h->pgno; 3831573Srgrimes l->nextpg = r->pgno; 3841573Srgrimes l->prevpg = h->prevpg; 3851573Srgrimes l->lower = BTDATAOFF; 3861573Srgrimes l->upper = t->bt_psize; 3871573Srgrimes l->flags = h->flags & P_TYPE; 3881573Srgrimes 3891573Srgrimes /* Fix up the previous pointer of the page after the split page. */ 3901573Srgrimes if (h->nextpg != P_INVALID) { 3911573Srgrimes if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) { 3921573Srgrimes free(l); 3931573Srgrimes /* XXX mpool_free(t->bt_mp, r->pgno); */ 3941573Srgrimes return (NULL); 3951573Srgrimes } 3961573Srgrimes tp->prevpg = r->pgno; 39714272Spst mpool_put(t->bt_mp, tp, MPOOL_DIRTY); 3981573Srgrimes } 3991573Srgrimes 4001573Srgrimes /* 4011573Srgrimes * Split right. The key/data pairs aren't sorted in the btree page so 4021573Srgrimes * it's simpler to copy the data from the split page onto two new pages 4031573Srgrimes * instead of copying half the data to the right page and compacting 4041573Srgrimes * the left page in place. Since the left page can't change, we have 4051573Srgrimes * to swap the original and the allocated left page after the split. 4061573Srgrimes */ 4071573Srgrimes tp = bt_psplit(t, h, l, r, skip, ilen); 4081573Srgrimes 4091573Srgrimes /* Move the new left page onto the old left page. */ 4101573Srgrimes memmove(h, l, t->bt_psize); 4111573Srgrimes if (tp == l) 4121573Srgrimes tp = h; 4131573Srgrimes free(l); 4141573Srgrimes 4151573Srgrimes *lp = h; 4161573Srgrimes *rp = r; 4171573Srgrimes return (tp); 4181573Srgrimes} 4191573Srgrimes 4201573Srgrimes/* 4211573Srgrimes * BT_ROOT -- Split the root page of a btree. 4221573Srgrimes * 4231573Srgrimes * Parameters: 4241573Srgrimes * t: tree 4251573Srgrimes * h: root page 4261573Srgrimes * lp: pointer to left page pointer 4271573Srgrimes * rp: pointer to right page pointer 4281573Srgrimes * skip: pointer to index to leave open 4291573Srgrimes * ilen: insert length 4301573Srgrimes * 4311573Srgrimes * Returns: 4321573Srgrimes * Pointer to page in which to insert or NULL on error. 4331573Srgrimes */ 4341573Srgrimesstatic PAGE * 435189291Sdelphijbt_root(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen) 4361573Srgrimes{ 4371573Srgrimes PAGE *l, *r, *tp; 4381573Srgrimes pgno_t lnpg, rnpg; 4391573Srgrimes 4401573Srgrimes#ifdef STATISTICS 4411573Srgrimes ++bt_split; 4421573Srgrimes ++bt_rootsplit; 4431573Srgrimes#endif 4441573Srgrimes /* Put the new left and right pages for the split into place. */ 4451573Srgrimes if ((l = __bt_new(t, &lnpg)) == NULL || 4461573Srgrimes (r = __bt_new(t, &rnpg)) == NULL) 4471573Srgrimes return (NULL); 4481573Srgrimes l->pgno = lnpg; 4491573Srgrimes r->pgno = rnpg; 4501573Srgrimes l->nextpg = r->pgno; 4511573Srgrimes r->prevpg = l->pgno; 4521573Srgrimes l->prevpg = r->nextpg = P_INVALID; 4531573Srgrimes l->lower = r->lower = BTDATAOFF; 4541573Srgrimes l->upper = r->upper = t->bt_psize; 4551573Srgrimes l->flags = r->flags = h->flags & P_TYPE; 4561573Srgrimes 4571573Srgrimes /* Split the root page. */ 4581573Srgrimes tp = bt_psplit(t, h, l, r, skip, ilen); 4591573Srgrimes 4601573Srgrimes *lp = l; 4611573Srgrimes *rp = r; 4621573Srgrimes return (tp); 4631573Srgrimes} 4641573Srgrimes 4651573Srgrimes/* 4661573Srgrimes * BT_RROOT -- Fix up the recno root page after it has been split. 4671573Srgrimes * 4681573Srgrimes * Parameters: 4691573Srgrimes * t: tree 4701573Srgrimes * h: root page 4711573Srgrimes * l: left page 4721573Srgrimes * r: right page 4731573Srgrimes * 4741573Srgrimes * Returns: 4751573Srgrimes * RET_ERROR, RET_SUCCESS 4761573Srgrimes */ 4771573Srgrimesstatic int 478189291Sdelphijbt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r) 4791573Srgrimes{ 4801573Srgrimes char *dest; 4811573Srgrimes 4821573Srgrimes /* Insert the left and right keys, set the header information. */ 4831573Srgrimes h->linp[0] = h->upper = t->bt_psize - NRINTERNAL; 4841573Srgrimes dest = (char *)h + h->upper; 4851573Srgrimes WR_RINTERNAL(dest, 4861573Srgrimes l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno); 4871573Srgrimes 488223262Sbenl __PAST_END(h->linp, 1) = h->upper -= NRINTERNAL; 4891573Srgrimes dest = (char *)h + h->upper; 4901573Srgrimes WR_RINTERNAL(dest, 4911573Srgrimes r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno); 4921573Srgrimes 4931573Srgrimes h->lower = BTDATAOFF + 2 * sizeof(indx_t); 4941573Srgrimes 4951573Srgrimes /* Unpin the root page, set to recno internal page. */ 4961573Srgrimes h->flags &= ~P_TYPE; 4971573Srgrimes h->flags |= P_RINTERNAL; 4981573Srgrimes mpool_put(t->bt_mp, h, MPOOL_DIRTY); 4991573Srgrimes 5001573Srgrimes return (RET_SUCCESS); 5011573Srgrimes} 5021573Srgrimes 5031573Srgrimes/* 5041573Srgrimes * BT_BROOT -- Fix up the btree root page after it has been split. 5051573Srgrimes * 5061573Srgrimes * Parameters: 5071573Srgrimes * t: tree 5081573Srgrimes * h: root page 5091573Srgrimes * l: left page 5101573Srgrimes * r: right page 5111573Srgrimes * 5121573Srgrimes * Returns: 5131573Srgrimes * RET_ERROR, RET_SUCCESS 5141573Srgrimes */ 5151573Srgrimesstatic int 516189291Sdelphijbt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r) 5171573Srgrimes{ 5181573Srgrimes BINTERNAL *bi; 5191573Srgrimes BLEAF *bl; 52014272Spst u_int32_t nbytes; 5211573Srgrimes char *dest; 5221573Srgrimes 5231573Srgrimes /* 5241573Srgrimes * If the root page was a leaf page, change it into an internal page. 5251573Srgrimes * We copy the key we split on (but not the key's data, in the case of 5261573Srgrimes * a leaf page) to the new root page. 5271573Srgrimes * 5281573Srgrimes * The btree comparison code guarantees that the left-most key on any 5291573Srgrimes * level of the tree is never used, so it doesn't need to be filled in. 5301573Srgrimes */ 5311573Srgrimes nbytes = NBINTERNAL(0); 5321573Srgrimes h->linp[0] = h->upper = t->bt_psize - nbytes; 5331573Srgrimes dest = (char *)h + h->upper; 5341573Srgrimes WR_BINTERNAL(dest, 0, l->pgno, 0); 5351573Srgrimes 53614272Spst switch (h->flags & P_TYPE) { 5371573Srgrimes case P_BLEAF: 5381573Srgrimes bl = GETBLEAF(r, 0); 5391573Srgrimes nbytes = NBINTERNAL(bl->ksize); 540223262Sbenl __PAST_END(h->linp, 1) = h->upper -= nbytes; 5411573Srgrimes dest = (char *)h + h->upper; 5421573Srgrimes WR_BINTERNAL(dest, bl->ksize, r->pgno, 0); 5431573Srgrimes memmove(dest, bl->bytes, bl->ksize); 5441573Srgrimes 5451573Srgrimes /* 5461573Srgrimes * If the key is on an overflow page, mark the overflow chain 5471573Srgrimes * so it isn't deleted when the leaf copy of the key is deleted. 5481573Srgrimes */ 549295973Spfg if (bl->flags & P_BIGKEY) { 550295973Spfg pgno_t pgno; 551295973Spfg memcpy(&pgno, bl->bytes, sizeof(pgno)); 552295973Spfg if (bt_preserve(t, pgno) == RET_ERROR) 553295973Spfg return (RET_ERROR); 554295973Spfg } 5551573Srgrimes break; 5561573Srgrimes case P_BINTERNAL: 5571573Srgrimes bi = GETBINTERNAL(r, 0); 5581573Srgrimes nbytes = NBINTERNAL(bi->ksize); 559223262Sbenl __PAST_END(h->linp, 1) = h->upper -= nbytes; 5601573Srgrimes dest = (char *)h + h->upper; 5611573Srgrimes memmove(dest, bi, nbytes); 5621573Srgrimes ((BINTERNAL *)dest)->pgno = r->pgno; 5631573Srgrimes break; 5641573Srgrimes default: 5651573Srgrimes abort(); 5661573Srgrimes } 5671573Srgrimes 5681573Srgrimes /* There are two keys on the page. */ 5691573Srgrimes h->lower = BTDATAOFF + 2 * sizeof(indx_t); 5701573Srgrimes 5711573Srgrimes /* Unpin the root page, set to btree internal page. */ 5721573Srgrimes h->flags &= ~P_TYPE; 5731573Srgrimes h->flags |= P_BINTERNAL; 5741573Srgrimes mpool_put(t->bt_mp, h, MPOOL_DIRTY); 5751573Srgrimes 5761573Srgrimes return (RET_SUCCESS); 5771573Srgrimes} 5781573Srgrimes 5791573Srgrimes/* 5801573Srgrimes * BT_PSPLIT -- Do the real work of splitting the page. 5811573Srgrimes * 5821573Srgrimes * Parameters: 5831573Srgrimes * t: tree 5841573Srgrimes * h: page to be split 5851573Srgrimes * l: page to put lower half of data 5861573Srgrimes * r: page to put upper half of data 5871573Srgrimes * pskip: pointer to index to leave open 5881573Srgrimes * ilen: insert length 5891573Srgrimes * 5901573Srgrimes * Returns: 5911573Srgrimes * Pointer to page in which to insert. 5921573Srgrimes */ 5931573Srgrimesstatic PAGE * 594189291Sdelphijbt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen) 5951573Srgrimes{ 5961573Srgrimes BINTERNAL *bi; 5971573Srgrimes BLEAF *bl; 59814272Spst CURSOR *c; 5991573Srgrimes RLEAF *rl; 6001573Srgrimes PAGE *rval; 6011573Srgrimes void *src; 6021573Srgrimes indx_t full, half, nxt, off, skip, top, used; 60314272Spst u_int32_t nbytes; 6041573Srgrimes int bigkeycnt, isbigkey; 6051573Srgrimes 6061573Srgrimes /* 6071573Srgrimes * Split the data to the left and right pages. Leave the skip index 6081573Srgrimes * open. Additionally, make some effort not to split on an overflow 6091573Srgrimes * key. This makes internal page processing faster and can save 6101573Srgrimes * space as overflow keys used by internal pages are never deleted. 6111573Srgrimes */ 6121573Srgrimes bigkeycnt = 0; 6131573Srgrimes skip = *pskip; 6141573Srgrimes full = t->bt_psize - BTDATAOFF; 6151573Srgrimes half = full / 2; 6161573Srgrimes used = 0; 6171573Srgrimes for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) { 6181573Srgrimes if (skip == off) { 6191573Srgrimes nbytes = ilen; 6201573Srgrimes isbigkey = 0; /* XXX: not really known. */ 6211573Srgrimes } else 6221573Srgrimes switch (h->flags & P_TYPE) { 6231573Srgrimes case P_BINTERNAL: 6241573Srgrimes src = bi = GETBINTERNAL(h, nxt); 6251573Srgrimes nbytes = NBINTERNAL(bi->ksize); 6261573Srgrimes isbigkey = bi->flags & P_BIGKEY; 6271573Srgrimes break; 6281573Srgrimes case P_BLEAF: 6291573Srgrimes src = bl = GETBLEAF(h, nxt); 6301573Srgrimes nbytes = NBLEAF(bl); 6311573Srgrimes isbigkey = bl->flags & P_BIGKEY; 6321573Srgrimes break; 6331573Srgrimes case P_RINTERNAL: 6341573Srgrimes src = GETRINTERNAL(h, nxt); 6351573Srgrimes nbytes = NRINTERNAL; 6361573Srgrimes isbigkey = 0; 6371573Srgrimes break; 6381573Srgrimes case P_RLEAF: 6391573Srgrimes src = rl = GETRLEAF(h, nxt); 6401573Srgrimes nbytes = NRLEAF(rl); 6411573Srgrimes isbigkey = 0; 6421573Srgrimes break; 6431573Srgrimes default: 6441573Srgrimes abort(); 6451573Srgrimes } 6461573Srgrimes 6471573Srgrimes /* 6481573Srgrimes * If the key/data pairs are substantial fractions of the max 6491573Srgrimes * possible size for the page, it's possible to get situations 6501573Srgrimes * where we decide to try and copy too much onto the left page. 6511573Srgrimes * Make sure that doesn't happen. 6521573Srgrimes */ 653194803Sdelphij if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) || 654194803Sdelphij nxt == top - 1) { 6551573Srgrimes --off; 6561573Srgrimes break; 6571573Srgrimes } 6581573Srgrimes 6591573Srgrimes /* Copy the key/data pair, if not the skipped index. */ 6601573Srgrimes if (skip != off) { 6611573Srgrimes ++nxt; 6621573Srgrimes 6631573Srgrimes l->linp[off] = l->upper -= nbytes; 6641573Srgrimes memmove((char *)l + l->upper, src, nbytes); 6651573Srgrimes } 6661573Srgrimes 66737155Sguido used += nbytes + sizeof(indx_t); 6681573Srgrimes if (used >= half) { 6691573Srgrimes if (!isbigkey || bigkeycnt == 3) 6701573Srgrimes break; 6711573Srgrimes else 6721573Srgrimes ++bigkeycnt; 6731573Srgrimes } 6741573Srgrimes } 6751573Srgrimes 6761573Srgrimes /* 6771573Srgrimes * Off is the last offset that's valid for the left page. 6781573Srgrimes * Nxt is the first offset to be placed on the right page. 6791573Srgrimes */ 6801573Srgrimes l->lower += (off + 1) * sizeof(indx_t); 6811573Srgrimes 6821573Srgrimes /* 6831573Srgrimes * If splitting the page that the cursor was on, the cursor has to be 6841573Srgrimes * adjusted to point to the same record as before the split. If the 6851573Srgrimes * cursor is at or past the skipped slot, the cursor is incremented by 6861573Srgrimes * one. If the cursor is on the right page, it is decremented by the 6871573Srgrimes * number of records split to the left page. 6881573Srgrimes */ 68914272Spst c = &t->bt_cursor; 69014272Spst if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) { 69114272Spst if (c->pg.index >= skip) 69214272Spst ++c->pg.index; 69314272Spst if (c->pg.index < nxt) /* Left page. */ 69414272Spst c->pg.pgno = l->pgno; 6951573Srgrimes else { /* Right page. */ 69614272Spst c->pg.pgno = r->pgno; 69714272Spst c->pg.index -= nxt; 6981573Srgrimes } 6991573Srgrimes } 7001573Srgrimes 7011573Srgrimes /* 7021573Srgrimes * If the skipped index was on the left page, just return that page. 7031573Srgrimes * Otherwise, adjust the skip index to reflect the new position on 7041573Srgrimes * the right page. 7051573Srgrimes */ 7061573Srgrimes if (skip <= off) { 707135178Skuriyama skip = MAX_PAGE_OFFSET; 7081573Srgrimes rval = l; 7091573Srgrimes } else { 7101573Srgrimes rval = r; 7111573Srgrimes *pskip -= nxt; 7121573Srgrimes } 7131573Srgrimes 7141573Srgrimes for (off = 0; nxt < top; ++off) { 7151573Srgrimes if (skip == nxt) { 7161573Srgrimes ++off; 717135178Skuriyama skip = MAX_PAGE_OFFSET; 7181573Srgrimes } 7191573Srgrimes switch (h->flags & P_TYPE) { 7201573Srgrimes case P_BINTERNAL: 7211573Srgrimes src = bi = GETBINTERNAL(h, nxt); 7221573Srgrimes nbytes = NBINTERNAL(bi->ksize); 7231573Srgrimes break; 7241573Srgrimes case P_BLEAF: 7251573Srgrimes src = bl = GETBLEAF(h, nxt); 7261573Srgrimes nbytes = NBLEAF(bl); 7271573Srgrimes break; 7281573Srgrimes case P_RINTERNAL: 7291573Srgrimes src = GETRINTERNAL(h, nxt); 7301573Srgrimes nbytes = NRINTERNAL; 7311573Srgrimes break; 7321573Srgrimes case P_RLEAF: 7331573Srgrimes src = rl = GETRLEAF(h, nxt); 7341573Srgrimes nbytes = NRLEAF(rl); 7351573Srgrimes break; 7361573Srgrimes default: 7371573Srgrimes abort(); 7381573Srgrimes } 7391573Srgrimes ++nxt; 7401573Srgrimes r->linp[off] = r->upper -= nbytes; 7411573Srgrimes memmove((char *)r + r->upper, src, nbytes); 7421573Srgrimes } 7431573Srgrimes r->lower += off * sizeof(indx_t); 7441573Srgrimes 7451573Srgrimes /* If the key is being appended to the page, adjust the index. */ 7461573Srgrimes if (skip == top) 7471573Srgrimes r->lower += sizeof(indx_t); 7481573Srgrimes 7491573Srgrimes return (rval); 7501573Srgrimes} 7511573Srgrimes 7521573Srgrimes/* 7531573Srgrimes * BT_PRESERVE -- Mark a chain of pages as used by an internal node. 7541573Srgrimes * 7551573Srgrimes * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the 7561573Srgrimes * record that references them gets deleted. Chains pointed to by internal 7571573Srgrimes * pages never get deleted. This routine marks a chain as pointed to by an 7581573Srgrimes * internal page. 7591573Srgrimes * 7601573Srgrimes * Parameters: 7611573Srgrimes * t: tree 7621573Srgrimes * pg: page number of first page in the chain. 7631573Srgrimes * 7641573Srgrimes * Returns: 7651573Srgrimes * RET_SUCCESS, RET_ERROR. 7661573Srgrimes */ 7671573Srgrimesstatic int 768189291Sdelphijbt_preserve(BTREE *t, pgno_t pg) 7691573Srgrimes{ 7701573Srgrimes PAGE *h; 7711573Srgrimes 7721573Srgrimes if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 7731573Srgrimes return (RET_ERROR); 7741573Srgrimes h->flags |= P_PRESERVE; 7751573Srgrimes mpool_put(t->bt_mp, h, MPOOL_DIRTY); 7761573Srgrimes return (RET_SUCCESS); 7771573Srgrimes} 7781573Srgrimes 7791573Srgrimes/* 7801573Srgrimes * REC_TOTAL -- Return the number of recno entries below a page. 7811573Srgrimes * 7821573Srgrimes * Parameters: 7831573Srgrimes * h: page 7841573Srgrimes * 7851573Srgrimes * Returns: 7861573Srgrimes * The number of recno entries below a page. 7871573Srgrimes * 7881573Srgrimes * XXX 7891573Srgrimes * These values could be set by the bt_psplit routine. The problem is that the 7901573Srgrimes * entry has to be popped off of the stack etc. or the values have to be passed 7911573Srgrimes * all the way back to bt_split/bt_rroot and it's not very clean. 7921573Srgrimes */ 7931573Srgrimesstatic recno_t 794189291Sdelphijrec_total(PAGE *h) 7951573Srgrimes{ 7961573Srgrimes recno_t recs; 7971573Srgrimes indx_t nxt, top; 7981573Srgrimes 7991573Srgrimes for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt) 8001573Srgrimes recs += GETRINTERNAL(h, nxt)->nrecs; 8011573Srgrimes return (recs); 8021573Srgrimes} 803