11573Srgrimes/*- 214272Spst * Copyright (c) 1990, 1993, 1994 31573Srgrimes * The Regents of the University of California. All rights reserved. 41573Srgrimes * 51573Srgrimes * This code is derived from software contributed to Berkeley by 61573Srgrimes * Mike Olson. 71573Srgrimes * 81573Srgrimes * Redistribution and use in source and binary forms, with or without 91573Srgrimes * modification, are permitted provided that the following conditions 101573Srgrimes * are met: 111573Srgrimes * 1. Redistributions of source code must retain the above copyright 121573Srgrimes * notice, this list of conditions and the following disclaimer. 131573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 141573Srgrimes * notice, this list of conditions and the following disclaimer in the 151573Srgrimes * documentation and/or other materials provided with the distribution. 161573Srgrimes * 4. Neither the name of the University nor the names of its contributors 171573Srgrimes * may be used to endorse or promote products derived from this software 181573Srgrimes * without specific prior written permission. 191573Srgrimes * 201573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 211573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 221573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 231573Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 241573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 251573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 261573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 271573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 281573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 291573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 301573Srgrimes * SUCH DAMAGE. 311573Srgrimes */ 321573Srgrimes 331573Srgrimes#if defined(LIBC_SCCS) && !defined(lint) 34194804Sdelphijstatic char sccsid[] = "@(#)bt_split.c 8.10 (Berkeley) 1/9/95"; 351573Srgrimes#endif /* LIBC_SCCS and not lint */ 3692905Sobrien#include <sys/cdefs.h> 3792905Sobrien__FBSDID("$FreeBSD$"); 381573Srgrimes 391573Srgrimes#include <sys/types.h> 40223262Sbenl#include <sys/param.h> 411573Srgrimes 421573Srgrimes#include <limits.h> 431573Srgrimes#include <stdio.h> 441573Srgrimes#include <stdlib.h> 451573Srgrimes#include <string.h> 461573Srgrimes 471573Srgrimes#include <db.h> 481573Srgrimes#include "btree.h" 491573Srgrimes 5092905Sobrienstatic int bt_broot(BTREE *, PAGE *, PAGE *, PAGE *); 51189291Sdelphijstatic PAGE *bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t); 5292905Sobrienstatic int bt_preserve(BTREE *, pgno_t); 53189291Sdelphijstatic PAGE *bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t); 54189291Sdelphijstatic PAGE *bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t); 5592905Sobrienstatic int bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *); 5692905Sobrienstatic recno_t rec_total(PAGE *); 571573Srgrimes 581573Srgrimes#ifdef STATISTICS 591573Srgrimesu_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved; 601573Srgrimes#endif 611573Srgrimes 621573Srgrimes/* 631573Srgrimes * __BT_SPLIT -- Split the tree. 641573Srgrimes * 651573Srgrimes * Parameters: 661573Srgrimes * t: tree 671573Srgrimes * sp: page to split 681573Srgrimes * key: key to insert 691573Srgrimes * data: data to insert 701573Srgrimes * flags: BIGKEY/BIGDATA flags 711573Srgrimes * ilen: insert length 721573Srgrimes * skip: index to leave open 731573Srgrimes * 741573Srgrimes * Returns: 751573Srgrimes * RET_ERROR, RET_SUCCESS 761573Srgrimes */ 771573Srgrimesint 78189291Sdelphij__bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags, 79189291Sdelphij size_t ilen, u_int32_t argskip) 801573Srgrimes{ 811573Srgrimes BINTERNAL *bi; 821573Srgrimes BLEAF *bl, *tbl; 831573Srgrimes DBT a, b; 841573Srgrimes EPGNO *parent; 851573Srgrimes PAGE *h, *l, *r, *lchild, *rchild; 861573Srgrimes indx_t nxtindex; 8714272Spst u_int16_t skip; 8814272Spst u_int32_t n, nbytes, nksize; 891573Srgrimes int parentsplit; 901573Srgrimes char *dest; 911573Srgrimes 921573Srgrimes /* 931573Srgrimes * Split the page into two pages, l and r. The split routines return 941573Srgrimes * a pointer to the page into which the key should be inserted and with 951573Srgrimes * skip set to the offset which should be used. Additionally, l and r 961573Srgrimes * are pinned. 971573Srgrimes */ 9814272Spst skip = argskip; 991573Srgrimes h = sp->pgno == P_ROOT ? 1001573Srgrimes bt_root(t, sp, &l, &r, &skip, ilen) : 1011573Srgrimes bt_page(t, sp, &l, &r, &skip, ilen); 1021573Srgrimes if (h == NULL) 1031573Srgrimes return (RET_ERROR); 1041573Srgrimes 1051573Srgrimes /* 1061573Srgrimes * Insert the new key/data pair into the leaf page. (Key inserts 1071573Srgrimes * always cause a leaf page to split first.) 1081573Srgrimes */ 1091573Srgrimes h->linp[skip] = h->upper -= ilen; 1101573Srgrimes dest = (char *)h + h->upper; 11114272Spst if (F_ISSET(t, R_RECNO)) 1121573Srgrimes WR_RLEAF(dest, data, flags) 1131573Srgrimes else 1141573Srgrimes WR_BLEAF(dest, key, data, flags) 1151573Srgrimes 1161573Srgrimes /* If the root page was split, make it look right. */ 1171573Srgrimes if (sp->pgno == P_ROOT && 11814272Spst (F_ISSET(t, R_RECNO) ? 1191573Srgrimes bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) 1201573Srgrimes goto err2; 1211573Srgrimes 1221573Srgrimes /* 1231573Srgrimes * Now we walk the parent page stack -- a LIFO stack of the pages that 1241573Srgrimes * were traversed when we searched for the page that split. Each stack 1251573Srgrimes * entry is a page number and a page index offset. The offset is for 1261573Srgrimes * the page traversed on the search. We've just split a page, so we 1271573Srgrimes * have to insert a new key into the parent page. 1281573Srgrimes * 1291573Srgrimes * If the insert into the parent page causes it to split, may have to 1301573Srgrimes * continue splitting all the way up the tree. We stop if the root 1311573Srgrimes * splits or the page inserted into didn't have to split to hold the 1321573Srgrimes * new key. Some algorithms replace the key for the old page as well 1331573Srgrimes * as the new page. We don't, as there's no reason to believe that the 1341573Srgrimes * first key on the old page is any better than the key we have, and, 1351573Srgrimes * in the case of a key being placed at index 0 causing the split, the 1361573Srgrimes * key is unavailable. 1371573Srgrimes * 1381573Srgrimes * There are a maximum of 5 pages pinned at any time. We keep the left 1391573Srgrimes * and right pages pinned while working on the parent. The 5 are the 1401573Srgrimes * two children, left parent and right parent (when the parent splits) 1411573Srgrimes * and the root page or the overflow key page when calling bt_preserve. 1421573Srgrimes * This code must make sure that all pins are released other than the 1431573Srgrimes * root page or overflow page which is unlocked elsewhere. 1441573Srgrimes */ 1451573Srgrimes while ((parent = BT_POP(t)) != NULL) { 1461573Srgrimes lchild = l; 1471573Srgrimes rchild = r; 1481573Srgrimes 1491573Srgrimes /* Get the parent page. */ 1501573Srgrimes if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 1511573Srgrimes goto err2; 1521573Srgrimes 153189327Sdelphij /* 1541573Srgrimes * The new key goes ONE AFTER the index, because the split 1551573Srgrimes * was to the right. 1561573Srgrimes */ 1571573Srgrimes skip = parent->index + 1; 1581573Srgrimes 1591573Srgrimes /* 1601573Srgrimes * Calculate the space needed on the parent page. 1611573Srgrimes * 1621573Srgrimes * Prefix trees: space hack when inserting into BINTERNAL 1631573Srgrimes * pages. Retain only what's needed to distinguish between 1641573Srgrimes * the new entry and the LAST entry on the page to its left. 1651573Srgrimes * If the keys compare equal, retain the entire key. Note, 1661573Srgrimes * we don't touch overflow keys, and the entire key must be 1671573Srgrimes * retained for the next-to-left most key on the leftmost 1681573Srgrimes * page of each level, or the search will fail. Applicable 1691573Srgrimes * ONLY to internal pages that have leaf pages as children. 1701573Srgrimes * Further reduction of the key between pairs of internal 1711573Srgrimes * pages loses too much information. 1721573Srgrimes */ 1731573Srgrimes switch (rchild->flags & P_TYPE) { 1741573Srgrimes case P_BINTERNAL: 1751573Srgrimes bi = GETBINTERNAL(rchild, 0); 1761573Srgrimes nbytes = NBINTERNAL(bi->ksize); 1771573Srgrimes break; 1781573Srgrimes case P_BLEAF: 1791573Srgrimes bl = GETBLEAF(rchild, 0); 1801573Srgrimes nbytes = NBINTERNAL(bl->ksize); 1811573Srgrimes if (t->bt_pfx && !(bl->flags & P_BIGKEY) && 1821573Srgrimes (h->prevpg != P_INVALID || skip > 1)) { 1831573Srgrimes tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1); 1841573Srgrimes a.size = tbl->ksize; 1851573Srgrimes a.data = tbl->bytes; 1861573Srgrimes b.size = bl->ksize; 1871573Srgrimes b.data = bl->bytes; 1881573Srgrimes nksize = t->bt_pfx(&a, &b); 1891573Srgrimes n = NBINTERNAL(nksize); 1901573Srgrimes if (n < nbytes) { 1911573Srgrimes#ifdef STATISTICS 1921573Srgrimes bt_pfxsaved += nbytes - n; 1931573Srgrimes#endif 1941573Srgrimes nbytes = n; 1951573Srgrimes } else 1961573Srgrimes nksize = 0; 1971573Srgrimes } else 1981573Srgrimes nksize = 0; 1991573Srgrimes break; 2001573Srgrimes case P_RINTERNAL: 2011573Srgrimes case P_RLEAF: 2021573Srgrimes nbytes = NRINTERNAL; 2031573Srgrimes break; 2041573Srgrimes default: 2051573Srgrimes abort(); 2061573Srgrimes } 2071573Srgrimes 2081573Srgrimes /* Split the parent page if necessary or shift the indices. */ 209190484Sdelphij if ((u_int32_t)(h->upper - h->lower) < nbytes + sizeof(indx_t)) { 2101573Srgrimes sp = h; 2111573Srgrimes h = h->pgno == P_ROOT ? 2121573Srgrimes bt_root(t, h, &l, &r, &skip, nbytes) : 2131573Srgrimes bt_page(t, h, &l, &r, &skip, nbytes); 2141573Srgrimes if (h == NULL) 2151573Srgrimes goto err1; 2161573Srgrimes parentsplit = 1; 2171573Srgrimes } else { 2181573Srgrimes if (skip < (nxtindex = NEXTINDEX(h))) 2191573Srgrimes memmove(h->linp + skip + 1, h->linp + skip, 2201573Srgrimes (nxtindex - skip) * sizeof(indx_t)); 2211573Srgrimes h->lower += sizeof(indx_t); 2221573Srgrimes parentsplit = 0; 2231573Srgrimes } 2241573Srgrimes 2251573Srgrimes /* Insert the key into the parent page. */ 22614272Spst switch (rchild->flags & P_TYPE) { 2271573Srgrimes case P_BINTERNAL: 2281573Srgrimes h->linp[skip] = h->upper -= nbytes; 2291573Srgrimes dest = (char *)h + h->linp[skip]; 2301573Srgrimes memmove(dest, bi, nbytes); 2311573Srgrimes ((BINTERNAL *)dest)->pgno = rchild->pgno; 2321573Srgrimes break; 2331573Srgrimes case P_BLEAF: 2341573Srgrimes h->linp[skip] = h->upper -= nbytes; 2351573Srgrimes dest = (char *)h + h->linp[skip]; 2361573Srgrimes WR_BINTERNAL(dest, nksize ? nksize : bl->ksize, 2371573Srgrimes rchild->pgno, bl->flags & P_BIGKEY); 2381573Srgrimes memmove(dest, bl->bytes, nksize ? nksize : bl->ksize); 2391573Srgrimes if (bl->flags & P_BIGKEY && 2401573Srgrimes bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR) 2411573Srgrimes goto err1; 2421573Srgrimes break; 2431573Srgrimes case P_RINTERNAL: 2441573Srgrimes /* 2451573Srgrimes * Update the left page count. If split 2461573Srgrimes * added at index 0, fix the correct page. 2471573Srgrimes */ 2481573Srgrimes if (skip > 0) 2491573Srgrimes dest = (char *)h + h->linp[skip - 1]; 2501573Srgrimes else 2511573Srgrimes dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; 2521573Srgrimes ((RINTERNAL *)dest)->nrecs = rec_total(lchild); 2531573Srgrimes ((RINTERNAL *)dest)->pgno = lchild->pgno; 2541573Srgrimes 2551573Srgrimes /* Update the right page count. */ 2561573Srgrimes h->linp[skip] = h->upper -= nbytes; 2571573Srgrimes dest = (char *)h + h->linp[skip]; 2581573Srgrimes ((RINTERNAL *)dest)->nrecs = rec_total(rchild); 2591573Srgrimes ((RINTERNAL *)dest)->pgno = rchild->pgno; 2601573Srgrimes break; 2611573Srgrimes case P_RLEAF: 2621573Srgrimes /* 2631573Srgrimes * Update the left page count. If split 2641573Srgrimes * added at index 0, fix the correct page. 2651573Srgrimes */ 2661573Srgrimes if (skip > 0) 2671573Srgrimes dest = (char *)h + h->linp[skip - 1]; 2681573Srgrimes else 2691573Srgrimes dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; 2701573Srgrimes ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild); 2711573Srgrimes ((RINTERNAL *)dest)->pgno = lchild->pgno; 2721573Srgrimes 2731573Srgrimes /* Update the right page count. */ 2741573Srgrimes h->linp[skip] = h->upper -= nbytes; 2751573Srgrimes dest = (char *)h + h->linp[skip]; 2761573Srgrimes ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild); 2771573Srgrimes ((RINTERNAL *)dest)->pgno = rchild->pgno; 2781573Srgrimes break; 2791573Srgrimes default: 2801573Srgrimes abort(); 2811573Srgrimes } 2821573Srgrimes 2831573Srgrimes /* Unpin the held pages. */ 2841573Srgrimes if (!parentsplit) { 2851573Srgrimes mpool_put(t->bt_mp, h, MPOOL_DIRTY); 2861573Srgrimes break; 2871573Srgrimes } 2881573Srgrimes 2891573Srgrimes /* If the root page was split, make it look right. */ 2901573Srgrimes if (sp->pgno == P_ROOT && 29114272Spst (F_ISSET(t, R_RECNO) ? 2921573Srgrimes bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) 2931573Srgrimes goto err1; 2941573Srgrimes 2951573Srgrimes mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); 2961573Srgrimes mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); 2971573Srgrimes } 2981573Srgrimes 2991573Srgrimes /* Unpin the held pages. */ 3001573Srgrimes mpool_put(t->bt_mp, l, MPOOL_DIRTY); 3011573Srgrimes mpool_put(t->bt_mp, r, MPOOL_DIRTY); 3021573Srgrimes 3031573Srgrimes /* Clear any pages left on the stack. */ 3041573Srgrimes return (RET_SUCCESS); 3051573Srgrimes 3061573Srgrimes /* 3071573Srgrimes * If something fails in the above loop we were already walking back 3081573Srgrimes * up the tree and the tree is now inconsistent. Nothing much we can 3091573Srgrimes * do about it but release any memory we're holding. 3101573Srgrimes */ 3111573Srgrimeserr1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); 3121573Srgrimes mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); 3131573Srgrimes 3141573Srgrimeserr2: mpool_put(t->bt_mp, l, 0); 3151573Srgrimes mpool_put(t->bt_mp, r, 0); 3161573Srgrimes __dbpanic(t->bt_dbp); 3171573Srgrimes return (RET_ERROR); 3181573Srgrimes} 3191573Srgrimes 3201573Srgrimes/* 3211573Srgrimes * BT_PAGE -- Split a non-root page of a btree. 3221573Srgrimes * 3231573Srgrimes * Parameters: 3241573Srgrimes * t: tree 3251573Srgrimes * h: root page 3261573Srgrimes * lp: pointer to left page pointer 3271573Srgrimes * rp: pointer to right page pointer 3281573Srgrimes * skip: pointer to index to leave open 3291573Srgrimes * ilen: insert length 3301573Srgrimes * 3311573Srgrimes * Returns: 3321573Srgrimes * Pointer to page in which to insert or NULL on error. 3331573Srgrimes */ 3341573Srgrimesstatic PAGE * 335189291Sdelphijbt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen) 3361573Srgrimes{ 3371573Srgrimes PAGE *l, *r, *tp; 3381573Srgrimes pgno_t npg; 3391573Srgrimes 3401573Srgrimes#ifdef STATISTICS 3411573Srgrimes ++bt_split; 3421573Srgrimes#endif 3431573Srgrimes /* Put the new right page for the split into place. */ 3441573Srgrimes if ((r = __bt_new(t, &npg)) == NULL) 3451573Srgrimes return (NULL); 3461573Srgrimes r->pgno = npg; 3471573Srgrimes r->lower = BTDATAOFF; 3481573Srgrimes r->upper = t->bt_psize; 3491573Srgrimes r->nextpg = h->nextpg; 3501573Srgrimes r->prevpg = h->pgno; 3511573Srgrimes r->flags = h->flags & P_TYPE; 3521573Srgrimes 3531573Srgrimes /* 3541573Srgrimes * If we're splitting the last page on a level because we're appending 3551573Srgrimes * a key to it (skip is NEXTINDEX()), it's likely that the data is 3561573Srgrimes * sorted. Adding an empty page on the side of the level is less work 3571573Srgrimes * and can push the fill factor much higher than normal. If we're 3581573Srgrimes * wrong it's no big deal, we'll just do the split the right way next 3591573Srgrimes * time. It may look like it's equally easy to do a similar hack for 3601573Srgrimes * reverse sorted data, that is, split the tree left, but it's not. 3611573Srgrimes * Don't even try. 3621573Srgrimes */ 3631573Srgrimes if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) { 3641573Srgrimes#ifdef STATISTICS 3651573Srgrimes ++bt_sortsplit; 3661573Srgrimes#endif 3671573Srgrimes h->nextpg = r->pgno; 3681573Srgrimes r->lower = BTDATAOFF + sizeof(indx_t); 3691573Srgrimes *skip = 0; 3701573Srgrimes *lp = h; 3711573Srgrimes *rp = r; 3721573Srgrimes return (r); 3731573Srgrimes } 3741573Srgrimes 3751573Srgrimes /* Put the new left page for the split into place. */ 376190482Sdelphij if ((l = (PAGE *)calloc(1, t->bt_psize)) == NULL) { 3771573Srgrimes mpool_put(t->bt_mp, r, 0); 3781573Srgrimes return (NULL); 3791573Srgrimes } 3801573Srgrimes l->pgno = h->pgno; 3811573Srgrimes l->nextpg = r->pgno; 3821573Srgrimes l->prevpg = h->prevpg; 3831573Srgrimes l->lower = BTDATAOFF; 3841573Srgrimes l->upper = t->bt_psize; 3851573Srgrimes l->flags = h->flags & P_TYPE; 3861573Srgrimes 3871573Srgrimes /* Fix up the previous pointer of the page after the split page. */ 3881573Srgrimes if (h->nextpg != P_INVALID) { 3891573Srgrimes if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) { 3901573Srgrimes free(l); 3911573Srgrimes /* XXX mpool_free(t->bt_mp, r->pgno); */ 3921573Srgrimes return (NULL); 3931573Srgrimes } 3941573Srgrimes tp->prevpg = r->pgno; 39514272Spst mpool_put(t->bt_mp, tp, MPOOL_DIRTY); 3961573Srgrimes } 3971573Srgrimes 3981573Srgrimes /* 3991573Srgrimes * Split right. The key/data pairs aren't sorted in the btree page so 4001573Srgrimes * it's simpler to copy the data from the split page onto two new pages 4011573Srgrimes * instead of copying half the data to the right page and compacting 4021573Srgrimes * the left page in place. Since the left page can't change, we have 4031573Srgrimes * to swap the original and the allocated left page after the split. 4041573Srgrimes */ 4051573Srgrimes tp = bt_psplit(t, h, l, r, skip, ilen); 4061573Srgrimes 4071573Srgrimes /* Move the new left page onto the old left page. */ 4081573Srgrimes memmove(h, l, t->bt_psize); 4091573Srgrimes if (tp == l) 4101573Srgrimes tp = h; 4111573Srgrimes free(l); 4121573Srgrimes 4131573Srgrimes *lp = h; 4141573Srgrimes *rp = r; 4151573Srgrimes return (tp); 4161573Srgrimes} 4171573Srgrimes 4181573Srgrimes/* 4191573Srgrimes * BT_ROOT -- Split the root page of a btree. 4201573Srgrimes * 4211573Srgrimes * Parameters: 4221573Srgrimes * t: tree 4231573Srgrimes * h: root page 4241573Srgrimes * lp: pointer to left page pointer 4251573Srgrimes * rp: pointer to right page pointer 4261573Srgrimes * skip: pointer to index to leave open 4271573Srgrimes * ilen: insert length 4281573Srgrimes * 4291573Srgrimes * Returns: 4301573Srgrimes * Pointer to page in which to insert or NULL on error. 4311573Srgrimes */ 4321573Srgrimesstatic PAGE * 433189291Sdelphijbt_root(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen) 4341573Srgrimes{ 4351573Srgrimes PAGE *l, *r, *tp; 4361573Srgrimes pgno_t lnpg, rnpg; 4371573Srgrimes 4381573Srgrimes#ifdef STATISTICS 4391573Srgrimes ++bt_split; 4401573Srgrimes ++bt_rootsplit; 4411573Srgrimes#endif 4421573Srgrimes /* Put the new left and right pages for the split into place. */ 4431573Srgrimes if ((l = __bt_new(t, &lnpg)) == NULL || 4441573Srgrimes (r = __bt_new(t, &rnpg)) == NULL) 4451573Srgrimes return (NULL); 4461573Srgrimes l->pgno = lnpg; 4471573Srgrimes r->pgno = rnpg; 4481573Srgrimes l->nextpg = r->pgno; 4491573Srgrimes r->prevpg = l->pgno; 4501573Srgrimes l->prevpg = r->nextpg = P_INVALID; 4511573Srgrimes l->lower = r->lower = BTDATAOFF; 4521573Srgrimes l->upper = r->upper = t->bt_psize; 4531573Srgrimes l->flags = r->flags = h->flags & P_TYPE; 4541573Srgrimes 4551573Srgrimes /* Split the root page. */ 4561573Srgrimes tp = bt_psplit(t, h, l, r, skip, ilen); 4571573Srgrimes 4581573Srgrimes *lp = l; 4591573Srgrimes *rp = r; 4601573Srgrimes return (tp); 4611573Srgrimes} 4621573Srgrimes 4631573Srgrimes/* 4641573Srgrimes * BT_RROOT -- Fix up the recno root page after it has been split. 4651573Srgrimes * 4661573Srgrimes * Parameters: 4671573Srgrimes * t: tree 4681573Srgrimes * h: root page 4691573Srgrimes * l: left page 4701573Srgrimes * r: right page 4711573Srgrimes * 4721573Srgrimes * Returns: 4731573Srgrimes * RET_ERROR, RET_SUCCESS 4741573Srgrimes */ 4751573Srgrimesstatic int 476189291Sdelphijbt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r) 4771573Srgrimes{ 4781573Srgrimes char *dest; 4791573Srgrimes 4801573Srgrimes /* Insert the left and right keys, set the header information. */ 4811573Srgrimes h->linp[0] = h->upper = t->bt_psize - NRINTERNAL; 4821573Srgrimes dest = (char *)h + h->upper; 4831573Srgrimes WR_RINTERNAL(dest, 4841573Srgrimes l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno); 4851573Srgrimes 486223262Sbenl __PAST_END(h->linp, 1) = h->upper -= NRINTERNAL; 4871573Srgrimes dest = (char *)h + h->upper; 4881573Srgrimes WR_RINTERNAL(dest, 4891573Srgrimes r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno); 4901573Srgrimes 4911573Srgrimes h->lower = BTDATAOFF + 2 * sizeof(indx_t); 4921573Srgrimes 4931573Srgrimes /* Unpin the root page, set to recno internal page. */ 4941573Srgrimes h->flags &= ~P_TYPE; 4951573Srgrimes h->flags |= P_RINTERNAL; 4961573Srgrimes mpool_put(t->bt_mp, h, MPOOL_DIRTY); 4971573Srgrimes 4981573Srgrimes return (RET_SUCCESS); 4991573Srgrimes} 5001573Srgrimes 5011573Srgrimes/* 5021573Srgrimes * BT_BROOT -- Fix up the btree root page after it has been split. 5031573Srgrimes * 5041573Srgrimes * Parameters: 5051573Srgrimes * t: tree 5061573Srgrimes * h: root page 5071573Srgrimes * l: left page 5081573Srgrimes * r: right page 5091573Srgrimes * 5101573Srgrimes * Returns: 5111573Srgrimes * RET_ERROR, RET_SUCCESS 5121573Srgrimes */ 5131573Srgrimesstatic int 514189291Sdelphijbt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r) 5151573Srgrimes{ 5161573Srgrimes BINTERNAL *bi; 5171573Srgrimes BLEAF *bl; 51814272Spst u_int32_t nbytes; 5191573Srgrimes char *dest; 5201573Srgrimes 5211573Srgrimes /* 5221573Srgrimes * If the root page was a leaf page, change it into an internal page. 5231573Srgrimes * We copy the key we split on (but not the key's data, in the case of 5241573Srgrimes * a leaf page) to the new root page. 5251573Srgrimes * 5261573Srgrimes * The btree comparison code guarantees that the left-most key on any 5271573Srgrimes * level of the tree is never used, so it doesn't need to be filled in. 5281573Srgrimes */ 5291573Srgrimes nbytes = NBINTERNAL(0); 5301573Srgrimes h->linp[0] = h->upper = t->bt_psize - nbytes; 5311573Srgrimes dest = (char *)h + h->upper; 5321573Srgrimes WR_BINTERNAL(dest, 0, l->pgno, 0); 5331573Srgrimes 53414272Spst switch (h->flags & P_TYPE) { 5351573Srgrimes case P_BLEAF: 5361573Srgrimes bl = GETBLEAF(r, 0); 5371573Srgrimes nbytes = NBINTERNAL(bl->ksize); 538223262Sbenl __PAST_END(h->linp, 1) = h->upper -= nbytes; 5391573Srgrimes dest = (char *)h + h->upper; 5401573Srgrimes WR_BINTERNAL(dest, bl->ksize, r->pgno, 0); 5411573Srgrimes memmove(dest, bl->bytes, bl->ksize); 5421573Srgrimes 5431573Srgrimes /* 5441573Srgrimes * If the key is on an overflow page, mark the overflow chain 5451573Srgrimes * so it isn't deleted when the leaf copy of the key is deleted. 5461573Srgrimes */ 5471573Srgrimes if (bl->flags & P_BIGKEY && 5481573Srgrimes bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR) 5491573Srgrimes return (RET_ERROR); 5501573Srgrimes break; 5511573Srgrimes case P_BINTERNAL: 5521573Srgrimes bi = GETBINTERNAL(r, 0); 5531573Srgrimes nbytes = NBINTERNAL(bi->ksize); 554223262Sbenl __PAST_END(h->linp, 1) = h->upper -= nbytes; 5551573Srgrimes dest = (char *)h + h->upper; 5561573Srgrimes memmove(dest, bi, nbytes); 5571573Srgrimes ((BINTERNAL *)dest)->pgno = r->pgno; 5581573Srgrimes break; 5591573Srgrimes default: 5601573Srgrimes abort(); 5611573Srgrimes } 5621573Srgrimes 5631573Srgrimes /* There are two keys on the page. */ 5641573Srgrimes h->lower = BTDATAOFF + 2 * sizeof(indx_t); 5651573Srgrimes 5661573Srgrimes /* Unpin the root page, set to btree internal page. */ 5671573Srgrimes h->flags &= ~P_TYPE; 5681573Srgrimes h->flags |= P_BINTERNAL; 5691573Srgrimes mpool_put(t->bt_mp, h, MPOOL_DIRTY); 5701573Srgrimes 5711573Srgrimes return (RET_SUCCESS); 5721573Srgrimes} 5731573Srgrimes 5741573Srgrimes/* 5751573Srgrimes * BT_PSPLIT -- Do the real work of splitting the page. 5761573Srgrimes * 5771573Srgrimes * Parameters: 5781573Srgrimes * t: tree 5791573Srgrimes * h: page to be split 5801573Srgrimes * l: page to put lower half of data 5811573Srgrimes * r: page to put upper half of data 5821573Srgrimes * pskip: pointer to index to leave open 5831573Srgrimes * ilen: insert length 5841573Srgrimes * 5851573Srgrimes * Returns: 5861573Srgrimes * Pointer to page in which to insert. 5871573Srgrimes */ 5881573Srgrimesstatic PAGE * 589189291Sdelphijbt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen) 5901573Srgrimes{ 5911573Srgrimes BINTERNAL *bi; 5921573Srgrimes BLEAF *bl; 59314272Spst CURSOR *c; 5941573Srgrimes RLEAF *rl; 5951573Srgrimes PAGE *rval; 5961573Srgrimes void *src; 5971573Srgrimes indx_t full, half, nxt, off, skip, top, used; 59814272Spst u_int32_t nbytes; 5991573Srgrimes int bigkeycnt, isbigkey; 6001573Srgrimes 6011573Srgrimes /* 6021573Srgrimes * Split the data to the left and right pages. Leave the skip index 6031573Srgrimes * open. Additionally, make some effort not to split on an overflow 6041573Srgrimes * key. This makes internal page processing faster and can save 6051573Srgrimes * space as overflow keys used by internal pages are never deleted. 6061573Srgrimes */ 6071573Srgrimes bigkeycnt = 0; 6081573Srgrimes skip = *pskip; 6091573Srgrimes full = t->bt_psize - BTDATAOFF; 6101573Srgrimes half = full / 2; 6111573Srgrimes used = 0; 6121573Srgrimes for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) { 6131573Srgrimes if (skip == off) { 6141573Srgrimes nbytes = ilen; 6151573Srgrimes isbigkey = 0; /* XXX: not really known. */ 6161573Srgrimes } else 6171573Srgrimes switch (h->flags & P_TYPE) { 6181573Srgrimes case P_BINTERNAL: 6191573Srgrimes src = bi = GETBINTERNAL(h, nxt); 6201573Srgrimes nbytes = NBINTERNAL(bi->ksize); 6211573Srgrimes isbigkey = bi->flags & P_BIGKEY; 6221573Srgrimes break; 6231573Srgrimes case P_BLEAF: 6241573Srgrimes src = bl = GETBLEAF(h, nxt); 6251573Srgrimes nbytes = NBLEAF(bl); 6261573Srgrimes isbigkey = bl->flags & P_BIGKEY; 6271573Srgrimes break; 6281573Srgrimes case P_RINTERNAL: 6291573Srgrimes src = GETRINTERNAL(h, nxt); 6301573Srgrimes nbytes = NRINTERNAL; 6311573Srgrimes isbigkey = 0; 6321573Srgrimes break; 6331573Srgrimes case P_RLEAF: 6341573Srgrimes src = rl = GETRLEAF(h, nxt); 6351573Srgrimes nbytes = NRLEAF(rl); 6361573Srgrimes isbigkey = 0; 6371573Srgrimes break; 6381573Srgrimes default: 6391573Srgrimes abort(); 6401573Srgrimes } 6411573Srgrimes 6421573Srgrimes /* 6431573Srgrimes * If the key/data pairs are substantial fractions of the max 6441573Srgrimes * possible size for the page, it's possible to get situations 6451573Srgrimes * where we decide to try and copy too much onto the left page. 6461573Srgrimes * Make sure that doesn't happen. 6471573Srgrimes */ 648194803Sdelphij if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) || 649194803Sdelphij nxt == top - 1) { 6501573Srgrimes --off; 6511573Srgrimes break; 6521573Srgrimes } 6531573Srgrimes 6541573Srgrimes /* Copy the key/data pair, if not the skipped index. */ 6551573Srgrimes if (skip != off) { 6561573Srgrimes ++nxt; 6571573Srgrimes 6581573Srgrimes l->linp[off] = l->upper -= nbytes; 6591573Srgrimes memmove((char *)l + l->upper, src, nbytes); 6601573Srgrimes } 6611573Srgrimes 66237155Sguido used += nbytes + sizeof(indx_t); 6631573Srgrimes if (used >= half) { 6641573Srgrimes if (!isbigkey || bigkeycnt == 3) 6651573Srgrimes break; 6661573Srgrimes else 6671573Srgrimes ++bigkeycnt; 6681573Srgrimes } 6691573Srgrimes } 6701573Srgrimes 6711573Srgrimes /* 6721573Srgrimes * Off is the last offset that's valid for the left page. 6731573Srgrimes * Nxt is the first offset to be placed on the right page. 6741573Srgrimes */ 6751573Srgrimes l->lower += (off + 1) * sizeof(indx_t); 6761573Srgrimes 6771573Srgrimes /* 6781573Srgrimes * If splitting the page that the cursor was on, the cursor has to be 6791573Srgrimes * adjusted to point to the same record as before the split. If the 6801573Srgrimes * cursor is at or past the skipped slot, the cursor is incremented by 6811573Srgrimes * one. If the cursor is on the right page, it is decremented by the 6821573Srgrimes * number of records split to the left page. 6831573Srgrimes */ 68414272Spst c = &t->bt_cursor; 68514272Spst if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) { 68614272Spst if (c->pg.index >= skip) 68714272Spst ++c->pg.index; 68814272Spst if (c->pg.index < nxt) /* Left page. */ 68914272Spst c->pg.pgno = l->pgno; 6901573Srgrimes else { /* Right page. */ 69114272Spst c->pg.pgno = r->pgno; 69214272Spst c->pg.index -= nxt; 6931573Srgrimes } 6941573Srgrimes } 6951573Srgrimes 6961573Srgrimes /* 6971573Srgrimes * If the skipped index was on the left page, just return that page. 6981573Srgrimes * Otherwise, adjust the skip index to reflect the new position on 6991573Srgrimes * the right page. 7001573Srgrimes */ 7011573Srgrimes if (skip <= off) { 702135178Skuriyama skip = MAX_PAGE_OFFSET; 7031573Srgrimes rval = l; 7041573Srgrimes } else { 7051573Srgrimes rval = r; 7061573Srgrimes *pskip -= nxt; 7071573Srgrimes } 7081573Srgrimes 7091573Srgrimes for (off = 0; nxt < top; ++off) { 7101573Srgrimes if (skip == nxt) { 7111573Srgrimes ++off; 712135178Skuriyama skip = MAX_PAGE_OFFSET; 7131573Srgrimes } 7141573Srgrimes switch (h->flags & P_TYPE) { 7151573Srgrimes case P_BINTERNAL: 7161573Srgrimes src = bi = GETBINTERNAL(h, nxt); 7171573Srgrimes nbytes = NBINTERNAL(bi->ksize); 7181573Srgrimes break; 7191573Srgrimes case P_BLEAF: 7201573Srgrimes src = bl = GETBLEAF(h, nxt); 7211573Srgrimes nbytes = NBLEAF(bl); 7221573Srgrimes break; 7231573Srgrimes case P_RINTERNAL: 7241573Srgrimes src = GETRINTERNAL(h, nxt); 7251573Srgrimes nbytes = NRINTERNAL; 7261573Srgrimes break; 7271573Srgrimes case P_RLEAF: 7281573Srgrimes src = rl = GETRLEAF(h, nxt); 7291573Srgrimes nbytes = NRLEAF(rl); 7301573Srgrimes break; 7311573Srgrimes default: 7321573Srgrimes abort(); 7331573Srgrimes } 7341573Srgrimes ++nxt; 7351573Srgrimes r->linp[off] = r->upper -= nbytes; 7361573Srgrimes memmove((char *)r + r->upper, src, nbytes); 7371573Srgrimes } 7381573Srgrimes r->lower += off * sizeof(indx_t); 7391573Srgrimes 7401573Srgrimes /* If the key is being appended to the page, adjust the index. */ 7411573Srgrimes if (skip == top) 7421573Srgrimes r->lower += sizeof(indx_t); 7431573Srgrimes 7441573Srgrimes return (rval); 7451573Srgrimes} 7461573Srgrimes 7471573Srgrimes/* 7481573Srgrimes * BT_PRESERVE -- Mark a chain of pages as used by an internal node. 7491573Srgrimes * 7501573Srgrimes * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the 7511573Srgrimes * record that references them gets deleted. Chains pointed to by internal 7521573Srgrimes * pages never get deleted. This routine marks a chain as pointed to by an 7531573Srgrimes * internal page. 7541573Srgrimes * 7551573Srgrimes * Parameters: 7561573Srgrimes * t: tree 7571573Srgrimes * pg: page number of first page in the chain. 7581573Srgrimes * 7591573Srgrimes * Returns: 7601573Srgrimes * RET_SUCCESS, RET_ERROR. 7611573Srgrimes */ 7621573Srgrimesstatic int 763189291Sdelphijbt_preserve(BTREE *t, pgno_t pg) 7641573Srgrimes{ 7651573Srgrimes PAGE *h; 7661573Srgrimes 7671573Srgrimes if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 7681573Srgrimes return (RET_ERROR); 7691573Srgrimes h->flags |= P_PRESERVE; 7701573Srgrimes mpool_put(t->bt_mp, h, MPOOL_DIRTY); 7711573Srgrimes return (RET_SUCCESS); 7721573Srgrimes} 7731573Srgrimes 7741573Srgrimes/* 7751573Srgrimes * REC_TOTAL -- Return the number of recno entries below a page. 7761573Srgrimes * 7771573Srgrimes * Parameters: 7781573Srgrimes * h: page 7791573Srgrimes * 7801573Srgrimes * Returns: 7811573Srgrimes * The number of recno entries below a page. 7821573Srgrimes * 7831573Srgrimes * XXX 7841573Srgrimes * These values could be set by the bt_psplit routine. The problem is that the 7851573Srgrimes * entry has to be popped off of the stack etc. or the values have to be passed 7861573Srgrimes * all the way back to bt_split/bt_rroot and it's not very clean. 7871573Srgrimes */ 7881573Srgrimesstatic recno_t 789189291Sdelphijrec_total(PAGE *h) 7901573Srgrimes{ 7911573Srgrimes recno_t recs; 7921573Srgrimes indx_t nxt, top; 7931573Srgrimes 7941573Srgrimes for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt) 7951573Srgrimes recs += GETRINTERNAL(h, nxt)->nrecs; 7961573Srgrimes return (recs); 7971573Srgrimes} 798