1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 */ 6/* 7 * Copyright (c) 1990, 1993, 1994, 1995, 1996 8 * Keith Bostic. All rights reserved. 9 */ 10/* 11 * Copyright (c) 1990, 1993, 1994, 1995 12 * The Regents of the University of California. All rights reserved. 13 * 14 * This code is derived from software contributed to Berkeley by 15 * Mike Olson. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. Neither the name of the University nor the names of its contributors 26 * may be used to endorse or promote products derived from this software 27 * without specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 39 * SUCH DAMAGE. 40 * 41 * $Id: bt_delete.c,v 12.30 2008/01/28 21:40:57 ubell Exp $ 42 */ 43 44#include "db_config.h" 45 46#include "db_int.h" 47#include "dbinc/db_page.h" 48#include "dbinc/btree.h" 49#include "dbinc/lock.h" 50#include "dbinc/mp.h" 51 52/* 53 * __bam_ditem -- 54 * Delete one or more entries from a page. 55 * 56 * PUBLIC: int __bam_ditem __P((DBC *, PAGE *, u_int32_t)); 57 */ 58int 59__bam_ditem(dbc, h, indx) 60 DBC *dbc; 61 PAGE *h; 62 u_int32_t indx; 63{ 64 BINTERNAL *bi; 65 BKEYDATA *bk; 66 DB *dbp; 67 u_int32_t nbytes; 68 int ret; 69 db_indx_t *inp; 70 71 dbp = dbc->dbp; 72 inp = P_INP(dbp, h); 73 74 /* The page should already have been dirtied by our caller. */ 75 DB_ASSERT(dbp->env, IS_DIRTY(h)); 76 77 switch (TYPE(h)) { 78 case P_IBTREE: 79 bi = GET_BINTERNAL(dbp, h, indx); 80 switch (B_TYPE(bi->type)) { 81 case B_DUPLICATE: 82 case B_KEYDATA: 83 nbytes = BINTERNAL_SIZE(bi->len); 84 break; 85 case B_OVERFLOW: 86 nbytes = BINTERNAL_SIZE(bi->len); 87 if ((ret = 88 __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0) 89 return (ret); 90 break; 91 default: 92 return (__db_pgfmt(dbp->env, PGNO(h))); 93 } 94 break; 95 case P_IRECNO: 96 nbytes = RINTERNAL_SIZE; 97 break; 98 case P_LBTREE: 99 /* 100 * If it's a duplicate key, discard the index and don't touch 101 * the actual page item. 102 * 103 * !!! 104 * This works because no data item can have an index matching 105 * any other index so even if the data item is in a key "slot", 106 * it won't match any other index. 107 */ 108 if ((indx % 2) == 0) { 109 /* 110 * Check for a duplicate after us on the page. NOTE: 111 * we have to delete the key item before deleting the 112 * data item, otherwise the "indx + P_INDX" calculation 113 * won't work! 114 */ 115 if (indx + P_INDX < (u_int32_t)NUM_ENT(h) && 116 inp[indx] == inp[indx + P_INDX]) 117 return (__bam_adjindx(dbc, 118 h, indx, indx + O_INDX, 0)); 119 /* 120 * Check for a duplicate before us on the page. It 121 * doesn't matter if we delete the key item before or 122 * after the data item for the purposes of this one. 123 */ 124 if (indx > 0 && inp[indx] == inp[indx - P_INDX]) 125 return (__bam_adjindx(dbc, 126 h, indx, indx - P_INDX, 0)); 127 } 128 /* FALLTHROUGH */ 129 case P_LDUP: 130 case P_LRECNO: 131 bk = GET_BKEYDATA(dbp, h, indx); 132 switch (B_TYPE(bk->type)) { 133 case B_DUPLICATE: 134 nbytes = BOVERFLOW_SIZE; 135 break; 136 case B_OVERFLOW: 137 nbytes = BOVERFLOW_SIZE; 138 if ((ret = __db_doff( 139 dbc, (GET_BOVERFLOW(dbp, h, indx))->pgno)) != 0) 140 return (ret); 141 break; 142 case B_KEYDATA: 143 nbytes = BKEYDATA_SIZE(bk->len); 144 break; 145 default: 146 return (__db_pgfmt(dbp->env, PGNO(h))); 147 } 148 break; 149 default: 150 return (__db_pgfmt(dbp->env, PGNO(h))); 151 } 152 153 /* Delete the item and mark the page dirty. */ 154 if ((ret = __db_ditem(dbc, h, indx, nbytes)) != 0) 155 return (ret); 156 157 return (0); 158} 159 160/* 161 * __bam_adjindx -- 162 * Adjust an index on the page. 163 * 164 * PUBLIC: int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int)); 165 */ 166int 167__bam_adjindx(dbc, h, indx, indx_copy, is_insert) 168 DBC *dbc; 169 PAGE *h; 170 u_int32_t indx, indx_copy; 171 int is_insert; 172{ 173 DB *dbp; 174 db_indx_t copy, *inp; 175 int ret; 176 177 dbp = dbc->dbp; 178 inp = P_INP(dbp, h); 179 180 /* Log the change. */ 181 if (DBC_LOGGING(dbc)) { 182 if ((ret = __bam_adj_log(dbp, dbc->txn, &LSN(h), 0, 183 PGNO(h), &LSN(h), indx, indx_copy, (u_int32_t)is_insert)) != 0) 184 return (ret); 185 } else 186 LSN_NOT_LOGGED(LSN(h)); 187 188 /* Shuffle the indices and mark the page dirty. */ 189 if (is_insert) { 190 copy = inp[indx_copy]; 191 if (indx != NUM_ENT(h)) 192 memmove(&inp[indx + O_INDX], &inp[indx], 193 sizeof(db_indx_t) * (NUM_ENT(h) - indx)); 194 inp[indx] = copy; 195 ++NUM_ENT(h); 196 } else { 197 --NUM_ENT(h); 198 if (indx != NUM_ENT(h)) 199 memmove(&inp[indx], &inp[indx + O_INDX], 200 sizeof(db_indx_t) * (NUM_ENT(h) - indx)); 201 } 202 203 return (0); 204} 205 206/* 207 * __bam_dpages -- 208 * Delete a set of locked pages. 209 * 210 * PUBLIC: int __bam_dpages __P((DBC *, int, int)); 211 */ 212int 213__bam_dpages(dbc, use_top, update) 214 DBC *dbc; 215 int use_top; 216 int update; 217{ 218 BINTERNAL *bi; 219 BTREE_CURSOR *cp; 220 DB *dbp; 221 DBT a, b; 222 DB_LOCK c_lock, p_lock; 223 DB_MPOOLFILE *mpf; 224 EPG *epg, *save_sp, *stack_epg; 225 PAGE *child, *parent; 226 db_indx_t nitems; 227 db_pgno_t pgno, root_pgno; 228 db_recno_t rcnt; 229 int done, ret, t_ret; 230 231 dbp = dbc->dbp; 232 mpf = dbp->mpf; 233 cp = (BTREE_CURSOR *)dbc->internal; 234 nitems = 0; 235 pgno = PGNO_INVALID; 236 237 /* 238 * We have the entire stack of deletable pages locked. 239 * 240 * Btree calls us with the first page in the stack is to have a 241 * single item deleted, and the rest of the pages are to be removed. 242 * 243 * Recno always has a stack to the root and __bam_merge operations 244 * may have unneeded items in the sack. We find the lowest page 245 * in the stack that has more than one record in it and start there. 246 */ 247 ret = 0; 248 if (use_top) 249 stack_epg = cp->sp; 250 else 251 for (stack_epg = cp->csp; stack_epg > cp->sp; --stack_epg) 252 if (NUM_ENT(stack_epg->page) > 1) 253 break; 254 epg = stack_epg; 255 /* 256 * !!! 257 * There is an interesting deadlock situation here. We have to relink 258 * the leaf page chain around the leaf page being deleted. Consider 259 * a cursor walking through the leaf pages, that has the previous page 260 * read-locked and is waiting on a lock for the page we're deleting. 261 * It will deadlock here. Before we unlink the subtree, we relink the 262 * leaf page chain. 263 */ 264 if (LEVEL(cp->csp->page) == 1 && 265 (ret = __bam_relink(dbc, cp->csp->page, PGNO_INVALID)) != 0) 266 goto discard; 267 268 /* 269 * Delete the last item that references the underlying pages that are 270 * to be deleted, and adjust cursors that reference that page. Then, 271 * save that page's page number and item count and release it. If 272 * the application isn't retaining locks because it's running without 273 * transactions, this lets the rest of the tree get back to business 274 * immediately. 275 */ 276 if ((ret = __memp_dirty(mpf, 277 &epg->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) 278 goto discard; 279 if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0) 280 goto discard; 281 if ((ret = __bam_ca_di(dbc, PGNO(epg->page), epg->indx, -1)) != 0) 282 goto discard; 283 284 if (update && epg->indx == 0) { 285 save_sp = cp->csp; 286 cp->csp = epg; 287 ret = __bam_pupdate(dbc, epg->page); 288 cp->csp = save_sp; 289 if (ret != 0) 290 goto discard; 291 } 292 293 pgno = PGNO(epg->page); 294 nitems = NUM_ENT(epg->page); 295 296 ret = __memp_fput(mpf, dbc->thread_info, epg->page, dbc->priority); 297 epg->page = NULL; 298 if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0) 299 ret = t_ret; 300 if (ret != 0) 301 goto err_inc; 302 303 /* Then, discard any pages that we don't care about. */ 304discard: for (epg = cp->sp; epg < stack_epg; ++epg) { 305 if ((t_ret = __memp_fput(mpf, dbc->thread_info, 306 epg->page, dbc->priority)) != 0 && ret == 0) 307 ret = t_ret; 308 epg->page = NULL; 309 if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0) 310 ret = t_ret; 311 } 312 if (ret != 0) 313 goto err; 314 315 /* Free the rest of the pages in the stack. */ 316 while (++epg <= cp->csp) { 317 if ((ret = __memp_dirty(mpf, &epg->page, 318 dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) 319 goto err; 320 /* 321 * Delete page entries so they will be restored as part of 322 * recovery. We don't need to do cursor adjustment here as 323 * the pages are being emptied by definition and so cannot 324 * be referenced by a cursor. 325 */ 326 if (NUM_ENT(epg->page) != 0) { 327 DB_ASSERT(dbp->env, LEVEL(epg->page) != 1); 328 329 if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0) 330 goto err; 331 /* 332 * Sheer paranoia: if we find any pages that aren't 333 * emptied by the delete, someone else added an item 334 * while we were walking the tree, and we discontinue 335 * the delete. Shouldn't be possible, but we check 336 * regardless. 337 */ 338 if (NUM_ENT(epg->page) != 0) 339 goto err; 340 } 341 342 ret = __db_free(dbc, epg->page); 343 if (cp->page == epg->page) 344 cp->page = NULL; 345 epg->page = NULL; 346 if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0) 347 ret = t_ret; 348 if (ret != 0) 349 goto err_inc; 350 } 351 352 if (0) { 353err_inc: ++epg; 354err: for (; epg <= cp->csp; ++epg) { 355 if (epg->page != NULL) { 356 (void)__memp_fput(mpf, dbc->thread_info, 357 epg->page, dbc->priority); 358 epg->page = NULL; 359 } 360 (void)__TLPUT(dbc, epg->lock); 361 } 362 BT_STK_CLR(cp); 363 return (ret); 364 } 365 BT_STK_CLR(cp); 366 367 /* 368 * If we just deleted the next-to-last item from the root page, the 369 * tree can collapse one or more levels. While there remains only a 370 * single item on the root page, write lock the last page referenced 371 * by the root page and copy it over the root page. 372 */ 373 root_pgno = cp->root; 374 if (pgno != root_pgno || nitems != 1) 375 return (0); 376 377 for (done = 0; !done;) { 378 /* Initialize. */ 379 parent = child = NULL; 380 LOCK_INIT(p_lock); 381 LOCK_INIT(c_lock); 382 383 /* Lock the root. */ 384 pgno = root_pgno; 385 if ((ret = 386 __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &p_lock)) != 0) 387 goto stop; 388 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 389 DB_MPOOL_DIRTY, &parent)) != 0) 390 goto stop; 391 392 if (NUM_ENT(parent) != 1) 393 goto stop; 394 395 switch (TYPE(parent)) { 396 case P_IBTREE: 397 /* 398 * If this is overflow, then try to delete it. 399 * The child may or may not still point at it. 400 */ 401 bi = GET_BINTERNAL(dbp, parent, 0); 402 if (B_TYPE(bi->type) == B_OVERFLOW) 403 if ((ret = __db_doff(dbc, 404 ((BOVERFLOW *)bi->data)->pgno)) != 0) 405 goto stop; 406 pgno = bi->pgno; 407 break; 408 case P_IRECNO: 409 pgno = GET_RINTERNAL(dbp, parent, 0)->pgno; 410 break; 411 default: 412 goto stop; 413 } 414 415 /* Lock the child page. */ 416 if ((ret = 417 __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &c_lock)) != 0) 418 goto stop; 419 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 420 DB_MPOOL_DIRTY, &child)) != 0) 421 goto stop; 422 423 /* Log the change. */ 424 if (DBC_LOGGING(dbc)) { 425 memset(&a, 0, sizeof(a)); 426 a.data = child; 427 a.size = dbp->pgsize; 428 memset(&b, 0, sizeof(b)); 429 b.data = P_ENTRY(dbp, parent, 0); 430 b.size = TYPE(parent) == P_IRECNO ? RINTERNAL_SIZE : 431 BINTERNAL_SIZE(((BINTERNAL *)b.data)->len); 432 if ((ret = __bam_rsplit_log(dbp, dbc->txn, 433 &child->lsn, 0, PGNO(child), &a, PGNO(parent), 434 RE_NREC(parent), &b, &parent->lsn)) != 0) 435 goto stop; 436 } else 437 LSN_NOT_LOGGED(child->lsn); 438 439 /* 440 * Make the switch. 441 * 442 * One fixup -- internal pages below the top level do not store 443 * a record count, so we have to preserve it if we're not 444 * converting to a leaf page. Note also that we are about to 445 * overwrite the parent page, including its LSN. This is OK 446 * because the log message we wrote describing this update 447 * stores its LSN on the child page. When the child is copied 448 * onto the parent, the correct LSN is copied into place. 449 */ 450 COMPQUIET(rcnt, 0); 451 if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL) 452 rcnt = RE_NREC(parent); 453 memcpy(parent, child, dbp->pgsize); 454 PGNO(parent) = root_pgno; 455 if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL) 456 RE_NREC_SET(parent, rcnt); 457 458 /* Adjust the cursors. */ 459 if ((ret = __bam_ca_rsplit(dbc, PGNO(child), root_pgno)) != 0) 460 goto stop; 461 462 /* 463 * Free the page copied onto the root page and discard its 464 * lock. (The call to __db_free() discards our reference 465 * to the page.) 466 */ 467 if ((ret = __db_free(dbc, child)) != 0) { 468 child = NULL; 469 goto stop; 470 } 471 child = NULL; 472 473 if (0) { 474stop: done = 1; 475 } 476 if ((t_ret = __TLPUT(dbc, p_lock)) != 0 && ret == 0) 477 ret = t_ret; 478 if (parent != NULL && 479 (t_ret = __memp_fput(mpf, dbc->thread_info, 480 parent, dbc->priority)) != 0 && ret == 0) 481 ret = t_ret; 482 if ((t_ret = __TLPUT(dbc, c_lock)) != 0 && ret == 0) 483 ret = t_ret; 484 if (child != NULL && 485 (t_ret = __memp_fput(mpf, dbc->thread_info, 486 child, dbc->priority)) != 0 && ret == 0) 487 ret = t_ret; 488 } 489 490 return (ret); 491} 492 493/* 494 * __bam_relink -- 495 * Relink around a deleted page. 496 * 497 * PUBLIC: int __bam_relink __P((DBC *, PAGE *, db_pgno_t)); 498 */ 499int 500__bam_relink(dbc, pagep, new_pgno) 501 DBC *dbc; 502 PAGE *pagep; 503 db_pgno_t new_pgno; 504{ 505 DB *dbp; 506 DB_LOCK npl, ppl; 507 DB_LSN *nlsnp, *plsnp, ret_lsn; 508 DB_MPOOLFILE *mpf; 509 PAGE *np, *pp; 510 int ret, t_ret; 511 512 dbp = dbc->dbp; 513 np = pp = NULL; 514 LOCK_INIT(npl); 515 LOCK_INIT(ppl); 516 nlsnp = plsnp = NULL; 517 mpf = dbp->mpf; 518 ret = 0; 519 520 /* 521 * Retrieve and lock the one/two pages. For a remove, we may need 522 * two pages (the before and after). For an add, we only need one 523 * because, the split took care of the prev. 524 */ 525 if (pagep->next_pgno != PGNO_INVALID) { 526 if ((ret = __db_lget(dbc, 527 0, pagep->next_pgno, DB_LOCK_WRITE, 0, &npl)) != 0) 528 goto err; 529 if ((ret = __memp_fget(mpf, &pagep->next_pgno, 530 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &np)) != 0) { 531 ret = __db_pgerr(dbp, pagep->next_pgno, ret); 532 goto err; 533 } 534 nlsnp = &np->lsn; 535 } 536 if (pagep->prev_pgno != PGNO_INVALID) { 537 if ((ret = __db_lget(dbc, 538 0, pagep->prev_pgno, DB_LOCK_WRITE, 0, &ppl)) != 0) 539 goto err; 540 if ((ret = __memp_fget(mpf, &pagep->prev_pgno, 541 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pp)) != 0) { 542 ret = __db_pgerr(dbp, pagep->prev_pgno, ret); 543 goto err; 544 } 545 plsnp = &pp->lsn; 546 } 547 548 /* Log the change. */ 549 if (DBC_LOGGING(dbc)) { 550 if ((ret = __bam_relink_log(dbp, dbc->txn, &ret_lsn, 0, 551 pagep->pgno, new_pgno, pagep->prev_pgno, plsnp, 552 pagep->next_pgno, nlsnp)) != 0) 553 goto err; 554 } else 555 LSN_NOT_LOGGED(ret_lsn); 556 if (np != NULL) 557 np->lsn = ret_lsn; 558 if (pp != NULL) 559 pp->lsn = ret_lsn; 560 561 /* 562 * Modify and release the two pages. 563 */ 564 if (np != NULL) { 565 if (new_pgno == PGNO_INVALID) 566 np->prev_pgno = pagep->prev_pgno; 567 else 568 np->prev_pgno = new_pgno; 569 ret = __memp_fput(mpf, dbc->thread_info, np, dbc->priority); 570 if ((t_ret = __TLPUT(dbc, npl)) != 0 && ret == 0) 571 ret = t_ret; 572 if (ret != 0) 573 goto err; 574 } 575 576 if (pp != NULL) { 577 if (new_pgno == PGNO_INVALID) 578 pp->next_pgno = pagep->next_pgno; 579 else 580 pp->next_pgno = new_pgno; 581 ret = __memp_fput(mpf, dbc->thread_info, pp, dbc->priority); 582 if ((t_ret = __TLPUT(dbc, ppl)) != 0 && ret == 0) 583 ret = t_ret; 584 if (ret != 0) 585 goto err; 586 } 587 return (0); 588 589err: if (np != NULL) 590 (void)__memp_fput(mpf, dbc->thread_info, np, dbc->priority); 591 (void)__TLPUT(dbc, npl); 592 if (pp != NULL) 593 (void)__memp_fput(mpf, dbc->thread_info, pp, dbc->priority); 594 (void)__TLPUT(dbc, ppl); 595 return (ret); 596} 597 598/* 599 * __bam_pupdate -- 600 * Update parent key pointers up the tree. 601 * 602 * PUBLIC: int __bam_pupdate __P((DBC *, PAGE *)); 603 */ 604int 605__bam_pupdate(dbc, lpg) 606 DBC *dbc; 607 PAGE *lpg; 608{ 609 BTREE_CURSOR *cp; 610 ENV *env; 611 EPG *epg; 612 int ret; 613 614 env = dbc->env; 615 cp = (BTREE_CURSOR *)dbc->internal; 616 ret = 0; 617 618 /* 619 * Update the parents up the tree. __bam_pinsert only looks at the 620 * left child if is a leaf page, so we don't need to change it. We 621 * just do a delete and insert; a replace is possible but reusing 622 * pinsert is better. 623 */ 624 for (epg = &cp->csp[-1]; epg >= cp->sp; epg--) { 625 if ((ret = __memp_dirty(dbc->dbp->mpf, &epg->page, 626 dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) 627 return (ret); 628 if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0) 629 return (ret); 630 epg->indx--; 631 if ((ret = __bam_pinsert(dbc, epg, 0, 632 lpg, epg[1].page, BPI_NORECNUM)) != 0) { 633 if (ret == DB_NEEDSPLIT) { 634 /* This should not happen. */ 635 __db_errx(env, 636 "Not enough room in parent: %s: page %lu", 637 dbc->dbp->fname, (u_long)PGNO(epg->page)); 638 ret = __env_panic(env, EINVAL); 639 } 640 return (ret); 641 } 642 } 643 return (ret); 644} 645