bt_delete.c revision 1.11
1/* $NetBSD: bt_delete.c,v 1.11 1999/03/16 18:17:56 christos Exp $ */ 2 3/*- 4 * Copyright (c) 1990, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Mike Olson. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39#include <sys/cdefs.h> 40#if defined(LIBC_SCCS) && !defined(lint) 41#if 0 42static char sccsid[] = "@(#)bt_delete.c 8.13 (Berkeley) 7/28/94"; 43#else 44__RCSID("$NetBSD: bt_delete.c,v 1.11 1999/03/16 18:17:56 christos Exp $"); 45#endif 46#endif /* LIBC_SCCS and not lint */ 47 48#include "namespace.h" 49#include <sys/types.h> 50 51#include <errno.h> 52#include <stdio.h> 53#include <string.h> 54 55#include <db.h> 56#include "btree.h" 57 58static int __bt_bdelete __P((BTREE *, const DBT *)); 59static int __bt_curdel __P((BTREE *, const DBT *, PAGE *, u_int)); 60static int __bt_pdelete __P((BTREE *, PAGE *)); 61static int __bt_relink __P((BTREE *, PAGE *)); 62static int __bt_stkacq __P((BTREE *, PAGE **, CURSOR *)); 63 64/* 65 * __bt_delete 66 * Delete the item(s) referenced by a key. 67 * 68 * Return RET_SPECIAL if the key is not found. 69 */ 70int 71__bt_delete(dbp, key, flags) 72 const DB *dbp; 73 const DBT *key; 74 u_int flags; 75{ 76 BTREE *t; 77 CURSOR *c; 78 PAGE *h; 79 int status; 80 81 t = dbp->internal; 82 83 /* Toss any page pinned across calls. */ 84 if (t->bt_pinned != NULL) { 85 mpool_put(t->bt_mp, t->bt_pinned, 0); 86 t->bt_pinned = NULL; 87 } 88 89 /* Check for change to a read-only tree. */ 90 if (F_ISSET(t, B_RDONLY)) { 91 errno = EPERM; 92 return (RET_ERROR); 93 } 94 95 switch (flags) { 96 case 0: 97 status = __bt_bdelete(t, key); 98 break; 99 case R_CURSOR: 100 /* 101 * If flags is R_CURSOR, delete the cursor. Must already 102 * have started a scan and not have already deleted it. 103 */ 104 c = &t->bt_cursor; 105 if (F_ISSET(c, CURS_INIT)) { 106 if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE)) 107 return (RET_SPECIAL); 108 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) 109 return (RET_ERROR); 110 111 /* 112 * If the page is about to be emptied, we'll need to 113 * delete it, which means we have to acquire a stack. 114 */ 115 if (NEXTINDEX(h) == 1) 116 if (__bt_stkacq(t, &h, &t->bt_cursor)) 117 return (RET_ERROR); 118 119 status = __bt_dleaf(t, NULL, h, (u_int)c->pg.index); 120 121 if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) { 122 if (__bt_pdelete(t, h)) 123 return (RET_ERROR); 124 } else 125 mpool_put(t->bt_mp, h, 126 (u_int)(status == RET_SUCCESS ? 127 MPOOL_DIRTY : 0)); 128 break; 129 } 130 /* FALLTHROUGH */ 131 default: 132 errno = EINVAL; 133 return (RET_ERROR); 134 } 135 if (status == RET_SUCCESS) 136 F_SET(t, B_MODIFIED); 137 return (status); 138} 139 140/* 141 * __bt_stkacq -- 142 * Acquire a stack so we can delete a cursor entry. 143 * 144 * Parameters: 145 * t: tree 146 * hp: pointer to current, pinned PAGE pointer 147 * c: pointer to the cursor 148 * 149 * Returns: 150 * 0 on success, 1 on failure 151 */ 152static int 153__bt_stkacq(t, hp, c) 154 BTREE *t; 155 PAGE **hp; 156 CURSOR *c; 157{ 158 BINTERNAL *bi; 159 EPG *e; 160 EPGNO *parent; 161 PAGE *h; 162 indx_t idx = 0; /* Pacify gcc */ 163 pgno_t pgno; 164 recno_t nextpg, prevpg; 165 int exact, level; 166 167 /* 168 * Find the first occurrence of the key in the tree. Toss the 169 * currently locked page so we don't hit an already-locked page. 170 */ 171 h = *hp; 172 mpool_put(t->bt_mp, h, 0); 173 if ((e = __bt_search(t, &c->key, &exact)) == NULL) 174 return (1); 175 h = e->page; 176 177 /* See if we got it in one shot. */ 178 if (h->pgno == c->pg.pgno) 179 goto ret; 180 181 /* 182 * Move right, looking for the page. At each move we have to move 183 * up the stack until we don't have to move to the next page. If 184 * we have to change pages at an internal level, we have to fix the 185 * stack back up. 186 */ 187 while (h->pgno != c->pg.pgno) { 188 if ((nextpg = h->nextpg) == P_INVALID) 189 break; 190 mpool_put(t->bt_mp, h, 0); 191 192 /* Move up the stack. */ 193 for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { 194 /* Get the parent page. */ 195 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 196 return (1); 197 198 /* Move to the next index. */ 199 if (parent->index != NEXTINDEX(h) - 1) { 200 idx = parent->index + 1; 201 BT_PUSH(t, h->pgno, idx); 202 break; 203 } 204 mpool_put(t->bt_mp, h, 0); 205 } 206 207 /* Restore the stack. */ 208 while (level--) { 209 /* Push the next level down onto the stack. */ 210 bi = GETBINTERNAL(h, idx); 211 pgno = bi->pgno; 212 BT_PUSH(t, pgno, 0); 213 214 /* Lose the currently pinned page. */ 215 mpool_put(t->bt_mp, h, 0); 216 217 /* Get the next level down. */ 218 if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) 219 return (1); 220 idx = 0; 221 } 222 mpool_put(t->bt_mp, h, 0); 223 if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL) 224 return (1); 225 } 226 227 if (h->pgno == c->pg.pgno) 228 goto ret; 229 230 /* Reacquire the original stack. */ 231 mpool_put(t->bt_mp, h, 0); 232 if ((e = __bt_search(t, &c->key, &exact)) == NULL) 233 return (1); 234 h = e->page; 235 236 /* 237 * Move left, looking for the page. At each move we have to move 238 * up the stack until we don't have to change pages to move to the 239 * next page. If we have to change pages at an internal level, we 240 * have to fix the stack back up. 241 */ 242 while (h->pgno != c->pg.pgno) { 243 if ((prevpg = h->prevpg) == P_INVALID) 244 break; 245 mpool_put(t->bt_mp, h, 0); 246 247 /* Move up the stack. */ 248 for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { 249 /* Get the parent page. */ 250 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 251 return (1); 252 253 /* Move to the next index. */ 254 if (parent->index != 0) { 255 idx = parent->index - 1; 256 BT_PUSH(t, h->pgno, idx); 257 break; 258 } 259 mpool_put(t->bt_mp, h, 0); 260 } 261 262 /* Restore the stack. */ 263 while (level--) { 264 /* Push the next level down onto the stack. */ 265 bi = GETBINTERNAL(h, idx); 266 pgno = bi->pgno; 267 268 /* Lose the currently pinned page. */ 269 mpool_put(t->bt_mp, h, 0); 270 271 /* Get the next level down. */ 272 if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) 273 return (1); 274 275 idx = NEXTINDEX(h) - 1; 276 BT_PUSH(t, pgno, idx); 277 } 278 mpool_put(t->bt_mp, h, 0); 279 if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL) 280 return (1); 281 } 282 283 284ret: mpool_put(t->bt_mp, h, 0); 285 return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL); 286} 287 288/* 289 * __bt_bdelete -- 290 * Delete all key/data pairs matching the specified key. 291 * 292 * Parameters: 293 * t: tree 294 * key: key to delete 295 * 296 * Returns: 297 * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. 298 */ 299static int 300__bt_bdelete(t, key) 301 BTREE *t; 302 const DBT *key; 303{ 304 EPG *e; 305 PAGE *h; 306 int deleted, exact, redo; 307 308 deleted = 0; 309 310 /* Find any matching record; __bt_search pins the page. */ 311loop: if ((e = __bt_search(t, key, &exact)) == NULL) 312 return (deleted ? RET_SUCCESS : RET_ERROR); 313 if (!exact) { 314 mpool_put(t->bt_mp, e->page, 0); 315 return (deleted ? RET_SUCCESS : RET_SPECIAL); 316 } 317 318 /* 319 * Delete forward, then delete backward, from the found key. If 320 * there are duplicates and we reach either side of the page, do 321 * the key search again, so that we get them all. 322 */ 323 redo = 0; 324 h = e->page; 325 do { 326 if (__bt_dleaf(t, key, h, (u_int)e->index)) { 327 mpool_put(t->bt_mp, h, 0); 328 return (RET_ERROR); 329 } 330 if (F_ISSET(t, B_NODUPS)) { 331 if (NEXTINDEX(h) == 0) { 332 if (__bt_pdelete(t, h)) 333 return (RET_ERROR); 334 } else 335 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 336 return (RET_SUCCESS); 337 } 338 deleted = 1; 339 } while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0); 340 341 /* Check for right-hand edge of the page. */ 342 if (e->index == NEXTINDEX(h)) 343 redo = 1; 344 345 /* Delete from the key to the beginning of the page. */ 346 while (e->index-- > 0) { 347 if (__bt_cmp(t, key, e) != 0) 348 break; 349 if (__bt_dleaf(t, key, h, (u_int)e->index) == RET_ERROR) { 350 mpool_put(t->bt_mp, h, 0); 351 return (RET_ERROR); 352 } 353 if (e->index == 0) 354 redo = 1; 355 } 356 357 /* Check for an empty page. */ 358 if (NEXTINDEX(h) == 0) { 359 if (__bt_pdelete(t, h)) 360 return (RET_ERROR); 361 goto loop; 362 } 363 364 /* Put the page. */ 365 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 366 367 if (redo) 368 goto loop; 369 return (RET_SUCCESS); 370} 371 372/* 373 * __bt_pdelete -- 374 * Delete a single page from the tree. 375 * 376 * Parameters: 377 * t: tree 378 * h: leaf page 379 * 380 * Returns: 381 * RET_SUCCESS, RET_ERROR. 382 * 383 * Side-effects: 384 * mpool_put's the page 385 */ 386static int 387__bt_pdelete(t, h) 388 BTREE *t; 389 PAGE *h; 390{ 391 BINTERNAL *bi; 392 PAGE *pg; 393 EPGNO *parent; 394 indx_t cnt, idx, *ip, offset; 395 u_int32_t nksize; 396 char *from; 397 398 /* 399 * Walk the parent page stack -- a LIFO stack of the pages that were 400 * traversed when we searched for the page where the delete occurred. 401 * Each stack entry is a page number and a page index offset. The 402 * offset is for the page traversed on the search. We've just deleted 403 * a page, so we have to delete the key from the parent page. 404 * 405 * If the delete from the parent page makes it empty, this process may 406 * continue all the way up the tree. We stop if we reach the root page 407 * (which is never deleted, it's just not worth the effort) or if the 408 * delete does not empty the page. 409 */ 410 while ((parent = BT_POP(t)) != NULL) { 411 /* Get the parent page. */ 412 if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 413 return (RET_ERROR); 414 415 idx = parent->index; 416 bi = GETBINTERNAL(pg, idx); 417 418 /* Free any overflow pages. */ 419 if (bi->flags & P_BIGKEY && 420 __ovfl_delete(t, bi->bytes) == RET_ERROR) { 421 mpool_put(t->bt_mp, pg, 0); 422 return (RET_ERROR); 423 } 424 425 /* 426 * Free the parent if it has only the one key and it's not the 427 * root page. If it's the rootpage, turn it back into an empty 428 * leaf page. 429 */ 430 if (NEXTINDEX(pg) == 1) { 431 if (pg->pgno == P_ROOT) { 432 pg->lower = BTDATAOFF; 433 pg->upper = t->bt_psize; 434 pg->flags = P_BLEAF; 435 } else { 436 if (__bt_relink(t, pg) || __bt_free(t, pg)) 437 return (RET_ERROR); 438 continue; 439 } 440 } else { 441 /* Pack remaining key items at the end of the page. */ 442 nksize = NBINTERNAL(bi->ksize); 443 from = (char *)(void *)pg + pg->upper; 444 memmove(from + nksize, from, 445 (size_t)((char *)(void *)bi - from)); 446 pg->upper += nksize; 447 448 /* Adjust indices' offsets, shift the indices down. */ 449 offset = pg->linp[idx]; 450 for (cnt = idx, ip = &pg->linp[0]; cnt--; ++ip) 451 if (ip[0] < offset) 452 ip[0] += nksize; 453 for (cnt = NEXTINDEX(pg) - idx; --cnt; ++ip) 454 ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1]; 455 pg->lower -= sizeof(indx_t); 456 } 457 458 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 459 break; 460 } 461 462 /* Free the leaf page, as long as it wasn't the root. */ 463 if (h->pgno == P_ROOT) { 464 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 465 return (RET_SUCCESS); 466 } 467 return (__bt_relink(t, h) || __bt_free(t, h)); 468} 469 470/* 471 * __bt_dleaf -- 472 * Delete a single record from a leaf page. 473 * 474 * Parameters: 475 * t: tree 476 * key: referenced key 477 * h: page 478 * idx: index on page to delete 479 * 480 * Returns: 481 * RET_SUCCESS, RET_ERROR. 482 */ 483int 484__bt_dleaf(t, key, h, idx) 485 BTREE *t; 486 const DBT *key; 487 PAGE *h; 488 u_int idx; 489{ 490 BLEAF *bl; 491 indx_t cnt, *ip, offset; 492 u_int32_t nbytes; 493 void *to; 494 char *from; 495 496 /* If this record is referenced by the cursor, delete the cursor. */ 497 if (F_ISSET(&t->bt_cursor, CURS_INIT) && 498 !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && 499 t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == idx && 500 __bt_curdel(t, key, h, idx)) 501 return (RET_ERROR); 502 503 /* If the entry uses overflow pages, make them available for reuse. */ 504 to = bl = GETBLEAF(h, idx); 505 if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR) 506 return (RET_ERROR); 507 if (bl->flags & P_BIGDATA && 508 __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR) 509 return (RET_ERROR); 510 511 /* Pack the remaining key/data items at the end of the page. */ 512 nbytes = NBLEAF(bl); 513 from = (char *)(void *)h + h->upper; 514 memmove(from + nbytes, from, (size_t)((char *)(void *)to - from)); 515 h->upper += nbytes; 516 517 /* Adjust the indices' offsets, shift the indices down. */ 518 offset = h->linp[idx]; 519 for (cnt = idx, ip = &h->linp[0]; cnt--; ++ip) 520 if (ip[0] < offset) 521 ip[0] += nbytes; 522 for (cnt = NEXTINDEX(h) - idx; --cnt; ++ip) 523 ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1]; 524 h->lower -= sizeof(indx_t); 525 526 /* If the cursor is on this page, adjust it as necessary. */ 527 if (F_ISSET(&t->bt_cursor, CURS_INIT) && 528 !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && 529 t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > idx) 530 --t->bt_cursor.pg.index; 531 532 return (RET_SUCCESS); 533} 534 535/* 536 * __bt_curdel -- 537 * Delete the cursor. 538 * 539 * Parameters: 540 * t: tree 541 * key: referenced key (or NULL) 542 * h: page 543 * idx: index on page to delete 544 * 545 * Returns: 546 * RET_SUCCESS, RET_ERROR. 547 */ 548static int 549__bt_curdel(t, key, h, idx) 550 BTREE *t; 551 const DBT *key; 552 PAGE *h; 553 u_int idx; 554{ 555 CURSOR *c; 556 EPG e; 557 PAGE *pg; 558 int curcopy, status; 559 560 /* 561 * If there are duplicates, move forward or backward to one. 562 * Otherwise, copy the key into the cursor area. 563 */ 564 c = &t->bt_cursor; 565 F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE); 566 567 curcopy = 0; 568 if (!F_ISSET(t, B_NODUPS)) { 569 /* 570 * We're going to have to do comparisons. If we weren't 571 * provided a copy of the key, i.e. the user is deleting 572 * the current cursor position, get one. 573 */ 574 if (key == NULL) { 575 e.page = h; 576 e.index = idx; 577 if ((status = __bt_ret(t, &e, 578 &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS) 579 return (status); 580 curcopy = 1; 581 key = &c->key; 582 } 583 /* Check previous key, if not at the beginning of the page. */ 584 if (idx > 0) { 585 e.page = h; 586 e.index = idx - 1; 587 if (__bt_cmp(t, key, &e) == 0) { 588 F_SET(c, CURS_BEFORE); 589 goto dup2; 590 } 591 } 592 /* Check next key, if not at the end of the page. */ 593 if (idx < NEXTINDEX(h) - 1) { 594 e.page = h; 595 e.index = idx + 1; 596 if (__bt_cmp(t, key, &e) == 0) { 597 F_SET(c, CURS_AFTER); 598 goto dup2; 599 } 600 } 601 /* Check previous key if at the beginning of the page. */ 602 if (idx == 0 && h->prevpg != P_INVALID) { 603 if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) 604 return (RET_ERROR); 605 e.page = pg; 606 e.index = NEXTINDEX(pg) - 1; 607 if (__bt_cmp(t, key, &e) == 0) { 608 F_SET(c, CURS_BEFORE); 609 goto dup1; 610 } 611 mpool_put(t->bt_mp, pg, 0); 612 } 613 /* Check next key if at the end of the page. */ 614 if (idx == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) { 615 if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) 616 return (RET_ERROR); 617 e.page = pg; 618 e.index = 0; 619 if (__bt_cmp(t, key, &e) == 0) { 620 F_SET(c, CURS_AFTER); 621dup1: mpool_put(t->bt_mp, pg, 0); 622dup2: c->pg.pgno = e.page->pgno; 623 c->pg.index = e.index; 624 return (RET_SUCCESS); 625 } 626 mpool_put(t->bt_mp, pg, 0); 627 } 628 } 629 e.page = h; 630 e.index = idx; 631 if (curcopy || (status = 632 __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) { 633 F_SET(c, CURS_ACQUIRE); 634 return (RET_SUCCESS); 635 } 636 return (status); 637} 638 639/* 640 * __bt_relink -- 641 * Link around a deleted page. 642 * 643 * Parameters: 644 * t: tree 645 * h: page to be deleted 646 */ 647static int 648__bt_relink(t, h) 649 BTREE *t; 650 PAGE *h; 651{ 652 PAGE *pg; 653 654 if (h->nextpg != P_INVALID) { 655 if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) 656 return (RET_ERROR); 657 pg->prevpg = h->prevpg; 658 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 659 } 660 if (h->prevpg != P_INVALID) { 661 if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) 662 return (RET_ERROR); 663 pg->nextpg = h->nextpg; 664 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 665 } 666 return (0); 667} 668