1/* $NetBSD: bt_seq.c,v 1.20 2016/09/24 21:31:25 christos Exp $ */ 2 3/*- 4 * Copyright (c) 1990, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Mike Olson. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35#if HAVE_NBTOOL_CONFIG_H 36#include "nbtool_config.h" 37#endif 38 39#include <sys/cdefs.h> 40__RCSID("$NetBSD: bt_seq.c,v 1.20 2016/09/24 21:31:25 christos Exp $"); 41 42#include "namespace.h" 43#include <sys/types.h> 44 45#include <assert.h> 46#include <errno.h> 47#include <stddef.h> 48#include <stdio.h> 49#include <stdlib.h> 50 51#include <db.h> 52#include "btree.h" 53 54static int __bt_first(BTREE *, const DBT *, EPG *, int *); 55static int __bt_seqadv(BTREE *, EPG *, int); 56static int __bt_seqset(BTREE *, EPG *, DBT *, int); 57static int __bt_rseq_next(BTREE *, EPG *); 58static int __bt_rseq_prev(BTREE *, EPG *); 59 60/* 61 * Sequential scan support. 62 * 63 * The tree can be scanned sequentially, starting from either end of the 64 * tree or from any specific key. A scan request before any scanning is 65 * done is initialized as starting from the least node. 66 */ 67 68/* 69 * __bt_seq -- 70 * Btree sequential scan interface. 71 * 72 * Parameters: 73 * dbp: pointer to access method 74 * key: key for positioning and return value 75 * data: data return value 76 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV, R_RNEXT, R_RPREV. 77 * 78 * Returns: 79 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 80 */ 81int 82__bt_seq(const DB *dbp, DBT *key, DBT *data, u_int flags) 83{ 84 BTREE *t; 85 EPG e; 86 int status; 87 88 t = dbp->internal; 89 90 /* Toss any page pinned across calls. */ 91 if (t->bt_pinned != NULL) { 92 mpool_put(t->bt_mp, t->bt_pinned, 0); 93 t->bt_pinned = NULL; 94 } 95 96 /* 97 * If scan uninitialized as yet, or starting at a specific record, set 98 * the scan to a specific key. Both __bt_seqset and __bt_seqadv pin 99 * the page the cursor references if they're successful. 100 */ 101 switch (flags) { 102 case R_NEXT: 103 case R_PREV: 104 case R_RNEXT: 105 case R_RPREV: 106 if (F_ISSET(&t->bt_cursor, CURS_INIT)) { 107 status = __bt_seqadv(t, &e, (int)flags); 108 break; 109 } 110 /* FALLTHROUGH */ 111 case R_FIRST: 112 case R_LAST: 113 case R_CURSOR: 114 status = __bt_seqset(t, &e, key, (int)flags); 115 break; 116 default: 117 errno = EINVAL; 118 return (RET_ERROR); 119 } 120 121 if (status == RET_SUCCESS) { 122 __bt_setcur(t, e.page->pgno, (u_int)e.index); 123 124 status = 125 __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0); 126 127 /* 128 * If the user is doing concurrent access, we copied the 129 * key/data, toss the page. 130 */ 131 if (F_ISSET(t, B_DB_LOCK)) 132 mpool_put(t->bt_mp, e.page, 0); 133 else 134 t->bt_pinned = e.page; 135 } 136 return (status); 137} 138 139/* 140 * __bt_seqset -- 141 * Set the sequential scan to a specific key. 142 * 143 * Parameters: 144 * t: tree 145 * ep: storage for returned key 146 * key: key for initial scan position 147 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV, R_RNEXT, R_RPREV. 148 * 149 * Side effects: 150 * Pins the page the cursor references. 151 * 152 * Returns: 153 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 154 */ 155static int 156__bt_seqset(BTREE *t, EPG *ep, DBT *key, int flags) 157{ 158 PAGE *h; 159 pgno_t pg; 160 int exact; 161 162 /* 163 * Find the first, last or specific key in the tree and point the 164 * cursor at it. The cursor may not be moved until a new key has 165 * been found. 166 */ 167 switch (flags) { 168 case R_CURSOR: /* Keyed scan. */ 169 /* 170 * Find the first instance of the key or the smallest key 171 * which is greater than or equal to the specified key. 172 */ 173 if (key->data == NULL || key->size == 0) { 174 errno = EINVAL; 175 return (RET_ERROR); 176 } 177 return (__bt_first(t, key, ep, &exact)); 178 case R_FIRST: /* First record. */ 179 case R_NEXT: 180 case R_RNEXT: 181 BT_CLR(t); 182 /* Walk down the left-hand side of the tree. */ 183 for (pg = P_ROOT;;) { 184 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 185 return (RET_ERROR); 186 187 /* Check for an empty tree. */ 188 if (NEXTINDEX(h) == 0) { 189 mpool_put(t->bt_mp, h, 0); 190 return (RET_SPECIAL); 191 } 192 193 if (h->flags & (P_BLEAF | P_RLEAF)) 194 break; 195 pg = GETBINTERNAL(h, 0)->pgno; 196 BT_PUSH(t, h->pgno, 0); 197 mpool_put(t->bt_mp, h, 0); 198 } 199 ep->page = h; 200 ep->index = 0; 201 break; 202 case R_LAST: /* Last record. */ 203 case R_PREV: 204 case R_RPREV: 205 BT_CLR(t); 206 /* Walk down the right-hand side of the tree. */ 207 for (pg = P_ROOT;;) { 208 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 209 return (RET_ERROR); 210 211 /* Check for an empty tree. */ 212 if (NEXTINDEX(h) == 0) { 213 mpool_put(t->bt_mp, h, 0); 214 return (RET_SPECIAL); 215 } 216 217 if (h->flags & (P_BLEAF | P_RLEAF)) 218 break; 219 pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno; 220 BT_PUSH(t, h->pgno, NEXTINDEX(h) - 1); 221 mpool_put(t->bt_mp, h, 0); 222 } 223 224 ep->page = h; 225 ep->index = NEXTINDEX(h) - 1; 226 break; 227 } 228 return (RET_SUCCESS); 229} 230 231/* 232 * __bt_seqadvance -- 233 * Advance the sequential scan. 234 * 235 * Parameters: 236 * t: tree 237 * flags: R_NEXT, R_PREV, R_RNEXT, R_RPREV 238 * 239 * Side effects: 240 * Pins the page the new key/data record is on. 241 * 242 * Returns: 243 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 244 */ 245static int 246__bt_seqadv(BTREE *t, EPG *ep, int flags) 247{ 248 CURSOR *c; 249 PAGE *h; 250 indx_t idx = 0; /* pacify gcc */ 251 pgno_t pg; 252 int exact, rval; 253 254 /* 255 * There are a couple of states that we can be in. The cursor has 256 * been initialized by the time we get here, but that's all we know. 257 */ 258 c = &t->bt_cursor; 259 260 /* 261 * The cursor was deleted and there weren't any duplicate records, 262 * so the cursor's key was saved. Find out where that key would 263 * be in the current tree. If the returned key is an exact match, 264 * it means that a key/data pair was inserted into the tree after 265 * the delete. We could reasonably return the key, but the problem 266 * is that this is the access pattern we'll see if the user is 267 * doing seq(..., R_NEXT)/put(..., 0) pairs, i.e. the put deletes 268 * the cursor record and then replaces it, so the cursor was saved, 269 * and we'll simply return the same "new" record until the user 270 * notices and doesn't do a put() of it. Since the key is an exact 271 * match, we could as easily put the new record before the cursor, 272 * and we've made no guarantee to return it. So, move forward or 273 * back a record if it's an exact match. 274 * 275 * XXX 276 * In the current implementation, put's to the cursor are done with 277 * delete/add pairs. This has two consequences. First, it means 278 * that seq(..., R_NEXT)/put(..., R_CURSOR) pairs are going to exhibit 279 * the same behavior as above. Second, you can return the same key 280 * twice if you have duplicate records. The scenario is that the 281 * cursor record is deleted, moving the cursor forward or backward 282 * to a duplicate. The add then inserts the new record at a location 283 * ahead of the cursor because duplicates aren't sorted in any way, 284 * and the new record is later returned. This has to be fixed at some 285 * point. 286 */ 287 if (F_ISSET(c, CURS_ACQUIRE)) { 288 if ((rval = __bt_first(t, &c->key, ep, &exact)) == RET_ERROR) 289 return RET_ERROR; 290 if (!exact) 291 return rval; 292 /* 293 * XXX 294 * Kluge -- get, release, get the page. 295 */ 296 c->pg.pgno = ep->page->pgno; 297 c->pg.index = ep->index; 298 mpool_put(t->bt_mp, ep->page, 0); 299 } 300 301 /* Get the page referenced by the cursor. */ 302 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) 303 return (RET_ERROR); 304 305 /* 306 * Find the next/previous record in the tree and point the cursor at 307 * it. The cursor may not be moved until a new key has been found. 308 */ 309 switch (flags) { 310 case R_NEXT: /* Next record. */ 311 case R_RNEXT: 312 /* 313 * The cursor was deleted in duplicate records, and moved 314 * forward to a record that has yet to be returned. Clear 315 * that flag, and return the record. 316 */ 317 if (F_ISSET(c, CURS_AFTER)) 318 goto usecurrent; 319 idx = c->pg.index; 320 if (++idx == NEXTINDEX(h)) { 321 if (flags == R_RNEXT) { 322 ep->page = h; 323 ep->index = idx; 324 return __bt_rseq_next(t, ep); 325 } 326 pg = h->nextpg; 327 mpool_put(t->bt_mp, h, 0); 328 if (pg == P_INVALID) 329 return RET_SPECIAL; 330 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 331 return RET_ERROR; 332 idx = 0; 333 } 334 break; 335 case R_PREV: /* Previous record. */ 336 case R_RPREV: 337 /* 338 * The cursor was deleted in duplicate records, and moved 339 * backward to a record that has yet to be returned. Clear 340 * that flag, and return the record. 341 */ 342 if (F_ISSET(c, CURS_BEFORE)) { 343usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE); 344 ep->page = h; 345 ep->index = c->pg.index; 346 return (RET_SUCCESS); 347 } 348 idx = c->pg.index; 349 if (idx == 0) { 350 if (flags == R_RPREV) { 351 ep->page = h; 352 ep->index = idx; 353 return __bt_rseq_prev(t, ep); 354 } 355 pg = h->prevpg; 356 mpool_put(t->bt_mp, h, 0); 357 if (pg == P_INVALID) 358 return RET_SPECIAL; 359 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 360 return RET_ERROR; 361 idx = NEXTINDEX(h) - 1; 362 } else 363 --idx; 364 break; 365 } 366 367 ep->page = h; 368 ep->index = idx; 369 return (RET_SUCCESS); 370} 371/* 372 * Get the first item on the next page, but by going up and down the tree. 373 */ 374static int 375__bt_rseq_next(BTREE *t, EPG *ep) 376{ 377 PAGE *h; 378 indx_t idx; 379 EPGNO *up; 380 pgno_t pg; 381 382 h = ep->page; 383 idx = ep->index; 384 do { 385 /* Move up the tree. */ 386 up = BT_POP(t); 387 mpool_put(t->bt_mp, h, 0); 388 /* Did we hit the right edge of the root? */ 389 if (up == NULL) 390 return RET_SPECIAL; 391 if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL) 392 return RET_ERROR; 393 idx = up->index; 394 } while (++idx == NEXTINDEX(h)); 395 396 while (!(h->flags & (P_BLEAF | P_RLEAF))) { 397 /* Move back down the tree. */ 398 BT_PUSH(t, h->pgno, idx); 399 pg = GETBINTERNAL(h, idx)->pgno; 400 mpool_put(t->bt_mp, h, 0); 401 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 402 return RET_ERROR; 403 idx = 0; 404 } 405 ep->page = h; 406 ep->index = idx; 407 return RET_SUCCESS; 408} 409 410/* 411 * Get the last item on the previous page, but by going up and down the tree. 412 */ 413static int 414__bt_rseq_prev(BTREE *t, EPG *ep) 415{ 416 PAGE *h; 417 indx_t idx; 418 EPGNO *up; 419 pgno_t pg; 420 421 h = ep->page; 422 idx = ep->index; 423 do { 424 /* Move up the tree. */ 425 up = BT_POP(t); 426 mpool_put(t->bt_mp, h, 0); 427 /* Did we hit the left edge of the root? */ 428 if (up == NULL) 429 return RET_SPECIAL; 430 if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL) 431 return RET_ERROR; 432 idx = up->index; 433 } while (idx == 0); 434 --idx; 435 while (!(h->flags & (P_BLEAF | P_RLEAF))) { 436 /* Move back down the tree. */ 437 BT_PUSH(t, h->pgno, idx); 438 pg = GETBINTERNAL(h, idx)->pgno; 439 mpool_put(t->bt_mp, h, 0); 440 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 441 return RET_ERROR; 442 idx = NEXTINDEX(h) - 1; 443 } 444 ep->page = h; 445 ep->index = idx; 446 return RET_SUCCESS; 447} 448 449/* 450 * __bt_first -- 451 * Find the first entry. 452 * 453 * Parameters: 454 * t: the tree 455 * key: the key 456 * erval: return EPG 457 * exactp: pointer to exact match flag 458 * 459 * Returns: 460 * The first entry in the tree greater than or equal to key, 461 * or RET_SPECIAL if no such key exists. 462 */ 463static int 464__bt_first(BTREE *t, const DBT *key, EPG *erval, int *exactp) 465{ 466 PAGE *h, *hprev; 467 EPG *ep, save; 468 pgno_t pg; 469 470 /* 471 * Find any matching record; __bt_search pins the page. 472 * 473 * If it's an exact match and duplicates are possible, walk backwards 474 * in the tree until we find the first one. Otherwise, make sure it's 475 * a valid key (__bt_search may return an index just past the end of a 476 * page) and return it. 477 */ 478 if ((ep = __bt_search(t, key, exactp)) == NULL) 479 return RET_SPECIAL; 480 if (*exactp) { 481 if (F_ISSET(t, B_NODUPS)) { 482 *erval = *ep; 483 return (RET_SUCCESS); 484 } 485 486 /* 487 * Walk backwards, as long as the entry matches and there are 488 * keys left in the tree. Save a copy of each match in case 489 * we go too far. 490 */ 491 save = *ep; 492 h = ep->page; 493 do { 494 if (save.page->pgno != ep->page->pgno) { 495 mpool_put(t->bt_mp, save.page, 0); 496 save = *ep; 497 } else 498 save.index = ep->index; 499 500 /* 501 * Don't unpin the page the last (or original) match 502 * was on, but make sure it's unpinned if an error 503 * occurs. 504 */ 505 if (ep->index == 0) { 506 if (h->prevpg == P_INVALID) 507 break; 508 if (h->pgno != save.page->pgno) 509 mpool_put(t->bt_mp, h, 0); 510 if ((hprev = mpool_get(t->bt_mp, 511 h->prevpg, 0)) == NULL) { 512 if (h->pgno == save.page->pgno) 513 mpool_put(t->bt_mp, 514 save.page, 0); 515 return RET_ERROR; 516 } 517 ep->page = h = hprev; 518 ep->index = NEXTINDEX(h); 519 } 520 --ep->index; 521 } while (__bt_cmp(t, key, ep) == 0); 522 523 /* 524 * Reach here with the last page that was looked at pinned, 525 * which may or may not be the same as the last (or original) 526 * match page. If it's not useful, release it. 527 */ 528 if (h->pgno != save.page->pgno) 529 mpool_put(t->bt_mp, h, 0); 530 531 *erval = save; 532 return (RET_SUCCESS); 533 } 534 535 /* If at the end of a page, find the next entry. */ 536 if (ep->index == NEXTINDEX(ep->page)) { 537 h = ep->page; 538 pg = h->nextpg; 539 mpool_put(t->bt_mp, h, 0); 540 if (pg == P_INVALID) 541 return (RET_SPECIAL); 542 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 543 return (RET_ERROR); 544 ep->index = 0; 545 ep->page = h; 546 } 547 *erval = *ep; 548 return (RET_SUCCESS); 549} 550 551/* 552 * __bt_setcur -- 553 * Set the cursor to an entry in the tree. 554 * 555 * Parameters: 556 * t: the tree 557 * pgno: page number 558 * idx: page index 559 */ 560void 561__bt_setcur(BTREE *t, pgno_t pgno, u_int idx) 562{ 563 /* Lose any already deleted key. */ 564 if (t->bt_cursor.key.data != NULL) { 565 free(t->bt_cursor.key.data); 566 t->bt_cursor.key.size = 0; 567 t->bt_cursor.key.data = NULL; 568 } 569 F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE); 570 571 /* Update the cursor. */ 572 t->bt_cursor.pg.pgno = pgno; 573 t->bt_cursor.pg.index = idx; 574 F_SET(&t->bt_cursor, CURS_INIT); 575} 576