1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996-2009 Oracle. All rights reserved. 5 */ 6/* 7 * Copyright (c) 1990, 1993, 1994, 1995, 1996 8 * Keith Bostic. All rights reserved. 9 */ 10/* 11 * Copyright (c) 1990, 1993 12 * The Regents of the University of California. All rights reserved. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * $Id$ 39 */ 40 41#include "db_config.h" 42 43#include "db_int.h" 44#include "dbinc/db_page.h" 45#include "dbinc/btree.h" 46#include "dbinc/lock.h" 47#include "dbinc/mp.h" 48 49/* 50 * __bam_rsearch -- 51 * Search a btree for a record number. 52 * 53 * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *)); 54 */ 55int 56__bam_rsearch(dbc, recnop, flags, stop, exactp) 57 DBC *dbc; 58 db_recno_t *recnop; 59 u_int32_t flags; 60 int stop, *exactp; 61{ 62 BINTERNAL *bi; 63 BTREE_CURSOR *cp; 64 DB *dbp; 65 DB_LOCK lock; 66 DB_MPOOLFILE *mpf; 67 ENV *env; 68 PAGE *h; 69 RINTERNAL *ri; 70 db_indx_t adjust, deloffset, indx, top; 71 db_lockmode_t lock_mode; 72 db_pgno_t pg; 73 db_recno_t recno, t_recno, total; 74 u_int32_t get_mode; 75 int ret, stack, t_ret; 76 77 dbp = dbc->dbp; 78 env = dbp->env; 79 mpf = dbp->mpf; 80 cp = (BTREE_CURSOR *)dbc->internal; 81 h = NULL; 82 83 BT_STK_CLR(cp); 84 85 /* 86 * There are several ways we search a btree tree. The flags argument 87 * specifies if we're acquiring read or write locks and if we are 88 * locking pairs of pages. In addition, if we're adding or deleting 89 * an item, we have to lock the entire tree, regardless. See btree.h 90 * for more details. 91 * 92 * If write-locking pages, we need to know whether or not to acquire a 93 * write lock on a page before getting it. This depends on how deep it 94 * is in tree, which we don't know until we acquire the root page. So, 95 * if we need to lock the root page we may have to upgrade it later, 96 * because we won't get the correct lock initially. 97 * 98 * Retrieve the root page. 99 */ 100 101 if ((ret = __bam_get_root(dbc, cp->root, stop, flags, &stack)) != 0) 102 return (ret); 103 lock_mode = cp->csp->lock_mode; 104 get_mode = lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0; 105 lock = cp->csp->lock; 106 h = cp->csp->page; 107 108 BT_STK_CLR(cp); 109 /* 110 * If appending to the tree, set the record number now -- we have the 111 * root page locked. 112 * 113 * Delete only deletes exact matches, read only returns exact matches. 114 * Note, this is different from __bam_search(), which returns non-exact 115 * matches for read. 116 * 117 * The record may not exist. We can only return the correct location 118 * for the record immediately after the last record in the tree, so do 119 * a fast check now. 120 */ 121 total = RE_NREC(h); 122 if (LF_ISSET(SR_APPEND)) { 123 *exactp = 0; 124 *recnop = recno = total + 1; 125 } else { 126 recno = *recnop; 127 if (recno <= total) 128 *exactp = 1; 129 else { 130 *exactp = 0; 131 if (!LF_ISSET(SR_PAST_EOF) || recno > total + 1) { 132 /* 133 * Keep the page locked for serializability. 134 * 135 * XXX 136 * This leaves the root page locked, which will 137 * eliminate any concurrency. A possible fix 138 * would be to lock the last leaf page instead. 139 */ 140 ret = __memp_fput(mpf, 141 dbc->thread_info, h, dbc->priority); 142 if ((t_ret = 143 __TLPUT(dbc, lock)) != 0 && ret == 0) 144 ret = t_ret; 145 return (ret == 0 ? DB_NOTFOUND : ret); 146 } 147 } 148 } 149 150 /* 151 * !!! 152 * Record numbers in the tree are 0-based, but the recno is 153 * 1-based. All of the calculations below have to take this 154 * into account. 155 */ 156 for (total = 0;;) { 157 switch (TYPE(h)) { 158 case P_LBTREE: 159 if (LF_ISSET(SR_MAX)) { 160 indx = NUM_ENT(h) - 2; 161 goto enter; 162 } 163 /* FALLTHROUGH */ 164 case P_LDUP: 165 if (LF_ISSET(SR_MAX)) { 166 indx = NUM_ENT(h) - 1; 167 goto enter; 168 } 169 recno -= total; 170 /* 171 * There may be logically deleted records on the page. 172 * If there are enough, the record may not exist. 173 */ 174 if (TYPE(h) == P_LBTREE) { 175 adjust = P_INDX; 176 deloffset = O_INDX; 177 } else { 178 adjust = O_INDX; 179 deloffset = 0; 180 } 181 for (t_recno = 0, indx = 0;; indx += adjust) { 182 if (indx >= NUM_ENT(h)) { 183 *exactp = 0; 184 if (!LF_ISSET(SR_PAST_EOF) || 185 recno > t_recno + 1) { 186 ret = __memp_fput(mpf, 187 dbc->thread_info, 188 h, dbc->priority); 189 h = NULL; 190 if ((t_ret = __TLPUT(dbc, 191 lock)) != 0 && ret == 0) 192 ret = t_ret; 193 if (ret == 0) 194 ret = DB_NOTFOUND; 195 goto err; 196 } 197 } 198 if (!B_DISSET(GET_BKEYDATA(dbp, h, 199 indx + deloffset)->type) && 200 ++t_recno == recno) 201 break; 202 } 203 204 BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret); 205 if (ret != 0) 206 goto err; 207 if (LF_ISSET(SR_BOTH)) 208 goto get_prev; 209 return (0); 210 case P_IBTREE: 211 if (LF_ISSET(SR_MAX)) { 212 indx = NUM_ENT(h); 213 bi = GET_BINTERNAL(dbp, h, indx - 1); 214 } else for (indx = 0, top = NUM_ENT(h);;) { 215 bi = GET_BINTERNAL(dbp, h, indx); 216 if (++indx == top || total + bi->nrecs >= recno) 217 break; 218 total += bi->nrecs; 219 } 220 pg = bi->pgno; 221 break; 222 case P_LRECNO: 223 if (LF_ISSET(SR_MAX)) 224 recno = NUM_ENT(h); 225 else 226 recno -= total; 227 228 /* Correct from 1-based to 0-based for a page offset. */ 229 --recno; 230enter: BT_STK_ENTER(env, cp, h, recno, lock, lock_mode, ret); 231 if (ret != 0) 232 goto err; 233 if (LF_ISSET(SR_BOTH)) { 234get_prev: DB_ASSERT(env, LF_ISSET(SR_NEXT)); 235 /* 236 * We have a NEXT tree, now add the sub tree 237 * that points gets to the previous page. 238 */ 239 cp->csp++; 240 indx = cp->sp->indx - 1; 241 h = cp->sp->page; 242 if (TYPE(h) == P_IRECNO) { 243 ri = GET_RINTERNAL(dbp, h, indx); 244 pg = ri->pgno; 245 } else { 246 DB_ASSERT(env, TYPE(h) == P_IBTREE); 247 bi = GET_BINTERNAL(dbp, h, indx); 248 pg = bi->pgno; 249 } 250 LF_CLR(SR_NEXT | SR_BOTH); 251 LF_SET(SR_MAX); 252 stack = 1; 253 h = NULL; 254 goto lock_next; 255 } 256 return (0); 257 case P_IRECNO: 258 if (LF_ISSET(SR_MAX)) { 259 indx = NUM_ENT(h); 260 ri = GET_RINTERNAL(dbp, h, indx - 1); 261 } else for (indx = 0, top = NUM_ENT(h);;) { 262 ri = GET_RINTERNAL(dbp, h, indx); 263 if (++indx == top || total + ri->nrecs >= recno) 264 break; 265 total += ri->nrecs; 266 } 267 pg = ri->pgno; 268 break; 269 default: 270 return (__db_pgfmt(env, h->pgno)); 271 } 272 --indx; 273 274 /* Return if this is the lowest page wanted. */ 275 if (stop == LEVEL(h)) { 276 BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret); 277 if (ret != 0) 278 goto err; 279 return (0); 280 } 281 if (stack) { 282 BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret); 283 if (ret != 0) 284 goto err; 285 h = NULL; 286 287 lock_mode = DB_LOCK_WRITE; 288 get_mode = DB_MPOOL_DIRTY; 289 if ((ret = 290 __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) 291 goto err; 292 } else if (LF_ISSET(SR_NEXT)) { 293 /* 294 * For RECNO if we are doing a NEXT search the 295 * search recno is the one we are looking for 296 * but we want to keep the stack from the spanning 297 * node on down. We only know we have the spanning 298 * node when its child's index is 0, so save 299 * each node and discard the tree when we find out 300 * its not needed. 301 */ 302 if (indx != 0 && cp->sp->page != NULL) { 303 BT_STK_POP(cp); 304 if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0) 305 goto err; 306 } 307 308 BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret); 309 h = NULL; 310 if (ret != 0) 311 goto err; 312lock_next: if ((ret = 313 __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) 314 goto err; 315 } else { 316 /* 317 * Decide if we want to return a pointer to the next 318 * page in the stack. If we do, write lock it and 319 * never unlock it. 320 */ 321 if ((LF_ISSET(SR_PARENT) && 322 (u_int8_t)(stop + 1) >= (u_int8_t)(LEVEL(h) - 1)) || 323 (LEVEL(h) - 1) == LEAFLEVEL) 324 stack = 1; 325 326 if ((ret = __memp_fput(mpf, 327 dbc->thread_info, h, dbc->priority)) != 0) 328 goto err; 329 h = NULL; 330 331 lock_mode = stack && 332 LF_ISSET(SR_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ; 333 if (lock_mode == DB_LOCK_WRITE) 334 get_mode = DB_MPOOL_DIRTY; 335 if ((ret = __db_lget(dbc, 336 LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) { 337 /* 338 * If we fail, discard the lock we held. This 339 * is OK because this only happens when we are 340 * descending the tree holding read-locks. 341 */ 342 (void)__LPUT(dbc, lock); 343 goto err; 344 } 345 } 346 347 if ((ret = __memp_fget(mpf, &pg, 348 dbc->thread_info, dbc->txn, get_mode, &h)) != 0) 349 goto err; 350 } 351 /* NOTREACHED */ 352 353err: if (h != NULL && (t_ret = __memp_fput(mpf, 354 dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) 355 ret = t_ret; 356 357 BT_STK_POP(cp); 358 (void)__bam_stkrel(dbc, 0); 359 360 return (ret); 361} 362 363/* 364 * __bam_adjust -- 365 * Adjust the tree after adding or deleting a record. 366 * 367 * PUBLIC: int __bam_adjust __P((DBC *, int32_t)); 368 */ 369int 370__bam_adjust(dbc, adjust) 371 DBC *dbc; 372 int32_t adjust; 373{ 374 BTREE_CURSOR *cp; 375 DB *dbp; 376 DB_MPOOLFILE *mpf; 377 EPG *epg; 378 PAGE *h; 379 db_pgno_t root_pgno; 380 int ret; 381 382 dbp = dbc->dbp; 383 mpf = dbp->mpf; 384 cp = (BTREE_CURSOR *)dbc->internal; 385 root_pgno = cp->root; 386 387 /* Update the record counts for the tree. */ 388 for (epg = cp->sp; epg <= cp->csp; ++epg) { 389 h = epg->page; 390 if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) { 391 ret = __memp_dirty(mpf, &h, 392 dbc->thread_info, dbc->txn, dbc->priority, 0); 393 epg->page = h; 394 if (ret != 0) 395 return (ret); 396 if (DBC_LOGGING(dbc)) { 397 if ((ret = __bam_cadjust_log(dbp, dbc->txn, 398 &LSN(h), 0, PGNO(h), &LSN(h), 399 (u_int32_t)epg->indx, adjust, 400 PGNO(h) == root_pgno ? 401 CAD_UPDATEROOT : 0)) != 0) 402 return (ret); 403 } else 404 LSN_NOT_LOGGED(LSN(h)); 405 406 if (TYPE(h) == P_IBTREE) 407 GET_BINTERNAL(dbp, h, epg->indx)->nrecs += 408 adjust; 409 else 410 GET_RINTERNAL(dbp, h, epg->indx)->nrecs += 411 adjust; 412 413 if (PGNO(h) == root_pgno) 414 RE_NREC_ADJ(h, adjust); 415 } 416 } 417 return (0); 418} 419 420/* 421 * __bam_nrecs -- 422 * Return the number of records in the tree. 423 * 424 * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *)); 425 */ 426int 427__bam_nrecs(dbc, rep) 428 DBC *dbc; 429 db_recno_t *rep; 430{ 431 DB *dbp; 432 DB_LOCK lock; 433 DB_MPOOLFILE *mpf; 434 PAGE *h; 435 db_pgno_t pgno; 436 int ret, t_ret; 437 438 dbp = dbc->dbp; 439 mpf = dbp->mpf; 440 441 pgno = dbc->internal->root; 442 if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0) 443 return (ret); 444 if ((ret = __memp_fget(mpf, &pgno, 445 dbc->thread_info, dbc->txn, 0, &h)) != 0) 446 return (ret); 447 448 *rep = RE_NREC(h); 449 450 ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority); 451 if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0) 452 ret = t_ret; 453 454 return (ret); 455} 456 457/* 458 * __bam_total -- 459 * Return the number of records below a page. 460 * 461 * PUBLIC: db_recno_t __bam_total __P((DB *, PAGE *)); 462 */ 463db_recno_t 464__bam_total(dbp, h) 465 DB *dbp; 466 PAGE *h; 467{ 468 db_recno_t nrecs; 469 db_indx_t indx, top; 470 471 nrecs = 0; 472 top = NUM_ENT(h); 473 474 switch (TYPE(h)) { 475 case P_LBTREE: 476 /* Check for logically deleted records. */ 477 for (indx = 0; indx < top; indx += P_INDX) 478 if (!B_DISSET( 479 GET_BKEYDATA(dbp, h, indx + O_INDX)->type)) 480 ++nrecs; 481 break; 482 case P_LDUP: 483 /* Check for logically deleted records. */ 484 for (indx = 0; indx < top; indx += O_INDX) 485 if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type)) 486 ++nrecs; 487 break; 488 case P_IBTREE: 489 for (indx = 0; indx < top; indx += O_INDX) 490 nrecs += GET_BINTERNAL(dbp, h, indx)->nrecs; 491 break; 492 case P_LRECNO: 493 nrecs = NUM_ENT(h); 494 break; 495 case P_IRECNO: 496 for (indx = 0; indx < top; indx += O_INDX) 497 nrecs += GET_RINTERNAL(dbp, h, indx)->nrecs; 498 break; 499 } 500 501 return (nrecs); 502} 503