1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 */ 6/* 7 * Copyright (c) 1990, 1993, 1994, 1995, 1996 8 * Keith Bostic. All rights reserved. 9 */ 10/* 11 * Copyright (c) 1990, 1993 12 * The Regents of the University of California. All rights reserved. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * $Id: bt_rsearch.c,v 12.17 2008/01/08 20:57:59 bostic Exp $ 39 */ 40 41#include "db_config.h" 42 43#include "db_int.h" 44#include "dbinc/db_page.h" 45#include "dbinc/btree.h" 46#include "dbinc/lock.h" 47#include "dbinc/mp.h" 48 49/* 50 * __bam_rsearch -- 51 * Search a btree for a record number. 52 * 53 * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *)); 54 */ 55int 56__bam_rsearch(dbc, recnop, flags, stop, exactp) 57 DBC *dbc; 58 db_recno_t *recnop; 59 u_int32_t flags; 60 int stop, *exactp; 61{ 62 BINTERNAL *bi; 63 BTREE_CURSOR *cp; 64 DB *dbp; 65 DB_LOCK lock; 66 DB_MPOOLFILE *mpf; 67 PAGE *h; 68 RINTERNAL *ri; 69 db_indx_t adjust, deloffset, indx, top; 70 db_lockmode_t lock_mode; 71 db_pgno_t pg; 72 db_recno_t recno, t_recno, total; 73 int ret, stack, t_ret; 74 75 dbp = dbc->dbp; 76 mpf = dbp->mpf; 77 cp = (BTREE_CURSOR *)dbc->internal; 78 h = NULL; 79 80 BT_STK_CLR(cp); 81 82 /* 83 * There are several ways we search a btree tree. The flags argument 84 * specifies if we're acquiring read or write locks and if we are 85 * locking pairs of pages. In addition, if we're adding or deleting 86 * an item, we have to lock the entire tree, regardless. See btree.h 87 * for more details. 88 * 89 * If write-locking pages, we need to know whether or not to acquire a 90 * write lock on a page before getting it. This depends on how deep it 91 * is in tree, which we don't know until we acquire the root page. So, 92 * if we need to lock the root page we may have to upgrade it later, 93 * because we won't get the correct lock initially. 94 * 95 * Retrieve the root page. 96 */ 97 98 if ((ret = __bam_get_root(dbc, cp->root, stop, flags, &stack)) != 0) 99 return (ret); 100 lock_mode = cp->csp->lock_mode; 101 lock = cp->csp->lock; 102 h = cp->csp->page; 103 104 BT_STK_CLR(cp); 105 /* 106 * If appending to the tree, set the record number now -- we have the 107 * root page locked. 108 * 109 * Delete only deletes exact matches, read only returns exact matches. 110 * Note, this is different from __bam_search(), which returns non-exact 111 * matches for read. 112 * 113 * The record may not exist. We can only return the correct location 114 * for the record immediately after the last record in the tree, so do 115 * a fast check now. 116 */ 117 total = RE_NREC(h); 118 if (LF_ISSET(SR_APPEND)) { 119 *exactp = 0; 120 *recnop = recno = total + 1; 121 } else { 122 recno = *recnop; 123 if (recno <= total) 124 *exactp = 1; 125 else { 126 *exactp = 0; 127 if (!LF_ISSET(SR_PAST_EOF) || recno > total + 1) { 128 /* 129 * Keep the page locked for serializability. 130 * 131 * XXX 132 * This leaves the root page locked, which will 133 * eliminate any concurrency. A possible fix 134 * would be to lock the last leaf page instead. 135 */ 136 ret = __memp_fput(mpf, 137 dbc->thread_info, h, dbc->priority); 138 if ((t_ret = 139 __TLPUT(dbc, lock)) != 0 && ret == 0) 140 ret = t_ret; 141 return (ret == 0 ? DB_NOTFOUND : ret); 142 } 143 } 144 } 145 146 /* 147 * !!! 148 * Record numbers in the tree are 0-based, but the recno is 149 * 1-based. All of the calculations below have to take this 150 * into account. 151 */ 152 for (total = 0;;) { 153 switch (TYPE(h)) { 154 case P_LBTREE: 155 case P_LDUP: 156 recno -= total; 157 /* 158 * There may be logically deleted records on the page. 159 * If there are enough, the record may not exist. 160 */ 161 if (TYPE(h) == P_LBTREE) { 162 adjust = P_INDX; 163 deloffset = O_INDX; 164 } else { 165 adjust = O_INDX; 166 deloffset = 0; 167 } 168 for (t_recno = 0, indx = 0;; indx += adjust) { 169 if (indx >= NUM_ENT(h)) { 170 *exactp = 0; 171 if (!LF_ISSET(SR_PAST_EOF) || 172 recno > t_recno + 1) { 173 ret = __memp_fput(mpf, 174 dbc->thread_info, 175 h, dbc->priority); 176 h = NULL; 177 if ((t_ret = __TLPUT(dbc, 178 lock)) != 0 && ret == 0) 179 ret = t_ret; 180 if (ret == 0) 181 ret = DB_NOTFOUND; 182 goto err; 183 } 184 } 185 if (!B_DISSET(GET_BKEYDATA(dbp, h, 186 indx + deloffset)->type) && 187 ++t_recno == recno) 188 break; 189 } 190 191 /* Correct from 1-based to 0-based for a page offset. */ 192 BT_STK_ENTER(dbp->env, 193 cp, h, indx, lock, lock_mode, ret); 194 if (ret != 0) 195 goto err; 196 return (0); 197 case P_IBTREE: 198 for (indx = 0, top = NUM_ENT(h);;) { 199 bi = GET_BINTERNAL(dbp, h, indx); 200 if (++indx == top || total + bi->nrecs >= recno) 201 break; 202 total += bi->nrecs; 203 } 204 pg = bi->pgno; 205 break; 206 case P_LRECNO: 207 recno -= total; 208 209 /* Correct from 1-based to 0-based for a page offset. */ 210 --recno; 211 BT_STK_ENTER(dbp->env, 212 cp, h, recno, lock, lock_mode, ret); 213 if (ret != 0) 214 goto err; 215 return (0); 216 case P_IRECNO: 217 for (indx = 0, top = NUM_ENT(h);;) { 218 ri = GET_RINTERNAL(dbp, h, indx); 219 if (++indx == top || total + ri->nrecs >= recno) 220 break; 221 total += ri->nrecs; 222 } 223 pg = ri->pgno; 224 break; 225 default: 226 return (__db_pgfmt(dbp->env, h->pgno)); 227 } 228 --indx; 229 230 /* Return if this is the lowest page wanted. */ 231 if (stop == LEVEL(h)) { 232 BT_STK_ENTER(dbp->env, 233 cp, h, indx, lock, lock_mode, ret); 234 if (ret != 0) 235 goto err; 236 return (0); 237 } 238 if (stack) { 239 BT_STK_PUSH(dbp->env, 240 cp, h, indx, lock, lock_mode, ret); 241 if (ret != 0) 242 goto err; 243 h = NULL; 244 245 lock_mode = DB_LOCK_WRITE; 246 if ((ret = 247 __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) 248 goto err; 249 } else { 250 /* 251 * Decide if we want to return a pointer to the next 252 * page in the stack. If we do, write lock it and 253 * never unlock it. 254 */ 255 if ((LF_ISSET(SR_PARENT) && 256 (u_int8_t)(stop + 1) >= (u_int8_t)(LEVEL(h) - 1)) || 257 (LEVEL(h) - 1) == LEAFLEVEL) 258 stack = 1; 259 260 if ((ret = __memp_fput(mpf, 261 dbc->thread_info, h, dbc->priority)) != 0) 262 goto err; 263 h = NULL; 264 265 lock_mode = stack && 266 LF_ISSET(SR_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ; 267 if ((ret = __db_lget(dbc, 268 LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) { 269 /* 270 * If we fail, discard the lock we held. This 271 * is OK because this only happens when we are 272 * descending the tree holding read-locks. 273 */ 274 (void)__LPUT(dbc, lock); 275 goto err; 276 } 277 } 278 279 if ((ret = __memp_fget(mpf, &pg, 280 dbc->thread_info, dbc->txn, 0, &h)) != 0) 281 goto err; 282 } 283 /* NOTREACHED */ 284 285err: if (h != NULL && (t_ret = __memp_fput(mpf, 286 dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) 287 ret = t_ret; 288 289 BT_STK_POP(cp); 290 __bam_stkrel(dbc, 0); 291 292 return (ret); 293} 294 295/* 296 * __bam_adjust -- 297 * Adjust the tree after adding or deleting a record. 298 * 299 * PUBLIC: int __bam_adjust __P((DBC *, int32_t)); 300 */ 301int 302__bam_adjust(dbc, adjust) 303 DBC *dbc; 304 int32_t adjust; 305{ 306 BTREE_CURSOR *cp; 307 DB *dbp; 308 DB_MPOOLFILE *mpf; 309 EPG *epg; 310 PAGE *h; 311 db_pgno_t root_pgno; 312 int ret; 313 314 dbp = dbc->dbp; 315 mpf = dbp->mpf; 316 cp = (BTREE_CURSOR *)dbc->internal; 317 root_pgno = cp->root; 318 319 /* Update the record counts for the tree. */ 320 for (epg = cp->sp; epg <= cp->csp; ++epg) { 321 h = epg->page; 322 if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) { 323 if ((ret = __memp_dirty(mpf, &h, 324 dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) 325 return (ret); 326 epg->page = h; 327 if (DBC_LOGGING(dbc)) { 328 if ((ret = __bam_cadjust_log(dbp, dbc->txn, 329 &LSN(h), 0, PGNO(h), &LSN(h), 330 (u_int32_t)epg->indx, adjust, 331 PGNO(h) == root_pgno ? 332 CAD_UPDATEROOT : 0)) != 0) 333 return (ret); 334 } else 335 LSN_NOT_LOGGED(LSN(h)); 336 337 if (TYPE(h) == P_IBTREE) 338 GET_BINTERNAL(dbp, h, epg->indx)->nrecs += 339 adjust; 340 else 341 GET_RINTERNAL(dbp, h, epg->indx)->nrecs += 342 adjust; 343 344 if (PGNO(h) == root_pgno) 345 RE_NREC_ADJ(h, adjust); 346 } 347 } 348 return (0); 349} 350 351/* 352 * __bam_nrecs -- 353 * Return the number of records in the tree. 354 * 355 * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *)); 356 */ 357int 358__bam_nrecs(dbc, rep) 359 DBC *dbc; 360 db_recno_t *rep; 361{ 362 DB *dbp; 363 DB_LOCK lock; 364 DB_MPOOLFILE *mpf; 365 PAGE *h; 366 db_pgno_t pgno; 367 int ret, t_ret; 368 369 dbp = dbc->dbp; 370 mpf = dbp->mpf; 371 372 pgno = dbc->internal->root; 373 if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0) 374 return (ret); 375 if ((ret = __memp_fget(mpf, &pgno, 376 dbc->thread_info, dbc->txn, 0, &h)) != 0) 377 return (ret); 378 379 *rep = RE_NREC(h); 380 381 ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority); 382 if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0) 383 ret = t_ret; 384 385 return (ret); 386} 387 388/* 389 * __bam_total -- 390 * Return the number of records below a page. 391 * 392 * PUBLIC: db_recno_t __bam_total __P((DB *, PAGE *)); 393 */ 394db_recno_t 395__bam_total(dbp, h) 396 DB *dbp; 397 PAGE *h; 398{ 399 db_recno_t nrecs; 400 db_indx_t indx, top; 401 402 nrecs = 0; 403 top = NUM_ENT(h); 404 405 switch (TYPE(h)) { 406 case P_LBTREE: 407 /* Check for logically deleted records. */ 408 for (indx = 0; indx < top; indx += P_INDX) 409 if (!B_DISSET( 410 GET_BKEYDATA(dbp, h, indx + O_INDX)->type)) 411 ++nrecs; 412 break; 413 case P_LDUP: 414 /* Check for logically deleted records. */ 415 for (indx = 0; indx < top; indx += O_INDX) 416 if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type)) 417 ++nrecs; 418 break; 419 case P_IBTREE: 420 for (indx = 0; indx < top; indx += O_INDX) 421 nrecs += GET_BINTERNAL(dbp, h, indx)->nrecs; 422 break; 423 case P_LRECNO: 424 nrecs = NUM_ENT(h); 425 break; 426 case P_IRECNO: 427 for (indx = 0; indx < top; indx += O_INDX) 428 nrecs += GET_RINTERNAL(dbp, h, indx)->nrecs; 429 break; 430 } 431 432 return (nrecs); 433} 434