1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1999,2008 Oracle. All rights reserved. 5 * 6 * $Id: bt_verify.c,v 12.39 2008/03/11 21:07:01 mbrey Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/db_page.h" 13#include "dbinc/db_verify.h" 14#include "dbinc/btree.h" 15#include "dbinc/mp.h" 16 17static int __bam_safe_getdata __P((DB *, DB_THREAD_INFO *, 18 PAGE *, u_int32_t, int, DBT *, int *)); 19static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, 20 db_indx_t *, u_int32_t)); 21static int __bam_vrfy_treeorder __P((DB *, DB_THREAD_INFO *, PAGE *, 22 BINTERNAL *, BINTERNAL *, int (*)(DB *, const DBT *, const DBT *), 23 u_int32_t)); 24static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, 25 db_indx_t *, u_int32_t)); 26 27/* 28 * __bam_vrfy_meta -- 29 * Verify the btree-specific part of a metadata page. 30 * 31 * PUBLIC: int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *, 32 * PUBLIC: db_pgno_t, u_int32_t)); 33 */ 34int 35__bam_vrfy_meta(dbp, vdp, meta, pgno, flags) 36 DB *dbp; 37 VRFY_DBINFO *vdp; 38 BTMETA *meta; 39 db_pgno_t pgno; 40 u_int32_t flags; 41{ 42 ENV *env; 43 VRFY_PAGEINFO *pip; 44 int isbad, t_ret, ret; 45 db_indx_t ovflsize; 46 47 env = dbp->env; 48 isbad = 0; 49 50 if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) 51 return (ret); 52 53 /* 54 * If VRFY_INCOMPLETE is not set, then we didn't come through 55 * __db_vrfy_pagezero and didn't incompletely 56 * check this page--we haven't checked it at all. 57 * Thus we need to call __db_vrfy_meta and check the common fields. 58 * 59 * If VRFY_INCOMPLETE is set, we've already done all the same work 60 * in __db_vrfy_pagezero, so skip the check. 61 */ 62 if (!F_ISSET(pip, VRFY_INCOMPLETE) && 63 (ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) { 64 if (ret == DB_VERIFY_BAD) 65 isbad = 1; 66 else 67 goto err; 68 } 69 70 /* bt_minkey: must be >= 2; must produce sensible ovflsize */ 71 72 /* avoid division by zero */ 73 ovflsize = meta->minkey > 0 ? 74 B_MINKEY_TO_OVFLSIZE(dbp, meta->minkey, dbp->pgsize) : 0; 75 76 if (meta->minkey < 2 || 77 ovflsize > B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) { 78 pip->bt_minkey = 0; 79 isbad = 1; 80 EPRINT((env, 81 "Page %lu: nonsensical bt_minkey value %lu on metadata page", 82 (u_long)pgno, (u_long)meta->minkey)); 83 } else 84 pip->bt_minkey = meta->minkey; 85 86 /* re_len: no constraints on this (may be zero or huge--we make rope) */ 87 pip->re_pad = meta->re_pad; 88 pip->re_len = meta->re_len; 89 90 /* 91 * The root must not be current page or 0 and it must be within 92 * database. If this metadata page is the master meta data page 93 * of the file, then the root page had better be page 1. 94 */ 95 pip->root = 0; 96 if (meta->root == PGNO_INVALID || 97 meta->root == pgno || !IS_VALID_PGNO(meta->root) || 98 (pgno == PGNO_BASE_MD && meta->root != 1)) { 99 isbad = 1; 100 EPRINT((env, 101 "Page %lu: nonsensical root page %lu on metadata page", 102 (u_long)pgno, (u_long)meta->root)); 103 } else 104 pip->root = meta->root; 105 106 /* Flags. */ 107 if (F_ISSET(&meta->dbmeta, BTM_RENUMBER)) 108 F_SET(pip, VRFY_IS_RRECNO); 109 110 if (F_ISSET(&meta->dbmeta, BTM_SUBDB)) { 111 /* 112 * If this is a master db meta page, it had better not have 113 * duplicates. 114 */ 115 if (F_ISSET(&meta->dbmeta, BTM_DUP) && pgno == PGNO_BASE_MD) { 116 isbad = 1; 117 EPRINT((env, 118"Page %lu: Btree metadata page has both duplicates and multiple databases", 119 (u_long)pgno)); 120 } 121 F_SET(pip, VRFY_HAS_SUBDBS); 122 } 123 124 if (F_ISSET(&meta->dbmeta, BTM_DUP)) 125 F_SET(pip, VRFY_HAS_DUPS); 126 if (F_ISSET(&meta->dbmeta, BTM_DUPSORT)) 127 F_SET(pip, VRFY_HAS_DUPSORT); 128 if (F_ISSET(&meta->dbmeta, BTM_RECNUM)) 129 F_SET(pip, VRFY_HAS_RECNUMS); 130 if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_DUPS)) { 131 EPRINT((env, 132 "Page %lu: Btree metadata page illegally has both recnums and dups", 133 (u_long)pgno)); 134 isbad = 1; 135 } 136 137 if (F_ISSET(&meta->dbmeta, BTM_RECNO)) { 138 F_SET(pip, VRFY_IS_RECNO); 139 dbp->type = DB_RECNO; 140 } else if (F_ISSET(pip, VRFY_IS_RRECNO)) { 141 isbad = 1; 142 EPRINT((env, 143 "Page %lu: metadata page has renumber flag set but is not recno", 144 (u_long)pgno)); 145 } 146 147 if (F_ISSET(pip, VRFY_IS_RECNO) && F_ISSET(pip, VRFY_HAS_DUPS)) { 148 EPRINT((env, 149 "Page %lu: recno metadata page specifies duplicates", 150 (u_long)pgno)); 151 isbad = 1; 152 } 153 154 if (F_ISSET(&meta->dbmeta, BTM_FIXEDLEN)) 155 F_SET(pip, VRFY_IS_FIXEDLEN); 156 else if (pip->re_len > 0) { 157 /* 158 * It's wrong to have an re_len if it's not a fixed-length 159 * database 160 */ 161 isbad = 1; 162 EPRINT((env, 163 "Page %lu: re_len of %lu in non-fixed-length database", 164 (u_long)pgno, (u_long)pip->re_len)); 165 } 166 167 /* 168 * We do not check that the rest of the page is 0, because it may 169 * not be and may still be correct. 170 */ 171 172err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) 173 ret = t_ret; 174 if (LF_ISSET(DB_SALVAGE) && 175 (t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0) 176 ret = t_ret; 177 return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); 178} 179 180/* 181 * __ram_vrfy_leaf -- 182 * Verify a recno leaf page. 183 * 184 * PUBLIC: int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, 185 * PUBLIC: u_int32_t)); 186 */ 187int 188__ram_vrfy_leaf(dbp, vdp, h, pgno, flags) 189 DB *dbp; 190 VRFY_DBINFO *vdp; 191 PAGE *h; 192 db_pgno_t pgno; 193 u_int32_t flags; 194{ 195 BKEYDATA *bk; 196 ENV *env; 197 VRFY_PAGEINFO *pip; 198 db_indx_t i; 199 int ret, t_ret, isbad; 200 u_int32_t re_len_guess, len; 201 202 env = dbp->env; 203 isbad = 0; 204 205 if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) 206 return (ret); 207 208 if (TYPE(h) != P_LRECNO) { 209 ret = __db_unknown_path(env, "__ram_vrfy_leaf"); 210 goto err; 211 } 212 213 /* 214 * Verify (and, if relevant, save off) page fields common to 215 * all PAGEs. 216 */ 217 if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) { 218 if (ret == DB_VERIFY_BAD) 219 isbad = 1; 220 else 221 goto err; 222 } 223 224 /* 225 * Verify inp[]. Return immediately if it returns DB_VERIFY_BAD; 226 * further checks are dangerous. 227 */ 228 if ((ret = __bam_vrfy_inp(dbp, 229 vdp, h, pgno, &pip->entries, flags)) != 0) 230 goto err; 231 232 if (F_ISSET(pip, VRFY_HAS_DUPS)) { 233 EPRINT((env, 234 "Page %lu: Recno database has dups", (u_long)pgno)); 235 ret = DB_VERIFY_BAD; 236 goto err; 237 } 238 239 /* 240 * Walk through inp and see if the lengths of all the records are the 241 * same--if so, this may be a fixed-length database, and we want to 242 * save off this value. We know inp to be safe if we've gotten this 243 * far. 244 */ 245 re_len_guess = 0; 246 for (i = 0; i < NUM_ENT(h); i++) { 247 bk = GET_BKEYDATA(dbp, h, i); 248 /* KEYEMPTY. Go on. */ 249 if (B_DISSET(bk->type)) 250 continue; 251 if (bk->type == B_OVERFLOW) 252 len = ((BOVERFLOW *)bk)->tlen; 253 else if (bk->type == B_KEYDATA) 254 len = bk->len; 255 else { 256 isbad = 1; 257 EPRINT((env, 258 "Page %lu: nonsensical type for item %lu", 259 (u_long)pgno, (u_long)i)); 260 continue; 261 } 262 if (re_len_guess == 0) 263 re_len_guess = len; 264 265 /* 266 * Is this item's len the same as the last one's? If not, 267 * reset to 0 and break--we don't have a single re_len. 268 * Otherwise, go on to the next item. 269 */ 270 if (re_len_guess != len) { 271 re_len_guess = 0; 272 break; 273 } 274 } 275 pip->re_len = re_len_guess; 276 277 /* Save off record count. */ 278 pip->rec_cnt = NUM_ENT(h); 279 280err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) 281 ret = t_ret; 282 return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); 283} 284 285/* 286 * __bam_vrfy -- 287 * Verify a btree leaf or internal page. 288 * 289 * PUBLIC: int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, 290 * PUBLIC: u_int32_t)); 291 */ 292int 293__bam_vrfy(dbp, vdp, h, pgno, flags) 294 DB *dbp; 295 VRFY_DBINFO *vdp; 296 PAGE *h; 297 db_pgno_t pgno; 298 u_int32_t flags; 299{ 300 ENV *env; 301 VRFY_PAGEINFO *pip; 302 int ret, t_ret, isbad; 303 304 env = dbp->env; 305 isbad = 0; 306 307 if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) 308 return (ret); 309 310 switch (TYPE(h)) { 311 case P_IBTREE: 312 case P_IRECNO: 313 case P_LBTREE: 314 case P_LDUP: 315 break; 316 default: 317 ret = __db_unknown_path(env, "__bam_vrfy"); 318 goto err; 319 } 320 321 /* 322 * Verify (and, if relevant, save off) page fields common to 323 * all PAGEs. 324 */ 325 if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) { 326 if (ret == DB_VERIFY_BAD) 327 isbad = 1; 328 else 329 goto err; 330 } 331 332 /* 333 * The record count is, on internal pages, stored in an overloaded 334 * next_pgno field. Save it off; we'll verify it when we check 335 * overall database structure. We could overload the field 336 * in VRFY_PAGEINFO, too, but this seems gross, and space 337 * is not at such a premium. 338 */ 339 pip->rec_cnt = RE_NREC(h); 340 341 /* 342 * Verify inp[]. 343 */ 344 if (TYPE(h) == P_IRECNO) { 345 if ((ret = __ram_vrfy_inp(dbp, 346 vdp, h, pgno, &pip->entries, flags)) != 0) 347 goto err; 348 } else if ((ret = __bam_vrfy_inp(dbp, 349 vdp, h, pgno, &pip->entries, flags)) != 0) { 350 if (ret == DB_VERIFY_BAD) 351 isbad = 1; 352 else 353 goto err; 354 EPRINT((env, 355 "Page %lu: item order check unsafe: skipping", 356 (u_long)pgno)); 357 } else if (!LF_ISSET(DB_NOORDERCHK) && (ret = 358 __bam_vrfy_itemorder(dbp, 359 vdp, vdp->thread_info, h, pgno, 0, 0, 0, flags)) != 0) { 360 /* 361 * We know that the elements of inp are reasonable. 362 * 363 * Check that elements fall in the proper order. 364 */ 365 if (ret == DB_VERIFY_BAD) 366 isbad = 1; 367 else 368 goto err; 369 } 370 371err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) 372 ret = t_ret; 373 return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); 374} 375 376/* 377 * __ram_vrfy_inp -- 378 * Verify that all entries in a P_IRECNO inp[] array are reasonable, 379 * and count them. Note that P_LRECNO uses __bam_vrfy_inp; 380 * P_IRECNOs are a special, and simpler, case, since they have 381 * RINTERNALs rather than BKEYDATA/BINTERNALs. 382 */ 383static int 384__ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) 385 DB *dbp; 386 VRFY_DBINFO *vdp; 387 PAGE *h; 388 db_pgno_t pgno; 389 db_indx_t *nentriesp; 390 u_int32_t flags; 391{ 392 ENV *env; 393 RINTERNAL *ri; 394 VRFY_CHILDINFO child; 395 VRFY_PAGEINFO *pip; 396 int ret, t_ret, isbad; 397 u_int32_t himark, i, offset, nentries; 398 db_indx_t *inp; 399 u_int8_t *pagelayout, *p; 400 401 env = dbp->env; 402 isbad = 0; 403 memset(&child, 0, sizeof(VRFY_CHILDINFO)); 404 nentries = 0; 405 pagelayout = NULL; 406 407 if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) 408 return (ret); 409 410 if (TYPE(h) != P_IRECNO) { 411 ret = __db_unknown_path(env, "__ram_vrfy_inp"); 412 goto err; 413 } 414 415 himark = dbp->pgsize; 416 if ((ret = __os_malloc(env, dbp->pgsize, &pagelayout)) != 0) 417 goto err; 418 memset(pagelayout, 0, dbp->pgsize); 419 inp = P_INP(dbp, h); 420 for (i = 0; i < NUM_ENT(h); i++) { 421 if ((u_int8_t *)inp + i >= (u_int8_t *)h + himark) { 422 EPRINT((env, 423 "Page %lu: entries listing %lu overlaps data", 424 (u_long)pgno, (u_long)i)); 425 ret = DB_VERIFY_BAD; 426 goto err; 427 } 428 offset = inp[i]; 429 /* 430 * Check that the item offset is reasonable: it points 431 * somewhere after the inp array and before the end of the 432 * page. 433 */ 434 if (offset <= (u_int32_t)((u_int8_t *)inp + i - 435 (u_int8_t *)h) || 436 offset > (u_int32_t)(dbp->pgsize - RINTERNAL_SIZE)) { 437 isbad = 1; 438 EPRINT((env, 439 "Page %lu: bad offset %lu at index %lu", 440 (u_long)pgno, (u_long)offset, (u_long)i)); 441 continue; 442 } 443 444 /* Update the high-water mark (what HOFFSET should be) */ 445 if (offset < himark) 446 himark = offset; 447 448 nentries++; 449 450 /* Make sure this RINTERNAL is not multiply referenced. */ 451 ri = GET_RINTERNAL(dbp, h, i); 452 if (pagelayout[offset] == 0) { 453 pagelayout[offset] = 1; 454 child.pgno = ri->pgno; 455 child.type = V_RECNO; 456 child.nrecs = ri->nrecs; 457 if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0) 458 goto err; 459 } else { 460 EPRINT((env, 461 "Page %lu: RINTERNAL structure at offset %lu referenced twice", 462 (u_long)pgno, (u_long)offset)); 463 isbad = 1; 464 } 465 } 466 467 for (p = pagelayout + himark; 468 p < pagelayout + dbp->pgsize; 469 p += RINTERNAL_SIZE) 470 if (*p != 1) { 471 EPRINT((env, 472 "Page %lu: gap between items at offset %lu", 473 (u_long)pgno, (u_long)(p - pagelayout))); 474 isbad = 1; 475 } 476 477 if ((db_indx_t)himark != HOFFSET(h)) { 478 EPRINT((env, 479 "Page %lu: bad HOFFSET %lu, appears to be %lu", 480 (u_long)pgno, (u_long)(HOFFSET(h)), (u_long)himark)); 481 isbad = 1; 482 } 483 484 *nentriesp = nentries; 485 486err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) 487 ret = t_ret; 488 if (pagelayout != NULL) 489 __os_free(env, pagelayout); 490 return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); 491} 492 493typedef enum { VRFY_ITEM_NOTSET=0, VRFY_ITEM_BEGIN, VRFY_ITEM_END } VRFY_ITEM; 494 495/* 496 * __bam_vrfy_inp -- 497 * Verify that all entries in inp[] array are reasonable; 498 * count them. 499 */ 500static int 501__bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) 502 DB *dbp; 503 VRFY_DBINFO *vdp; 504 PAGE *h; 505 db_pgno_t pgno; 506 db_indx_t *nentriesp; 507 u_int32_t flags; 508{ 509 BKEYDATA *bk; 510 BOVERFLOW *bo; 511 ENV *env; 512 VRFY_CHILDINFO child; 513 VRFY_ITEM *pagelayout; 514 VRFY_PAGEINFO *pip; 515 u_int32_t himark, offset; /* 516 * These would be db_indx_ts 517 * but for alignment. 518 */ 519 u_int32_t i, endoff, nentries; 520 int isbad, initem, isdupitem, ret, t_ret; 521 522 env = dbp->env; 523 isbad = isdupitem = 0; 524 nentries = 0; 525 memset(&child, 0, sizeof(VRFY_CHILDINFO)); 526 if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) 527 return (ret); 528 529 switch (TYPE(h)) { 530 case P_IBTREE: 531 case P_LBTREE: 532 case P_LDUP: 533 case P_LRECNO: 534 break; 535 default: 536 /* 537 * In the salvager, we might call this from a page which 538 * we merely suspect is a btree page. Otherwise, it 539 * shouldn't get called--if it is, that's a verifier bug. 540 */ 541 if (LF_ISSET(DB_SALVAGE)) 542 break; 543 ret = __db_unknown_path(env, "__bam_vrfy_inp"); 544 goto err; 545 } 546 547 /* 548 * Loop through inp[], the array of items, until we either 549 * run out of entries or collide with the data. Keep track 550 * of h_offset in himark. 551 * 552 * For each element in inp[i], make sure it references a region 553 * that starts after the end of the inp array (as defined by 554 * NUM_ENT(h)), ends before the beginning of the page, doesn't 555 * overlap any other regions, and doesn't have a gap between 556 * it and the region immediately after it. 557 */ 558 himark = dbp->pgsize; 559 if ((ret = __os_calloc( 560 env, dbp->pgsize, sizeof(pagelayout[0]), &pagelayout)) != 0) 561 goto err; 562 for (i = 0; i < NUM_ENT(h); i++) { 563 switch (ret = __db_vrfy_inpitem(dbp, 564 h, pgno, i, 1, flags, &himark, &offset)) { 565 case 0: 566 break; 567 case DB_VERIFY_BAD: 568 isbad = 1; 569 continue; 570 case DB_VERIFY_FATAL: 571 isbad = 1; 572 goto err; 573 default: 574 DB_ASSERT(env, ret != 0); 575 break; 576 } 577 578 /* 579 * We now have a plausible beginning for the item, and we know 580 * its length is safe. 581 * 582 * Mark the beginning and end in pagelayout so we can make sure 583 * items have no overlaps or gaps. 584 */ 585 bk = GET_BKEYDATA(dbp, h, i); 586 if (pagelayout[offset] == VRFY_ITEM_NOTSET) 587 pagelayout[offset] = VRFY_ITEM_BEGIN; 588 else if (pagelayout[offset] == VRFY_ITEM_BEGIN) { 589 /* 590 * Having two inp entries that point at the same patch 591 * of page is legal if and only if the page is 592 * a btree leaf and they're onpage duplicate keys-- 593 * that is, if (i % P_INDX) == 0. 594 */ 595 if ((i % P_INDX == 0) && (TYPE(h) == P_LBTREE)) { 596 /* Flag for later. */ 597 F_SET(pip, VRFY_HAS_DUPS); 598 599 /* Bump up nentries so we don't undercount. */ 600 nentries++; 601 602 /* 603 * We'll check to make sure the end is 604 * equal, too. 605 */ 606 isdupitem = 1; 607 } else { 608 isbad = 1; 609 EPRINT((env, "Page %lu: duplicated item %lu", 610 (u_long)pgno, (u_long)i)); 611 } 612 } 613 614 /* 615 * Mark the end. Its location varies with the page type 616 * and the item type. 617 * 618 * If the end already has a sign other than 0, do nothing-- 619 * it's an overlap that we'll catch later. 620 */ 621 switch (B_TYPE(bk->type)) { 622 case B_KEYDATA: 623 if (TYPE(h) == P_IBTREE) 624 /* It's a BINTERNAL. */ 625 endoff = offset + BINTERNAL_SIZE(bk->len) - 1; 626 else 627 endoff = offset + BKEYDATA_SIZE(bk->len) - 1; 628 break; 629 case B_DUPLICATE: 630 /* 631 * Flag that we have dups; we'll check whether 632 * that's okay during the structure check. 633 */ 634 F_SET(pip, VRFY_HAS_DUPS); 635 /* FALLTHROUGH */ 636 case B_OVERFLOW: 637 /* 638 * Overflow entries on internal pages are stored 639 * as the _data_ of a BINTERNAL; overflow entries 640 * on leaf pages are stored as the entire entry. 641 */ 642 endoff = offset + 643 ((TYPE(h) == P_IBTREE) ? 644 BINTERNAL_SIZE(BOVERFLOW_SIZE) : 645 BOVERFLOW_SIZE) - 1; 646 break; 647 default: 648 /* 649 * We'll complain later; for now, just mark 650 * a minimum. 651 */ 652 endoff = offset + BKEYDATA_SIZE(0) - 1; 653 break; 654 } 655 656 /* 657 * If this is an onpage duplicate key we've seen before, 658 * the end had better coincide too. 659 */ 660 if (isdupitem && pagelayout[endoff] != VRFY_ITEM_END) { 661 EPRINT((env, "Page %lu: duplicated item %lu", 662 (u_long)pgno, (u_long)i)); 663 isbad = 1; 664 } else if (pagelayout[endoff] == VRFY_ITEM_NOTSET) 665 pagelayout[endoff] = VRFY_ITEM_END; 666 isdupitem = 0; 667 668 /* 669 * There should be no deleted items in a quiescent tree, 670 * except in recno. 671 */ 672 if (B_DISSET(bk->type) && TYPE(h) != P_LRECNO) { 673 isbad = 1; 674 EPRINT((env, "Page %lu: item %lu marked deleted", 675 (u_long)pgno, (u_long)i)); 676 } 677 678 /* 679 * Check the type and such of bk--make sure it's reasonable 680 * for the pagetype. 681 */ 682 switch (B_TYPE(bk->type)) { 683 case B_KEYDATA: 684 /* 685 * This is a normal, non-overflow BKEYDATA or BINTERNAL. 686 * The only thing to check is the len, and that's 687 * already been done. 688 */ 689 break; 690 case B_DUPLICATE: 691 if (TYPE(h) == P_IBTREE) { 692 isbad = 1; 693 EPRINT((env, 694 "Page %lu: duplicate page referenced by internal btree page at item %lu", 695 (u_long)pgno, (u_long)i)); 696 break; 697 } else if (TYPE(h) == P_LRECNO) { 698 isbad = 1; 699 EPRINT((env, 700 "Page %lu: duplicate page referenced by recno page at item %lu", 701 (u_long)pgno, (u_long)i)); 702 break; 703 } 704 /* FALLTHROUGH */ 705 case B_OVERFLOW: 706 bo = (TYPE(h) == P_IBTREE) ? 707 (BOVERFLOW *)(((BINTERNAL *)bk)->data) : 708 (BOVERFLOW *)bk; 709 710 if (B_TYPE(bk->type) == B_OVERFLOW) 711 /* Make sure tlen is reasonable. */ 712 if (bo->tlen > dbp->pgsize * vdp->last_pgno) { 713 isbad = 1; 714 EPRINT((env, 715 "Page %lu: impossible tlen %lu, item %lu", 716 (u_long)pgno, 717 (u_long)bo->tlen, (u_long)i)); 718 /* Don't save as a child. */ 719 break; 720 } 721 722 if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno || 723 bo->pgno == PGNO_INVALID) { 724 isbad = 1; 725 EPRINT((env, 726 "Page %lu: offpage item %lu has bad pgno %lu", 727 (u_long)pgno, (u_long)i, (u_long)bo->pgno)); 728 /* Don't save as a child. */ 729 break; 730 } 731 732 child.pgno = bo->pgno; 733 child.type = (B_TYPE(bk->type) == B_OVERFLOW ? 734 V_OVERFLOW : V_DUPLICATE); 735 child.tlen = bo->tlen; 736 if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0) 737 goto err; 738 break; 739 default: 740 isbad = 1; 741 EPRINT((env, "Page %lu: item %lu of invalid type %lu", 742 (u_long)pgno, (u_long)i, (u_long)B_TYPE(bk->type))); 743 break; 744 } 745 } 746 747 /* 748 * Now, loop through and make sure the items are contiguous and 749 * non-overlapping. 750 */ 751 initem = 0; 752 for (i = himark; i < dbp->pgsize; i++) 753 if (initem == 0) 754 switch (pagelayout[i]) { 755 case VRFY_ITEM_NOTSET: 756 /* May be just for alignment. */ 757 if (i != DB_ALIGN(i, sizeof(u_int32_t))) 758 continue; 759 760 isbad = 1; 761 EPRINT((env, 762 "Page %lu: gap between items at offset %lu", 763 (u_long)pgno, (u_long)i)); 764 /* Find the end of the gap */ 765 for (; pagelayout[i + 1] == VRFY_ITEM_NOTSET && 766 (size_t)(i + 1) < dbp->pgsize; i++) 767 ; 768 break; 769 case VRFY_ITEM_BEGIN: 770 /* We've found an item. Check its alignment. */ 771 if (i != DB_ALIGN(i, sizeof(u_int32_t))) { 772 isbad = 1; 773 EPRINT((env, 774 "Page %lu: offset %lu unaligned", 775 (u_long)pgno, (u_long)i)); 776 } 777 initem = 1; 778 nentries++; 779 break; 780 case VRFY_ITEM_END: 781 /* 782 * We've hit the end of an item even though 783 * we don't think we're in one; must 784 * be an overlap. 785 */ 786 isbad = 1; 787 EPRINT((env, 788 "Page %lu: overlapping items at offset %lu", 789 (u_long)pgno, (u_long)i)); 790 break; 791 } 792 else 793 switch (pagelayout[i]) { 794 case VRFY_ITEM_NOTSET: 795 /* In the middle of an item somewhere. Okay. */ 796 break; 797 case VRFY_ITEM_END: 798 /* End of an item; switch to out-of-item mode.*/ 799 initem = 0; 800 break; 801 case VRFY_ITEM_BEGIN: 802 /* 803 * Hit a second item beginning without an 804 * end. Overlap. 805 */ 806 isbad = 1; 807 EPRINT((env, 808 "Page %lu: overlapping items at offset %lu", 809 (u_long)pgno, (u_long)i)); 810 break; 811 } 812 813 __os_free(env, pagelayout); 814 815 /* Verify HOFFSET. */ 816 if ((db_indx_t)himark != HOFFSET(h)) { 817 EPRINT((env, "Page %lu: bad HOFFSET %lu, appears to be %lu", 818 (u_long)pgno, (u_long)HOFFSET(h), (u_long)himark)); 819 isbad = 1; 820 } 821 822err: if (nentriesp != NULL) 823 *nentriesp = nentries; 824 825 if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) 826 ret = t_ret; 827 828 return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret); 829} 830 831/* 832 * __bam_vrfy_itemorder -- 833 * Make sure the items on a page sort correctly. 834 * 835 * Assumes that NUM_ENT(h) and inp[0]..inp[NUM_ENT(h) - 1] are 836 * reasonable; be sure that __bam_vrfy_inp has been called first. 837 * 838 * If ovflok is set, it also assumes that overflow page chains 839 * hanging off the current page have been sanity-checked, and so we 840 * can use __bam_cmp to verify their ordering. If it is not set, 841 * and we run into an overflow page, carp and return DB_VERIFY_BAD; 842 * we shouldn't be called if any exist. 843 * 844 * PUBLIC: int __bam_vrfy_itemorder __P((DB *, VRFY_DBINFO *, DB_THREAD_INFO *, 845 * PUBLIC: PAGE *, db_pgno_t, u_int32_t, int, int, u_int32_t)); 846 */ 847int 848__bam_vrfy_itemorder(dbp, vdp, ip, h, pgno, nentries, ovflok, hasdups, flags) 849 DB *dbp; 850 VRFY_DBINFO *vdp; 851 DB_THREAD_INFO *ip; 852 PAGE *h; 853 db_pgno_t pgno; 854 u_int32_t nentries; 855 int ovflok, hasdups; 856 u_int32_t flags; 857{ 858 BINTERNAL *bi; 859 BKEYDATA *bk; 860 BOVERFLOW *bo; 861 BTREE *bt; 862 DBT dbta, dbtb, dup_1, dup_2, *p1, *p2, *tmp; 863 ENV *env; 864 VRFY_PAGEINFO *pip; 865 db_indx_t i, *inp; 866 int adj, cmp, freedup_1, freedup_2, isbad, ret, t_ret; 867 int (*dupfunc) __P((DB *, const DBT *, const DBT *)); 868 int (*func) __P((DB *, const DBT *, const DBT *)); 869 void *buf1, *buf2, *tmpbuf; 870 871 /* 872 * We need to work in the ORDERCHKONLY environment where we might 873 * not have a pip, but we also may need to work in contexts where 874 * NUM_ENT isn't safe. 875 */ 876 if (vdp != NULL) { 877 if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) 878 return (ret); 879 nentries = pip->entries; 880 } else 881 pip = NULL; 882 883 env = dbp->env; 884 ret = isbad = 0; 885 bo = NULL; /* Shut up compiler. */ 886 887 memset(&dbta, 0, sizeof(DBT)); 888 F_SET(&dbta, DB_DBT_REALLOC); 889 890 memset(&dbtb, 0, sizeof(DBT)); 891 F_SET(&dbtb, DB_DBT_REALLOC); 892 893 buf1 = buf2 = NULL; 894 895 DB_ASSERT(env, !LF_ISSET(DB_NOORDERCHK)); 896 897 dupfunc = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare; 898 if (TYPE(h) == P_LDUP) 899 func = dupfunc; 900 else { 901 func = __bam_defcmp; 902 if (dbp->bt_internal != NULL) { 903 bt = (BTREE *)dbp->bt_internal; 904 if (bt->bt_compare != NULL) 905 func = bt->bt_compare; 906 } 907 } 908 909 /* 910 * We alternate our use of dbta and dbtb so that we can walk 911 * through the page key-by-key without copying a dbt twice. 912 * p1 is always the dbt for index i - 1, and p2 for index i. 913 */ 914 p1 = &dbta; 915 p2 = &dbtb; 916 917 /* 918 * Loop through the entries. nentries ought to contain the 919 * actual count, and so is a safe way to terminate the loop; whether 920 * we inc. by one or two depends on whether we're a leaf page-- 921 * on a leaf page, we care only about keys. On internal pages 922 * and LDUP pages, we want to check the order of all entries. 923 * 924 * Note that on IBTREE pages, we start with item 1, since item 925 * 0 doesn't get looked at by __bam_cmp. 926 */ 927 inp = P_INP(dbp, h); 928 adj = (TYPE(h) == P_LBTREE) ? P_INDX : O_INDX; 929 for (i = (TYPE(h) == P_IBTREE) ? 1 : 0; i < nentries; i += adj) { 930 /* 931 * Put key i-1, now in p2, into p1, by swapping DBTs and bufs. 932 */ 933 tmp = p1; 934 p1 = p2; 935 p2 = tmp; 936 tmpbuf = buf1; 937 buf1 = buf2; 938 buf2 = tmpbuf; 939 940 /* 941 * Get key i into p2. 942 */ 943 switch (TYPE(h)) { 944 case P_IBTREE: 945 bi = GET_BINTERNAL(dbp, h, i); 946 if (B_TYPE(bi->type) == B_OVERFLOW) { 947 bo = (BOVERFLOW *)(bi->data); 948 goto overflow; 949 } else { 950 p2->data = bi->data; 951 p2->size = bi->len; 952 } 953 954 /* 955 * The leftmost key on an internal page must be 956 * len 0, since it's just a placeholder and 957 * automatically sorts less than all keys. 958 * 959 * XXX 960 * This criterion does not currently hold! 961 * See todo list item #1686. Meanwhile, it's harmless 962 * to just not check for it. 963 */ 964#if 0 965 if (i == 0 && bi->len != 0) { 966 isbad = 1; 967 EPRINT((env, 968 "Page %lu: lowest key on internal page of nonzero length", 969 (u_long)pgno)); 970 } 971#endif 972 break; 973 case P_LBTREE: 974 case P_LDUP: 975 bk = GET_BKEYDATA(dbp, h, i); 976 if (B_TYPE(bk->type) == B_OVERFLOW) { 977 bo = (BOVERFLOW *)bk; 978 goto overflow; 979 } else { 980 p2->data = bk->data; 981 p2->size = bk->len; 982 } 983 break; 984 default: 985 /* 986 * This means our caller screwed up and sent us 987 * an inappropriate page. 988 */ 989 ret = __db_unknown_path(env, "__bam_vrfy_itemorder"); 990 goto err; 991 } 992 993 if (0) { 994 /* 995 * If ovflok != 1, we can't safely go chasing 996 * overflow pages with the normal routines now; 997 * they might be unsafe or nonexistent. Mark this 998 * page as incomplete and return. 999 * 1000 * Note that we don't need to worry about freeing 1001 * buffers, since they can't have been allocated 1002 * if overflow items are unsafe. 1003 */ 1004overflow: if (!ovflok) { 1005 F_SET(pip, VRFY_INCOMPLETE); 1006 goto err; 1007 } 1008 1009 /* 1010 * Overflow items are safe to chase. Do so. 1011 * Fetch the overflow item into p2->data, 1012 * NULLing it or reallocing it as appropriate. 1013 * 1014 * (We set p2->data to buf2 before the call 1015 * so we're sure to realloc if we can and if p2 1016 * was just pointing at a non-overflow item.) 1017 */ 1018 p2->data = buf2; 1019 if ((ret = __db_goff(dbp, ip, NULL, 1020 p2, bo->tlen, bo->pgno, NULL, NULL)) != 0) { 1021 isbad = 1; 1022 EPRINT((env, 1023 "Page %lu: error %lu in fetching overflow item %lu", 1024 (u_long)pgno, (u_long)ret, (u_long)i)); 1025 } 1026 /* In case it got realloc'ed and thus changed. */ 1027 buf2 = p2->data; 1028 } 1029 1030 /* Compare with the last key. */ 1031 if (p1->data != NULL && p2->data != NULL) { 1032 cmp = inp[i] == inp[i - adj] ? 0 : func(dbp, p1, p2); 1033 1034 /* comparison succeeded */ 1035 if (cmp > 0) { 1036 isbad = 1; 1037 EPRINT((env, 1038 "Page %lu: out-of-order key at entry %lu", 1039 (u_long)pgno, (u_long)i)); 1040 /* proceed */ 1041 } else if (cmp == 0) { 1042 if (inp[i] != inp[i - adj]) { 1043 EPRINT((env, 1044 "Page %lu: non-dup dup key at entry %lu", 1045 (u_long)pgno, (u_long)i)); 1046 isbad = 1; 1047 } 1048 /* 1049 * If they compared equally, this 1050 * had better be a (sub)database with dups. 1051 * Mark it so we can check during the 1052 * structure check. 1053 */ 1054 if (pip != NULL) 1055 F_SET(pip, VRFY_HAS_DUPS); 1056 else if (hasdups == 0) { 1057 isbad = 1; 1058 EPRINT((env, 1059 "Page %lu: database with no duplicates has duplicated keys", 1060 (u_long)pgno)); 1061 } 1062 1063 /* 1064 * If we're a btree leaf, check to see 1065 * if the data items of these on-page dups are 1066 * in sorted order. If not, flag this, so 1067 * that we can make sure during the 1068 * structure checks that the DUPSORT flag 1069 * is unset. 1070 * 1071 * At this point i points to a duplicate key. 1072 * Compare the datum before it (same key) 1073 * to the datum after it, i.e. i-1 to i+1. 1074 */ 1075 if (TYPE(h) == P_LBTREE) { 1076 /* 1077 * Unsafe; continue and we'll pick 1078 * up the bogus nentries later. 1079 */ 1080 if (i + 1 >= (db_indx_t)nentries) 1081 continue; 1082 1083 /* 1084 * We don't bother with clever memory 1085 * management with on-page dups, 1086 * as it's only really a big win 1087 * in the overflow case, and overflow 1088 * dups are probably (?) rare. 1089 */ 1090 if (((ret = __bam_safe_getdata(dbp, 1091 ip, h, i - 1, ovflok, 1092 &dup_1, &freedup_1)) != 0) || 1093 ((ret = __bam_safe_getdata(dbp, 1094 ip, h, i + 1, ovflok, 1095 &dup_2, &freedup_2)) != 0)) 1096 goto err; 1097 1098 /* 1099 * If either of the data are NULL, 1100 * it's because they're overflows and 1101 * it's not safe to chase them now. 1102 * Mark an incomplete and return. 1103 */ 1104 if (dup_1.data == NULL || 1105 dup_2.data == NULL) { 1106 DB_ASSERT(env, !ovflok); 1107 F_SET(pip, VRFY_INCOMPLETE); 1108 goto err; 1109 } 1110 1111 /* 1112 * If the dups are out of order, 1113 * flag this. It's not an error 1114 * until we do the structure check 1115 * and see whether DUPSORT is set. 1116 */ 1117 if (dupfunc(dbp, &dup_1, &dup_2) > 0) 1118 F_SET(pip, VRFY_DUPS_UNSORTED); 1119 1120 if (freedup_1) 1121 __os_ufree(env, dup_1.data); 1122 if (freedup_2) 1123 __os_ufree(env, dup_2.data); 1124 } 1125 } 1126 } 1127 } 1128 1129err: if (pip != NULL && ((t_ret = 1130 __db_vrfy_putpageinfo(env, vdp, pip)) != 0) && ret == 0) 1131 ret = t_ret; 1132 1133 if (buf1 != NULL) 1134 __os_ufree(env, buf1); 1135 if (buf2 != NULL) 1136 __os_ufree(env, buf2); 1137 1138 return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); 1139} 1140 1141/* 1142 * __bam_vrfy_structure -- 1143 * Verify the tree structure of a btree database (including the master 1144 * database containing subdbs). 1145 * 1146 * PUBLIC: int __bam_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, 1147 * PUBLIC: u_int32_t)); 1148 */ 1149int 1150__bam_vrfy_structure(dbp, vdp, meta_pgno, flags) 1151 DB *dbp; 1152 VRFY_DBINFO *vdp; 1153 db_pgno_t meta_pgno; 1154 u_int32_t flags; 1155{ 1156 DB *pgset; 1157 ENV *env; 1158 VRFY_PAGEINFO *mip, *rip; 1159 db_pgno_t root, p; 1160 int t_ret, ret; 1161 u_int32_t nrecs, level, relen, stflags; 1162 1163 env = dbp->env; 1164 mip = rip = 0; 1165 pgset = vdp->pgset; 1166 1167 if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &mip)) != 0) 1168 return (ret); 1169 1170 if ((ret = __db_vrfy_pgset_get(pgset, 1171 vdp->thread_info, meta_pgno, (int *)&p)) != 0) 1172 goto err; 1173 if (p != 0) { 1174 EPRINT((env, 1175 "Page %lu: btree metadata page observed twice", 1176 (u_long)meta_pgno)); 1177 ret = DB_VERIFY_BAD; 1178 goto err; 1179 } 1180 if ((ret = 1181 __db_vrfy_pgset_inc(pgset, vdp->thread_info, meta_pgno)) != 0) 1182 goto err; 1183 1184 root = mip->root; 1185 1186 if (root == 0) { 1187 EPRINT((env, 1188 "Page %lu: btree metadata page has no root", 1189 (u_long)meta_pgno)); 1190 ret = DB_VERIFY_BAD; 1191 goto err; 1192 } 1193 1194 if ((ret = __db_vrfy_getpageinfo(vdp, root, &rip)) != 0) 1195 goto err; 1196 1197 switch (rip->type) { 1198 case P_IBTREE: 1199 case P_LBTREE: 1200 stflags = flags | DB_ST_TOPLEVEL; 1201 if (F_ISSET(mip, VRFY_HAS_DUPS)) 1202 stflags |= DB_ST_DUPOK; 1203 if (F_ISSET(mip, VRFY_HAS_DUPSORT)) 1204 stflags |= DB_ST_DUPSORT; 1205 if (F_ISSET(mip, VRFY_HAS_RECNUMS)) 1206 stflags |= DB_ST_RECNUM; 1207 ret = __bam_vrfy_subtree(dbp, 1208 vdp, root, NULL, NULL, stflags, NULL, NULL, NULL); 1209 break; 1210 case P_IRECNO: 1211 case P_LRECNO: 1212 stflags = 1213 flags | DB_ST_RECNUM | DB_ST_IS_RECNO | DB_ST_TOPLEVEL; 1214 if (mip->re_len > 0) 1215 stflags |= DB_ST_RELEN; 1216 if ((ret = __bam_vrfy_subtree(dbp, vdp, 1217 root, NULL, NULL, stflags, &level, &nrecs, &relen)) != 0) 1218 goto err; 1219 /* 1220 * Even if mip->re_len > 0, re_len may come back zero if the 1221 * tree is empty. It should be okay to just skip the check in 1222 * this case, as if there are any non-deleted keys at all, 1223 * that should never happen. 1224 */ 1225 if (mip->re_len > 0 && relen > 0 && mip->re_len != relen) { 1226 EPRINT((env, 1227 "Page %lu: recno database has bad re_len %lu", 1228 (u_long)meta_pgno, (u_long)relen)); 1229 ret = DB_VERIFY_BAD; 1230 goto err; 1231 } 1232 ret = 0; 1233 break; 1234 case P_LDUP: 1235 EPRINT((env, 1236 "Page %lu: duplicate tree referenced from metadata page", 1237 (u_long)meta_pgno)); 1238 ret = DB_VERIFY_BAD; 1239 break; 1240 default: 1241 EPRINT((env, 1242 "Page %lu: btree root of incorrect type %lu on metadata page", 1243 (u_long)meta_pgno, (u_long)rip->type)); 1244 ret = DB_VERIFY_BAD; 1245 break; 1246 } 1247 1248err: if (mip != NULL && ((t_ret = 1249 __db_vrfy_putpageinfo(env, vdp, mip)) != 0) && ret == 0) 1250 ret = t_ret; 1251 if (rip != NULL && ((t_ret = 1252 __db_vrfy_putpageinfo(env, vdp, rip)) != 0) && ret == 0) 1253 ret = t_ret; 1254 return (ret); 1255} 1256 1257/* 1258 * __bam_vrfy_subtree-- 1259 * Verify a subtree (or entire) btree with specified root. 1260 * 1261 * Note that this is public because it must be called to verify 1262 * offpage dup trees, including from hash. 1263 * 1264 * PUBLIC: int __bam_vrfy_subtree __P((DB *, VRFY_DBINFO *, db_pgno_t, void *, 1265 * PUBLIC: void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *)); 1266 */ 1267int 1268__bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp) 1269 DB *dbp; 1270 VRFY_DBINFO *vdp; 1271 db_pgno_t pgno; 1272 void *l, *r; 1273 u_int32_t flags, *levelp, *nrecsp, *relenp; 1274{ 1275 BINTERNAL *li, *ri; 1276 DB *pgset; 1277 DBC *cc; 1278 DB_MPOOLFILE *mpf; 1279 ENV *env; 1280 PAGE *h; 1281 VRFY_CHILDINFO *child; 1282 VRFY_PAGEINFO *pip; 1283 db_indx_t i; 1284 db_pgno_t next_pgno, prev_pgno; 1285 db_recno_t child_nrecs, nrecs; 1286 u_int32_t child_level, child_relen, j, level, relen, stflags; 1287 u_int8_t leaf_type; 1288 int (*func) __P((DB *, const DBT *, const DBT *)); 1289 int isbad, p, ret, t_ret, toplevel; 1290 1291 if (levelp != NULL) /* Don't leave uninitialized on error. */ 1292 *levelp = 0; 1293 if (nrecsp != NULL) 1294 *nrecsp = 0; 1295 1296 env = dbp->env; 1297 mpf = dbp->mpf; 1298 h = NULL; 1299 next_pgno = prev_pgno = PGNO_INVALID; 1300 nrecs = 0; 1301 relen = 0; 1302 leaf_type = P_INVALID; 1303 isbad = ret = 0; 1304 1305 /* Provide feedback on our progress to the application. */ 1306 if (!LF_ISSET(DB_SALVAGE)) 1307 __db_vrfy_struct_feedback(dbp, vdp); 1308 1309 if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) 1310 return (ret); 1311 1312 cc = NULL; 1313 level = pip->bt_level; 1314 1315 toplevel = LF_ISSET(DB_ST_TOPLEVEL) ? 1 : 0; 1316 LF_CLR(DB_ST_TOPLEVEL); 1317 1318 /* 1319 * If this is the root, initialize the vdp's prev- and next-pgno 1320 * accounting. 1321 * 1322 * For each leaf page we hit, we'll want to make sure that 1323 * vdp->prev_pgno is the same as pip->prev_pgno and vdp->next_pgno is 1324 * our page number. Then, we'll set vdp->next_pgno to pip->next_pgno 1325 * and vdp->prev_pgno to our page number, and the next leaf page in 1326 * line should be able to do the same verification. 1327 */ 1328 if (toplevel) { 1329 /* 1330 * Cache the values stored in the vdp so that if we're an 1331 * auxiliary tree such as an off-page duplicate set, our 1332 * caller's leaf page chain doesn't get lost. 1333 */ 1334 prev_pgno = vdp->prev_pgno; 1335 next_pgno = vdp->next_pgno; 1336 leaf_type = vdp->leaf_type; 1337 vdp->next_pgno = vdp->prev_pgno = PGNO_INVALID; 1338 vdp->leaf_type = P_INVALID; 1339 } 1340 1341 /* 1342 * We are recursively descending a btree, starting from the root 1343 * and working our way out to the leaves. 1344 * 1345 * There are four cases we need to deal with: 1346 * 1. pgno is a recno leaf page. Any children are overflows. 1347 * 2. pgno is a duplicate leaf page. Any children 1348 * are overflow pages; traverse them, and then return 1349 * level and nrecs. 1350 * 3. pgno is an ordinary leaf page. Check whether dups are 1351 * allowed, and if so, traverse any off-page dups or 1352 * overflows. Then return nrecs and level. 1353 * 4. pgno is a recno internal page. Recursively check any 1354 * child pages, making sure their levels are one lower 1355 * and their nrecs sum to ours. 1356 * 5. pgno is a btree internal page. Same as #4, plus we 1357 * must verify that for each pair of BINTERNAL entries 1358 * N and N+1, the leftmost item on N's child sorts 1359 * greater than N, and the rightmost item on N's child 1360 * sorts less than N+1. 1361 * 1362 * Furthermore, in any sorted page type (P_LDUP, P_LBTREE, P_IBTREE), 1363 * we need to verify the internal sort order is correct if, 1364 * due to overflow items, we were not able to do so earlier. 1365 */ 1366 switch (pip->type) { 1367 case P_LRECNO: 1368 case P_LDUP: 1369 case P_LBTREE: 1370 /* 1371 * Cases 1, 2 and 3. 1372 * 1373 * We're some sort of leaf page; verify 1374 * that our linked list of leaves is consistent. 1375 */ 1376 if (vdp->leaf_type == P_INVALID) { 1377 /* 1378 * First leaf page. Set the type that all its 1379 * successors should be, and verify that our prev_pgno 1380 * is PGNO_INVALID. 1381 */ 1382 vdp->leaf_type = pip->type; 1383 if (pip->prev_pgno != PGNO_INVALID) 1384 goto bad_prev; 1385 } else { 1386 /* 1387 * Successor leaf page. Check our type, the previous 1388 * page's next_pgno, and our prev_pgno. 1389 */ 1390 if (pip->type != vdp->leaf_type) { 1391 isbad = 1; 1392 EPRINT((env, 1393 "Page %lu: unexpected page type %lu found in leaf chain (expected %lu)", 1394 (u_long)pip->pgno, (u_long)pip->type, 1395 (u_long)vdp->leaf_type)); 1396 } 1397 1398 /* 1399 * Don't do the prev/next_pgno checks if we've lost 1400 * leaf pages due to another corruption. 1401 */ 1402 if (!F_ISSET(vdp, VRFY_LEAFCHAIN_BROKEN)) { 1403 if (pip->pgno != vdp->next_pgno) { 1404 isbad = 1; 1405 EPRINT((env, 1406 "Page %lu: incorrect next_pgno %lu found in leaf chain (should be %lu)", 1407 (u_long)vdp->prev_pgno, 1408 (u_long)vdp->next_pgno, 1409 (u_long)pip->pgno)); 1410 } 1411 if (pip->prev_pgno != vdp->prev_pgno) { 1412bad_prev: isbad = 1; 1413 EPRINT((env, 1414 "Page %lu: incorrect prev_pgno %lu found in leaf chain (should be %lu)", 1415 (u_long)pip->pgno, 1416 (u_long)pip->prev_pgno, 1417 (u_long)vdp->prev_pgno)); 1418 } 1419 } 1420 } 1421 vdp->prev_pgno = pip->pgno; 1422 vdp->next_pgno = pip->next_pgno; 1423 F_CLR(vdp, VRFY_LEAFCHAIN_BROKEN); 1424 1425 /* 1426 * Overflow pages are common to all three leaf types; 1427 * traverse the child list, looking for overflows. 1428 */ 1429 if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0) 1430 goto err; 1431 for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0; 1432 ret = __db_vrfy_ccnext(cc, &child)) 1433 if (child->type == V_OVERFLOW && 1434 (ret = __db_vrfy_ovfl_structure(dbp, vdp, 1435 child->pgno, child->tlen, 1436 flags | DB_ST_OVFL_LEAF)) != 0) { 1437 if (ret == DB_VERIFY_BAD) 1438 isbad = 1; 1439 else 1440 goto done; 1441 } 1442 1443 if ((ret = __db_vrfy_ccclose(cc)) != 0) 1444 goto err; 1445 cc = NULL; 1446 1447 /* Case 1 */ 1448 if (pip->type == P_LRECNO) { 1449 if (!LF_ISSET(DB_ST_IS_RECNO) && 1450 !(LF_ISSET(DB_ST_DUPOK) && 1451 !LF_ISSET(DB_ST_DUPSORT))) { 1452 isbad = 1; 1453 EPRINT((env, 1454 "Page %lu: recno leaf page non-recno tree", 1455 (u_long)pgno)); 1456 goto done; 1457 } 1458 goto leaf; 1459 } else if (LF_ISSET(DB_ST_IS_RECNO)) { 1460 /* 1461 * It's a non-recno leaf. Had better not be a recno 1462 * subtree. 1463 */ 1464 isbad = 1; 1465 EPRINT((env, 1466 "Page %lu: non-recno leaf page in recno tree", 1467 (u_long)pgno)); 1468 goto done; 1469 } 1470 1471 /* Case 2--no more work. */ 1472 if (pip->type == P_LDUP) 1473 goto leaf; 1474 1475 /* Case 3 */ 1476 1477 /* Check if we have any dups. */ 1478 if (F_ISSET(pip, VRFY_HAS_DUPS)) { 1479 /* If dups aren't allowed in this btree, trouble. */ 1480 if (!LF_ISSET(DB_ST_DUPOK)) { 1481 isbad = 1; 1482 EPRINT((env, 1483 "Page %lu: duplicates in non-dup btree", 1484 (u_long)pgno)); 1485 } else { 1486 /* 1487 * We correctly have dups. If any are off-page, 1488 * traverse those btrees recursively. 1489 */ 1490 if ((ret = 1491 __db_vrfy_childcursor(vdp, &cc)) != 0) 1492 goto err; 1493 for (ret = __db_vrfy_ccset(cc, pgno, &child); 1494 ret == 0; 1495 ret = __db_vrfy_ccnext(cc, &child)) { 1496 stflags = 1497 flags | DB_ST_RECNUM | DB_ST_DUPSET; 1498 /* Skip any overflow entries. */ 1499 if (child->type == V_DUPLICATE) { 1500 if ((ret = __db_vrfy_duptype( 1501 dbp, vdp, child->pgno, 1502 stflags)) != 0) { 1503 isbad = 1; 1504 /* Next child. */ 1505 continue; 1506 } 1507 if ((ret = __bam_vrfy_subtree( 1508 dbp, vdp, child->pgno, 1509 NULL, NULL, 1510 stflags | DB_ST_TOPLEVEL, 1511 NULL, NULL, NULL)) != 0) { 1512 if (ret == 1513 DB_VERIFY_BAD) 1514 isbad = 1; 1515 else 1516 goto err; 1517 } 1518 } 1519 } 1520 1521 if ((ret = __db_vrfy_ccclose(cc)) != 0) 1522 goto err; 1523 cc = NULL; 1524 1525 /* 1526 * If VRFY_DUPS_UNSORTED is set, 1527 * DB_ST_DUPSORT had better not be. 1528 */ 1529 if (F_ISSET(pip, VRFY_DUPS_UNSORTED) && 1530 LF_ISSET(DB_ST_DUPSORT)) { 1531 isbad = 1; 1532 EPRINT((env, 1533 "Page %lu: unsorted duplicate set in sorted-dup database", 1534 (u_long)pgno)); 1535 } 1536 } 1537 } 1538 goto leaf; 1539 case P_IBTREE: 1540 case P_IRECNO: 1541 /* We handle these below. */ 1542 break; 1543 default: 1544 /* 1545 * If a P_IBTREE or P_IRECNO contains a reference to an 1546 * invalid page, we'll wind up here; handle it gracefully. 1547 * Note that the code at the "done" label assumes that the 1548 * current page is a btree/recno one of some sort; this 1549 * is not the case here, so we goto err. 1550 * 1551 * If the page is entirely zeroed, its pip->type will be a lie 1552 * (we assumed it was a hash page, as they're allowed to be 1553 * zeroed); handle this case specially. 1554 */ 1555 if (F_ISSET(pip, VRFY_IS_ALLZEROES)) 1556 ZEROPG_ERR_PRINT(env, pgno, "btree or recno page"); 1557 else 1558 EPRINT((env, 1559 "Page %lu: btree or recno page is of inappropriate type %lu", 1560 (u_long)pgno, (u_long)pip->type)); 1561 1562 /* 1563 * We probably lost a leaf page (or more if this was an 1564 * internal page) from our prev/next_pgno chain. Flag 1565 * that this is expected; we don't want or need to 1566 * spew error messages about erroneous prev/next_pgnos, 1567 * since that's probably not the real problem. 1568 */ 1569 F_SET(vdp, VRFY_LEAFCHAIN_BROKEN); 1570 1571 ret = DB_VERIFY_BAD; 1572 goto err; 1573 } 1574 1575 /* 1576 * Cases 4 & 5: This is a btree or recno internal page. For each child, 1577 * recurse, keeping a running count of nrecs and making sure the level 1578 * is always reasonable. 1579 */ 1580 if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0) 1581 goto err; 1582 for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0; 1583 ret = __db_vrfy_ccnext(cc, &child)) 1584 if (child->type == V_RECNO) { 1585 if (pip->type != P_IRECNO) { 1586 ret = __db_unknown_path( 1587 env, "__bam_vrfy_subtree"); 1588 goto err; 1589 } 1590 if ((ret = __bam_vrfy_subtree(dbp, vdp, child->pgno, 1591 NULL, NULL, flags, &child_level, &child_nrecs, 1592 &child_relen)) != 0) { 1593 if (ret == DB_VERIFY_BAD) 1594 isbad = 1; 1595 else 1596 goto done; 1597 } 1598 1599 if (LF_ISSET(DB_ST_RELEN)) { 1600 if (relen == 0) 1601 relen = child_relen; 1602 /* 1603 * child_relen may be zero if the child subtree 1604 * is empty. 1605 */ 1606 else if (child_relen > 0 && 1607 relen != child_relen) { 1608 isbad = 1; 1609 EPRINT((env, 1610 "Page %lu: recno page returned bad re_len %lu", 1611 (u_long)child->pgno, 1612 (u_long)child_relen)); 1613 } 1614 if (relenp) 1615 *relenp = relen; 1616 } 1617 if (LF_ISSET(DB_ST_RECNUM)) { 1618 if (child->nrecs != child_nrecs) { 1619 isbad = 1; 1620 EPRINT((env, 1621 "Page %lu: record count incorrect: actual %lu, in record %lu", 1622 (u_long)child->pgno, 1623 (u_long)child_nrecs, 1624 (u_long)child->nrecs)); 1625 } 1626 nrecs += child_nrecs; 1627 } 1628 if (isbad == 0 && level != child_level + 1) { 1629 isbad = 1; 1630 EPRINT((env, 1631 "Page %lu: recno level incorrect: got %lu, expected %lu", 1632 (u_long)child->pgno, (u_long)child_level, 1633 (u_long)(level - 1))); 1634 } 1635 } else if (child->type == V_OVERFLOW) { 1636 /* 1637 * It is possible for one internal page to reference 1638 * a single overflow page twice, if all the items 1639 * in the subtree referenced by slot 0 are deleted, 1640 * then a similar number of items are put back 1641 * before the key that formerly had been in slot 1. 1642 * 1643 * (Btree doesn't look at the key in slot 0, so the 1644 * fact that the key formerly at slot 1 is the "wrong" 1645 * parent of the stuff in the slot 0 subtree isn't 1646 * really incorrect.) 1647 * 1648 * __db_vrfy_ovfl_structure is designed to be 1649 * efficiently called multiple times for multiple 1650 * references; call it here as many times as is 1651 * appropriate. 1652 */ 1653 1654 /* Otherwise, __db_vrfy_childput would be broken. */ 1655 DB_ASSERT(env, child->refcnt >= 1); 1656 1657 /* 1658 * An overflow referenced more than twice here 1659 * shouldn't happen. 1660 */ 1661 if (child->refcnt > 2) { 1662 isbad = 1; 1663 EPRINT((env, 1664 "Page %lu: overflow page %lu referenced more than twice from internal page", 1665 (u_long)pgno, (u_long)child->pgno)); 1666 } else 1667 for (j = 0; j < child->refcnt; j++) 1668 if ((ret = __db_vrfy_ovfl_structure(dbp, 1669 vdp, child->pgno, child->tlen, 1670 flags)) != 0) { 1671 if (ret == DB_VERIFY_BAD) 1672 isbad = 1; 1673 else 1674 goto done; 1675 } 1676 } 1677 1678 if ((ret = __db_vrfy_ccclose(cc)) != 0) 1679 goto err; 1680 cc = NULL; 1681 1682 /* We're done with case 4. */ 1683 if (pip->type == P_IRECNO) 1684 goto done; 1685 1686 /* 1687 * Case 5. Btree internal pages. 1688 * As described above, we need to iterate through all the 1689 * items on the page and make sure that our children sort appropriately 1690 * with respect to them. 1691 * 1692 * For each entry, li will be the "left-hand" key for the entry 1693 * itself, which must sort lower than all entries on its child; 1694 * ri will be the key to its right, which must sort greater. 1695 */ 1696 if (h == NULL && 1697 (ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0) 1698 goto err; 1699 for (i = 0; i < pip->entries; i += O_INDX) { 1700 li = GET_BINTERNAL(dbp, h, i); 1701 ri = (i + O_INDX < pip->entries) ? 1702 GET_BINTERNAL(dbp, h, i + O_INDX) : r; 1703 1704 /* 1705 * The leftmost key is forcibly sorted less than all entries, 1706 * so don't bother passing it. 1707 */ 1708 if ((ret = __bam_vrfy_subtree(dbp, vdp, li->pgno, 1709 i == 0 ? NULL : li, ri, flags, &child_level, 1710 &child_nrecs, NULL)) != 0) { 1711 if (ret == DB_VERIFY_BAD) 1712 isbad = 1; 1713 else 1714 goto done; 1715 } 1716 1717 if (LF_ISSET(DB_ST_RECNUM)) { 1718 /* 1719 * Keep a running tally on the actual record count so 1720 * we can return it to our parent (if we have one) or 1721 * compare it to the NRECS field if we're a root page. 1722 */ 1723 nrecs += child_nrecs; 1724 1725 /* 1726 * Make sure the actual record count of the child 1727 * is equal to the value in the BINTERNAL structure. 1728 */ 1729 if (li->nrecs != child_nrecs) { 1730 isbad = 1; 1731 EPRINT((env, 1732 "Page %lu: item %lu has incorrect record count of %lu, should be %lu", 1733 (u_long)pgno, (u_long)i, (u_long)li->nrecs, 1734 (u_long)child_nrecs)); 1735 } 1736 } 1737 1738 if (level != child_level + 1) { 1739 isbad = 1; 1740 EPRINT((env, 1741 "Page %lu: Btree level incorrect: got %lu, expected %lu", 1742 (u_long)li->pgno, 1743 (u_long)child_level, (u_long)(level - 1))); 1744 } 1745 } 1746 1747 if (0) { 1748leaf: level = LEAFLEVEL; 1749 if (LF_ISSET(DB_ST_RECNUM)) 1750 nrecs = pip->rec_cnt; 1751 1752 /* XXX 1753 * We should verify that the record count on a leaf page 1754 * is the sum of the number of keys and the number of 1755 * records in its off-page dups. This requires looking 1756 * at the page again, however, and it may all be changing 1757 * soon, so for now we don't bother. 1758 */ 1759 1760 if (LF_ISSET(DB_ST_RELEN) && relenp) 1761 *relenp = pip->re_len; 1762 } 1763done: if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) { 1764 /* 1765 * During the page-by-page pass, item order verification was 1766 * not finished due to the presence of overflow items. If 1767 * isbad == 0, though, it's now safe to do so, as we've 1768 * traversed any child overflow pages. Do it. 1769 */ 1770 if (h == NULL && (ret = __memp_fget(mpf, &pgno, 1771 vdp->thread_info, NULL, 0, &h)) != 0) 1772 goto err; 1773 if ((ret = __bam_vrfy_itemorder(dbp, 1774 vdp, vdp->thread_info, h, pgno, 0, 1, 0, flags)) != 0) 1775 goto err; 1776 F_CLR(pip, VRFY_INCOMPLETE); 1777 } 1778 1779 /* 1780 * It's possible to get to this point with a page that has no 1781 * items, but without having detected any sort of failure yet. 1782 * Having zero items is legal if it's a leaf--it may be the 1783 * root page in an empty tree, or the tree may have been 1784 * modified with the DB_REVSPLITOFF flag set (there's no way 1785 * to tell from what's on disk). For an internal page, 1786 * though, having no items is a problem (all internal pages 1787 * must have children). 1788 */ 1789 if (isbad == 0 && ret == 0) { 1790 if (h == NULL && (ret = __memp_fget(mpf, &pgno, 1791 vdp->thread_info, NULL, 0, &h)) != 0) 1792 goto err; 1793 1794 if (NUM_ENT(h) == 0 && ISINTERNAL(h)) { 1795 isbad = 1; 1796 EPRINT((env, 1797 "Page %lu: internal page is empty and should not be", 1798 (u_long)pgno)); 1799 goto err; 1800 } 1801 } 1802 1803 /* 1804 * Our parent has sent us BINTERNAL pointers to parent records 1805 * so that we can verify our place with respect to them. If it's 1806 * appropriate--we have a default sort function--verify this. 1807 */ 1808 if (isbad == 0 && ret == 0 && !LF_ISSET(DB_NOORDERCHK) && 1809 pip->type != P_IRECNO && pip->type != P_LRECNO) { 1810 if (h == NULL && (ret = __memp_fget(mpf, &pgno, 1811 vdp->thread_info, NULL, 0, &h)) != 0) 1812 goto err; 1813 1814 /* 1815 * __bam_vrfy_treeorder needs to know what comparison function 1816 * to use. If DB_ST_DUPSET is set, we're in a duplicate tree 1817 * and we use the duplicate comparison function; otherwise, 1818 * use the btree one. If unset, use the default, of course. 1819 */ 1820 func = LF_ISSET(DB_ST_DUPSET) ? dbp->dup_compare : 1821 ((BTREE *)dbp->bt_internal)->bt_compare; 1822 if (func == NULL) 1823 func = __bam_defcmp; 1824 1825 if ((ret = __bam_vrfy_treeorder(dbp, 1826 vdp->thread_info, h, l, r, func, flags)) != 0) { 1827 if (ret == DB_VERIFY_BAD) 1828 isbad = 1; 1829 else 1830 goto err; 1831 } 1832 } 1833 1834 /* 1835 * This is guaranteed to succeed for leaf pages, but no harm done. 1836 * 1837 * Internal pages below the top level do not store their own 1838 * record numbers, so we skip them. 1839 */ 1840 if (LF_ISSET(DB_ST_RECNUM) && nrecs != pip->rec_cnt && toplevel) { 1841 isbad = 1; 1842 EPRINT((env, 1843 "Page %lu: bad record count: has %lu records, claims %lu", 1844 (u_long)pgno, (u_long)nrecs, (u_long)pip->rec_cnt)); 1845 } 1846 1847 if (levelp) 1848 *levelp = level; 1849 if (nrecsp) 1850 *nrecsp = nrecs; 1851 1852 pgset = vdp->pgset; 1853 if ((ret = __db_vrfy_pgset_get(pgset, 1854 vdp->thread_info, pgno, &p)) != 0) 1855 goto err; 1856 if (p != 0) { 1857 isbad = 1; 1858 EPRINT((env, "Page %lu: linked twice", (u_long)pgno)); 1859 } else if ((ret = 1860 __db_vrfy_pgset_inc(pgset, vdp->thread_info, pgno)) != 0) 1861 goto err; 1862 1863 if (toplevel) 1864 /* 1865 * The last page's next_pgno in the leaf chain should have been 1866 * PGNO_INVALID. 1867 */ 1868 if (vdp->next_pgno != PGNO_INVALID) { 1869 isbad = 1; 1870 EPRINT((env, "Page %lu: unterminated leaf chain", 1871 (u_long)vdp->prev_pgno)); 1872 } 1873 1874err: if (toplevel) { 1875 /* Restore our caller's settings. */ 1876 vdp->next_pgno = next_pgno; 1877 vdp->prev_pgno = prev_pgno; 1878 vdp->leaf_type = leaf_type; 1879 } 1880 1881 if (h != NULL && (t_ret = __memp_fput(mpf, 1882 vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0) 1883 ret = t_ret; 1884 if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) 1885 ret = t_ret; 1886 if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0) 1887 ret = t_ret; 1888 return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); 1889} 1890 1891/* 1892 * __bam_vrfy_treeorder -- 1893 * Verify that the lowest key on a page sorts greater than the 1894 * BINTERNAL which points to it (lp), and the highest key 1895 * sorts less than the BINTERNAL above that (rp). 1896 * 1897 * If lp is NULL, this means that it was the leftmost key on the 1898 * parent, which (regardless of sort function) sorts less than 1899 * all keys. No need to check it. 1900 * 1901 * If rp is NULL, lp was the highest key on the parent, so there's 1902 * no higher key we must sort less than. 1903 */ 1904static int 1905__bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) 1906 DB *dbp; 1907 DB_THREAD_INFO *ip; 1908 PAGE *h; 1909 BINTERNAL *lp, *rp; 1910 int (*func) __P((DB *, const DBT *, const DBT *)); 1911 u_int32_t flags; 1912{ 1913 BOVERFLOW *bo; 1914 DBT dbt; 1915 ENV *env; 1916 db_indx_t last; 1917 int ret, cmp; 1918 1919 env = dbp->env; 1920 memset(&dbt, 0, sizeof(DBT)); 1921 F_SET(&dbt, DB_DBT_MALLOC); 1922 ret = 0; 1923 1924 /* 1925 * Empty pages are sorted correctly by definition. We check 1926 * to see whether they ought to be empty elsewhere; leaf 1927 * pages legally may be. 1928 */ 1929 if (NUM_ENT(h) == 0) 1930 return (0); 1931 1932 switch (TYPE(h)) { 1933 case P_IBTREE: 1934 case P_LDUP: 1935 last = NUM_ENT(h) - O_INDX; 1936 break; 1937 case P_LBTREE: 1938 last = NUM_ENT(h) - P_INDX; 1939 break; 1940 default: 1941 return (__db_unknown_path(env, "__bam_vrfy_treeorder")); 1942 } 1943 1944 /* 1945 * The key on page h, the child page, is more likely to be 1946 * an overflow page, so we pass its offset, rather than lp/rp's, 1947 * into __bam_cmp. This will take advantage of __db_moff. 1948 */ 1949 1950 /* 1951 * Skip first-item check if we're an internal page--the first 1952 * entry on an internal page is treated specially by __bam_cmp, 1953 * so what's on the page shouldn't matter. (Plus, since we're passing 1954 * our page and item 0 as to __bam_cmp, we'll sort before our 1955 * parent and falsely report a failure.) 1956 */ 1957 if (lp != NULL && TYPE(h) != P_IBTREE) { 1958 if (lp->type == B_KEYDATA) { 1959 dbt.data = lp->data; 1960 dbt.size = lp->len; 1961 } else if (lp->type == B_OVERFLOW) { 1962 bo = (BOVERFLOW *)lp->data; 1963 if ((ret = __db_goff(dbp, ip, NULL, &dbt, 1964 bo->tlen, bo->pgno, NULL, NULL)) != 0) 1965 return (ret); 1966 } else 1967 return ( 1968 __db_unknown_path(env, "__bam_vrfy_treeorder")); 1969 1970 /* On error, fall through, free if needed, and return. */ 1971 if ((ret = __bam_cmp(dbp, ip, 1972 NULL, &dbt, h, 0, func, &cmp)) == 0) { 1973 if (cmp > 0) { 1974 EPRINT((env, 1975 "Page %lu: first item on page sorted greater than parent entry", 1976 (u_long)PGNO(h))); 1977 ret = DB_VERIFY_BAD; 1978 } 1979 } else 1980 EPRINT((env, 1981 "Page %lu: first item on page had comparison error", 1982 (u_long)PGNO(h))); 1983 1984 if (dbt.data != lp->data) 1985 __os_ufree(env, dbt.data); 1986 if (ret != 0) 1987 return (ret); 1988 } 1989 1990 if (rp != NULL) { 1991 if (rp->type == B_KEYDATA) { 1992 dbt.data = rp->data; 1993 dbt.size = rp->len; 1994 } else if (rp->type == B_OVERFLOW) { 1995 bo = (BOVERFLOW *)rp->data; 1996 if ((ret = __db_goff(dbp, ip, NULL, &dbt, 1997 bo->tlen, bo->pgno, NULL, NULL)) != 0) 1998 return (ret); 1999 } else 2000 return ( 2001 __db_unknown_path(env, "__bam_vrfy_treeorder")); 2002 2003 /* On error, fall through, free if needed, and return. */ 2004 if ((ret = __bam_cmp(dbp, ip, 2005 NULL, &dbt, h, last, func, &cmp)) == 0) { 2006 if (cmp < 0) { 2007 EPRINT((env, 2008 "Page %lu: last item on page sorted greater than parent entry", 2009 (u_long)PGNO(h))); 2010 ret = DB_VERIFY_BAD; 2011 } 2012 } else 2013 EPRINT((env, 2014 "Page %lu: last item on page had comparison error", 2015 (u_long)PGNO(h))); 2016 2017 if (dbt.data != rp->data) 2018 __os_ufree(env, dbt.data); 2019 } 2020 2021 return (ret); 2022} 2023 2024/* 2025 * __bam_salvage -- 2026 * Safely dump out anything that looks like a key on an alleged 2027 * btree leaf page, also mark overflow pages as seen. For internal btree 2028 * pages, just mark any overflow pages as seen. 2029 * 2030 * PUBLIC: int __bam_salvage __P((DB *, VRFY_DBINFO *, 2031 * PUBLIC: db_pgno_t, u_int32_t, PAGE *, void *, 2032 * PUBLIC: int (*)(void *, const void *), DBT *, u_int32_t)); 2033 */ 2034int 2035__bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) 2036 DB *dbp; 2037 VRFY_DBINFO *vdp; 2038 db_pgno_t pgno; 2039 u_int32_t pgtype; 2040 PAGE *h; 2041 void *handle; 2042 int (*callback) __P((void *, const void *)); 2043 DBT *key; 2044 u_int32_t flags; 2045{ 2046 BKEYDATA *bk; 2047 BOVERFLOW *bo; 2048 DBT dbt, repldbt, unknown_key, unknown_data; 2049 ENV *env; 2050 VRFY_ITEM *pgmap; 2051 db_indx_t i, last, beg, end, *inp; 2052 db_pgno_t ovflpg; 2053 u_int32_t himark; 2054 void *ovflbuf; 2055 int adj, ret, t_ret, t2_ret; 2056 2057 env = dbp->env; 2058 ovflbuf = pgmap = NULL; 2059 inp = P_INP(dbp, h); 2060 2061 memset(&dbt, 0, sizeof(DBT)); 2062 dbt.flags = DB_DBT_REALLOC; 2063 memset(&repldbt, 0, sizeof(DBT)); 2064 2065 DB_INIT_DBT(unknown_key, "UNKNOWN_KEY", sizeof("UNKNOWN_KEY") - 1); 2066 DB_INIT_DBT(unknown_data, "UNKNOWN_DATA", sizeof("UNKNOWN_DATA") - 1); 2067 2068 /* 2069 * Allocate a buffer for overflow items. Start at one page; 2070 * __db_safe_goff will realloc as needed. 2071 */ 2072 if ((ret = __os_malloc(env, dbp->pgsize, &ovflbuf)) != 0) 2073 goto err; 2074 2075 if (LF_ISSET(DB_AGGRESSIVE) && (ret = 2076 __os_calloc(env, dbp->pgsize, sizeof(pgmap[0]), &pgmap)) != 0) 2077 goto err; 2078 2079 /* 2080 * Loop through the inp array, spitting out key/data pairs. 2081 * 2082 * If we're salvaging normally, loop from 0 through NUM_ENT(h). If 2083 * we're being aggressive, loop until we hit the end of the page -- 2084 * NUM_ENT() may be bogus. 2085 */ 2086 himark = dbp->pgsize; 2087 for (i = 0, last = UINT16_MAX;; i += O_INDX) { 2088 /* If we're not aggressive, break when we hit NUM_ENT(h). */ 2089 if (!LF_ISSET(DB_AGGRESSIVE) && i >= NUM_ENT(h)) 2090 break; 2091 2092 /* Verify the current item. */ 2093 t_ret = 2094 __db_vrfy_inpitem(dbp, h, pgno, i, 1, flags, &himark, NULL); 2095 2096 if (t_ret != 0) { 2097 /* 2098 * If this is a btree leaf and we've printed out a key 2099 * but not its associated data item, fix this imbalance 2100 * by printing an "UNKNOWN_DATA". 2101 */ 2102 if (pgtype == P_LBTREE && i % P_INDX == 1 && 2103 last == i - 1 && (t2_ret = __db_vrfy_prdbt( 2104 &unknown_data, 2105 0, " ", handle, callback, 0, vdp)) != 0) { 2106 if (ret == 0) 2107 ret = t2_ret; 2108 goto err; 2109 } 2110 2111 /* 2112 * Don't return DB_VERIFY_FATAL; it's private and means 2113 * only that we can't go on with this page, not with 2114 * the whole database. It's not even an error if we've 2115 * run into it after NUM_ENT(h). 2116 */ 2117 if (t_ret == DB_VERIFY_FATAL) { 2118 if (i < NUM_ENT(h) && ret == 0) 2119 ret = DB_VERIFY_BAD; 2120 break; 2121 } 2122 continue; 2123 } 2124 2125 /* 2126 * If this returned 0, it's safe to print or (carefully) 2127 * try to fetch. 2128 * 2129 * We only print deleted items if DB_AGGRESSIVE is set. 2130 */ 2131 bk = GET_BKEYDATA(dbp, h, i); 2132 if (!LF_ISSET(DB_AGGRESSIVE) && B_DISSET(bk->type)) 2133 continue; 2134 2135 /* 2136 * If this is a btree leaf and we're about to print out a data 2137 * item for which we didn't print out a key, fix this imbalance 2138 * by printing an "UNKNOWN_KEY". 2139 */ 2140 if (pgtype == P_LBTREE && i % P_INDX == 1 && 2141 last != i - 1 && (t_ret = __db_vrfy_prdbt( 2142 &unknown_key, 0, " ", handle, callback, 0, vdp)) != 0) { 2143 if (ret == 0) 2144 ret = t_ret; 2145 goto err; 2146 } 2147 last = i; 2148 2149 /* 2150 * We're going to go try to print the next item. If key is 2151 * non-NULL, we're a dup page, so we've got to print the key 2152 * first, unless DB_SA_SKIPFIRSTKEY is set and we're on the 2153 * first entry. 2154 */ 2155 if (key != NULL && (i != 0 || !LF_ISSET(DB_SA_SKIPFIRSTKEY))) 2156 if ((t_ret = __db_vrfy_prdbt(key, 2157 0, " ", handle, callback, 0, vdp)) != 0) { 2158 if (ret == 0) 2159 ret = t_ret; 2160 goto err; 2161 } 2162 2163 beg = end = inp[i]; 2164 switch (B_TYPE(bk->type)) { 2165 case B_DUPLICATE: 2166 if (pgtype == P_IBTREE) 2167 break; 2168 2169 end = beg + BOVERFLOW_SIZE - 1; 2170 /* 2171 * If we're not on a normal btree leaf page, there 2172 * shouldn't be off-page dup sets. Something's 2173 * confused; just drop it, and the code to pick up 2174 * unlinked offpage dup sets will print it out 2175 * with key "UNKNOWN" later. 2176 */ 2177 if (pgtype != P_LBTREE) 2178 break; 2179 2180 bo = (BOVERFLOW *)bk; 2181 2182 /* 2183 * If the page number is unreasonable, or if this is 2184 * supposed to be a key item, output "UNKNOWN_KEY" -- 2185 * the best we can do is run into the data items in 2186 * the unlinked offpage dup pass. 2187 */ 2188 if (!IS_VALID_PGNO(bo->pgno) || (i % P_INDX == 0)) { 2189 /* Not much to do on failure. */ 2190 if ((t_ret = __db_vrfy_prdbt(&unknown_key, 2191 0, " ", handle, callback, 0, vdp)) != 0) { 2192 if (ret == 0) 2193 ret = t_ret; 2194 goto err; 2195 } 2196 break; 2197 } 2198 2199 /* Don't stop on error. */ 2200 if ((t_ret = __db_salvage_duptree(dbp, 2201 vdp, bo->pgno, &dbt, handle, callback, 2202 flags | DB_SA_SKIPFIRSTKEY)) != 0 && ret == 0) 2203 ret = t_ret; 2204 2205 break; 2206 case B_KEYDATA: 2207 if (pgtype == P_IBTREE) 2208 break; 2209 2210 end = (db_indx_t)DB_ALIGN( 2211 beg + bk->len, sizeof(u_int32_t)) - 1; 2212 2213 dbt.data = bk->data; 2214 dbt.size = bk->len; 2215 if ((t_ret = __db_vrfy_prdbt(&dbt, 2216 0, " ", handle, callback, 0, vdp)) != 0) { 2217 if (ret == 0) 2218 ret = t_ret; 2219 goto err; 2220 } 2221 break; 2222 case B_OVERFLOW: 2223 if (pgtype != P_IBTREE) 2224 end = beg + BOVERFLOW_SIZE - 1; 2225 bo = (BOVERFLOW *)bk; 2226 2227 /* 2228 * Check for replicated overflow keys, so that we only 2229 * call __db_safe_goff once per overflow page. If we 2230 * get the same offset as the previous key just re-use 2231 * the previous dbt. 2232 * 2233 * P_IBTREE pages will never have replicated overflow 2234 * keys and will never pass this test. 2235 */ 2236 adj = pgtype == P_IBTREE ? O_INDX : P_INDX; 2237 if (i > adj - 1 && i % adj == 0 && 2238 inp[i] == inp[i - adj]) { 2239 DB_ASSERT(env, pgtype != P_IBTREE); 2240 dbt = repldbt; 2241 } else { 2242 if (pgtype == P_IBTREE) { 2243 /* 2244 * If we're looking at a P_IBTREE, we 2245 * just want to mark the overflow page 2246 * as seen. P_IBTREE entries will 2247 * always fail the above test, we'll 2248 * always wind up here. 2249 * 2250 * Note that this call to __db_safe_goff 2251 * differs from the non-P_IBTREE call. 2252 * 2253 * Only call __db_safe_goff if the 2254 * overflow page hasn't been seen. 2255 */ 2256 ovflpg = ((BOVERFLOW *) 2257 ((BINTERNAL *)bk)->data)->pgno; 2258 if (__db_salvage_isdone(vdp, 2259 ovflpg) == 0 && (t_ret = 2260 __db_safe_goff(dbp, vdp, ovflpg, 2261 &dbt, &ovflbuf, flags)) != 0 && 2262 ret == 0) 2263 ret = t_ret; 2264 break; 2265 } 2266 2267 /* Don't stop on error. */ 2268 if ((t_ret = __db_safe_goff(dbp, vdp, bo->pgno, 2269 &dbt, &ovflbuf, flags)) != 0 && ret == 0) 2270 ret = t_ret; 2271 2272 /* 2273 * If this is a key, save it in case the next 2274 * key is a replicated overflow, so we don't 2275 * call __db_safe_goff again. Copy out dbt.data 2276 * in case that pointer gets realloc'd when 2277 * getting a data item. 2278 */ 2279 if (i % P_INDX == 0) { 2280 if ((ret = __os_realloc(env, 2281 dbt.size, &repldbt.data)) != 0) 2282 goto err; 2283 memcpy(repldbt.data, dbt.data, 2284 dbt.size); 2285 repldbt.size = dbt.size; 2286 } 2287 2288 } 2289 2290 if ((t_ret = __db_vrfy_prdbt( 2291 t_ret == 0 ? &dbt : &unknown_key, 2292 0, " ", handle, callback, 0, vdp)) != 0 && ret == 0) 2293 ret = t_ret; 2294 break; 2295 default: 2296 /* 2297 * We should never get here; __db_vrfy_inpitem should 2298 * not be returning 0 if bk->type is unrecognizable. 2299 */ 2300 t_ret = __db_unknown_path(env, "__bam_salvage"); 2301 if (ret == 0) 2302 ret = t_ret; 2303 goto err; 2304 } 2305 2306 /* 2307 * If we're being aggressive, mark the beginning and end of 2308 * the item; we'll come back and print whatever "junk" is in 2309 * the gaps in case we had any bogus inp elements and thereby 2310 * missed stuff. 2311 */ 2312 if (LF_ISSET(DB_AGGRESSIVE) && pgtype != P_IBTREE) { 2313 pgmap[beg] = VRFY_ITEM_BEGIN; 2314 pgmap[end] = VRFY_ITEM_END; 2315 } 2316 } 2317 2318err: if (pgmap != NULL) 2319 __os_free(env, pgmap); 2320 if (ovflbuf != NULL) 2321 __os_free(env, ovflbuf); 2322 if (repldbt.data != NULL) 2323 __os_free(env, repldbt.data); 2324 2325 /* Mark this page as done. */ 2326 if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0) 2327 ret = t_ret; 2328 2329 return (ret); 2330} 2331 2332/* 2333 * __bam_salvage_walkdupint -- 2334 * Walk a known-good btree or recno internal page which is part of 2335 * a dup tree, calling __db_salvage_duptree on each child page. 2336 * 2337 * PUBLIC: int __bam_salvage_walkdupint __P((DB *, VRFY_DBINFO *, PAGE *, 2338 * PUBLIC: DBT *, void *, int (*)(void *, const void *), u_int32_t)); 2339 */ 2340int 2341__bam_salvage_walkdupint(dbp, vdp, h, key, handle, callback, flags) 2342 DB *dbp; 2343 VRFY_DBINFO *vdp; 2344 PAGE *h; 2345 DBT *key; 2346 void *handle; 2347 int (*callback) __P((void *, const void *)); 2348 u_int32_t flags; 2349{ 2350 BINTERNAL *bi; 2351 ENV *env; 2352 RINTERNAL *ri; 2353 int ret, t_ret; 2354 db_indx_t i; 2355 2356 env = dbp->env; 2357 ret = 0; 2358 2359 for (i = 0; i < NUM_ENT(h); i++) { 2360 switch (TYPE(h)) { 2361 case P_IBTREE: 2362 bi = GET_BINTERNAL(dbp, h, i); 2363 if ((t_ret = __db_salvage_duptree(dbp, 2364 vdp, bi->pgno, key, handle, callback, flags)) != 0) 2365 ret = t_ret; 2366 break; 2367 case P_IRECNO: 2368 ri = GET_RINTERNAL(dbp, h, i); 2369 if ((t_ret = __db_salvage_duptree(dbp, 2370 vdp, ri->pgno, key, handle, callback, flags)) != 0) 2371 ret = t_ret; 2372 break; 2373 default: 2374 return (__db_unknown_path( 2375 env, "__bam_salvage_walkdupint")); 2376 } 2377 /* Pass DB_SA_SKIPFIRSTKEY, if set, on to the 0th child only. */ 2378 flags &= ~LF_ISSET(DB_SA_SKIPFIRSTKEY); 2379 } 2380 2381 return (ret); 2382} 2383 2384/* 2385 * __bam_meta2pgset -- 2386 * Given a known-good meta page, return in pgsetp a 0-terminated list of 2387 * db_pgno_t's corresponding to the pages in the btree. 2388 * 2389 * We do this by a somewhat sleazy method, to avoid having to traverse the 2390 * btree structure neatly: we walk down the left side to the very 2391 * first leaf page, then we mark all the pages in the chain of 2392 * NEXT_PGNOs (being wary of cycles and invalid ones), then we 2393 * consolidate our scratch array into a nice list, and return. This 2394 * avoids the memory management hassles of recursion and the 2395 * trouble of walking internal pages--they just don't matter, except 2396 * for the left branch. 2397 * 2398 * PUBLIC: int __bam_meta2pgset __P((DB *, VRFY_DBINFO *, BTMETA *, 2399 * PUBLIC: u_int32_t, DB *)); 2400 */ 2401int 2402__bam_meta2pgset(dbp, vdp, btmeta, flags, pgset) 2403 DB *dbp; 2404 VRFY_DBINFO *vdp; 2405 BTMETA *btmeta; 2406 u_int32_t flags; 2407 DB *pgset; 2408{ 2409 BINTERNAL *bi; 2410 DB_MPOOLFILE *mpf; 2411 PAGE *h; 2412 RINTERNAL *ri; 2413 db_pgno_t current, p; 2414 int err_ret, ret; 2415 2416 DB_ASSERT(dbp->env, pgset != NULL); 2417 2418 mpf = dbp->mpf; 2419 h = NULL; 2420 ret = err_ret = 0; 2421 2422 for (current = btmeta->root;;) { 2423 if (!IS_VALID_PGNO(current) || current == PGNO(btmeta)) { 2424 err_ret = DB_VERIFY_BAD; 2425 goto err; 2426 } 2427 if ((ret = __memp_fget(mpf, ¤t, 2428 vdp->thread_info, NULL, 0, &h)) != 0) { 2429 err_ret = ret; 2430 goto err; 2431 } 2432 2433 switch (TYPE(h)) { 2434 case P_IBTREE: 2435 case P_IRECNO: 2436 if ((ret = __bam_vrfy(dbp, 2437 vdp, h, current, flags | DB_NOORDERCHK)) != 0) { 2438 err_ret = ret; 2439 goto err; 2440 } 2441 if (TYPE(h) == P_IBTREE) { 2442 bi = GET_BINTERNAL(dbp, h, 0); 2443 current = bi->pgno; 2444 } else { /* P_IRECNO */ 2445 ri = GET_RINTERNAL(dbp, h, 0); 2446 current = ri->pgno; 2447 } 2448 break; 2449 case P_LBTREE: 2450 case P_LRECNO: 2451 goto traverse; 2452 default: 2453 err_ret = DB_VERIFY_BAD; 2454 goto err; 2455 } 2456 2457 if ((ret = __memp_fput(mpf, 2458 vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0) 2459 err_ret = ret; 2460 h = NULL; 2461 } 2462 2463 /* 2464 * At this point, current is the pgno of leaf page h, the 0th in the 2465 * tree we're concerned with. 2466 */ 2467traverse: 2468 while (IS_VALID_PGNO(current) && current != PGNO_INVALID) { 2469 if (h == NULL && (ret = __memp_fget(mpf, 2470 ¤t, vdp->thread_info, NULL, 0, &h)) != 0) { 2471 err_ret = ret; 2472 break; 2473 } 2474 2475 if ((ret = __db_vrfy_pgset_get(pgset, 2476 vdp->thread_info, current, (int *)&p)) != 0) 2477 goto err; 2478 2479 if (p != 0) { 2480 /* 2481 * We've found a cycle. Return success anyway-- 2482 * our caller may as well use however much of 2483 * the pgset we've come up with. 2484 */ 2485 break; 2486 } 2487 if ((ret = 2488 __db_vrfy_pgset_inc(pgset, vdp->thread_info, current)) != 0) 2489 goto err; 2490 2491 current = NEXT_PGNO(h); 2492 if ((ret = __memp_fput(mpf, 2493 vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0) 2494 err_ret = ret; 2495 h = NULL; 2496 } 2497 2498err: if (h != NULL) 2499 (void)__memp_fput(mpf, 2500 vdp->thread_info, h, DB_PRIORITY_UNCHANGED); 2501 2502 return (ret == 0 ? err_ret : ret); 2503} 2504 2505/* 2506 * __bam_safe_getdata -- 2507 * 2508 * Utility function for __bam_vrfy_itemorder. Safely gets the datum at 2509 * index i, page h, and sticks it in DBT dbt. If ovflok is 1 and i's an 2510 * overflow item, we do a safe_goff to get the item and signal that we need 2511 * to free dbt->data; if ovflok is 0, we leaves the DBT zeroed. 2512 */ 2513static int 2514__bam_safe_getdata(dbp, ip, h, i, ovflok, dbt, freedbtp) 2515 DB *dbp; 2516 DB_THREAD_INFO *ip; 2517 PAGE *h; 2518 u_int32_t i; 2519 int ovflok; 2520 DBT *dbt; 2521 int *freedbtp; 2522{ 2523 BKEYDATA *bk; 2524 BOVERFLOW *bo; 2525 2526 memset(dbt, 0, sizeof(DBT)); 2527 *freedbtp = 0; 2528 2529 bk = GET_BKEYDATA(dbp, h, i); 2530 if (B_TYPE(bk->type) == B_OVERFLOW) { 2531 if (!ovflok) 2532 return (0); 2533 2534 bo = (BOVERFLOW *)bk; 2535 F_SET(dbt, DB_DBT_MALLOC); 2536 2537 *freedbtp = 1; 2538 return (__db_goff(dbp, ip, NULL, dbt, 2539 bo->tlen, bo->pgno, NULL, NULL)); 2540 } else { 2541 dbt->data = bk->data; 2542 dbt->size = bk->len; 2543 } 2544 2545 return (0); 2546} 2547