1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 * 6 * $Id: bt_stat.c,v 12.22 2008/03/11 21:07:19 mbrey Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/db_page.h" 13#include "dbinc/btree.h" 14#include "dbinc/lock.h" 15#include "dbinc/mp.h" 16 17#ifdef HAVE_STATISTICS 18/* 19 * __bam_stat -- 20 * Gather/print the btree statistics 21 * 22 * PUBLIC: int __bam_stat __P((DBC *, void *, u_int32_t)); 23 */ 24int 25__bam_stat(dbc, spp, flags) 26 DBC *dbc; 27 void *spp; 28 u_int32_t flags; 29{ 30 BTMETA *meta; 31 BTREE *t; 32 BTREE_CURSOR *cp; 33 DB *dbp; 34 DB_BTREE_STAT *sp; 35 DB_LOCK lock, metalock; 36 DB_MPOOLFILE *mpf; 37 ENV *env; 38 PAGE *h; 39 db_pgno_t pgno; 40 int ret, t_ret, write_meta; 41 42 dbp = dbc->dbp; 43 env = dbp->env; 44 45 meta = NULL; 46 t = dbp->bt_internal; 47 sp = NULL; 48 LOCK_INIT(metalock); 49 LOCK_INIT(lock); 50 mpf = dbp->mpf; 51 h = NULL; 52 ret = write_meta = 0; 53 54 cp = (BTREE_CURSOR *)dbc->internal; 55 56 /* Allocate and clear the structure. */ 57 if ((ret = __os_umalloc(env, sizeof(*sp), &sp)) != 0) 58 goto err; 59 memset(sp, 0, sizeof(*sp)); 60 61 /* Get the metadata page for the entire database. */ 62 pgno = PGNO_BASE_MD; 63 if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &metalock)) != 0) 64 goto err; 65 if ((ret = __memp_fget(mpf, &pgno, 66 dbc->thread_info, dbc->txn, 0, &meta)) != 0) 67 goto err; 68 69 if (flags == DB_FAST_STAT) 70 goto meta_only; 71 72 /* Walk the metadata free list, counting pages. */ 73 for (sp->bt_free = 0, pgno = meta->dbmeta.free; pgno != PGNO_INVALID;) { 74 ++sp->bt_free; 75 76 if ((ret = __memp_fget(mpf, &pgno, 77 dbc->thread_info, dbc->txn, 0, &h)) != 0) 78 goto err; 79 80 pgno = h->next_pgno; 81 if ((ret = __memp_fput(mpf, 82 dbc->thread_info, h, dbc->priority)) != 0) 83 goto err; 84 h = NULL; 85 } 86 87 /* Get the root page. */ 88 pgno = cp->root; 89 if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0) 90 goto err; 91 if ((ret = __memp_fget(mpf, &pgno, 92 dbc->thread_info, dbc->txn, 0, &h)) != 0) 93 goto err; 94 95 /* Get the levels from the root page. */ 96 sp->bt_levels = h->level; 97 98 /* Discard the root page. */ 99 ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority); 100 h = NULL; 101 if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0) 102 ret = t_ret; 103 if (ret != 0) 104 goto err; 105 106 /* Walk the tree. */ 107 if ((ret = __bam_traverse(dbc, 108 DB_LOCK_READ, cp->root, __bam_stat_callback, sp)) != 0) 109 goto err; 110 111 /* 112 * Get the subdatabase metadata page if it's not the same as the 113 * one we already have. 114 */ 115 write_meta = !F_ISSET(dbp, DB_AM_RDONLY) && 116 (!MULTIVERSION(dbp) || dbc->txn != NULL); 117meta_only: 118 if (t->bt_meta != PGNO_BASE_MD || write_meta) { 119 ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority); 120 meta = NULL; 121 if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0) 122 ret = t_ret; 123 if (ret != 0) 124 goto err; 125 126 if ((ret = __db_lget(dbc, 127 0, t->bt_meta, write_meta ? DB_LOCK_WRITE : DB_LOCK_READ, 128 0, &metalock)) != 0) 129 goto err; 130 if ((ret = __memp_fget(mpf, &t->bt_meta, 131 dbc->thread_info, dbc->txn, 132 write_meta ? DB_MPOOL_DIRTY : 0, &meta)) != 0) 133 goto err; 134 } 135 if (flags == DB_FAST_STAT) { 136 if (dbp->type == DB_RECNO || 137 (dbp->type == DB_BTREE && F_ISSET(dbp, DB_AM_RECNUM))) { 138 if ((ret = __db_lget(dbc, 0, 139 cp->root, DB_LOCK_READ, 0, &lock)) != 0) 140 goto err; 141 if ((ret = __memp_fget(mpf, &cp->root, 142 dbc->thread_info, dbc->txn, 0, &h)) != 0) 143 goto err; 144 145 sp->bt_nkeys = RE_NREC(h); 146 } else 147 sp->bt_nkeys = meta->dbmeta.key_count; 148 149 sp->bt_ndata = dbp->type == DB_RECNO ? 150 sp->bt_nkeys : meta->dbmeta.record_count; 151 } 152 153 /* Get metadata page statistics. */ 154 sp->bt_metaflags = meta->dbmeta.flags; 155 sp->bt_minkey = meta->minkey; 156 sp->bt_re_len = meta->re_len; 157 sp->bt_re_pad = meta->re_pad; 158 /* 159 * Don't take the page number from the meta-data page -- that value is 160 * only maintained in the primary database, we may have been called on 161 * a subdatabase. (Yes, I read the primary database meta-data page 162 * earlier in this function, but I'm asking the underlying cache so the 163 * code for the Hash and Btree methods is the same.) 164 */ 165 if ((ret = __memp_get_last_pgno(dbp->mpf, &pgno)) != 0) 166 goto err; 167 sp->bt_pagecnt = pgno + 1; 168 sp->bt_pagesize = meta->dbmeta.pagesize; 169 sp->bt_magic = meta->dbmeta.magic; 170 sp->bt_version = meta->dbmeta.version; 171 172 if (write_meta != 0) { 173 meta->dbmeta.key_count = sp->bt_nkeys; 174 meta->dbmeta.record_count = sp->bt_ndata; 175 } 176 177 *(DB_BTREE_STAT **)spp = sp; 178 179err: /* Discard the second page. */ 180 if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0) 181 ret = t_ret; 182 if (h != NULL && (t_ret = __memp_fput(mpf, 183 dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) 184 ret = t_ret; 185 186 /* Discard the metadata page. */ 187 if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0) 188 ret = t_ret; 189 if (meta != NULL && (t_ret = __memp_fput(mpf, 190 dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0) 191 ret = t_ret; 192 193 if (ret != 0 && sp != NULL) { 194 __os_ufree(env, sp); 195 *(DB_BTREE_STAT **)spp = NULL; 196 } 197 198 return (ret); 199} 200 201/* 202 * __bam_stat_print -- 203 * Display btree/recno statistics. 204 * 205 * PUBLIC: int __bam_stat_print __P((DBC *, u_int32_t)); 206 */ 207int 208__bam_stat_print(dbc, flags) 209 DBC *dbc; 210 u_int32_t flags; 211{ 212 static const FN fn[] = { 213 { BTM_DUP, "duplicates" }, 214 { BTM_RECNO, "recno" }, 215 { BTM_RECNUM, "record-numbers" }, 216 { BTM_FIXEDLEN, "fixed-length" }, 217 { BTM_RENUMBER, "renumber" }, 218 { BTM_SUBDB, "multiple-databases" }, 219 { BTM_DUPSORT, "sorted duplicates" }, 220 { 0, NULL } 221 }; 222 DB *dbp; 223 DB_BTREE_STAT *sp; 224 ENV *env; 225 int lorder, ret; 226 const char *s; 227 228 dbp = dbc->dbp; 229 env = dbp->env; 230 231 if ((ret = __bam_stat(dbc, &sp, LF_ISSET(DB_FAST_STAT))) != 0) 232 return (ret); 233 234 if (LF_ISSET(DB_STAT_ALL)) { 235 __db_msg(env, "%s", DB_GLOBAL(db_line)); 236 __db_msg(env, "Default Btree/Recno database information:"); 237 } 238 239 __db_msg(env, "%lx\tBtree magic number", (u_long)sp->bt_magic); 240 __db_msg(env, "%lu\tBtree version number", (u_long)sp->bt_version); 241 242 (void)__db_get_lorder(dbp, &lorder); 243 switch (lorder) { 244 case 1234: 245 s = "Little-endian"; 246 break; 247 case 4321: 248 s = "Big-endian"; 249 break; 250 default: 251 s = "Unrecognized byte order"; 252 break; 253 } 254 __db_msg(env, "%s\tByte order", s); 255 __db_prflags(env, NULL, sp->bt_metaflags, fn, NULL, "\tFlags"); 256 if (dbp->type == DB_BTREE) 257 __db_dl(env, "Minimum keys per-page", (u_long)sp->bt_minkey); 258 if (dbp->type == DB_RECNO) { 259 __db_dl(env, 260 "Fixed-length record size", (u_long)sp->bt_re_len); 261 __db_msg(env, 262 "%#x\tFixed-length record pad", (u_int)sp->bt_re_pad); 263 } 264 __db_dl(env, 265 "Underlying database page size", (u_long)sp->bt_pagesize); 266 if (dbp->type == DB_BTREE) 267 __db_dl(env, "Overflow key/data size", 268 ((BTREE_CURSOR *)dbc->internal)->ovflsize); 269 __db_dl(env, "Number of levels in the tree", (u_long)sp->bt_levels); 270 __db_dl(env, dbp->type == DB_BTREE ? 271 "Number of unique keys in the tree" : 272 "Number of records in the tree", (u_long)sp->bt_nkeys); 273 __db_dl(env, 274 "Number of data items in the tree", (u_long)sp->bt_ndata); 275 276 __db_dl(env, 277 "Number of tree internal pages", (u_long)sp->bt_int_pg); 278 __db_dl_pct(env, 279 "Number of bytes free in tree internal pages", 280 (u_long)sp->bt_int_pgfree, 281 DB_PCT_PG(sp->bt_int_pgfree, sp->bt_int_pg, sp->bt_pagesize), "ff"); 282 283 __db_dl(env, 284 "Number of tree leaf pages", (u_long)sp->bt_leaf_pg); 285 __db_dl_pct(env, "Number of bytes free in tree leaf pages", 286 (u_long)sp->bt_leaf_pgfree, DB_PCT_PG( 287 sp->bt_leaf_pgfree, sp->bt_leaf_pg, sp->bt_pagesize), "ff"); 288 289 __db_dl(env, 290 "Number of tree duplicate pages", (u_long)sp->bt_dup_pg); 291 __db_dl_pct(env, 292 "Number of bytes free in tree duplicate pages", 293 (u_long)sp->bt_dup_pgfree, 294 DB_PCT_PG(sp->bt_dup_pgfree, sp->bt_dup_pg, sp->bt_pagesize), "ff"); 295 296 __db_dl(env, 297 "Number of tree overflow pages", (u_long)sp->bt_over_pg); 298 __db_dl_pct(env, "Number of bytes free in tree overflow pages", 299 (u_long)sp->bt_over_pgfree, DB_PCT_PG( 300 sp->bt_over_pgfree, sp->bt_over_pg, sp->bt_pagesize), "ff"); 301 __db_dl(env, "Number of empty pages", (u_long)sp->bt_empty_pg); 302 303 __db_dl(env, "Number of pages on the free list", (u_long)sp->bt_free); 304 305 __os_ufree(env, sp); 306 307 return (0); 308} 309 310/* 311 * __bam_stat_callback -- 312 * Statistics callback. 313 * 314 * PUBLIC: int __bam_stat_callback __P((DBC *, PAGE *, void *, int *)); 315 */ 316int 317__bam_stat_callback(dbc, h, cookie, putp) 318 DBC *dbc; 319 PAGE *h; 320 void *cookie; 321 int *putp; 322{ 323 DB *dbp; 324 DB_BTREE_STAT *sp; 325 db_indx_t indx, *inp, top; 326 u_int8_t type; 327 328 dbp = dbc->dbp; 329 sp = cookie; 330 *putp = 0; 331 top = NUM_ENT(h); 332 inp = P_INP(dbp, h); 333 334 switch (TYPE(h)) { 335 case P_IBTREE: 336 case P_IRECNO: 337 ++sp->bt_int_pg; 338 sp->bt_int_pgfree += P_FREESPACE(dbp, h); 339 break; 340 case P_LBTREE: 341 if (top == 0) 342 ++sp->bt_empty_pg; 343 344 /* Correct for on-page duplicates and deleted items. */ 345 for (indx = 0; indx < top; indx += P_INDX) { 346 type = GET_BKEYDATA(dbp, h, indx + O_INDX)->type; 347 /* Ignore deleted items. */ 348 if (B_DISSET(type)) 349 continue; 350 351 /* Ignore duplicate keys. */ 352 if (indx + P_INDX >= top || 353 inp[indx] != inp[indx + P_INDX]) 354 ++sp->bt_nkeys; 355 356 /* Ignore off-page duplicates. */ 357 if (B_TYPE(type) != B_DUPLICATE) 358 ++sp->bt_ndata; 359 } 360 361 ++sp->bt_leaf_pg; 362 sp->bt_leaf_pgfree += P_FREESPACE(dbp, h); 363 break; 364 case P_LRECNO: 365 if (top == 0) 366 ++sp->bt_empty_pg; 367 368 /* 369 * If walking a recno tree, then each of these items is a key. 370 * Otherwise, we're walking an off-page duplicate set. 371 */ 372 if (dbp->type == DB_RECNO) { 373 /* 374 * Correct for deleted items in non-renumbering Recno 375 * databases. 376 */ 377 if (F_ISSET(dbp, DB_AM_RENUMBER)) { 378 sp->bt_nkeys += top; 379 sp->bt_ndata += top; 380 } else 381 for (indx = 0; indx < top; indx += O_INDX) { 382 type = GET_BKEYDATA(dbp, h, indx)->type; 383 if (!B_DISSET(type)) { 384 ++sp->bt_ndata; 385 ++sp->bt_nkeys; 386 } 387 } 388 389 ++sp->bt_leaf_pg; 390 sp->bt_leaf_pgfree += P_FREESPACE(dbp, h); 391 } else { 392 sp->bt_ndata += top; 393 394 ++sp->bt_dup_pg; 395 sp->bt_dup_pgfree += P_FREESPACE(dbp, h); 396 } 397 break; 398 case P_LDUP: 399 if (top == 0) 400 ++sp->bt_empty_pg; 401 402 /* Correct for deleted items. */ 403 for (indx = 0; indx < top; indx += O_INDX) 404 if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type)) 405 ++sp->bt_ndata; 406 407 ++sp->bt_dup_pg; 408 sp->bt_dup_pgfree += P_FREESPACE(dbp, h); 409 break; 410 case P_OVERFLOW: 411 ++sp->bt_over_pg; 412 sp->bt_over_pgfree += P_OVFLSPACE(dbp, dbp->pgsize, h); 413 break; 414 default: 415 return (__db_pgfmt(dbp->env, h->pgno)); 416 } 417 return (0); 418} 419 420/* 421 * __bam_print_cursor -- 422 * Display the current internal cursor. 423 * 424 * PUBLIC: void __bam_print_cursor __P((DBC *)); 425 */ 426void 427__bam_print_cursor(dbc) 428 DBC *dbc; 429{ 430 static const FN fn[] = { 431 { C_DELETED, "C_DELETED" }, 432 { C_RECNUM, "C_RECNUM" }, 433 { C_RENUMBER, "C_RENUMBER" }, 434 { 0, NULL } 435 }; 436 ENV *env; 437 BTREE_CURSOR *cp; 438 439 env = dbc->env; 440 cp = (BTREE_CURSOR *)dbc->internal; 441 442 STAT_ULONG("Overflow size", cp->ovflsize); 443 if (dbc->dbtype == DB_RECNO) 444 STAT_ULONG("Recno", cp->recno); 445 STAT_ULONG("Order", cp->order); 446 __db_prflags(env, NULL, cp->flags, fn, NULL, "\tInternal Flags"); 447} 448 449#else /* !HAVE_STATISTICS */ 450 451int 452__bam_stat(dbc, spp, flags) 453 DBC *dbc; 454 void *spp; 455 u_int32_t flags; 456{ 457 COMPQUIET(spp, NULL); 458 COMPQUIET(flags, 0); 459 460 return (__db_stat_not_built(dbc->env)); 461} 462 463int 464__bam_stat_print(dbc, flags) 465 DBC *dbc; 466 u_int32_t flags; 467{ 468 COMPQUIET(flags, 0); 469 470 return (__db_stat_not_built(dbc->env)); 471} 472#endif 473 474#ifndef HAVE_BREW 475/* 476 * __bam_key_range -- 477 * Return proportion of keys relative to given key. The numbers are 478 * slightly skewed due to on page duplicates. 479 * 480 * PUBLIC: int __bam_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t)); 481 */ 482int 483__bam_key_range(dbc, dbt, kp, flags) 484 DBC *dbc; 485 DBT *dbt; 486 DB_KEY_RANGE *kp; 487 u_int32_t flags; 488{ 489 BTREE_CURSOR *cp; 490 EPG *sp; 491 double factor; 492 int exact, ret; 493 494 COMPQUIET(flags, 0); 495 496 if ((ret = __bam_search(dbc, PGNO_INVALID, 497 dbt, SR_STK_ONLY, 1, NULL, &exact)) != 0) 498 return (ret); 499 500 cp = (BTREE_CURSOR *)dbc->internal; 501 kp->less = kp->greater = 0.0; 502 503 factor = 1.0; 504 505 /* Correct the leaf page. */ 506 cp->csp->entries /= 2; 507 cp->csp->indx /= 2; 508 for (sp = cp->sp; sp <= cp->csp; ++sp) { 509 /* 510 * At each level we know that pages greater than indx contain 511 * keys greater than what we are looking for and those less 512 * than indx are less than. The one pointed to by indx may 513 * have some less, some greater or even equal. If indx is 514 * equal to the number of entries, then the key is out of range 515 * and everything is less. 516 */ 517 if (sp->indx == 0) 518 kp->greater += factor * (sp->entries - 1)/sp->entries; 519 else if (sp->indx == sp->entries) 520 kp->less += factor; 521 else { 522 kp->less += factor * sp->indx / sp->entries; 523 kp->greater += factor * 524 ((sp->entries - sp->indx) - 1) / sp->entries; 525 } 526 factor *= 1.0/sp->entries; 527 } 528 529 /* 530 * If there was an exact match then assign 1 n'th to the key itself. 531 * Otherwise that factor belongs to those greater than the key, unless 532 * the key was out of range. 533 */ 534 if (exact) 535 kp->equal = factor; 536 else { 537 if (kp->less != 1) 538 kp->greater += factor; 539 kp->equal = 0; 540 } 541 542 BT_STK_CLR(cp); 543 544 return (0); 545} 546#endif 547 548/* 549 * __bam_traverse -- 550 * Walk a Btree database. 551 * 552 * PUBLIC: int __bam_traverse __P((DBC *, db_lockmode_t, 553 * PUBLIC: db_pgno_t, int (*)(DBC *, PAGE *, void *, int *), void *)); 554 */ 555int 556__bam_traverse(dbc, mode, root_pgno, callback, cookie) 557 DBC *dbc; 558 db_lockmode_t mode; 559 db_pgno_t root_pgno; 560 int (*callback)__P((DBC *, PAGE *, void *, int *)); 561 void *cookie; 562{ 563 BINTERNAL *bi; 564 BKEYDATA *bk; 565 DB *dbp; 566 DB_LOCK lock; 567 DB_MPOOLFILE *mpf; 568 PAGE *h; 569 RINTERNAL *ri; 570 db_indx_t indx, *inp; 571 int already_put, ret, t_ret; 572 573 dbp = dbc->dbp; 574 mpf = dbp->mpf; 575 already_put = 0; 576 577 if ((ret = __db_lget(dbc, 0, root_pgno, mode, 0, &lock)) != 0) 578 return (ret); 579 if ((ret = __memp_fget(mpf, &root_pgno, 580 dbc->thread_info, dbc->txn, 0, &h)) != 0) { 581 (void)__TLPUT(dbc, lock); 582 return (ret); 583 } 584 585 switch (TYPE(h)) { 586 case P_IBTREE: 587 for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) { 588 bi = GET_BINTERNAL(dbp, h, indx); 589 if (B_TYPE(bi->type) == B_OVERFLOW && 590 (ret = __db_traverse_big(dbc, 591 ((BOVERFLOW *)bi->data)->pgno, 592 callback, cookie)) != 0) 593 goto err; 594 if ((ret = __bam_traverse( 595 dbc, mode, bi->pgno, callback, cookie)) != 0) 596 goto err; 597 } 598 break; 599 case P_IRECNO: 600 for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) { 601 ri = GET_RINTERNAL(dbp, h, indx); 602 if ((ret = __bam_traverse( 603 dbc, mode, ri->pgno, callback, cookie)) != 0) 604 goto err; 605 } 606 break; 607 case P_LBTREE: 608 inp = P_INP(dbp, h); 609 for (indx = 0; indx < NUM_ENT(h); indx += P_INDX) { 610 bk = GET_BKEYDATA(dbp, h, indx); 611 if (B_TYPE(bk->type) == B_OVERFLOW && 612 (indx + P_INDX >= NUM_ENT(h) || 613 inp[indx] != inp[indx + P_INDX])) { 614 if ((ret = __db_traverse_big(dbc, 615 GET_BOVERFLOW(dbp, h, indx)->pgno, 616 callback, cookie)) != 0) 617 goto err; 618 } 619 bk = GET_BKEYDATA(dbp, h, indx + O_INDX); 620 if (B_TYPE(bk->type) == B_DUPLICATE && 621 (ret = __bam_traverse(dbc, mode, 622 GET_BOVERFLOW(dbp, h, indx + O_INDX)->pgno, 623 callback, cookie)) != 0) 624 goto err; 625 if (B_TYPE(bk->type) == B_OVERFLOW && 626 (ret = __db_traverse_big(dbc, 627 GET_BOVERFLOW(dbp, h, indx + O_INDX)->pgno, 628 callback, cookie)) != 0) 629 goto err; 630 } 631 break; 632 case P_LDUP: 633 case P_LRECNO: 634 for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) { 635 bk = GET_BKEYDATA(dbp, h, indx); 636 if (B_TYPE(bk->type) == B_OVERFLOW && 637 (ret = __db_traverse_big(dbc, 638 GET_BOVERFLOW(dbp, h, indx)->pgno, 639 callback, cookie)) != 0) 640 goto err; 641 } 642 break; 643 default: 644 return (__db_pgfmt(dbp->env, h->pgno)); 645 } 646 647 ret = callback(dbc, h, cookie, &already_put); 648 649err: if (!already_put && (t_ret = __memp_fput(mpf, 650 dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) 651 ret = t_ret; 652 if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0) 653 ret = t_ret; 654 655 return (ret); 656} 657