1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 */ 6/* 7 * Copyright (c) 1990, 1993, 1994, 1995, 1996 8 * Keith Bostic. All rights reserved. 9 */ 10/* 11 * Copyright (c) 1990, 1993, 1994, 1995 12 * The Regents of the University of California. All rights reserved. 13 * 14 * This code is derived from software contributed to Berkeley by 15 * Mike Olson. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. Neither the name of the University nor the names of its contributors 26 * may be used to endorse or promote products derived from this software 27 * without specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 39 * SUCH DAMAGE. 40 * 41 * $Id: db_meta.c,v 12.58 2008/05/07 12:27:32 bschmeck Exp $ 42 */ 43 44#include "db_config.h" 45 46#include "db_int.h" 47#include "dbinc/db_page.h" 48#include "dbinc/lock.h" 49#include "dbinc/mp.h" 50#include "dbinc/txn.h" 51#include "dbinc/db_am.h" 52 53static void __db_init_meta __P((DB *, void *, db_pgno_t, u_int32_t)); 54#ifdef HAVE_FTRUNCATE 55static void __db_freelist_sort __P((db_pglist_t *, u_int32_t)); 56static int __db_pglistcmp __P((const void *, const void *)); 57static int __db_truncate_freelist __P((DBC *, DBMETA *, 58 PAGE *, db_pgno_t *, u_int32_t, u_int32_t)); 59#endif 60 61/* 62 * __db_init_meta -- 63 * Helper function for __db_new that initializes the important fields in 64 * a meta-data page (used instead of P_INIT). We need to make sure that we 65 * retain the page number and LSN of the existing page. 66 */ 67static void 68__db_init_meta(dbp, p, pgno, pgtype) 69 DB *dbp; 70 void *p; 71 db_pgno_t pgno; 72 u_int32_t pgtype; 73{ 74 DBMETA *meta; 75 DB_LSN save_lsn; 76 77 meta = (DBMETA *)p; 78 save_lsn = meta->lsn; 79 memset(meta, 0, sizeof(DBMETA)); 80 meta->lsn = save_lsn; 81 meta->pagesize = dbp->pgsize; 82 if (F_ISSET(dbp, DB_AM_CHKSUM)) 83 FLD_SET(meta->metaflags, DBMETA_CHKSUM); 84 meta->pgno = pgno; 85 meta->type = (u_int8_t)pgtype; 86} 87 88/* 89 * __db_new -- 90 * Get a new page, preferably from the freelist. 91 * 92 * PUBLIC: int __db_new __P((DBC *, u_int32_t, PAGE **)); 93 */ 94int 95__db_new(dbc, type, pagepp) 96 DBC *dbc; 97 u_int32_t type; 98 PAGE **pagepp; 99{ 100 DB *dbp; 101 DBMETA *meta; 102 DB_LOCK metalock; 103 DB_LSN lsn; 104 DB_MPOOLFILE *mpf; 105 ENV *env; 106 PAGE *h; 107 db_pgno_t last, *list, pgno, newnext; 108 int extend, ret, t_ret; 109 110 meta = NULL; 111 dbp = dbc->dbp; 112 env = dbp->env; 113 mpf = dbp->mpf; 114 h = NULL; 115 newnext = PGNO_INVALID; 116 117 pgno = PGNO_BASE_MD; 118 if ((ret = __db_lget(dbc, 119 LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) 120 goto err; 121 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 122 DB_MPOOL_DIRTY, &meta)) != 0) 123 goto err; 124 last = meta->last_pgno; 125 if (meta->free == PGNO_INVALID) { 126 if (FLD_ISSET(type, P_DONTEXTEND)) { 127 *pagepp = NULL; 128 goto err; 129 } 130 last = pgno = meta->last_pgno + 1; 131 ZERO_LSN(lsn); 132 extend = 1; 133 } else { 134 pgno = meta->free; 135 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 136 DB_MPOOL_DIRTY, &h)) != 0) 137 goto err; 138 139 /* 140 * We want to take the first page off the free list and 141 * then set meta->free to the that page's next_pgno, but 142 * we need to log the change first. 143 */ 144 newnext = h->next_pgno; 145 lsn = h->lsn; 146 extend = 0; 147 DB_ASSERT(env, TYPE(h) == P_INVALID); 148 149 if (TYPE(h) != P_INVALID) { 150 __db_errx(env, 151 "%s page %lu is on free list with type %lu", 152 dbp->fname, (u_long)PGNO(h), (u_long)TYPE(h)); 153 return (__env_panic(env, EINVAL)); 154 } 155 156 } 157 158 FLD_CLR(type, P_DONTEXTEND); 159 160 /* 161 * Log the allocation before fetching the new page. If we 162 * don't have room in the log then we don't want to tell 163 * mpool to extend the file. 164 */ 165 if (DBC_LOGGING(dbc)) { 166 if ((ret = __db_pg_alloc_log(dbp, dbc->txn, &LSN(meta), 0, 167 &LSN(meta), PGNO_BASE_MD, &lsn, 168 pgno, (u_int32_t)type, newnext, meta->last_pgno)) != 0) 169 goto err; 170 } else 171 LSN_NOT_LOGGED(LSN(meta)); 172 173 meta->free = newnext; 174 175 if (extend == 1) { 176 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 177 DB_MPOOL_NEW, &h)) != 0) 178 goto err; 179 DB_ASSERT(env, last == pgno); 180 meta->last_pgno = pgno; 181 ZERO_LSN(h->lsn); 182 h->pgno = pgno; 183 } 184 LSN(h) = LSN(meta); 185 186 ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority); 187 meta = NULL; 188 if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0) 189 ret = t_ret; 190 if (ret != 0) 191 goto err; 192 193 switch (type) { 194 case P_BTREEMETA: 195 case P_HASHMETA: 196 case P_QAMMETA: 197 __db_init_meta(dbp, h, h->pgno, type); 198 break; 199 default: 200 P_INIT(h, dbp->pgsize, 201 h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type); 202 break; 203 } 204 205 /* Fix up the sorted free list if necessary. */ 206#ifdef HAVE_FTRUNCATE 207 if (extend == 0) { 208 u_int32_t nelems = 0; 209 210 if ((ret = __memp_get_freelist(dbp->mpf, &nelems, &list)) != 0) 211 goto err; 212 if (nelems != 0) { 213 DB_ASSERT(env, h->pgno == list[0]); 214 memmove(list, &list[1], (nelems - 1) * sizeof(*list)); 215 if ((ret = __memp_extend_freelist( 216 dbp->mpf, nelems - 1, &list)) != 0) 217 goto err; 218 } 219 } 220#else 221 COMPQUIET(list, NULL); 222#endif 223 224 /* 225 * If dirty reads are enabled and we are in a transaction, we could 226 * abort this allocation after the page(s) pointing to this 227 * one have their locks downgraded. This would permit dirty readers 228 * to access this page which is ok, but they must be off the 229 * page when we abort. We never lock overflow pages or off page 230 * duplicate trees. 231 */ 232 if (type != P_OVERFLOW && !F_ISSET(dbc, DBC_OPD) && 233 F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) && dbc->txn != NULL) { 234 if ((ret = __db_lget(dbc, 0, 235 h->pgno, DB_LOCK_WWRITE, 0, &metalock)) != 0) 236 goto err; 237 } 238 239 *pagepp = h; 240 return (0); 241 242err: if (h != NULL) 243 (void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority); 244 if (meta != NULL) 245 (void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority); 246 (void)__TLPUT(dbc, metalock); 247 return (ret); 248} 249 250/* 251 * __db_free -- 252 * Add a page to the head of the freelist. 253 * 254 * PUBLIC: int __db_free __P((DBC *, PAGE *)); 255 */ 256int 257__db_free(dbc, h) 258 DBC *dbc; 259 PAGE *h; 260{ 261 DB *dbp; 262 DBMETA *meta; 263 DBT ddbt, ldbt; 264 DB_LOCK metalock; 265 DB_LSN *lsnp; 266 DB_MPOOLFILE *mpf; 267 PAGE *prev; 268 db_pgno_t last_pgno, next_pgno, pgno, prev_pgno; 269 u_int32_t lflag; 270 int ret, t_ret; 271#ifdef HAVE_FTRUNCATE 272 db_pgno_t *list, *lp; 273 u_int32_t nelem, position, start; 274 int do_truncate; 275#endif 276 277 dbp = dbc->dbp; 278 mpf = dbp->mpf; 279 prev_pgno = PGNO_INVALID; 280 meta = NULL; 281 prev = NULL; 282#ifdef HAVE_FTRUNCATE 283 lp = NULL; 284 nelem = 0; 285 do_truncate = 0; 286#endif 287 288 /* 289 * Retrieve the metadata page. If we are not keeping a sorted 290 * free list put the page at the head of the the free list. 291 * If we are keeping a sorted free list, for truncation, 292 * then figure out where this page belongs and either 293 * link it in or truncate the file as much as possible. 294 * If either the lock get or page get routines 295 * fail, then we need to put the page with which we were called 296 * back because our caller assumes we take care of it. 297 */ 298 pgno = PGNO_BASE_MD; 299 if ((ret = __db_lget(dbc, 300 LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) 301 goto err; 302 303 /* If we support truncate, we might not dirty the meta page. */ 304 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 305#ifdef HAVE_FTRUNCATE 306 0, 307#else 308 DB_MPOOL_DIRTY, 309#endif 310 &meta)) != 0) 311 goto err1; 312 313 last_pgno = meta->last_pgno; 314 next_pgno = meta->free; 315 /* 316 * Assign lsnp here so it always initialized when 317 * HAVE_FTRUNCATE is not defined. 318 */ 319 lsnp = &LSN(meta); 320 321 DB_ASSERT(dbp->env, h->pgno != next_pgno); 322 323#ifdef HAVE_FTRUNCATE 324 /* 325 * If we are maintaining a sorted free list see if we either have a 326 * new truncation point or the page goes somewhere in the middle of 327 * the list. If it goes in the middle of the list, we will drop the 328 * meta page and get the previous page. 329 */ 330 if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0) 331 goto err1; 332 if (list == NULL) 333 goto no_sort; 334 335 if (h->pgno != last_pgno) { 336 /* 337 * Put the page number in the sorted list. 338 * Finds its position and the previous page, 339 * extend the list, make room and insert. 340 */ 341 position = 0; 342 if (nelem != 0) { 343 __db_freelist_pos(h->pgno, list, nelem, &position); 344 345 DB_ASSERT(dbp->env, h->pgno != list[position]); 346 347 /* Get the previous page if this is not the smallest. */ 348 if (position != 0 || h->pgno > list[0]) 349 prev_pgno = list[position]; 350 } 351 352 /* Put the page number into the list. */ 353 if ((ret = __memp_extend_freelist(mpf, nelem + 1, &list)) != 0) 354 return (ret); 355 if (prev_pgno != PGNO_INVALID) 356 lp = &list[position + 1]; 357 else 358 lp = list; 359 if (nelem != 0 && position != nelem) 360 memmove(lp + 1, lp, 361 (size_t)((u_int8_t*)&list[nelem] - (u_int8_t*)lp)); 362 *lp = h->pgno; 363 } else if (nelem != 0) { 364 /* Find the truncation point. */ 365 for (lp = &list[nelem - 1]; lp >= list; lp--) 366 if (--last_pgno != *lp) 367 break; 368 if (lp < list || last_pgno < h->pgno - 1) 369 do_truncate = 1; 370 last_pgno = meta->last_pgno; 371 } 372 373no_sort: 374 if (prev_pgno == PGNO_INVALID) { 375 if ((ret = __memp_dirty(mpf, 376 &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) 377 goto err1; 378 lsnp = &LSN(meta); 379 } else { 380 pgno = prev_pgno; 381 if ((ret = __memp_fget(mpf, &pgno, 382 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &prev)) != 0) 383 goto err1; 384 next_pgno = NEXT_PGNO(prev); 385 lsnp = &LSN(prev); 386 } 387#endif 388 389 /* 390 * Log the change. 391 * We are either logging an update to the metapage or to the 392 * previous page in the sorted list. 393 */ 394 if (DBC_LOGGING(dbc)) { 395 memset(&ldbt, 0, sizeof(ldbt)); 396 ldbt.data = h; 397 ldbt.size = P_OVERHEAD(dbp); 398 /* 399 * If we are truncating the file, we need to make sure 400 * the logging happens before the truncation. If we 401 * are truncating multiple pages we don't need to flush the 402 * log here as it will be flushed by __db_truncate_freelist. 403 * If we are zeroing pages rather than truncating we still 404 * need to flush since they will not have valid LSNs. 405 */ 406 lflag = 0; 407 408 if (h->pgno == last_pgno 409#ifdef HAVE_FTRUNCATE 410 && do_truncate == 0 411#endif 412 ) 413 lflag = DB_FLUSH; 414 switch (h->type) { 415 case P_HASH: 416 case P_IBTREE: 417 case P_IRECNO: 418 case P_LBTREE: 419 case P_LRECNO: 420 case P_LDUP: 421 if (h->entries > 0) { 422 ldbt.size += h->entries * sizeof(db_indx_t); 423 ddbt.data = (u_int8_t *)h + HOFFSET(h); 424 ddbt.size = dbp->pgsize - HOFFSET(h); 425 if ((ret = __db_pg_freedata_log(dbp, dbc->txn, 426 lsnp, lflag, 427 h->pgno, lsnp, pgno, 428 &ldbt, next_pgno, last_pgno, &ddbt)) != 0) 429 goto err1; 430 goto logged; 431 } 432 break; 433 case P_HASHMETA: 434 ldbt.size = sizeof(HMETA); 435 break; 436 case P_BTREEMETA: 437 ldbt.size = sizeof(BTMETA); 438 break; 439 case P_OVERFLOW: 440 ldbt.size += OV_LEN(h); 441 break; 442 default: 443 DB_ASSERT(dbp->env, h->type != P_QAMDATA); 444 } 445 446 if ((ret = __db_pg_free_log(dbp, 447 dbc->txn, lsnp, lflag, h->pgno, 448 lsnp, pgno, &ldbt, next_pgno, last_pgno)) != 0) 449 goto err1; 450 } else 451 LSN_NOT_LOGGED(*lsnp); 452 453logged: 454#ifdef HAVE_FTRUNCATE 455 if (do_truncate) { 456 start = (u_int32_t) (lp - list) + 1; 457 meta->last_pgno--; 458 ret = __db_truncate_freelist( 459 dbc, meta, h, list, start, nelem); 460 h = NULL; 461 } else 462#endif 463 if (h->pgno == last_pgno) { 464 LSN(h) = *lsnp; 465 P_INIT(h, dbp->pgsize, 466 h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID); 467 if ((ret = __memp_fput(mpf, 468 dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0) 469 goto err1; 470 h = NULL; 471 /* Give the page back to the OS. */ 472 if ((ret = 473 __memp_ftruncate(mpf, dbc->thread_info, last_pgno, 0)) != 0) 474 goto err1; 475 DB_ASSERT(dbp->env, meta->pgno == PGNO_BASE_MD); 476 meta->last_pgno--; 477 h = NULL; 478 } else { 479 /* 480 * If we are not truncating the page then we 481 * reinitialize it and put it at the head of 482 * the free list. 483 */ 484 if ((ret = __memp_dirty(mpf, 485 &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) 486 goto err1; 487 LSN(h) = *lsnp; 488 P_INIT(h, dbp->pgsize, 489 h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID); 490#ifdef DIAGNOSTIC 491 memset((u_int8_t *) h + P_OVERHEAD(dbp), 492 CLEAR_BYTE, dbp->pgsize - P_OVERHEAD(dbp)); 493#endif 494 if (prev_pgno == PGNO_INVALID) 495 meta->free = h->pgno; 496 else 497 NEXT_PGNO(prev) = h->pgno; 498 } 499 500 /* Discard the metadata or previous page. */ 501err1: if (meta != NULL && (t_ret = __memp_fput(mpf, 502 dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0) 503 ret = t_ret; 504 if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0) 505 ret = t_ret; 506 if (prev != (PAGE*) meta && prev != NULL && (t_ret = __memp_fput(mpf, 507 dbc->thread_info, prev, dbc->priority)) != 0 && ret == 0) 508 ret = t_ret; 509 510 /* Discard the caller's page reference. */ 511err: if (h != NULL && (t_ret = __memp_fput(mpf, 512 dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) 513 ret = t_ret; 514 515 /* 516 * XXX 517 * We have to unlock the caller's page in the caller! 518 */ 519 return (ret); 520} 521 522#ifdef HAVE_FTRUNCATE 523/* 524 * __db_freelist_pos -- find the position of a page in the freelist. 525 * The list is sorted, we do a binary search. 526 * 527 * PUBLIC: #ifdef HAVE_FTRUNCATE 528 * PUBLIC: void __db_freelist_pos __P((db_pgno_t, 529 * PUBLIC: db_pgno_t *, u_int32_t, u_int32_t *)); 530 * PUBLIC: #endif 531 */ 532void 533__db_freelist_pos(pgno, list, nelem, posp) 534 db_pgno_t pgno; 535 db_pgno_t *list; 536 u_int32_t nelem; 537 u_int32_t *posp; 538{ 539 u_int32_t base, indx, lim; 540 541 indx = 0; 542 for (base = 0, lim = nelem; lim != 0; lim >>= 1) { 543 indx = base + (lim >> 1); 544 if (pgno == list[indx]) { 545 *posp = indx; 546 return; 547 } 548 if (pgno > list[indx]) { 549 base = indx + 1; 550 --lim; 551 } 552 } 553 if (base != 0) 554 base--; 555 *posp = base; 556 return; 557} 558 559static int 560__db_pglistcmp(a, b) 561 const void *a, *b; 562{ 563 db_pglist_t *ap, *bp; 564 565 ap = (db_pglist_t *)a; 566 bp = (db_pglist_t *)b; 567 568 return ((ap->pgno > bp->pgno) ? 1 : (ap->pgno < bp->pgno) ? -1: 0); 569} 570 571/* 572 * __db_freelist_sort -- sort a list of free pages. 573 */ 574static void 575__db_freelist_sort(list, nelems) 576 db_pglist_t *list; 577 u_int32_t nelems; 578{ 579 qsort(list, (size_t)nelems, sizeof(db_pglist_t), __db_pglistcmp); 580} 581 582/* 583 * __db_pg_truncate -- sort the freelist and find the truncation point. 584 * 585 * PUBLIC: #ifdef HAVE_FTRUNCATE 586 * PUBLIC: int __db_pg_truncate __P((DBC *, DB_TXN *, 587 * PUBLIC: db_pglist_t *list, DB_COMPACT *, u_int32_t *, db_pgno_t *, 588 * PUBLIC: DB_LSN *, int)); 589 * PUBLIC: #endif 590 */ 591int 592__db_pg_truncate(dbc, txn, list, c_data, nelemp, last_pgno, lsnp, in_recovery) 593 DBC *dbc; 594 DB_TXN *txn; 595 db_pglist_t *list; 596 DB_COMPACT *c_data; 597 u_int32_t *nelemp; 598 db_pgno_t *last_pgno; 599 DB_LSN *lsnp; 600 int in_recovery; 601{ 602 DB *dbp; 603 DB_MPOOLFILE *mpf; 604 PAGE *h; 605 db_pglist_t *lp; 606 db_pgno_t pgno; 607 u_int32_t nelems; 608 int ret; 609 610 ret = 0; 611 612 dbp = dbc->dbp; 613 mpf = dbp->mpf; 614 nelems = *nelemp; 615 /* Sort the list */ 616 __db_freelist_sort(list, nelems); 617 618 /* Find the truncation point. */ 619 pgno = *last_pgno; 620 lp = &list[nelems - 1]; 621 while (nelems != 0) { 622 if (lp->pgno != pgno) 623 break; 624 pgno--; 625 nelems--; 626 lp--; 627 } 628 629 /* 630 * Figure out what (if any) pages can be truncated immediately and 631 * record the place from which we can truncate, so we can do the 632 * memp_ftruncate below. We also use this to avoid ever putting 633 * these pages on the freelist, which we are about to relink. 634 */ 635 for (lp = list; lp < &list[nelems]; lp++) { 636 if ((ret = __memp_fget( 637 mpf, &lp->pgno, dbc->thread_info, txn, 0, &h)) != 0) { 638 /* Page may have been truncated later. */ 639 if (in_recovery && ret == DB_PAGE_NOTFOUND) { 640 ret = 0; 641 continue; 642 } 643 goto err; 644 } 645 if (!in_recovery || LOG_COMPARE(&LSN(h), &lp->lsn) == 0) { 646 if ((ret = __memp_dirty(mpf, &h, 647 dbc->thread_info, txn, dbp->priority, 0)) != 0) { 648 (void)__memp_fput(mpf, 649 dbc->thread_info, h, dbp->priority); 650 goto err; 651 } 652 if (lp == &list[nelems - 1]) 653 NEXT_PGNO(h) = PGNO_INVALID; 654 else 655 NEXT_PGNO(h) = lp[1].pgno; 656 DB_ASSERT(mpf->env, NEXT_PGNO(h) < *last_pgno); 657 658 LSN(h) = *lsnp; 659 } 660 if ((ret = __memp_fput(mpf, 661 dbc->thread_info, h, dbp->priority)) != 0) 662 goto err; 663 } 664 665 if (pgno != *last_pgno) { 666 if ((ret = __memp_ftruncate(mpf, dbc->thread_info, 667 pgno + 1, in_recovery ? MP_TRUNC_RECOVER : 0)) != 0) 668 goto err; 669 if (c_data) 670 c_data->compact_pages_truncated += *last_pgno - pgno; 671 *last_pgno = pgno; 672 } 673 *nelemp = nelems; 674 675err: return (ret); 676} 677 678/* 679 * __db_free_truncate -- 680 * Truncate free pages at the end of the file. 681 * 682 * PUBLIC: #ifdef HAVE_FTRUNCATE 683 * PUBLIC: int __db_free_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *, 684 * PUBLIC: u_int32_t, DB_COMPACT *, db_pglist_t **, u_int32_t *, 685 * PUBLIC: db_pgno_t *)); 686 * PUBLIC: #endif 687 */ 688int 689__db_free_truncate(dbp, ip, txn, flags, c_data, listp, nelemp, last_pgnop) 690 DB *dbp; 691 DB_THREAD_INFO *ip; 692 DB_TXN *txn; 693 u_int32_t flags; 694 DB_COMPACT *c_data; 695 db_pglist_t **listp; 696 u_int32_t *nelemp; 697 db_pgno_t *last_pgnop; 698{ 699 DBC *dbc; 700 DBMETA *meta; 701 DBT ddbt; 702 DB_LOCK metalock; 703 DB_LSN null_lsn; 704 DB_MPOOLFILE *mpf; 705 ENV *env; 706 PAGE *h; 707 db_pglist_t *list, *lp; 708 db_pgno_t pgno; 709 u_int32_t nelems; 710 int ret, t_ret; 711 size_t size; 712 713 COMPQUIET(flags, 0); 714 list = NULL; 715 meta = NULL; 716 env = dbp->env; 717 mpf = dbp->mpf; 718 h = NULL; 719 nelems = 0; 720 if (listp != NULL) { 721 *listp = NULL; 722 DB_ASSERT(env, nelemp != NULL); 723 *nelemp = 0; 724 } 725 726 if ((ret = __db_cursor(dbp, ip, txn, &dbc, DB_WRITELOCK)) != 0) 727 return (ret); 728 729 pgno = PGNO_BASE_MD; 730 if ((ret = __db_lget(dbc, 731 LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) 732 goto err; 733 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 0, 734 &meta)) != 0) 735 goto err; 736 737 if (last_pgnop != NULL) 738 *last_pgnop = meta->last_pgno; 739 if ((pgno = meta->free) == PGNO_INVALID) 740 goto done; 741 742 size = 128; 743 if ((ret = __os_malloc(env, size * sizeof(*list), &list)) != 0) 744 goto err; 745 lp = list; 746 747 do { 748 if (lp == &list[size]) { 749 size *= 2; 750 if ((ret = __os_realloc(env, 751 size * sizeof(*list), &list)) != 0) 752 goto err; 753 lp = &list[size / 2]; 754 } 755 if ((ret = __memp_fget(mpf, &pgno, 756 dbc->thread_info, dbc->txn, 0, &h)) != 0) 757 goto err; 758 759 lp->pgno = pgno; 760 lp->lsn = LSN(h); 761 pgno = NEXT_PGNO(h); 762 if ((ret = __memp_fput(mpf, 763 dbc->thread_info, h, dbc->priority)) != 0) 764 goto err; 765 lp++; 766 } while (pgno != PGNO_INVALID); 767 nelems = (u_int32_t)(lp - list); 768 769 if ((ret = __memp_dirty(mpf, 770 &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) 771 goto err; 772 773 /* Log the current state of the free list */ 774 if (DBC_LOGGING(dbc)) { 775 ddbt.data = list; 776 ddbt.size = nelems * sizeof(*lp); 777 ZERO_LSN(null_lsn); 778 if ((ret = __db_pg_sort_log(dbp, 779 dbc->txn, &LSN(meta), DB_FLUSH, PGNO_BASE_MD, &LSN(meta), 780 PGNO_INVALID, &null_lsn, meta->last_pgno, &ddbt)) != 0) 781 goto err; 782 } else 783 LSN_NOT_LOGGED(LSN(meta)); 784 785 if ((ret = __db_pg_truncate(dbc, txn, list, c_data, 786 &nelems, &meta->last_pgno, &LSN(meta), 0)) != 0) 787 goto err; 788 789 if (nelems == 0) 790 meta->free = PGNO_INVALID; 791 else 792 meta->free = list[0].pgno; 793 794done: if (last_pgnop != NULL) 795 *last_pgnop = meta->last_pgno; 796 797 /* 798 * The truncate point is the number of pages in the free 799 * list back from the last page. The number of pages 800 * in the free list are the number that we can swap in. 801 */ 802 if (c_data) 803 c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems; 804 805 if (nelems != 0 && listp != NULL) { 806 *listp = list; 807 *nelemp = nelems; 808 list = NULL; 809 } 810 811err: if (list != NULL) 812 __os_free(env, list); 813 if (meta != NULL && (t_ret = __memp_fput(mpf, 814 dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0) 815 ret = t_ret; 816 if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0) 817 ret = t_ret; 818 if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) 819 ret = t_ret; 820 return (ret); 821} 822 823static int 824__db_truncate_freelist(dbc, meta, h, list, start, nelem) 825 DBC *dbc; 826 DBMETA *meta; 827 PAGE *h; 828 db_pgno_t *list; 829 u_int32_t start, nelem; 830{ 831 DB *dbp; 832 DBT ddbt; 833 DB_LSN null_lsn; 834 DB_MPOOLFILE *mpf; 835 PAGE *last_free, *pg; 836 db_pgno_t *lp; 837 db_pglist_t *plist, *pp; 838 int ret; 839 840 dbp = dbc->dbp; 841 mpf = dbp->mpf; 842 plist = NULL; 843 last_free = NULL; 844 845 if (start != 0 && 846 (ret = __memp_fget(mpf, &list[start - 1], 847 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &last_free)) != 0) 848 goto err; 849 850 if (DBC_LOGGING(dbc)) { 851 if ((ret = __os_malloc(dbp->env, 852 (nelem - start) * sizeof(*pp), &plist)) != 0) 853 goto err; 854 855 pp = plist; 856 for (lp = &list[start]; lp < &list[nelem]; lp++) { 857 pp->pgno = *lp; 858 if ((ret = __memp_fget(mpf, lp, 859 dbc->thread_info, dbc->txn, 0, &pg)) != 0) 860 goto err; 861 pp->lsn = LSN(pg); 862 if ((ret = __memp_fput(mpf, 863 dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0) 864 goto err; 865 pp++; 866 } 867 ddbt.data = plist; 868 ddbt.size = (nelem - start) * sizeof(*pp); 869 ZERO_LSN(null_lsn); 870 if (last_free != NULL) { 871 if ((ret = __db_pg_sort_log(dbp, dbc->txn, &LSN(meta), 872 DB_FLUSH, PGNO(meta), &LSN(meta), PGNO(last_free), 873 &LSN(last_free), meta->last_pgno, &ddbt)) != 0) 874 goto err; 875 } else if ((ret = __db_pg_sort_log(dbp, dbc->txn, 876 &LSN(meta), DB_FLUSH, PGNO(meta), &LSN(meta), 877 PGNO_INVALID, &null_lsn, meta->last_pgno, &ddbt)) != 0) 878 goto err; 879 } else 880 LSN_NOT_LOGGED(LSN(meta)); 881 if (last_free != NULL) 882 LSN(last_free) = LSN(meta); 883 884 if ((ret = __memp_fput(mpf, 885 dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0) 886 goto err; 887 h = NULL; 888 if ((ret = 889 __memp_ftruncate(mpf, dbc->thread_info, list[start], 0)) != 0) 890 goto err; 891 meta->last_pgno = list[start] - 1; 892 893 if (start == 0) 894 meta->free = PGNO_INVALID; 895 else { 896 NEXT_PGNO(last_free) = PGNO_INVALID; 897 if ((ret = __memp_fput(mpf, 898 dbc->thread_info, last_free, dbc->priority)) != 0) 899 goto err; 900 last_free = NULL; 901 } 902 903 /* Shrink the number of elements in the list. */ 904 ret = __memp_extend_freelist(mpf, start, &list); 905 906err: if (plist != NULL) 907 __os_free(dbp->env, plist); 908 909 /* We need to put the page on error. */ 910 if (h != NULL) 911 (void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority); 912 if (last_free != NULL) 913 (void)__memp_fput(mpf, 914 dbc->thread_info, last_free, dbc->priority); 915 916 return (ret); 917} 918#endif 919 920#ifdef DEBUG 921/* 922 * __db_lprint -- 923 * Print out the list of locks currently held by a cursor. 924 * 925 * PUBLIC: int __db_lprint __P((DBC *)); 926 */ 927int 928__db_lprint(dbc) 929 DBC *dbc; 930{ 931 DB *dbp; 932 DB_LOCKREQ req; 933 ENV *env; 934 935 dbp = dbc->dbp; 936 env = dbp->env; 937 938 if (LOCKING_ON(env)) { 939 req.op = DB_LOCK_DUMP; 940 (void)__lock_vec(env, dbc->locker, 0, &req, 1, NULL); 941 } 942 return (0); 943} 944#endif 945 946/* 947 * __db_lget -- 948 * The standard lock get call. 949 * 950 * PUBLIC: int __db_lget __P((DBC *, 951 * PUBLIC: int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *)); 952 */ 953int 954__db_lget(dbc, action, pgno, mode, lkflags, lockp) 955 DBC *dbc; 956 int action; 957 db_pgno_t pgno; 958 db_lockmode_t mode; 959 u_int32_t lkflags; 960 DB_LOCK *lockp; 961{ 962 DB *dbp; 963 DB_LOCKREQ couple[3], *reqp; 964 DB_TXN *txn; 965 ENV *env; 966 int has_timeout, i, ret; 967 968 dbp = dbc->dbp; 969 env = dbp->env; 970 txn = dbc->txn; 971 972 /* 973 * We do not always check if we're configured for locking before 974 * calling __db_lget to acquire the lock. 975 */ 976 if (CDB_LOCKING(env) || !LOCKING_ON(env) || 977 (MULTIVERSION(dbp) && mode == DB_LOCK_READ && 978 dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT)) || 979 F_ISSET(dbc, DBC_DONTLOCK) || (F_ISSET(dbc, DBC_RECOVER) && 980 (action != LCK_ROLLBACK || IS_REP_CLIENT(env))) || 981 (action != LCK_ALWAYS && F_ISSET(dbc, DBC_OPD))) { 982 LOCK_INIT(*lockp); 983 return (0); 984 } 985 986 dbc->lock.pgno = pgno; 987 if (lkflags & DB_LOCK_RECORD) 988 dbc->lock.type = DB_RECORD_LOCK; 989 else 990 dbc->lock.type = DB_PAGE_LOCK; 991 lkflags &= ~DB_LOCK_RECORD; 992 993 /* 994 * If the transaction enclosing this cursor has DB_LOCK_NOWAIT set, 995 * pass that along to the lock call. 996 */ 997 if (DB_NONBLOCK(dbc)) 998 lkflags |= DB_LOCK_NOWAIT; 999 1000 if (F_ISSET(dbc, DBC_READ_UNCOMMITTED) && mode == DB_LOCK_READ) 1001 mode = DB_LOCK_READ_UNCOMMITTED; 1002 1003 has_timeout = F_ISSET(dbc, DBC_RECOVER) || 1004 (txn != NULL && F_ISSET(txn, TXN_LOCKTIMEOUT)); 1005 1006 /* 1007 * Transactional locking. 1008 * Hold on to the previous read lock only if we are in full isolation. 1009 * COUPLE_ALWAYS indicates we are holding an interior node which need 1010 * not be isolated. 1011 * Downgrade write locks if we are supporting dirty readers. 1012 */ 1013 if ((action != LCK_COUPLE && action != LCK_COUPLE_ALWAYS) || 1014 !LOCK_ISSET(*lockp)) 1015 action = 0; 1016 else if (dbc->txn == NULL || action == LCK_COUPLE_ALWAYS) 1017 action = LCK_COUPLE; 1018 else if (F_ISSET(dbc, 1019 DBC_READ_COMMITTED) && lockp->mode == DB_LOCK_READ) 1020 action = LCK_COUPLE; 1021 else if (F_ISSET(dbc, 1022 DBC_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_READ_UNCOMMITTED) 1023 action = LCK_COUPLE; 1024 else if (F_ISSET(dbc->dbp, 1025 DB_AM_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_WRITE) 1026 action = LCK_DOWNGRADE; 1027 else 1028 action = 0; 1029 1030 i = 0; 1031 switch (action) { 1032 default: 1033 if (has_timeout) 1034 goto do_couple; 1035 ret = __lock_get(env, 1036 dbc->locker, lkflags, &dbc->lock_dbt, mode, lockp); 1037 break; 1038 1039 case LCK_DOWNGRADE: 1040 couple[0].op = DB_LOCK_GET; 1041 couple[0].obj = NULL; 1042 couple[0].lock = *lockp; 1043 couple[0].mode = DB_LOCK_WWRITE; 1044 UMRW_SET(couple[0].timeout); 1045 i++; 1046 /* FALLTHROUGH */ 1047 case LCK_COUPLE: 1048do_couple: couple[i].op = has_timeout? DB_LOCK_GET_TIMEOUT : DB_LOCK_GET; 1049 couple[i].obj = &dbc->lock_dbt; 1050 couple[i].mode = mode; 1051 UMRW_SET(couple[i].timeout); 1052 i++; 1053 if (has_timeout) 1054 couple[0].timeout = 1055 F_ISSET(dbc, DBC_RECOVER) ? 0 : txn->lock_timeout; 1056 if (action == LCK_COUPLE || action == LCK_DOWNGRADE) { 1057 couple[i].op = DB_LOCK_PUT; 1058 couple[i].lock = *lockp; 1059 i++; 1060 } 1061 1062 ret = __lock_vec(env, 1063 dbc->locker, lkflags, couple, i, &reqp); 1064 if (ret == 0 || reqp == &couple[i - 1]) 1065 *lockp = i == 1 ? couple[0].lock : couple[i - 2].lock; 1066 break; 1067 } 1068 1069 if (txn != NULL && ret == DB_LOCK_DEADLOCK) 1070 F_SET(txn, TXN_DEADLOCK); 1071 return ((ret == DB_LOCK_NOTGRANTED && !F_ISSET(env->dbenv, 1072 DB_ENV_TIME_NOTGRANTED)) ? DB_LOCK_DEADLOCK : ret); 1073} 1074 1075/* 1076 * __db_lput -- 1077 * The standard lock put call. 1078 * 1079 * PUBLIC: int __db_lput __P((DBC *, DB_LOCK *)); 1080 */ 1081int 1082__db_lput(dbc, lockp) 1083 DBC *dbc; 1084 DB_LOCK *lockp; 1085{ 1086 DB_LOCKREQ couple[2], *reqp; 1087 ENV *env; 1088 int action, ret; 1089 1090 /* 1091 * Transactional locking. 1092 * Hold on to the read locks only if we are in full isolation. 1093 * Downgrade write locks if we are supporting dirty readers. 1094 */ 1095 if (F_ISSET(dbc->dbp, 1096 DB_AM_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_WRITE) 1097 action = LCK_DOWNGRADE; 1098 else if (dbc->txn == NULL) 1099 action = LCK_COUPLE; 1100 else if (F_ISSET(dbc, 1101 DBC_READ_COMMITTED) && lockp->mode == DB_LOCK_READ) 1102 action = LCK_COUPLE; 1103 else if (F_ISSET(dbc, 1104 DBC_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_READ_UNCOMMITTED) 1105 action = LCK_COUPLE; 1106 else 1107 action = 0; 1108 1109 env = dbc->env; 1110 switch (action) { 1111 case LCK_COUPLE: 1112 ret = __lock_put(env, lockp); 1113 break; 1114 case LCK_DOWNGRADE: 1115 couple[0].op = DB_LOCK_GET; 1116 couple[0].obj = NULL; 1117 couple[0].mode = DB_LOCK_WWRITE; 1118 couple[0].lock = *lockp; 1119 UMRW_SET(couple[0].timeout); 1120 couple[1].op = DB_LOCK_PUT; 1121 couple[1].lock = *lockp; 1122 ret = __lock_vec(env, dbc->locker, 0, couple, 2, &reqp); 1123 if (ret == 0 || reqp == &couple[1]) 1124 *lockp = couple[0].lock; 1125 break; 1126 default: 1127 ret = 0; 1128 break; 1129 } 1130 1131 return (ret); 1132} 1133