1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 */ 6/* 7 * Copyright (c) 1995, 1996 8 * The President and Fellows of Harvard University. All rights reserved. 9 * 10 * This code is derived from software contributed to Berkeley by 11 * Margo Seltzer. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * $Id: db_dispatch.c,v 12.45 2008/03/12 20:28:09 mbrey Exp $ 38 */ 39 40#include "db_config.h" 41 42#include "db_int.h" 43#include "dbinc/db_page.h" 44#include "dbinc/hash.h" 45#include "dbinc/fop.h" 46#include "dbinc/lock.h" 47#include "dbinc/log.h" 48#include "dbinc/mp.h" 49#include "dbinc/txn.h" 50 51static int __db_txnlist_find_internal __P((ENV *, DB_TXNHEAD *, 52 db_txnlist_type, u_int32_t, DB_TXNLIST **, 53 int, u_int32_t *)); 54 55/* 56 * __db_dispatch -- 57 * 58 * This is the transaction dispatch function used by the db access methods. 59 * It is designed to handle the record format used by all the access 60 * methods (the one automatically generated by the db_{h,log,read}.sh 61 * scripts in the tools directory). An application using a different 62 * recovery paradigm will supply a different dispatch function to txn_open. 63 * 64 * PUBLIC: int __db_dispatch __P((ENV *, 65 * PUBLIC: DB_DISTAB *, DBT *, DB_LSN *, db_recops, DB_TXNHEAD *)); 66 */ 67int 68__db_dispatch(env, dtab, db, lsnp, redo, info) 69 ENV *env; /* The environment. */ 70 DB_DISTAB *dtab; 71 DBT *db; /* The log record upon which to dispatch. */ 72 DB_LSN *lsnp; /* The lsn of the record being dispatched. */ 73 db_recops redo; /* Redo this op (or undo it). */ 74 DB_TXNHEAD *info; /* Transaction list. */ 75{ 76 DB_ENV *dbenv; 77 DB_LSN prev_lsn; 78 u_int32_t rectype, status, txnid, urectype; 79 int make_call, ret; 80 81 dbenv = env->dbenv; 82 LOGCOPY_32(env, &rectype, db->data); 83 LOGCOPY_32(env, &txnid, (u_int8_t *)db->data + sizeof(rectype)); 84 85 make_call = ret = 0; 86 87 /* If we don't have a dispatch table, it's hard to dispatch. */ 88 DB_ASSERT(env, dtab != NULL); 89 90 /* 91 * If we find a record that is in the user's number space and they 92 * have specified a recovery routine, let them handle it. If they 93 * didn't specify a recovery routine, then we expect that they've 94 * followed all our rules and registered new recovery functions. 95 */ 96 switch (redo) { 97 case DB_TXN_ABORT: 98 case DB_TXN_APPLY: 99 case DB_TXN_PRINT: 100 make_call = 1; 101 break; 102 case DB_TXN_OPENFILES: 103 /* 104 * We collect all the transactions that have 105 * "begin" records, those with no previous LSN, 106 * so that we do not abort partial transactions. 107 * These are known to be undone, otherwise the 108 * log would not have been freeable. 109 */ 110 LOGCOPY_TOLSN(env, &prev_lsn, (u_int8_t *)db->data + 111 sizeof(rectype) + sizeof(txnid)); 112 if (txnid != 0 && prev_lsn.file == 0 && (ret = 113 __db_txnlist_add(env, info, txnid, TXN_OK, NULL)) != 0) 114 return (ret); 115 116 /* FALLTHROUGH */ 117 case DB_TXN_POPENFILES: 118 if (rectype == DB___dbreg_register || 119 rectype == DB___txn_child || 120 rectype == DB___txn_ckp || rectype == DB___txn_recycle) 121 return ((dtab->int_dispatch[rectype])(env, 122 db, lsnp, redo, info)); 123 break; 124 case DB_TXN_BACKWARD_ROLL: 125 /* 126 * Running full recovery in the backward pass. In general, 127 * we only process records during this pass that belong 128 * to aborted transactions. Unfortunately, there are several 129 * exceptions: 130 * 1. If this is a meta-record, one not associated with 131 * a transaction, then we must always process it. 132 * 2. If this is a transaction commit/abort, we must 133 * always process it, so that we know the status of 134 * every transaction. 135 * 3. If this is a child commit, we need to process it 136 * because the outcome of the child transaction depends 137 * on the outcome of the parent. 138 * 4. If this is a dbreg_register record, we must always 139 * process is because they contain non-transactional 140 * closes that must be properly handled. 141 * 5. If this is a noop, we must always undo it so that we 142 * properly handle any aborts before a file was closed. 143 * 6. If this a file remove, we need to process it to 144 * determine if the on-disk file is the same as the 145 * one being described. 146 */ 147 switch (rectype) { 148 /* 149 * These either do not belong to a transaction or (regop) 150 * must be processed regardless of the status of the 151 * transaction. 152 */ 153 case DB___txn_regop: 154 case DB___txn_recycle: 155 case DB___txn_ckp: 156 make_call = 1; 157 break; 158 /* 159 * These belong to a transaction whose status must be 160 * checked. 161 */ 162 case DB___txn_child: 163 case DB___db_noop: 164 case DB___fop_file_remove: 165 case DB___dbreg_register: 166 make_call = 1; 167 168 /* FALLTHROUGH */ 169 default: 170 if (txnid == 0) 171 break; 172 173 ret = __db_txnlist_find(env, info, txnid, &status); 174 175 /* If not found, this is an incomplete abort. */ 176 if (ret == DB_NOTFOUND) 177 return (__db_txnlist_add(env, 178 info, txnid, TXN_IGNORE, lsnp)); 179 if (ret != 0) 180 return (ret); 181 182 /* 183 * If we ignore the transaction, ignore the operation 184 * UNLESS this is a child commit in which case we need 185 * to make sure that the child also gets marked as 186 * ignore. 187 */ 188 if (status == TXN_IGNORE && rectype != DB___txn_child) { 189 make_call = 0; 190 break; 191 } 192 if (status == TXN_COMMIT) 193 break; 194 195 /* Set make_call in case we came through default */ 196 make_call = 1; 197 if (status == TXN_OK && 198 (ret = __db_txnlist_update(env, 199 info, txnid, rectype == DB___txn_xa_regop ? 200 TXN_PREPARE : TXN_ABORT, NULL, &status, 0)) != 0) 201 return (ret); 202 } 203 break; 204 case DB_TXN_FORWARD_ROLL: 205 /* 206 * In the forward pass, if we haven't seen the transaction, 207 * do nothing, else recover it. 208 * 209 * We need to always redo DB___db_noop records, so that we 210 * properly handle any commits after the file was closed. 211 */ 212 switch (rectype) { 213 case DB___txn_recycle: 214 case DB___txn_ckp: 215 case DB___db_noop: 216 case DB___dbreg_register: 217 make_call = 1; 218 break; 219 220 default: 221 if (txnid == 0) 222 status = 0; 223 else { 224 ret = __db_txnlist_find(env, 225 info, txnid, &status); 226 227 if (ret == DB_NOTFOUND) 228 /* Break out out of if clause. */ 229 ; 230 else if (ret != 0) 231 return (ret); 232 else if (status == TXN_COMMIT) { 233 make_call = 1; 234 break; 235 } 236 } 237 238 } 239 break; 240 default: 241 return (__db_unknown_flag( 242 env, "__db_dispatch", (u_int32_t)redo)); 243 } 244 245 if (make_call) { 246 /* 247 * If the debug flag is set then we are logging 248 * records for a non-durable update so that they 249 * may be examined for diagnostic purposes. 250 * So only make the call if we are printing, 251 * otherwise we need to extract the previous 252 * lsn so undo will work properly. 253 */ 254 if (rectype & DB_debug_FLAG) { 255 if (redo == DB_TXN_PRINT) 256 rectype &= ~DB_debug_FLAG; 257 else { 258 LOGCOPY_TOLSN(env, lsnp, 259 (u_int8_t *)db->data + 260 sizeof(rectype) + 261 sizeof(txnid)); 262 return (0); 263 } 264 } 265 if (rectype >= DB_user_BEGIN) { 266 if (dbenv->app_dispatch != NULL) 267 return (dbenv->app_dispatch(dbenv, 268 db, lsnp, redo)); 269 270 /* No application-specific dispatch */ 271 urectype = rectype - DB_user_BEGIN; 272 if (urectype > dtab->ext_size || 273 dtab->ext_dispatch[urectype] == NULL) { 274 __db_errx(env, 275 "Illegal application-specific record type %lu in log", 276 (u_long)rectype); 277 return (EINVAL); 278 } 279 return ((dtab->ext_dispatch[urectype])(dbenv, 280 db, lsnp, redo)); 281 } else { 282 if (rectype > dtab->int_size || 283 dtab->int_dispatch[rectype] == NULL) { 284 __db_errx(env, 285 "Illegal record type %lu in log", 286 (u_long)rectype); 287 return (EINVAL); 288 } 289 return ((dtab->int_dispatch[rectype])(env, 290 db, lsnp, redo, info)); 291 } 292 } 293 294 return (0); 295} 296 297/* 298 * __db_add_recovery -- Add recovery functions to the dispatch table. 299 * 300 * We have two versions of this, an external one and an internal one, 301 * because application-specific functions take different arguments 302 * for dispatch (ENV versus DB_ENV). 303 * 304 * This is the external version. 305 * 306 * PUBLIC: int __db_add_recovery __P((DB_ENV *, DB_DISTAB *, 307 * PUBLIC: int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops), u_int32_t)); 308 */ 309int 310__db_add_recovery(dbenv, dtab, func, ndx) 311 DB_ENV *dbenv; 312 DB_DISTAB *dtab; 313 int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops)); 314 u_int32_t ndx; 315{ 316 size_t i, nsize; 317 int ret; 318 319 /* Make sure this is an application-specific record. */ 320 if (ndx < DB_user_BEGIN) { 321 __db_errx(dbenv->env, 322 "Attempting to add application-specific record with invalid type %lu", 323 (u_long)ndx); 324 return (EINVAL); 325 } 326 ndx -= DB_user_BEGIN; 327 328 /* Check if we have to grow the table. */ 329 if (ndx >= dtab->ext_size) { 330 nsize = ndx + 40; 331 if ((ret = 332 __os_realloc(dbenv->env, nsize * 333 sizeof((dtab->ext_dispatch)[0]), &dtab->ext_dispatch)) 334 != 0) 335 return (ret); 336 for (i = dtab->ext_size; i < nsize; ++i) 337 (dtab->ext_dispatch)[i] = NULL; 338 dtab->ext_size = nsize; 339 } 340 341 (dtab->ext_dispatch)[ndx] = func; 342 return (0); 343} 344 345/* 346 * __db_add_recovery_int -- 347 * 348 * Internal version of dispatch addition function. 349 * 350 * 351 * PUBLIC: int __db_add_recovery_int __P((ENV *, DB_DISTAB *, 352 * PUBLIC: int (*)(ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t)); 353 */ 354int 355__db_add_recovery_int(env, dtab, func, ndx) 356 ENV *env; 357 DB_DISTAB *dtab; 358 int (*func) __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); 359 u_int32_t ndx; 360{ 361 size_t i, nsize; 362 int ret; 363 364 if (ndx >= DB_user_BEGIN) { 365 __db_errx(env, 366 "Attempting to add internal record with invalid type %lu", 367 (u_long)ndx); 368 return (EINVAL); 369 } 370 371 /* Check if we have to grow the table. */ 372 if (ndx >= dtab->int_size) { 373 nsize = ndx + 40; 374 if ((ret = 375 __os_realloc(env, nsize * sizeof((dtab->int_dispatch)[0]), 376 &dtab->int_dispatch)) != 0) 377 return (ret); 378 for (i = dtab->int_size; i < nsize; ++i) 379 (dtab->int_dispatch)[i] = NULL; 380 dtab->int_size = nsize; 381 } 382 383 (dtab->int_dispatch)[ndx] = func; 384 return (0); 385} 386 387/* 388 * __db_txnlist_init -- 389 * Initialize transaction linked list. 390 * 391 * PUBLIC: int __db_txnlist_init __P((ENV *, DB_THREAD_INFO *, 392 * PUBLIC: u_int32_t, u_int32_t, DB_LSN *, DB_TXNHEAD **)); 393 */ 394int 395__db_txnlist_init(env, ip, low_txn, hi_txn, trunc_lsn, retp) 396 ENV *env; 397 DB_THREAD_INFO *ip; 398 u_int32_t low_txn, hi_txn; 399 DB_LSN *trunc_lsn; 400 DB_TXNHEAD **retp; 401{ 402 DB_TXNHEAD *headp; 403 u_int32_t size, tmp; 404 int ret; 405 406 /* 407 * Size a hash table. 408 * If low is zero then we are being called during rollback 409 * and we need only one slot. 410 * Hi maybe lower than low if we have recycled txnid's. 411 * The numbers here are guesses about txn density, we can afford 412 * to look at a few entries in each slot. 413 */ 414 if (low_txn == 0) 415 size = 1; 416 else { 417 if (hi_txn < low_txn) { 418 tmp = hi_txn; 419 hi_txn = low_txn; 420 low_txn = tmp; 421 } 422 tmp = hi_txn - low_txn; 423 /* See if we wrapped around. */ 424 if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2) 425 tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn); 426 size = tmp / 5; 427 if (size < 100) 428 size = 100; 429 } 430 if ((ret = __os_malloc(env, 431 sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0) 432 return (ret); 433 434 memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head)); 435 headp->maxid = hi_txn; 436 headp->generation = 0; 437 headp->nslots = size; 438 headp->gen_alloc = 8; 439 headp->thread_info = ip; 440 if ((ret = __os_malloc(env, headp->gen_alloc * 441 sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) { 442 __os_free(env, headp); 443 return (ret); 444 } 445 headp->gen_array[0].generation = 0; 446 headp->gen_array[0].txn_min = TXN_MINIMUM; 447 headp->gen_array[0].txn_max = TXN_MAXIMUM; 448 if (trunc_lsn != NULL) { 449 headp->trunc_lsn = *trunc_lsn; 450 headp->maxlsn = *trunc_lsn; 451 } else { 452 ZERO_LSN(headp->trunc_lsn); 453 ZERO_LSN(headp->maxlsn); 454 } 455 ZERO_LSN(headp->ckplsn); 456 457 *retp = headp; 458 return (0); 459} 460 461#define FIND_GENERATION(hp, txnid, gen) do { \ 462 u_int32_t __i; \ 463 for (__i = 0; __i <= (hp)->generation; __i++) \ 464 /* The range may wrap around the end. */ \ 465 if ((hp)->gen_array[__i].txn_min < \ 466 (hp)->gen_array[__i].txn_max ? \ 467 ((txnid) >= (hp)->gen_array[__i].txn_min && \ 468 (txnid) <= (hp)->gen_array[__i].txn_max) : \ 469 ((txnid) >= (hp)->gen_array[__i].txn_min || \ 470 (txnid) <= (hp)->gen_array[__i].txn_max)) \ 471 break; \ 472 DB_ASSERT(env, __i <= (hp)->generation); \ 473 gen = (hp)->gen_array[__i].generation; \ 474} while (0) 475 476/* 477 * __db_txnlist_add -- 478 * Add an element to our transaction linked list. 479 * 480 * PUBLIC: int __db_txnlist_add __P((ENV *, 481 * PUBLIC: DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *)); 482 */ 483int 484__db_txnlist_add(env, hp, txnid, status, lsn) 485 ENV *env; 486 DB_TXNHEAD *hp; 487 u_int32_t txnid, status; 488 DB_LSN *lsn; 489{ 490 DB_TXNLIST *elp; 491 int ret; 492 493 if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0) 494 return (ret); 495 496 LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links); 497 498 /* Find the most recent generation containing this ID */ 499 FIND_GENERATION(hp, txnid, elp->u.t.generation); 500 elp->type = TXNLIST_TXNID; 501 elp->u.t.txnid = txnid; 502 elp->u.t.status = status; 503 if (txnid > hp->maxid) 504 hp->maxid = txnid; 505 if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT) 506 hp->maxlsn = *lsn; 507 508 DB_ASSERT(env, lsn == NULL || 509 status != TXN_COMMIT || LOG_COMPARE(&hp->maxlsn, lsn) >= 0); 510 511 return (0); 512} 513 514/* 515 * __db_txnlist_remove -- 516 * Remove an element from our transaction linked list. 517 * 518 * PUBLIC: int __db_txnlist_remove __P((ENV *, DB_TXNHEAD *, u_int32_t)); 519 */ 520int 521__db_txnlist_remove(env, hp, txnid) 522 ENV *env; 523 DB_TXNHEAD *hp; 524 u_int32_t txnid; 525{ 526 DB_TXNLIST *entry; 527 u_int32_t status; 528 529 return (__db_txnlist_find_internal(env, 530 hp, TXNLIST_TXNID, txnid, &entry, 1, &status)); 531} 532 533/* 534 * __db_txnlist_ckp -- 535 * Used to record the maximum checkpoint that will be retained 536 * after recovery. Typically this is simply the max checkpoint, but 537 * if we are doing client replication recovery or timestamp-based 538 * recovery, we are going to virtually truncate the log and we need 539 * to retain the last checkpoint before the truncation point. 540 * 541 * PUBLIC: void __db_txnlist_ckp __P((ENV *, DB_TXNHEAD *, DB_LSN *)); 542 */ 543void 544__db_txnlist_ckp(env, hp, ckp_lsn) 545 ENV *env; 546 DB_TXNHEAD *hp; 547 DB_LSN *ckp_lsn; 548{ 549 550 COMPQUIET(env, NULL); 551 552 if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) && 553 LOG_COMPARE(&hp->maxlsn, ckp_lsn) >= 0) 554 hp->ckplsn = *ckp_lsn; 555} 556 557/* 558 * __db_txnlist_end -- 559 * Discard transaction linked list. 560 * 561 * PUBLIC: void __db_txnlist_end __P((ENV *, DB_TXNHEAD *)); 562 */ 563void 564__db_txnlist_end(env, hp) 565 ENV *env; 566 DB_TXNHEAD *hp; 567{ 568 u_int32_t i; 569 DB_TXNLIST *p; 570 571 if (hp == NULL) 572 return; 573 574 for (i = 0; i < hp->nslots; i++) 575 while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) { 576 switch (p->type) { 577 case TXNLIST_LSN: 578 __os_free(env, p->u.l.lsn_stack); 579 break; 580 case TXNLIST_DELETE: 581 case TXNLIST_TXNID: 582 default: 583 /* 584 * Possibly an incomplete DB_TXNLIST; just 585 * free it. 586 */ 587 break; 588 } 589 LIST_REMOVE(p, links); 590 __os_free(env, p); 591 } 592 593 if (hp->gen_array != NULL) 594 __os_free(env, hp->gen_array); 595 __os_free(env, hp); 596} 597 598/* 599 * __db_txnlist_find -- 600 * Checks to see if a txnid with the current generation is in the 601 * txnid list. This returns DB_NOTFOUND if the item isn't in the 602 * list otherwise it returns (like __db_txnlist_find_internal) 603 * the status of the transaction. A txnid of 0 means the record 604 * was generated while not in a transaction. 605 * 606 * PUBLIC: int __db_txnlist_find __P((ENV *, 607 * PUBLIC: DB_TXNHEAD *, u_int32_t, u_int32_t *)); 608 */ 609int 610__db_txnlist_find(env, hp, txnid, statusp) 611 ENV *env; 612 DB_TXNHEAD *hp; 613 u_int32_t txnid, *statusp; 614{ 615 DB_TXNLIST *entry; 616 617 if (txnid == 0) 618 return (DB_NOTFOUND); 619 620 return (__db_txnlist_find_internal(env, hp, 621 TXNLIST_TXNID, txnid, &entry, 0, statusp)); 622} 623 624/* 625 * __db_txnlist_update -- 626 * Change the status of an existing transaction entry. 627 * Returns DB_NOTFOUND if no such entry exists. 628 * 629 * PUBLIC: int __db_txnlist_update __P((ENV *, DB_TXNHEAD *, 630 * PUBLIC: u_int32_t, u_int32_t, DB_LSN *, u_int32_t *, int)); 631 */ 632int 633__db_txnlist_update(env, hp, txnid, status, lsn, ret_status, add_ok) 634 ENV *env; 635 DB_TXNHEAD *hp; 636 u_int32_t txnid, status; 637 DB_LSN *lsn; 638 u_int32_t *ret_status; 639 int add_ok; 640{ 641 DB_TXNLIST *elp; 642 int ret; 643 644 if (txnid == 0) 645 return (DB_NOTFOUND); 646 647 ret = __db_txnlist_find_internal(env, 648 hp, TXNLIST_TXNID, txnid, &elp, 0, ret_status); 649 650 if (ret == DB_NOTFOUND && add_ok) { 651 *ret_status = status; 652 return (__db_txnlist_add(env, hp, txnid, status, lsn)); 653 } 654 if (ret != 0) 655 return (ret); 656 657 if (*ret_status == TXN_IGNORE) 658 return (0); 659 660 elp->u.t.status = status; 661 662 if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT) 663 hp->maxlsn = *lsn; 664 665 return (ret); 666} 667 668/* 669 * __db_txnlist_find_internal -- 670 * Find an entry on the transaction list. If the entry is not there or 671 * the list pointer is not initialized we return DB_NOTFOUND. If the 672 * item is found, we return the status. Currently we always call this 673 * with an initialized list pointer but checking for NULL keeps it general. 674 */ 675static int 676__db_txnlist_find_internal(env, 677 hp, type, txnid, txnlistp, delete, statusp) 678 ENV *env; 679 DB_TXNHEAD *hp; 680 db_txnlist_type type; 681 u_int32_t txnid; 682 DB_TXNLIST **txnlistp; 683 int delete; 684 u_int32_t *statusp; 685{ 686 struct __db_headlink *head; 687 DB_TXNLIST *p; 688 u_int32_t generation, hash; 689 int ret; 690 691 ret = 0; 692 693 if (hp == NULL) 694 return (DB_NOTFOUND); 695 696 switch (type) { 697 case TXNLIST_TXNID: 698 hash = txnid; 699 FIND_GENERATION(hp, txnid, generation); 700 break; 701 case TXNLIST_DELETE: 702 case TXNLIST_LSN: 703 default: 704 return (__env_panic(env, EINVAL)); 705 } 706 707 head = &hp->head[DB_TXNLIST_MASK(hp, hash)]; 708 LIST_FOREACH(p, head, links) { 709 if (p->type != type) 710 continue; 711 switch (type) { 712 case TXNLIST_TXNID: 713 if (p->u.t.txnid != txnid || 714 generation != p->u.t.generation) 715 continue; 716 *statusp = p->u.t.status; 717 break; 718 719 case TXNLIST_DELETE: 720 case TXNLIST_LSN: 721 default: 722 return (__env_panic(env, EINVAL)); 723 } 724 if (delete == 1) { 725 LIST_REMOVE(p, links); 726 __os_free(env, p); 727 *txnlistp = NULL; 728 } else if (p != LIST_FIRST(head)) { 729 /* Move it to head of list. */ 730 LIST_REMOVE(p, links); 731 LIST_INSERT_HEAD(head, p, links); 732 *txnlistp = p; 733 } else 734 *txnlistp = p; 735 return (ret); 736 } 737 738 return (DB_NOTFOUND); 739} 740 741/* 742 * __db_txnlist_gen -- 743 * Change the current generation number. 744 * 745 * PUBLIC: int __db_txnlist_gen __P((ENV *, 746 * PUBLIC: DB_TXNHEAD *, int, u_int32_t, u_int32_t)); 747 */ 748int 749__db_txnlist_gen(env, hp, incr, min, max) 750 ENV *env; 751 DB_TXNHEAD *hp; 752 int incr; 753 u_int32_t min, max; 754{ 755 int ret; 756 757 /* 758 * During recovery generation numbers keep track of "restart" 759 * checkpoints and recycle records. Restart checkpoints occur 760 * whenever we take a checkpoint and there are no outstanding 761 * transactions. When that happens, we can reset transaction IDs 762 * back to TXNID_MINIMUM. Currently we only do the reset 763 * at then end of recovery. Recycle records occur when txnids 764 * are exhausted during runtime. A free range of ids is identified 765 * and logged. This code maintains a stack of ranges. A txnid 766 * is given the generation number of the first range it falls into 767 * in the stack. 768 */ 769 if (incr < 0) { 770 --hp->generation; 771 memmove(hp->gen_array, &hp->gen_array[1], 772 (hp->generation + 1) * sizeof(hp->gen_array[0])); 773 } else { 774 ++hp->generation; 775 if (hp->generation >= hp->gen_alloc) { 776 hp->gen_alloc *= 2; 777 if ((ret = __os_realloc(env, hp->gen_alloc * 778 sizeof(hp->gen_array[0]), &hp->gen_array)) != 0) 779 return (ret); 780 } 781 memmove(&hp->gen_array[1], &hp->gen_array[0], 782 hp->generation * sizeof(hp->gen_array[0])); 783 hp->gen_array[0].generation = hp->generation; 784 hp->gen_array[0].txn_min = min; 785 hp->gen_array[0].txn_max = max; 786 } 787 return (0); 788} 789 790/* 791 * __db_txnlist_lsnadd -- 792 * Save the prev_lsn from a txn_child record. 793 * 794 * PUBLIC: int __db_txnlist_lsnadd __P((ENV *, DB_TXNHEAD *, DB_LSN *)); 795 */ 796int 797__db_txnlist_lsnadd(env, hp, lsnp) 798 ENV *env; 799 DB_TXNHEAD *hp; 800 DB_LSN *lsnp; 801{ 802 DB_TXNLIST *elp; 803 int ret; 804 805 if (IS_ZERO_LSN(*lsnp)) 806 return (0); 807 808 LIST_FOREACH(elp, &hp->head[0], links) 809 if (elp->type == TXNLIST_LSN) 810 break; 811 812 if (elp == NULL) { 813 if ((ret = __db_txnlist_lsninit(env, hp, lsnp)) != 0) 814 return (ret); 815 return (DB_SURPRISE_KID); 816 } 817 818 if (elp->u.l.stack_indx == elp->u.l.stack_size) { 819 elp->u.l.stack_size <<= 1; 820 if ((ret = __os_realloc(env, sizeof(DB_LSN) * 821 elp->u.l.stack_size, &elp->u.l.lsn_stack)) != 0) { 822 __db_txnlist_end(env, hp); 823 return (ret); 824 } 825 } 826 elp->u.l.lsn_stack[elp->u.l.stack_indx++] = *lsnp; 827 828 return (0); 829} 830 831/* 832 * __db_txnlist_lsnget -- 833 * 834 * PUBLIC: int __db_txnlist_lsnget __P((ENV *, 835 * PUBLIC: DB_TXNHEAD *, DB_LSN *, u_int32_t)); 836 * Get the lsn saved from a txn_child record. 837 */ 838int 839__db_txnlist_lsnget(env, hp, lsnp, flags) 840 ENV *env; 841 DB_TXNHEAD *hp; 842 DB_LSN *lsnp; 843 u_int32_t flags; 844{ 845 DB_TXNLIST *elp; 846 847 COMPQUIET(env, NULL); 848 COMPQUIET(flags, 0); 849 850 LIST_FOREACH(elp, &hp->head[0], links) 851 if (elp->type == TXNLIST_LSN) 852 break; 853 854 if (elp == NULL || elp->u.l.stack_indx == 0) { 855 ZERO_LSN(*lsnp); 856 return (0); 857 } 858 859 *lsnp = elp->u.l.lsn_stack[--elp->u.l.stack_indx]; 860 861 return (0); 862} 863 864/* 865 * __db_txnlist_lsninit -- 866 * Initialize a transaction list with an lsn array entry. 867 * 868 * PUBLIC: int __db_txnlist_lsninit __P((ENV *, DB_TXNHEAD *, DB_LSN *)); 869 */ 870int 871__db_txnlist_lsninit(env, hp, lsnp) 872 ENV *env; 873 DB_TXNHEAD *hp; 874 DB_LSN *lsnp; 875{ 876 DB_TXNLIST *elp; 877 int ret; 878 879 elp = NULL; 880 881 if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0) 882 goto err; 883 LIST_INSERT_HEAD(&hp->head[0], elp, links); 884 885 elp->type = TXNLIST_LSN; 886 if ((ret = __os_malloc(env, 887 sizeof(DB_LSN) * DB_LSN_STACK_SIZE, &elp->u.l.lsn_stack)) != 0) 888 goto err; 889 elp->u.l.stack_indx = 1; 890 elp->u.l.stack_size = DB_LSN_STACK_SIZE; 891 elp->u.l.lsn_stack[0] = *lsnp; 892 893 return (0); 894 895err: __db_txnlist_end(env, hp); 896 return (ret); 897} 898 899#ifdef DEBUG 900/* 901 * __db_txnlist_print -- 902 * Print out the transaction list. 903 * 904 * PUBLIC: void __db_txnlist_print __P((DB_TXNHEAD *)); 905 */ 906void 907__db_txnlist_print(hp) 908 DB_TXNHEAD *hp; 909{ 910 DB_TXNLIST *p; 911 u_int32_t i; 912 char *txntype; 913 914 printf("Maxid: %lu Generation: %lu\n", 915 (u_long)hp->maxid, (u_long)hp->generation); 916 for (i = 0; i < hp->nslots; i++) 917 LIST_FOREACH(p, &hp->head[i], links) { 918 if (p->type != TXNLIST_TXNID) { 919 printf("Unrecognized type: %d\n", p->type); 920 continue; 921 } 922 switch (p->u.t.status) { 923 case TXN_OK: 924 txntype = "OK"; 925 break; 926 case TXN_COMMIT: 927 txntype = "commit"; 928 break; 929 case TXN_PREPARE: 930 txntype = "prepare"; 931 break; 932 case TXN_ABORT: 933 txntype = "abort"; 934 break; 935 case TXN_IGNORE: 936 txntype = "ignore"; 937 break; 938 case TXN_EXPECTED: 939 txntype = "expected"; 940 break; 941 case TXN_UNEXPECTED: 942 txntype = "unexpected"; 943 break; 944 default: 945 txntype = "UNKNOWN"; 946 break; 947 } 948 printf("TXNID: %lx(%lu): %s\n", 949 (u_long)p->u.t.txnid, 950 (u_long)p->u.t.generation, txntype); 951 } 952} 953#endif 954