1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 */ 6/* 7 * Copyright (c) 1995, 1996 8 * The President and Fellows of Harvard University. All rights reserved. 9 * 10 * This code is derived from software contributed to Berkeley by 11 * Margo Seltzer. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * $Id: txn.c,v 12.89 2008/04/19 15:47:42 mjc Exp $ 38 */ 39 40#include "db_config.h" 41 42#include "db_int.h" 43#include "dbinc/crypto.h" 44#include "dbinc/hmac.h" 45#include "dbinc/db_page.h" 46#include "dbinc/hash.h" 47#include "dbinc/lock.h" 48#include "dbinc/log.h" 49#include "dbinc/mp.h" 50#include "dbinc/txn.h" 51 52#define LOG_FLAGS(txn) \ 53 (DB_LOG_COMMIT | (F_ISSET(txn, TXN_SYNC) ? \ 54 DB_FLUSH : (F_ISSET(txn, TXN_WRITE_NOSYNC) ? \ 55 DB_LOG_WRNOSYNC : 0))) 56 57/* 58 * __txn_isvalid enumerated types. We cannot simply use the transaction 59 * statuses, because different statuses need to be handled differently 60 * depending on the caller. 61 */ 62typedef enum { 63 TXN_OP_ABORT, 64 TXN_OP_COMMIT, 65 TXN_OP_DISCARD, 66 TXN_OP_PREPARE 67} txnop_t; 68 69static int __txn_abort_pp __P((DB_TXN *)); 70static int __txn_begin_int __P((DB_TXN *)); 71static int __txn_commit_pp __P((DB_TXN *, u_int32_t)); 72static int __txn_discard __P((DB_TXN *, u_int32_t)); 73static int __txn_dispatch_undo 74 __P((ENV *, DB_TXN *, DBT *, DB_LSN *, DB_TXNHEAD *)); 75static int __txn_end __P((DB_TXN *, int)); 76static int __txn_isvalid __P((const DB_TXN *, txnop_t)); 77static int __txn_undo __P((DB_TXN *)); 78static void __txn_set_txn_lsnp __P((DB_TXN *, DB_LSN **, DB_LSN **)); 79 80/* 81 * __txn_begin_pp -- 82 * ENV->txn_begin pre/post processing. 83 * 84 * PUBLIC: int __txn_begin_pp __P((DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t)); 85 */ 86int 87__txn_begin_pp(dbenv, parent, txnpp, flags) 88 DB_ENV *dbenv; 89 DB_TXN *parent, **txnpp; 90 u_int32_t flags; 91{ 92 DB_THREAD_INFO *ip; 93 ENV *env; 94 int rep_check, ret; 95 96 env = dbenv->env; 97 98 ENV_REQUIRES_CONFIG(env, env->tx_handle, "txn_begin", DB_INIT_TXN); 99 100 if ((ret = __db_fchk(env, 101 "txn_begin", flags, 102 DB_READ_COMMITTED | DB_READ_UNCOMMITTED | 103 DB_TXN_NOSYNC | DB_TXN_SNAPSHOT | DB_TXN_SYNC | 104 DB_TXN_WAIT | DB_TXN_WRITE_NOSYNC | DB_TXN_NOWAIT)) != 0) 105 return (ret); 106 if ((ret = __db_fcchk(env, "txn_begin", flags, 107 DB_TXN_WRITE_NOSYNC | DB_TXN_NOSYNC, DB_TXN_SYNC)) != 0) 108 return (ret); 109 if ((ret = __db_fcchk(env, "txn_begin", 110 flags, DB_TXN_WRITE_NOSYNC, DB_TXN_NOSYNC)) != 0) 111 return (ret); 112 if (parent != NULL && !F_ISSET(parent, TXN_SNAPSHOT) && 113 LF_ISSET(DB_TXN_SNAPSHOT)) { 114 __db_errx(env, 115 "Child transaction snapshot setting must match parent"); 116 return (EINVAL); 117 } 118 119 ENV_ENTER(env, ip); 120 121 if (parent == NULL) { 122 rep_check = IS_ENV_REPLICATED(env) ? 1 : 0; 123 if (rep_check && (ret = __op_rep_enter(env)) != 0) 124 goto err; 125 } else 126 rep_check = 0; 127 ret = __txn_begin(env, ip, parent, txnpp, flags); 128 /* 129 * We only decrement the count if the operation fails. 130 * Otherwise the count will be decremented when the 131 * txn is resolved by txn_commit, txn_abort, etc. 132 */ 133 if (ret != 0 && rep_check) 134 (void)__op_rep_exit(env); 135 136err: ENV_LEAVE(env, ip); 137 return (ret); 138} 139 140/* 141 * __txn_begin -- 142 * ENV->txn_begin. 143 * 144 * This is a wrapper to the actual begin process. Normal transaction begin 145 * allocates a DB_TXN structure for the caller, while XA transaction begin 146 * does not. Other than that, both call into common __txn_begin_int code. 147 * 148 * Internally, we use TXN_DETAIL structures, but the DB_TXN structure 149 * provides access to the transaction ID and the offset in the transaction 150 * region of the TXN_DETAIL structure. 151 * 152 * PUBLIC: int __txn_begin __P((ENV *, 153 * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_TXN **, u_int32_t)); 154 */ 155int 156__txn_begin(env, ip, parent, txnpp, flags) 157 ENV *env; 158 DB_THREAD_INFO *ip; 159 DB_TXN *parent, **txnpp; 160 u_int32_t flags; 161{ 162 DB_ENV *dbenv; 163 DB_LOCKREGION *region; 164 DB_TXN *txn; 165 TXN_DETAIL *ptd, *td; 166 int ret; 167 168 *txnpp = NULL; 169 if ((ret = __os_calloc(env, 1, sizeof(DB_TXN), &txn)) != 0) 170 return (ret); 171 172 dbenv = env->dbenv; 173 txn->mgrp = env->tx_handle; 174 txn->parent = parent; 175 TAILQ_INIT(&txn->kids); 176 TAILQ_INIT(&txn->events); 177 STAILQ_INIT(&txn->logs); 178 txn->flags = TXN_MALLOC; 179 txn->thread_info = 180 ip != NULL ? ip : (parent != NULL ? parent->thread_info : NULL); 181 182 /* 183 * Set the sync mode for commit. Any local bits override those 184 * in the environment. SYNC is the default. 185 */ 186 if (LF_ISSET(DB_TXN_SYNC)) 187 F_SET(txn, TXN_SYNC); 188 else if (LF_ISSET(DB_TXN_NOSYNC)) 189 F_SET(txn, TXN_NOSYNC); 190 else if (LF_ISSET(DB_TXN_WRITE_NOSYNC)) 191 F_SET(txn, TXN_WRITE_NOSYNC); 192 else if (F_ISSET(dbenv, DB_ENV_TXN_NOSYNC)) 193 F_SET(txn, TXN_NOSYNC); 194 else if (F_ISSET(dbenv, DB_ENV_TXN_WRITE_NOSYNC)) 195 F_SET(txn, TXN_WRITE_NOSYNC); 196 else 197 F_SET(txn, TXN_SYNC); 198 199 if (LF_ISSET(DB_TXN_NOWAIT) || 200 (F_ISSET(dbenv, DB_ENV_TXN_NOWAIT) && !LF_ISSET(DB_TXN_WAIT))) 201 F_SET(txn, TXN_NOWAIT); 202 if (LF_ISSET(DB_READ_COMMITTED)) 203 F_SET(txn, TXN_READ_COMMITTED); 204 if (LF_ISSET(DB_READ_UNCOMMITTED)) 205 F_SET(txn, TXN_READ_UNCOMMITTED); 206 if (LF_ISSET(DB_TXN_SNAPSHOT) || F_ISSET(dbenv, DB_ENV_TXN_SNAPSHOT) || 207 (parent != NULL && F_ISSET(parent, TXN_SNAPSHOT))) 208 F_SET(txn, TXN_SNAPSHOT); 209 210 if ((ret = __txn_begin_int(txn)) != 0) 211 goto err; 212 td = txn->td; 213 214 if (parent != NULL) { 215 ptd = parent->td; 216 TAILQ_INSERT_HEAD(&parent->kids, txn, klinks); 217 SH_TAILQ_INSERT_HEAD(&ptd->kids, td, klinks, __txn_detail); 218 } 219 220 if (LOCKING_ON(env)) { 221 region = env->lk_handle->reginfo.primary; 222 if (parent != NULL) { 223 ret = __lock_inherit_timeout(env, 224 parent->locker, txn->locker); 225 /* No parent locker set yet. */ 226 if (ret == EINVAL) { 227 parent = NULL; 228 ret = 0; 229 } 230 if (ret != 0) 231 goto err; 232 } 233 234 /* 235 * Parent is NULL if we have no parent 236 * or it has no timeouts set. 237 */ 238 if (parent == NULL && region->tx_timeout != 0) 239 if ((ret = __lock_set_timeout(env, txn->locker, 240 region->tx_timeout, DB_SET_TXN_TIMEOUT)) != 0) 241 goto err; 242 } 243 244 *txnpp = txn; 245 return (0); 246 247err: 248 __os_free(env, txn); 249 return (ret); 250} 251 252/* 253 * __txn_xa_begin -- 254 * XA version of txn_begin. 255 * 256 * PUBLIC: int __txn_xa_begin __P((ENV *, DB_TXN *)); 257 */ 258int 259__txn_xa_begin(env, txn) 260 ENV *env; 261 DB_TXN *txn; 262{ 263 /* 264 * We need to initialize the transaction structure, but must be careful 265 * not to smash the links. We manually initialize the structure. 266 */ 267 txn->mgrp = env->tx_handle; 268 TAILQ_INIT(&txn->kids); 269 TAILQ_INIT(&txn->events); 270 STAILQ_INIT(&txn->logs); 271 txn->parent = NULL; 272 txn->txnid = TXN_INVALID; 273 txn->cursors = 0; 274 memset(&txn->lock_timeout, 0, sizeof(db_timeout_t)); 275 memset(&txn->expire, 0, sizeof(db_timeout_t)); 276 277 return (__txn_begin_int(txn)); 278} 279 280/* 281 * __txn_recycle_id -- 282 * Find a range of useable transaction ids. 283 * 284 * PUBLIC: int __txn_recycle_id __P((ENV *)); 285 */ 286int 287__txn_recycle_id(env) 288 ENV *env; 289{ 290 DB_LSN null_lsn; 291 DB_TXNMGR *mgr; 292 DB_TXNREGION *region; 293 TXN_DETAIL *td; 294 u_int32_t *ids; 295 int nids, ret; 296 297 mgr = env->tx_handle; 298 region = mgr->reginfo.primary; 299 300 if ((ret = __os_malloc(env, 301 sizeof(u_int32_t) * region->maxtxns, &ids)) != 0) 302 return (ret); 303 nids = 0; 304 SH_TAILQ_FOREACH(td, ®ion->active_txn, links, __txn_detail) 305 ids[nids++] = td->txnid; 306 region->last_txnid = TXN_MINIMUM - 1; 307 region->cur_maxid = TXN_MAXIMUM; 308 if (nids != 0) 309 __db_idspace(ids, nids, 310 ®ion->last_txnid, ®ion->cur_maxid); 311 __os_free(env, ids); 312 313 /* 314 * Check LOGGING_ON rather than DBENV_LOGGING as we want to emit this 315 * record at the end of recovery. 316 */ 317 if (LOGGING_ON(env)) 318 ret = __txn_recycle_log(env, NULL, &null_lsn, 319 0, region->last_txnid + 1, region->cur_maxid); 320 321 return (ret); 322} 323 324/* 325 * __txn_compensate_begin 326 * Begin an compensation transaction. This is a special interface 327 * that is used only for transactions that must be started to compensate 328 * for actions during an abort. Currently only used for allocations. 329 * 330 * PUBLIC: int __txn_compensate_begin __P((ENV *, DB_TXN **)); 331 */ 332int 333__txn_compensate_begin(env, txnpp) 334 ENV *env; 335 DB_TXN **txnpp; 336{ 337 DB_TXN *txn; 338 int ret; 339 340 if ((ret = __os_calloc(env, 1, sizeof(DB_TXN), &txn)) != 0) 341 return (ret); 342 343 txn->mgrp = env->tx_handle; 344 TAILQ_INIT(&txn->kids); 345 TAILQ_INIT(&txn->events); 346 STAILQ_INIT(&txn->logs); 347 txn->flags = TXN_COMPENSATE | TXN_MALLOC; 348 349 *txnpp = txn; 350 return (__txn_begin_int(txn)); 351} 352 353/* 354 * __txn_begin_int -- 355 * Normal DB version of txn_begin. 356 */ 357static int 358__txn_begin_int(txn) 359 DB_TXN *txn; 360{ 361 DB_ENV *dbenv; 362 DB_TXNMGR *mgr; 363 DB_TXNREGION *region; 364 ENV *env; 365 TXN_DETAIL *td; 366 u_int32_t id; 367 int ret; 368 369 mgr = txn->mgrp; 370 env = mgr->env; 371 dbenv = env->dbenv; 372 region = mgr->reginfo.primary; 373 374 TXN_SYSTEM_LOCK(env); 375 if (!F_ISSET(txn, TXN_COMPENSATE) && F_ISSET(region, TXN_IN_RECOVERY)) { 376 __db_errx(env, "operation not permitted during recovery"); 377 ret = EINVAL; 378 goto err; 379 } 380 381 /* 382 * Allocate a new transaction id. Our current valid range can span 383 * the maximum valid value, so check for it and wrap manually. 384 */ 385 if (region->last_txnid == TXN_MAXIMUM && 386 region->cur_maxid != TXN_MAXIMUM) 387 region->last_txnid = TXN_MINIMUM - 1; 388 389 if (region->last_txnid == region->cur_maxid && 390 (ret = __txn_recycle_id(env)) != 0) 391 goto err; 392 393 /* Allocate a new transaction detail structure. */ 394 if ((ret = 395 __env_alloc(&mgr->reginfo, sizeof(TXN_DETAIL), &td)) != 0) { 396 __db_errx(env, 397 "Unable to allocate memory for transaction detail"); 398 goto err; 399 } 400 401 /* Place transaction on active transaction list. */ 402 SH_TAILQ_INSERT_HEAD(®ion->active_txn, td, links, __txn_detail); 403 404 id = ++region->last_txnid; 405 406#ifdef HAVE_STATISTICS 407 ++region->stat.st_nbegins; 408 if (++region->stat.st_nactive > region->stat.st_maxnactive) 409 region->stat.st_maxnactive = region->stat.st_nactive; 410#endif 411 412 td->txnid = id; 413 dbenv->thread_id(dbenv, &td->pid, &td->tid); 414 415 /* allocate a locker for this txn */ 416 if (LOCKING_ON(env) && (ret = 417 __lock_getlocker(env->lk_handle, id, 1, &txn->locker)) != 0) 418 goto err; 419 420 ZERO_LSN(td->last_lsn); 421 ZERO_LSN(td->begin_lsn); 422 SH_TAILQ_INIT(&td->kids); 423 if (txn->parent != NULL) 424 td->parent = R_OFFSET(&mgr->reginfo, txn->parent->td); 425 else 426 td->parent = INVALID_ROFF; 427 td->name = INVALID_ROFF; 428 MAX_LSN(td->read_lsn); 429 MAX_LSN(td->visible_lsn); 430 td->mvcc_ref = 0; 431 td->mvcc_mtx = MUTEX_INVALID; 432 td->status = TXN_RUNNING; 433 td->flags = 0; 434 td->xa_status = 0; 435 td->nlog_dbs = 0; 436 td->nlog_slots = TXN_NSLOTS; 437 td->log_dbs = R_OFFSET(&mgr->reginfo, td->slots); 438 439 TXN_SYSTEM_UNLOCK(env); 440 441 txn->txnid = id; 442 txn->td = td; 443 444 txn->abort = __txn_abort_pp; 445 txn->commit = __txn_commit_pp; 446 txn->discard = __txn_discard; 447 txn->get_name = __txn_get_name; 448 txn->id = __txn_id; 449 txn->prepare = __txn_prepare; 450 txn->set_txn_lsnp = __txn_set_txn_lsnp; 451 txn->set_name = __txn_set_name; 452 txn->set_timeout = __txn_set_timeout; 453 454 /* 455 * If this is a transaction family, we must link the child to the 456 * maximal grandparent in the lock table for deadlock detection. 457 */ 458 if (txn->parent != NULL && LOCKING_ON(env)) 459 if ((ret = __lock_addfamilylocker(env, 460 txn->parent->txnid, txn->txnid)) != 0) 461 return (ret); 462 463 if (F_ISSET(txn, TXN_MALLOC)) { 464 MUTEX_LOCK(env, mgr->mutex); 465 TAILQ_INSERT_TAIL(&mgr->txn_chain, txn, links); 466 MUTEX_UNLOCK(env, mgr->mutex); 467 } 468 469 return (0); 470 471err: TXN_SYSTEM_UNLOCK(env); 472 return (ret); 473} 474 475/* 476 * __txn_continue 477 * Fill in the fields of the local transaction structure given 478 * the detail transaction structure. 479 * 480 * PUBLIC: int __txn_continue __P((ENV *, DB_TXN *, TXN_DETAIL *)); 481 */ 482int 483__txn_continue(env, txn, td) 484 ENV *env; 485 DB_TXN *txn; 486 TXN_DETAIL *td; 487{ 488 int ret; 489 490 ret = 0; 491 492 txn->mgrp = env->tx_handle; 493 txn->parent = NULL; 494 txn->txnid = td->txnid; 495 txn->td = td; 496 497 txn->abort = __txn_abort_pp; 498 txn->commit = __txn_commit_pp; 499 txn->discard = __txn_discard; 500 txn->get_name = __txn_get_name; 501 txn->id = __txn_id; 502 txn->prepare = __txn_prepare; 503 txn->set_name = __txn_set_name; 504 505 txn->flags = 0; 506 /* 507 * If this is a restored transaction, we need to propagate that fact 508 * to the process-local structure. However, if it's not a restored 509 * transaction, then we're running in XA and we need to make sure 510 * that we have a locker associated with this transaction. 511 */ 512 if (F_ISSET(td, TXN_DTL_RESTORED)) 513 F_SET(txn, TXN_RESTORED); 514 else 515 ret = __lock_getlocker(env->lk_handle, 516 txn->txnid, 0, &txn->locker); 517 518 return (ret); 519} 520 521/* 522 * __txn_commit_pp -- 523 * Interface routine to TXN->commit. 524 */ 525static int 526__txn_commit_pp(txn, flags) 527 DB_TXN *txn; 528 u_int32_t flags; 529{ 530 DB_THREAD_INFO *ip; 531 ENV *env; 532 int not_child, ret, t_ret; 533 534 env = txn->mgrp->env; 535 not_child = txn->parent == NULL; 536 537 ENV_ENTER(env, ip); 538 539 ret = __txn_commit(txn, flags); 540 if (not_child && IS_ENV_REPLICATED(env) && 541 (t_ret = __op_rep_exit(env)) != 0 && ret == 0) 542 ret = t_ret; 543 ENV_LEAVE(env, ip); 544 return (ret); 545} 546 547/* 548 * __txn_commit -- 549 * Commit a transaction. 550 * 551 * PUBLIC: int __txn_commit __P((DB_TXN *, u_int32_t)); 552 */ 553int 554__txn_commit(txn, flags) 555 DB_TXN *txn; 556 u_int32_t flags; 557{ 558 DBT list_dbt; 559 DB_LOCKREQ request; 560 DB_TXN *kid; 561 ENV *env; 562 REGENV *renv; 563 REGINFO *infop; 564 TXN_DETAIL *td; 565 u_int32_t id; 566 int ret, t_ret; 567 568 env = txn->mgrp->env; 569 td = txn->td; 570 571 /* 572 * A common mistake in Berkeley DB programs is to mis-handle deadlock 573 * return. If the transaction deadlocked, they want abort, not commit. 574 */ 575 if (F_ISSET(txn, TXN_DEADLOCK)) { 576 ret = __db_txn_deadlock_err(env, txn); 577 goto err; 578 } 579 580 if ((ret = __txn_isvalid(txn, TXN_OP_COMMIT)) != 0) 581 return (ret); 582 583 /* 584 * Check for master leases at the beginning. If we are a 585 * master and cannot have valid leases now, we error and 586 * abort this txn. Leases are granted on PERM records, 587 * and since this is the beginning of txn_commit, there 588 * might not be *any* in the log yet. If that is the case, 589 * then __rep_lease_check (from __rep_lease_refresh and 590 * lower, log_c_get) will return DB_NOTFOUND. If we get 591 * that here, allow the operation to continue because leases 592 * will be checked after the commit completes again anyway. 593 */ 594 if (txn->parent == NULL && IS_REP_MASTER(env) && 595 IS_USING_LEASES(env) && (ret = __rep_lease_check(env, 1)) != 0) { 596 if (ret == DB_NOTFOUND) { 597 ret = 0; 598 } else 599 goto err; 600 } 601 602 infop = env->reginfo; 603 renv = infop->primary; 604 /* 605 * No mutex is needed as envid is read-only once it is set. 606 */ 607 id = renv->envid; 608 609 /* 610 * We clear flags that are incorrect, ignoring any flag errors, and 611 * default to synchronous operations. By definition, transaction 612 * handles are dead when we return, and this error should never 613 * happen, but we don't want to fail in the field 'cause the app is 614 * specifying the wrong flag for some reason. 615 */ 616 if (__db_fchk(env, "DB_TXN->commit", flags, 617 DB_TXN_NOSYNC | DB_TXN_SYNC | DB_TXN_WRITE_NOSYNC) != 0) 618 flags = DB_TXN_SYNC; 619 if (__db_fcchk(env, "DB_TXN->commit", flags, 620 DB_TXN_SYNC, DB_TXN_NOSYNC | DB_TXN_WRITE_NOSYNC) != 0) 621 flags = DB_TXN_SYNC; 622 623 if (LF_ISSET(DB_TXN_WRITE_NOSYNC)) { 624 F_CLR(txn, TXN_SYNC_FLAGS); 625 F_SET(txn, TXN_WRITE_NOSYNC); 626 } 627 if (LF_ISSET(DB_TXN_NOSYNC)) { 628 F_CLR(txn, TXN_SYNC_FLAGS); 629 F_SET(txn, TXN_NOSYNC); 630 } 631 if (LF_ISSET(DB_TXN_SYNC)) { 632 F_CLR(txn, TXN_SYNC_FLAGS); 633 F_SET(txn, TXN_SYNC); 634 } 635 636 DB_ASSERT(env, F_ISSET(txn, TXN_SYNC_FLAGS)); 637 638 /* 639 * Commit any unresolved children. If anyone fails to commit, 640 * then try to abort the rest of the kids and then abort the parent. 641 * Abort should never fail; if it does, we bail out immediately. 642 */ 643 while ((kid = TAILQ_FIRST(&txn->kids)) != NULL) 644 if ((ret = __txn_commit(kid, flags)) != 0) 645 while ((kid = TAILQ_FIRST(&txn->kids)) != NULL) 646 if ((t_ret = __txn_abort(kid)) != 0) 647 return (__env_panic(env, t_ret)); 648 649 /* 650 * If there are any log records, write a log record and sync the log, 651 * else do no log writes. If the commit is for a child transaction, 652 * we do not need to commit the child synchronously since it may still 653 * abort (if its parent aborts), and otherwise its parent or ultimate 654 * ancestor will write synchronously. 655 */ 656 if (DBENV_LOGGING(env) && (!IS_ZERO_LSN(td->last_lsn) || 657 STAILQ_FIRST(&txn->logs) != NULL)) { 658 if (txn->parent == NULL) { 659 /* 660 * We are about to free all the read locks for this 661 * transaction below. Some of those locks might be 662 * handle locks which should not be freed, because 663 * they will be freed when the handle is closed. Check 664 * the events and preprocess any trades now so we don't 665 * release the locks below. 666 */ 667 if ((ret = 668 __txn_doevents(env, txn, TXN_PREPARE, 1)) != 0) 669 goto err; 670 671 memset(&request, 0, sizeof(request)); 672 if (LOCKING_ON(env)) { 673 request.op = DB_LOCK_PUT_READ; 674 if (IS_REP_MASTER(env) && 675 !IS_ZERO_LSN(td->last_lsn)) { 676 memset(&list_dbt, 0, sizeof(list_dbt)); 677 request.obj = &list_dbt; 678 } 679 ret = __lock_vec(env, 680 txn->locker, 0, &request, 1, NULL); 681 } 682 683 if (ret == 0 && !IS_ZERO_LSN(td->last_lsn)) { 684 ret = __txn_regop_log(env, txn, 685 &td->visible_lsn, LOG_FLAGS(txn), 686 TXN_COMMIT, 687 (int32_t)time(NULL), id, request.obj); 688 if (ret == 0) 689 td->last_lsn = td->visible_lsn; 690#ifdef DIAGNOSTIC 691 if (ret == 0) { 692 DB_LSN s_lsn; 693 694 DB_ASSERT(env, __log_current_lsn( 695 env, &s_lsn, NULL, NULL) == 0); 696 DB_ASSERT(env, LOG_COMPARE( 697 &td->visible_lsn, &s_lsn) <= 0); 698 COMPQUIET(s_lsn.file, 0); 699 } 700#endif 701 } 702 703 if (request.obj != NULL && request.obj->data != NULL) 704 __os_free(env, request.obj->data); 705 if (ret != 0) 706 goto err; 707 } else { 708 /* Log the commit in the parent! */ 709 if (!IS_ZERO_LSN(td->last_lsn) && 710 (ret = __txn_child_log(env, txn->parent, 711 &((TXN_DETAIL *)txn->parent->td)->last_lsn, 712 0, txn->txnid, &td->last_lsn)) != 0) { 713 goto err; 714 } 715 if (STAILQ_FIRST(&txn->logs) != NULL) { 716 /* 717 * Put the child first so we back it out first. 718 * All records are undone in reverse order. 719 */ 720 STAILQ_CONCAT(&txn->logs, &txn->parent->logs); 721 txn->parent->logs = txn->logs; 722 STAILQ_INIT(&txn->logs); 723 } 724 725 F_SET(txn->parent, TXN_CHILDCOMMIT); 726 } 727 } 728 729 if (txn->txn_list != NULL) { 730 __db_txnlist_end(env, txn->txn_list); 731 txn->txn_list = NULL; 732 } 733 734 if (ret != 0) 735 goto err; 736 737 /* 738 * Check for master leases at the end of only a normal commit. 739 * If we're a child, that is not a perm record. If we are a 740 * master and cannot get valid leases now, something happened 741 * during the commit. The only thing to do is panic. 742 */ 743 if (txn->parent == NULL && IS_REP_MASTER(env) && IS_USING_LEASES(env) && 744 (ret = __rep_lease_check(env, 1)) != 0) { 745 return (__env_panic(env, ret)); 746 } 747 748 /* This is OK because __txn_end can only fail with a panic. */ 749 return (__txn_end(txn, 1)); 750 751err: /* 752 * If we are prepared, then we "must" be able to commit. We panic here 753 * because even though the coordinator might be able to retry it is not 754 * clear it would know to do that. Otherwise we'll try to abort. If 755 * that is successful, then we return whatever was in ret (that is, the 756 * reason we failed). If the abort was unsuccessful, abort probably 757 * returned DB_RUNRECOVERY and we need to propagate that up. 758 */ 759 if (td->status == TXN_PREPARED) 760 return (__env_panic(env, ret)); 761 762 if ((t_ret = __txn_abort(txn)) != 0) 763 ret = t_ret; 764 return (ret); 765} 766 767/* 768 * __txn_abort_pp -- 769 * Interface routine to TXN->abort. 770 */ 771static int 772__txn_abort_pp(txn) 773 DB_TXN *txn; 774{ 775 DB_THREAD_INFO *ip; 776 ENV *env; 777 int not_child, ret, t_ret; 778 779 env = txn->mgrp->env; 780 not_child = txn->parent == NULL; 781 782 ENV_ENTER(env, ip); 783 784 ret = __txn_abort(txn); 785 if (not_child && IS_ENV_REPLICATED(env) && 786 (t_ret = __op_rep_exit(env)) != 0 && ret == 0) 787 ret = t_ret; 788 ENV_LEAVE(env, ip); 789 return (ret); 790} 791 792/* 793 * __txn_abort -- 794 * Abort a transaction. 795 * 796 * PUBLIC: int __txn_abort __P((DB_TXN *)); 797 */ 798int 799__txn_abort(txn) 800 DB_TXN *txn; 801{ 802 DB_LOCKREQ request; 803 DB_TXN *kid; 804 ENV *env; 805 REGENV *renv; 806 REGINFO *infop; 807 TXN_DETAIL *td; 808 u_int32_t id; 809 int ret; 810 811 env = txn->mgrp->env; 812 td = txn->td; 813 814 /* Ensure that abort always fails fatally. */ 815 if ((ret = __txn_isvalid(txn, TXN_OP_ABORT)) != 0) 816 return (__env_panic(env, ret)); 817 818 /* 819 * Try to abort any unresolved children. 820 * 821 * Abort either succeeds or panics the region. As soon as we 822 * see any failure, we just get out of here and return the panic 823 * up. 824 */ 825 while ((kid = TAILQ_FIRST(&txn->kids)) != NULL) 826 if ((ret = __txn_abort(kid)) != 0) 827 return (ret); 828 829 infop = env->reginfo; 830 renv = infop->primary; 831 /* 832 * No mutex is needed as envid is read-only once it is set. 833 */ 834 id = renv->envid; 835 836 /* 837 * Fast path -- no need to do anything fancy if there were no 838 * modifications (e.g., log records) for this transaction. 839 * We still call txn_undo to cleanup the txn_list from our 840 * children. 841 */ 842 if (IS_ZERO_LSN(td->last_lsn) && STAILQ_FIRST(&txn->logs) == NULL) { 843 if (txn->txn_list == NULL) 844 goto done; 845 else 846 goto undo; 847 } 848 849 if (LOCKING_ON(env)) { 850 /* Allocate a locker for this restored txn if necessary. */ 851 if (txn->locker == NULL && 852 (ret = __lock_getlocker(env->lk_handle, 853 txn->txnid, 1, &txn->locker)) != 0) 854 return (__env_panic(env, ret)); 855 /* 856 * We are about to free all the read locks for this transaction 857 * below. Some of those locks might be handle locks which 858 * should not be freed, because they will be freed when the 859 * handle is closed. Check the events and preprocess any 860 * trades now so that we don't release the locks below. 861 */ 862 if ((ret = __txn_doevents(env, txn, TXN_ABORT, 1)) != 0) 863 return (__env_panic(env, ret)); 864 865 /* Turn off timeouts. */ 866 if ((ret = __lock_set_timeout(env, 867 txn->locker, 0, DB_SET_TXN_TIMEOUT)) != 0) 868 return (__env_panic(env, ret)); 869 870 if ((ret = __lock_set_timeout(env, 871 txn->locker, 0, DB_SET_LOCK_TIMEOUT)) != 0) 872 return (__env_panic(env, ret)); 873 874 request.op = DB_LOCK_UPGRADE_WRITE; 875 request.obj = NULL; 876 if ((ret = __lock_vec( 877 env, txn->locker, 0, &request, 1, NULL)) != 0) 878 return (__env_panic(env, ret)); 879 } 880undo: if ((ret = __txn_undo(txn)) != 0) 881 return (__env_panic(env, ret)); 882 883 /* 884 * Normally, we do not need to log aborts. However, if we 885 * are a distributed transaction (i.e., we have a prepare), 886 * then we log the abort so we know that this transaction 887 * was actually completed. 888 */ 889done: if (DBENV_LOGGING(env) && td->status == TXN_PREPARED && 890 (ret = __txn_regop_log(env, txn, &td->last_lsn, 891 LOG_FLAGS(txn), TXN_ABORT, (int32_t)time(NULL), id, NULL)) != 0) 892 return (__env_panic(env, ret)); 893 894 /* __txn_end always panics if it errors, so pass the return along. */ 895 return (__txn_end(txn, 0)); 896} 897 898/* 899 * __txn_discard -- 900 * Interface routine to TXN->discard. 901 */ 902static int 903__txn_discard(txn, flags) 904 DB_TXN *txn; 905 u_int32_t flags; 906{ 907 DB_THREAD_INFO *ip; 908 ENV *env; 909 int ret, t_ret; 910 911 env = txn->mgrp->env; 912 913 ENV_ENTER(env, ip); 914 ret = __txn_discard_int(txn, flags); 915 if (IS_ENV_REPLICATED(env) && 916 (t_ret = __op_rep_exit(env)) != 0 && ret == 0) 917 ret = t_ret; 918 ENV_LEAVE(env, ip); 919 return (ret); 920} 921 922/* 923 * __txn_discard -- 924 * Free the per-process resources associated with this txn handle. 925 * 926 * PUBLIC: int __txn_discard_int __P((DB_TXN *, u_int32_t flags)); 927 */ 928int 929__txn_discard_int(txn, flags) 930 DB_TXN *txn; 931 u_int32_t flags; 932{ 933 DB_TXN *freep; 934 DB_TXNMGR *mgr; 935 ENV *env; 936 int ret; 937 938 COMPQUIET(flags, 0); 939 940 mgr = txn->mgrp; 941 env = mgr->env; 942 freep = NULL; 943 944 if ((ret = __txn_isvalid(txn, TXN_OP_DISCARD)) != 0) 945 return (ret); 946 947 /* Should be no children. */ 948 DB_ASSERT(env, TAILQ_FIRST(&txn->kids) == NULL); 949 950 /* Free the space. */ 951 MUTEX_LOCK(env, mgr->mutex); 952 mgr->n_discards++; 953 if (F_ISSET(txn, TXN_MALLOC)) { 954 TAILQ_REMOVE(&mgr->txn_chain, txn, links); 955 freep = txn; 956 } 957 MUTEX_UNLOCK(env, mgr->mutex); 958 if (freep != NULL) 959 __os_free(env, freep); 960 961 return (0); 962} 963 964/* 965 * __txn_prepare -- 966 * Flush the log so a future commit is guaranteed to succeed. 967 * 968 * PUBLIC: int __txn_prepare __P((DB_TXN *, u_int8_t *)); 969 */ 970int 971__txn_prepare(txn, gid) 972 DB_TXN *txn; 973 u_int8_t *gid; 974{ 975 DBT list_dbt, xid; 976 DB_LOCKREQ request; 977 DB_THREAD_INFO *ip; 978 DB_TXN *kid; 979 ENV *env; 980 TXN_DETAIL *td; 981 u_int32_t lflags; 982 int ret; 983 984 env = txn->mgrp->env; 985 td = txn->td; 986 987 if ((ret = __txn_isvalid(txn, TXN_OP_PREPARE)) != 0) 988 return (ret); 989 if (F_ISSET(txn, TXN_DEADLOCK)) 990 return (__db_txn_deadlock_err(env, txn)); 991 992 ENV_ENTER(env, ip); 993 994 /* Commit any unresolved children. */ 995 while ((kid = TAILQ_FIRST(&txn->kids)) != NULL) 996 if ((ret = __txn_commit(kid, DB_TXN_NOSYNC)) != 0) 997 goto err; 998 999 /* 1000 * In XA, the global transaction ID in the txn_detail structure is 1001 * already set; in a non-XA environment, we must set it here. XA 1002 * requires that the transaction be either ENDED or SUSPENDED when 1003 * prepare is called, so we know that if the xa_status isn't in one 1004 * of those states, then we are calling prepare directly and we need 1005 * to fill in the td->xid. 1006 */ 1007 if ((ret = __txn_doevents(env, txn, TXN_PREPARE, 1)) != 0) 1008 goto err; 1009 memset(&request, 0, sizeof(request)); 1010 if (LOCKING_ON(env)) { 1011 request.op = DB_LOCK_PUT_READ; 1012 if (!IS_ZERO_LSN(td->last_lsn)) { 1013 memset(&list_dbt, 0, sizeof(list_dbt)); 1014 request.obj = &list_dbt; 1015 } 1016 if ((ret = __lock_vec(env, 1017 txn->locker, 0, &request, 1, NULL)) != 0) 1018 goto err; 1019 1020 } 1021 if (DBENV_LOGGING(env)) { 1022 memset(&xid, 0, sizeof(xid)); 1023 if (td->xa_status != TXN_XA_ENDED && 1024 td->xa_status != TXN_XA_SUSPENDED) 1025 /* Regular prepare; fill in the gid. */ 1026 memcpy(td->xid, gid, sizeof(td->xid)); 1027 1028 xid.size = sizeof(td->xid); 1029 xid.data = td->xid; 1030 1031 lflags = DB_LOG_COMMIT | DB_FLUSH; 1032 if ((ret = __txn_xa_regop_log(env, txn, &td->last_lsn, 1033 lflags, TXN_PREPARE, &xid, td->format, td->gtrid, td->bqual, 1034 &td->begin_lsn, request.obj)) != 0) 1035 __db_err( 1036 env, ret, "DB_TXN->prepare: log_write failed"); 1037 1038 if (request.obj != NULL && request.obj->data != NULL) 1039 __os_free(env, request.obj->data); 1040 if (ret != 0) 1041 goto err; 1042 1043 } 1044 1045 MUTEX_LOCK(env, txn->mgrp->mutex); 1046 td->status = TXN_PREPARED; 1047 MUTEX_UNLOCK(env, txn->mgrp->mutex); 1048err: ENV_LEAVE(env, ip); 1049 return (ret); 1050} 1051 1052/* 1053 * __txn_id -- 1054 * Return the transaction ID. 1055 * 1056 * PUBLIC: u_int32_t __txn_id __P((DB_TXN *)); 1057 */ 1058u_int32_t 1059__txn_id(txn) 1060 DB_TXN *txn; 1061{ 1062 return (txn->txnid); 1063} 1064 1065/* 1066 * __txn_get_name -- 1067 * Get a descriptive string from a transaction. 1068 * 1069 * PUBLIC: int __txn_get_name __P((DB_TXN *, const char **)); 1070 */ 1071int 1072__txn_get_name(txn, namep) 1073 DB_TXN *txn; 1074 const char **namep; 1075{ 1076 *namep = txn->name; 1077 1078 return (0); 1079} 1080 1081/* 1082 * __txn_set_name -- 1083 * Set a descriptive string for a transaction. 1084 * 1085 * PUBLIC: int __txn_set_name __P((DB_TXN *, const char *)); 1086 */ 1087int 1088__txn_set_name(txn, name) 1089 DB_TXN *txn; 1090 const char *name; 1091{ 1092 DB_THREAD_INFO *ip; 1093 DB_TXNMGR *mgr; 1094 ENV *env; 1095 TXN_DETAIL *td; 1096 size_t len; 1097 int ret; 1098 char *p; 1099 1100 mgr = txn->mgrp; 1101 env = mgr->env; 1102 td = txn->td; 1103 len = strlen(name) + 1; 1104 1105 if ((ret = __os_realloc(env, len, &txn->name)) != 0) 1106 return (ret); 1107 memcpy(txn->name, name, len); 1108 1109 ENV_ENTER(env, ip); 1110 TXN_SYSTEM_LOCK(env); 1111 if (td->name != INVALID_ROFF) { 1112 __env_alloc_free( 1113 &mgr->reginfo, R_ADDR(&mgr->reginfo, td->name)); 1114 td->name = INVALID_ROFF; 1115 } 1116 if ((ret = __env_alloc(&mgr->reginfo, len, &p)) != 0) { 1117 TXN_SYSTEM_UNLOCK(env); 1118 __db_errx(env, 1119 "Unable to allocate memory for transaction name"); 1120 1121 __os_free(env, txn->name); 1122 txn->name = NULL; 1123 1124 ENV_LEAVE(env, ip); 1125 return (ret); 1126 } 1127 TXN_SYSTEM_UNLOCK(env); 1128 td->name = R_OFFSET(&mgr->reginfo, p); 1129 memcpy(p, name, len); 1130 1131#ifdef DIAGNOSTIC 1132 /* 1133 * If DIAGNOSTIC is set, map the name into the log so users can track 1134 * operations through the log. 1135 */ 1136 if (DBENV_LOGGING(env)) 1137 (void)__log_printf(env, txn, 1138 "transaction %#lx named %s", (u_long)txn->txnid, name); 1139#endif 1140 1141 ENV_LEAVE(env, ip); 1142 return (0); 1143} 1144 1145/* 1146 * __txn_set_timeout -- 1147 * ENV->set_txn_timeout. 1148 * PUBLIC: int __txn_set_timeout __P((DB_TXN *, db_timeout_t, u_int32_t)); 1149 */ 1150int 1151__txn_set_timeout(txn, timeout, op) 1152 DB_TXN *txn; 1153 db_timeout_t timeout; 1154 u_int32_t op; 1155{ 1156 DB_THREAD_INFO *ip; 1157 ENV *env; 1158 int ret; 1159 1160 env = txn->mgrp->env; 1161 1162 if (op != DB_SET_TXN_TIMEOUT && op != DB_SET_LOCK_TIMEOUT) 1163 return (__db_ferr(env, "DB_TXN->set_timeout", 0)); 1164 1165 ENV_ENTER(env, ip); 1166 ret = __lock_set_timeout( env, txn->locker, timeout, op); 1167 ENV_LEAVE(txn->mgrp->env, ip); 1168 return (ret); 1169} 1170 1171/* 1172 * __txn_isvalid -- 1173 * Return 0 if the DB_TXN is reasonable, otherwise panic. 1174 */ 1175static int 1176__txn_isvalid(txn, op) 1177 const DB_TXN *txn; 1178 txnop_t op; 1179{ 1180 DB_TXNMGR *mgr; 1181 DB_TXNREGION *region; 1182 ENV *env; 1183 TXN_DETAIL *td; 1184 1185 mgr = txn->mgrp; 1186 env = mgr->env; 1187 region = mgr->reginfo.primary; 1188 1189 /* Check for recovery. */ 1190 if (!F_ISSET(txn, TXN_COMPENSATE) && 1191 F_ISSET(region, TXN_IN_RECOVERY)) { 1192 __db_errx(env, "operation not permitted during recovery"); 1193 goto err; 1194 } 1195 1196 /* Check for live cursors. */ 1197 if (txn->cursors != 0) { 1198 __db_errx(env, "transaction has active cursors"); 1199 goto err; 1200 } 1201 1202 /* Check transaction's state. */ 1203 td = txn->td; 1204 1205 /* Handle any operation specific checks. */ 1206 switch (op) { 1207 case TXN_OP_DISCARD: 1208 /* 1209 * Since we're just tossing the per-process space; there are 1210 * a lot of problems with the transaction that we can tolerate. 1211 */ 1212 1213 /* Transaction is already been reused. */ 1214 if (txn->txnid != td->txnid) 1215 return (0); 1216 1217 /* 1218 * What we've got had better be either a prepared or 1219 * restored transaction. 1220 */ 1221 if (td->status != TXN_PREPARED && 1222 !F_ISSET(td, TXN_DTL_RESTORED)) { 1223 __db_errx(env, "not a restored transaction"); 1224 return (__env_panic(env, EINVAL)); 1225 } 1226 1227 return (0); 1228 case TXN_OP_PREPARE: 1229 if (txn->parent != NULL) { 1230 /* 1231 * This is not fatal, because you could imagine an 1232 * application that simply prepares everybody because 1233 * it doesn't distinguish between children and parents. 1234 * I'm not arguing this is good, but I could imagine 1235 * someone doing it. 1236 */ 1237 __db_errx(env, 1238 "Prepare disallowed on child transactions"); 1239 return (EINVAL); 1240 } 1241 break; 1242 case TXN_OP_ABORT: 1243 case TXN_OP_COMMIT: 1244 default: 1245 break; 1246 } 1247 1248 switch (td->status) { 1249 case TXN_PREPARED: 1250 if (op == TXN_OP_PREPARE) { 1251 __db_errx(env, "transaction already prepared"); 1252 /* 1253 * Txn_prepare doesn't blow away the user handle, so 1254 * in this case, give the user the opportunity to 1255 * abort or commit. 1256 */ 1257 return (EINVAL); 1258 } 1259 break; 1260 case TXN_RUNNING: 1261 break; 1262 case TXN_ABORTED: 1263 case TXN_COMMITTED: 1264 default: 1265 __db_errx(env, "transaction already %s", 1266 td->status == TXN_COMMITTED ? "committed" : "aborted"); 1267 goto err; 1268 } 1269 1270 return (0); 1271 1272err: /* 1273 * If there's a serious problem with the transaction, panic. TXN 1274 * handles are dead by definition when we return, and if you use 1275 * a cursor you forgot to close, we have no idea what will happen. 1276 */ 1277 return (__env_panic(env, EINVAL)); 1278} 1279 1280/* 1281 * __txn_end -- 1282 * Internal transaction end routine. 1283 */ 1284static int 1285__txn_end(txn, is_commit) 1286 DB_TXN *txn; 1287 int is_commit; 1288{ 1289 DB_LOCKREQ request; 1290 DB_TXNLOGREC *lr; 1291 DB_TXNMGR *mgr; 1292 DB_TXNREGION *region; 1293 ENV *env; 1294 TXN_DETAIL *ptd, *td; 1295 db_mutex_t mvcc_mtx; 1296 int do_closefiles, ret; 1297 1298 mgr = txn->mgrp; 1299 env = mgr->env; 1300 region = mgr->reginfo.primary; 1301 do_closefiles = 0; 1302 1303 /* Process commit events. */ 1304 if ((ret = __txn_doevents(env, 1305 txn, is_commit ? TXN_COMMIT : TXN_ABORT, 0)) != 0) 1306 return (__env_panic(env, ret)); 1307 1308 /* 1309 * Release the locks. 1310 * 1311 * __txn_end cannot return an simple error, we MUST return 1312 * success/failure from commit or abort, ignoring any internal 1313 * errors. So, we panic if something goes wrong. We can't 1314 * deadlock here because we're not acquiring any new locks, 1315 * so DB_LOCK_DEADLOCK is just as fatal as any other error. 1316 */ 1317 if (LOCKING_ON(env)) { 1318 /* Allocate a locker for this restored txn if necessary. */ 1319 if (txn->locker == NULL && 1320 (ret = __lock_getlocker(env->lk_handle, 1321 txn->txnid, 1, &txn->locker)) != 0) 1322 return (__env_panic(env, ret)); 1323 request.op = txn->parent == NULL || 1324 is_commit == 0 ? DB_LOCK_PUT_ALL : DB_LOCK_INHERIT; 1325 request.obj = NULL; 1326 if ((ret = __lock_vec(env, 1327 txn->locker, 0, &request, 1, NULL)) != 0) 1328 return (__env_panic(env, ret)); 1329 } 1330 1331 /* End the transaction. */ 1332 td = txn->td; 1333 if (td->nlog_dbs != 0 && (ret = __txn_dref_fname(env, txn)) != 0) 1334 return (__env_panic(env, ret)); 1335 1336 if (td->mvcc_ref != 0 && IS_MAX_LSN(td->visible_lsn)) { 1337 DB_ASSERT(env, !is_commit); 1338 1339 /* 1340 * In the abort path, we need to make sure that the versions 1341 * become visible to future transactions. We need to set 1342 * visible_lsn before setting td->status to ensure safe reads 1343 * of visible_lsn in __memp_fget. 1344 */ 1345 if ((ret = __log_current_lsn(env, &td->visible_lsn, 1346 NULL, NULL)) != 0) 1347 return (__env_panic(env, ret)); 1348 } 1349 1350 TXN_SYSTEM_LOCK(env); 1351 td->status = is_commit ? TXN_COMMITTED : TXN_ABORTED; 1352 SH_TAILQ_REMOVE(®ion->active_txn, td, links, __txn_detail); 1353 if (F_ISSET(td, TXN_DTL_RESTORED)) { 1354 region->stat.st_nrestores--; 1355 do_closefiles = region->stat.st_nrestores == 0; 1356 } 1357 1358 if (td->name != INVALID_ROFF) { 1359 __env_alloc_free( 1360 &mgr->reginfo, R_ADDR(&mgr->reginfo, td->name)); 1361 td->name = INVALID_ROFF; 1362 } 1363 if (txn->parent != NULL) { 1364 ptd = txn->parent->td; 1365 SH_TAILQ_REMOVE(&ptd->kids, td, klinks, __txn_detail); 1366 } else if ((mvcc_mtx = td->mvcc_mtx) != MUTEX_INVALID) { 1367 MUTEX_LOCK(env, mvcc_mtx); 1368 if (td->mvcc_ref != 0) { 1369 SH_TAILQ_INSERT_HEAD(®ion->mvcc_txn, 1370 td, links, __txn_detail); 1371#ifdef HAVE_STATISTICS 1372 if (++region->stat.st_nsnapshot > 1373 region->stat.st_maxnsnapshot) 1374 region->stat.st_maxnsnapshot = 1375 region->stat.st_nsnapshot; 1376#endif 1377 td = NULL; 1378 } 1379 MUTEX_UNLOCK(env, mvcc_mtx); 1380 if (td != NULL) 1381 if ((ret = __mutex_free(env, &td->mvcc_mtx)) != 0) 1382 return (__env_panic(env, ret)); 1383 } 1384 1385 if (td != NULL) { 1386 if (td->nlog_slots != TXN_NSLOTS) 1387 __env_alloc_free(&mgr->reginfo, 1388 R_ADDR(&mgr->reginfo, td->log_dbs)); 1389 __env_alloc_free(&mgr->reginfo, td); 1390 } 1391 1392#ifdef HAVE_STATISTICS 1393 if (is_commit) 1394 region->stat.st_ncommits++; 1395 else 1396 region->stat.st_naborts++; 1397 --region->stat.st_nactive; 1398#endif 1399 1400 TXN_SYSTEM_UNLOCK(env); 1401 1402 /* 1403 * The transaction cannot get more locks, remove its locker info, 1404 * if any. 1405 */ 1406 if (LOCKING_ON(env) && (ret = 1407 __lock_freefamilylocker(env->lk_handle, txn->locker)) != 0) 1408 return (__env_panic(env, ret)); 1409 if (txn->parent != NULL) 1410 TAILQ_REMOVE(&txn->parent->kids, txn, klinks); 1411 1412 /* Free the space. */ 1413 while ((lr = STAILQ_FIRST(&txn->logs)) != NULL) { 1414 STAILQ_REMOVE(&txn->logs, lr, __txn_logrec, links); 1415 __os_free(env, lr); 1416 } 1417 if (txn->name != NULL) { 1418 __os_free(env, txn->name); 1419 txn->name = NULL; 1420 } 1421 if (F_ISSET(txn, TXN_MALLOC)) { 1422 MUTEX_LOCK(env, mgr->mutex); 1423 TAILQ_REMOVE(&mgr->txn_chain, txn, links); 1424 MUTEX_UNLOCK(env, mgr->mutex); 1425 1426 __os_free(env, txn); 1427 } 1428 1429 if (do_closefiles) { 1430 /* 1431 * Otherwise, we have resolved the last outstanding prepared 1432 * txn and need to invalidate the fileids that were left 1433 * open for those txns and then close them. 1434 */ 1435 (void)__dbreg_invalidate_files(env, 1); 1436 (void)__dbreg_close_files(env, 1); 1437 if (IS_REP_MASTER(env)) 1438 F_CLR(env->rep_handle, DBREP_OPENFILES); 1439 F_CLR(env->lg_handle, DBLOG_OPENFILES); 1440 mgr->n_discards = 0; 1441 (void)__txn_checkpoint(env, 0, 0, 1442 DB_CKP_INTERNAL | DB_FORCE); 1443 } 1444 1445 return (0); 1446} 1447 1448static int 1449__txn_dispatch_undo(env, txn, rdbt, key_lsn, txnlist) 1450 ENV *env; 1451 DB_TXN *txn; 1452 DBT *rdbt; 1453 DB_LSN *key_lsn; 1454 DB_TXNHEAD *txnlist; 1455{ 1456 int ret; 1457 1458 txnlist->td = txn->td; 1459 ret = __db_dispatch(env, &env->recover_dtab, 1460 rdbt, key_lsn, DB_TXN_ABORT, txnlist); 1461 if (ret == DB_SURPRISE_KID) { 1462 F_SET(txn, TXN_CHILDCOMMIT); 1463 ret = 0; 1464 } 1465 if (ret == 0 && F_ISSET(txn, TXN_CHILDCOMMIT) && IS_ZERO_LSN(*key_lsn)) 1466 ret = __db_txnlist_lsnget(env, txnlist, key_lsn, 0); 1467 1468 return (ret); 1469} 1470 1471/* 1472 * __txn_undo -- 1473 * Undo the transaction with id txnid. 1474 */ 1475static int 1476__txn_undo(txn) 1477 DB_TXN *txn; 1478{ 1479 DBT rdbt; 1480 DB_LOGC *logc; 1481 DB_LSN key_lsn; 1482 DB_TXN *ptxn; 1483 DB_TXNHEAD *txnlist; 1484 DB_TXNLOGREC *lr; 1485 DB_TXNMGR *mgr; 1486 ENV *env; 1487 int ret, t_ret; 1488 1489 mgr = txn->mgrp; 1490 env = mgr->env; 1491 logc = NULL; 1492 txnlist = NULL; 1493 ret = 0; 1494 1495 if (!LOGGING_ON(env)) 1496 return (0); 1497 1498 /* 1499 * This is the simplest way to code this, but if the mallocs during 1500 * recovery turn out to be a performance issue, we can do the 1501 * allocation here and use DB_DBT_USERMEM. 1502 */ 1503 memset(&rdbt, 0, sizeof(rdbt)); 1504 1505 /* 1506 * Allocate a txnlist for children and aborted page allocs. 1507 * We need to associate the list with the maximal parent 1508 * so that aborted pages are recovered when that transaction 1509 * is committed or aborted. 1510 */ 1511 for (ptxn = txn->parent; ptxn != NULL && ptxn->parent != NULL;) 1512 ptxn = ptxn->parent; 1513 1514 if (ptxn != NULL && ptxn->txn_list != NULL) 1515 txnlist = ptxn->txn_list; 1516 else if (txn->txn_list != NULL) 1517 txnlist = txn->txn_list; 1518 else if ((ret = __db_txnlist_init(env, 1519 txn->thread_info, 0, 0, NULL, &txnlist)) != 0) 1520 return (ret); 1521 else if (ptxn != NULL) 1522 ptxn->txn_list = txnlist; 1523 1524 /* 1525 * Take log records from the linked list stored in the transaction, 1526 * then from the log. 1527 */ 1528 STAILQ_FOREACH(lr, &txn->logs, links) { 1529 rdbt.data = lr->data; 1530 rdbt.size = 0; 1531 LSN_NOT_LOGGED(key_lsn); 1532 ret = 1533 __txn_dispatch_undo(env, txn, &rdbt, &key_lsn, txnlist); 1534 if (ret != 0) { 1535 __db_err(env, ret, 1536 "DB_TXN->abort: in-memory log undo failed"); 1537 goto err; 1538 } 1539 } 1540 1541 key_lsn = ((TXN_DETAIL *)txn->td)->last_lsn; 1542 1543 if (!IS_ZERO_LSN(key_lsn) && 1544 (ret = __log_cursor(env, &logc)) != 0) 1545 goto err; 1546 1547 while (!IS_ZERO_LSN(key_lsn)) { 1548 /* 1549 * The dispatch routine returns the lsn of the record 1550 * before the current one in the key_lsn argument. 1551 */ 1552 if ((ret = __logc_get(logc, &key_lsn, &rdbt, DB_SET)) == 0) { 1553 ret = __txn_dispatch_undo(env, 1554 txn, &rdbt, &key_lsn, txnlist); 1555 } 1556 1557 if (ret != 0) { 1558 __db_err(env, ret, 1559 "DB_TXN->abort: log undo failed for LSN: %lu %lu", 1560 (u_long)key_lsn.file, (u_long)key_lsn.offset); 1561 goto err; 1562 } 1563 } 1564 1565err: if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0) 1566 ret = t_ret; 1567 1568 if (ptxn == NULL && txnlist != NULL) 1569 __db_txnlist_end(env, txnlist); 1570 return (ret); 1571} 1572 1573/* 1574 * __txn_activekids -- 1575 * Return if this transaction has any active children. 1576 * 1577 * PUBLIC: int __txn_activekids __P((ENV *, u_int32_t, DB_TXN *)); 1578 */ 1579int 1580__txn_activekids(env, rectype, txn) 1581 ENV *env; 1582 u_int32_t rectype; 1583 DB_TXN *txn; 1584{ 1585 /* 1586 * On a child commit, we know that there are children (i.e., the 1587 * committing child at the least. In that case, skip this check. 1588 */ 1589 if (F_ISSET(txn, TXN_COMPENSATE) || rectype == DB___txn_child) 1590 return (0); 1591 1592 if (TAILQ_FIRST(&txn->kids) != NULL) { 1593 __db_errx(env, "Child transaction is active"); 1594 return (EPERM); 1595 } 1596 return (0); 1597} 1598 1599/* 1600 * __txn_force_abort -- 1601 * Force an abort record into the log if the commit record 1602 * failed to get to disk. 1603 * 1604 * PUBLIC: int __txn_force_abort __P((ENV *, u_int8_t *)); 1605 */ 1606int 1607__txn_force_abort(env, buffer) 1608 ENV *env; 1609 u_int8_t *buffer; 1610{ 1611 DB_CIPHER *db_cipher; 1612 HDR hdr, *hdrp; 1613 u_int32_t offset, opcode, sum_len; 1614 u_int8_t *bp, *key, chksum[DB_MAC_KEY]; 1615 size_t hdrsize, rec_len; 1616 int ret; 1617 1618 db_cipher = env->crypto_handle; 1619 1620 /* 1621 * This routine depends on the layout of HDR and the __txn_regop 1622 * __txn_xa_regop records in txn.src. We are passed the beginning 1623 * of the commit record in the log buffer and overwrite the 1624 * commit with an abort and recalculate the checksum. 1625 */ 1626 hdrsize = CRYPTO_ON(env) ? HDR_CRYPTO_SZ : HDR_NORMAL_SZ; 1627 1628 hdrp = (HDR *)buffer; 1629 memcpy(&hdr.prev, buffer + SSZ(HDR, prev), sizeof(hdr.prev)); 1630 memcpy(&hdr.len, buffer + SSZ(HDR, len), sizeof(hdr.len)); 1631 rec_len = hdr.len - hdrsize; 1632 1633 offset = sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(DB_LSN); 1634 if (CRYPTO_ON(env)) { 1635 key = db_cipher->mac_key; 1636 sum_len = DB_MAC_KEY; 1637 if ((ret = db_cipher->decrypt(env, db_cipher->data, 1638 &hdrp->iv[0], buffer + hdrsize, rec_len)) != 0) 1639 return (__env_panic(env, ret)); 1640 } else { 1641 key = NULL; 1642 sum_len = sizeof(u_int32_t); 1643 } 1644 bp = buffer + hdrsize + offset; 1645 opcode = TXN_ABORT; 1646 memcpy(bp, &opcode, sizeof(opcode)); 1647 1648 if (CRYPTO_ON(env) && 1649 (ret = db_cipher->encrypt(env, 1650 db_cipher->data, &hdrp->iv[0], buffer + hdrsize, rec_len)) != 0) 1651 return (__env_panic(env, ret)); 1652 1653 __db_chksum(&hdr, buffer + hdrsize, rec_len, key, chksum); 1654 memcpy(buffer + SSZA(HDR, chksum), chksum, sum_len); 1655 1656 return (0); 1657} 1658 1659/* 1660 * __txn_preclose -- 1661 * Before we can close an environment, we need to check if we were in the 1662 * middle of taking care of restored transactions. If so, close the files 1663 * we opened. 1664 * 1665 * PUBLIC: int __txn_preclose __P((ENV *)); 1666 */ 1667int 1668__txn_preclose(env) 1669 ENV *env; 1670{ 1671 DB_TXNMGR *mgr; 1672 DB_TXNREGION *region; 1673 int do_closefiles, ret; 1674 1675 mgr = env->tx_handle; 1676 region = mgr->reginfo.primary; 1677 do_closefiles = 0; 1678 1679 TXN_SYSTEM_LOCK(env); 1680 if (region != NULL && 1681 region->stat.st_nrestores <= mgr->n_discards && 1682 mgr->n_discards != 0) 1683 do_closefiles = 1; 1684 TXN_SYSTEM_UNLOCK(env); 1685 1686 if (do_closefiles) { 1687 /* 1688 * Set the DBLOG_RECOVER flag while closing these files so they 1689 * do not create additional log records that will confuse future 1690 * recoveries. 1691 */ 1692 F_SET(env->lg_handle, DBLOG_RECOVER); 1693 ret = __dbreg_close_files(env, 0); 1694 F_CLR(env->lg_handle, DBLOG_RECOVER); 1695 } else 1696 ret = 0; 1697 1698 return (ret); 1699} 1700 1701/* 1702 * __txn_reset -- 1703 * Reset the last txnid to its minimum value, and log the reset. 1704 * 1705 * PUBLIC: int __txn_reset __P((ENV *)); 1706 */ 1707int 1708__txn_reset(env) 1709 ENV *env; 1710{ 1711 DB_LSN scrap; 1712 DB_TXNREGION *region; 1713 1714 region = env->tx_handle->reginfo.primary; 1715 region->last_txnid = TXN_MINIMUM; 1716 1717 DB_ASSERT(env, LOGGING_ON(env)); 1718 return (__txn_recycle_log(env, 1719 NULL, &scrap, 0, TXN_MINIMUM, TXN_MAXIMUM)); 1720} 1721 1722/* 1723 * txn_set_txn_lsnp -- 1724 * Set the pointer to the begin_lsn field if that field is zero. 1725 * Set the pointer to the last_lsn field. 1726 */ 1727static void 1728__txn_set_txn_lsnp(txn, blsnp, llsnp) 1729 DB_TXN *txn; 1730 DB_LSN **blsnp, **llsnp; 1731{ 1732 TXN_DETAIL *td; 1733 1734 td = txn->td; 1735 *llsnp = &td->last_lsn; 1736 1737 while (txn->parent != NULL) 1738 txn = txn->parent; 1739 1740 td = txn->td; 1741 if (IS_ZERO_LSN(td->begin_lsn)) 1742 *blsnp = &td->begin_lsn; 1743} 1744