1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 */
6/*
7 * Copyright (c) 1996
8 *	The President and Fellows of Harvard University.  All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * $Id: txn_rec.c,v 12.29 2008/03/13 20:48:48 mbrey Exp $
35 */
36
37#include "db_config.h"
38
39#include "db_int.h"
40#include "dbinc/db_page.h"
41#include "dbinc/lock.h"
42#include "dbinc/txn.h"
43#include "dbinc/db_am.h"
44
45/*
46 * PUBLIC: int __txn_regop_recover
47 * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
48 *
49 * These records are only ever written for commits.  Normally, we redo any
50 * committed transaction, however if we are doing recovery to a timestamp, then
51 * we may treat transactions that committed after the timestamp as aborted.
52 */
53int
54__txn_regop_recover(env, dbtp, lsnp, op, info)
55	ENV *env;
56	DBT *dbtp;
57	DB_LSN *lsnp;
58	db_recops op;
59	void *info;
60{
61	__txn_regop_args *argp;
62	DB_TXNHEAD *headp;
63	int ret;
64	u_int32_t status;
65
66#ifdef DEBUG_RECOVER
67	(void)__txn_regop_print(env, dbtp, lsnp, op, info);
68#endif
69
70	if ((ret = __txn_regop_read(env, dbtp->data, &argp)) != 0)
71		return (ret);
72
73	headp = info;
74	/*
75	 * We are only ever called during FORWARD_ROLL or BACKWARD_ROLL.
76	 * We check for the former explicitly and the last two clauses
77	 * apply to the BACKWARD_ROLL case.
78	 */
79
80	if (op == DB_TXN_FORWARD_ROLL) {
81		/*
82		 * If this was a 2-phase-commit transaction, then it
83		 * might already have been removed from the list, and
84		 * that's OK.  Ignore the return code from remove.
85		 */
86		if ((ret = __db_txnlist_remove(env,
87		    info, argp->txnp->txnid)) != DB_NOTFOUND && ret != 0)
88			goto err;
89	} else if ((env->dbenv->tx_timestamp != 0 &&
90	    argp->timestamp > (int32_t)env->dbenv->tx_timestamp) ||
91	    (!IS_ZERO_LSN(headp->trunc_lsn) &&
92	    LOG_COMPARE(&headp->trunc_lsn, lsnp) < 0)) {
93		/*
94		 * We failed either the timestamp check or the trunc_lsn check,
95		 * so we treat this as an abort even if it was a commit record.
96		 */
97		if ((ret = __db_txnlist_update(env, info,
98		    argp->txnp->txnid, TXN_ABORT, NULL, &status, 1)) != 0)
99			goto err;
100		else if (status != TXN_IGNORE && status != TXN_OK)
101			goto err;
102	} else {
103		/* This is a normal commit; mark it appropriately. */
104		if ((ret = __db_txnlist_update(env,
105		    info, argp->txnp->txnid, argp->opcode, lsnp,
106		    &status, 0)) == DB_NOTFOUND) {
107			if ((ret = __db_txnlist_add(env,
108			    info, argp->txnp->txnid,
109			    argp->opcode == TXN_ABORT ?
110			    TXN_IGNORE : argp->opcode, lsnp)) != 0)
111				goto err;
112		} else if (ret != 0 ||
113		    (status != TXN_IGNORE && status != TXN_OK))
114			goto err;
115	}
116
117	if (ret == 0)
118		*lsnp = argp->prev_lsn;
119
120	if (0) {
121err:		__db_errx(env,
122		    "txnid %lx commit record found, already on commit list",
123		    (u_long)argp->txnp->txnid);
124		ret = EINVAL;
125	}
126	__os_free(env, argp);
127
128	return (ret);
129}
130
131/*
132 * PUBLIC: int __txn_xa_regop_recover
133 * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
134 *
135 * These records are only ever written for prepares.
136 */
137int
138__txn_xa_regop_recover(env, dbtp, lsnp, op, info)
139	ENV *env;
140	DBT *dbtp;
141	DB_LSN *lsnp;
142	db_recops op;
143	void *info;
144{
145	__txn_xa_regop_args *argp;
146	DBT *lock_dbt;
147	DB_TXNHEAD *headp;
148	DB_LOCKTAB *lt;
149	u_int32_t status;
150	int ret;
151
152#ifdef DEBUG_RECOVER
153	(void)__txn_xa_regop_print(env, dbtp, lsnp, op, info);
154#endif
155
156	if ((ret = __txn_xa_regop_read(env, dbtp->data, &argp)) != 0)
157		return (ret);
158
159	if (argp->opcode != TXN_PREPARE && argp->opcode != TXN_ABORT) {
160		ret = EINVAL;
161		goto err;
162	}
163	headp = info;
164
165	/*
166	 * The return value here is either a DB_NOTFOUND or it is
167	 * the transaction status from the list.  It is not a normal
168	 * error return, so we must make sure that in each of the
169	 * cases below, we overwrite the ret value so we return
170	 * appropriately.
171	 */
172	ret = __db_txnlist_find(env, info, argp->txnp->txnid, &status);
173
174	/*
175	 * If we are rolling forward, then an aborted prepare
176	 * indicates that this may be the last record we'll see for
177	 * this transaction ID, so we should remove it from the list.
178	 */
179
180	if (op == DB_TXN_FORWARD_ROLL) {
181		if ((ret = __db_txnlist_remove(env,
182		    info, argp->txnp->txnid)) != 0)
183			goto txn_err;
184	} else if (op == DB_TXN_BACKWARD_ROLL && status == TXN_PREPARE) {
185		/*
186		 * On the backward pass, we have four possibilities:
187		 * 1. The transaction is already committed, no-op.
188		 * 2. The transaction is already aborted, no-op.
189		 * 3. The prepare failed and was aborted, mark as abort.
190		 * 4. The transaction is neither committed nor aborted.
191		 *	 Treat this like a commit and roll forward so that
192		 *	 the transaction can be resurrected in the region.
193		 * We handle cases 3 and 4 here; cases 1 and 2
194		 * are the final clause below.
195		 */
196		if (argp->opcode == TXN_ABORT) {
197			if ((ret = __db_txnlist_update(env,
198			     info, argp->txnp->txnid,
199			     TXN_ABORT, NULL, &status, 0)) != 0 &&
200			     status != TXN_PREPARE)
201				goto txn_err;
202			ret = 0;
203		}
204		/*
205		 * This is prepared, but not yet committed transaction.  We
206		 * need to add it to the transaction list, so that it gets
207		 * rolled forward. We also have to add it to the region's
208		 * internal state so it can be properly aborted or committed
209		 * after recovery (see txn_recover).
210		 */
211		else if ((ret = __db_txnlist_remove(env,
212		    info, argp->txnp->txnid)) != 0) {
213txn_err:		__db_errx(env,
214			    "transaction not in list %lx",
215			    (u_long)argp->txnp->txnid);
216			ret = DB_NOTFOUND;
217		} else if (IS_ZERO_LSN(headp->trunc_lsn) ||
218		    LOG_COMPARE(&headp->trunc_lsn, lsnp) >= 0) {
219			if ((ret = __db_txnlist_add(env,
220			   info, argp->txnp->txnid, TXN_COMMIT, lsnp)) == 0) {
221				/* Re-acquire the locks for this transaction. */
222				lock_dbt = &argp->locks;
223				if (LOCKING_ON(env)) {
224					lt = env->lk_handle;
225					if ((ret = __lock_getlocker(lt,
226						argp->txnp->txnid, 1,
227						&argp->txnp->locker)) != 0)
228						goto err;
229					if ((ret = __lock_get_list(env,
230					    argp->txnp->locker, 0,
231					    DB_LOCK_WRITE, lock_dbt)) != 0)
232						goto err;
233				}
234
235				ret = __txn_restore_txn(env, lsnp, argp);
236			}
237		}
238	} else
239		ret = 0;
240
241	if (ret == 0)
242		*lsnp = argp->prev_lsn;
243
244err:	__os_free(env, argp);
245
246	return (ret);
247}
248
249/*
250 * PUBLIC: int __txn_ckp_recover
251 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
252 */
253int
254__txn_ckp_recover(env, dbtp, lsnp, op, info)
255	ENV *env;
256	DBT *dbtp;
257	DB_LSN *lsnp;
258	db_recops op;
259	void *info;
260{
261	__txn_ckp_args *argp;
262	int ret;
263
264#ifdef DEBUG_RECOVER
265	__txn_ckp_print(env, dbtp, lsnp, op, info);
266#endif
267	if ((ret = __txn_ckp_read(env, dbtp->data, &argp)) != 0)
268		return (ret);
269
270	if (op == DB_TXN_BACKWARD_ROLL)
271		__db_txnlist_ckp(env, info, lsnp);
272
273	*lsnp = argp->last_ckp;
274	__os_free(env, argp);
275	return (DB_TXN_CKP);
276}
277
278/*
279 * __txn_child_recover
280 *	Recover a commit record for a child transaction.
281 *
282 * PUBLIC: int __txn_child_recover
283 * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
284 */
285int
286__txn_child_recover(env, dbtp, lsnp, op, info)
287	ENV *env;
288	DBT *dbtp;
289	DB_LSN *lsnp;
290	db_recops op;
291	void *info;
292{
293	__txn_child_args *argp;
294	u_int32_t c_stat, p_stat, tmpstat;
295	int ret, t_ret;
296
297#ifdef DEBUG_RECOVER
298	(void)__txn_child_print(env, dbtp, lsnp, op, info);
299#endif
300	if ((ret = __txn_child_read(env, dbtp->data, &argp)) != 0)
301		return (ret);
302
303	/*
304	 * This is a record in a PARENT's log trail indicating that a
305	 * child committed.  If we are aborting, return the childs last
306	 * record's LSN.  If we are in recovery, then if the
307	 * parent is committing, we set ourselves up to commit, else
308	 * we do nothing.
309	 */
310	if (op == DB_TXN_ABORT) {
311		*lsnp = argp->c_lsn;
312		ret = __db_txnlist_lsnadd(env, info, &argp->prev_lsn);
313		goto out;
314	} else if (op == DB_TXN_BACKWARD_ROLL) {
315		/* Child might exist -- look for it. */
316		ret = __db_txnlist_find(env, info, argp->child, &c_stat);
317		t_ret =
318		    __db_txnlist_find(env, info, argp->txnp->txnid, &p_stat);
319		if (ret != 0 && ret != DB_NOTFOUND)
320			goto out;
321		if (t_ret != 0 && t_ret != DB_NOTFOUND) {
322			ret = t_ret;
323			goto out;
324		}
325		/*
326		 * If the parent is in state COMMIT or IGNORE, then we apply
327		 * that to the child, else we need to abort the child.
328		 */
329
330		if (ret == DB_NOTFOUND  ||
331		    c_stat == TXN_OK || c_stat == TXN_COMMIT) {
332			if (t_ret == DB_NOTFOUND ||
333			     (p_stat != TXN_COMMIT  && p_stat != TXN_IGNORE))
334				c_stat = TXN_ABORT;
335			else
336				c_stat = p_stat;
337
338			if (ret == DB_NOTFOUND)
339				ret = __db_txnlist_add(env,
340				     info, argp->child, c_stat, NULL);
341			else
342				ret = __db_txnlist_update(env, info,
343				     argp->child, c_stat, NULL, &tmpstat, 0);
344		} else if (c_stat == TXN_EXPECTED) {
345			/*
346			 * The open after this create succeeded.  If the
347			 * parent succeeded, we don't want to redo; if the
348			 * parent aborted, we do want to undo.
349			 */
350			switch (p_stat) {
351			case TXN_COMMIT:
352			case TXN_IGNORE:
353				c_stat = TXN_IGNORE;
354				break;
355			default:
356				c_stat = TXN_ABORT;
357			}
358			ret = __db_txnlist_update(env,
359			    info, argp->child, c_stat, NULL, &tmpstat, 0);
360		} else if (c_stat == TXN_UNEXPECTED) {
361			/*
362			 * The open after this create failed.  If the parent
363			 * is rolling forward, we need to roll forward.  If
364			 * the parent failed, then we do not want to abort
365			 * (because the file may not be the one in which we
366			 * are interested).
367			 */
368			ret = __db_txnlist_update(env, info, argp->child,
369			    p_stat == TXN_COMMIT ? TXN_COMMIT : TXN_IGNORE,
370			    NULL, &tmpstat, 0);
371		}
372	} else if (op == DB_TXN_OPENFILES) {
373		/*
374		 * If we have a partial subtransaction, then the whole
375		 * transaction should be ignored.
376		 */
377		if ((ret = __db_txnlist_find(env,
378		    info, argp->child, &c_stat)) == DB_NOTFOUND)
379			ret = __db_txnlist_update(env, info,
380			     argp->txnp->txnid, TXN_IGNORE,
381			     NULL, &p_stat, 1);
382	} else if (DB_REDO(op)) {
383		/* Forward Roll */
384		if ((ret =
385		    __db_txnlist_remove(env, info, argp->child)) != 0)
386			__db_errx(env,
387			    "Transaction not in list %x", argp->child);
388	}
389
390	if (ret == 0)
391		*lsnp = argp->prev_lsn;
392
393out:	__os_free(env, argp);
394
395	return (ret);
396}
397
398/*
399 * __txn_restore_txn --
400 *	Using only during XA recovery.  If we find any transactions that are
401 * prepared, but not yet committed, then we need to restore the transaction's
402 * state into the shared region, because the TM is going to issue an abort
403 * or commit and we need to respond correctly.
404 *
405 * lsnp is the LSN of the returned LSN
406 * argp is the prepare record (in an appropriate structure)
407 *
408 * PUBLIC: int __txn_restore_txn __P((ENV *, DB_LSN *, __txn_xa_regop_args *));
409 */
410int
411__txn_restore_txn(env, lsnp, argp)
412	ENV *env;
413	DB_LSN *lsnp;
414	__txn_xa_regop_args *argp;
415{
416	DB_TXNMGR *mgr;
417	DB_TXNREGION *region;
418	TXN_DETAIL *td;
419	int ret;
420
421	if (argp->xid.size == 0)
422		return (0);
423
424	mgr = env->tx_handle;
425	region = mgr->reginfo.primary;
426	TXN_SYSTEM_LOCK(env);
427
428	/* Allocate a new transaction detail structure. */
429	if ((ret = __env_alloc(&mgr->reginfo, sizeof(TXN_DETAIL), &td)) != 0) {
430		TXN_SYSTEM_UNLOCK(env);
431		return (ret);
432	}
433
434	/* Place transaction on active transaction list. */
435	SH_TAILQ_INSERT_HEAD(&region->active_txn, td, links, __txn_detail);
436
437	td->txnid = argp->txnp->txnid;
438	__os_id(env->dbenv, &td->pid, &td->tid);
439	td->last_lsn = *lsnp;
440	td->begin_lsn = argp->begin_lsn;
441	td->parent = INVALID_ROFF;
442	td->name = INVALID_ROFF;
443	SH_TAILQ_INIT(&td->kids);
444	MAX_LSN(td->read_lsn);
445	MAX_LSN(td->visible_lsn);
446	td->mvcc_ref = 0;
447	td->mvcc_mtx = MUTEX_INVALID;
448	td->status = TXN_PREPARED;
449	td->flags = TXN_DTL_RESTORED;
450	td->xa_status = TXN_XA_PREPARED;
451	memcpy(td->xid, argp->xid.data, argp->xid.size);
452	td->bqual = argp->bqual;
453	td->gtrid = argp->gtrid;
454	td->format = argp->formatID;
455	td->nlog_dbs = 0;
456	td->nlog_slots = TXN_NSLOTS;
457	td->log_dbs = R_OFFSET(&mgr->reginfo, td->slots);
458
459	region->stat.st_nrestores++;
460#ifdef HAVE_STATISTICS
461	region->stat.st_nactive++;
462	if (region->stat.st_nactive > region->stat.st_maxnactive)
463		region->stat.st_maxnactive = region->stat.st_nactive;
464#endif
465	TXN_SYSTEM_UNLOCK(env);
466	return (0);
467}
468
469/*
470 * __txn_recycle_recover --
471 *	Recovery function for recycle.
472 *
473 * PUBLIC: int __txn_recycle_recover
474 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
475 */
476int
477__txn_recycle_recover(env, dbtp, lsnp, op, info)
478	ENV *env;
479	DBT *dbtp;
480	DB_LSN *lsnp;
481	db_recops op;
482	void *info;
483{
484	__txn_recycle_args *argp;
485	int ret;
486
487#ifdef DEBUG_RECOVER
488	(void)__txn_child_print(env, dbtp, lsnp, op, info);
489#endif
490	if ((ret = __txn_recycle_read(env, dbtp->data, &argp)) != 0)
491		return (ret);
492
493	COMPQUIET(lsnp, NULL);
494
495	if ((ret = __db_txnlist_gen(env, info,
496	    DB_UNDO(op) ? -1 : 1, argp->min, argp->max)) != 0)
497		return (ret);
498
499	__os_free(env, argp);
500
501	return (0);
502}
503
504/*
505 * PUBLIC: int __txn_regop_42_recover
506 * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
507 *
508 * These records are only ever written for commits.  Normally, we redo any
509 * committed transaction, however if we are doing recovery to a timestamp, then
510 * we may treat transactions that committed after the timestamp as aborted.
511 */
512int
513__txn_regop_42_recover(env, dbtp, lsnp, op, info)
514	ENV *env;
515	DBT *dbtp;
516	DB_LSN *lsnp;
517	db_recops op;
518	void *info;
519{
520	__txn_regop_42_args *argp;
521	DB_TXNHEAD *headp;
522	u_int32_t status;
523	int ret;
524
525#ifdef DEBUG_RECOVER
526	(void)__txn_regop_42_print(env, dbtp, lsnp, op, info);
527#endif
528
529	if ((ret = __txn_regop_42_read(env, dbtp->data, &argp)) != 0)
530		return (ret);
531
532	headp = info;
533	/*
534	 * We are only ever called during FORWARD_ROLL or BACKWARD_ROLL.
535	 * We check for the former explicitly and the last two clauses
536	 * apply to the BACKWARD_ROLL case.
537	 */
538
539	if (op == DB_TXN_FORWARD_ROLL) {
540		/*
541		 * If this was a 2-phase-commit transaction, then it
542		 * might already have been removed from the list, and
543		 * that's OK.  Ignore the return code from remove.
544		 */
545		if ((ret = __db_txnlist_remove(env,
546		    info, argp->txnp->txnid)) != DB_NOTFOUND && ret != 0)
547			goto err;
548	} else if ((env->dbenv->tx_timestamp != 0 &&
549	    argp->timestamp > (int32_t)env->dbenv->tx_timestamp) ||
550	    (!IS_ZERO_LSN(headp->trunc_lsn) &&
551	    LOG_COMPARE(&headp->trunc_lsn, lsnp) < 0)) {
552		/*
553		 * We failed either the timestamp check or the trunc_lsn check,
554		 * so we treat this as an abort even if it was a commit record.
555		 */
556		if ((ret = __db_txnlist_update(env, info,
557		    argp->txnp->txnid, TXN_ABORT, NULL, &status, 1)) != 0)
558			goto err;
559		else if (status != TXN_IGNORE && status != TXN_OK)
560			goto err;
561	} else {
562		/* This is a normal commit; mark it appropriately. */
563		if ((ret = __db_txnlist_update(env,
564		    info, argp->txnp->txnid, argp->opcode, lsnp,
565		    &status, 0)) == DB_NOTFOUND) {
566			if ((ret = __db_txnlist_add(env,
567			    info, argp->txnp->txnid,
568			    argp->opcode == TXN_ABORT ?
569			    TXN_IGNORE : argp->opcode, lsnp)) != 0)
570				goto err;
571		} else if (ret != 0 ||
572		    (status != TXN_IGNORE && status != TXN_OK))
573			goto err;
574	}
575
576	if (ret == 0)
577		*lsnp = argp->prev_lsn;
578
579	if (0) {
580err:		__db_errx(env,
581		    "txnid %lx commit record found, already on commit list",
582		    (u_long)argp->txnp->txnid);
583		ret = EINVAL;
584	}
585	__os_free(env, argp);
586
587	return (ret);
588}
589
590/*
591 * PUBLIC: int __txn_ckp_42_recover
592 * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
593 */
594int
595__txn_ckp_42_recover(env, dbtp, lsnp, op, info)
596	ENV *env;
597	DBT *dbtp;
598	DB_LSN *lsnp;
599	db_recops op;
600	void *info;
601{
602	__txn_ckp_42_args *argp;
603	int ret;
604
605#ifdef DEBUG_RECOVER
606	__txn_ckp_42_print(env, dbtp, lsnp, op, info);
607#endif
608	if ((ret = __txn_ckp_42_read(env, dbtp->data, &argp)) != 0)
609		return (ret);
610
611	if (op == DB_TXN_BACKWARD_ROLL)
612		__db_txnlist_ckp(env, info, lsnp);
613
614	*lsnp = argp->last_ckp;
615	__os_free(env, argp);
616	return (DB_TXN_CKP);
617}
618