1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2001,2008 Oracle.  All rights reserved.
5 *
6 * $Id: txn_recover.c,v 12.37 2008/04/19 15:47:42 mjc Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/txn.h"
13#include "dbinc/db_page.h"
14#include "dbinc/db_dispatch.h"
15#include "dbinc/log.h"
16#include "dbinc_auto/db_auto.h"
17#include "dbinc_auto/crdel_auto.h"
18#include "dbinc_auto/db_ext.h"
19
20/*
21 * __txn_map_gid
22 *	Return the txn that corresponds to this global ID.
23 *
24 * PUBLIC: int __txn_map_gid __P((ENV *,
25 * PUBLIC:     u_int8_t *, TXN_DETAIL **, roff_t *));
26 */
27int
28__txn_map_gid(env, gid, tdp, offp)
29	ENV *env;
30	u_int8_t *gid;
31	TXN_DETAIL **tdp;
32	roff_t *offp;
33{
34	DB_TXNMGR *mgr;
35	DB_TXNREGION *region;
36
37	mgr = env->tx_handle;
38	region = mgr->reginfo.primary;
39
40	/*
41	 * Search the internal active transaction table to find the
42	 * matching xid.  If this is a performance hit, then we
43	 * can create a hash table, but I doubt it's worth it.
44	 */
45	TXN_SYSTEM_LOCK(env);
46	SH_TAILQ_FOREACH(*tdp, &region->active_txn, links, __txn_detail)
47		if (memcmp(gid, (*tdp)->xid, sizeof((*tdp)->xid)) == 0)
48			break;
49	TXN_SYSTEM_UNLOCK(env);
50
51	if (*tdp == NULL)
52		return (EINVAL);
53
54	*offp = R_OFFSET(&mgr->reginfo, *tdp);
55	return (0);
56}
57
58/*
59 * __txn_recover_pp --
60 *	ENV->txn_recover pre/post processing.
61 *
62 * PUBLIC: int __txn_recover_pp
63 * PUBLIC:     __P((DB_ENV *, DB_PREPLIST *, long, long *, u_int32_t));
64 */
65int
66__txn_recover_pp(dbenv, preplist, count, retp, flags)
67	DB_ENV *dbenv;
68	DB_PREPLIST *preplist;
69	long count, *retp;
70	u_int32_t flags;
71{
72	DB_THREAD_INFO *ip;
73	ENV *env;
74	int ret;
75
76	env = dbenv->env;
77
78	ENV_REQUIRES_CONFIG(
79	    env, env->tx_handle, "txn_recover", DB_INIT_TXN);
80
81	if (F_ISSET((DB_TXNREGION *)env->tx_handle->reginfo.primary,
82	    TXN_IN_RECOVERY)) {
83		__db_errx(env, "operation not permitted while in recovery");
84		return (EINVAL);
85	}
86
87	if (flags != DB_FIRST && flags != DB_NEXT)
88		return (__db_ferr(env, "DB_ENV->txn_recover", 0));
89
90	ENV_ENTER(env, ip);
91	REPLICATION_WRAP(env,
92	    (__txn_recover(env, preplist, count, retp, flags)), 0, ret);
93	ENV_LEAVE(env, ip);
94	return (ret);
95}
96
97/*
98 * __txn_recover --
99 *	ENV->txn_recover.
100 *
101 * PUBLIC: int __txn_recover __P((ENV *,
102 * PUBLIC:         DB_PREPLIST *, long, long *, u_int32_t));
103 */
104int
105__txn_recover(env, preplist, count, retp, flags)
106	ENV *env;
107	DB_PREPLIST *preplist;
108	long count, *retp;
109	u_int32_t flags;
110{
111	/*
112	 * Public API to retrieve the list of prepared, but not yet committed
113	 * transactions.  See __txn_get_prepared for details.  This function
114	 * and __db_xa_recover both wrap that one.
115	 */
116	return (__txn_get_prepared(env, NULL, preplist, count, retp, flags));
117}
118
119/*
120 * __txn_get_prepared --
121 *	Returns a list of prepared (and for XA, heuristically completed)
122 *	transactions (less than or equal to the count parameter).  One of
123 *	xids or txns must be set to point to an array of the appropriate type.
124 *	The count parameter indicates the number of entries in the xids and/or
125 *	txns array. The retp parameter will be set to indicate the number of
126 *	entries	returned in the xids/txns array.  Flags indicates the operation,
127 *	one of DB_FIRST or DB_NEXT.
128 *
129 * PUBLIC: int __txn_get_prepared __P((ENV *,
130 * PUBLIC:     XID *, DB_PREPLIST *, long, long *, u_int32_t));
131 */
132int
133__txn_get_prepared(env, xids, txns, count, retp, flags)
134	ENV *env;
135	XID *xids;
136	DB_PREPLIST *txns;
137	long count;		/* This is long for XA compatibility. */
138	long *retp;
139	u_int32_t flags;
140{
141	DB_LSN min;
142	DB_PREPLIST *prepp;
143	DB_THREAD_INFO *ip;
144	DB_TXNMGR *mgr;
145	DB_TXNREGION *region;
146	TXN_DETAIL *td;
147	XID *xidp;
148	long i;
149	int restored, ret;
150
151	*retp = 0;
152
153	MAX_LSN(min);
154	prepp = txns;
155	xidp = xids;
156	restored = ret = 0;
157
158	/*
159	 * If we are starting a scan, then we traverse the active transaction
160	 * list once making sure that all transactions are marked as not having
161	 * been collected.  Then on each pass, we mark the ones we collected
162	 * so that if we cannot collect them all at once, we can finish up
163	 * next time with a continue.
164	 */
165
166	mgr = env->tx_handle;
167	region = mgr->reginfo.primary;
168
169	/*
170	 * During this pass we need to figure out if we are going to need
171	 * to open files.  We need to open files if we've never collected
172	 * before (in which case, none of the COLLECTED bits will be set)
173	 * and the ones that we are collecting are restored (if they aren't
174	 * restored, then we never crashed; just the main server did).
175	 */
176	TXN_SYSTEM_LOCK(env);
177
178	/* Now begin collecting active transactions. */
179	for (td = SH_TAILQ_FIRST(&region->active_txn, __txn_detail);
180	    td != NULL && *retp < count;
181	    td = SH_TAILQ_NEXT(td, links, __txn_detail)) {
182		if (td->status != TXN_PREPARED ||
183		    (flags != DB_FIRST && F_ISSET(td, TXN_DTL_COLLECTED)))
184			continue;
185
186		if (F_ISSET(td, TXN_DTL_RESTORED))
187			restored = 1;
188
189		if (xids != NULL) {
190			xidp->formatID = td->format;
191			/*
192			 * XID structure uses longs; we use u_int32_t's as we
193			 * log them to disk.  Cast them to make the conversion
194			 * explicit.
195			 */
196			xidp->gtrid_length = (long)td->gtrid;
197			xidp->bqual_length = (long)td->bqual;
198			memcpy(xidp->data, td->xid, sizeof(td->xid));
199			xidp++;
200		}
201
202		if (txns != NULL) {
203			if ((ret = __os_calloc(env,
204			    1, sizeof(DB_TXN), &prepp->txn)) != 0) {
205				TXN_SYSTEM_UNLOCK(env);
206				goto err;
207			}
208			if ((ret = __txn_continue(env, prepp->txn, td)) != 0)
209				goto err;
210			F_SET(prepp->txn, TXN_MALLOC);
211			if (F_ISSET(env->dbenv, DB_ENV_TXN_NOSYNC))
212				F_SET(prepp->txn, TXN_NOSYNC);
213			else if (F_ISSET(env->dbenv, DB_ENV_TXN_WRITE_NOSYNC))
214				F_SET(prepp->txn, TXN_WRITE_NOSYNC);
215			else
216				F_SET(prepp->txn, TXN_SYNC);
217			memcpy(prepp->gid, td->xid, sizeof(td->xid));
218			prepp++;
219		}
220
221		if (!IS_ZERO_LSN(td->begin_lsn) &&
222		    LOG_COMPARE(&td->begin_lsn, &min) < 0)
223			min = td->begin_lsn;
224
225		(*retp)++;
226		F_SET(td, TXN_DTL_COLLECTED);
227	}
228	if (flags == DB_FIRST)
229		for (; td != NULL; td = SH_TAILQ_NEXT(td, links, __txn_detail))
230			F_CLR(td, TXN_DTL_COLLECTED);
231	TXN_SYSTEM_UNLOCK(env);
232
233	/*
234	 * Now link all the transactions into the transaction manager's list.
235	 */
236	if (txns != NULL && *retp != 0) {
237		MUTEX_LOCK(env, mgr->mutex);
238		for (i = 0; i < *retp; i++)
239			TAILQ_INSERT_TAIL(&mgr->txn_chain, txns[i].txn, links);
240		MUTEX_UNLOCK(env, mgr->mutex);
241
242		/*
243		 * If we are restoring, update our count of outstanding
244		 * transactions.
245		 */
246		if (REP_ON(env)) {
247			REP_SYSTEM_LOCK(env);
248			env->rep_handle->region->op_cnt += (u_long)*retp;
249			REP_SYSTEM_UNLOCK(env);
250		}
251
252	}
253	/*
254	 * If recovery already opened the files for us, don't
255	 * do it here.
256	 */
257	if (restored != 0 && flags == DB_FIRST &&
258	    !F_ISSET(env->lg_handle, DBLOG_OPENFILES)) {
259		ENV_GET_THREAD_INFO(env, ip);
260		ret = __txn_openfiles(env, ip, &min, 0);
261	}
262
263	if (0) {
264err:		TXN_SYSTEM_UNLOCK(env);
265	}
266	return (ret);
267}
268
269/*
270 * __txn_openfiles --
271 *	Call env_openfiles.
272 *
273 * PUBLIC: int __txn_openfiles __P((ENV *, DB_THREAD_INFO *, DB_LSN *, int));
274 */
275int
276__txn_openfiles(env, ip, min, force)
277	ENV *env;
278	DB_THREAD_INFO *ip;
279	DB_LSN *min;
280	int force;
281{
282	DBT data;
283	DB_LOGC *logc;
284	DB_LSN open_lsn;
285	DB_TXNHEAD *txninfo;
286	__txn_ckp_args *ckp_args;
287	int ret, t_ret;
288
289	/*
290	 * Figure out the last checkpoint before the smallest
291	 * start_lsn in the region.
292	 */
293	logc = NULL;
294	if ((ret = __log_cursor(env, &logc)) != 0)
295		goto err;
296
297	memset(&data, 0, sizeof(data));
298	if ((ret = __txn_getckp(env, &open_lsn)) == 0)
299		while (!IS_ZERO_LSN(open_lsn) && (ret =
300		    __logc_get(logc, &open_lsn, &data, DB_SET)) == 0 &&
301		    (force ||
302		    (min != NULL && LOG_COMPARE(min, &open_lsn) < 0))) {
303			/* Format the log record. */
304			if ((ret = __txn_ckp_read(
305			    env, data.data, &ckp_args)) != 0) {
306				__db_errx(env,
307			    "Invalid checkpoint record at [%lu][%lu]",
308				    (u_long)open_lsn.file,
309				    (u_long)open_lsn.offset);
310				goto err;
311			}
312			/*
313			 * If force is set, then we're forcing ourselves
314			 * to go back far enough to open files.
315			 * Use ckp_lsn and then break out of the loop.
316			 */
317			open_lsn = force ? ckp_args->ckp_lsn :
318			    ckp_args->last_ckp;
319			__os_free(env, ckp_args);
320			if (force) {
321				if ((ret = __logc_get(logc, &open_lsn,
322				    &data, DB_SET)) != 0)
323					goto err;
324				break;
325			}
326		}
327
328	/*
329	 * There are several ways by which we may have gotten here.
330	 * - We got a DB_NOTFOUND -- we need to read the first
331	 *	log record.
332	 * - We found a checkpoint before min.  We're done.
333	 * - We found a checkpoint after min who's last_ckp is 0.  We
334	 *	need to start at the beginning of the log.
335	 * - We are forcing an openfiles and we have our ckp_lsn.
336	 */
337	if ((ret == DB_NOTFOUND || IS_ZERO_LSN(open_lsn)) && (ret =
338	    __logc_get(logc, &open_lsn, &data, DB_FIRST)) != 0) {
339		__db_errx(env, "No log records");
340		goto err;
341	}
342
343	if ((ret = __db_txnlist_init(env, ip, 0, 0, NULL, &txninfo)) != 0)
344		goto err;
345	ret = __env_openfiles(
346	    env, logc, txninfo, &data, &open_lsn, NULL, (double)0, 0);
347	if (txninfo != NULL)
348		__db_txnlist_end(env, txninfo);
349
350err:
351	if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
352		ret = t_ret;
353	return (ret);
354}
355