1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: env_recover.c,v 12.60 2008/03/12 20:52:53 mbrey Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/fop.h"
14#include "dbinc/btree.h"
15#include "dbinc/hash.h"
16#include "dbinc/log.h"
17#include "dbinc/mp.h"
18#include "dbinc/qam.h"
19#include "dbinc/txn.h"
20
21#ifndef lint
22static const char copyright[] =
23    "Copyright (c) 1996,2008 Oracle.  All rights reserved.\n";
24#endif
25
26static int	__db_log_corrupt __P((ENV *, DB_LSN *));
27static int	__env_init_rec_42 __P((ENV *));
28static int	__env_init_rec_43 __P((ENV *));
29static int	__env_init_rec_46 __P((ENV *));
30static int	__env_init_rec_47 __P((ENV *));
31static int	__log_earliest __P((ENV *, DB_LOGC *, int32_t *, DB_LSN *));
32
33#ifndef HAVE_BREW
34static double	__lsn_diff __P((DB_LSN *, DB_LSN *, DB_LSN *, u_int32_t, int));
35#endif
36
37/*
38 * __db_apprec --
39 *	Perform recovery.  If max_lsn is non-NULL, then we are trying
40 * to synchronize this system up with another system that has a max
41 * LSN of max_lsn, so we need to roll back sufficiently far for that
42 * to work.  See __log_backup for details.
43 *
44 * PUBLIC: int __db_apprec __P((ENV *,
45 * PUBLIC:     DB_THREAD_INFO *, DB_LSN *, DB_LSN *, int, u_int32_t));
46 */
47int
48__db_apprec(env, ip, max_lsn, trunclsn, update, flags)
49	ENV *env;
50	DB_THREAD_INFO *ip;
51	DB_LSN *max_lsn, *trunclsn;
52	int update;
53	u_int32_t flags;
54{
55	DBT data;
56	DB_ENV *dbenv;
57	DB_LOGC *logc;
58	DB_LSN ckp_lsn, first_lsn, last_lsn, lowlsn, lsn, stop_lsn, tlsn;
59	DB_TXNHEAD *txninfo;
60	DB_TXNREGION *region;
61	REGENV *renv;
62	REGINFO *infop;
63	__txn_ckp_args *ckp_args;
64	time_t now, tlow;
65	double nfiles;
66	u_int32_t hi_txn, log_size, txnid;
67	int32_t low;
68	int have_rec, progress, ret, t_ret;
69	char *p, *pass;
70	char t1[CTIME_BUFLEN], t2[CTIME_BUFLEN], time_buf[CTIME_BUFLEN];
71
72	COMPQUIET(nfiles, (double)0.001);
73
74	dbenv = env->dbenv;
75	logc = NULL;
76	ckp_args = NULL;
77	hi_txn = TXN_MAXIMUM;
78	txninfo = NULL;
79	pass = "initial";
80	ZERO_LSN(lsn);
81
82	/*
83	 * XXX
84	 * Get the log size.  No locking required because we're single-threaded
85	 * during recovery.
86	 */
87	log_size = ((LOG *)env->lg_handle->reginfo.primary)->log_size;
88
89	/*
90	 * If we need to, update the env handle timestamp.
91	 */
92	if (update && REP_ON(env)) {
93		infop = env->reginfo;
94		renv = infop->primary;
95		(void)time(&renv->rep_timestamp);
96	}
97
98	/* Set in-recovery flags. */
99	F_SET(env->lg_handle, DBLOG_RECOVER);
100	region = env->tx_handle->reginfo.primary;
101	F_SET(region, TXN_IN_RECOVERY);
102
103	/* Allocate a cursor for the log. */
104	if ((ret = __log_cursor(env, &logc)) != 0)
105		goto err;
106
107	/*
108	 * If the user is specifying recovery to a particular point in time
109	 * or to a particular LSN, find the point to start recovery from.
110	 */
111	ZERO_LSN(lowlsn);
112	if (max_lsn != NULL) {
113		if ((ret = __log_backup(env, logc, max_lsn, &lowlsn,
114		    CKPLSN_CMP)) != 0)
115			goto err;
116	} else if (dbenv->tx_timestamp != 0) {
117		if ((ret = __log_earliest(env, logc, &low, &lowlsn)) != 0)
118			goto err;
119		if ((int32_t)dbenv->tx_timestamp < low) {
120			t1[sizeof(t1) - 1] = '\0';
121			(void)strncpy(t1, __os_ctime(
122			    &dbenv->tx_timestamp, time_buf), sizeof(t1) - 1);
123			if ((p = strchr(t1, '\n')) != NULL)
124				*p = '\0';
125
126			t2[sizeof(t2) - 1] = '\0';
127			tlow = (time_t)low;
128			(void)strncpy(t2, __os_ctime(
129			    &tlow, time_buf), sizeof(t2) - 1);
130			if ((p = strchr(t2, '\n')) != NULL)
131				*p = '\0';
132
133			__db_errx(env,
134		    "Invalid recovery timestamp %s; earliest time is %s",
135			    t1, t2);
136			ret = EINVAL;
137			goto err;
138		}
139	}
140
141	/*
142	 * Recovery is done in three passes:
143	 * Pass #0:
144	 *	We need to find the position from which we will open files.
145	 *	We need to open files beginning with the earlier of the
146	 *	most recent checkpoint LSN and a checkpoint LSN before the
147	 *	recovery timestamp, if specified.  We need to be before the
148	 *	most recent checkpoint LSN because we are going to collect
149	 *	information about which transactions were begun before we
150	 *	start rolling forward.  Those that were should never be undone
151	 *	because queue cannot use LSNs to determine what operations can
152	 *	safely be aborted and it cannot rollback operations in
153	 *	transactions for which there may be records not processed
154	 *	during recovery.  We need to consider earlier points in time
155	 *	in case we are recovering to a particular timestamp.
156	 *
157	 * Pass #1:
158	 *	Read forward through the log from the position found in pass 0
159	 *	opening and closing files, and recording transactions for which
160	 *	we've seen their first record (the transaction's prev_lsn is
161	 *	0,0).  At the end of this pass, we know all transactions for
162	 *	which we've seen begins and we have the "current" set of files
163	 *	open.
164	 *
165	 * Pass #2:
166	 *	Read backward through the log undoing any uncompleted TXNs.
167	 *	There are four cases:
168	 *	    1.  If doing catastrophic recovery, we read to the
169	 *		beginning of the log
170	 *	    2.  If we are doing normal reovery, then we have to roll
171	 *		back to the most recent checkpoint LSN.
172	 *	    3.  If we are recovering to a point in time, then we have
173	 *		to roll back to the checkpoint whose ckp_lsn is earlier
174	 *		than the specified time.  __log_earliest will figure
175	 *		this out for us.
176	 *	    4.	If we are recovering back to a particular LSN, then
177	 *		we have to roll back to the checkpoint whose ckp_lsn
178	 *		is earlier than the max_lsn.  __log_backup will figure
179	 *		that out for us.
180	 *	In case 2, "uncompleted TXNs" include all those who committed
181	 *	after the user's specified timestamp.
182	 *
183	 * Pass #3:
184	 *	Read forward through the log from the LSN found in pass #2,
185	 *	redoing any committed TXNs (which committed after any user-
186	 *	specified rollback point).  During this pass, checkpoint
187	 *	file information is ignored, and file openings and closings
188	 *	are redone.
189	 *
190	 * ckp_lsn   -- lsn of the last checkpoint or the first in the log.
191	 * first_lsn -- the lsn where the forward passes begin.
192	 * last_lsn  -- the last lsn in the log, used for feedback
193	 * lowlsn    -- the lsn we are rolling back to, if we are recovering
194	 *		to a point in time.
195	 * lsn       -- temporary use lsn.
196	 * stop_lsn  -- the point at which forward roll should stop
197	 */
198
199	/*
200	 * Find out the last lsn, so that we can estimate how far along we
201	 * are in recovery.  This will help us determine how much log there
202	 * is between the first LSN that we're going to be working with and
203	 * the last one.  We assume that each of the three phases takes the
204	 * same amount of time (a false assumption) and then use the %-age
205	 * of the amount of log traversed to figure out how much of the
206	 * pass we've accomplished.
207	 *
208	 * If we can't find any log records, we're kind of done.
209	 */
210#ifdef UMRW
211	ZERO_LSN(last_lsn);
212#endif
213	memset(&data, 0, sizeof(data));
214	if ((ret = __logc_get(logc, &last_lsn, &data, DB_LAST)) != 0) {
215		if (ret == DB_NOTFOUND)
216			ret = 0;
217		else
218			__db_errx(env, "Last log record not found");
219		goto err;
220	}
221
222	do {
223		/* txnid is after rectype, which is a u_int32. */
224		LOGCOPY_32(env, &txnid,
225		    (u_int8_t *)data.data + sizeof(u_int32_t));
226
227		if (txnid != 0)
228			break;
229	} while ((ret = __logc_get(logc, &lsn, &data, DB_PREV)) == 0);
230
231	/*
232	 * There are no transactions, so there is nothing to do unless
233	 * we're recovering to an LSN.  If we are, we need to proceed since
234	 * we'll still need to do a vtruncate based on information we haven't
235	 * yet collected.
236	 */
237	if (ret == DB_NOTFOUND)
238		ret = 0;
239	else if (ret != 0)
240		goto err;
241
242	hi_txn = txnid;
243
244	/*
245	 * Pass #0
246	 * Find the LSN from which we begin OPENFILES.
247	 *
248	 * If this is a catastrophic recovery, or if no checkpoint exists
249	 * in the log, the LSN is the first LSN in the log.
250	 *
251	 * Otherwise, it is the minimum of (1) the LSN in the last checkpoint
252	 * and (2) the LSN in the checkpoint before any specified recovery
253	 * timestamp or max_lsn.
254	 */
255	/*
256	 * Get the first LSN in the log; it's an initial default
257	 * even if this is not a catastrophic recovery.
258	 */
259	if ((ret = __logc_get(logc, &ckp_lsn, &data, DB_FIRST)) != 0) {
260		if (ret == DB_NOTFOUND)
261			ret = 0;
262		else
263			__db_errx(env, "First log record not found");
264		goto err;
265	}
266	first_lsn = ckp_lsn;
267	have_rec = 1;
268
269	if (!LF_ISSET(DB_RECOVER_FATAL)) {
270		if ((ret = __txn_getckp(env, &ckp_lsn)) == 0 &&
271		    (ret = __logc_get(logc, &ckp_lsn, &data, DB_SET)) == 0) {
272			/* We have a recent checkpoint.  This is LSN (1). */
273			if ((ret = __txn_ckp_read(env,
274			    data.data, &ckp_args)) != 0) {
275				__db_errx(env,
276			    "Invalid checkpoint record at [%ld][%ld]",
277				    (u_long)ckp_lsn.file,
278				    (u_long)ckp_lsn.offset);
279				goto err;
280			}
281			first_lsn = ckp_args->ckp_lsn;
282			__os_free(env, ckp_args);
283			have_rec = 0;
284		}
285
286		/*
287		 * If LSN (2) exists, use it if it's before LSN (1).
288		 * (If LSN (1) doesn't exist, first_lsn is the
289		 * beginning of the log, so will "win" this check.)
290		 *
291		 * XXX
292		 * In the recovery-to-a-timestamp case, lowlsn is chosen by
293		 * __log_earliest, and is the checkpoint LSN of the
294		 * *earliest* checkpoint in the unreclaimed log.  I
295		 * (krinsky) believe that we could optimize this by looking
296		 * instead for the LSN of the *latest* checkpoint before
297		 * the timestamp of interest, but I'm not sure that this
298		 * is worth doing right now.  (We have to look for lowlsn
299		 * and low anyway, to make sure the requested timestamp is
300		 * somewhere in the logs we have, and all that's required
301		 * is that we pick *some* checkpoint after the beginning of
302		 * the logs and before the timestamp.
303		 */
304		if ((dbenv->tx_timestamp != 0 || max_lsn != NULL) &&
305		    LOG_COMPARE(&lowlsn, &first_lsn) < 0) {
306			DB_ASSERT(env, have_rec == 0);
307			first_lsn = lowlsn;
308		}
309	}
310
311	/* Get the record at first_lsn if we don't have it already. */
312	if (!have_rec &&
313	    (ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0) {
314		__db_errx(env, "Checkpoint LSN record [%ld][%ld] not found",
315		    (u_long)first_lsn.file, (u_long)first_lsn.offset);
316		goto err;
317	}
318
319#ifndef HAVE_BREW
320	if (dbenv->db_feedback != NULL) {
321		if (last_lsn.file == first_lsn.file)
322			nfiles = (double)
323			    (last_lsn.offset - first_lsn.offset) / log_size;
324		else
325			nfiles = (double)(last_lsn.file - first_lsn.file) +
326			    (double)((log_size - first_lsn.offset) +
327			    last_lsn.offset) / log_size;
328		/* We are going to divide by nfiles; make sure it isn't 0. */
329		if (nfiles < 0.001)
330			nfiles = 0.001;
331	}
332#endif
333
334	/* Find a low txnid. */
335	ret = 0;
336	if (hi_txn != 0) do {
337		/* txnid is after rectype, which is a u_int32. */
338		LOGCOPY_32(env, &txnid,
339		    (u_int8_t *)data.data + sizeof(u_int32_t));
340
341		if (txnid != 0)
342			break;
343	} while ((ret = __logc_get(logc, &lsn, &data, DB_NEXT)) == 0);
344
345	/*
346	 * There are no transactions and we're not recovering to an LSN (see
347	 * above), so there is nothing to do.
348	 */
349	if (ret == DB_NOTFOUND) {
350		if (LOG_COMPARE(&lsn, &last_lsn) != 0)
351			ret = __db_log_corrupt(env, &lsn);
352		else
353			ret = 0;
354	}
355
356	/* Reset to the first lsn. */
357	if (ret != 0 ||
358	    (ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0)
359		goto err;
360
361	/* Initialize the transaction list. */
362	if ((ret = __db_txnlist_init(env, ip,
363	    txnid, hi_txn, max_lsn, &txninfo)) != 0)
364		goto err;
365
366	/*
367	 * Pass #1
368	 * Run forward through the log starting at the first relevant lsn.
369	 */
370	if ((ret = __env_openfiles(env, logc,
371	    txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0)
372		goto err;
373
374	/* If there were no transactions, then we can bail out early. */
375	if (hi_txn == 0 && max_lsn == NULL)
376		goto done;
377
378	/*
379	 * Pass #2.
380	 *
381	 * We used first_lsn to tell us how far back we need to recover,
382	 * use it here.
383	 */
384	if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
385		__db_msg(env, "Recovery starting from [%lu][%lu]",
386		    (u_long)first_lsn.file, (u_long)first_lsn.offset);
387
388	pass = "backward";
389	for (ret = __logc_get(logc, &lsn, &data, DB_LAST);
390	    ret == 0 && LOG_COMPARE(&lsn, &first_lsn) >= 0;
391	    ret = __logc_get(logc, &lsn, &data, DB_PREV)) {
392#ifdef HAVE_BREW
393		COMPQUIET(progress, 0);
394#else
395		if (dbenv->db_feedback != NULL) {
396			progress = 34 + (int)(33 * (__lsn_diff(&first_lsn,
397			    &last_lsn, &lsn, log_size, 0) / nfiles));
398			dbenv->db_feedback(dbenv, DB_RECOVER, progress);
399		}
400#endif
401		tlsn = lsn;
402		ret = __db_dispatch(env, &env->recover_dtab,
403		    &data, &tlsn, DB_TXN_BACKWARD_ROLL, txninfo);
404		if (ret != 0) {
405			if (ret != DB_TXN_CKP)
406				goto msgerr;
407			else
408				ret = 0;
409		}
410	}
411	if (ret == DB_NOTFOUND) {
412		if (LOG_COMPARE(&lsn, &first_lsn) > 0)
413			ret = __db_log_corrupt(env, &lsn);
414		else
415			ret = 0;
416	}
417	if (ret != 0)
418		goto err;
419
420	/*
421	 * Pass #3.  If we are recovering to a timestamp or to an LSN,
422	 * we need to make sure that we don't roll-forward beyond that
423	 * point because there may be non-transactional operations (e.g.,
424	 * closes that would fail).  The last_lsn variable is used for
425	 * feedback calculations, but use it to set an initial stopping
426	 * point for the forward pass, and then reset appropriately to
427	 * derive a real stop_lsn that tells how far the forward pass
428	 * should go.
429	 */
430	pass = "forward";
431	stop_lsn = last_lsn;
432	if (max_lsn != NULL || dbenv->tx_timestamp != 0)
433		stop_lsn = ((DB_TXNHEAD *)txninfo)->maxlsn;
434
435	for (ret = __logc_get(logc, &lsn, &data, DB_NEXT);
436	    ret == 0; ret = __logc_get(logc, &lsn, &data, DB_NEXT)) {
437#ifndef HAVE_BREW
438		if (dbenv->db_feedback != NULL) {
439			progress = 67 + (int)(33 * (__lsn_diff(&first_lsn,
440			    &last_lsn, &lsn, log_size, 1) / nfiles));
441			dbenv->db_feedback(dbenv, DB_RECOVER, progress);
442		}
443#endif
444		tlsn = lsn;
445		ret = __db_dispatch(env, &env->recover_dtab,
446		    &data, &tlsn, DB_TXN_FORWARD_ROLL, txninfo);
447		if (ret != 0) {
448			if (ret != DB_TXN_CKP)
449				goto msgerr;
450			else
451				ret = 0;
452		}
453		/*
454		 * If we are recovering to a timestamp or an LSN,
455		 * we need to make sure that we don't try to roll
456		 * forward beyond the soon-to-be end of log.
457		 */
458		if (LOG_COMPARE(&lsn, &stop_lsn) >= 0)
459			break;
460
461	}
462	if (ret == DB_NOTFOUND)
463		ret = __db_log_corrupt(env, &lsn);
464	if (ret != 0)
465		goto err;
466
467	if (max_lsn == NULL)
468		region->last_txnid = ((DB_TXNHEAD *)txninfo)->maxid;
469
470	if (dbenv->tx_timestamp != 0) {
471		/* We are going to truncate, so we'd best close the cursor. */
472		if (logc != NULL) {
473			if ((ret = __logc_close(logc)) != 0)
474				goto err;
475			logc = NULL;
476		}
477
478		/*
479		 * Flush everything to disk, we are losing the log.  It's
480		 * recovery, ignore any application max-write configuration.
481		 */
482		if ((ret = __memp_sync_int(env, NULL, 0,
483		    DB_SYNC_CACHE | DB_SYNC_SUPPRESS_WRITE, NULL, NULL)) != 0)
484			goto err;
485		region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
486		if ((ret = __log_vtruncate(env,
487		    &((DB_TXNHEAD *)txninfo)->maxlsn,
488		    &((DB_TXNHEAD *)txninfo)->ckplsn, trunclsn)) != 0)
489			goto err;
490	}
491
492done:
493	/* Take a checkpoint here to force any dirty data pages to disk. */
494	if (!IS_REP_CLIENT(env) && (ret = __txn_checkpoint(env, 0, 0,
495	    DB_CKP_INTERNAL | DB_FORCE)) != 0) {
496		/*
497		 * If there was no space for the checkpoint we can
498		 * still bring the environment up.  No updates will
499		 * be able to commit either, but the environment can
500		 * be used read only.
501		 */
502		if (max_lsn == NULL && ret == ENOSPC)
503			ret = 0;
504		else
505			goto err;
506	}
507
508	if (region->stat.st_nrestores == 0) {
509		/* Close all the db files that are open. */
510		if ((ret = __dbreg_close_files(env, 0)) != 0)
511			goto err;
512	} else {
513		if ((ret = __dbreg_mark_restored(env)) != 0)
514			goto err;
515		F_SET(env->lg_handle, DBLOG_OPENFILES);
516	}
517
518	if (max_lsn != NULL) {
519		if (!IS_ZERO_LSN(((DB_TXNHEAD *)txninfo)->ckplsn))
520			region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
521		else if ((ret =
522		    __txn_findlastckp(env, &region->last_ckp, max_lsn)) != 0)
523			goto err;
524
525		/* We are going to truncate, so we'd best close the cursor. */
526		if (logc != NULL && (ret = __logc_close(logc)) != 0)
527			goto err;
528		logc = NULL;
529		if ((ret = __log_vtruncate(env,
530		    max_lsn, &((DB_TXNHEAD *)txninfo)->ckplsn, trunclsn)) != 0)
531			goto err;
532
533		/*
534		 * Now we need to open files that should be open in order for
535		 * client processing to continue.  However, since we've
536		 * truncated the log, we need to recompute from where the
537		 * openfiles pass should begin.
538		 */
539		if ((ret = __log_cursor(env, &logc)) != 0)
540			goto err;
541		if ((ret =
542		    __logc_get(logc, &first_lsn, &data, DB_FIRST)) != 0) {
543			if (ret == DB_NOTFOUND)
544				ret = 0;
545			else
546				__db_errx(env, "First log record not found");
547			goto err;
548		}
549		if ((ret = __txn_getckp(env, &first_lsn)) == 0 &&
550		    (ret = __logc_get(logc, &first_lsn, &data, DB_SET)) == 0) {
551			/* We have a recent checkpoint.  This is LSN (1). */
552			if ((ret = __txn_ckp_read(env,
553			    data.data, &ckp_args)) != 0) {
554				__db_errx(env,
555			    "Invalid checkpoint record at [%ld][%ld]",
556				    (u_long)first_lsn.file,
557				    (u_long)first_lsn.offset);
558				goto err;
559			}
560			first_lsn = ckp_args->ckp_lsn;
561			__os_free(env, ckp_args);
562		}
563		if ((ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0)
564			goto err;
565		if ((ret = __env_openfiles(env, logc,
566		    txninfo, &data, &first_lsn, max_lsn, nfiles, 1)) != 0)
567			goto err;
568	} else if (region->stat.st_nrestores == 0) {
569		/*
570		 * If there are no prepared transactions that need resolution,
571		 * we need to reset the transaction ID space and log this fact.
572		 */
573		if ((ret = __txn_reset(env)) != 0)
574			goto err;
575	} else {
576		if ((ret = __txn_recycle_id(env)) != 0)
577			goto err;
578	}
579
580	/*
581	 * We must be sure to zero the tail of the log.  Otherwise a partial
582	 * record may be at the end of the log and it may never be fully
583	 * overwritten.
584	 */
585	if (max_lsn == NULL && dbenv->tx_timestamp == 0) {
586		/* We are going to truncate, so we'd best close the cursor. */
587		if (logc != NULL && (ret = __logc_close(logc)) != 0)
588			goto err;
589		logc = NULL;
590
591		/* Truncate from beyond the last record in the log. */
592		if ((ret =
593		    __log_current_lsn(env, &last_lsn, NULL, NULL)) != 0)
594			goto err;
595		if ((ret = __log_vtruncate(env,
596		    &last_lsn, &region->last_ckp, NULL)) != 0)
597			goto err;
598	}
599
600	if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) {
601		(void)time(&now);
602		__db_msg(env,
603		    "Recovery complete at %.24s", __os_ctime(&now, time_buf));
604		__db_msg(env, "%s %lx %s [%lu][%lu]",
605		    "Maximum transaction ID",
606		    (u_long)(txninfo == NULL ?
607			TXN_MINIMUM : ((DB_TXNHEAD *)txninfo)->maxid),
608		    "Recovery checkpoint",
609		    (u_long)region->last_ckp.file,
610		    (u_long)region->last_ckp.offset);
611	}
612
613	if (0) {
614msgerr:		__db_errx(env,
615		    "Recovery function for LSN %lu %lu failed on %s pass",
616		    (u_long)lsn.file, (u_long)lsn.offset, pass);
617	}
618
619err:	if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
620		ret = t_ret;
621
622	if (txninfo != NULL)
623		__db_txnlist_end(env, txninfo);
624
625	dbenv->tx_timestamp = 0;
626
627	F_CLR(env->lg_handle, DBLOG_RECOVER);
628	F_CLR(region, TXN_IN_RECOVERY);
629
630	return (ret);
631}
632
633#ifndef HAVE_BREW
634/*
635 * Figure out how many logfiles we have processed.  If we are moving
636 * forward (is_forward != 0), then we're computing current - low.  If
637 * we are moving backward, we are computing high - current.  max is
638 * the number of bytes per logfile.
639 */
640static double
641__lsn_diff(low, high, current, max, is_forward)
642	DB_LSN *low, *high, *current;
643	u_int32_t max;
644	int is_forward;
645{
646	double nf;
647
648	/*
649	 * There are three cases in each direction.  If you are in the
650	 * same file, then all you need worry about is the difference in
651	 * offsets.  If you are in different files, then either your offsets
652	 * put you either more or less than the integral difference in the
653	 * number of files -- we need to handle both of these.
654	 */
655	if (is_forward) {
656		if (current->file == low->file)
657			nf = (double)(current->offset - low->offset) / max;
658		else if (current->offset < low->offset)
659			nf = (double)((current->file - low->file) - 1) +
660			    (double)((max - low->offset) + current->offset) /
661			    max;
662		else
663			nf = (double)(current->file - low->file) +
664			    (double)(current->offset - low->offset) / max;
665	} else {
666		if (current->file == high->file)
667			nf = (double)(high->offset - current->offset) / max;
668		else if (current->offset > high->offset)
669			nf = (double)((high->file - current->file) - 1) +
670			    (double)
671			    ((max - current->offset) + high->offset) / max;
672		else
673			nf = (double)(high->file - current->file) +
674			    (double)(high->offset - current->offset) / max;
675	}
676	return (nf);
677}
678#endif
679
680/*
681 * __log_backup --
682 *
683 * This is used to find the earliest log record to process when a client
684 * is trying to sync up with a master whose max LSN is less than this
685 * client's max lsn; we want to roll back everything after that.
686 * Also used in the verify phase to walk back via checkpoints.
687 *
688 * Find the latest checkpoint whose ckp_lsn is less than the max lsn.
689 * PUBLIC: int    __log_backup __P((ENV *, DB_LOGC *, DB_LSN *,
690 * PUBLIC:    DB_LSN *, u_int32_t));
691 */
692int
693__log_backup(env, logc, max_lsn, start_lsn, cmp)
694	ENV *env;
695	DB_LOGC *logc;
696	DB_LSN *max_lsn, *start_lsn;
697	u_int32_t cmp;
698{
699	DBT data;
700	DB_LSN cmp_lsn, lsn;
701	__txn_ckp_args *ckp_args;
702	int lcmp, ret;
703
704	memset(&data, 0, sizeof(data));
705	ckp_args = NULL;
706
707	if (cmp != CKPLSN_CMP && cmp != LASTCKP_CMP)
708		return (EINVAL);
709
710	if ((ret = __txn_getckp(env, &lsn)) != 0)
711		goto err;
712	/*
713	 * Cmp tells us whether to check the ckp_lsn or the last_ckp
714	 * fields in the checkpoint record.
715	 */
716	while ((ret = __logc_get(logc, &lsn, &data, DB_SET)) == 0) {
717		if ((ret = __txn_ckp_read(
718		    env, data.data, &ckp_args)) != 0)
719			return (ret);
720		if (cmp == CKPLSN_CMP) {
721			/*
722			 * Follow checkpoints through the log until
723			 * we find one with a ckp_lsn less than
724			 * or equal max_lsn.
725			 */
726			cmp_lsn = ckp_args->ckp_lsn;
727			lcmp = (LOG_COMPARE(&cmp_lsn, max_lsn) <= 0);
728		} else {
729			/*
730			 * When we're walking back through the checkpoints
731			 * we want the LSN of this checkpoint strictly less
732			 * than the max_lsn (also a ckp LSN).
733			 */
734			cmp_lsn = lsn;
735			lcmp = (LOG_COMPARE(&cmp_lsn, max_lsn) < 0);
736		}
737		if (lcmp) {
738			*start_lsn = cmp_lsn;
739			break;
740		}
741
742		lsn = ckp_args->last_ckp;
743		/*
744		 * If there are no more checkpoints behind us, we're
745		 * done.  Break with DB_NOTFOUND.
746		 */
747		if (IS_ZERO_LSN(lsn)) {
748			ret = DB_NOTFOUND;
749			break;
750		}
751		__os_free(env, ckp_args);
752		ckp_args = NULL;
753	}
754
755	if (ckp_args != NULL)
756		__os_free(env, ckp_args);
757	/*
758	 * For CKPLSN_CMP if we walked back through all the checkpoints,
759	 * set the cursor on the first log record.  For LASTCKP_CMP
760	 * we want to return 0,0 in start_lsn.
761	 */
762err:	if (IS_ZERO_LSN(*start_lsn) && cmp == CKPLSN_CMP &&
763	    (ret == 0 || ret == DB_NOTFOUND))
764		ret = __logc_get(logc, start_lsn, &data, DB_FIRST);
765	return (ret);
766}
767
768/*
769 * __log_earliest --
770 *
771 * Return the earliest recovery point for the log files present.  The
772 * earliest recovery time is the time stamp of the first checkpoint record
773 * whose checkpoint LSN is greater than the first LSN we process.
774 */
775static int
776__log_earliest(env, logc, lowtime, lowlsn)
777	ENV *env;
778	DB_LOGC *logc;
779	int32_t *lowtime;
780	DB_LSN *lowlsn;
781{
782	__txn_ckp_args *ckpargs;
783	DB_LSN first_lsn, lsn;
784	DBT data;
785	u_int32_t rectype;
786	int cmp, ret;
787
788	memset(&data, 0, sizeof(data));
789
790	/*
791	 * Read forward through the log looking for the first checkpoint
792	 * record whose ckp_lsn is greater than first_lsn.
793	 */
794	for (ret = __logc_get(logc, &first_lsn, &data, DB_FIRST);
795	    ret == 0; ret = __logc_get(logc, &lsn, &data, DB_NEXT)) {
796		LOGCOPY_32(env, &rectype, data.data);
797		if (rectype != DB___txn_ckp)
798			continue;
799		if ((ret =
800		    __txn_ckp_read(env, data.data, &ckpargs)) == 0) {
801			cmp = LOG_COMPARE(&ckpargs->ckp_lsn, &first_lsn);
802			*lowlsn = ckpargs->ckp_lsn;
803			*lowtime = ckpargs->timestamp;
804
805			__os_free(env, ckpargs);
806			if (cmp >= 0)
807				break;
808		}
809	}
810
811	return (ret);
812}
813
814/*
815 * __env_openfiles --
816 * Perform the pass of recovery that opens files.  This is used
817 * both during regular recovery and an initial call to txn_recover (since
818 * we need files open in order to abort prepared, but not yet committed
819 * transactions).
820 *
821 * See the comments in db_apprec for a detailed description of the
822 * various recovery passes.
823 *
824 * If we are not doing feedback processing (i.e., we are doing txn_recover
825 * processing and in_recovery is zero), then last_lsn can be NULL.
826 *
827 * PUBLIC: int __env_openfiles __P((ENV *,
828 * PUBLIC:     DB_LOGC *, void *, DBT *, DB_LSN *, DB_LSN *, double, int));
829 */
830int
831__env_openfiles(env, logc, txninfo,
832    data, open_lsn, last_lsn, nfiles, in_recovery)
833	ENV *env;
834	DB_LOGC *logc;
835	void *txninfo;
836	DBT *data;
837	DB_LSN *open_lsn, *last_lsn;
838	double nfiles;
839	int in_recovery;
840{
841	DB_ENV *dbenv;
842	DB_LSN lsn, tlsn;
843	u_int32_t log_size;
844	int progress, ret;
845
846	dbenv = env->dbenv;
847
848	/*
849	 * XXX
850	 * Get the log size.  No locking required because we're single-threaded
851	 * during recovery.
852	 */
853	log_size = ((LOG *)env->lg_handle->reginfo.primary)->log_size;
854
855	lsn = *open_lsn;
856	for (;;) {
857#ifdef HAVE_BREW
858		COMPQUIET(nfiles, (double)0.001);
859		COMPQUIET(progress, 0);
860#else
861		if (in_recovery && dbenv->db_feedback != NULL) {
862			DB_ASSERT(env, last_lsn != NULL);
863			progress = (int)(33 * (__lsn_diff(open_lsn,
864			   last_lsn, &lsn, log_size, 1) / nfiles));
865			dbenv->db_feedback(dbenv, DB_RECOVER, progress);
866		}
867#endif
868		tlsn = lsn;
869		ret = __db_dispatch(env, &env->recover_dtab, data, &tlsn,
870		    in_recovery ? DB_TXN_OPENFILES : DB_TXN_POPENFILES,
871		    txninfo);
872		if (ret != 0 && ret != DB_TXN_CKP) {
873			__db_errx(env,
874			    "Recovery function for LSN %lu %lu failed",
875			    (u_long)lsn.file, (u_long)lsn.offset);
876			break;
877		}
878		if ((ret = __logc_get(logc, &lsn, data, DB_NEXT)) != 0) {
879			if (ret == DB_NOTFOUND) {
880				if (last_lsn != NULL &&
881				   LOG_COMPARE(&lsn, last_lsn) != 0)
882					ret = __db_log_corrupt(env, &lsn);
883				else
884					ret = 0;
885			}
886			break;
887		}
888	}
889
890	return (ret);
891}
892
893static int
894__db_log_corrupt(env, lsnp)
895	ENV *env;
896	DB_LSN *lsnp;
897{
898	__db_errx(env, "Log file corrupt at LSN: [%lu][%lu]",
899	     (u_long)lsnp->file, (u_long)lsnp->offset);
900	return (EINVAL);
901}
902
903/*
904 * __env_init_rec --
905 *
906 * PUBLIC: int __env_init_rec __P((ENV *, u_int32_t));
907 */
908int
909__env_init_rec(env, version)
910	ENV *env;
911	u_int32_t version;
912{
913	int ret;
914
915	/*
916	 * We need to prime the recovery table with the current recovery
917	 * functions.  Then we overwrite only specific entries based on
918	 * each previous version we support.
919	 */
920	if ((ret = __env_init_rec_47(env)) != 0)
921		return (ret);
922	ret = 0;
923	switch (version) {
924	case DB_LOGVERSION_47:
925		break;
926	/*
927	 * There are no log record/recovery differences between 4.4 and 4.5.
928	 * The log version changed due to checksum.  There are no log recovery
929	 * differences between 4.5 and 4.6.  The name of the rep_gen in
930	 * txn_checkpoint changed (to spare, since we don't use it anymore).
931	 */
932	case DB_LOGVERSION_46:
933	case DB_LOGVERSION_45:
934	case DB_LOGVERSION_44:
935		ret = __env_init_rec_46(env);
936		break;
937	case DB_LOGVERSION_43:
938		ret = __env_init_rec_43(env);
939		break;
940	case DB_LOGVERSION_42:
941		ret = __env_init_rec_42(env);
942		break;
943	default:
944		__db_errx(env, "Unknown version %lu", (u_long)version);
945		ret = EINVAL;
946		break;
947	}
948	return (ret);
949}
950
951static int
952__env_init_rec_42(env)
953	ENV *env;
954{
955	int ret;
956
957	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
958	    __db_relink_42_recover, DB___db_relink_42)) != 0)
959		goto err;
960	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
961	    __db_pg_alloc_42_recover, DB___db_pg_alloc_42)) != 0)
962		goto err;
963	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
964	    __db_pg_free_42_recover, DB___db_pg_free_42)) != 0)
965		goto err;
966	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
967	    __db_pg_freedata_42_recover, DB___db_pg_freedata_42)) != 0)
968		goto err;
969	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
970	    __ham_metagroup_42_recover, DB___ham_metagroup_42)) != 0)
971		goto err;
972	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
973	    __ham_groupalloc_42_recover, DB___ham_groupalloc_42)) != 0)
974		goto err;
975	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
976	    __txn_ckp_42_recover, DB___txn_ckp_42)) != 0)
977		goto err;
978	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
979	    __txn_regop_42_recover, DB___txn_regop_42)) != 0)
980		goto err;
981err:
982	return (ret);
983}
984
985static int
986__env_init_rec_43(env)
987	ENV *env;
988{
989	int ret;
990
991	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
992	    __bam_relink_43_recover, DB___bam_relink_43)) != 0)
993		goto err;
994	/*
995	 * We want to use the 4.2-based txn_regop record.
996	 */
997	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
998	    __txn_regop_42_recover, DB___txn_regop_42)) != 0)
999		goto err;
1000err:
1001	return (ret);
1002}
1003
1004static int
1005__env_init_rec_46(env)
1006	ENV *env;
1007{
1008	int ret;
1009
1010	if ((ret = __db_add_recovery_int(env, &env->recover_dtab,
1011	    __bam_merge_44_recover, DB___bam_merge_44)) != 0)
1012		goto err;
1013
1014err:	return (ret);
1015}
1016
1017static int
1018__env_init_rec_47(env)
1019	ENV *env;
1020{
1021	int ret;
1022
1023	if ((ret = __bam_init_recover(env, &env->recover_dtab)) != 0)
1024		goto err;
1025	if ((ret = __crdel_init_recover(env, &env->recover_dtab)) != 0)
1026		goto err;
1027	if ((ret = __db_init_recover(env, &env->recover_dtab)) != 0)
1028		goto err;
1029	if ((ret = __dbreg_init_recover(env, &env->recover_dtab)) != 0)
1030		goto err;
1031	if ((ret = __fop_init_recover(env, &env->recover_dtab)) != 0)
1032		goto err;
1033	if ((ret = __ham_init_recover(env, &env->recover_dtab)) != 0)
1034		goto err;
1035	if ((ret = __qam_init_recover(env, &env->recover_dtab)) != 0)
1036		goto err;
1037	if ((ret = __txn_init_recover(env, &env->recover_dtab)) != 0)
1038		goto err;
1039err:
1040	return (ret);
1041}
1042