1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2004,2008 Oracle.  All rights reserved.
5 *
6 * $Id: rep_verify.c,v 12.69 2008/03/13 16:21:05 mbrey Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/db_am.h"
14#include "dbinc/log.h"
15#include "dbinc/txn.h"
16
17static int __rep_dorecovery __P((ENV *, DB_LSN *, DB_LSN *));
18
19/*
20 * __rep_verify --
21 *	Handle a REP_VERIFY message.
22 *
23 * PUBLIC: int __rep_verify __P((ENV *, __rep_control_args *, DBT *,
24 * PUBLIC:     int, time_t));
25 */
26int
27__rep_verify(env, rp, rec, eid, savetime)
28	ENV *env;
29	__rep_control_args *rp;
30	DBT *rec;
31	int eid;
32	time_t savetime;
33{
34	DBT mylog;
35	DB_LOG *dblp;
36	DB_LOGC *logc;
37	DB_LSN lsn;
38	DB_REP *db_rep;
39	LOG *lp;
40	REP *rep;
41	u_int32_t rectype, logflag;
42	int match, ret, t_ret;
43
44	ret = 0;
45	db_rep = env->rep_handle;
46	rep = db_rep->region;
47	dblp = env->lg_handle;
48	lp = dblp->reginfo.primary;
49
50	/* Do nothing if VERIFY flag is not set. */
51	if (!F_ISSET(rep, REP_F_RECOVER_VERIFY))
52		return (ret);
53
54#ifdef DIAGNOSTIC
55	/*
56	 * We should not ever be in internal init with a lease granted.
57	 */
58	if (IS_USING_LEASES(env)) {
59		REP_SYSTEM_LOCK(env);
60		DB_ASSERT(env, __rep_islease_granted(env) == 0);
61		REP_SYSTEM_UNLOCK(env);
62	}
63#endif
64
65	if ((ret = __log_cursor(env, &logc)) != 0)
66		return (ret);
67	memset(&mylog, 0, sizeof(mylog));
68	/* If verify_lsn of ZERO is passed in, get last log. */
69	MUTEX_LOCK(env, rep->mtx_clientdb);
70	logflag = IS_ZERO_LSN(lp->verify_lsn) ? DB_LAST : DB_SET;
71	MUTEX_UNLOCK(env, rep->mtx_clientdb);
72	if ((ret = __logc_get(logc, &rp->lsn, &mylog, logflag)) != 0)
73		goto err;
74	match = 0;
75	LOGCOPY_32(env, &rectype, mylog.data);
76	if (mylog.size == rec->size &&
77	    memcmp(mylog.data, rec->data, rec->size) == 0)
78		match = 1;
79	/*
80	 * If we don't have a match, backup to the previous
81	 * identification record and try again.
82	 */
83	if (match == 0) {
84		ZERO_LSN(lsn);
85		if ((ret = __rep_log_backup(env, rep, logc, &lsn)) == 0) {
86			MUTEX_LOCK(env, rep->mtx_clientdb);
87			lp->verify_lsn = lsn;
88			__os_gettime(env, &lp->rcvd_ts, 1);
89			lp->wait_ts = rep->request_gap;
90			MUTEX_UNLOCK(env, rep->mtx_clientdb);
91			(void)__rep_send_message(env, eid, REP_VERIFY_REQ,
92			    &lsn, NULL, 0, DB_REP_ANYWHERE);
93		} else if (ret == DB_NOTFOUND) {
94			/*
95			 * We've either run out of records because
96			 * logs have been removed or we've rolled back
97			 * all the way to the beginning.
98			 */
99			STAT(rep->stat.st_outdated++);
100			REP_SYSTEM_LOCK(env);
101			if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT))
102				ret = DB_REP_JOIN_FAILURE;
103			else {
104				F_CLR(rep, REP_F_RECOVER_VERIFY);
105				F_SET(rep, REP_F_RECOVER_UPDATE);
106				ZERO_LSN(rep->first_lsn);
107				ZERO_LSN(rep->ckp_lsn);
108				ret = 0;
109			}
110			REP_SYSTEM_UNLOCK(env);
111			if (ret == 0)
112				(void)__rep_send_message(env,
113				    eid, REP_UPDATE_REQ, NULL,
114				    NULL, 0, 0);
115		}
116	} else
117		ret = __rep_verify_match(env, &rp->lsn, savetime);
118
119err:	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
120		ret = t_ret;
121	return (ret);
122}
123
124/*
125 * __rep_verify_fail --
126 *	Handle a REP_VERIFY_FAIL message.
127 *
128 * PUBLIC: int __rep_verify_fail __P((ENV *, __rep_control_args *, int));
129 */
130int
131__rep_verify_fail(env, rp, eid)
132	ENV *env;
133	__rep_control_args *rp;
134	int eid;
135{
136	DB_LOG *dblp;
137	DB_REP *db_rep;
138	LOG *lp;
139	REP *rep;
140	int lockout, ret;
141
142	lockout = 0;
143	ret = 0;
144	db_rep = env->rep_handle;
145	rep = db_rep->region;
146	dblp = env->lg_handle;
147	lp = dblp->reginfo.primary;
148
149	/*
150	 * If any recovery flags are set, but not LOG or VERIFY,
151	 * then we ignore this message.  We are already
152	 * in the middle of updating.
153	 */
154	if (F_ISSET(rep, REP_F_RECOVER_MASK) &&
155	    !F_ISSET(rep, REP_F_RECOVER_LOG | REP_F_RECOVER_VERIFY))
156		return (0);
157	MUTEX_LOCK(env, rep->mtx_clientdb);
158	REP_SYSTEM_LOCK(env);
159	/*
160	 * We should not ever be in internal init with a lease granted.
161	 */
162	DB_ASSERT(env,
163	    !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
164
165	/*
166	 * Update stats.
167	 */
168	STAT(rep->stat.st_outdated++);
169
170	/*
171	 * Clean up old internal init in progress if:
172	 * REP_C_NOAUTOINIT is not configured and
173	 * we are recovering LOG and this LSN is in the range we need.
174	 */
175	if (!FLD_ISSET(rep->config, REP_C_NOAUTOINIT) &&
176	    (F_ISSET(rep, REP_F_RECOVER_LOG) &&
177	    LOG_COMPARE(&rep->first_lsn, &rp->lsn) <= 0 &&
178	    LOG_COMPARE(&rep->last_lsn, &rp->lsn) >= 0)) {
179		/*
180		 * Already locking out messages, give up.
181		 */
182		if (F_ISSET(rep, REP_F_READY_MSG))
183		    goto unlock;
184
185		/*
186		 * Lock out other messages to prevent race conditions.
187		 */
188		if ((ret = __rep_lockout_msg(env, rep, 1)) != 0)
189		    goto unlock;
190		lockout = 1;
191
192		/*
193		 * Clean up internal init if one was in progress.
194		 */
195		if (F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP)) {
196			RPRINT(env, DB_VERB_REP_SYNC, (env,
197    "VERIFY_FAIL is cleaning up old internal init for missing log"));
198			if ((ret =
199			    __rep_init_cleanup(env, rep, DB_FORCE)) != 0) {
200				RPRINT(env, DB_VERB_REP_SYNC, (env,
201    "VERIFY_FAIL error cleaning up internal init for missing log: %d", ret));
202				goto msglck;
203			}
204			F_CLR(rep, REP_F_RECOVER_MASK);
205		}
206		F_CLR(rep, REP_F_READY_MSG);
207		lockout = 0;
208	}
209
210	/*
211	 * Commence an internal init if:
212	 * We are in VERIFY state and the failing LSN is the one we
213	 * were verifying or
214	 * we're recovering LOG and this LSN is in the range we need or
215	 * we are in normal state (no recovery flags set) and
216	 * the failing LSN is the one we're ready for.
217	 */
218	if (((F_ISSET(rep, REP_F_RECOVER_VERIFY)) &&
219	    LOG_COMPARE(&rp->lsn, &lp->verify_lsn) == 0) ||
220	    (F_ISSET(rep, REP_F_RECOVER_LOG) &&
221	    LOG_COMPARE(&rep->first_lsn, &rp->lsn) <= 0 &&
222	    LOG_COMPARE(&rep->last_lsn, &rp->lsn) >= 0) ||
223	    (F_ISSET(rep, REP_F_RECOVER_MASK) == 0 &&
224	    LOG_COMPARE(&rp->lsn, &lp->ready_lsn) >= 0)) {
225		/*
226		 * We don't want an old or delayed VERIFY_FAIL
227		 * message to throw us into internal initialization
228		 * when we shouldn't be. If REP_C_NOAUTOINIT is configured,
229		 * return DB_REP_JOIN_FAILURE instead of doing internal init.
230		 */
231		if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT)) {
232			ret = DB_REP_JOIN_FAILURE;
233			goto unlock;
234		}
235
236		/*
237		 * Do the internal init.
238		 */
239		F_CLR(rep, REP_F_RECOVER_VERIFY);
240		F_SET(rep, REP_F_RECOVER_UPDATE);
241		ZERO_LSN(rep->first_lsn);
242		ZERO_LSN(rep->ckp_lsn);
243		lp->wait_ts = rep->request_gap;
244		REP_SYSTEM_UNLOCK(env);
245		MUTEX_UNLOCK(env, rep->mtx_clientdb);
246		(void)__rep_send_message(env,
247		    eid, REP_UPDATE_REQ, NULL, NULL, 0, 0);
248	} else {
249		/*
250		 * Otherwise ignore this message.
251		 */
252msglck:		if (lockout)
253		    F_CLR(rep, REP_F_READY_MSG);
254unlock:		REP_SYSTEM_UNLOCK(env);
255		MUTEX_UNLOCK(env, rep->mtx_clientdb);
256	}
257	return (ret);
258}
259
260/*
261 * __rep_verify_req --
262 *	Handle a REP_VERIFY_REQ message.
263 *
264 * PUBLIC: int __rep_verify_req __P((ENV *, __rep_control_args *, int));
265 */
266int
267__rep_verify_req(env, rp, eid)
268	ENV *env;
269	__rep_control_args *rp;
270	int eid;
271{
272	DBT *d, data_dbt;
273	DB_LOGC *logc;
274	DB_REP *db_rep;
275	REP *rep;
276	u_int32_t type;
277	int old, ret;
278
279	ret = 0;
280	db_rep = env->rep_handle;
281	rep = db_rep->region;
282
283	type = REP_VERIFY;
284	if ((ret = __log_cursor(env, &logc)) != 0)
285		return (ret);
286	d = &data_dbt;
287	memset(d, 0, sizeof(data_dbt));
288	F_SET(logc, DB_LOG_SILENT_ERR);
289	ret = __logc_get(logc, &rp->lsn, d, DB_SET);
290	/*
291	 * If the LSN was invalid, then we might get a DB_NOTFOUND
292	 * we might get an EIO, we could get anything.
293	 * If we get a DB_NOTFOUND, then there is a chance that
294	 * the LSN comes before the first file present in which
295	 * case we need to return a fail so that the client can
296	 * perform an internal init or return a REP_JOIN_FAILURE.
297	 *
298	 * If we're a client servicing this request and we get a
299	 * NOTFOUND, return it so the caller can rerequest from
300	 * a better source.
301	 */
302	if (ret == DB_NOTFOUND) {
303		if (F_ISSET(rep, REP_F_CLIENT)) {
304			(void)__logc_close(logc);
305			return (DB_NOTFOUND);
306		}
307		if (__log_is_outdated(env, rp->lsn.file, &old) == 0 &&
308		    old != 0)
309			type = REP_VERIFY_FAIL;
310	}
311
312	if (ret != 0)
313		d = NULL;
314
315	(void)__rep_send_message(env, eid, type, &rp->lsn, d, 0, 0);
316	return (__logc_close(logc));
317}
318
319static int
320__rep_dorecovery(env, lsnp, trunclsnp)
321	ENV *env;
322	DB_LSN *lsnp, *trunclsnp;
323{
324	DBT mylog;
325	DB_LOGC *logc;
326	DB_LSN last_ckp, lsn;
327	DB_REP *db_rep;
328	DB_THREAD_INFO *ip;
329	REP *rep;
330	int ret, skip_rec, t_ret, update;
331	u_int32_t rectype, opcode;
332	__txn_regop_args *txnrec;
333	__txn_regop_42_args *txn42rec;
334
335	db_rep = env->rep_handle;
336	rep = db_rep->region;
337	ENV_GET_THREAD_INFO(env, ip);
338
339	/* Figure out if we are backing out any committed transactions. */
340	if ((ret = __log_cursor(env, &logc)) != 0)
341		return (ret);
342
343	memset(&mylog, 0, sizeof(mylog));
344	if (F_ISSET(rep, REP_F_RECOVER_LOG)) {
345		/*
346		 * Internal init can never skip recovery.
347		 * Internal init must always update the timestamp and
348		 * force dead handles.
349		 */
350		skip_rec = 0;
351		update = 1;
352	} else {
353		skip_rec = 1;
354		update = 0;
355	}
356	while (update == 0 &&
357	    (ret = __logc_get(logc, &lsn, &mylog, DB_PREV)) == 0 &&
358	    LOG_COMPARE(&lsn, lsnp) > 0) {
359		LOGCOPY_32(env, &rectype, mylog.data);
360		/*
361		 * Find out if we can skip recovery completely.  If we
362		 * are backing up over any record a client usually
363		 * cares about, we must run recovery.
364		 *
365		 * Skipping sync-up recovery can be pretty scary!
366		 * Here's why we can do it:
367		 * If a master downgraded to client and is now running
368		 * sync-up to a new master, that old master must have
369		 * waited for any outstanding txns to resolve before
370		 * becoming a client.  Also we are in lockout so there
371		 * can be no other operations right now.
372		 *
373		 * If the client wrote a commit record to the log, but
374		 * was descheduled before processing the txn, and then
375		 * a new master was found, we must've let the txn get
376		 * processed because right now we are the only message
377		 * thread allowed to be running.
378		 */
379		DB_ASSERT(env, rep->op_cnt == 0);
380		DB_ASSERT(env, rep->msg_th == 1);
381		if (rectype == DB___txn_regop || rectype == DB___txn_ckp ||
382		    rectype == DB___dbreg_register)
383			skip_rec = 0;
384		if (rectype == DB___txn_regop) {
385			if (rep->version >= DB_REPVERSION_44) {
386				if ((ret = __txn_regop_read(
387				    env, mylog.data, &txnrec)) != 0)
388					goto err;
389				opcode = txnrec->opcode;
390				__os_free(env, txnrec);
391			} else {
392				if ((ret = __txn_regop_42_read(
393				    env, mylog.data, &txn42rec)) != 0)
394					goto err;
395				opcode = txn42rec->opcode;
396				__os_free(env, txn42rec);
397			}
398			if (opcode != TXN_ABORT)
399				update = 1;
400		}
401	}
402	/*
403	 * Handle if the logc_get fails.
404	 */
405	if (ret != 0)
406		goto err;
407
408	/*
409	 * If we successfully run recovery, we've opened all the necessary
410	 * files.  We are guaranteed to be single-threaded here, so no mutex
411	 * is necessary.
412	 */
413	if (skip_rec) {
414		if ((ret = __log_get_stable_lsn(env, &last_ckp)) != 0) {
415			if (ret != DB_NOTFOUND)
416				goto err;
417			ZERO_LSN(last_ckp);
418		}
419		RPRINT(env, DB_VERB_REP_SYNC, (env,
420    "Skip sync-up rec.  Truncate log to [%lu][%lu], ckp [%lu][%lu]",
421    (u_long)lsnp->file, (u_long)lsnp->offset,
422    (u_long)last_ckp.file, (u_long)last_ckp.offset));
423		ret = __log_vtruncate(env, lsnp, &last_ckp, trunclsnp);
424	} else
425		ret = __db_apprec(env, ip, lsnp, trunclsnp, update, 0);
426
427	if (ret != 0)
428		goto err;
429	F_SET(db_rep, DBREP_OPENFILES);
430
431err:	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
432		ret = t_ret;
433
434	return (ret);
435}
436
437/*
438 * __rep_verify_match --
439 *	We have just received a matching log record during verification.
440 * Figure out if we're going to need to run recovery. If so, wait until
441 * everything else has exited the library.  If not, set up the world
442 * correctly and move forward.
443 *
444 * PUBLIC: int __rep_verify_match __P((ENV *, DB_LSN *, time_t));
445 */
446int
447__rep_verify_match(env, reclsnp, savetime)
448	ENV *env;
449	DB_LSN *reclsnp;
450	time_t savetime;
451{
452	DB_LOG *dblp;
453	DB_LSN trunclsn;
454	DB_REP *db_rep;
455	DB_THREAD_INFO *ip;
456	LOG *lp;
457	REGENV *renv;
458	REGINFO *infop;
459	REP *rep;
460	int done, master, ret;
461	u_int32_t unused;
462
463	dblp = env->lg_handle;
464	db_rep = env->rep_handle;
465	rep = db_rep->region;
466	lp = dblp->reginfo.primary;
467	ret = 0;
468	infop = env->reginfo;
469	renv = infop->primary;
470	ENV_GET_THREAD_INFO(env, ip);
471
472	/*
473	 * Check if the savetime is different than our current time stamp.
474	 * If it is, then we're racing with another thread trying to recover
475	 * and we lost.  We must give up.
476	 */
477	MUTEX_LOCK(env, rep->mtx_clientdb);
478	done = savetime != renv->rep_timestamp;
479	if (done) {
480		MUTEX_UNLOCK(env, rep->mtx_clientdb);
481		return (0);
482	}
483	ZERO_LSN(lp->verify_lsn);
484	MUTEX_UNLOCK(env, rep->mtx_clientdb);
485
486	/*
487	 * Make sure the world hasn't changed while we tried to get
488	 * the lock.  If it hasn't then it's time for us to kick all
489	 * operations out of DB and run recovery.
490	 */
491	REP_SYSTEM_LOCK(env);
492	if (F_ISSET(rep, REP_F_READY_MSG) ||
493	    (!F_ISSET(rep, REP_F_RECOVER_LOG) &&
494	    F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP))) {
495		/*
496		 * We lost.  The world changed and we should do nothing.
497		 */
498		STAT(rep->stat.st_msgs_recover++);
499		goto errunlock;
500	}
501
502	/*
503	 * Lockout all message threads but ourselves.
504	 */
505	if ((ret = __rep_lockout_msg(env, rep, 1)) != 0)
506		goto errunlock;
507
508	/*
509	 * Lockout the API and wait for operations to complete.
510	 */
511	if ((ret = __rep_lockout_api(env, rep)) != 0)
512		goto errunlock;
513
514	/* OK, everyone is out, we can now run recovery. */
515	REP_SYSTEM_UNLOCK(env);
516
517	if ((ret = __rep_dorecovery(env, reclsnp, &trunclsn)) != 0 ||
518	    (ret = __rep_remove_init_file(env)) != 0) {
519		REP_SYSTEM_LOCK(env);
520		F_CLR(rep, REP_F_READY_API | REP_F_READY_MSG | REP_F_READY_OP);
521		goto errunlock;
522	}
523
524	/*
525	 * The log has been truncated (either directly by us or by __db_apprec)
526	 * We want to make sure we're waiting for the LSN at the new end-of-log,
527	 * not some later point.
528	 */
529	MUTEX_LOCK(env, rep->mtx_clientdb);
530	lp->ready_lsn = trunclsn;
531	ZERO_LSN(lp->waiting_lsn);
532	ZERO_LSN(lp->max_wait_lsn);
533	lp->max_perm_lsn = *reclsnp;
534	lp->wait_ts = rep->request_gap;
535	__os_gettime(env, &lp->rcvd_ts, 1);
536	ZERO_LSN(lp->verify_lsn);
537
538	/*
539	 * Discard any log records we have queued;  we're about to re-request
540	 * them, and can't trust the ones in the queue.  We need to set the
541	 * DB_AM_RECOVER bit in this handle, so that the operation doesn't
542	 * deadlock.
543	 */
544	if (db_rep->rep_db == NULL &&
545	    (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) {
546		MUTEX_UNLOCK(env, rep->mtx_clientdb);
547		goto out;
548	}
549
550	F_SET(db_rep->rep_db, DB_AM_RECOVER);
551	MUTEX_UNLOCK(env, rep->mtx_clientdb);
552	ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused);
553	MUTEX_LOCK(env, rep->mtx_clientdb);
554	F_CLR(db_rep->rep_db, DB_AM_RECOVER);
555
556	REP_SYSTEM_LOCK(env);
557	rep->stat.st_log_queued = 0;
558	F_CLR(rep, REP_F_NOARCHIVE | REP_F_RECOVER_MASK | REP_F_READY_MSG);
559	if (ret != 0)
560		goto errunlock2;
561
562	/*
563	 * If the master_id is invalid, this means that since
564	 * the last record was sent, something happened to the
565	 * master and we may not have a master to request
566	 * things of.
567	 *
568	 * This is not an error;  when we find a new master,
569	 * we'll re-negotiate where the end of the log is and
570	 * try to bring ourselves up to date again anyway.
571	 */
572	master = rep->master_id;
573	REP_SYSTEM_UNLOCK(env);
574	if (master == DB_EID_INVALID) {
575		MUTEX_UNLOCK(env, rep->mtx_clientdb);
576		ret = 0;
577	} else {
578		/*
579		 * We're making an ALL_REQ.  But now that we've
580		 * cleared the flags, we're likely receiving new
581		 * log records from the master, resulting in a gap
582		 * immediately.  So to avoid multiple data streams,
583		 * set the wait_ts value high now to give the master
584		 * a chance to start sending us these records before
585		 * the gap code re-requests the same gap.  Wait_recs
586		 * will get reset once we start receiving these
587		 * records.
588		 */
589		lp->wait_ts = rep->max_gap;
590		MUTEX_UNLOCK(env, rep->mtx_clientdb);
591		(void)__rep_send_message(env,
592		    master, REP_ALL_REQ, reclsnp, NULL, 0, DB_REP_ANYWHERE);
593	}
594	if (0) {
595errunlock2:	MUTEX_UNLOCK(env, rep->mtx_clientdb);
596errunlock:	REP_SYSTEM_UNLOCK(env);
597	}
598out:	return (ret);
599}
600
601/*
602 * __rep_log_backup --
603 *
604 * In the verify handshake, we walk backward looking for
605 * identification records.  Those are the only record types
606 * we verify and match on.
607 *
608 * PUBLIC: int __rep_log_backup __P((ENV *, REP *, DB_LOGC *, DB_LSN *));
609 */
610int
611__rep_log_backup(env, rep, logc, lsn)
612	ENV *env;
613	REP *rep;
614	DB_LOGC *logc;
615	DB_LSN *lsn;
616{
617	DBT mylog;
618	u_int32_t rectype;
619	int ret;
620
621	ret = 0;
622	memset(&mylog, 0, sizeof(mylog));
623	while ((ret = __logc_get(logc, lsn, &mylog, DB_PREV)) == 0) {
624		/*
625		 * Determine what we look for based on version number.
626		 * Due to the contents of records changing between
627		 * versions we have to match based on criteria of that
628		 * particular version.
629		 */
630		LOGCOPY_32(env, &rectype, mylog.data);
631		/*
632		 * In 4.4 and beyond we match checkpoint and commit.
633		 */
634		if (rep->version >= DB_REPVERSION_44 &&
635		    (rectype == DB___txn_ckp || rectype == DB___txn_regop))
636			break;
637	}
638	return (ret);
639}
640