1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2005,2008 Oracle.  All rights reserved.
5 *
6 * $Id: repmgr_elect.c,v 1.41 2008/03/13 17:31:28 mbrey Exp $
7 */
8
9#include "db_config.h"
10
11#define	__INCLUDE_NETWORKING	1
12#include "db_int.h"
13
14static int __repmgr_is_ready __P((ENV *));
15static int __repmgr_elect_main __P((ENV *));
16static void *__repmgr_elect_thread __P((void *));
17static int start_election_thread __P((ENV *));
18
19/*
20 * Starts the election thread, or wakes up an existing one, starting off with
21 * the specified operation (an election, or a call to rep_start(CLIENT), or
22 * nothing).  Avoid multiple concurrent elections.
23 *
24 * PUBLIC: int __repmgr_init_election __P((ENV *, int));
25 *
26 * !!!
27 * Caller must hold mutex.
28 */
29int
30__repmgr_init_election(env, initial_operation)
31	ENV *env;
32	int initial_operation;
33{
34	DB_REP *db_rep;
35	int ret;
36
37	db_rep = env->rep_handle;
38	if (db_rep->finished) {
39		RPRINT(env, DB_VERB_REPMGR_MISC, (env,
40		    "ignoring elect thread request %d; repmgr is finished",
41		    initial_operation));
42		return (0);
43	}
44
45	db_rep->operation_needed = initial_operation;
46	if (db_rep->elect_thread == NULL)
47		ret = start_election_thread(env);
48	else if (db_rep->elect_thread->finished) {
49		RPRINT(env, DB_VERB_REPMGR_MISC,
50		    (env, "join dead elect thread"));
51		if ((ret = __repmgr_thread_join(db_rep->elect_thread)) != 0)
52			return (ret);
53		__os_free(env, db_rep->elect_thread);
54		db_rep->elect_thread = NULL;
55		ret = start_election_thread(env);
56	} else {
57		RPRINT(env, DB_VERB_REPMGR_MISC,
58		    (env, "reusing existing elect thread"));
59		if ((ret = __repmgr_signal(&db_rep->check_election)) != 0)
60			__db_err(env, ret, "can't signal election thread");
61	}
62	return (ret);
63}
64
65/*
66 * !!!
67 * Caller holds mutex.
68 */
69static int
70start_election_thread(env)
71	ENV *env;
72{
73	DB_REP *db_rep;
74	REPMGR_RUNNABLE *elector;
75	int ret;
76
77	db_rep = env->rep_handle;
78
79	if ((ret = __os_malloc(env, sizeof(REPMGR_RUNNABLE), &elector))
80	    != 0)
81		return (ret);
82	elector->env = env;
83	elector->run = __repmgr_elect_thread;
84
85	if ((ret = __repmgr_thread_start(env, elector)) == 0)
86		db_rep->elect_thread = elector;
87	else
88		__os_free(env, elector);
89
90	return (ret);
91}
92
93static void *
94__repmgr_elect_thread(args)
95	void *args;
96{
97	ENV *env = args;
98	int ret;
99
100	RPRINT(env, DB_VERB_REPMGR_MISC, (env, "starting election thread"));
101
102	if ((ret = __repmgr_elect_main(env)) != 0) {
103		__db_err(env, ret, "election thread failed");
104		__repmgr_thread_failure(env, ret);
105	}
106
107	RPRINT(env, DB_VERB_REPMGR_MISC, (env, "election thread is exiting"));
108	return (NULL);
109}
110
111static int
112__repmgr_elect_main(env)
113	ENV *env;
114{
115	DBT my_addr;
116	DB_ENV *dbenv;
117	DB_REP *db_rep;
118#ifdef DB_WIN32
119	DWORD duration;
120#else
121	struct timespec deadline;
122#endif
123	u_int32_t nsites, nvotes;
124	int done, failure_recovery, last_op;
125	int need_success, ret, succeeded, to_do;
126
127	COMPQUIET(need_success, TRUE);
128
129	dbenv = env->dbenv;
130	db_rep = env->rep_handle;
131	last_op = 0;
132	failure_recovery = succeeded = FALSE;
133
134	/*
135	 * db_rep->operation_needed is the mechanism by which the outside world
136	 * (running in a different thread) tells us what it wants us to do.  It
137	 * is obviously relevant when we're just starting up.  But it can also
138	 * be set if a subsequent request for us to do something occurs while
139	 * we're still looping.
140	 *
141	 * ELECT_FAILURE_ELECTION asks us to start by doing an election, but to
142	 * do so in failure recovery mode.  This failure recovery mode may
143	 * persist through several loop iterations: as long as it takes us to
144	 * succeed in finding a master, or until we get asked to perform a new
145	 * request.  Thus the time for mapping ELECT_FAILURE_ELECTION to the
146	 * internal ELECT_ELECTION, as well as the setting of the failure
147	 * recovery flag, is at the point we receive the new request from
148	 * operation_needed (either here, or within the loop below).
149	 */
150	LOCK_MUTEX(db_rep->mutex);
151	if (db_rep->finished) {
152		db_rep->elect_thread->finished = TRUE;
153		UNLOCK_MUTEX(db_rep->mutex);
154		return (0);
155	}
156	to_do = db_rep->operation_needed;
157	db_rep->operation_needed = 0;
158	UNLOCK_MUTEX(db_rep->mutex);
159
160	/*
161	 * The way we are invoked determines the criterion for completion (which
162	 * is represented as "need_success"): if we've been asked to do an
163	 * election, we're only "done" when an election has actually succeeded.
164	 * If we're just here trying to find the master initially, then merely
165	 * getting a valid master_eid suffices.
166	 */
167	switch (to_do) {
168	case ELECT_FAILURE_ELECTION:
169		failure_recovery = TRUE;
170		to_do = ELECT_ELECTION;
171		/* FALLTHROUGH */
172	case ELECT_ELECTION:
173		need_success = TRUE;
174		break;
175	case ELECT_SEEK_MASTER:
176		to_do = 0;	/* Caller has already called rep_start. */
177		/* FALLTHROUGH */
178	case ELECT_REPSTART:
179		need_success = FALSE;
180		break;
181	default:
182		DB_ASSERT(env, FALSE);
183	}
184	/* Here, need_success has been initialized. */
185
186	for (;;) {
187		RPRINT(env, DB_VERB_REPMGR_MISC,
188		    (env, "elect thread to do: %d", to_do));
189		switch (to_do) {
190		case ELECT_ELECTION:
191			nsites = __repmgr_get_nsites(db_rep);
192			/*
193			 * With only 2 sites in the group, even a single failure
194			 * could make it impossible to get a majority.  So,
195			 * fudge a little, unless the user really wants strict
196			 * safety.
197			 */
198			if (nsites == 2 &&
199			    !FLD_ISSET(db_rep->region->config,
200			    REP_C_2SITE_STRICT))
201				nvotes = 1;
202			else
203				nvotes = ELECTION_MAJORITY(nsites);
204
205			/*
206			 * If we're doing an election because we noticed that
207			 * the master failed, it's reasonable to expect that the
208			 * master won't participate.  By not waiting for its
209			 * vote, we can probably complete the election faster.
210			 * But note that we shouldn't allow this to affect
211			 * nvotes calculation.
212			 *
213			 * However, if we have 2 sites, and strict majority is
214			 * turned on, now nvotes would be 2, and it doesn't make
215			 * sense to rep_elect to see nsites of 1 in that case.
216			 * So only decrement nsites if it currently exceeds
217			 * nvotes.
218			 */
219			if (failure_recovery && nsites > nvotes)
220				nsites--;
221
222			switch (ret =
223			    __rep_elect(dbenv, nsites, nvotes, 0)) {
224			case DB_REP_UNAVAIL:
225				break;
226
227			case 0:
228				succeeded = TRUE;
229				if (db_rep->takeover_pending) {
230					db_rep->takeover_pending = FALSE;
231					if ((ret =
232					    __repmgr_become_master(env)) != 0)
233						return (ret);
234				}
235				break;
236
237			default:
238				__db_err(
239				    env, ret, "unexpected election failure");
240				return (ret);
241			}
242			last_op = ELECT_ELECTION;
243			break;
244		case ELECT_REPSTART:
245			if ((ret =
246			    __repmgr_prepare_my_addr(env, &my_addr)) != 0)
247				return (ret);
248			ret = __rep_start(dbenv, &my_addr, DB_REP_CLIENT);
249			__os_free(env, my_addr.data);
250			if (ret != 0) {
251				__db_err(env, ret, "rep_start");
252				return (ret);
253			}
254			last_op = ELECT_REPSTART;
255			break;
256		case 0:
257			/*
258			 * Nothing to do: this can happen the first time
259			 * through, on initialization.
260			 */
261			last_op = 0;
262			break;
263		default:
264			DB_ASSERT(env, FALSE);
265		}
266
267		/*
268		 * Only the first election after a crashed master should be
269		 * "fast".  If that election fails and we have to retry, the
270		 * crashed master may have rebooted in the interim.
271		 */
272		failure_recovery = FALSE;
273
274		LOCK_MUTEX(db_rep->mutex);
275		while (!succeeded && !__repmgr_is_ready(env)) {
276#ifdef DB_WIN32
277			duration = db_rep->election_retry_wait / US_PER_MS;
278			ret = SignalObjectAndWait(db_rep->mutex,
279			    db_rep->check_election, duration, FALSE);
280			LOCK_MUTEX(db_rep->mutex);
281			if (ret == WAIT_TIMEOUT)
282				break;
283			DB_ASSERT(env, ret == WAIT_OBJECT_0);
284#else
285			__repmgr_compute_wait_deadline(env, &deadline,
286			    db_rep->election_retry_wait);
287			if ((ret = pthread_cond_timedwait(
288			    &db_rep->check_election, &db_rep->mutex, &deadline))
289			    == ETIMEDOUT)
290				break;
291			DB_ASSERT(env, ret == 0);
292#endif
293		}
294
295		/*
296		 * Ways we can get here: election succeeded, sleep duration
297		 * expired, "operation needed", or thread shut-down command.
298		 *
299		 * If we're not yet done, figure out what to do next (which may
300		 * be trivially easy if we've been told explicitly, via the
301		 * "operation needed" flag).  We must first check if we've been
302		 * told to do a specific operation, because that could make our
303		 * completion criterion more stringent.  Note that we never
304		 * lessen our completion criterion (i.e., unlike the initial
305		 * case, we may leave need_success untouched here).
306		 */
307		done = FALSE;
308		if ((to_do = db_rep->operation_needed) != 0) {
309			db_rep->operation_needed = 0;
310			switch (to_do) {
311			case ELECT_FAILURE_ELECTION:
312				failure_recovery = TRUE;
313				to_do = ELECT_ELECTION;
314				/* FALLTHROUGH */
315			case ELECT_ELECTION:
316				need_success = TRUE;
317				break;
318			case ELECT_SEEK_MASTER:
319				to_do = 0;
320				break;
321			default:
322				break;
323			}
324		} else if ((done = (succeeded ||
325		    (!need_success && IS_VALID_EID(db_rep->master_eid)) ||
326		    db_rep->finished)))
327			db_rep->elect_thread->finished = TRUE;
328		else {
329			if (last_op == ELECT_ELECTION)
330				to_do = ELECT_REPSTART;
331			else {
332				/*
333				 * Generally, if what we previously did is a
334				 * rep_start (or nothing, which really just
335				 * means another thread did the rep_start before
336				 * turning us on), then we next do an election.
337				 * However, with the REP_CLIENT init policy we
338				 * never do an initial election.
339				 */
340				to_do = ELECT_ELECTION;
341				if (db_rep->init_policy == DB_REP_CLIENT &&
342				    !db_rep->found_master)
343					to_do = ELECT_REPSTART;
344			}
345		}
346
347		UNLOCK_MUTEX(db_rep->mutex);
348		if (done)
349			return (0);
350	}
351}
352
353/*
354 * Tests whether another thread has signalled for our attention.
355 */
356static int
357__repmgr_is_ready(env)
358	ENV *env;
359{
360	DB_REP *db_rep;
361
362	db_rep = env->rep_handle;
363
364	RPRINT(env, DB_VERB_REPMGR_MISC, (env,
365	    "repmgr elect: opcode %d, finished %d, master %d",
366	    db_rep->operation_needed, db_rep->finished, db_rep->master_eid));
367
368	return (db_rep->operation_needed || db_rep->finished);
369}
370
371/*
372 * PUBLIC: int __repmgr_become_master __P((ENV *));
373 */
374int
375__repmgr_become_master(env)
376	ENV *env;
377{
378	DBT my_addr;
379	DB_ENV *dbenv;
380	DB_REP *db_rep;
381	int ret;
382
383	dbenv = env->dbenv;
384	db_rep = env->rep_handle;
385	db_rep->master_eid = SELF_EID;
386	db_rep->found_master = TRUE;
387
388	/*
389	 * At the moment, it's useless to pass my address to rep_start here,
390	 * because rep_start ignores it in the case of MASTER.  So we could
391	 * avoid the trouble of allocating and freeing this memory.  But might
392	 * this conceivably change in the future?
393	 */
394	if ((ret = __repmgr_prepare_my_addr(env, &my_addr)) != 0)
395		return (ret);
396	ret = __rep_start(dbenv, &my_addr, DB_REP_MASTER);
397	__os_free(env, my_addr.data);
398	if (ret == 0)
399		__repmgr_stash_generation(env);
400
401	return (ret);
402}
403