1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2005-2009 Oracle.  All rights reserved.
5 *
6 * $Id$
7 */
8
9#include "db_config.h"
10
11#define	__INCLUDE_NETWORKING	1
12#include "db_int.h"
13
14static int __repmgr_is_ready __P((ENV *));
15static int __repmgr_elect_main __P((ENV *));
16static void *__repmgr_elect_thread __P((void *));
17static int start_election_thread __P((ENV *));
18
19/*
20 * Starts the election thread, or wakes up an existing one, starting off with
21 * the specified operation (an election, or a call to rep_start(CLIENT), or
22 * nothing).  Avoid multiple concurrent elections.
23 *
24 * PUBLIC: int __repmgr_init_election __P((ENV *, int));
25 *
26 * !!!
27 * Caller must hold mutex.
28 */
29int
30__repmgr_init_election(env, initial_operation)
31	ENV *env;
32	int initial_operation;
33{
34	DB_REP *db_rep;
35	int ret;
36
37	db_rep = env->rep_handle;
38	if (db_rep->finished) {
39		RPRINT(env, DB_VERB_REPMGR_MISC, (env,
40		    "ignoring elect thread request %d; repmgr is finished",
41		    initial_operation));
42		return (0);
43	}
44
45	db_rep->operation_needed = initial_operation;
46	if (db_rep->elect_thread == NULL)
47		ret = start_election_thread(env);
48	else if (db_rep->elect_thread->finished) {
49		RPRINT(env, DB_VERB_REPMGR_MISC,
50		    (env, "join dead elect thread"));
51		if ((ret = __repmgr_thread_join(db_rep->elect_thread)) != 0)
52			return (ret);
53		__os_free(env, db_rep->elect_thread);
54		db_rep->elect_thread = NULL;
55		ret = start_election_thread(env);
56	} else {
57		RPRINT(env, DB_VERB_REPMGR_MISC,
58		    (env, "reusing existing elect thread"));
59		if ((ret = __repmgr_signal(&db_rep->check_election)) != 0)
60			__db_err(env, ret, "can't signal election thread");
61	}
62	return (ret);
63}
64
65/*
66 * !!!
67 * Caller holds mutex.
68 */
69static int
70start_election_thread(env)
71	ENV *env;
72{
73	DB_REP *db_rep;
74	REPMGR_RUNNABLE *elector;
75	int ret;
76
77	db_rep = env->rep_handle;
78
79	if ((ret = __os_malloc(env, sizeof(REPMGR_RUNNABLE), &elector))
80	    != 0)
81		return (ret);
82	elector->env = env;
83	elector->run = __repmgr_elect_thread;
84
85	if ((ret = __repmgr_thread_start(env, elector)) == 0)
86		db_rep->elect_thread = elector;
87	else
88		__os_free(env, elector);
89
90	return (ret);
91}
92
93static void *
94__repmgr_elect_thread(args)
95	void *args;
96{
97	ENV *env = args;
98	int ret;
99
100	RPRINT(env, DB_VERB_REPMGR_MISC, (env, "starting election thread"));
101
102	if ((ret = __repmgr_elect_main(env)) != 0) {
103		__db_err(env, ret, "election thread failed");
104		__repmgr_thread_failure(env, ret);
105	}
106
107	RPRINT(env, DB_VERB_REPMGR_MISC, (env, "election thread is exiting"));
108	return (NULL);
109}
110
111static int
112__repmgr_elect_main(env)
113	ENV *env;
114{
115	DBT my_addr;
116	DB_REP *db_rep;
117#ifdef DB_WIN32
118	DWORD duration;
119#else
120	struct timespec deadline;
121#endif
122	u_int32_t nsites, nvotes;
123	int done, failure_recovery, last_op;
124	int need_success, ret, succeeded, to_do;
125
126	COMPQUIET(need_success, TRUE);
127
128	db_rep = env->rep_handle;
129	last_op = 0;
130	failure_recovery = succeeded = FALSE;
131
132	/*
133	 * db_rep->operation_needed is the mechanism by which the outside world
134	 * (running in a different thread) tells us what it wants us to do.  It
135	 * is obviously relevant when we're just starting up.  But it can also
136	 * be set if a subsequent request for us to do something occurs while
137	 * we're still looping.
138	 *
139	 * ELECT_FAILURE_ELECTION asks us to start by doing an election, but to
140	 * do so in failure recovery mode.  This failure recovery mode may
141	 * persist through several loop iterations: as long as it takes us to
142	 * succeed in finding a master, or until we get asked to perform a new
143	 * request.  Thus the time for mapping ELECT_FAILURE_ELECTION to the
144	 * internal ELECT_ELECTION, as well as the setting of the failure
145	 * recovery flag, is at the point we receive the new request from
146	 * operation_needed (either here, or within the loop below).
147	 */
148	LOCK_MUTEX(db_rep->mutex);
149	if (db_rep->finished) {
150		db_rep->elect_thread->finished = TRUE;
151		UNLOCK_MUTEX(db_rep->mutex);
152		return (0);
153	}
154	to_do = db_rep->operation_needed;
155	db_rep->operation_needed = 0;
156	UNLOCK_MUTEX(db_rep->mutex);
157
158	/*
159	 * The way we are invoked determines the criterion for completion (which
160	 * is represented as "need_success"): if we've been asked to do an
161	 * election, we're only "done" when an election has actually succeeded.
162	 * If we're just here trying to find the master initially, then merely
163	 * getting a valid master_eid suffices.
164	 */
165	switch (to_do) {
166	case ELECT_FAILURE_ELECTION:
167		failure_recovery = TRUE;
168		to_do = ELECT_ELECTION;
169		/* FALLTHROUGH */
170	case ELECT_ELECTION:
171		need_success = TRUE;
172		break;
173	case ELECT_REPSTART:
174		need_success = FALSE;
175		break;
176	default:
177		DB_ASSERT(env, FALSE);
178	}
179	/* Here, need_success has been initialized. */
180
181	for (;;) {
182		RPRINT(env, DB_VERB_REPMGR_MISC,
183		    (env, "elect thread to do: %d", to_do));
184		switch (to_do) {
185		case ELECT_ELECTION:
186			nsites = __repmgr_get_nsites(db_rep);
187			/*
188			 * With only 2 sites in the group, even a single failure
189			 * could make it impossible to get a majority.  So,
190			 * fudge a little, unless the user really wants strict
191			 * safety.
192			 */
193			if (nsites == 2 &&
194			    !FLD_ISSET(db_rep->region->config,
195			    REP_C_2SITE_STRICT))
196				nvotes = 1;
197			else
198				nvotes = ELECTION_MAJORITY(nsites);
199
200			/*
201			 * If we're doing an election because we noticed that
202			 * the master failed, it's reasonable to expect that the
203			 * master won't participate.  By not waiting for its
204			 * vote, we can probably complete the election faster.
205			 * But note that we shouldn't allow this to affect
206			 * nvotes calculation.
207			 *
208			 * However, if we have 2 sites, and strict majority is
209			 * turned on, now nvotes would be 2, and it doesn't make
210			 * sense to rep_elect to see nsites of 1 in that case.
211			 * So only decrement nsites if it currently exceeds
212			 * nvotes.
213			 */
214			if (failure_recovery && nsites > nvotes)
215				nsites--;
216
217			if (IS_USING_LEASES(env))
218				nsites = 0;
219
220			switch (ret =
221			    __rep_elect_int(env, nsites, nvotes, 0)) {
222			case DB_REP_UNAVAIL:
223				break;
224
225			case 0:
226				succeeded = TRUE;
227				if (db_rep->takeover_pending) {
228					db_rep->takeover_pending = FALSE;
229					if ((ret =
230					    __repmgr_become_master(env)) != 0)
231						return (ret);
232				}
233				break;
234
235			default:
236				__db_err(
237				    env, ret, "unexpected election failure");
238				return (ret);
239			}
240			last_op = ELECT_ELECTION;
241			break;
242		case ELECT_REPSTART:
243			if ((ret =
244			    __repmgr_prepare_my_addr(env, &my_addr)) != 0)
245				return (ret);
246			ret = __rep_start_int(env, &my_addr, DB_REP_CLIENT);
247			__os_free(env, my_addr.data);
248			if (ret != 0) {
249				__db_err(env, ret, "rep_start");
250				return (ret);
251			}
252			last_op = ELECT_REPSTART;
253			break;
254		case 0:
255			/*
256			 * Nothing to do: this can happen the first time
257			 * through, on initialization.
258			 */
259			last_op = 0;
260			break;
261		default:
262			DB_ASSERT(env, FALSE);
263		}
264
265		/*
266		 * Only the first election after a crashed master should be
267		 * "fast".  If that election fails and we have to retry, the
268		 * crashed master may have rebooted in the interim.
269		 */
270		failure_recovery = FALSE;
271
272		LOCK_MUTEX(db_rep->mutex);
273		while (!succeeded && !__repmgr_is_ready(env)) {
274#ifdef DB_WIN32
275			duration = db_rep->election_retry_wait / US_PER_MS;
276			ret = SignalObjectAndWait(*db_rep->mutex,
277			    db_rep->check_election, duration, FALSE);
278			LOCK_MUTEX(db_rep->mutex);
279			if (ret == WAIT_TIMEOUT)
280				break;
281			DB_ASSERT(env, ret == WAIT_OBJECT_0);
282#else
283			__repmgr_compute_wait_deadline(env, &deadline,
284			    db_rep->election_retry_wait);
285			if ((ret = pthread_cond_timedwait(
286			    &db_rep->check_election, db_rep->mutex, &deadline))
287			    == ETIMEDOUT)
288				break;
289			DB_ASSERT(env, ret == 0);
290#endif
291		}
292
293		/*
294		 * Ways we can get here: election succeeded, sleep duration
295		 * expired, "operation needed", or thread shut-down command.
296		 *
297		 * If we're not yet done, figure out what to do next (which may
298		 * be trivially easy if we've been told explicitly, via the
299		 * "operation needed" flag).  We must first check if we've been
300		 * told to do a specific operation, because that could make our
301		 * completion criterion more stringent.  Note that we never
302		 * lessen our completion criterion (i.e., unlike the initial
303		 * case, we may leave need_success untouched here).
304		 */
305		done = FALSE;
306		if ((to_do = db_rep->operation_needed) != 0) {
307			db_rep->operation_needed = 0;
308			switch (to_do) {
309			case ELECT_FAILURE_ELECTION:
310				failure_recovery = TRUE;
311				to_do = ELECT_ELECTION;
312				/* FALLTHROUGH */
313			case ELECT_ELECTION:
314				need_success = TRUE;
315				break;
316			default:
317				break;
318			}
319		} else if ((done = (succeeded ||
320		    (!need_success && IS_VALID_EID(db_rep->master_eid)) ||
321		    db_rep->finished)))
322			db_rep->elect_thread->finished = TRUE;
323		else {
324			if (last_op == ELECT_ELECTION)
325				to_do = ELECT_REPSTART;
326			else {
327				/*
328				 * Generally, if what we previously did is a
329				 * rep_start (or nothing, which really just
330				 * means another thread did the rep_start before
331				 * turning us on), then we next do an election.
332				 * However, with the REP_CLIENT init policy we
333				 * never do an initial election.
334				 */
335				to_do = ELECT_ELECTION;
336				if (db_rep->init_policy == DB_REP_CLIENT &&
337				    !db_rep->found_master)
338					to_do = ELECT_REPSTART;
339			}
340		}
341
342		UNLOCK_MUTEX(db_rep->mutex);
343		if (done)
344			return (0);
345	}
346}
347
348/*
349 * Tests whether another thread has signalled for our attention.
350 */
351static int
352__repmgr_is_ready(env)
353	ENV *env;
354{
355	DB_REP *db_rep;
356
357	db_rep = env->rep_handle;
358
359	RPRINT(env, DB_VERB_REPMGR_MISC, (env,
360	    "repmgr elect: opcode %d, finished %d, master %d",
361	    db_rep->operation_needed, db_rep->finished, db_rep->master_eid));
362
363	return (db_rep->operation_needed || db_rep->finished);
364}
365
366/*
367 * PUBLIC: int __repmgr_become_master __P((ENV *));
368 */
369int
370__repmgr_become_master(env)
371	ENV *env;
372{
373	DBT my_addr;
374	DB_REP *db_rep;
375	int ret;
376
377	db_rep = env->rep_handle;
378
379	/*
380	 * At the moment, it's useless to pass my address to rep_start here,
381	 * because rep_start ignores it in the case of MASTER.  So we could
382	 * avoid the trouble of allocating and freeing this memory.  But might
383	 * this conceivably change in the future?
384	 */
385	if ((ret = __repmgr_prepare_my_addr(env, &my_addr)) != 0)
386		return (ret);
387	ret = __rep_start_int(env, &my_addr, DB_REP_MASTER);
388	__os_free(env, my_addr.data);
389
390	if (ret == 0) {
391		db_rep->master_eid = SELF_EID;
392		db_rep->found_master = TRUE;
393	}
394	return (ret);
395}
396