1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2001,2008 Oracle.  All rights reserved.
5 *
6 * $Id: rep.h,v 12.111 2008/05/02 15:19:43 sue Exp $
7 */
8
9#ifndef _DB_REP_H_
10#define	_DB_REP_H_
11
12#include "dbinc_auto/rep_auto.h"
13
14#if defined(__cplusplus)
15extern "C" {
16#endif
17
18/*
19 * Names of client temp databases.
20 */
21#define	REPDBNAME	"__db.rep.db"
22#define	REPPAGENAME     "__db.reppg.db"
23
24/*
25 * Message types
26 */
27#define	REP_INVALID	0	/* Invalid message type. */
28#define	REP_ALIVE	1	/* I am alive message. */
29#define	REP_ALIVE_REQ	2	/* Request for alive messages. */
30#define	REP_ALL_REQ	3	/* Request all log records greater than LSN. */
31#define	REP_BULK_LOG	4	/* Bulk transfer of log records. */
32#define	REP_BULK_PAGE	5	/* Bulk transfer of pages. */
33#define	REP_DUPMASTER	6	/* Duplicate master detected; propagate. */
34#define	REP_FILE	7	/* Page of a database file. NOTUSED */
35#define	REP_FILE_FAIL	8	/* File requested does not exist. */
36#define	REP_FILE_REQ	9	/* Request for a database file. NOTUSED */
37#define	REP_LEASE_GRANT	10	/* Client grants a lease to a master. */
38#define	REP_LOG		11	/* Log record. */
39#define	REP_LOG_MORE	12	/* There are more log records to request. */
40#define	REP_LOG_REQ	13	/* Request for a log record. */
41#define	REP_MASTER_REQ	14	/* Who is the master */
42#define	REP_NEWCLIENT	15	/* Announces the presence of a new client. */
43#define	REP_NEWFILE	16	/* Announce a log file change. */
44#define	REP_NEWMASTER	17	/* Announces who the master is. */
45#define	REP_NEWSITE	18	/* Announces that a site has heard from a new
46				 * site; like NEWCLIENT, but indirect.  A
47				 * NEWCLIENT message comes directly from the new
48				 * client while a NEWSITE comes indirectly from
49				 * someone who heard about a NEWSITE.
50				 */
51#define	REP_PAGE	19	/* Database page. */
52#define	REP_PAGE_FAIL	20	/* Requested page does not exist. */
53#define	REP_PAGE_MORE	21	/* There are more pages to request. */
54#define	REP_PAGE_REQ	22	/* Request for a database page. */
55#define	REP_REREQUEST	23	/* Force rerequest. */
56#define	REP_START_SYNC	24	/* Tell client to begin syncing a ckp.*/
57#define	REP_UPDATE	25	/* Environment hotcopy information. */
58#define	REP_UPDATE_REQ	26	/* Request for hotcopy information. */
59#define	REP_VERIFY	27	/* A log record for verification. */
60#define	REP_VERIFY_FAIL	28	/* The client is outdated. */
61#define	REP_VERIFY_REQ	29	/* Request for a log record to verify. */
62#define	REP_VOTE1	30	/* Send out your information for an election. */
63#define	REP_VOTE2	31	/* Send a "you are master" vote. */
64/*
65 * Maximum message number for conversion tables.  Update this
66 * value as the largest message number above increases.
67 *
68 * !!!
69 * NOTE: When changing messages above, the two tables for upgrade support
70 * need adjusting.  They are in rep_util.c.
71 */
72#define	REP_MAX_MSG	31
73
74/*
75 * This is the list of client-to-client requests messages.
76 * We use this to decide if we're doing client-to-client and
77 * might need to send a rerequest.
78 */
79#define	REP_MSG_REQ(rectype)			\
80    (rectype == REP_ALL_REQ ||			\
81    rectype == REP_LOG_REQ ||			\
82    rectype == REP_PAGE_REQ ||			\
83    rectype == REP_VERIFY_REQ)
84
85/*
86 * Note that the version information should be at the beginning of the
87 * structure, so that we can rearrange the rest of it while letting the
88 * version checks continue to work.  DB_REPVERSION should be revved any time
89 * the rest of the structure changes or when the message numbers change.
90 *
91 * Define also, the corresponding log versions that are tied to the
92 * replication/release versions.  These are only used in replication
93 * and that is why they're defined here.
94 */
95#define	DB_LOGVERSION_42	8
96#define	DB_LOGVERSION_43	10
97#define	DB_LOGVERSION_44	11
98#define	DB_LOGVERSION_45	12
99#define	DB_LOGVERSION_46	13
100#define	DB_LOGVERSION_47	14
101#define	DB_LOGVERSION_MIN	DB_LOGVERSION_44
102#define	DB_REPVERSION_INVALID	0
103#define	DB_REPVERSION_44	3
104#define	DB_REPVERSION_45	3
105#define	DB_REPVERSION_46	4
106#define	DB_REPVERSION_47	5
107#define	DB_REPVERSION		DB_REPVERSION_47
108#define	DB_REPVERSION_MIN	DB_REPVERSION_44
109
110/*
111 * RPRINT
112 * REP_PRINT_MESSAGE
113 *	Macros for verbose replication messages.
114 */
115#define	RPRINT(env, verbose_category, x) do {				\
116	if (FLD_ISSET((env)->dbenv->verbose,				\
117	    (verbose_category) | DB_VERB_REPLICATION)) {		\
118		__rep_print x;						\
119	}								\
120} while (0)
121#define	REP_PRINT_MESSAGE(env, eid, rp, str, fl) do {			\
122	if (FLD_ISSET((env)->dbenv->verbose,				\
123	    DB_VERB_REP_MSGS | DB_VERB_REPLICATION)) {			\
124		__rep_print_message(env, eid, rp, str, fl);		\
125	}								\
126} while (0)
127
128/*
129 * Election gen file name
130 * The file contains an egen number for an election this client has NOT
131 * participated in.  I.e. it is the number of a future election.  We
132 * create it when we create the rep region, if it doesn't already exist
133 * and initialize egen to 1.  If it does exist, we read it when we create
134 * the rep region.  We write it immediately before sending our VOTE1 in
135 * an election.  That way, if a client has ever sent a vote for any
136 * election, the file is already going to be updated to reflect a future
137 * election, should it crash.
138 */
139#define	REP_EGENNAME	"__db.rep.egen"
140#define	REP_GENNAME	"__db.rep.gen"
141
142/*
143 * Internal init flag file name:
144 * The existence of this file serves as an indication that the client is in the
145 * process of Internal Initialization, in case it crashes before completing.
146 * During internal init the client's partially reconstructed database pages and
147 * logs may be in an inconsistent state, so much so that running recovery must
148 * be avoided.  Furthermore, there is no other way to reliably recognize this
149 * condition.  Therefore, when we open an environment, and we're just about to
150 * run recovery, we check for this file first.  If it exists we must discard all
151 * logs and databases.  This avoids the recovery problems, and leads to a fresh
152 * attempt at internal init if the environment becomes a replication client and
153 * finds a master.  The list of databases which may need to be removed is stored
154 * in this file.
155 */
156#define	REP_INITNAME	"__db.rep.init"
157#define	REP_INITVERSION_46	1
158#define	REP_INITVERSION_47	2
159#define	REP_INITVERSION		2
160
161
162/*
163 * Database types for __rep_client_dbinit
164 */
165typedef enum {
166	REP_DB,		/* Log record database. */
167	REP_PG		/* Pg database. */
168} repdb_t;
169
170/* Macros to lock/unlock the replication region as a whole. */
171#define	REP_SYSTEM_LOCK(env)						\
172	MUTEX_LOCK(env, (env)->rep_handle->region->mtx_region)
173#define	REP_SYSTEM_UNLOCK(env)						\
174	MUTEX_UNLOCK(env, (env)->rep_handle->region->mtx_region)
175
176/*
177 * Macros for manipulating the event synchronization.  We use a separate mutex
178 * so that an application's call-back function can be invoked without locking
179 * the whole region.
180 */
181#define	REP_EVENT_LOCK(env)						\
182	MUTEX_LOCK(env, (env)->rep_handle->region->mtx_event)
183#define	REP_EVENT_UNLOCK(env)						\
184	MUTEX_UNLOCK(env, (env)->rep_handle->region->mtx_event)
185
186/*
187 * REP --
188 * Shared replication structure.
189 */
190typedef struct __rep {
191	db_mutex_t	mtx_region;	/* Region mutex. */
192	db_mutex_t	mtx_clientdb;	/* Client database mutex. */
193	db_mutex_t	mtx_ckp;	/* Checkpoint mutex. */
194	roff_t		lease_off;	/* Offset of the lease table. */
195	roff_t		tally_off;	/* Offset of the tally region. */
196	roff_t		v2tally_off;	/* Offset of the vote2 tally region. */
197	int		eid;		/* Environment id. */
198	int		master_id;	/* ID of the master site. */
199	u_int32_t	version;	/* Current replication version. */
200	u_int32_t	egen;		/* Replication election generation. */
201	u_int32_t	gen;		/* Replication generation number. */
202	u_int32_t	recover_gen;	/* Last generation number in log. */
203	u_int32_t	asites;		/* Space allocated for sites. */
204	u_int32_t	nsites;		/* Number of sites in group. */
205	u_int32_t	nvotes;		/* Number of votes needed. */
206	u_int32_t	priority;	/* My priority in an election. */
207	u_int32_t	config_nsites;
208
209	db_timeout_t	elect_timeout;	/* Normal/full election timeouts. */
210	db_timeout_t	full_elect_timeout;
211
212	db_timeout_t	chkpt_delay;	/* Master checkpoint delay. */
213
214#define	REP_DEFAULT_THROTTLE	(10 * MEGABYTE) /* Default value is < 1Gig. */
215	u_int32_t	gbytes;		/* Limit on data sent in single... */
216	u_int32_t	bytes;		/* __rep_process_message call. */
217#define	DB_REP_REQUEST_GAP	40000	/* 40 msecs */
218#define	DB_REP_MAX_GAP		1280000	/* 1.28 seconds */
219	db_timespec	request_gap;	/* Minimum time to wait before we
220					 * request a missing log record. */
221	db_timespec	max_gap;	/* Maximum time to wait before
222					 * requesting a missing log record. */
223	/* Status change information */
224	u_int32_t	apply_th;	/* Number of callers in rep_apply. */
225	u_int32_t	msg_th;		/* Number of callers in rep_proc_msg.*/
226	u_int32_t	handle_cnt;	/* Count of handles in library. */
227	u_int32_t	op_cnt;		/* Multi-step operation count.*/
228	DB_LSN		ckp_lsn;	/* LSN for syncing a checkpoint. */
229	DB_LSN		max_prep_lsn;	/* Max LSN of txn_prepare record. */
230
231	/*
232	 * Event notification synchronization: the mtx_event and associate
233	 * fields which it protects govern event notification to the
234	 * application.  They form a guarantee that no matter how crazy the
235	 * thread scheduling gets, the application sees a sensible, orderly
236	 * progression of events.
237	 */
238	db_mutex_t	mtx_event;	/* Serializes event notification. */
239	/*
240	 * Latest generation whose NEWMASTER event the application has been
241	 * notified of.  Also serves to force STARTUPDONE to occur after
242	 * NEWMASTER.
243	 */
244	u_int32_t	newmaster_event_gen;
245	/*
246	 * Latest local victory of an election that the application has been
247	 * notified of, expressed as the election generation number.  This
248	 * ensures we notify the application exactly once when it wins an
249	 * election.
250	 */
251	u_int32_t	notified_egen;
252
253	/* Backup information. */
254	u_int32_t	nfiles;		/* Number of files we have info on. */
255	u_int32_t	curfile;	/* Cur file we're getting (0-based). */
256	__rep_fileinfo_args	*curinfo;	/* Current file info ptr. */
257	u_int8_t	*finfo;		/* Current file info buffer. */
258	u_int8_t	*nextinfo;	/* Next file info buffer. */
259	u_int8_t	*originfo;	/* Original file info buffer. */
260	u_int32_t	infolen;	/* Remaining length file info buffer. */
261	u_int32_t	originfolen;	/* Original length file info buffer. */
262	u_int32_t	infoversion;	/* Original file info version. */
263	DB_LSN		first_lsn;	/* Earliest LSN we need. */
264	u_int32_t	first_vers;	/* Log version of first log file. */
265	DB_LSN		last_lsn;	/* Latest LSN we need. */
266	db_pgno_t	ready_pg;	/* Next pg expected. */
267	db_pgno_t	waiting_pg;	/* First pg after gap. */
268	db_pgno_t	max_wait_pg;	/* Maximum pg requested. */
269	u_int32_t	npages;		/* Num of pages rcvd for this file. */
270	DB_MPOOLFILE	*file_mpf;	/* Mpoolfile for current database. */
271	DB		*file_dbp;	/* This file's page info. */
272	DBC		*queue_dbc;	/* Dbc for a queue file. */
273
274	/* Vote tallying information. */
275	u_int32_t	sites;		/* Sites heard from. */
276	int		winner;		/* Current winner EID. */
277	u_int32_t	w_priority;	/* Winner priority. */
278	u_int32_t	w_gen;		/* Winner generation. */
279	DB_LSN		w_lsn;		/* Winner LSN. */
280	u_int32_t	w_tiebreaker;	/* Winner tiebreaking value. */
281	u_int32_t	votes;		/* Number of votes for this site. */
282
283	db_timespec	etime;		/* Election start timestamp. */
284
285	/* Leases. */
286	db_timeout_t	lease_timeout;	/* Lease timeout. */
287	db_timespec	lease_duration;	/* Lease timeout with clock skew. */
288	u_int32_t	clock_skew;	/* Clock skew. */
289	u_int32_t	clock_base;	/* Clock scale factor base. */
290	db_timespec	grant_expire;	/* Local grant expiration time. */
291
292	/* Statistics. */
293	DB_REP_STAT	stat;
294#if defined(HAVE_REPLICATION_THREADS) && defined(HAVE_STATISTICS)
295	DB_REPMGR_STAT	mstat;
296#endif
297
298	/* Configuration. */
299#define	REP_C_2SITE_STRICT	0x00001		/* Don't cheat on elections. */
300#define	REP_C_BULK		0x00002		/* Bulk transfer. */
301#define	REP_C_DELAYCLIENT	0x00004		/* Delay client sync-up. */
302#define	REP_C_LEASE		0x00008		/* Leases configured. */
303#define	REP_C_NOAUTOINIT	0x00010		/* No auto initialization. */
304#define	REP_C_NOWAIT		0x00020		/* Immediate error return. */
305	u_int32_t	config;		/* Configuration flags. */
306
307#define	REP_F_CLIENT		0x00000001	/* Client replica. */
308#define	REP_F_DELAY		0x00000002	/* Delaying client sync-up. */
309#define	REP_F_EGENUPDATE	0x00000004	/* Egen updated by ALIVE msg. */
310#define	REP_F_EPHASE0		0x00000008	/* In phase 0 of election. */
311#define	REP_F_EPHASE1		0x00000010	/* In phase 1 of election. */
312#define	REP_F_EPHASE2		0x00000020	/* In phase 2 of election. */
313#define	REP_F_GROUP_ESTD	0x00000040	/* Rep group is established. */
314#define	REP_F_INREPELECT	0x00000080	/* Thread in rep_elect. */
315#define	REP_F_MASTER		0x00000100	/* Master replica. */
316#define	REP_F_MASTERELECT	0x00000200	/* Master elect. */
317#define	REP_F_NEWFILE		0x00000400	/* Newfile in progress. */
318#define	REP_F_NOARCHIVE		0x00000800	/* Rep blocks log_archive. */
319#define	REP_F_READY_API		0x00001000	/* Need handle_cnt to be 0. */
320#define	REP_F_READY_APPLY	0x00002000	/* Need apply_th to be 0. */
321#define	REP_F_READY_MSG		0x00004000	/* Need msg_th to be 0. */
322#define	REP_F_READY_OP		0x00008000	/* Need op_cnt to be 0. */
323#define	REP_F_RECOVER_LOG	0x00010000	/* In recovery - log. */
324#define	REP_F_RECOVER_PAGE	0x00020000	/* In recovery - pages. */
325#define	REP_F_RECOVER_UPDATE	0x00040000	/* In recovery - files. */
326#define	REP_F_RECOVER_VERIFY	0x00080000	/* In recovery - verify. */
327#define	REP_F_SKIPPED_APPLY	0x00100000	/* Skipped applying a record. */
328#define	REP_F_START_CALLED	0x00200000	/* Rep_start called. */
329#define	REP_F_TALLY		0x00400000	/* Tallied vote before elect. */
330	u_int32_t	flags;
331} REP;
332
333/*
334 * Recovery flag mask to easily check any/all recovery bits.  That is
335 * REP_F_READY_{API|OP} and all REP_F_RECOVER*.  This must change if the values
336 * of the flags change.  NOTE:  We do not include REP_F_READY_MSG in
337 * this mask because it is used frequently in non-recovery related
338 * areas and we want to manipulate it separately (see especially
339 * in __rep_new_master).
340 */
341#define	REP_F_RECOVER_MASK						\
342    (REP_F_READY_API | REP_F_READY_OP |					\
343     REP_F_RECOVER_LOG | REP_F_RECOVER_PAGE |				\
344     REP_F_RECOVER_UPDATE | REP_F_RECOVER_VERIFY)
345
346/*
347 * REP_F_EPHASE0 is not a *real* election phase.  It is used for
348 * master leases and allowing the client to find the master or
349 * expire its lease.  However, EPHASE0 is cleared by __rep_elect_done.
350 */
351#define	IN_ELECTION(R)							\
352	F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2)
353#define	IN_ELECTION_TALLY(R) \
354	F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2 | REP_F_TALLY)
355#define	ELECTION_MAJORITY(n) (((n) / 2) + 1)
356
357#define	REP_F_INTERNAL_INIT_MASK (REP_F_RECOVER_PAGE | REP_F_RECOVER_LOG)
358
359#define	IS_REP_MASTER(env)						\
360	(REP_ON(env) &&							\
361	    F_ISSET(((REP *)(env)->rep_handle->region), REP_F_MASTER))
362
363#define	IS_REP_CLIENT(env)						\
364	(REP_ON(env) &&							\
365	    F_ISSET(((REP *)(env)->rep_handle->region), REP_F_CLIENT))
366
367#define	IS_USING_LEASES(env)						\
368	(REP_ON(env) &&							\
369	    FLD_ISSET(((REP *)(env)->rep_handle->region)->config,	\
370	    REP_C_LEASE))
371
372#define	IS_CLIENT_PGRECOVER(env)					\
373	(IS_REP_CLIENT(env) &&						\
374	    F_ISSET(((REP *)(env)->rep_handle->region), REP_F_RECOVER_PAGE))
375
376/*
377 * Macros to figure out if we need to do replication pre/post-amble processing.
378 * Skip for specific DB handles owned by the replication layer, either because
379 * replication is running recovery or because it's a handle entirely owned by
380 * the replication code (replication opens its own databases to track state).
381 */
382#define	IS_ENV_REPLICATED(env)						\
383	(REP_ON(env) && (env)->rep_handle->region->flags != 0)
384
385/*
386 * Gap processing flags.  These provide control over the basic
387 * gap processing algorithm for some special cases.
388 */
389#define	REP_GAP_FORCE		0x001	/* Force a request for a gap. */
390#define	REP_GAP_REREQUEST	0x002	/* Gap request is a forced rerequest. */
391					/* REREQUEST is a superset of FORCE. */
392
393/*
394 * Basic pre/post-amble processing.
395 */
396#define	REPLICATION_WRAP(env, func_call, checklock, ret) do {		\
397	int __rep_check, __t_ret;					\
398	__rep_check = IS_ENV_REPLICATED(env) ? 1 : 0;			\
399	(ret) = __rep_check ? __env_rep_enter(env, checklock) : 0;	\
400	if ((ret) == 0) {						\
401		(ret) = func_call;					\
402		if (__rep_check && (__t_ret =				\
403		    __env_db_rep_exit(env)) != 0 && (ret) == 0)		\
404		(ret) = __t_ret;					\
405	}								\
406} while (0)
407
408/*
409 * Per-process replication structure.
410 *
411 * There are 2 mutexes used in replication.
412 * 1.  mtx_region - This protects the fields of the rep region above.
413 * 2.  mtx_clientdb - This protects the per-process flags, and bookkeeping
414 * database and all of the components that maintain it.  Those
415 * components include the following fields in the log region (see log.h):
416 *	a. ready_lsn
417 *	b. waiting_lsn
418 *	c. verify_lsn
419 *	d. wait_recs
420 *	e. rcvd_recs
421 *	f. max_wait_lsn
422 * These fields in the log region are NOT protected by the log region lock at
423 * all.
424 *
425 * Note that the per-process flags should truly be protected by a special
426 * per-process thread mutex, but it is currently set in so isolated a manner
427 * that it didn't make sense to do so and in most case we're already holding
428 * the mtx_clientdb anyway.
429 *
430 * The lock ordering protocol is that mtx_clientdb must be acquired first and
431 * then either REP->mtx_region, or the LOG->mtx_region mutex may be acquired if
432 * necessary.
433 */
434struct __db_rep {
435	/*
436	 * Shared configuration information -- copied to and maintained in the
437	 * shared region as soon as the shared region is created.
438	 */
439	int		eid;		/* Environment ID. */
440
441	u_int32_t	gbytes;		/* Limit on data sent in single... */
442	u_int32_t	bytes;		/* __rep_process_message call. */
443
444	db_timespec	request_gap;	/* Minimum time to wait before we
445					 * request a missing log record. */
446	db_timespec	max_gap;	/* Maximum time to wait before
447					 * requesting a missing log record. */
448
449	u_int32_t	clock_skew;	/* Clock skew factor. */
450	u_int32_t	clock_base;	/* Clock skew base. */
451	u_int32_t	config;		/* Configuration flags. */
452	u_int32_t	config_nsites;
453
454	db_timeout_t	elect_timeout;	/* Normal/full election timeouts. */
455	db_timeout_t	full_elect_timeout;
456
457	db_timeout_t	chkpt_delay;	/* Master checkpoint delay. */
458
459	u_int32_t	my_priority;
460	db_timeout_t	lease_timeout;	/* Master leases. */
461	/*
462	 * End of shared configuration information.
463	 */
464	int		(*send)		/* Send function. */
465			    __P((DB_ENV *, const DBT *, const DBT *,
466			    const DB_LSN *, int, u_int32_t));
467
468	DB		*rep_db;	/* Bookkeeping database. */
469
470	REP		*region;	/* In memory structure. */
471	u_int8_t	*bulk;		/* Shared memory bulk area. */
472
473#define	DBREP_OPENFILES		0x0001	/* This handle has opened files. */
474	u_int32_t	flags;		/* per-process flags. */
475
476#ifdef HAVE_REPLICATION_THREADS
477	/*
478	 * Replication Framework (repmgr) information.
479	 */
480	int		nthreads;
481	u_int32_t	init_policy;
482	int		perm_policy;
483	int		peer;	/* Site to use for C2C sync. */
484	db_timeout_t	ack_timeout;
485	db_timeout_t	election_retry_wait;
486	db_timeout_t	connection_retry_wait;
487	db_timeout_t	heartbeat_frequency; /* Max period between msgs. */
488	db_timeout_t	heartbeat_monitor_timeout;
489
490	/* Repmgr's copies of rep stuff. */
491	int		master_eid;
492	u_int32_t	generation;
493
494	/* Thread synchronization. */
495	REPMGR_RUNNABLE *selector, **messengers, *elect_thread;
496	mgr_mutex_t	mutex;
497	cond_var_t	queue_nonempty, check_election;
498#ifdef DB_WIN32
499	ACK_WAITERS_TABLE *waiters;
500	HANDLE		signaler;
501	int		wsa_inited;
502#else
503	pthread_cond_t	ack_condition;
504	int		read_pipe, write_pipe;
505	int		chg_sig_handler;
506#endif
507
508	/* Operational stuff. */
509	REPMGR_SITE	*sites;		/* Array of known sites. */
510	u_int		site_cnt;	/* Array slots in use. */
511	u_int		site_max;	/* Total array slots allocated. */
512
513	CONNECTION_LIST	connections;
514	RETRY_Q_HEADER	retries;	/* Sites needing connection retry. */
515	REPMGR_QUEUE	*input_queue;
516
517	socket_t	listen_fd;
518	repmgr_netaddr_t my_addr;
519	db_timespec	last_bcast;	/* Time of last broadcast msg. */
520
521	int		finished; /* Repmgr threads should shut down. */
522	int		done_one; /* TODO: rename */
523	int		found_master;
524	int		takeover_pending; /* We've been elected master. */
525
526/* Operations we can ask election thread to perform (OOB value is 0): */
527#define	ELECT_ELECTION		1 /* Call for an election. */
528#define	ELECT_FAILURE_ELECTION	2 /* Do election, adjusting nsites to account
529				     for a failed master. */
530#define	ELECT_REPSTART		3 /* Call rep_start(CLIENT). */
531#define	ELECT_SEEK_MASTER	4 /* Alternate rep_start to find master. */
532	int		operation_needed; /* Next op for election thread. */
533
534#endif  /* HAVE_REPLICATION_THREADS */
535};
536
537/*
538 * Control structure flags for replication communication infrastructure.
539 */
540/*
541 * Define old DB_LOG_ values that we must support here.  For reasons of
542 * compatibility with old versions, these values must be reserved explicitly in
543 * the list of flag values (below)
544 */
545#define	DB_LOG_PERM_42_44	0x20
546#define	DB_LOG_RESEND_42_44	0x40
547#define	REPCTL_INIT_45		0x02	/* Back compatible flag value. */
548
549#define	REPCTL_ELECTABLE	0x01	/* Upgraded client is electable. */
550#define	REPCTL_FLUSH		0x02	/* Record should be flushed. */
551#define	REPCTL_GROUP_ESTD	0x04	/* Message from site in a group. */
552#define	REPCTL_INIT		0x08	/* Internal init message. */
553#define	REPCTL_LEASE		0x10	/* Lease related message.. */
554			/*
555			 * Skip over reserved values 0x20
556			 * and 0x40, as explained above.
557			 */
558#define	REPCTL_LOG_END		0x80	/* Approximate end of group-wide log. */
559#define	REPCTL_PERM		DB_LOG_PERM_42_44
560#define	REPCTL_RESEND		DB_LOG_RESEND_42_44
561
562/*
563 * File info flags for internal init.  The per-database (i.e., file) flag
564 * represents the on-disk format of the file, and is conveyed from the master to
565 * the initializing client in the UPDATE message, so that the client can know
566 * how to create the file.  The per-page flag is conveyed along with each PAGE
567 * message, describing the format of the page image being transmitted; it is of
568 * course set by the site serving the PAGE_REQ.  The serving site gets the page
569 * image from its own mpool, and thus the page is in the native format of the
570 * serving site.  This format may be different (i.e., opposite) from the on-disk
571 * format, and in fact can vary per-page, since with client-to-client sync it is
572 * possible for various different sites to serve the various PAGE_REQ requests.
573 */
574#define	REPINFO_DB_LITTLEENDIAN	0x0001	/* File is little-endian lorder. */
575#define	REPINFO_PG_LITTLEENDIAN	0x0002	/* Page is little-endian lorder. */
576
577/*
578 * Control message format for 4.6 release.  The db_timespec_t is
579 * not a portable structure.  Therefore, in 4.6, replication among
580 * mixed OSs such as Linux and Windows, which have different time_t
581 * sizes, does not work.
582 */
583typedef struct {
584	u_int32_t	rep_version;	/* Replication version number. */
585	u_int32_t	log_version;	/* Log version number. */
586
587	DB_LSN		lsn;		/* Log sequence number. */
588	u_int32_t	rectype;	/* Message type. */
589	u_int32_t	gen;		/* Generation number. */
590	db_timespec	msg_time;	/* Timestamp seconds for leases. */
591	u_int32_t	flags;		/* log_put flag value. */
592} REP_46_CONTROL;
593
594/*
595 * Control message format for 4.5 release and earlier.
596 */
597typedef struct {
598	u_int32_t	rep_version;	/* Replication version number. */
599	u_int32_t	log_version;	/* Log version number. */
600
601	DB_LSN		lsn;		/* Log sequence number. */
602	u_int32_t	rectype;	/* Message type. */
603	u_int32_t	gen;		/* Generation number. */
604	u_int32_t	flags;		/* log_put flag value. */
605} REP_OLD_CONTROL;
606
607#define	LEASE_REFRESH_TRIES	3	/* Number of times to try refresh. */
608
609/* Master granted lease information. */
610typedef struct __rep_lease_entry {
611	int		eid;		/* EID of client grantor. */
612	db_timespec	start_time;	/* Start time clients echo back. */
613	db_timespec	end_time;	/* Master lease expiration time. */
614	DB_LSN		lease_lsn;	/* Durable LSN lease applies to. */
615} REP_LEASE_ENTRY;
616
617/*
618 * Old vote info where some fields were not fixed size.
619 */
620typedef struct {
621	u_int32_t	egen;		/* Election generation. */
622	int		nsites;		/* Number of sites I've been in
623					 * communication with. */
624	int		nvotes;		/* Number of votes needed to win. */
625	int		priority;	/* My site's priority. */
626	u_int32_t	tiebreaker;	/* Tie-breaking quasi-random value. */
627} REP_OLD_VOTE_INFO;
628
629typedef struct {
630	u_int32_t	egen;		/* Voter's election generation. */
631	int		eid;		/* Voter's ID. */
632} REP_VTALLY;
633
634/*
635 * The REP_THROTTLE_ONLY flag is used to do throttle processing only.
636 * If set, it will only allow sending the REP_*_MORE message, but not
637 * the normal, non-throttled message.  It is used to support throttling
638 * with bulk transfer.
639 */
640/* Flags for __rep_send_throttle. */
641#define	REP_THROTTLE_ONLY	0x0001	/* Send _MORE message only. */
642
643/* Throttled message processing information. */
644typedef struct {
645	DB_LSN		lsn;		/* LSN of this record. */
646	DBT		*data_dbt;	/* DBT of this record. */
647	u_int32_t	gbytes;		/* This call's max gbytes sent. */
648	u_int32_t	bytes;		/* This call's max bytes sent. */
649	u_int32_t	type;		/* Record type. */
650} REP_THROTTLE;
651
652/* Bulk processing information. */
653/*
654 * !!!
655 * We use a uintptr_t for the offset.  We'd really like to use a ptrdiff_t
656 * since that really is what it is.  But ptrdiff_t is not portable and
657 * doesn't exist everywhere.
658 */
659typedef struct {
660	u_int8_t	*addr;		/* Address of bulk buffer. */
661	uintptr_t	*offp;		/* Ptr to current offset into buffer. */
662	u_int32_t	len;		/* Bulk buffer length. */
663	u_int32_t	type;		/* Item type in buffer (log, page). */
664	DB_LSN		lsn;		/* First LSN in buffer. */
665	int		eid;		/* ID of potential recipients. */
666#define	BULK_XMIT	0x001		/* Buffer in transit. */
667	u_int32_t	*flagsp;	/* Buffer flags. */
668} REP_BULK;
669
670/*
671 * This structure takes care of representing a transaction.
672 * It holds all the records, sorted by page number so that
673 * we can obtain locks and apply updates in a deadlock free
674 * order.
675 */
676typedef struct {
677	u_int nlsns;
678	u_int nalloc;
679	DB_LSN *array;
680} LSN_COLLECTION;
681
682/*
683 * This is used by the page-prep routines to do the lock_vec call to
684 * apply the updates for a single transaction or a collection of
685 * transactions.
686 */
687typedef struct {
688	int		n;
689	DB_LOCKREQ	*reqs;
690	DBT		*objs;
691} linfo_t;
692
693#if defined(__cplusplus)
694}
695#endif
696
697#include "dbinc_auto/rep_ext.h"
698#endif	/* !_DB_REP_H_ */
699