1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2001,2008 Oracle. All rights reserved. 5 * 6 * $Id: rep.h,v 12.111 2008/05/02 15:19:43 sue Exp $ 7 */ 8 9#ifndef _DB_REP_H_ 10#define _DB_REP_H_ 11 12#include "dbinc_auto/rep_auto.h" 13 14#if defined(__cplusplus) 15extern "C" { 16#endif 17 18/* 19 * Names of client temp databases. 20 */ 21#define REPDBNAME "__db.rep.db" 22#define REPPAGENAME "__db.reppg.db" 23 24/* 25 * Message types 26 */ 27#define REP_INVALID 0 /* Invalid message type. */ 28#define REP_ALIVE 1 /* I am alive message. */ 29#define REP_ALIVE_REQ 2 /* Request for alive messages. */ 30#define REP_ALL_REQ 3 /* Request all log records greater than LSN. */ 31#define REP_BULK_LOG 4 /* Bulk transfer of log records. */ 32#define REP_BULK_PAGE 5 /* Bulk transfer of pages. */ 33#define REP_DUPMASTER 6 /* Duplicate master detected; propagate. */ 34#define REP_FILE 7 /* Page of a database file. NOTUSED */ 35#define REP_FILE_FAIL 8 /* File requested does not exist. */ 36#define REP_FILE_REQ 9 /* Request for a database file. NOTUSED */ 37#define REP_LEASE_GRANT 10 /* Client grants a lease to a master. */ 38#define REP_LOG 11 /* Log record. */ 39#define REP_LOG_MORE 12 /* There are more log records to request. */ 40#define REP_LOG_REQ 13 /* Request for a log record. */ 41#define REP_MASTER_REQ 14 /* Who is the master */ 42#define REP_NEWCLIENT 15 /* Announces the presence of a new client. */ 43#define REP_NEWFILE 16 /* Announce a log file change. */ 44#define REP_NEWMASTER 17 /* Announces who the master is. */ 45#define REP_NEWSITE 18 /* Announces that a site has heard from a new 46 * site; like NEWCLIENT, but indirect. A 47 * NEWCLIENT message comes directly from the new 48 * client while a NEWSITE comes indirectly from 49 * someone who heard about a NEWSITE. 50 */ 51#define REP_PAGE 19 /* Database page. */ 52#define REP_PAGE_FAIL 20 /* Requested page does not exist. */ 53#define REP_PAGE_MORE 21 /* There are more pages to request. */ 54#define REP_PAGE_REQ 22 /* Request for a database page. */ 55#define REP_REREQUEST 23 /* Force rerequest. */ 56#define REP_START_SYNC 24 /* Tell client to begin syncing a ckp.*/ 57#define REP_UPDATE 25 /* Environment hotcopy information. */ 58#define REP_UPDATE_REQ 26 /* Request for hotcopy information. */ 59#define REP_VERIFY 27 /* A log record for verification. */ 60#define REP_VERIFY_FAIL 28 /* The client is outdated. */ 61#define REP_VERIFY_REQ 29 /* Request for a log record to verify. */ 62#define REP_VOTE1 30 /* Send out your information for an election. */ 63#define REP_VOTE2 31 /* Send a "you are master" vote. */ 64/* 65 * Maximum message number for conversion tables. Update this 66 * value as the largest message number above increases. 67 * 68 * !!! 69 * NOTE: When changing messages above, the two tables for upgrade support 70 * need adjusting. They are in rep_util.c. 71 */ 72#define REP_MAX_MSG 31 73 74/* 75 * This is the list of client-to-client requests messages. 76 * We use this to decide if we're doing client-to-client and 77 * might need to send a rerequest. 78 */ 79#define REP_MSG_REQ(rectype) \ 80 (rectype == REP_ALL_REQ || \ 81 rectype == REP_LOG_REQ || \ 82 rectype == REP_PAGE_REQ || \ 83 rectype == REP_VERIFY_REQ) 84 85/* 86 * Note that the version information should be at the beginning of the 87 * structure, so that we can rearrange the rest of it while letting the 88 * version checks continue to work. DB_REPVERSION should be revved any time 89 * the rest of the structure changes or when the message numbers change. 90 * 91 * Define also, the corresponding log versions that are tied to the 92 * replication/release versions. These are only used in replication 93 * and that is why they're defined here. 94 */ 95#define DB_LOGVERSION_42 8 96#define DB_LOGVERSION_43 10 97#define DB_LOGVERSION_44 11 98#define DB_LOGVERSION_45 12 99#define DB_LOGVERSION_46 13 100#define DB_LOGVERSION_47 14 101#define DB_LOGVERSION_MIN DB_LOGVERSION_44 102#define DB_REPVERSION_INVALID 0 103#define DB_REPVERSION_44 3 104#define DB_REPVERSION_45 3 105#define DB_REPVERSION_46 4 106#define DB_REPVERSION_47 5 107#define DB_REPVERSION DB_REPVERSION_47 108#define DB_REPVERSION_MIN DB_REPVERSION_44 109 110/* 111 * RPRINT 112 * REP_PRINT_MESSAGE 113 * Macros for verbose replication messages. 114 */ 115#define RPRINT(env, verbose_category, x) do { \ 116 if (FLD_ISSET((env)->dbenv->verbose, \ 117 (verbose_category) | DB_VERB_REPLICATION)) { \ 118 __rep_print x; \ 119 } \ 120} while (0) 121#define REP_PRINT_MESSAGE(env, eid, rp, str, fl) do { \ 122 if (FLD_ISSET((env)->dbenv->verbose, \ 123 DB_VERB_REP_MSGS | DB_VERB_REPLICATION)) { \ 124 __rep_print_message(env, eid, rp, str, fl); \ 125 } \ 126} while (0) 127 128/* 129 * Election gen file name 130 * The file contains an egen number for an election this client has NOT 131 * participated in. I.e. it is the number of a future election. We 132 * create it when we create the rep region, if it doesn't already exist 133 * and initialize egen to 1. If it does exist, we read it when we create 134 * the rep region. We write it immediately before sending our VOTE1 in 135 * an election. That way, if a client has ever sent a vote for any 136 * election, the file is already going to be updated to reflect a future 137 * election, should it crash. 138 */ 139#define REP_EGENNAME "__db.rep.egen" 140#define REP_GENNAME "__db.rep.gen" 141 142/* 143 * Internal init flag file name: 144 * The existence of this file serves as an indication that the client is in the 145 * process of Internal Initialization, in case it crashes before completing. 146 * During internal init the client's partially reconstructed database pages and 147 * logs may be in an inconsistent state, so much so that running recovery must 148 * be avoided. Furthermore, there is no other way to reliably recognize this 149 * condition. Therefore, when we open an environment, and we're just about to 150 * run recovery, we check for this file first. If it exists we must discard all 151 * logs and databases. This avoids the recovery problems, and leads to a fresh 152 * attempt at internal init if the environment becomes a replication client and 153 * finds a master. The list of databases which may need to be removed is stored 154 * in this file. 155 */ 156#define REP_INITNAME "__db.rep.init" 157#define REP_INITVERSION_46 1 158#define REP_INITVERSION_47 2 159#define REP_INITVERSION 2 160 161 162/* 163 * Database types for __rep_client_dbinit 164 */ 165typedef enum { 166 REP_DB, /* Log record database. */ 167 REP_PG /* Pg database. */ 168} repdb_t; 169 170/* Macros to lock/unlock the replication region as a whole. */ 171#define REP_SYSTEM_LOCK(env) \ 172 MUTEX_LOCK(env, (env)->rep_handle->region->mtx_region) 173#define REP_SYSTEM_UNLOCK(env) \ 174 MUTEX_UNLOCK(env, (env)->rep_handle->region->mtx_region) 175 176/* 177 * Macros for manipulating the event synchronization. We use a separate mutex 178 * so that an application's call-back function can be invoked without locking 179 * the whole region. 180 */ 181#define REP_EVENT_LOCK(env) \ 182 MUTEX_LOCK(env, (env)->rep_handle->region->mtx_event) 183#define REP_EVENT_UNLOCK(env) \ 184 MUTEX_UNLOCK(env, (env)->rep_handle->region->mtx_event) 185 186/* 187 * REP -- 188 * Shared replication structure. 189 */ 190typedef struct __rep { 191 db_mutex_t mtx_region; /* Region mutex. */ 192 db_mutex_t mtx_clientdb; /* Client database mutex. */ 193 db_mutex_t mtx_ckp; /* Checkpoint mutex. */ 194 roff_t lease_off; /* Offset of the lease table. */ 195 roff_t tally_off; /* Offset of the tally region. */ 196 roff_t v2tally_off; /* Offset of the vote2 tally region. */ 197 int eid; /* Environment id. */ 198 int master_id; /* ID of the master site. */ 199 u_int32_t version; /* Current replication version. */ 200 u_int32_t egen; /* Replication election generation. */ 201 u_int32_t gen; /* Replication generation number. */ 202 u_int32_t recover_gen; /* Last generation number in log. */ 203 u_int32_t asites; /* Space allocated for sites. */ 204 u_int32_t nsites; /* Number of sites in group. */ 205 u_int32_t nvotes; /* Number of votes needed. */ 206 u_int32_t priority; /* My priority in an election. */ 207 u_int32_t config_nsites; 208 209 db_timeout_t elect_timeout; /* Normal/full election timeouts. */ 210 db_timeout_t full_elect_timeout; 211 212 db_timeout_t chkpt_delay; /* Master checkpoint delay. */ 213 214#define REP_DEFAULT_THROTTLE (10 * MEGABYTE) /* Default value is < 1Gig. */ 215 u_int32_t gbytes; /* Limit on data sent in single... */ 216 u_int32_t bytes; /* __rep_process_message call. */ 217#define DB_REP_REQUEST_GAP 40000 /* 40 msecs */ 218#define DB_REP_MAX_GAP 1280000 /* 1.28 seconds */ 219 db_timespec request_gap; /* Minimum time to wait before we 220 * request a missing log record. */ 221 db_timespec max_gap; /* Maximum time to wait before 222 * requesting a missing log record. */ 223 /* Status change information */ 224 u_int32_t apply_th; /* Number of callers in rep_apply. */ 225 u_int32_t msg_th; /* Number of callers in rep_proc_msg.*/ 226 u_int32_t handle_cnt; /* Count of handles in library. */ 227 u_int32_t op_cnt; /* Multi-step operation count.*/ 228 DB_LSN ckp_lsn; /* LSN for syncing a checkpoint. */ 229 DB_LSN max_prep_lsn; /* Max LSN of txn_prepare record. */ 230 231 /* 232 * Event notification synchronization: the mtx_event and associate 233 * fields which it protects govern event notification to the 234 * application. They form a guarantee that no matter how crazy the 235 * thread scheduling gets, the application sees a sensible, orderly 236 * progression of events. 237 */ 238 db_mutex_t mtx_event; /* Serializes event notification. */ 239 /* 240 * Latest generation whose NEWMASTER event the application has been 241 * notified of. Also serves to force STARTUPDONE to occur after 242 * NEWMASTER. 243 */ 244 u_int32_t newmaster_event_gen; 245 /* 246 * Latest local victory of an election that the application has been 247 * notified of, expressed as the election generation number. This 248 * ensures we notify the application exactly once when it wins an 249 * election. 250 */ 251 u_int32_t notified_egen; 252 253 /* Backup information. */ 254 u_int32_t nfiles; /* Number of files we have info on. */ 255 u_int32_t curfile; /* Cur file we're getting (0-based). */ 256 __rep_fileinfo_args *curinfo; /* Current file info ptr. */ 257 u_int8_t *finfo; /* Current file info buffer. */ 258 u_int8_t *nextinfo; /* Next file info buffer. */ 259 u_int8_t *originfo; /* Original file info buffer. */ 260 u_int32_t infolen; /* Remaining length file info buffer. */ 261 u_int32_t originfolen; /* Original length file info buffer. */ 262 u_int32_t infoversion; /* Original file info version. */ 263 DB_LSN first_lsn; /* Earliest LSN we need. */ 264 u_int32_t first_vers; /* Log version of first log file. */ 265 DB_LSN last_lsn; /* Latest LSN we need. */ 266 db_pgno_t ready_pg; /* Next pg expected. */ 267 db_pgno_t waiting_pg; /* First pg after gap. */ 268 db_pgno_t max_wait_pg; /* Maximum pg requested. */ 269 u_int32_t npages; /* Num of pages rcvd for this file. */ 270 DB_MPOOLFILE *file_mpf; /* Mpoolfile for current database. */ 271 DB *file_dbp; /* This file's page info. */ 272 DBC *queue_dbc; /* Dbc for a queue file. */ 273 274 /* Vote tallying information. */ 275 u_int32_t sites; /* Sites heard from. */ 276 int winner; /* Current winner EID. */ 277 u_int32_t w_priority; /* Winner priority. */ 278 u_int32_t w_gen; /* Winner generation. */ 279 DB_LSN w_lsn; /* Winner LSN. */ 280 u_int32_t w_tiebreaker; /* Winner tiebreaking value. */ 281 u_int32_t votes; /* Number of votes for this site. */ 282 283 db_timespec etime; /* Election start timestamp. */ 284 285 /* Leases. */ 286 db_timeout_t lease_timeout; /* Lease timeout. */ 287 db_timespec lease_duration; /* Lease timeout with clock skew. */ 288 u_int32_t clock_skew; /* Clock skew. */ 289 u_int32_t clock_base; /* Clock scale factor base. */ 290 db_timespec grant_expire; /* Local grant expiration time. */ 291 292 /* Statistics. */ 293 DB_REP_STAT stat; 294#if defined(HAVE_REPLICATION_THREADS) && defined(HAVE_STATISTICS) 295 DB_REPMGR_STAT mstat; 296#endif 297 298 /* Configuration. */ 299#define REP_C_2SITE_STRICT 0x00001 /* Don't cheat on elections. */ 300#define REP_C_BULK 0x00002 /* Bulk transfer. */ 301#define REP_C_DELAYCLIENT 0x00004 /* Delay client sync-up. */ 302#define REP_C_LEASE 0x00008 /* Leases configured. */ 303#define REP_C_NOAUTOINIT 0x00010 /* No auto initialization. */ 304#define REP_C_NOWAIT 0x00020 /* Immediate error return. */ 305 u_int32_t config; /* Configuration flags. */ 306 307#define REP_F_CLIENT 0x00000001 /* Client replica. */ 308#define REP_F_DELAY 0x00000002 /* Delaying client sync-up. */ 309#define REP_F_EGENUPDATE 0x00000004 /* Egen updated by ALIVE msg. */ 310#define REP_F_EPHASE0 0x00000008 /* In phase 0 of election. */ 311#define REP_F_EPHASE1 0x00000010 /* In phase 1 of election. */ 312#define REP_F_EPHASE2 0x00000020 /* In phase 2 of election. */ 313#define REP_F_GROUP_ESTD 0x00000040 /* Rep group is established. */ 314#define REP_F_INREPELECT 0x00000080 /* Thread in rep_elect. */ 315#define REP_F_MASTER 0x00000100 /* Master replica. */ 316#define REP_F_MASTERELECT 0x00000200 /* Master elect. */ 317#define REP_F_NEWFILE 0x00000400 /* Newfile in progress. */ 318#define REP_F_NOARCHIVE 0x00000800 /* Rep blocks log_archive. */ 319#define REP_F_READY_API 0x00001000 /* Need handle_cnt to be 0. */ 320#define REP_F_READY_APPLY 0x00002000 /* Need apply_th to be 0. */ 321#define REP_F_READY_MSG 0x00004000 /* Need msg_th to be 0. */ 322#define REP_F_READY_OP 0x00008000 /* Need op_cnt to be 0. */ 323#define REP_F_RECOVER_LOG 0x00010000 /* In recovery - log. */ 324#define REP_F_RECOVER_PAGE 0x00020000 /* In recovery - pages. */ 325#define REP_F_RECOVER_UPDATE 0x00040000 /* In recovery - files. */ 326#define REP_F_RECOVER_VERIFY 0x00080000 /* In recovery - verify. */ 327#define REP_F_SKIPPED_APPLY 0x00100000 /* Skipped applying a record. */ 328#define REP_F_START_CALLED 0x00200000 /* Rep_start called. */ 329#define REP_F_TALLY 0x00400000 /* Tallied vote before elect. */ 330 u_int32_t flags; 331} REP; 332 333/* 334 * Recovery flag mask to easily check any/all recovery bits. That is 335 * REP_F_READY_{API|OP} and all REP_F_RECOVER*. This must change if the values 336 * of the flags change. NOTE: We do not include REP_F_READY_MSG in 337 * this mask because it is used frequently in non-recovery related 338 * areas and we want to manipulate it separately (see especially 339 * in __rep_new_master). 340 */ 341#define REP_F_RECOVER_MASK \ 342 (REP_F_READY_API | REP_F_READY_OP | \ 343 REP_F_RECOVER_LOG | REP_F_RECOVER_PAGE | \ 344 REP_F_RECOVER_UPDATE | REP_F_RECOVER_VERIFY) 345 346/* 347 * REP_F_EPHASE0 is not a *real* election phase. It is used for 348 * master leases and allowing the client to find the master or 349 * expire its lease. However, EPHASE0 is cleared by __rep_elect_done. 350 */ 351#define IN_ELECTION(R) \ 352 F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2) 353#define IN_ELECTION_TALLY(R) \ 354 F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2 | REP_F_TALLY) 355#define ELECTION_MAJORITY(n) (((n) / 2) + 1) 356 357#define REP_F_INTERNAL_INIT_MASK (REP_F_RECOVER_PAGE | REP_F_RECOVER_LOG) 358 359#define IS_REP_MASTER(env) \ 360 (REP_ON(env) && \ 361 F_ISSET(((REP *)(env)->rep_handle->region), REP_F_MASTER)) 362 363#define IS_REP_CLIENT(env) \ 364 (REP_ON(env) && \ 365 F_ISSET(((REP *)(env)->rep_handle->region), REP_F_CLIENT)) 366 367#define IS_USING_LEASES(env) \ 368 (REP_ON(env) && \ 369 FLD_ISSET(((REP *)(env)->rep_handle->region)->config, \ 370 REP_C_LEASE)) 371 372#define IS_CLIENT_PGRECOVER(env) \ 373 (IS_REP_CLIENT(env) && \ 374 F_ISSET(((REP *)(env)->rep_handle->region), REP_F_RECOVER_PAGE)) 375 376/* 377 * Macros to figure out if we need to do replication pre/post-amble processing. 378 * Skip for specific DB handles owned by the replication layer, either because 379 * replication is running recovery or because it's a handle entirely owned by 380 * the replication code (replication opens its own databases to track state). 381 */ 382#define IS_ENV_REPLICATED(env) \ 383 (REP_ON(env) && (env)->rep_handle->region->flags != 0) 384 385/* 386 * Gap processing flags. These provide control over the basic 387 * gap processing algorithm for some special cases. 388 */ 389#define REP_GAP_FORCE 0x001 /* Force a request for a gap. */ 390#define REP_GAP_REREQUEST 0x002 /* Gap request is a forced rerequest. */ 391 /* REREQUEST is a superset of FORCE. */ 392 393/* 394 * Basic pre/post-amble processing. 395 */ 396#define REPLICATION_WRAP(env, func_call, checklock, ret) do { \ 397 int __rep_check, __t_ret; \ 398 __rep_check = IS_ENV_REPLICATED(env) ? 1 : 0; \ 399 (ret) = __rep_check ? __env_rep_enter(env, checklock) : 0; \ 400 if ((ret) == 0) { \ 401 (ret) = func_call; \ 402 if (__rep_check && (__t_ret = \ 403 __env_db_rep_exit(env)) != 0 && (ret) == 0) \ 404 (ret) = __t_ret; \ 405 } \ 406} while (0) 407 408/* 409 * Per-process replication structure. 410 * 411 * There are 2 mutexes used in replication. 412 * 1. mtx_region - This protects the fields of the rep region above. 413 * 2. mtx_clientdb - This protects the per-process flags, and bookkeeping 414 * database and all of the components that maintain it. Those 415 * components include the following fields in the log region (see log.h): 416 * a. ready_lsn 417 * b. waiting_lsn 418 * c. verify_lsn 419 * d. wait_recs 420 * e. rcvd_recs 421 * f. max_wait_lsn 422 * These fields in the log region are NOT protected by the log region lock at 423 * all. 424 * 425 * Note that the per-process flags should truly be protected by a special 426 * per-process thread mutex, but it is currently set in so isolated a manner 427 * that it didn't make sense to do so and in most case we're already holding 428 * the mtx_clientdb anyway. 429 * 430 * The lock ordering protocol is that mtx_clientdb must be acquired first and 431 * then either REP->mtx_region, or the LOG->mtx_region mutex may be acquired if 432 * necessary. 433 */ 434struct __db_rep { 435 /* 436 * Shared configuration information -- copied to and maintained in the 437 * shared region as soon as the shared region is created. 438 */ 439 int eid; /* Environment ID. */ 440 441 u_int32_t gbytes; /* Limit on data sent in single... */ 442 u_int32_t bytes; /* __rep_process_message call. */ 443 444 db_timespec request_gap; /* Minimum time to wait before we 445 * request a missing log record. */ 446 db_timespec max_gap; /* Maximum time to wait before 447 * requesting a missing log record. */ 448 449 u_int32_t clock_skew; /* Clock skew factor. */ 450 u_int32_t clock_base; /* Clock skew base. */ 451 u_int32_t config; /* Configuration flags. */ 452 u_int32_t config_nsites; 453 454 db_timeout_t elect_timeout; /* Normal/full election timeouts. */ 455 db_timeout_t full_elect_timeout; 456 457 db_timeout_t chkpt_delay; /* Master checkpoint delay. */ 458 459 u_int32_t my_priority; 460 db_timeout_t lease_timeout; /* Master leases. */ 461 /* 462 * End of shared configuration information. 463 */ 464 int (*send) /* Send function. */ 465 __P((DB_ENV *, const DBT *, const DBT *, 466 const DB_LSN *, int, u_int32_t)); 467 468 DB *rep_db; /* Bookkeeping database. */ 469 470 REP *region; /* In memory structure. */ 471 u_int8_t *bulk; /* Shared memory bulk area. */ 472 473#define DBREP_OPENFILES 0x0001 /* This handle has opened files. */ 474 u_int32_t flags; /* per-process flags. */ 475 476#ifdef HAVE_REPLICATION_THREADS 477 /* 478 * Replication Framework (repmgr) information. 479 */ 480 int nthreads; 481 u_int32_t init_policy; 482 int perm_policy; 483 int peer; /* Site to use for C2C sync. */ 484 db_timeout_t ack_timeout; 485 db_timeout_t election_retry_wait; 486 db_timeout_t connection_retry_wait; 487 db_timeout_t heartbeat_frequency; /* Max period between msgs. */ 488 db_timeout_t heartbeat_monitor_timeout; 489 490 /* Repmgr's copies of rep stuff. */ 491 int master_eid; 492 u_int32_t generation; 493 494 /* Thread synchronization. */ 495 REPMGR_RUNNABLE *selector, **messengers, *elect_thread; 496 mgr_mutex_t mutex; 497 cond_var_t queue_nonempty, check_election; 498#ifdef DB_WIN32 499 ACK_WAITERS_TABLE *waiters; 500 HANDLE signaler; 501 int wsa_inited; 502#else 503 pthread_cond_t ack_condition; 504 int read_pipe, write_pipe; 505 int chg_sig_handler; 506#endif 507 508 /* Operational stuff. */ 509 REPMGR_SITE *sites; /* Array of known sites. */ 510 u_int site_cnt; /* Array slots in use. */ 511 u_int site_max; /* Total array slots allocated. */ 512 513 CONNECTION_LIST connections; 514 RETRY_Q_HEADER retries; /* Sites needing connection retry. */ 515 REPMGR_QUEUE *input_queue; 516 517 socket_t listen_fd; 518 repmgr_netaddr_t my_addr; 519 db_timespec last_bcast; /* Time of last broadcast msg. */ 520 521 int finished; /* Repmgr threads should shut down. */ 522 int done_one; /* TODO: rename */ 523 int found_master; 524 int takeover_pending; /* We've been elected master. */ 525 526/* Operations we can ask election thread to perform (OOB value is 0): */ 527#define ELECT_ELECTION 1 /* Call for an election. */ 528#define ELECT_FAILURE_ELECTION 2 /* Do election, adjusting nsites to account 529 for a failed master. */ 530#define ELECT_REPSTART 3 /* Call rep_start(CLIENT). */ 531#define ELECT_SEEK_MASTER 4 /* Alternate rep_start to find master. */ 532 int operation_needed; /* Next op for election thread. */ 533 534#endif /* HAVE_REPLICATION_THREADS */ 535}; 536 537/* 538 * Control structure flags for replication communication infrastructure. 539 */ 540/* 541 * Define old DB_LOG_ values that we must support here. For reasons of 542 * compatibility with old versions, these values must be reserved explicitly in 543 * the list of flag values (below) 544 */ 545#define DB_LOG_PERM_42_44 0x20 546#define DB_LOG_RESEND_42_44 0x40 547#define REPCTL_INIT_45 0x02 /* Back compatible flag value. */ 548 549#define REPCTL_ELECTABLE 0x01 /* Upgraded client is electable. */ 550#define REPCTL_FLUSH 0x02 /* Record should be flushed. */ 551#define REPCTL_GROUP_ESTD 0x04 /* Message from site in a group. */ 552#define REPCTL_INIT 0x08 /* Internal init message. */ 553#define REPCTL_LEASE 0x10 /* Lease related message.. */ 554 /* 555 * Skip over reserved values 0x20 556 * and 0x40, as explained above. 557 */ 558#define REPCTL_LOG_END 0x80 /* Approximate end of group-wide log. */ 559#define REPCTL_PERM DB_LOG_PERM_42_44 560#define REPCTL_RESEND DB_LOG_RESEND_42_44 561 562/* 563 * File info flags for internal init. The per-database (i.e., file) flag 564 * represents the on-disk format of the file, and is conveyed from the master to 565 * the initializing client in the UPDATE message, so that the client can know 566 * how to create the file. The per-page flag is conveyed along with each PAGE 567 * message, describing the format of the page image being transmitted; it is of 568 * course set by the site serving the PAGE_REQ. The serving site gets the page 569 * image from its own mpool, and thus the page is in the native format of the 570 * serving site. This format may be different (i.e., opposite) from the on-disk 571 * format, and in fact can vary per-page, since with client-to-client sync it is 572 * possible for various different sites to serve the various PAGE_REQ requests. 573 */ 574#define REPINFO_DB_LITTLEENDIAN 0x0001 /* File is little-endian lorder. */ 575#define REPINFO_PG_LITTLEENDIAN 0x0002 /* Page is little-endian lorder. */ 576 577/* 578 * Control message format for 4.6 release. The db_timespec_t is 579 * not a portable structure. Therefore, in 4.6, replication among 580 * mixed OSs such as Linux and Windows, which have different time_t 581 * sizes, does not work. 582 */ 583typedef struct { 584 u_int32_t rep_version; /* Replication version number. */ 585 u_int32_t log_version; /* Log version number. */ 586 587 DB_LSN lsn; /* Log sequence number. */ 588 u_int32_t rectype; /* Message type. */ 589 u_int32_t gen; /* Generation number. */ 590 db_timespec msg_time; /* Timestamp seconds for leases. */ 591 u_int32_t flags; /* log_put flag value. */ 592} REP_46_CONTROL; 593 594/* 595 * Control message format for 4.5 release and earlier. 596 */ 597typedef struct { 598 u_int32_t rep_version; /* Replication version number. */ 599 u_int32_t log_version; /* Log version number. */ 600 601 DB_LSN lsn; /* Log sequence number. */ 602 u_int32_t rectype; /* Message type. */ 603 u_int32_t gen; /* Generation number. */ 604 u_int32_t flags; /* log_put flag value. */ 605} REP_OLD_CONTROL; 606 607#define LEASE_REFRESH_TRIES 3 /* Number of times to try refresh. */ 608 609/* Master granted lease information. */ 610typedef struct __rep_lease_entry { 611 int eid; /* EID of client grantor. */ 612 db_timespec start_time; /* Start time clients echo back. */ 613 db_timespec end_time; /* Master lease expiration time. */ 614 DB_LSN lease_lsn; /* Durable LSN lease applies to. */ 615} REP_LEASE_ENTRY; 616 617/* 618 * Old vote info where some fields were not fixed size. 619 */ 620typedef struct { 621 u_int32_t egen; /* Election generation. */ 622 int nsites; /* Number of sites I've been in 623 * communication with. */ 624 int nvotes; /* Number of votes needed to win. */ 625 int priority; /* My site's priority. */ 626 u_int32_t tiebreaker; /* Tie-breaking quasi-random value. */ 627} REP_OLD_VOTE_INFO; 628 629typedef struct { 630 u_int32_t egen; /* Voter's election generation. */ 631 int eid; /* Voter's ID. */ 632} REP_VTALLY; 633 634/* 635 * The REP_THROTTLE_ONLY flag is used to do throttle processing only. 636 * If set, it will only allow sending the REP_*_MORE message, but not 637 * the normal, non-throttled message. It is used to support throttling 638 * with bulk transfer. 639 */ 640/* Flags for __rep_send_throttle. */ 641#define REP_THROTTLE_ONLY 0x0001 /* Send _MORE message only. */ 642 643/* Throttled message processing information. */ 644typedef struct { 645 DB_LSN lsn; /* LSN of this record. */ 646 DBT *data_dbt; /* DBT of this record. */ 647 u_int32_t gbytes; /* This call's max gbytes sent. */ 648 u_int32_t bytes; /* This call's max bytes sent. */ 649 u_int32_t type; /* Record type. */ 650} REP_THROTTLE; 651 652/* Bulk processing information. */ 653/* 654 * !!! 655 * We use a uintptr_t for the offset. We'd really like to use a ptrdiff_t 656 * since that really is what it is. But ptrdiff_t is not portable and 657 * doesn't exist everywhere. 658 */ 659typedef struct { 660 u_int8_t *addr; /* Address of bulk buffer. */ 661 uintptr_t *offp; /* Ptr to current offset into buffer. */ 662 u_int32_t len; /* Bulk buffer length. */ 663 u_int32_t type; /* Item type in buffer (log, page). */ 664 DB_LSN lsn; /* First LSN in buffer. */ 665 int eid; /* ID of potential recipients. */ 666#define BULK_XMIT 0x001 /* Buffer in transit. */ 667 u_int32_t *flagsp; /* Buffer flags. */ 668} REP_BULK; 669 670/* 671 * This structure takes care of representing a transaction. 672 * It holds all the records, sorted by page number so that 673 * we can obtain locks and apply updates in a deadlock free 674 * order. 675 */ 676typedef struct { 677 u_int nlsns; 678 u_int nalloc; 679 DB_LSN *array; 680} LSN_COLLECTION; 681 682/* 683 * This is used by the page-prep routines to do the lock_vec call to 684 * apply the updates for a single transaction or a collection of 685 * transactions. 686 */ 687typedef struct { 688 int n; 689 DB_LOCKREQ *reqs; 690 DBT *objs; 691} linfo_t; 692 693#if defined(__cplusplus) 694} 695#endif 696 697#include "dbinc_auto/rep_ext.h" 698#endif /* !_DB_REP_H_ */ 699