1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 * 6 * $Id: log.h,v 12.39 2008/01/30 04:30:37 mjc Exp $ 7 */ 8 9#ifndef _DB_LOG_H_ 10#define _DB_LOG_H_ 11 12#include "dbinc/db_swap.h" 13 14#if defined(__cplusplus) 15extern "C" { 16#endif 17 18/******************************************************* 19 * DBREG: 20 * The DB file register code keeps track of open files. It's stored 21 * in the log subsystem's shared region, and so appears in the log.h 22 * header file, but is logically separate. 23 *******************************************************/ 24/* 25 * The per-process table that maps log file-id's to DB structures. 26 */ 27typedef struct __db_entry { 28 DB *dbp; /* Open dbp for this file id. */ 29 int deleted; /* File was not found during open. */ 30} DB_ENTRY; 31 32/* 33 * FNAME -- 34 * File name and id. 35 */ 36struct __fname { 37 SH_TAILQ_ENTRY q; /* File name queue. */ 38 39 pid_t pid; /* Process that owns this. */ 40 int32_t id; /* Logging file id. */ 41 int32_t old_id; /* Saved logging file id. */ 42 DBTYPE s_type; /* Saved DB type. */ 43 44 roff_t fname_off; /* File name offset. */ 45 roff_t dname_off; /* Database name offset. */ 46 db_pgno_t meta_pgno; /* Page number of the meta page. */ 47 u_int8_t ufid[DB_FILE_ID_LEN]; /* Unique file id. */ 48 49 u_int32_t create_txnid; /* 50 * Txn ID of the DB create, stored so 51 * we can log it at register time. 52 */ 53 db_mutex_t mutex; /* mutex from db handle. */ 54 /* number of txn referencing + 1 for the db handle. */ 55 u_int32_t txn_ref; 56 57#define DB_FNAME_CLOSED 0x01 /* DBP was closed. */ 58#define DB_FNAME_DURABLE 0x02 /* File is durable. */ 59#define DB_FNAME_INMEM 0x04 /* File is in memory. */ 60#define DB_FNAME_NOTLOGGED 0x08 /* Log of close failed. */ 61#define DB_FNAME_RECOVER 0x10 /* File was opened by recovery code. */ 62#define DB_FNAME_RESTORED 0x20 /* File may be in restored txn. */ 63 u_int32_t flags; 64}; 65 66/* File open/close register log record opcodes. */ 67#define DBREG_CHKPNT 1 /* Checkpoint: file name/id dump. */ 68#define DBREG_CLOSE 2 /* File close. */ 69#define DBREG_OPEN 3 /* File open. */ 70#define DBREG_PREOPEN 4 /* Open in mpool only. */ 71#define DBREG_RCLOSE 5 /* File close after recovery. */ 72#define DBREG_REOPEN 6 /* Open for in-memory database. */ 73 74/******************************************************* 75 * LOG: 76 * The log subsystem information. 77 *******************************************************/ 78struct __hdr; typedef struct __hdr HDR; 79struct __log; typedef struct __log LOG; 80struct __log_persist; typedef struct __log_persist LOGP; 81 82#define LFPREFIX "log." /* Log file name prefix. */ 83#define LFNAME "log.%010d" /* Log file name template. */ 84#define LFNAME_V1 "log.%05d" /* Log file name template, rev 1. */ 85 86#define LG_MAX_DEFAULT (10 * MEGABYTE) /* 10 MB. */ 87#define LG_MAX_INMEM (256 * 1024) /* 256 KB. */ 88#define LG_BSIZE_INMEM (1 * MEGABYTE) /* 1 MB. */ 89 90/* 91 * Allocate a few bytes under a power-of-two value. BDB doesn't care if it's 92 * a power-of-two or not, and requesting slightly under a power-of-two allows 93 * stupid allocators to avoid wasting space. 94 */ 95#define LG_BASE_REGION_SIZE (65000) /* 64KB - 536B */ 96#define LG_BSIZE_DEFAULT (32000) /* 32 KB - 768B */ 97#define LG_CURSOR_BUF_SIZE (32000) /* 32KB - 768B */ 98 99/* 100 * DB_LOG 101 * Per-process log structure. 102 */ 103struct __db_log { 104 /* 105 * These fields need to be protected for multi-threaded support. 106 */ 107 db_mutex_t mtx_dbreg; /* Mutex for thread protection. */ 108 109 DB_ENTRY *dbentry; /* Recovery file-id mapping. */ 110#define DB_GROW_SIZE 64 111 int32_t dbentry_cnt; /* Entries. Grows by DB_GROW_SIZE. */ 112 113 /* 114 * These fields are only accessed when the region lock is held, so 115 * they do not have to be protected by the thread lock as well. 116 */ 117 u_int32_t lfname; /* Log file "name". */ 118 DB_FH *lfhp; /* Log file handle. */ 119 time_t lf_timestamp; /* Log file timestamp. */ 120 121 u_int8_t *bufp; /* Region buffer. */ 122 123 /* These fields are not thread protected. */ 124 ENV *env; /* Environment */ 125 REGINFO reginfo; /* Region information. */ 126 127#define DBLOG_AUTOREMOVE 0x01 /* Autoremove log files. */ 128#define DBLOG_DIRECT 0x02 /* Do direct I/O on the log. */ 129#define DBLOG_DSYNC 0x04 /* Set OS_DSYNC on the log. */ 130#define DBLOG_FORCE_OPEN 0x08 /* Force the DB open even if it appears 131 * to be deleted. */ 132#define DBLOG_INMEMORY 0x10 /* Logging is in memory. */ 133#define DBLOG_OPENFILES 0x20 /* Prepared files need to be open. */ 134#define DBLOG_RECOVER 0x40 /* We are in recovery. */ 135#define DBLOG_ZERO 0x80 /* Zero fill the log. */ 136 u_int32_t flags; 137}; 138 139/* 140 * HDR -- 141 * Log record header. 142 */ 143struct __hdr { 144 u_int32_t prev; /* Previous offset. */ 145 u_int32_t len; /* Current length. */ 146 u_int8_t chksum[DB_MAC_KEY]; /* Current checksum. */ 147 u_int8_t iv[DB_IV_BYTES]; /* IV */ 148 u_int32_t orig_size; /* Original size of log record */ 149 /* !!! - 'size' is not written to log, must be last in hdr */ 150 size_t size; /* Size of header to use */ 151}; 152 153/* 154 * LOG_HDR_SUM -- XOR in prev and len 155 * This helps avoids the race misreading the log while it 156 * it is being updated. 157 */ 158#define LOG_HDR_SUM(crypto, hdr, sum) do { \ 159 if (crypto) { \ 160 ((u_int32_t *)sum)[0] ^= ((HDR *)hdr)->prev; \ 161 ((u_int32_t *)sum)[1] ^= ((HDR *)hdr)->len; \ 162 } else { \ 163 ((u_int32_t *)sum)[0] ^= \ 164 ((HDR *)hdr)->prev ^ ((HDR *)hdr)->len; \ 165 } \ 166} while (0) 167 168/* 169 * We use HDR internally, and then when we write out, we write out 170 * prev, len, and then a 4-byte checksum if normal operation or 171 * a crypto-checksum and IV and original size if running in crypto 172 * mode. We must store the original size in case we pad. Set the 173 * size when we set up the header. We compute a DB_MAC_KEY size 174 * checksum regardless, but we can safely just use the first 4 bytes. 175 */ 176#define HDR_NORMAL_SZ 12 177#define HDR_CRYPTO_SZ 12 + DB_MAC_KEY + DB_IV_BYTES 178 179struct __log_persist { 180 u_int32_t magic; /* DB_LOGMAGIC */ 181 u_int32_t version; /* DB_LOGVERSION */ 182 183 u_int32_t log_size; /* Log file size. */ 184 u_int32_t notused; /* Historically the log file mode. */ 185}; 186 187/* Macros to lock/unlock the log region as a whole. */ 188#define LOG_SYSTEM_LOCK(env) \ 189 MUTEX_LOCK(env, ((LOG *) \ 190 (env)->lg_handle->reginfo.primary)->mtx_region) 191#define LOG_SYSTEM_UNLOCK(env) \ 192 MUTEX_UNLOCK(env, ((LOG *) \ 193 (env)->lg_handle->reginfo.primary)->mtx_region) 194 195/* 196 * LOG -- 197 * Shared log region. One of these is allocated in shared memory, 198 * and describes the log. 199 */ 200struct __log { 201 db_mutex_t mtx_region; /* Region mutex. */ 202 203 db_mutex_t mtx_filelist; /* Mutex guarding file name list. */ 204 205 LOGP persist; /* Persistent information. */ 206 207 SH_TAILQ_HEAD(__fq1) fq; /* List of file names. */ 208 int32_t fid_max; /* Max fid allocated. */ 209 roff_t free_fid_stack; /* Stack of free file ids. */ 210 u_int free_fids; /* Height of free fid stack. */ 211 u_int free_fids_alloced; /* N free fid slots allocated. */ 212 213 /* 214 * The lsn LSN is the file offset that we're about to write and which 215 * we will return to the user. 216 */ 217 DB_LSN lsn; /* LSN at current file offset. */ 218 219 /* 220 * The f_lsn LSN is the LSN (returned to the user) that "owns" the 221 * first byte of the buffer. If the record associated with the LSN 222 * spans buffers, it may not reflect the physical file location of 223 * the first byte of the buffer. 224 */ 225 DB_LSN f_lsn; /* LSN of first byte in the buffer. */ 226 size_t b_off; /* Current offset in the buffer. */ 227 u_int32_t w_off; /* Current write offset in the file. */ 228 u_int32_t len; /* Length of the last record. */ 229 230 DB_LSN active_lsn; /* Oldest active LSN in the buffer. */ 231 size_t a_off; /* Offset in the buffer of first active 232 file. */ 233 234 /* 235 * The s_lsn LSN is the last LSN that we know is on disk, not just 236 * written, but synced. This field is protected by the flush mutex 237 * rather than by the region mutex. 238 */ 239 db_mutex_t mtx_flush; /* Mutex guarding flushing. */ 240 int in_flush; /* Log flush in progress. */ 241 DB_LSN s_lsn; /* LSN of the last sync. */ 242 243 DB_LOG_STAT stat; /* Log statistics. */ 244 245 /* 246 * This timestamp is updated anytime someone unlinks log 247 * files. This can happen when calling __log_vtruncate 248 * or replication internal init when it unlinks log files. 249 * 250 * The timestamp is used so that other processes that might 251 * have file handles to log files know to close/reopen them 252 * so they're not potentially writing to now-removed files. 253 */ 254 time_t timestamp; /* Log trunc timestamp. */ 255 256 /* 257 * !!! 258 * NOTE: the next group of fields are NOT protected by the log 259 * region lock. They are protected by REP->mtx_clientdb. If you 260 * need access to both, you must acquire REP->mtx_clientdb 261 * before acquiring the log region lock. 262 * 263 * The waiting_lsn is used by the replication system. It is the 264 * first LSN that we are holding without putting in the log, because 265 * we received one or more log records out of order. Associated with 266 * the waiting_lsn is the number of log records that we still have to 267 * receive before we decide that we should request it again. 268 * 269 * The max_wait_lsn is used to control retransmission in the face 270 * of dropped messages. If we are requesting all records from the 271 * current gap (i.e., chunk of the log that we are missing), then 272 * the max_wait_lsn contains the first LSN that we are known to have 273 * in the __db.rep.db. If we requested only a single record, then 274 * the max_wait_lsn has the LSN of that record we requested. 275 */ 276 /* BEGIN fields protected by rep->mtx_clientdb. */ 277 DB_LSN waiting_lsn; /* First log record after a gap. */ 278 DB_LSN verify_lsn; /* LSN we are waiting to verify. */ 279 DB_LSN max_wait_lsn; /* Maximum LSN requested. */ 280 DB_LSN max_perm_lsn; /* Maximum PERMANENT LSN processed. */ 281 db_timespec max_lease_ts; /* Maximum Lease timestamp seen. */ 282 db_timespec wait_ts; /* Time to wait before requesting. */ 283 db_timespec rcvd_ts; /* Initial received time to wait. */ 284 db_timespec last_ts; /* Last time of insert in temp db. */ 285 /* 286 * The ready_lsn is also used by the replication system. It is the 287 * next LSN we expect to receive. It's normally equal to "lsn", 288 * except at the beginning of a log file, at which point it's set 289 * to the LSN of the first record of the new file (after the 290 * header), rather than to 0. 291 */ 292 DB_LSN ready_lsn; 293 /* 294 * The bulk_buf is used by replication for bulk transfer. While this 295 * is protected by REP->mtx_clientdb, this doesn't contend with the 296 * above fields because the above are used by clients and the bulk 297 * fields below are used by a master. 298 */ 299 roff_t bulk_buf; /* Bulk transfer buffer in region. */ 300 uintptr_t bulk_off; /* Current offset into bulk buffer. */ 301 u_int32_t bulk_len; /* Length of buffer. */ 302 u_int32_t bulk_flags; /* Bulk buffer flags. */ 303 /* END fields protected by rep->mtx_clientdb. */ 304 305 /* 306 * During initialization, the log system walks forward through the 307 * last log file to find its end. If it runs into a checkpoint 308 * while it's doing so, it caches it here so that the transaction 309 * system doesn't need to walk through the file again on its 310 * initialization. 311 */ 312 DB_LSN cached_ckp_lsn; 313 314 u_int32_t regionmax; /* Configured size of the region. */ 315 316 roff_t buffer_off; /* Log buffer offset in the region. */ 317 u_int32_t buffer_size; /* Log buffer size. */ 318 319 u_int32_t log_size; /* Log file's size. */ 320 u_int32_t log_nsize; /* Next log file's size. */ 321 322 int filemode; /* Log file permissions mode. */ 323 324 /* 325 * DB_LOG_AUTOREMOVE and DB_LOG_INMEMORY: not protected by a mutex, 326 * all we care about is if they're zero or non-zero. 327 */ 328 int db_log_autoremove; 329 int db_log_inmemory; 330 331 u_int32_t ncommit; /* Number of txns waiting to commit. */ 332 DB_LSN t_lsn; /* LSN of first commit */ 333 SH_TAILQ_HEAD(__commit) commits;/* list of txns waiting to commit. */ 334 SH_TAILQ_HEAD(__free) free_commits;/* free list of commit structs. */ 335 336 /* 337 * In-memory logs maintain a list of the start positions of all log 338 * files currently active in the in-memory buffer. This is to make the 339 * lookup from LSN to log buffer offset efficient. 340 */ 341 SH_TAILQ_HEAD(__logfile) logfiles; 342 SH_TAILQ_HEAD(__free_logfile) free_logfiles; 343}; 344 345/* 346 * __db_commit structure -- 347 * One of these is allocated for each transaction waiting to commit. 348 */ 349struct __db_commit { 350 db_mutex_t mtx_txnwait; /* Mutex for txn to wait on. */ 351 DB_LSN lsn; /* LSN of commit record. */ 352 SH_TAILQ_ENTRY links; /* Either on free or waiting list. */ 353 354#define DB_COMMIT_FLUSH 0x0001 /* Flush the log when you wake up. */ 355 u_int32_t flags; 356}; 357 358/* 359 * Check for the proper progression of Log Sequence Numbers. 360 * If we are rolling forward the LSN on the page must be greater 361 * than or equal to the previous LSN in log record. 362 * We ignore NOT LOGGED LSNs. The user did an unlogged update. 363 * We should eventually see a log record that matches and continue 364 * forward. 365 * A ZERO LSN implies a page that was allocated prior to the recovery 366 * start point and then truncated later in the log. An allocation of a 367 * page after this page will extend the file, leaving a hole. We want to 368 * ignore this page until it is truncated again. 369 * 370 */ 371 372#define CHECK_LSN(e, redo, cmp, lsn, prev) \ 373 if (DB_REDO(redo) && (cmp) < 0 && \ 374 ((!IS_NOT_LOGGED_LSN(*(lsn)) && !IS_ZERO_LSN(*(lsn))) || \ 375 IS_REP_CLIENT(e))) { \ 376 ret = __db_check_lsn(e, lsn, prev); \ 377 goto out; \ 378 } 379 380/* 381 * Helper for in-memory logs -- check whether an offset is in range 382 * in a ring buffer (inclusive of start, exclusive of end). 383 */ 384struct __db_filestart { 385 u_int32_t file; 386 size_t b_off; 387 388 SH_TAILQ_ENTRY links; /* Either on free or waiting list. */ 389}; 390 391#define RINGBUF_LEN(lp, start, end) \ 392 ((start) < (end) ? \ 393 (end) - (start) : (lp)->buffer_size - ((start) - (end))) 394 395/* 396 * Internal macro to set pointer to the begin_lsn for generated 397 * logging routines. If begin_lsn is already set then do nothing. 398 * Return a pointer to the last lsn too. 399 */ 400#undef DB_SET_TXN_LSNP 401#define DB_SET_TXN_LSNP(txn, blsnp, llsnp) do { \ 402 DB_LSN *__lsnp; \ 403 TXN_DETAIL *__td; \ 404 __td = (txn)->td; \ 405 *(llsnp) = &__td->last_lsn; \ 406 while (__td->parent != INVALID_ROFF) \ 407 __td = R_ADDR(&(txn)->mgrp->reginfo, __td->parent); \ 408 __lsnp = &__td->begin_lsn; \ 409 if (IS_ZERO_LSN(*__lsnp)) \ 410 *(blsnp) = __lsnp; \ 411} while (0) 412 413/* 414 * These are used in __log_backup to determine which LSN in the 415 * checkpoint record to compare and return. 416 */ 417#define CKPLSN_CMP 0 418#define LASTCKP_CMP 1 419 420/* 421 * Status codes indicating the validity of a log file examined by 422 * __log_valid(). 423 */ 424typedef enum { 425 DB_LV_INCOMPLETE, 426 DB_LV_NONEXISTENT, 427 DB_LV_NORMAL, 428 DB_LV_OLD_READABLE, 429 DB_LV_OLD_UNREADABLE 430} logfile_validity; 431 432#if defined(__cplusplus) 433} 434#endif 435 436#include "dbinc_auto/dbreg_auto.h" 437#include "dbinc_auto/dbreg_ext.h" 438#include "dbinc_auto/log_ext.h" 439#endif /* !_DB_LOG_H_ */ 440