1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 * 6 * $Id: mp.h,v 12.45 2008/03/10 13:28:01 mjc Exp $ 7 */ 8 9#ifndef _DB_MP_H_ 10#define _DB_MP_H_ 11 12#if defined(__cplusplus) 13extern "C" { 14#endif 15 16struct __bh; typedef struct __bh BH; 17struct __bh_frozen_p; typedef struct __bh_frozen_p BH_FROZEN_PAGE; 18struct __bh_frozen_a; typedef struct __bh_frozen_a BH_FROZEN_ALLOC; 19struct __db_mpool_hash; typedef struct __db_mpool_hash DB_MPOOL_HASH; 20struct __db_mpreg; typedef struct __db_mpreg DB_MPREG; 21struct __mpool; typedef struct __mpool MPOOL; 22 23 /* We require at least 20KB of cache. */ 24#define DB_CACHESIZE_MIN (20 * 1024) 25 26/* 27 * DB_MPOOLFILE initialization methods cannot be called after open is called, 28 * other methods cannot be called before open is called 29 */ 30#define MPF_ILLEGAL_AFTER_OPEN(dbmfp, name) \ 31 if (F_ISSET(dbmfp, MP_OPEN_CALLED)) \ 32 return (__db_mi_open((dbmfp)->env, name, 1)); 33#define MPF_ILLEGAL_BEFORE_OPEN(dbmfp, name) \ 34 if (!F_ISSET(dbmfp, MP_OPEN_CALLED)) \ 35 return (__db_mi_open((dbmfp)->env, name, 0)); 36 37/* 38 * Cache flush operations, plus modifiers. 39 */ 40#define DB_SYNC_ALLOC 0x0001 /* Flush for allocation. */ 41#define DB_SYNC_CACHE 0x0002 /* Flush entire cache. */ 42#define DB_SYNC_CHECKPOINT 0x0004 /* Checkpoint. */ 43#define DB_SYNC_FILE 0x0008 /* Flush file. */ 44#define DB_SYNC_INTERRUPT_OK 0x0010 /* Allow interrupt and return OK. */ 45#define DB_SYNC_QUEUE_EXTENT 0x0020 /* Flush a queue file with extents. */ 46#define DB_SYNC_SUPPRESS_WRITE 0x0040 /* Ignore max-write configuration. */ 47#define DB_SYNC_TRICKLE 0x0080 /* Trickle sync. */ 48 49/* 50 * DB_MPOOL -- 51 * Per-process memory pool structure. 52 */ 53struct __db_mpool { 54 /* These fields need to be protected for multi-threaded support. */ 55 db_mutex_t mutex; /* Thread mutex. */ 56 57 /* 58 * DB_MPREG structure for the DB pgin/pgout routines. 59 * 60 * Linked list of application-specified pgin/pgout routines. 61 */ 62 DB_MPREG *pg_inout; 63 LIST_HEAD(__db_mpregh, __db_mpreg) dbregq; 64 65 /* List of DB_MPOOLFILE's. */ 66 TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq; 67 68 /* 69 * The env and reginfo fields are not thread protected, as they are 70 * initialized during mpool creation, and not modified again. 71 */ 72 ENV *env; /* Enclosing environment. */ 73 REGINFO *reginfo; /* Underlying cache regions. */ 74}; 75 76/* 77 * DB_MPREG -- 78 * DB_MPOOL registry of pgin/pgout functions. 79 */ 80struct __db_mpreg { 81 LIST_ENTRY(__db_mpreg) q; /* Linked list. */ 82 83 int32_t ftype; /* File type. */ 84 /* Pgin, pgout routines. */ 85 int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *)); 86 int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *)); 87}; 88 89/* 90 * File hashing -- 91 * We hash each file to hash bucket based on its fileid 92 * or, in the case of in memory files, its name. 93 */ 94 95/* Number of file hash buckets, a small prime number */ 96#define MPOOL_FILE_BUCKETS 17 97 98#define FHASH(id, len) __ham_func5(NULL, id, (u_int32_t)(len)) 99 100#define FNBUCKET(id, len) \ 101 (FHASH(id, len) % MPOOL_FILE_BUCKETS) 102 103/* Macros to lock/unlock the mpool region as a whole. */ 104#define MPOOL_SYSTEM_LOCK(env) \ 105 MUTEX_LOCK(env, ((MPOOL *) \ 106 (env)->mp_handle->reginfo[0].primary)->mtx_region) 107#define MPOOL_SYSTEM_UNLOCK(env) \ 108 MUTEX_UNLOCK(env, ((MPOOL *) \ 109 (env)->mp_handle->reginfo[0].primary)->mtx_region) 110 111/* Macros to lock/unlock a specific mpool region. */ 112#define MPOOL_REGION_LOCK(env, infop) \ 113 MUTEX_LOCK(env, ((MPOOL *)(infop)->primary)->mtx_region) 114#define MPOOL_REGION_UNLOCK(env, infop) \ 115 MUTEX_UNLOCK(env, ((MPOOL *)(infop)->primary)->mtx_region) 116 117/* 118 * MPOOL -- 119 * Shared memory pool region. 120 */ 121struct __mpool { 122 /* 123 * The memory pool can be broken up into individual pieces/files. 124 * There are two reasons for this: firstly, on Solaris you can allocate 125 * only a little more than 2GB of memory in a contiguous chunk, 126 * and I expect to see more systems with similar issues. Secondly, 127 * applications can add / remove pieces to dynamically resize the 128 * cache. 129 * 130 * While this structure is duplicated in each piece of the cache, 131 * the first of these pieces/files describes the entire pool, the 132 * second only describe a piece of the cache. 133 */ 134 db_mutex_t mtx_region; /* Region mutex. */ 135 db_mutex_t mtx_resize; /* Resizing mutex. */ 136 137 /* 138 * The lsn field and list of underlying MPOOLFILEs are thread protected 139 * by the region lock. 140 */ 141 DB_LSN lsn; /* Maximum checkpoint LSN. */ 142 143 /* Configuration information: protected by the region lock. */ 144 u_int32_t max_nreg; /* Maximum number of regions. */ 145 size_t mp_mmapsize; /* Maximum file size for mmap. */ 146 int mp_maxopenfd; /* Maximum open file descriptors. */ 147 int mp_maxwrite; /* Maximum buffers to write. */ 148 db_timeout_t mp_maxwrite_sleep; /* Sleep after writing max buffers. */ 149 150 /* 151 * The number of regions and the total number of hash buckets across 152 * all regions. 153 * These fields are not protected by a mutex because we assume that we 154 * can read a 32-bit value atomically. They are only modified by cache 155 * resizing which holds the mpool resizing mutex to ensure that 156 * resizing is single-threaded. See the comment in mp_resize.c for 157 * more information. 158 */ 159 u_int32_t nreg; /* Number of underlying REGIONS. */ 160 u_int32_t nbuckets; /* Total number of hash buckets. */ 161 162 /* 163 * The regid field is protected by the resize mutex. 164 */ 165 roff_t regids; /* Array of underlying REGION Ids. */ 166 167 roff_t ftab; /* Hash table of files. */ 168 169 /* 170 * The following fields describe the per-cache portion of the region. 171 * 172 * The htab and htab_buckets fields are not thread protected as they 173 * are initialized during mpool creation, and not modified again. 174 * 175 * The last_checked and lru_count fields are thread protected by 176 * the region lock. 177 */ 178 roff_t htab; /* Hash table offset. */ 179 u_int32_t htab_buckets; /* Number of hash table entries. */ 180 u_int32_t last_checked; /* Last bucket checked for free. */ 181 u_int32_t lru_count; /* Counter for buffer LRU. */ 182 int32_t lru_reset; /* Hash bucket lru reset point. */ 183 184 /* 185 * The stat fields are generally not thread protected, and cannot be 186 * trusted. Note that st_pages is an exception, and is always updated 187 * inside a region lock (although it is sometimes read outside of the 188 * region lock). 189 */ 190 DB_MPOOL_STAT stat; /* Per-cache mpool statistics. */ 191 192 /* 193 * We track page puts so that we can decide when allocation is never 194 * going to succeed. We don't lock the field, all we care about is 195 * if it changes. 196 */ 197 u_int32_t put_counter; /* Count of page put calls. */ 198 199 /* 200 * Cache flush operations take a long time... 201 * 202 * Some cache flush operations want to ignore the app's configured 203 * max-write parameters (they are trying to quickly shut down an 204 * environment, for example). We can't specify that as an argument 205 * to the cache region functions, because we may decide to ignore 206 * the max-write configuration after the cache operation has begun. 207 * If the variable suppress_maxwrite is set, ignore the application 208 * max-write config. 209 * 210 * We may want to interrupt cache flush operations in high-availability 211 * configurations. 212 */ 213#define DB_MEMP_SUPPRESS_WRITE 0x01 214#define DB_MEMP_SYNC_INTERRUPT 0x02 215 u_int32_t config_flags; 216 217 /* Free frozen buffer headers, protected by the region lock. */ 218 SH_TAILQ_HEAD(__free_frozen) free_frozen; 219 220 /* Allocated blocks of frozen buffer headers. */ 221 SH_TAILQ_HEAD(__alloc_frozen) alloc_frozen; 222}; 223 224/* 225 * NREGION -- 226 * Select a cache region given the bucket number. 227 */ 228#define NREGION(mp, bucket) \ 229 ((bucket) / (mp)->htab_buckets) 230 231/* 232 * MP_HASH -- 233 * We make the assumption that early pages of the file are more likely 234 * to be retrieved than the later pages, which means the top bits will 235 * be more interesting for hashing as they're less likely to collide. 236 * That said, as 512 8K pages represents a 4MB file, so only reasonably 237 * large files will have page numbers with any other than the bottom 9 238 * bits set. We XOR in the MPOOL offset of the MPOOLFILE that backs the 239 * page, since that should also be unique for the page. We don't want 240 * to do anything very fancy -- speed is more important to us than using 241 * good hashing. 242 * 243 * Since moving to a dynamic hash, which boils down to using some of the 244 * least significant bits of the hash value, we no longer want to use a 245 * simple shift here, because it's likely with a bit shift that mf_offset 246 * will be ignored, and pages from different files end up in the same 247 * hash bucket. Use a nearby prime instead. 248 */ 249#define MP_HASH(mf_offset, pgno) \ 250 ((((pgno) << 8) ^ (pgno)) ^ ((mf_offset) * 509)) 251 252/* 253 * Inline the calculation of the mask, since we can't reliably store the mask 254 * with the number of buckets in the region. 255 * 256 * This is equivalent to: 257 * mask = (1 << __db_log2(nbuckets)) - 1; 258 */ 259#define MP_MASK(nbuckets, mask) do { \ 260 for (mask = 1; mask < (nbuckets); mask = (mask << 1) | 1) \ 261 ; \ 262} while (0) 263 264#define MP_HASH_BUCKET(hash, nbuckets, mask, bucket) do { \ 265 (bucket) = (hash) & (mask); \ 266 if ((bucket) >= (nbuckets)) \ 267 (bucket) &= ((mask) >> 1); \ 268} while (0) 269 270#define MP_BUCKET(mf_offset, pgno, nbuckets, bucket) do { \ 271 u_int32_t __mask; \ 272 MP_MASK(nbuckets, __mask); \ 273 MP_HASH_BUCKET(MP_HASH(mf_offset, pgno), nbuckets, \ 274 __mask, bucket); \ 275} while (0) 276 277/* 278 * MP_GET_REGION -- 279 * Select the region for a given page. 280 */ 281#define MP_GET_REGION(dbmfp, pgno, infopp, ret) do { \ 282 DB_MPOOL *__t_dbmp; \ 283 MPOOL *__t_mp; \ 284 \ 285 __t_dbmp = dbmfp->env->mp_handle; \ 286 __t_mp = __t_dbmp->reginfo[0].primary; \ 287 if (__t_mp->max_nreg == 1) { \ 288 *(infopp) = &__t_dbmp->reginfo[0]; \ 289 } else \ 290 ret = __memp_get_bucket((dbmfp)->env, \ 291 (dbmfp)->mfp, (pgno), (infopp), NULL); \ 292} while (0) 293 294/* 295 * MP_GET_BUCKET -- 296 * Select and lock the bucket for a given page. 297 */ 298#define MP_GET_BUCKET(env, mfp, pgno, infopp, hp, ret) do { \ 299 DB_MPOOL *__t_dbmp; \ 300 MPOOL *__t_mp; \ 301 roff_t __t_mf_offset; \ 302 u_int32_t __t_bucket; \ 303 \ 304 __t_dbmp = (env)->mp_handle; \ 305 __t_mp = __t_dbmp->reginfo[0].primary; \ 306 if (__t_mp->max_nreg == 1) { \ 307 *(infopp) = &__t_dbmp->reginfo[0]; \ 308 __t_mf_offset = R_OFFSET(*(infopp), (mfp)); \ 309 MP_BUCKET(__t_mf_offset, \ 310 (pgno), __t_mp->nbuckets, __t_bucket); \ 311 (hp) = R_ADDR(*(infopp), __t_mp->htab); \ 312 (hp) = &(hp)[__t_bucket]; \ 313 MUTEX_LOCK(env, (hp)->mtx_hash); \ 314 ret = 0; \ 315 } else \ 316 ret = __memp_get_bucket((env), \ 317 (mfp), (pgno), (infopp), &(hp)); \ 318} while (0) 319 320struct __db_mpool_hash { 321 db_mutex_t mtx_hash; /* Per-bucket mutex. */ 322 db_mutex_t mtx_io; /* Buffer I/O mutex. */ 323 324 DB_HASHTAB hash_bucket; /* Head of bucket. */ 325 326 u_int32_t hash_page_dirty;/* Count of dirty pages. */ 327 328#ifndef __TEST_DB_NO_STATISTICS 329 u_int32_t hash_io_wait; /* Count of I/O waits. */ 330 u_int32_t hash_frozen; /* Count of frozen buffers. */ 331 u_int32_t hash_thawed; /* Count of thawed buffers. */ 332 u_int32_t hash_frozen_freed;/* Count of freed frozen buffers. */ 333#endif 334 335 DB_LSN old_reader; /* Oldest snapshot reader (cached). */ 336 337#define IO_WAITER 0x001 /* Thread is waiting on page. */ 338 u_int32_t flags; 339}; 340 341/* 342 * The base mpool priority is 1/4th of the name space, or just under 2^30. 343 * When the LRU counter wraps, we shift everybody down to a base-relative 344 * value. 345 */ 346#define MPOOL_BASE_DECREMENT (UINT32_MAX - (UINT32_MAX / 4)) 347 348/* 349 * Mpool priorities from low to high. Defined in terms of fractions of the 350 * buffers in the pool. 351 */ 352#define MPOOL_PRI_VERY_LOW -1 /* Dead duck. Check and set to 0. */ 353#define MPOOL_PRI_LOW -2 /* Low. */ 354#define MPOOL_PRI_DEFAULT 0 /* No adjustment -- special case.*/ 355#define MPOOL_PRI_HIGH 10 /* With the dirty buffers. */ 356#define MPOOL_PRI_DIRTY 10 /* Dirty gets a 10% boost. */ 357#define MPOOL_PRI_VERY_HIGH 1 /* Add number of buffers in pool. */ 358 359/* 360 * MPOOLFILE -- 361 * Shared DB_MPOOLFILE information. 362 */ 363struct __mpoolfile { 364 db_mutex_t mutex; /* MPOOLFILE mutex. */ 365 366 /* Protected by MPOOLFILE mutex. */ 367 u_int32_t mpf_cnt; /* Ref count: DB_MPOOLFILEs. */ 368 u_int32_t block_cnt; /* Ref count: blocks in cache. */ 369 db_pgno_t last_pgno; /* Last page in the file. */ 370 db_pgno_t last_flushed_pgno; /* Last page flushed to disk. */ 371 db_pgno_t orig_last_pgno; /* Original last page in the file. */ 372 db_pgno_t maxpgno; /* Maximum page number. */ 373 374 roff_t path_off; /* File name location. */ 375 376 /* Protected by hash bucket mutex. */ 377 SH_TAILQ_ENTRY q; /* List of MPOOLFILEs */ 378 379 /* 380 * The following are used for file compaction processing. 381 * They are only used when a thread is in the process 382 * of trying to move free pages to the end of the file. 383 * Other threads may look here when freeing a page. 384 * Protected by a lock on the metapage. 385 */ 386 u_int32_t free_ref; /* Refcount to freelist. */ 387 u_int32_t free_cnt; /* Count of free pages. */ 388 size_t free_size; /* Allocated size of free list. */ 389 roff_t free_list; /* Offset to free list. */ 390 391 /* 392 * We normally don't lock the deadfile field when we read it since we 393 * only care if the field is zero or non-zero. We do lock on read when 394 * searching for a matching MPOOLFILE -- see that code for more detail. 395 */ 396 int32_t deadfile; /* Dirty pages can be discarded. */ 397 398 u_int32_t bucket; /* hash bucket for this file. */ 399 400 /* 401 * None of the following fields are thread protected. 402 * 403 * There are potential races with the ftype field because it's read 404 * without holding a lock. However, it has to be set before adding 405 * any buffers to the cache that depend on it being set, so there 406 * would need to be incorrect operation ordering to have a problem. 407 */ 408 int32_t ftype; /* File type. */ 409 410 /* 411 * There are potential races with the priority field because it's read 412 * without holding a lock. However, a collision is unlikely and if it 413 * happens is of little consequence. 414 */ 415 int32_t priority; /* Priority when unpinning buffer. */ 416 417 /* 418 * There are potential races with the file_written field (many threads 419 * may be writing blocks at the same time), and with no_backing_file 420 * and unlink_on_close fields, as they may be set while other threads 421 * are reading them. However, we only care if the field value is zero 422 * or non-zero, so don't lock the memory. 423 * 424 * !!! 425 * Theoretically, a 64-bit architecture could put two of these fields 426 * in a single memory operation and we could race. I have never seen 427 * an architecture where that's a problem, and I believe Java requires 428 * that to never be the case. 429 * 430 * File_written is set whenever a buffer is marked dirty in the cache. 431 * It can be cleared in some cases, after all dirty buffers have been 432 * written AND the file has been flushed to disk. 433 */ 434 int32_t file_written; /* File was written. */ 435 int32_t no_backing_file; /* Never open a backing file. */ 436 int32_t unlink_on_close; /* Unlink file on last close. */ 437 int32_t multiversion; /* Number of DB_MULTIVERSION handles. */ 438 439 /* 440 * We do not protect the statistics in "stat" because of the cost of 441 * the mutex in the get/put routines. There is a chance that a count 442 * will get lost. 443 */ 444 DB_MPOOL_FSTAT stat; /* Per-file mpool statistics. */ 445 446 /* 447 * The remaining fields are initialized at open and never subsequently 448 * modified. 449 */ 450 int32_t lsn_off; /* Page's LSN offset. */ 451 u_int32_t clear_len; /* Bytes to clear on page create. */ 452 453 roff_t fileid_off; /* File ID string location. */ 454 455 roff_t pgcookie_len; /* Pgin/pgout cookie length. */ 456 roff_t pgcookie_off; /* Pgin/pgout cookie location. */ 457 458 /* 459 * The flags are initialized at open and never subsequently modified. 460 */ 461#define MP_CAN_MMAP 0x001 /* If the file can be mmap'd. */ 462#define MP_DIRECT 0x002 /* No OS buffering. */ 463#define MP_DURABLE_UNKNOWN 0x004 /* We don't care about durability. */ 464#define MP_EXTENT 0x008 /* Extent file. */ 465#define MP_FAKE_DEADFILE 0x010 /* Deadfile field: fake flag. */ 466#define MP_FAKE_FILEWRITTEN 0x020 /* File_written field: fake flag. */ 467#define MP_FAKE_NB 0x040 /* No_backing_file field: fake flag. */ 468#define MP_FAKE_UOC 0x080 /* Unlink_on_close field: fake flag. */ 469#define MP_NOT_DURABLE 0x100 /* File is not durable. */ 470#define MP_TEMP 0x200 /* Backing file is a temporary. */ 471 u_int32_t flags; 472}; 473 474/* 475 * Flags to __memp_bh_free. 476 */ 477#define BH_FREE_FREEMEM 0x01 478#define BH_FREE_REUSE 0x02 479#define BH_FREE_UNLOCKED 0x04 480 481/* 482 * BH -- 483 * Buffer header. 484 */ 485struct __bh { 486 u_int16_t ref; /* Reference count. */ 487 u_int16_t ref_sync; /* Sync wait-for reference count. */ 488 489#define BH_CALLPGIN 0x001 /* Convert the page before use. */ 490#define BH_DIRTY 0x002 /* Page is modified. */ 491#define BH_DIRTY_CREATE 0x004 /* Page is modified. */ 492#define BH_DISCARD 0x008 /* Page is useless. */ 493#define BH_FREED 0x010 /* Page was freed. */ 494#define BH_FROZEN 0x020 /* Frozen buffer: allocate & re-read. */ 495#define BH_LOCKED 0x040 /* Page is locked (I/O in progress). */ 496#define BH_TRASH 0x080 /* Page is garbage. */ 497#define BH_THAWED 0x100 /* Page was thawed. */ 498 u_int16_t flags; 499 500 u_int32_t priority; /* Priority. */ 501 SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */ 502 503 db_pgno_t pgno; /* Underlying MPOOLFILE page number. */ 504 roff_t mf_offset; /* Associated MPOOLFILE offset. */ 505 506 roff_t td_off; /* MVCC: creating TXN_DETAIL offset. */ 507 SH_CHAIN_ENTRY vc; /* MVCC: version chain. */ 508#ifdef DIAG_MVCC 509 u_int16_t align_off; /* Alignment offset for diagnostics.*/ 510#endif 511 512 /* 513 * !!! 514 * This array must be at least size_t aligned -- the DB access methods 515 * put PAGE and other structures into it, and then access them directly. 516 * (We guarantee size_t alignment to applications in the documentation, 517 * too.) 518 */ 519 u_int8_t buf[1]; /* Variable length data. */ 520}; 521 522/* 523 * BH_FROZEN_PAGE -- 524 * Data used to find a frozen buffer header. 525 */ 526struct __bh_frozen_p { 527 BH header; 528 db_pgno_t spgno; /* Page number in freezer file. */ 529}; 530 531/* 532 * BH_FROZEN_ALLOC -- 533 * Frozen buffer headers are allocated a page at a time in general. This 534 * structure is allocated at the beginning of the page so that the 535 * allocation chunks can be tracked and freed (for private environments). 536 */ 537struct __bh_frozen_a { 538 SH_TAILQ_ENTRY links; 539}; 540 541#define MULTIVERSION(dbp) ((dbp)->mpf->mfp->multiversion) 542#define IS_DIRTY(p) \ 543 F_ISSET((BH *)((u_int8_t *)(p) - SSZA(BH, buf)), BH_DIRTY) 544 545#define BH_OWNER(env, bhp) \ 546 ((TXN_DETAIL *)R_ADDR(&env->tx_handle->reginfo, bhp->td_off)) 547 548#define BH_OWNED_BY(env, bhp, txn) ((txn) != NULL && \ 549 (bhp)->td_off != INVALID_ROFF && \ 550 (txn)->td == BH_OWNER(env, bhp)) 551 552#define BH_PRIORITY(bhp) \ 553 (SH_CHAIN_SINGLETON(bhp, vc) ? (bhp)->priority : \ 554 __memp_bh_priority(bhp)) 555 556#define VISIBLE_LSN(env, bhp) \ 557 (&BH_OWNER(env, bhp)->visible_lsn) 558 559/* 560 * Make a copy of the buffer's visible LSN, one field at a time. We rely on the 561 * 32-bit operations being atomic. The visible_lsn starts at MAX_LSN and is 562 * set during commit or abort to the current LSN. 563 * 564 * If we race with a commit / abort, we may see either the file or the offset 565 * still at UINT32_MAX, so vlsn is guaranteed to be in the future. That's OK, 566 * since we had to take the log region lock to allocate the read LSN so we were 567 * never going to see this buffer anyway. 568 */ 569#define BH_VISIBLE(env, bhp, read_lsnp, vlsn) \ 570 (bhp->td_off == INVALID_ROFF || \ 571 ((vlsn).file = VISIBLE_LSN(env, bhp)->file, \ 572 (vlsn).offset = VISIBLE_LSN(env, bhp)->offset, \ 573 LOG_COMPARE((read_lsnp), &(vlsn)) >= 0)) 574 575#define BH_OBSOLETE(bhp, old_lsn, vlsn) (SH_CHAIN_HASNEXT(bhp, vc) ? \ 576 BH_VISIBLE(env, SH_CHAIN_NEXTP(bhp, vc, __bh), &(old_lsn), vlsn) :\ 577 BH_VISIBLE(env, bhp, &(old_lsn), vlsn)) 578 579#define MVCC_SKIP_CURADJ(dbc, pgno) \ 580 (dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT) && \ 581 dbc->txn->td != NULL && __memp_skip_curadj(dbc, pgno)) 582 583#if defined(DIAG_MVCC) && defined(HAVE_MPROTECT) 584#define VM_PAGESIZE 4096 585#define MVCC_BHSIZE(mfp, sz) do { \ 586 sz += VM_PAGESIZE + sizeof(BH); \ 587 if (mfp->stat.st_pagesize < VM_PAGESIZE) \ 588 sz += VM_PAGESIZE - mfp->stat.st_pagesize; \ 589} while (0) 590 591#define MVCC_BHALIGN(mfp, p) do { \ 592 if (mfp != NULL) { \ 593 BH *__bhp; \ 594 void *__orig = (p); \ 595 p = ALIGNP_INC(p, VM_PAGESIZE); \ 596 if ((u_int8_t *)p < (u_int8_t *)__orig + sizeof(BH)) \ 597 p = (u_int8_t *)p + VM_PAGESIZE; \ 598 __bhp = (BH *)((u_int8_t *)p - SSZA(BH, buf)); \ 599 DB_ASSERT(env, \ 600 ((uintptr_t)__bhp->buf & (VM_PAGESIZE - 1)) == 0); \ 601 DB_ASSERT(env, \ 602 (u_int8_t *)__bhp >= (u_int8_t *)__orig); \ 603 DB_ASSERT(env, (u_int8_t *)p + mfp->stat.st_pagesize <\ 604 (u_int8_t *)__orig + len); \ 605 __bhp->align_off = \ 606 (u_int16_t)((u_int8_t *)__bhp - (u_int8_t *)__orig);\ 607 p = __bhp; \ 608 } \ 609} while (0) 610 611#define MVCC_BHUNALIGN(mfp, p) do { \ 612 if ((mfp) != NULL) { \ 613 BH *bhp = (BH *)(p); \ 614 (p) = ((u_int8_t *)bhp - bhp->align_off); \ 615 } \ 616} while (0) 617 618#ifdef linux 619#define MVCC_MPROTECT(buf, sz, mode) do { \ 620 int __ret = mprotect((buf), (sz), (mode)); \ 621 DB_ASSERT(env, __ret == 0); \ 622} while (0) 623#else 624#define MVCC_MPROTECT(buf, sz, mode) do { \ 625 if (!F_ISSET(env, ENV_PRIVATE | ENV_SYSTEM_MEM)) { \ 626 int __ret = mprotect((buf), (sz), (mode)); \ 627 DB_ASSERT(env, __ret == 0); \ 628 } \ 629} while (0) 630#endif /* linux */ 631 632#else /* defined(DIAG_MVCC) && defined(HAVE_MPROTECT) */ 633#define MVCC_BHSIZE(mfp, sz) do {} while (0) 634#define MVCC_BHALIGN(mfp, p) do {} while (0) 635#define MVCC_BHUNALIGN(mfp, p) do {} while (0) 636#define MVCC_MPROTECT(buf, size, mode) do {} while (0) 637#endif 638 639/* 640 * Flags to __memp_ftruncate. 641 */ 642#define MP_TRUNC_RECOVER 0x01 643 644#if defined(__cplusplus) 645} 646#endif 647 648#include "dbinc_auto/mp_ext.h" 649#endif /* !_DB_MP_H_ */ 650