1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: mp.h,v 12.45 2008/03/10 13:28:01 mjc Exp $
7 */
8
9#ifndef	_DB_MP_H_
10#define	_DB_MP_H_
11
12#if defined(__cplusplus)
13extern "C" {
14#endif
15
16struct __bh;		typedef struct __bh BH;
17struct __bh_frozen_p;	typedef struct __bh_frozen_p BH_FROZEN_PAGE;
18struct __bh_frozen_a;	typedef struct __bh_frozen_a BH_FROZEN_ALLOC;
19struct __db_mpool_hash; typedef struct __db_mpool_hash DB_MPOOL_HASH;
20struct __db_mpreg;	typedef struct __db_mpreg DB_MPREG;
21struct __mpool;		typedef struct __mpool MPOOL;
22
23				/* We require at least 20KB of cache. */
24#define	DB_CACHESIZE_MIN	(20 * 1024)
25
26/*
27 * DB_MPOOLFILE initialization methods cannot be called after open is called,
28 * other methods cannot be called before open is called
29 */
30#define	MPF_ILLEGAL_AFTER_OPEN(dbmfp, name)				\
31	if (F_ISSET(dbmfp, MP_OPEN_CALLED))				\
32		return (__db_mi_open((dbmfp)->env, name, 1));
33#define	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, name)				\
34	if (!F_ISSET(dbmfp, MP_OPEN_CALLED))				\
35		return (__db_mi_open((dbmfp)->env, name, 0));
36
37/*
38 * Cache flush operations, plus modifiers.
39 */
40#define	DB_SYNC_ALLOC		0x0001	/* Flush for allocation. */
41#define	DB_SYNC_CACHE		0x0002	/* Flush entire cache. */
42#define	DB_SYNC_CHECKPOINT	0x0004	/* Checkpoint. */
43#define	DB_SYNC_FILE		0x0008	/* Flush file. */
44#define	DB_SYNC_INTERRUPT_OK	0x0010	/* Allow interrupt and return OK. */
45#define	DB_SYNC_QUEUE_EXTENT	0x0020	/* Flush a queue file with extents. */
46#define	DB_SYNC_SUPPRESS_WRITE	0x0040	/* Ignore max-write configuration. */
47#define	DB_SYNC_TRICKLE		0x0080	/* Trickle sync. */
48
49/*
50 * DB_MPOOL --
51 *	Per-process memory pool structure.
52 */
53struct __db_mpool {
54	/* These fields need to be protected for multi-threaded support. */
55	db_mutex_t mutex;		/* Thread mutex. */
56
57	/*
58	 * DB_MPREG structure for the DB pgin/pgout routines.
59	 *
60	 * Linked list of application-specified pgin/pgout routines.
61	 */
62	DB_MPREG *pg_inout;
63	LIST_HEAD(__db_mpregh, __db_mpreg) dbregq;
64
65					/* List of DB_MPOOLFILE's. */
66	TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq;
67
68	/*
69	 * The env and reginfo fields are not thread protected, as they are
70	 * initialized during mpool creation, and not modified again.
71	 */
72	ENV	   *env;		/* Enclosing environment. */
73	REGINFO	   *reginfo;		/* Underlying cache regions. */
74};
75
76/*
77 * DB_MPREG --
78 *	DB_MPOOL registry of pgin/pgout functions.
79 */
80struct __db_mpreg {
81	LIST_ENTRY(__db_mpreg) q;	/* Linked list. */
82
83	int32_t ftype;			/* File type. */
84					/* Pgin, pgout routines. */
85	int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
86	int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *));
87};
88
89/*
90 * File hashing --
91 *	We hash each file to hash bucket based on its fileid
92 *	or, in the case of in memory files, its name.
93 */
94
95/* Number of file hash buckets, a small prime number */
96#define	MPOOL_FILE_BUCKETS	17
97
98#define	FHASH(id, len)	__ham_func5(NULL, id, (u_int32_t)(len))
99
100#define	FNBUCKET(id, len)						\
101	(FHASH(id, len) % MPOOL_FILE_BUCKETS)
102
103/* Macros to lock/unlock the mpool region as a whole. */
104#define	MPOOL_SYSTEM_LOCK(env)						\
105	MUTEX_LOCK(env, ((MPOOL *)					\
106	    (env)->mp_handle->reginfo[0].primary)->mtx_region)
107#define	MPOOL_SYSTEM_UNLOCK(env)					\
108	MUTEX_UNLOCK(env, ((MPOOL *)					\
109	    (env)->mp_handle->reginfo[0].primary)->mtx_region)
110
111/* Macros to lock/unlock a specific mpool region. */
112#define	MPOOL_REGION_LOCK(env, infop)					\
113	MUTEX_LOCK(env, ((MPOOL *)(infop)->primary)->mtx_region)
114#define	MPOOL_REGION_UNLOCK(env, infop)					\
115	MUTEX_UNLOCK(env, ((MPOOL *)(infop)->primary)->mtx_region)
116
117/*
118 * MPOOL --
119 *	Shared memory pool region.
120 */
121struct __mpool {
122	/*
123	 * The memory pool can be broken up into individual pieces/files.
124	 * There are two reasons for this: firstly, on Solaris you can allocate
125	 * only a little more than 2GB of memory in a contiguous chunk,
126	 * and I expect to see more systems with similar issues.  Secondly,
127	 * applications can add / remove pieces to dynamically resize the
128	 * cache.
129	 *
130	 * While this structure is duplicated in each piece of the cache,
131	 * the first of these pieces/files describes the entire pool, the
132	 * second only describe a piece of the cache.
133	 */
134	db_mutex_t	mtx_region;	/* Region mutex. */
135	db_mutex_t	mtx_resize;	/* Resizing mutex. */
136
137	/*
138	 * The lsn field and list of underlying MPOOLFILEs are thread protected
139	 * by the region lock.
140	 */
141	DB_LSN	  lsn;			/* Maximum checkpoint LSN. */
142
143	/* Configuration information: protected by the region lock. */
144	u_int32_t max_nreg;		/* Maximum number of regions. */
145	size_t    mp_mmapsize;		/* Maximum file size for mmap. */
146	int       mp_maxopenfd;		/* Maximum open file descriptors. */
147	int       mp_maxwrite;		/* Maximum buffers to write. */
148	db_timeout_t mp_maxwrite_sleep;	/* Sleep after writing max buffers. */
149
150	/*
151	 * The number of regions and the total number of hash buckets across
152	 * all regions.
153	 * These fields are not protected by a mutex because we assume that we
154	 * can read a 32-bit value atomically.  They are only modified by cache
155	 * resizing which holds the mpool resizing mutex to ensure that
156	 * resizing is single-threaded.  See the comment in mp_resize.c for
157	 * more information.
158	 */
159	u_int32_t nreg;			/* Number of underlying REGIONS. */
160	u_int32_t nbuckets;		/* Total number of hash buckets. */
161
162	/*
163	 * The regid field is protected by the resize mutex.
164	 */
165	roff_t	  regids;		/* Array of underlying REGION Ids. */
166
167	roff_t	  ftab;			/* Hash table of files. */
168
169	/*
170	 * The following fields describe the per-cache portion of the region.
171	 *
172	 * The htab and htab_buckets fields are not thread protected as they
173	 * are initialized during mpool creation, and not modified again.
174	 *
175	 * The last_checked and lru_count fields are thread protected by
176	 * the region lock.
177	 */
178	roff_t	  htab;			/* Hash table offset. */
179	u_int32_t htab_buckets;		/* Number of hash table entries. */
180	u_int32_t last_checked;		/* Last bucket checked for free. */
181	u_int32_t lru_count;		/* Counter for buffer LRU. */
182	int32_t   lru_reset;		/* Hash bucket lru reset point. */
183
184	/*
185	 * The stat fields are generally not thread protected, and cannot be
186	 * trusted.  Note that st_pages is an exception, and is always updated
187	 * inside a region lock (although it is sometimes read outside of the
188	 * region lock).
189	 */
190	DB_MPOOL_STAT stat;		/* Per-cache mpool statistics. */
191
192	/*
193	 * We track page puts so that we can decide when allocation is never
194	 * going to succeed.  We don't lock the field, all we care about is
195	 * if it changes.
196	 */
197	u_int32_t  put_counter;		/* Count of page put calls. */
198
199	/*
200	 * Cache flush operations take a long time...
201	 *
202	 * Some cache flush operations want to ignore the app's configured
203	 * max-write parameters (they are trying to quickly shut down an
204	 * environment, for example).  We can't specify that as an argument
205	 * to the cache region functions, because we may decide to ignore
206	 * the max-write configuration after the cache operation has begun.
207	 * If the variable suppress_maxwrite is set, ignore the application
208	 * max-write config.
209	 *
210	 * We may want to interrupt cache flush operations in high-availability
211	 * configurations.
212	 */
213#define	DB_MEMP_SUPPRESS_WRITE	0x01
214#define	DB_MEMP_SYNC_INTERRUPT	0x02
215	u_int32_t config_flags;
216
217	/* Free frozen buffer headers, protected by the region lock. */
218	SH_TAILQ_HEAD(__free_frozen) free_frozen;
219
220	/* Allocated blocks of frozen buffer headers. */
221	SH_TAILQ_HEAD(__alloc_frozen) alloc_frozen;
222};
223
224/*
225 * NREGION --
226 *	Select a cache region given the bucket number.
227 */
228#define	NREGION(mp, bucket)						\
229	((bucket) / (mp)->htab_buckets)
230
231/*
232 * MP_HASH --
233 *	 We make the assumption that early pages of the file are more likely
234 *	 to be retrieved than the later pages, which means the top bits will
235 *	 be more interesting for hashing as they're less likely to collide.
236 *	 That said, as 512 8K pages represents a 4MB file, so only reasonably
237 *	 large files will have page numbers with any other than the bottom 9
238 *	 bits set.  We XOR in the MPOOL offset of the MPOOLFILE that backs the
239 *	 page, since that should also be unique for the page.  We don't want
240 *	 to do anything very fancy -- speed is more important to us than using
241 *	 good hashing.
242 *
243 *	 Since moving to a dynamic hash, which boils down to using some of the
244 *	 least significant bits of the hash value, we no longer want to use a
245 *	 simple shift here, because it's likely with a bit shift that mf_offset
246 *	 will be ignored, and pages from different files end up in the same
247 *	 hash bucket.  Use a nearby prime instead.
248 */
249#define	MP_HASH(mf_offset, pgno)					\
250	((((pgno) << 8) ^ (pgno)) ^ ((mf_offset) * 509))
251
252/*
253 * Inline the calculation of the mask, since we can't reliably store the mask
254 * with the number of buckets in the region.
255 *
256 * This is equivalent to:
257 *     mask = (1 << __db_log2(nbuckets)) - 1;
258 */
259#define	MP_MASK(nbuckets, mask) do {					\
260	for (mask = 1; mask < (nbuckets); mask = (mask << 1) | 1)	\
261		;							\
262} while (0)
263
264#define	MP_HASH_BUCKET(hash, nbuckets, mask, bucket) do {		\
265	(bucket) = (hash) & (mask);					\
266	if ((bucket) >= (nbuckets))					\
267		(bucket) &= ((mask) >> 1);				\
268} while (0)
269
270#define	MP_BUCKET(mf_offset, pgno, nbuckets, bucket) do {		\
271	u_int32_t __mask;						\
272	MP_MASK(nbuckets, __mask);					\
273	MP_HASH_BUCKET(MP_HASH(mf_offset, pgno), nbuckets,		\
274	    __mask, bucket);						\
275} while (0)
276
277/*
278 * MP_GET_REGION --
279 *	Select the region for a given page.
280 */
281#define	MP_GET_REGION(dbmfp, pgno, infopp, ret) do {			\
282	DB_MPOOL *__t_dbmp;						\
283	MPOOL *__t_mp;							\
284									\
285	__t_dbmp = dbmfp->env->mp_handle;				\
286	__t_mp = __t_dbmp->reginfo[0].primary;				\
287	if (__t_mp->max_nreg == 1) {					\
288		*(infopp) = &__t_dbmp->reginfo[0];			\
289	} else								\
290		ret = __memp_get_bucket((dbmfp)->env,			\
291		    (dbmfp)->mfp, (pgno), (infopp), NULL);		\
292} while (0)
293
294/*
295 * MP_GET_BUCKET --
296 *	Select and lock the bucket for a given page.
297 */
298#define	MP_GET_BUCKET(env, mfp, pgno, infopp, hp, ret) do {		\
299	DB_MPOOL *__t_dbmp;						\
300	MPOOL *__t_mp;							\
301	roff_t __t_mf_offset;						\
302	u_int32_t __t_bucket;						\
303									\
304	__t_dbmp = (env)->mp_handle;					\
305	__t_mp = __t_dbmp->reginfo[0].primary;				\
306	if (__t_mp->max_nreg == 1) {					\
307		*(infopp) = &__t_dbmp->reginfo[0];			\
308		__t_mf_offset = R_OFFSET(*(infopp), (mfp));		\
309		MP_BUCKET(__t_mf_offset,				\
310		    (pgno), __t_mp->nbuckets, __t_bucket);		\
311		(hp) = R_ADDR(*(infopp), __t_mp->htab);			\
312		(hp) = &(hp)[__t_bucket];				\
313		MUTEX_LOCK(env, (hp)->mtx_hash);			\
314		ret = 0;						\
315	} else								\
316		ret = __memp_get_bucket((env), 				\
317		    (mfp), (pgno), (infopp), &(hp));			\
318} while (0)
319
320struct __db_mpool_hash {
321	db_mutex_t	mtx_hash;	/* Per-bucket mutex. */
322	db_mutex_t	mtx_io;		/* Buffer I/O mutex. */
323
324	DB_HASHTAB	hash_bucket;	/* Head of bucket. */
325
326	u_int32_t	hash_page_dirty;/* Count of dirty pages. */
327
328#ifndef __TEST_DB_NO_STATISTICS
329	u_int32_t	hash_io_wait;	/* Count of I/O waits. */
330	u_int32_t	hash_frozen;	/* Count of frozen buffers. */
331	u_int32_t	hash_thawed;	/* Count of thawed buffers. */
332	u_int32_t	hash_frozen_freed;/* Count of freed frozen buffers. */
333#endif
334
335	DB_LSN		old_reader;	/* Oldest snapshot reader (cached). */
336
337#define	IO_WAITER	0x001		/* Thread is waiting on page. */
338	u_int32_t	flags;
339};
340
341/*
342 * The base mpool priority is 1/4th of the name space, or just under 2^30.
343 * When the LRU counter wraps, we shift everybody down to a base-relative
344 * value.
345 */
346#define	MPOOL_BASE_DECREMENT	(UINT32_MAX - (UINT32_MAX / 4))
347
348/*
349 * Mpool priorities from low to high.  Defined in terms of fractions of the
350 * buffers in the pool.
351 */
352#define	MPOOL_PRI_VERY_LOW	-1	/* Dead duck.  Check and set to 0. */
353#define	MPOOL_PRI_LOW		-2	/* Low. */
354#define	MPOOL_PRI_DEFAULT	0	/* No adjustment -- special case.*/
355#define	MPOOL_PRI_HIGH		10	/* With the dirty buffers. */
356#define	MPOOL_PRI_DIRTY		10	/* Dirty gets a 10% boost. */
357#define	MPOOL_PRI_VERY_HIGH	1	/* Add number of buffers in pool. */
358
359/*
360 * MPOOLFILE --
361 *	Shared DB_MPOOLFILE information.
362 */
363struct __mpoolfile {
364	db_mutex_t mutex;		/* MPOOLFILE mutex. */
365
366	/* Protected by MPOOLFILE mutex. */
367	u_int32_t mpf_cnt;		/* Ref count: DB_MPOOLFILEs. */
368	u_int32_t block_cnt;		/* Ref count: blocks in cache. */
369	db_pgno_t last_pgno;		/* Last page in the file. */
370	db_pgno_t last_flushed_pgno;	/* Last page flushed to disk. */
371	db_pgno_t orig_last_pgno;	/* Original last page in the file. */
372	db_pgno_t maxpgno;		/* Maximum page number. */
373
374	roff_t	  path_off;		/* File name location. */
375
376	/* Protected by hash bucket mutex. */
377	SH_TAILQ_ENTRY q;		/* List of MPOOLFILEs */
378
379	/*
380	 * The following are used for file compaction processing.
381	 * They are only used when a thread is in the process
382	 * of trying to move free pages to the end of the file.
383	 * Other threads may look here when freeing a page.
384	 * Protected by a lock on the metapage.
385	 */
386	u_int32_t free_ref;		/* Refcount to freelist. */
387	u_int32_t free_cnt;		/* Count of free pages. */
388	size_t	  free_size;		/* Allocated size of free list. */
389	roff_t	  free_list;		/* Offset to free list. */
390
391	/*
392	 * We normally don't lock the deadfile field when we read it since we
393	 * only care if the field is zero or non-zero.  We do lock on read when
394	 * searching for a matching MPOOLFILE -- see that code for more detail.
395	 */
396	int32_t	  deadfile;		/* Dirty pages can be discarded. */
397
398	u_int32_t bucket;		/* hash bucket for this file. */
399
400	/*
401	 * None of the following fields are thread protected.
402	 *
403	 * There are potential races with the ftype field because it's read
404	 * without holding a lock.  However, it has to be set before adding
405	 * any buffers to the cache that depend on it being set, so there
406	 * would need to be incorrect operation ordering to have a problem.
407	 */
408	int32_t	  ftype;		/* File type. */
409
410	/*
411	 * There are potential races with the priority field because it's read
412	 * without holding a lock.  However, a collision is unlikely and if it
413	 * happens is of little consequence.
414	 */
415	int32_t   priority;		/* Priority when unpinning buffer. */
416
417	/*
418	 * There are potential races with the file_written field (many threads
419	 * may be writing blocks at the same time), and with no_backing_file
420	 * and unlink_on_close fields, as they may be set while other threads
421	 * are reading them.  However, we only care if the field value is zero
422	 * or non-zero, so don't lock the memory.
423	 *
424	 * !!!
425	 * Theoretically, a 64-bit architecture could put two of these fields
426	 * in a single memory operation and we could race.  I have never seen
427	 * an architecture where that's a problem, and I believe Java requires
428	 * that to never be the case.
429	 *
430	 * File_written is set whenever a buffer is marked dirty in the cache.
431	 * It can be cleared in some cases, after all dirty buffers have been
432	 * written AND the file has been flushed to disk.
433	 */
434	int32_t	  file_written;		/* File was written. */
435	int32_t	  no_backing_file;	/* Never open a backing file. */
436	int32_t	  unlink_on_close;	/* Unlink file on last close. */
437	int32_t	  multiversion;		/* Number of DB_MULTIVERSION handles. */
438
439	/*
440	 * We do not protect the statistics in "stat" because of the cost of
441	 * the mutex in the get/put routines.  There is a chance that a count
442	 * will get lost.
443	 */
444	DB_MPOOL_FSTAT stat;		/* Per-file mpool statistics. */
445
446	/*
447	 * The remaining fields are initialized at open and never subsequently
448	 * modified.
449	 */
450	int32_t	  lsn_off;		/* Page's LSN offset. */
451	u_int32_t clear_len;		/* Bytes to clear on page create. */
452
453	roff_t	  fileid_off;		/* File ID string location. */
454
455	roff_t	  pgcookie_len;		/* Pgin/pgout cookie length. */
456	roff_t	  pgcookie_off;		/* Pgin/pgout cookie location. */
457
458	/*
459	 * The flags are initialized at open and never subsequently modified.
460	 */
461#define	MP_CAN_MMAP		0x001	/* If the file can be mmap'd. */
462#define	MP_DIRECT		0x002	/* No OS buffering. */
463#define	MP_DURABLE_UNKNOWN	0x004	/* We don't care about durability. */
464#define	MP_EXTENT		0x008	/* Extent file. */
465#define	MP_FAKE_DEADFILE	0x010	/* Deadfile field: fake flag. */
466#define	MP_FAKE_FILEWRITTEN	0x020	/* File_written field: fake flag. */
467#define	MP_FAKE_NB		0x040	/* No_backing_file field: fake flag. */
468#define	MP_FAKE_UOC		0x080	/* Unlink_on_close field: fake flag. */
469#define	MP_NOT_DURABLE		0x100	/* File is not durable. */
470#define	MP_TEMP			0x200	/* Backing file is a temporary. */
471	u_int32_t  flags;
472};
473
474/*
475 * Flags to __memp_bh_free.
476 */
477#define	BH_FREE_FREEMEM		0x01
478#define	BH_FREE_REUSE		0x02
479#define	BH_FREE_UNLOCKED	0x04
480
481/*
482 * BH --
483 *	Buffer header.
484 */
485struct __bh {
486	u_int16_t	ref;		/* Reference count. */
487	u_int16_t	ref_sync;	/* Sync wait-for reference count. */
488
489#define	BH_CALLPGIN	0x001		/* Convert the page before use. */
490#define	BH_DIRTY	0x002		/* Page is modified. */
491#define	BH_DIRTY_CREATE	0x004		/* Page is modified. */
492#define	BH_DISCARD	0x008		/* Page is useless. */
493#define	BH_FREED	0x010		/* Page was freed. */
494#define	BH_FROZEN	0x020		/* Frozen buffer: allocate & re-read. */
495#define	BH_LOCKED	0x040		/* Page is locked (I/O in progress). */
496#define	BH_TRASH	0x080		/* Page is garbage. */
497#define	BH_THAWED	0x100		/* Page was thawed. */
498	u_int16_t	flags;
499
500	u_int32_t	priority;	/* Priority. */
501	SH_TAILQ_ENTRY	hq;		/* MPOOL hash bucket queue. */
502
503	db_pgno_t	pgno;		/* Underlying MPOOLFILE page number. */
504	roff_t		mf_offset;	/* Associated MPOOLFILE offset. */
505
506	roff_t		td_off;		/* MVCC: creating TXN_DETAIL offset. */
507	SH_CHAIN_ENTRY	vc;		/* MVCC: version chain. */
508#ifdef DIAG_MVCC
509	u_int16_t	align_off;	/* Alignment offset for diagnostics.*/
510#endif
511
512	/*
513	 * !!!
514	 * This array must be at least size_t aligned -- the DB access methods
515	 * put PAGE and other structures into it, and then access them directly.
516	 * (We guarantee size_t alignment to applications in the documentation,
517	 * too.)
518	 */
519	u_int8_t   buf[1];		/* Variable length data. */
520};
521
522/*
523 * BH_FROZEN_PAGE --
524 *	Data used to find a frozen buffer header.
525 */
526struct __bh_frozen_p {
527	BH header;
528	db_pgno_t	spgno;		/* Page number in freezer file. */
529};
530
531/*
532 * BH_FROZEN_ALLOC --
533 *	Frozen buffer headers are allocated a page at a time in general.  This
534 *	structure is allocated at the beginning of the page so that the
535 *	allocation chunks can be tracked and freed (for private environments).
536 */
537struct __bh_frozen_a {
538	SH_TAILQ_ENTRY links;
539};
540
541#define	MULTIVERSION(dbp)	((dbp)->mpf->mfp->multiversion)
542#define	IS_DIRTY(p)							\
543    F_ISSET((BH *)((u_int8_t *)(p) - SSZA(BH, buf)), BH_DIRTY)
544
545#define	BH_OWNER(env, bhp)						\
546    ((TXN_DETAIL *)R_ADDR(&env->tx_handle->reginfo, bhp->td_off))
547
548#define	BH_OWNED_BY(env, bhp, txn)	((txn) != NULL &&		\
549    (bhp)->td_off != INVALID_ROFF &&					\
550    (txn)->td == BH_OWNER(env, bhp))
551
552#define	BH_PRIORITY(bhp)						\
553    (SH_CHAIN_SINGLETON(bhp, vc) ? (bhp)->priority :			\
554     __memp_bh_priority(bhp))
555
556#define	VISIBLE_LSN(env, bhp)						\
557    (&BH_OWNER(env, bhp)->visible_lsn)
558
559/*
560 * Make a copy of the buffer's visible LSN, one field at a time.  We rely on the
561 * 32-bit operations being atomic.  The visible_lsn starts at MAX_LSN and is
562 * set during commit or abort to the current LSN.
563 *
564 * If we race with a commit / abort, we may see either the file or the offset
565 * still at UINT32_MAX, so vlsn is guaranteed to be in the future.  That's OK,
566 * since we had to take the log region lock to allocate the read LSN so we were
567 * never going to see this buffer anyway.
568 */
569#define	BH_VISIBLE(env, bhp, read_lsnp, vlsn)				\
570    (bhp->td_off == INVALID_ROFF ||					\
571    ((vlsn).file = VISIBLE_LSN(env, bhp)->file,			\
572    (vlsn).offset = VISIBLE_LSN(env, bhp)->offset,			\
573    LOG_COMPARE((read_lsnp), &(vlsn)) >= 0))
574
575#define	BH_OBSOLETE(bhp, old_lsn, vlsn)	(SH_CHAIN_HASNEXT(bhp, vc) ?	\
576    BH_VISIBLE(env, SH_CHAIN_NEXTP(bhp, vc, __bh), &(old_lsn), vlsn) :\
577    BH_VISIBLE(env, bhp, &(old_lsn), vlsn))
578
579#define	MVCC_SKIP_CURADJ(dbc, pgno)					\
580    (dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT) &&		\
581    dbc->txn->td != NULL && __memp_skip_curadj(dbc, pgno))
582
583#if defined(DIAG_MVCC) && defined(HAVE_MPROTECT)
584#define	VM_PAGESIZE 4096
585#define	MVCC_BHSIZE(mfp, sz) do {					\
586	sz += VM_PAGESIZE + sizeof(BH);					\
587	if (mfp->stat.st_pagesize < VM_PAGESIZE)			\
588		sz += VM_PAGESIZE - mfp->stat.st_pagesize;		\
589} while (0)
590
591#define	MVCC_BHALIGN(mfp, p) do {					\
592	if (mfp != NULL) {						\
593		BH *__bhp;						\
594		void *__orig = (p);					\
595		p = ALIGNP_INC(p, VM_PAGESIZE);				\
596		if ((u_int8_t *)p < (u_int8_t *)__orig + sizeof(BH))	\
597			p = (u_int8_t *)p + VM_PAGESIZE;		\
598		__bhp = (BH *)((u_int8_t *)p - SSZA(BH, buf));		\
599		DB_ASSERT(env,					\
600		    ((uintptr_t)__bhp->buf & (VM_PAGESIZE - 1)) == 0);	\
601		DB_ASSERT(env,					\
602		    (u_int8_t *)__bhp >= (u_int8_t *)__orig);		\
603		DB_ASSERT(env, (u_int8_t *)p + mfp->stat.st_pagesize <\
604		    (u_int8_t *)__orig + len);				\
605		__bhp->align_off =					\
606		    (u_int16_t)((u_int8_t *)__bhp - (u_int8_t *)__orig);\
607		p = __bhp;						\
608	}								\
609} while (0)
610
611#define	MVCC_BHUNALIGN(mfp, p) do {					\
612	if ((mfp) != NULL) {						\
613		BH *bhp = (BH *)(p);					\
614		(p) = ((u_int8_t *)bhp - bhp->align_off);		\
615	}								\
616} while (0)
617
618#ifdef linux
619#define	MVCC_MPROTECT(buf, sz, mode) do {				\
620	int __ret = mprotect((buf), (sz), (mode));			\
621	DB_ASSERT(env, __ret == 0);					\
622} while (0)
623#else
624#define	MVCC_MPROTECT(buf, sz, mode) do {				\
625	if (!F_ISSET(env, ENV_PRIVATE | ENV_SYSTEM_MEM)) {		\
626		int __ret = mprotect((buf), (sz), (mode));		\
627		DB_ASSERT(env, __ret == 0);				\
628	}								\
629} while (0)
630#endif /* linux */
631
632#else /* defined(DIAG_MVCC) && defined(HAVE_MPROTECT) */
633#define	MVCC_BHSIZE(mfp, sz) do {} while (0)
634#define	MVCC_BHALIGN(mfp, p) do {} while (0)
635#define	MVCC_BHUNALIGN(mfp, p) do {} while (0)
636#define	MVCC_MPROTECT(buf, size, mode) do {} while (0)
637#endif
638
639/*
640 * Flags to __memp_ftruncate.
641 */
642#define	MP_TRUNC_RECOVER	0x01
643
644#if defined(__cplusplus)
645}
646#endif
647
648#include "dbinc_auto/mp_ext.h"
649#endif /* !_DB_MP_H_ */
650