1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996-2009 Oracle.  All rights reserved.
5 *
6 * $Id$
7 */
8
9#ifndef _DB_INT_H_
10#define	_DB_INT_H_
11
12/*******************************************************
13 * Berkeley DB ANSI/POSIX include files.
14 *******************************************************/
15#ifdef HAVE_SYSTEM_INCLUDE_FILES
16#include <sys/types.h>
17#ifdef DIAG_MVCC
18#include <sys/mman.h>
19#endif
20#include <sys/stat.h>
21
22#if defined(__INCLUDE_SELECT_H)
23#ifdef HAVE_SYS_SELECT_H
24#include <sys/select.h>
25#endif
26#ifdef HAVE_VXWORKS
27#include <selectLib.h>
28#endif
29#endif
30
31#if TIME_WITH_SYS_TIME
32#include <sys/time.h>
33#include <time.h>
34#else
35#if HAVE_SYS_TIME_H
36#include <sys/time.h>
37#else
38#include <time.h>
39#endif
40#endif
41
42#ifdef HAVE_VXWORKS
43#include <net/uio.h>
44#else
45#include <sys/uio.h>
46#endif
47
48#if defined(__INCLUDE_NETWORKING)
49#ifdef HAVE_SYS_SOCKET_H
50#include <sys/socket.h>
51#endif
52#include <netinet/in.h>
53#include <netdb.h>
54#include <arpa/inet.h>
55#endif
56
57#if defined(STDC_HEADERS) || defined(__cplusplus)
58#include <stdarg.h>
59#else
60#include <varargs.h>
61#endif
62
63#include <ctype.h>
64#include <errno.h>
65#include <fcntl.h>
66#include <limits.h>
67#include <signal.h>
68#include <stddef.h>
69#include <stdio.h>
70#include <stdlib.h>
71#include <string.h>
72#include <unistd.h>
73
74#if defined(__INCLUDE_DIRECTORY)
75#if HAVE_DIRENT_H
76# include <dirent.h>
77# define NAMLEN(dirent) strlen((dirent)->d_name)
78#else
79# define dirent direct
80# define NAMLEN(dirent) (dirent)->d_namlen
81# if HAVE_SYS_NDIR_H
82#  include <sys/ndir.h>
83# endif
84# if HAVE_SYS_DIR_H
85#  include <sys/dir.h>
86# endif
87# if HAVE_NDIR_H
88#  include <ndir.h>
89# endif
90#endif
91#endif /* __INCLUDE_DIRECTORY */
92
93#endif /* !HAVE_SYSTEM_INCLUDE_FILES */
94
95#ifdef DB_WIN32
96#include "dbinc/win_db.h"
97#endif
98
99#include "db.h"
100#include "clib_port.h"
101
102#include "dbinc/queue.h"
103#include "dbinc/shqueue.h"
104
105#if defined(__cplusplus)
106extern "C" {
107#endif
108
109/*******************************************************
110 * Forward structure declarations.
111 *******************************************************/
112struct __db_reginfo_t;	typedef struct __db_reginfo_t REGINFO;
113struct __db_txnhead;	typedef struct __db_txnhead DB_TXNHEAD;
114struct __db_txnlist;	typedef struct __db_txnlist DB_TXNLIST;
115struct __vrfy_childinfo;typedef struct __vrfy_childinfo VRFY_CHILDINFO;
116struct __vrfy_dbinfo;   typedef struct __vrfy_dbinfo VRFY_DBINFO;
117struct __vrfy_pageinfo; typedef struct __vrfy_pageinfo VRFY_PAGEINFO;
118
119typedef SH_TAILQ_HEAD(__hash_head) DB_HASHTAB;
120
121/*******************************************************
122 * General purpose constants and macros.
123 *******************************************************/
124#undef	FALSE
125#define	FALSE		0
126#undef	TRUE
127#define	TRUE		(!FALSE)
128
129#define	MEGABYTE	1048576
130#define	GIGABYTE	1073741824
131
132#define	NS_PER_MS	1000000		/* Nanoseconds in a millisecond */
133#define	NS_PER_US	1000		/* Nanoseconds in a microsecond */
134#define	NS_PER_SEC	1000000000	/* Nanoseconds in a second */
135#define	US_PER_MS	1000		/* Microseconds in a millisecond */
136#define	US_PER_SEC	1000000		/* Microseconds in a second */
137#define	MS_PER_SEC	1000		/* Milliseconds in a second */
138
139#define	RECNO_OOB	0		/* Illegal record number. */
140
141/* Test for a power-of-two (tests true for zero, which doesn't matter here). */
142#define	POWER_OF_TWO(x)	(((x) & ((x) - 1)) == 0)
143
144/* Test for valid page sizes. */
145#define	DB_MIN_PGSIZE	0x000200	/* Minimum page size (512). */
146#define	DB_MAX_PGSIZE	0x010000	/* Maximum page size (65536). */
147#define	IS_VALID_PAGESIZE(x)						\
148	(POWER_OF_TWO(x) && (x) >= DB_MIN_PGSIZE && ((x) <= DB_MAX_PGSIZE))
149
150/* Minimum number of pages cached, by default. */
151#define	DB_MINPAGECACHE	16
152
153/*
154 * If we are unable to determine the underlying filesystem block size, use
155 * 8K on the grounds that most OS's use less than 8K for a VM page size.
156 */
157#define	DB_DEF_IOSIZE	(8 * 1024)
158
159/* Align an integer to a specific boundary. */
160#undef	DB_ALIGN
161#define	DB_ALIGN(v, bound)						\
162	(((v) + (bound) - 1) & ~(((uintmax_t)(bound)) - 1))
163
164/* Increment a pointer to a specific boundary. */
165#undef	ALIGNP_INC
166#define	ALIGNP_INC(p, bound)						\
167	(void *)(((uintptr_t)(p) + (bound) - 1) & ~(((uintptr_t)(bound)) - 1))
168
169/*
170 * Print an address as a u_long (a u_long is the largest type we can print
171 * portably).  Most 64-bit systems have made longs 64-bits, so this should
172 * work.
173 */
174#define	P_TO_ULONG(p)	((u_long)(uintptr_t)(p))
175
176/*
177 * Convert a pointer to a small integral value.
178 *
179 * The (u_int16_t)(uintptr_t) cast avoids warnings: the (uintptr_t) cast
180 * converts the value to an integral type, and the (u_int16_t) cast converts
181 * it to a small integral type so we don't get complaints when we assign the
182 * final result to an integral type smaller than uintptr_t.
183 */
184#define	P_TO_UINT32(p)	((u_int32_t)(uintptr_t)(p))
185#define	P_TO_UINT16(p)	((u_int16_t)(uintptr_t)(p))
186
187/*
188 * There are several on-page structures that are declared to have a number of
189 * fields followed by a variable length array of items.  The structure size
190 * without including the variable length array or the address of the first of
191 * those elements can be found using SSZ.
192 *
193 * This macro can also be used to find the offset of a structure element in a
194 * structure.  This is used in various places to copy structure elements from
195 * unaligned memory references, e.g., pointers into a packed page.
196 *
197 * There are two versions because compilers object if you take the address of
198 * an array.
199 */
200#undef	SSZ
201#define	SSZ(name, field)  P_TO_UINT16(&(((name *)0)->field))
202
203#undef	SSZA
204#define	SSZA(name, field) P_TO_UINT16(&(((name *)0)->field[0]))
205
206/* Structure used to print flag values. */
207typedef struct __fn {
208	u_int32_t mask;			/* Flag value. */
209	const char *name;		/* Flag name. */
210} FN;
211
212/* Set, clear and test flags. */
213#define	FLD_CLR(fld, f)		(fld) &= ~(f)
214#define	FLD_ISSET(fld, f)	((fld) & (f))
215#define	FLD_SET(fld, f)		(fld) |= (f)
216#define	F_CLR(p, f)		(p)->flags &= ~(f)
217#define	F_ISSET(p, f)		((p)->flags & (f))
218#define	F_SET(p, f)		(p)->flags |= (f)
219#define	LF_CLR(f)		((flags) &= ~(f))
220#define	LF_ISSET(f)		((flags) & (f))
221#define	LF_SET(f)		((flags) |= (f))
222
223/*
224 * Calculate a percentage.  The values can overflow 32-bit integer arithmetic
225 * so we use floating point.
226 *
227 * When calculating a bytes-vs-page size percentage, we're getting the inverse
228 * of the percentage in all cases, that is, we want 100 minus the percentage we
229 * calculate.
230 */
231#define	DB_PCT(v, total)						\
232	((int)((total) == 0 ? 0 : ((double)(v) * 100) / (total)))
233#define	DB_PCT_PG(v, total, pgsize)					\
234	((int)((total) == 0 ? 0 :					\
235	    100 - ((double)(v) * 100) / (((double)total) * (pgsize))))
236
237/*
238 * Statistics update shared memory and so are expensive -- don't update the
239 * values unless we're going to display the results.
240 */
241#undef	STAT
242#ifdef	HAVE_STATISTICS
243#define	STAT(x)	x
244#else
245#define	STAT(x)
246#endif
247
248/*
249 * Structure used for callback message aggregation.
250 *
251 * Display values in XXX_stat_print calls.
252 */
253typedef struct __db_msgbuf {
254	char *buf;			/* Heap allocated buffer. */
255	char *cur;			/* Current end of message. */
256	size_t len;			/* Allocated length of buffer. */
257} DB_MSGBUF;
258#define	DB_MSGBUF_INIT(a) do {						\
259	(a)->buf = (a)->cur = NULL;					\
260	(a)->len = 0;							\
261} while (0)
262#define	DB_MSGBUF_FLUSH(env, a) do {					\
263	if ((a)->buf != NULL) {						\
264		if ((a)->cur != (a)->buf)				\
265			__db_msg(env, "%s", (a)->buf);		\
266		__os_free(env, (a)->buf);				\
267		DB_MSGBUF_INIT(a);					\
268	}								\
269} while (0)
270#define	STAT_FMT(msg, fmt, type, v) do {				\
271	DB_MSGBUF __mb;							\
272	DB_MSGBUF_INIT(&__mb);						\
273	__db_msgadd(env, &__mb, fmt, (type)(v));			\
274	__db_msgadd(env, &__mb, "\t%s", msg);				\
275	DB_MSGBUF_FLUSH(env, &__mb);					\
276} while (0)
277#define	STAT_HEX(msg, v)						\
278	__db_msg(env, "%#lx\t%s", (u_long)(v), msg)
279#define	STAT_ISSET(msg, p)						\
280	__db_msg(env, "%sSet\t%s", (p) == NULL ? "!" : " ", msg)
281#define	STAT_LONG(msg, v)						\
282	__db_msg(env, "%ld\t%s", (long)(v), msg)
283#define	STAT_LSN(msg, lsnp)						\
284	__db_msg(env, "%lu/%lu\t%s",					\
285	    (u_long)(lsnp)->file, (u_long)(lsnp)->offset, msg)
286#define	STAT_POINTER(msg, v)						\
287	__db_msg(env, "%#lx\t%s", P_TO_ULONG(v), msg)
288#define	STAT_STRING(msg, p) do {					\
289	const char *__p = p;	/* p may be a function call. */		\
290	__db_msg(env, "%s\t%s", __p == NULL ? "!Set" : __p, msg);	\
291} while (0)
292#define	STAT_ULONG(msg, v)						\
293	__db_msg(env, "%lu\t%s", (u_long)(v), msg)
294
295/*
296 * There are quite a few places in Berkeley DB where we want to initialize
297 * a DBT from a string or other random pointer type, using a length typed
298 * to size_t in most cases.  This macro avoids a lot of casting.  The macro
299 * comes in two flavors because we often want to clear the DBT first.
300 */
301#define	DB_SET_DBT(dbt, d, s)  do {					\
302	(dbt).data = (void *)(d);					\
303	(dbt).size = (u_int32_t)(s);					\
304} while (0)
305#define	DB_INIT_DBT(dbt, d, s)  do {					\
306	memset(&(dbt), 0, sizeof(dbt));					\
307	DB_SET_DBT(dbt, d, s);						\
308} while (0)
309
310/*******************************************************
311 * API return values
312 *******************************************************/
313/*
314 * Return values that are OK for each different call.  Most calls have a
315 * standard 'return of 0 is only OK value', but some, like db->get have
316 * DB_NOTFOUND as a return value, but it really isn't an error.
317 */
318#define	DB_RETOK_STD(ret)	((ret) == 0)
319#define	DB_RETOK_DBCDEL(ret)	((ret) == 0 || (ret) == DB_KEYEMPTY || \
320				    (ret) == DB_NOTFOUND)
321#define	DB_RETOK_DBCGET(ret)	((ret) == 0 || (ret) == DB_KEYEMPTY || \
322				    (ret) == DB_NOTFOUND)
323#define	DB_RETOK_DBCPUT(ret)	((ret) == 0 || (ret) == DB_KEYEXIST || \
324				    (ret) == DB_NOTFOUND)
325#define	DB_RETOK_DBDEL(ret)	DB_RETOK_DBCDEL(ret)
326#define	DB_RETOK_DBGET(ret)	DB_RETOK_DBCGET(ret)
327#define	DB_RETOK_DBPUT(ret)	((ret) == 0 || (ret) == DB_KEYEXIST)
328#define	DB_RETOK_EXISTS(ret)	DB_RETOK_DBCGET(ret)
329#define	DB_RETOK_LGGET(ret)	((ret) == 0 || (ret) == DB_NOTFOUND)
330#define	DB_RETOK_MPGET(ret)	((ret) == 0 || (ret) == DB_PAGE_NOTFOUND)
331#define	DB_RETOK_REPPMSG(ret)	((ret) == 0 || \
332				    (ret) == DB_REP_IGNORE || \
333				    (ret) == DB_REP_ISPERM || \
334				    (ret) == DB_REP_NEWMASTER || \
335				    (ret) == DB_REP_NEWSITE || \
336				    (ret) == DB_REP_NOTPERM)
337#define	DB_RETOK_REPMGR_START(ret) ((ret) == 0 || (ret) == DB_REP_IGNORE)
338
339/* Find a reasonable operation-not-supported error. */
340#ifdef	EOPNOTSUPP
341#define	DB_OPNOTSUP	EOPNOTSUPP
342#else
343#ifdef	ENOTSUP
344#define	DB_OPNOTSUP	ENOTSUP
345#else
346#define	DB_OPNOTSUP	EINVAL
347#endif
348#endif
349
350/*******************************************************
351 * Files.
352 *******************************************************/
353/*
354 * We use 1024 as the maximum path length.  It's too hard to figure out what
355 * the real path length is, as it was traditionally stored in <sys/param.h>,
356 * and that file isn't always available.
357 */
358#define	DB_MAXPATHLEN	1024
359
360#define	PATH_DOT	"."	/* Current working directory. */
361				/* Path separator character(s). */
362#define	PATH_SEPARATOR	"@PATH_SEPARATOR@"
363
364/*******************************************************
365 * Environment.
366 *******************************************************/
367/* Type passed to __db_appname(). */
368typedef enum {
369	DB_APP_NONE=0,			/* No type (region). */
370	DB_APP_DATA,			/* Data file. */
371	DB_APP_LOG,			/* Log file. */
372	DB_APP_TMP,			/* Temporary file. */
373	DB_APP_RECOVER			/* We are in recovery. */
374} APPNAME;
375
376/*
377 * A set of macros to check if various functionality has been configured.
378 *
379 * ALIVE_ON	The is_alive function is configured.
380 * CDB_LOCKING	CDB product locking.
381 * CRYPTO_ON	Security has been configured.
382 * LOCKING_ON	Locking has been configured.
383 * LOGGING_ON	Logging has been configured.
384 * MUTEX_ON	Mutexes have been configured.
385 * MPOOL_ON	Memory pool has been configured.
386 * REP_ON	Replication has been configured.
387 * RPC_ON	RPC has been configured.
388 * TXN_ON	Transactions have been configured.
389 *
390 * REP_ON is more complex than most: if the BDB library was compiled without
391 * replication support, ENV->rep_handle will be NULL; if the BDB library has
392 * replication support, but it was not configured, the region reference will
393 * be NULL.
394 */
395#define	ALIVE_ON(env)		((env)->dbenv->is_alive != NULL)
396#define	CDB_LOCKING(env)	F_ISSET(env, ENV_CDB)
397#define	CRYPTO_ON(env)		((env)->crypto_handle != NULL)
398#define	LOCKING_ON(env)		((env)->lk_handle != NULL)
399#define	LOGGING_ON(env)		((env)->lg_handle != NULL)
400#define	MPOOL_ON(env)		((env)->mp_handle != NULL)
401#define	MUTEX_ON(env)		((env)->mutex_handle != NULL)
402#define	REP_ON(env)							\
403	((env)->rep_handle != NULL && (env)->rep_handle->region != NULL)
404#define	RPC_ON(dbenv)		((dbenv)->cl_handle != NULL)
405#define	TXN_ON(env)		((env)->tx_handle != NULL)
406
407/*
408 * STD_LOCKING	Standard locking, that is, locking was configured and CDB
409 *		was not.  We do not do locking in off-page duplicate trees,
410 *		so we check for that in the cursor first.
411 */
412#define	STD_LOCKING(dbc)						\
413	(!F_ISSET(dbc, DBC_OPD) &&					\
414	    !CDB_LOCKING((dbc)->env) && LOCKING_ON((dbc)->env))
415
416/*
417 * IS_RECOVERING: The system is running recovery.
418 */
419#define	IS_RECOVERING(env)						\
420	(LOGGING_ON(env) && F_ISSET((env)->lg_handle, DBLOG_RECOVER))
421
422/* Initialization methods are often illegal before/after open is called. */
423#define	ENV_ILLEGAL_AFTER_OPEN(env, name)				\
424	if (F_ISSET((env), ENV_OPEN_CALLED))				\
425		return (__db_mi_open(env, name, 1));
426#define	ENV_ILLEGAL_BEFORE_OPEN(env, name)				\
427	if (!F_ISSET((env), ENV_OPEN_CALLED))				\
428		return (__db_mi_open(env, name, 0));
429
430/* We're not actually user hostile, honest. */
431#define	ENV_REQUIRES_CONFIG(env, handle, i, flags)			\
432	if (handle == NULL)						\
433		return (__env_not_config(env, i, flags));
434#define	ENV_REQUIRES_CONFIG_XX(env, handle, i, flags)			\
435	if ((env)->handle->region == NULL)				\
436		return (__env_not_config(env, i, flags));
437#define	ENV_NOT_CONFIGURED(env, handle, i, flags)			\
438	if (F_ISSET((env), ENV_OPEN_CALLED))				\
439		ENV_REQUIRES_CONFIG(env, handle, i, flags)
440
441#define	ENV_ENTER(env, ip) do {						\
442	int __ret;							\
443	PANIC_CHECK(env);						\
444	if ((env)->thr_hashtab == NULL)					\
445		ip = NULL;						\
446	else {								\
447		if ((__ret =						\
448		    __env_set_state(env, &(ip), THREAD_ACTIVE)) != 0)	\
449			return (__ret);					\
450	}								\
451} while (0)
452
453#define	FAILCHK_THREAD(env, ip) do {					\
454	if ((ip) != NULL)						\
455		(ip)->dbth_state = THREAD_FAILCHK;			\
456} while (0)
457
458#define	ENV_GET_THREAD_INFO(env, ip) ENV_ENTER(env, ip)
459
460#ifdef DIAGNOSTIC
461#define	ENV_LEAVE(env, ip) do {						\
462	if ((ip) != NULL) {						\
463		DB_ASSERT(env, ((ip)->dbth_state == THREAD_ACTIVE  ||	\
464		    (ip)->dbth_state == THREAD_FAILCHK));		\
465		(ip)->dbth_state = THREAD_OUT;				\
466	}								\
467} while (0)
468#else
469#define	ENV_LEAVE(env, ip) do {						\
470	if ((ip) != NULL)						\
471		(ip)->dbth_state = THREAD_OUT;				\
472} while (0)
473#endif
474#ifdef DIAGNOSTIC
475#define	CHECK_THREAD(env) do {						\
476	if ((env)->thr_hashtab != NULL)					\
477		(void)__env_set_state(env, NULL, THREAD_VERIFY);	\
478} while (0)
479#ifdef HAVE_STATISTICS
480#define	CHECK_MTX_THREAD(env, mtx) do {					\
481	if (mtx->alloc_id != MTX_MUTEX_REGION &&			\
482	    mtx->alloc_id != MTX_ENV_REGION &&				\
483	    mtx->alloc_id != MTX_APPLICATION)				\
484		CHECK_THREAD(env);					\
485} while (0)
486#else
487#define	CHECK_MTX_THREAD(env, mtx)
488#endif
489#else
490#define	CHECK_THREAD(env)
491#define	CHECK_MTX_THREAD(env, mtx)
492#endif
493
494typedef enum {
495	THREAD_SLOT_NOT_IN_USE=0,
496	THREAD_OUT,
497	THREAD_ACTIVE,
498	THREAD_BLOCKED,
499	THREAD_BLOCKED_DEAD,
500	THREAD_FAILCHK,
501	THREAD_VERIFY
502} DB_THREAD_STATE;
503
504typedef struct __pin_list {
505	roff_t b_ref;		/* offset to buffer. */
506	int region;		/* region containing buffer. */
507} PIN_LIST;
508#define	PINMAX 4
509
510struct __db_thread_info {
511	pid_t		dbth_pid;
512	db_threadid_t	dbth_tid;
513	DB_THREAD_STATE	dbth_state;
514	SH_TAILQ_ENTRY	dbth_links;
515	/*
516	 * The following fields track which buffers this thread of
517	 * control has pinned in the mpool buffer cache.
518	 */
519	u_int16_t	dbth_pincount;	/* Number of pins for this thread. */
520	u_int16_t	dbth_pinmax;	/* Number of slots allocated. */
521	roff_t		dbth_pinlist;	/* List of pins. */
522	PIN_LIST	dbth_pinarray[PINMAX];	/* Initial array of slots. */
523};
524
525typedef struct __env_thread_info {
526	u_int32_t	thr_count;
527	u_int32_t	thr_max;
528	u_int32_t	thr_nbucket;
529	roff_t		thr_hashoff;
530} THREAD_INFO;
531
532#define	DB_EVENT(env, e, einfo) do {					\
533	DB_ENV *__dbenv = (env)->dbenv;					\
534	if (__dbenv->db_event_func != NULL)				\
535		__dbenv->db_event_func(__dbenv, e, einfo);		\
536} while (0)
537
538typedef struct __flag_map {
539	u_int32_t inflag, outflag;
540} FLAG_MAP;
541
542/*
543 * Internal database environment structure.
544 *
545 * This is the private database environment handle.  The public environment
546 * handle is the DB_ENV structure.   The library owns this structure, the user
547 * owns the DB_ENV structure.  The reason there are two structures is because
548 * the user's configuration outlives any particular DB_ENV->open call, and
549 * separate structures allows us to easily discard internal information without
550 * discarding the user's configuration.
551 */
552struct __env {
553	DB_ENV *dbenv;			/* Linked DB_ENV structure */
554
555	/*
556	 * The ENV structure can be used concurrently, so field access is
557	 * protected.
558	 */
559	db_mutex_t mtx_env;		/* ENV structure mutex */
560
561	/*
562	 * Some fields are included in the ENV structure rather than in the
563	 * DB_ENV structure because they are only set as arguments to the
564	 * DB_ENV->open method.  In other words, because of the historic API,
565	 * not for any rational reason.
566	 *
567	 * Arguments to DB_ENV->open.
568	 */
569	char	 *db_home;		/* Database home */
570	u_int32_t open_flags;		/* Flags */
571	int	  db_mode;		/* Default open permissions */
572
573	pid_t	pid_cache;		/* Cached process ID */
574
575	DB_FH	*lockfhp;		/* fcntl(2) locking file handle */
576
577	DB_LOCKER *env_lref;		/* Locker in non-threaded handles */
578
579	DB_DISTAB   recover_dtab;	/* Dispatch table for recover funcs */
580
581	int dir_mode;			/* Intermediate directory perms. */
582
583	/* Thread tracking */
584	u_int32_t	 thr_nbucket;	/* Number of hash buckets */
585	DB_HASHTAB	*thr_hashtab;	/* Hash table of DB_THREAD_INFO */
586
587	/* Mutex allocation */
588	struct {
589		int	  alloc_id;	/* Allocation ID argument */
590		u_int32_t flags;	/* Flags argument */
591	} *mutex_iq;			/* Initial mutexes queue */
592	u_int		mutex_iq_next;	/* Count of initial mutexes */
593	u_int		mutex_iq_max;	/* Maximum initial mutexes */
594
595	/*
596	 * List of open DB handles for this ENV, used for cursor
597	 * adjustment.  Must be protected for multi-threaded support.
598	 */
599	db_mutex_t mtx_dblist;
600	int	   db_ref;		/* DB handle reference count */
601	TAILQ_HEAD(__dblist, __db) dblist;
602
603	/*
604	 * List of open file handles for this ENV.  Must be protected
605	 * for multi-threaded support.
606	 */
607	TAILQ_HEAD(__fdlist, __fh_t) fdlist;
608
609	db_mutex_t	 mtx_mt;	/* Mersenne Twister mutex */
610	int		 mti;		/* Mersenne Twister index */
611	u_long		*mt;		/* Mersenne Twister state vector */
612
613	DB_CIPHER	*crypto_handle;	/* Crypto handle */
614	DB_LOCKTAB	*lk_handle;	/* Lock handle */
615	DB_LOG		*lg_handle;	/* Log handle */
616	DB_MPOOL	*mp_handle;	/* Mpool handle */
617	DB_MUTEXMGR	*mutex_handle;	/* Mutex handle */
618	DB_REP		*rep_handle;	/* Replication handle */
619	DB_TXNMGR	*tx_handle;	/* Txn handle */
620
621	/* Application callback to copy data to/from a custom data source */
622#define	DB_USERCOPY_GETDATA	0x0001
623#define	DB_USERCOPY_SETDATA	0x0002
624	int (*dbt_usercopy)
625	    __P((DBT *, u_int32_t, void *, u_int32_t, u_int32_t));
626
627	REGINFO	*reginfo;		/* REGINFO structure reference */
628
629#define	DB_TEST_ELECTINIT	 1	/* after __rep_elect_init */
630#define	DB_TEST_ELECTVOTE1	 2	/* after sending VOTE1 */
631#define	DB_TEST_POSTDESTROY	 3	/* after destroy op */
632#define	DB_TEST_POSTLOG		 4	/* after logging all pages */
633#define	DB_TEST_POSTLOGMETA	 5	/* after logging meta in btree */
634#define	DB_TEST_POSTOPEN	 6	/* after __os_open */
635#define	DB_TEST_POSTSYNC	 7	/* after syncing the log */
636#define	DB_TEST_PREDESTROY	 8	/* before destroy op */
637#define	DB_TEST_PREOPEN		 9	/* before __os_open */
638#define	DB_TEST_SUBDB_LOCKS	 10	/* subdb locking tests */
639	int	test_abort;		/* Abort value for testing */
640	int	test_check;		/* Checkpoint value for testing */
641	int	test_copy;		/* Copy value for testing */
642
643#define	ENV_CDB			0x00000001 /* DB_INIT_CDB */
644#define	ENV_DBLOCAL		0x00000002 /* Environment for a private DB */
645#define	ENV_LITTLEENDIAN	0x00000004 /* Little endian system. */
646#define	ENV_LOCKDOWN		0x00000008 /* DB_LOCKDOWN set */
647#define	ENV_NO_OUTPUT_SET	0x00000010 /* No output channel set */
648#define	ENV_OPEN_CALLED		0x00000020 /* DB_ENV->open called */
649#define	ENV_PRIVATE		0x00000040 /* DB_PRIVATE set */
650#define	ENV_RECOVER_FATAL	0x00000080 /* Doing fatal recovery in env */
651#define	ENV_REF_COUNTED		0x00000100 /* Region references this handle */
652#define	ENV_SYSTEM_MEM		0x00000200 /* DB_SYSTEM_MEM set */
653#define	ENV_THREAD		0x00000400 /* DB_THREAD set */
654	u_int32_t flags;
655};
656
657/*******************************************************
658 * Database Access Methods.
659 *******************************************************/
660/*
661 * DB_IS_THREADED --
662 *	The database handle is free-threaded (was opened with DB_THREAD).
663 */
664#define	DB_IS_THREADED(dbp)						\
665	((dbp)->mutex != MUTEX_INVALID)
666
667/* Initialization methods are often illegal before/after open is called. */
668#define	DB_ILLEGAL_AFTER_OPEN(dbp, name)				\
669	if (F_ISSET((dbp), DB_AM_OPEN_CALLED))				\
670		return (__db_mi_open((dbp)->env, name, 1));
671#define	DB_ILLEGAL_BEFORE_OPEN(dbp, name)				\
672	if (!F_ISSET((dbp), DB_AM_OPEN_CALLED))				\
673		return (__db_mi_open((dbp)->env, name, 0));
674/* Some initialization methods are illegal if environment isn't local. */
675#define	DB_ILLEGAL_IN_ENV(dbp, name)					\
676	if (!F_ISSET((dbp)->env, ENV_DBLOCAL))				\
677		return (__db_mi_env((dbp)->env, name));
678#define	DB_ILLEGAL_METHOD(dbp, flags) {					\
679	int __ret;							\
680	if ((__ret = __dbh_am_chk(dbp, flags)) != 0)			\
681		return (__ret);						\
682}
683
684/*
685 * Common DBC->internal fields.  Each access method adds additional fields
686 * to this list, but the initial fields are common.
687 */
688#define	__DBC_INTERNAL							\
689	DBC	 *opd;			/* Off-page duplicate cursor. */\
690	DBC	 *pdbc;			/* Pointer to parent cursor. */ \
691									\
692	void	 *page;			/* Referenced page. */		\
693	u_int32_t part;			/* Partition number. */		\
694	db_pgno_t root;			/* Tree root. */		\
695	db_pgno_t pgno;			/* Referenced page number. */	\
696	db_indx_t indx;			/* Referenced key item index. */\
697									\
698	/* Streaming -- cache last position. */				\
699	db_pgno_t stream_start_pgno;	/* Last start pgno. */		\
700	u_int32_t stream_off;		/* Current offset. */		\
701	db_pgno_t stream_curr_pgno;	/* Current overflow page. */	\
702									\
703	DB_LOCK		lock;		/* Cursor lock. */		\
704	db_lockmode_t	lock_mode;	/* Lock mode. */
705
706struct __dbc_internal {
707	__DBC_INTERNAL
708};
709
710/* Actions that __db_master_update can take. */
711typedef enum { MU_REMOVE, MU_RENAME, MU_OPEN } mu_action;
712
713/*
714 * Access-method-common macro for determining whether a cursor
715 * has been initialized.
716 */
717#ifdef HAVE_PARTITION
718#define	IS_INITIALIZED(dbc)	(DB_IS_PARTITIONED((dbc)->dbp) ?	\
719		((PART_CURSOR *)(dbc)->internal)->sub_cursor != NULL && \
720		((PART_CURSOR *)(dbc)->internal)->sub_cursor->		\
721		    internal->pgno != PGNO_INVALID :			\
722		(dbc)->internal->pgno != PGNO_INVALID)
723#else
724#define	IS_INITIALIZED(dbc)	((dbc)->internal->pgno != PGNO_INVALID)
725#endif
726
727/* Free the callback-allocated buffer, if necessary, hanging off of a DBT. */
728#define	FREE_IF_NEEDED(env, dbt)					\
729	if (F_ISSET((dbt), DB_DBT_APPMALLOC)) {				\
730		__os_ufree((env), (dbt)->data);				\
731		F_CLR((dbt), DB_DBT_APPMALLOC);				\
732	}
733
734/*
735 * Use memory belonging to object "owner" to return the results of
736 * any no-DBT-flag get ops on cursor "dbc".
737 */
738#define	SET_RET_MEM(dbc, owner)				\
739	do {						\
740		(dbc)->rskey = &(owner)->my_rskey;	\
741		(dbc)->rkey = &(owner)->my_rkey;	\
742		(dbc)->rdata = &(owner)->my_rdata;	\
743	} while (0)
744
745/* Use the return-data memory src is currently set to use in dest as well. */
746#define	COPY_RET_MEM(src, dest)				\
747	do {						\
748		(dest)->rskey = (src)->rskey;		\
749		(dest)->rkey = (src)->rkey;		\
750		(dest)->rdata = (src)->rdata;		\
751	} while (0)
752
753/* Reset the returned-memory pointers to their defaults. */
754#define	RESET_RET_MEM(dbc)				\
755	do {						\
756		(dbc)->rskey = &(dbc)->my_rskey;	\
757		(dbc)->rkey = &(dbc)->my_rkey;		\
758		(dbc)->rdata = &(dbc)->my_rdata;	\
759	} while (0)
760
761/*******************************************************
762 * Mpool.
763 *******************************************************/
764/*
765 * File types for DB access methods.  Negative numbers are reserved to DB.
766 */
767#define	DB_FTYPE_SET		-1		/* Call pgin/pgout functions. */
768#define	DB_FTYPE_NOTSET		 0		/* Don't call... */
769#define	DB_LSN_OFF_NOTSET	-1		/* Not yet set. */
770#define	DB_CLEARLEN_NOTSET	UINT32_MAX	/* Not yet set. */
771
772/* Structure used as the DB pgin/pgout pgcookie. */
773typedef struct __dbpginfo {
774	size_t	db_pagesize;		/* Underlying page size. */
775	u_int32_t flags;		/* Some DB_AM flags needed. */
776	DBTYPE  type;			/* DB type */
777} DB_PGINFO;
778
779/*******************************************************
780 * Log.
781 *******************************************************/
782/* Initialize an LSN to 'zero'. */
783#define	ZERO_LSN(LSN) do {						\
784	(LSN).file = 0;							\
785	(LSN).offset = 0;						\
786} while (0)
787#define	IS_ZERO_LSN(LSN)	((LSN).file == 0 && (LSN).offset == 0)
788
789#define	IS_INIT_LSN(LSN)	((LSN).file == 1 && (LSN).offset == 0)
790#define	INIT_LSN(LSN)		do {					\
791	(LSN).file = 1;							\
792	(LSN).offset = 0;						\
793} while (0)
794
795#define	MAX_LSN(LSN) do {						\
796	(LSN).file = UINT32_MAX;					\
797	(LSN).offset = UINT32_MAX;					\
798} while (0)
799#define	IS_MAX_LSN(LSN) \
800	((LSN).file == UINT32_MAX && (LSN).offset == UINT32_MAX)
801
802/* If logging is turned off, smash the lsn. */
803#define	LSN_NOT_LOGGED(LSN) do {					\
804	(LSN).file = 0;							\
805	(LSN).offset = 1;						\
806} while (0)
807#define	IS_NOT_LOGGED_LSN(LSN) \
808	((LSN).file == 0 && (LSN).offset == 1)
809
810/*
811 * LOG_COMPARE -- compare two LSNs.
812 */
813
814#define	LOG_COMPARE(lsn0, lsn1)						\
815    ((lsn0)->file != (lsn1)->file ?					\
816    ((lsn0)->file < (lsn1)->file ? -1 : 1) :				\
817    ((lsn0)->offset != (lsn1)->offset ?					\
818    ((lsn0)->offset < (lsn1)->offset ? -1 : 1) : 0))
819
820/*******************************************************
821 * Txn.
822 *******************************************************/
823#define	DB_NONBLOCK(C)	((C)->txn != NULL && F_ISSET((C)->txn, TXN_NOWAIT))
824#define	NOWAIT_FLAG(txn) \
825	((txn) != NULL && F_ISSET((txn), TXN_NOWAIT) ? DB_LOCK_NOWAIT : 0)
826#define	IS_REAL_TXN(txn)						\
827	((txn) != NULL && !F_ISSET(txn, TXN_CDSGROUP))
828#define	IS_SUBTRANSACTION(txn)						\
829	((txn) != NULL && (txn)->parent != NULL)
830
831/*******************************************************
832 * Crypto.
833 *******************************************************/
834#define	DB_IV_BYTES     16		/* Bytes per IV */
835#define	DB_MAC_KEY	20		/* Bytes per MAC checksum */
836
837/*******************************************************
838 * Compression
839 *******************************************************/
840#define CMP_INT_SPARE_VAL	0xFC	/* Smallest byte value that the integer
841					   compression algorithm doesn't use */
842
843/*******************************************************
844 * Secondaries over RPC.
845 *******************************************************/
846#ifdef CONFIG_TEST
847/*
848 * These are flags passed to DB->associate calls by the Tcl API if running
849 * over RPC.  The RPC server will mask out these flags before making the real
850 * DB->associate call.
851 *
852 * These flags must coexist with the valid flags to DB->associate (currently
853 * DB_AUTO_COMMIT and DB_CREATE).  DB_AUTO_COMMIT is in the group of
854 * high-order shared flags (0xff000000), and DB_CREATE is in the low-order
855 * group (0x00000fff), so we pick a range in between.
856 */
857#define	DB_RPC2ND_MASK		0x00f00000 /* Reserved bits. */
858
859#define	DB_RPC2ND_REVERSEDATA	0x00100000 /* callback_n(0) _s_reversedata. */
860#define	DB_RPC2ND_NOOP		0x00200000 /* callback_n(1) _s_noop */
861#define	DB_RPC2ND_CONCATKEYDATA	0x00300000 /* callback_n(2) _s_concatkeydata */
862#define	DB_RPC2ND_CONCATDATAKEY 0x00400000 /* callback_n(3) _s_concatdatakey */
863#define	DB_RPC2ND_REVERSECONCAT	0x00500000 /* callback_n(4) _s_reverseconcat */
864#define	DB_RPC2ND_TRUNCDATA	0x00600000 /* callback_n(5) _s_truncdata */
865#define	DB_RPC2ND_CONSTANT	0x00700000 /* callback_n(6) _s_constant */
866#define	DB_RPC2ND_GETZIP	0x00800000 /* sj_getzip */
867#define	DB_RPC2ND_GETNAME	0x00900000 /* sj_getname */
868#endif
869
870#if defined(__cplusplus)
871}
872#endif
873
874/*******************************************************
875 * Remaining general DB includes.
876 *******************************************************/
877@db_int_def@
878
879#include "dbinc/globals.h"
880#include "dbinc/clock.h"
881#include "dbinc/debug.h"
882#include "dbinc/region.h"
883#include "dbinc_auto/env_ext.h"
884#include "dbinc/mutex.h"
885#ifdef HAVE_REPLICATION_THREADS
886#include "dbinc/repmgr.h"
887#endif
888#include "dbinc/rep.h"
889#include "dbinc/os.h"
890#include "dbinc_auto/clib_ext.h"
891#include "dbinc_auto/common_ext.h"
892
893/*******************************************************
894 * Remaining Log.
895 * These need to be defined after the general includes
896 * because they need rep.h from above.
897 *******************************************************/
898/*
899 * Test if the environment is currently logging changes.  If we're in recovery
900 * or we're a replication client, we don't need to log changes because they're
901 * already in the log, even though we have a fully functional log system.
902 */
903#define	DBENV_LOGGING(env)						\
904	(LOGGING_ON(env) && !IS_REP_CLIENT(env) && (!IS_RECOVERING(env)))
905
906/*
907 * Test if we need to log a change.  By default, we don't log operations without
908 * associated transactions, unless DIAGNOSTIC, DEBUG_ROP or DEBUG_WOP are on.
909 * This is because we want to get log records for read/write operations, and, if
910 * we are trying to debug something, more information is always better.
911 *
912 * The DBC_RECOVER flag is set when we're in abort, as well as during recovery;
913 * thus DBC_LOGGING may be false for a particular dbc even when DBENV_LOGGING
914 * is true.
915 *
916 * We explicitly use LOGGING_ON/IS_REP_CLIENT here because we don't want to pull
917 * in the log headers, which IS_RECOVERING (and thus DBENV_LOGGING) rely on, and
918 * because DBC_RECOVER should be set anytime IS_RECOVERING would be true.
919 *
920 * If we're not in recovery (master - doing an abort or a client applying
921 * a txn), then a client's only path through here is on an internal
922 * operation, and a master's only path through here is a transactional
923 * operation.  Detect if either is not the case.
924 */
925#if defined(DIAGNOSTIC) || defined(DEBUG_ROP)  || defined(DEBUG_WOP)
926#define	DBC_LOGGING(dbc)	__dbc_logging(dbc)
927#else
928#define	DBC_LOGGING(dbc)						\
929	((dbc)->txn != NULL && LOGGING_ON((dbc)->env) &&		\
930	    !F_ISSET((dbc), DBC_RECOVER) && !IS_REP_CLIENT((dbc)->env))
931#endif
932
933#endif /* !_DB_INT_H_ */
934