1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 */
6/*
7 * Copyright (c) 1990, 1993, 1994, 1995, 1996
8 *	Keith Bostic.  All rights reserved.
9 */
10/*
11 * Copyright (c) 1990, 1993, 1994, 1995
12 *	The Regents of the University of California.  All rights reserved.
13 *
14 * This code is derived from software contributed to Berkeley by
15 * Mike Olson.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 * $Id: btree.h,v 12.17 2008/01/08 20:58:17 bostic Exp $
42 */
43#ifndef	_DB_BTREE_H_
44#define	_DB_BTREE_H_
45
46#if defined(__cplusplus)
47extern "C" {
48#endif
49
50/* Forward structure declarations. */
51struct __btree;		typedef struct __btree BTREE;
52struct __cursor;	typedef struct __cursor BTREE_CURSOR;
53struct __epg;		typedef struct __epg EPG;
54
55#define	DEFMINKEYPAGE	 (2)
56
57/*
58 * A recno order of 0 indicates that we don't have an order, not that we've
59 * an order less than 1.
60 */
61#define	INVALID_ORDER	0
62
63#define	ISINTERNAL(p)	(TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO)
64#define	ISLEAF(p)	(TYPE(p) == P_LBTREE ||				\
65			    TYPE(p) == P_LRECNO || TYPE(p) == P_LDUP)
66
67/* Flags for __bam_cadjust_log(). */
68#define	CAD_UPDATEROOT	0x01		/* Root page count was updated. */
69
70/* Flags for __bam_split_log(). */
71#define	SPL_NRECS	0x01		/* Split tree has record count. */
72
73/* Flags for __bam_iitem(). */
74#define	BI_DELETED	0x01		/* Key/data pair only placeholder. */
75
76/* Flags for __bam_stkrel(). */
77#define	STK_CLRDBC	0x01		/* Clear dbc->page reference. */
78#define	STK_NOLOCK	0x02		/* Don't retain locks. */
79#define	STK_PGONLY	0x04
80
81/* Flags for __ram_ca(). These get logged, so make the values explicit. */
82typedef enum {
83	CA_DELETE = 0,			/* Delete the current record. */
84	CA_IAFTER = 1,			/* Insert before the current record. */
85	CA_IBEFORE = 2,			/* Insert after the current record. */
86	CA_ICURRENT = 3			/* Overwrite the current record. */
87} ca_recno_arg;
88
89/*
90 * Flags for __bam_search() and __bam_rsearch().
91 *
92 * Note, internal page searches must find the largest record less than key in
93 * the tree so that descents work.  Leaf page searches must find the smallest
94 * record greater than key so that the returned index is the record's correct
95 * position for insertion.
96 *
97 * The flags parameter to the search routines describes three aspects of the
98 * search: the type of locking required (including if we're locking a pair of
99 * pages), the item to return in the presence of duplicates and whether or not
100 * to return deleted entries.  To simplify both the mnemonic representation
101 * and the code that checks for various cases, we construct a set of bitmasks.
102 */
103#define	SR_READ		0x00001		/* Read locks. */
104#define	SR_WRITE	0x00002		/* Write locks. */
105
106#define	SR_APPEND	0x00040		/* Append to the tree. */
107#define	SR_DELNO	0x00080		/* Don't return deleted items. */
108#define	SR_DUPFIRST	0x00100		/* Return first duplicate. */
109#define	SR_DUPLAST	0x00200		/* Return last duplicate. */
110#define	SR_EXACT	0x00400		/* Exact items only. */
111#define	SR_PARENT	0x00800		/* Lock page pair. */
112#define	SR_STACK	0x01000		/* Need a complete stack. */
113#define	SR_PAST_EOF	0x02000		/* If doing insert search (or keyfirst
114					 * or keylast operations), or a split
115					 * on behalf of an insert, it's okay to
116					 * return an entry one past end-of-page.
117					 */
118#define	SR_STK_ONLY	0x04000		/* Just return info in the stack */
119#define	SR_MAX		0x08000		/* Get the right most key */
120#define	SR_MIN		0x10000		/* Get the left most key */
121#define	SR_NEXT		0x20000		/* Get the page after this key */
122#define	SR_DEL		0x40000		/* Get the tree to delete this key. */
123#define	SR_START	0x80000		/* Level to start stack. */
124
125#define	SR_DELETE							\
126	(SR_WRITE | SR_DUPFIRST | SR_DELNO | SR_EXACT | SR_STACK)
127#define	SR_FIND		(SR_READ | SR_DUPFIRST | SR_DELNO)
128#define	SR_FIND_WR	(SR_WRITE | SR_DUPFIRST | SR_DELNO)
129#define	SR_INSERT	(SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_STACK)
130#define	SR_KEYFIRST	(SR_WRITE | SR_DUPFIRST | SR_PAST_EOF | SR_STACK)
131#define	SR_KEYLAST	(SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_STACK)
132#define	SR_WRPAIR	(SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_PARENT)
133
134/*
135 * Various routines pass around page references.  A page reference is
136 * a pointer to the page, and the indx indicates an item on the page.
137 * Each page reference may include a lock.
138 */
139struct __epg {
140	PAGE	     *page;		/* The page. */
141	db_indx_t     indx;		/* The index on the page. */
142	db_indx_t     entries;		/* The number of entries on page */
143	DB_LOCK	      lock;		/* The page's lock. */
144	db_lockmode_t lock_mode;	/* The lock mode. */
145};
146
147/*
148 * We maintain a stack of the pages that we're locking in the tree.  Grow
149 * the stack as necessary.
150 *
151 * XXX
152 * Temporary fix for #3243 -- clear the page and lock from the stack entry.
153 * The correct fix is to never release a stack that doesn't hold items.
154 */
155#define	BT_STK_CLR(c) do {						\
156	(c)->csp = (c)->sp;						\
157	(c)->csp->page = NULL;						\
158	LOCK_INIT((c)->csp->lock);					\
159} while (0)
160
161#define	BT_STK_ENTER(env, c, pagep, page_indx, l, mode, ret) do {	\
162	if ((ret = ((c)->csp == (c)->esp ?				\
163	    __bam_stkgrow(env, c) : 0)) == 0) {			\
164		(c)->csp->page = pagep;					\
165		(c)->csp->indx = (page_indx);				\
166		(c)->csp->entries = NUM_ENT(pagep);			\
167		(c)->csp->lock = l;					\
168		(c)->csp->lock_mode = mode;				\
169	}								\
170} while (0)
171
172#define	BT_STK_PUSH(env, c, pagep, page_indx, lock, mode, ret) do {	\
173	BT_STK_ENTER(env, c, pagep, page_indx, lock, mode, ret);	\
174	++(c)->csp;							\
175} while (0)
176
177#define	BT_STK_NUM(env, c, pagep, page_indx, ret) do {		\
178	if ((ret = ((c)->csp ==						\
179	    (c)->esp ? __bam_stkgrow(env, c) : 0)) == 0) {		\
180		(c)->csp->page = NULL;					\
181		(c)->csp->indx = (page_indx);				\
182		(c)->csp->entries = NUM_ENT(pagep);			\
183		LOCK_INIT((c)->csp->lock);				\
184		(c)->csp->lock_mode = DB_LOCK_NG;			\
185	}								\
186} while (0)
187
188#define	BT_STK_NUMPUSH(env, c, pagep, page_indx, ret) do {		\
189	BT_STK_NUM(env, cp, pagep, page_indx, ret);			\
190	++(c)->csp;							\
191} while (0)
192
193#define	BT_STK_POP(c)							\
194	((c)->csp == (c)->sp ? NULL : --(c)->csp)
195
196/* Btree/Recno cursor. */
197struct __cursor {
198	/* struct __dbc_internal */
199	__DBC_INTERNAL
200
201	/* btree private part */
202	EPG		*sp;		/* Stack pointer. */
203	EPG		*csp;		/* Current stack entry. */
204	EPG		*esp;		/* End stack pointer. */
205	EPG		 stack[5];
206
207	db_indx_t	 ovflsize;	/* Maximum key/data on-page size. */
208
209	db_recno_t	 recno;		/* Current record number. */
210	u_int32_t	 order;		/* Relative order among deleted curs. */
211
212	/*
213	 * Btree:
214	 * We set a flag in the cursor structure if the underlying object has
215	 * been deleted.  It's not strictly necessary, we could get the same
216	 * information by looking at the page itself, but this method doesn't
217	 * require us to retrieve the page on cursor delete.
218	 *
219	 * Recno:
220	 * When renumbering recno databases during deletes, cursors referencing
221	 * "deleted" records end up positioned between two records, and so must
222	 * be specially adjusted on the next operation.
223	 */
224#define	C_DELETED	0x0001		/* Record was deleted. */
225	/*
226	 * There are three tree types that require maintaining record numbers.
227	 * Recno AM trees, Btree AM trees for which the DB_RECNUM flag was set,
228	 * and Btree off-page duplicate trees.
229	 */
230#define	C_RECNUM	0x0002		/* Tree requires record counts. */
231	/*
232	 * Recno trees have immutable record numbers by default, but optionally
233	 * support mutable record numbers.  Off-page duplicate Recno trees have
234	 * mutable record numbers.  All Btrees with record numbers (including
235	 * off-page duplicate trees) are mutable by design, no flag is needed.
236	 */
237#define	C_RENUMBER	0x0004		/* Tree records are mutable. */
238	u_int32_t	 flags;
239};
240
241/*
242 * Threshhold value, as a function of bt_minkey, of the number of
243 * bytes a key/data pair can use before being placed on an overflow
244 * page.  Assume every item requires the maximum alignment for
245 * padding, out of sheer paranoia.
246 */
247#define	B_MINKEY_TO_OVFLSIZE(dbp, minkey, pgsize)			\
248	((u_int16_t)(((pgsize) - P_OVERHEAD(dbp)) / ((minkey) * P_INDX) -\
249	    (BKEYDATA_PSIZE(0) + DB_ALIGN(1, sizeof(int32_t)))))
250
251/*
252 * The maximum space that a single item can ever take up on one page.
253 * Used by __bam_split to determine whether a split is still necessary.
254 */
255#define	B_MAX(a,b)	(((a) > (b)) ? (a) : (b))
256#define	B_MAXSIZEONPAGE(ovflsize)					\
257	(B_MAX(BOVERFLOW_PSIZE, BKEYDATA_PSIZE(ovflsize)))
258
259/*
260 * The in-memory, per-tree btree/recno data structure.
261 */
262struct __btree {			/* Btree access method. */
263	/*
264	 * !!!
265	 * These fields are write-once (when the structure is created) and
266	 * so are ignored as far as multi-threading is concerned.
267	 */
268	db_pgno_t bt_meta;		/* Database meta-data page. */
269	db_pgno_t bt_root;		/* Database root page. */
270
271	u_int32_t bt_minkey;		/* Minimum keys per page. */
272
273					/* Btree comparison function. */
274	int (*bt_compare) __P((DB *, const DBT *, const DBT *));
275					/* Btree prefix function. */
276	size_t (*bt_prefix) __P((DB *, const DBT *, const DBT *));
277
278					/* Recno access method. */
279	int	  re_pad;		/* Fixed-length padding byte. */
280	int	  re_delim;		/* Variable-length delimiting byte. */
281	u_int32_t re_len;		/* Length for fixed-length records. */
282	char	 *re_source;		/* Source file name. */
283
284	/*
285	 * !!!
286	 * The bt_lpgno field is NOT protected by any mutex, and for this
287	 * reason must be advisory only, so, while it is read/written by
288	 * multiple threads, DB is completely indifferent to the quality
289	 * of its information.
290	 */
291	db_pgno_t bt_lpgno;		/* Last insert location. */
292	DB_LSN	  bt_llsn;		/* Last insert LSN. */
293
294	/*
295	 * !!!
296	 * The re_modified field is NOT protected by any mutex, and for this
297	 * reason cannot be anything more complicated than a zero/non-zero
298	 * value.  The actual writing of the backing source file cannot be
299	 * threaded, so clearing the flag isn't a problem.
300	 */
301	int	  re_modified;		/* If the tree was modified. */
302
303	/*
304	 * !!!
305	 * These fields are ignored as far as multi-threading is concerned.
306	 * There are no transaction semantics associated with backing files,
307	 * nor is there any thread protection.
308	 */
309	FILE		*re_fp;		/* Source file handle. */
310	int		 re_eof;	/* Backing source file EOF reached. */
311	db_recno_t	 re_last;	/* Last record number read. */
312
313};
314
315/*
316 * Modes for the __bam_curadj recovery records (btree_curadj).
317 * These appear in log records, so we wire the values and
318 * do not leave it up to the compiler.
319 */
320typedef enum {
321	DB_CA_DI	= 1,
322	DB_CA_DUP	= 2,
323	DB_CA_RSPLIT	= 3,
324	DB_CA_SPLIT	= 4
325} db_ca_mode;
326
327/*
328 * Flags for __bam_pinsert.
329 */
330#define	BPI_SPACEONLY	0x01		/* Only check for space to update. */
331#define	BPI_NORECNUM	0x02		/* Not update the recnum on the left. */
332
333#if defined(__cplusplus)
334}
335#endif
336
337#include "dbinc_auto/btree_auto.h"
338#include "dbinc_auto/btree_ext.h"
339#include "dbinc/db_am.h"
340#endif /* !_DB_BTREE_H_ */
341