1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 */ 6/* 7 * Copyright (c) 1990, 1993, 1994, 1995, 1996 8 * Keith Bostic. All rights reserved. 9 */ 10/* 11 * Copyright (c) 1990, 1993, 1994, 1995 12 * The Regents of the University of California. All rights reserved. 13 * 14 * This code is derived from software contributed to Berkeley by 15 * Mike Olson. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. Neither the name of the University nor the names of its contributors 26 * may be used to endorse or promote products derived from this software 27 * without specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 39 * SUCH DAMAGE. 40 * 41 * $Id: btree.h,v 12.17 2008/01/08 20:58:17 bostic Exp $ 42 */ 43#ifndef _DB_BTREE_H_ 44#define _DB_BTREE_H_ 45 46#if defined(__cplusplus) 47extern "C" { 48#endif 49 50/* Forward structure declarations. */ 51struct __btree; typedef struct __btree BTREE; 52struct __cursor; typedef struct __cursor BTREE_CURSOR; 53struct __epg; typedef struct __epg EPG; 54 55#define DEFMINKEYPAGE (2) 56 57/* 58 * A recno order of 0 indicates that we don't have an order, not that we've 59 * an order less than 1. 60 */ 61#define INVALID_ORDER 0 62 63#define ISINTERNAL(p) (TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO) 64#define ISLEAF(p) (TYPE(p) == P_LBTREE || \ 65 TYPE(p) == P_LRECNO || TYPE(p) == P_LDUP) 66 67/* Flags for __bam_cadjust_log(). */ 68#define CAD_UPDATEROOT 0x01 /* Root page count was updated. */ 69 70/* Flags for __bam_split_log(). */ 71#define SPL_NRECS 0x01 /* Split tree has record count. */ 72 73/* Flags for __bam_iitem(). */ 74#define BI_DELETED 0x01 /* Key/data pair only placeholder. */ 75 76/* Flags for __bam_stkrel(). */ 77#define STK_CLRDBC 0x01 /* Clear dbc->page reference. */ 78#define STK_NOLOCK 0x02 /* Don't retain locks. */ 79#define STK_PGONLY 0x04 80 81/* Flags for __ram_ca(). These get logged, so make the values explicit. */ 82typedef enum { 83 CA_DELETE = 0, /* Delete the current record. */ 84 CA_IAFTER = 1, /* Insert before the current record. */ 85 CA_IBEFORE = 2, /* Insert after the current record. */ 86 CA_ICURRENT = 3 /* Overwrite the current record. */ 87} ca_recno_arg; 88 89/* 90 * Flags for __bam_search() and __bam_rsearch(). 91 * 92 * Note, internal page searches must find the largest record less than key in 93 * the tree so that descents work. Leaf page searches must find the smallest 94 * record greater than key so that the returned index is the record's correct 95 * position for insertion. 96 * 97 * The flags parameter to the search routines describes three aspects of the 98 * search: the type of locking required (including if we're locking a pair of 99 * pages), the item to return in the presence of duplicates and whether or not 100 * to return deleted entries. To simplify both the mnemonic representation 101 * and the code that checks for various cases, we construct a set of bitmasks. 102 */ 103#define SR_READ 0x00001 /* Read locks. */ 104#define SR_WRITE 0x00002 /* Write locks. */ 105 106#define SR_APPEND 0x00040 /* Append to the tree. */ 107#define SR_DELNO 0x00080 /* Don't return deleted items. */ 108#define SR_DUPFIRST 0x00100 /* Return first duplicate. */ 109#define SR_DUPLAST 0x00200 /* Return last duplicate. */ 110#define SR_EXACT 0x00400 /* Exact items only. */ 111#define SR_PARENT 0x00800 /* Lock page pair. */ 112#define SR_STACK 0x01000 /* Need a complete stack. */ 113#define SR_PAST_EOF 0x02000 /* If doing insert search (or keyfirst 114 * or keylast operations), or a split 115 * on behalf of an insert, it's okay to 116 * return an entry one past end-of-page. 117 */ 118#define SR_STK_ONLY 0x04000 /* Just return info in the stack */ 119#define SR_MAX 0x08000 /* Get the right most key */ 120#define SR_MIN 0x10000 /* Get the left most key */ 121#define SR_NEXT 0x20000 /* Get the page after this key */ 122#define SR_DEL 0x40000 /* Get the tree to delete this key. */ 123#define SR_START 0x80000 /* Level to start stack. */ 124 125#define SR_DELETE \ 126 (SR_WRITE | SR_DUPFIRST | SR_DELNO | SR_EXACT | SR_STACK) 127#define SR_FIND (SR_READ | SR_DUPFIRST | SR_DELNO) 128#define SR_FIND_WR (SR_WRITE | SR_DUPFIRST | SR_DELNO) 129#define SR_INSERT (SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_STACK) 130#define SR_KEYFIRST (SR_WRITE | SR_DUPFIRST | SR_PAST_EOF | SR_STACK) 131#define SR_KEYLAST (SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_STACK) 132#define SR_WRPAIR (SR_WRITE | SR_DUPLAST | SR_PAST_EOF | SR_PARENT) 133 134/* 135 * Various routines pass around page references. A page reference is 136 * a pointer to the page, and the indx indicates an item on the page. 137 * Each page reference may include a lock. 138 */ 139struct __epg { 140 PAGE *page; /* The page. */ 141 db_indx_t indx; /* The index on the page. */ 142 db_indx_t entries; /* The number of entries on page */ 143 DB_LOCK lock; /* The page's lock. */ 144 db_lockmode_t lock_mode; /* The lock mode. */ 145}; 146 147/* 148 * We maintain a stack of the pages that we're locking in the tree. Grow 149 * the stack as necessary. 150 * 151 * XXX 152 * Temporary fix for #3243 -- clear the page and lock from the stack entry. 153 * The correct fix is to never release a stack that doesn't hold items. 154 */ 155#define BT_STK_CLR(c) do { \ 156 (c)->csp = (c)->sp; \ 157 (c)->csp->page = NULL; \ 158 LOCK_INIT((c)->csp->lock); \ 159} while (0) 160 161#define BT_STK_ENTER(env, c, pagep, page_indx, l, mode, ret) do { \ 162 if ((ret = ((c)->csp == (c)->esp ? \ 163 __bam_stkgrow(env, c) : 0)) == 0) { \ 164 (c)->csp->page = pagep; \ 165 (c)->csp->indx = (page_indx); \ 166 (c)->csp->entries = NUM_ENT(pagep); \ 167 (c)->csp->lock = l; \ 168 (c)->csp->lock_mode = mode; \ 169 } \ 170} while (0) 171 172#define BT_STK_PUSH(env, c, pagep, page_indx, lock, mode, ret) do { \ 173 BT_STK_ENTER(env, c, pagep, page_indx, lock, mode, ret); \ 174 ++(c)->csp; \ 175} while (0) 176 177#define BT_STK_NUM(env, c, pagep, page_indx, ret) do { \ 178 if ((ret = ((c)->csp == \ 179 (c)->esp ? __bam_stkgrow(env, c) : 0)) == 0) { \ 180 (c)->csp->page = NULL; \ 181 (c)->csp->indx = (page_indx); \ 182 (c)->csp->entries = NUM_ENT(pagep); \ 183 LOCK_INIT((c)->csp->lock); \ 184 (c)->csp->lock_mode = DB_LOCK_NG; \ 185 } \ 186} while (0) 187 188#define BT_STK_NUMPUSH(env, c, pagep, page_indx, ret) do { \ 189 BT_STK_NUM(env, cp, pagep, page_indx, ret); \ 190 ++(c)->csp; \ 191} while (0) 192 193#define BT_STK_POP(c) \ 194 ((c)->csp == (c)->sp ? NULL : --(c)->csp) 195 196/* Btree/Recno cursor. */ 197struct __cursor { 198 /* struct __dbc_internal */ 199 __DBC_INTERNAL 200 201 /* btree private part */ 202 EPG *sp; /* Stack pointer. */ 203 EPG *csp; /* Current stack entry. */ 204 EPG *esp; /* End stack pointer. */ 205 EPG stack[5]; 206 207 db_indx_t ovflsize; /* Maximum key/data on-page size. */ 208 209 db_recno_t recno; /* Current record number. */ 210 u_int32_t order; /* Relative order among deleted curs. */ 211 212 /* 213 * Btree: 214 * We set a flag in the cursor structure if the underlying object has 215 * been deleted. It's not strictly necessary, we could get the same 216 * information by looking at the page itself, but this method doesn't 217 * require us to retrieve the page on cursor delete. 218 * 219 * Recno: 220 * When renumbering recno databases during deletes, cursors referencing 221 * "deleted" records end up positioned between two records, and so must 222 * be specially adjusted on the next operation. 223 */ 224#define C_DELETED 0x0001 /* Record was deleted. */ 225 /* 226 * There are three tree types that require maintaining record numbers. 227 * Recno AM trees, Btree AM trees for which the DB_RECNUM flag was set, 228 * and Btree off-page duplicate trees. 229 */ 230#define C_RECNUM 0x0002 /* Tree requires record counts. */ 231 /* 232 * Recno trees have immutable record numbers by default, but optionally 233 * support mutable record numbers. Off-page duplicate Recno trees have 234 * mutable record numbers. All Btrees with record numbers (including 235 * off-page duplicate trees) are mutable by design, no flag is needed. 236 */ 237#define C_RENUMBER 0x0004 /* Tree records are mutable. */ 238 u_int32_t flags; 239}; 240 241/* 242 * Threshhold value, as a function of bt_minkey, of the number of 243 * bytes a key/data pair can use before being placed on an overflow 244 * page. Assume every item requires the maximum alignment for 245 * padding, out of sheer paranoia. 246 */ 247#define B_MINKEY_TO_OVFLSIZE(dbp, minkey, pgsize) \ 248 ((u_int16_t)(((pgsize) - P_OVERHEAD(dbp)) / ((minkey) * P_INDX) -\ 249 (BKEYDATA_PSIZE(0) + DB_ALIGN(1, sizeof(int32_t))))) 250 251/* 252 * The maximum space that a single item can ever take up on one page. 253 * Used by __bam_split to determine whether a split is still necessary. 254 */ 255#define B_MAX(a,b) (((a) > (b)) ? (a) : (b)) 256#define B_MAXSIZEONPAGE(ovflsize) \ 257 (B_MAX(BOVERFLOW_PSIZE, BKEYDATA_PSIZE(ovflsize))) 258 259/* 260 * The in-memory, per-tree btree/recno data structure. 261 */ 262struct __btree { /* Btree access method. */ 263 /* 264 * !!! 265 * These fields are write-once (when the structure is created) and 266 * so are ignored as far as multi-threading is concerned. 267 */ 268 db_pgno_t bt_meta; /* Database meta-data page. */ 269 db_pgno_t bt_root; /* Database root page. */ 270 271 u_int32_t bt_minkey; /* Minimum keys per page. */ 272 273 /* Btree comparison function. */ 274 int (*bt_compare) __P((DB *, const DBT *, const DBT *)); 275 /* Btree prefix function. */ 276 size_t (*bt_prefix) __P((DB *, const DBT *, const DBT *)); 277 278 /* Recno access method. */ 279 int re_pad; /* Fixed-length padding byte. */ 280 int re_delim; /* Variable-length delimiting byte. */ 281 u_int32_t re_len; /* Length for fixed-length records. */ 282 char *re_source; /* Source file name. */ 283 284 /* 285 * !!! 286 * The bt_lpgno field is NOT protected by any mutex, and for this 287 * reason must be advisory only, so, while it is read/written by 288 * multiple threads, DB is completely indifferent to the quality 289 * of its information. 290 */ 291 db_pgno_t bt_lpgno; /* Last insert location. */ 292 DB_LSN bt_llsn; /* Last insert LSN. */ 293 294 /* 295 * !!! 296 * The re_modified field is NOT protected by any mutex, and for this 297 * reason cannot be anything more complicated than a zero/non-zero 298 * value. The actual writing of the backing source file cannot be 299 * threaded, so clearing the flag isn't a problem. 300 */ 301 int re_modified; /* If the tree was modified. */ 302 303 /* 304 * !!! 305 * These fields are ignored as far as multi-threading is concerned. 306 * There are no transaction semantics associated with backing files, 307 * nor is there any thread protection. 308 */ 309 FILE *re_fp; /* Source file handle. */ 310 int re_eof; /* Backing source file EOF reached. */ 311 db_recno_t re_last; /* Last record number read. */ 312 313}; 314 315/* 316 * Modes for the __bam_curadj recovery records (btree_curadj). 317 * These appear in log records, so we wire the values and 318 * do not leave it up to the compiler. 319 */ 320typedef enum { 321 DB_CA_DI = 1, 322 DB_CA_DUP = 2, 323 DB_CA_RSPLIT = 3, 324 DB_CA_SPLIT = 4 325} db_ca_mode; 326 327/* 328 * Flags for __bam_pinsert. 329 */ 330#define BPI_SPACEONLY 0x01 /* Only check for space to update. */ 331#define BPI_NORECNUM 0x02 /* Not update the recnum on the left. */ 332 333#if defined(__cplusplus) 334} 335#endif 336 337#include "dbinc_auto/btree_auto.h" 338#include "dbinc_auto/btree_ext.h" 339#include "dbinc/db_am.h" 340#endif /* !_DB_BTREE_H_ */ 341