1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 * 6 * $Id: db_page.h,v 12.15 2008/01/10 17:44:45 bostic Exp $ 7 */ 8 9#ifndef _DB_PAGE_H_ 10#define _DB_PAGE_H_ 11 12#if defined(__cplusplus) 13extern "C" { 14#endif 15 16/* 17 * DB page formats. 18 * 19 * !!! 20 * This implementation requires that values within the following structures 21 * NOT be padded -- note, ANSI C permits random padding within structures. 22 * If your compiler pads randomly you can just forget ever making DB run on 23 * your system. In addition, no data type can require larger alignment than 24 * its own size, e.g., a 4-byte data element may not require 8-byte alignment. 25 * 26 * Note that key/data lengths are often stored in db_indx_t's -- this is 27 * not accidental, nor does it limit the key/data size. If the key/data 28 * item fits on a page, it's guaranteed to be small enough to fit into a 29 * db_indx_t, and storing it in one saves space. 30 */ 31 32#define PGNO_INVALID 0 /* Invalid page number in any database. */ 33#define PGNO_BASE_MD 0 /* Base database: metadata page number. */ 34 35/* Page types. */ 36#define P_INVALID 0 /* Invalid page type. */ 37#define __P_DUPLICATE 1 /* Duplicate. DEPRECATED in 3.1 */ 38#define P_HASH_UNSORTED 2 /* Hash pages created pre 4.6. DEPRECATED */ 39#define P_IBTREE 3 /* Btree internal. */ 40#define P_IRECNO 4 /* Recno internal. */ 41#define P_LBTREE 5 /* Btree leaf. */ 42#define P_LRECNO 6 /* Recno leaf. */ 43#define P_OVERFLOW 7 /* Overflow. */ 44#define P_HASHMETA 8 /* Hash metadata page. */ 45#define P_BTREEMETA 9 /* Btree metadata page. */ 46#define P_QAMMETA 10 /* Queue metadata page. */ 47#define P_QAMDATA 11 /* Queue data page. */ 48#define P_LDUP 12 /* Off-page duplicate leaf. */ 49#define P_HASH 13 /* Sorted hash page. */ 50#define P_PAGETYPE_MAX 14 51/* Flag to __db_new */ 52#define P_DONTEXTEND 0x8000 /* Don't allocate if there are no free pages. */ 53 54/* 55 * When we create pages in mpool, we ask mpool to clear some number of bytes 56 * in the header. This number must be at least as big as the regular page 57 * headers and cover enough of the btree and hash meta-data pages to obliterate 58 * the page type. 59 */ 60#define DB_PAGE_DB_LEN 32 61#define DB_PAGE_QUEUE_LEN 0 62 63/************************************************************************ 64 GENERIC METADATA PAGE HEADER 65 * 66 * !!! 67 * The magic and version numbers have to be in the same place in all versions 68 * of the metadata page as the application may not have upgraded the database. 69 ************************************************************************/ 70typedef struct _dbmeta33 { 71 DB_LSN lsn; /* 00-07: LSN. */ 72 db_pgno_t pgno; /* 08-11: Current page number. */ 73 u_int32_t magic; /* 12-15: Magic number. */ 74 u_int32_t version; /* 16-19: Version. */ 75 u_int32_t pagesize; /* 20-23: Pagesize. */ 76 u_int8_t encrypt_alg; /* 24: Encryption algorithm. */ 77 u_int8_t type; /* 25: Page type. */ 78#define DBMETA_CHKSUM 0x01 79 u_int8_t metaflags; /* 26: Meta-only flags */ 80 u_int8_t unused1; /* 27: Unused. */ 81 u_int32_t free; /* 28-31: Free list page number. */ 82 db_pgno_t last_pgno; /* 32-35: Page number of last page in db. */ 83 u_int32_t unused3; /* 36-39: Unused. */ 84 u_int32_t key_count; /* 40-43: Cached key count. */ 85 u_int32_t record_count; /* 44-47: Cached record count. */ 86 u_int32_t flags; /* 48-51: Flags: unique to each AM. */ 87 /* 52-71: Unique file ID. */ 88 u_int8_t uid[DB_FILE_ID_LEN]; 89} DBMETA33, DBMETA; 90 91/************************************************************************ 92 BTREE METADATA PAGE LAYOUT 93 ************************************************************************/ 94typedef struct _btmeta33 { 95#define BTM_DUP 0x001 /* Duplicates. */ 96#define BTM_RECNO 0x002 /* Recno tree. */ 97#define BTM_RECNUM 0x004 /* Btree: maintain record count. */ 98#define BTM_FIXEDLEN 0x008 /* Recno: fixed length records. */ 99#define BTM_RENUMBER 0x010 /* Recno: renumber on insert/delete. */ 100#define BTM_SUBDB 0x020 /* Subdatabases. */ 101#define BTM_DUPSORT 0x040 /* Duplicates are sorted. */ 102#define BTM_MASK 0x07f 103 DBMETA dbmeta; /* 00-71: Generic meta-data header. */ 104 105 u_int32_t unused1; /* 72-75: Unused space. */ 106 u_int32_t minkey; /* 76-79: Btree: Minkey. */ 107 u_int32_t re_len; /* 80-83: Recno: fixed-length record length. */ 108 u_int32_t re_pad; /* 84-87: Recno: fixed-length record pad. */ 109 u_int32_t root; /* 88-91: Root page. */ 110 u_int32_t unused2[92]; /* 92-459: Unused space. */ 111 u_int32_t crypto_magic; /* 460-463: Crypto magic number */ 112 u_int32_t trash[3]; /* 464-475: Trash space - Do not use */ 113 u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */ 114 u_int8_t chksum[DB_MAC_KEY]; /* 496-511: Page chksum */ 115 116 /* 117 * Minimum page size is 512. 118 */ 119} BTMETA33, BTMETA; 120 121/************************************************************************ 122 HASH METADATA PAGE LAYOUT 123 ************************************************************************/ 124typedef struct _hashmeta33 { 125#define DB_HASH_DUP 0x01 /* Duplicates. */ 126#define DB_HASH_SUBDB 0x02 /* Subdatabases. */ 127#define DB_HASH_DUPSORT 0x04 /* Duplicates are sorted. */ 128 DBMETA dbmeta; /* 00-71: Generic meta-data page header. */ 129 130 u_int32_t max_bucket; /* 72-75: ID of Maximum bucket in use */ 131 u_int32_t high_mask; /* 76-79: Modulo mask into table */ 132 u_int32_t low_mask; /* 80-83: Modulo mask into table lower half */ 133 u_int32_t ffactor; /* 84-87: Fill factor */ 134 u_int32_t nelem; /* 88-91: Number of keys in hash table */ 135 u_int32_t h_charkey; /* 92-95: Value of hash(CHARKEY) */ 136#define NCACHED 32 /* number of spare points */ 137 /* 96-223: Spare pages for overflow */ 138 u_int32_t spares[NCACHED]; 139 u_int32_t unused[59]; /* 224-459: Unused space */ 140 u_int32_t crypto_magic; /* 460-463: Crypto magic number */ 141 u_int32_t trash[3]; /* 464-475: Trash space - Do not use */ 142 u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */ 143 u_int8_t chksum[DB_MAC_KEY]; /* 496-511: Page chksum */ 144 145 /* 146 * Minimum page size is 512. 147 */ 148} HMETA33, HMETA; 149 150/************************************************************************ 151 QUEUE METADATA PAGE LAYOUT 152 ************************************************************************/ 153/* 154 * QAM Meta data page structure 155 * 156 */ 157typedef struct _qmeta33 { 158 DBMETA dbmeta; /* 00-71: Generic meta-data header. */ 159 160 u_int32_t first_recno; /* 72-75: First not deleted record. */ 161 u_int32_t cur_recno; /* 76-79: Next recno to be allocated. */ 162 u_int32_t re_len; /* 80-83: Fixed-length record length. */ 163 u_int32_t re_pad; /* 84-87: Fixed-length record pad. */ 164 u_int32_t rec_page; /* 88-91: Records Per Page. */ 165 u_int32_t page_ext; /* 92-95: Pages per extent */ 166 167 u_int32_t unused[91]; /* 96-459: Unused space */ 168 u_int32_t crypto_magic; /* 460-463: Crypto magic number */ 169 u_int32_t trash[3]; /* 464-475: Trash space - Do not use */ 170 u_int8_t iv[DB_IV_BYTES]; /* 476-495: Crypto IV */ 171 u_int8_t chksum[DB_MAC_KEY]; /* 496-511: Page chksum */ 172 /* 173 * Minimum page size is 512. 174 */ 175} QMETA33, QMETA; 176 177/* 178 * DBMETASIZE is a constant used by __db_file_setup and DB->verify 179 * as a buffer which is guaranteed to be larger than any possible 180 * metadata page size and smaller than any disk sector. 181 */ 182#define DBMETASIZE 512 183 184/************************************************************************ 185 BTREE/HASH MAIN PAGE LAYOUT 186 ************************************************************************/ 187/* 188 * +-----------------------------------+ 189 * | lsn | pgno | prev pgno | 190 * +-----------------------------------+ 191 * | next pgno | entries | hf offset | 192 * +-----------------------------------+ 193 * | level | type | chksum | 194 * +-----------------------------------+ 195 * | iv | index | free --> | 196 * +-----------+-----------------------+ 197 * | F R E E A R E A | 198 * +-----------------------------------+ 199 * | <-- free | item | 200 * +-----------------------------------+ 201 * | item | item | item | 202 * +-----------------------------------+ 203 * 204 * sizeof(PAGE) == 26 bytes + possibly 20 bytes of checksum and possibly 205 * 16 bytes of IV (+ 2 bytes for alignment), and the following indices 206 * are guaranteed to be two-byte aligned. If we aren't doing crypto or 207 * checksumming the bytes are reclaimed for data storage. 208 * 209 * For hash and btree leaf pages, index items are paired, e.g., inp[0] is the 210 * key for inp[1]'s data. All other types of pages only contain single items. 211 */ 212typedef struct __pg_chksum { 213 u_int8_t unused[2]; /* 26-27: For alignment */ 214 u_int8_t chksum[4]; /* 28-31: Checksum */ 215} PG_CHKSUM; 216 217typedef struct __pg_crypto { 218 u_int8_t unused[2]; /* 26-27: For alignment */ 219 u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */ 220 u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */ 221 /* !!! 222 * Must be 16-byte aligned for crypto 223 */ 224} PG_CRYPTO; 225 226typedef struct _db_page { 227 DB_LSN lsn; /* 00-07: Log sequence number. */ 228 db_pgno_t pgno; /* 08-11: Current page number. */ 229 db_pgno_t prev_pgno; /* 12-15: Previous page number. */ 230 db_pgno_t next_pgno; /* 16-19: Next page number. */ 231 db_indx_t entries; /* 20-21: Number of items on the page. */ 232 db_indx_t hf_offset; /* 22-23: High free byte page offset. */ 233 234 /* 235 * The btree levels are numbered from the leaf to the root, starting 236 * with 1, so the leaf is level 1, its parent is level 2, and so on. 237 * We maintain this level on all btree pages, but the only place that 238 * we actually need it is on the root page. It would not be difficult 239 * to hide the byte on the root page once it becomes an internal page, 240 * so we could get this byte back if we needed it for something else. 241 */ 242#define LEAFLEVEL 1 243#define MAXBTREELEVEL 255 244 u_int8_t level; /* 24: Btree tree level. */ 245 u_int8_t type; /* 25: Page type. */ 246} PAGE; 247 248/* 249 * With many compilers sizeof(PAGE) == 28, while SIZEOF_PAGE == 26. 250 * We add in other things directly after the page header and need 251 * the SIZEOF_PAGE. When giving the sizeof(), many compilers will 252 * pad it out to the next 4-byte boundary. 253 */ 254#define SIZEOF_PAGE 26 255/* 256 * !!! 257 * DB_AM_ENCRYPT always implies DB_AM_CHKSUM so that must come first. 258 */ 259#define P_INP(dbp, pg) \ 260 ((db_indx_t *)((u_int8_t *)(pg) + SIZEOF_PAGE + \ 261 (F_ISSET((dbp), DB_AM_ENCRYPT) ? sizeof(PG_CRYPTO) : \ 262 (F_ISSET((dbp), DB_AM_CHKSUM) ? sizeof(PG_CHKSUM) : 0)))) 263 264#define P_IV(dbp, pg) \ 265 (F_ISSET((dbp), DB_AM_ENCRYPT) ? ((u_int8_t *)(pg) + \ 266 SIZEOF_PAGE + SSZA(PG_CRYPTO, iv)) \ 267 : NULL) 268 269#define P_CHKSUM(dbp, pg) \ 270 (F_ISSET((dbp), DB_AM_ENCRYPT) ? ((u_int8_t *)(pg) + \ 271 SIZEOF_PAGE + SSZA(PG_CRYPTO, chksum)) : \ 272 (F_ISSET((dbp), DB_AM_CHKSUM) ? ((u_int8_t *)(pg) + \ 273 SIZEOF_PAGE + SSZA(PG_CHKSUM, chksum)) \ 274 : NULL)) 275 276/* PAGE element macros. */ 277#define LSN(p) (((PAGE *)p)->lsn) 278#define PGNO(p) (((PAGE *)p)->pgno) 279#define PREV_PGNO(p) (((PAGE *)p)->prev_pgno) 280#define NEXT_PGNO(p) (((PAGE *)p)->next_pgno) 281#define NUM_ENT(p) (((PAGE *)p)->entries) 282#define HOFFSET(p) (((PAGE *)p)->hf_offset) 283#define LEVEL(p) (((PAGE *)p)->level) 284#define TYPE(p) (((PAGE *)p)->type) 285 286/************************************************************************ 287 QUEUE MAIN PAGE LAYOUT 288 ************************************************************************/ 289/* 290 * Sizes of page below. Used to reclaim space if not doing 291 * crypto or checksumming. If you change the QPAGE below you 292 * MUST adjust this too. 293 */ 294#define QPAGE_NORMAL 28 295#define QPAGE_CHKSUM 48 296#define QPAGE_SEC 64 297 298typedef struct _qpage { 299 DB_LSN lsn; /* 00-07: Log sequence number. */ 300 db_pgno_t pgno; /* 08-11: Current page number. */ 301 u_int32_t unused0[3]; /* 12-23: Unused. */ 302 u_int8_t unused1[1]; /* 24: Unused. */ 303 u_int8_t type; /* 25: Page type. */ 304 u_int8_t unused2[2]; /* 26-27: Unused. */ 305 u_int8_t chksum[DB_MAC_KEY]; /* 28-47: Checksum */ 306 u_int8_t iv[DB_IV_BYTES]; /* 48-63: IV */ 307} QPAGE; 308 309#define QPAGE_SZ(dbp) \ 310 (F_ISSET((dbp), DB_AM_ENCRYPT) ? QPAGE_SEC : \ 311 F_ISSET((dbp), DB_AM_CHKSUM) ? QPAGE_CHKSUM : QPAGE_NORMAL) 312/* 313 * !!! 314 * The next_pgno and prev_pgno fields are not maintained for btree and recno 315 * internal pages. Doing so only provides a minor performance improvement, 316 * it's hard to do when deleting internal pages, and it increases the chance 317 * of deadlock during deletes and splits because we have to re-link pages at 318 * more than the leaf level. 319 * 320 * !!! 321 * The btree/recno access method needs db_recno_t bytes of space on the root 322 * page to specify how many records are stored in the tree. (The alternative 323 * is to store the number of records in the meta-data page, which will create 324 * a second hot spot in trees being actively modified, or recalculate it from 325 * the BINTERNAL fields on each access.) Overload the PREV_PGNO field. 326 */ 327#define RE_NREC(p) \ 328 ((TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO) ? PREV_PGNO(p) : \ 329 (db_pgno_t)(TYPE(p) == P_LBTREE ? NUM_ENT(p) / 2 : NUM_ENT(p))) 330#define RE_NREC_ADJ(p, adj) \ 331 PREV_PGNO(p) += adj; 332#define RE_NREC_SET(p, num) \ 333 PREV_PGNO(p) = (num); 334 335/* 336 * Initialize a page. 337 * 338 * !!! 339 * Don't modify the page's LSN, code depends on it being unchanged after a 340 * P_INIT call. 341 */ 342#define P_INIT(pg, pg_size, n, pg_prev, pg_next, btl, pg_type) do { \ 343 PGNO(pg) = (n); \ 344 PREV_PGNO(pg) = (pg_prev); \ 345 NEXT_PGNO(pg) = (pg_next); \ 346 NUM_ENT(pg) = (0); \ 347 HOFFSET(pg) = (db_indx_t)(pg_size); \ 348 LEVEL(pg) = (btl); \ 349 TYPE(pg) = (pg_type); \ 350} while (0) 351 352/* Page header length (offset to first index). */ 353#define P_OVERHEAD(dbp) P_TO_UINT16(P_INP(dbp, 0)) 354 355/* First free byte. */ 356#define LOFFSET(dbp, pg) \ 357 (P_OVERHEAD(dbp) + NUM_ENT(pg) * sizeof(db_indx_t)) 358 359/* Free space on a regular page. */ 360#define P_FREESPACE(dbp, pg) (HOFFSET(pg) - LOFFSET(dbp, pg)) 361 362/* Get a pointer to the bytes at a specific index. */ 363#define P_ENTRY(dbp, pg, indx) ((u_int8_t *)pg + P_INP(dbp, pg)[indx]) 364 365/************************************************************************ 366 OVERFLOW PAGE LAYOUT 367 ************************************************************************/ 368 369/* 370 * Overflow items are referenced by HOFFPAGE and BOVERFLOW structures, which 371 * store a page number (the first page of the overflow item) and a length 372 * (the total length of the overflow item). The overflow item consists of 373 * some number of overflow pages, linked by the next_pgno field of the page. 374 * A next_pgno field of PGNO_INVALID flags the end of the overflow item. 375 * 376 * Overflow page overloads: 377 * The amount of overflow data stored on each page is stored in the 378 * hf_offset field. 379 * 380 * The implementation reference counts overflow items as it's possible 381 * for them to be promoted onto btree internal pages. The reference 382 * count is stored in the entries field. 383 */ 384#define OV_LEN(p) (((PAGE *)p)->hf_offset) 385#define OV_REF(p) (((PAGE *)p)->entries) 386 387/* Maximum number of bytes that you can put on an overflow page. */ 388#define P_MAXSPACE(dbp, psize) ((psize) - P_OVERHEAD(dbp)) 389 390/* Free space on an overflow page. */ 391#define P_OVFLSPACE(dbp, psize, pg) (P_MAXSPACE(dbp, psize) - HOFFSET(pg)) 392 393/************************************************************************ 394 HASH PAGE LAYOUT 395 ************************************************************************/ 396 397/* Each index references a group of bytes on the page. */ 398#define H_KEYDATA 1 /* Key/data item. */ 399#define H_DUPLICATE 2 /* Duplicate key/data item. */ 400#define H_OFFPAGE 3 /* Overflow key/data item. */ 401#define H_OFFDUP 4 /* Overflow page of duplicates. */ 402 403/* 404 * !!! 405 * Items on hash pages are (potentially) unaligned, so we can never cast the 406 * (page + offset) pointer to an HKEYDATA, HOFFPAGE or HOFFDUP structure, as 407 * we do with B+tree on-page structures. Because we frequently want the type 408 * field, it requires no alignment, and it's in the same location in all three 409 * structures, there's a pair of macros. 410 */ 411#define HPAGE_PTYPE(p) (*(u_int8_t *)p) 412#define HPAGE_TYPE(dbp, pg, indx) (*P_ENTRY(dbp, pg, indx)) 413 414/* 415 * The first and second types are H_KEYDATA and H_DUPLICATE, represented 416 * by the HKEYDATA structure: 417 * 418 * +-----------------------------------+ 419 * | type | key/data ... | 420 * +-----------------------------------+ 421 * 422 * For duplicates, the data field encodes duplicate elements in the data 423 * field: 424 * 425 * +---------------------------------------------------------------+ 426 * | type | len1 | element1 | len1 | len2 | element2 | len2 | 427 * +---------------------------------------------------------------+ 428 * 429 * Thus, by keeping track of the offset in the element, we can do both 430 * backward and forward traversal. 431 */ 432typedef struct _hkeydata { 433 u_int8_t type; /* 00: Page type. */ 434 u_int8_t data[1]; /* Variable length key/data item. */ 435} HKEYDATA; 436#define HKEYDATA_DATA(p) (((u_int8_t *)p) + SSZA(HKEYDATA, data)) 437 438/* 439 * The length of any HKEYDATA item. Note that indx is an element index, 440 * not a PAIR index. 441 */ 442#define LEN_HITEM(dbp, pg, pgsize, indx) \ 443 (((indx) == 0 ? (pgsize) : \ 444 (P_INP(dbp, pg)[(indx) - 1])) - (P_INP(dbp, pg)[indx])) 445 446#define LEN_HKEYDATA(dbp, pg, psize, indx) \ 447 (db_indx_t)(LEN_HITEM(dbp, pg, psize, indx) - HKEYDATA_SIZE(0)) 448 449/* 450 * Page space required to add a new HKEYDATA item to the page, with and 451 * without the index value. 452 */ 453#define HKEYDATA_SIZE(len) \ 454 ((len) + SSZA(HKEYDATA, data)) 455#define HKEYDATA_PSIZE(len) \ 456 (HKEYDATA_SIZE(len) + sizeof(db_indx_t)) 457 458/* Put a HKEYDATA item at the location referenced by a page entry. */ 459#define PUT_HKEYDATA(pe, kd, len, etype) { \ 460 ((HKEYDATA *)(pe))->type = etype; \ 461 memcpy((u_int8_t *)(pe) + sizeof(u_int8_t), kd, len); \ 462} 463 464/* 465 * Macros the describe the page layout in terms of key-data pairs. 466 */ 467#define H_NUMPAIRS(pg) (NUM_ENT(pg) / 2) 468#define H_KEYINDEX(indx) (indx) 469#define H_DATAINDEX(indx) ((indx) + 1) 470#define H_PAIRKEY(dbp, pg, indx) P_ENTRY(dbp, pg, H_KEYINDEX(indx)) 471#define H_PAIRDATA(dbp, pg, indx) P_ENTRY(dbp, pg, H_DATAINDEX(indx)) 472#define H_PAIRSIZE(dbp, pg, psize, indx) \ 473 (LEN_HITEM(dbp, pg, psize, H_KEYINDEX(indx)) + \ 474 LEN_HITEM(dbp, pg, psize, H_DATAINDEX(indx))) 475#define LEN_HDATA(dbp, p, psize, indx) \ 476 LEN_HKEYDATA(dbp, p, psize, H_DATAINDEX(indx)) 477#define LEN_HKEY(dbp, p, psize, indx) \ 478 LEN_HKEYDATA(dbp, p, psize, H_KEYINDEX(indx)) 479 480/* 481 * The third type is the H_OFFPAGE, represented by the HOFFPAGE structure: 482 */ 483typedef struct _hoffpage { 484 u_int8_t type; /* 00: Page type and delete flag. */ 485 u_int8_t unused[3]; /* 01-03: Padding, unused. */ 486 db_pgno_t pgno; /* 04-07: Offpage page number. */ 487 u_int32_t tlen; /* 08-11: Total length of item. */ 488} HOFFPAGE; 489 490#define HOFFPAGE_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, pgno)) 491#define HOFFPAGE_TLEN(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, tlen)) 492 493/* 494 * Page space required to add a new HOFFPAGE item to the page, with and 495 * without the index value. 496 */ 497#define HOFFPAGE_SIZE (sizeof(HOFFPAGE)) 498#define HOFFPAGE_PSIZE (HOFFPAGE_SIZE + sizeof(db_indx_t)) 499 500/* 501 * The fourth type is H_OFFDUP represented by the HOFFDUP structure: 502 */ 503typedef struct _hoffdup { 504 u_int8_t type; /* 00: Page type and delete flag. */ 505 u_int8_t unused[3]; /* 01-03: Padding, unused. */ 506 db_pgno_t pgno; /* 04-07: Offpage page number. */ 507} HOFFDUP; 508#define HOFFDUP_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFDUP, pgno)) 509 510/* 511 * Page space required to add a new HOFFDUP item to the page, with and 512 * without the index value. 513 */ 514#define HOFFDUP_SIZE (sizeof(HOFFDUP)) 515 516/************************************************************************ 517 BTREE PAGE LAYOUT 518 ************************************************************************/ 519 520/* Each index references a group of bytes on the page. */ 521#define B_KEYDATA 1 /* Key/data item. */ 522#define B_DUPLICATE 2 /* Duplicate key/data item. */ 523#define B_OVERFLOW 3 /* Overflow key/data item. */ 524 525/* 526 * We have to store a deleted entry flag in the page. The reason is complex, 527 * but the simple version is that we can't delete on-page items referenced by 528 * a cursor -- the return order of subsequent insertions might be wrong. The 529 * delete flag is an overload of the top bit of the type byte. 530 */ 531#define B_DELETE (0x80) 532#define B_DCLR(t) (t) &= ~B_DELETE 533#define B_DSET(t) (t) |= B_DELETE 534#define B_DISSET(t) ((t) & B_DELETE) 535 536#define B_TYPE(t) ((t) & ~B_DELETE) 537#define B_TSET(t, type) ((t) = B_TYPE(type)) 538#define B_TSET_DELETED(t, type) ((t) = (type) | B_DELETE) 539 540/* 541 * The first type is B_KEYDATA, represented by the BKEYDATA structure: 542 */ 543typedef struct _bkeydata { 544 db_indx_t len; /* 00-01: Key/data item length. */ 545 u_int8_t type; /* 02: Page type AND DELETE FLAG. */ 546 u_int8_t data[1]; /* Variable length key/data item. */ 547} BKEYDATA; 548 549/* Get a BKEYDATA item for a specific index. */ 550#define GET_BKEYDATA(dbp, pg, indx) \ 551 ((BKEYDATA *)P_ENTRY(dbp, pg, indx)) 552 553/* 554 * Page space required to add a new BKEYDATA item to the page, with and 555 * without the index value. The (u_int16_t) cast avoids warnings: DB_ALIGN 556 * casts to uintmax_t, the cast converts it to a small integral type so we 557 * don't get complaints when we assign the final result to an integral type 558 * smaller than uintmax_t. 559 */ 560#define BKEYDATA_SIZE(len) \ 561 (u_int16_t)DB_ALIGN((len) + SSZA(BKEYDATA, data), sizeof(u_int32_t)) 562#define BKEYDATA_PSIZE(len) \ 563 (BKEYDATA_SIZE(len) + sizeof(db_indx_t)) 564 565/* 566 * The second and third types are B_DUPLICATE and B_OVERFLOW, represented 567 * by the BOVERFLOW structure. 568 */ 569typedef struct _boverflow { 570 db_indx_t unused1; /* 00-01: Padding, unused. */ 571 u_int8_t type; /* 02: Page type AND DELETE FLAG. */ 572 u_int8_t unused2; /* 03: Padding, unused. */ 573 db_pgno_t pgno; /* 04-07: Next page number. */ 574 u_int32_t tlen; /* 08-11: Total length of item. */ 575} BOVERFLOW; 576 577/* Get a BOVERFLOW item for a specific index. */ 578#define GET_BOVERFLOW(dbp, pg, indx) \ 579 ((BOVERFLOW *)P_ENTRY(dbp, pg, indx)) 580 581/* 582 * Page space required to add a new BOVERFLOW item to the page, with and 583 * without the index value. 584 */ 585#define BOVERFLOW_SIZE \ 586 ((u_int16_t)DB_ALIGN(sizeof(BOVERFLOW), sizeof(u_int32_t))) 587#define BOVERFLOW_PSIZE \ 588 (BOVERFLOW_SIZE + sizeof(db_indx_t)) 589 590#define BITEM_SIZE(bk) \ 591 (B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_SIZE : \ 592 BKEYDATA_SIZE((bk)->len)) 593 594#define BITEM_PSIZE(bk) \ 595 (B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_PSIZE : \ 596 BKEYDATA_PSIZE((bk)->len)) 597 598/* 599 * Btree leaf and hash page layouts group indices in sets of two, one for the 600 * key and one for the data. Everything else does it in sets of one to save 601 * space. Use the following macros so that it's real obvious what's going on. 602 */ 603#define O_INDX 1 604#define P_INDX 2 605 606/************************************************************************ 607 BTREE INTERNAL PAGE LAYOUT 608 ************************************************************************/ 609 610/* 611 * Btree internal entry. 612 */ 613typedef struct _binternal { 614 db_indx_t len; /* 00-01: Key/data item length. */ 615 u_int8_t type; /* 02: Page type AND DELETE FLAG. */ 616 u_int8_t unused; /* 03: Padding, unused. */ 617 db_pgno_t pgno; /* 04-07: Page number of referenced page. */ 618 db_recno_t nrecs; /* 08-11: Subtree record count. */ 619 u_int8_t data[1]; /* Variable length key item. */ 620} BINTERNAL; 621 622/* Get a BINTERNAL item for a specific index. */ 623#define GET_BINTERNAL(dbp, pg, indx) \ 624 ((BINTERNAL *)P_ENTRY(dbp, pg, indx)) 625 626/* 627 * Page space required to add a new BINTERNAL item to the page, with and 628 * without the index value. 629 */ 630#define BINTERNAL_SIZE(len) \ 631 (u_int16_t)DB_ALIGN((len) + SSZA(BINTERNAL, data), sizeof(u_int32_t)) 632#define BINTERNAL_PSIZE(len) \ 633 (BINTERNAL_SIZE(len) + sizeof(db_indx_t)) 634 635/************************************************************************ 636 RECNO INTERNAL PAGE LAYOUT 637 ************************************************************************/ 638 639/* 640 * The recno internal entry. 641 */ 642typedef struct _rinternal { 643 db_pgno_t pgno; /* 00-03: Page number of referenced page. */ 644 db_recno_t nrecs; /* 04-07: Subtree record count. */ 645} RINTERNAL; 646 647/* Get a RINTERNAL item for a specific index. */ 648#define GET_RINTERNAL(dbp, pg, indx) \ 649 ((RINTERNAL *)P_ENTRY(dbp, pg, indx)) 650 651/* 652 * Page space required to add a new RINTERNAL item to the page, with and 653 * without the index value. 654 */ 655#define RINTERNAL_SIZE \ 656 (u_int16_t)DB_ALIGN(sizeof(RINTERNAL), sizeof(u_int32_t)) 657#define RINTERNAL_PSIZE \ 658 (RINTERNAL_SIZE + sizeof(db_indx_t)) 659 660typedef struct __pglist { 661 db_pgno_t pgno; 662 DB_LSN lsn; 663} db_pglist_t; 664 665#if defined(__cplusplus) 666} 667#endif 668 669#endif /* !_DB_PAGE_H_ */ 670