1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: db_page.h,v 12.15 2008/01/10 17:44:45 bostic Exp $
7 */
8
9#ifndef _DB_PAGE_H_
10#define	_DB_PAGE_H_
11
12#if defined(__cplusplus)
13extern "C" {
14#endif
15
16/*
17 * DB page formats.
18 *
19 * !!!
20 * This implementation requires that values within the following structures
21 * NOT be padded -- note, ANSI C permits random padding within structures.
22 * If your compiler pads randomly you can just forget ever making DB run on
23 * your system.  In addition, no data type can require larger alignment than
24 * its own size, e.g., a 4-byte data element may not require 8-byte alignment.
25 *
26 * Note that key/data lengths are often stored in db_indx_t's -- this is
27 * not accidental, nor does it limit the key/data size.  If the key/data
28 * item fits on a page, it's guaranteed to be small enough to fit into a
29 * db_indx_t, and storing it in one saves space.
30 */
31
32#define	PGNO_INVALID	0	/* Invalid page number in any database. */
33#define	PGNO_BASE_MD	0	/* Base database: metadata page number. */
34
35/* Page types. */
36#define	P_INVALID	0	/* Invalid page type. */
37#define	__P_DUPLICATE	1	/* Duplicate. DEPRECATED in 3.1 */
38#define	P_HASH_UNSORTED	2	/* Hash pages created pre 4.6. DEPRECATED */
39#define	P_IBTREE	3	/* Btree internal. */
40#define	P_IRECNO	4	/* Recno internal. */
41#define	P_LBTREE	5	/* Btree leaf. */
42#define	P_LRECNO	6	/* Recno leaf. */
43#define	P_OVERFLOW	7	/* Overflow. */
44#define	P_HASHMETA	8	/* Hash metadata page. */
45#define	P_BTREEMETA	9	/* Btree metadata page. */
46#define	P_QAMMETA	10	/* Queue metadata page. */
47#define	P_QAMDATA	11	/* Queue data page. */
48#define	P_LDUP		12	/* Off-page duplicate leaf. */
49#define	P_HASH		13	/* Sorted hash page. */
50#define	P_PAGETYPE_MAX	14
51/* Flag to __db_new */
52#define	P_DONTEXTEND	0x8000	/* Don't allocate if there are no free pages. */
53
54/*
55 * When we create pages in mpool, we ask mpool to clear some number of bytes
56 * in the header.  This number must be at least as big as the regular page
57 * headers and cover enough of the btree and hash meta-data pages to obliterate
58 * the page type.
59 */
60#define	DB_PAGE_DB_LEN		32
61#define	DB_PAGE_QUEUE_LEN	0
62
63/************************************************************************
64 GENERIC METADATA PAGE HEADER
65 *
66 * !!!
67 * The magic and version numbers have to be in the same place in all versions
68 * of the metadata page as the application may not have upgraded the database.
69 ************************************************************************/
70typedef struct _dbmeta33 {
71	DB_LSN	  lsn;		/* 00-07: LSN. */
72	db_pgno_t pgno;		/* 08-11: Current page number. */
73	u_int32_t magic;	/* 12-15: Magic number. */
74	u_int32_t version;	/* 16-19: Version. */
75	u_int32_t pagesize;	/* 20-23: Pagesize. */
76	u_int8_t  encrypt_alg;	/*    24: Encryption algorithm. */
77	u_int8_t  type;		/*    25: Page type. */
78#define	DBMETA_CHKSUM		0x01
79	u_int8_t  metaflags;	/* 26: Meta-only flags */
80	u_int8_t  unused1;	/* 27: Unused. */
81	u_int32_t free;		/* 28-31: Free list page number. */
82	db_pgno_t last_pgno;	/* 32-35: Page number of last page in db. */
83	u_int32_t unused3;	/* 36-39: Unused. */
84	u_int32_t key_count;	/* 40-43: Cached key count. */
85	u_int32_t record_count;	/* 44-47: Cached record count. */
86	u_int32_t flags;	/* 48-51: Flags: unique to each AM. */
87				/* 52-71: Unique file ID. */
88	u_int8_t  uid[DB_FILE_ID_LEN];
89} DBMETA33, DBMETA;
90
91/************************************************************************
92 BTREE METADATA PAGE LAYOUT
93 ************************************************************************/
94typedef struct _btmeta33 {
95#define	BTM_DUP		0x001	/*	  Duplicates. */
96#define	BTM_RECNO	0x002	/*	  Recno tree. */
97#define	BTM_RECNUM	0x004	/*	  Btree: maintain record count. */
98#define	BTM_FIXEDLEN	0x008	/*	  Recno: fixed length records. */
99#define	BTM_RENUMBER	0x010	/*	  Recno: renumber on insert/delete. */
100#define	BTM_SUBDB	0x020	/*	  Subdatabases. */
101#define	BTM_DUPSORT	0x040	/*	  Duplicates are sorted. */
102#define	BTM_MASK	0x07f
103	DBMETA	dbmeta;		/* 00-71: Generic meta-data header. */
104
105	u_int32_t unused1;	/* 72-75: Unused space. */
106	u_int32_t minkey;	/* 76-79: Btree: Minkey. */
107	u_int32_t re_len;	/* 80-83: Recno: fixed-length record length. */
108	u_int32_t re_pad;	/* 84-87: Recno: fixed-length record pad. */
109	u_int32_t root;		/* 88-91: Root page. */
110	u_int32_t unused2[92];	/* 92-459: Unused space. */
111	u_int32_t crypto_magic;		/* 460-463: Crypto magic number */
112	u_int32_t trash[3];		/* 464-475: Trash space - Do not use */
113	u_int8_t iv[DB_IV_BYTES];	/* 476-495: Crypto IV */
114	u_int8_t chksum[DB_MAC_KEY];	/* 496-511: Page chksum */
115
116	/*
117	 * Minimum page size is 512.
118	 */
119} BTMETA33, BTMETA;
120
121/************************************************************************
122 HASH METADATA PAGE LAYOUT
123 ************************************************************************/
124typedef struct _hashmeta33 {
125#define	DB_HASH_DUP	0x01	/*	  Duplicates. */
126#define	DB_HASH_SUBDB	0x02	/*	  Subdatabases. */
127#define	DB_HASH_DUPSORT	0x04	/*	  Duplicates are sorted. */
128	DBMETA dbmeta;		/* 00-71: Generic meta-data page header. */
129
130	u_int32_t max_bucket;	/* 72-75: ID of Maximum bucket in use */
131	u_int32_t high_mask;	/* 76-79: Modulo mask into table */
132	u_int32_t low_mask;	/* 80-83: Modulo mask into table lower half */
133	u_int32_t ffactor;	/* 84-87: Fill factor */
134	u_int32_t nelem;	/* 88-91: Number of keys in hash table */
135	u_int32_t h_charkey;	/* 92-95: Value of hash(CHARKEY) */
136#define	NCACHED	32		/* number of spare points */
137				/* 96-223: Spare pages for overflow */
138	u_int32_t spares[NCACHED];
139	u_int32_t unused[59];	/* 224-459: Unused space */
140	u_int32_t crypto_magic;	/* 460-463: Crypto magic number */
141	u_int32_t trash[3];	/* 464-475: Trash space - Do not use */
142	u_int8_t iv[DB_IV_BYTES];	/* 476-495: Crypto IV */
143	u_int8_t chksum[DB_MAC_KEY];	/* 496-511: Page chksum */
144
145	/*
146	 * Minimum page size is 512.
147	 */
148} HMETA33, HMETA;
149
150/************************************************************************
151 QUEUE METADATA PAGE LAYOUT
152 ************************************************************************/
153/*
154 * QAM Meta data page structure
155 *
156 */
157typedef struct _qmeta33 {
158	DBMETA    dbmeta;	/* 00-71: Generic meta-data header. */
159
160	u_int32_t first_recno;	/* 72-75: First not deleted record. */
161	u_int32_t cur_recno;	/* 76-79: Next recno to be allocated. */
162	u_int32_t re_len;	/* 80-83: Fixed-length record length. */
163	u_int32_t re_pad;	/* 84-87: Fixed-length record pad. */
164	u_int32_t rec_page;	/* 88-91: Records Per Page. */
165	u_int32_t page_ext;	/* 92-95: Pages per extent */
166
167	u_int32_t unused[91];	/* 96-459: Unused space */
168	u_int32_t crypto_magic;	/* 460-463: Crypto magic number */
169	u_int32_t trash[3];	/* 464-475: Trash space - Do not use */
170	u_int8_t iv[DB_IV_BYTES];	/* 476-495: Crypto IV */
171	u_int8_t chksum[DB_MAC_KEY];	/* 496-511: Page chksum */
172	/*
173	 * Minimum page size is 512.
174	 */
175} QMETA33, QMETA;
176
177/*
178 * DBMETASIZE is a constant used by __db_file_setup and DB->verify
179 * as a buffer which is guaranteed to be larger than any possible
180 * metadata page size and smaller than any disk sector.
181 */
182#define	DBMETASIZE	512
183
184/************************************************************************
185 BTREE/HASH MAIN PAGE LAYOUT
186 ************************************************************************/
187/*
188 *	+-----------------------------------+
189 *	|    lsn    |   pgno    | prev pgno |
190 *	+-----------------------------------+
191 *	| next pgno |  entries  | hf offset |
192 *	+-----------------------------------+
193 *	|   level   |   type    |   chksum  |
194 *	+-----------------------------------+
195 *	|    iv     |   index   | free -->  |
196 *	+-----------+-----------------------+
197 *	|	 F R E E A R E A            |
198 *	+-----------------------------------+
199 *	|              <-- free |   item    |
200 *	+-----------------------------------+
201 *	|   item    |   item    |   item    |
202 *	+-----------------------------------+
203 *
204 * sizeof(PAGE) == 26 bytes + possibly 20 bytes of checksum and possibly
205 * 16 bytes of IV (+ 2 bytes for alignment), and the following indices
206 * are guaranteed to be two-byte aligned.  If we aren't doing crypto or
207 * checksumming the bytes are reclaimed for data storage.
208 *
209 * For hash and btree leaf pages, index items are paired, e.g., inp[0] is the
210 * key for inp[1]'s data.  All other types of pages only contain single items.
211 */
212typedef struct __pg_chksum {
213	u_int8_t	unused[2];		/* 26-27: For alignment */
214	u_int8_t	chksum[4];		/* 28-31: Checksum */
215} PG_CHKSUM;
216
217typedef struct __pg_crypto {
218	u_int8_t	unused[2];		/* 26-27: For alignment */
219	u_int8_t	chksum[DB_MAC_KEY];	/* 28-47: Checksum */
220	u_int8_t	iv[DB_IV_BYTES];	/* 48-63: IV */
221	/* !!!
222	 * Must be 16-byte aligned for crypto
223	 */
224} PG_CRYPTO;
225
226typedef struct _db_page {
227	DB_LSN	  lsn;		/* 00-07: Log sequence number. */
228	db_pgno_t pgno;		/* 08-11: Current page number. */
229	db_pgno_t prev_pgno;	/* 12-15: Previous page number. */
230	db_pgno_t next_pgno;	/* 16-19: Next page number. */
231	db_indx_t entries;	/* 20-21: Number of items on the page. */
232	db_indx_t hf_offset;	/* 22-23: High free byte page offset. */
233
234	/*
235	 * The btree levels are numbered from the leaf to the root, starting
236	 * with 1, so the leaf is level 1, its parent is level 2, and so on.
237	 * We maintain this level on all btree pages, but the only place that
238	 * we actually need it is on the root page.  It would not be difficult
239	 * to hide the byte on the root page once it becomes an internal page,
240	 * so we could get this byte back if we needed it for something else.
241	 */
242#define	LEAFLEVEL	  1
243#define	MAXBTREELEVEL	255
244	u_int8_t  level;	/*    24: Btree tree level. */
245	u_int8_t  type;		/*    25: Page type. */
246} PAGE;
247
248/*
249 * With many compilers sizeof(PAGE) == 28, while SIZEOF_PAGE == 26.
250 * We add in other things directly after the page header and need
251 * the SIZEOF_PAGE.  When giving the sizeof(), many compilers will
252 * pad it out to the next 4-byte boundary.
253 */
254#define	SIZEOF_PAGE	26
255/*
256 * !!!
257 * DB_AM_ENCRYPT always implies DB_AM_CHKSUM so that must come first.
258 */
259#define	P_INP(dbp, pg)							\
260	((db_indx_t *)((u_int8_t *)(pg) + SIZEOF_PAGE +			\
261	(F_ISSET((dbp), DB_AM_ENCRYPT) ? sizeof(PG_CRYPTO) :		\
262	(F_ISSET((dbp), DB_AM_CHKSUM) ? sizeof(PG_CHKSUM) : 0))))
263
264#define	P_IV(dbp, pg)							\
265	(F_ISSET((dbp), DB_AM_ENCRYPT) ? ((u_int8_t *)(pg) +		\
266	SIZEOF_PAGE + SSZA(PG_CRYPTO, iv))				\
267	: NULL)
268
269#define	P_CHKSUM(dbp, pg)						\
270	(F_ISSET((dbp), DB_AM_ENCRYPT) ? ((u_int8_t *)(pg) +		\
271	SIZEOF_PAGE + SSZA(PG_CRYPTO, chksum)) :			\
272	(F_ISSET((dbp), DB_AM_CHKSUM) ? ((u_int8_t *)(pg) +		\
273	SIZEOF_PAGE + SSZA(PG_CHKSUM, chksum))				\
274	: NULL))
275
276/* PAGE element macros. */
277#define	LSN(p)		(((PAGE *)p)->lsn)
278#define	PGNO(p)		(((PAGE *)p)->pgno)
279#define	PREV_PGNO(p)	(((PAGE *)p)->prev_pgno)
280#define	NEXT_PGNO(p)	(((PAGE *)p)->next_pgno)
281#define	NUM_ENT(p)	(((PAGE *)p)->entries)
282#define	HOFFSET(p)	(((PAGE *)p)->hf_offset)
283#define	LEVEL(p)	(((PAGE *)p)->level)
284#define	TYPE(p)		(((PAGE *)p)->type)
285
286/************************************************************************
287 QUEUE MAIN PAGE LAYOUT
288 ************************************************************************/
289/*
290 * Sizes of page below.  Used to reclaim space if not doing
291 * crypto or checksumming.  If you change the QPAGE below you
292 * MUST adjust this too.
293 */
294#define	QPAGE_NORMAL	28
295#define	QPAGE_CHKSUM	48
296#define	QPAGE_SEC	64
297
298typedef struct _qpage {
299	DB_LSN	  lsn;		/* 00-07: Log sequence number. */
300	db_pgno_t pgno;		/* 08-11: Current page number. */
301	u_int32_t unused0[3];	/* 12-23: Unused. */
302	u_int8_t  unused1[1];	/*    24: Unused. */
303	u_int8_t  type;		/*    25: Page type. */
304	u_int8_t  unused2[2];	/* 26-27: Unused. */
305	u_int8_t  chksum[DB_MAC_KEY]; /* 28-47: Checksum */
306	u_int8_t  iv[DB_IV_BYTES]; /* 48-63: IV */
307} QPAGE;
308
309#define	QPAGE_SZ(dbp)						\
310	(F_ISSET((dbp), DB_AM_ENCRYPT) ? QPAGE_SEC :		\
311	F_ISSET((dbp), DB_AM_CHKSUM) ? QPAGE_CHKSUM : QPAGE_NORMAL)
312/*
313 * !!!
314 * The next_pgno and prev_pgno fields are not maintained for btree and recno
315 * internal pages.  Doing so only provides a minor performance improvement,
316 * it's hard to do when deleting internal pages, and it increases the chance
317 * of deadlock during deletes and splits because we have to re-link pages at
318 * more than the leaf level.
319 *
320 * !!!
321 * The btree/recno access method needs db_recno_t bytes of space on the root
322 * page to specify how many records are stored in the tree.  (The alternative
323 * is to store the number of records in the meta-data page, which will create
324 * a second hot spot in trees being actively modified, or recalculate it from
325 * the BINTERNAL fields on each access.)  Overload the PREV_PGNO field.
326 */
327#define	RE_NREC(p)							\
328	((TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO) ?	PREV_PGNO(p) :	\
329	(db_pgno_t)(TYPE(p) == P_LBTREE ? NUM_ENT(p) / 2 : NUM_ENT(p)))
330#define	RE_NREC_ADJ(p, adj)						\
331	PREV_PGNO(p) += adj;
332#define	RE_NREC_SET(p, num)						\
333	PREV_PGNO(p) = (num);
334
335/*
336 * Initialize a page.
337 *
338 * !!!
339 * Don't modify the page's LSN, code depends on it being unchanged after a
340 * P_INIT call.
341 */
342#define	P_INIT(pg, pg_size, n, pg_prev, pg_next, btl, pg_type) do {	\
343	PGNO(pg) = (n);							\
344	PREV_PGNO(pg) = (pg_prev);					\
345	NEXT_PGNO(pg) = (pg_next);					\
346	NUM_ENT(pg) = (0);						\
347	HOFFSET(pg) = (db_indx_t)(pg_size);				\
348	LEVEL(pg) = (btl);						\
349	TYPE(pg) = (pg_type);						\
350} while (0)
351
352/* Page header length (offset to first index). */
353#define	P_OVERHEAD(dbp)	P_TO_UINT16(P_INP(dbp, 0))
354
355/* First free byte. */
356#define	LOFFSET(dbp, pg)						\
357    (P_OVERHEAD(dbp) + NUM_ENT(pg) * sizeof(db_indx_t))
358
359/* Free space on a regular page. */
360#define	P_FREESPACE(dbp, pg)	(HOFFSET(pg) - LOFFSET(dbp, pg))
361
362/* Get a pointer to the bytes at a specific index. */
363#define	P_ENTRY(dbp, pg, indx)	((u_int8_t *)pg + P_INP(dbp, pg)[indx])
364
365/************************************************************************
366 OVERFLOW PAGE LAYOUT
367 ************************************************************************/
368
369/*
370 * Overflow items are referenced by HOFFPAGE and BOVERFLOW structures, which
371 * store a page number (the first page of the overflow item) and a length
372 * (the total length of the overflow item).  The overflow item consists of
373 * some number of overflow pages, linked by the next_pgno field of the page.
374 * A next_pgno field of PGNO_INVALID flags the end of the overflow item.
375 *
376 * Overflow page overloads:
377 *	The amount of overflow data stored on each page is stored in the
378 *	hf_offset field.
379 *
380 *	The implementation reference counts overflow items as it's possible
381 *	for them to be promoted onto btree internal pages.  The reference
382 *	count is stored in the entries field.
383 */
384#define	OV_LEN(p)	(((PAGE *)p)->hf_offset)
385#define	OV_REF(p)	(((PAGE *)p)->entries)
386
387/* Maximum number of bytes that you can put on an overflow page. */
388#define	P_MAXSPACE(dbp, psize)	((psize) - P_OVERHEAD(dbp))
389
390/* Free space on an overflow page. */
391#define	P_OVFLSPACE(dbp, psize, pg)	(P_MAXSPACE(dbp, psize) - HOFFSET(pg))
392
393/************************************************************************
394 HASH PAGE LAYOUT
395 ************************************************************************/
396
397/* Each index references a group of bytes on the page. */
398#define	H_KEYDATA	1	/* Key/data item. */
399#define	H_DUPLICATE	2	/* Duplicate key/data item. */
400#define	H_OFFPAGE	3	/* Overflow key/data item. */
401#define	H_OFFDUP	4	/* Overflow page of duplicates. */
402
403/*
404 * !!!
405 * Items on hash pages are (potentially) unaligned, so we can never cast the
406 * (page + offset) pointer to an HKEYDATA, HOFFPAGE or HOFFDUP structure, as
407 * we do with B+tree on-page structures.  Because we frequently want the type
408 * field, it requires no alignment, and it's in the same location in all three
409 * structures, there's a pair of macros.
410 */
411#define	HPAGE_PTYPE(p)		(*(u_int8_t *)p)
412#define	HPAGE_TYPE(dbp, pg, indx)	(*P_ENTRY(dbp, pg, indx))
413
414/*
415 * The first and second types are H_KEYDATA and H_DUPLICATE, represented
416 * by the HKEYDATA structure:
417 *
418 *	+-----------------------------------+
419 *	|    type   | key/data ...          |
420 *	+-----------------------------------+
421 *
422 * For duplicates, the data field encodes duplicate elements in the data
423 * field:
424 *
425 *	+---------------------------------------------------------------+
426 *	|    type   | len1 | element1 | len1 | len2 | element2 | len2   |
427 *	+---------------------------------------------------------------+
428 *
429 * Thus, by keeping track of the offset in the element, we can do both
430 * backward and forward traversal.
431 */
432typedef struct _hkeydata {
433	u_int8_t  type;		/*    00: Page type. */
434	u_int8_t  data[1];	/* Variable length key/data item. */
435} HKEYDATA;
436#define	HKEYDATA_DATA(p)	(((u_int8_t *)p) + SSZA(HKEYDATA, data))
437
438/*
439 * The length of any HKEYDATA item. Note that indx is an element index,
440 * not a PAIR index.
441 */
442#define	LEN_HITEM(dbp, pg, pgsize, indx)				\
443	(((indx) == 0 ? (pgsize) :					\
444	(P_INP(dbp, pg)[(indx) - 1])) - (P_INP(dbp, pg)[indx]))
445
446#define	LEN_HKEYDATA(dbp, pg, psize, indx)				\
447	(db_indx_t)(LEN_HITEM(dbp, pg, psize, indx) - HKEYDATA_SIZE(0))
448
449/*
450 * Page space required to add a new HKEYDATA item to the page, with and
451 * without the index value.
452 */
453#define	HKEYDATA_SIZE(len)						\
454	((len) + SSZA(HKEYDATA, data))
455#define	HKEYDATA_PSIZE(len)						\
456	(HKEYDATA_SIZE(len) + sizeof(db_indx_t))
457
458/* Put a HKEYDATA item at the location referenced by a page entry. */
459#define	PUT_HKEYDATA(pe, kd, len, etype) {				\
460	((HKEYDATA *)(pe))->type = etype;				\
461	memcpy((u_int8_t *)(pe) + sizeof(u_int8_t), kd, len);		\
462}
463
464/*
465 * Macros the describe the page layout in terms of key-data pairs.
466 */
467#define	H_NUMPAIRS(pg)			(NUM_ENT(pg) / 2)
468#define	H_KEYINDEX(indx)		(indx)
469#define	H_DATAINDEX(indx)		((indx) + 1)
470#define	H_PAIRKEY(dbp, pg, indx)	P_ENTRY(dbp, pg, H_KEYINDEX(indx))
471#define	H_PAIRDATA(dbp, pg, indx)	P_ENTRY(dbp, pg, H_DATAINDEX(indx))
472#define	H_PAIRSIZE(dbp, pg, psize, indx)				\
473	(LEN_HITEM(dbp, pg, psize, H_KEYINDEX(indx)) +			\
474	LEN_HITEM(dbp, pg, psize, H_DATAINDEX(indx)))
475#define	LEN_HDATA(dbp, p, psize, indx)					\
476    LEN_HKEYDATA(dbp, p, psize, H_DATAINDEX(indx))
477#define	LEN_HKEY(dbp, p, psize, indx)					\
478    LEN_HKEYDATA(dbp, p, psize, H_KEYINDEX(indx))
479
480/*
481 * The third type is the H_OFFPAGE, represented by the HOFFPAGE structure:
482 */
483typedef struct _hoffpage {
484	u_int8_t  type;		/*    00: Page type and delete flag. */
485	u_int8_t  unused[3];	/* 01-03: Padding, unused. */
486	db_pgno_t pgno;		/* 04-07: Offpage page number. */
487	u_int32_t tlen;		/* 08-11: Total length of item. */
488} HOFFPAGE;
489
490#define	HOFFPAGE_PGNO(p)	(((u_int8_t *)p) + SSZ(HOFFPAGE, pgno))
491#define	HOFFPAGE_TLEN(p)	(((u_int8_t *)p) + SSZ(HOFFPAGE, tlen))
492
493/*
494 * Page space required to add a new HOFFPAGE item to the page, with and
495 * without the index value.
496 */
497#define	HOFFPAGE_SIZE		(sizeof(HOFFPAGE))
498#define	HOFFPAGE_PSIZE		(HOFFPAGE_SIZE + sizeof(db_indx_t))
499
500/*
501 * The fourth type is H_OFFDUP represented by the HOFFDUP structure:
502 */
503typedef struct _hoffdup {
504	u_int8_t  type;		/*    00: Page type and delete flag. */
505	u_int8_t  unused[3];	/* 01-03: Padding, unused. */
506	db_pgno_t pgno;		/* 04-07: Offpage page number. */
507} HOFFDUP;
508#define	HOFFDUP_PGNO(p)		(((u_int8_t *)p) + SSZ(HOFFDUP, pgno))
509
510/*
511 * Page space required to add a new HOFFDUP item to the page, with and
512 * without the index value.
513 */
514#define	HOFFDUP_SIZE		(sizeof(HOFFDUP))
515
516/************************************************************************
517 BTREE PAGE LAYOUT
518 ************************************************************************/
519
520/* Each index references a group of bytes on the page. */
521#define	B_KEYDATA	1	/* Key/data item. */
522#define	B_DUPLICATE	2	/* Duplicate key/data item. */
523#define	B_OVERFLOW	3	/* Overflow key/data item. */
524
525/*
526 * We have to store a deleted entry flag in the page.   The reason is complex,
527 * but the simple version is that we can't delete on-page items referenced by
528 * a cursor -- the return order of subsequent insertions might be wrong.  The
529 * delete flag is an overload of the top bit of the type byte.
530 */
531#define	B_DELETE	(0x80)
532#define	B_DCLR(t)	(t) &= ~B_DELETE
533#define	B_DSET(t)	(t) |= B_DELETE
534#define	B_DISSET(t)	((t) & B_DELETE)
535
536#define	B_TYPE(t)		((t) & ~B_DELETE)
537#define	B_TSET(t, type)	((t) = B_TYPE(type))
538#define	B_TSET_DELETED(t, type) ((t) = (type) | B_DELETE)
539
540/*
541 * The first type is B_KEYDATA, represented by the BKEYDATA structure:
542 */
543typedef struct _bkeydata {
544	db_indx_t len;		/* 00-01: Key/data item length. */
545	u_int8_t  type;		/*    02: Page type AND DELETE FLAG. */
546	u_int8_t  data[1];	/* Variable length key/data item. */
547} BKEYDATA;
548
549/* Get a BKEYDATA item for a specific index. */
550#define	GET_BKEYDATA(dbp, pg, indx)					\
551	((BKEYDATA *)P_ENTRY(dbp, pg, indx))
552
553/*
554 * Page space required to add a new BKEYDATA item to the page, with and
555 * without the index value.  The (u_int16_t) cast avoids warnings: DB_ALIGN
556 * casts to uintmax_t, the cast converts it to a small integral type so we
557 * don't get complaints when we assign the final result to an integral type
558 * smaller than uintmax_t.
559 */
560#define	BKEYDATA_SIZE(len)						\
561	(u_int16_t)DB_ALIGN((len) + SSZA(BKEYDATA, data), sizeof(u_int32_t))
562#define	BKEYDATA_PSIZE(len)						\
563	(BKEYDATA_SIZE(len) + sizeof(db_indx_t))
564
565/*
566 * The second and third types are B_DUPLICATE and B_OVERFLOW, represented
567 * by the BOVERFLOW structure.
568 */
569typedef struct _boverflow {
570	db_indx_t unused1;	/* 00-01: Padding, unused. */
571	u_int8_t  type;		/*    02: Page type AND DELETE FLAG. */
572	u_int8_t  unused2;	/*    03: Padding, unused. */
573	db_pgno_t pgno;		/* 04-07: Next page number. */
574	u_int32_t tlen;		/* 08-11: Total length of item. */
575} BOVERFLOW;
576
577/* Get a BOVERFLOW item for a specific index. */
578#define	GET_BOVERFLOW(dbp, pg, indx)					\
579	((BOVERFLOW *)P_ENTRY(dbp, pg, indx))
580
581/*
582 * Page space required to add a new BOVERFLOW item to the page, with and
583 * without the index value.
584 */
585#define	BOVERFLOW_SIZE							\
586	((u_int16_t)DB_ALIGN(sizeof(BOVERFLOW), sizeof(u_int32_t)))
587#define	BOVERFLOW_PSIZE							\
588	(BOVERFLOW_SIZE + sizeof(db_indx_t))
589
590#define	BITEM_SIZE(bk)							\
591	(B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_SIZE :		\
592	BKEYDATA_SIZE((bk)->len))
593
594#define	BITEM_PSIZE(bk)							\
595	(B_TYPE((bk)->type) != B_KEYDATA ? BOVERFLOW_PSIZE :		\
596	BKEYDATA_PSIZE((bk)->len))
597
598/*
599 * Btree leaf and hash page layouts group indices in sets of two, one for the
600 * key and one for the data.  Everything else does it in sets of one to save
601 * space.  Use the following macros so that it's real obvious what's going on.
602 */
603#define	O_INDX	1
604#define	P_INDX	2
605
606/************************************************************************
607 BTREE INTERNAL PAGE LAYOUT
608 ************************************************************************/
609
610/*
611 * Btree internal entry.
612 */
613typedef struct _binternal {
614	db_indx_t  len;		/* 00-01: Key/data item length. */
615	u_int8_t   type;	/*    02: Page type AND DELETE FLAG. */
616	u_int8_t   unused;	/*    03: Padding, unused. */
617	db_pgno_t  pgno;	/* 04-07: Page number of referenced page. */
618	db_recno_t nrecs;	/* 08-11: Subtree record count. */
619	u_int8_t   data[1];	/* Variable length key item. */
620} BINTERNAL;
621
622/* Get a BINTERNAL item for a specific index. */
623#define	GET_BINTERNAL(dbp, pg, indx)					\
624	((BINTERNAL *)P_ENTRY(dbp, pg, indx))
625
626/*
627 * Page space required to add a new BINTERNAL item to the page, with and
628 * without the index value.
629 */
630#define	BINTERNAL_SIZE(len)						\
631	(u_int16_t)DB_ALIGN((len) + SSZA(BINTERNAL, data), sizeof(u_int32_t))
632#define	BINTERNAL_PSIZE(len)						\
633	(BINTERNAL_SIZE(len) + sizeof(db_indx_t))
634
635/************************************************************************
636 RECNO INTERNAL PAGE LAYOUT
637 ************************************************************************/
638
639/*
640 * The recno internal entry.
641 */
642typedef struct _rinternal {
643	db_pgno_t  pgno;	/* 00-03: Page number of referenced page. */
644	db_recno_t nrecs;	/* 04-07: Subtree record count. */
645} RINTERNAL;
646
647/* Get a RINTERNAL item for a specific index. */
648#define	GET_RINTERNAL(dbp, pg, indx)					\
649	((RINTERNAL *)P_ENTRY(dbp, pg, indx))
650
651/*
652 * Page space required to add a new RINTERNAL item to the page, with and
653 * without the index value.
654 */
655#define	RINTERNAL_SIZE							\
656	(u_int16_t)DB_ALIGN(sizeof(RINTERNAL), sizeof(u_int32_t))
657#define	RINTERNAL_PSIZE							\
658	(RINTERNAL_SIZE + sizeof(db_indx_t))
659
660typedef struct __pglist {
661	db_pgno_t pgno;
662	DB_LSN lsn;
663} db_pglist_t;
664
665#if defined(__cplusplus)
666}
667#endif
668
669#endif /* !_DB_PAGE_H_ */
670