1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1999,2008 Oracle.  All rights reserved.
5 *
6 * $Id: bt_verify.c,v 12.39 2008/03/11 21:07:01 mbrey Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/db_verify.h"
14#include "dbinc/btree.h"
15#include "dbinc/mp.h"
16
17static int __bam_safe_getdata __P((DB *, DB_THREAD_INFO *,
18    PAGE *, u_int32_t, int, DBT *, int *));
19static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
20    db_indx_t *, u_int32_t));
21static int __bam_vrfy_treeorder __P((DB *, DB_THREAD_INFO *, PAGE *,
22    BINTERNAL *, BINTERNAL *, int (*)(DB *, const DBT *, const DBT *),
23    u_int32_t));
24static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
25    db_indx_t *, u_int32_t));
26
27/*
28 * __bam_vrfy_meta --
29 *	Verify the btree-specific part of a metadata page.
30 *
31 * PUBLIC: int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *,
32 * PUBLIC:     db_pgno_t, u_int32_t));
33 */
34int
35__bam_vrfy_meta(dbp, vdp, meta, pgno, flags)
36	DB *dbp;
37	VRFY_DBINFO *vdp;
38	BTMETA *meta;
39	db_pgno_t pgno;
40	u_int32_t flags;
41{
42	ENV *env;
43	VRFY_PAGEINFO *pip;
44	int isbad, t_ret, ret;
45	db_indx_t ovflsize;
46
47	env = dbp->env;
48	isbad = 0;
49
50	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
51		return (ret);
52
53	/*
54	 * If VRFY_INCOMPLETE is not set, then we didn't come through
55	 * __db_vrfy_pagezero and didn't incompletely
56	 * check this page--we haven't checked it at all.
57	 * Thus we need to call __db_vrfy_meta and check the common fields.
58	 *
59	 * If VRFY_INCOMPLETE is set, we've already done all the same work
60	 * in __db_vrfy_pagezero, so skip the check.
61	 */
62	if (!F_ISSET(pip, VRFY_INCOMPLETE) &&
63	    (ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) {
64		if (ret == DB_VERIFY_BAD)
65			isbad = 1;
66		else
67			goto err;
68	}
69
70	/* bt_minkey:  must be >= 2; must produce sensible ovflsize */
71
72	/* avoid division by zero */
73	ovflsize = meta->minkey > 0 ?
74	    B_MINKEY_TO_OVFLSIZE(dbp, meta->minkey, dbp->pgsize) : 0;
75
76	if (meta->minkey < 2 ||
77	    ovflsize > B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) {
78		pip->bt_minkey = 0;
79		isbad = 1;
80		EPRINT((env,
81	    "Page %lu: nonsensical bt_minkey value %lu on metadata page",
82		    (u_long)pgno, (u_long)meta->minkey));
83	} else
84		pip->bt_minkey = meta->minkey;
85
86	/* re_len: no constraints on this (may be zero or huge--we make rope) */
87	pip->re_pad = meta->re_pad;
88	pip->re_len = meta->re_len;
89
90	/*
91	 * The root must not be current page or 0 and it must be within
92	 * database.  If this metadata page is the master meta data page
93	 * of the file, then the root page had better be page 1.
94	 */
95	pip->root = 0;
96	if (meta->root == PGNO_INVALID ||
97	    meta->root == pgno || !IS_VALID_PGNO(meta->root) ||
98	    (pgno == PGNO_BASE_MD && meta->root != 1)) {
99		isbad = 1;
100		EPRINT((env,
101		    "Page %lu: nonsensical root page %lu on metadata page",
102		    (u_long)pgno, (u_long)meta->root));
103	} else
104		pip->root = meta->root;
105
106	/* Flags. */
107	if (F_ISSET(&meta->dbmeta, BTM_RENUMBER))
108		F_SET(pip, VRFY_IS_RRECNO);
109
110	if (F_ISSET(&meta->dbmeta, BTM_SUBDB)) {
111		/*
112		 * If this is a master db meta page, it had better not have
113		 * duplicates.
114		 */
115		if (F_ISSET(&meta->dbmeta, BTM_DUP) && pgno == PGNO_BASE_MD) {
116			isbad = 1;
117			EPRINT((env,
118"Page %lu: Btree metadata page has both duplicates and multiple databases",
119			    (u_long)pgno));
120		}
121		F_SET(pip, VRFY_HAS_SUBDBS);
122	}
123
124	if (F_ISSET(&meta->dbmeta, BTM_DUP))
125		F_SET(pip, VRFY_HAS_DUPS);
126	if (F_ISSET(&meta->dbmeta, BTM_DUPSORT))
127		F_SET(pip, VRFY_HAS_DUPSORT);
128	if (F_ISSET(&meta->dbmeta, BTM_RECNUM))
129		F_SET(pip, VRFY_HAS_RECNUMS);
130	if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_DUPS)) {
131		EPRINT((env,
132    "Page %lu: Btree metadata page illegally has both recnums and dups",
133		    (u_long)pgno));
134		isbad = 1;
135	}
136
137	if (F_ISSET(&meta->dbmeta, BTM_RECNO)) {
138		F_SET(pip, VRFY_IS_RECNO);
139		dbp->type = DB_RECNO;
140	} else if (F_ISSET(pip, VRFY_IS_RRECNO)) {
141		isbad = 1;
142		EPRINT((env,
143    "Page %lu: metadata page has renumber flag set but is not recno",
144		    (u_long)pgno));
145	}
146
147	if (F_ISSET(pip, VRFY_IS_RECNO) && F_ISSET(pip, VRFY_HAS_DUPS)) {
148		EPRINT((env,
149		    "Page %lu: recno metadata page specifies duplicates",
150		    (u_long)pgno));
151		isbad = 1;
152	}
153
154	if (F_ISSET(&meta->dbmeta, BTM_FIXEDLEN))
155		F_SET(pip, VRFY_IS_FIXEDLEN);
156	else if (pip->re_len > 0) {
157		/*
158		 * It's wrong to have an re_len if it's not a fixed-length
159		 * database
160		 */
161		isbad = 1;
162		EPRINT((env,
163		    "Page %lu: re_len of %lu in non-fixed-length database",
164		    (u_long)pgno, (u_long)pip->re_len));
165	}
166
167	/*
168	 * We do not check that the rest of the page is 0, because it may
169	 * not be and may still be correct.
170	 */
171
172err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
173		ret = t_ret;
174	if (LF_ISSET(DB_SALVAGE) &&
175	   (t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
176		ret = t_ret;
177	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
178}
179
180/*
181 * __ram_vrfy_leaf --
182 *	Verify a recno leaf page.
183 *
184 * PUBLIC: int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
185 * PUBLIC:     u_int32_t));
186 */
187int
188__ram_vrfy_leaf(dbp, vdp, h, pgno, flags)
189	DB *dbp;
190	VRFY_DBINFO *vdp;
191	PAGE *h;
192	db_pgno_t pgno;
193	u_int32_t flags;
194{
195	BKEYDATA *bk;
196	ENV *env;
197	VRFY_PAGEINFO *pip;
198	db_indx_t i;
199	int ret, t_ret, isbad;
200	u_int32_t re_len_guess, len;
201
202	env = dbp->env;
203	isbad = 0;
204
205	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
206		return (ret);
207
208	if (TYPE(h) != P_LRECNO) {
209		ret = __db_unknown_path(env, "__ram_vrfy_leaf");
210		goto err;
211	}
212
213	/*
214	 * Verify (and, if relevant, save off) page fields common to
215	 * all PAGEs.
216	 */
217	if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
218		if (ret == DB_VERIFY_BAD)
219			isbad = 1;
220		else
221			goto err;
222	}
223
224	/*
225	 * Verify inp[].  Return immediately if it returns DB_VERIFY_BAD;
226	 * further checks are dangerous.
227	 */
228	if ((ret = __bam_vrfy_inp(dbp,
229	    vdp, h, pgno, &pip->entries, flags)) != 0)
230		goto err;
231
232	if (F_ISSET(pip, VRFY_HAS_DUPS)) {
233		EPRINT((env,
234		    "Page %lu: Recno database has dups", (u_long)pgno));
235		ret = DB_VERIFY_BAD;
236		goto err;
237	}
238
239	/*
240	 * Walk through inp and see if the lengths of all the records are the
241	 * same--if so, this may be a fixed-length database, and we want to
242	 * save off this value.  We know inp to be safe if we've gotten this
243	 * far.
244	 */
245	re_len_guess = 0;
246	for (i = 0; i < NUM_ENT(h); i++) {
247		bk = GET_BKEYDATA(dbp, h, i);
248		/* KEYEMPTY.  Go on. */
249		if (B_DISSET(bk->type))
250			continue;
251		if (bk->type == B_OVERFLOW)
252			len = ((BOVERFLOW *)bk)->tlen;
253		else if (bk->type == B_KEYDATA)
254			len = bk->len;
255		else {
256			isbad = 1;
257			EPRINT((env,
258			    "Page %lu: nonsensical type for item %lu",
259			    (u_long)pgno, (u_long)i));
260			continue;
261		}
262		if (re_len_guess == 0)
263			re_len_guess = len;
264
265		/*
266		 * Is this item's len the same as the last one's?  If not,
267		 * reset to 0 and break--we don't have a single re_len.
268		 * Otherwise, go on to the next item.
269		 */
270		if (re_len_guess != len) {
271			re_len_guess = 0;
272			break;
273		}
274	}
275	pip->re_len = re_len_guess;
276
277	/* Save off record count. */
278	pip->rec_cnt = NUM_ENT(h);
279
280err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
281		ret = t_ret;
282	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
283}
284
285/*
286 * __bam_vrfy --
287 *	Verify a btree leaf or internal page.
288 *
289 * PUBLIC: int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
290 * PUBLIC:     u_int32_t));
291 */
292int
293__bam_vrfy(dbp, vdp, h, pgno, flags)
294	DB *dbp;
295	VRFY_DBINFO *vdp;
296	PAGE *h;
297	db_pgno_t pgno;
298	u_int32_t flags;
299{
300	ENV *env;
301	VRFY_PAGEINFO *pip;
302	int ret, t_ret, isbad;
303
304	env = dbp->env;
305	isbad = 0;
306
307	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
308		return (ret);
309
310	switch (TYPE(h)) {
311	case P_IBTREE:
312	case P_IRECNO:
313	case P_LBTREE:
314	case P_LDUP:
315		break;
316	default:
317		ret = __db_unknown_path(env, "__bam_vrfy");
318		goto err;
319	}
320
321	/*
322	 * Verify (and, if relevant, save off) page fields common to
323	 * all PAGEs.
324	 */
325	if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
326		if (ret == DB_VERIFY_BAD)
327			isbad = 1;
328		else
329			goto err;
330	}
331
332	/*
333	 * The record count is, on internal pages, stored in an overloaded
334	 * next_pgno field.  Save it off;  we'll verify it when we check
335	 * overall database structure.  We could overload the field
336	 * in VRFY_PAGEINFO, too, but this seems gross, and space
337	 * is not at such a premium.
338	 */
339	pip->rec_cnt = RE_NREC(h);
340
341	/*
342	 * Verify inp[].
343	 */
344	if (TYPE(h) == P_IRECNO) {
345		if ((ret = __ram_vrfy_inp(dbp,
346		    vdp, h, pgno, &pip->entries, flags)) != 0)
347			goto err;
348	} else if ((ret = __bam_vrfy_inp(dbp,
349	    vdp, h, pgno, &pip->entries, flags)) != 0) {
350		if (ret == DB_VERIFY_BAD)
351			isbad = 1;
352		else
353			goto err;
354		EPRINT((env,
355		    "Page %lu: item order check unsafe: skipping",
356		    (u_long)pgno));
357	} else if (!LF_ISSET(DB_NOORDERCHK) && (ret =
358	    __bam_vrfy_itemorder(dbp,
359	    vdp, vdp->thread_info, h, pgno, 0, 0, 0, flags)) != 0) {
360		/*
361		 * We know that the elements of inp are reasonable.
362		 *
363		 * Check that elements fall in the proper order.
364		 */
365		if (ret == DB_VERIFY_BAD)
366			isbad = 1;
367		else
368			goto err;
369	}
370
371err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
372		ret = t_ret;
373	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
374}
375
376/*
377 * __ram_vrfy_inp --
378 *	Verify that all entries in a P_IRECNO inp[] array are reasonable,
379 *	and count them.  Note that P_LRECNO uses __bam_vrfy_inp;
380 *	P_IRECNOs are a special, and simpler, case, since they have
381 *	RINTERNALs rather than BKEYDATA/BINTERNALs.
382 */
383static int
384__ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
385	DB *dbp;
386	VRFY_DBINFO *vdp;
387	PAGE *h;
388	db_pgno_t pgno;
389	db_indx_t *nentriesp;
390	u_int32_t flags;
391{
392	ENV *env;
393	RINTERNAL *ri;
394	VRFY_CHILDINFO child;
395	VRFY_PAGEINFO *pip;
396	int ret, t_ret, isbad;
397	u_int32_t himark, i, offset, nentries;
398	db_indx_t *inp;
399	u_int8_t *pagelayout, *p;
400
401	env = dbp->env;
402	isbad = 0;
403	memset(&child, 0, sizeof(VRFY_CHILDINFO));
404	nentries = 0;
405	pagelayout = NULL;
406
407	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
408		return (ret);
409
410	if (TYPE(h) != P_IRECNO) {
411		ret = __db_unknown_path(env, "__ram_vrfy_inp");
412		goto err;
413	}
414
415	himark = dbp->pgsize;
416	if ((ret = __os_malloc(env, dbp->pgsize, &pagelayout)) != 0)
417		goto err;
418	memset(pagelayout, 0, dbp->pgsize);
419	inp = P_INP(dbp, h);
420	for (i = 0; i < NUM_ENT(h); i++) {
421		if ((u_int8_t *)inp + i >= (u_int8_t *)h + himark) {
422			EPRINT((env,
423			    "Page %lu: entries listing %lu overlaps data",
424			    (u_long)pgno, (u_long)i));
425			ret = DB_VERIFY_BAD;
426			goto err;
427		}
428		offset = inp[i];
429		/*
430		 * Check that the item offset is reasonable:  it points
431		 * somewhere after the inp array and before the end of the
432		 * page.
433		 */
434		if (offset <= (u_int32_t)((u_int8_t *)inp + i -
435		    (u_int8_t *)h) ||
436		    offset > (u_int32_t)(dbp->pgsize - RINTERNAL_SIZE)) {
437			isbad = 1;
438			EPRINT((env,
439			    "Page %lu: bad offset %lu at index %lu",
440			    (u_long)pgno, (u_long)offset, (u_long)i));
441			continue;
442		}
443
444		/* Update the high-water mark (what HOFFSET should be) */
445		if (offset < himark)
446			himark = offset;
447
448		nentries++;
449
450		/* Make sure this RINTERNAL is not multiply referenced. */
451		ri = GET_RINTERNAL(dbp, h, i);
452		if (pagelayout[offset] == 0) {
453			pagelayout[offset] = 1;
454			child.pgno = ri->pgno;
455			child.type = V_RECNO;
456			child.nrecs = ri->nrecs;
457			if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
458				goto err;
459		} else {
460			EPRINT((env,
461		"Page %lu: RINTERNAL structure at offset %lu referenced twice",
462			    (u_long)pgno, (u_long)offset));
463			isbad = 1;
464		}
465	}
466
467	for (p = pagelayout + himark;
468	    p < pagelayout + dbp->pgsize;
469	    p += RINTERNAL_SIZE)
470		if (*p != 1) {
471			EPRINT((env,
472			    "Page %lu: gap between items at offset %lu",
473			    (u_long)pgno, (u_long)(p - pagelayout)));
474			isbad = 1;
475		}
476
477	if ((db_indx_t)himark != HOFFSET(h)) {
478		EPRINT((env,
479		    "Page %lu: bad HOFFSET %lu, appears to be %lu",
480		    (u_long)pgno, (u_long)(HOFFSET(h)), (u_long)himark));
481		isbad = 1;
482	}
483
484	*nentriesp = nentries;
485
486err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
487		ret = t_ret;
488	if (pagelayout != NULL)
489		__os_free(env, pagelayout);
490	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
491}
492
493typedef enum { VRFY_ITEM_NOTSET=0, VRFY_ITEM_BEGIN, VRFY_ITEM_END } VRFY_ITEM;
494
495/*
496 * __bam_vrfy_inp --
497 *	Verify that all entries in inp[] array are reasonable;
498 *	count them.
499 */
500static int
501__bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
502	DB *dbp;
503	VRFY_DBINFO *vdp;
504	PAGE *h;
505	db_pgno_t pgno;
506	db_indx_t *nentriesp;
507	u_int32_t flags;
508{
509	BKEYDATA *bk;
510	BOVERFLOW *bo;
511	ENV *env;
512	VRFY_CHILDINFO child;
513	VRFY_ITEM *pagelayout;
514	VRFY_PAGEINFO *pip;
515	u_int32_t himark, offset;		/*
516						 * These would be db_indx_ts
517						 * but for alignment.
518						 */
519	u_int32_t i, endoff, nentries;
520	int isbad, initem, isdupitem, ret, t_ret;
521
522	env = dbp->env;
523	isbad = isdupitem = 0;
524	nentries = 0;
525	memset(&child, 0, sizeof(VRFY_CHILDINFO));
526	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
527		return (ret);
528
529	switch (TYPE(h)) {
530	case P_IBTREE:
531	case P_LBTREE:
532	case P_LDUP:
533	case P_LRECNO:
534		break;
535	default:
536		/*
537		 * In the salvager, we might call this from a page which
538		 * we merely suspect is a btree page.  Otherwise, it
539		 * shouldn't get called--if it is, that's a verifier bug.
540		 */
541		if (LF_ISSET(DB_SALVAGE))
542			break;
543		ret = __db_unknown_path(env, "__bam_vrfy_inp");
544		goto err;
545	}
546
547	/*
548	 * Loop through inp[], the array of items, until we either
549	 * run out of entries or collide with the data.  Keep track
550	 * of h_offset in himark.
551	 *
552	 * For each element in inp[i], make sure it references a region
553	 * that starts after the end of the inp array (as defined by
554	 * NUM_ENT(h)), ends before the beginning of the page, doesn't
555	 * overlap any other regions, and doesn't have a gap between
556	 * it and the region immediately after it.
557	 */
558	himark = dbp->pgsize;
559	if ((ret = __os_calloc(
560	    env, dbp->pgsize, sizeof(pagelayout[0]), &pagelayout)) != 0)
561		goto err;
562	for (i = 0; i < NUM_ENT(h); i++) {
563		switch (ret = __db_vrfy_inpitem(dbp,
564		    h, pgno, i, 1, flags, &himark, &offset)) {
565		case 0:
566			break;
567		case DB_VERIFY_BAD:
568			isbad = 1;
569			continue;
570		case DB_VERIFY_FATAL:
571			isbad = 1;
572			goto err;
573		default:
574			DB_ASSERT(env, ret != 0);
575			break;
576		}
577
578		/*
579		 * We now have a plausible beginning for the item, and we know
580		 * its length is safe.
581		 *
582		 * Mark the beginning and end in pagelayout so we can make sure
583		 * items have no overlaps or gaps.
584		 */
585		bk = GET_BKEYDATA(dbp, h, i);
586		if (pagelayout[offset] == VRFY_ITEM_NOTSET)
587			pagelayout[offset] = VRFY_ITEM_BEGIN;
588		else if (pagelayout[offset] == VRFY_ITEM_BEGIN) {
589			/*
590			 * Having two inp entries that point at the same patch
591			 * of page is legal if and only if the page is
592			 * a btree leaf and they're onpage duplicate keys--
593			 * that is, if (i % P_INDX) == 0.
594			 */
595			if ((i % P_INDX == 0) && (TYPE(h) == P_LBTREE)) {
596				/* Flag for later. */
597				F_SET(pip, VRFY_HAS_DUPS);
598
599				/* Bump up nentries so we don't undercount. */
600				nentries++;
601
602				/*
603				 * We'll check to make sure the end is
604				 * equal, too.
605				 */
606				isdupitem = 1;
607			} else {
608				isbad = 1;
609				EPRINT((env, "Page %lu: duplicated item %lu",
610				    (u_long)pgno, (u_long)i));
611			}
612		}
613
614		/*
615		 * Mark the end.  Its location varies with the page type
616		 * and the item type.
617		 *
618		 * If the end already has a sign other than 0, do nothing--
619		 * it's an overlap that we'll catch later.
620		 */
621		switch (B_TYPE(bk->type)) {
622		case B_KEYDATA:
623			if (TYPE(h) == P_IBTREE)
624				/* It's a BINTERNAL. */
625				endoff = offset + BINTERNAL_SIZE(bk->len) - 1;
626			else
627				endoff = offset + BKEYDATA_SIZE(bk->len) - 1;
628			break;
629		case B_DUPLICATE:
630			/*
631			 * Flag that we have dups; we'll check whether
632			 * that's okay during the structure check.
633			 */
634			F_SET(pip, VRFY_HAS_DUPS);
635			/* FALLTHROUGH */
636		case B_OVERFLOW:
637			/*
638			 * Overflow entries on internal pages are stored
639			 * as the _data_ of a BINTERNAL;  overflow entries
640			 * on leaf pages are stored as the entire entry.
641			 */
642			endoff = offset +
643			    ((TYPE(h) == P_IBTREE) ?
644			    BINTERNAL_SIZE(BOVERFLOW_SIZE) :
645			    BOVERFLOW_SIZE) - 1;
646			break;
647		default:
648			/*
649			 * We'll complain later;  for now, just mark
650			 * a minimum.
651			 */
652			endoff = offset + BKEYDATA_SIZE(0) - 1;
653			break;
654		}
655
656		/*
657		 * If this is an onpage duplicate key we've seen before,
658		 * the end had better coincide too.
659		 */
660		if (isdupitem && pagelayout[endoff] != VRFY_ITEM_END) {
661			EPRINT((env, "Page %lu: duplicated item %lu",
662			    (u_long)pgno, (u_long)i));
663			isbad = 1;
664		} else if (pagelayout[endoff] == VRFY_ITEM_NOTSET)
665			pagelayout[endoff] = VRFY_ITEM_END;
666		isdupitem = 0;
667
668		/*
669		 * There should be no deleted items in a quiescent tree,
670		 * except in recno.
671		 */
672		if (B_DISSET(bk->type) && TYPE(h) != P_LRECNO) {
673			isbad = 1;
674			EPRINT((env, "Page %lu: item %lu marked deleted",
675			    (u_long)pgno, (u_long)i));
676		}
677
678		/*
679		 * Check the type and such of bk--make sure it's reasonable
680		 * for the pagetype.
681		 */
682		switch (B_TYPE(bk->type)) {
683		case B_KEYDATA:
684			/*
685			 * This is a normal, non-overflow BKEYDATA or BINTERNAL.
686			 * The only thing to check is the len, and that's
687			 * already been done.
688			 */
689			break;
690		case B_DUPLICATE:
691			if (TYPE(h) == P_IBTREE) {
692				isbad = 1;
693				EPRINT((env,
694    "Page %lu: duplicate page referenced by internal btree page at item %lu",
695				    (u_long)pgno, (u_long)i));
696				break;
697			} else if (TYPE(h) == P_LRECNO) {
698				isbad = 1;
699				EPRINT((env,
700	"Page %lu: duplicate page referenced by recno page at item %lu",
701				    (u_long)pgno, (u_long)i));
702				break;
703			}
704			/* FALLTHROUGH */
705		case B_OVERFLOW:
706			bo = (TYPE(h) == P_IBTREE) ?
707			    (BOVERFLOW *)(((BINTERNAL *)bk)->data) :
708			    (BOVERFLOW *)bk;
709
710			if (B_TYPE(bk->type) == B_OVERFLOW)
711				/* Make sure tlen is reasonable. */
712				if (bo->tlen > dbp->pgsize * vdp->last_pgno) {
713					isbad = 1;
714					EPRINT((env,
715				"Page %lu: impossible tlen %lu, item %lu",
716					    (u_long)pgno,
717					    (u_long)bo->tlen, (u_long)i));
718					/* Don't save as a child. */
719					break;
720				}
721
722			if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno ||
723			    bo->pgno == PGNO_INVALID) {
724				isbad = 1;
725				EPRINT((env,
726			    "Page %lu: offpage item %lu has bad pgno %lu",
727				    (u_long)pgno, (u_long)i, (u_long)bo->pgno));
728				/* Don't save as a child. */
729				break;
730			}
731
732			child.pgno = bo->pgno;
733			child.type = (B_TYPE(bk->type) == B_OVERFLOW ?
734			    V_OVERFLOW : V_DUPLICATE);
735			child.tlen = bo->tlen;
736			if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
737				goto err;
738			break;
739		default:
740			isbad = 1;
741			EPRINT((env, "Page %lu: item %lu of invalid type %lu",
742			    (u_long)pgno, (u_long)i, (u_long)B_TYPE(bk->type)));
743			break;
744		}
745	}
746
747	/*
748	 * Now, loop through and make sure the items are contiguous and
749	 * non-overlapping.
750	 */
751	initem = 0;
752	for (i = himark; i < dbp->pgsize; i++)
753		if (initem == 0)
754			switch (pagelayout[i]) {
755			case VRFY_ITEM_NOTSET:
756				/* May be just for alignment. */
757				if (i != DB_ALIGN(i, sizeof(u_int32_t)))
758					continue;
759
760				isbad = 1;
761				EPRINT((env,
762				    "Page %lu: gap between items at offset %lu",
763				    (u_long)pgno, (u_long)i));
764				/* Find the end of the gap */
765				for (; pagelayout[i + 1] == VRFY_ITEM_NOTSET &&
766				    (size_t)(i + 1) < dbp->pgsize; i++)
767					;
768				break;
769			case VRFY_ITEM_BEGIN:
770				/* We've found an item. Check its alignment. */
771				if (i != DB_ALIGN(i, sizeof(u_int32_t))) {
772					isbad = 1;
773					EPRINT((env,
774					    "Page %lu: offset %lu unaligned",
775					    (u_long)pgno, (u_long)i));
776				}
777				initem = 1;
778				nentries++;
779				break;
780			case VRFY_ITEM_END:
781				/*
782				 * We've hit the end of an item even though
783				 * we don't think we're in one;  must
784				 * be an overlap.
785				 */
786				isbad = 1;
787				EPRINT((env,
788				    "Page %lu: overlapping items at offset %lu",
789				    (u_long)pgno, (u_long)i));
790				break;
791			}
792		else
793			switch (pagelayout[i]) {
794			case VRFY_ITEM_NOTSET:
795				/* In the middle of an item somewhere. Okay. */
796				break;
797			case VRFY_ITEM_END:
798				/* End of an item; switch to out-of-item mode.*/
799				initem = 0;
800				break;
801			case VRFY_ITEM_BEGIN:
802				/*
803				 * Hit a second item beginning without an
804				 * end.  Overlap.
805				 */
806				isbad = 1;
807				EPRINT((env,
808				    "Page %lu: overlapping items at offset %lu",
809				    (u_long)pgno, (u_long)i));
810				break;
811			}
812
813	__os_free(env, pagelayout);
814
815	/* Verify HOFFSET. */
816	if ((db_indx_t)himark != HOFFSET(h)) {
817		EPRINT((env, "Page %lu: bad HOFFSET %lu, appears to be %lu",
818		    (u_long)pgno, (u_long)HOFFSET(h), (u_long)himark));
819		isbad = 1;
820	}
821
822err:	if (nentriesp != NULL)
823		*nentriesp = nentries;
824
825	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
826		ret = t_ret;
827
828	return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
829}
830
831/*
832 * __bam_vrfy_itemorder --
833 *	Make sure the items on a page sort correctly.
834 *
835 *	Assumes that NUM_ENT(h) and inp[0]..inp[NUM_ENT(h) - 1] are
836 *	reasonable;  be sure that __bam_vrfy_inp has been called first.
837 *
838 *	If ovflok is set, it also assumes that overflow page chains
839 *	hanging off the current page have been sanity-checked, and so we
840 *	can use __bam_cmp to verify their ordering.  If it is not set,
841 *	and we run into an overflow page, carp and return DB_VERIFY_BAD;
842 *	we shouldn't be called if any exist.
843 *
844 * PUBLIC: int __bam_vrfy_itemorder __P((DB *, VRFY_DBINFO *, DB_THREAD_INFO *,
845 * PUBLIC:      PAGE *, db_pgno_t, u_int32_t, int, int, u_int32_t));
846 */
847int
848__bam_vrfy_itemorder(dbp, vdp, ip, h, pgno, nentries, ovflok, hasdups, flags)
849	DB *dbp;
850	VRFY_DBINFO *vdp;
851	DB_THREAD_INFO *ip;
852	PAGE *h;
853	db_pgno_t pgno;
854	u_int32_t nentries;
855	int ovflok, hasdups;
856	u_int32_t flags;
857{
858	BINTERNAL *bi;
859	BKEYDATA *bk;
860	BOVERFLOW *bo;
861	BTREE *bt;
862	DBT dbta, dbtb, dup_1, dup_2, *p1, *p2, *tmp;
863	ENV *env;
864	VRFY_PAGEINFO *pip;
865	db_indx_t i, *inp;
866	int adj, cmp, freedup_1, freedup_2, isbad, ret, t_ret;
867	int (*dupfunc) __P((DB *, const DBT *, const DBT *));
868	int (*func) __P((DB *, const DBT *, const DBT *));
869	void *buf1, *buf2, *tmpbuf;
870
871	/*
872	 * We need to work in the ORDERCHKONLY environment where we might
873	 * not have a pip, but we also may need to work in contexts where
874	 * NUM_ENT isn't safe.
875	 */
876	if (vdp != NULL) {
877		if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
878			return (ret);
879		nentries = pip->entries;
880	} else
881		pip = NULL;
882
883	env = dbp->env;
884	ret = isbad = 0;
885	bo = NULL;			/* Shut up compiler. */
886
887	memset(&dbta, 0, sizeof(DBT));
888	F_SET(&dbta, DB_DBT_REALLOC);
889
890	memset(&dbtb, 0, sizeof(DBT));
891	F_SET(&dbtb, DB_DBT_REALLOC);
892
893	buf1 = buf2 = NULL;
894
895	DB_ASSERT(env, !LF_ISSET(DB_NOORDERCHK));
896
897	dupfunc = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
898	if (TYPE(h) == P_LDUP)
899		func = dupfunc;
900	else {
901		func = __bam_defcmp;
902		if (dbp->bt_internal != NULL) {
903			bt = (BTREE *)dbp->bt_internal;
904			if (bt->bt_compare != NULL)
905				func = bt->bt_compare;
906		}
907	}
908
909	/*
910	 * We alternate our use of dbta and dbtb so that we can walk
911	 * through the page key-by-key without copying a dbt twice.
912	 * p1 is always the dbt for index i - 1, and p2 for index i.
913	 */
914	p1 = &dbta;
915	p2 = &dbtb;
916
917	/*
918	 * Loop through the entries.  nentries ought to contain the
919	 * actual count, and so is a safe way to terminate the loop;  whether
920	 * we inc. by one or two depends on whether we're a leaf page--
921	 * on a leaf page, we care only about keys.  On internal pages
922	 * and LDUP pages, we want to check the order of all entries.
923	 *
924	 * Note that on IBTREE pages, we start with item 1, since item
925	 * 0 doesn't get looked at by __bam_cmp.
926	 */
927	inp = P_INP(dbp, h);
928	adj = (TYPE(h) == P_LBTREE) ? P_INDX : O_INDX;
929	for (i = (TYPE(h) == P_IBTREE) ? 1 : 0; i < nentries; i += adj) {
930		/*
931		 * Put key i-1, now in p2, into p1, by swapping DBTs and bufs.
932		 */
933		tmp = p1;
934		p1 = p2;
935		p2 = tmp;
936		tmpbuf = buf1;
937		buf1 = buf2;
938		buf2 = tmpbuf;
939
940		/*
941		 * Get key i into p2.
942		 */
943		switch (TYPE(h)) {
944		case P_IBTREE:
945			bi = GET_BINTERNAL(dbp, h, i);
946			if (B_TYPE(bi->type) == B_OVERFLOW) {
947				bo = (BOVERFLOW *)(bi->data);
948				goto overflow;
949			} else {
950				p2->data = bi->data;
951				p2->size = bi->len;
952			}
953
954			/*
955			 * The leftmost key on an internal page must be
956			 * len 0, since it's just a placeholder and
957			 * automatically sorts less than all keys.
958			 *
959			 * XXX
960			 * This criterion does not currently hold!
961			 * See todo list item #1686.  Meanwhile, it's harmless
962			 * to just not check for it.
963			 */
964#if 0
965			if (i == 0 && bi->len != 0) {
966				isbad = 1;
967				EPRINT((env,
968		"Page %lu: lowest key on internal page of nonzero length",
969				    (u_long)pgno));
970			}
971#endif
972			break;
973		case P_LBTREE:
974		case P_LDUP:
975			bk = GET_BKEYDATA(dbp, h, i);
976			if (B_TYPE(bk->type) == B_OVERFLOW) {
977				bo = (BOVERFLOW *)bk;
978				goto overflow;
979			} else {
980				p2->data = bk->data;
981				p2->size = bk->len;
982			}
983			break;
984		default:
985			/*
986			 * This means our caller screwed up and sent us
987			 * an inappropriate page.
988			 */
989			ret = __db_unknown_path(env, "__bam_vrfy_itemorder");
990			goto err;
991		}
992
993		if (0) {
994			/*
995			 * If ovflok != 1, we can't safely go chasing
996			 * overflow pages with the normal routines now;
997			 * they might be unsafe or nonexistent.  Mark this
998			 * page as incomplete and return.
999			 *
1000			 * Note that we don't need to worry about freeing
1001			 * buffers, since they can't have been allocated
1002			 * if overflow items are unsafe.
1003			 */
1004overflow:		if (!ovflok) {
1005				F_SET(pip, VRFY_INCOMPLETE);
1006				goto err;
1007			}
1008
1009			/*
1010			 * Overflow items are safe to chase.  Do so.
1011			 * Fetch the overflow item into p2->data,
1012			 * NULLing it or reallocing it as appropriate.
1013			 *
1014			 * (We set p2->data to buf2 before the call
1015			 * so we're sure to realloc if we can and if p2
1016			 * was just pointing at a non-overflow item.)
1017			 */
1018			p2->data = buf2;
1019			if ((ret = __db_goff(dbp, ip, NULL,
1020			    p2, bo->tlen, bo->pgno, NULL, NULL)) != 0) {
1021				isbad = 1;
1022				EPRINT((env,
1023			    "Page %lu: error %lu in fetching overflow item %lu",
1024				    (u_long)pgno, (u_long)ret, (u_long)i));
1025			}
1026			/* In case it got realloc'ed and thus changed. */
1027			buf2 = p2->data;
1028		}
1029
1030		/* Compare with the last key. */
1031		if (p1->data != NULL && p2->data != NULL) {
1032			cmp = inp[i] == inp[i - adj] ? 0 : func(dbp, p1, p2);
1033
1034			/* comparison succeeded */
1035			if (cmp > 0) {
1036				isbad = 1;
1037				EPRINT((env,
1038				    "Page %lu: out-of-order key at entry %lu",
1039				    (u_long)pgno, (u_long)i));
1040				/* proceed */
1041			} else if (cmp == 0) {
1042				if (inp[i] != inp[i - adj]) {
1043					EPRINT((env,
1044				     "Page %lu: non-dup dup key at entry %lu",
1045					   (u_long)pgno, (u_long)i));
1046					isbad = 1;
1047				}
1048				/*
1049				 * If they compared equally, this
1050				 * had better be a (sub)database with dups.
1051				 * Mark it so we can check during the
1052				 * structure check.
1053				 */
1054				if (pip != NULL)
1055					F_SET(pip, VRFY_HAS_DUPS);
1056				else if (hasdups == 0) {
1057					isbad = 1;
1058					EPRINT((env,
1059	"Page %lu: database with no duplicates has duplicated keys",
1060					    (u_long)pgno));
1061				}
1062
1063				/*
1064				 * If we're a btree leaf, check to see
1065				 * if the data items of these on-page dups are
1066				 * in sorted order.  If not, flag this, so
1067				 * that we can make sure during the
1068				 * structure checks that the DUPSORT flag
1069				 * is unset.
1070				 *
1071				 * At this point i points to a duplicate key.
1072				 * Compare the datum before it (same key)
1073				 * to the datum after it, i.e. i-1 to i+1.
1074				 */
1075				if (TYPE(h) == P_LBTREE) {
1076					/*
1077					 * Unsafe;  continue and we'll pick
1078					 * up the bogus nentries later.
1079					 */
1080					if (i + 1 >= (db_indx_t)nentries)
1081						continue;
1082
1083					/*
1084					 * We don't bother with clever memory
1085					 * management with on-page dups,
1086					 * as it's only really a big win
1087					 * in the overflow case, and overflow
1088					 * dups are probably (?) rare.
1089					 */
1090					if (((ret = __bam_safe_getdata(dbp,
1091					    ip, h, i - 1, ovflok,
1092					    &dup_1, &freedup_1)) != 0) ||
1093					    ((ret = __bam_safe_getdata(dbp,
1094					    ip, h, i + 1, ovflok,
1095					    &dup_2, &freedup_2)) != 0))
1096						goto err;
1097
1098					/*
1099					 * If either of the data are NULL,
1100					 * it's because they're overflows and
1101					 * it's not safe to chase them now.
1102					 * Mark an incomplete and return.
1103					 */
1104					if (dup_1.data == NULL ||
1105					    dup_2.data == NULL) {
1106						DB_ASSERT(env, !ovflok);
1107						F_SET(pip, VRFY_INCOMPLETE);
1108						goto err;
1109					}
1110
1111					/*
1112					 * If the dups are out of order,
1113					 * flag this.  It's not an error
1114					 * until we do the structure check
1115					 * and see whether DUPSORT is set.
1116					 */
1117					if (dupfunc(dbp, &dup_1, &dup_2) > 0)
1118						F_SET(pip, VRFY_DUPS_UNSORTED);
1119
1120					if (freedup_1)
1121						__os_ufree(env, dup_1.data);
1122					if (freedup_2)
1123						__os_ufree(env, dup_2.data);
1124				}
1125			}
1126		}
1127	}
1128
1129err:	if (pip != NULL && ((t_ret =
1130	    __db_vrfy_putpageinfo(env, vdp, pip)) != 0) && ret == 0)
1131		ret = t_ret;
1132
1133	if (buf1 != NULL)
1134		__os_ufree(env, buf1);
1135	if (buf2 != NULL)
1136		__os_ufree(env, buf2);
1137
1138	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
1139}
1140
1141/*
1142 * __bam_vrfy_structure --
1143 *	Verify the tree structure of a btree database (including the master
1144 *	database containing subdbs).
1145 *
1146 * PUBLIC: int __bam_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t,
1147 * PUBLIC:     u_int32_t));
1148 */
1149int
1150__bam_vrfy_structure(dbp, vdp, meta_pgno, flags)
1151	DB *dbp;
1152	VRFY_DBINFO *vdp;
1153	db_pgno_t meta_pgno;
1154	u_int32_t flags;
1155{
1156	DB *pgset;
1157	ENV *env;
1158	VRFY_PAGEINFO *mip, *rip;
1159	db_pgno_t root, p;
1160	int t_ret, ret;
1161	u_int32_t nrecs, level, relen, stflags;
1162
1163	env = dbp->env;
1164	mip = rip = 0;
1165	pgset = vdp->pgset;
1166
1167	if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &mip)) != 0)
1168		return (ret);
1169
1170	if ((ret = __db_vrfy_pgset_get(pgset,
1171	    vdp->thread_info, meta_pgno, (int *)&p)) != 0)
1172		goto err;
1173	if (p != 0) {
1174		EPRINT((env,
1175		    "Page %lu: btree metadata page observed twice",
1176		    (u_long)meta_pgno));
1177		ret = DB_VERIFY_BAD;
1178		goto err;
1179	}
1180	if ((ret =
1181	    __db_vrfy_pgset_inc(pgset, vdp->thread_info, meta_pgno)) != 0)
1182		goto err;
1183
1184	root = mip->root;
1185
1186	if (root == 0) {
1187		EPRINT((env,
1188		    "Page %lu: btree metadata page has no root",
1189		    (u_long)meta_pgno));
1190		ret = DB_VERIFY_BAD;
1191		goto err;
1192	}
1193
1194	if ((ret = __db_vrfy_getpageinfo(vdp, root, &rip)) != 0)
1195		goto err;
1196
1197	switch (rip->type) {
1198	case P_IBTREE:
1199	case P_LBTREE:
1200		stflags = flags | DB_ST_TOPLEVEL;
1201		if (F_ISSET(mip, VRFY_HAS_DUPS))
1202			stflags |= DB_ST_DUPOK;
1203		if (F_ISSET(mip, VRFY_HAS_DUPSORT))
1204			stflags |= DB_ST_DUPSORT;
1205		if (F_ISSET(mip, VRFY_HAS_RECNUMS))
1206			stflags |= DB_ST_RECNUM;
1207		ret = __bam_vrfy_subtree(dbp,
1208		    vdp, root, NULL, NULL, stflags, NULL, NULL, NULL);
1209		break;
1210	case P_IRECNO:
1211	case P_LRECNO:
1212		stflags =
1213		    flags | DB_ST_RECNUM | DB_ST_IS_RECNO | DB_ST_TOPLEVEL;
1214		if (mip->re_len > 0)
1215			stflags |= DB_ST_RELEN;
1216		if ((ret = __bam_vrfy_subtree(dbp, vdp,
1217		    root, NULL, NULL, stflags, &level, &nrecs, &relen)) != 0)
1218			goto err;
1219		/*
1220		 * Even if mip->re_len > 0, re_len may come back zero if the
1221		 * tree is empty.  It should be okay to just skip the check in
1222		 * this case, as if there are any non-deleted keys at all,
1223		 * that should never happen.
1224		 */
1225		if (mip->re_len > 0 && relen > 0 && mip->re_len != relen) {
1226			EPRINT((env,
1227			    "Page %lu: recno database has bad re_len %lu",
1228			    (u_long)meta_pgno, (u_long)relen));
1229			ret = DB_VERIFY_BAD;
1230			goto err;
1231		}
1232		ret = 0;
1233		break;
1234	case P_LDUP:
1235		EPRINT((env,
1236		    "Page %lu: duplicate tree referenced from metadata page",
1237		    (u_long)meta_pgno));
1238		ret = DB_VERIFY_BAD;
1239		break;
1240	default:
1241		EPRINT((env,
1242	    "Page %lu: btree root of incorrect type %lu on metadata page",
1243		    (u_long)meta_pgno, (u_long)rip->type));
1244		ret = DB_VERIFY_BAD;
1245		break;
1246	}
1247
1248err:	if (mip != NULL && ((t_ret =
1249	    __db_vrfy_putpageinfo(env, vdp, mip)) != 0) && ret == 0)
1250		ret = t_ret;
1251	if (rip != NULL && ((t_ret =
1252	    __db_vrfy_putpageinfo(env, vdp, rip)) != 0) && ret == 0)
1253		ret = t_ret;
1254	return (ret);
1255}
1256
1257/*
1258 * __bam_vrfy_subtree--
1259 *	Verify a subtree (or entire) btree with specified root.
1260 *
1261 *	Note that this is public because it must be called to verify
1262 *	offpage dup trees, including from hash.
1263 *
1264 * PUBLIC: int __bam_vrfy_subtree __P((DB *, VRFY_DBINFO *, db_pgno_t, void *,
1265 * PUBLIC:     void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *));
1266 */
1267int
1268__bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp)
1269	DB *dbp;
1270	VRFY_DBINFO *vdp;
1271	db_pgno_t pgno;
1272	void *l, *r;
1273	u_int32_t flags, *levelp, *nrecsp, *relenp;
1274{
1275	BINTERNAL *li, *ri;
1276	DB *pgset;
1277	DBC *cc;
1278	DB_MPOOLFILE *mpf;
1279	ENV *env;
1280	PAGE *h;
1281	VRFY_CHILDINFO *child;
1282	VRFY_PAGEINFO *pip;
1283	db_indx_t i;
1284	db_pgno_t next_pgno, prev_pgno;
1285	db_recno_t child_nrecs, nrecs;
1286	u_int32_t child_level, child_relen, j, level, relen, stflags;
1287	u_int8_t leaf_type;
1288	int (*func) __P((DB *, const DBT *, const DBT *));
1289	int isbad, p, ret, t_ret, toplevel;
1290
1291	if (levelp != NULL)	/* Don't leave uninitialized on error. */
1292		*levelp = 0;
1293	if (nrecsp != NULL)
1294		*nrecsp = 0;
1295
1296	env = dbp->env;
1297	mpf = dbp->mpf;
1298	h = NULL;
1299	next_pgno = prev_pgno = PGNO_INVALID;
1300	nrecs = 0;
1301	relen = 0;
1302	leaf_type = P_INVALID;
1303	isbad = ret = 0;
1304
1305	/* Provide feedback on our progress to the application. */
1306	if (!LF_ISSET(DB_SALVAGE))
1307		__db_vrfy_struct_feedback(dbp, vdp);
1308
1309	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
1310		return (ret);
1311
1312	cc = NULL;
1313	level = pip->bt_level;
1314
1315	toplevel = LF_ISSET(DB_ST_TOPLEVEL) ? 1 : 0;
1316	LF_CLR(DB_ST_TOPLEVEL);
1317
1318	/*
1319	 * If this is the root, initialize the vdp's prev- and next-pgno
1320	 * accounting.
1321	 *
1322	 * For each leaf page we hit, we'll want to make sure that
1323	 * vdp->prev_pgno is the same as pip->prev_pgno and vdp->next_pgno is
1324	 * our page number.  Then, we'll set vdp->next_pgno to pip->next_pgno
1325	 * and vdp->prev_pgno to our page number, and the next leaf page in
1326	 * line should be able to do the same verification.
1327	 */
1328	if (toplevel) {
1329		/*
1330		 * Cache the values stored in the vdp so that if we're an
1331		 * auxiliary tree such as an off-page duplicate set, our
1332		 * caller's leaf page chain doesn't get lost.
1333		 */
1334		prev_pgno = vdp->prev_pgno;
1335		next_pgno = vdp->next_pgno;
1336		leaf_type = vdp->leaf_type;
1337		vdp->next_pgno = vdp->prev_pgno = PGNO_INVALID;
1338		vdp->leaf_type = P_INVALID;
1339	}
1340
1341	/*
1342	 * We are recursively descending a btree, starting from the root
1343	 * and working our way out to the leaves.
1344	 *
1345	 * There are four cases we need to deal with:
1346	 *	1. pgno is a recno leaf page.  Any children are overflows.
1347	 *	2. pgno is a duplicate leaf page.  Any children
1348	 *	   are overflow pages;  traverse them, and then return
1349	 *	   level and nrecs.
1350	 *	3. pgno is an ordinary leaf page.  Check whether dups are
1351	 *	   allowed, and if so, traverse any off-page dups or
1352	 *	   overflows.  Then return nrecs and level.
1353	 *	4. pgno is a recno internal page.  Recursively check any
1354	 *	   child pages, making sure their levels are one lower
1355	 *	   and their nrecs sum to ours.
1356	 *	5. pgno is a btree internal page.  Same as #4, plus we
1357	 *	   must verify that for each pair of BINTERNAL entries
1358	 *	   N and N+1, the leftmost item on N's child sorts
1359	 *	   greater than N, and the rightmost item on N's child
1360	 *	   sorts less than N+1.
1361	 *
1362	 * Furthermore, in any sorted page type (P_LDUP, P_LBTREE, P_IBTREE),
1363	 * we need to verify the internal sort order is correct if,
1364	 * due to overflow items, we were not able to do so earlier.
1365	 */
1366	switch (pip->type) {
1367	case P_LRECNO:
1368	case P_LDUP:
1369	case P_LBTREE:
1370		/*
1371		 * Cases 1, 2 and 3.
1372		 *
1373		 * We're some sort of leaf page;  verify
1374		 * that our linked list of leaves is consistent.
1375		 */
1376		if (vdp->leaf_type == P_INVALID) {
1377			/*
1378			 * First leaf page.  Set the type that all its
1379			 * successors should be, and verify that our prev_pgno
1380			 * is PGNO_INVALID.
1381			 */
1382			vdp->leaf_type = pip->type;
1383			if (pip->prev_pgno != PGNO_INVALID)
1384				goto bad_prev;
1385		} else {
1386			/*
1387			 * Successor leaf page. Check our type, the previous
1388			 * page's next_pgno, and our prev_pgno.
1389			 */
1390			if (pip->type != vdp->leaf_type) {
1391				isbad = 1;
1392				EPRINT((env,
1393	"Page %lu: unexpected page type %lu found in leaf chain (expected %lu)",
1394				    (u_long)pip->pgno, (u_long)pip->type,
1395				    (u_long)vdp->leaf_type));
1396			}
1397
1398			/*
1399			 * Don't do the prev/next_pgno checks if we've lost
1400			 * leaf pages due to another corruption.
1401			 */
1402			if (!F_ISSET(vdp, VRFY_LEAFCHAIN_BROKEN)) {
1403				if (pip->pgno != vdp->next_pgno) {
1404					isbad = 1;
1405					EPRINT((env,
1406	"Page %lu: incorrect next_pgno %lu found in leaf chain (should be %lu)",
1407					    (u_long)vdp->prev_pgno,
1408					    (u_long)vdp->next_pgno,
1409					    (u_long)pip->pgno));
1410				}
1411				if (pip->prev_pgno != vdp->prev_pgno) {
1412bad_prev:				isbad = 1;
1413					EPRINT((env,
1414    "Page %lu: incorrect prev_pgno %lu found in leaf chain (should be %lu)",
1415					    (u_long)pip->pgno,
1416					    (u_long)pip->prev_pgno,
1417					    (u_long)vdp->prev_pgno));
1418				}
1419			}
1420		}
1421		vdp->prev_pgno = pip->pgno;
1422		vdp->next_pgno = pip->next_pgno;
1423		F_CLR(vdp, VRFY_LEAFCHAIN_BROKEN);
1424
1425		/*
1426		 * Overflow pages are common to all three leaf types;
1427		 * traverse the child list, looking for overflows.
1428		 */
1429		if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
1430			goto err;
1431		for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
1432		    ret = __db_vrfy_ccnext(cc, &child))
1433			if (child->type == V_OVERFLOW &&
1434			    (ret = __db_vrfy_ovfl_structure(dbp, vdp,
1435			    child->pgno, child->tlen,
1436			    flags | DB_ST_OVFL_LEAF)) != 0) {
1437				if (ret == DB_VERIFY_BAD)
1438					isbad = 1;
1439				else
1440					goto done;
1441			}
1442
1443		if ((ret = __db_vrfy_ccclose(cc)) != 0)
1444			goto err;
1445		cc = NULL;
1446
1447		/* Case 1 */
1448		if (pip->type == P_LRECNO) {
1449			if (!LF_ISSET(DB_ST_IS_RECNO) &&
1450			    !(LF_ISSET(DB_ST_DUPOK) &&
1451			    !LF_ISSET(DB_ST_DUPSORT))) {
1452				isbad = 1;
1453				EPRINT((env,
1454				    "Page %lu: recno leaf page non-recno tree",
1455				    (u_long)pgno));
1456				goto done;
1457			}
1458			goto leaf;
1459		} else if (LF_ISSET(DB_ST_IS_RECNO)) {
1460			/*
1461			 * It's a non-recno leaf.  Had better not be a recno
1462			 * subtree.
1463			 */
1464			isbad = 1;
1465			EPRINT((env,
1466			    "Page %lu: non-recno leaf page in recno tree",
1467			    (u_long)pgno));
1468			goto done;
1469		}
1470
1471		/* Case 2--no more work. */
1472		if (pip->type == P_LDUP)
1473			goto leaf;
1474
1475		/* Case 3 */
1476
1477		/* Check if we have any dups. */
1478		if (F_ISSET(pip, VRFY_HAS_DUPS)) {
1479			/* If dups aren't allowed in this btree, trouble. */
1480			if (!LF_ISSET(DB_ST_DUPOK)) {
1481				isbad = 1;
1482				EPRINT((env,
1483				    "Page %lu: duplicates in non-dup btree",
1484				    (u_long)pgno));
1485			} else {
1486				/*
1487				 * We correctly have dups.  If any are off-page,
1488				 * traverse those btrees recursively.
1489				 */
1490				if ((ret =
1491				    __db_vrfy_childcursor(vdp, &cc)) != 0)
1492					goto err;
1493				for (ret = __db_vrfy_ccset(cc, pgno, &child);
1494				    ret == 0;
1495				    ret = __db_vrfy_ccnext(cc, &child)) {
1496					stflags =
1497					    flags | DB_ST_RECNUM | DB_ST_DUPSET;
1498					/* Skip any overflow entries. */
1499					if (child->type == V_DUPLICATE) {
1500						if ((ret = __db_vrfy_duptype(
1501						    dbp, vdp, child->pgno,
1502						    stflags)) != 0) {
1503							isbad = 1;
1504							/* Next child. */
1505							continue;
1506						}
1507						if ((ret = __bam_vrfy_subtree(
1508						    dbp, vdp, child->pgno,
1509						    NULL, NULL,
1510						    stflags | DB_ST_TOPLEVEL,
1511						    NULL, NULL, NULL)) != 0) {
1512							if (ret ==
1513							    DB_VERIFY_BAD)
1514								isbad = 1;
1515							else
1516								goto err;
1517						}
1518					}
1519				}
1520
1521				if ((ret = __db_vrfy_ccclose(cc)) != 0)
1522					goto err;
1523				cc = NULL;
1524
1525				/*
1526				 * If VRFY_DUPS_UNSORTED is set,
1527				 * DB_ST_DUPSORT had better not be.
1528				 */
1529				if (F_ISSET(pip, VRFY_DUPS_UNSORTED) &&
1530				    LF_ISSET(DB_ST_DUPSORT)) {
1531					isbad = 1;
1532					EPRINT((env,
1533		    "Page %lu: unsorted duplicate set in sorted-dup database",
1534					    (u_long)pgno));
1535				}
1536			}
1537		}
1538		goto leaf;
1539	case P_IBTREE:
1540	case P_IRECNO:
1541		/* We handle these below. */
1542		break;
1543	default:
1544		/*
1545		 * If a P_IBTREE or P_IRECNO contains a reference to an
1546		 * invalid page, we'll wind up here;  handle it gracefully.
1547		 * Note that the code at the "done" label assumes that the
1548		 * current page is a btree/recno one of some sort;  this
1549		 * is not the case here, so we goto err.
1550		 *
1551		 * If the page is entirely zeroed, its pip->type will be a lie
1552		 * (we assumed it was a hash page, as they're allowed to be
1553		 * zeroed);  handle this case specially.
1554		 */
1555		if (F_ISSET(pip, VRFY_IS_ALLZEROES))
1556			ZEROPG_ERR_PRINT(env, pgno, "btree or recno page");
1557		else
1558			EPRINT((env,
1559	    "Page %lu: btree or recno page is of inappropriate type %lu",
1560			    (u_long)pgno, (u_long)pip->type));
1561
1562		/*
1563		 * We probably lost a leaf page (or more if this was an
1564		 * internal page) from our prev/next_pgno chain.  Flag
1565		 * that this is expected;  we don't want or need to
1566		 * spew error messages about erroneous prev/next_pgnos,
1567		 * since that's probably not the real problem.
1568		 */
1569		F_SET(vdp, VRFY_LEAFCHAIN_BROKEN);
1570
1571		ret = DB_VERIFY_BAD;
1572		goto err;
1573	}
1574
1575	/*
1576	 * Cases 4 & 5: This is a btree or recno internal page.  For each child,
1577	 * recurse, keeping a running count of nrecs and making sure the level
1578	 * is always reasonable.
1579	 */
1580	if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
1581		goto err;
1582	for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
1583	    ret = __db_vrfy_ccnext(cc, &child))
1584		if (child->type == V_RECNO) {
1585			if (pip->type != P_IRECNO) {
1586				ret = __db_unknown_path(
1587				    env, "__bam_vrfy_subtree");
1588				goto err;
1589			}
1590			if ((ret = __bam_vrfy_subtree(dbp, vdp, child->pgno,
1591			    NULL, NULL, flags, &child_level, &child_nrecs,
1592			    &child_relen)) != 0) {
1593				if (ret == DB_VERIFY_BAD)
1594					isbad = 1;
1595				else
1596					goto done;
1597			}
1598
1599			if (LF_ISSET(DB_ST_RELEN)) {
1600				if (relen == 0)
1601					relen = child_relen;
1602				/*
1603				 * child_relen may be zero if the child subtree
1604				 * is empty.
1605				 */
1606				else if (child_relen > 0 &&
1607				    relen != child_relen) {
1608					isbad = 1;
1609					EPRINT((env,
1610			   "Page %lu: recno page returned bad re_len %lu",
1611					    (u_long)child->pgno,
1612					    (u_long)child_relen));
1613				}
1614				if (relenp)
1615					*relenp = relen;
1616			}
1617			if (LF_ISSET(DB_ST_RECNUM)) {
1618				if (child->nrecs != child_nrecs) {
1619					isbad = 1;
1620					EPRINT((env,
1621		"Page %lu: record count incorrect: actual %lu, in record %lu",
1622					    (u_long)child->pgno,
1623					    (u_long)child_nrecs,
1624					    (u_long)child->nrecs));
1625				}
1626				nrecs += child_nrecs;
1627			}
1628			if (isbad == 0 && level != child_level + 1) {
1629				isbad = 1;
1630				EPRINT((env,
1631		"Page %lu: recno level incorrect: got %lu, expected %lu",
1632				    (u_long)child->pgno, (u_long)child_level,
1633				    (u_long)(level - 1)));
1634			}
1635		} else if (child->type == V_OVERFLOW) {
1636			/*
1637			 * It is possible for one internal page to reference
1638			 * a single overflow page twice, if all the items
1639			 * in the subtree referenced by slot 0 are deleted,
1640			 * then a similar number of items are put back
1641			 * before the key that formerly had been in slot 1.
1642			 *
1643			 * (Btree doesn't look at the key in slot 0, so the
1644			 * fact that the key formerly at slot 1 is the "wrong"
1645			 * parent of the stuff in the slot 0 subtree isn't
1646			 * really incorrect.)
1647			 *
1648			 * __db_vrfy_ovfl_structure is designed to be
1649			 * efficiently called multiple times for multiple
1650			 * references;  call it here as many times as is
1651			 * appropriate.
1652			 */
1653
1654			/* Otherwise, __db_vrfy_childput would be broken. */
1655			DB_ASSERT(env, child->refcnt >= 1);
1656
1657			/*
1658			 * An overflow referenced more than twice here
1659			 * shouldn't happen.
1660			 */
1661			if (child->refcnt > 2) {
1662				isbad = 1;
1663				EPRINT((env,
1664    "Page %lu: overflow page %lu referenced more than twice from internal page",
1665				    (u_long)pgno, (u_long)child->pgno));
1666			} else
1667				for (j = 0; j < child->refcnt; j++)
1668					if ((ret = __db_vrfy_ovfl_structure(dbp,
1669					    vdp, child->pgno, child->tlen,
1670					    flags)) != 0) {
1671						if (ret == DB_VERIFY_BAD)
1672							isbad = 1;
1673						else
1674							goto done;
1675					}
1676		}
1677
1678	if ((ret = __db_vrfy_ccclose(cc)) != 0)
1679		goto err;
1680	cc = NULL;
1681
1682	/* We're done with case 4. */
1683	if (pip->type == P_IRECNO)
1684		goto done;
1685
1686	/*
1687	 * Case 5.  Btree internal pages.
1688	 * As described above, we need to iterate through all the
1689	 * items on the page and make sure that our children sort appropriately
1690	 * with respect to them.
1691	 *
1692	 * For each entry, li will be the "left-hand" key for the entry
1693	 * itself, which must sort lower than all entries on its child;
1694	 * ri will be the key to its right, which must sort greater.
1695	 */
1696	if (h == NULL &&
1697	    (ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
1698		goto err;
1699	for (i = 0; i < pip->entries; i += O_INDX) {
1700		li = GET_BINTERNAL(dbp, h, i);
1701		ri = (i + O_INDX < pip->entries) ?
1702		    GET_BINTERNAL(dbp, h, i + O_INDX) : r;
1703
1704		/*
1705		 * The leftmost key is forcibly sorted less than all entries,
1706		 * so don't bother passing it.
1707		 */
1708		if ((ret = __bam_vrfy_subtree(dbp, vdp, li->pgno,
1709		    i == 0 ? NULL : li, ri, flags, &child_level,
1710		    &child_nrecs, NULL)) != 0) {
1711			if (ret == DB_VERIFY_BAD)
1712				isbad = 1;
1713			else
1714				goto done;
1715		}
1716
1717		if (LF_ISSET(DB_ST_RECNUM)) {
1718			/*
1719			 * Keep a running tally on the actual record count so
1720			 * we can return it to our parent (if we have one) or
1721			 * compare it to the NRECS field if we're a root page.
1722			 */
1723			nrecs += child_nrecs;
1724
1725			/*
1726			 * Make sure the actual record count of the child
1727			 * is equal to the value in the BINTERNAL structure.
1728			 */
1729			if (li->nrecs != child_nrecs) {
1730				isbad = 1;
1731				EPRINT((env,
1732	"Page %lu: item %lu has incorrect record count of %lu, should be %lu",
1733				    (u_long)pgno, (u_long)i, (u_long)li->nrecs,
1734				    (u_long)child_nrecs));
1735			}
1736		}
1737
1738		if (level != child_level + 1) {
1739			isbad = 1;
1740			EPRINT((env,
1741		"Page %lu: Btree level incorrect: got %lu, expected %lu",
1742			    (u_long)li->pgno,
1743			    (u_long)child_level, (u_long)(level - 1)));
1744		}
1745	}
1746
1747	if (0) {
1748leaf:		level = LEAFLEVEL;
1749		if (LF_ISSET(DB_ST_RECNUM))
1750			nrecs = pip->rec_cnt;
1751
1752		/* XXX
1753		 * We should verify that the record count on a leaf page
1754		 * is the sum of the number of keys and the number of
1755		 * records in its off-page dups.  This requires looking
1756		 * at the page again, however, and it may all be changing
1757		 * soon, so for now we don't bother.
1758		 */
1759
1760		if (LF_ISSET(DB_ST_RELEN) && relenp)
1761			*relenp = pip->re_len;
1762	}
1763done:	if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) {
1764		/*
1765		 * During the page-by-page pass, item order verification was
1766		 * not finished due to the presence of overflow items.  If
1767		 * isbad == 0, though, it's now safe to do so, as we've
1768		 * traversed any child overflow pages.  Do it.
1769		 */
1770		if (h == NULL && (ret = __memp_fget(mpf, &pgno,
1771		    vdp->thread_info, NULL, 0, &h)) != 0)
1772			goto err;
1773		if ((ret = __bam_vrfy_itemorder(dbp,
1774		    vdp, vdp->thread_info, h, pgno, 0, 1, 0, flags)) != 0)
1775			goto err;
1776		F_CLR(pip, VRFY_INCOMPLETE);
1777	}
1778
1779	/*
1780	 * It's possible to get to this point with a page that has no
1781	 * items, but without having detected any sort of failure yet.
1782	 * Having zero items is legal if it's a leaf--it may be the
1783	 * root page in an empty tree, or the tree may have been
1784	 * modified with the DB_REVSPLITOFF flag set (there's no way
1785	 * to tell from what's on disk).  For an internal page,
1786	 * though, having no items is a problem (all internal pages
1787	 * must have children).
1788	 */
1789	if (isbad == 0 && ret == 0) {
1790		if (h == NULL && (ret = __memp_fget(mpf, &pgno,
1791		    vdp->thread_info, NULL, 0, &h)) != 0)
1792			goto err;
1793
1794		if (NUM_ENT(h) == 0 && ISINTERNAL(h)) {
1795			isbad = 1;
1796			EPRINT((env,
1797		    "Page %lu: internal page is empty and should not be",
1798			    (u_long)pgno));
1799			goto err;
1800		}
1801	}
1802
1803	/*
1804	 * Our parent has sent us BINTERNAL pointers to parent records
1805	 * so that we can verify our place with respect to them.  If it's
1806	 * appropriate--we have a default sort function--verify this.
1807	 */
1808	if (isbad == 0 && ret == 0 && !LF_ISSET(DB_NOORDERCHK) &&
1809	    pip->type != P_IRECNO && pip->type != P_LRECNO) {
1810		if (h == NULL && (ret = __memp_fget(mpf, &pgno,
1811		    vdp->thread_info, NULL, 0, &h)) != 0)
1812			goto err;
1813
1814		/*
1815		 * __bam_vrfy_treeorder needs to know what comparison function
1816		 * to use.  If DB_ST_DUPSET is set, we're in a duplicate tree
1817		 * and we use the duplicate comparison function;  otherwise,
1818		 * use the btree one.  If unset, use the default, of course.
1819		 */
1820		func = LF_ISSET(DB_ST_DUPSET) ? dbp->dup_compare :
1821		    ((BTREE *)dbp->bt_internal)->bt_compare;
1822		if (func == NULL)
1823			func = __bam_defcmp;
1824
1825		if ((ret = __bam_vrfy_treeorder(dbp,
1826		    vdp->thread_info, h, l, r, func, flags)) != 0) {
1827			if (ret == DB_VERIFY_BAD)
1828				isbad = 1;
1829			else
1830				goto err;
1831		}
1832	}
1833
1834	/*
1835	 * This is guaranteed to succeed for leaf pages, but no harm done.
1836	 *
1837	 * Internal pages below the top level do not store their own
1838	 * record numbers, so we skip them.
1839	 */
1840	if (LF_ISSET(DB_ST_RECNUM) && nrecs != pip->rec_cnt && toplevel) {
1841		isbad = 1;
1842		EPRINT((env,
1843		    "Page %lu: bad record count: has %lu records, claims %lu",
1844		    (u_long)pgno, (u_long)nrecs, (u_long)pip->rec_cnt));
1845	}
1846
1847	if (levelp)
1848		*levelp = level;
1849	if (nrecsp)
1850		*nrecsp = nrecs;
1851
1852	pgset = vdp->pgset;
1853	if ((ret = __db_vrfy_pgset_get(pgset,
1854	    vdp->thread_info, pgno, &p)) != 0)
1855		goto err;
1856	if (p != 0) {
1857		isbad = 1;
1858		EPRINT((env, "Page %lu: linked twice", (u_long)pgno));
1859	} else if ((ret =
1860	    __db_vrfy_pgset_inc(pgset, vdp->thread_info, pgno)) != 0)
1861		goto err;
1862
1863	if (toplevel)
1864		/*
1865		 * The last page's next_pgno in the leaf chain should have been
1866		 * PGNO_INVALID.
1867		 */
1868		if (vdp->next_pgno != PGNO_INVALID) {
1869			isbad = 1;
1870			EPRINT((env, "Page %lu: unterminated leaf chain",
1871			    (u_long)vdp->prev_pgno));
1872		}
1873
1874err:	if (toplevel) {
1875		/* Restore our caller's settings. */
1876		vdp->next_pgno = next_pgno;
1877		vdp->prev_pgno = prev_pgno;
1878		vdp->leaf_type = leaf_type;
1879	}
1880
1881	if (h != NULL && (t_ret = __memp_fput(mpf,
1882	    vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0)
1883		ret = t_ret;
1884	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
1885		ret = t_ret;
1886	if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0)
1887		ret = t_ret;
1888	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
1889}
1890
1891/*
1892 * __bam_vrfy_treeorder --
1893 *	Verify that the lowest key on a page sorts greater than the
1894 *	BINTERNAL which points to it (lp), and the highest key
1895 *	sorts less than the BINTERNAL above that (rp).
1896 *
1897 *	If lp is NULL, this means that it was the leftmost key on the
1898 *	parent, which (regardless of sort function) sorts less than
1899 *	all keys.  No need to check it.
1900 *
1901 *	If rp is NULL, lp was the highest key on the parent, so there's
1902 *	no higher key we must sort less than.
1903 */
1904static int
1905__bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
1906	DB *dbp;
1907	DB_THREAD_INFO *ip;
1908	PAGE *h;
1909	BINTERNAL *lp, *rp;
1910	int (*func) __P((DB *, const DBT *, const DBT *));
1911	u_int32_t flags;
1912{
1913	BOVERFLOW *bo;
1914	DBT dbt;
1915	ENV *env;
1916	db_indx_t last;
1917	int ret, cmp;
1918
1919	env = dbp->env;
1920	memset(&dbt, 0, sizeof(DBT));
1921	F_SET(&dbt, DB_DBT_MALLOC);
1922	ret = 0;
1923
1924	/*
1925	 * Empty pages are sorted correctly by definition.  We check
1926	 * to see whether they ought to be empty elsewhere;  leaf
1927	 * pages legally may be.
1928	 */
1929	if (NUM_ENT(h) == 0)
1930		return (0);
1931
1932	switch (TYPE(h)) {
1933	case P_IBTREE:
1934	case P_LDUP:
1935		last = NUM_ENT(h) - O_INDX;
1936		break;
1937	case P_LBTREE:
1938		last = NUM_ENT(h) - P_INDX;
1939		break;
1940	default:
1941		return (__db_unknown_path(env, "__bam_vrfy_treeorder"));
1942	}
1943
1944	/*
1945	 * The key on page h, the child page, is more likely to be
1946	 * an overflow page, so we pass its offset, rather than lp/rp's,
1947	 * into __bam_cmp.  This will take advantage of __db_moff.
1948	 */
1949
1950	/*
1951	 * Skip first-item check if we're an internal page--the first
1952	 * entry on an internal page is treated specially by __bam_cmp,
1953	 * so what's on the page shouldn't matter.  (Plus, since we're passing
1954	 * our page and item 0 as to __bam_cmp, we'll sort before our
1955	 * parent and falsely report a failure.)
1956	 */
1957	if (lp != NULL && TYPE(h) != P_IBTREE) {
1958		if (lp->type == B_KEYDATA) {
1959			dbt.data = lp->data;
1960			dbt.size = lp->len;
1961		} else if (lp->type == B_OVERFLOW) {
1962			bo = (BOVERFLOW *)lp->data;
1963			if ((ret = __db_goff(dbp, ip, NULL, &dbt,
1964			    bo->tlen, bo->pgno, NULL, NULL)) != 0)
1965				return (ret);
1966		} else
1967			return (
1968			    __db_unknown_path(env, "__bam_vrfy_treeorder"));
1969
1970		/* On error, fall through, free if needed, and return. */
1971		if ((ret = __bam_cmp(dbp, ip,
1972		    NULL, &dbt, h, 0, func, &cmp)) == 0) {
1973			if (cmp > 0) {
1974				EPRINT((env,
1975	    "Page %lu: first item on page sorted greater than parent entry",
1976				    (u_long)PGNO(h)));
1977				ret = DB_VERIFY_BAD;
1978			}
1979		} else
1980			EPRINT((env,
1981			    "Page %lu: first item on page had comparison error",
1982			    (u_long)PGNO(h)));
1983
1984		if (dbt.data != lp->data)
1985			__os_ufree(env, dbt.data);
1986		if (ret != 0)
1987			return (ret);
1988	}
1989
1990	if (rp != NULL) {
1991		if (rp->type == B_KEYDATA) {
1992			dbt.data = rp->data;
1993			dbt.size = rp->len;
1994		} else if (rp->type == B_OVERFLOW) {
1995			bo = (BOVERFLOW *)rp->data;
1996			if ((ret = __db_goff(dbp, ip, NULL, &dbt,
1997			    bo->tlen, bo->pgno, NULL, NULL)) != 0)
1998				return (ret);
1999		} else
2000			return (
2001			    __db_unknown_path(env, "__bam_vrfy_treeorder"));
2002
2003		/* On error, fall through, free if needed, and return. */
2004		if ((ret = __bam_cmp(dbp, ip,
2005		    NULL, &dbt, h, last, func, &cmp)) == 0) {
2006			if (cmp < 0) {
2007				EPRINT((env,
2008	    "Page %lu: last item on page sorted greater than parent entry",
2009				    (u_long)PGNO(h)));
2010				ret = DB_VERIFY_BAD;
2011			}
2012		} else
2013			EPRINT((env,
2014			    "Page %lu: last item on page had comparison error",
2015			    (u_long)PGNO(h)));
2016
2017		if (dbt.data != rp->data)
2018			__os_ufree(env, dbt.data);
2019	}
2020
2021	return (ret);
2022}
2023
2024/*
2025 * __bam_salvage --
2026 *	Safely dump out anything that looks like a key on an alleged
2027 *	btree leaf page, also mark overflow pages as seen.  For internal btree
2028 *	pages, just mark any overflow pages as seen.
2029 *
2030 * PUBLIC: int __bam_salvage __P((DB *, VRFY_DBINFO *,
2031 * PUBLIC:     db_pgno_t, u_int32_t, PAGE *, void *,
2032 * PUBLIC:     int (*)(void *, const void *), DBT *, u_int32_t));
2033 */
2034int
2035__bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags)
2036	DB *dbp;
2037	VRFY_DBINFO *vdp;
2038	db_pgno_t pgno;
2039	u_int32_t pgtype;
2040	PAGE *h;
2041	void *handle;
2042	int (*callback) __P((void *, const void *));
2043	DBT *key;
2044	u_int32_t flags;
2045{
2046	BKEYDATA *bk;
2047	BOVERFLOW *bo;
2048	DBT dbt, repldbt, unknown_key, unknown_data;
2049	ENV *env;
2050	VRFY_ITEM *pgmap;
2051	db_indx_t i, last, beg, end, *inp;
2052	db_pgno_t ovflpg;
2053	u_int32_t himark;
2054	void *ovflbuf;
2055	int adj, ret, t_ret, t2_ret;
2056
2057	env = dbp->env;
2058	ovflbuf = pgmap = NULL;
2059	inp = P_INP(dbp, h);
2060
2061	memset(&dbt, 0, sizeof(DBT));
2062	dbt.flags = DB_DBT_REALLOC;
2063	memset(&repldbt, 0, sizeof(DBT));
2064
2065	DB_INIT_DBT(unknown_key, "UNKNOWN_KEY", sizeof("UNKNOWN_KEY") - 1);
2066	DB_INIT_DBT(unknown_data, "UNKNOWN_DATA", sizeof("UNKNOWN_DATA") - 1);
2067
2068	/*
2069	 * Allocate a buffer for overflow items.  Start at one page;
2070	 * __db_safe_goff will realloc as needed.
2071	 */
2072	if ((ret = __os_malloc(env, dbp->pgsize, &ovflbuf)) != 0)
2073		goto err;
2074
2075	if (LF_ISSET(DB_AGGRESSIVE) && (ret =
2076	    __os_calloc(env, dbp->pgsize, sizeof(pgmap[0]), &pgmap)) != 0)
2077		goto err;
2078
2079	/*
2080	 * Loop through the inp array, spitting out key/data pairs.
2081	 *
2082	 * If we're salvaging normally, loop from 0 through NUM_ENT(h).  If
2083	 * we're being aggressive, loop until we hit the end of the page --
2084	 * NUM_ENT() may be bogus.
2085	 */
2086	himark = dbp->pgsize;
2087	for (i = 0, last = UINT16_MAX;; i += O_INDX) {
2088		/* If we're not aggressive, break when we hit NUM_ENT(h). */
2089		if (!LF_ISSET(DB_AGGRESSIVE) && i >= NUM_ENT(h))
2090			break;
2091
2092		/* Verify the current item. */
2093		t_ret =
2094		    __db_vrfy_inpitem(dbp, h, pgno, i, 1, flags, &himark, NULL);
2095
2096		if (t_ret != 0) {
2097			/*
2098			 * If this is a btree leaf and we've printed out a key
2099			 * but not its associated data item, fix this imbalance
2100			 * by printing an "UNKNOWN_DATA".
2101			 */
2102			if (pgtype == P_LBTREE && i % P_INDX == 1 &&
2103			    last == i - 1 && (t2_ret = __db_vrfy_prdbt(
2104			    &unknown_data,
2105			    0, " ", handle, callback, 0, vdp)) != 0) {
2106				if (ret == 0)
2107					ret = t2_ret;
2108				goto err;
2109			}
2110
2111			/*
2112			 * Don't return DB_VERIFY_FATAL; it's private and means
2113			 * only that we can't go on with this page, not with
2114			 * the whole database.  It's not even an error if we've
2115			 * run into it after NUM_ENT(h).
2116			 */
2117			if (t_ret == DB_VERIFY_FATAL) {
2118				if (i < NUM_ENT(h) && ret == 0)
2119					ret = DB_VERIFY_BAD;
2120				break;
2121			}
2122			continue;
2123		}
2124
2125		/*
2126		 * If this returned 0, it's safe to print or (carefully)
2127		 * try to fetch.
2128		 *
2129		 * We only print deleted items if DB_AGGRESSIVE is set.
2130		 */
2131		bk = GET_BKEYDATA(dbp, h, i);
2132		if (!LF_ISSET(DB_AGGRESSIVE) && B_DISSET(bk->type))
2133			continue;
2134
2135		/*
2136		 * If this is a btree leaf and we're about to print out a data
2137		 * item for which we didn't print out a key, fix this imbalance
2138		 * by printing an "UNKNOWN_KEY".
2139		 */
2140		if (pgtype == P_LBTREE && i % P_INDX == 1 &&
2141		    last != i - 1 && (t_ret = __db_vrfy_prdbt(
2142		    &unknown_key, 0, " ", handle, callback, 0, vdp)) != 0) {
2143			if (ret == 0)
2144				ret = t_ret;
2145			goto err;
2146		}
2147		last = i;
2148
2149		/*
2150		 * We're going to go try to print the next item.  If key is
2151		 * non-NULL, we're a dup page, so we've got to print the key
2152		 * first, unless DB_SA_SKIPFIRSTKEY is set and we're on the
2153		 * first entry.
2154		 */
2155		if (key != NULL && (i != 0 || !LF_ISSET(DB_SA_SKIPFIRSTKEY)))
2156			if ((t_ret = __db_vrfy_prdbt(key,
2157			    0, " ", handle, callback, 0, vdp)) != 0) {
2158				if (ret == 0)
2159					ret = t_ret;
2160				goto err;
2161			}
2162
2163		beg = end = inp[i];
2164		switch (B_TYPE(bk->type)) {
2165		case B_DUPLICATE:
2166			if (pgtype == P_IBTREE)
2167				break;
2168
2169			end = beg + BOVERFLOW_SIZE - 1;
2170			/*
2171			 * If we're not on a normal btree leaf page, there
2172			 * shouldn't be off-page dup sets.  Something's
2173			 * confused; just drop it, and the code to pick up
2174			 * unlinked offpage dup sets will print it out
2175			 * with key "UNKNOWN" later.
2176			 */
2177			if (pgtype != P_LBTREE)
2178				break;
2179
2180			bo = (BOVERFLOW *)bk;
2181
2182			/*
2183			 * If the page number is unreasonable, or if this is
2184			 * supposed to be a key item, output "UNKNOWN_KEY" --
2185			 * the best we can do is run into the data items in
2186			 * the unlinked offpage dup pass.
2187			 */
2188			if (!IS_VALID_PGNO(bo->pgno) || (i % P_INDX == 0)) {
2189				/* Not much to do on failure. */
2190				if ((t_ret = __db_vrfy_prdbt(&unknown_key,
2191				    0, " ", handle, callback, 0, vdp)) != 0) {
2192					if (ret == 0)
2193						ret = t_ret;
2194					goto err;
2195				}
2196				break;
2197			}
2198
2199			/* Don't stop on error. */
2200			if ((t_ret = __db_salvage_duptree(dbp,
2201			    vdp, bo->pgno, &dbt, handle, callback,
2202			    flags | DB_SA_SKIPFIRSTKEY)) != 0 && ret == 0)
2203				ret = t_ret;
2204
2205			break;
2206		case B_KEYDATA:
2207			if (pgtype == P_IBTREE)
2208				break;
2209
2210			end = (db_indx_t)DB_ALIGN(
2211			    beg + bk->len, sizeof(u_int32_t)) - 1;
2212
2213			dbt.data = bk->data;
2214			dbt.size = bk->len;
2215			if ((t_ret = __db_vrfy_prdbt(&dbt,
2216			    0, " ", handle, callback, 0, vdp)) != 0) {
2217				if (ret == 0)
2218					ret = t_ret;
2219				goto err;
2220			}
2221			break;
2222		case B_OVERFLOW:
2223			if (pgtype != P_IBTREE)
2224				end = beg + BOVERFLOW_SIZE - 1;
2225			bo = (BOVERFLOW *)bk;
2226
2227			/*
2228			 * Check for replicated overflow keys, so that we only
2229			 * call __db_safe_goff once per overflow page.  If we
2230			 * get the same offset as the previous key just re-use
2231			 * the previous dbt.
2232			 *
2233			 * P_IBTREE pages will never have replicated overflow
2234			 * keys and will never pass this test.
2235			 */
2236			adj = pgtype == P_IBTREE ? O_INDX : P_INDX;
2237			if (i > adj - 1 && i % adj == 0 &&
2238			    inp[i] == inp[i - adj]) {
2239				DB_ASSERT(env, pgtype != P_IBTREE);
2240				dbt = repldbt;
2241			} else {
2242				if (pgtype == P_IBTREE) {
2243					/*
2244					 * If we're looking at a P_IBTREE, we
2245					 * just want to mark the overflow page
2246					 * as seen.  P_IBTREE entries will
2247					 * always fail the above test, we'll
2248					 * always wind up here.
2249					 *
2250					 * Note that this call to __db_safe_goff
2251					 * differs from the non-P_IBTREE call.
2252					 *
2253					 * Only call __db_safe_goff if the
2254					 * overflow page hasn't been seen.
2255					 */
2256					ovflpg = ((BOVERFLOW *)
2257					    ((BINTERNAL *)bk)->data)->pgno;
2258					if (__db_salvage_isdone(vdp,
2259					    ovflpg) == 0 && (t_ret =
2260					    __db_safe_goff(dbp, vdp, ovflpg,
2261					    &dbt, &ovflbuf, flags)) != 0 &&
2262					    ret == 0)
2263						ret = t_ret;
2264					break;
2265				}
2266
2267				/* Don't stop on error. */
2268				if ((t_ret = __db_safe_goff(dbp, vdp, bo->pgno,
2269				    &dbt, &ovflbuf, flags)) != 0 && ret == 0)
2270					ret = t_ret;
2271
2272				/*
2273				 * If this is a key, save it in case the next
2274				 * key is a replicated overflow, so we don't
2275				 * call __db_safe_goff again.  Copy out dbt.data
2276				 * in case that pointer gets realloc'd when
2277				 * getting a data item.
2278				 */
2279				if (i % P_INDX == 0) {
2280					if ((ret = __os_realloc(env,
2281					    dbt.size, &repldbt.data)) != 0)
2282						goto err;
2283					memcpy(repldbt.data, dbt.data,
2284					    dbt.size);
2285					repldbt.size = dbt.size;
2286				}
2287
2288			}
2289
2290			if ((t_ret = __db_vrfy_prdbt(
2291			    t_ret == 0 ? &dbt : &unknown_key,
2292			    0, " ", handle, callback, 0, vdp)) != 0 && ret == 0)
2293				ret = t_ret;
2294			break;
2295		default:
2296			/*
2297			 * We should never get here; __db_vrfy_inpitem should
2298			 * not be returning 0 if bk->type is unrecognizable.
2299			 */
2300			t_ret = __db_unknown_path(env, "__bam_salvage");
2301			if (ret == 0)
2302				ret = t_ret;
2303			goto err;
2304		}
2305
2306		/*
2307		 * If we're being aggressive, mark the beginning and end of
2308		 * the item; we'll come back and print whatever "junk" is in
2309		 * the gaps in case we had any bogus inp elements and thereby
2310		 * missed stuff.
2311		 */
2312		if (LF_ISSET(DB_AGGRESSIVE) && pgtype != P_IBTREE) {
2313			pgmap[beg] = VRFY_ITEM_BEGIN;
2314			pgmap[end] = VRFY_ITEM_END;
2315		}
2316	}
2317
2318err:	if (pgmap != NULL)
2319		__os_free(env, pgmap);
2320	if (ovflbuf != NULL)
2321		__os_free(env, ovflbuf);
2322	if (repldbt.data != NULL)
2323		__os_free(env, repldbt.data);
2324
2325	/* Mark this page as done. */
2326	if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
2327		ret = t_ret;
2328
2329	return (ret);
2330}
2331
2332/*
2333 * __bam_salvage_walkdupint --
2334 *	Walk a known-good btree or recno internal page which is part of
2335 *	a dup tree, calling __db_salvage_duptree on each child page.
2336 *
2337 * PUBLIC: int __bam_salvage_walkdupint __P((DB *, VRFY_DBINFO *, PAGE *,
2338 * PUBLIC:     DBT *, void *, int (*)(void *, const void *), u_int32_t));
2339 */
2340int
2341__bam_salvage_walkdupint(dbp, vdp, h, key, handle, callback, flags)
2342	DB *dbp;
2343	VRFY_DBINFO *vdp;
2344	PAGE *h;
2345	DBT *key;
2346	void *handle;
2347	int (*callback) __P((void *, const void *));
2348	u_int32_t flags;
2349{
2350	BINTERNAL *bi;
2351	ENV *env;
2352	RINTERNAL *ri;
2353	int ret, t_ret;
2354	db_indx_t i;
2355
2356	env = dbp->env;
2357	ret = 0;
2358
2359	for (i = 0; i < NUM_ENT(h); i++) {
2360		switch (TYPE(h)) {
2361		case P_IBTREE:
2362			bi = GET_BINTERNAL(dbp, h, i);
2363			if ((t_ret = __db_salvage_duptree(dbp,
2364			    vdp, bi->pgno, key, handle, callback, flags)) != 0)
2365				ret = t_ret;
2366			break;
2367		case P_IRECNO:
2368			ri = GET_RINTERNAL(dbp, h, i);
2369			if ((t_ret = __db_salvage_duptree(dbp,
2370			    vdp, ri->pgno, key, handle, callback, flags)) != 0)
2371				ret = t_ret;
2372			break;
2373		default:
2374			return (__db_unknown_path(
2375			    env, "__bam_salvage_walkdupint"));
2376		}
2377		/* Pass DB_SA_SKIPFIRSTKEY, if set, on to the 0th child only. */
2378		flags &= ~LF_ISSET(DB_SA_SKIPFIRSTKEY);
2379	}
2380
2381	return (ret);
2382}
2383
2384/*
2385 * __bam_meta2pgset --
2386 *	Given a known-good meta page, return in pgsetp a 0-terminated list of
2387 *	db_pgno_t's corresponding to the pages in the btree.
2388 *
2389 *	We do this by a somewhat sleazy method, to avoid having to traverse the
2390 *	btree structure neatly:  we walk down the left side to the very
2391 *	first leaf page, then we mark all the pages in the chain of
2392 *	NEXT_PGNOs (being wary of cycles and invalid ones), then we
2393 *	consolidate our scratch array into a nice list, and return.  This
2394 *	avoids the memory management hassles of recursion and the
2395 *	trouble of walking internal pages--they just don't matter, except
2396 *	for the left branch.
2397 *
2398 * PUBLIC: int __bam_meta2pgset __P((DB *, VRFY_DBINFO *, BTMETA *,
2399 * PUBLIC:     u_int32_t, DB *));
2400 */
2401int
2402__bam_meta2pgset(dbp, vdp, btmeta, flags, pgset)
2403	DB *dbp;
2404	VRFY_DBINFO *vdp;
2405	BTMETA *btmeta;
2406	u_int32_t flags;
2407	DB *pgset;
2408{
2409	BINTERNAL *bi;
2410	DB_MPOOLFILE *mpf;
2411	PAGE *h;
2412	RINTERNAL *ri;
2413	db_pgno_t current, p;
2414	int err_ret, ret;
2415
2416	DB_ASSERT(dbp->env, pgset != NULL);
2417
2418	mpf = dbp->mpf;
2419	h = NULL;
2420	ret = err_ret = 0;
2421
2422	for (current = btmeta->root;;) {
2423		if (!IS_VALID_PGNO(current) || current == PGNO(btmeta)) {
2424			err_ret = DB_VERIFY_BAD;
2425			goto err;
2426		}
2427		if ((ret = __memp_fget(mpf, &current,
2428		     vdp->thread_info, NULL, 0, &h)) != 0) {
2429			err_ret = ret;
2430			goto err;
2431		}
2432
2433		switch (TYPE(h)) {
2434		case P_IBTREE:
2435		case P_IRECNO:
2436			if ((ret = __bam_vrfy(dbp,
2437			    vdp, h, current, flags | DB_NOORDERCHK)) != 0) {
2438				err_ret = ret;
2439				goto err;
2440			}
2441			if (TYPE(h) == P_IBTREE) {
2442				bi = GET_BINTERNAL(dbp, h, 0);
2443				current = bi->pgno;
2444			} else {	/* P_IRECNO */
2445				ri = GET_RINTERNAL(dbp, h, 0);
2446				current = ri->pgno;
2447			}
2448			break;
2449		case P_LBTREE:
2450		case P_LRECNO:
2451			goto traverse;
2452		default:
2453			err_ret = DB_VERIFY_BAD;
2454			goto err;
2455		}
2456
2457		if ((ret = __memp_fput(mpf,
2458		     vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
2459			err_ret = ret;
2460		h = NULL;
2461	}
2462
2463	/*
2464	 * At this point, current is the pgno of leaf page h, the 0th in the
2465	 * tree we're concerned with.
2466	 */
2467traverse:
2468	while (IS_VALID_PGNO(current) && current != PGNO_INVALID) {
2469		if (h == NULL && (ret = __memp_fget(mpf,
2470		    &current, vdp->thread_info, NULL, 0, &h)) != 0) {
2471			err_ret = ret;
2472			break;
2473		}
2474
2475		if ((ret = __db_vrfy_pgset_get(pgset,
2476		    vdp->thread_info, current, (int *)&p)) != 0)
2477			goto err;
2478
2479		if (p != 0) {
2480			/*
2481			 * We've found a cycle.  Return success anyway--
2482			 * our caller may as well use however much of
2483			 * the pgset we've come up with.
2484			 */
2485			break;
2486		}
2487		if ((ret =
2488		    __db_vrfy_pgset_inc(pgset, vdp->thread_info, current)) != 0)
2489			goto err;
2490
2491		current = NEXT_PGNO(h);
2492		if ((ret = __memp_fput(mpf,
2493		     vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
2494			err_ret = ret;
2495		h = NULL;
2496	}
2497
2498err:	if (h != NULL)
2499		(void)__memp_fput(mpf,
2500		    vdp->thread_info, h, DB_PRIORITY_UNCHANGED);
2501
2502	return (ret == 0 ? err_ret : ret);
2503}
2504
2505/*
2506 * __bam_safe_getdata --
2507 *
2508 *	Utility function for __bam_vrfy_itemorder.  Safely gets the datum at
2509 *	index i, page h, and sticks it in DBT dbt.  If ovflok is 1 and i's an
2510 *	overflow item, we do a safe_goff to get the item and signal that we need
2511 *	to free dbt->data;  if ovflok is 0, we leaves the DBT zeroed.
2512 */
2513static int
2514__bam_safe_getdata(dbp, ip, h, i, ovflok, dbt, freedbtp)
2515	DB *dbp;
2516	DB_THREAD_INFO *ip;
2517	PAGE *h;
2518	u_int32_t i;
2519	int ovflok;
2520	DBT *dbt;
2521	int *freedbtp;
2522{
2523	BKEYDATA *bk;
2524	BOVERFLOW *bo;
2525
2526	memset(dbt, 0, sizeof(DBT));
2527	*freedbtp = 0;
2528
2529	bk = GET_BKEYDATA(dbp, h, i);
2530	if (B_TYPE(bk->type) == B_OVERFLOW) {
2531		if (!ovflok)
2532			return (0);
2533
2534		bo = (BOVERFLOW *)bk;
2535		F_SET(dbt, DB_DBT_MALLOC);
2536
2537		*freedbtp = 1;
2538		return (__db_goff(dbp, ip, NULL, dbt,
2539		    bo->tlen, bo->pgno, NULL, NULL));
2540	} else {
2541		dbt->data = bk->data;
2542		dbt->size = bk->len;
2543	}
2544
2545	return (0);
2546}
2547