1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1999-2009 Oracle.  All rights reserved.
5 *
6 * $Id$
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/db_verify.h"
14#include "dbinc/btree.h"
15#include "dbinc/hash.h"
16#include "dbinc/lock.h"
17#include "dbinc/mp.h"
18
19static int __ham_dups_unsorted __P((DB *, u_int8_t *, u_int32_t));
20static int __ham_vrfy_bucket __P((DB *, VRFY_DBINFO *, HMETA *, u_int32_t,
21    u_int32_t));
22static int __ham_vrfy_item __P((DB *,
23    VRFY_DBINFO *, db_pgno_t, PAGE *, u_int32_t, u_int32_t));
24
25/*
26 * __ham_vrfy_meta --
27 *	Verify the hash-specific part of a metadata page.
28 *
29 *	Note that unlike btree, we don't save things off, because we
30 *	will need most everything again to verify each page and the
31 *	amount of state here is significant.
32 *
33 * PUBLIC: int __ham_vrfy_meta __P((DB *, VRFY_DBINFO *, HMETA *,
34 * PUBLIC:     db_pgno_t, u_int32_t));
35 */
36int
37__ham_vrfy_meta(dbp, vdp, m, pgno, flags)
38	DB *dbp;
39	VRFY_DBINFO *vdp;
40	HMETA *m;
41	db_pgno_t pgno;
42	u_int32_t flags;
43{
44	ENV *env;
45	HASH *hashp;
46	VRFY_PAGEINFO *pip;
47	int i, ret, t_ret, isbad;
48	u_int32_t pwr, mbucket;
49	u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
50
51	env = dbp->env;
52	isbad = 0;
53
54	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
55		return (ret);
56
57	hashp = dbp->h_internal;
58
59	if (hashp != NULL && hashp->h_hash != NULL)
60		hfunc = hashp->h_hash;
61	else
62		hfunc = __ham_func5;
63
64	/*
65	 * If we haven't already checked the common fields in pagezero,
66	 * check them.
67	 */
68	if (!F_ISSET(pip, VRFY_INCOMPLETE) &&
69	    (ret = __db_vrfy_meta(dbp, vdp, &m->dbmeta, pgno, flags)) != 0) {
70		if (ret == DB_VERIFY_BAD)
71			isbad = 1;
72		else
73			goto err;
74	}
75
76	/* h_charkey */
77	if (!LF_ISSET(DB_NOORDERCHK))
78		if (m->h_charkey != hfunc(dbp, CHARKEY, sizeof(CHARKEY))) {
79			EPRINT((env,
80"Page %lu: database has custom hash function; reverify with DB_NOORDERCHK set",
81			    (u_long)pgno));
82			/*
83			 * Return immediately;  this is probably a sign of user
84			 * error rather than database corruption, so we want to
85			 * avoid extraneous errors.
86			 */
87			isbad = 1;
88			goto err;
89		}
90
91	/* max_bucket must be less than the last pgno. */
92	if (m->max_bucket > vdp->last_pgno) {
93		EPRINT((env,
94		    "Page %lu: Impossible max_bucket %lu on meta page",
95		    (u_long)pgno, (u_long)m->max_bucket));
96		/*
97		 * Most other fields depend somehow on max_bucket, so
98		 * we just return--there will be lots of extraneous
99		 * errors.
100		 */
101		isbad = 1;
102		goto err;
103	}
104
105	/*
106	 * max_bucket, high_mask and low_mask: high_mask must be one
107	 * less than the next power of two above max_bucket, and
108	 * low_mask must be one less than the power of two below it.
109	 */
110	pwr = (m->max_bucket == 0) ? 1 : 1 << __db_log2(m->max_bucket + 1);
111	if (m->high_mask != pwr - 1) {
112		EPRINT((env,
113		    "Page %lu: incorrect high_mask %lu, should be %lu",
114		    (u_long)pgno, (u_long)m->high_mask, (u_long)pwr - 1));
115		isbad = 1;
116	}
117	pwr >>= 1;
118	if (m->low_mask != pwr - 1) {
119		EPRINT((env,
120		    "Page %lu: incorrect low_mask %lu, should be %lu",
121		    (u_long)pgno, (u_long)m->low_mask, (u_long)pwr - 1));
122		isbad = 1;
123	}
124
125	/* ffactor: no check possible. */
126	pip->h_ffactor = m->ffactor;
127
128	/*
129	 * nelem: just make sure it's not astronomical for now. This is the
130	 * same check that hash_upgrade does, since there was a bug in 2.X
131	 * which could make nelem go "negative".
132	 */
133	if (m->nelem > 0x80000000) {
134		EPRINT((env,
135		    "Page %lu: suspiciously high nelem of %lu",
136		    (u_long)pgno, (u_long)m->nelem));
137		isbad = 1;
138		pip->h_nelem = 0;
139	} else
140		pip->h_nelem = m->nelem;
141
142	/* flags */
143	if (F_ISSET(&m->dbmeta, DB_HASH_DUP))
144		F_SET(pip, VRFY_HAS_DUPS);
145	if (F_ISSET(&m->dbmeta, DB_HASH_DUPSORT))
146		F_SET(pip, VRFY_HAS_DUPSORT);
147	/* XXX: Why is the DB_HASH_SUBDB flag necessary? */
148
149	/* spares array */
150	for (i = 0; m->spares[i] != 0 && i < NCACHED; i++) {
151		/*
152		 * We set mbucket to the maximum bucket that would use a given
153		 * spares entry;  we want to ensure that it's always less
154		 * than last_pgno.
155		 */
156		mbucket = (1 << i) - 1;
157		if (BS_TO_PAGE(mbucket, m->spares) > vdp->last_pgno) {
158			EPRINT((env,
159			    "Page %lu: spares array entry %d is invalid",
160			    (u_long)pgno, i));
161			isbad = 1;
162		}
163	}
164
165err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
166		ret = t_ret;
167	if (LF_ISSET(DB_SALVAGE) &&
168	   (t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
169		ret = t_ret;
170	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
171}
172
173/*
174 * __ham_vrfy --
175 *	Verify hash page.
176 *
177 * PUBLIC: int __ham_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
178 * PUBLIC:     u_int32_t));
179 */
180int
181__ham_vrfy(dbp, vdp, h, pgno, flags)
182	DB *dbp;
183	VRFY_DBINFO *vdp;
184	PAGE *h;
185	db_pgno_t pgno;
186	u_int32_t flags;
187{
188	DBC *dbc;
189	ENV *env;
190	VRFY_PAGEINFO *pip;
191	u_int32_t ent, himark, inpend;
192	db_indx_t *inp;
193	int isbad, ret, t_ret;
194
195	env = dbp->env;
196	isbad = 0;
197
198	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
199		return (ret);
200
201	if (TYPE(h) != P_HASH && TYPE(h) != P_HASH_UNSORTED) {
202		ret = __db_unknown_path(env, "__ham_vrfy");
203		goto err;
204	}
205
206	/* Verify and save off fields common to all PAGEs. */
207	if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
208		if (ret == DB_VERIFY_BAD)
209			isbad = 1;
210		else
211			goto err;
212	}
213
214	/*
215	 * Verify inp[].  Each offset from 0 to NUM_ENT(h) must be lower
216	 * than the previous one, higher than the current end of the inp array,
217	 * and lower than the page size.
218	 *
219	 * In any case, we return immediately if things are bad, as it would
220	 * be unsafe to proceed.
221	 */
222	inp = P_INP(dbp, h);
223	for (ent = 0, himark = dbp->pgsize,
224	    inpend = (u_int32_t)((u_int8_t *)inp - (u_int8_t *)h);
225	    ent < NUM_ENT(h); ent++)
226		if (inp[ent] >= himark) {
227			EPRINT((env,
228			    "Page %lu: item %lu is out of order or nonsensical",
229			    (u_long)pgno, (u_long)ent));
230			isbad = 1;
231			goto err;
232		} else if (inpend >= himark) {
233			EPRINT((env,
234			    "Page %lu: entries array collided with data",
235			    (u_long)pgno));
236			isbad = 1;
237			goto err;
238
239		} else {
240			himark = inp[ent];
241			inpend += sizeof(db_indx_t);
242			if ((ret = __ham_vrfy_item(
243			    dbp, vdp, pgno, h, ent, flags)) != 0)
244				goto err;
245		}
246
247	if ((ret = __db_cursor_int(dbp, vdp->thread_info, NULL, DB_HASH,
248	    PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
249		return (ret);
250	if (!LF_ISSET(DB_NOORDERCHK) && TYPE(h) == P_HASH &&
251	    (ret = __ham_verify_sorted_page(dbc, h)) != 0)
252		isbad = 1;
253
254err:	if ((t_ret =
255	    __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
256		ret = t_ret;
257	return (ret == 0 && isbad == 1 ? DB_VERIFY_BAD : ret);
258}
259
260/*
261 * __ham_vrfy_item --
262 *	Given a hash page and an offset, sanity-check the item itself,
263 *	and save off any overflow items or off-page dup children as necessary.
264 */
265static int
266__ham_vrfy_item(dbp, vdp, pgno, h, i, flags)
267	DB *dbp;
268	VRFY_DBINFO *vdp;
269	db_pgno_t pgno;
270	PAGE *h;
271	u_int32_t i, flags;
272{
273	HOFFDUP hod;
274	HOFFPAGE hop;
275	VRFY_CHILDINFO child;
276	VRFY_PAGEINFO *pip;
277	db_indx_t offset, len, dlen, elen;
278	int ret, t_ret;
279	u_int8_t *databuf;
280
281	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
282		return (ret);
283
284	switch (HPAGE_TYPE(dbp, h, i)) {
285	case H_KEYDATA:
286		/* Nothing to do here--everything but the type field is data */
287		break;
288	case H_DUPLICATE:
289		/* Are we a datum or a key?  Better be the former. */
290		if (i % 2 == 0) {
291			EPRINT((dbp->env,
292			    "Page %lu: hash key stored as duplicate item %lu",
293			    (u_long)pip->pgno, (u_long)i));
294		}
295		/*
296		 * Dups are encoded as a series within a single HKEYDATA,
297		 * in which each dup is surrounded by a copy of its length
298		 * on either side (so that the series can be walked in either
299		 * direction.  We loop through this series and make sure
300		 * each dup is reasonable.
301		 *
302		 * Note that at this point, we've verified item i-1, so
303		 * it's safe to use LEN_HKEYDATA (which looks at inp[i-1]).
304		 */
305		len = LEN_HKEYDATA(dbp, h, dbp->pgsize, i);
306		databuf = HKEYDATA_DATA(P_ENTRY(dbp, h, i));
307		for (offset = 0; offset < len; offset += DUP_SIZE(dlen)) {
308			memcpy(&dlen, databuf + offset, sizeof(db_indx_t));
309
310			/* Make sure the length is plausible. */
311			if (offset + DUP_SIZE(dlen) > len) {
312				EPRINT((dbp->env,
313			    "Page %lu: duplicate item %lu has bad length",
314				    (u_long)pip->pgno, (u_long)i));
315				ret = DB_VERIFY_BAD;
316				goto err;
317			}
318
319			/*
320			 * Make sure the second copy of the length is the
321			 * same as the first.
322			 */
323			memcpy(&elen,
324			    databuf + offset + dlen + sizeof(db_indx_t),
325			    sizeof(db_indx_t));
326			if (elen != dlen) {
327				EPRINT((dbp->env,
328		"Page %lu: duplicate item %lu has two different lengths",
329				    (u_long)pip->pgno, (u_long)i));
330				ret = DB_VERIFY_BAD;
331				goto err;
332			}
333		}
334		F_SET(pip, VRFY_HAS_DUPS);
335		if (!LF_ISSET(DB_NOORDERCHK) &&
336		    __ham_dups_unsorted(dbp, databuf, len))
337			F_SET(pip, VRFY_DUPS_UNSORTED);
338		break;
339	case H_OFFPAGE:
340		/* Offpage item.  Make sure pgno is sane, save off. */
341		memcpy(&hop, P_ENTRY(dbp, h, i), HOFFPAGE_SIZE);
342		if (!IS_VALID_PGNO(hop.pgno) || hop.pgno == pip->pgno ||
343		    hop.pgno == PGNO_INVALID) {
344			EPRINT((dbp->env,
345			    "Page %lu: offpage item %lu has bad pgno %lu",
346			    (u_long)pip->pgno, (u_long)i, (u_long)hop.pgno));
347			ret = DB_VERIFY_BAD;
348			goto err;
349		}
350		memset(&child, 0, sizeof(VRFY_CHILDINFO));
351		child.pgno = hop.pgno;
352		child.type = V_OVERFLOW;
353		child.tlen = hop.tlen; /* This will get checked later. */
354		if ((ret = __db_vrfy_childput(vdp, pip->pgno, &child)) != 0)
355			goto err;
356		break;
357	case H_OFFDUP:
358		/* Offpage duplicate item.  Same drill. */
359		memcpy(&hod, P_ENTRY(dbp, h, i), HOFFDUP_SIZE);
360		if (!IS_VALID_PGNO(hod.pgno) || hod.pgno == pip->pgno ||
361		    hod.pgno == PGNO_INVALID) {
362			EPRINT((dbp->env,
363			    "Page %lu: offpage item %lu has bad page number",
364			    (u_long)pip->pgno, (u_long)i));
365			ret = DB_VERIFY_BAD;
366			goto err;
367		}
368		memset(&child, 0, sizeof(VRFY_CHILDINFO));
369		child.pgno = hod.pgno;
370		child.type = V_DUPLICATE;
371		if ((ret = __db_vrfy_childput(vdp, pip->pgno, &child)) != 0)
372			goto err;
373		F_SET(pip, VRFY_HAS_DUPS);
374		break;
375	default:
376		EPRINT((dbp->env,
377		    "Page %lu: item %lu has bad type",
378		    (u_long)pip->pgno, (u_long)i));
379		ret = DB_VERIFY_BAD;
380		break;
381	}
382
383err:	if ((t_ret =
384	    __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0 && ret == 0)
385		ret = t_ret;
386	return (ret);
387}
388
389/*
390 * __ham_vrfy_structure --
391 *	Verify the structure of a hash database.
392 *
393 * PUBLIC: int __ham_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t,
394 * PUBLIC:     u_int32_t));
395 */
396int
397__ham_vrfy_structure(dbp, vdp, meta_pgno, flags)
398	DB *dbp;
399	VRFY_DBINFO *vdp;
400	db_pgno_t meta_pgno;
401	u_int32_t flags;
402{
403	DB *pgset;
404	DB_MPOOLFILE *mpf;
405	HMETA *m;
406	PAGE *h;
407	VRFY_PAGEINFO *pip;
408	int isbad, p, ret, t_ret;
409	db_pgno_t pgno;
410	u_int32_t bucket, spares_entry;
411
412	mpf = dbp->mpf;
413	pgset = vdp->pgset;
414	h = NULL;
415	ret = isbad = 0;
416
417	if ((ret = __db_vrfy_pgset_get(pgset,
418	    vdp->thread_info, meta_pgno, &p)) != 0)
419		return (ret);
420	if (p != 0) {
421		EPRINT((dbp->env,
422		    "Page %lu: Hash meta page referenced twice",
423		    (u_long)meta_pgno));
424		return (DB_VERIFY_BAD);
425	}
426	if ((ret = __db_vrfy_pgset_inc(pgset,
427	    vdp->thread_info, meta_pgno)) != 0)
428		return (ret);
429
430	/* Get the meta page;  we'll need it frequently. */
431	if ((ret = __memp_fget(mpf,
432	    &meta_pgno, vdp->thread_info, NULL, 0, &m)) != 0)
433		return (ret);
434
435	/* Loop through bucket by bucket. */
436	for (bucket = 0; bucket <= m->max_bucket; bucket++)
437		if ((ret =
438		    __ham_vrfy_bucket(dbp, vdp, m, bucket, flags)) != 0) {
439			if (ret == DB_VERIFY_BAD)
440				isbad = 1;
441			else
442				goto err;
443		    }
444
445	/*
446	 * There may be unused hash pages corresponding to buckets
447	 * that have been allocated but not yet used.  These may be
448	 * part of the current doubling above max_bucket, or they may
449	 * correspond to buckets that were used in a transaction
450	 * that then aborted.
451	 *
452	 * Loop through them, as far as the spares array defines them,
453	 * and make sure they're all empty.
454	 *
455	 * Note that this should be safe, since we've already verified
456	 * that the spares array is sane.
457	 */
458	for (bucket = m->max_bucket + 1; spares_entry = __db_log2(bucket + 1),
459	    spares_entry < NCACHED && m->spares[spares_entry] != 0; bucket++) {
460		pgno = BS_TO_PAGE(bucket, m->spares);
461		if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
462			goto err;
463
464		/* It's okay if these pages are totally zeroed;  unmark it. */
465		F_CLR(pip, VRFY_IS_ALLZEROES);
466
467		/* It's also OK if this page is simply invalid. */
468		if (pip->type == P_INVALID) {
469			if ((ret = __db_vrfy_putpageinfo(dbp->env,
470			    vdp, pip)) != 0)
471				goto err;
472			continue;
473		}
474
475		if (pip->type != P_HASH && pip->type != P_HASH_UNSORTED) {
476			EPRINT((dbp->env,
477			    "Page %lu: hash bucket %lu maps to non-hash page",
478			    (u_long)pgno, (u_long)bucket));
479			isbad = 1;
480		} else if (pip->entries != 0) {
481			EPRINT((dbp->env,
482		    "Page %lu: non-empty page in unused hash bucket %lu",
483			    (u_long)pgno, (u_long)bucket));
484			isbad = 1;
485		} else {
486			if ((ret = __db_vrfy_pgset_get(pgset,
487			    vdp->thread_info, pgno, &p)) != 0)
488				goto err;
489			if (p != 0) {
490				EPRINT((dbp->env,
491				    "Page %lu: above max_bucket referenced",
492				    (u_long)pgno));
493				isbad = 1;
494			} else {
495				if ((ret = __db_vrfy_pgset_inc(pgset,
496				    vdp->thread_info, pgno)) != 0)
497					goto err;
498				if ((ret = __db_vrfy_putpageinfo(dbp->env,
499				    vdp, pip)) != 0)
500					goto err;
501				continue;
502			}
503		}
504
505		/* If we got here, it's an error. */
506		(void)__db_vrfy_putpageinfo(dbp->env, vdp, pip);
507		goto err;
508	}
509
510err:	if ((t_ret = __memp_fput(mpf, vdp->thread_info, m, dbp->priority)) != 0)
511		return (t_ret);
512	if (h != NULL &&
513	    (t_ret = __memp_fput(mpf, vdp->thread_info, h, dbp->priority)) != 0)
514		return (t_ret);
515	return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD: ret);
516}
517
518/*
519 * __ham_vrfy_bucket --
520 *	Verify a given bucket.
521 */
522static int
523__ham_vrfy_bucket(dbp, vdp, m, bucket, flags)
524	DB *dbp;
525	VRFY_DBINFO *vdp;
526	HMETA *m;
527	u_int32_t bucket, flags;
528{
529	ENV *env;
530	HASH *hashp;
531	VRFY_CHILDINFO *child;
532	VRFY_PAGEINFO *mip, *pip;
533	int ret, t_ret, isbad, p;
534	db_pgno_t pgno, next_pgno;
535	DBC *cc;
536	u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
537
538	env = dbp->env;
539	isbad = 0;
540	pip = NULL;
541	cc = NULL;
542
543	hashp = dbp->h_internal;
544	if (hashp != NULL && hashp->h_hash != NULL)
545		hfunc = hashp->h_hash;
546	else
547		hfunc = __ham_func5;
548
549	if ((ret = __db_vrfy_getpageinfo(vdp, PGNO(m), &mip)) != 0)
550		return (ret);
551
552	/* Calculate the first pgno for this bucket. */
553	pgno = BS_TO_PAGE(bucket, m->spares);
554
555	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
556		goto err;
557
558	/* Make sure we got a plausible page number. */
559	if (pgno > vdp->last_pgno ||
560	    (pip->type != P_HASH && pip->type != P_HASH_UNSORTED)) {
561		EPRINT((env,
562		    "Page %lu: impossible first page in bucket %lu",
563		    (u_long)pgno, (u_long)bucket));
564		/* Unsafe to continue. */
565		isbad = 1;
566		goto err;
567	}
568
569	if (pip->prev_pgno != PGNO_INVALID) {
570		EPRINT((env,
571		    "Page %lu: first page in hash bucket %lu has a prev_pgno",
572		    (u_long)pgno, (u_long)bucket));
573		isbad = 1;
574	}
575
576	/*
577	 * Set flags for dups and sorted dups.
578	 */
579	flags |= F_ISSET(mip, VRFY_HAS_DUPS) ? DB_ST_DUPOK : 0;
580	flags |= F_ISSET(mip, VRFY_HAS_DUPSORT) ? DB_ST_DUPSORT : 0;
581
582	/* Loop until we find a fatal bug, or until we run out of pages. */
583	for (;;) {
584		/* Provide feedback on our progress to the application. */
585		if (!LF_ISSET(DB_SALVAGE))
586			__db_vrfy_struct_feedback(dbp, vdp);
587
588		if ((ret = __db_vrfy_pgset_get(vdp->pgset,
589		    vdp->thread_info, pgno, &p)) != 0)
590			goto err;
591		if (p != 0) {
592			EPRINT((env,
593			    "Page %lu: hash page referenced twice",
594			    (u_long)pgno));
595			isbad = 1;
596			/* Unsafe to continue. */
597			goto err;
598		} else if ((ret = __db_vrfy_pgset_inc(vdp->pgset,
599		    vdp->thread_info, pgno)) != 0)
600			goto err;
601
602		/*
603		 * Hash pages that nothing has ever hashed to may never
604		 * have actually come into existence, and may appear to be
605		 * entirely zeroed.  This is acceptable, and since there's
606		 * no real way for us to know whether this has actually
607		 * occurred, we clear the "wholly zeroed" flag on every
608		 * hash page.  A wholly zeroed page, by nature, will appear
609		 * to have no flags set and zero entries, so should
610		 * otherwise verify correctly.
611		 */
612		F_CLR(pip, VRFY_IS_ALLZEROES);
613
614		/* If we have dups, our meta page had better know about it. */
615		if (F_ISSET(pip, VRFY_HAS_DUPS) &&
616		    !F_ISSET(mip, VRFY_HAS_DUPS)) {
617			EPRINT((env,
618		    "Page %lu: duplicates present in non-duplicate database",
619			    (u_long)pgno));
620			isbad = 1;
621		}
622
623		/*
624		 * If the database has sorted dups, this page had better
625		 * not have unsorted ones.
626		 */
627		if (F_ISSET(mip, VRFY_HAS_DUPSORT) &&
628		    F_ISSET(pip, VRFY_DUPS_UNSORTED)) {
629			EPRINT((env,
630			    "Page %lu: unsorted dups in sorted-dup database",
631			    (u_long)pgno));
632			isbad = 1;
633		}
634
635		/* Walk overflow chains and offpage dup trees. */
636		if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
637			goto err;
638		for (ret = __db_vrfy_ccset(cc, pip->pgno, &child); ret == 0;
639		    ret = __db_vrfy_ccnext(cc, &child))
640			if (child->type == V_OVERFLOW) {
641				if ((ret = __db_vrfy_ovfl_structure(dbp, vdp,
642				    child->pgno, child->tlen,
643				    flags | DB_ST_OVFL_LEAF)) != 0) {
644					if (ret == DB_VERIFY_BAD)
645						isbad = 1;
646					else
647						goto err;
648				}
649			} else if (child->type == V_DUPLICATE) {
650				if ((ret = __db_vrfy_duptype(dbp,
651				    vdp, child->pgno, flags)) != 0) {
652					isbad = 1;
653					continue;
654				}
655				if ((ret = __bam_vrfy_subtree(dbp, vdp,
656				    child->pgno, NULL, NULL,
657		    flags | DB_ST_RECNUM | DB_ST_DUPSET | DB_ST_TOPLEVEL,
658				    NULL, NULL, NULL)) != 0) {
659					if (ret == DB_VERIFY_BAD)
660						isbad = 1;
661					else
662						goto err;
663				}
664			}
665		/* Close the cursor on vdp, open one on dbp */
666		if ((ret = __db_vrfy_ccclose(cc)) != 0)
667			goto err;
668		if ((ret = __db_cursor_int(dbp, vdp->thread_info, NULL,
669		    DB_HASH, PGNO_INVALID, 0, DB_LOCK_INVALIDID, &cc)) != 0)
670			goto err;
671		/* If it's safe to check that things hash properly, do so. */
672		if (isbad == 0 && !LF_ISSET(DB_NOORDERCHK) &&
673		    (ret = __ham_vrfy_hashing(cc, pip->entries,
674		    m, bucket, pgno, flags, hfunc)) != 0) {
675			if (ret == DB_VERIFY_BAD)
676				isbad = 1;
677			else
678				goto err;
679		}
680
681		next_pgno = pip->next_pgno;
682		ret = __db_vrfy_putpageinfo(env, vdp, pip);
683
684		pip = NULL;
685		if (ret != 0)
686			goto err;
687
688		if (next_pgno == PGNO_INVALID)
689			break;		/* End of the bucket. */
690
691		/* We already checked this, but just in case... */
692		if (!IS_VALID_PGNO(next_pgno)) {
693			EPRINT((env,
694			    "Page %lu: hash page has bad next_pgno",
695			    (u_long)pgno));
696			isbad = 1;
697			goto err;
698		}
699
700		if ((ret = __db_vrfy_getpageinfo(vdp, next_pgno, &pip)) != 0)
701			goto err;
702
703		if (pip->prev_pgno != pgno) {
704			EPRINT((env,
705			    "Page %lu: hash page has bad prev_pgno",
706			    (u_long)next_pgno));
707			isbad = 1;
708		}
709		pgno = next_pgno;
710	}
711
712err:	if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0)
713		ret = t_ret;
714	if (mip != NULL && ((t_ret =
715	    __db_vrfy_putpageinfo(env, vdp, mip)) != 0) && ret == 0)
716		ret = t_ret;
717	if (pip != NULL && ((t_ret =
718	    __db_vrfy_putpageinfo(env, vdp, pip)) != 0) && ret == 0)
719		ret = t_ret;
720	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
721}
722
723/*
724 * __ham_vrfy_hashing --
725 *	Verify that all items on a given hash page hash correctly.
726 *
727 * PUBLIC: int __ham_vrfy_hashing __P((DBC *,
728 * PUBLIC:     u_int32_t, HMETA *, u_int32_t, db_pgno_t, u_int32_t,
729 * PUBLIC:     u_int32_t (*) __P((DB *, const void *, u_int32_t))));
730 */
731int
732__ham_vrfy_hashing(dbc, nentries, m, thisbucket, pgno, flags, hfunc)
733	DBC *dbc;
734	u_int32_t nentries;
735	HMETA *m;
736	u_int32_t thisbucket;
737	db_pgno_t pgno;
738	u_int32_t flags;
739	u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
740{
741	DB *dbp;
742	DBT dbt;
743	DB_MPOOLFILE *mpf;
744	DB_THREAD_INFO *ip;
745	PAGE *h;
746	db_indx_t i;
747	int ret, t_ret, isbad;
748	u_int32_t hval, bucket;
749
750	dbp = dbc->dbp;
751	mpf = dbp->mpf;
752	ret = isbad = 0;
753
754	memset(&dbt, 0, sizeof(DBT));
755	F_SET(&dbt, DB_DBT_REALLOC);
756	ENV_GET_THREAD_INFO(dbp->env, ip);
757
758	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &h)) != 0)
759		return (ret);
760
761	for (i = 0; i < nentries; i += 2) {
762		/*
763		 * We've already verified the page integrity and that of any
764		 * overflow chains linked off it;  it is therefore safe to use
765		 * __db_ret.  It's also not all that much slower, since we have
766		 * to copy every hash item to deal with alignment anyway;  we
767		 * can tweak this a bit if this proves to be a bottleneck,
768		 * but for now, take the easy route.
769		 */
770		if ((ret = __db_ret(dbc, h, i, &dbt, NULL, NULL)) != 0)
771			goto err;
772		hval = hfunc(dbp, dbt.data, dbt.size);
773
774		bucket = hval & m->high_mask;
775		if (bucket > m->max_bucket)
776			bucket = bucket & m->low_mask;
777
778		if (bucket != thisbucket) {
779			EPRINT((dbp->env,
780			    "Page %lu: item %lu hashes incorrectly",
781			    (u_long)pgno, (u_long)i));
782			isbad = 1;
783		}
784	}
785
786err:	if (dbt.data != NULL)
787		__os_ufree(dbp->env, dbt.data);
788	if ((t_ret = __memp_fput(mpf, ip, h, dbp->priority)) != 0)
789		return (t_ret);
790
791	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
792}
793
794/*
795 * __ham_salvage --
796 *	Safely dump out anything that looks like a key on an alleged
797 *	hash page.
798 *
799 * PUBLIC: int __ham_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *,
800 * PUBLIC:     void *, int (*)(void *, const void *), u_int32_t));
801 */
802int
803__ham_salvage(dbp, vdp, pgno, h, handle, callback, flags)
804	DB *dbp;
805	VRFY_DBINFO *vdp;
806	db_pgno_t pgno;
807	PAGE *h;
808	void *handle;
809	int (*callback) __P((void *, const void *));
810	u_int32_t flags;
811{
812	DBT dbt, key_dbt, unkdbt;
813	db_pgno_t dpgno;
814	int ret, err_ret, t_ret;
815	u_int32_t himark, i, ovfl_bufsz;
816	u_int8_t *hk, *p;
817	void *buf, *key_buf;
818	db_indx_t dlen, len, tlen;
819
820	memset(&dbt, 0, sizeof(DBT));
821	dbt.flags = DB_DBT_REALLOC;
822
823	DB_INIT_DBT(unkdbt, "UNKNOWN", sizeof("UNKNOWN") - 1);
824
825	err_ret = 0;
826
827	/*
828	 * Allocate a buffer for overflow items.  Start at one page;
829	 * __db_safe_goff will realloc as needed.
830	 */
831	if ((ret = __os_malloc(dbp->env, dbp->pgsize, &buf)) != 0)
832		return (ret);
833    ovfl_bufsz = dbp->pgsize;
834
835	himark = dbp->pgsize;
836	for (i = 0;; i++) {
837		/* If we're not aggressive, break when we hit NUM_ENT(h). */
838		if (!LF_ISSET(DB_AGGRESSIVE) && i >= NUM_ENT(h))
839			break;
840
841		/*
842		 * Verify the current item. If we're beyond NUM_ENT errors are
843		 * expected and ignored.
844		 */
845		ret = __db_vrfy_inpitem(dbp,
846		    h, pgno, i, 0, flags, &himark, NULL);
847		/* If this returned a fatality, it's time to break. */
848		if (ret == DB_VERIFY_FATAL) {
849			if (i >= NUM_ENT(h))
850				ret = 0;
851			break;
852		} else if (ret != 0 && i >= NUM_ENT(h)) {
853			/* Not a reportable error, but don't salvage the item. */
854			ret = 0;
855		} else if (ret == 0) {
856			/* Set len to total entry length. */
857			len = LEN_HITEM(dbp, h, dbp->pgsize, i);
858			hk = P_ENTRY(dbp, h, i);
859			if (len == 0 || len > dbp->pgsize ||
860			    (u_int32_t)(hk + len - (u_int8_t *)h) >
861			    dbp->pgsize) {
862				/* Item is unsafely large; skip it. */
863				err_ret = DB_VERIFY_BAD;
864				continue;
865			}
866			switch (HPAGE_PTYPE(hk)) {
867			case H_KEYDATA:
868				/* Update len to size of item. */
869				len = LEN_HKEYDATA(dbp, h, dbp->pgsize, i);
870keydata:			memcpy(buf, HKEYDATA_DATA(hk), len);
871				dbt.size = len;
872				dbt.data = buf;
873				if ((ret = __db_vrfy_prdbt(&dbt,
874				    0, " ", handle, callback, 0, vdp)) != 0)
875					err_ret = ret;
876				break;
877			case H_OFFPAGE:
878				if (len < HOFFPAGE_SIZE) {
879					err_ret = DB_VERIFY_BAD;
880					continue;
881				}
882				memcpy(&dpgno,
883				    HOFFPAGE_PGNO(hk), sizeof(dpgno));
884				if ((ret = __db_safe_goff(dbp, vdp,
885				    dpgno, &dbt, &buf, &ovfl_bufsz, flags)) != 0) {
886					err_ret = ret;
887					(void)__db_vrfy_prdbt(&unkdbt, 0, " ",
888					    handle, callback, 0, vdp);
889					/* fallthrough to end of case */
890				} else if ((ret = __db_vrfy_prdbt(&dbt,
891				    0, " ", handle, callback, 0, vdp)) != 0)
892					err_ret = ret;
893				break;
894			case H_OFFDUP:
895				if (len < HOFFDUP_SIZE) {
896					err_ret = DB_VERIFY_BAD;
897					continue;
898				}
899				memcpy(&dpgno,
900				    HOFFDUP_PGNO(hk), sizeof(dpgno));
901				/* UNKNOWN iff pgno is bad or we're a key. */
902				if (!IS_VALID_PGNO(dpgno) || (i % 2 == 0)) {
903					if ((ret =
904					    __db_vrfy_prdbt(&unkdbt, 0, " ",
905					    handle, callback, 0, vdp)) != 0)
906						err_ret = ret;
907				} else if ((ret = __db_salvage_duptree(dbp,
908				    vdp, dpgno, &dbt, handle, callback,
909				    flags | DB_SA_SKIPFIRSTKEY)) != 0)
910					err_ret = ret;
911				break;
912			case H_DUPLICATE:
913				/*
914				 * This is an on-page duplicate item, iterate
915				 * over the duplicate set, printing out
916				 * key/data pairs.
917				 */
918				len = LEN_HKEYDATA(dbp, h, dbp->pgsize, i);
919				/*
920				 * If this item is at an even index it must be
921				 * a key item and it should never be of type
922				 * H_DUPLICATE. If we are in aggressive mode,
923				 * print the item out as a normal key, and let
924				 * the user resolve the discrepancy.
925				 */
926				if (i % 2 == 0) {
927					err_ret = ret;
928					if (LF_ISSET(DB_AGGRESSIVE))
929						goto keydata;
930					break;
931				}
932
933				/*
934				 * Check to ensure that the item size is
935				 * greater than the smallest possible on page
936				 * duplicate.
937				 */
938				if (len <
939				    HKEYDATA_SIZE(2 * sizeof(db_indx_t))) {
940					err_ret = DB_VERIFY_BAD;
941					continue;
942				}
943
944				/*
945				 * Copy out the key from the dbt, it is still
946				 * present from the previous pass.
947				 */
948				memset(&key_dbt, 0, sizeof(key_dbt));
949				if ((ret = __os_malloc(
950				    dbp->env, dbt.size, &key_buf)) != 0)
951					return (ret);
952				memcpy(key_buf, buf, dbt.size);
953				key_dbt.data = key_buf;
954				key_dbt.size = dbt.size;
955				key_dbt.flags = DB_DBT_USERMEM;
956
957				/* Loop until we hit the total length. */
958				for (tlen = 0; tlen + sizeof(db_indx_t) < len;
959				    tlen += dlen + 2 * sizeof(db_indx_t)) {
960					/*
961					 * Print the key for every duplicate
962					 * item. Except the first dup, since
963					 * the key was already output once by
964					 * the previous iteration.
965					 */
966					if (tlen != 0) {
967						if ((ret = __db_vrfy_prdbt(
968						    &key_dbt, 0, " ", handle,
969						    callback, 0, vdp)) != 0)
970							err_ret = ret;
971					}
972					p = HKEYDATA_DATA(hk) + tlen;
973					memcpy(&dlen, p, sizeof(db_indx_t));
974					p += sizeof(db_indx_t);
975					/*
976					 * If dlen is too long, print all the
977					 * rest of the dup set in a chunk.
978					 */
979					if (dlen + tlen + sizeof(db_indx_t) >
980					    len) {
981						dlen = len -
982						    (tlen + sizeof(db_indx_t));
983						err_ret = DB_VERIFY_BAD;
984					}
985					memcpy(buf, p, dlen);
986					dbt.size = dlen;
987					dbt.data = buf;
988					if ((ret = __db_vrfy_prdbt(&dbt, 0, " ",
989					    handle, callback, 0, vdp)) != 0)
990						err_ret = ret;
991				}
992				__os_free(dbp->env, key_buf);
993				break;
994			default:
995				if (!LF_ISSET(DB_AGGRESSIVE))
996					break;
997				err_ret = DB_VERIFY_BAD;
998				break;
999			}
1000		}
1001	}
1002
1003	__os_free(dbp->env, buf);
1004	if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0)
1005		return (t_ret);
1006	return ((ret == 0 && err_ret != 0) ? err_ret : ret);
1007}
1008
1009/*
1010 * __ham_meta2pgset --
1011 *	Return the set of hash pages corresponding to the given
1012 *	known-good meta page.
1013 *
1014 * PUBLIC: int __ham_meta2pgset __P((DB *, VRFY_DBINFO *, HMETA *, u_int32_t,
1015 * PUBLIC:     DB *));
1016 */
1017int
1018__ham_meta2pgset(dbp, vdp, hmeta, flags, pgset)
1019	DB *dbp;
1020	VRFY_DBINFO *vdp;
1021	HMETA *hmeta;
1022	u_int32_t flags;
1023	DB *pgset;
1024{
1025	DB_MPOOLFILE *mpf;
1026	DB_THREAD_INFO *ip;
1027	PAGE *h;
1028	db_pgno_t pgno;
1029	u_int32_t bucket, totpgs;
1030	int ret, val;
1031
1032	/*
1033	 * We don't really need flags, but leave them for consistency with
1034	 * __bam_meta2pgset.
1035	 */
1036	COMPQUIET(flags, 0);
1037	ip = vdp->thread_info;
1038
1039	DB_ASSERT(dbp->env, pgset != NULL);
1040
1041	mpf = dbp->mpf;
1042	totpgs = 0;
1043
1044	/*
1045	 * Loop through all the buckets, pushing onto pgset the corresponding
1046	 * page(s) for each one.
1047	 */
1048	for (bucket = 0; bucket <= hmeta->max_bucket; bucket++) {
1049		pgno = BS_TO_PAGE(bucket, hmeta->spares);
1050
1051		/*
1052		 * We know the initial pgno is safe because the spares array has
1053		 * been verified.
1054		 *
1055		 * Safely walk the list of pages in this bucket.
1056		 */
1057		for (;;) {
1058			if ((ret =
1059			    __memp_fget(mpf, &pgno, ip, NULL, 0, &h)) != 0)
1060				return (ret);
1061			if (TYPE(h) == P_HASH || TYPE(h) == P_HASH_UNSORTED) {
1062
1063				/*
1064				 * Make sure we don't go past the end of
1065				 * pgset.
1066				 */
1067				if (++totpgs > vdp->last_pgno) {
1068					(void)__memp_fput(mpf,
1069					    ip, h, dbp->priority);
1070					return (DB_VERIFY_BAD);
1071				}
1072				if ((ret = __db_vrfy_pgset_inc(pgset,
1073				    vdp->thread_info, pgno)) != 0) {
1074					(void)__memp_fput(mpf,
1075					    ip, h, dbp->priority);
1076					return (ret);
1077				}
1078
1079				pgno = NEXT_PGNO(h);
1080			} else
1081				pgno = PGNO_INVALID;
1082
1083			if ((ret = __memp_fput(mpf, ip, h, dbp->priority)) != 0)
1084				return (ret);
1085
1086			/* If the new pgno is wonky, go onto the next bucket. */
1087			if (!IS_VALID_PGNO(pgno) ||
1088			    pgno == PGNO_INVALID)
1089				break;
1090
1091			/*
1092			 * If we've touched this page before, we have a cycle;
1093			 * go on to the next bucket.
1094			 */
1095			if ((ret = __db_vrfy_pgset_get(pgset,
1096			    vdp->thread_info, pgno, &val)) != 0)
1097				return (ret);
1098			if (val != 0)
1099				break;
1100		}
1101	}
1102	return (0);
1103}
1104
1105/*
1106 * __ham_dups_unsorted --
1107 *	Takes a known-safe hash duplicate set and its total length.
1108 *	Returns 1 if there are out-of-order duplicates in this set,
1109 *	0 if there are not.
1110 */
1111static int
1112__ham_dups_unsorted(dbp, buf, len)
1113	DB *dbp;
1114	u_int8_t *buf;
1115	u_int32_t len;
1116{
1117	DBT a, b;
1118	db_indx_t offset, dlen;
1119	int (*func) __P((DB *, const DBT *, const DBT *));
1120
1121	memset(&a, 0, sizeof(DBT));
1122	memset(&b, 0, sizeof(DBT));
1123
1124	func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
1125
1126	/*
1127	 * Loop through the dup set until we hit the end or we find
1128	 * a pair of dups that's out of order.  b is always the current
1129	 * dup, a the one before it.
1130	 */
1131	for (offset = 0; offset < len; offset += DUP_SIZE(dlen)) {
1132		memcpy(&dlen, buf + offset, sizeof(db_indx_t));
1133		b.data = buf + offset + sizeof(db_indx_t);
1134		b.size = dlen;
1135
1136		if (a.data != NULL && func(dbp, &a, &b) > 0)
1137			return (1);
1138
1139		a.data = b.data;
1140		a.size = b.size;
1141	}
1142
1143	return (0);
1144}
1145