1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1999,2008 Oracle.  All rights reserved.
5 *
6 * $Id: hash_verify.c,v 12.33 2008/03/12 22:33:03 mbrey Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/db_verify.h"
14#include "dbinc/btree.h"
15#include "dbinc/hash.h"
16#include "dbinc/mp.h"
17
18static int __ham_dups_unsorted __P((DB *, u_int8_t *, u_int32_t));
19static int __ham_vrfy_bucket __P((DB *, VRFY_DBINFO *, HMETA *, u_int32_t,
20    u_int32_t));
21static int __ham_vrfy_item __P((DB *,
22    VRFY_DBINFO *, db_pgno_t, PAGE *, u_int32_t, u_int32_t));
23
24/*
25 * __ham_vrfy_meta --
26 *	Verify the hash-specific part of a metadata page.
27 *
28 *	Note that unlike btree, we don't save things off, because we
29 *	will need most everything again to verify each page and the
30 *	amount of state here is significant.
31 *
32 * PUBLIC: int __ham_vrfy_meta __P((DB *, VRFY_DBINFO *, HMETA *,
33 * PUBLIC:     db_pgno_t, u_int32_t));
34 */
35int
36__ham_vrfy_meta(dbp, vdp, m, pgno, flags)
37	DB *dbp;
38	VRFY_DBINFO *vdp;
39	HMETA *m;
40	db_pgno_t pgno;
41	u_int32_t flags;
42{
43	ENV *env;
44	HASH *hashp;
45	VRFY_PAGEINFO *pip;
46	int i, ret, t_ret, isbad;
47	u_int32_t pwr, mbucket;
48	u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
49
50	env = dbp->env;
51	isbad = 0;
52
53	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
54		return (ret);
55
56	hashp = dbp->h_internal;
57
58	if (hashp != NULL && hashp->h_hash != NULL)
59		hfunc = hashp->h_hash;
60	else
61		hfunc = __ham_func5;
62
63	/*
64	 * If we haven't already checked the common fields in pagezero,
65	 * check them.
66	 */
67	if (!F_ISSET(pip, VRFY_INCOMPLETE) &&
68	    (ret = __db_vrfy_meta(dbp, vdp, &m->dbmeta, pgno, flags)) != 0) {
69		if (ret == DB_VERIFY_BAD)
70			isbad = 1;
71		else
72			goto err;
73	}
74
75	/* h_charkey */
76	if (!LF_ISSET(DB_NOORDERCHK))
77		if (m->h_charkey != hfunc(dbp, CHARKEY, sizeof(CHARKEY))) {
78			EPRINT((env,
79"Page %lu: database has custom hash function; reverify with DB_NOORDERCHK set",
80			    (u_long)pgno));
81			/*
82			 * Return immediately;  this is probably a sign of user
83			 * error rather than database corruption, so we want to
84			 * avoid extraneous errors.
85			 */
86			isbad = 1;
87			goto err;
88		}
89
90	/* max_bucket must be less than the last pgno. */
91	if (m->max_bucket > vdp->last_pgno) {
92		EPRINT((env,
93		    "Page %lu: Impossible max_bucket %lu on meta page",
94		    (u_long)pgno, (u_long)m->max_bucket));
95		/*
96		 * Most other fields depend somehow on max_bucket, so
97		 * we just return--there will be lots of extraneous
98		 * errors.
99		 */
100		isbad = 1;
101		goto err;
102	}
103
104	/*
105	 * max_bucket, high_mask and low_mask: high_mask must be one
106	 * less than the next power of two above max_bucket, and
107	 * low_mask must be one less than the power of two below it.
108	 */
109	pwr = (m->max_bucket == 0) ? 1 : 1 << __db_log2(m->max_bucket + 1);
110	if (m->high_mask != pwr - 1) {
111		EPRINT((env,
112		    "Page %lu: incorrect high_mask %lu, should be %lu",
113		    (u_long)pgno, (u_long)m->high_mask, (u_long)pwr - 1));
114		isbad = 1;
115	}
116	pwr >>= 1;
117	if (m->low_mask != pwr - 1) {
118		EPRINT((env,
119		    "Page %lu: incorrect low_mask %lu, should be %lu",
120		    (u_long)pgno, (u_long)m->low_mask, (u_long)pwr - 1));
121		isbad = 1;
122	}
123
124	/* ffactor: no check possible. */
125	pip->h_ffactor = m->ffactor;
126
127	/*
128	 * nelem: just make sure it's not astronomical for now. This is the
129	 * same check that hash_upgrade does, since there was a bug in 2.X
130	 * which could make nelem go "negative".
131	 */
132	if (m->nelem > 0x80000000) {
133		EPRINT((env,
134		    "Page %lu: suspiciously high nelem of %lu",
135		    (u_long)pgno, (u_long)m->nelem));
136		isbad = 1;
137		pip->h_nelem = 0;
138	} else
139		pip->h_nelem = m->nelem;
140
141	/* flags */
142	if (F_ISSET(&m->dbmeta, DB_HASH_DUP))
143		F_SET(pip, VRFY_HAS_DUPS);
144	if (F_ISSET(&m->dbmeta, DB_HASH_DUPSORT))
145		F_SET(pip, VRFY_HAS_DUPSORT);
146	/* XXX: Why is the DB_HASH_SUBDB flag necessary? */
147
148	/* spares array */
149	for (i = 0; m->spares[i] != 0 && i < NCACHED; i++) {
150		/*
151		 * We set mbucket to the maximum bucket that would use a given
152		 * spares entry;  we want to ensure that it's always less
153		 * than last_pgno.
154		 */
155		mbucket = (1 << i) - 1;
156		if (BS_TO_PAGE(mbucket, m->spares) > vdp->last_pgno) {
157			EPRINT((env,
158			    "Page %lu: spares array entry %d is invalid",
159			    (u_long)pgno, i));
160			isbad = 1;
161		}
162	}
163
164err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
165		ret = t_ret;
166	if (LF_ISSET(DB_SALVAGE) &&
167	   (t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
168		ret = t_ret;
169	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
170}
171
172/*
173 * __ham_vrfy --
174 *	Verify hash page.
175 *
176 * PUBLIC: int __ham_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
177 * PUBLIC:     u_int32_t));
178 */
179int
180__ham_vrfy(dbp, vdp, h, pgno, flags)
181	DB *dbp;
182	VRFY_DBINFO *vdp;
183	PAGE *h;
184	db_pgno_t pgno;
185	u_int32_t flags;
186{
187	ENV *env;
188	VRFY_PAGEINFO *pip;
189	u_int32_t ent, himark, inpend;
190	db_indx_t *inp;
191	int isbad, ret, t_ret;
192
193	env = dbp->env;
194	isbad = 0;
195
196	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
197		return (ret);
198
199	if (TYPE(h) != P_HASH && TYPE(h) != P_HASH_UNSORTED) {
200		ret = __db_unknown_path(env, "__ham_vrfy");
201		goto err;
202	}
203
204	/* Verify and save off fields common to all PAGEs. */
205	if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
206		if (ret == DB_VERIFY_BAD)
207			isbad = 1;
208		else
209			goto err;
210	}
211
212	/*
213	 * Verify inp[].  Each offset from 0 to NUM_ENT(h) must be lower
214	 * than the previous one, higher than the current end of the inp array,
215	 * and lower than the page size.
216	 *
217	 * In any case, we return immediately if things are bad, as it would
218	 * be unsafe to proceed.
219	 */
220	inp = P_INP(dbp, h);
221	for (ent = 0, himark = dbp->pgsize,
222	    inpend = (u_int32_t)((u_int8_t *)inp - (u_int8_t *)h);
223	    ent < NUM_ENT(h); ent++)
224		if (inp[ent] >= himark) {
225			EPRINT((env,
226			    "Page %lu: item %lu is out of order or nonsensical",
227			    (u_long)pgno, (u_long)ent));
228			isbad = 1;
229			goto err;
230		} else if (inpend >= himark) {
231			EPRINT((env,
232			    "Page %lu: entries array collided with data",
233			    (u_long)pgno));
234			isbad = 1;
235			goto err;
236
237		} else {
238			himark = inp[ent];
239			inpend += sizeof(db_indx_t);
240			if ((ret = __ham_vrfy_item(
241			    dbp, vdp, pgno, h, ent, flags)) != 0)
242				goto err;
243		}
244
245	if (!LF_ISSET(DB_NOORDERCHK) && TYPE(h) == P_HASH && (ret =
246	    __ham_verify_sorted_page(dbp, vdp->thread_info, NULL, h)) != 0)
247		isbad = 1;
248
249err:	if ((t_ret =
250	    __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
251		ret = t_ret;
252	return (ret == 0 && isbad == 1 ? DB_VERIFY_BAD : ret);
253}
254
255/*
256 * __ham_vrfy_item --
257 *	Given a hash page and an offset, sanity-check the item itself,
258 *	and save off any overflow items or off-page dup children as necessary.
259 */
260static int
261__ham_vrfy_item(dbp, vdp, pgno, h, i, flags)
262	DB *dbp;
263	VRFY_DBINFO *vdp;
264	db_pgno_t pgno;
265	PAGE *h;
266	u_int32_t i, flags;
267{
268	HOFFDUP hod;
269	HOFFPAGE hop;
270	VRFY_CHILDINFO child;
271	VRFY_PAGEINFO *pip;
272	db_indx_t offset, len, dlen, elen;
273	int ret, t_ret;
274	u_int8_t *databuf;
275
276	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
277		return (ret);
278
279	switch (HPAGE_TYPE(dbp, h, i)) {
280	case H_KEYDATA:
281		/* Nothing to do here--everything but the type field is data */
282		break;
283	case H_DUPLICATE:
284		/* Are we a datum or a key?  Better be the former. */
285		if (i % 2 == 0) {
286			EPRINT((dbp->env,
287			    "Page %lu: hash key stored as duplicate item %lu",
288			    (u_long)pip->pgno, (u_long)i));
289		}
290		/*
291		 * Dups are encoded as a series within a single HKEYDATA,
292		 * in which each dup is surrounded by a copy of its length
293		 * on either side (so that the series can be walked in either
294		 * direction.  We loop through this series and make sure
295		 * each dup is reasonable.
296		 *
297		 * Note that at this point, we've verified item i-1, so
298		 * it's safe to use LEN_HKEYDATA (which looks at inp[i-1]).
299		 */
300		len = LEN_HKEYDATA(dbp, h, dbp->pgsize, i);
301		databuf = HKEYDATA_DATA(P_ENTRY(dbp, h, i));
302		for (offset = 0; offset < len; offset += DUP_SIZE(dlen)) {
303			memcpy(&dlen, databuf + offset, sizeof(db_indx_t));
304
305			/* Make sure the length is plausible. */
306			if (offset + DUP_SIZE(dlen) > len) {
307				EPRINT((dbp->env,
308			    "Page %lu: duplicate item %lu has bad length",
309				    (u_long)pip->pgno, (u_long)i));
310				ret = DB_VERIFY_BAD;
311				goto err;
312			}
313
314			/*
315			 * Make sure the second copy of the length is the
316			 * same as the first.
317			 */
318			memcpy(&elen,
319			    databuf + offset + dlen + sizeof(db_indx_t),
320			    sizeof(db_indx_t));
321			if (elen != dlen) {
322				EPRINT((dbp->env,
323		"Page %lu: duplicate item %lu has two different lengths",
324				    (u_long)pip->pgno, (u_long)i));
325				ret = DB_VERIFY_BAD;
326				goto err;
327			}
328		}
329		F_SET(pip, VRFY_HAS_DUPS);
330		if (!LF_ISSET(DB_NOORDERCHK) &&
331		    __ham_dups_unsorted(dbp, databuf, len))
332			F_SET(pip, VRFY_DUPS_UNSORTED);
333		break;
334	case H_OFFPAGE:
335		/* Offpage item.  Make sure pgno is sane, save off. */
336		memcpy(&hop, P_ENTRY(dbp, h, i), HOFFPAGE_SIZE);
337		if (!IS_VALID_PGNO(hop.pgno) || hop.pgno == pip->pgno ||
338		    hop.pgno == PGNO_INVALID) {
339			EPRINT((dbp->env,
340			    "Page %lu: offpage item %lu has bad pgno %lu",
341			    (u_long)pip->pgno, (u_long)i, (u_long)hop.pgno));
342			ret = DB_VERIFY_BAD;
343			goto err;
344		}
345		memset(&child, 0, sizeof(VRFY_CHILDINFO));
346		child.pgno = hop.pgno;
347		child.type = V_OVERFLOW;
348		child.tlen = hop.tlen; /* This will get checked later. */
349		if ((ret = __db_vrfy_childput(vdp, pip->pgno, &child)) != 0)
350			goto err;
351		break;
352	case H_OFFDUP:
353		/* Offpage duplicate item.  Same drill. */
354		memcpy(&hod, P_ENTRY(dbp, h, i), HOFFDUP_SIZE);
355		if (!IS_VALID_PGNO(hod.pgno) || hod.pgno == pip->pgno ||
356		    hod.pgno == PGNO_INVALID) {
357			EPRINT((dbp->env,
358			    "Page %lu: offpage item %lu has bad page number",
359			    (u_long)pip->pgno, (u_long)i));
360			ret = DB_VERIFY_BAD;
361			goto err;
362		}
363		memset(&child, 0, sizeof(VRFY_CHILDINFO));
364		child.pgno = hod.pgno;
365		child.type = V_DUPLICATE;
366		if ((ret = __db_vrfy_childput(vdp, pip->pgno, &child)) != 0)
367			goto err;
368		F_SET(pip, VRFY_HAS_DUPS);
369		break;
370	default:
371		EPRINT((dbp->env,
372		    "Page %lu: item %lu has bad type",
373		    (u_long)pip->pgno, (u_long)i));
374		ret = DB_VERIFY_BAD;
375		break;
376	}
377
378err:	if ((t_ret =
379	    __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0 && ret == 0)
380		ret = t_ret;
381	return (ret);
382}
383
384/*
385 * __ham_vrfy_structure --
386 *	Verify the structure of a hash database.
387 *
388 * PUBLIC: int __ham_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t,
389 * PUBLIC:     u_int32_t));
390 */
391int
392__ham_vrfy_structure(dbp, vdp, meta_pgno, flags)
393	DB *dbp;
394	VRFY_DBINFO *vdp;
395	db_pgno_t meta_pgno;
396	u_int32_t flags;
397{
398	DB *pgset;
399	DB_MPOOLFILE *mpf;
400	HMETA *m;
401	PAGE *h;
402	VRFY_PAGEINFO *pip;
403	int isbad, p, ret, t_ret;
404	db_pgno_t pgno;
405	u_int32_t bucket, spares_entry;
406
407	mpf = dbp->mpf;
408	pgset = vdp->pgset;
409	h = NULL;
410	ret = isbad = 0;
411
412	if ((ret = __db_vrfy_pgset_get(pgset,
413	    vdp->thread_info, meta_pgno, &p)) != 0)
414		return (ret);
415	if (p != 0) {
416		EPRINT((dbp->env,
417		    "Page %lu: Hash meta page referenced twice",
418		    (u_long)meta_pgno));
419		return (DB_VERIFY_BAD);
420	}
421	if ((ret = __db_vrfy_pgset_inc(pgset,
422	    vdp->thread_info, meta_pgno)) != 0)
423		return (ret);
424
425	/* Get the meta page;  we'll need it frequently. */
426	if ((ret = __memp_fget(mpf,
427	    &meta_pgno, vdp->thread_info, NULL, 0, &m)) != 0)
428		return (ret);
429
430	/* Loop through bucket by bucket. */
431	for (bucket = 0; bucket <= m->max_bucket; bucket++)
432		if ((ret =
433		    __ham_vrfy_bucket(dbp, vdp, m, bucket, flags)) != 0) {
434			if (ret == DB_VERIFY_BAD)
435				isbad = 1;
436			else
437				goto err;
438		    }
439
440	/*
441	 * There may be unused hash pages corresponding to buckets
442	 * that have been allocated but not yet used.  These may be
443	 * part of the current doubling above max_bucket, or they may
444	 * correspond to buckets that were used in a transaction
445	 * that then aborted.
446	 *
447	 * Loop through them, as far as the spares array defines them,
448	 * and make sure they're all empty.
449	 *
450	 * Note that this should be safe, since we've already verified
451	 * that the spares array is sane.
452	 */
453	for (bucket = m->max_bucket + 1; spares_entry = __db_log2(bucket + 1),
454	    spares_entry < NCACHED && m->spares[spares_entry] != 0; bucket++) {
455		pgno = BS_TO_PAGE(bucket, m->spares);
456		if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
457			goto err;
458
459		/* It's okay if these pages are totally zeroed;  unmark it. */
460		F_CLR(pip, VRFY_IS_ALLZEROES);
461
462		/* It's also OK if this page is simply invalid. */
463		if (pip->type == P_INVALID) {
464			if ((ret = __db_vrfy_putpageinfo(dbp->env,
465			    vdp, pip)) != 0)
466				goto err;
467			continue;
468		}
469
470		if (pip->type != P_HASH && pip->type != P_HASH_UNSORTED) {
471			EPRINT((dbp->env,
472			    "Page %lu: hash bucket %lu maps to non-hash page",
473			    (u_long)pgno, (u_long)bucket));
474			isbad = 1;
475		} else if (pip->entries != 0) {
476			EPRINT((dbp->env,
477		    "Page %lu: non-empty page in unused hash bucket %lu",
478			    (u_long)pgno, (u_long)bucket));
479			isbad = 1;
480		} else {
481			if ((ret = __db_vrfy_pgset_get(pgset,
482			    vdp->thread_info, pgno, &p)) != 0)
483				goto err;
484			if (p != 0) {
485				EPRINT((dbp->env,
486				    "Page %lu: above max_bucket referenced",
487				    (u_long)pgno));
488				isbad = 1;
489			} else {
490				if ((ret = __db_vrfy_pgset_inc(pgset,
491				    vdp->thread_info, pgno)) != 0)
492					goto err;
493				if ((ret = __db_vrfy_putpageinfo(dbp->env,
494				    vdp, pip)) != 0)
495					goto err;
496				continue;
497			}
498		}
499
500		/* If we got here, it's an error. */
501		(void)__db_vrfy_putpageinfo(dbp->env, vdp, pip);
502		goto err;
503	}
504
505err:	if ((t_ret = __memp_fput(mpf, vdp->thread_info, m, dbp->priority)) != 0)
506		return (t_ret);
507	if (h != NULL &&
508	    (t_ret = __memp_fput(mpf, vdp->thread_info, h, dbp->priority)) != 0)
509		return (t_ret);
510	return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD: ret);
511}
512
513/*
514 * __ham_vrfy_bucket --
515 *	Verify a given bucket.
516 */
517static int
518__ham_vrfy_bucket(dbp, vdp, m, bucket, flags)
519	DB *dbp;
520	VRFY_DBINFO *vdp;
521	HMETA *m;
522	u_int32_t bucket, flags;
523{
524	ENV *env;
525	HASH *hashp;
526	VRFY_CHILDINFO *child;
527	VRFY_PAGEINFO *mip, *pip;
528	int ret, t_ret, isbad, p;
529	db_pgno_t pgno, next_pgno;
530	DBC *cc;
531	u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
532
533	env = dbp->env;
534	isbad = 0;
535	pip = NULL;
536	cc = NULL;
537
538	hashp = dbp->h_internal;
539	if (hashp != NULL && hashp->h_hash != NULL)
540		hfunc = hashp->h_hash;
541	else
542		hfunc = __ham_func5;
543
544	if ((ret = __db_vrfy_getpageinfo(vdp, PGNO(m), &mip)) != 0)
545		return (ret);
546
547	/* Calculate the first pgno for this bucket. */
548	pgno = BS_TO_PAGE(bucket, m->spares);
549
550	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
551		goto err;
552
553	/* Make sure we got a plausible page number. */
554	if (pgno > vdp->last_pgno ||
555	    (pip->type != P_HASH && pip->type != P_HASH_UNSORTED)) {
556		EPRINT((env,
557		    "Page %lu: impossible first page in bucket %lu",
558		    (u_long)pgno, (u_long)bucket));
559		/* Unsafe to continue. */
560		isbad = 1;
561		goto err;
562	}
563
564	if (pip->prev_pgno != PGNO_INVALID) {
565		EPRINT((env,
566		    "Page %lu: first page in hash bucket %lu has a prev_pgno",
567		    (u_long)pgno, (u_long)bucket));
568		isbad = 1;
569	}
570
571	/*
572	 * Set flags for dups and sorted dups.
573	 */
574	flags |= F_ISSET(mip, VRFY_HAS_DUPS) ? DB_ST_DUPOK : 0;
575	flags |= F_ISSET(mip, VRFY_HAS_DUPSORT) ? DB_ST_DUPSORT : 0;
576
577	/* Loop until we find a fatal bug, or until we run out of pages. */
578	for (;;) {
579		/* Provide feedback on our progress to the application. */
580		if (!LF_ISSET(DB_SALVAGE))
581			__db_vrfy_struct_feedback(dbp, vdp);
582
583		if ((ret = __db_vrfy_pgset_get(vdp->pgset,
584		    vdp->thread_info, pgno, &p)) != 0)
585			goto err;
586		if (p != 0) {
587			EPRINT((env,
588			    "Page %lu: hash page referenced twice",
589			    (u_long)pgno));
590			isbad = 1;
591			/* Unsafe to continue. */
592			goto err;
593		} else if ((ret = __db_vrfy_pgset_inc(vdp->pgset,
594		    vdp->thread_info, pgno)) != 0)
595			goto err;
596
597		/*
598		 * Hash pages that nothing has ever hashed to may never
599		 * have actually come into existence, and may appear to be
600		 * entirely zeroed.  This is acceptable, and since there's
601		 * no real way for us to know whether this has actually
602		 * occurred, we clear the "wholly zeroed" flag on every
603		 * hash page.  A wholly zeroed page, by nature, will appear
604		 * to have no flags set and zero entries, so should
605		 * otherwise verify correctly.
606		 */
607		F_CLR(pip, VRFY_IS_ALLZEROES);
608
609		/* If we have dups, our meta page had better know about it. */
610		if (F_ISSET(pip, VRFY_HAS_DUPS) &&
611		    !F_ISSET(mip, VRFY_HAS_DUPS)) {
612			EPRINT((env,
613		    "Page %lu: duplicates present in non-duplicate database",
614			    (u_long)pgno));
615			isbad = 1;
616		}
617
618		/*
619		 * If the database has sorted dups, this page had better
620		 * not have unsorted ones.
621		 */
622		if (F_ISSET(mip, VRFY_HAS_DUPSORT) &&
623		    F_ISSET(pip, VRFY_DUPS_UNSORTED)) {
624			EPRINT((env,
625			    "Page %lu: unsorted dups in sorted-dup database",
626			    (u_long)pgno));
627			isbad = 1;
628		}
629
630		/* Walk overflow chains and offpage dup trees. */
631		if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
632			goto err;
633		for (ret = __db_vrfy_ccset(cc, pip->pgno, &child); ret == 0;
634		    ret = __db_vrfy_ccnext(cc, &child))
635			if (child->type == V_OVERFLOW) {
636				if ((ret = __db_vrfy_ovfl_structure(dbp, vdp,
637				    child->pgno, child->tlen,
638				    flags | DB_ST_OVFL_LEAF)) != 0) {
639					if (ret == DB_VERIFY_BAD)
640						isbad = 1;
641					else
642						goto err;
643				}
644			} else if (child->type == V_DUPLICATE) {
645				if ((ret = __db_vrfy_duptype(dbp,
646				    vdp, child->pgno, flags)) != 0) {
647					isbad = 1;
648					continue;
649				}
650				if ((ret = __bam_vrfy_subtree(dbp, vdp,
651				    child->pgno, NULL, NULL,
652		    flags | DB_ST_RECNUM | DB_ST_DUPSET | DB_ST_TOPLEVEL,
653				    NULL, NULL, NULL)) != 0) {
654					if (ret == DB_VERIFY_BAD)
655						isbad = 1;
656					else
657						goto err;
658				}
659			}
660		if ((ret = __db_vrfy_ccclose(cc)) != 0)
661			goto err;
662		cc = NULL;
663
664		/* If it's safe to check that things hash properly, do so. */
665		if (isbad == 0 && !LF_ISSET(DB_NOORDERCHK) &&
666		    (ret = __ham_vrfy_hashing(dbp, pip->entries,
667		    m, bucket, pgno, flags, hfunc)) != 0) {
668			if (ret == DB_VERIFY_BAD)
669				isbad = 1;
670			else
671				goto err;
672		}
673
674		next_pgno = pip->next_pgno;
675		ret = __db_vrfy_putpageinfo(env, vdp, pip);
676
677		pip = NULL;
678		if (ret != 0)
679			goto err;
680
681		if (next_pgno == PGNO_INVALID)
682			break;		/* End of the bucket. */
683
684		/* We already checked this, but just in case... */
685		if (!IS_VALID_PGNO(next_pgno)) {
686			EPRINT((env,
687			    "Page %lu: hash page has bad next_pgno",
688			    (u_long)pgno));
689			isbad = 1;
690			goto err;
691		}
692
693		if ((ret = __db_vrfy_getpageinfo(vdp, next_pgno, &pip)) != 0)
694			goto err;
695
696		if (pip->prev_pgno != pgno) {
697			EPRINT((env,
698			    "Page %lu: hash page has bad prev_pgno",
699			    (u_long)next_pgno));
700			isbad = 1;
701		}
702		pgno = next_pgno;
703	}
704
705err:	if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0)
706		ret = t_ret;
707	if (mip != NULL && ((t_ret =
708	    __db_vrfy_putpageinfo(env, vdp, mip)) != 0) && ret == 0)
709		ret = t_ret;
710	if (pip != NULL && ((t_ret =
711	    __db_vrfy_putpageinfo(env, vdp, pip)) != 0) && ret == 0)
712		ret = t_ret;
713	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
714}
715
716/*
717 * __ham_vrfy_hashing --
718 *	Verify that all items on a given hash page hash correctly.
719 *
720 * PUBLIC: int __ham_vrfy_hashing __P((DB *,
721 * PUBLIC:     u_int32_t, HMETA *, u_int32_t, db_pgno_t, u_int32_t,
722 * PUBLIC:     u_int32_t (*) __P((DB *, const void *, u_int32_t))));
723 */
724int
725__ham_vrfy_hashing(dbp, nentries, m, thisbucket, pgno, flags, hfunc)
726	DB *dbp;
727	u_int32_t nentries;
728	HMETA *m;
729	u_int32_t thisbucket;
730	db_pgno_t pgno;
731	u_int32_t flags;
732	u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t));
733{
734	DBT dbt;
735	DB_MPOOLFILE *mpf;
736	DB_THREAD_INFO *ip;
737	PAGE *h;
738	db_indx_t i;
739	int ret, t_ret, isbad;
740	u_int32_t hval, bucket;
741
742	mpf = dbp->mpf;
743	ret = isbad = 0;
744
745	memset(&dbt, 0, sizeof(DBT));
746	F_SET(&dbt, DB_DBT_REALLOC);
747	ENV_GET_THREAD_INFO(dbp->env, ip);
748
749	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &h)) != 0)
750		return (ret);
751
752	for (i = 0; i < nentries; i += 2) {
753		/*
754		 * We've already verified the page integrity and that of any
755		 * overflow chains linked off it;  it is therefore safe to use
756		 * __db_ret.  It's also not all that much slower, since we have
757		 * to copy every hash item to deal with alignment anyway;  we
758		 * can tweak this a bit if this proves to be a bottleneck,
759		 * but for now, take the easy route.
760		 */
761		if ((ret = __db_ret(dbp, ip,
762		     NULL, h, i, &dbt, NULL, NULL)) != 0)
763			goto err;
764		hval = hfunc(dbp, dbt.data, dbt.size);
765
766		bucket = hval & m->high_mask;
767		if (bucket > m->max_bucket)
768			bucket = bucket & m->low_mask;
769
770		if (bucket != thisbucket) {
771			EPRINT((dbp->env,
772			    "Page %lu: item %lu hashes incorrectly",
773			    (u_long)pgno, (u_long)i));
774			isbad = 1;
775		}
776	}
777
778err:	if (dbt.data != NULL)
779		__os_ufree(dbp->env, dbt.data);
780	if ((t_ret = __memp_fput(mpf, ip, h, dbp->priority)) != 0)
781		return (t_ret);
782
783	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
784}
785
786/*
787 * __ham_salvage --
788 *	Safely dump out anything that looks like a key on an alleged
789 *	hash page.
790 *
791 * PUBLIC: int __ham_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *,
792 * PUBLIC:     void *, int (*)(void *, const void *), u_int32_t));
793 */
794int
795__ham_salvage(dbp, vdp, pgno, h, handle, callback, flags)
796	DB *dbp;
797	VRFY_DBINFO *vdp;
798	db_pgno_t pgno;
799	PAGE *h;
800	void *handle;
801	int (*callback) __P((void *, const void *));
802	u_int32_t flags;
803{
804	DBT dbt, key_dbt, unkdbt;
805	db_pgno_t dpgno;
806	int ret, err_ret, t_ret;
807	u_int32_t himark, i;
808	u_int8_t *hk, *p;
809	void *buf, *key_buf;
810	db_indx_t dlen, len, tlen;
811
812	memset(&dbt, 0, sizeof(DBT));
813	dbt.flags = DB_DBT_REALLOC;
814
815	DB_INIT_DBT(unkdbt, "UNKNOWN", sizeof("UNKNOWN") - 1);
816
817	err_ret = 0;
818
819	/*
820	 * Allocate a buffer for overflow items.  Start at one page;
821	 * __db_safe_goff will realloc as needed.
822	 */
823	if ((ret = __os_malloc(dbp->env, dbp->pgsize, &buf)) != 0)
824		return (ret);
825
826	himark = dbp->pgsize;
827	for (i = 0;; i++) {
828		/* If we're not aggressive, break when we hit NUM_ENT(h). */
829		if (!LF_ISSET(DB_AGGRESSIVE) && i >= NUM_ENT(h))
830			break;
831
832		/* Verify the current item. */
833		ret = __db_vrfy_inpitem(dbp,
834		    h, pgno, i, 0, flags, &himark, NULL);
835		/* If this returned a fatality, it's time to break. */
836		if (ret == DB_VERIFY_FATAL)
837			break;
838
839		if (ret == 0) {
840			/* Set len to total entry length. */
841			len = LEN_HITEM(dbp, h, dbp->pgsize, i);
842			hk = P_ENTRY(dbp, h, i);
843			if (len == 0 || len > dbp->pgsize ||
844			    (u_int32_t)(hk + len - (u_int8_t *)h) >
845			    dbp->pgsize) {
846				/* Item is unsafely large; skip it. */
847				err_ret = DB_VERIFY_BAD;
848				continue;
849			}
850			switch (HPAGE_PTYPE(hk)) {
851			case H_KEYDATA:
852				/* Update len to size of item. */
853				len = LEN_HKEYDATA(dbp, h, dbp->pgsize, i);
854keydata:			memcpy(buf, HKEYDATA_DATA(hk), len);
855				dbt.size = len;
856				dbt.data = buf;
857				if ((ret = __db_vrfy_prdbt(&dbt,
858				    0, " ", handle, callback, 0, vdp)) != 0)
859					err_ret = ret;
860				break;
861			case H_OFFPAGE:
862				if (len < HOFFPAGE_SIZE) {
863					err_ret = DB_VERIFY_BAD;
864					continue;
865				}
866				memcpy(&dpgno,
867				    HOFFPAGE_PGNO(hk), sizeof(dpgno));
868				if ((ret = __db_safe_goff(dbp, vdp,
869				    dpgno, &dbt, &buf, flags)) != 0) {
870					err_ret = ret;
871					(void)__db_vrfy_prdbt(&unkdbt, 0, " ",
872					    handle, callback, 0, vdp);
873					/* fallthrough to end of case */
874				} else if ((ret = __db_vrfy_prdbt(&dbt,
875				    0, " ", handle, callback, 0, vdp)) != 0)
876					err_ret = ret;
877				break;
878			case H_OFFDUP:
879				if (len < HOFFDUP_SIZE) {
880					err_ret = DB_VERIFY_BAD;
881					continue;
882				}
883				memcpy(&dpgno,
884				    HOFFDUP_PGNO(hk), sizeof(dpgno));
885				/* UNKNOWN iff pgno is bad or we're a key. */
886				if (!IS_VALID_PGNO(dpgno) || (i % 2 == 0)) {
887					if ((ret =
888					    __db_vrfy_prdbt(&unkdbt, 0, " ",
889					    handle, callback, 0, vdp)) != 0)
890						err_ret = ret;
891				} else if ((ret = __db_salvage_duptree(dbp,
892				    vdp, dpgno, &dbt, handle, callback,
893				    flags | DB_SA_SKIPFIRSTKEY)) != 0)
894					err_ret = ret;
895				break;
896			case H_DUPLICATE:
897				/*
898				 * This is an on-page duplicate item, iterate
899				 * over the duplicate set, printing out
900				 * key/data pairs.
901				 */
902				len = LEN_HKEYDATA(dbp, h, dbp->pgsize, i);
903				/*
904				 * If this item is at an even index it must be
905				 * a key item and it should never be of type
906				 * H_DUPLICATE. If we are in aggressive mode,
907				 * print the item out as a normal key, and let
908				 * the user resolve the discrepancy.
909				 */
910				if (i % 2 == 0) {
911					err_ret = ret;
912					if (LF_ISSET(DB_AGGRESSIVE))
913						goto keydata;
914					break;
915				}
916
917				/*
918				 * Check to ensure that the item size is
919				 * greater than the smallest possible on page
920				 * duplicate.
921				 */
922				if (len <
923				    HKEYDATA_SIZE(2 * sizeof(db_indx_t))) {
924					err_ret = DB_VERIFY_BAD;
925					continue;
926				}
927
928				/*
929				 * Copy out the key from the dbt, it is still
930				 * present from the previous pass.
931				 */
932				memset(&key_dbt, 0, sizeof(key_dbt));
933				if ((ret = __os_malloc(
934				    dbp->env, dbt.size, &key_buf)) != 0)
935					return (ret);
936				memcpy(key_buf, buf, dbt.size);
937				key_dbt.data = key_buf;
938				key_dbt.size = dbt.size;
939				key_dbt.flags = DB_DBT_USERMEM;
940
941				/* Loop until we hit the total length. */
942				for (tlen = 0; tlen + sizeof(db_indx_t) < len;
943				    tlen += dlen + 2 * sizeof(db_indx_t)) {
944					/*
945					 * Print the key for every duplicate
946					 * item. Except the first dup, since
947					 * the key was already output once by
948					 * the previous iteration.
949					 */
950					if (tlen != 0) {
951						if ((ret = __db_vrfy_prdbt(
952						    &key_dbt, 0, " ", handle,
953						    callback, 0, vdp)) != 0)
954							err_ret = ret;
955					}
956					p = HKEYDATA_DATA(hk) + tlen;
957					memcpy(&dlen, p, sizeof(db_indx_t));
958					p += sizeof(db_indx_t);
959					/*
960					 * If dlen is too long, print all the
961					 * rest of the dup set in a chunk.
962					 */
963					if (dlen + tlen + sizeof(db_indx_t) >
964					    len) {
965						dlen = len -
966						    (tlen + sizeof(db_indx_t));
967						err_ret = DB_VERIFY_BAD;
968					}
969					memcpy(buf, p, dlen);
970					dbt.size = dlen;
971					dbt.data = buf;
972					if ((ret = __db_vrfy_prdbt(&dbt, 0, " ",
973					    handle, callback, 0, vdp)) != 0)
974						err_ret = ret;
975				}
976				__os_free(dbp->env, key_buf);
977				break;
978			default:
979				if (!LF_ISSET(DB_AGGRESSIVE))
980					break;
981				err_ret = DB_VERIFY_BAD;
982				break;
983			}
984		}
985	}
986
987	__os_free(dbp->env, buf);
988	if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0)
989		return (t_ret);
990	return ((ret == 0 && err_ret != 0) ? err_ret : ret);
991}
992
993/*
994 * __ham_meta2pgset --
995 *	Return the set of hash pages corresponding to the given
996 *	known-good meta page.
997 *
998 * PUBLIC: int __ham_meta2pgset __P((DB *, VRFY_DBINFO *, HMETA *, u_int32_t,
999 * PUBLIC:     DB *));
1000 */
1001int
1002__ham_meta2pgset(dbp, vdp, hmeta, flags, pgset)
1003	DB *dbp;
1004	VRFY_DBINFO *vdp;
1005	HMETA *hmeta;
1006	u_int32_t flags;
1007	DB *pgset;
1008{
1009	DB_MPOOLFILE *mpf;
1010	DB_THREAD_INFO *ip;
1011	PAGE *h;
1012	db_pgno_t pgno;
1013	u_int32_t bucket, totpgs;
1014	int ret, val;
1015
1016	/*
1017	 * We don't really need flags, but leave them for consistency with
1018	 * __bam_meta2pgset.
1019	 */
1020	COMPQUIET(flags, 0);
1021	ip = vdp->thread_info;
1022
1023	DB_ASSERT(dbp->env, pgset != NULL);
1024
1025	mpf = dbp->mpf;
1026	totpgs = 0;
1027
1028	/*
1029	 * Loop through all the buckets, pushing onto pgset the corresponding
1030	 * page(s) for each one.
1031	 */
1032	for (bucket = 0; bucket <= hmeta->max_bucket; bucket++) {
1033		pgno = BS_TO_PAGE(bucket, hmeta->spares);
1034
1035		/*
1036		 * We know the initial pgno is safe because the spares array has
1037		 * been verified.
1038		 *
1039		 * Safely walk the list of pages in this bucket.
1040		 */
1041		for (;;) {
1042			if ((ret =
1043			    __memp_fget(mpf, &pgno, ip, NULL, 0, &h)) != 0)
1044				return (ret);
1045			if (TYPE(h) == P_HASH || TYPE(h) == P_HASH_UNSORTED) {
1046
1047				/*
1048				 * Make sure we don't go past the end of
1049				 * pgset.
1050				 */
1051				if (++totpgs > vdp->last_pgno) {
1052					(void)__memp_fput(mpf,
1053					    ip, h, dbp->priority);
1054					return (DB_VERIFY_BAD);
1055				}
1056				if ((ret = __db_vrfy_pgset_inc(pgset,
1057				    vdp->thread_info, pgno)) != 0) {
1058					(void)__memp_fput(mpf,
1059					    ip, h, dbp->priority);
1060					return (ret);
1061				}
1062
1063				pgno = NEXT_PGNO(h);
1064			} else
1065				pgno = PGNO_INVALID;
1066
1067			if ((ret = __memp_fput(mpf, ip, h, dbp->priority)) != 0)
1068				return (ret);
1069
1070			/* If the new pgno is wonky, go onto the next bucket. */
1071			if (!IS_VALID_PGNO(pgno) ||
1072			    pgno == PGNO_INVALID)
1073				break;
1074
1075			/*
1076			 * If we've touched this page before, we have a cycle;
1077			 * go on to the next bucket.
1078			 */
1079			if ((ret = __db_vrfy_pgset_get(pgset,
1080			    vdp->thread_info, pgno, &val)) != 0)
1081				return (ret);
1082			if (val != 0)
1083				break;
1084		}
1085	}
1086	return (0);
1087}
1088
1089/*
1090 * __ham_dups_unsorted --
1091 *	Takes a known-safe hash duplicate set and its total length.
1092 *	Returns 1 if there are out-of-order duplicates in this set,
1093 *	0 if there are not.
1094 */
1095static int
1096__ham_dups_unsorted(dbp, buf, len)
1097	DB *dbp;
1098	u_int8_t *buf;
1099	u_int32_t len;
1100{
1101	DBT a, b;
1102	db_indx_t offset, dlen;
1103	int (*func) __P((DB *, const DBT *, const DBT *));
1104
1105	memset(&a, 0, sizeof(DBT));
1106	memset(&b, 0, sizeof(DBT));
1107
1108	func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
1109
1110	/*
1111	 * Loop through the dup set until we hit the end or we find
1112	 * a pair of dups that's out of order.  b is always the current
1113	 * dup, a the one before it.
1114	 */
1115	for (offset = 0; offset < len; offset += DUP_SIZE(dlen)) {
1116		memcpy(&dlen, buf + offset, sizeof(db_indx_t));
1117		b.data = buf + offset + sizeof(db_indx_t);
1118		b.size = dlen;
1119
1120		if (a.data != NULL && func(dbp, &a, &b) > 0)
1121			return (1);
1122
1123		a.data = b.data;
1124		a.size = b.size;
1125	}
1126
1127	return (0);
1128}
1129