1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 */
6/*
7 * Copyright (c) 1995, 1996
8 *	Margo Seltzer.  All rights reserved.
9 */
10/*
11 * Copyright (c) 1995, 1996
12 *	The President and Fellows of Harvard University.  All rights reserved.
13 *
14 * This code is derived from software contributed to Berkeley by
15 * Margo Seltzer.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 * $Id: hash_rec.c,v 12.44 2008/02/18 04:46:43 mjc Exp $
42 */
43
44#include "db_config.h"
45
46#include "db_int.h"
47#include "dbinc/db_page.h"
48#include "dbinc/btree.h"
49#include "dbinc/hash.h"
50#include "dbinc/log.h"
51#include "dbinc/mp.h"
52
53static int __ham_alloc_pages __P((DBC *, __ham_groupalloc_args *, DB_LSN *));
54static int __ham_alloc_pages_42
55    __P((DBC *, __ham_groupalloc_42_args *, DB_LSN *));
56
57/*
58 * __ham_insdel_recover --
59 *
60 * PUBLIC: int __ham_insdel_recover
61 * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
62 */
63int
64__ham_insdel_recover(env, dbtp, lsnp, op, info)
65	ENV *env;
66	DBT *dbtp;
67	DB_LSN *lsnp;
68	db_recops op;
69	void *info;
70{
71	__ham_insdel_args *argp;
72	DB_THREAD_INFO *ip;
73	DB *file_dbp;
74	DBC *dbc;
75	DB_MPOOLFILE *mpf;
76	PAGE *pagep;
77	db_indx_t dindx;
78	u_int32_t opcode;
79	int cmp_n, cmp_p, dtype, ktype, ret;
80
81	ip = ((DB_TXNHEAD *)info)->thread_info;
82	pagep = NULL;
83	REC_PRINT(__ham_insdel_print);
84	REC_INTRO(__ham_insdel_read, ip, 1);
85
86	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
87	    0, &pagep)) != 0) {
88		if (DB_UNDO(op)) {
89			if (ret == DB_PAGE_NOTFOUND)
90				goto done;
91			else {
92				ret = __db_pgerr(file_dbp, argp->pgno, ret);
93				goto out;
94			}
95		}
96		/* If the page is not here then it was later truncated. */
97		if (!IS_ZERO_LSN(argp->pagelsn))
98			goto done;
99		/*
100		 * This page was created by a group allocation and
101		 * the file may not have been extend yet.
102		 * Create the page if necessary.
103		 */
104		if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
105		    DB_MPOOL_CREATE, &pagep)) != 0) {
106			ret = __db_pgerr(file_dbp, argp->pgno, ret);
107			goto out;
108		}
109	}
110
111	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
112	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
113	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
114
115	/*
116	 * Two possible things going on:
117	 * redo a delete/undo a put: delete the item from the page.
118	 * redo a put/undo a delete: add the item to the page.
119	 * If we are undoing a delete, then the information logged is the
120	 * entire entry off the page, not just the data of a dbt.  In
121	 * this case, we want to copy it back onto the page verbatim.
122	 * We do this by calling __insertpair with the type H_OFFPAGE instead
123	 * of H_KEYDATA.
124	 */
125	opcode = OPCODE_OF(argp->opcode);
126	if ((opcode == DELPAIR && cmp_n == 0 && DB_UNDO(op)) ||
127	    (opcode == PUTPAIR && cmp_p == 0 && DB_REDO(op))) {
128		/*
129		 * Need to redo a PUT or undo a delete.
130		 */
131		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
132		ktype = DB_UNDO(op) || PAIR_ISKEYBIG(argp->opcode) ?
133		    H_OFFPAGE : H_KEYDATA;
134		if (PAIR_ISDATADUP(argp->opcode))
135			dtype = H_DUPLICATE;
136		else if (DB_UNDO(op) || PAIR_ISDATABIG(argp->opcode))
137			dtype = H_OFFPAGE;
138		else
139			dtype = H_KEYDATA;
140		dindx = (db_indx_t)argp->ndx;
141		if ((ret = __ham_insertpair(dbc, pagep, &dindx,
142		    &argp->key, &argp->data, ktype, dtype)) != 0)
143			goto out;
144		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
145	} else if ((opcode == DELPAIR && cmp_p == 0 && DB_REDO(op)) ||
146	    (opcode == PUTPAIR && cmp_n == 0 && DB_UNDO(op))) {
147		/* Need to undo a put or redo a delete. */
148		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
149		__ham_dpair(file_dbp, pagep, argp->ndx);
150		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
151	}
152
153	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
154		goto out;
155	pagep = NULL;
156
157	/* Return the previous LSN. */
158done:	*lsnp = argp->prev_lsn;
159	ret = 0;
160
161out:	if (pagep != NULL)
162		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
163	REC_CLOSE;
164}
165
166/*
167 * __ham_newpage_recover --
168 *	This log message is used when we add/remove overflow pages.  This
169 *	message takes care of the pointer chains, not the data on the pages.
170 *
171 * PUBLIC: int __ham_newpage_recover
172 * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
173 */
174int
175__ham_newpage_recover(env, dbtp, lsnp, op, info)
176	ENV *env;
177	DBT *dbtp;
178	DB_LSN *lsnp;
179	db_recops op;
180	void *info;
181{
182	__ham_newpage_args *argp;
183	DB_THREAD_INFO *ip;
184	DB *file_dbp;
185	DBC *dbc;
186	DB_MPOOLFILE *mpf;
187	PAGE *pagep;
188	int change, cmp_n, cmp_p, ret;
189
190	ip = ((DB_TXNHEAD *)info)->thread_info;
191	pagep = NULL;
192	REC_PRINT(__ham_newpage_print);
193	REC_INTRO(__ham_newpage_read, ip, 0);
194
195	REC_FGET(mpf, ip, argp->new_pgno, &pagep, ppage);
196	change = 0;
197
198	/*
199	 * There are potentially three pages we need to check: the one
200	 * that we created/deleted, the one before it and the one after
201	 * it.
202	 */
203
204	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
205	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
206	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
207
208	if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) ||
209	    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) {
210		/* Redo a create new page or undo a delete new page. */
211		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
212		P_INIT(pagep, file_dbp->pgsize, argp->new_pgno,
213		    argp->prev_pgno, argp->next_pgno, 0, P_HASH);
214		change = 1;
215	} else if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DELOVFL) ||
216	    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) {
217		/*
218		 * Redo a delete or undo a create new page.  All we
219		 * really need to do is change the LSN.
220		 */
221		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
222		change = 1;
223	}
224
225	if (change)
226		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
227
228	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
229		goto out;
230	pagep = NULL;
231
232	/* Now do the prev page. */
233ppage:	if (argp->prev_pgno != PGNO_INVALID) {
234		REC_FGET(mpf, ip, argp->prev_pgno, &pagep, npage);
235
236		cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
237		cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
238		CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
239		change = 0;
240
241		if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) ||
242		    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) {
243			/* Redo a create new page or undo a delete new page. */
244			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
245			pagep->next_pgno = argp->new_pgno;
246			change = 1;
247		} else if ((cmp_p == 0 &&
248		    DB_REDO(op) && argp->opcode == DELOVFL) ||
249		    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) {
250			/* Redo a delete or undo a create new page. */
251			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
252			pagep->next_pgno = argp->next_pgno;
253			change = 1;
254		}
255
256		if (change)
257			LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn;
258
259		if ((ret = __memp_fput(mpf,
260		    ip, pagep, file_dbp->priority)) != 0)
261			goto out;
262		pagep = NULL;
263	}
264
265	/* Now time to do the next page */
266npage:	if (argp->next_pgno != PGNO_INVALID) {
267		REC_FGET(mpf, ip, argp->next_pgno, &pagep, done);
268
269		cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
270		cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
271		CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
272		change = 0;
273
274		if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) ||
275		    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) {
276			/* Redo a create new page or undo a delete new page. */
277			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
278			pagep->prev_pgno = argp->new_pgno;
279			change = 1;
280		} else if ((cmp_p == 0 &&
281		    DB_REDO(op) && argp->opcode == DELOVFL) ||
282		    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) {
283			/* Redo a delete or undo a create new page. */
284			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
285			pagep->prev_pgno = argp->prev_pgno;
286			change = 1;
287		}
288
289		if (change)
290			LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn;
291
292		if ((ret = __memp_fput(mpf,
293		    ip, pagep, file_dbp->priority)) != 0)
294			goto out;
295		pagep = NULL;
296	}
297done:	*lsnp = argp->prev_lsn;
298	ret = 0;
299
300out:	if (pagep != NULL)
301		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
302	REC_CLOSE;
303}
304
305/*
306 * __ham_replace_recover --
307 *	This log message refers to partial puts that are local to a single
308 *	page.  You can think of them as special cases of the more general
309 *	insdel log message.
310 *
311 * PUBLIC: int __ham_replace_recover
312 * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
313 */
314int
315__ham_replace_recover(env, dbtp, lsnp, op, info)
316	ENV *env;
317	DBT *dbtp;
318	DB_LSN *lsnp;
319	db_recops op;
320	void *info;
321{
322	__ham_replace_args *argp;
323	DB_THREAD_INFO *ip;
324	DB *file_dbp;
325	DBC *dbc;
326	DB_MPOOLFILE *mpf;
327	DBT dbt;
328	PAGE *pagep;
329	u_int32_t change;
330	int cmp_n, cmp_p, is_plus, modified, ret;
331	u_int8_t *hk;
332
333	ip = ((DB_TXNHEAD *)info)->thread_info;
334	pagep = NULL;
335	REC_PRINT(__ham_replace_print);
336	REC_INTRO(__ham_replace_read, ip, 0);
337
338	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
339
340	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
341	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
342	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
343
344	memset(&dbt, 0, sizeof(dbt));
345	modified = 0;
346
347	/*
348	 * Before we know the direction of the transformation we will
349	 * determine the size differential; then once we know if we are
350	 * redoing or undoing, we'll adjust the sign (is_plus) appropriately.
351	 */
352	if (argp->newitem.size > argp->olditem.size) {
353		change = argp->newitem.size - argp->olditem.size;
354		is_plus = 1;
355	} else {
356		change = argp->olditem.size - argp->newitem.size;
357		is_plus = 0;
358	}
359	if (cmp_p == 0 && DB_REDO(op)) {
360		/* Reapply the change as specified. */
361		dbt.data = argp->newitem.data;
362		dbt.size = argp->newitem.size;
363		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
364		LSN(pagep) = *lsnp;
365		/*
366		 * The is_plus flag is set properly to reflect
367		 * newitem.size - olditem.size.
368		 */
369		modified = 1;
370	} else if (cmp_n == 0 && DB_UNDO(op)) {
371		/* Undo the already applied change. */
372		dbt.data = argp->olditem.data;
373		dbt.size = argp->olditem.size;
374		/*
375		 * Invert is_plus to reflect sign of
376		 * olditem.size - newitem.size.
377		 */
378		is_plus = !is_plus;
379		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
380		LSN(pagep) = argp->pagelsn;
381		modified = 1;
382	}
383
384	if (modified) {
385		__ham_onpage_replace(file_dbp, pagep,
386		    argp->ndx, argp->off, change, is_plus, &dbt);
387		if (argp->makedup) {
388			hk = P_ENTRY(file_dbp, pagep, argp->ndx);
389			if (DB_REDO(op))
390				HPAGE_PTYPE(hk) = H_DUPLICATE;
391			else
392				HPAGE_PTYPE(hk) = H_KEYDATA;
393		}
394	}
395
396	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
397		goto out;
398	pagep = NULL;
399
400done:	*lsnp = argp->prev_lsn;
401	ret = 0;
402
403out:	if (pagep != NULL)
404		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
405	REC_CLOSE;
406}
407
408/*
409 * __ham_splitdata_recover --
410 *
411 * PUBLIC: int __ham_splitdata_recover
412 * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
413 */
414int
415__ham_splitdata_recover(env, dbtp, lsnp, op, info)
416	ENV *env;
417	DBT *dbtp;
418	DB_LSN *lsnp;
419	db_recops op;
420	void *info;
421{
422	__ham_splitdata_args *argp;
423	DB_THREAD_INFO *ip;
424	DB *file_dbp;
425	DBC *dbc;
426	DB_MPOOLFILE *mpf;
427	PAGE *pagep;
428	int cmp_n, cmp_p, ret;
429
430	ip = ((DB_TXNHEAD *)info)->thread_info;
431	pagep = NULL;
432	REC_PRINT(__ham_splitdata_print);
433	REC_INTRO(__ham_splitdata_read, ip, 1);
434
435	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
436		if (DB_UNDO(op)) {
437			if (ret == DB_PAGE_NOTFOUND)
438				goto done;
439			else {
440				ret = __db_pgerr(file_dbp, argp->pgno, ret);
441				goto out;
442			}
443		}
444		/* If the page is not here then it was later truncated. */
445		if (!IS_ZERO_LSN(argp->pagelsn))
446			goto done;
447		/*
448		 * This page was created by a group allocation and
449		 * the file may not have been extend yet.
450		 * Create the page if necessary.
451		 */
452		if ((ret = __memp_fget(mpf, &argp->pgno,
453		    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
454			ret = __db_pgerr(file_dbp, argp->pgno, ret);
455			goto out;
456		}
457	}
458
459	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
460	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
461	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
462
463	/*
464	 * There are three types of log messages here. Two are related
465	 * to an actual page split operation, one for the old page
466	 * and one for the new pages created.  The original image in the
467	 * SPLITOLD record is used for undo.  The image in the SPLITNEW
468	 * is used for redo.  We should never have a case where there is
469	 * a redo operation and the SPLITOLD record is on disk, but not
470	 * the SPLITNEW record.  Therefore, we only have work to do when
471	 * redo NEW messages and undo OLD messages, but we have to update
472	 * LSNs in both cases.
473	 *
474	 * The third message is generated when a page is sorted (SORTPAGE). In
475	 * an undo the original image in the SORTPAGE is used. In a redo we
476	 * recreate the sort operation by calling __ham_sort_page.
477	 */
478	if (cmp_p == 0 && DB_REDO(op)) {
479		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
480		if (argp->opcode == SPLITNEW)
481			/* Need to redo the split described. */
482			memcpy(pagep, argp->pageimage.data,
483			    argp->pageimage.size);
484		else if (argp->opcode == SORTPAGE) {
485			if ((ret = __ham_sort_page(dbc, NULL, pagep)) != 0)
486				goto out;
487		}
488		LSN(pagep) = *lsnp;
489	} else if (cmp_n == 0 && DB_UNDO(op)) {
490		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
491		if (argp->opcode == SPLITOLD || argp->opcode == SORTPAGE) {
492			/* Put back the old image. */
493			memcpy(pagep, argp->pageimage.data,
494			    argp->pageimage.size);
495		} else
496			P_INIT(pagep, file_dbp->pgsize, argp->pgno,
497			    PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
498		LSN(pagep) = argp->pagelsn;
499	}
500	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
501		goto out;
502	pagep = NULL;
503
504done:	*lsnp = argp->prev_lsn;
505	ret = 0;
506
507out:	if (pagep != NULL)
508		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
509	REC_CLOSE;
510}
511
512/*
513 * __ham_copypage_recover --
514 *	Recovery function for copypage.
515 *
516 * PUBLIC: int __ham_copypage_recover
517 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
518 */
519int
520__ham_copypage_recover(env, dbtp, lsnp, op, info)
521	ENV *env;
522	DBT *dbtp;
523	DB_LSN *lsnp;
524	db_recops op;
525	void *info;
526{
527	__ham_copypage_args *argp;
528	DB_THREAD_INFO *ip;
529	DB *file_dbp;
530	DBC *dbc;
531	DB_MPOOLFILE *mpf;
532	PAGE *pagep;
533	int cmp_n, cmp_p, ret;
534
535	ip = ((DB_TXNHEAD *)info)->thread_info;
536	pagep = NULL;
537	REC_PRINT(__ham_copypage_print);
538	REC_INTRO(__ham_copypage_read, ip, 0);
539
540	/* This is the bucket page. */
541	REC_FGET(mpf, ip, argp->pgno, &pagep, donext);
542
543	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
544	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
545	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
546
547	if (cmp_p == 0 && DB_REDO(op)) {
548		/* Need to redo update described. */
549		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
550		memcpy(pagep, argp->page.data, argp->page.size);
551		PGNO(pagep) = argp->pgno;
552		PREV_PGNO(pagep) = PGNO_INVALID;
553		LSN(pagep) = *lsnp;
554	} else if (cmp_n == 0 && DB_UNDO(op)) {
555		/* Need to undo update described. */
556		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
557		P_INIT(pagep, file_dbp->pgsize, argp->pgno, PGNO_INVALID,
558		    argp->next_pgno, 0, P_HASH);
559		LSN(pagep) = argp->pagelsn;
560	}
561	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
562		goto out;
563	pagep = NULL;
564
565donext:	/* Now fix up the "next" page. */
566	REC_FGET(mpf, ip, argp->next_pgno, &pagep, do_nn);
567
568	/* For REDO just update the LSN. For UNDO copy page back. */
569	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
570	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
571	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
572	if (cmp_p == 0 && DB_REDO(op)) {
573		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
574		LSN(pagep) = *lsnp;
575	} else if (cmp_n == 0 && DB_UNDO(op)) {
576		/* Need to undo update described. */
577		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
578		memcpy(pagep, argp->page.data, argp->page.size);
579	}
580	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
581		goto out;
582	pagep = NULL;
583
584	/* Now fix up the next's next page. */
585do_nn:	if (argp->nnext_pgno == PGNO_INVALID)
586		goto done;
587
588	REC_FGET(mpf, ip, argp->nnext_pgno, &pagep, done);
589
590	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
591	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nnextlsn);
592	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nnextlsn);
593
594	if (cmp_p == 0 && DB_REDO(op)) {
595		/* Need to redo update described. */
596		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
597		PREV_PGNO(pagep) = argp->pgno;
598		LSN(pagep) = *lsnp;
599	} else if (cmp_n == 0 && DB_UNDO(op)) {
600		/* Need to undo update described. */
601		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
602		PREV_PGNO(pagep) = argp->next_pgno;
603		LSN(pagep) = argp->nnextlsn;
604	}
605	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
606		goto out;
607	pagep = NULL;
608
609done:	*lsnp = argp->prev_lsn;
610	ret = 0;
611
612out:	if (pagep != NULL)
613		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
614	REC_CLOSE;
615}
616
617/*
618 * __ham_metagroup_recover --
619 *	Recovery function for metagroup.
620 *
621 * PUBLIC: int __ham_metagroup_recover
622 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
623 */
624int
625__ham_metagroup_recover(env, dbtp, lsnp, op, info)
626	ENV *env;
627	DBT *dbtp;
628	DB_LSN *lsnp;
629	db_recops op;
630	void *info;
631{
632	__ham_metagroup_args *argp;
633	DB_THREAD_INFO *ip;
634	HASH_CURSOR *hcp;
635	DB *file_dbp;
636	DBMETA *mmeta;
637	DBC *dbc;
638	DB_MPOOLFILE *mpf;
639	PAGE *pagep;
640	db_pgno_t pgno;
641	int cmp_n, cmp_p, did_alloc, groupgrow, ret;
642
643	ip = ((DB_TXNHEAD *)info)->thread_info;
644	mmeta = NULL;
645	did_alloc = 0;
646	REC_PRINT(__ham_metagroup_print);
647	REC_INTRO(__ham_metagroup_read, ip, 1);
648
649	/*
650	 * This logs the virtual create of pages pgno to pgno + bucket.
651	 * The log record contains:
652	 * bucket: old maximum bucket
653	 * pgno: page number of the new bucket.
654	 * We round up on log calculations, so we can figure out if we are
655	 * about to double the hash table if argp->bucket+1 is a power of 2.
656	 * If it is, then we are allocating an entire doubling of pages,
657	 * otherwise, we are simply allocated one new page.
658	 */
659	groupgrow =
660	    (u_int32_t)(1 << __db_log2(argp->bucket + 1)) == argp->bucket + 1;
661	pgno = argp->pgno;
662	if (argp->newalloc)
663		pgno += argp->bucket;
664
665	pagep = NULL;
666	ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep);
667
668	/* If we are undoing, then we don't want to create the page. */
669	if (ret != 0 && DB_REDO(op))
670		ret = __memp_fget(mpf,
671		    &pgno, ip, NULL, DB_MPOOL_CREATE, &pagep);
672	else if (ret == DB_PAGE_NOTFOUND)
673		goto do_meta;
674	if (ret != 0) {
675		if (ret != ENOSPC)
676			goto out;
677		pgno = 0;
678		goto do_meta;
679	}
680
681	/*
682	 * When we get here then either we did not grow the file
683	 * (groupgrow == 0) or we did grow the file and the allocation
684	 * of those new pages succeeded.
685	 */
686	did_alloc = groupgrow;
687
688	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
689	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
690	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
691
692	if (cmp_p == 0 && DB_REDO(op)) {
693		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
694		pagep->lsn = *lsnp;
695	} else if (cmp_n == 0 && DB_UNDO(op)) {
696		/* If this record allocated the pages give them back. */
697		if (argp->newalloc) {
698			if (pagep != NULL && (ret = __memp_fput(mpf,
699			    ip, pagep, DB_PRIORITY_VERY_LOW)) != 0)
700				goto out;
701			pagep = NULL;
702			if ((ret =
703			    __memp_ftruncate(mpf, ip, argp->pgno, 0)) != 0)
704				goto out;
705		} else {
706			/*
707			 * Otherwise just roll the page back to its
708			 * previous state.
709			 */
710			REC_DIRTY(mpf, ip, dbc->priority, &pagep);
711			pagep->lsn = argp->pagelsn;
712		}
713	}
714	if (pagep != NULL &&
715	    (ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
716		goto out;
717
718do_meta:
719	/* Now we have to update the meta-data page. */
720	hcp = (HASH_CURSOR *)dbc->internal;
721	if ((ret = __ham_get_meta(dbc)) != 0)
722		goto out;
723	cmp_n = LOG_COMPARE(lsnp, &hcp->hdr->dbmeta.lsn);
724	cmp_p = LOG_COMPARE(&hcp->hdr->dbmeta.lsn, &argp->metalsn);
725	CHECK_LSN(env, op, cmp_p, &hcp->hdr->dbmeta.lsn, &argp->metalsn);
726	if (cmp_p == 0 && DB_REDO(op)) {
727		/* Redo the actual updating of bucket counts. */
728		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
729		++hcp->hdr->max_bucket;
730		if (groupgrow) {
731			hcp->hdr->low_mask = hcp->hdr->high_mask;
732			hcp->hdr->high_mask =
733			    (argp->bucket + 1) | hcp->hdr->low_mask;
734		}
735		hcp->hdr->dbmeta.lsn = *lsnp;
736	} else if (cmp_n == 0 && DB_UNDO(op)) {
737		/* Undo the actual updating of bucket counts. */
738		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
739		hcp->hdr->max_bucket = argp->bucket;
740		if (groupgrow) {
741			hcp->hdr->high_mask = argp->bucket;
742			hcp->hdr->low_mask = hcp->hdr->high_mask >> 1;
743		}
744		hcp->hdr->dbmeta.lsn = argp->metalsn;
745	}
746
747	/*
748	 * Now we need to fix up the spares array.  Each entry in the
749	 * spares array indicates the beginning page number for the
750	 * indicated doubling.  We need to fill this in whenever the
751	 * spares array is invalid, if we never reclaim pages then
752	 * we have to allocate the pages to the spares array in both
753	 * the redo and undo cases.
754	 */
755	if (did_alloc && !DB_UNDO(op) &&
756	    hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] == PGNO_INVALID) {
757		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
758		hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] =
759		    (argp->pgno - argp->bucket) - 1;
760	}
761	if (cmp_n == 0 && groupgrow && DB_UNDO(op)) {
762		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
763		hcp->hdr->spares[
764		    __db_log2(argp->bucket + 1) + 1] = PGNO_INVALID;
765	}
766
767	/*
768	 * Finally, we need to potentially fix up the last_pgno field
769	 * in the master meta-data page (which may or may not be the
770	 * same as the hash header page).
771	 */
772	if (argp->mmpgno != argp->mpgno) {
773		if ((ret = __memp_fget(mpf,
774		    &argp->mmpgno, ip,  NULL, DB_MPOOL_EDIT, &mmeta)) != 0) {
775			if (DB_UNDO(op) && ret == DB_PAGE_NOTFOUND)
776				ret = 0;
777			goto out;
778		}
779		cmp_n = LOG_COMPARE(lsnp, &mmeta->lsn);
780		cmp_p = LOG_COMPARE(&mmeta->lsn, &argp->mmetalsn);
781		if (cmp_p == 0 && DB_REDO(op)) {
782			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
783			mmeta->lsn = *lsnp;
784		} else if (cmp_n == 0 && DB_UNDO(op)) {
785			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
786			mmeta->lsn = argp->mmetalsn;
787		}
788	} else {
789		mmeta = (DBMETA *)hcp->hdr;
790		REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
791	}
792
793	if (cmp_n == 0 && DB_UNDO(op))
794		mmeta->last_pgno = argp->last_pgno;
795	else if (DB_REDO(op) && mmeta->last_pgno < pgno)
796		mmeta->last_pgno = pgno;
797
798	if (argp->mmpgno != argp->mpgno &&
799	    (ret = __memp_fput(mpf, ip, mmeta, dbc->priority)) != 0)
800		goto out;
801	mmeta = NULL;
802
803done:	*lsnp = argp->prev_lsn;
804	ret = 0;
805
806out:	if (mmeta != NULL)
807		(void)__memp_fput(mpf, ip, mmeta, dbc->priority);
808	if (dbc != NULL)
809		(void)__ham_release_meta(dbc);
810
811	REC_CLOSE;
812}
813
814/*
815 * __ham_groupalloc_recover --
816 *	Recover the batch creation of a set of pages for a new database.
817 *
818 * PUBLIC: int __ham_groupalloc_recover
819 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
820 */
821int
822__ham_groupalloc_recover(env, dbtp, lsnp, op, info)
823	ENV *env;
824	DBT *dbtp;
825	DB_LSN *lsnp;
826	db_recops op;
827	void *info;
828{
829	__ham_groupalloc_args *argp;
830	DB_THREAD_INFO *ip;
831	DBMETA *mmeta;
832	DB_MPOOLFILE *mpf;
833	DB *file_dbp;
834	DBC *dbc;
835	PAGE *pagep;
836	db_pgno_t pgno;
837	int cmp_n, cmp_p, ret;
838
839	ip = ((DB_TXNHEAD *)info)->thread_info;
840	mmeta = NULL;
841	REC_PRINT(__ham_groupalloc_print);
842	REC_INTRO(__ham_groupalloc_read, ip, 1);
843
844	pgno = PGNO_BASE_MD;
845	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &mmeta)) != 0) {
846		if (DB_REDO(op)) {
847			ret = __db_pgerr(file_dbp, pgno, ret);
848			goto out;
849		} else
850			goto done;
851	}
852
853	cmp_n = LOG_COMPARE(lsnp, &LSN(mmeta));
854	cmp_p = LOG_COMPARE(&LSN(mmeta), &argp->meta_lsn);
855	CHECK_LSN(env, op, cmp_p, &LSN(mmeta), &argp->meta_lsn);
856
857	/*
858	 * Basically, we used mpool to allocate a chunk of pages.
859	 * We need to either add those to a free list (in the undo
860	 * case) or initialize them (in the redo case).
861	 *
862	 * If we are redoing and this is a hash subdatabase, it's possible
863	 * that the pages were never allocated, so we'd better check for
864	 * that and handle it here.
865	 */
866	pgno = argp->start_pgno + argp->num - 1;
867	if (DB_REDO(op)) {
868		if ((ret = __ham_alloc_pages(dbc, argp, lsnp)) != 0)
869			goto out;
870		if (cmp_p == 0) {
871			REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
872			LSN(mmeta) = *lsnp;
873		}
874	} else if (DB_UNDO(op)) {
875		/*
876		 * Fetch the last page and determine if it is in
877		 * the post allocation state.
878		 */
879		pagep = NULL;
880		if ((ret = __memp_fget(mpf, &pgno,
881		     ip,  NULL, DB_MPOOL_EDIT, &pagep)) == 0) {
882			if (LOG_COMPARE(&pagep->lsn, lsnp) != 0) {
883				if ((ret = __memp_fput(mpf, ip,
884				    pagep, DB_PRIORITY_VERY_LOW)) != 0)
885					goto out;
886				pagep = NULL;
887			}
888		} else if (ret != DB_PAGE_NOTFOUND)
889			goto out;
890		/*
891		 * If the last page was allocated then truncate back
892		 * to the first page.
893		 */
894		if (pagep != NULL) {
895			if ((ret = __memp_fput(mpf, ip,
896			    pagep, DB_PRIORITY_VERY_LOW)) != 0)
897				goto out;
898			if ((ret = __memp_ftruncate(mpf,
899			     ip, argp->start_pgno, 0)) != 0)
900				goto out;
901		}
902
903		/*
904		 * If we are rolling back the metapage, then make
905		 * sure it reflects the the correct last_pgno.
906		 */
907		if (cmp_n == 0) {
908			REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
909			mmeta->last_pgno = argp->last_pgno;
910		}
911		pgno = 0;
912		if (cmp_n == 0) {
913			REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
914			LSN(mmeta) = argp->meta_lsn;
915		}
916	}
917
918	/*
919	 * Set the last page number to the current value.
920	 */
921	if (pgno > mmeta->last_pgno) {
922		REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
923		mmeta->last_pgno = pgno;
924	}
925
926done:	if (ret == 0)
927		*lsnp = argp->prev_lsn;
928	ret = 0;
929
930out:	if (mmeta != NULL)
931		(void)__memp_fput(mpf, ip, mmeta, file_dbp->priority);
932
933	REC_CLOSE;
934}
935
936/*
937 * __ham_alloc_pages --
938 *
939 * Called during redo of a file create.  We create new pages in the file
940 * using the MPOOL_NEW_GROUP flag.  We then log the meta-data page with a
941 * __crdel_metasub message.  If we manage to crash without the newly written
942 * pages getting to disk (I'm not sure this can happen anywhere except our
943 * test suite?!), then we need to go through a recreate the final pages.
944 * Hash normally has holes in its files and handles them appropriately.
945 */
946static int
947__ham_alloc_pages(dbc, argp, lsnp)
948	DBC *dbc;
949	__ham_groupalloc_args *argp;
950	DB_LSN *lsnp;
951{
952	DB *file_dbp;
953	DB_MPOOLFILE *mpf;
954	DB_THREAD_INFO *ip;
955	PAGE *pagep;
956	db_pgno_t pgno;
957	int ret;
958
959	file_dbp = dbc->dbp;
960	mpf = file_dbp->mpf;
961	ip = dbc->thread_info;
962
963	/* Read the last page of the allocation. */
964	pgno = argp->start_pgno + argp->num - 1;
965
966	/* If the page exists, and it has been initialized, then we're done. */
967	if ((ret =
968	    __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep)) == 0) {
969		if (NUM_ENT(pagep) == 0 && IS_ZERO_LSN(pagep->lsn))
970			goto reinit_page;
971		return (__memp_fput(mpf, ip, pagep, dbc->priority));
972	}
973
974	/* Had to create the page. */
975	if ((ret = __memp_fget(mpf, &pgno,
976	    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
977		return (__db_pgerr(dbc->dbp, pgno, ret));
978
979reinit_page:
980	/* Initialize the newly allocated page. */
981	REC_DIRTY(mpf, ip, dbc->priority, &pagep);
982	P_INIT(pagep, dbc->dbp->pgsize,
983	    pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
984	pagep->lsn = *lsnp;
985
986out:	return (__memp_fput(mpf, ip, pagep, dbc->priority));
987}
988
989/*
990 * __ham_curadj_recover --
991 *	Undo cursor adjustments if a subtransaction fails.
992 *
993 * PUBLIC: int __ham_curadj_recover
994 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
995 */
996int
997__ham_curadj_recover(env, dbtp, lsnp, op, info)
998	ENV *env;
999	DBT *dbtp;
1000	DB_LSN *lsnp;
1001	db_recops op;
1002	void *info;
1003{
1004	__ham_curadj_args *argp;
1005	db_ham_curadj mode, hamc_mode;
1006	DB_THREAD_INFO *ip;
1007	DB_MPOOLFILE *mpf;
1008	DB *file_dbp;
1009	DBC *dbc;
1010	HASH_CURSOR *hcp;
1011	int ret;
1012
1013	ip = ((DB_TXNHEAD *)info)->thread_info;
1014	REC_PRINT(__ham_curadj_print);
1015	REC_INTRO(__ham_curadj_read, ip, 1);
1016
1017	if (op != DB_TXN_ABORT)
1018		goto done;
1019
1020	mode = (db_ham_curadj)argp->add;
1021
1022	/*
1023	 * Reverse the logged operation, so that the consequences are reversed
1024	 * by the __hamc_update code.
1025	 */
1026	switch (mode) {
1027	case DB_HAM_CURADJ_DEL:
1028		hamc_mode = DB_HAM_CURADJ_ADD;
1029		break;
1030	case DB_HAM_CURADJ_ADD:
1031		hamc_mode = DB_HAM_CURADJ_DEL;
1032		break;
1033	case DB_HAM_CURADJ_ADDMOD:
1034		hamc_mode = DB_HAM_CURADJ_DELMOD;
1035		break;
1036	case DB_HAM_CURADJ_DELMOD:
1037		hamc_mode = DB_HAM_CURADJ_ADDMOD;
1038		break;
1039	default:
1040		__db_errx(env,
1041		    "Invalid flag in __ham_curadj_recover");
1042		ret = EINVAL;
1043		goto out;
1044	}
1045
1046	/*
1047	 * Undo the adjustment by reinitializing the the cursor to look like
1048	 * the one that was used to do the adjustment, then we invert the
1049	 * add so that undo the adjustment.
1050	 */
1051	hcp = (HASH_CURSOR *)dbc->internal;
1052	hcp->pgno = argp->pgno;
1053	hcp->indx = argp->indx;
1054	hcp->dup_off = argp->dup_off;
1055	hcp->order = argp->order;
1056	if (mode == DB_HAM_CURADJ_DEL)
1057		F_SET(hcp, H_DELETED);
1058	(void)__hamc_update(dbc, argp->len, hamc_mode, argp->is_dup);
1059
1060done:	*lsnp = argp->prev_lsn;
1061out:	REC_CLOSE;
1062}
1063
1064/*
1065 * __ham_chgpg_recover --
1066 *	Undo cursor adjustments if a subtransaction fails.
1067 *
1068 * PUBLIC: int __ham_chgpg_recover
1069 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1070 */
1071int
1072__ham_chgpg_recover(env, dbtp, lsnp, op, info)
1073	ENV *env;
1074	DBT *dbtp;
1075	DB_LSN *lsnp;
1076	db_recops op;
1077	void *info;
1078{
1079	__ham_chgpg_args *argp;
1080	DB_THREAD_INFO *ip;
1081	BTREE_CURSOR *opdcp;
1082	DB_MPOOLFILE *mpf;
1083	DB *file_dbp, *ldbp;
1084	DBC *dbc;
1085	DBC *cp;
1086	HASH_CURSOR *lcp;
1087	u_int32_t order, indx;
1088	int ret;
1089
1090	ip = ((DB_TXNHEAD *)info)->thread_info;
1091	REC_PRINT(__ham_chgpg_print);
1092	REC_INTRO(__ham_chgpg_read, ip, 0);
1093
1094	if (op != DB_TXN_ABORT)
1095		goto done;
1096
1097	/* Overloaded fields for DB_HAM_DEL*PG */
1098	indx = argp->old_indx;
1099	order = argp->new_indx;
1100
1101	MUTEX_LOCK(env, env->mtx_dblist);
1102	FIND_FIRST_DB_MATCH(env, file_dbp, ldbp);
1103	for (;
1104	    ldbp != NULL && ldbp->adj_fileid == file_dbp->adj_fileid;
1105	    ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
1106		MUTEX_LOCK(env, file_dbp->mutex);
1107		TAILQ_FOREACH(cp, &ldbp->active_queue, links) {
1108			lcp = (HASH_CURSOR *)cp->internal;
1109
1110			switch (argp->mode) {
1111			case DB_HAM_DELFIRSTPG:
1112				if (lcp->pgno != argp->new_pgno ||
1113				    MVCC_SKIP_CURADJ(cp, lcp->pgno))
1114					break;
1115				if (lcp->indx != indx ||
1116				    !F_ISSET(lcp, H_DELETED) ||
1117				    lcp->order >= order) {
1118					lcp->pgno = argp->old_pgno;
1119					if (lcp->indx == indx)
1120						lcp->order -= order;
1121				}
1122				break;
1123			case DB_HAM_DELMIDPG:
1124			case DB_HAM_DELLASTPG:
1125				if (lcp->pgno == argp->new_pgno &&
1126				    lcp->indx == indx &&
1127				    F_ISSET(lcp, H_DELETED) &&
1128				    lcp->order >= order &&
1129				    !MVCC_SKIP_CURADJ(cp, lcp->pgno)) {
1130					lcp->pgno = argp->old_pgno;
1131					lcp->order -= order;
1132					lcp->indx = 0;
1133				}
1134				break;
1135			case DB_HAM_CHGPG:
1136				/*
1137				 * If we're doing a CHGPG, we're undoing
1138				 * the move of a non-deleted item to a
1139				 * new page.  Any cursors with the deleted
1140				 * flag set do not belong to this item;
1141				 * don't touch them.
1142				 */
1143				if (F_ISSET(lcp, H_DELETED))
1144					break;
1145				/* FALLTHROUGH */
1146			case DB_HAM_SPLIT:
1147				if (lcp->pgno == argp->new_pgno &&
1148				    lcp->indx == argp->new_indx &&
1149				    !MVCC_SKIP_CURADJ(cp, lcp->pgno)) {
1150					lcp->indx = argp->old_indx;
1151					lcp->pgno = argp->old_pgno;
1152				}
1153				break;
1154			case DB_HAM_DUP:
1155				if (lcp->opd == NULL)
1156					break;
1157				opdcp = (BTREE_CURSOR *)lcp->opd->internal;
1158				if (opdcp->pgno != argp->new_pgno ||
1159				    opdcp->indx != argp->new_indx ||
1160				    MVCC_SKIP_CURADJ(lcp->opd, opdcp->pgno))
1161					break;
1162
1163				if (F_ISSET(opdcp, C_DELETED))
1164					F_SET(lcp, H_DELETED);
1165				/*
1166				 * We can't close a cursor while we have the
1167				 * dbp mutex locked, since c_close reacquires
1168				 * it.  It should be safe to drop the mutex
1169				 * here, though, since newly opened cursors
1170				 * are put only at the end of the tailq and
1171				 * the cursor we're adjusting can't be closed
1172				 * under us.
1173				 */
1174				MUTEX_UNLOCK(env, file_dbp->mutex);
1175				if ((ret = __dbc_close(lcp->opd)) != 0)
1176					goto out;
1177				MUTEX_LOCK(env, file_dbp->mutex);
1178				lcp->opd = NULL;
1179				break;
1180			}
1181		}
1182		MUTEX_UNLOCK(env, file_dbp->mutex);
1183	}
1184	MUTEX_UNLOCK(env, env->mtx_dblist);
1185
1186done:	*lsnp = argp->prev_lsn;
1187out:	REC_CLOSE;
1188}
1189
1190/*
1191 * __ham_metagroup_recover --
1192 *	Recovery function for metagroup.
1193 *
1194 * PUBLIC: int __ham_metagroup_42_recover
1195 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1196 */
1197int
1198__ham_metagroup_42_recover(env, dbtp, lsnp, op, info)
1199	ENV *env;
1200	DBT *dbtp;
1201	DB_LSN *lsnp;
1202	db_recops op;
1203	void *info;
1204{
1205	__ham_metagroup_42_args *argp;
1206	DB_THREAD_INFO *ip;
1207	HASH_CURSOR *hcp;
1208	DB *file_dbp;
1209	DBMETA *mmeta;
1210	DBC *dbc;
1211	DB_MPOOLFILE *mpf;
1212	PAGE *pagep;
1213	db_pgno_t pgno;
1214	u_int32_t flags;
1215	int cmp_n, cmp_p, did_alloc, groupgrow, ret;
1216
1217	ip = ((DB_TXNHEAD *)info)->thread_info;
1218	mmeta = NULL;
1219	did_alloc = 0;
1220	REC_PRINT(__ham_metagroup_42_print);
1221	REC_INTRO(__ham_metagroup_42_read, ip, 1);
1222
1223	/*
1224	 * This logs the virtual create of pages pgno to pgno + bucket
1225	 * If HAVE_FTRUNCATE is not supported the mpool page-allocation is not
1226	 * transaction protected, we can never undo it.  Even in an abort,
1227	 * we have to allocate these pages to the hash table if they
1228	 * were actually created.  In particular, during disaster
1229	 * recovery the metapage may be before this point if we
1230	 * are rolling backward.  If the file has not been extended
1231	 * then the metapage could not have been updated.
1232	 * The log record contains:
1233	 * bucket: old maximum bucket
1234	 * pgno: page number of the new bucket.
1235	 * We round up on log calculations, so we can figure out if we are
1236	 * about to double the hash table if argp->bucket+1 is a power of 2.
1237	 * If it is, then we are allocating an entire doubling of pages,
1238	 * otherwise, we are simply allocated one new page.
1239	 */
1240	groupgrow =
1241	    (u_int32_t)(1 << __db_log2(argp->bucket + 1)) == argp->bucket + 1;
1242	pgno = argp->pgno;
1243	if (argp->newalloc)
1244		pgno += argp->bucket;
1245
1246	flags = 0;
1247	pagep = NULL;
1248	LF_SET(DB_MPOOL_CREATE);
1249	ret = __memp_fget(mpf, &pgno, ip,  NULL, flags, &pagep);
1250
1251	if (ret != 0) {
1252		if (ret != ENOSPC)
1253			goto out;
1254		pgno = 0;
1255		goto do_meta;
1256	}
1257
1258	/*
1259	 * When we get here then either we did not grow the file
1260	 * (groupgrow == 0) or we did grow the file and the allocation
1261	 * of those new pages succeeded.
1262	 */
1263	did_alloc = groupgrow;
1264
1265	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
1266	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
1267	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
1268
1269	if (cmp_p == 0 && DB_REDO(op)) {
1270		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
1271		pagep->lsn = *lsnp;
1272	} else if (cmp_n == 0 && DB_UNDO(op)) {
1273		/*
1274		 * Otherwise just roll the page back to its
1275		 * previous state.
1276		 */
1277		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
1278		pagep->lsn = argp->pagelsn;
1279	}
1280	if (pagep != NULL &&
1281	    (ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
1282		goto out;
1283
1284do_meta:
1285	/* Now we have to update the meta-data page. */
1286	hcp = (HASH_CURSOR *)dbc->internal;
1287	if ((ret = __ham_get_meta(dbc)) != 0)
1288		goto out;
1289	cmp_n = LOG_COMPARE(lsnp, &hcp->hdr->dbmeta.lsn);
1290	cmp_p = LOG_COMPARE(&hcp->hdr->dbmeta.lsn, &argp->metalsn);
1291	CHECK_LSN(env, op, cmp_p, &hcp->hdr->dbmeta.lsn, &argp->metalsn);
1292	if (cmp_p == 0 && DB_REDO(op)) {
1293		/* Redo the actual updating of bucket counts. */
1294		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
1295		++hcp->hdr->max_bucket;
1296		if (groupgrow) {
1297			hcp->hdr->low_mask = hcp->hdr->high_mask;
1298			hcp->hdr->high_mask =
1299			    (argp->bucket + 1) | hcp->hdr->low_mask;
1300		}
1301		hcp->hdr->dbmeta.lsn = *lsnp;
1302	} else if (cmp_n == 0 && DB_UNDO(op)) {
1303		/* Undo the actual updating of bucket counts. */
1304		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
1305		hcp->hdr->max_bucket = argp->bucket;
1306		if (groupgrow) {
1307			hcp->hdr->high_mask = argp->bucket;
1308			hcp->hdr->low_mask = hcp->hdr->high_mask >> 1;
1309		}
1310		hcp->hdr->dbmeta.lsn = argp->metalsn;
1311	}
1312
1313	/*
1314	 * Now we need to fix up the spares array.  Each entry in the
1315	 * spares array indicates the beginning page number for the
1316	 * indicated doubling.  We need to fill this in whenever the
1317	 * spares array is invalid, if we never reclaim pages then
1318	 * we have to allocate the pages to the spares array in both
1319	 * the redo and undo cases.
1320	 */
1321	if (did_alloc &&
1322	    hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] == PGNO_INVALID) {
1323		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
1324		hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] =
1325		    (argp->pgno - argp->bucket) - 1;
1326	}
1327
1328	/*
1329	 * Finally, we need to potentially fix up the last_pgno field
1330	 * in the master meta-data page (which may or may not be the
1331	 * same as the hash header page).
1332	 */
1333	if (argp->mmpgno != argp->mpgno) {
1334		if ((ret = __memp_fget(mpf, &argp->mmpgno, ip, NULL,
1335		    DB_MPOOL_EDIT, &mmeta)) != 0) {
1336			if (DB_UNDO(op) && ret == DB_PAGE_NOTFOUND)
1337				ret = 0;
1338			goto out;
1339		}
1340		cmp_n = LOG_COMPARE(lsnp, &mmeta->lsn);
1341		cmp_p = LOG_COMPARE(&mmeta->lsn, &argp->mmetalsn);
1342		if (cmp_p == 0 && DB_REDO(op)) {
1343			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
1344			mmeta->lsn = *lsnp;
1345		} else if (cmp_n == 0 && DB_UNDO(op)) {
1346			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
1347			mmeta->lsn = argp->mmetalsn;
1348		}
1349	} else {
1350		mmeta = (DBMETA *)hcp->hdr;
1351		REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
1352	}
1353
1354	if (mmeta->last_pgno < pgno)
1355		mmeta->last_pgno = pgno;
1356
1357	if (argp->mmpgno != argp->mpgno &&
1358	    (ret = __memp_fput(mpf, ip, mmeta, dbc->priority)) != 0)
1359		goto out;
1360	mmeta = NULL;
1361
1362done:	*lsnp = argp->prev_lsn;
1363	ret = 0;
1364
1365out:	if (mmeta != NULL)
1366		(void)__memp_fput(mpf, ip, mmeta, dbc->priority);
1367	if (dbc != NULL)
1368		(void)__ham_release_meta(dbc);
1369
1370	REC_CLOSE;
1371}
1372
1373/*
1374 * __ham_groupalloc_42_recover --
1375 *	Recover the batch creation of a set of pages for a new database.
1376 *
1377 * PUBLIC: int __ham_groupalloc_42_recover
1378 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1379 */
1380int
1381__ham_groupalloc_42_recover(env, dbtp, lsnp, op, info)
1382	ENV *env;
1383	DBT *dbtp;
1384	DB_LSN *lsnp;
1385	db_recops op;
1386	void *info;
1387{
1388	__ham_groupalloc_42_args *argp;
1389	DB_THREAD_INFO *ip;
1390	DBMETA *mmeta;
1391	DB_MPOOLFILE *mpf;
1392	DB *file_dbp;
1393	DBC *dbc;
1394	db_pgno_t pgno;
1395	int cmp_p, ret;
1396
1397	ip = ((DB_TXNHEAD *)info)->thread_info;
1398	mmeta = NULL;
1399	REC_PRINT(__ham_groupalloc_42_print);
1400	REC_INTRO(__ham_groupalloc_42_read, ip, 1);
1401
1402	pgno = PGNO_BASE_MD;
1403	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &mmeta)) != 0) {
1404		if (DB_REDO(op)) {
1405			ret = __db_pgerr(file_dbp, pgno, ret);
1406			goto out;
1407		} else
1408			goto done;
1409	}
1410
1411	cmp_p = LOG_COMPARE(&LSN(mmeta), &argp->meta_lsn);
1412	CHECK_LSN(env, op, cmp_p, &LSN(mmeta), &argp->meta_lsn);
1413
1414	/*
1415	 * Basically, we used mpool to allocate a chunk of pages.
1416	 * We need to either add those to a free list (in the undo
1417	 * case) or initialize them (in the redo case).
1418	 *
1419	 * If we are redoing and this is a hash subdatabase, it's possible
1420	 * that the pages were never allocated, so we'd better check for
1421	 * that and handle it here.
1422	 */
1423	pgno = argp->start_pgno + argp->num - 1;
1424	if (DB_REDO(op)) {
1425		if ((ret = __ham_alloc_pages_42(dbc, argp, lsnp)) != 0)
1426			goto out;
1427		if (cmp_p == 0) {
1428			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
1429			LSN(mmeta) = *lsnp;
1430		}
1431	} else if (DB_UNDO(op)) {
1432		/*
1433		 * We cannot roll back 4.2 style allocations.
1434		 */
1435		__db_errx(env,
1436"Cannot replicate prepared transactions from master running release 4.2.");
1437		ret = __env_panic(env, EINVAL);
1438		goto out;
1439	}
1440
1441	/*
1442	 * In both REDO and UNDO, we have grown the file and need to make
1443	 * sure that last_pgno is correct.  If we HAVE_FTRUNCATE pgno
1444	 * will only be valid on REDO.
1445	 */
1446	if (pgno > mmeta->last_pgno) {
1447		REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
1448		mmeta->last_pgno = pgno;
1449	}
1450
1451done:	if (ret == 0)
1452		*lsnp = argp->prev_lsn;
1453	ret = 0;
1454
1455out:	if (mmeta != NULL)
1456		(void)__memp_fput(mpf, ip, mmeta, dbc->priority);
1457
1458	REC_CLOSE;
1459}
1460
1461/*
1462 * __ham_alloc_pages_42 --
1463 *
1464 * Called during redo of a file create.  We create new pages in the file
1465 * using the MPOOL_NEW_GROUP flag.  We then log the meta-data page with a
1466 * __crdel_metasub message.  If we manage to crash without the newly written
1467 * pages getting to disk (I'm not sure this can happen anywhere except our
1468 * test suite?!), then we need to go through a recreate the final pages.
1469 * Hash normally has holes in its files and handles them appropriately.
1470 */
1471static int
1472__ham_alloc_pages_42(dbc, argp, lsnp)
1473	DBC *dbc;
1474	__ham_groupalloc_42_args *argp;
1475	DB_LSN *lsnp;
1476{
1477	DB_MPOOLFILE *mpf;
1478	DB_THREAD_INFO *ip;
1479	PAGE *pagep;
1480	db_pgno_t pgno;
1481	int ret;
1482
1483	mpf = dbc->dbp->mpf;
1484	ip = dbc->thread_info;
1485
1486	/* Read the last page of the allocation. */
1487	pgno = argp->start_pgno + argp->num - 1;
1488
1489	/* If the page exists, and it has been initialized, then we're done. */
1490	if ((ret = __memp_fget(mpf,
1491	    &pgno, ip, NULL, 0, &pagep)) == 0) {
1492		if (NUM_ENT(pagep) == 0 && IS_ZERO_LSN(pagep->lsn))
1493			goto reinit_page;
1494		if ((ret = __memp_fput(mpf,
1495		    ip, pagep, dbc->priority)) != 0)
1496			return (ret);
1497		return (0);
1498	}
1499
1500	/* Had to create the page. */
1501	if ((ret = __memp_fget(mpf, &pgno, ip, NULL,
1502	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &pagep)) != 0)
1503		return (__db_pgerr(dbc->dbp, pgno, ret));
1504
1505reinit_page:
1506	/* Initialize the newly allocated page. */
1507	P_INIT(pagep,
1508	    dbc->dbp->pgsize, pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
1509	pagep->lsn = *lsnp;
1510
1511	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
1512		return (ret);
1513
1514	return (0);
1515}
1516