1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996-2009 Oracle.  All rights reserved.
5 */
6/*
7 * Copyright (c) 1995, 1996
8 *	Margo Seltzer.  All rights reserved.
9 */
10/*
11 * Copyright (c) 1995, 1996
12 *	The President and Fellows of Harvard University.  All rights reserved.
13 *
14 * This code is derived from software contributed to Berkeley by
15 * Margo Seltzer.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 * 3. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 * $Id$
42 */
43
44#include "db_config.h"
45
46#include "db_int.h"
47#include "dbinc/db_page.h"
48#include "dbinc/btree.h"
49#include "dbinc/hash.h"
50#include "dbinc/log.h"
51#include "dbinc/mp.h"
52
53static int __ham_alloc_pages __P((DBC *, __ham_groupalloc_args *, DB_LSN *));
54static int __ham_alloc_pages_42
55    __P((DBC *, __ham_groupalloc_42_args *, DB_LSN *));
56
57/*
58 * __ham_insdel_recover --
59 *
60 * PUBLIC: int __ham_insdel_recover
61 * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
62 */
63int
64__ham_insdel_recover(env, dbtp, lsnp, op, info)
65	ENV *env;
66	DBT *dbtp;
67	DB_LSN *lsnp;
68	db_recops op;
69	void *info;
70{
71	__ham_insdel_args *argp;
72	DB_THREAD_INFO *ip;
73	DB *file_dbp;
74	DBC *dbc;
75	DB_MPOOLFILE *mpf;
76	PAGE *pagep;
77	db_indx_t dindx;
78	u_int32_t opcode;
79	int cmp_n, cmp_p, dtype, ktype, ret;
80
81	ip = ((DB_TXNHEAD *)info)->thread_info;
82	pagep = NULL;
83	REC_PRINT(__ham_insdel_print);
84	REC_INTRO(__ham_insdel_read, ip, 1);
85
86	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
87	    0, &pagep)) != 0) {
88		if (DB_UNDO(op)) {
89			if (ret == DB_PAGE_NOTFOUND)
90				goto done;
91			else {
92				ret = __db_pgerr(file_dbp, argp->pgno, ret);
93				goto out;
94			}
95		}
96		/* If the page is not here then it was later truncated. */
97		if (!IS_ZERO_LSN(argp->pagelsn))
98			goto done;
99		/*
100		 * This page was created by a group allocation and
101		 * the file may not have been extend yet.
102		 * Create the page if necessary.
103		 */
104		if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
105		    DB_MPOOL_CREATE, &pagep)) != 0) {
106			ret = __db_pgerr(file_dbp, argp->pgno, ret);
107			goto out;
108		}
109	}
110
111	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
112	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
113	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
114
115	/*
116	 * Two possible things going on:
117	 * redo a delete/undo a put: delete the item from the page.
118	 * redo a put/undo a delete: add the item to the page.
119	 * If we are undoing a delete, then the information logged is the
120	 * entire entry off the page, not just the data of a dbt.  In
121	 * this case, we want to copy it back onto the page verbatim.
122	 * We do this by calling __insertpair with the type H_OFFPAGE instead
123	 * of H_KEYDATA.
124	 */
125	opcode = OPCODE_OF(argp->opcode);
126	if ((opcode == DELPAIR && cmp_n == 0 && DB_UNDO(op)) ||
127	    (opcode == PUTPAIR && cmp_p == 0 && DB_REDO(op))) {
128		/*
129		 * Need to redo a PUT or undo a delete.
130		 */
131		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
132		ktype = DB_UNDO(op) || PAIR_ISKEYBIG(argp->opcode) ?
133		    H_OFFPAGE : H_KEYDATA;
134		if (PAIR_ISDATADUP(argp->opcode))
135			dtype = H_DUPLICATE;
136		else if (DB_UNDO(op) || PAIR_ISDATABIG(argp->opcode))
137			dtype = H_OFFPAGE;
138		else
139			dtype = H_KEYDATA;
140		dindx = (db_indx_t)argp->ndx;
141		if ((ret = __ham_insertpair(dbc, pagep, &dindx,
142		    &argp->key, &argp->data, ktype, dtype)) != 0)
143			goto out;
144		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
145	} else if ((opcode == DELPAIR && cmp_p == 0 && DB_REDO(op)) ||
146	    (opcode == PUTPAIR && cmp_n == 0 && DB_UNDO(op))) {
147		/* Need to undo a put or redo a delete. */
148		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
149		__ham_dpair(file_dbp, pagep, argp->ndx);
150		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
151	}
152
153	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
154		goto out;
155	pagep = NULL;
156
157	/* Return the previous LSN. */
158done:	*lsnp = argp->prev_lsn;
159	ret = 0;
160
161out:	if (pagep != NULL)
162		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
163	REC_CLOSE;
164}
165
166/*
167 * __ham_newpage_recover --
168 *	This log message is used when we add/remove overflow pages.  This
169 *	message takes care of the pointer chains, not the data on the pages.
170 *
171 * PUBLIC: int __ham_newpage_recover
172 * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
173 */
174int
175__ham_newpage_recover(env, dbtp, lsnp, op, info)
176	ENV *env;
177	DBT *dbtp;
178	DB_LSN *lsnp;
179	db_recops op;
180	void *info;
181{
182	__ham_newpage_args *argp;
183	DB_THREAD_INFO *ip;
184	DB *file_dbp;
185	DBC *dbc;
186	DB_MPOOLFILE *mpf;
187	PAGE *pagep;
188	int change, cmp_n, cmp_p, ret;
189
190	ip = ((DB_TXNHEAD *)info)->thread_info;
191	pagep = NULL;
192	REC_PRINT(__ham_newpage_print);
193	REC_INTRO(__ham_newpage_read, ip, 0);
194
195	REC_FGET(mpf, ip, argp->new_pgno, &pagep, ppage);
196	change = 0;
197
198	/*
199	 * There are potentially three pages we need to check: the one
200	 * that we created/deleted, the one before it and the one after
201	 * it.
202	 */
203
204	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
205	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
206	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
207	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
208
209	if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) ||
210	    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) {
211		/* Redo a create new page or undo a delete new page. */
212		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
213		P_INIT(pagep, file_dbp->pgsize, argp->new_pgno,
214		    argp->prev_pgno, argp->next_pgno, 0, P_HASH);
215		change = 1;
216	} else if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DELOVFL) ||
217	    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) {
218		/*
219		 * Redo a delete or undo a create new page.  All we
220		 * really need to do is change the LSN.
221		 */
222		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
223		change = 1;
224	}
225
226	if (change)
227		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
228
229	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
230		goto out;
231	pagep = NULL;
232
233	/* Now do the prev page. */
234ppage:	if (argp->prev_pgno != PGNO_INVALID) {
235		REC_FGET(mpf, ip, argp->prev_pgno, &pagep, npage);
236
237		cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
238		cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
239		CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
240		CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
241		change = 0;
242
243		if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) ||
244		    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) {
245			/* Redo a create new page or undo a delete new page. */
246			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
247			pagep->next_pgno = argp->new_pgno;
248			change = 1;
249		} else if ((cmp_p == 0 &&
250		    DB_REDO(op) && argp->opcode == DELOVFL) ||
251		    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) {
252			/* Redo a delete or undo a create new page. */
253			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
254			pagep->next_pgno = argp->next_pgno;
255			change = 1;
256		}
257
258		if (change)
259			LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn;
260
261		if ((ret = __memp_fput(mpf,
262		    ip, pagep, file_dbp->priority)) != 0)
263			goto out;
264		pagep = NULL;
265	}
266
267	/* Now time to do the next page */
268npage:	if (argp->next_pgno != PGNO_INVALID) {
269		REC_FGET(mpf, ip, argp->next_pgno, &pagep, done);
270
271		cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
272		cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
273		CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
274		CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
275		change = 0;
276
277		if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) ||
278		    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) {
279			/* Redo a create new page or undo a delete new page. */
280			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
281			pagep->prev_pgno = argp->new_pgno;
282			change = 1;
283		} else if ((cmp_p == 0 &&
284		    DB_REDO(op) && argp->opcode == DELOVFL) ||
285		    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) {
286			/* Redo a delete or undo a create new page. */
287			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
288			pagep->prev_pgno = argp->prev_pgno;
289			change = 1;
290		}
291
292		if (change)
293			LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn;
294
295		if ((ret = __memp_fput(mpf,
296		    ip, pagep, file_dbp->priority)) != 0)
297			goto out;
298		pagep = NULL;
299	}
300done:	*lsnp = argp->prev_lsn;
301	ret = 0;
302
303out:	if (pagep != NULL)
304		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
305	REC_CLOSE;
306}
307
308/*
309 * __ham_replace_recover --
310 *	This log message refers to partial puts that are local to a single
311 *	page.  You can think of them as special cases of the more general
312 *	insdel log message.
313 *
314 * PUBLIC: int __ham_replace_recover
315 * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
316 */
317int
318__ham_replace_recover(env, dbtp, lsnp, op, info)
319	ENV *env;
320	DBT *dbtp;
321	DB_LSN *lsnp;
322	db_recops op;
323	void *info;
324{
325	__ham_replace_args *argp;
326	DB_THREAD_INFO *ip;
327	DB *file_dbp;
328	DBC *dbc;
329	DB_MPOOLFILE *mpf;
330	DBT dbt;
331	PAGE *pagep;
332	u_int32_t change;
333	int cmp_n, cmp_p, is_plus, modified, ret;
334	u_int8_t *hk;
335
336	ip = ((DB_TXNHEAD *)info)->thread_info;
337	pagep = NULL;
338	REC_PRINT(__ham_replace_print);
339	REC_INTRO(__ham_replace_read, ip, 0);
340
341	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
342
343	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
344	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
345	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
346	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
347
348	memset(&dbt, 0, sizeof(dbt));
349	modified = 0;
350
351	/*
352	 * Before we know the direction of the transformation we will
353	 * determine the size differential; then once we know if we are
354	 * redoing or undoing, we'll adjust the sign (is_plus) appropriately.
355	 */
356	if (argp->newitem.size > argp->olditem.size) {
357		change = argp->newitem.size - argp->olditem.size;
358		is_plus = 1;
359	} else {
360		change = argp->olditem.size - argp->newitem.size;
361		is_plus = 0;
362	}
363	if (cmp_p == 0 && DB_REDO(op)) {
364		/* Reapply the change as specified. */
365		dbt.data = argp->newitem.data;
366		dbt.size = argp->newitem.size;
367		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
368		LSN(pagep) = *lsnp;
369		/*
370		 * The is_plus flag is set properly to reflect
371		 * newitem.size - olditem.size.
372		 */
373		modified = 1;
374	} else if (cmp_n == 0 && DB_UNDO(op)) {
375		/* Undo the already applied change. */
376		dbt.data = argp->olditem.data;
377		dbt.size = argp->olditem.size;
378		/*
379		 * Invert is_plus to reflect sign of
380		 * olditem.size - newitem.size.
381		 */
382		is_plus = !is_plus;
383		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
384		LSN(pagep) = argp->pagelsn;
385		modified = 1;
386	}
387
388	if (modified) {
389		__ham_onpage_replace(file_dbp, pagep,
390		    argp->ndx, argp->off, change, is_plus, &dbt);
391		if (argp->makedup) {
392			hk = P_ENTRY(file_dbp, pagep, argp->ndx);
393			if (DB_REDO(op))
394				HPAGE_PTYPE(hk) = H_DUPLICATE;
395			else
396				HPAGE_PTYPE(hk) = H_KEYDATA;
397		}
398	}
399
400	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
401		goto out;
402	pagep = NULL;
403
404done:	*lsnp = argp->prev_lsn;
405	ret = 0;
406
407out:	if (pagep != NULL)
408		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
409	REC_CLOSE;
410}
411
412/*
413 * __ham_splitdata_recover --
414 *
415 * PUBLIC: int __ham_splitdata_recover
416 * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
417 */
418int
419__ham_splitdata_recover(env, dbtp, lsnp, op, info)
420	ENV *env;
421	DBT *dbtp;
422	DB_LSN *lsnp;
423	db_recops op;
424	void *info;
425{
426	__ham_splitdata_args *argp;
427	DB_THREAD_INFO *ip;
428	DB *file_dbp;
429	DBC *dbc;
430	DB_MPOOLFILE *mpf;
431	PAGE *pagep;
432	int cmp_n, cmp_p, ret;
433
434	ip = ((DB_TXNHEAD *)info)->thread_info;
435	pagep = NULL;
436	REC_PRINT(__ham_splitdata_print);
437	REC_INTRO(__ham_splitdata_read, ip, 1);
438
439	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
440		if (DB_UNDO(op)) {
441			if (ret == DB_PAGE_NOTFOUND)
442				goto done;
443			else {
444				ret = __db_pgerr(file_dbp, argp->pgno, ret);
445				goto out;
446			}
447		}
448		/* If the page is not here then it was later truncated. */
449		if (!IS_ZERO_LSN(argp->pagelsn))
450			goto done;
451		/*
452		 * This page was created by a group allocation and
453		 * the file may not have been extend yet.
454		 * Create the page if necessary.
455		 */
456		if ((ret = __memp_fget(mpf, &argp->pgno,
457		    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
458			ret = __db_pgerr(file_dbp, argp->pgno, ret);
459			goto out;
460		}
461	}
462
463	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
464	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
465	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
466	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
467
468	/*
469	 * There are three types of log messages here. Two are related
470	 * to an actual page split operation, one for the old page
471	 * and one for the new pages created.  The original image in the
472	 * SPLITOLD record is used for undo.  The image in the SPLITNEW
473	 * is used for redo.  We should never have a case where there is
474	 * a redo operation and the SPLITOLD record is on disk, but not
475	 * the SPLITNEW record.  Therefore, we only have work to do when
476	 * redo NEW messages and undo OLD messages, but we have to update
477	 * LSNs in both cases.
478	 *
479	 * The third message is generated when a page is sorted (SORTPAGE). In
480	 * an undo the original image in the SORTPAGE is used. In a redo we
481	 * recreate the sort operation by calling __ham_sort_page.
482	 */
483	if (cmp_p == 0 && DB_REDO(op)) {
484		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
485		if (argp->opcode == SPLITNEW)
486			/* Need to redo the split described. */
487			memcpy(pagep, argp->pageimage.data,
488			    argp->pageimage.size);
489		else if (argp->opcode == SORTPAGE) {
490			if ((ret = __ham_sort_page(dbc, NULL, pagep)) != 0)
491				goto out;
492		}
493		LSN(pagep) = *lsnp;
494	} else if (cmp_n == 0 && DB_UNDO(op)) {
495		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
496		if (argp->opcode == SPLITOLD || argp->opcode == SORTPAGE) {
497			/* Put back the old image. */
498			memcpy(pagep, argp->pageimage.data,
499			    argp->pageimage.size);
500		} else
501			P_INIT(pagep, file_dbp->pgsize, argp->pgno,
502			    PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
503		LSN(pagep) = argp->pagelsn;
504	}
505	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
506		goto out;
507	pagep = NULL;
508
509done:	*lsnp = argp->prev_lsn;
510	ret = 0;
511
512out:	if (pagep != NULL)
513		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
514	REC_CLOSE;
515}
516
517/*
518 * __ham_copypage_recover --
519 *	Recovery function for copypage.
520 *
521 * PUBLIC: int __ham_copypage_recover
522 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
523 */
524int
525__ham_copypage_recover(env, dbtp, lsnp, op, info)
526	ENV *env;
527	DBT *dbtp;
528	DB_LSN *lsnp;
529	db_recops op;
530	void *info;
531{
532	__ham_copypage_args *argp;
533	DB_THREAD_INFO *ip;
534	DB *file_dbp;
535	DBC *dbc;
536	DB_MPOOLFILE *mpf;
537	PAGE *pagep;
538	int cmp_n, cmp_p, ret;
539
540	ip = ((DB_TXNHEAD *)info)->thread_info;
541	pagep = NULL;
542	REC_PRINT(__ham_copypage_print);
543	REC_INTRO(__ham_copypage_read, ip, 0);
544
545	/* This is the bucket page. */
546	REC_FGET(mpf, ip, argp->pgno, &pagep, donext);
547
548	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
549	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
550	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
551
552	if (cmp_p == 0 && DB_REDO(op)) {
553		/* Need to redo update described. */
554		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
555		memcpy(pagep, argp->page.data, argp->page.size);
556		PGNO(pagep) = argp->pgno;
557		PREV_PGNO(pagep) = PGNO_INVALID;
558		LSN(pagep) = *lsnp;
559	} else if (cmp_n == 0 && DB_UNDO(op)) {
560		/* Need to undo update described. */
561		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
562		P_INIT(pagep, file_dbp->pgsize, argp->pgno, PGNO_INVALID,
563		    argp->next_pgno, 0, P_HASH);
564		LSN(pagep) = argp->pagelsn;
565	}
566	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
567		goto out;
568	pagep = NULL;
569
570donext:	/* Now fix up the "next" page. */
571	REC_FGET(mpf, ip, argp->next_pgno, &pagep, do_nn);
572
573	/* For REDO just update the LSN. For UNDO copy page back. */
574	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
575	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
576	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
577	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
578	if (cmp_p == 0 && DB_REDO(op)) {
579		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
580		LSN(pagep) = *lsnp;
581	} else if (cmp_n == 0 && DB_UNDO(op)) {
582		/* Need to undo update described. */
583		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
584		memcpy(pagep, argp->page.data, argp->page.size);
585	}
586	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
587		goto out;
588	pagep = NULL;
589
590	/* Now fix up the next's next page. */
591do_nn:	if (argp->nnext_pgno == PGNO_INVALID)
592		goto done;
593
594	REC_FGET(mpf, ip, argp->nnext_pgno, &pagep, done);
595
596	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
597	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nnextlsn);
598	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nnextlsn);
599	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
600
601	if (cmp_p == 0 && DB_REDO(op)) {
602		/* Need to redo update described. */
603		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
604		PREV_PGNO(pagep) = argp->pgno;
605		LSN(pagep) = *lsnp;
606	} else if (cmp_n == 0 && DB_UNDO(op)) {
607		/* Need to undo update described. */
608		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
609		PREV_PGNO(pagep) = argp->next_pgno;
610		LSN(pagep) = argp->nnextlsn;
611	}
612	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
613		goto out;
614	pagep = NULL;
615
616done:	*lsnp = argp->prev_lsn;
617	ret = 0;
618
619out:	if (pagep != NULL)
620		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
621	REC_CLOSE;
622}
623
624/*
625 * __ham_metagroup_recover --
626 *	Recovery function for metagroup.
627 *
628 * PUBLIC: int __ham_metagroup_recover
629 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
630 */
631int
632__ham_metagroup_recover(env, dbtp, lsnp, op, info)
633	ENV *env;
634	DBT *dbtp;
635	DB_LSN *lsnp;
636	db_recops op;
637	void *info;
638{
639	__ham_metagroup_args *argp;
640	DB_THREAD_INFO *ip;
641	HASH_CURSOR *hcp;
642	DB *file_dbp;
643	DBMETA *mmeta;
644	DBC *dbc;
645	DB_MPOOLFILE *mpf;
646	PAGE *pagep;
647	db_pgno_t pgno;
648	int cmp_n, cmp_p, did_alloc, groupgrow, ret;
649
650	ip = ((DB_TXNHEAD *)info)->thread_info;
651	mmeta = NULL;
652	did_alloc = 0;
653	REC_PRINT(__ham_metagroup_print);
654	REC_INTRO(__ham_metagroup_read, ip, 1);
655
656	/*
657	 * This logs the virtual create of pages pgno to pgno + bucket.
658	 * The log record contains:
659	 * bucket: old maximum bucket
660	 * pgno: page number of the new bucket.
661	 * We round up on log calculations, so we can figure out if we are
662	 * about to double the hash table if argp->bucket+1 is a power of 2.
663	 * If it is, then we are allocating an entire doubling of pages,
664	 * otherwise, we are simply allocated one new page.
665	 */
666	groupgrow =
667	    (u_int32_t)(1 << __db_log2(argp->bucket + 1)) == argp->bucket + 1;
668	pgno = argp->pgno;
669	if (argp->newalloc)
670		pgno += argp->bucket;
671
672	pagep = NULL;
673	ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep);
674
675	/* If we are undoing, then we don't want to create the page. */
676	if (ret != 0 && DB_REDO(op))
677		ret = __memp_fget(mpf,
678		    &pgno, ip, NULL, DB_MPOOL_CREATE, &pagep);
679	else if (ret == DB_PAGE_NOTFOUND)
680		goto do_meta;
681	if (ret != 0) {
682		if (ret != ENOSPC)
683			goto out;
684		pgno = 0;
685		goto do_meta;
686	}
687
688	/*
689	 * When we get here then either we did not grow the file
690	 * (groupgrow == 0) or we did grow the file and the allocation
691	 * of those new pages succeeded.
692	 */
693	did_alloc = groupgrow;
694
695	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
696	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
697	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
698
699	if (cmp_p == 0 && DB_REDO(op)) {
700		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
701		pagep->lsn = *lsnp;
702	} else if (cmp_n == 0 && DB_UNDO(op)) {
703		/* If this record allocated the pages give them back. */
704		if (argp->newalloc) {
705			if (pagep != NULL && (ret = __memp_fput(mpf,
706			    ip, pagep, DB_PRIORITY_VERY_LOW)) != 0)
707				goto out;
708			pagep = NULL;
709			if ((ret = __memp_ftruncate(mpf, NULL, ip,
710			    argp->pgno, 0)) != 0)
711				goto out;
712		} else {
713			/*
714			 * Otherwise just roll the page back to its
715			 * previous state.
716			 */
717			REC_DIRTY(mpf, ip, dbc->priority, &pagep);
718			pagep->lsn = argp->pagelsn;
719		}
720	}
721	if (pagep != NULL &&
722	    (ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
723		goto out;
724
725	/*
726	 * If a earlier aborted allocation used one of our pages it may
727	 * be in the wrong state, read all the pages in the group and init
728	 * them to be empty.
729	 */
730	if (DB_REDO(op) && argp->newalloc) {
731		for (pgno = argp->pgno;
732		    pgno < argp->pgno + argp->bucket; pgno++) {
733			if ((ret = __memp_fget(mpf,
734			    &pgno, ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
735				goto out;
736
737			if (IS_ZERO_LSN(LSN(pagep)))
738				P_INIT(pagep, file_dbp->pgsize,
739				    PGNO_INVALID, PGNO_INVALID, PGNO_INVALID,
740				    0, P_HASH);
741			if ((ret =
742			    __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
743				goto out;
744		}
745	}
746
747do_meta:
748	/* Now we have to update the meta-data page. */
749	hcp = (HASH_CURSOR *)dbc->internal;
750	if ((ret = __ham_get_meta(dbc)) != 0)
751		goto out;
752	cmp_n = LOG_COMPARE(lsnp, &hcp->hdr->dbmeta.lsn);
753	cmp_p = LOG_COMPARE(&hcp->hdr->dbmeta.lsn, &argp->metalsn);
754	CHECK_LSN(env, op, cmp_p, &hcp->hdr->dbmeta.lsn, &argp->metalsn);
755	CHECK_ABORT(env, op, cmp_n, &hcp->hdr->dbmeta.lsn, lsnp);
756	if (cmp_p == 0 && DB_REDO(op)) {
757		/* Redo the actual updating of bucket counts. */
758		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
759		++hcp->hdr->max_bucket;
760		if (groupgrow) {
761			hcp->hdr->low_mask = hcp->hdr->high_mask;
762			hcp->hdr->high_mask =
763			    (argp->bucket + 1) | hcp->hdr->low_mask;
764		}
765		hcp->hdr->dbmeta.lsn = *lsnp;
766	} else if (cmp_n == 0 && DB_UNDO(op)) {
767		/* Undo the actual updating of bucket counts. */
768		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
769		hcp->hdr->max_bucket = argp->bucket;
770		if (groupgrow) {
771			hcp->hdr->high_mask = argp->bucket;
772			hcp->hdr->low_mask = hcp->hdr->high_mask >> 1;
773		}
774		hcp->hdr->dbmeta.lsn = argp->metalsn;
775	}
776
777	/*
778	 * Now we need to fix up the spares array.  Each entry in the
779	 * spares array indicates the beginning page number for the
780	 * indicated doubling.  We need to fill this in whenever the
781	 * spares array is invalid, if we never reclaim pages then
782	 * we have to allocate the pages to the spares array in both
783	 * the redo and undo cases.
784	 */
785	if (did_alloc && !DB_UNDO(op) &&
786	    hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] == PGNO_INVALID) {
787		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
788		hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] =
789		    (argp->pgno - argp->bucket) - 1;
790	}
791	if (cmp_n == 0 && groupgrow && DB_UNDO(op)) {
792		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
793		hcp->hdr->spares[
794		    __db_log2(argp->bucket + 1) + 1] = PGNO_INVALID;
795	}
796
797	/*
798	 * Finally, we need to potentially fix up the last_pgno field
799	 * in the master meta-data page (which may or may not be the
800	 * same as the hash header page).
801	 */
802	if (argp->mmpgno != argp->mpgno) {
803		if ((ret = __memp_fget(mpf,
804		    &argp->mmpgno, ip,  NULL, DB_MPOOL_EDIT, &mmeta)) != 0) {
805			if (DB_UNDO(op) && ret == DB_PAGE_NOTFOUND)
806				ret = 0;
807			goto out;
808		}
809		cmp_n = LOG_COMPARE(lsnp, &mmeta->lsn);
810		cmp_p = LOG_COMPARE(&mmeta->lsn, &argp->mmetalsn);
811		if (cmp_p == 0 && DB_REDO(op)) {
812			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
813			mmeta->lsn = *lsnp;
814		} else if (cmp_n == 0 && DB_UNDO(op)) {
815			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
816			mmeta->lsn = argp->mmetalsn;
817		}
818	} else {
819		mmeta = (DBMETA *)hcp->hdr;
820		REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
821	}
822
823	if (cmp_n == 0 && DB_UNDO(op))
824		mmeta->last_pgno = argp->last_pgno;
825	else if (DB_REDO(op) && mmeta->last_pgno < pgno)
826		mmeta->last_pgno = pgno;
827
828	if (argp->mmpgno != argp->mpgno &&
829	    (ret = __memp_fput(mpf, ip, mmeta, dbc->priority)) != 0)
830		goto out;
831	mmeta = NULL;
832
833done:	*lsnp = argp->prev_lsn;
834	ret = 0;
835
836out:	if (mmeta != NULL)
837		(void)__memp_fput(mpf, ip, mmeta, dbc->priority);
838	if (dbc != NULL)
839		(void)__ham_release_meta(dbc);
840
841	REC_CLOSE;
842}
843
844/*
845 * __ham_groupalloc_recover --
846 *	Recover the batch creation of a set of pages for a new database.
847 *
848 * PUBLIC: int __ham_groupalloc_recover
849 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
850 */
851int
852__ham_groupalloc_recover(env, dbtp, lsnp, op, info)
853	ENV *env;
854	DBT *dbtp;
855	DB_LSN *lsnp;
856	db_recops op;
857	void *info;
858{
859	__ham_groupalloc_args *argp;
860	DB_THREAD_INFO *ip;
861	DBMETA *mmeta;
862	DB_MPOOLFILE *mpf;
863	DB *file_dbp;
864	DBC *dbc;
865	PAGE *pagep;
866	db_pgno_t pgno;
867	int cmp_n, cmp_p, ret;
868
869	ip = ((DB_TXNHEAD *)info)->thread_info;
870	mmeta = NULL;
871	REC_PRINT(__ham_groupalloc_print);
872	REC_INTRO(__ham_groupalloc_read, ip, 1);
873
874	pgno = PGNO_BASE_MD;
875	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &mmeta)) != 0) {
876		if (DB_REDO(op)) {
877			ret = __db_pgerr(file_dbp, pgno, ret);
878			goto out;
879		} else
880			goto done;
881	}
882
883	cmp_n = LOG_COMPARE(lsnp, &LSN(mmeta));
884	cmp_p = LOG_COMPARE(&LSN(mmeta), &argp->meta_lsn);
885	CHECK_LSN(env, op, cmp_p, &LSN(mmeta), &argp->meta_lsn);
886	CHECK_ABORT(env, op, cmp_n, &LSN(mmeta), lsnp);
887
888	/*
889	 * Basically, we used mpool to allocate a chunk of pages.
890	 * We need to either add those to a free list (in the undo
891	 * case) or initialize them (in the redo case).
892	 *
893	 * If we are redoing and this is a hash subdatabase, it's possible
894	 * that the pages were never allocated, so we'd better check for
895	 * that and handle it here.
896	 */
897	pgno = argp->start_pgno + argp->num - 1;
898	if (DB_REDO(op)) {
899		if ((ret = __ham_alloc_pages(dbc, argp, lsnp)) != 0)
900			goto out;
901		if (cmp_p == 0) {
902			REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
903			LSN(mmeta) = *lsnp;
904		}
905	} else if (DB_UNDO(op)) {
906		/*
907		 * Fetch the last page and determine if it is in
908		 * the post allocation state.
909		 */
910		pagep = NULL;
911		if ((ret = __memp_fget(mpf, &pgno,
912		     ip,  NULL, DB_MPOOL_EDIT, &pagep)) == 0) {
913			if (LOG_COMPARE(&pagep->lsn, lsnp) != 0) {
914				if ((ret = __memp_fput(mpf, ip,
915				    pagep, DB_PRIORITY_VERY_LOW)) != 0)
916					goto out;
917				pagep = NULL;
918			}
919		} else if (ret != DB_PAGE_NOTFOUND)
920			goto out;
921		/*
922		 * If the last page was allocated then truncate back
923		 * to the first page.
924		 */
925		if (pagep != NULL) {
926			if ((ret = __memp_fput(mpf, ip,
927			    pagep, DB_PRIORITY_VERY_LOW)) != 0)
928				goto out;
929			if ((ret = __memp_ftruncate(mpf, NULL,
930			     ip, argp->start_pgno, 0)) != 0)
931				goto out;
932		}
933
934		/*
935		 * If we are rolling back the metapage, then make
936		 * sure it reflects the the correct last_pgno.
937		 */
938		if (cmp_n == 0) {
939			REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
940			mmeta->last_pgno = argp->last_pgno;
941		}
942		pgno = 0;
943		if (cmp_n == 0) {
944			REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
945			LSN(mmeta) = argp->meta_lsn;
946		}
947	}
948
949	/*
950	 * Set the last page number to the current value.
951	 */
952	if (pgno > mmeta->last_pgno) {
953		REC_DIRTY(mpf, ip, file_dbp->priority, &mmeta);
954		mmeta->last_pgno = pgno;
955	}
956
957done:	if (ret == 0)
958		*lsnp = argp->prev_lsn;
959	ret = 0;
960
961out:	if (mmeta != NULL)
962		(void)__memp_fput(mpf, ip, mmeta, file_dbp->priority);
963
964	REC_CLOSE;
965}
966
967/*
968 * __ham_alloc_pages --
969 *
970 * Called during redo of a file create.  We create new pages in the file
971 * using the MPOOL_NEW_GROUP flag.  We then log the meta-data page with a
972 * __crdel_metasub message.  If we manage to crash without the newly written
973 * pages getting to disk (I'm not sure this can happen anywhere except our
974 * test suite?!), then we need to go through a recreate the final pages.
975 * Hash normally has holes in its files and handles them appropriately.
976 */
977static int
978__ham_alloc_pages(dbc, argp, lsnp)
979	DBC *dbc;
980	__ham_groupalloc_args *argp;
981	DB_LSN *lsnp;
982{
983	DB *file_dbp;
984	DB_MPOOLFILE *mpf;
985	DB_THREAD_INFO *ip;
986	PAGE *pagep;
987	db_pgno_t pgno;
988	int ret;
989
990	file_dbp = dbc->dbp;
991	mpf = file_dbp->mpf;
992	ip = dbc->thread_info;
993
994	/* Read the last page of the allocation. */
995	pgno = argp->start_pgno + argp->num - 1;
996
997	/* If the page exists, and it has been initialized, then we're done. */
998	if ((ret =
999	    __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep)) == 0) {
1000		if (NUM_ENT(pagep) == 0 && IS_ZERO_LSN(pagep->lsn))
1001			goto reinit_page;
1002		return (__memp_fput(mpf, ip, pagep, dbc->priority));
1003	}
1004
1005	/* Had to create the page. */
1006	if ((ret = __memp_fget(mpf, &pgno,
1007	    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
1008		return (__db_pgerr(dbc->dbp, pgno, ret));
1009
1010reinit_page:
1011	/* Initialize the newly allocated page. */
1012	REC_DIRTY(mpf, ip, dbc->priority, &pagep);
1013	P_INIT(pagep, dbc->dbp->pgsize,
1014	    pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
1015	pagep->lsn = *lsnp;
1016
1017out:	return (__memp_fput(mpf, ip, pagep, dbc->priority));
1018}
1019
1020/*
1021 * __ham_curadj_recover --
1022 *	Undo cursor adjustments if a subtransaction fails.
1023 *
1024 * PUBLIC: int __ham_curadj_recover
1025 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1026 */
1027int
1028__ham_curadj_recover(env, dbtp, lsnp, op, info)
1029	ENV *env;
1030	DBT *dbtp;
1031	DB_LSN *lsnp;
1032	db_recops op;
1033	void *info;
1034{
1035	__ham_curadj_args *argp;
1036	db_ham_curadj mode, hamc_mode;
1037	DB_THREAD_INFO *ip;
1038	DB_MPOOLFILE *mpf;
1039	DB *file_dbp;
1040	DBC *dbc;
1041	HASH_CURSOR *hcp;
1042	int ret;
1043
1044	ip = ((DB_TXNHEAD *)info)->thread_info;
1045	REC_PRINT(__ham_curadj_print);
1046	REC_INTRO(__ham_curadj_read, ip, 1);
1047
1048	if (op != DB_TXN_ABORT)
1049		goto done;
1050
1051	mode = (db_ham_curadj)argp->add;
1052
1053	/*
1054	 * Reverse the logged operation, so that the consequences are reversed
1055	 * by the __hamc_update code.
1056	 */
1057	switch (mode) {
1058	case DB_HAM_CURADJ_DEL:
1059		hamc_mode = DB_HAM_CURADJ_ADD;
1060		break;
1061	case DB_HAM_CURADJ_ADD:
1062		hamc_mode = DB_HAM_CURADJ_DEL;
1063		break;
1064	case DB_HAM_CURADJ_ADDMOD:
1065		hamc_mode = DB_HAM_CURADJ_DELMOD;
1066		break;
1067	case DB_HAM_CURADJ_DELMOD:
1068		hamc_mode = DB_HAM_CURADJ_ADDMOD;
1069		break;
1070	default:
1071		__db_errx(env,
1072		    "Invalid flag in __ham_curadj_recover");
1073		ret = EINVAL;
1074		goto out;
1075	}
1076
1077	/*
1078	 * Undo the adjustment by reinitializing the the cursor to look like
1079	 * the one that was used to do the adjustment, then we invert the
1080	 * add so that undo the adjustment.
1081	 */
1082	hcp = (HASH_CURSOR *)dbc->internal;
1083	hcp->pgno = argp->pgno;
1084	hcp->indx = argp->indx;
1085	hcp->dup_off = argp->dup_off;
1086	hcp->order = argp->order;
1087	if (mode == DB_HAM_CURADJ_DEL)
1088		F_SET(hcp, H_DELETED);
1089	(void)__hamc_update(dbc, argp->len, hamc_mode, argp->is_dup);
1090
1091done:	*lsnp = argp->prev_lsn;
1092out:	REC_CLOSE;
1093}
1094
1095/*
1096 * __ham_chgpg_recover --
1097 *	Undo cursor adjustments if a subtransaction fails.
1098 *
1099 * PUBLIC: int __ham_chgpg_recover
1100 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1101 */
1102int
1103__ham_chgpg_recover(env, dbtp, lsnp, op, info)
1104	ENV *env;
1105	DBT *dbtp;
1106	DB_LSN *lsnp;
1107	db_recops op;
1108	void *info;
1109{
1110	__ham_chgpg_args *argp;
1111	DB_THREAD_INFO *ip;
1112	BTREE_CURSOR *opdcp;
1113	DB_MPOOLFILE *mpf;
1114	DB *file_dbp, *ldbp;
1115	DBC *dbc;
1116	DBC *cp;
1117	HASH_CURSOR *lcp;
1118	u_int32_t order, indx;
1119	int ret;
1120
1121	ip = ((DB_TXNHEAD *)info)->thread_info;
1122	REC_PRINT(__ham_chgpg_print);
1123	REC_INTRO(__ham_chgpg_read, ip, 0);
1124
1125	if (op != DB_TXN_ABORT)
1126		goto done;
1127
1128	/* Overloaded fields for DB_HAM_DEL*PG */
1129	indx = argp->old_indx;
1130	order = argp->new_indx;
1131
1132	MUTEX_LOCK(env, env->mtx_dblist);
1133	FIND_FIRST_DB_MATCH(env, file_dbp, ldbp);
1134	for (;
1135	    ldbp != NULL && ldbp->adj_fileid == file_dbp->adj_fileid;
1136	    ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
1137		MUTEX_LOCK(env, file_dbp->mutex);
1138		TAILQ_FOREACH(cp, &ldbp->active_queue, links) {
1139			lcp = (HASH_CURSOR *)cp->internal;
1140
1141			switch (argp->mode) {
1142			case DB_HAM_DELFIRSTPG:
1143				if (lcp->pgno != argp->new_pgno ||
1144				    MVCC_SKIP_CURADJ(cp, lcp->pgno))
1145					break;
1146				if (lcp->indx != indx ||
1147				    !F_ISSET(lcp, H_DELETED) ||
1148				    lcp->order >= order) {
1149					lcp->pgno = argp->old_pgno;
1150					if (lcp->indx == indx)
1151						lcp->order -= order;
1152				}
1153				break;
1154			case DB_HAM_DELMIDPG:
1155			case DB_HAM_DELLASTPG:
1156				if (lcp->pgno == argp->new_pgno &&
1157				    lcp->indx == indx &&
1158				    F_ISSET(lcp, H_DELETED) &&
1159				    lcp->order >= order &&
1160				    !MVCC_SKIP_CURADJ(cp, lcp->pgno)) {
1161					lcp->pgno = argp->old_pgno;
1162					lcp->order -= order;
1163					lcp->indx = 0;
1164				}
1165				break;
1166			case DB_HAM_CHGPG:
1167				/*
1168				 * If we're doing a CHGPG, we're undoing
1169				 * the move of a non-deleted item to a
1170				 * new page.  Any cursors with the deleted
1171				 * flag set do not belong to this item;
1172				 * don't touch them.
1173				 */
1174				if (F_ISSET(lcp, H_DELETED))
1175					break;
1176				/* FALLTHROUGH */
1177			case DB_HAM_SPLIT:
1178				if (lcp->pgno == argp->new_pgno &&
1179				    lcp->indx == argp->new_indx &&
1180				    !MVCC_SKIP_CURADJ(cp, lcp->pgno)) {
1181					lcp->indx = argp->old_indx;
1182					lcp->pgno = argp->old_pgno;
1183				}
1184				break;
1185			case DB_HAM_DUP:
1186				if (lcp->opd == NULL)
1187					break;
1188				opdcp = (BTREE_CURSOR *)lcp->opd->internal;
1189				if (opdcp->pgno != argp->new_pgno ||
1190				    opdcp->indx != argp->new_indx ||
1191				    MVCC_SKIP_CURADJ(lcp->opd, opdcp->pgno))
1192					break;
1193
1194				if (F_ISSET(opdcp, C_DELETED))
1195					F_SET(lcp, H_DELETED);
1196				/*
1197				 * We can't close a cursor while we have the
1198				 * dbp mutex locked, since c_close reacquires
1199				 * it.  It should be safe to drop the mutex
1200				 * here, though, since newly opened cursors
1201				 * are put only at the end of the tailq and
1202				 * the cursor we're adjusting can't be closed
1203				 * under us.
1204				 */
1205				MUTEX_UNLOCK(env, file_dbp->mutex);
1206				if ((ret = __dbc_close(lcp->opd)) != 0)
1207					goto out;
1208				MUTEX_LOCK(env, file_dbp->mutex);
1209				lcp->opd = NULL;
1210				break;
1211			}
1212		}
1213		MUTEX_UNLOCK(env, file_dbp->mutex);
1214	}
1215	MUTEX_UNLOCK(env, env->mtx_dblist);
1216
1217done:	*lsnp = argp->prev_lsn;
1218out:	REC_CLOSE;
1219}
1220
1221/*
1222 * __ham_metagroup_recover --
1223 *	Recovery function for metagroup.
1224 *
1225 * PUBLIC: int __ham_metagroup_42_recover
1226 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1227 */
1228int
1229__ham_metagroup_42_recover(env, dbtp, lsnp, op, info)
1230	ENV *env;
1231	DBT *dbtp;
1232	DB_LSN *lsnp;
1233	db_recops op;
1234	void *info;
1235{
1236	__ham_metagroup_42_args *argp;
1237	DB_THREAD_INFO *ip;
1238	HASH_CURSOR *hcp;
1239	DB *file_dbp;
1240	DBMETA *mmeta;
1241	DBC *dbc;
1242	DB_MPOOLFILE *mpf;
1243	PAGE *pagep;
1244	db_pgno_t pgno;
1245	u_int32_t flags;
1246	int cmp_n, cmp_p, did_alloc, groupgrow, ret;
1247
1248	ip = ((DB_TXNHEAD *)info)->thread_info;
1249	mmeta = NULL;
1250	did_alloc = 0;
1251	REC_PRINT(__ham_metagroup_42_print);
1252	REC_INTRO(__ham_metagroup_42_read, ip, 1);
1253
1254	/*
1255	 * This logs the virtual create of pages pgno to pgno + bucket
1256	 * If HAVE_FTRUNCATE is not supported the mpool page-allocation is not
1257	 * transaction protected, we can never undo it.  Even in an abort,
1258	 * we have to allocate these pages to the hash table if they
1259	 * were actually created.  In particular, during disaster
1260	 * recovery the metapage may be before this point if we
1261	 * are rolling backward.  If the file has not been extended
1262	 * then the metapage could not have been updated.
1263	 * The log record contains:
1264	 * bucket: old maximum bucket
1265	 * pgno: page number of the new bucket.
1266	 * We round up on log calculations, so we can figure out if we are
1267	 * about to double the hash table if argp->bucket+1 is a power of 2.
1268	 * If it is, then we are allocating an entire doubling of pages,
1269	 * otherwise, we are simply allocated one new page.
1270	 */
1271	groupgrow =
1272	    (u_int32_t)(1 << __db_log2(argp->bucket + 1)) == argp->bucket + 1;
1273	pgno = argp->pgno;
1274	if (argp->newalloc)
1275		pgno += argp->bucket;
1276
1277	flags = 0;
1278	pagep = NULL;
1279	LF_SET(DB_MPOOL_CREATE);
1280	ret = __memp_fget(mpf, &pgno, ip,  NULL, flags, &pagep);
1281
1282	if (ret != 0) {
1283		if (ret != ENOSPC)
1284			goto out;
1285		pgno = 0;
1286		goto do_meta;
1287	}
1288
1289	/*
1290	 * When we get here then either we did not grow the file
1291	 * (groupgrow == 0) or we did grow the file and the allocation
1292	 * of those new pages succeeded.
1293	 */
1294	did_alloc = groupgrow;
1295
1296	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
1297	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
1298	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
1299
1300	if (cmp_p == 0 && DB_REDO(op)) {
1301		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
1302		pagep->lsn = *lsnp;
1303	} else if (cmp_n == 0 && DB_UNDO(op)) {
1304		/*
1305		 * Otherwise just roll the page back to its
1306		 * previous state.
1307		 */
1308		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
1309		pagep->lsn = argp->pagelsn;
1310	}
1311	if (pagep != NULL &&
1312	    (ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
1313		goto out;
1314
1315do_meta:
1316	/* Now we have to update the meta-data page. */
1317	hcp = (HASH_CURSOR *)dbc->internal;
1318	if ((ret = __ham_get_meta(dbc)) != 0)
1319		goto out;
1320	cmp_n = LOG_COMPARE(lsnp, &hcp->hdr->dbmeta.lsn);
1321	cmp_p = LOG_COMPARE(&hcp->hdr->dbmeta.lsn, &argp->metalsn);
1322	CHECK_LSN(env, op, cmp_p, &hcp->hdr->dbmeta.lsn, &argp->metalsn);
1323	if (cmp_p == 0 && DB_REDO(op)) {
1324		/* Redo the actual updating of bucket counts. */
1325		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
1326		++hcp->hdr->max_bucket;
1327		if (groupgrow) {
1328			hcp->hdr->low_mask = hcp->hdr->high_mask;
1329			hcp->hdr->high_mask =
1330			    (argp->bucket + 1) | hcp->hdr->low_mask;
1331		}
1332		hcp->hdr->dbmeta.lsn = *lsnp;
1333	} else if (cmp_n == 0 && DB_UNDO(op)) {
1334		/* Undo the actual updating of bucket counts. */
1335		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
1336		hcp->hdr->max_bucket = argp->bucket;
1337		if (groupgrow) {
1338			hcp->hdr->high_mask = argp->bucket;
1339			hcp->hdr->low_mask = hcp->hdr->high_mask >> 1;
1340		}
1341		hcp->hdr->dbmeta.lsn = argp->metalsn;
1342	}
1343
1344	/*
1345	 * Now we need to fix up the spares array.  Each entry in the
1346	 * spares array indicates the beginning page number for the
1347	 * indicated doubling.  We need to fill this in whenever the
1348	 * spares array is invalid, if we never reclaim pages then
1349	 * we have to allocate the pages to the spares array in both
1350	 * the redo and undo cases.
1351	 */
1352	if (did_alloc &&
1353	    hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] == PGNO_INVALID) {
1354		REC_DIRTY(mpf, ip, dbc->priority, &hcp->hdr);
1355		hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] =
1356		    (argp->pgno - argp->bucket) - 1;
1357	}
1358
1359	/*
1360	 * Finally, we need to potentially fix up the last_pgno field
1361	 * in the master meta-data page (which may or may not be the
1362	 * same as the hash header page).
1363	 */
1364	if (argp->mmpgno != argp->mpgno) {
1365		if ((ret = __memp_fget(mpf, &argp->mmpgno, ip, NULL,
1366		    DB_MPOOL_EDIT, &mmeta)) != 0) {
1367			if (DB_UNDO(op) && ret == DB_PAGE_NOTFOUND)
1368				ret = 0;
1369			goto out;
1370		}
1371		cmp_n = LOG_COMPARE(lsnp, &mmeta->lsn);
1372		cmp_p = LOG_COMPARE(&mmeta->lsn, &argp->mmetalsn);
1373		if (cmp_p == 0 && DB_REDO(op)) {
1374			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
1375			mmeta->lsn = *lsnp;
1376		} else if (cmp_n == 0 && DB_UNDO(op)) {
1377			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
1378			mmeta->lsn = argp->mmetalsn;
1379		}
1380	} else {
1381		mmeta = (DBMETA *)hcp->hdr;
1382		REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
1383	}
1384
1385	if (mmeta->last_pgno < pgno)
1386		mmeta->last_pgno = pgno;
1387
1388	if (argp->mmpgno != argp->mpgno &&
1389	    (ret = __memp_fput(mpf, ip, mmeta, dbc->priority)) != 0)
1390		goto out;
1391	mmeta = NULL;
1392
1393done:	*lsnp = argp->prev_lsn;
1394	ret = 0;
1395
1396out:	if (mmeta != NULL)
1397		(void)__memp_fput(mpf, ip, mmeta, dbc->priority);
1398	if (dbc != NULL)
1399		(void)__ham_release_meta(dbc);
1400
1401	REC_CLOSE;
1402}
1403
1404/*
1405 * __ham_groupalloc_42_recover --
1406 *	Recover the batch creation of a set of pages for a new database.
1407 *
1408 * PUBLIC: int __ham_groupalloc_42_recover
1409 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1410 */
1411int
1412__ham_groupalloc_42_recover(env, dbtp, lsnp, op, info)
1413	ENV *env;
1414	DBT *dbtp;
1415	DB_LSN *lsnp;
1416	db_recops op;
1417	void *info;
1418{
1419	__ham_groupalloc_42_args *argp;
1420	DB_THREAD_INFO *ip;
1421	DBMETA *mmeta;
1422	DB_MPOOLFILE *mpf;
1423	DB *file_dbp;
1424	DBC *dbc;
1425	db_pgno_t pgno;
1426	int cmp_p, ret;
1427
1428	ip = ((DB_TXNHEAD *)info)->thread_info;
1429	mmeta = NULL;
1430	REC_PRINT(__ham_groupalloc_42_print);
1431	REC_INTRO(__ham_groupalloc_42_read, ip, 1);
1432
1433	pgno = PGNO_BASE_MD;
1434	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &mmeta)) != 0) {
1435		if (DB_REDO(op)) {
1436			ret = __db_pgerr(file_dbp, pgno, ret);
1437			goto out;
1438		} else
1439			goto done;
1440	}
1441
1442	cmp_p = LOG_COMPARE(&LSN(mmeta), &argp->meta_lsn);
1443	CHECK_LSN(env, op, cmp_p, &LSN(mmeta), &argp->meta_lsn);
1444
1445	/*
1446	 * Basically, we used mpool to allocate a chunk of pages.
1447	 * We need to either add those to a free list (in the undo
1448	 * case) or initialize them (in the redo case).
1449	 *
1450	 * If we are redoing and this is a hash subdatabase, it's possible
1451	 * that the pages were never allocated, so we'd better check for
1452	 * that and handle it here.
1453	 */
1454	pgno = argp->start_pgno + argp->num - 1;
1455	if (DB_REDO(op)) {
1456		if ((ret = __ham_alloc_pages_42(dbc, argp, lsnp)) != 0)
1457			goto out;
1458		if (cmp_p == 0) {
1459			REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
1460			LSN(mmeta) = *lsnp;
1461		}
1462	} else if (DB_UNDO(op)) {
1463		/*
1464		 * We cannot roll back 4.2 style allocations.
1465		 */
1466		__db_errx(env,
1467"Cannot replicate prepared transactions from master running release 4.2.");
1468		ret = __env_panic(env, EINVAL);
1469		goto out;
1470	}
1471
1472	/*
1473	 * In both REDO and UNDO, we have grown the file and need to make
1474	 * sure that last_pgno is correct.  If we HAVE_FTRUNCATE pgno
1475	 * will only be valid on REDO.
1476	 */
1477	if (pgno > mmeta->last_pgno) {
1478		REC_DIRTY(mpf, ip, dbc->priority, &mmeta);
1479		mmeta->last_pgno = pgno;
1480	}
1481
1482done:	if (ret == 0)
1483		*lsnp = argp->prev_lsn;
1484	ret = 0;
1485
1486out:	if (mmeta != NULL)
1487		(void)__memp_fput(mpf, ip, mmeta, dbc->priority);
1488
1489	REC_CLOSE;
1490}
1491
1492/*
1493 * __ham_alloc_pages_42 --
1494 *
1495 * Called during redo of a file create.  We create new pages in the file
1496 * using the MPOOL_NEW_GROUP flag.  We then log the meta-data page with a
1497 * __crdel_metasub message.  If we manage to crash without the newly written
1498 * pages getting to disk (I'm not sure this can happen anywhere except our
1499 * test suite?!), then we need to go through a recreate the final pages.
1500 * Hash normally has holes in its files and handles them appropriately.
1501 */
1502static int
1503__ham_alloc_pages_42(dbc, argp, lsnp)
1504	DBC *dbc;
1505	__ham_groupalloc_42_args *argp;
1506	DB_LSN *lsnp;
1507{
1508	DB_MPOOLFILE *mpf;
1509	DB_THREAD_INFO *ip;
1510	PAGE *pagep;
1511	db_pgno_t pgno;
1512	int ret;
1513
1514	mpf = dbc->dbp->mpf;
1515	ip = dbc->thread_info;
1516
1517	/* Read the last page of the allocation. */
1518	pgno = argp->start_pgno + argp->num - 1;
1519
1520	/* If the page exists, and it has been initialized, then we're done. */
1521	if ((ret = __memp_fget(mpf,
1522	    &pgno, ip, NULL, 0, &pagep)) == 0) {
1523		if (NUM_ENT(pagep) == 0 && IS_ZERO_LSN(pagep->lsn))
1524			goto reinit_page;
1525		if ((ret = __memp_fput(mpf,
1526		    ip, pagep, dbc->priority)) != 0)
1527			return (ret);
1528		return (0);
1529	}
1530
1531	/* Had to create the page. */
1532	if ((ret = __memp_fget(mpf, &pgno, ip, NULL,
1533	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &pagep)) != 0)
1534		return (__db_pgerr(dbc->dbp, pgno, ret));
1535
1536reinit_page:
1537	/* Initialize the newly allocated page. */
1538	P_INIT(pagep,
1539	    dbc->dbp->pgsize, pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
1540	pagep->lsn = *lsnp;
1541
1542	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
1543		return (ret);
1544
1545	return (0);
1546}
1547