1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: db_rec.c,v 12.53 2008/03/12 20:33:03 mbrey Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/log.h"
14#include "dbinc/mp.h"
15#include "dbinc/hash.h"
16
17static int __db_pg_free_recover_int __P((ENV *, DB_THREAD_INFO *,
18    __db_pg_freedata_args *, DB *, DB_LSN *, DB_MPOOLFILE *, db_recops, int));
19static int __db_pg_free_recover_42_int __P((ENV *, DB_THREAD_INFO *,
20    __db_pg_freedata_42_args *,
21    DB *, DB_LSN *, DB_MPOOLFILE *, db_recops, int));
22
23/*
24 * PUBLIC: int __db_addrem_recover
25 * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
26 *
27 * This log message is generated whenever we add or remove a duplicate
28 * to/from a duplicate page.  On recover, we just do the opposite.
29 */
30int
31__db_addrem_recover(env, dbtp, lsnp, op, info)
32	ENV *env;
33	DBT *dbtp;
34	DB_LSN *lsnp;
35	db_recops op;
36	void *info;
37{
38	__db_addrem_args *argp;
39	DB_THREAD_INFO *ip;
40	DB *file_dbp;
41	DBC *dbc;
42	DB_MPOOLFILE *mpf;
43	PAGE *pagep;
44	int cmp_n, cmp_p, modified, ret;
45
46	ip = ((DB_TXNHEAD *)info)->thread_info;
47	pagep = NULL;
48	REC_PRINT(__db_addrem_print);
49	REC_INTRO(__db_addrem_read, ip, 1);
50
51	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
52	modified = 0;
53
54	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
55	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
56	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
57	if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_DUP) ||
58	    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_DUP)) {
59		/* Need to redo an add, or undo a delete. */
60		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
61		if ((ret = __db_pitem(dbc, pagep, argp->indx, argp->nbytes,
62		    argp->hdr.size == 0 ? NULL : &argp->hdr,
63		    argp->dbt.size == 0 ? NULL : &argp->dbt)) != 0)
64			goto out;
65		modified = 1;
66
67	} else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_DUP) ||
68	    (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_DUP)) {
69		/* Need to undo an add, or redo a delete. */
70		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
71		if ((ret = __db_ditem(dbc,
72		    pagep, argp->indx, argp->nbytes)) != 0)
73			goto out;
74		modified = 1;
75	}
76
77	if (modified) {
78		if (DB_REDO(op))
79			LSN(pagep) = *lsnp;
80		else
81			LSN(pagep) = argp->pagelsn;
82	}
83
84	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
85		goto out;
86	pagep = NULL;
87
88done:	*lsnp = argp->prev_lsn;
89	ret = 0;
90
91out:	if (pagep != NULL)
92		(void)__memp_fput(mpf, ip, pagep, dbc->priority);
93	REC_CLOSE;
94}
95
96/*
97 * PUBLIC: int __db_big_recover
98 * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
99 */
100int
101__db_big_recover(env, dbtp, lsnp, op, info)
102	ENV *env;
103	DBT *dbtp;
104	DB_LSN *lsnp;
105	db_recops op;
106	void *info;
107{
108	__db_big_args *argp;
109	DB_THREAD_INFO *ip;
110	DB *file_dbp;
111	DBC *dbc;
112	DB_MPOOLFILE *mpf;
113	PAGE *pagep;
114	int cmp_n, cmp_p, modified, ret;
115
116	ip = ((DB_TXNHEAD *)info)->thread_info;
117	pagep = NULL;
118	REC_PRINT(__db_big_print);
119	REC_INTRO(__db_big_read, ip, 0);
120
121	REC_FGET(mpf, ip, argp->pgno, &pagep, ppage);
122	modified = 0;
123
124	/*
125	 * There are three pages we need to check.  The one on which we are
126	 * adding data, the previous one whose next_pointer may have
127	 * been updated, and the next one whose prev_pointer may have
128	 * been updated.
129	 */
130	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
131	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
132	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
133	if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) ||
134	    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_BIG)) {
135		/* We are either redo-ing an add, or undoing a delete. */
136		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
137		P_INIT(pagep, file_dbp->pgsize, argp->pgno, argp->prev_pgno,
138			argp->next_pgno, 0, P_OVERFLOW);
139		OV_LEN(pagep) = argp->dbt.size;
140		OV_REF(pagep) = 1;
141		memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp), argp->dbt.data,
142		    argp->dbt.size);
143		PREV_PGNO(pagep) = argp->prev_pgno;
144		modified = 1;
145	} else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_BIG) ||
146	    (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_BIG)) {
147		/*
148		 * We are either undo-ing an add or redo-ing a delete.
149		 * The page is about to be reclaimed in either case, so
150		 * there really isn't anything to do here.
151		 */
152		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
153		modified = 1;
154	}
155	if (modified)
156		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
157
158	ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
159	pagep = NULL;
160	if (ret != 0)
161		goto out;
162
163	/*
164	 * We only delete a whole chain of overflow.
165	 * Each page is handled individually
166	 */
167	if (argp->opcode == DB_REM_BIG)
168		goto done;
169
170	/* Now check the previous page. */
171ppage:	if (argp->prev_pgno != PGNO_INVALID) {
172		REC_FGET(mpf, ip, argp->prev_pgno, &pagep, npage);
173		modified = 0;
174
175		cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
176		cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
177		CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
178
179		if (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) {
180			/* Redo add, undo delete. */
181			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
182			NEXT_PGNO(pagep) = argp->pgno;
183			modified = 1;
184		} else if (cmp_n == 0 &&
185		    DB_UNDO(op) && argp->opcode == DB_ADD_BIG) {
186			/* Redo delete, undo add. */
187			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
188			NEXT_PGNO(pagep) = argp->next_pgno;
189			modified = 1;
190		}
191		if (modified)
192			LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn;
193		ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
194		pagep = NULL;
195		if (ret != 0)
196			goto out;
197	}
198	pagep = NULL;
199
200	/* Now check the next page.  Can only be set on a delete. */
201npage:	if (argp->next_pgno != PGNO_INVALID) {
202		REC_FGET(mpf, ip, argp->next_pgno, &pagep, done);
203		modified = 0;
204
205		cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
206		cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
207		CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
208		if (cmp_p == 0 && DB_REDO(op)) {
209			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
210			PREV_PGNO(pagep) = PGNO_INVALID;
211			modified = 1;
212		} else if (cmp_n == 0 && DB_UNDO(op)) {
213			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
214			PREV_PGNO(pagep) = argp->pgno;
215			modified = 1;
216		}
217		if (modified)
218			LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn;
219		ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
220		pagep = NULL;
221		if (ret != 0)
222			goto out;
223	}
224	pagep = NULL;
225
226done:	*lsnp = argp->prev_lsn;
227	ret = 0;
228
229out:	if (pagep != NULL)
230		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
231	REC_CLOSE;
232}
233
234/*
235 * __db_ovref_recover --
236 *	Recovery function for __db_ovref().
237 *
238 * PUBLIC: int __db_ovref_recover
239 * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
240 */
241int
242__db_ovref_recover(env, dbtp, lsnp, op, info)
243	ENV *env;
244	DBT *dbtp;
245	DB_LSN *lsnp;
246	db_recops op;
247	void *info;
248{
249	__db_ovref_args *argp;
250	DB_THREAD_INFO *ip;
251	DB *file_dbp;
252	DBC *dbc;
253	DB_MPOOLFILE *mpf;
254	PAGE *pagep;
255	int cmp, ret;
256
257	ip = ((DB_TXNHEAD *)info)->thread_info;
258	pagep = NULL;
259	REC_PRINT(__db_ovref_print);
260	REC_INTRO(__db_ovref_read, ip, 0);
261
262	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
263
264	cmp = LOG_COMPARE(&LSN(pagep), &argp->lsn);
265	CHECK_LSN(env, op, cmp, &LSN(pagep), &argp->lsn);
266	if (cmp == 0 && DB_REDO(op)) {
267		/* Need to redo update described. */
268		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
269		OV_REF(pagep) += argp->adjust;
270		pagep->lsn = *lsnp;
271	} else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
272		/* Need to undo update described. */
273		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
274		OV_REF(pagep) -= argp->adjust;
275		pagep->lsn = argp->lsn;
276	}
277	ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
278	pagep = NULL;
279	if (ret != 0)
280		goto out;
281	pagep = NULL;
282
283done:	*lsnp = argp->prev_lsn;
284	ret = 0;
285
286out:	if (pagep != NULL)
287		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
288	REC_CLOSE;
289}
290
291/*
292 * __db_debug_recover --
293 *	Recovery function for debug.
294 *
295 * PUBLIC: int __db_debug_recover __P((ENV *,
296 * PUBLIC:     DBT *, DB_LSN *, db_recops, void *));
297 */
298int
299__db_debug_recover(env, dbtp, lsnp, op, info)
300	ENV *env;
301	DBT *dbtp;
302	DB_LSN *lsnp;
303	db_recops op;
304	void *info;
305{
306	__db_debug_args *argp;
307	int ret;
308
309	COMPQUIET(op, DB_TXN_ABORT);
310	COMPQUIET(info, NULL);
311
312	REC_PRINT(__db_debug_print);
313	REC_NOOP_INTRO(__db_debug_read);
314
315	*lsnp = argp->prev_lsn;
316	ret = 0;
317
318	REC_NOOP_CLOSE;
319}
320
321/*
322 * __db_noop_recover --
323 *	Recovery function for noop.
324 *
325 * PUBLIC: int __db_noop_recover __P((ENV *,
326 * PUBLIC:      DBT *, DB_LSN *, db_recops, void *));
327 */
328int
329__db_noop_recover(env, dbtp, lsnp, op, info)
330	ENV *env;
331	DBT *dbtp;
332	DB_LSN *lsnp;
333	db_recops op;
334	void *info;
335{
336	__db_noop_args *argp;
337	DB_THREAD_INFO *ip;
338	DB *file_dbp;
339	DBC *dbc;
340	DB_MPOOLFILE *mpf;
341	PAGE *pagep;
342	int cmp_n, cmp_p, ret;
343
344	ip = ((DB_TXNHEAD *)info)->thread_info;
345	pagep = NULL;
346	REC_PRINT(__db_noop_print);
347	REC_INTRO(__db_noop_read, ip, 0);
348
349	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
350
351	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
352	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
353	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
354	if (cmp_p == 0 && DB_REDO(op)) {
355		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
356		LSN(pagep) = *lsnp;
357	} else if (cmp_n == 0 && DB_UNDO(op)) {
358		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
359		LSN(pagep) = argp->prevlsn;
360	}
361	ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
362	pagep = NULL;
363
364done:	*lsnp = argp->prev_lsn;
365out:	if (pagep != NULL)
366		(void)__memp_fput(mpf,
367		    ip, pagep, file_dbp->priority);
368	REC_CLOSE;
369}
370
371/*
372 * __db_pg_alloc_recover --
373 *	Recovery function for pg_alloc.
374 *
375 * PUBLIC: int __db_pg_alloc_recover
376 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
377 */
378int
379__db_pg_alloc_recover(env, dbtp, lsnp, op, info)
380	ENV *env;
381	DBT *dbtp;
382	DB_LSN *lsnp;
383	db_recops op;
384	void *info;
385{
386	__db_pg_alloc_args *argp;
387	DB_THREAD_INFO *ip;
388	DB *file_dbp;
389	DBC *dbc;
390	DBMETA *meta;
391	DB_MPOOLFILE *mpf;
392	PAGE *pagep;
393	db_pgno_t pgno;
394	int cmp_n, cmp_p, created, level, ret;
395
396	ip = ((DB_TXNHEAD *)info)->thread_info;
397	meta = NULL;
398	pagep = NULL;
399	created = 0;
400	REC_PRINT(__db_pg_alloc_print);
401	REC_INTRO(__db_pg_alloc_read, ip, 0);
402
403	/*
404	 * Fix up the metadata page.  If we're redoing the operation, we have
405	 * to get the metadata page and update its LSN and its free pointer.
406	 * If we're undoing the operation and the page was ever created, we put
407	 * it on the freelist.
408	 */
409	pgno = PGNO_BASE_MD;
410	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &meta)) != 0) {
411		/* The metadata page must always exist on redo. */
412		if (DB_REDO(op)) {
413			ret = __db_pgerr(file_dbp, pgno, ret);
414			goto out;
415		} else
416			goto done;
417	}
418	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
419	cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
420	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
421	if (cmp_p == 0 && DB_REDO(op)) {
422		/* Need to redo update described. */
423		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
424		LSN(meta) = *lsnp;
425		meta->free = argp->next;
426		if (argp->pgno > meta->last_pgno)
427			meta->last_pgno = argp->pgno;
428	} else if (cmp_n == 0 && DB_UNDO(op)) {
429		/* Need to undo update described. */
430		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
431		LSN(meta) = argp->meta_lsn;
432		/*
433		 * If the page has a zero LSN then its newly created and
434		 * will be truncated rather than go on the free list.
435		 */
436		if (!IS_ZERO_LSN(argp->page_lsn))
437			meta->free = argp->pgno;
438		meta->last_pgno = argp->last_pgno;
439	}
440
441#ifdef HAVE_FTRUNCATE
442	/*
443	 * Check to see if we are keeping a sorted
444	 * freelist, if so put this back in the in
445	 * memory list.  It must be the first element.
446	 */
447	if (op == DB_TXN_ABORT && !IS_ZERO_LSN(argp->page_lsn)) {
448		db_pgno_t *list;
449		u_int32_t nelem;
450
451		if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
452			goto out;
453		if (list != NULL) {
454			if ((ret =
455			    __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
456				goto out;
457			if (nelem != 0)
458				memmove(list + 1, list, nelem * sizeof(list));
459			*list = argp->pgno;
460		}
461	}
462#endif
463
464	/*
465	 * Fix up the allocated page. If the page does not exist
466	 * and we can truncate it then don't create it.
467	 * Otherwise if we're redoing the operation, we have
468	 * to get the page (creating it if it doesn't exist), and update its
469	 * LSN.  If we're undoing the operation, we have to reset the page's
470	 * LSN and put it on the free list.
471	 */
472	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
473		/*
474		 * We have to be able to identify if a page was newly
475		 * created so we can recover it properly.  We cannot simply
476		 * look for an empty header, because hash uses a pgin
477		 * function that will set the header.  Instead, we explicitly
478		 * try for the page without CREATE and if that fails, then
479		 * create it.
480		 */
481		if (DB_UNDO(op))
482			goto do_truncate;
483		if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
484		    DB_MPOOL_CREATE, &pagep)) != 0) {
485			if (DB_UNDO(op) && ret == ENOSPC)
486				goto do_truncate;
487			ret = __db_pgerr(file_dbp, argp->pgno, ret);
488			goto out;
489		}
490		created = 1;
491	}
492
493	/* Fix up the allocated page. */
494	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
495	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->page_lsn);
496
497	/*
498	 * If an initial allocation is aborted and then reallocated during
499	 * an archival restore the log record will have an LSN for the page
500	 * but the page will be empty.
501	 */
502	if (IS_ZERO_LSN(LSN(pagep)))
503		cmp_p = 0;
504
505	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->page_lsn);
506	/*
507	 * Another special case we have to handle is if we ended up with a
508	 * page of all 0's which can happen if we abort between allocating a
509	 * page in mpool and initializing it.  In that case, even if we're
510	 * undoing, we need to re-initialize the page.
511	 */
512	if (DB_REDO(op) && cmp_p == 0) {
513		/* Need to redo update described. */
514		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
515		switch (argp->ptype) {
516		case P_LBTREE:
517		case P_LRECNO:
518		case P_LDUP:
519			level = LEAFLEVEL;
520			break;
521		default:
522			level = 0;
523			break;
524		}
525		P_INIT(pagep, file_dbp->pgsize,
526		    argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype);
527
528		pagep->lsn = *lsnp;
529	} else if (DB_UNDO(op) && (cmp_n == 0 || created)) {
530		/*
531		 * This is where we handle the case of a 0'd page (pagep->pgno
532		 * is equal to PGNO_INVALID).
533		 * Undo the allocation, reinitialize the page and
534		 * link its next pointer to the free list.
535		 */
536		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
537		P_INIT(pagep, file_dbp->pgsize,
538		    argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
539
540		pagep->lsn = argp->page_lsn;
541	}
542
543do_truncate:
544	/*
545	 * If the page was newly created, give it back.
546	 */
547	if ((pagep == NULL || IS_ZERO_LSN(LSN(pagep))) &&
548	    IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) {
549		/* Discard the page. */
550		if (pagep != NULL) {
551			if ((ret = __memp_fput(mpf, ip,
552			    pagep, DB_PRIORITY_VERY_LOW)) != 0)
553				goto out;
554			pagep = NULL;
555		}
556		/* Give the page back to the OS. */
557		if (meta->last_pgno <= argp->pgno && (ret = __memp_ftruncate(
558		    mpf, ip, argp->pgno, MP_TRUNC_RECOVER)) != 0)
559			goto out;
560	}
561
562	if (pagep != NULL) {
563		ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
564		pagep = NULL;
565		if (ret != 0)
566			goto out;
567	}
568
569	ret = __memp_fput(mpf, ip, meta, file_dbp->priority);
570	meta = NULL;
571	if (ret != 0)
572		goto out;
573
574done:	*lsnp = argp->prev_lsn;
575	ret = 0;
576
577out:	if (pagep != NULL)
578		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
579	if (meta != NULL)
580		(void)__memp_fput(mpf, ip, meta, file_dbp->priority);
581	REC_CLOSE;
582}
583
584/*
585 * __db_pg_free_recover_int --
586 */
587static int
588__db_pg_free_recover_int(env, ip, argp, file_dbp, lsnp, mpf, op, data)
589	ENV *env;
590	DB_THREAD_INFO *ip;
591	__db_pg_freedata_args *argp;
592	DB *file_dbp;
593	DB_LSN *lsnp;
594	DB_MPOOLFILE *mpf;
595	db_recops op;
596	int data;
597{
598	DBMETA *meta;
599	DB_LSN copy_lsn;
600	PAGE *pagep, *prevp;
601	int cmp_n, cmp_p, is_meta, ret;
602
603	meta = NULL;
604	pagep = prevp = NULL;
605
606	/*
607	 * Get the "metapage".  This will either be the metapage
608	 * or the previous page in the free list if we are doing
609	 * sorted allocations.  If its a previous page then
610	 * we will not be truncating.
611	 */
612	is_meta = argp->meta_pgno == PGNO_BASE_MD;
613
614	REC_FGET(mpf, ip, argp->meta_pgno, &meta, check_meta);
615
616	if (argp->meta_pgno != PGNO_BASE_MD)
617		prevp = (PAGE *)meta;
618
619	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
620	cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
621	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
622
623	/*
624	 * Fix up the metadata page.  If we're redoing or undoing the operation
625	 * we get the page and update its LSN, last and free pointer.
626	 */
627	if (cmp_p == 0 && DB_REDO(op)) {
628		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
629		/*
630		 * If we are at the end of the file truncate, otherwise
631		 * put on the free list.
632		*/
633		if (argp->pgno == argp->last_pgno)
634			meta->last_pgno = argp->pgno - 1;
635		else if (is_meta)
636			meta->free = argp->pgno;
637		else
638			NEXT_PGNO(prevp) = argp->pgno;
639		LSN(meta) = *lsnp;
640	} else if (cmp_n == 0 && DB_UNDO(op)) {
641		/* Need to undo the deallocation. */
642		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
643		if (is_meta) {
644			if (meta->last_pgno < argp->pgno)
645				meta->last_pgno = argp->pgno;
646			meta->free = argp->next;
647		} else
648			NEXT_PGNO(prevp) = argp->next;
649		LSN(meta) = argp->meta_lsn;
650	}
651
652check_meta:
653	if (ret != 0 && is_meta) {
654		/* The metadata page must always exist. */
655		ret = __db_pgerr(file_dbp, argp->meta_pgno, ret);
656		goto out;
657	}
658
659	/*
660	 * Get the freed page.  Don't create the page if we are going to
661	 * free it.  If we're redoing the operation we get the page and
662	 * explicitly discard its contents, then update its LSN. If we're
663	 * undoing the operation, we get the page and restore its header.
664	 */
665	if (DB_REDO(op) || (is_meta && meta->last_pgno < argp->pgno)) {
666		if ((ret = __memp_fget(mpf, &argp->pgno,
667		    ip, NULL, 0, &pagep)) != 0) {
668			if (ret != DB_PAGE_NOTFOUND)
669				goto out;
670			if (is_meta &&
671			    DB_REDO(op) && meta->last_pgno <= argp->pgno)
672				goto trunc;
673			goto done;
674		}
675	} else if ((ret = __memp_fget(mpf, &argp->pgno,
676	   ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
677		goto out;
678
679	(void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
680	cmp_n = IS_ZERO_LSN(LSN(pagep)) ? 0 : LOG_COMPARE(lsnp, &LSN(pagep));
681	cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
682
683	/*
684	 * This page got extended by a later allocation,
685	 * but its allocation was not in the scope of this
686	 * recovery pass.
687	 */
688	if (IS_ZERO_LSN(LSN(pagep)))
689		cmp_p = 0;
690
691	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
692	if (DB_REDO(op) &&
693	    (cmp_p == 0 ||
694	    (IS_ZERO_LSN(copy_lsn) &&
695	    LOG_COMPARE(&LSN(pagep), &argp->meta_lsn) <= 0))) {
696		/* Need to redo the deallocation. */
697		/*
698		 * The page can be truncated if it was truncated at runtime
699		 * and the current metapage reflects the truncation.
700		 */
701		if (is_meta && meta->last_pgno <= argp->pgno &&
702		    argp->last_pgno <= argp->pgno) {
703			if ((ret = __memp_fput(mpf, ip,
704			    pagep, DB_PRIORITY_VERY_LOW)) != 0)
705				goto out;
706			pagep = NULL;
707trunc:			if ((ret = __memp_ftruncate(mpf, ip,
708			    argp->pgno, MP_TRUNC_RECOVER)) != 0)
709				goto out;
710		} else if (argp->last_pgno == argp->pgno) {
711			/* The page was truncated at runtime, zero it out. */
712			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
713			P_INIT(pagep, 0, PGNO_INVALID,
714			    PGNO_INVALID, PGNO_INVALID, 0, P_INVALID);
715			ZERO_LSN(pagep->lsn);
716		} else {
717			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
718			P_INIT(pagep, file_dbp->pgsize,
719			    argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
720			pagep->lsn = *lsnp;
721
722		}
723	} else if (cmp_n == 0 && DB_UNDO(op)) {
724		/* Need to reallocate the page. */
725		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
726		memcpy(pagep, argp->header.data, argp->header.size);
727		if (data)
728			memcpy((u_int8_t*)pagep + HOFFSET(pagep),
729			     argp->data.data, argp->data.size);
730	}
731	if (pagep != NULL &&
732	    (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
733		goto out;
734
735	pagep = NULL;
736#ifdef HAVE_FTRUNCATE
737	/*
738	 * If we are keeping an in memory free list remove this
739	 * element from the list.
740	 */
741	if (op == DB_TXN_ABORT && argp->pgno != argp->last_pgno) {
742		db_pgno_t *lp;
743		u_int32_t nelem, pos;
744
745		if ((ret = __memp_get_freelist(mpf, &nelem, &lp)) != 0)
746			goto out;
747		if (lp != NULL) {
748			pos = 0;
749			if (!is_meta) {
750				__db_freelist_pos(argp->pgno, lp, nelem, &pos);
751
752				DB_ASSERT(env, argp->pgno == lp[pos]);
753				DB_ASSERT(env,
754				    argp->meta_pgno == lp[pos - 1]);
755			}
756
757			if (pos < nelem)
758				memmove(&lp[pos], &lp[pos + 1],
759				    ((nelem - pos) - 1) * sizeof(*lp));
760
761			/* Shrink the list */
762			if ((ret =
763			    __memp_extend_freelist(mpf, nelem - 1, &lp)) != 0)
764				goto out;
765		}
766	}
767#endif
768done:
769	if (meta != NULL &&
770	     (ret = __memp_fput(mpf, ip,  meta, file_dbp->priority)) != 0)
771		goto out;
772	meta = NULL;
773	ret = 0;
774
775out:	if (pagep != NULL)
776		(void)__memp_fput(mpf, ip,  pagep, file_dbp->priority);
777	if (meta != NULL)
778		(void)__memp_fput(mpf, ip,  meta, file_dbp->priority);
779
780	return (ret);
781}
782
783/*
784 * __db_pg_free_recover --
785 *	Recovery function for pg_free.
786 *
787 * PUBLIC: int __db_pg_free_recover
788 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
789 */
790int
791__db_pg_free_recover(env, dbtp, lsnp, op, info)
792	ENV *env;
793	DBT *dbtp;
794	DB_LSN *lsnp;
795	db_recops op;
796	void *info;
797{
798	__db_pg_free_args *argp;
799	DB *file_dbp;
800	DBC *dbc;
801	DB_MPOOLFILE *mpf;
802	DB_THREAD_INFO *ip;
803	int ret;
804
805	ip = ((DB_TXNHEAD *)info)->thread_info;
806	REC_PRINT(__db_pg_free_print);
807	REC_INTRO(__db_pg_free_read, ip, 0);
808
809	ret = __db_pg_free_recover_int(env, ip,
810	     (__db_pg_freedata_args *)argp, file_dbp, lsnp, mpf, op, 0);
811
812done:	*lsnp = argp->prev_lsn;
813out:
814	REC_CLOSE;
815}
816
817/*
818 * __db_pg_freedata_recover --
819 *	Recovery function for pg_freedata.
820 *
821 * PUBLIC: int __db_pg_freedata_recover
822 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
823 */
824int
825__db_pg_freedata_recover(env, dbtp, lsnp, op, info)
826	ENV *env;
827	DBT *dbtp;
828	DB_LSN *lsnp;
829	db_recops op;
830	void *info;
831{
832	__db_pg_freedata_args *argp;
833	DB *file_dbp;
834	DBC *dbc;
835	DB_MPOOLFILE *mpf;
836	DB_THREAD_INFO *ip;
837	int ret;
838
839	ip = ((DB_TXNHEAD *)info)->thread_info;
840	REC_PRINT(__db_pg_freedata_print);
841	REC_INTRO(__db_pg_freedata_read, ip, 0);
842
843	ret = __db_pg_free_recover_int(env,
844	    ip, argp, file_dbp, lsnp, mpf, op, 1);
845
846done:	*lsnp = argp->prev_lsn;
847out:
848	REC_CLOSE;
849}
850
851/*
852 * __db_cksum_recover --
853 *	Recovery function for checksum failure log record.
854 *
855 * PUBLIC: int __db_cksum_recover __P((ENV *,
856 * PUBLIC:      DBT *, DB_LSN *, db_recops, void *));
857 */
858int
859__db_cksum_recover(env, dbtp, lsnp, op, info)
860	ENV *env;
861	DBT *dbtp;
862	DB_LSN *lsnp;
863	db_recops op;
864	void *info;
865{
866	__db_cksum_args *argp;
867	int ret;
868
869	COMPQUIET(info, NULL);
870	COMPQUIET(lsnp, NULL);
871	COMPQUIET(op, DB_TXN_ABORT);
872
873	REC_PRINT(__db_cksum_print);
874
875	if ((ret = __db_cksum_read(env, dbtp->data, &argp)) != 0)
876		return (ret);
877
878	/*
879	 * We had a checksum failure -- the only option is to run catastrophic
880	 * recovery.
881	 */
882	if (F_ISSET(env, ENV_RECOVER_FATAL))
883		ret = 0;
884	else {
885		__db_errx(env,
886		    "Checksum failure requires catastrophic recovery");
887		ret = __env_panic(env, DB_RUNRECOVERY);
888	}
889
890	__os_free(env, argp);
891	return (ret);
892}
893
894/*
895 * __db_pg_init_recover --
896 *	Recovery function to reinit pages after truncation.
897 *
898 * PUBLIC: int __db_pg_init_recover
899 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
900 */
901int
902__db_pg_init_recover(env, dbtp, lsnp, op, info)
903	ENV *env;
904	DBT *dbtp;
905	DB_LSN *lsnp;
906	db_recops op;
907	void *info;
908{
909	__db_pg_init_args *argp;
910	DB_THREAD_INFO *ip;
911	DB *file_dbp;
912	DBC *dbc;
913	DB_LSN copy_lsn;
914	DB_MPOOLFILE *mpf;
915	PAGE *pagep;
916	int cmp_n, cmp_p, ret, type;
917
918	ip = ((DB_TXNHEAD *)info)->thread_info;
919	REC_PRINT(__db_pg_init_print);
920	REC_INTRO(__db_pg_init_read, ip, 0);
921
922	mpf = file_dbp->mpf;
923	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
924		if (DB_UNDO(op)) {
925			if (ret == DB_PAGE_NOTFOUND)
926				goto done;
927			else {
928				ret = __db_pgerr(file_dbp, argp->pgno, ret);
929				goto out;
930			}
931		}
932
933		/*
934		 * This page was truncated and may simply not have
935		 * had an item written to it yet.  This should only
936		 * happen on hash databases, so confirm that.
937		 */
938		DB_ASSERT(env, file_dbp->type == DB_HASH);
939		if ((ret = __memp_fget(mpf, &argp->pgno,
940		    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
941			ret = __db_pgerr(file_dbp, argp->pgno, ret);
942			goto out;
943		}
944	}
945
946	(void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
947	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
948	cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
949	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
950
951	if (cmp_p == 0 && DB_REDO(op)) {
952		if (TYPE(pagep) == P_HASH)
953			type = P_HASH;
954		else
955			type = file_dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE;
956		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
957		P_INIT(pagep, file_dbp->pgsize, PGNO(pagep), PGNO_INVALID,
958		    PGNO_INVALID, TYPE(pagep) == P_HASH ? 0 : 1, type);
959		pagep->lsn = *lsnp;
960	} else if (cmp_n == 0 && DB_UNDO(op)) {
961		/* Put the data back on the page. */
962		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
963		memcpy(pagep, argp->header.data, argp->header.size);
964		if (argp->data.size > 0)
965			memcpy((u_int8_t*)pagep + HOFFSET(pagep),
966			     argp->data.data, argp->data.size);
967	}
968	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
969		goto out;
970
971done:	*lsnp = argp->prev_lsn;
972out:
973	REC_CLOSE;
974}
975
976/*
977 * __db_pg_sort_recover --
978 *	Recovery function for pg_sort.
979 *
980 * PUBLIC: int __db_pg_sort_recover
981 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
982 */
983int
984__db_pg_sort_recover(env, dbtp, lsnp, op, info)
985	ENV *env;
986	DBT *dbtp;
987	DB_LSN *lsnp;
988	db_recops op;
989	void *info;
990{
991#ifdef HAVE_FTRUNCATE
992	__db_pg_sort_args *argp;
993	DB_THREAD_INFO *ip;
994	DB *file_dbp;
995	DBC *dbc;
996	DBMETA *meta;
997	DB_MPOOLFILE *mpf;
998	PAGE *pagep;
999	db_pglist_t *pglist, *lp;
1000	db_pgno_t pgno, *list;
1001	u_int32_t felem, nelem;
1002	int ret;
1003
1004	ip = ((DB_TXNHEAD *)info)->thread_info;
1005	REC_PRINT(__db_pg_sort_print);
1006	REC_INTRO(__db_pg_sort_read, ip, 1);
1007
1008	pglist = (db_pglist_t *) argp->list.data;
1009	nelem = argp->list.size / sizeof(db_pglist_t);
1010	if (DB_REDO(op)) {
1011		pgno = argp->last_pgno;
1012		if ((ret = __db_pg_truncate(dbc, NULL,
1013		    pglist, NULL, &nelem, &pgno, lsnp, 1)) != 0)
1014			goto out;
1015
1016		if (argp->last_free != PGNO_INVALID) {
1017			if ((ret = __memp_fget(mpf,
1018			    &argp->last_free, ip, NULL, 0, &meta)) == 0) {
1019				if (LOG_COMPARE(&LSN(meta),
1020				     &argp->last_lsn) == 0) {
1021					REC_DIRTY(mpf,
1022					    ip, dbc->priority, &meta);
1023					NEXT_PGNO(meta) = PGNO_INVALID;
1024					LSN(meta) = *lsnp;
1025				}
1026				if ((ret = __memp_fput(mpf, ip,
1027				    meta, file_dbp->priority)) != 0)
1028					goto out;
1029				meta = NULL;
1030			} else if (ret != DB_PAGE_NOTFOUND)
1031				goto out;
1032		}
1033		if ((ret = __memp_fget(mpf, &argp->meta, ip, NULL,
1034		    0, &meta)) != 0)
1035			goto out;
1036		if (LOG_COMPARE(&LSN(meta), &argp->meta_lsn) == 0) {
1037			REC_DIRTY(mpf, ip, dbc->priority, &meta);
1038			if (argp->last_free == PGNO_INVALID) {
1039				if (nelem == 0)
1040					meta->free = PGNO_INVALID;
1041				else
1042					meta->free = pglist->pgno;
1043			}
1044			meta->last_pgno = pgno;
1045			LSN(meta) = *lsnp;
1046		}
1047	} else {
1048		/* Put the free list back in its original order. */
1049		for (lp = pglist; lp < &pglist[nelem]; lp++) {
1050			if ((ret = __memp_fget(mpf, &lp->pgno, ip,
1051			    NULL, DB_MPOOL_CREATE, &pagep)) != 0)
1052				goto out;
1053			if (IS_ZERO_LSN(LSN(pagep)) ||
1054			     LOG_COMPARE(&LSN(pagep), lsnp) == 0) {
1055				REC_DIRTY(mpf, ip, dbc->priority, &pagep);
1056				if (lp == &pglist[nelem - 1])
1057					pgno = PGNO_INVALID;
1058				else
1059					pgno = lp[1].pgno;
1060
1061				P_INIT(pagep, file_dbp->pgsize,
1062				    lp->pgno, PGNO_INVALID, pgno, 0, P_INVALID);
1063				LSN(pagep) = lp->lsn;
1064			}
1065			if ((ret = __memp_fput(mpf,
1066			    ip, pagep, file_dbp->priority)) != 0)
1067				goto out;
1068		}
1069		if (argp->last_free != PGNO_INVALID) {
1070			if ((ret = __memp_fget(mpf, &argp->last_free,
1071			    ip, NULL, DB_MPOOL_EDIT, &meta)) == 0) {
1072				if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
1073					REC_DIRTY(mpf,
1074					    ip, dbc->priority, &pagep);
1075					NEXT_PGNO(meta) = pglist->pgno;
1076					LSN(meta) = argp->last_lsn;
1077				}
1078				if ((ret = __memp_fput(mpf, ip,
1079				    meta, file_dbp->priority)) != 0)
1080					goto out;
1081			} else if (ret != DB_PAGE_NOTFOUND)
1082				goto out;
1083			meta = NULL;
1084		}
1085		if ((ret = __memp_fget(mpf, &argp->meta,
1086		    ip, NULL, DB_MPOOL_EDIT, &meta)) != 0)
1087			goto out;
1088		if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
1089			REC_DIRTY(mpf, ip, dbc->priority, &meta);
1090			meta->last_pgno = argp->last_pgno;
1091			if (argp->last_free == PGNO_INVALID)
1092				meta->free = pglist->pgno;
1093			LSN(meta) = argp->meta_lsn;
1094		}
1095	}
1096	if (op == DB_TXN_ABORT) {
1097		if ((ret = __memp_get_freelist(mpf, &felem, &list)) != 0)
1098			goto out;
1099		if (list != NULL) {
1100			DB_ASSERT(env, felem == 0 ||
1101			    argp->last_free == list[felem - 1]);
1102			if ((ret = __memp_extend_freelist(
1103			    mpf, felem + nelem, &list)) != 0)
1104				goto out;
1105			for (lp = pglist; lp < &pglist[nelem]; lp++)
1106				list[felem++] = lp->pgno;
1107		}
1108	}
1109
1110	if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
1111		goto out;
1112
1113done:	*lsnp = argp->prev_lsn;
1114	ret = 0;
1115
1116out:	REC_CLOSE;
1117#else
1118	/*
1119	 * If HAVE_FTRUNCATE is not defined, we'll never see pg_sort records
1120	 * to recover.
1121	 */
1122	COMPQUIET(env, NULL);
1123	COMPQUIET(dbtp, NULL);
1124	COMPQUIET(lsnp, NULL);
1125	COMPQUIET(op,  DB_TXN_ABORT);
1126	COMPQUIET(info, NULL);
1127	return (EINVAL);
1128#endif
1129}
1130
1131/*
1132 * __db_pg_alloc_42_recover --
1133 *	Recovery function for pg_alloc.
1134 *
1135 * PUBLIC: int __db_pg_alloc_42_recover
1136 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1137 */
1138int
1139__db_pg_alloc_42_recover(env, dbtp, lsnp, op, info)
1140	ENV *env;
1141	DBT *dbtp;
1142	DB_LSN *lsnp;
1143	db_recops op;
1144	void *info;
1145{
1146	__db_pg_alloc_42_args *argp;
1147	DB_THREAD_INFO *ip;
1148	DB *file_dbp;
1149	DBC *dbc;
1150	DBMETA *meta;
1151	DB_MPOOLFILE *mpf;
1152	PAGE *pagep;
1153	db_pgno_t pgno;
1154	int cmp_n, cmp_p, created, level, ret;
1155
1156	ip = ((DB_TXNHEAD *)info)->thread_info;
1157	meta = NULL;
1158	pagep = NULL;
1159	created = 0;
1160	REC_PRINT(__db_pg_alloc_42_print);
1161	REC_INTRO(__db_pg_alloc_42_read, ip, 0);
1162
1163	/*
1164	 * Fix up the metadata page.  If we're redoing the operation, we have
1165	 * to get the metadata page and update its LSN and its free pointer.
1166	 * If we're undoing the operation and the page was ever created, we put
1167	 * it on the freelist.
1168	 */
1169	pgno = PGNO_BASE_MD;
1170	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &meta)) != 0) {
1171		/* The metadata page must always exist on redo. */
1172		if (DB_REDO(op)) {
1173			ret = __db_pgerr(file_dbp, pgno, ret);
1174			goto out;
1175		} else
1176			goto done;
1177	}
1178	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
1179	cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
1180	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
1181	if (cmp_p == 0 && DB_REDO(op)) {
1182		/* Need to redo update described. */
1183		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
1184		LSN(meta) = *lsnp;
1185		meta->free = argp->next;
1186		if (argp->pgno > meta->last_pgno)
1187			meta->last_pgno = argp->pgno;
1188	} else if (cmp_n == 0 && DB_UNDO(op)) {
1189		goto no_rollback;
1190	}
1191
1192	/*
1193	 * Fix up the allocated page. If the page does not exist
1194	 * and we can truncate it then don't create it.
1195	 * Otherwise if we're redoing the operation, we have
1196	 * to get the page (creating it if it doesn't exist), and update its
1197	 * LSN.  If we're undoing the operation, we have to reset the page's
1198	 * LSN and put it on the free list, or truncate it.
1199	 */
1200	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
1201		/*
1202		 * We have to be able to identify if a page was newly
1203		 * created so we can recover it properly.  We cannot simply
1204		 * look for an empty header, because hash uses a pgin
1205		 * function that will set the header.  Instead, we explicitly
1206		 * try for the page without CREATE and if that fails, then
1207		 * create it.
1208		 */
1209		if ((ret = __memp_fget(mpf, &argp->pgno,
1210		    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
1211			if (DB_UNDO(op) && ret == ENOSPC)
1212				goto do_truncate;
1213			ret = __db_pgerr(file_dbp, argp->pgno, ret);
1214			goto out;
1215		}
1216		created = 1;
1217	}
1218
1219	/* Fix up the allocated page. */
1220	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
1221	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->page_lsn);
1222
1223	/*
1224	 * If an initial allocation is aborted and then reallocated during
1225	 * an archival restore the log record will have an LSN for the page
1226	 * but the page will be empty.
1227	 */
1228	if (IS_ZERO_LSN(LSN(pagep)) ||
1229	    (IS_ZERO_LSN(argp->page_lsn) && IS_INIT_LSN(LSN(pagep))))
1230		cmp_p = 0;
1231
1232	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->page_lsn);
1233	/*
1234	 * Another special case we have to handle is if we ended up with a
1235	 * page of all 0's which can happen if we abort between allocating a
1236	 * page in mpool and initializing it.  In that case, even if we're
1237	 * undoing, we need to re-initialize the page.
1238	 */
1239	if (DB_REDO(op) && cmp_p == 0) {
1240		/* Need to redo update described. */
1241		switch (argp->ptype) {
1242		case P_LBTREE:
1243		case P_LRECNO:
1244		case P_LDUP:
1245			level = LEAFLEVEL;
1246			break;
1247		default:
1248			level = 0;
1249			break;
1250		}
1251		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1252		P_INIT(pagep, file_dbp->pgsize,
1253		    argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype);
1254
1255		pagep->lsn = *lsnp;
1256	} else if (DB_UNDO(op) && (cmp_n == 0 || created)) {
1257		/*
1258		 * This is where we handle the case of a 0'd page (pagep->pgno
1259		 * is equal to PGNO_INVALID).
1260		 * Undo the allocation, reinitialize the page and
1261		 * link its next pointer to the free list.
1262		 */
1263		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1264		P_INIT(pagep, file_dbp->pgsize,
1265		    argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
1266
1267		pagep->lsn = argp->page_lsn;
1268	}
1269
1270do_truncate:
1271	/*
1272	 * We cannot undo things from 4.2 land, because we nolonger
1273	 * have limbo processing.
1274	 */
1275	if ((pagep == NULL || IS_ZERO_LSN(LSN(pagep))) &&
1276	    IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) {
1277no_rollback:	__db_errx(env,
1278"Cannot replicate prepared transactions from master running release 4.2 ");
1279		ret = __env_panic(env, EINVAL);
1280	}
1281
1282	if (pagep != NULL &&
1283	    (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
1284		goto out;
1285	pagep = NULL;
1286
1287	if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
1288		goto out;
1289	meta = NULL;
1290
1291done:	*lsnp = argp->prev_lsn;
1292	ret = 0;
1293
1294out:	if (pagep != NULL)
1295		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
1296	if (meta != NULL)
1297		(void)__memp_fput(mpf, ip, meta, file_dbp->priority);
1298	REC_CLOSE;
1299}
1300
1301/*
1302 * __db_pg_free_recover_42_int --
1303 */
1304static int
1305__db_pg_free_recover_42_int(env, ip, argp, file_dbp, lsnp, mpf, op, data)
1306	ENV *env;
1307	DB_THREAD_INFO *ip;
1308	__db_pg_freedata_42_args *argp;
1309	DB *file_dbp;
1310	DB_LSN *lsnp;
1311	DB_MPOOLFILE *mpf;
1312	db_recops op;
1313	int data;
1314{
1315	DBMETA *meta;
1316	DB_LSN copy_lsn;
1317	PAGE *pagep, *prevp;
1318	int cmp_n, cmp_p, is_meta, ret;
1319
1320	meta = NULL;
1321	pagep = NULL;
1322	prevp = NULL;
1323
1324	/*
1325	 * Get the "metapage".  This will either be the metapage
1326	 * or the previous page in the free list if we are doing
1327	 * sorted allocations.  If its a previous page then
1328	 * we will not be truncating.
1329	 */
1330	is_meta = argp->meta_pgno == PGNO_BASE_MD;
1331
1332	REC_FGET(mpf, ip, argp->meta_pgno, &meta, check_meta);
1333
1334	if (argp->meta_pgno != PGNO_BASE_MD)
1335		prevp = (PAGE *)meta;
1336
1337	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
1338	cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
1339	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
1340
1341	/*
1342	 * Fix up the metadata page.  If we're redoing or undoing the operation
1343	 * we get the page and update its LSN, last and free pointer.
1344	 */
1345	if (cmp_p == 0 && DB_REDO(op)) {
1346		/* Need to redo the deallocation. */
1347		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
1348		if (prevp == NULL)
1349			meta->free = argp->pgno;
1350		else
1351			NEXT_PGNO(prevp) = argp->pgno;
1352		/*
1353		 * If this was a compensating transaction and
1354		 * we are a replica, then we never executed the
1355		 * original allocation which incremented meta->free.
1356		 */
1357		if (prevp == NULL && meta->last_pgno < meta->free)
1358			meta->last_pgno = meta->free;
1359		LSN(meta) = *lsnp;
1360	} else if (cmp_n == 0 && DB_UNDO(op)) {
1361		/* Need to undo the deallocation. */
1362		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
1363		if (prevp == NULL)
1364			meta->free = argp->next;
1365		else
1366			NEXT_PGNO(prevp) = argp->next;
1367		LSN(meta) = argp->meta_lsn;
1368		if (prevp == NULL && meta->last_pgno < argp->pgno)
1369			meta->last_pgno = argp->pgno;
1370	}
1371
1372check_meta:
1373	if (ret != 0 && is_meta) {
1374		/* The metadata page must always exist. */
1375		ret = __db_pgerr(file_dbp, argp->meta_pgno, ret);
1376		goto out;
1377	}
1378
1379	/*
1380	 * Get the freed page.  If we support truncate then don't
1381	 * create the page if we are going to free it.  If we're
1382	 * redoing the operation we get the page and explicitly discard
1383	 * its contents, then update its LSN.  If we're undoing the
1384	 * operation, we get the page and restore its header.
1385	 * If we don't support truncate, then we must create the page
1386	 * and roll it back.
1387	 */
1388	if ((ret = __memp_fget(mpf, &argp->pgno,
1389	    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
1390		goto out;
1391
1392	(void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
1393	cmp_n = IS_ZERO_LSN(LSN(pagep)) ? 0 : LOG_COMPARE(lsnp, &LSN(pagep));
1394	cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
1395
1396	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
1397	if (DB_REDO(op) &&
1398	    (cmp_p == 0 ||
1399	    (IS_ZERO_LSN(copy_lsn) &&
1400	    LOG_COMPARE(&LSN(pagep), &argp->meta_lsn) <= 0))) {
1401		/* Need to redo the deallocation. */
1402		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1403		P_INIT(pagep, file_dbp->pgsize,
1404		    argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
1405		pagep->lsn = *lsnp;
1406	} else if (cmp_n == 0 && DB_UNDO(op)) {
1407		/* Need to reallocate the page. */
1408		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1409		memcpy(pagep, argp->header.data, argp->header.size);
1410		if (data)
1411			memcpy((u_int8_t*)pagep + HOFFSET(pagep),
1412			     argp->data.data, argp->data.size);
1413	}
1414	if (pagep != NULL &&
1415	    (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
1416		goto out;
1417
1418	pagep = NULL;
1419	if (meta != NULL &&
1420	    (ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
1421		goto out;
1422	meta = NULL;
1423
1424	ret = 0;
1425
1426out:	if (pagep != NULL)
1427		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
1428	if (meta != NULL)
1429		(void)__memp_fput(mpf, ip, meta, file_dbp->priority);
1430
1431	return (ret);
1432}
1433
1434/*
1435 * __db_pg_free_42_recover --
1436 *	Recovery function for pg_free.
1437 *
1438 * PUBLIC: int __db_pg_free_42_recover
1439 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1440 */
1441int
1442__db_pg_free_42_recover(env, dbtp, lsnp, op, info)
1443	ENV *env;
1444	DBT *dbtp;
1445	DB_LSN *lsnp;
1446	db_recops op;
1447	void *info;
1448{
1449	__db_pg_free_42_args *argp;
1450	DB *file_dbp;
1451	DBC *dbc;
1452	DB_MPOOLFILE *mpf;
1453	DB_THREAD_INFO *ip;
1454	int ret;
1455
1456	ip = ((DB_TXNHEAD *)info)->thread_info;
1457	REC_PRINT(__db_pg_free_42_print);
1458	REC_INTRO(__db_pg_free_42_read, ip, 0);
1459
1460	ret = __db_pg_free_recover_42_int(env, ip,
1461	     (__db_pg_freedata_42_args *)argp, file_dbp, lsnp, mpf, op, 0);
1462
1463done:	*lsnp = argp->prev_lsn;
1464out:
1465	REC_CLOSE;
1466}
1467
1468/*
1469 * __db_pg_freedata_42_recover --
1470 *	Recovery function for pg_freedata.
1471 *
1472 * PUBLIC: int __db_pg_freedata_42_recover
1473 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1474 */
1475int
1476__db_pg_freedata_42_recover(env, dbtp, lsnp, op, info)
1477	ENV *env;
1478	DBT *dbtp;
1479	DB_LSN *lsnp;
1480	db_recops op;
1481	void *info;
1482{
1483	__db_pg_freedata_42_args *argp;
1484	DB *file_dbp;
1485	DBC *dbc;
1486	DB_MPOOLFILE *mpf;
1487	DB_THREAD_INFO *ip;
1488	int ret;
1489
1490	ip = ((DB_TXNHEAD *)info)->thread_info;
1491	REC_PRINT(__db_pg_freedata_42_print);
1492	REC_INTRO(__db_pg_freedata_42_read, ip, 0);
1493
1494	ret = __db_pg_free_recover_42_int(
1495	    env, ip, argp, file_dbp, lsnp, mpf, op, 1);
1496
1497done:	*lsnp = argp->prev_lsn;
1498out:
1499	REC_CLOSE;
1500}
1501
1502/*
1503 * __db_relink_42_recover --
1504 *	Recovery function for relink.
1505 *
1506 * PUBLIC: int __db_relink_42_recover
1507 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1508 */
1509int
1510__db_relink_42_recover(env, dbtp, lsnp, op, info)
1511	ENV *env;
1512	DBT *dbtp;
1513	DB_LSN *lsnp;
1514	db_recops op;
1515	void *info;
1516{
1517	__db_relink_42_args *argp;
1518	DB_THREAD_INFO *ip;
1519	DB *file_dbp;
1520	DBC *dbc;
1521	DB_MPOOLFILE *mpf;
1522	PAGE *pagep;
1523	int cmp_n, cmp_p, modified, ret;
1524
1525	ip = ((DB_TXNHEAD *)info)->thread_info;
1526	pagep = NULL;
1527	REC_PRINT(__db_relink_42_print);
1528	REC_INTRO(__db_relink_42_read, ip, 0);
1529
1530	/*
1531	 * There are up to three pages we need to check -- the page, and the
1532	 * previous and next pages, if they existed.  For a page add operation,
1533	 * the current page is the result of a split and is being recovered
1534	 * elsewhere, so all we need do is recover the next page.
1535	 */
1536	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
1537		if (DB_REDO(op)) {
1538			ret = __db_pgerr(file_dbp, argp->pgno, ret);
1539			goto out;
1540		}
1541		goto next2;
1542	}
1543	if (argp->opcode == DB_ADD_PAGE_COMPAT)
1544		goto next1;
1545
1546	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
1547	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
1548	if (cmp_p == 0 && DB_REDO(op)) {
1549		/* Redo the relink. */
1550		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1551		pagep->lsn = *lsnp;
1552	} else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
1553		/* Undo the relink. */
1554		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1555		pagep->next_pgno = argp->next;
1556		pagep->prev_pgno = argp->prev;
1557		pagep->lsn = argp->lsn;
1558	}
1559next1:	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
1560		goto out;
1561	pagep = NULL;
1562
1563next2:	if ((ret = __memp_fget(mpf, &argp->next, ip, NULL, 0, &pagep)) != 0) {
1564		if (DB_REDO(op)) {
1565			ret = __db_pgerr(file_dbp, argp->next, ret);
1566			goto out;
1567		}
1568		goto prev;
1569	}
1570	modified = 0;
1571	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
1572	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next);
1573	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next);
1574	if ((argp->opcode == DB_REM_PAGE_COMPAT && cmp_p == 0 && DB_REDO(op)) ||
1575	    (argp->opcode == DB_ADD_PAGE_COMPAT && cmp_n == 0 && DB_UNDO(op))) {
1576		/* Redo the remove or undo the add. */
1577		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1578		pagep->prev_pgno = argp->prev;
1579		modified = 1;
1580	} else if ((argp->opcode == DB_REM_PAGE_COMPAT &&
1581	    cmp_n == 0 && DB_UNDO(op)) ||
1582	    (argp->opcode == DB_ADD_PAGE_COMPAT && cmp_p == 0 && DB_REDO(op))) {
1583		/* Undo the remove or redo the add. */
1584		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1585		pagep->prev_pgno = argp->pgno;
1586		modified = 1;
1587	}
1588	if (modified) {
1589		if (DB_UNDO(op))
1590			pagep->lsn = argp->lsn_next;
1591		else
1592			pagep->lsn = *lsnp;
1593	}
1594	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
1595		goto out;
1596	pagep = NULL;
1597	if (argp->opcode == DB_ADD_PAGE_COMPAT)
1598		goto done;
1599
1600prev:	if ((ret = __memp_fget(mpf, &argp->prev, ip, NULL, 0, &pagep)) != 0) {
1601		if (DB_REDO(op)) {
1602			ret = __db_pgerr(file_dbp, argp->prev, ret);
1603			goto out;
1604		}
1605		goto done;
1606	}
1607	modified = 0;
1608	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev);
1609	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev);
1610	if (cmp_p == 0 && DB_REDO(op)) {
1611		/* Redo the relink. */
1612		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1613		pagep->next_pgno = argp->next;
1614		modified = 1;
1615	} else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
1616		/* Undo the relink. */
1617		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1618		pagep->next_pgno = argp->pgno;
1619		modified = 1;
1620	}
1621	if (modified) {
1622		if (DB_UNDO(op))
1623			pagep->lsn = argp->lsn_prev;
1624		else
1625			pagep->lsn = *lsnp;
1626	}
1627	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
1628		goto out;
1629	pagep = NULL;
1630
1631done:	*lsnp = argp->prev_lsn;
1632	ret = 0;
1633
1634out:	if (pagep != NULL)
1635		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
1636	REC_CLOSE;
1637}
1638