1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: bt_rec.c,v 12.39 2008/02/18 06:14:08 mjc Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/btree.h"
14#include "dbinc/lock.h"
15#include "dbinc/log.h"
16#include "dbinc/mp.h"
17
18#define	IS_BTREE_PAGE(pagep)						\
19	(TYPE(pagep) == P_IBTREE ||					\
20	 TYPE(pagep) == P_LBTREE || TYPE(pagep) == P_LDUP)
21
22/*
23 * __bam_split_recover --
24 *	Recovery function for split.
25 *
26 * PUBLIC: int __bam_split_recover
27 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
28 */
29int
30__bam_split_recover(env, dbtp, lsnp, op, info)
31	ENV *env;
32	DBT *dbtp;
33	DB_LSN *lsnp;
34	db_recops op;
35	void *info;
36{
37	__bam_split_args *argp;
38	DB_THREAD_INFO *ip;
39	DB *file_dbp;
40	DBC *dbc;
41	DB_MPOOLFILE *mpf;
42	PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp;
43	db_pgno_t pgno, root_pgno;
44	u_int32_t ptype;
45	int cmp, l_update, p_update, r_update, rc, ret, rootsplit, t_ret;
46
47	ip = ((DB_TXNHEAD *)info)->thread_info;
48	REC_PRINT(__bam_split_print);
49
50	_lp = lp = np = pp = _rp = rp = NULL;
51	sp = NULL;
52
53	REC_INTRO(__bam_split_read, ip, 0);
54
55	/*
56	 * There are two kinds of splits that we have to recover from.  The
57	 * first is a root-page split, where the root page is split from a
58	 * leaf page into an internal page and two new leaf pages are created.
59	 * The second is where a page is split into two pages, and a new key
60	 * is inserted into the parent page.
61	 *
62	 * DBTs are not aligned in log records, so we need to copy the page
63	 * so that we can access fields within it throughout this routine.
64	 * Although we could hardcode the unaligned copies in this routine,
65	 * we will be calling into regular btree functions with this page,
66	 * so it's got to be aligned.  Copying it into allocated memory is
67	 * the only way to guarantee this.
68	 */
69	if ((ret = __os_malloc(env, argp->pg.size, &sp)) != 0)
70		goto out;
71	memcpy(sp, argp->pg.data, argp->pg.size);
72
73	pgno = PGNO(sp);
74	root_pgno = argp->root_pgno;
75	rootsplit = root_pgno != PGNO_INVALID;
76	REC_FGET(mpf, ip, argp->left, &lp, right);
77right:	REC_FGET(mpf, ip, argp->right, &rp, redo);
78
79redo:	if (DB_REDO(op)) {
80		l_update = r_update = p_update = 0;
81		/*
82		 * Decide if we need to resplit the page.
83		 *
84		 * If this is a root split, then the root has to exist unless
85		 * we have truncated it due to a future deallocation.
86		 */
87		if (rootsplit) {
88			REC_FGET(mpf, ip, root_pgno, &pp, do_left);
89			cmp = LOG_COMPARE(&LSN(pp), &LSN(argp->pg.data));
90			CHECK_LSN(env, op,
91			    cmp, &LSN(pp), &LSN(argp->pg.data));
92			p_update = cmp  == 0;
93		}
94
95do_left:	if (lp != NULL) {
96			cmp = LOG_COMPARE(&LSN(lp), &argp->llsn);
97			CHECK_LSN(env, op, cmp, &LSN(lp), &argp->llsn);
98			if (cmp == 0)
99				l_update = 1;
100		}
101
102		if (rp != NULL) {
103			cmp = LOG_COMPARE(&LSN(rp), &argp->rlsn);
104			CHECK_LSN(env, op, cmp, &LSN(rp), &argp->rlsn);
105			if (cmp == 0)
106				r_update = 1;
107		}
108
109		if (!p_update && !l_update && !r_update)
110			goto check_next;
111
112		/* Allocate and initialize new left/right child pages. */
113		if ((ret = __os_malloc(env, file_dbp->pgsize, &_lp)) != 0 ||
114		    (ret = __os_malloc(env, file_dbp->pgsize, &_rp)) != 0)
115			goto out;
116		if (rootsplit) {
117			P_INIT(_lp, file_dbp->pgsize, argp->left,
118			    PGNO_INVALID,
119			    ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
120			    LEVEL(sp), TYPE(sp));
121			P_INIT(_rp, file_dbp->pgsize, argp->right,
122			    ISINTERNAL(sp) ?  PGNO_INVALID : argp->left,
123			    PGNO_INVALID, LEVEL(sp), TYPE(sp));
124		} else {
125			P_INIT(_lp, file_dbp->pgsize, PGNO(sp),
126			    ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp),
127			    ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
128			    LEVEL(sp), TYPE(sp));
129			P_INIT(_rp, file_dbp->pgsize, argp->right,
130			    ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno,
131			    ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp),
132			    LEVEL(sp), TYPE(sp));
133		}
134
135		/* Split the page. */
136		if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 ||
137		    (ret = __bam_copy(file_dbp, sp, _rp, argp->indx,
138		    NUM_ENT(sp))) != 0)
139			goto out;
140
141		if (l_update) {
142			REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
143			memcpy(lp, _lp, file_dbp->pgsize);
144			lp->lsn = *lsnp;
145			if ((ret = __memp_fput(mpf,
146			     ip, lp, file_dbp->priority)) != 0)
147				goto out;
148			lp = NULL;
149		}
150
151		if (r_update) {
152			REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
153			memcpy(rp, _rp, file_dbp->pgsize);
154			rp->lsn = *lsnp;
155			if ((ret = __memp_fput(mpf,
156			    ip, rp, file_dbp->priority)) != 0)
157				goto out;
158			rp = NULL;
159		}
160
161		/*
162		 * If the parent page is wrong, update it.  This is of interest
163		 * only if it was a root split, since root splits create parent
164		 * pages.  All other splits modify a parent page, but those are
165		 * separately logged and recovered.
166		 */
167		if (rootsplit && p_update) {
168			if (IS_BTREE_PAGE(sp)) {
169				ptype = P_IBTREE;
170				rc = argp->opflags & SPL_NRECS ? 1 : 0;
171			} else {
172				ptype = P_IRECNO;
173				rc = 1;
174			}
175
176			REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
177			P_INIT(pp, file_dbp->pgsize, root_pgno,
178			    PGNO_INVALID, PGNO_INVALID, _lp->level + 1, ptype);
179			RE_NREC_SET(pp, rc ? __bam_total(file_dbp, _lp) +
180			    __bam_total(file_dbp, _rp) : 0);
181
182			pp->lsn = *lsnp;
183			if ((ret = __memp_fput(mpf,
184			     ip, pp, file_dbp->priority)) != 0)
185				goto out;
186			pp = NULL;
187		}
188
189check_next:	/*
190		 * Finally, redo the next-page link if necessary.  This is of
191		 * interest only if it wasn't a root split -- inserting a new
192		 * page in the tree requires that any following page have its
193		 * previous-page pointer updated to our new page.  The next
194		 * page must exist because we're redoing the operation.
195		 */
196		if (!rootsplit && argp->npgno != PGNO_INVALID) {
197			if ((ret = __memp_fget(mpf, &argp->npgno,
198			    ip, NULL, 0, &np)) != 0) {
199				if (ret != DB_PAGE_NOTFOUND) {
200					ret = __db_pgerr(
201					    file_dbp, argp->npgno, ret);
202					goto out;
203				} else
204					goto done;
205			}
206			cmp = LOG_COMPARE(&LSN(np), &argp->nlsn);
207			CHECK_LSN(env, op, cmp, &LSN(np), &argp->nlsn);
208			if (cmp == 0) {
209				REC_DIRTY(mpf, ip, file_dbp->priority, &np);
210				PREV_PGNO(np) = argp->right;
211				np->lsn = *lsnp;
212				if ((ret = __memp_fput(mpf, ip,
213				    np, file_dbp->priority)) != 0)
214					goto out;
215				np = NULL;
216			}
217		}
218	} else {
219		/*
220		 * If the split page is wrong, replace its contents with the
221		 * logged page contents.  If the page doesn't exist, it means
222		 * that the create of the page never happened, nor did any of
223		 * the adds onto the page that caused the split, and there's
224		 * really no undo-ing to be done.
225		 */
226		if ((ret = __memp_fget(mpf, &pgno, ip, NULL,
227		    DB_MPOOL_EDIT, &pp)) != 0) {
228			pp = NULL;
229			goto lrundo;
230		}
231		if (LOG_COMPARE(lsnp, &LSN(pp)) == 0) {
232			REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
233			memcpy(pp, argp->pg.data, argp->pg.size);
234			if ((ret = __memp_fput(mpf,
235			     ip, pp, file_dbp->priority)) != 0)
236				goto out;
237			pp = NULL;
238		}
239
240		/*
241		 * If it's a root split and the left child ever existed, update
242		 * its LSN.  (If it's not a root split, we've updated the left
243		 * page already -- it's the same as the split page.) If the
244		 * right child ever existed, root split or not, update its LSN.
245		 * The undo of the page allocation(s) will restore them to the
246		 * free list.
247		 */
248lrundo:		if ((rootsplit && lp != NULL) || rp != NULL) {
249			if (rootsplit && lp != NULL &&
250			    LOG_COMPARE(lsnp, &LSN(lp)) == 0) {
251				REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
252				lp->lsn = argp->llsn;
253				if ((ret = __memp_fput(mpf, ip,
254				    lp, file_dbp->priority)) != 0)
255					goto out;
256				lp = NULL;
257			}
258			if (rp != NULL &&
259			    LOG_COMPARE(lsnp, &LSN(rp)) == 0) {
260				REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
261				rp->lsn = argp->rlsn;
262				if ((ret = __memp_fput(mpf, ip,
263				     rp, file_dbp->priority)) != 0)
264					goto out;
265				rp = NULL;
266			}
267		}
268
269		/*
270		 * Finally, undo the next-page link if necessary.  This is of
271		 * interest only if it wasn't a root split -- inserting a new
272		 * page in the tree requires that any following page have its
273		 * previous-page pointer updated to our new page.  Since it's
274		 * possible that the next-page never existed, we ignore it as
275		 * if there's nothing to undo.
276		 */
277		if (!rootsplit && argp->npgno != PGNO_INVALID) {
278			if ((ret = __memp_fget(mpf, &argp->npgno,
279			    ip, NULL, DB_MPOOL_EDIT, &np)) != 0) {
280				np = NULL;
281				goto done;
282			}
283			if (LOG_COMPARE(lsnp, &LSN(np)) == 0) {
284				REC_DIRTY(mpf, ip, file_dbp->priority, &np);
285				PREV_PGNO(np) = argp->left;
286				np->lsn = argp->nlsn;
287				if (__memp_fput(mpf,
288				     ip, np, file_dbp->priority))
289					goto out;
290				np = NULL;
291			}
292		}
293	}
294
295done:	*lsnp = argp->prev_lsn;
296	ret = 0;
297
298out:	/* Free any pages that weren't dirtied. */
299	if (pp != NULL && (t_ret = __memp_fput(mpf,
300	    ip, pp, file_dbp->priority)) != 0 && ret == 0)
301		ret = t_ret;
302	if (lp != NULL && (t_ret = __memp_fput(mpf,
303	    ip, lp, file_dbp->priority)) != 0 && ret == 0)
304		ret = t_ret;
305	if (np != NULL && (t_ret = __memp_fput(mpf,
306	    ip, np, file_dbp->priority)) != 0 && ret == 0)
307		ret = t_ret;
308	if (rp != NULL && (t_ret = __memp_fput(mpf,
309	     ip, rp, file_dbp->priority)) != 0 && ret == 0)
310		ret = t_ret;
311
312	/* Free any allocated space. */
313	if (_lp != NULL)
314		__os_free(env, _lp);
315	if (_rp != NULL)
316		__os_free(env, _rp);
317	if (sp != NULL)
318		__os_free(env, sp);
319
320	REC_CLOSE;
321}
322
323/*
324 * __bam_rsplit_recover --
325 *	Recovery function for a reverse split.
326 *
327 * PUBLIC: int __bam_rsplit_recover
328 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
329 */
330int
331__bam_rsplit_recover(env, dbtp, lsnp, op, info)
332	ENV *env;
333	DBT *dbtp;
334	DB_LSN *lsnp;
335	db_recops op;
336	void *info;
337{
338	__bam_rsplit_args *argp;
339	DB_THREAD_INFO *ip;
340	DB *file_dbp;
341	DBC *dbc;
342	DB_LSN copy_lsn;
343	DB_MPOOLFILE *mpf;
344	PAGE *pagep;
345	db_pgno_t pgno, root_pgno;
346	db_recno_t rcnt;
347	int cmp_n, cmp_p, ret;
348
349	ip = ((DB_TXNHEAD *)info)->thread_info;
350	pagep = NULL;
351	REC_PRINT(__bam_rsplit_print);
352	REC_INTRO(__bam_rsplit_read, ip, 1);
353
354	/* Fix the root page. */
355	pgno = root_pgno = argp->root_pgno;
356	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep)) != 0) {
357		if (ret != DB_PAGE_NOTFOUND) {
358			ret = __db_pgerr(file_dbp, pgno, ret);
359			goto out;
360		} else
361			goto do_page;
362	}
363
364	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
365	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->rootlsn);
366	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->rootlsn);
367	if (cmp_p == 0 && DB_REDO(op)) {
368		/*
369		 * Copy the new data to the root page.  If it is not now a
370		 * leaf page we need to restore the record number.  We could
371		 * try to determine if C_RECNUM was set in the btree, but
372		 * that's not really necessary since the field is not used
373		 * otherwise.
374		 */
375		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
376		rcnt = RE_NREC(pagep);
377		memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size);
378		if (LEVEL(pagep) > LEAFLEVEL)
379			RE_NREC_SET(pagep, rcnt);
380		pagep->pgno = root_pgno;
381		pagep->lsn = *lsnp;
382	} else if (cmp_n == 0 && DB_UNDO(op)) {
383		/* Need to undo update described. */
384		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
385		P_INIT(pagep, file_dbp->pgsize, root_pgno,
386		    argp->nrec, PGNO_INVALID, pagep->level + 1,
387		    IS_BTREE_PAGE(pagep) ? P_IBTREE : P_IRECNO);
388		if ((ret = __db_pitem(dbc, pagep, 0,
389		    argp->rootent.size, &argp->rootent, NULL)) != 0)
390			goto out;
391		pagep->lsn = argp->rootlsn;
392	}
393	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
394		goto out;
395
396do_page:
397	/*
398	 * Fix the page copied over the root page.  It's possible that the
399	 * page never made it to disk, or was truncated so if the page
400	 * doesn't exist, it's okay and there's nothing further to do.
401	 */
402	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
403		if (ret != DB_PAGE_NOTFOUND) {
404			ret = __db_pgerr(file_dbp, argp->pgno, ret);
405			goto out;
406		} else
407			goto done;
408	}
409	(void)__ua_memcpy(&copy_lsn, &LSN(argp->pgdbt.data), sizeof(DB_LSN));
410	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
411	cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
412	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
413	if (cmp_p == 0 && DB_REDO(op)) {
414		/* Need to redo update described. */
415		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
416		pagep->lsn = *lsnp;
417	} else if (cmp_n == 0 && DB_UNDO(op)) {
418		/* Need to undo update described. */
419		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
420		memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size);
421	}
422	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
423		goto out;
424	pagep = NULL;
425
426done:	*lsnp = argp->prev_lsn;
427	ret = 0;
428
429out:	if (pagep != NULL)
430		(void)__memp_fput(mpf, ip, pagep, dbc->priority);
431	REC_CLOSE;
432}
433
434/*
435 * __bam_adj_recover --
436 *	Recovery function for adj.
437 *
438 * PUBLIC: int __bam_adj_recover
439 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
440 */
441int
442__bam_adj_recover(env, dbtp, lsnp, op, info)
443	ENV *env;
444	DBT *dbtp;
445	DB_LSN *lsnp;
446	db_recops op;
447	void *info;
448{
449	__bam_adj_args *argp;
450	DB_THREAD_INFO *ip;
451	DB *file_dbp;
452	DBC *dbc;
453	DB_MPOOLFILE *mpf;
454	PAGE *pagep;
455	int cmp_n, cmp_p, ret;
456
457	ip = ((DB_TXNHEAD *)info)->thread_info;
458	pagep = NULL;
459	REC_PRINT(__bam_adj_print);
460	REC_INTRO(__bam_adj_read, ip, 1);
461
462	/* Get the page; if it never existed and we're undoing, we're done. */
463	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
464		if (ret != DB_PAGE_NOTFOUND) {
465			ret = __db_pgerr(file_dbp, argp->pgno, ret);
466			goto out;
467		} else
468			goto done;
469	}
470
471	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
472	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
473	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
474	if (cmp_p == 0 && DB_REDO(op)) {
475		/* Need to redo update described. */
476		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
477		if ((ret = __bam_adjindx(dbc,
478		    pagep, argp->indx, argp->indx_copy, argp->is_insert)) != 0)
479			goto out;
480
481		LSN(pagep) = *lsnp;
482	} else if (cmp_n == 0 && DB_UNDO(op)) {
483		/* Need to undo update described. */
484		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
485		if ((ret = __bam_adjindx(dbc,
486		    pagep, argp->indx, argp->indx_copy, !argp->is_insert)) != 0)
487			goto out;
488
489		LSN(pagep) = argp->lsn;
490	}
491	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
492		goto out;
493	pagep = NULL;
494
495done:	*lsnp = argp->prev_lsn;
496	ret = 0;
497
498out:	if (pagep != NULL)
499		(void)__memp_fput(mpf, ip, pagep, dbc->priority);
500	REC_CLOSE;
501}
502
503/*
504 * __bam_cadjust_recover --
505 *	Recovery function for the adjust of a count change in an internal
506 *	page.
507 *
508 * PUBLIC: int __bam_cadjust_recover
509 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
510 */
511int
512__bam_cadjust_recover(env, dbtp, lsnp, op, info)
513	ENV *env;
514	DBT *dbtp;
515	DB_LSN *lsnp;
516	db_recops op;
517	void *info;
518{
519	__bam_cadjust_args *argp;
520	DB_THREAD_INFO *ip;
521	DB *file_dbp;
522	DBC *dbc;
523	DB_MPOOLFILE *mpf;
524	PAGE *pagep;
525	int cmp_n, cmp_p, ret;
526
527	ip = ((DB_TXNHEAD *)info)->thread_info;
528	pagep = NULL;
529	REC_PRINT(__bam_cadjust_print);
530	REC_INTRO(__bam_cadjust_read, ip, 0);
531
532	/* Get the page; if it never existed and we're undoing, we're done. */
533	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
534		if (ret != DB_PAGE_NOTFOUND) {
535			ret = __db_pgerr(file_dbp, argp->pgno, ret);
536			goto out;
537		} else
538			goto done;
539	}
540
541	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
542	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
543	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
544	if (cmp_p == 0 && DB_REDO(op)) {
545		/* Need to redo update described. */
546		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
547		if (IS_BTREE_PAGE(pagep)) {
548			GET_BINTERNAL(file_dbp, pagep, argp->indx)->nrecs +=
549			    argp->adjust;
550			if (argp->opflags & CAD_UPDATEROOT)
551				RE_NREC_ADJ(pagep, argp->adjust);
552		} else {
553			GET_RINTERNAL(file_dbp, pagep, argp->indx)->nrecs +=
554			    argp->adjust;
555			if (argp->opflags & CAD_UPDATEROOT)
556				RE_NREC_ADJ(pagep, argp->adjust);
557		}
558
559		LSN(pagep) = *lsnp;
560	} else if (cmp_n == 0 && DB_UNDO(op)) {
561		/* Need to undo update described. */
562		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
563		if (IS_BTREE_PAGE(pagep)) {
564			GET_BINTERNAL(file_dbp, pagep, argp->indx)->nrecs -=
565			    argp->adjust;
566			if (argp->opflags & CAD_UPDATEROOT)
567				RE_NREC_ADJ(pagep, -(argp->adjust));
568		} else {
569			GET_RINTERNAL(file_dbp, pagep, argp->indx)->nrecs -=
570			    argp->adjust;
571			if (argp->opflags & CAD_UPDATEROOT)
572				RE_NREC_ADJ(pagep, -(argp->adjust));
573		}
574		LSN(pagep) = argp->lsn;
575	}
576	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
577		goto out;
578	pagep = NULL;
579
580done:	*lsnp = argp->prev_lsn;
581	ret = 0;
582
583out:	if (pagep != NULL)
584		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
585	REC_CLOSE;
586}
587
588/*
589 * __bam_cdel_recover --
590 *	Recovery function for the intent-to-delete of a cursor record.
591 *
592 * PUBLIC: int __bam_cdel_recover
593 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
594 */
595int
596__bam_cdel_recover(env, dbtp, lsnp, op, info)
597	ENV *env;
598	DBT *dbtp;
599	DB_LSN *lsnp;
600	db_recops op;
601	void *info;
602{
603	__bam_cdel_args *argp;
604	DB_THREAD_INFO *ip;
605	DB *file_dbp;
606	DBC *dbc;
607	DB_MPOOLFILE *mpf;
608	PAGE *pagep;
609	u_int32_t indx;
610	int cmp_n, cmp_p, ret;
611
612	ip = ((DB_TXNHEAD *)info)->thread_info;
613	pagep = NULL;
614	REC_PRINT(__bam_cdel_print);
615	REC_INTRO(__bam_cdel_read, ip, 0);
616
617	/* Get the page; if it never existed and we're undoing, we're done. */
618	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
619		if (ret != DB_PAGE_NOTFOUND) {
620			ret = __db_pgerr(file_dbp, argp->pgno, ret);
621			goto out;
622		} else
623			goto done;
624	}
625
626	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
627	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
628	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
629	if (cmp_p == 0 && DB_REDO(op)) {
630		/* Need to redo update described. */
631		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
632		indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0);
633		B_DSET(GET_BKEYDATA(file_dbp, pagep, indx)->type);
634
635		LSN(pagep) = *lsnp;
636	} else if (cmp_n == 0 && DB_UNDO(op)) {
637		/* Need to undo update described. */
638		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
639		indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0);
640		B_DCLR(GET_BKEYDATA(file_dbp, pagep, indx)->type);
641
642		if ((ret = __bam_ca_delete(
643		    file_dbp, argp->pgno, argp->indx, 0, NULL)) != 0)
644			goto out;
645
646		LSN(pagep) = argp->lsn;
647	}
648	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
649		goto out;
650	pagep = NULL;
651
652done:	*lsnp = argp->prev_lsn;
653	ret = 0;
654
655out:	if (pagep != NULL)
656		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
657	REC_CLOSE;
658}
659
660/*
661 * __bam_repl_recover --
662 *	Recovery function for page item replacement.
663 *
664 * PUBLIC: int __bam_repl_recover
665 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
666 */
667int
668__bam_repl_recover(env, dbtp, lsnp, op, info)
669	ENV *env;
670	DBT *dbtp;
671	DB_LSN *lsnp;
672	db_recops op;
673	void *info;
674{
675	__bam_repl_args *argp;
676	DB_THREAD_INFO *ip;
677	BKEYDATA *bk;
678	DB *file_dbp;
679	DBC *dbc;
680	DBT dbt;
681	DB_MPOOLFILE *mpf;
682	PAGE *pagep;
683	int cmp_n, cmp_p, ret;
684	u_int8_t *p;
685
686	ip = ((DB_TXNHEAD *)info)->thread_info;
687	pagep = NULL;
688	REC_PRINT(__bam_repl_print);
689	REC_INTRO(__bam_repl_read, ip, 1);
690
691	/* Get the page; if it never existed and we're undoing, we're done. */
692	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
693		if (ret != DB_PAGE_NOTFOUND) {
694			ret = __db_pgerr(file_dbp, argp->pgno, ret);
695			goto out;
696		} else
697			goto done;
698	}
699	bk = GET_BKEYDATA(file_dbp, pagep, argp->indx);
700
701	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
702	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
703	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
704	if (cmp_p == 0 && DB_REDO(op)) {
705		/*
706		 * Need to redo update described.
707		 *
708		 * Re-build the replacement item.
709		 */
710		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
711		memset(&dbt, 0, sizeof(dbt));
712		dbt.size = argp->prefix + argp->suffix + argp->repl.size;
713		if ((ret = __os_malloc(env, dbt.size, &dbt.data)) != 0)
714			goto out;
715		p = dbt.data;
716		memcpy(p, bk->data, argp->prefix);
717		p += argp->prefix;
718		memcpy(p, argp->repl.data, argp->repl.size);
719		p += argp->repl.size;
720		memcpy(p, bk->data + (bk->len - argp->suffix), argp->suffix);
721
722		ret = __bam_ritem(dbc, pagep, argp->indx, &dbt);
723		__os_free(env, dbt.data);
724		if (ret != 0)
725			goto out;
726
727		LSN(pagep) = *lsnp;
728	} else if (cmp_n == 0 && DB_UNDO(op)) {
729		/*
730		 * Need to undo update described.
731		 *
732		 * Re-build the original item.
733		 */
734		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
735		memset(&dbt, 0, sizeof(dbt));
736		dbt.size = argp->prefix + argp->suffix + argp->orig.size;
737		if ((ret = __os_malloc(env, dbt.size, &dbt.data)) != 0)
738			goto out;
739		p = dbt.data;
740		memcpy(p, bk->data, argp->prefix);
741		p += argp->prefix;
742		memcpy(p, argp->orig.data, argp->orig.size);
743		p += argp->orig.size;
744		memcpy(p, bk->data + (bk->len - argp->suffix), argp->suffix);
745
746		ret = __bam_ritem(dbc, pagep, argp->indx, &dbt);
747		__os_free(env, dbt.data);
748		if (ret != 0)
749			goto out;
750
751		/* Reset the deleted flag, if necessary. */
752		if (argp->isdeleted)
753			B_DSET(GET_BKEYDATA(file_dbp, pagep, argp->indx)->type);
754
755		LSN(pagep) = argp->lsn;
756	}
757	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
758		goto out;
759	pagep = NULL;
760
761done:	*lsnp = argp->prev_lsn;
762	ret = 0;
763
764out:	if (pagep != NULL)
765		(void)__memp_fput(mpf, ip, pagep, dbc->priority);
766	REC_CLOSE;
767}
768
769/*
770 * __bam_root_recover --
771 *	Recovery function for setting the root page on the meta-data page.
772 *
773 * PUBLIC: int __bam_root_recover
774 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
775 */
776int
777__bam_root_recover(env, dbtp, lsnp, op, info)
778	ENV *env;
779	DBT *dbtp;
780	DB_LSN *lsnp;
781	db_recops op;
782	void *info;
783{
784	__bam_root_args *argp;
785	DB_THREAD_INFO *ip;
786	BTMETA *meta;
787	DB *file_dbp;
788	DBC *dbc;
789	DB_MPOOLFILE *mpf;
790	int cmp_n, cmp_p, ret;
791
792	ip = ((DB_TXNHEAD *)info)->thread_info;
793	meta = NULL;
794	REC_PRINT(__bam_root_print);
795	REC_INTRO(__bam_root_read, ip, 0);
796
797	if ((ret = __memp_fget(mpf, &argp->meta_pgno, ip, NULL,
798	    0, &meta)) != 0) {
799		if (ret != DB_PAGE_NOTFOUND) {
800			ret = __db_pgerr(file_dbp, argp->meta_pgno, ret);
801			goto out;
802		} else
803			goto done;
804	}
805
806	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
807	cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
808	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
809	if (cmp_p == 0 && DB_REDO(op)) {
810		/* Need to redo update described. */
811		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
812		meta->root = argp->root_pgno;
813		meta->dbmeta.lsn = *lsnp;
814		((BTREE *)file_dbp->bt_internal)->bt_root = meta->root;
815	} else if (cmp_n == 0 && DB_UNDO(op)) {
816		/* Nothing to undo except lsn. */
817		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
818		meta->dbmeta.lsn = argp->meta_lsn;
819	}
820	if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
821		goto out;
822	meta = NULL;
823
824done:	*lsnp = argp->prev_lsn;
825	ret = 0;
826
827out:	if (meta != NULL)
828		(void)__memp_fput(mpf, ip, meta, file_dbp->priority);
829	REC_CLOSE;
830}
831
832/*
833 * __bam_curadj_recover --
834 *	Transaction abort function to undo cursor adjustments.
835 *	This should only be triggered by subtransaction aborts.
836 *
837 * PUBLIC: int __bam_curadj_recover
838 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
839 */
840int
841__bam_curadj_recover(env, dbtp, lsnp, op, info)
842	ENV *env;
843	DBT *dbtp;
844	DB_LSN *lsnp;
845	db_recops op;
846	void *info;
847{
848	__bam_curadj_args *argp;
849	DB_THREAD_INFO *ip;
850	DB *file_dbp;
851	DBC *dbc;
852	DB_MPOOLFILE *mpf;
853	int ret;
854
855	COMPQUIET(mpf, NULL);
856
857	ip = ((DB_TXNHEAD *)info)->thread_info;
858	REC_PRINT(__bam_curadj_print);
859	REC_INTRO(__bam_curadj_read, ip, 1);
860
861	ret = 0;
862	if (op != DB_TXN_ABORT)
863		goto done;
864
865	switch (argp->mode) {
866	case DB_CA_DI:
867		if ((ret = __bam_ca_di(dbc, argp->from_pgno,
868		    argp->from_indx, -(int)argp->first_indx)) != 0)
869			goto out;
870		break;
871	case DB_CA_DUP:
872		if ((ret = __bam_ca_undodup(file_dbp, argp->first_indx,
873		    argp->from_pgno, argp->from_indx, argp->to_indx)) != 0)
874			goto out;
875		break;
876
877	case DB_CA_RSPLIT:
878		if ((ret =
879		    __bam_ca_rsplit(dbc, argp->to_pgno, argp->from_pgno)) != 0)
880			goto out;
881		break;
882
883	case DB_CA_SPLIT:
884		if ((ret = __bam_ca_undosplit(file_dbp, argp->from_pgno,
885		    argp->to_pgno, argp->left_pgno, argp->from_indx)) != 0)
886			goto out;
887		break;
888	}
889
890done:	*lsnp = argp->prev_lsn;
891out:	REC_CLOSE;
892}
893
894/*
895 * __bam_rcuradj_recover --
896 *	Transaction abort function to undo cursor adjustments in rrecno.
897 *	This should only be triggered by subtransaction aborts.
898 *
899 * PUBLIC: int __bam_rcuradj_recover
900 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
901 */
902int
903__bam_rcuradj_recover(env, dbtp, lsnp, op, info)
904	ENV *env;
905	DBT *dbtp;
906	DB_LSN *lsnp;
907	db_recops op;
908	void *info;
909{
910	__bam_rcuradj_args *argp;
911	DB_THREAD_INFO *ip;
912	BTREE_CURSOR *cp;
913	DB *file_dbp;
914	DBC *dbc, *rdbc;
915	DB_MPOOLFILE *mpf;
916	int ret, t_ret;
917
918	COMPQUIET(mpf, NULL);
919
920	ip = ((DB_TXNHEAD *)info)->thread_info;
921	rdbc = NULL;
922	REC_PRINT(__bam_rcuradj_print);
923	REC_INTRO(__bam_rcuradj_read, ip, 1);
924
925	ret = t_ret = 0;
926
927	if (op != DB_TXN_ABORT)
928		goto done;
929
930	/*
931	 * We don't know whether we're in an offpage dup set, and
932	 * thus don't know whether the dbc REC_INTRO has handed us is
933	 * of a reasonable type.  It's certainly unset, so if this is
934	 * an offpage dup set, we don't have an OPD cursor.  The
935	 * simplest solution is just to allocate a whole new cursor
936	 * for our use;  we're only really using it to hold pass some
937	 * state into __ram_ca, and this way we don't need to make
938	 * this function know anything about how offpage dups work.
939	 */
940	if ((ret = __db_cursor_int(file_dbp, NULL,
941		NULL, DB_RECNO, argp->root, 0, NULL, &rdbc)) != 0)
942		goto out;
943
944	cp = (BTREE_CURSOR *)rdbc->internal;
945	F_SET(cp, C_RENUMBER);
946	cp->recno = argp->recno;
947
948	switch (argp->mode) {
949	case CA_DELETE:
950		/*
951		 * The way to undo a delete is with an insert.  Since
952		 * we're undoing it, the delete flag must be set.
953		 */
954		F_SET(cp, C_DELETED);
955		F_SET(cp, C_RENUMBER);	/* Just in case. */
956		cp->order = argp->order;
957		if ((ret = __ram_ca(rdbc, CA_ICURRENT, NULL)) != 0)
958			goto out;
959		break;
960	case CA_IAFTER:
961	case CA_IBEFORE:
962	case CA_ICURRENT:
963		/*
964		 * The way to undo an insert is with a delete.  The delete
965		 * flag is unset to start with.
966		 */
967		F_CLR(cp, C_DELETED);
968		cp->order = INVALID_ORDER;
969		if ((ret = __ram_ca(rdbc, CA_DELETE, NULL)) != 0)
970			goto out;
971		break;
972	}
973
974done:	*lsnp = argp->prev_lsn;
975out:	if (rdbc != NULL && (t_ret = __dbc_close(rdbc)) != 0 && ret == 0)
976		ret = t_ret;
977	REC_CLOSE;
978}
979
980/*
981 * __bam_relink_recover --
982 *	Recovery function for relink.
983 *
984 * PUBLIC: int __bam_relink_recover
985 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
986 */
987int
988__bam_relink_recover(env, dbtp, lsnp, op, info)
989	ENV *env;
990	DBT *dbtp;
991	DB_LSN *lsnp;
992	db_recops op;
993	void *info;
994{
995	__bam_relink_args *argp;
996	DB_THREAD_INFO *ip;
997	DB *file_dbp;
998	DBC *dbc;
999	DB_MPOOLFILE *mpf;
1000	PAGE *pagep;
1001	int cmp_n, cmp_p, ret;
1002
1003	ip = ((DB_TXNHEAD *)info)->thread_info;
1004	pagep = NULL;
1005	REC_PRINT(__bam_relink_print);
1006	REC_INTRO(__bam_relink_read, ip, 0);
1007
1008	/*
1009	 * There are up to three pages we need to check -- the page, and the
1010	 * previous and next pages, if they existed.  For a page add operation,
1011	 * the current page is the result of a split and is being recovered
1012	 * elsewhere, so all we need do is recover the next page.
1013	 */
1014	if ((ret = __memp_fget(mpf, &argp->next, ip, NULL, 0, &pagep)) != 0) {
1015		if (ret != DB_PAGE_NOTFOUND) {
1016			ret = __db_pgerr(file_dbp, argp->next, ret);
1017			goto out;
1018		} else
1019			goto prev;
1020	}
1021
1022	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
1023	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next);
1024	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next);
1025	if (cmp_p == 0 && DB_REDO(op)) {
1026		/* Redo the remove or replace. */
1027		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1028		if (argp->new_pgno == PGNO_INVALID)
1029			pagep->prev_pgno = argp->prev;
1030		else
1031			pagep->prev_pgno = argp->new_pgno;
1032
1033		pagep->lsn = *lsnp;
1034	} else if (cmp_n == 0 && DB_UNDO(op)) {
1035		/* Undo the remove or replace. */
1036		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1037		pagep->prev_pgno = argp->pgno;
1038
1039		pagep->lsn = argp->lsn_next;
1040	}
1041
1042	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
1043		goto out;
1044	pagep = NULL;
1045
1046prev:	if ((ret = __memp_fget(mpf, &argp->prev, ip, NULL, 0, &pagep)) != 0) {
1047		if (ret != DB_PAGE_NOTFOUND) {
1048			ret = __db_pgerr(file_dbp, argp->prev, ret);
1049			goto out;
1050		} else
1051			goto done;
1052	}
1053
1054	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev);
1055	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev);
1056	if (cmp_p == 0 && DB_REDO(op)) {
1057		/* Redo the relink. */
1058		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1059		if (argp->new_pgno == PGNO_INVALID)
1060			pagep->next_pgno = argp->next;
1061		else
1062			pagep->next_pgno = argp->new_pgno;
1063
1064		pagep->lsn = *lsnp;
1065	} else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
1066		/* Undo the relink. */
1067		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1068		pagep->next_pgno = argp->pgno;
1069		pagep->lsn = argp->lsn_prev;
1070	}
1071
1072	if ((ret = __memp_fput(mpf,
1073	     ip, pagep, file_dbp->priority)) != 0)
1074		goto out;
1075	pagep = NULL;
1076
1077done:	*lsnp = argp->prev_lsn;
1078	ret = 0;
1079
1080out:	if (pagep != NULL)
1081		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
1082	REC_CLOSE;
1083}
1084
1085/*
1086 * __bam_merge_44_recover --
1087 *	Recovery function for merge.
1088 *
1089 * PUBLIC: int __bam_merge_44_recover
1090 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1091 */
1092int
1093__bam_merge_44_recover(env, dbtp, lsnp, op, info)
1094	ENV *env;
1095	DBT *dbtp;
1096	DB_LSN *lsnp;
1097	db_recops op;
1098	void *info;
1099{
1100	__bam_merge_44_args *argp;
1101	DB_THREAD_INFO *ip;
1102	BKEYDATA *bk;
1103	DB *file_dbp;
1104	DBC *dbc;
1105	DB_MPOOLFILE *mpf;
1106	PAGE *pagep;
1107	db_indx_t indx, *ninp, *pinp;
1108	u_int32_t size;
1109	u_int8_t *bp;
1110	int cmp_n, cmp_p, i, ret;
1111
1112	ip = ((DB_TXNHEAD *)info)->thread_info;
1113	REC_PRINT(__bam_merge_44_print);
1114	REC_INTRO(__bam_merge_44_read, ip, 1);
1115
1116	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
1117		if (ret != DB_PAGE_NOTFOUND) {
1118			ret = __db_pgerr(file_dbp, argp->pgno, ret);
1119			goto out;
1120		} else
1121			goto next;
1122	}
1123
1124	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
1125	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
1126	CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn);
1127
1128	if (cmp_p == 0 && DB_REDO(op)) {
1129		/*
1130		 * If the header is provided the page is empty, copy the
1131		 * needed data.
1132		 */
1133		DB_ASSERT(env, argp->hdr.size == 0 || NUM_ENT(pagep) == 0);
1134		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
1135		if (argp->hdr.size != 0) {
1136			P_INIT(pagep, file_dbp->pgsize, pagep->pgno,
1137			     PREV_PGNO(argp->hdr.data),
1138			     NEXT_PGNO(argp->hdr.data),
1139			     LEVEL(argp->hdr.data), TYPE(argp->hdr.data));
1140		}
1141		if (TYPE(pagep) == P_OVERFLOW) {
1142			OV_REF(pagep) = OV_REF(argp->hdr.data);
1143			OV_LEN(pagep) = OV_LEN(argp->hdr.data);
1144			bp = (u_int8_t *) pagep + P_OVERHEAD(file_dbp);
1145			memcpy(bp, argp->data.data, argp->data.size);
1146		} else {
1147			/* Copy the data segment. */
1148			bp = (u_int8_t *)pagep +
1149			     (db_indx_t)(HOFFSET(pagep) - argp->data.size);
1150			memcpy(bp, argp->data.data, argp->data.size);
1151
1152			/* Copy index table offset past the current entries. */
1153			pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
1154			ninp = argp->ind.data;
1155			for (i = 0;
1156			     i < (int)(argp->ind.size / sizeof(*ninp)); i++)
1157				*pinp++ = *ninp++
1158				      - (file_dbp->pgsize - HOFFSET(pagep));
1159			HOFFSET(pagep) -= argp->data.size;
1160			NUM_ENT(pagep) += i;
1161		}
1162		pagep->lsn = *lsnp;
1163	} else if (cmp_n == 0 && !DB_REDO(op)) {
1164		/*
1165		 * Since logging is logical at the page level
1166		 * we cannot just truncate the data space.  Delete
1167		 * the proper number of items from the logical end
1168		 * of the page.
1169		 */
1170		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
1171		for (i = 0; i < (int)(argp->ind.size / sizeof(*ninp)); i++) {
1172			indx = NUM_ENT(pagep) - 1;
1173			if (P_INP(file_dbp, pagep)[indx] ==
1174			     P_INP(file_dbp, pagep)[indx - P_INDX]) {
1175				NUM_ENT(pagep)--;
1176				continue;
1177			}
1178			switch (TYPE(pagep)) {
1179			case P_LBTREE:
1180			case P_LRECNO:
1181			case P_LDUP:
1182				bk = GET_BKEYDATA(file_dbp, pagep, indx);
1183				size = BITEM_SIZE(bk);
1184				break;
1185
1186			case P_IBTREE:
1187				size = BINTERNAL_SIZE(
1188				     GET_BINTERNAL(file_dbp, pagep, indx)->len);
1189				break;
1190			case P_IRECNO:
1191				size = RINTERNAL_SIZE;
1192				break;
1193
1194			default:
1195				ret = __db_pgfmt(env, PGNO(pagep));
1196				goto out;
1197			}
1198			if ((ret =
1199			     __db_ditem(dbc, pagep, indx, size)) != 0)
1200				goto out;
1201		}
1202		if (argp->ind.size == 0)
1203			HOFFSET(pagep) = file_dbp->pgsize;
1204		pagep->lsn = argp->lsn;
1205	}
1206
1207	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
1208		goto out;
1209
1210next:	if ((ret = __memp_fget(mpf, &argp->npgno, ip, NULL, 0, &pagep)) != 0) {
1211		if (ret != DB_PAGE_NOTFOUND) {
1212			ret = __db_pgerr(file_dbp, argp->pgno, ret);
1213			goto out;
1214		} else
1215			goto done;
1216	}
1217
1218	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
1219	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nlsn);
1220	CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->nlsn);
1221
1222	if (cmp_p == 0 && DB_REDO(op)) {
1223		/* Need to truncate the page. */
1224		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
1225		HOFFSET(pagep) = file_dbp->pgsize;
1226		NUM_ENT(pagep) = 0;
1227		pagep->lsn = *lsnp;
1228	} else if (cmp_n == 0 && !DB_REDO(op)) {
1229		/* Need to put the data back on the page. */
1230		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
1231		if (TYPE(pagep) == P_OVERFLOW) {
1232			OV_REF(pagep) = OV_REF(argp->hdr.data);
1233			OV_LEN(pagep) = OV_LEN(argp->hdr.data);
1234			bp = (u_int8_t *) pagep + P_OVERHEAD(file_dbp);
1235			memcpy(bp, argp->data.data, argp->data.size);
1236		} else {
1237			bp = (u_int8_t *)pagep +
1238			     (db_indx_t)(HOFFSET(pagep) - argp->data.size);
1239			memcpy(bp, argp->data.data, argp->data.size);
1240
1241			/* Copy index table. */
1242			pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
1243			ninp = argp->ind.data;
1244			for (i = 0;
1245			    i < (int)(argp->ind.size / sizeof(*ninp)); i++)
1246				*pinp++ = *ninp++;
1247			HOFFSET(pagep) -= argp->data.size;
1248			NUM_ENT(pagep) = i;
1249		}
1250		pagep->lsn = argp->nlsn;
1251	}
1252
1253	if ((ret = __memp_fput(mpf,
1254	     ip, pagep, dbc->priority)) != 0)
1255		goto out;
1256done:
1257	*lsnp = argp->prev_lsn;
1258	ret = 0;
1259
1260out:	REC_CLOSE;
1261}
1262
1263/*
1264 * __bam_merge_recover --
1265 *	Recovery function for merge.
1266 *
1267 * PUBLIC: int __bam_merge_recover
1268 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1269 */
1270int
1271__bam_merge_recover(env, dbtp, lsnp, op, info)
1272	ENV *env;
1273	DBT *dbtp;
1274	DB_LSN *lsnp;
1275	db_recops op;
1276	void *info;
1277{
1278	__bam_merge_args *argp;
1279	DB_THREAD_INFO *ip;
1280	BKEYDATA *bk;
1281	DB *file_dbp;
1282	DBC *dbc;
1283	DB_MPOOLFILE *mpf;
1284	PAGE *pagep;
1285	db_indx_t indx, *ninp, *pinp;
1286	u_int32_t size;
1287	u_int8_t *bp;
1288	int cmp_n, cmp_p, i, ret;
1289
1290	ip = ((DB_TXNHEAD *)info)->thread_info;
1291	REC_PRINT(__bam_merge_print);
1292	REC_INTRO(__bam_merge_read, ip, 1);
1293
1294	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
1295		if (ret != DB_PAGE_NOTFOUND) {
1296			ret = __db_pgerr(file_dbp, argp->pgno, ret);
1297			goto out;
1298		} else
1299			goto next;
1300	}
1301
1302	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
1303	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
1304	CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn);
1305
1306	if (cmp_p == 0 && DB_REDO(op)) {
1307		/*
1308		 * When pg_copy is set, we are copying onto a new page.
1309		 */
1310		DB_ASSERT(env, !argp->pg_copy || NUM_ENT(pagep) == 0);
1311		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
1312		if (argp->pg_copy) {
1313			P_INIT(pagep, file_dbp->pgsize, pagep->pgno,
1314			     PREV_PGNO(argp->hdr.data),
1315			     NEXT_PGNO(argp->hdr.data),
1316			     LEVEL(argp->hdr.data), TYPE(argp->hdr.data));
1317		}
1318		if (TYPE(pagep) == P_OVERFLOW) {
1319			OV_REF(pagep) = OV_REF(argp->hdr.data);
1320			OV_LEN(pagep) = OV_LEN(argp->hdr.data);
1321			bp = (u_int8_t *)pagep + P_OVERHEAD(file_dbp);
1322			memcpy(bp, argp->data.data, argp->data.size);
1323		} else {
1324			/* Copy the data segment. */
1325			bp = (u_int8_t *)pagep +
1326			     (db_indx_t)(HOFFSET(pagep) - argp->data.size);
1327			memcpy(bp, argp->data.data, argp->data.size);
1328
1329			/* Copy index table offset past the current entries. */
1330			pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
1331			ninp = P_INP(file_dbp, argp->hdr.data);
1332			for (i = 0; i < NUM_ENT(argp->hdr.data); i++)
1333				*pinp++ = *ninp++
1334				      - (file_dbp->pgsize - HOFFSET(pagep));
1335			HOFFSET(pagep) -= argp->data.size;
1336			NUM_ENT(pagep) += i;
1337		}
1338		pagep->lsn = *lsnp;
1339	} else if (cmp_n == 0 && !DB_REDO(op)) {
1340		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
1341		if (TYPE(pagep) == P_OVERFLOW) {
1342			HOFFSET(pagep) = file_dbp->pgsize;
1343			goto setlsn;
1344		}
1345
1346		/*
1347		 * Since logging is logical at the page level we cannot just
1348		 * truncate the data space.  Delete the proper number of items
1349		 * from the logical end of the page.
1350		 */
1351		for (i = 0; i < NUM_ENT(argp->hdr.data); i++) {
1352			indx = NUM_ENT(pagep) - 1;
1353			if (P_INP(file_dbp, pagep)[indx] ==
1354			     P_INP(file_dbp, pagep)[indx - P_INDX]) {
1355				NUM_ENT(pagep)--;
1356				continue;
1357			}
1358			switch (TYPE(pagep)) {
1359			case P_LBTREE:
1360			case P_LRECNO:
1361			case P_LDUP:
1362				bk = GET_BKEYDATA(file_dbp, pagep, indx);
1363				size = BITEM_SIZE(bk);
1364				break;
1365
1366			case P_IBTREE:
1367				size = BINTERNAL_SIZE(
1368				     GET_BINTERNAL(file_dbp, pagep, indx)->len);
1369				break;
1370			case P_IRECNO:
1371				size = RINTERNAL_SIZE;
1372				break;
1373
1374			default:
1375				ret = __db_pgfmt(env, PGNO(pagep));
1376				goto out;
1377			}
1378			if ((ret = __db_ditem(dbc, pagep, indx, size)) != 0)
1379				goto out;
1380		}
1381setlsn:		pagep->lsn = argp->lsn;
1382	}
1383
1384	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
1385		goto out;
1386
1387next:	if ((ret = __memp_fget(mpf, &argp->npgno, ip, NULL, 0, &pagep)) != 0) {
1388		if (ret != DB_PAGE_NOTFOUND) {
1389			ret = __db_pgerr(file_dbp, argp->pgno, ret);
1390			goto out;
1391		} else
1392			goto done;
1393	}
1394
1395	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
1396	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nlsn);
1397	CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->nlsn);
1398
1399	if (cmp_p == 0 && DB_REDO(op)) {
1400		/* Need to truncate the page. */
1401		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
1402		HOFFSET(pagep) = file_dbp->pgsize;
1403		NUM_ENT(pagep) = 0;
1404		pagep->lsn = *lsnp;
1405	} else if (cmp_n == 0 && !DB_REDO(op)) {
1406		/* Need to put the data back on the page. */
1407		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
1408		if (TYPE(pagep) == P_OVERFLOW) {
1409			OV_REF(pagep) = OV_REF(argp->hdr.data);
1410			OV_LEN(pagep) = OV_LEN(argp->hdr.data);
1411			bp = (u_int8_t *)pagep + P_OVERHEAD(file_dbp);
1412			memcpy(bp, argp->data.data, argp->data.size);
1413		} else {
1414			bp = (u_int8_t *)pagep +
1415			     (db_indx_t)(HOFFSET(pagep) - argp->data.size);
1416			memcpy(bp, argp->data.data, argp->data.size);
1417
1418			/* Copy index table. */
1419			pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
1420			ninp = P_INP(file_dbp, argp->hdr.data);
1421			for (i = 0; i < NUM_ENT(argp->hdr.data); i++)
1422				*pinp++ = *ninp++;
1423			HOFFSET(pagep) -= argp->data.size;
1424			NUM_ENT(pagep) += i;
1425		}
1426		pagep->lsn = argp->nlsn;
1427	}
1428
1429	if ((ret = __memp_fput(mpf,
1430	     ip, pagep, dbc->priority)) != 0)
1431		goto out;
1432done:
1433	*lsnp = argp->prev_lsn;
1434	ret = 0;
1435
1436out:	REC_CLOSE;
1437}
1438
1439/*
1440 * __bam_pgno_recover --
1441 *	Recovery function for page number replacment.
1442 *
1443 * PUBLIC: int __bam_pgno_recover
1444 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1445 */
1446int
1447__bam_pgno_recover(env, dbtp, lsnp, op, info)
1448	ENV *env;
1449	DBT *dbtp;
1450	DB_LSN *lsnp;
1451	db_recops op;
1452	void *info;
1453{
1454	BINTERNAL *bi;
1455	__bam_pgno_args *argp;
1456	DB_THREAD_INFO *ip;
1457	DB *file_dbp;
1458	DBC *dbc;
1459	DB_MPOOLFILE *mpf;
1460	PAGE *pagep, *npagep;
1461	db_pgno_t *pgnop;
1462	int cmp_n, cmp_p, ret;
1463
1464	ip = ((DB_TXNHEAD *)info)->thread_info;
1465	REC_PRINT(__bam_pgno_print);
1466	REC_INTRO(__bam_pgno_read, ip, 0);
1467
1468	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
1469
1470	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
1471	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
1472	CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn);
1473
1474	if ((cmp_p == 0 && DB_REDO(op)) || (cmp_n == 0 && !DB_REDO(op))) {
1475		switch (TYPE(pagep)) {
1476		case P_IBTREE:
1477			/*
1478			 * An internal record can have both a overflow
1479			 * and child pointer.  Fetch the page to see
1480			 * which it is.
1481			 */
1482			bi = GET_BINTERNAL(file_dbp, pagep, argp->indx);
1483			if (B_TYPE(bi->type) == B_OVERFLOW) {
1484				REC_FGET(mpf, ip, argp->npgno, &npagep, out);
1485
1486				if (TYPE(npagep) == P_OVERFLOW)
1487					pgnop =
1488					     &((BOVERFLOW *)(bi->data))->pgno;
1489				else
1490					pgnop = &bi->pgno;
1491				if ((ret = __memp_fput(mpf, ip,
1492				    npagep, file_dbp->priority)) != 0)
1493					goto out;
1494				break;
1495			}
1496			pgnop = &bi->pgno;
1497			break;
1498		case P_IRECNO:
1499			pgnop =
1500			     &GET_RINTERNAL(file_dbp, pagep, argp->indx)->pgno;
1501			break;
1502		default:
1503			pgnop =
1504			     &GET_BOVERFLOW(file_dbp, pagep, argp->indx)->pgno;
1505			break;
1506		}
1507
1508		if (DB_REDO(op)) {
1509			/* Need to redo update described. */
1510			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1511			*pgnop = argp->npgno;
1512			pagep->lsn = *lsnp;
1513		} else {
1514			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1515			*pgnop = argp->opgno;
1516			pagep->lsn = argp->lsn;
1517		}
1518	}
1519
1520	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
1521		goto out;
1522
1523done:
1524	*lsnp = argp->prev_lsn;
1525	ret = 0;
1526
1527out:	REC_CLOSE;
1528}
1529
1530/*
1531 * __bam_relink_43_recover --
1532 *	Recovery function for relink.
1533 *
1534 * PUBLIC: int __bam_relink_43_recover
1535 * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
1536 */
1537int
1538__bam_relink_43_recover(env, dbtp, lsnp, op, info)
1539	ENV *env;
1540	DBT *dbtp;
1541	DB_LSN *lsnp;
1542	db_recops op;
1543	void *info;
1544{
1545	__bam_relink_43_args *argp;
1546	DB_THREAD_INFO *ip;
1547	DB *file_dbp;
1548	DBC *dbc;
1549	DB_MPOOLFILE *mpf;
1550	PAGE *pagep;
1551	int cmp_n, cmp_p, modified, ret;
1552
1553	ip = ((DB_TXNHEAD *)info)->thread_info;
1554	pagep = NULL;
1555	REC_PRINT(__bam_relink_43_print);
1556	REC_INTRO(__bam_relink_43_read, ip, 0);
1557
1558	/*
1559	 * There are up to three pages we need to check -- the page, and the
1560	 * previous and next pages, if they existed.  For a page add operation,
1561	 * the current page is the result of a split and is being recovered
1562	 * elsewhere, so all we need do is recover the next page.
1563	 */
1564	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
1565		if (ret != DB_PAGE_NOTFOUND) {
1566			ret = __db_pgerr(file_dbp, argp->pgno, ret);
1567			goto out;
1568		} else
1569			goto next2;
1570	}
1571
1572	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
1573	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
1574	if (cmp_p == 0 && DB_REDO(op)) {
1575		/* Redo the relink. */
1576		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1577		pagep->lsn = *lsnp;
1578	} else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
1579		/* Undo the relink. */
1580		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1581		pagep->next_pgno = argp->next;
1582		pagep->prev_pgno = argp->prev;
1583		pagep->lsn = argp->lsn;
1584	}
1585	if ((ret = __memp_fput(mpf,
1586	     ip, pagep, file_dbp->priority)) != 0)
1587		goto out;
1588	pagep = NULL;
1589
1590next2: if ((ret = __memp_fget(mpf, &argp->next, ip, NULL, 0, &pagep)) != 0) {
1591		if (ret != DB_PAGE_NOTFOUND) {
1592			ret = __db_pgerr(file_dbp, argp->next, ret);
1593			goto out;
1594		} else
1595			goto prev;
1596	}
1597
1598	modified = 0;
1599	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
1600	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next);
1601	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next);
1602	if (cmp_p == 0 && DB_REDO(op)) {
1603		/* Redo the remove or undo the add. */
1604		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1605		pagep->prev_pgno = argp->prev;
1606		modified = 1;
1607	} else if (cmp_n == 0 && DB_UNDO(op)) {
1608		/* Undo the remove or redo the add. */
1609		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1610		pagep->prev_pgno = argp->pgno;
1611		modified = 1;
1612	}
1613	if (modified) {
1614		if (DB_UNDO(op))
1615			pagep->lsn = argp->lsn_next;
1616		else
1617			pagep->lsn = *lsnp;
1618	}
1619	if ((ret = __memp_fput(mpf,
1620	     ip, pagep, file_dbp->priority)) != 0)
1621		goto out;
1622	pagep = NULL;
1623
1624prev: if ((ret = __memp_fget(mpf, &argp->prev, ip, NULL, 0, &pagep)) != 0) {
1625		if (ret != DB_PAGE_NOTFOUND) {
1626			ret = __db_pgerr(file_dbp, argp->prev, ret);
1627			goto out;
1628		} else
1629			goto done;
1630	}
1631
1632	modified = 0;
1633	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev);
1634	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev);
1635	if (cmp_p == 0 && DB_REDO(op)) {
1636		/* Redo the relink. */
1637		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1638		pagep->next_pgno = argp->next;
1639		modified = 1;
1640	} else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
1641		/* Undo the relink. */
1642		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
1643		pagep->next_pgno = argp->pgno;
1644		modified = 1;
1645	}
1646	if (modified) {
1647		if (DB_UNDO(op))
1648			pagep->lsn = argp->lsn_prev;
1649		else
1650			pagep->lsn = *lsnp;
1651	}
1652	if ((ret = __memp_fput(mpf,
1653	     ip, pagep, file_dbp->priority)) != 0)
1654		goto out;
1655	pagep = NULL;
1656
1657done:	*lsnp = argp->prev_lsn;
1658	ret = 0;
1659
1660out:	if (pagep != NULL)
1661		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
1662	REC_CLOSE;
1663}
1664