1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 */
6/*
7 * Copyright (c) 1995, 1996
8 *	The President and Fellows of Harvard University.  All rights reserved.
9 *
10 * This code is derived from software contributed to Berkeley by
11 * Margo Seltzer.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * $Id: db_dispatch.c,v 12.45 2008/03/12 20:28:09 mbrey Exp $
38 */
39
40#include "db_config.h"
41
42#include "db_int.h"
43#include "dbinc/db_page.h"
44#include "dbinc/hash.h"
45#include "dbinc/fop.h"
46#include "dbinc/lock.h"
47#include "dbinc/log.h"
48#include "dbinc/mp.h"
49#include "dbinc/txn.h"
50
51static int __db_txnlist_find_internal __P((ENV *, DB_TXNHEAD *,
52		db_txnlist_type, u_int32_t,  DB_TXNLIST **,
53		int, u_int32_t *));
54
55/*
56 * __db_dispatch --
57 *
58 * This is the transaction dispatch function used by the db access methods.
59 * It is designed to handle the record format used by all the access
60 * methods (the one automatically generated by the db_{h,log,read}.sh
61 * scripts in the tools directory).  An application using a different
62 * recovery paradigm will supply a different dispatch function to txn_open.
63 *
64 * PUBLIC: int __db_dispatch __P((ENV *,
65 * PUBLIC:     DB_DISTAB *, DBT *, DB_LSN *, db_recops, DB_TXNHEAD *));
66 */
67int
68__db_dispatch(env, dtab, db, lsnp, redo, info)
69	ENV *env;		/* The environment. */
70	DB_DISTAB *dtab;
71	DBT *db;		/* The log record upon which to dispatch. */
72	DB_LSN *lsnp;		/* The lsn of the record being dispatched. */
73	db_recops redo;		/* Redo this op (or undo it). */
74	DB_TXNHEAD *info;	/* Transaction list. */
75{
76	DB_ENV *dbenv;
77	DB_LSN prev_lsn;
78	u_int32_t rectype, status, txnid, urectype;
79	int make_call, ret;
80
81	dbenv = env->dbenv;
82	LOGCOPY_32(env, &rectype, db->data);
83	LOGCOPY_32(env, &txnid, (u_int8_t *)db->data + sizeof(rectype));
84
85	make_call = ret = 0;
86
87	/* If we don't have a dispatch table, it's hard to dispatch. */
88	DB_ASSERT(env, dtab != NULL);
89
90	/*
91	 * If we find a record that is in the user's number space and they
92	 * have specified a recovery routine, let them handle it.  If they
93	 * didn't specify a recovery routine, then we expect that they've
94	 * followed all our rules and registered new recovery functions.
95	 */
96	switch (redo) {
97	case DB_TXN_ABORT:
98	case DB_TXN_APPLY:
99	case DB_TXN_PRINT:
100		make_call = 1;
101		break;
102	case DB_TXN_OPENFILES:
103		/*
104		 * We collect all the transactions that have
105		 * "begin" records, those with no previous LSN,
106		 * so that we do not abort partial transactions.
107		 * These are known to be undone, otherwise the
108		 * log would not have been freeable.
109		 */
110		LOGCOPY_TOLSN(env, &prev_lsn, (u_int8_t *)db->data +
111		    sizeof(rectype) + sizeof(txnid));
112		if (txnid != 0 && prev_lsn.file == 0 && (ret =
113		    __db_txnlist_add(env, info, txnid, TXN_OK, NULL)) != 0)
114			return (ret);
115
116		/* FALLTHROUGH */
117	case DB_TXN_POPENFILES:
118		if (rectype == DB___dbreg_register ||
119		    rectype == DB___txn_child ||
120		    rectype == DB___txn_ckp || rectype == DB___txn_recycle)
121			return ((dtab->int_dispatch[rectype])(env,
122			    db, lsnp, redo, info));
123		break;
124	case DB_TXN_BACKWARD_ROLL:
125		/*
126		 * Running full recovery in the backward pass. In general,
127		 * we only process records during this pass that belong
128		 * to aborted transactions.  Unfortunately, there are several
129		 * exceptions:
130		 * 1. If this is a meta-record, one not associated with
131		 *    a transaction, then we must always process it.
132		 * 2. If this is a transaction commit/abort, we must
133		 *    always process it, so that we know the status of
134		 *    every transaction.
135		 * 3. If this is a child commit, we need to process it
136		 *    because the outcome of the child transaction depends
137		 *    on the outcome of the parent.
138		 * 4. If this is a dbreg_register record, we must always
139		 *    process is because they contain non-transactional
140		 *    closes that must be properly handled.
141		 * 5. If this is a noop, we must always undo it so that we
142		 *    properly handle any aborts before a file was closed.
143		 * 6. If this a file remove, we need to process it to
144		 *    determine if the on-disk file is the same as the
145		 *    one being described.
146		 */
147		switch (rectype) {
148		/*
149		 * These either do not belong to a transaction or (regop)
150		 * must be processed regardless of the status of the
151		 * transaction.
152		 */
153		case DB___txn_regop:
154		case DB___txn_recycle:
155		case DB___txn_ckp:
156			make_call = 1;
157			break;
158		/*
159		 * These belong to a transaction whose status must be
160		 * checked.
161		 */
162		case DB___txn_child:
163		case DB___db_noop:
164		case DB___fop_file_remove:
165		case DB___dbreg_register:
166			make_call = 1;
167
168			/* FALLTHROUGH */
169		default:
170			if (txnid == 0)
171				break;
172
173			ret = __db_txnlist_find(env, info, txnid, &status);
174
175			/* If not found, this is an incomplete abort.  */
176			if (ret == DB_NOTFOUND)
177				return (__db_txnlist_add(env,
178				    info, txnid, TXN_IGNORE, lsnp));
179			if (ret != 0)
180				return (ret);
181
182			/*
183			 * If we ignore the transaction, ignore the operation
184			 * UNLESS this is a child commit in which case we need
185			 * to make sure that the child also gets marked as
186			 * ignore.
187			 */
188			if (status == TXN_IGNORE && rectype != DB___txn_child) {
189				make_call = 0;
190				break;
191			}
192			if (status == TXN_COMMIT)
193				break;
194
195			/* Set make_call in case we came through default */
196			make_call = 1;
197			if (status == TXN_OK &&
198			    (ret = __db_txnlist_update(env,
199			    info, txnid, rectype == DB___txn_xa_regop ?
200			    TXN_PREPARE : TXN_ABORT, NULL, &status, 0)) != 0)
201				return (ret);
202		}
203		break;
204	case DB_TXN_FORWARD_ROLL:
205		/*
206		 * In the forward pass, if we haven't seen the transaction,
207		 * do nothing, else recover it.
208		 *
209		 * We need to always redo DB___db_noop records, so that we
210		 * properly handle any commits after the file was closed.
211		 */
212		switch (rectype) {
213		case DB___txn_recycle:
214		case DB___txn_ckp:
215		case DB___db_noop:
216		case DB___dbreg_register:
217			make_call = 1;
218			break;
219
220		default:
221			if (txnid == 0)
222				status = 0;
223			else {
224				ret = __db_txnlist_find(env,
225				    info, txnid, &status);
226
227				if (ret == DB_NOTFOUND)
228					/* Break out out of if clause. */
229					;
230				else if (ret != 0)
231					return (ret);
232				else if (status == TXN_COMMIT) {
233					make_call = 1;
234					break;
235				}
236			}
237
238		}
239		break;
240	default:
241		return (__db_unknown_flag(
242		    env, "__db_dispatch", (u_int32_t)redo));
243	}
244
245	if (make_call) {
246		/*
247		 * If the debug flag is set then we are logging
248		 * records for a non-durable update so that they
249		 * may be examined for diagnostic purposes.
250		 * So only make the call if we are printing,
251		 * otherwise we need to extract the previous
252		 * lsn so undo will work properly.
253		 */
254		if (rectype & DB_debug_FLAG) {
255			if (redo == DB_TXN_PRINT)
256				rectype &= ~DB_debug_FLAG;
257			else {
258				LOGCOPY_TOLSN(env, lsnp,
259				    (u_int8_t *)db->data +
260				    sizeof(rectype) +
261				    sizeof(txnid));
262				return (0);
263			}
264		}
265		if (rectype >= DB_user_BEGIN) {
266			if (dbenv->app_dispatch != NULL)
267				return (dbenv->app_dispatch(dbenv,
268				    db, lsnp, redo));
269
270			/* No application-specific dispatch */
271			urectype = rectype - DB_user_BEGIN;
272			if (urectype > dtab->ext_size ||
273			    dtab->ext_dispatch[urectype] == NULL) {
274				__db_errx(env,
275	    "Illegal application-specific record type %lu in log",
276				    (u_long)rectype);
277				return (EINVAL);
278			}
279			return ((dtab->ext_dispatch[urectype])(dbenv,
280			    db, lsnp, redo));
281		} else {
282			if (rectype > dtab->int_size ||
283			    dtab->int_dispatch[rectype] == NULL) {
284				__db_errx(env,
285				    "Illegal record type %lu in log",
286				    (u_long)rectype);
287				return (EINVAL);
288			}
289			return ((dtab->int_dispatch[rectype])(env,
290			    db, lsnp, redo, info));
291		}
292	}
293
294	return (0);
295}
296
297/*
298 * __db_add_recovery -- Add recovery functions to the dispatch table.
299 *
300 * We have two versions of this, an external one and an internal one,
301 * because application-specific functions take different arguments
302 * for dispatch (ENV versus DB_ENV).
303 *
304 * This is the external version.
305 *
306 * PUBLIC: int __db_add_recovery __P((DB_ENV *, DB_DISTAB *,
307 * PUBLIC:   int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops), u_int32_t));
308 */
309int
310__db_add_recovery(dbenv, dtab, func, ndx)
311	DB_ENV *dbenv;
312	DB_DISTAB *dtab;
313	int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
314	u_int32_t ndx;
315{
316	size_t i, nsize;
317	int ret;
318
319	/* Make sure this is an application-specific record. */
320	if (ndx < DB_user_BEGIN) {
321		__db_errx(dbenv->env,
322	 "Attempting to add application-specific record with invalid type %lu",
323		    (u_long)ndx);
324		return (EINVAL);
325	}
326	ndx -= DB_user_BEGIN;
327
328	/* Check if we have to grow the table. */
329	if (ndx >= dtab->ext_size) {
330		nsize = ndx + 40;
331		if ((ret =
332		    __os_realloc(dbenv->env, nsize *
333		    sizeof((dtab->ext_dispatch)[0]), &dtab->ext_dispatch))
334		    != 0)
335			return (ret);
336		for (i = dtab->ext_size; i < nsize; ++i)
337			(dtab->ext_dispatch)[i] = NULL;
338		dtab->ext_size = nsize;
339	}
340
341	(dtab->ext_dispatch)[ndx] = func;
342	return (0);
343}
344
345/*
346 * __db_add_recovery_int --
347 *
348 * Internal version of dispatch addition function.
349 *
350 *
351 * PUBLIC: int __db_add_recovery_int __P((ENV *, DB_DISTAB *,
352 * PUBLIC:   int (*)(ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
353 */
354int
355__db_add_recovery_int(env, dtab, func, ndx)
356	ENV *env;
357	DB_DISTAB *dtab;
358	int (*func) __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
359	u_int32_t ndx;
360{
361	size_t i, nsize;
362	int ret;
363
364	if (ndx >= DB_user_BEGIN) {
365		__db_errx(env,
366		    "Attempting to add internal record with invalid type %lu",
367		    (u_long)ndx);
368		return (EINVAL);
369	}
370
371	/* Check if we have to grow the table. */
372	if (ndx >= dtab->int_size) {
373		nsize = ndx + 40;
374		if ((ret =
375		    __os_realloc(env, nsize * sizeof((dtab->int_dispatch)[0]),
376		    &dtab->int_dispatch)) != 0)
377			return (ret);
378		for (i = dtab->int_size; i < nsize; ++i)
379			(dtab->int_dispatch)[i] = NULL;
380		dtab->int_size = nsize;
381	}
382
383	(dtab->int_dispatch)[ndx] = func;
384	return (0);
385}
386
387/*
388 * __db_txnlist_init --
389 *	Initialize transaction linked list.
390 *
391 * PUBLIC: int __db_txnlist_init __P((ENV *, DB_THREAD_INFO *,
392 * PUBLIC:     u_int32_t, u_int32_t, DB_LSN *, DB_TXNHEAD **));
393 */
394int
395__db_txnlist_init(env, ip, low_txn, hi_txn, trunc_lsn, retp)
396	ENV *env;
397	DB_THREAD_INFO *ip;
398	u_int32_t low_txn, hi_txn;
399	DB_LSN *trunc_lsn;
400	DB_TXNHEAD **retp;
401{
402	DB_TXNHEAD *headp;
403	u_int32_t size, tmp;
404	int ret;
405
406	/*
407	 * Size a hash table.
408	 *	If low is zero then we are being called during rollback
409	 * and we need only one slot.
410	 *	Hi maybe lower than low if we have recycled txnid's.
411	 *	The numbers here are guesses about txn density, we can afford
412	 * to look at a few entries in each slot.
413	 */
414	if (low_txn == 0)
415		size = 1;
416	else {
417		if (hi_txn < low_txn) {
418			tmp = hi_txn;
419			hi_txn = low_txn;
420			low_txn = tmp;
421		}
422		tmp = hi_txn - low_txn;
423		/* See if we wrapped around. */
424		if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2)
425			tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn);
426		size = tmp / 5;
427		if (size < 100)
428			size = 100;
429	}
430	if ((ret = __os_malloc(env,
431	    sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0)
432		return (ret);
433
434	memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head));
435	headp->maxid = hi_txn;
436	headp->generation = 0;
437	headp->nslots = size;
438	headp->gen_alloc = 8;
439	headp->thread_info = ip;
440	if ((ret = __os_malloc(env, headp->gen_alloc *
441	    sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) {
442		__os_free(env, headp);
443		return (ret);
444	}
445	headp->gen_array[0].generation = 0;
446	headp->gen_array[0].txn_min = TXN_MINIMUM;
447	headp->gen_array[0].txn_max = TXN_MAXIMUM;
448	if (trunc_lsn != NULL) {
449		headp->trunc_lsn = *trunc_lsn;
450		headp->maxlsn = *trunc_lsn;
451	} else {
452		ZERO_LSN(headp->trunc_lsn);
453		ZERO_LSN(headp->maxlsn);
454	}
455	ZERO_LSN(headp->ckplsn);
456
457	*retp = headp;
458	return (0);
459}
460
461#define	FIND_GENERATION(hp, txnid, gen) do {				\
462	u_int32_t __i;							\
463	for (__i = 0; __i <= (hp)->generation; __i++)			\
464		/* The range may wrap around the end. */		\
465		if ((hp)->gen_array[__i].txn_min <			\
466		    (hp)->gen_array[__i].txn_max ?			\
467		    ((txnid) >= (hp)->gen_array[__i].txn_min &&		\
468		    (txnid) <= (hp)->gen_array[__i].txn_max) :		\
469		    ((txnid) >= (hp)->gen_array[__i].txn_min ||		\
470		    (txnid) <= (hp)->gen_array[__i].txn_max))		\
471			break;						\
472	DB_ASSERT(env, __i <= (hp)->generation);			\
473	gen = (hp)->gen_array[__i].generation;				\
474} while (0)
475
476/*
477 * __db_txnlist_add --
478 *	Add an element to our transaction linked list.
479 *
480 * PUBLIC: int __db_txnlist_add __P((ENV *,
481 * PUBLIC:     DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *));
482 */
483int
484__db_txnlist_add(env, hp, txnid, status, lsn)
485	ENV *env;
486	DB_TXNHEAD *hp;
487	u_int32_t txnid, status;
488	DB_LSN *lsn;
489{
490	DB_TXNLIST *elp;
491	int ret;
492
493	if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0)
494		return (ret);
495
496	LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links);
497
498	/* Find the most recent generation containing this ID */
499	FIND_GENERATION(hp, txnid, elp->u.t.generation);
500	elp->type = TXNLIST_TXNID;
501	elp->u.t.txnid = txnid;
502	elp->u.t.status = status;
503	if (txnid > hp->maxid)
504		hp->maxid = txnid;
505	if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
506		hp->maxlsn = *lsn;
507
508	DB_ASSERT(env, lsn == NULL ||
509	    status != TXN_COMMIT || LOG_COMPARE(&hp->maxlsn, lsn) >= 0);
510
511	return (0);
512}
513
514/*
515 * __db_txnlist_remove --
516 *	Remove an element from our transaction linked list.
517 *
518 * PUBLIC: int __db_txnlist_remove __P((ENV *, DB_TXNHEAD *, u_int32_t));
519 */
520int
521__db_txnlist_remove(env, hp, txnid)
522	ENV *env;
523	DB_TXNHEAD *hp;
524	u_int32_t txnid;
525{
526	DB_TXNLIST *entry;
527	u_int32_t status;
528
529	return (__db_txnlist_find_internal(env,
530	    hp, TXNLIST_TXNID, txnid, &entry, 1, &status));
531}
532
533/*
534 * __db_txnlist_ckp --
535 *	Used to record the maximum checkpoint that will be retained
536 * after recovery.  Typically this is simply the max checkpoint, but
537 * if we are doing client replication recovery or timestamp-based
538 * recovery, we are going to virtually truncate the log and we need
539 * to retain the last checkpoint before the truncation point.
540 *
541 * PUBLIC: void __db_txnlist_ckp __P((ENV *, DB_TXNHEAD *, DB_LSN *));
542 */
543void
544__db_txnlist_ckp(env, hp, ckp_lsn)
545	ENV *env;
546	DB_TXNHEAD *hp;
547	DB_LSN *ckp_lsn;
548{
549
550	COMPQUIET(env, NULL);
551
552	if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) &&
553	    LOG_COMPARE(&hp->maxlsn, ckp_lsn) >= 0)
554		hp->ckplsn = *ckp_lsn;
555}
556
557/*
558 * __db_txnlist_end --
559 *	Discard transaction linked list.
560 *
561 * PUBLIC: void __db_txnlist_end __P((ENV *, DB_TXNHEAD *));
562 */
563void
564__db_txnlist_end(env, hp)
565	ENV *env;
566	DB_TXNHEAD *hp;
567{
568	u_int32_t i;
569	DB_TXNLIST *p;
570
571	if (hp == NULL)
572		return;
573
574	for (i = 0; i < hp->nslots; i++)
575		while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) {
576			switch (p->type) {
577			case TXNLIST_LSN:
578				__os_free(env, p->u.l.lsn_stack);
579				break;
580			case TXNLIST_DELETE:
581			case TXNLIST_TXNID:
582			default:
583				/*
584				 * Possibly an incomplete DB_TXNLIST; just
585				 * free it.
586				 */
587				break;
588			}
589			LIST_REMOVE(p, links);
590			__os_free(env, p);
591		}
592
593	if (hp->gen_array != NULL)
594		__os_free(env, hp->gen_array);
595	__os_free(env, hp);
596}
597
598/*
599 * __db_txnlist_find --
600 *	Checks to see if a txnid with the current generation is in the
601 *	txnid list.  This returns DB_NOTFOUND if the item isn't in the
602 *	list otherwise it returns (like __db_txnlist_find_internal)
603 *	the status of the transaction.  A txnid of 0 means the record
604 *	was generated while not in a transaction.
605 *
606 * PUBLIC: int __db_txnlist_find __P((ENV *,
607 * PUBLIC:     DB_TXNHEAD *, u_int32_t, u_int32_t *));
608 */
609int
610__db_txnlist_find(env, hp, txnid, statusp)
611	ENV *env;
612	DB_TXNHEAD *hp;
613	u_int32_t txnid, *statusp;
614{
615	DB_TXNLIST *entry;
616
617	if (txnid == 0)
618		return (DB_NOTFOUND);
619
620	return (__db_txnlist_find_internal(env, hp,
621	    TXNLIST_TXNID, txnid, &entry, 0, statusp));
622}
623
624/*
625 * __db_txnlist_update --
626 *	Change the status of an existing transaction entry.
627 *	Returns DB_NOTFOUND if no such entry exists.
628 *
629 * PUBLIC: int __db_txnlist_update __P((ENV *, DB_TXNHEAD *,
630 * PUBLIC:     u_int32_t, u_int32_t, DB_LSN *, u_int32_t *, int));
631 */
632int
633__db_txnlist_update(env, hp, txnid, status, lsn, ret_status, add_ok)
634	ENV *env;
635	DB_TXNHEAD *hp;
636	u_int32_t txnid, status;
637	DB_LSN *lsn;
638	u_int32_t *ret_status;
639	int add_ok;
640{
641	DB_TXNLIST *elp;
642	int ret;
643
644	if (txnid == 0)
645		return (DB_NOTFOUND);
646
647	ret = __db_txnlist_find_internal(env,
648	    hp, TXNLIST_TXNID, txnid, &elp, 0, ret_status);
649
650	if (ret == DB_NOTFOUND && add_ok) {
651		*ret_status = status;
652		return (__db_txnlist_add(env, hp, txnid, status, lsn));
653	}
654	if (ret != 0)
655		return (ret);
656
657	if (*ret_status == TXN_IGNORE)
658		return (0);
659
660	elp->u.t.status = status;
661
662	if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
663		hp->maxlsn = *lsn;
664
665	return (ret);
666}
667
668/*
669 * __db_txnlist_find_internal --
670 *	Find an entry on the transaction list.  If the entry is not there or
671 *	the list pointer is not initialized we return DB_NOTFOUND.  If the
672 *	item is found, we return the status.  Currently we always call this
673 *	with an initialized list pointer but checking for NULL keeps it general.
674 */
675static int
676__db_txnlist_find_internal(env,
677    hp, type, txnid, txnlistp, delete, statusp)
678	ENV *env;
679	DB_TXNHEAD *hp;
680	db_txnlist_type type;
681	u_int32_t  txnid;
682	DB_TXNLIST **txnlistp;
683	int delete;
684	u_int32_t *statusp;
685{
686	struct __db_headlink *head;
687	DB_TXNLIST *p;
688	u_int32_t generation, hash;
689	int ret;
690
691	ret = 0;
692
693	if (hp == NULL)
694		return (DB_NOTFOUND);
695
696	switch (type) {
697	case TXNLIST_TXNID:
698		hash = txnid;
699		FIND_GENERATION(hp, txnid, generation);
700		break;
701	case TXNLIST_DELETE:
702	case TXNLIST_LSN:
703	default:
704		return (__env_panic(env, EINVAL));
705	}
706
707	head = &hp->head[DB_TXNLIST_MASK(hp, hash)];
708	LIST_FOREACH(p, head, links) {
709		if (p->type != type)
710			continue;
711		switch (type) {
712		case TXNLIST_TXNID:
713			if (p->u.t.txnid != txnid ||
714			    generation != p->u.t.generation)
715				continue;
716			*statusp = p->u.t.status;
717			break;
718
719		case TXNLIST_DELETE:
720		case TXNLIST_LSN:
721		default:
722			return (__env_panic(env, EINVAL));
723		}
724		if (delete == 1) {
725			LIST_REMOVE(p, links);
726			__os_free(env, p);
727			*txnlistp = NULL;
728		} else if (p != LIST_FIRST(head)) {
729			/* Move it to head of list. */
730			LIST_REMOVE(p, links);
731			LIST_INSERT_HEAD(head, p, links);
732			*txnlistp = p;
733		} else
734			*txnlistp = p;
735		return (ret);
736	}
737
738	return (DB_NOTFOUND);
739}
740
741/*
742 * __db_txnlist_gen --
743 *	Change the current generation number.
744 *
745 * PUBLIC: int __db_txnlist_gen __P((ENV *,
746 * PUBLIC:       DB_TXNHEAD *, int, u_int32_t, u_int32_t));
747 */
748int
749__db_txnlist_gen(env, hp, incr, min, max)
750	ENV *env;
751	DB_TXNHEAD *hp;
752	int incr;
753	u_int32_t min, max;
754{
755	int ret;
756
757	/*
758	 * During recovery generation numbers keep track of "restart"
759	 * checkpoints and recycle records.  Restart checkpoints occur
760	 * whenever we take a checkpoint and there are no outstanding
761	 * transactions.  When that happens, we can reset transaction IDs
762	 * back to TXNID_MINIMUM.  Currently we only do the reset
763	 * at then end of recovery.  Recycle records occur when txnids
764	 * are exhausted during runtime.  A free range of ids is identified
765	 * and logged.  This code maintains a stack of ranges.  A txnid
766	 * is given the generation number of the first range it falls into
767	 * in the stack.
768	 */
769	if (incr < 0) {
770		--hp->generation;
771		memmove(hp->gen_array, &hp->gen_array[1],
772		    (hp->generation + 1) * sizeof(hp->gen_array[0]));
773	} else {
774		++hp->generation;
775		if (hp->generation >= hp->gen_alloc) {
776			hp->gen_alloc *= 2;
777			if ((ret = __os_realloc(env, hp->gen_alloc *
778			    sizeof(hp->gen_array[0]), &hp->gen_array)) != 0)
779				return (ret);
780		}
781		memmove(&hp->gen_array[1], &hp->gen_array[0],
782		    hp->generation * sizeof(hp->gen_array[0]));
783		hp->gen_array[0].generation = hp->generation;
784		hp->gen_array[0].txn_min = min;
785		hp->gen_array[0].txn_max = max;
786	}
787	return (0);
788}
789
790/*
791 * __db_txnlist_lsnadd --
792 *	Save the prev_lsn from a txn_child record.
793 *
794 * PUBLIC: int __db_txnlist_lsnadd __P((ENV *, DB_TXNHEAD *, DB_LSN *));
795 */
796int
797__db_txnlist_lsnadd(env, hp, lsnp)
798	ENV *env;
799	DB_TXNHEAD *hp;
800	DB_LSN *lsnp;
801{
802	DB_TXNLIST *elp;
803	int ret;
804
805	if (IS_ZERO_LSN(*lsnp))
806		return (0);
807
808	LIST_FOREACH(elp, &hp->head[0], links)
809		if (elp->type == TXNLIST_LSN)
810			break;
811
812	if (elp == NULL) {
813		if ((ret = __db_txnlist_lsninit(env, hp, lsnp)) != 0)
814			return (ret);
815		return (DB_SURPRISE_KID);
816	}
817
818	if (elp->u.l.stack_indx == elp->u.l.stack_size) {
819		elp->u.l.stack_size <<= 1;
820		if ((ret = __os_realloc(env, sizeof(DB_LSN) *
821		     elp->u.l.stack_size, &elp->u.l.lsn_stack)) != 0) {
822			__db_txnlist_end(env, hp);
823			return (ret);
824		}
825	}
826	elp->u.l.lsn_stack[elp->u.l.stack_indx++] = *lsnp;
827
828	return (0);
829}
830
831/*
832 * __db_txnlist_lsnget --
833 *
834 * PUBLIC: int __db_txnlist_lsnget __P((ENV *,
835 * PUBLIC:     DB_TXNHEAD *, DB_LSN *, u_int32_t));
836 *	Get the lsn saved from a txn_child record.
837 */
838int
839__db_txnlist_lsnget(env, hp, lsnp, flags)
840	ENV *env;
841	DB_TXNHEAD *hp;
842	DB_LSN *lsnp;
843	u_int32_t flags;
844{
845	DB_TXNLIST *elp;
846
847	COMPQUIET(env, NULL);
848	COMPQUIET(flags, 0);
849
850	LIST_FOREACH(elp, &hp->head[0], links)
851		if (elp->type == TXNLIST_LSN)
852			break;
853
854	if (elp == NULL || elp->u.l.stack_indx == 0) {
855		ZERO_LSN(*lsnp);
856		return (0);
857	}
858
859	*lsnp = elp->u.l.lsn_stack[--elp->u.l.stack_indx];
860
861	return (0);
862}
863
864/*
865 * __db_txnlist_lsninit --
866 *	Initialize a transaction list with an lsn array entry.
867 *
868 * PUBLIC: int __db_txnlist_lsninit __P((ENV *, DB_TXNHEAD *, DB_LSN *));
869 */
870int
871__db_txnlist_lsninit(env, hp, lsnp)
872	ENV *env;
873	DB_TXNHEAD *hp;
874	DB_LSN *lsnp;
875{
876	DB_TXNLIST *elp;
877	int ret;
878
879	elp = NULL;
880
881	if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0)
882		goto err;
883	LIST_INSERT_HEAD(&hp->head[0], elp, links);
884
885	elp->type = TXNLIST_LSN;
886	if ((ret = __os_malloc(env,
887	    sizeof(DB_LSN) * DB_LSN_STACK_SIZE, &elp->u.l.lsn_stack)) != 0)
888		goto err;
889	elp->u.l.stack_indx = 1;
890	elp->u.l.stack_size = DB_LSN_STACK_SIZE;
891	elp->u.l.lsn_stack[0] = *lsnp;
892
893	return (0);
894
895err:	__db_txnlist_end(env, hp);
896	return (ret);
897}
898
899#ifdef DEBUG
900/*
901 * __db_txnlist_print --
902 *	Print out the transaction list.
903 *
904 * PUBLIC: void __db_txnlist_print __P((DB_TXNHEAD *));
905 */
906void
907__db_txnlist_print(hp)
908	DB_TXNHEAD *hp;
909{
910	DB_TXNLIST *p;
911	u_int32_t i;
912	char *txntype;
913
914	printf("Maxid: %lu Generation: %lu\n",
915	    (u_long)hp->maxid, (u_long)hp->generation);
916	for (i = 0; i < hp->nslots; i++)
917		LIST_FOREACH(p, &hp->head[i], links) {
918			if (p->type != TXNLIST_TXNID) {
919				printf("Unrecognized type: %d\n", p->type);
920				continue;
921			}
922			switch (p->u.t.status) {
923			case TXN_OK:
924				txntype = "OK";
925				break;
926			case TXN_COMMIT:
927				txntype = "commit";
928				break;
929			case TXN_PREPARE:
930				txntype = "prepare";
931				break;
932			case TXN_ABORT:
933				txntype = "abort";
934				break;
935			case TXN_IGNORE:
936				txntype = "ignore";
937				break;
938			case TXN_EXPECTED:
939				txntype = "expected";
940				break;
941			case TXN_UNEXPECTED:
942				txntype = "unexpected";
943				break;
944			default:
945				txntype = "UNKNOWN";
946				break;
947			}
948			printf("TXNID: %lx(%lu): %s\n",
949			    (u_long)p->u.t.txnid,
950			    (u_long)p->u.t.generation, txntype);
951		}
952}
953#endif
954