1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: log.c,v 12.68 2008/05/05 01:59:52 mjc Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/crypto.h"
13#include "dbinc/hmac.h"
14#include "dbinc/log.h"
15#include "dbinc/txn.h"
16
17static int	__log_init __P((ENV *, DB_LOG *));
18static int	__log_recover __P((DB_LOG *));
19static size_t	__log_region_size __P((ENV *));
20
21/*
22 * __log_open --
23 *	Internal version of log_open: only called from ENV->open.
24 *
25 * PUBLIC: int __log_open __P((ENV *, int));
26 */
27int
28__log_open(env, create_ok)
29	ENV *env;
30	int create_ok;
31{
32	DB_ENV *dbenv;
33	DB_LOG *dblp;
34	LOG *lp;
35	u_int8_t *bulk;
36	int region_locked, ret;
37
38	dbenv = env->dbenv;
39	region_locked = 0;
40
41	/* Create/initialize the DB_LOG structure. */
42	if ((ret = __os_calloc(env, 1, sizeof(DB_LOG), &dblp)) != 0)
43		return (ret);
44	dblp->env = env;
45
46	/* Set the default buffer size, if not otherwise configured. */
47	if (dbenv->lg_bsize == 0)
48		dbenv->lg_bsize = FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) ?
49		    LG_BSIZE_INMEM : LG_BSIZE_DEFAULT;
50
51	/* Join/create the log region. */
52	dblp->reginfo.env = env;
53	dblp->reginfo.type = REGION_TYPE_LOG;
54	dblp->reginfo.id = INVALID_REGION_ID;
55	dblp->reginfo.flags = REGION_JOIN_OK;
56
57	if (create_ok)
58		F_SET(&dblp->reginfo, REGION_CREATE_OK);
59	if ((ret = __env_region_attach(
60	    env, &dblp->reginfo, __log_region_size(env))) != 0)
61		goto err;
62
63	/* If we created the region, initialize it. */
64	if (F_ISSET(&dblp->reginfo, REGION_CREATE))
65		if ((ret = __log_init(env, dblp)) != 0)
66			goto err;
67
68	/* Set the local addresses. */
69	lp = dblp->reginfo.primary =
70	    R_ADDR(&dblp->reginfo, dblp->reginfo.rp->primary);
71	dblp->bufp = R_ADDR(&dblp->reginfo, lp->buffer_off);
72
73	/*
74	 * If the region is threaded, we have to lock the DBREG list, and we
75	 * need to allocate a mutex for that purpose.
76	 */
77	if ((ret = __mutex_alloc(env,
78	    MTX_LOG_REGION, DB_MUTEX_PROCESS_ONLY, &dblp->mtx_dbreg)) != 0)
79		goto err;
80
81	/*
82	 * Set the handle -- we may be about to run recovery, which allocates
83	 * log cursors.  Log cursors require logging be already configured,
84	 * and the handle being set is what demonstrates that.
85	 *
86	 * If we created the region, run recovery.  If that fails, make sure
87	 * we reset the log handle before cleaning up, otherwise we will try
88	 * and clean up again in the mainline ENV initialization code.
89	 */
90	env->lg_handle = dblp;
91
92	if (F_ISSET(&dblp->reginfo, REGION_CREATE)) {
93		/*
94		 * We first take the log file size from the environment, if
95		 * specified.  If that wasn't set, default it.  Regardless,
96		 * recovery may set it from the persistent information in a
97		 * log file header.
98		 */
99		if (lp->log_size == 0)
100			lp->log_size =
101			    FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) ?
102			    LG_MAX_INMEM : LG_MAX_DEFAULT;
103
104		if ((ret = __log_recover(dblp)) != 0)
105			goto err;
106
107		/*
108		 * If the next log file size hasn't been set yet, default it
109		 * to the current log file size.
110		 */
111		if (lp->log_nsize == 0)
112			lp->log_nsize = lp->log_size;
113
114		/*
115		 * If we haven't written any log files, write the first one
116		 * so that checkpoint gets a valid ckp_lsn value.
117		 */
118		if (IS_INIT_LSN(lp->lsn) &&
119		    (ret = __log_newfile(dblp, NULL, 0, 0)) != 0)
120			goto err;
121
122		/*
123		 * Initialize replication's next-expected LSN value
124		 * and replication's bulk buffer.  In __env_open, we
125		 * always create/open the replication region before
126		 * the log region so we're assured that our rep_handle
127		 * is valid at this point, if replication is being used.
128		 */
129		lp->ready_lsn = lp->lsn;
130		if (IS_ENV_REPLICATED(env)) {
131			if ((ret =
132			    __env_alloc(&dblp->reginfo, MEGABYTE, &bulk)) != 0)
133				goto err;
134			lp->bulk_buf = R_OFFSET(&dblp->reginfo, bulk);
135			lp->bulk_len = MEGABYTE;
136			lp->bulk_off = 0;
137			lp->wait_ts = env->rep_handle->request_gap;
138			__os_gettime(env, &lp->rcvd_ts, 1);
139		} else {
140			lp->bulk_buf = INVALID_ROFF;
141			lp->bulk_len = 0;
142			lp->bulk_off = 0;
143		}
144	} else {
145		/*
146		 * A process joining the region may have reset the log file
147		 * size, too.  If so, it only affects the next log file we
148		 * create.  We need to check that the size is reasonable given
149		 * the buffer size in the region.
150		 */
151		LOG_SYSTEM_LOCK(env);
152		region_locked = 1;
153
154		 if (dbenv->lg_size != 0) {
155			if ((ret =
156			    __log_check_sizes(env, dbenv->lg_size, 0)) != 0)
157				goto err;
158
159			lp->log_nsize = dbenv->lg_size;
160		 }
161
162		LOG_SYSTEM_UNLOCK(env);
163		region_locked = 0;
164	}
165
166	return (0);
167
168err:	if (dblp->reginfo.addr != NULL) {
169		if (region_locked)
170			LOG_SYSTEM_UNLOCK(env);
171		(void)__env_region_detach(env, &dblp->reginfo, 0);
172	}
173	env->lg_handle = NULL;
174
175	(void)__mutex_free(env, &dblp->mtx_dbreg);
176	__os_free(env, dblp);
177
178	return (ret);
179}
180
181/*
182 * __log_init --
183 *	Initialize a log region in shared memory.
184 */
185static int
186__log_init(env, dblp)
187	ENV *env;
188	DB_LOG *dblp;
189{
190	DB_ENV *dbenv;
191	LOG *lp;
192	int ret;
193	void *p;
194
195	dbenv = env->dbenv;
196
197	/*
198	 * This is the first point where we can validate the buffer size,
199	 * because we know all three settings have been configured (file size,
200	 * buffer size and the in-memory flag).
201	 */
202	if ((ret =
203	   __log_check_sizes(env, dbenv->lg_size, dbenv->lg_bsize)) != 0)
204		return (ret);
205
206	if ((ret = __env_alloc(&dblp->reginfo,
207	    sizeof(*lp), &dblp->reginfo.primary)) != 0)
208		goto mem_err;
209	dblp->reginfo.rp->primary =
210	    R_OFFSET(&dblp->reginfo, dblp->reginfo.primary);
211	lp = dblp->reginfo.primary;
212	memset(lp, 0, sizeof(*lp));
213
214	if ((ret =
215	    __mutex_alloc(env, MTX_LOG_REGION, 0, &lp->mtx_region)) != 0)
216		return (ret);
217
218	lp->fid_max = 0;
219	SH_TAILQ_INIT(&lp->fq);
220	lp->free_fid_stack = INVALID_ROFF;
221	lp->free_fids = lp->free_fids_alloced = 0;
222
223	/* Initialize LOG LSNs. */
224	INIT_LSN(lp->lsn);
225	INIT_LSN(lp->t_lsn);
226
227	/*
228	 * It's possible to be waiting for an LSN of [1][0], if a replication
229	 * client gets the first log record out of order.  An LSN of [0][0]
230	 * signifies that we're not waiting.
231	 */
232	ZERO_LSN(lp->waiting_lsn);
233
234	/*
235	 * Log makes note of the fact that it ran into a checkpoint on
236	 * startup if it did so, as a recovery optimization.  A zero
237	 * LSN signifies that it hasn't found one [yet].
238	 */
239	ZERO_LSN(lp->cached_ckp_lsn);
240
241	if ((ret =
242	    __mutex_alloc(env, MTX_LOG_FILENAME, 0, &lp->mtx_filelist)) != 0)
243		return (ret);
244	if ((ret = __mutex_alloc(env, MTX_LOG_FLUSH, 0, &lp->mtx_flush)) != 0)
245		return (ret);
246
247	/* Initialize the buffer. */
248	if ((ret = __env_alloc(&dblp->reginfo, dbenv->lg_bsize, &p)) != 0) {
249mem_err:	__db_errx( env, "unable to allocate log region memory");
250		return (ret);
251	}
252	lp->regionmax = dbenv->lg_regionmax;
253	lp->buffer_off = R_OFFSET(&dblp->reginfo, p);
254	lp->buffer_size = dbenv->lg_bsize;
255	lp->filemode = dbenv->lg_filemode;
256	lp->log_size = lp->log_nsize = dbenv->lg_size;
257
258	/* Initialize the commit Queue. */
259	SH_TAILQ_INIT(&lp->free_commits);
260	SH_TAILQ_INIT(&lp->commits);
261	lp->ncommit = 0;
262
263	/* Initialize the logfiles list for in-memory logs. */
264	SH_TAILQ_INIT(&lp->logfiles);
265	SH_TAILQ_INIT(&lp->free_logfiles);
266
267	/*
268	 * Fill in the log's persistent header.  Don't fill in the log file
269	 * sizes, as they may change at any time and so have to be filled in
270	 * as each log file is created.
271	 */
272	lp->persist.magic = DB_LOGMAGIC;
273	/*
274	 * Don't use __log_set_version because env->dblp isn't set up yet.
275	 */
276	lp->persist.version = DB_LOGVERSION;
277	lp->persist.notused = 0;
278	env->lg_handle = dblp;
279
280	/* Migrate persistent flags from the ENV into the region. */
281	if (dbenv->lg_flags != 0 &&
282	    (ret = __log_set_config_int(dbenv, dbenv->lg_flags, 1, 1)) != 0)
283		return (ret);
284
285	(void)time(&lp->timestamp);
286	return (0);
287}
288
289/*
290 * __log_recover --
291 *	Recover a log.
292 */
293static int
294__log_recover(dblp)
295	DB_LOG *dblp;
296{
297	DBT dbt;
298	DB_ENV *dbenv;
299	DB_LOGC *logc;
300	DB_LSN lsn;
301	ENV *env;
302	LOG *lp;
303	u_int32_t cnt, rectype;
304	int ret;
305	logfile_validity status;
306
307	env = dblp->env;
308	dbenv = env->dbenv;
309	logc = NULL;
310	lp = dblp->reginfo.primary;
311
312	/*
313	 * Find a log file.  If none exist, we simply return, leaving
314	 * everything initialized to a new log.
315	 */
316	if ((ret = __log_find(dblp, 0, &cnt, &status)) != 0)
317		return (ret);
318	if (cnt == 0)
319		return (0);
320
321	/*
322	 * If the last file is an old, unreadable version, start a new
323	 * file.  Don't bother finding the end of the last log file;
324	 * we assume that it's valid in its entirety, since the user
325	 * should have shut down cleanly or run recovery before upgrading.
326	 */
327	if (status == DB_LV_OLD_UNREADABLE) {
328		lp->lsn.file = lp->s_lsn.file = cnt + 1;
329		lp->lsn.offset = lp->s_lsn.offset = 0;
330		goto skipsearch;
331	}
332	DB_ASSERT(env,
333	    (status == DB_LV_NORMAL || status == DB_LV_OLD_READABLE));
334
335	/*
336	 * We have the last useful log file and we've loaded any persistent
337	 * information.  Set the end point of the log past the end of the last
338	 * file. Read the last file, looking for the last checkpoint and
339	 * the log's end.
340	 */
341	lp->lsn.file = cnt + 1;
342	lp->lsn.offset = 0;
343	lsn.file = cnt;
344	lsn.offset = 0;
345
346	/*
347	 * Allocate a cursor and set it to the first record.  This shouldn't
348	 * fail, leave error messages on.
349	 */
350	if ((ret = __log_cursor(env, &logc)) != 0)
351		return (ret);
352	F_SET(logc, DB_LOG_LOCKED);
353	memset(&dbt, 0, sizeof(dbt));
354	if ((ret = __logc_get(logc, &lsn, &dbt, DB_SET)) != 0)
355		goto err;
356
357	/*
358	 * Read to the end of the file.  This may fail at some point, so
359	 * turn off error messages.
360	 */
361	F_SET(logc, DB_LOG_SILENT_ERR);
362	while (__logc_get(logc, &lsn, &dbt, DB_NEXT) == 0) {
363		if (dbt.size < sizeof(u_int32_t))
364			continue;
365		LOGCOPY_32(env, &rectype, dbt.data);
366		if (rectype == DB___txn_ckp)
367			/*
368			 * If we happen to run into a checkpoint, cache its
369			 * LSN so that the transaction system doesn't have
370			 * to walk this log file again looking for it.
371			 */
372			lp->cached_ckp_lsn = lsn;
373	}
374	F_CLR(logc, DB_LOG_SILENT_ERR);
375
376	/*
377	 * We now know where the end of the log is.  Set the first LSN that
378	 * we want to return to an application and the LSN of the last known
379	 * record on disk.
380	 */
381	lp->lsn = lsn;
382	lp->s_lsn = lsn;
383	lp->lsn.offset += logc->len;
384	lp->s_lsn.offset += logc->len;
385
386	/* Set up the current buffer information, too. */
387	lp->len = logc->len;
388	lp->a_off = 0;
389	lp->b_off = 0;
390	lp->w_off = lp->lsn.offset;
391
392skipsearch:
393	if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
394		__db_msg(env,
395		    "Finding last valid log LSN: file: %lu offset %lu",
396		    (u_long)lp->lsn.file, (u_long)lp->lsn.offset);
397
398err:	if (logc != NULL)
399		(void)__logc_close(logc);
400
401	return (ret);
402}
403
404/*
405 * __log_find --
406 *	Try to find a log file.  If find_first is set, valp will contain
407 * the number of the first readable log file, else it will contain the number
408 * of the last log file (which may be too old to read).
409 *
410 * PUBLIC: int __log_find __P((DB_LOG *, int, u_int32_t *, logfile_validity *));
411 */
412int
413__log_find(dblp, find_first, valp, statusp)
414	DB_LOG *dblp;
415	int find_first;
416	u_int32_t *valp;
417	logfile_validity *statusp;
418{
419	ENV *env;
420	LOG *lp;
421	logfile_validity logval_status, status;
422	struct __db_filestart *filestart;
423	u_int32_t clv, logval;
424	int cnt, fcnt, ret;
425	const char *dir;
426	char *c, **names, *p, *q;
427
428	env = dblp->env;
429	lp = dblp->reginfo.primary;
430	logval_status = status = DB_LV_NONEXISTENT;
431
432	/* Return a value of 0 as the log file number on failure. */
433	*valp = 0;
434
435	if (lp->db_log_inmemory) {
436		filestart = find_first ?
437		    SH_TAILQ_FIRST(&lp->logfiles, __db_filestart) :
438		    SH_TAILQ_LAST(&lp->logfiles, links, __db_filestart);
439		if (filestart != NULL) {
440			*valp = filestart->file;
441			logval_status = DB_LV_NORMAL;
442		}
443		*statusp = logval_status;
444		return (0);
445	}
446
447	/* Find the directory name. */
448	if ((ret = __log_name(dblp, 1, &p, NULL, 0)) != 0) {
449		__os_free(env, p);
450		return (ret);
451	}
452	if ((q = __db_rpath(p)) == NULL)
453		dir = PATH_DOT;
454	else {
455		*q = '\0';
456		dir = p;
457	}
458
459	/* Get the list of file names. */
460	if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0) {
461		__db_err(env, ret, "%s", dir);
462		__os_free(env, p);
463		return (ret);
464	}
465
466	/* Search for a valid log file name. */
467	for (cnt = fcnt, clv = logval = 0; --cnt >= 0;) {
468		if (strncmp(names[cnt], LFPREFIX, sizeof(LFPREFIX) - 1) != 0)
469			continue;
470
471		/*
472		 * Names of the form log\.[0-9]* are reserved for DB.  Other
473		 * names sharing LFPREFIX, such as "log.db", are legal.
474		 */
475		for (c = names[cnt] + sizeof(LFPREFIX) - 1; *c != '\0'; c++)
476			if (!isdigit((int)*c))
477				break;
478		if (*c != '\0')
479			continue;
480
481		/*
482		 * Use atol, not atoi; if an "int" is 16-bits, the largest
483		 * log file name won't fit.
484		 */
485		clv = (u_int32_t)atol(names[cnt] + (sizeof(LFPREFIX) - 1));
486
487		/*
488		 * If searching for the first log file, we want to return the
489		 * oldest log file we can read, or, if no readable log files
490		 * exist, the newest log file we can't read (the crossover
491		 * point between the old and new versions of the log file).
492		 *
493		 * If we're searching for the last log file, we want to return
494		 * the newest log file, period.
495		 *
496		 * Readable log files should never precede unreadable log
497		 * files, that would mean the admin seriously screwed up.
498		 */
499		if (find_first) {
500			if (logval != 0 &&
501			    status != DB_LV_OLD_UNREADABLE && clv > logval)
502				continue;
503		} else
504			if (logval != 0 && clv < logval)
505				continue;
506
507		if ((ret = __log_valid(dblp, clv, 1, NULL, 0,
508		    &status, NULL)) != 0) {
509			__db_err(
510			    env, ret, "Invalid log file: %s", names[cnt]);
511			goto err;
512		}
513		switch (status) {
514		case DB_LV_NONEXISTENT:
515			/* __log_valid never returns DB_LV_NONEXISTENT. */
516			DB_ASSERT(env, 0);
517			break;
518		case DB_LV_INCOMPLETE:
519			/*
520			 * The last log file may not have been initialized --
521			 * it's possible to create a log file but not write
522			 * anything to it.  If performing recovery (that is,
523			 * if find_first isn't set), ignore the file, it's
524			 * not interesting.  If we're searching for the first
525			 * log record, return the file (assuming we don't find
526			 * something better), as the "real" first log record
527			 * is likely to be in the log buffer, and we want to
528			 * set the file LSN for our return.
529			 */
530			if (find_first)
531				goto found;
532			break;
533		case DB_LV_OLD_UNREADABLE:
534			/*
535			 * If we're searching for the first log file, then we
536			 * only want this file if we don't yet have a file or
537			 * already have an unreadable file and this one is
538			 * newer than that one.  If we're searching for the
539			 * last log file, we always want this file because we
540			 * wouldn't be here if it wasn't newer than our current
541			 * choice.
542			 */
543			if (!find_first || logval == 0 ||
544			    (status == DB_LV_OLD_UNREADABLE && clv > logval))
545				goto found;
546			break;
547		case DB_LV_NORMAL:
548		case DB_LV_OLD_READABLE:
549found:			logval = clv;
550			logval_status = status;
551			break;
552		}
553	}
554
555	*valp = logval;
556
557err:	__os_dirfree(env, names, fcnt);
558	__os_free(env, p);
559	*statusp = logval_status;
560
561	return (ret);
562}
563
564/*
565 * log_valid --
566 *	Validate a log file.  Returns an error code in the event of
567 *	a fatal flaw in a the specified log file;  returns success with
568 *	a code indicating the currentness and completeness of the specified
569 *	log file if it is not unexpectedly flawed (that is, if it's perfectly
570 *	normal, if it's zero-length, or if it's an old version).
571 *
572 * PUBLIC: int __log_valid __P((DB_LOG *, u_int32_t, int,
573 * PUBLIC:     DB_FH **, u_int32_t, logfile_validity *, u_int32_t *));
574 */
575int
576__log_valid(dblp, number, set_persist, fhpp, flags, statusp, versionp)
577	DB_LOG *dblp;
578	u_int32_t number;
579	int set_persist;
580	DB_FH **fhpp;
581	u_int32_t flags;
582	logfile_validity *statusp;
583	u_int32_t *versionp;
584{
585	DB_CIPHER *db_cipher;
586	DB_FH *fhp;
587	ENV *env;
588	HDR *hdr;
589	LOG *lp;
590	LOGP *persist;
591	logfile_validity status;
592	size_t hdrsize, nr, recsize;
593	int is_hmac, ret;
594	u_int8_t *tmp;
595	char *fname;
596
597	env = dblp->env;
598	db_cipher = env->crypto_handle;
599	fhp = NULL;
600	persist = NULL;
601	status = DB_LV_NORMAL;
602	tmp = NULL;
603
604	/* Return the file handle to our caller, on request */
605	if (fhpp != NULL)
606		*fhpp = NULL;
607
608	if (flags == 0)
609		flags = DB_OSO_RDONLY | DB_OSO_SEQ;
610	/* Try to open the log file. */
611	if ((ret = __log_name(dblp, number, &fname, &fhp, flags)) != 0) {
612		__os_free(env, fname);
613		return (ret);
614	}
615
616	hdrsize = HDR_NORMAL_SZ;
617	is_hmac = 0;
618	recsize = sizeof(LOGP);
619	if (CRYPTO_ON(env)) {
620		hdrsize = HDR_CRYPTO_SZ;
621		recsize = sizeof(LOGP);
622		recsize += db_cipher->adj_size(recsize);
623		is_hmac = 1;
624	}
625	if ((ret = __os_calloc(env, 1, recsize + hdrsize, &tmp)) != 0)
626		goto err;
627
628	hdr = (HDR *)tmp;
629	persist = (LOGP *)(tmp + hdrsize);
630
631	/*
632	 * Try to read the header.  This can fail if the log is truncated, or
633	 * if we find a preallocated log file where the header has not yet been
634	 * written, so we need to check whether the header is zero-filled.
635	 */
636	if ((ret = __os_read(env, fhp, tmp, recsize + hdrsize, &nr)) != 0 ||
637	    nr != recsize + hdrsize ||
638	    (hdr->len == 0 && persist->magic == 0 && persist->log_size == 0)) {
639		if (ret == 0)
640			status = DB_LV_INCOMPLETE;
641		else
642			/*
643			 * The error was a fatal read error, not just an
644			 * incompletely initialized log file.
645			 */
646			__db_err(env, ret, "ignoring log file: %s", fname);
647		goto err;
648	}
649
650	if (LOG_SWAPPED(env))
651		__log_hdrswap(hdr, CRYPTO_ON(env));
652
653	/*
654	 * Now we have to validate the persistent record.  We have
655	 * several scenarios we have to deal with:
656	 *
657	 * 1.  User has crypto turned on:
658	 *	- They're reading an old, unencrypted log file
659	 *	  .  We will fail the record size match check below.
660	 *	- They're reading a current, unencrypted log file
661	 *	  .  We will fail the record size match check below.
662	 *	- They're reading an old, encrypted log file [NOT YET]
663	 *	  .  After decryption we'll fail the version check.  [NOT YET]
664	 *	- They're reading a current, encrypted log file
665	 *	  .  We should proceed as usual.
666	 * 2.  User has crypto turned off:
667	 *	- They're reading an old, unencrypted log file
668	 *	  .  We will fail the version check.
669	 *	- They're reading a current, unencrypted log file
670	 *	  .  We should proceed as usual.
671	 *	- They're reading an old, encrypted log file [NOT YET]
672	 *	  .  We'll fail the magic number check (it is encrypted).
673	 *	- They're reading a current, encrypted log file
674	 *	  .  We'll fail the magic number check (it is encrypted).
675	 */
676	if (CRYPTO_ON(env)) {
677		/*
678		 * If we are trying to decrypt an unencrypted log
679		 * we can only detect that by having an unreasonable
680		 * data length for our persistent data.
681		 */
682		if ((hdr->len - hdrsize) != sizeof(LOGP)) {
683			__db_errx(env, "log record size mismatch");
684			goto err;
685		}
686		/* Check the checksum and decrypt. */
687		if ((ret = __db_check_chksum(env, hdr, db_cipher,
688		    &hdr->chksum[0], (u_int8_t *)persist,
689		    hdr->len - hdrsize, is_hmac)) != 0) {
690			__db_errx(env, "log record checksum mismatch");
691			goto err;
692		}
693
694		if ((ret = db_cipher->decrypt(env, db_cipher->data,
695		    &hdr->iv[0], (u_int8_t *)persist, hdr->len - hdrsize)) != 0)
696			goto err;
697	}
698
699	if (LOG_SWAPPED(env))
700		__log_persistswap(persist);
701
702	/* Validate the header. */
703	if (persist->magic != DB_LOGMAGIC) {
704		__db_errx(env,
705		    "Ignoring log file: %s: magic number %lx, not %lx",
706		    fname, (u_long)persist->magic, (u_long)DB_LOGMAGIC);
707		ret = EINVAL;
708		goto err;
709	}
710
711	/*
712	 * Set our status code to indicate whether the log file belongs to an
713	 * unreadable or readable old version; leave it alone if and only if
714	 * the log file version is the current one.
715	 */
716	if (persist->version > DB_LOGVERSION) {
717		/* This is a fatal error--the log file is newer than DB. */
718		__db_errx(env,
719		    "Unacceptable log file %s: unsupported log version %lu",
720		    fname, (u_long)persist->version);
721		ret = EINVAL;
722		goto err;
723	} else if (persist->version < DB_LOGOLDVER) {
724		status = DB_LV_OLD_UNREADABLE;
725		/* This is a non-fatal error, but give some feedback. */
726		__db_errx(env,
727		    "Skipping log file %s: historic log version %lu",
728		    fname, (u_long)persist->version);
729		/*
730		 * We don't want to set persistent info based on an unreadable
731		 * region, so jump to "err".
732		 */
733		goto err;
734	} else if (persist->version < DB_LOGVERSION)
735		status = DB_LV_OLD_READABLE;
736
737	/*
738	 * Only if we have a current log do we verify the checksum.  We could
739	 * not check the checksum before checking the magic and version because
740	 * old log headers put the length and checksum in a different location.
741	 * The checksum was calculated with the swapped byte order, so we need
742	 * to check it with the same bytes.
743	 */
744	if (!CRYPTO_ON(env)) {
745		if (LOG_SWAPPED(env))
746			__log_persistswap(persist);
747
748		if ((ret = __db_check_chksum(env,
749		    hdr, db_cipher, &hdr->chksum[0], (u_int8_t *)persist,
750		    hdr->len - hdrsize, is_hmac)) != 0) {
751			__db_errx(env, "log record checksum mismatch");
752			goto err;
753		}
754
755		if (LOG_SWAPPED(env))
756			__log_persistswap(persist);
757	}
758
759	/*
760	 * If the log is readable so far and we're doing system initialization,
761	 * set the region's persistent information based on the headers.
762	 *
763	 * Override the current log file size.
764	 */
765	if (set_persist) {
766		lp = dblp->reginfo.primary;
767		lp->log_size = persist->log_size;
768		lp->persist.version = persist->version;
769	}
770	if (versionp != NULL)
771		*versionp = persist->version;
772
773err:	if (fname != NULL)
774		__os_free(env, fname);
775	if (ret == 0 && fhpp != NULL)
776		*fhpp = fhp;
777	else
778		/* Must close on error or if we only used it locally. */
779		(void)__os_closehandle(env, fhp);
780	if (tmp != NULL)
781		__os_free(env, tmp);
782
783	if (statusp != NULL)
784		*statusp = status;
785
786	return (ret);
787}
788
789/*
790 * __log_env_refresh --
791 *	Clean up after the log system on a close or failed open.
792 *
793 * PUBLIC: int __log_env_refresh __P((ENV *));
794 */
795int
796__log_env_refresh(env)
797	ENV *env;
798{
799	DB_LOG *dblp;
800	LOG *lp;
801	REGINFO *reginfo;
802	struct __fname *fnp;
803	struct __db_commit *commit;
804	struct __db_filestart *filestart;
805	int ret, t_ret;
806
807	dblp = env->lg_handle;
808	reginfo = &dblp->reginfo;
809	lp = reginfo->primary;
810	ret = 0;
811
812	/*
813	 * Flush the log if it's private -- there's no Berkeley DB guarantee
814	 * that this gets done, but in case the application has forgotten to
815	 * flush for durability, it's the polite thing to do.
816	 */
817	if (F_ISSET(env, ENV_PRIVATE) &&
818	    (t_ret = __log_flush(env, NULL)) != 0 && ret == 0)
819		ret = t_ret;
820
821	/* We may have opened files as part of XA; if so, close them. */
822	if ((t_ret = __dbreg_close_files(env, 0)) != 0 && ret == 0)
823		ret = t_ret;
824
825	/*
826	 * After we close the files, check for any unlogged closes left in
827	 * the shared memory queue.  If we find any, try to log it, otherwise
828	 * return the error.  We cannot say the environment was closed
829	 * cleanly.
830	 */
831	MUTEX_LOCK(env, lp->mtx_filelist);
832	SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
833		if (F_ISSET(fnp, DB_FNAME_NOTLOGGED) &&
834		    (t_ret = __dbreg_close_id_int(
835		    env, fnp, DBREG_CLOSE, 1)) != 0)
836			ret = t_ret;
837	MUTEX_UNLOCK(env, lp->mtx_filelist);
838
839	/*
840	 * If a private region, return the memory to the heap.  Not needed for
841	 * filesystem-backed or system shared memory regions, that memory isn't
842	 * owned by any particular process.
843	 */
844	if (F_ISSET(env, ENV_PRIVATE)) {
845		/* Discard the flush mutex. */
846		if ((t_ret =
847		    __mutex_free(env, &lp->mtx_flush)) != 0 && ret == 0)
848			ret = t_ret;
849
850		/* Discard the buffer. */
851		__env_alloc_free(reginfo, R_ADDR(reginfo, lp->buffer_off));
852
853		/* Discard stack of free file IDs. */
854		if (lp->free_fid_stack != INVALID_ROFF)
855			__env_alloc_free(reginfo,
856			    R_ADDR(reginfo, lp->free_fid_stack));
857
858		/* Discard the list of in-memory log file markers. */
859		while ((filestart = SH_TAILQ_FIRST(&lp->logfiles,
860		    __db_filestart)) != NULL) {
861			SH_TAILQ_REMOVE(&lp->logfiles, filestart, links,
862			    __db_filestart);
863			__env_alloc_free(reginfo, filestart);
864		}
865
866		while ((filestart = SH_TAILQ_FIRST(&lp->free_logfiles,
867		    __db_filestart)) != NULL) {
868			SH_TAILQ_REMOVE(&lp->free_logfiles, filestart, links,
869			    __db_filestart);
870			__env_alloc_free(reginfo, filestart);
871		}
872
873		/* Discord commit queue elements. */
874		while ((commit = SH_TAILQ_FIRST(&lp->free_commits,
875		    __db_commit)) != NULL) {
876			SH_TAILQ_REMOVE(&lp->free_commits, commit, links,
877			    __db_commit);
878			__env_alloc_free(reginfo, commit);
879		}
880
881		/* Discard replication bulk buffer. */
882		if (lp->bulk_buf != INVALID_ROFF) {
883			__env_alloc_free(reginfo,
884			    R_ADDR(reginfo, lp->bulk_buf));
885			lp->bulk_buf = INVALID_ROFF;
886		}
887	}
888
889	/* Discard the per-thread DBREG mutex. */
890	if ((t_ret = __mutex_free(env, &dblp->mtx_dbreg)) != 0 && ret == 0)
891		ret = t_ret;
892
893	/* Detach from the region. */
894	if ((t_ret = __env_region_detach(env, reginfo, 0)) != 0 && ret == 0)
895		ret = t_ret;
896
897	/* Close open files, release allocated memory. */
898	if (dblp->lfhp != NULL) {
899		if ((t_ret =
900		    __os_closehandle(env, dblp->lfhp)) != 0 && ret == 0)
901			ret = t_ret;
902		dblp->lfhp = NULL;
903	}
904	if (dblp->dbentry != NULL)
905		__os_free(env, dblp->dbentry);
906
907	__os_free(env, dblp);
908
909	env->lg_handle = NULL;
910	return (ret);
911}
912
913/*
914 * __log_get_cached_ckp_lsn --
915 *	Retrieve any last checkpoint LSN that we may have found on startup.
916 *
917 * PUBLIC: int __log_get_cached_ckp_lsn __P((ENV *, DB_LSN *));
918 */
919int
920__log_get_cached_ckp_lsn(env, ckp_lsnp)
921	ENV *env;
922	DB_LSN *ckp_lsnp;
923{
924	DB_LOG *dblp;
925	LOG *lp;
926
927	dblp = env->lg_handle;
928	lp = (LOG *)dblp->reginfo.primary;
929
930	LOG_SYSTEM_LOCK(env);
931	*ckp_lsnp = lp->cached_ckp_lsn;
932	LOG_SYSTEM_UNLOCK(env);
933
934	return (0);
935}
936
937/*
938 * __log_region_mutex_count --
939 *	Return the number of mutexes the log region will need.
940 *
941 * PUBLIC: u_int32_t __log_region_mutex_count __P((ENV *));
942 */
943u_int32_t
944__log_region_mutex_count(env)
945	ENV *env;
946{
947	/*
948	 * We need a few assorted mutexes, and one per transaction waiting
949	 * on the group commit list.  We can't know how many that will be,
950	 * but it should be bounded by the maximum active transactions.
951	 */
952	return (env->dbenv->tx_max + 5);
953}
954
955/*
956 * __log_region_size --
957 *	Return the amount of space needed for the log region.
958 *	Make the region large enough to hold txn_max transaction
959 *	detail structures  plus some space to hold thread handles
960 *	and the beginning of the alloc region and anything we
961 *	need for mutex system resource recording.
962 */
963static size_t
964__log_region_size(env)
965	ENV *env;
966{
967	DB_ENV *dbenv;
968	size_t s;
969
970	dbenv = env->dbenv;
971
972	s = dbenv->lg_regionmax + dbenv->lg_bsize;
973
974	/*
975	 * If running with replication, add in space for bulk buffer.
976	 * Allocate a megabyte and a little bit more space.
977	 */
978	if (IS_ENV_REPLICATED(env))
979		s += MEGABYTE;
980
981	return (s);
982}
983
984/*
985 * __log_vtruncate
986 *	This is a virtual truncate.  We set up the log indicators to
987 * make everyone believe that the given record is the last one in the
988 * log.  Returns with the next valid LSN (i.e., the LSN of the next
989 * record to be written). This is used in replication to discard records
990 * in the log file that do not agree with the master.
991 *
992 * PUBLIC: int __log_vtruncate __P((ENV *, DB_LSN *, DB_LSN *, DB_LSN *));
993 */
994int
995__log_vtruncate(env, lsn, ckplsn, trunclsn)
996	ENV *env;
997	DB_LSN *lsn, *ckplsn, *trunclsn;
998{
999	DBT log_dbt;
1000	DB_LOG *dblp;
1001	DB_LOGC *logc;
1002	LOG *lp;
1003	u_int32_t bytes, len;
1004	int ret, t_ret;
1005
1006	/* Need to find out the length of this soon-to-be-last record. */
1007	if ((ret = __log_cursor(env, &logc)) != 0)
1008		return (ret);
1009	memset(&log_dbt, 0, sizeof(log_dbt));
1010	ret = __logc_get(logc, lsn, &log_dbt, DB_SET);
1011	len = logc->len;
1012	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
1013		ret = t_ret;
1014	if (ret != 0)
1015		return (ret);
1016
1017	/* Now do the truncate. */
1018	dblp = env->lg_handle;
1019	lp = (LOG *)dblp->reginfo.primary;
1020
1021	LOG_SYSTEM_LOCK(env);
1022
1023	/*
1024	 * Flush the log so we can simply initialize the in-memory buffer
1025	 * after the truncate.
1026	 */
1027	if ((ret = __log_flush_int(dblp, NULL, 0)) != 0)
1028		goto err;
1029
1030	lp->lsn = *lsn;
1031	lp->len = len;
1032	lp->lsn.offset += lp->len;
1033
1034	if (lp->db_log_inmemory &&
1035	    (ret = __log_inmem_lsnoff(dblp, &lp->lsn, &lp->b_off)) != 0)
1036		goto err;
1037
1038	/*
1039	 * I am going to assume that the number of bytes written since
1040	 * the last checkpoint doesn't exceed a 32-bit number.
1041	 */
1042	DB_ASSERT(env, lp->lsn.file >= ckplsn->file);
1043	bytes = 0;
1044	if (ckplsn->file != lp->lsn.file) {
1045		bytes = lp->log_size - ckplsn->offset;
1046		if (lp->lsn.file > ckplsn->file + 1)
1047			bytes += lp->log_size *
1048			    ((lp->lsn.file - ckplsn->file) - 1);
1049		bytes += lp->lsn.offset;
1050	} else
1051		bytes = lp->lsn.offset - ckplsn->offset;
1052
1053	lp->stat.st_wc_mbytes += bytes / MEGABYTE;
1054	lp->stat.st_wc_bytes += bytes % MEGABYTE;
1055
1056	/*
1057	 * If the synced lsn is greater than our new end of log, reset it
1058	 * to our current end of log.
1059	 */
1060	MUTEX_LOCK(env, lp->mtx_flush);
1061	if (LOG_COMPARE(&lp->s_lsn, lsn) > 0)
1062		lp->s_lsn = lp->lsn;
1063	MUTEX_UNLOCK(env, lp->mtx_flush);
1064
1065	/* Initialize the in-region buffer to a pristine state. */
1066	ZERO_LSN(lp->f_lsn);
1067	lp->w_off = lp->lsn.offset;
1068
1069	if (trunclsn != NULL)
1070		*trunclsn = lp->lsn;
1071
1072	/* Truncate the log to the new point. */
1073	if ((ret = __log_zero(env, &lp->lsn)) != 0)
1074		goto err;
1075
1076err:	LOG_SYSTEM_UNLOCK(env);
1077	return (ret);
1078}
1079
1080/*
1081 * __log_is_outdated --
1082 *	Used by the replication system to identify if a client's logs are too
1083 *	old.
1084 *
1085 * PUBLIC: int __log_is_outdated __P((ENV *, u_int32_t, int *));
1086 */
1087int
1088__log_is_outdated(env, fnum, outdatedp)
1089	ENV *env;
1090	u_int32_t fnum;
1091	int *outdatedp;
1092{
1093	DB_LOG *dblp;
1094	LOG *lp;
1095	char *name;
1096	int ret;
1097	u_int32_t cfile;
1098	struct __db_filestart *filestart;
1099
1100	dblp = env->lg_handle;
1101
1102	/*
1103	 * The log represented by env is compared to the file number passed
1104	 * in fnum.  If the log file fnum does not exist and is lower-numbered
1105	 * than the current logs, return *outdatedp non-zero, else we return 0.
1106	 */
1107	if (FLD_ISSET(env->dbenv->lg_flags, DB_LOG_IN_MEMORY)) {
1108		LOG_SYSTEM_LOCK(env);
1109		lp = (LOG *)dblp->reginfo.primary;
1110		filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1111		*outdatedp = filestart == NULL ? 0 : (fnum < filestart->file);
1112		LOG_SYSTEM_UNLOCK(env);
1113		return (0);
1114	}
1115
1116	*outdatedp = 0;
1117	if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0) {
1118		__os_free(env, name);
1119		return (ret);
1120	}
1121
1122	/* If the file exists, we're just fine. */
1123	if (__os_exists(env, name, NULL) == 0)
1124		goto out;
1125
1126	/*
1127	 * It didn't exist, decide if the file number is too big or
1128	 * too little.  If it's too little, then we need to indicate
1129	 * that the LSN is outdated.
1130	 */
1131	LOG_SYSTEM_LOCK(env);
1132	lp = (LOG *)dblp->reginfo.primary;
1133	cfile = lp->lsn.file;
1134	LOG_SYSTEM_UNLOCK(env);
1135
1136	if (cfile > fnum)
1137		*outdatedp = 1;
1138out:	__os_free(env, name);
1139	return (ret);
1140}
1141
1142/*
1143 * __log_zero --
1144 *	Zero out the tail of a log after a truncate.
1145 *
1146 * PUBLIC: int __log_zero __P((ENV *, DB_LSN *));
1147 */
1148int
1149__log_zero(env, from_lsn)
1150	ENV *env;
1151	DB_LSN *from_lsn;
1152{
1153	DB_FH *fhp;
1154	DB_LOG *dblp;
1155	LOG *lp;
1156	struct __db_filestart *filestart, *nextstart;
1157	size_t nbytes, len, nw;
1158	u_int32_t fn, mbytes, bytes;
1159	u_int8_t buf[4096];
1160	int ret;
1161	char *fname;
1162
1163	dblp = env->lg_handle;
1164	lp = (LOG *)dblp->reginfo.primary;
1165	DB_ASSERT(env, LOG_COMPARE(from_lsn, &lp->lsn) <= 0);
1166	if (LOG_COMPARE(from_lsn, &lp->lsn) > 0) {
1167		__db_errx(env,
1168		    "Warning: truncating to point beyond end of log");
1169		return (0);
1170	}
1171
1172	if (lp->db_log_inmemory) {
1173		/*
1174		 * Remove the files that are invalidated by this truncate.
1175		 */
1176		for (filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1177		    filestart != NULL; filestart = nextstart) {
1178			nextstart = SH_TAILQ_NEXT(filestart,
1179			    links, __db_filestart);
1180			if (filestart->file > from_lsn->file) {
1181				SH_TAILQ_REMOVE(&lp->logfiles,
1182				    filestart, links, __db_filestart);
1183				SH_TAILQ_INSERT_HEAD(&lp->free_logfiles,
1184				    filestart, links, __db_filestart);
1185			}
1186		}
1187
1188		return (0);
1189	}
1190
1191	/* Close any open file handles so unlinks don't fail. */
1192	if (dblp->lfhp != NULL) {
1193		(void)__os_closehandle(env, dblp->lfhp);
1194		dblp->lfhp = NULL;
1195	}
1196
1197	/* Throw away any extra log files that we have around. */
1198	for (fn = from_lsn->file + 1;; fn++) {
1199		if (__log_name(dblp, fn, &fname, &fhp, DB_OSO_RDONLY) != 0) {
1200			__os_free(env, fname);
1201			break;
1202		}
1203		(void)__os_closehandle(env, fhp);
1204		(void)time(&lp->timestamp);
1205		ret = __os_unlink(env, fname, 0);
1206		__os_free(env, fname);
1207		if (ret != 0)
1208			return (ret);
1209	}
1210
1211	/* We removed some log files; have to 0 to end of file. */
1212	if ((ret =
1213	    __log_name(dblp, from_lsn->file, &fname, &dblp->lfhp, 0)) != 0) {
1214		__os_free(env, fname);
1215		return (ret);
1216	}
1217	__os_free(env, fname);
1218	if ((ret = __os_ioinfo(env,
1219	    NULL, dblp->lfhp, &mbytes, &bytes, NULL)) != 0)
1220		goto err;
1221	DB_ASSERT(env, (mbytes * MEGABYTE + bytes) >= from_lsn->offset);
1222	len = (mbytes * MEGABYTE + bytes) - from_lsn->offset;
1223
1224	memset(buf, 0, sizeof(buf));
1225
1226	/* Initialize the write position. */
1227	if ((ret = __os_seek(env, dblp->lfhp, 0, 0, from_lsn->offset)) != 0)
1228		goto err;
1229
1230	while (len > 0) {
1231		nbytes = len > sizeof(buf) ? sizeof(buf) : len;
1232		if ((ret =
1233		    __os_write(env, dblp->lfhp, buf, nbytes, &nw)) != 0)
1234			goto err;
1235		len -= nbytes;
1236	}
1237
1238err:	(void)__os_closehandle(env, dblp->lfhp);
1239	dblp->lfhp = NULL;
1240
1241	return (ret);
1242}
1243
1244/*
1245 * __log_inmem_lsnoff --
1246 *	Find the offset in the buffer of a given LSN.
1247 *
1248 * PUBLIC: int __log_inmem_lsnoff __P((DB_LOG *, DB_LSN *, size_t *));
1249 */
1250int
1251__log_inmem_lsnoff(dblp, lsnp, offsetp)
1252	DB_LOG *dblp;
1253	DB_LSN *lsnp;
1254	size_t *offsetp;
1255{
1256	LOG *lp;
1257	struct __db_filestart *filestart;
1258
1259	lp = (LOG *)dblp->reginfo.primary;
1260
1261	SH_TAILQ_FOREACH(filestart, &lp->logfiles, links, __db_filestart)
1262		if (filestart->file == lsnp->file) {
1263			*offsetp =
1264			    (filestart->b_off + lsnp->offset) % lp->buffer_size;
1265			return (0);
1266		}
1267
1268	return (DB_NOTFOUND);
1269}
1270
1271/*
1272 * __log_inmem_newfile --
1273 *	Records the offset of the beginning of a new file in the in-memory
1274 *	buffer.
1275 *
1276 * PUBLIC: int __log_inmem_newfile __P((DB_LOG *, u_int32_t));
1277 */
1278int
1279__log_inmem_newfile(dblp, file)
1280	DB_LOG *dblp;
1281	u_int32_t file;
1282{
1283	HDR hdr;
1284	LOG *lp;
1285	struct __db_filestart *filestart;
1286	int ret;
1287#ifdef DIAGNOSTIC
1288	struct __db_filestart *first, *last;
1289#endif
1290
1291	lp = (LOG *)dblp->reginfo.primary;
1292
1293	/*
1294	 * If the log buffer is empty, reuse the filestart entry.
1295	 */
1296	filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1297	if (filestart != NULL &&
1298	    RINGBUF_LEN(lp, filestart->b_off, lp->b_off) <=
1299	    sizeof(HDR) + sizeof(LOGP)) {
1300		filestart->file = file;
1301		filestart->b_off = lp->b_off;
1302		return (0);
1303	}
1304
1305	/*
1306	 * We write an empty header at the end of every in-memory log file.
1307	 * This is used during cursor traversal to indicate when to switch the
1308	 * LSN to the next file.
1309	 */
1310	if (file > 1) {
1311		memset(&hdr, 0, sizeof(HDR));
1312		__log_inmem_copyin(dblp, lp->b_off, &hdr, sizeof(HDR));
1313		lp->b_off = (lp->b_off + sizeof(HDR)) % lp->buffer_size;
1314	}
1315
1316	filestart = SH_TAILQ_FIRST(&lp->free_logfiles, __db_filestart);
1317	if (filestart == NULL) {
1318		if ((ret = __env_alloc(&dblp->reginfo,
1319		    sizeof(struct __db_filestart), &filestart)) != 0)
1320			return (ret);
1321		memset(filestart, 0, sizeof(*filestart));
1322	} else
1323		SH_TAILQ_REMOVE(&lp->free_logfiles, filestart,
1324		    links, __db_filestart);
1325
1326	filestart->file = file;
1327	filestart->b_off = lp->b_off;
1328
1329#ifdef DIAGNOSTIC
1330	first = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1331	last = SH_TAILQ_LAST(&(lp)->logfiles, links, __db_filestart);
1332
1333	/* Check that we don't wrap. */
1334	DB_ASSERT(dblp->env, !first || first == last ||
1335	    RINGBUF_LEN(lp, first->b_off, lp->b_off) ==
1336	    RINGBUF_LEN(lp, first->b_off, last->b_off) +
1337	    RINGBUF_LEN(lp, last->b_off, lp->b_off));
1338#endif
1339
1340	SH_TAILQ_INSERT_TAIL(&lp->logfiles, filestart, links);
1341	return (0);
1342}
1343
1344/*
1345 * __log_inmem_chkspace --
1346 *	Ensure that the requested amount of space is available in the buffer,
1347 *	and invalidate the region.
1348 *      Note: assumes that the region lock is held on entry.
1349 *
1350 * PUBLIC: int __log_inmem_chkspace __P((DB_LOG *, size_t));
1351 */
1352int
1353__log_inmem_chkspace(dblp, len)
1354	DB_LOG *dblp;
1355	size_t len;
1356{
1357	DB_LSN active_lsn, old_active_lsn;
1358	ENV *env;
1359	LOG *lp;
1360	struct __db_filestart *filestart;
1361	int ret;
1362
1363	env = dblp->env;
1364	lp = dblp->reginfo.primary;
1365
1366	DB_ASSERT(env, lp->db_log_inmemory);
1367
1368	/*
1369	 * Allow room for an extra header so that we don't need to check for
1370	 * space when switching files.
1371	 */
1372	len += sizeof(HDR);
1373
1374	/*
1375	 * If transactions are enabled and we're about to fill available space,
1376	 * update the active LSN and recheck.  If transactions aren't enabled,
1377	 * don't even bother checking: in that case we can always overwrite old
1378	 * log records, because we're never going to abort.
1379	 */
1380	while (TXN_ON(env) &&
1381	    RINGBUF_LEN(lp, lp->b_off, lp->a_off) <= len) {
1382		old_active_lsn = lp->active_lsn;
1383		active_lsn = lp->lsn;
1384
1385		/*
1386		 * Drop the log region lock so we don't hold it while
1387		 * taking the transaction region lock.
1388		 */
1389		LOG_SYSTEM_UNLOCK(env);
1390		ret = __txn_getactive(env, &active_lsn);
1391		LOG_SYSTEM_LOCK(env);
1392		if (ret != 0)
1393			return (ret);
1394		active_lsn.offset = 0;
1395
1396		/* If we didn't make any progress, give up. */
1397		if (LOG_COMPARE(&active_lsn, &old_active_lsn) == 0) {
1398			__db_errx(env,
1399      "In-memory log buffer is full (an active transaction spans the buffer)");
1400			return (DB_LOG_BUFFER_FULL);
1401		}
1402
1403		/* Make sure we're moving the region LSN forwards. */
1404		if (LOG_COMPARE(&active_lsn, &lp->active_lsn) > 0) {
1405			lp->active_lsn = active_lsn;
1406			(void)__log_inmem_lsnoff(dblp, &active_lsn,
1407			    &lp->a_off);
1408		}
1409	}
1410
1411	/*
1412	 * Remove the first file if it is invalidated by this write.
1413	 * Log records can't be bigger than a file, so we only need to
1414	 * check the first file.
1415	 */
1416	filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1417	if (filestart != NULL &&
1418	    RINGBUF_LEN(lp, lp->b_off, filestart->b_off) <= len) {
1419		SH_TAILQ_REMOVE(&lp->logfiles, filestart,
1420		    links, __db_filestart);
1421		SH_TAILQ_INSERT_HEAD(&lp->free_logfiles, filestart,
1422		    links, __db_filestart);
1423		lp->f_lsn.file = filestart->file + 1;
1424	}
1425
1426	return (0);
1427}
1428
1429/*
1430 * __log_inmem_copyout --
1431 *	Copies the given number of bytes from the buffer -- no checking.
1432 *      Note: assumes that the region lock is held on entry.
1433 *
1434 * PUBLIC: void __log_inmem_copyout __P((DB_LOG *, size_t, void *, size_t));
1435 */
1436void
1437__log_inmem_copyout(dblp, offset, buf, size)
1438	DB_LOG *dblp;
1439	size_t offset;
1440	void *buf;
1441	size_t size;
1442{
1443	LOG *lp;
1444	size_t nbytes;
1445
1446	lp = (LOG *)dblp->reginfo.primary;
1447	nbytes = (offset + size < lp->buffer_size) ?
1448	    size : lp->buffer_size - offset;
1449	memcpy(buf, dblp->bufp + offset, nbytes);
1450	if (nbytes < size)
1451		memcpy((u_int8_t *)buf + nbytes, dblp->bufp, size - nbytes);
1452}
1453
1454/*
1455 * __log_inmem_copyin --
1456 *	Copies the given number of bytes into the buffer -- no checking.
1457 *      Note: assumes that the region lock is held on entry.
1458 *
1459 * PUBLIC: void __log_inmem_copyin __P((DB_LOG *, size_t, void *, size_t));
1460 */
1461void
1462__log_inmem_copyin(dblp, offset, buf, size)
1463	DB_LOG *dblp;
1464	size_t offset;
1465	void *buf;
1466	size_t size;
1467{
1468	LOG *lp;
1469	size_t nbytes;
1470
1471	lp = (LOG *)dblp->reginfo.primary;
1472	nbytes = (offset + size < lp->buffer_size) ?
1473	    size : lp->buffer_size - offset;
1474	memcpy(dblp->bufp + offset, buf, nbytes);
1475	if (nbytes < size)
1476		memcpy(dblp->bufp, (u_int8_t *)buf + nbytes, size - nbytes);
1477}
1478
1479/*
1480 * __log_set_version --
1481 *	Sets the current version of the log subsystem to the given version.
1482 *	Essentially this modifies the lp->persist.version field in the
1483 *	shared memory region.  Called when region is initially created
1484 *	and when replication is starting up or finds a new master.
1485 *
1486 * PUBLIC: void __log_set_version __P((ENV *, u_int32_t));
1487 */
1488void
1489__log_set_version(env, newver)
1490	ENV *env;
1491	u_int32_t newver;
1492{
1493	DB_LOG *dblp;
1494	LOG *lp;
1495
1496	dblp = env->lg_handle;
1497	lp = (LOG *)dblp->reginfo.primary;
1498	/*
1499	 * We should be able to update this atomically without locking.
1500	 */
1501	lp->persist.version = newver;
1502}
1503
1504/*
1505 * __log_get_oldversion --
1506 *	Returns the last version of log that this environment was working
1507 *	with.  Since there could be several versions of log files, if
1508 *	the user upgraded and didn't log archive, we check the version
1509 *	of the first log file, compare it to the last log file.  If those
1510 *	are different, then there is an older log existing, and we then
1511 *	walk backward in the log files looking for the version of the
1512 *	most recent older log file.
1513 *
1514 * PUBLIC: int __log_get_oldversion __P((ENV *, u_int32_t *));
1515 */
1516int
1517__log_get_oldversion(env, ver)
1518	ENV *env;
1519	u_int32_t *ver;
1520{
1521	DBT rec;
1522	DB_LOG *dblp;
1523	DB_LOGC *logc;
1524	DB_LSN lsn;
1525	LOG *lp;
1526	u_int32_t firstfnum, fnum, lastver, oldver;
1527	int ret, t_ret;
1528
1529	dblp = env->lg_handle;
1530	lp = dblp->reginfo.primary;
1531
1532	logc = NULL;
1533	ret = 0;
1534	oldver = DB_LOGVERSION;
1535	/*
1536	 * If we're in-memory logs we're always the current version.
1537	 */
1538	if (lp->db_log_inmemory) {
1539		*ver = oldver;
1540		return (0);
1541	}
1542	memset(&rec, 0, sizeof(rec));
1543	if ((ret = __log_cursor(env, &logc)) != 0)
1544		goto err;
1545	/*
1546	 * Get the version numbers of the first and last log files.
1547	 */
1548	if ((ret = __logc_get(logc, &lsn, &rec, DB_FIRST)) != 0) {
1549		/*
1550		 * If there is no log file, we'll get DB_NOTFOUND.
1551		 * If we get that, set the version to the current.
1552		 */
1553		if (ret == DB_NOTFOUND)
1554			ret = 0;
1555		goto err;
1556	}
1557	firstfnum = lsn.file;
1558	if ((ret = __logc_get(logc, &lsn, &rec, DB_LAST)) != 0)
1559		goto err;
1560	if ((ret = __log_valid(dblp, firstfnum, 0, NULL, 0,
1561	    NULL, &oldver)) != 0)
1562		goto err;
1563	/*
1564	 * If the first and last LSN are in the same file, then we
1565	 * already have the version in oldver.  Return it.
1566	 */
1567	if (firstfnum == lsn.file)
1568		goto err;
1569
1570	/*
1571	 * Otherwise they're in different files and we call __log_valid
1572	 * to get the version numbers in both files.
1573	 */
1574	if ((ret = __log_valid(dblp, lsn.file, 0, NULL, 0,
1575	    NULL, &lastver)) != 0)
1576		goto err;
1577	/*
1578	 * If the version numbers are different, walk backward getting
1579	 * the version of each log file until we find one that is
1580	 * different than the last.
1581	 */
1582	if (oldver != lastver) {
1583		for (fnum = lsn.file - 1; fnum >= firstfnum; fnum--) {
1584			if ((ret = __log_valid(dblp, fnum, 0, NULL, 0,
1585			    NULL, &oldver)) != 0)
1586				goto err;
1587			if (oldver != lastver)
1588				break;
1589		}
1590	}
1591err:	if (logc != NULL && ((t_ret = __logc_close(logc)) != 0) && ret == 0)
1592		ret = t_ret;
1593	if (ret == 0 && ver != NULL)
1594		*ver = oldver;
1595	return (ret);
1596}
1597