1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996-2009 Oracle.  All rights reserved.
5 *
6 * $Id$
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/crypto.h"
13#include "dbinc/db_page.h"
14#include "dbinc/hmac.h"
15#include "dbinc/log.h"
16#include "dbinc/hash.h"
17
18typedef enum { L_ALREADY, L_ACQUIRED, L_NONE } RLOCK;
19
20static int __logc_close_pp __P((DB_LOGC *, u_int32_t));
21static int __logc_get_pp __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
22static int __logc_get_int __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
23static int __logc_hdrchk __P((DB_LOGC *, DB_LSN *, HDR *, int *));
24static int __logc_incursor __P((DB_LOGC *, DB_LSN *, HDR *, u_int8_t **));
25static int __logc_inregion __P((DB_LOGC *,
26	       DB_LSN *, RLOCK *, DB_LSN *, HDR *, u_int8_t **, int *));
27static int __logc_io __P((DB_LOGC *,
28	       u_int32_t, u_int32_t, void *, size_t *, int *));
29static int __logc_ondisk __P((DB_LOGC *,
30	       DB_LSN *, DB_LSN *, u_int32_t, HDR *, u_int8_t **, int *));
31static int __logc_set_maxrec __P((DB_LOGC *, char *));
32static int __logc_shortread __P((DB_LOGC *, DB_LSN *, int));
33static int __logc_version_pp __P((DB_LOGC *, u_int32_t *, u_int32_t));
34
35/*
36 * __log_cursor_pp --
37 *	ENV->log_cursor
38 *
39 * PUBLIC: int __log_cursor_pp __P((DB_ENV *, DB_LOGC **, u_int32_t));
40 */
41int
42__log_cursor_pp(dbenv, logcp, flags)
43	DB_ENV *dbenv;
44	DB_LOGC **logcp;
45	u_int32_t flags;
46{
47	DB_THREAD_INFO *ip;
48	ENV *env;
49	int ret;
50
51	env = dbenv->env;
52
53	ENV_REQUIRES_CONFIG(env,
54	    env->lg_handle, "DB_ENV->log_cursor", DB_INIT_LOG);
55
56	/* Validate arguments. */
57	if ((ret = __db_fchk(env, "DB_ENV->log_cursor", flags, 0)) != 0)
58		return (ret);
59
60	ENV_ENTER(env, ip);
61	REPLICATION_WRAP(env, (__log_cursor(env, logcp)), 0, ret);
62	ENV_LEAVE(env, ip);
63	return (ret);
64}
65
66/*
67 * __log_cursor --
68 *	Create a log cursor.
69 *
70 * PUBLIC: int __log_cursor __P((ENV *, DB_LOGC **));
71 */
72int
73__log_cursor(env, logcp)
74	ENV *env;
75	DB_LOGC **logcp;
76{
77	DB_LOGC *logc;
78	int ret;
79
80	*logcp = NULL;
81
82	/* Allocate memory for the cursor. */
83	if ((ret = __os_calloc(env, 1, sizeof(DB_LOGC), &logc)) != 0)
84		return (ret);
85
86	logc->bp_size = LG_CURSOR_BUF_SIZE;
87	/*
88	 * Set this to something positive.
89	 */
90	logc->bp_maxrec = MEGABYTE;
91	if ((ret = __os_malloc(env, logc->bp_size, &logc->bp)) != 0) {
92		__os_free(env, logc);
93		return (ret);
94	}
95
96	logc->env = env;
97	logc->close = __logc_close_pp;
98	logc->get = __logc_get_pp;
99	logc->version = __logc_version_pp;
100
101	*logcp = logc;
102	return (0);
103}
104
105/*
106 * __logc_close_pp --
107 *	DB_LOGC->close pre/post processing.
108 */
109static int
110__logc_close_pp(logc, flags)
111	DB_LOGC *logc;
112	u_int32_t flags;
113{
114	DB_THREAD_INFO *ip;
115	ENV *env;
116	int ret;
117
118	env = logc->env;
119
120	if ((ret = __db_fchk(env, "DB_LOGC->close", flags, 0)) != 0)
121		return (ret);
122
123	ENV_ENTER(env, ip);
124	REPLICATION_WRAP(env, (__logc_close(logc)), 0, ret);
125	ENV_LEAVE(env, ip);
126	return (ret);
127}
128
129/*
130 * __logc_close --
131 *	DB_LOGC->close.
132 *
133 * PUBLIC: int __logc_close __P((DB_LOGC *));
134 */
135int
136__logc_close(logc)
137	DB_LOGC *logc;
138{
139	ENV *env;
140
141	env = logc->env;
142
143	if (logc->fhp != NULL) {
144		(void)__os_closehandle(env, logc->fhp);
145		logc->fhp = NULL;
146	}
147
148	if (logc->dbt.data != NULL)
149		__os_free(env, logc->dbt.data);
150
151	__os_free(env, logc->bp);
152	__os_free(env, logc);
153
154	return (0);
155}
156
157/*
158 * __logc_version_pp --
159 *	DB_LOGC->version.
160 */
161static int
162__logc_version_pp(logc, versionp, flags)
163	DB_LOGC *logc;
164	u_int32_t *versionp;
165	u_int32_t flags;
166{
167	DB_THREAD_INFO *ip;
168	ENV *env;
169	int ret;
170
171	env = logc->env;
172
173	if ((ret = __db_fchk(env, "DB_LOGC->version", flags, 0)) != 0)
174		return (ret);
175
176	ENV_ENTER(env, ip);
177	REPLICATION_WRAP(env, (__logc_version(logc, versionp)), 0, ret);
178	ENV_LEAVE(env, ip);
179	return (ret);
180}
181
182/*
183 * __logc_version --
184 *	DB_LOGC->version.
185 *
186 * PUBLIC: int __logc_version __P((DB_LOGC *, u_int32_t *));
187 */
188int
189__logc_version(logc, versionp)
190	DB_LOGC *logc;
191	u_int32_t *versionp;
192{
193	DBT hdrdbt;
194	DB_LOGC *plogc;
195	DB_LSN plsn;
196	ENV *env;
197	LOGP *persist;
198	int ret, t_ret;
199
200	env = logc->env;
201	if (IS_ZERO_LSN(logc->lsn)) {
202		__db_errx(env, "DB_LOGC->get: unset cursor");
203		return (EINVAL);
204	}
205	ret = 0;
206	/*
207	 * Check if the persist info we have is for the same file
208	 * as the current cursor position.  If we already have the
209	 * information, then we're done.  If not, we open a new
210	 * log cursor and get the header.
211	 *
212	 * Since most users walk forward through the log when
213	 * using this feature (i.e. printlog) we're likely to
214	 * have the information we need.
215	 */
216	if (logc->lsn.file != logc->p_lsn.file) {
217		if ((ret = __log_cursor(env, &plogc)) != 0)
218			return (ret);
219		plsn.file = logc->lsn.file;
220		plsn.offset = 0;
221		plogc->lsn = plsn;
222		memset(&hdrdbt, 0, sizeof(DBT));
223		if ((ret = __logc_get_int(plogc,
224		    &plsn, &hdrdbt, DB_SET)) == 0) {
225			persist = (LOGP *)hdrdbt.data;
226			if (LOG_SWAPPED(env))
227				__log_persistswap(persist);
228			logc->p_lsn = logc->lsn;
229			logc->p_version = persist->version;
230		}
231		if ((t_ret = __logc_close(plogc)) != 0 && ret == 0)
232			ret = t_ret;
233	}
234	/* Return the version. */
235	if (ret == 0)
236		*versionp = logc->p_version;
237	return (ret);
238}
239
240/*
241 * __logc_get_pp --
242 *	DB_LOGC->get pre/post processing.
243 */
244static int
245__logc_get_pp(logc, alsn, dbt, flags)
246	DB_LOGC *logc;
247	DB_LSN *alsn;
248	DBT *dbt;
249	u_int32_t flags;
250{
251	DB_THREAD_INFO *ip;
252	ENV *env;
253	int ret;
254
255	env = logc->env;
256
257	/* Validate arguments. */
258	switch (flags) {
259	case DB_CURRENT:
260	case DB_FIRST:
261	case DB_LAST:
262	case DB_NEXT:
263	case DB_PREV:
264		break;
265	case DB_SET:
266		if (IS_ZERO_LSN(*alsn)) {
267			__db_errx(env, "DB_LOGC->get: invalid LSN: %lu/%lu",
268			    (u_long)alsn->file, (u_long)alsn->offset);
269			return (EINVAL);
270		}
271		break;
272	default:
273		return (__db_ferr(env, "DB_LOGC->get", 1));
274	}
275
276	ENV_ENTER(env, ip);
277	REPLICATION_WRAP(env, (__logc_get(logc, alsn, dbt, flags)), 0, ret);
278	ENV_LEAVE(env, ip);
279	return (ret);
280}
281
282/*
283 * __logc_get --
284 *	DB_LOGC->get.
285 *
286 * PUBLIC: int __logc_get __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
287 */
288int
289__logc_get(logc, alsn, dbt, flags)
290	DB_LOGC *logc;
291	DB_LSN *alsn;
292	DBT *dbt;
293	u_int32_t flags;
294{
295	DB_LSN saved_lsn;
296	ENV *env;
297	LOGP *persist;
298	int ret;
299
300	env = logc->env;
301
302	/*
303	 * On error, we take care not to overwrite the caller's LSN.  This
304	 * is because callers looking for the end of the log loop using the
305	 * DB_NEXT flag, and expect to take the last successful lsn out of
306	 * the passed-in structure after DB_LOGC->get fails with DB_NOTFOUND.
307	 *
308	 * !!!
309	 * This line is often flagged an uninitialized memory read during a
310	 * Purify or similar tool run, as the application didn't initialize
311	 * *alsn.  If the application isn't setting the DB_SET flag, there is
312	 * no reason it should have initialized *alsn, but we can't know that
313	 * and we want to make sure we never overwrite whatever the application
314	 * put in there.
315	 */
316	saved_lsn = *alsn;
317	/*
318	 * If we get one of the log's header records as a result of doing a
319	 * DB_FIRST, DB_NEXT, DB_LAST or DB_PREV, repeat the operation, log
320	 * file header records aren't useful to applications.
321	 */
322	if ((ret = __logc_get_int(logc, alsn, dbt, flags)) != 0) {
323		*alsn = saved_lsn;
324		return (ret);
325	}
326	/*
327	 * The DBT was populated by the call to __logc_get_int, copy the data
328	 * out of DB_DBT_USERMEM space if it is there.
329	 */
330	if ((ret = __dbt_usercopy(env, dbt)) != 0)
331		return (ret);
332
333	if (alsn->offset == 0 && (flags == DB_FIRST ||
334	    flags == DB_NEXT || flags == DB_LAST || flags == DB_PREV)) {
335		switch (flags) {
336		case DB_FIRST:
337			flags = DB_NEXT;
338			break;
339		case DB_LAST:
340			flags = DB_PREV;
341			break;
342		case DB_NEXT:
343		case DB_PREV:
344		default:
345			break;
346		}
347		/*
348		 * If we're walking the log and we find a persist header
349		 * then store so that we may use it later if needed.
350		 */
351		persist = (LOGP *)dbt->data;
352		if (LOG_SWAPPED(env))
353			__log_persistswap(persist);
354		logc->p_lsn = *alsn;
355		logc->p_version = persist->version;
356		if (F_ISSET(dbt, DB_DBT_MALLOC)) {
357			__os_free(env, dbt->data);
358			dbt->data = NULL;
359		}
360		if ((ret = __logc_get_int(logc, alsn, dbt, flags)) != 0) {
361			*alsn = saved_lsn;
362			goto err;
363		}
364	}
365
366err:	__dbt_userfree(env, dbt, NULL, NULL);
367	return (ret);
368}
369
370/*
371 * __logc_get_int --
372 *	Get a log record; internal version.
373 */
374static int
375__logc_get_int(logc, alsn, dbt, flags)
376	DB_LOGC *logc;
377	DB_LSN *alsn;
378	DBT *dbt;
379	u_int32_t flags;
380{
381	DB_CIPHER *db_cipher;
382	DB_LOG *dblp;
383	DB_LSN last_lsn, nlsn;
384	ENV *env;
385	HDR hdr;
386	LOG *lp;
387	RLOCK rlock;
388	logfile_validity status;
389	u_int32_t cnt, version;
390	u_int8_t *rp;
391	int eof, is_hmac, need_cksum, ret;
392
393	env = logc->env;
394	db_cipher = env->crypto_handle;
395	dblp = env->lg_handle;
396	lp = dblp->reginfo.primary;
397	is_hmac = 0;
398
399	/*
400	 * We don't acquire the log region lock until we need it, and we
401	 * release it as soon as we're done.
402	 */
403	rlock = F_ISSET(logc, DB_LOG_LOCKED) ? L_ALREADY : L_NONE;
404
405	nlsn = logc->lsn;
406	switch (flags) {
407	case DB_NEXT:				/* Next log record. */
408		if (!IS_ZERO_LSN(nlsn)) {
409			/* Increment the cursor by the cursor record size. */
410			nlsn.offset += logc->len;
411			break;
412		}
413		flags = DB_FIRST;
414		/* FALLTHROUGH */
415	case DB_FIRST:				/* First log record. */
416		/* Find the first log file. */
417		if ((ret = __log_find(dblp, 1, &cnt, &status)) != 0)
418			goto err;
419
420		/*
421		 * DB_LV_INCOMPLETE:
422		 *	Theoretically, the log file we want could be created
423		 *	but not yet written, the "first" log record must be
424		 *	in the log buffer.
425		 * DB_LV_NORMAL:
426		 * DB_LV_OLD_READABLE:
427		 *	We found a log file we can read.
428		 * DB_LV_NONEXISTENT:
429		 *	No log files exist, the "first" log record must be in
430		 *	the log buffer.
431		 * DB_LV_OLD_UNREADABLE:
432		 *	No readable log files exist, we're at the cross-over
433		 *	point between two versions.  The "first" log record
434		 *	must be in the log buffer.
435		 */
436		switch (status) {
437		case DB_LV_INCOMPLETE:
438			DB_ASSERT(env, lp->lsn.file == cnt);
439			/* FALLTHROUGH */
440		case DB_LV_NORMAL:
441		case DB_LV_OLD_READABLE:
442			nlsn.file = cnt;
443			break;
444		case DB_LV_NONEXISTENT:
445			nlsn.file = 1;
446			DB_ASSERT(env, lp->lsn.file == nlsn.file);
447			break;
448		case DB_LV_OLD_UNREADABLE:
449			nlsn.file = cnt + 1;
450			DB_ASSERT(env, lp->lsn.file == nlsn.file);
451			break;
452		}
453		nlsn.offset = 0;
454		break;
455	case DB_CURRENT:			/* Current log record. */
456		break;
457	case DB_PREV:				/* Previous log record. */
458		if (!IS_ZERO_LSN(nlsn)) {
459			/* If at start-of-file, move to the previous file. */
460			if (nlsn.offset == 0) {
461				if (nlsn.file == 1) {
462					ret = DB_NOTFOUND;
463					goto err;
464				}
465				if ((!lp->db_log_inmemory &&
466				    (__log_valid(dblp, nlsn.file - 1, 0, NULL,
467				    0, &status, NULL) != 0 ||
468				    (status != DB_LV_NORMAL &&
469				    status != DB_LV_OLD_READABLE)))) {
470					ret = DB_NOTFOUND;
471					goto err;
472				}
473
474				--nlsn.file;
475			}
476			nlsn.offset = logc->prev;
477			break;
478		}
479		/* FALLTHROUGH */
480	case DB_LAST:				/* Last log record. */
481		if (rlock == L_NONE) {
482			rlock = L_ACQUIRED;
483			LOG_SYSTEM_LOCK(env);
484		}
485		nlsn.file = lp->lsn.file;
486		nlsn.offset = lp->lsn.offset - lp->len;
487		break;
488	case DB_SET:				/* Set log record. */
489		nlsn = *alsn;
490		break;
491	default:
492		ret = __db_unknown_path(env, "__logc_get_int");
493		goto err;
494	}
495
496	if (0) {				/* Move to the next file. */
497next_file:	++nlsn.file;
498		nlsn.offset = 0;
499	}
500
501	/*
502	 * The above switch statement should have set nlsn to the lsn of
503	 * the requested record.
504	 */
505
506	if (CRYPTO_ON(env)) {
507		hdr.size = HDR_CRYPTO_SZ;
508		is_hmac = 1;
509	} else {
510		hdr.size = HDR_NORMAL_SZ;
511		is_hmac = 0;
512	}
513
514	/*
515	 * Check to see if the record is in the cursor's buffer -- if so,
516	 * we'll need to checksum it.
517	 */
518	if ((ret = __logc_incursor(logc, &nlsn, &hdr, &rp)) != 0)
519		goto err;
520	if (rp != NULL)
521		goto cksum;
522
523	/*
524	 * Look to see if we're moving backward in the log with the last record
525	 * coming from the disk -- it means the record can't be in the region's
526	 * buffer.  Else, check the region's buffer.
527	 *
528	 * If the record isn't in the region's buffer, then either logs are
529	 * in-memory, and we're done, or we're going to have to read the
530	 * record from disk.  We want to make a point of not reading past the
531	 * end of the logical log (after recovery, there may be data after the
532	 * end of the logical log, not to mention the log file may have been
533	 * pre-allocated).  So, zero out last_lsn, and initialize it inside
534	 * __logc_inregion -- if it's still zero when we check it in
535	 * __logc_ondisk, that's OK, it just means the logical end of the log
536	 * isn't an issue for this request.
537	 */
538	ZERO_LSN(last_lsn);
539	if (!F_ISSET(logc, DB_LOG_DISK) ||
540	    LOG_COMPARE(&nlsn, &logc->lsn) > 0) {
541		F_CLR(logc, DB_LOG_DISK);
542
543		if ((ret = __logc_inregion(logc,
544		    &nlsn, &rlock, &last_lsn, &hdr, &rp, &need_cksum)) != 0)
545			goto err;
546		if (rp != NULL) {
547			/*
548			 * If we read the entire record from the in-memory log
549			 * buffer, we don't need to checksum it, nor do we need
550			 * to worry about vtruncate issues.
551			 */
552			if (need_cksum)
553				goto cksum;
554			goto from_memory;
555		}
556		if (lp->db_log_inmemory)
557			goto nohdr;
558	}
559
560	/*
561	 * We have to read from an on-disk file to retrieve the record.
562	 * If we ever can't retrieve the record at offset 0, we're done,
563	 * return EOF/DB_NOTFOUND.
564	 *
565	 * Discard the region lock if we're still holding it, the on-disk
566	 * reading routines don't need it.
567	 */
568	if (rlock == L_ACQUIRED) {
569		rlock = L_NONE;
570		LOG_SYSTEM_UNLOCK(env);
571	}
572	if ((ret = __logc_ondisk(
573	    logc, &nlsn, &last_lsn, flags, &hdr, &rp, &eof)) != 0)
574		goto err;
575
576	/*
577	 * If we got a 0-length record, that means we're in the midst of some
578	 * bytes that got 0'd as the result of a vtruncate.  In that case or at
579	 * the end of a file, with DB_NEXT we're going to have to retry.
580	 */
581	if (eof || hdr.len == 0) {
582nohdr:		switch (flags) {
583		case DB_LAST:
584		case DB_PREV:
585			/*
586			 * We should never get here.  If we recover a log
587			 * file with 0's at the end, we'll treat the 0'd
588			 * headers as the end of log and ignore them.  If
589			 * we're reading backwards from another file, then
590			 * the first record in that new file should have its
591			 * prev field set correctly.
592			 */
593			__db_errx(env,
594		"Encountered zero length records while traversing backwards");
595			ret = __env_panic(env, DB_RUNRECOVERY);
596			goto err;
597		case DB_FIRST:
598		case DB_NEXT:
599			/*
600			 * Zero'd records always indicate the end of a file,
601			 * but only go to the next file once.
602			 */
603			if (nlsn.offset != 0)
604				goto next_file;
605			/* FALLTHROUGH */
606		case DB_SET:
607		default:
608			ret = DB_NOTFOUND;
609			goto err;
610		}
611	}
612
613	F_SET(logc, DB_LOG_DISK);
614
615cksum:	/*
616	 * Discard the region lock if we're still holding it.  (The path to
617	 * get here is we acquired the region lock because of the caller's
618	 * flag argument, but we found the record in the in-memory or cursor
619	 * buffers.  Improbable, but it's easy to avoid.)
620	 */
621	if (rlock == L_ACQUIRED) {
622		rlock = L_NONE;
623		LOG_SYSTEM_UNLOCK(env);
624	}
625
626	/*
627	 * Checksum: there are two types of errors -- a configuration error
628	 * or a checksum mismatch.  The former is always bad.  The latter is
629	 * OK if we're searching for the end of the log, and very, very bad
630	 * if we're reading random log records.
631	 */
632	if ((ret = __db_check_chksum(env, &hdr, db_cipher,
633	    hdr.chksum, rp + hdr.size, hdr.len - hdr.size, is_hmac)) != 0) {
634		/*
635		 * We may be dealing with a version that does not
636		 * checksum the header.  Try again without the header.
637		 * Set the cursor to the LSN we are trying to look at.
638		 */
639		last_lsn = logc->lsn;
640		logc->lsn = nlsn;
641		if (__logc_version(logc, &version) == 0  &&
642		    version < DB_LOGCHKSUM &&
643		    __db_check_chksum(env, NULL,  db_cipher, hdr.chksum,
644		    rp + hdr.size, hdr.len - hdr.size, is_hmac) == 0) {
645			logc->lsn = last_lsn;
646			goto from_memory;
647		}
648
649		if (F_ISSET(logc, DB_LOG_SILENT_ERR)) {
650			if (ret == -1)
651				ret = EIO;
652		} else if (ret == -1) {
653			__db_errx(env,
654		    "DB_LOGC->get: log record LSN %lu/%lu: checksum mismatch",
655			    (u_long)nlsn.file, (u_long)nlsn.offset);
656			__db_errx(env,
657		    "DB_LOGC->get: catastrophic recovery may be required");
658			ret = __env_panic(env, DB_RUNRECOVERY);
659		}
660		logc->lsn = last_lsn;
661		goto err;
662	}
663
664from_memory:
665	/*
666	 * Discard the region lock if we're still holding it.  (The path to
667	 * get here is we acquired the region lock because of the caller's
668	 * flag argument, but we found the record in the in-memory or cursor
669	 * buffers.  Improbable, but it's easy to avoid.)
670	 */
671	if (rlock == L_ACQUIRED) {
672		rlock = L_NONE;
673		LOG_SYSTEM_UNLOCK(env);
674	}
675
676	/* Copy the record into the user's DBT. */
677	if ((ret = __db_retcopy(env, dbt, rp + hdr.size,
678	    (u_int32_t)(hdr.len - hdr.size),
679	    &logc->dbt.data, &logc->dbt.ulen)) != 0)
680		goto err;
681
682	if (CRYPTO_ON(env)) {
683		if ((ret = db_cipher->decrypt(env, db_cipher->data,
684		    hdr.iv, dbt->data, hdr.len - hdr.size)) != 0) {
685			ret = EAGAIN;
686			goto err;
687		}
688		/*
689		 * Return the original log record size to the user,
690		 * even though we've allocated more than that, possibly.
691		 * The log record is decrypted in the user dbt, not in
692		 * the buffer, so we must do this here after decryption,
693		 * not adjust the len passed to the __db_retcopy call.
694		 */
695		dbt->size = hdr.orig_size;
696	}
697
698	/* Update the cursor and the returned LSN. */
699	*alsn = nlsn;
700	logc->lsn = nlsn;
701	logc->len = hdr.len;
702	logc->prev = hdr.prev;
703
704err:	if (rlock == L_ACQUIRED)
705		LOG_SYSTEM_UNLOCK(env);
706
707	return (ret);
708}
709
710/*
711 * __logc_incursor --
712 *	Check to see if the requested record is in the cursor's buffer.
713 */
714static int
715__logc_incursor(logc, lsn, hdr, pp)
716	DB_LOGC *logc;
717	DB_LSN *lsn;
718	HDR *hdr;
719	u_int8_t **pp;
720{
721	ENV *env;
722	u_int8_t *p;
723	int eof;
724
725	env = logc->env;
726	*pp = NULL;
727
728	/*
729	 * Test to see if the requested LSN could be part of the cursor's
730	 * buffer.
731	 *
732	 * The record must be part of the same file as the cursor's buffer.
733	 * The record must start at a byte offset equal to or greater than
734	 * the cursor buffer.
735	 * The record must not start at a byte offset after the cursor
736	 * buffer's end.
737	 */
738	if (logc->bp_lsn.file != lsn->file)
739		return (0);
740	if (logc->bp_lsn.offset > lsn->offset)
741		return (0);
742	if (logc->bp_lsn.offset + logc->bp_rlen <= lsn->offset + hdr->size)
743		return (0);
744
745	/*
746	 * Read the record's header and check if the record is entirely held
747	 * in the buffer.  If the record is not entirely held, get it again.
748	 * (The only advantage in having part of the record locally is that
749	 * we might avoid a system call because we already have the HDR in
750	 * memory.)
751	 *
752	 * If the header check fails for any reason, it must be because the
753	 * LSN is bogus.  Fail hard.
754	 */
755	p = logc->bp + (lsn->offset - logc->bp_lsn.offset);
756	memcpy(hdr, p, hdr->size);
757	if (LOG_SWAPPED(env))
758		__log_hdrswap(hdr, CRYPTO_ON(env));
759	if (__logc_hdrchk(logc, lsn, hdr, &eof))
760		return (DB_NOTFOUND);
761	if (eof || logc->bp_lsn.offset + logc->bp_rlen < lsn->offset + hdr->len)
762		return (0);
763
764	*pp = p;				/* Success. */
765
766	return (0);
767}
768
769/*
770 * __logc_inregion --
771 *	Check to see if the requested record is in the region's buffer.
772 */
773static int
774__logc_inregion(logc, lsn, rlockp, last_lsn, hdr, pp, need_cksump)
775	DB_LOGC *logc;
776	DB_LSN *lsn, *last_lsn;
777	RLOCK *rlockp;
778	HDR *hdr;
779	u_int8_t **pp;
780	int *need_cksump;
781{
782	DB_LOG *dblp;
783	ENV *env;
784	LOG *lp;
785	size_t b_region, len, nr;
786	u_int32_t b_disk;
787	int eof, ret;
788	u_int8_t *p;
789
790	env = logc->env;
791	dblp = env->lg_handle;
792	lp = env->lg_handle->reginfo.primary;
793
794	ret = 0;
795	b_region = 0;
796	*pp = NULL;
797	*need_cksump = 0;
798
799	/* If we haven't yet acquired the log region lock, do so. */
800	if (*rlockp == L_NONE) {
801		*rlockp = L_ACQUIRED;
802		LOG_SYSTEM_LOCK(env);
803	}
804
805	/*
806	 * The routines to read from disk must avoid reading past the logical
807	 * end of the log, so pass that information back to it.
808	 *
809	 * Since they're reading directly from the disk, they must also avoid
810	 * reading past the offset we've written out.  If the log was
811	 * truncated, it's possible that there are zeroes or garbage on
812	 * disk after this offset, and the logical end of the log can
813	 * come later than this point if the log buffer isn't empty.
814	 */
815	*last_lsn = lp->lsn;
816	if (!lp->db_log_inmemory && last_lsn->offset > lp->w_off)
817		last_lsn->offset = lp->w_off;
818
819	/*
820	 * Test to see if the requested LSN could be part of the region's
821	 * buffer.
822	 *
823	 * During recovery, we read the log files getting the information to
824	 * initialize the region.  In that case, the region's lsn field will
825	 * not yet have been filled in, use only the disk.
826	 *
827	 * The record must not start at a byte offset after the region buffer's
828	 * end, since that means the request is for a record after the end of
829	 * the log.  Do this test even if the region's buffer is empty -- after
830	 * recovery, the log files may continue past the declared end-of-log,
831	 * and the disk reading routine will incorrectly attempt to read the
832	 * remainder of the log.
833	 *
834	 * Otherwise, test to see if the region's buffer actually has what we
835	 * want:
836	 *
837	 * The buffer must have some useful content.
838	 * The record must be in the same file as the region's buffer and must
839	 * start at a byte offset equal to or greater than the region's buffer.
840	 */
841	if (IS_ZERO_LSN(lp->lsn))
842		return (0);
843	if (LOG_COMPARE(lsn, &lp->lsn) >= 0)
844		return (DB_NOTFOUND);
845	else if (lp->db_log_inmemory) {
846		if ((ret = __log_inmem_lsnoff(dblp, lsn, &b_region)) != 0)
847			return (ret);
848	} else if (lp->b_off == 0 || LOG_COMPARE(lsn, &lp->f_lsn) < 0)
849		return (0);
850
851	/*
852	 * The current contents of the cursor's buffer will be useless for a
853	 * future call, we're about to overwrite it -- trash it rather than
854	 * try and make it look correct.
855	 */
856	logc->bp_rlen = 0;
857
858	/*
859	 * If the requested LSN is greater than the region buffer's first
860	 * byte, we know the entire record is in the buffer on a good LSN.
861	 *
862	 * If we're given a bad LSN, the "entire" record might not be in
863	 * our buffer in order to fail at the chksum.  __logc_hdrchk made
864	 * sure our dest buffer fits, via bp_maxrec, but we also need to
865	 * make sure we don't run off the end of this buffer, the src.
866	 *
867	 * There is one case where the header check can fail: on a scan through
868	 * in-memory logs, when we reach the end of a file we can read an empty
869	 * header.  In that case, it's safe to return zero, here: it will be
870	 * caught in our caller.  Otherwise, the LSN is bogus.  Fail hard.
871	 */
872	if (lp->db_log_inmemory || LOG_COMPARE(lsn, &lp->f_lsn) > 0) {
873		if (!lp->db_log_inmemory)
874			b_region = lsn->offset - lp->w_off;
875		__log_inmem_copyout(dblp, b_region, hdr, hdr->size);
876		if (LOG_SWAPPED(env))
877			__log_hdrswap(hdr, CRYPTO_ON(env));
878		if (__logc_hdrchk(logc, lsn, hdr, &eof) != 0)
879			return (DB_NOTFOUND);
880		if (eof)
881			return (0);
882		if (lp->db_log_inmemory) {
883			if (RINGBUF_LEN(lp, b_region, lp->b_off) < hdr->len)
884				return (DB_NOTFOUND);
885		} else if (lsn->offset + hdr->len > lp->w_off + lp->buffer_size)
886			return (DB_NOTFOUND);
887		if (logc->bp_size <= hdr->len) {
888			len = (size_t)DB_ALIGN((uintmax_t)hdr->len * 2, 128);
889			if ((ret =
890			    __os_realloc(logc->env, len, &logc->bp)) != 0)
891				 return (ret);
892			logc->bp_size = (u_int32_t)len;
893		}
894		__log_inmem_copyout(dblp, b_region, logc->bp, hdr->len);
895		*pp = logc->bp;
896		return (0);
897	}
898
899	DB_ASSERT(env, !lp->db_log_inmemory);
900
901	/*
902	 * There's a partial record, that is, the requested record starts
903	 * in a log file and finishes in the region buffer.  We have to
904	 * find out how many bytes of the record are in the region buffer
905	 * so we can copy them out into the cursor buffer.  First, check
906	 * to see if the requested record is the only record in the region
907	 * buffer, in which case we should copy the entire region buffer.
908	 *
909	 * Else, walk back through the region's buffer to find the first LSN
910	 * after the record that crosses the buffer boundary -- we can detect
911	 * that LSN, because its "prev" field will reference the record we
912	 * want.  The bytes we need to copy from the region buffer are the
913	 * bytes up to the record we find.  The bytes we'll need to allocate
914	 * to hold the log record are the bytes between the two offsets.
915	 */
916	b_disk = lp->w_off - lsn->offset;
917	if (lp->b_off <= lp->len)
918		b_region = (u_int32_t)lp->b_off;
919	else
920		for (p = dblp->bufp + (lp->b_off - lp->len);;) {
921			memcpy(hdr, p, hdr->size);
922			if (LOG_SWAPPED(env))
923				__log_hdrswap(hdr, CRYPTO_ON(env));
924			if (hdr->prev == lsn->offset) {
925				b_region = (u_int32_t)(p - dblp->bufp);
926				break;
927			}
928			p = dblp->bufp + (hdr->prev - lp->w_off);
929		}
930
931	/*
932	 * If we don't have enough room for the record, we have to allocate
933	 * space.  We have to do it while holding the region lock, which is
934	 * truly annoying, but there's no way around it.  This call is why
935	 * we allocate cursor buffer space when allocating the cursor instead
936	 * of waiting.
937	 */
938	if (logc->bp_size <= b_region + b_disk) {
939		len = (size_t)DB_ALIGN((uintmax_t)(b_region + b_disk) * 2, 128);
940		if ((ret = __os_realloc(logc->env, len, &logc->bp)) != 0)
941			return (ret);
942		logc->bp_size = (u_int32_t)len;
943	}
944
945	/* Copy the region's bytes to the end of the cursor's buffer. */
946	p = (logc->bp + logc->bp_size) - b_region;
947	memcpy(p, dblp->bufp, b_region);
948
949	/* Release the region lock. */
950	if (*rlockp == L_ACQUIRED) {
951		*rlockp = L_NONE;
952		LOG_SYSTEM_UNLOCK(env);
953	}
954
955	/*
956	 * Read the rest of the information from disk.  Neither short reads
957	 * or EOF are acceptable, the bytes we want had better be there.
958	 */
959	if (b_disk != 0) {
960		p -= b_disk;
961		nr = b_disk;
962		if ((ret = __logc_io(
963		    logc, lsn->file, lsn->offset, p, &nr, NULL)) != 0)
964			return (ret);
965		if (nr < b_disk)
966			return (__logc_shortread(logc, lsn, 0));
967
968		/* We read bytes from the disk, we'll need to checksum them. */
969		*need_cksump = 1;
970	}
971
972	/* Copy the header information into the caller's structure. */
973	memcpy(hdr, p, hdr->size);
974	if (LOG_SWAPPED(env))
975		__log_hdrswap(hdr, CRYPTO_ON(env));
976
977	*pp = p;
978	return (0);
979}
980
981/*
982 * __log_hdrswap --
983 *	Swap the bytes in a log header from machines with different endianness.
984 *
985 * PUBLIC: void __log_hdrswap __P((HDR *, int));
986 */
987void
988__log_hdrswap(hdr, is_hmac)
989	HDR *hdr;
990	int is_hmac;
991{
992	M_32_SWAP(hdr->prev);
993	M_32_SWAP(hdr->len);
994	if (!is_hmac)
995		P_32_SWAP(hdr->chksum);
996}
997
998/*
999 * __log_persistswap --
1000 *	Swap the bytes in a log file persistent header from machines with
1001 *	different endianness.
1002 *
1003 * PUBLIC: void __log_persistswap __P((LOGP *));
1004 */
1005void
1006__log_persistswap(persist)
1007	LOGP *persist;
1008{
1009	M_32_SWAP(persist->magic);
1010	M_32_SWAP(persist->version);
1011	M_32_SWAP(persist->log_size);
1012	M_32_SWAP(persist->notused);
1013}
1014
1015/*
1016 * __logc_ondisk --
1017 *	Read a record off disk.
1018 */
1019static int
1020__logc_ondisk(logc, lsn, last_lsn, flags, hdr, pp, eofp)
1021	DB_LOGC *logc;
1022	DB_LSN *lsn, *last_lsn;
1023	u_int32_t flags;
1024	int *eofp;
1025	HDR *hdr;
1026	u_int8_t **pp;
1027{
1028	ENV *env;
1029	size_t len, nr;
1030	u_int32_t offset;
1031	int ret;
1032
1033	env = logc->env;
1034	*eofp = 0;
1035
1036	nr = hdr->size;
1037	if ((ret =
1038	    __logc_io(logc, lsn->file, lsn->offset, hdr, &nr, eofp)) != 0)
1039		return (ret);
1040	if (*eofp)
1041		return (0);
1042
1043	if (LOG_SWAPPED(env))
1044		__log_hdrswap(hdr, CRYPTO_ON(env));
1045
1046	/*
1047	 * If the read was successful, but we can't read a full header, assume
1048	 * we've hit EOF.  We can't check that the header has been partially
1049	 * zeroed out, but it's unlikely that this is caused by a write failure
1050	 * since the header is written as a single write call and it's less
1051	 * than sector.
1052	 */
1053	if (nr < hdr->size) {
1054		*eofp = 1;
1055		return (0);
1056	}
1057
1058	/* Check the HDR. */
1059	if ((ret = __logc_hdrchk(logc, lsn, hdr, eofp)) != 0)
1060		return (ret);
1061	if (*eofp)
1062		return (0);
1063
1064	/*
1065	 * Regardless of how we return, the previous contents of the cursor's
1066	 * buffer are useless -- trash it.
1067	 */
1068	logc->bp_rlen = 0;
1069
1070	/*
1071	 * Otherwise, we now (finally!) know how big the record is.  (Maybe
1072	 * we should have just stuck the length of the record into the LSN!?)
1073	 * Make sure we have enough space.
1074	 */
1075	if (logc->bp_size <= hdr->len) {
1076		len = (size_t)DB_ALIGN((uintmax_t)hdr->len * 2, 128);
1077		if ((ret = __os_realloc(env, len, &logc->bp)) != 0)
1078			return (ret);
1079		logc->bp_size = (u_int32_t)len;
1080	}
1081
1082	/*
1083	 * If we're moving forward in the log file, read this record in at the
1084	 * beginning of the buffer.  Otherwise, read this record in at the end
1085	 * of the buffer, making sure we don't try and read before the start
1086	 * of the file.  (We prefer positioning at the end because transaction
1087	 * aborts use DB_SET to move backward through the log and we might get
1088	 * lucky.)
1089	 *
1090	 * Read a buffer's worth, without reading past the logical EOF.  The
1091	 * last_lsn may be a zero LSN, but that's OK, the test works anyway.
1092	 */
1093	if (flags == DB_FIRST || flags == DB_NEXT)
1094		offset = lsn->offset;
1095	else if (lsn->offset + hdr->len < logc->bp_size)
1096		offset = 0;
1097	else
1098		offset = (lsn->offset + hdr->len) - logc->bp_size;
1099
1100	nr = logc->bp_size;
1101	if (lsn->file == last_lsn->file && offset + nr >= last_lsn->offset)
1102		nr = last_lsn->offset - offset;
1103
1104	if ((ret =
1105	    __logc_io(logc, lsn->file, offset, logc->bp, &nr, eofp)) != 0)
1106		return (ret);
1107
1108	/*
1109	 * We should have at least gotten the bytes up-to-and-including the
1110	 * record we're reading.
1111	 */
1112	if (nr < (lsn->offset + hdr->len) - offset)
1113		return (__logc_shortread(logc, lsn, 1));
1114
1115	/*
1116	 * Set up the return information.
1117	 *
1118	 * !!!
1119	 * No need to set the bp_lsn.file field, __logc_io set it for us.
1120	 */
1121	logc->bp_rlen = (u_int32_t)nr;
1122	logc->bp_lsn.offset = offset;
1123
1124	*pp = logc->bp + (lsn->offset - offset);
1125
1126	return (0);
1127}
1128
1129/*
1130 * __logc_hdrchk --
1131 *
1132 * Check for corrupted HDRs before we use them to allocate memory or find
1133 * records.
1134 *
1135 * If the log files were pre-allocated, a zero-filled HDR structure is the
1136 * logical file end.  However, we can see buffers filled with 0's during
1137 * recovery, too (because multiple log buffers were written asynchronously,
1138 * and one made it to disk before a different one that logically precedes
1139 * it in the log file.
1140 *
1141 * Check for impossibly large records.  The malloc should fail later, but we
1142 * have customers that run mallocs that treat all allocation failures as fatal
1143 * errors.
1144 *
1145 * Note that none of this is necessarily something awful happening.  We let
1146 * the application hand us any LSN they want, and it could be a pointer into
1147 * the middle of a log record, there's no way to tell.
1148 */
1149static int
1150__logc_hdrchk(logc, lsn, hdr, eofp)
1151	DB_LOGC *logc;
1152	DB_LSN *lsn;
1153	HDR *hdr;
1154	int *eofp;
1155{
1156	ENV *env;
1157	int ret;
1158
1159	env = logc->env;
1160
1161	/*
1162	 * Check EOF before we do any other processing.
1163	 */
1164	if (eofp != NULL) {
1165		if (hdr->prev == 0 && hdr->chksum[0] == 0 && hdr->len == 0) {
1166			*eofp = 1;
1167			return (0);
1168		}
1169		*eofp = 0;
1170	}
1171
1172	/*
1173	 * Sanity check the log record's size.
1174	 * We must check it after "virtual" EOF above.
1175	 */
1176	if (hdr->len <= hdr->size)
1177		goto err;
1178
1179	/*
1180	 * If the cursor's max-record value isn't yet set, it means we aren't
1181	 * reading these records from a log file and no check is necessary.
1182	 */
1183	if (logc->bp_maxrec != 0 && hdr->len > logc->bp_maxrec) {
1184		/*
1185		 * If we fail the check, there's the pathological case that
1186		 * we're reading the last file, it's growing, and our initial
1187		 * check information was wrong.  Get it again, to be sure.
1188		 */
1189		if ((ret = __logc_set_maxrec(logc, NULL)) != 0) {
1190			__db_err(env, ret, "DB_LOGC->get");
1191			return (ret);
1192		}
1193		if (logc->bp_maxrec != 0 && hdr->len > logc->bp_maxrec)
1194			goto err;
1195	}
1196	return (0);
1197
1198err:	if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
1199		__db_errx(env,
1200		    "DB_LOGC->get: LSN %lu/%lu: invalid log record header",
1201		    (u_long)lsn->file, (u_long)lsn->offset);
1202	return (EIO);
1203}
1204
1205/*
1206 * __logc_io --
1207 *	Read records from a log file.
1208 */
1209static int
1210__logc_io(logc, fnum, offset, p, nrp, eofp)
1211	DB_LOGC *logc;
1212	u_int32_t fnum, offset;
1213	void *p;
1214	size_t *nrp;
1215	int *eofp;
1216{
1217	DB_LOG *dblp;
1218	ENV *env;
1219	LOG *lp;
1220	int ret;
1221	char *np;
1222
1223	env = logc->env;
1224	dblp = env->lg_handle;
1225	lp = dblp->reginfo.primary;
1226
1227	/*
1228	 * If we've switched files, discard the current file handle and acquire
1229	 * a new one.
1230	 */
1231	if (logc->fhp != NULL && logc->bp_lsn.file != fnum) {
1232		ret = __os_closehandle(env, logc->fhp);
1233		logc->fhp = NULL;
1234		logc->bp_lsn.file = 0;
1235
1236		if (ret != 0)
1237			return (ret);
1238	}
1239	if (logc->fhp == NULL) {
1240		if ((ret = __log_name(dblp, fnum,
1241		    &np, &logc->fhp, DB_OSO_RDONLY | DB_OSO_SEQ)) != 0) {
1242			/*
1243			 * If we're allowed to return EOF, assume that's the
1244			 * problem, set the EOF status flag and return 0.
1245			 */
1246			if (eofp != NULL) {
1247				*eofp = 1;
1248				ret = 0;
1249			} else if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
1250				__db_err(env, ret, "DB_LOGC->get: %s",
1251				    np == NULL ? "__log_name failed" : np);
1252			__os_free(env, np);
1253			return (ret);
1254		}
1255
1256		if ((ret = __logc_set_maxrec(logc, np)) != 0) {
1257			__db_err(env, ret, "DB_LOGC->get: %s", np);
1258			__os_free(env, np);
1259			return (ret);
1260		}
1261		__os_free(env, np);
1262
1263		logc->bp_lsn.file = fnum;
1264	}
1265
1266	STAT(++lp->stat.st_rcount);
1267	/* Seek to the record's offset and read the data. */
1268	if ((ret = __os_io(env, DB_IO_READ,
1269	    logc->fhp, 0, 0, offset, (u_int32_t)*nrp, p, nrp)) != 0) {
1270		if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
1271			__db_err(env, ret,
1272			    "DB_LOGC->get: LSN: %lu/%lu: read",
1273			    (u_long)fnum, (u_long)offset);
1274		return (ret);
1275	}
1276
1277	return (0);
1278}
1279
1280/*
1281 * __logc_shortread --
1282 *	Read was short -- return a consistent error message and error.
1283 */
1284static int
1285__logc_shortread(logc, lsn, check_silent)
1286	DB_LOGC *logc;
1287	DB_LSN *lsn;
1288	int check_silent;
1289{
1290	if (!check_silent || !F_ISSET(logc, DB_LOG_SILENT_ERR))
1291		__db_errx(logc->env, "DB_LOGC->get: LSN: %lu/%lu: short read",
1292		    (u_long)lsn->file, (u_long)lsn->offset);
1293	return (EIO);
1294}
1295
1296/*
1297 * __logc_set_maxrec --
1298 *	Bound the maximum log record size in a log file.
1299 */
1300static int
1301__logc_set_maxrec(logc, np)
1302	DB_LOGC *logc;
1303	char *np;
1304{
1305	DB_LOG *dblp;
1306	ENV *env;
1307	LOG *lp;
1308	u_int32_t mbytes, bytes;
1309	int ret;
1310
1311	env = logc->env;
1312	dblp = env->lg_handle;
1313
1314	/*
1315	 * We don't want to try and allocate huge chunks of memory because
1316	 * applications with error-checking malloc's often consider that a
1317	 * hard failure.  If we're about to look at a corrupted record with
1318	 * a bizarre size, we need to know before trying to allocate space
1319	 * to hold it.  We could read the persistent data at the beginning
1320	 * of the file but that's hard -- we may have to decrypt it, checksum
1321	 * it and so on.  Stat the file instead.
1322	 */
1323	if (logc->fhp != NULL) {
1324		if ((ret = __os_ioinfo(env, np, logc->fhp,
1325		    &mbytes, &bytes, NULL)) != 0)
1326			return (ret);
1327		if (logc->bp_maxrec < (mbytes * MEGABYTE + bytes))
1328			logc->bp_maxrec = mbytes * MEGABYTE + bytes;
1329	}
1330
1331	/*
1332	 * If reading from the log file currently being written, we could get
1333	 * an incorrect size, that is, if the cursor was opened on the file
1334	 * when it had only a few hundred bytes, and then the cursor used to
1335	 * move forward in the file, after more log records were written, the
1336	 * original stat value would be wrong.  Use the maximum of the current
1337	 * log file size and the size of the buffer -- that should represent
1338	 * the max of any log record currently in the file.
1339	 *
1340	 * The log buffer size is set when the environment is opened and never
1341	 * changed, we don't need a lock on it.
1342	 */
1343	lp = dblp->reginfo.primary;
1344	if (logc->bp_maxrec < lp->buffer_size)
1345		logc->bp_maxrec = lp->buffer_size;
1346
1347	return (0);
1348}
1349
1350#ifdef HAVE_REPLICATION
1351/*
1352 * __log_rep_split --
1353 *	- Split a log buffer into individual records.
1354 *
1355 * This is used by a replication client to process a bulk log message from the
1356 * master and convert it into individual __rep_apply requests.
1357 *
1358 * PUBLIC: int __log_rep_split __P((ENV *, DB_THREAD_INFO *,
1359 * PUBLIC:     __rep_control_args *, DBT *, DB_LSN *, DB_LSN *));
1360 */
1361int
1362__log_rep_split(env, ip, rp, rec, ret_lsnp, last_lsnp)
1363	ENV *env;
1364	DB_THREAD_INFO *ip;
1365	__rep_control_args *rp;
1366	DBT *rec;
1367	DB_LSN *ret_lsnp;
1368	DB_LSN *last_lsnp;
1369{
1370	DBT logrec;
1371	DB_LSN save_lsn, tmp_lsn;
1372	__rep_control_args tmprp;
1373	__rep_bulk_args b_args;
1374	int ret, save_ret;
1375	u_int32_t save_flags;
1376	u_int8_t *p, *ep;
1377
1378	memset(&logrec, 0, sizeof(logrec));
1379	memset(&save_lsn, 0, sizeof(save_lsn));
1380	memset(&tmp_lsn, 0, sizeof(tmp_lsn));
1381	/*
1382	 * We're going to be modifying the rp LSN contents so make
1383	 * our own private copy to play with.
1384	 */
1385	memcpy(&tmprp, rp, sizeof(tmprp));
1386	/*
1387	 * We send the bulk buffer on a PERM record, so often we will have
1388	 * DB_LOG_PERM set.  However, we only want to mark the last LSN
1389	 * we have as a PERM record.  So clear it here, and when we're on
1390	 * the last record below, set it.  The same applies if the sender
1391	 * set REPCTL_LOG_END on this message.  We want the end of the
1392	 * bulk buffer to be marked as the end.
1393	 */
1394	save_flags = F_ISSET(rp, REPCTL_LOG_END | REPCTL_PERM);
1395	F_CLR(&tmprp, REPCTL_LOG_END | REPCTL_PERM);
1396	ret = save_ret = 0;
1397	for (ep = (u_int8_t *)rec->data + rec->size, p = (u_int8_t *)rec->data;
1398	    p < ep; ) {
1399		/*
1400		 * First thing in the buffer is the length.  Then the LSN
1401		 * of this record, then the record itself.
1402		 */
1403		if (rp->rep_version < DB_REPVERSION_47) {
1404			memcpy(&b_args.len, p, sizeof(b_args.len));
1405			p += sizeof(b_args.len);
1406			memcpy(&tmprp.lsn, p, sizeof(DB_LSN));
1407			p += sizeof(DB_LSN);
1408			logrec.data = p;
1409			logrec.size = b_args.len;
1410			p += b_args.len;
1411		} else {
1412			if ((ret = __rep_bulk_unmarshal(env,
1413			    &b_args, p, rec->size, &p)) != 0)
1414				return (ret);
1415			tmprp.lsn = b_args.lsn;
1416			logrec.data = b_args.bulkdata.data;
1417			logrec.size = b_args.len;
1418		}
1419		RPRINT(env, DB_VERB_REP_MISC, (env,
1420		    "log_rep_split: Processing LSN [%lu][%lu]",
1421		    (u_long)tmprp.lsn.file, (u_long)tmprp.lsn.offset));
1422		RPRINT(env, DB_VERB_REP_MISC, (env,
1423    "log_rep_split: p %#lx ep %#lx logrec data %#lx, size %lu (%#lx)",
1424		    P_TO_ULONG(p), P_TO_ULONG(ep), P_TO_ULONG(logrec.data),
1425		    (u_long)logrec.size, (u_long)logrec.size));
1426		if (p >= ep && save_flags)
1427			F_SET(&tmprp, save_flags);
1428		ret = __rep_apply(env, ip,
1429		    &tmprp, &logrec, &tmp_lsn, NULL, last_lsnp);
1430		RPRINT(env, DB_VERB_REP_MISC, (env,
1431		    "log_split: rep_apply ret %d, tmp_lsn [%lu][%lu]",
1432		    ret, (u_long)tmp_lsn.file, (u_long)tmp_lsn.offset));
1433		switch (ret) {
1434		/*
1435		 * If we received the pieces we need for running recovery,
1436		 * short-circuit because recovery will truncate the log to
1437		 * the LSN we want anyway.
1438		 */
1439		case DB_REP_LOGREADY:
1440			goto out;
1441		/*
1442		 * If we just handled a special record, retain that information.
1443		 */
1444		case DB_REP_ISPERM:
1445		case DB_REP_NOTPERM:
1446			save_ret = ret;
1447			save_lsn = tmp_lsn;
1448			ret = 0;
1449			break;
1450		/*
1451		 * Normal processing, do nothing, just continue.
1452		 */
1453		case 0:
1454			break;
1455		/*
1456		 * If we get an error, then stop immediately.
1457		 */
1458		default:
1459			goto out;
1460		}
1461	}
1462out:
1463	/*
1464	 * If we finish processing successfully, set our return values
1465	 * based on what we saw.
1466	 */
1467	if (ret == 0) {
1468		ret = save_ret;
1469		*ret_lsnp = save_lsn;
1470	}
1471	return (ret);
1472}
1473#endif
1474