1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: log_put.c,v 12.70 2008/05/13 00:33:27 alexg Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/crypto.h"
13#include "dbinc/hmac.h"
14#include "dbinc/log.h"
15#include "dbinc/txn.h"
16
17static int __log_encrypt_record __P((ENV *, DBT *, HDR *, u_int32_t));
18static int __log_file __P((ENV *, const DB_LSN *, char *, size_t));
19static int __log_fill __P((DB_LOG *, DB_LSN *, void *, u_int32_t));
20static int __log_flush_commit __P((ENV *, const DB_LSN *, u_int32_t));
21static int __log_newfh __P((DB_LOG *, int));
22static int __log_put_next __P((ENV *,
23    DB_LSN *, const DBT *, HDR *, DB_LSN *));
24static int __log_putr __P((DB_LOG *,
25    DB_LSN *, const DBT *, u_int32_t, HDR *));
26static int __log_write __P((DB_LOG *, void *, u_int32_t));
27
28/*
29 * __log_put_pp --
30 *	ENV->log_put pre/post processing.
31 *
32 * PUBLIC: int __log_put_pp __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t));
33 */
34int
35__log_put_pp(dbenv, lsnp, udbt, flags)
36	DB_ENV *dbenv;
37	DB_LSN *lsnp;
38	const DBT *udbt;
39	u_int32_t flags;
40{
41	DB_THREAD_INFO *ip;
42	ENV *env;
43	int ret;
44
45	env = dbenv->env;
46
47	ENV_REQUIRES_CONFIG(env,
48	    env->lg_handle, "DB_ENV->log_put", DB_INIT_LOG);
49
50	/* Validate arguments: check for allowed flags. */
51	if ((ret = __db_fchk(env, "DB_ENV->log_put", flags,
52	    DB_LOG_CHKPNT | DB_LOG_COMMIT |
53	    DB_FLUSH | DB_LOG_NOCOPY | DB_LOG_WRNOSYNC)) != 0)
54		return (ret);
55
56	/* DB_LOG_WRNOSYNC and DB_FLUSH are mutually exclusive. */
57	if (LF_ISSET(DB_LOG_WRNOSYNC) && LF_ISSET(DB_FLUSH))
58		return (__db_ferr(env, "DB_ENV->log_put", 1));
59
60	/* Replication clients should never write log records. */
61	if (IS_REP_CLIENT(env)) {
62		__db_errx(env,
63		    "DB_ENV->log_put is illegal on replication clients");
64		return (EINVAL);
65	}
66
67	ENV_ENTER(env, ip);
68	REPLICATION_WRAP(env, (__log_put(env, lsnp, udbt, flags)), 0, ret);
69	ENV_LEAVE(env, ip);
70	return (ret);
71}
72
73/*
74 * __log_put --
75 *	ENV->log_put.
76 *
77 * PUBLIC: int __log_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
78 */
79int
80__log_put(env, lsnp, udbt, flags)
81	ENV *env;
82	DB_LSN *lsnp;
83	const DBT *udbt;
84	u_int32_t flags;
85{
86	DBT *dbt, t;
87	DB_CIPHER *db_cipher;
88	DB_LOG *dblp;
89	DB_LSN lsn, old_lsn;
90	DB_REP *db_rep;
91	HDR hdr;
92	LOG *lp;
93	REP *rep;
94	int lock_held, need_free, ret;
95	u_int8_t *key;
96
97	dblp = env->lg_handle;
98	lp = dblp->reginfo.primary;
99	db_cipher = env->crypto_handle;
100	db_rep = env->rep_handle;
101	if (db_rep != NULL)
102		rep = db_rep->region;
103	else
104		rep = NULL;
105
106	dbt = &t;
107	t = *udbt;
108	lock_held = need_free = 0;
109	ZERO_LSN(old_lsn);
110	hdr.len = hdr.prev = 0;
111
112#if !defined(DEBUG_ROP) && !defined(DEBUG_WOP)
113	/*
114	 * If we are not a rep application, but are sharing a master rep env,
115	 * we should not be writing log records.
116	 */
117	if (IS_REP_MASTER(env) && db_rep->send == NULL) {
118		__db_errx(env, "%s %s",
119		    "Non-replication DB_ENV handle attempting",
120		    "to modify a replicated environment");
121		return (EINVAL);
122	}
123#endif
124	DB_ASSERT(env, !IS_REP_CLIENT(env));
125
126	/*
127	 * If we are coming from the logging code, we use an internal flag,
128	 * DB_LOG_NOCOPY, because we know we can overwrite/encrypt the log
129	 * record in place.  Otherwise, if a user called log_put then we
130	 * must copy it to new memory so that we know we can write it.
131	 *
132	 * We also must copy it to new memory if we are a replication master
133	 * so that we retain an unencrypted copy of the log record to send
134	 * to clients.
135	 */
136	if (!LF_ISSET(DB_LOG_NOCOPY) || IS_REP_MASTER(env)) {
137		if (CRYPTO_ON(env))
138			t.size += db_cipher->adj_size(udbt->size);
139		if ((ret = __os_calloc(env, 1, t.size, &t.data)) != 0)
140			goto err;
141		need_free = 1;
142		memcpy(t.data, udbt->data, udbt->size);
143	}
144	if ((ret = __log_encrypt_record(env, dbt, &hdr, udbt->size)) != 0)
145		goto err;
146	if (CRYPTO_ON(env))
147		key = db_cipher->mac_key;
148	else
149		key = NULL;
150
151	/* Before we grab the region lock, calculate the record's checksum. */
152	if (lp->persist.version != DB_LOGVERSION)
153		__db_chksum(NULL, dbt->data, dbt->size, key, hdr.chksum);
154	else
155		__db_chksum(&hdr, dbt->data, dbt->size, key, hdr.chksum);
156
157	LOG_SYSTEM_LOCK(env);
158	lock_held = 1;
159
160	if ((ret = __log_put_next(env, &lsn, dbt, &hdr, &old_lsn)) != 0)
161		goto panic_check;
162
163	/*
164	 * Assign the return LSN before dropping the region lock.  Necessary
165	 * in case the lsn is a begin_lsn from a TXN_DETAIL structure passed in
166	 * by the logging routines.  We use atomic 32-bit operations because
167	 * during commit this will be a TXN_DETAIL visible_lsn field, and MVCC
168	 * relies on reading the fields atomically.
169	 */
170	lsnp->file = lsn.file;
171	lsnp->offset = lsn.offset;
172
173#ifdef HAVE_REPLICATION
174	if (IS_REP_MASTER(env)) {
175		__rep_newfile_args nf_args;
176		DBT newfiledbt;
177		REP_BULK bulk;
178		size_t len;
179		u_int32_t ctlflags;
180		u_int8_t buf[__REP_NEWFILE_SIZE];
181
182		/*
183		 * Replication masters need to drop the lock to send messages,
184		 * but want to drop and reacquire it a minimal number of times.
185		 */
186		ctlflags = LF_ISSET(DB_LOG_COMMIT | DB_LOG_CHKPNT) ?
187		    REPCTL_PERM : 0;
188		/*
189		 * If using leases, keep track of our last PERM lsn.
190		 * Set this on a master under the log lock.
191		 */
192		if (IS_USING_LEASES(env) &&
193		    FLD_ISSET(ctlflags, REPCTL_PERM))
194			lp->max_perm_lsn = lsn;
195		LOG_SYSTEM_UNLOCK(env);
196		lock_held = 0;
197		if (LF_ISSET(DB_FLUSH))
198			ctlflags |= REPCTL_FLUSH;
199
200		/*
201		 * If we changed files and we're in a replicated environment,
202		 * we need to inform our clients now that we've dropped the
203		 * region lock.
204		 *
205		 * Note that a failed NEWFILE send is a dropped message that
206		 * our client can handle, so we can ignore it.  It's possible
207		 * that the record we already put is a commit, so we don't just
208		 * want to return failure.
209		 */
210		if (!IS_ZERO_LSN(old_lsn)) {
211			memset(&newfiledbt, 0, sizeof(newfiledbt));
212			nf_args.version = lp->persist.version;
213			(void)__rep_newfile_marshal(env, &nf_args,
214			    buf, __REP_NEWFILE_SIZE, &len);
215			DB_INIT_DBT(newfiledbt, buf, len);
216			(void)__rep_send_message(env, DB_EID_BROADCAST,
217			    REP_NEWFILE, &old_lsn, &newfiledbt, 0, 0);
218		}
219
220		/*
221		 * If we're doing bulk processing put it in the bulk buffer.
222		 */
223		ret = 0;
224		if (FLD_ISSET(rep->config, REP_C_BULK)) {
225			/*
226			 * Bulk could have been turned on by another process.
227			 * If so, set the address into the bulk region now.
228			 */
229			if (db_rep->bulk == NULL)
230				db_rep->bulk = R_ADDR(&dblp->reginfo,
231				    lp->bulk_buf);
232			memset(&bulk, 0, sizeof(bulk));
233			bulk.addr = db_rep->bulk;
234			bulk.offp = &lp->bulk_off;
235			bulk.len = lp->bulk_len;
236			bulk.lsn = lsn;
237			bulk.type = REP_BULK_LOG;
238			bulk.eid = DB_EID_BROADCAST;
239			bulk.flagsp = &lp->bulk_flags;
240			ret = __rep_bulk_message(env, &bulk, NULL,
241			    &lsn, udbt, ctlflags);
242		}
243		if (!FLD_ISSET(rep->config, REP_C_BULK) ||
244		    ret == DB_REP_BULKOVF) {
245			/*
246			 * Then send the log record itself on to our clients.
247			 */
248			/*
249			 * !!!
250			 * In the crypto case, we MUST send the udbt, not the
251			 * now-encrypted dbt.  Clients have no way to decrypt
252			 * without the header.
253			 */
254			ret = __rep_send_message(env, DB_EID_BROADCAST,
255			    REP_LOG, &lsn, udbt, ctlflags, 0);
256		}
257		/*
258		 * If the send fails and we're a commit or checkpoint,
259		 * there's nothing we can do;  the record's in the log.
260		 * Flush it, even if we're running with TXN_NOSYNC,
261		 * on the grounds that it should be in durable
262		 * form somewhere.
263		 *
264		 * If the send fails with this perm record and leases
265		 * are in use, we need to forcibly expire all lease
266		 * grants to prevent authoritative reads.
267		 */
268		if (ret != 0 && FLD_ISSET(ctlflags, REPCTL_PERM)) {
269			LF_SET(DB_FLUSH);
270			if (IS_USING_LEASES(env))
271				(void)__rep_lease_expire(env, 0);
272		}
273		/*
274		 * We ignore send failures so reset 'ret' to 0 here.
275		 * We needed to check special return values from
276		 * bulk transfer and errors from either bulk or normal
277		 * message sending need flushing on perm records.  But
278		 * otherwise we need to ignore it and reset it now.
279		 */
280		ret = 0;
281	}
282#endif
283
284	/*
285	 * If needed, do a flush.  Note that failures at this point
286	 * are only permissible if we know we haven't written a commit
287	 * record;  __log_flush_commit is responsible for enforcing this.
288	 *
289	 * If a flush is not needed, see if WRITE_NOSYNC was set and we
290	 * need to write out the log buffer.
291	 */
292	if (LF_ISSET(DB_FLUSH | DB_LOG_WRNOSYNC)) {
293		if (!lock_held) {
294			LOG_SYSTEM_LOCK(env);
295			lock_held = 1;
296		}
297		if ((ret = __log_flush_commit(env, &lsn, flags)) != 0)
298			goto panic_check;
299	}
300
301	/*
302	 * If flushed a checkpoint record, reset the "bytes since the last
303	 * checkpoint" counters.
304	 */
305	if (LF_ISSET(DB_LOG_CHKPNT))
306		lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0;
307
308	/* Increment count of records added to the log. */
309	STAT(++lp->stat.st_record);
310
311	if (0) {
312panic_check:	/*
313		 * Writing log records cannot fail if we're a replication
314		 * master.  The reason is that once we send the record to
315		 * replication clients, the transaction can no longer
316		 * abort, otherwise the master would be out of sync with
317		 * the rest of the replication group.  Panic the system.
318		 */
319		if (ret != 0 && IS_REP_MASTER(env))
320			ret = __env_panic(env, ret);
321	}
322
323err:	if (lock_held)
324		LOG_SYSTEM_UNLOCK(env);
325	if (need_free)
326		__os_free(env, dbt->data);
327
328	/*
329	 * If auto-remove is set and we switched files, remove unnecessary
330	 * log files.
331	 */
332	if (ret == 0 && !IS_ZERO_LSN(old_lsn) && lp->db_log_autoremove)
333		__log_autoremove(env);
334
335	return (ret);
336}
337
338/*
339 * __log_current_lsn --
340 *	Return the current LSN.
341 *
342 * PUBLIC: int __log_current_lsn
343 * PUBLIC:     __P((ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
344 */
345int
346__log_current_lsn(env, lsnp, mbytesp, bytesp)
347	ENV *env;
348	DB_LSN *lsnp;
349	u_int32_t *mbytesp, *bytesp;
350{
351	DB_LOG *dblp;
352	LOG *lp;
353
354	dblp = env->lg_handle;
355	lp = dblp->reginfo.primary;
356
357	LOG_SYSTEM_LOCK(env);
358
359	/*
360	 * We need the LSN of the last entry in the log.
361	 *
362	 * Typically, it's easy to get the last written LSN, you simply look
363	 * at the current log pointer and back up the number of bytes of the
364	 * last log record.  However, if the last thing we did was write the
365	 * log header of a new log file, then, this doesn't work, so we return
366	 * the first log record that will be written in this new file.
367	 */
368	*lsnp = lp->lsn;
369	if (lp->lsn.offset > lp->len)
370		lsnp->offset -= lp->len;
371
372	/*
373	 * Since we're holding the log region lock, return the bytes put into
374	 * the log since the last checkpoint, transaction checkpoint needs it.
375	 *
376	 * We add the current buffer offset so as to count bytes that have not
377	 * yet been written, but are sitting in the log buffer.
378	 */
379	if (mbytesp != NULL) {
380		*mbytesp = lp->stat.st_wc_mbytes;
381		*bytesp = (u_int32_t)(lp->stat.st_wc_bytes + lp->b_off);
382	}
383
384	LOG_SYSTEM_UNLOCK(env);
385
386	return (0);
387}
388
389/*
390 * __log_put_next --
391 *	Put the given record as the next in the log, wherever that may
392 * turn out to be.
393 */
394static int
395__log_put_next(env, lsn, dbt, hdr, old_lsnp)
396	ENV *env;
397	DB_LSN *lsn;
398	const DBT *dbt;
399	HDR *hdr;
400	DB_LSN *old_lsnp;
401{
402	DB_LOG *dblp;
403	DB_LSN old_lsn;
404	LOG *lp;
405	int adv_file, newfile, ret;
406
407	dblp = env->lg_handle;
408	lp = dblp->reginfo.primary;
409
410	/*
411	 * Save a copy of lp->lsn before we might decide to switch log
412	 * files and change it.  If we do switch log files, and we're
413	 * doing replication, we'll need to tell our clients about the
414	 * switch, and they need to receive a NEWFILE message
415	 * with this "would-be" LSN in order to know they're not
416	 * missing any log records.
417	 */
418	old_lsn = lp->lsn;
419	newfile = 0;
420	adv_file = 0;
421	/*
422	 * If our current log is at an older version and we want to write
423	 * a record then we need to advance the log.
424	 */
425	if (lp->persist.version != DB_LOGVERSION) {
426		__log_set_version(env, DB_LOGVERSION);
427		adv_file = 1;
428	}
429
430	/*
431	 * If this information won't fit in the file, or if we're a
432	 * replication client environment and have been told to do so,
433	 * swap files.
434	 */
435	if (adv_file || lp->lsn.offset == 0 ||
436	    lp->lsn.offset + hdr->size + dbt->size > lp->log_size) {
437		if (hdr->size + sizeof(LOGP) + dbt->size > lp->log_size) {
438			__db_errx(env,
439	    "DB_ENV->log_put: record larger than maximum file size (%lu > %lu)",
440			    (u_long)hdr->size + sizeof(LOGP) + dbt->size,
441			    (u_long)lp->log_size);
442			return (EINVAL);
443		}
444
445		if ((ret = __log_newfile(dblp, NULL, 0, 0)) != 0)
446			return (ret);
447
448		/*
449		 * Flag that we switched files, in case we're a master
450		 * and need to send this information to our clients.
451		 * We postpone doing the actual send until we can
452		 * safely release the log region lock and are doing so
453		 * anyway.
454		 */
455		newfile = 1;
456	}
457
458	/* If we switched log files, let our caller know where. */
459	if (newfile)
460		*old_lsnp = old_lsn;
461
462	/* Actually put the record. */
463	return (__log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len, hdr));
464}
465
466/*
467 * __log_flush_commit --
468 *	Flush a record.
469 */
470static int
471__log_flush_commit(env, lsnp, flags)
472	ENV *env;
473	const DB_LSN *lsnp;
474	u_int32_t flags;
475{
476	DB_LOG *dblp;
477	DB_LSN flush_lsn;
478	LOG *lp;
479	int ret;
480
481	dblp = env->lg_handle;
482	lp = dblp->reginfo.primary;
483	flush_lsn = *lsnp;
484
485	ret = 0;
486
487	/*
488	 * DB_FLUSH:
489	 *	Flush a record for which the DB_FLUSH flag to log_put was set.
490	 *
491	 * DB_LOG_WRNOSYNC:
492	 *	If there's anything in the current log buffer, write it out.
493	 */
494	if (LF_ISSET(DB_FLUSH))
495		ret = __log_flush_int(dblp, &flush_lsn, 1);
496	else if (!lp->db_log_inmemory && lp->b_off != 0)
497		if ((ret = __log_write(dblp,
498		    dblp->bufp, (u_int32_t)lp->b_off)) == 0)
499			lp->b_off = 0;
500
501	/*
502	 * If a flush supporting a transaction commit fails, we must abort the
503	 * transaction.  (If we aren't doing a commit, return the failure; if
504	 * if the commit we care about made it to disk successfully, we just
505	 * ignore the failure, because there's no way to undo the commit.)
506	 */
507	if (ret == 0 || !LF_ISSET(DB_LOG_COMMIT))
508		return (ret);
509
510	if (flush_lsn.file != lp->lsn.file || flush_lsn.offset < lp->w_off)
511		return (0);
512
513	/*
514	 * Else, make sure that the commit record does not get out after we
515	 * abort the transaction.  Do this by overwriting the commit record
516	 * in the buffer.  (Note that other commits in this buffer will wait
517	 * until a successful write happens, we do not wake them.)  We point
518	 * at the right part of the buffer and write an abort record over the
519	 * commit.  We must then try and flush the buffer again, since the
520	 * interesting part of the buffer may have actually made it out to
521	 * disk before there was a failure, we can't know for sure.
522	 */
523	if (__txn_force_abort(env,
524	    dblp->bufp + flush_lsn.offset - lp->w_off) == 0)
525		(void)__log_flush_int(dblp, &flush_lsn, 0);
526
527	return (ret);
528}
529
530/*
531 * __log_newfile --
532 *	Initialize and switch to a new log file.  (Note that this is
533 * called both when no log yet exists and when we fill a log file.)
534 *
535 * PUBLIC: int __log_newfile __P((DB_LOG *, DB_LSN *, u_int32_t, u_int32_t));
536 */
537int
538__log_newfile(dblp, lsnp, logfile, version)
539	DB_LOG *dblp;
540	DB_LSN *lsnp;
541	u_int32_t logfile;
542	u_int32_t version;
543{
544	DBT t;
545	DB_CIPHER *db_cipher;
546	DB_LSN lsn;
547	ENV *env;
548	HDR hdr;
549	LOG *lp;
550	LOGP *tpersist;
551	int need_free, ret;
552	u_int32_t lastoff;
553	size_t tsize;
554
555	env = dblp->env;
556	lp = dblp->reginfo.primary;
557
558	/*
559	 * If we're not specifying a specific log file number and we're
560	 * not at the beginning of a file already, start a new one.
561	 */
562	if (logfile == 0 && lp->lsn.offset != 0) {
563		/*
564		 * Flush the log so this file is out and can be closed.  We
565		 * cannot release the region lock here because we need to
566		 * protect the end of the file while we switch.  In
567		 * particular, a thread with a smaller record than ours
568		 * could detect that there is space in the log. Even
569		 * blocking that event by declaring the file full would
570		 * require all threads to wait here so that the lsn.file
571		 * can be moved ahead after the flush completes.  This
572		 * probably can be changed if we had an lsn for the
573		 * previous file and one for the current, but it does not
574		 * seem like this would get much more throughput, if any.
575		 */
576		if ((ret = __log_flush_int(dblp, NULL, 0)) != 0)
577			return (ret);
578
579		/*
580		 * Save the last known offset from the previous file, we'll
581		 * need it to initialize the persistent header information.
582		 */
583		lastoff = lp->lsn.offset;
584
585		/* Point the current LSN to the new file. */
586		++lp->lsn.file;
587		lp->lsn.offset = 0;
588
589		/* Reset the file write offset. */
590		lp->w_off = 0;
591	} else
592		lastoff = 0;
593
594	/*
595	 * Replication may require we reset the log file name space entirely.
596	 * In that case we also force a file switch so that replication can
597	 * clean up old files.
598	 */
599	if (logfile != 0) {
600		lp->lsn.file = logfile;
601		lp->lsn.offset = 0;
602		lp->w_off = 0;
603		if (lp->db_log_inmemory) {
604			lsn = lp->lsn;
605			(void)__log_zero(env, &lsn);
606		} else {
607			lp->s_lsn = lp->lsn;
608			if ((ret = __log_newfh(dblp, 1)) != 0)
609				return (ret);
610		}
611	}
612
613	DB_ASSERT(env, lp->db_log_inmemory || lp->b_off == 0);
614	if (lp->db_log_inmemory &&
615	    (ret = __log_inmem_newfile(dblp, lp->lsn.file)) != 0)
616		return (ret);
617
618	/*
619	 * Insert persistent information as the first record in every file.
620	 * Note that the previous length is wrong for the very first record
621	 * of the log, but that's okay, we check for it during retrieval.
622	 */
623	memset(&t, 0, sizeof(t));
624	memset(&hdr, 0, sizeof(HDR));
625
626	need_free = 0;
627	tsize = sizeof(LOGP);
628	db_cipher = env->crypto_handle;
629	if (CRYPTO_ON(env))
630		tsize += db_cipher->adj_size(tsize);
631	if ((ret = __os_calloc(env, 1, tsize, &tpersist)) != 0)
632		return (ret);
633	need_free = 1;
634	/*
635	 * If we're told what version to make this file, then we
636	 * need to be at that version.  Update here.
637	 */
638	if (version != 0) {
639		__log_set_version(env, version);
640		if ((ret = __env_init_rec(env, version)) != 0)
641			goto err;
642	}
643	lp->persist.log_size = lp->log_size = lp->log_nsize;
644	memcpy(tpersist, &lp->persist, sizeof(LOGP));
645	DB_SET_DBT(t, tpersist, tsize);
646	if (LOG_SWAPPED(env))
647		__log_persistswap(tpersist);
648
649	if ((ret =
650	    __log_encrypt_record(env, &t, &hdr, (u_int32_t)tsize)) != 0)
651		goto err;
652	if (lp->persist.version != DB_LOGVERSION)
653		__db_chksum(NULL, t.data, t.size,
654		    (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL, hdr.chksum);
655	else
656		__db_chksum(&hdr, t.data, t.size,
657		    (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL, hdr.chksum);
658
659	if ((ret = __log_putr(dblp, &lsn,
660	    &t, lastoff == 0 ? 0 : lastoff - lp->len, &hdr)) != 0)
661		goto err;
662
663	/* Update the LSN information returned to the caller. */
664	if (lsnp != NULL)
665		*lsnp = lp->lsn;
666
667err:	if (need_free)
668		__os_free(env, tpersist);
669	return (ret);
670}
671
672/*
673 * __log_putr --
674 *	Actually put a record into the log.
675 */
676static int
677__log_putr(dblp, lsn, dbt, prev, h)
678	DB_LOG *dblp;
679	DB_LSN *lsn;
680	const DBT *dbt;
681	u_int32_t prev;
682	HDR *h;
683{
684	DB_CIPHER *db_cipher;
685	DB_LSN f_lsn;
686	ENV *env;
687	HDR tmp, *hdr;
688	LOG *lp;
689	int ret, t_ret;
690	size_t b_off, nr;
691	u_int32_t w_off;
692
693	env = dblp->env;
694	lp = dblp->reginfo.primary;
695
696	/*
697	 * If we weren't given a header, use a local one.
698	 */
699	db_cipher = env->crypto_handle;
700	if (h == NULL) {
701		hdr = &tmp;
702		memset(hdr, 0, sizeof(HDR));
703		if (CRYPTO_ON(env))
704			hdr->size = HDR_CRYPTO_SZ;
705		else
706			hdr->size = HDR_NORMAL_SZ;
707	} else
708		hdr = h;
709
710	/* Save our position in case we fail. */
711	b_off = lp->b_off;
712	w_off = lp->w_off;
713	f_lsn = lp->f_lsn;
714
715	/*
716	 * Initialize the header.  If we just switched files, lsn.offset will
717	 * be 0, and what we really want is the offset of the previous record
718	 * in the previous file.  Fortunately, prev holds the value we want.
719	 */
720	hdr->prev = prev;
721	hdr->len = (u_int32_t)hdr->size + dbt->size;
722
723	/*
724	 * If we were passed in a nonzero checksum, our caller calculated
725	 * the checksum before acquiring the log mutex, as an optimization.
726	 *
727	 * If our caller calculated a real checksum of 0, we'll needlessly
728	 * recalculate it.  C'est la vie;  there's no out-of-bounds value
729	 * here.
730	 */
731	if (hdr->chksum[0] == 0)
732		if (lp->persist.version != DB_LOGVERSION)
733			__db_chksum(NULL, dbt->data, dbt->size,
734			    (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL,
735			    hdr->chksum);
736		else
737			__db_chksum(hdr, dbt->data, dbt->size,
738			    (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL,
739			    hdr->chksum);
740	else if (lp->persist.version == DB_LOGVERSION) {
741		/*
742		 * We need to correct for prev and len since they are not
743		 * set before here.
744		 */
745		LOG_HDR_SUM(CRYPTO_ON(env), hdr, hdr->chksum);
746	}
747
748	if (lp->db_log_inmemory && (ret = __log_inmem_chkspace(dblp,
749	    (u_int32_t)hdr->size + dbt->size)) != 0)
750		goto err;
751
752	/*
753	 * The offset into the log file at this point is the LSN where
754	 * we're about to put this record, and is the LSN the caller wants.
755	 */
756	*lsn = lp->lsn;
757
758	nr = hdr->size;
759	if (LOG_SWAPPED(env))
760		__log_hdrswap(hdr, CRYPTO_ON(env));
761
762	 /* nr can't overflow a 32 bit value - header size is internal. */
763	ret = __log_fill(dblp, lsn, hdr, (u_int32_t)nr);
764
765	if (LOG_SWAPPED(env))
766		__log_hdrswap(hdr, CRYPTO_ON(env));
767
768	if (ret != 0)
769		goto err;
770
771	if ((ret = __log_fill(dblp, lsn, dbt->data, dbt->size)) != 0)
772		goto err;
773
774	lp->len = (u_int32_t)(hdr->size + dbt->size);
775	lp->lsn.offset += lp->len;
776	return (0);
777err:
778	/*
779	 * If we wrote more than one buffer before failing, get the
780	 * first one back.  The extra buffers will fail the checksums
781	 * and be ignored.
782	 */
783	if (w_off + lp->buffer_size < lp->w_off) {
784		DB_ASSERT(env, !lp->db_log_inmemory);
785		if ((t_ret = __os_seek(env, dblp->lfhp, 0, 0, w_off)) != 0 ||
786		    (t_ret = __os_read(env, dblp->lfhp, dblp->bufp,
787		    b_off, &nr)) != 0)
788			return (__env_panic(env, t_ret));
789		if (nr != b_off) {
790			__db_errx(env, "Short read while restoring log");
791			return (__env_panic(env, EIO));
792		}
793	}
794
795	/* Reset to where we started. */
796	lp->w_off = w_off;
797	lp->b_off = b_off;
798	lp->f_lsn = f_lsn;
799
800	return (ret);
801}
802
803/*
804 * __log_flush_pp --
805 *	ENV->log_flush pre/post processing.
806 *
807 * PUBLIC: int __log_flush_pp __P((DB_ENV *, const DB_LSN *));
808 */
809int
810__log_flush_pp(dbenv, lsn)
811	DB_ENV *dbenv;
812	const DB_LSN *lsn;
813{
814	DB_THREAD_INFO *ip;
815	ENV *env;
816	int ret;
817
818	env = dbenv->env;
819
820	ENV_REQUIRES_CONFIG(env,
821	    env->lg_handle, "DB_ENV->log_flush", DB_INIT_LOG);
822
823	ENV_ENTER(env, ip);
824	REPLICATION_WRAP(env, (__log_flush(env, lsn)), 0, ret);
825	ENV_LEAVE(env, ip);
826	return (ret);
827}
828
829/*
830 * See if we need to wait.  s_lsn is not locked so some care is needed.
831 * The sync point can only move forward.  The lsnp->file cannot be
832 * greater than the s_lsn.file.  If the file we want is in the past
833 * we are done.  If the file numbers are the same check the offset.
834 * This all assumes we can read an 32-bit quantity in one state or
835 * the other, not in transition.
836 */
837#define	ALREADY_FLUSHED(lp, lsnp)					\
838	(((lp)->s_lsn.file > (lsnp)->file) ||				\
839	((lp)->s_lsn.file == (lsnp)->file &&				\
840	    (lp)->s_lsn.offset > (lsnp)->offset))
841
842/*
843 * __log_flush --
844 *	ENV->log_flush
845 *
846 * PUBLIC: int __log_flush __P((ENV *, const DB_LSN *));
847 */
848int
849__log_flush(env, lsn)
850	ENV *env;
851	const DB_LSN *lsn;
852{
853	DB_LOG *dblp;
854	LOG *lp;
855	int ret;
856
857	dblp = env->lg_handle;
858	lp = dblp->reginfo.primary;
859	if (lsn != NULL && ALREADY_FLUSHED(lp, lsn))
860		return (0);
861	LOG_SYSTEM_LOCK(env);
862	ret = __log_flush_int(dblp, lsn, 1);
863	LOG_SYSTEM_UNLOCK(env);
864	return (ret);
865}
866
867/*
868 * __log_flush_int --
869 *	Write all records less than or equal to the specified LSN; internal
870 *	version.
871 *
872 * PUBLIC: int __log_flush_int __P((DB_LOG *, const DB_LSN *, int));
873 */
874int
875__log_flush_int(dblp, lsnp, release)
876	DB_LOG *dblp;
877	const DB_LSN *lsnp;
878	int release;
879{
880	struct __db_commit *commit;
881	ENV *env;
882	DB_LSN flush_lsn, f_lsn;
883	LOG *lp;
884	size_t b_off;
885	u_int32_t ncommit, w_off;
886	int do_flush, first, ret;
887
888	env = dblp->env;
889	lp = dblp->reginfo.primary;
890	ncommit = 0;
891	ret = 0;
892
893	if (lp->db_log_inmemory) {
894		lp->s_lsn = lp->lsn;
895		STAT(++lp->stat.st_scount);
896		return (0);
897	}
898
899	/*
900	 * If no LSN specified, flush the entire log by setting the flush LSN
901	 * to the last LSN written in the log.  Otherwise, check that the LSN
902	 * isn't a non-existent record for the log.
903	 */
904	if (lsnp == NULL) {
905		flush_lsn.file = lp->lsn.file;
906		flush_lsn.offset = lp->lsn.offset - lp->len;
907	} else if (lsnp->file > lp->lsn.file ||
908	    (lsnp->file == lp->lsn.file &&
909	    lsnp->offset > lp->lsn.offset - lp->len)) {
910		__db_errx(env,
911    "DB_ENV->log_flush: LSN of %lu/%lu past current end-of-log of %lu/%lu",
912		    (u_long)lsnp->file, (u_long)lsnp->offset,
913		    (u_long)lp->lsn.file, (u_long)lp->lsn.offset);
914		__db_errx(env, "%s %s %s",
915		    "Database environment corrupt; the wrong log files may",
916		    "have been removed or incompatible database files imported",
917		    "from another environment");
918		return (__env_panic(env, DB_RUNRECOVERY));
919	} else {
920		if (ALREADY_FLUSHED(lp, lsnp))
921			return (0);
922		flush_lsn = *lsnp;
923	}
924
925	/*
926	 * If a flush is in progress and we're allowed to do so, drop
927	 * the region lock and block waiting for the next flush.
928	 */
929	if (release && lp->in_flush != 0) {
930		if ((commit = SH_TAILQ_FIRST(
931		    &lp->free_commits, __db_commit)) == NULL) {
932			if ((ret = __env_alloc(&dblp->reginfo,
933			    sizeof(struct __db_commit), &commit)) != 0)
934				goto flush;
935			memset(commit, 0, sizeof(*commit));
936			if ((ret = __mutex_alloc(env, MTX_TXN_COMMIT,
937			    DB_MUTEX_SELF_BLOCK, &commit->mtx_txnwait)) != 0) {
938				__env_alloc_free(&dblp->reginfo, commit);
939				return (ret);
940			}
941			MUTEX_LOCK(env, commit->mtx_txnwait);
942		} else
943			SH_TAILQ_REMOVE(
944			    &lp->free_commits, commit, links, __db_commit);
945
946		lp->ncommit++;
947
948		/*
949		 * Flushes may be requested out of LSN order;  be
950		 * sure we only move lp->t_lsn forward.
951		 */
952		if (LOG_COMPARE(&lp->t_lsn, &flush_lsn) < 0)
953			lp->t_lsn = flush_lsn;
954
955		commit->lsn = flush_lsn;
956		SH_TAILQ_INSERT_HEAD(
957		    &lp->commits, commit, links, __db_commit);
958		LOG_SYSTEM_UNLOCK(env);
959		/* Wait here for the in-progress flush to finish. */
960		MUTEX_LOCK(env, commit->mtx_txnwait);
961		LOG_SYSTEM_LOCK(env);
962
963		lp->ncommit--;
964		/*
965		 * Grab the flag before freeing the struct to see if
966		 * we need to flush the log to commit.  If so,
967		 * use the maximal lsn for any committing thread.
968		 */
969		do_flush = F_ISSET(commit, DB_COMMIT_FLUSH);
970		F_CLR(commit, DB_COMMIT_FLUSH);
971		SH_TAILQ_INSERT_HEAD(
972		    &lp->free_commits, commit, links, __db_commit);
973		if (do_flush) {
974			lp->in_flush--;
975			flush_lsn = lp->t_lsn;
976		} else
977			return (0);
978	}
979
980	/*
981	 * Protect flushing with its own mutex so we can release
982	 * the region lock except during file switches.
983	 */
984flush:	MUTEX_LOCK(env, lp->mtx_flush);
985
986	/*
987	 * If the LSN is less than or equal to the last-sync'd LSN, we're done.
988	 * Note, the last-sync LSN saved in s_lsn is the LSN of the first byte
989	 * after the byte we absolutely know was written to disk, so the test
990	 * is <, not <=.
991	 */
992	if (flush_lsn.file < lp->s_lsn.file ||
993	    (flush_lsn.file == lp->s_lsn.file &&
994	    flush_lsn.offset < lp->s_lsn.offset)) {
995		MUTEX_UNLOCK(env, lp->mtx_flush);
996		goto done;
997	}
998
999	/*
1000	 * We may need to write the current buffer.  We have to write the
1001	 * current buffer if the flush LSN is greater than or equal to the
1002	 * buffer's starting LSN.
1003	 *
1004	 * Otherwise, it's still possible that this thread may never have
1005	 * written to this log file.  Acquire a file descriptor if we don't
1006	 * already have one.
1007	 */
1008	if (lp->b_off != 0 && LOG_COMPARE(&flush_lsn, &lp->f_lsn) >= 0) {
1009		if ((ret = __log_write(dblp,
1010		    dblp->bufp, (u_int32_t)lp->b_off)) != 0) {
1011			MUTEX_UNLOCK(env, lp->mtx_flush);
1012			goto done;
1013		}
1014
1015		lp->b_off = 0;
1016	} else if (dblp->lfhp == NULL || dblp->lfname != lp->lsn.file)
1017		if ((ret = __log_newfh(dblp, 0)) != 0) {
1018			MUTEX_UNLOCK(env, lp->mtx_flush);
1019			goto done;
1020		}
1021
1022	/*
1023	 * We are going to flush, release the region.
1024	 * First get the current state of the buffer since
1025	 * another write may come in, but we may not flush it.
1026	 */
1027	b_off = lp->b_off;
1028	w_off = lp->w_off;
1029	f_lsn = lp->f_lsn;
1030	lp->in_flush++;
1031	if (release)
1032		LOG_SYSTEM_UNLOCK(env);
1033
1034	/* Sync all writes to disk. */
1035	if ((ret = __os_fsync(env, dblp->lfhp)) != 0) {
1036		MUTEX_UNLOCK(env, lp->mtx_flush);
1037		if (release)
1038			LOG_SYSTEM_LOCK(env);
1039		ret = __env_panic(env, ret);
1040		return (ret);
1041	}
1042
1043	/*
1044	 * Set the last-synced LSN.
1045	 * This value must be set to the LSN past the last complete
1046	 * record that has been flushed.  This is at least the first
1047	 * lsn, f_lsn.  If the buffer is empty, b_off == 0, then
1048	 * we can move up to write point since the first lsn is not
1049	 * set for the new buffer.
1050	 */
1051	lp->s_lsn = f_lsn;
1052	if (b_off == 0)
1053		lp->s_lsn.offset = w_off;
1054
1055	MUTEX_UNLOCK(env, lp->mtx_flush);
1056	if (release)
1057		LOG_SYSTEM_LOCK(env);
1058
1059	lp->in_flush--;
1060	STAT(++lp->stat.st_scount);
1061
1062	/*
1063	 * How many flush calls (usually commits) did this call actually sync?
1064	 * At least one, if it got here.
1065	 */
1066	ncommit = 1;
1067done:
1068	if (lp->ncommit != 0) {
1069		first = 1;
1070		SH_TAILQ_FOREACH(commit, &lp->commits, links, __db_commit)
1071			if (LOG_COMPARE(&lp->s_lsn, &commit->lsn) > 0) {
1072				MUTEX_UNLOCK(env, commit->mtx_txnwait);
1073				SH_TAILQ_REMOVE(
1074				    &lp->commits, commit, links, __db_commit);
1075				ncommit++;
1076			} else if (first == 1) {
1077				F_SET(commit, DB_COMMIT_FLUSH);
1078				MUTEX_UNLOCK(env, commit->mtx_txnwait);
1079				SH_TAILQ_REMOVE(
1080				    &lp->commits, commit, links, __db_commit);
1081				/*
1082				 * This thread will wake and flush.
1083				 * If another thread commits and flushes
1084				 * first we will waste a trip trough the
1085				 * mutex.
1086				 */
1087				lp->in_flush++;
1088				first = 0;
1089			}
1090	}
1091#ifdef HAVE_STATISTICS
1092	if (lp->stat.st_maxcommitperflush < ncommit)
1093		lp->stat.st_maxcommitperflush = ncommit;
1094	if (lp->stat.st_mincommitperflush > ncommit ||
1095	    lp->stat.st_mincommitperflush == 0)
1096		lp->stat.st_mincommitperflush = ncommit;
1097#endif
1098
1099	return (ret);
1100}
1101
1102/*
1103 * __log_fill --
1104 *	Write information into the log.
1105 */
1106static int
1107__log_fill(dblp, lsn, addr, len)
1108	DB_LOG *dblp;
1109	DB_LSN *lsn;
1110	void *addr;
1111	u_int32_t len;
1112{
1113	LOG *lp;
1114	u_int32_t bsize, nrec;
1115	size_t nw, remain;
1116	int ret;
1117
1118	lp = dblp->reginfo.primary;
1119	bsize = lp->buffer_size;
1120
1121	if (lp->db_log_inmemory) {
1122		__log_inmem_copyin(dblp, lp->b_off, addr, len);
1123		lp->b_off = (lp->b_off + len) % lp->buffer_size;
1124		return (0);
1125	}
1126
1127	while (len > 0) {			/* Copy out the data. */
1128		/*
1129		 * If we're beginning a new buffer, note the user LSN to which
1130		 * the first byte of the buffer belongs.  We have to know this
1131		 * when flushing the buffer so that we know if the in-memory
1132		 * buffer needs to be flushed.
1133		 */
1134		if (lp->b_off == 0)
1135			lp->f_lsn = *lsn;
1136
1137		/*
1138		 * If we're on a buffer boundary and the data is big enough,
1139		 * copy as many records as we can directly from the data.
1140		 */
1141		if (lp->b_off == 0 && len >= bsize) {
1142			nrec = len / bsize;
1143			if ((ret = __log_write(dblp, addr, nrec * bsize)) != 0)
1144				return (ret);
1145			addr = (u_int8_t *)addr + nrec * bsize;
1146			len -= nrec * bsize;
1147			STAT(++lp->stat.st_wcount_fill);
1148			continue;
1149		}
1150
1151		/* Figure out how many bytes we can copy this time. */
1152		remain = bsize - lp->b_off;
1153		nw = remain > len ? len : remain;
1154		memcpy(dblp->bufp + lp->b_off, addr, nw);
1155		addr = (u_int8_t *)addr + nw;
1156		len -= (u_int32_t)nw;
1157		lp->b_off += nw;
1158
1159		/* If we fill the buffer, flush it. */
1160		if (lp->b_off == bsize) {
1161			if ((ret = __log_write(dblp, dblp->bufp, bsize)) != 0)
1162				return (ret);
1163			lp->b_off = 0;
1164			STAT(++lp->stat.st_wcount_fill);
1165		}
1166	}
1167	return (0);
1168}
1169
1170/*
1171 * __log_write --
1172 *	Write the log buffer to disk.
1173 */
1174static int
1175__log_write(dblp, addr, len)
1176	DB_LOG *dblp;
1177	void *addr;
1178	u_int32_t len;
1179{
1180	ENV *env;
1181	LOG *lp;
1182	size_t nw;
1183	int ret;
1184
1185	env = dblp->env;
1186	lp = dblp->reginfo.primary;
1187
1188	DB_ASSERT(env, !lp->db_log_inmemory);
1189
1190	/*
1191	 * If we haven't opened the log file yet or the current one has
1192	 * changed, acquire a new log file.  We are creating the file if we're
1193	 * about to write to the start of it, in other words, if the write
1194	 * offset is zero.
1195	 */
1196	if (dblp->lfhp == NULL || dblp->lfname != lp->lsn.file ||
1197	    dblp->lf_timestamp != lp->timestamp)
1198		if ((ret = __log_newfh(dblp, lp->w_off == 0)) != 0)
1199			return (ret);
1200
1201	/*
1202	 * If we're writing the first block in a log file on a filesystem that
1203	 * guarantees unwritten blocks are zero-filled, we set the size of the
1204	 * file in advance.  This increases sync performance on some systems,
1205	 * because they don't need to update metadata on every sync.
1206	 *
1207	 * Ignore any error -- we may have run out of disk space, but that's no
1208	 * reason to quit.
1209	 */
1210#ifdef HAVE_FILESYSTEM_NOTZERO
1211	if (lp->w_off == 0 && !__os_fs_notzero()) {
1212#else
1213	if (lp->w_off == 0) {
1214#endif
1215		(void)__db_file_extend(env, dblp->lfhp, lp->log_size);
1216		if (F_ISSET(dblp, DBLOG_ZERO))
1217			(void)__db_zero_extend(env, dblp->lfhp,
1218			     0, lp->log_size/lp->buffer_size, lp->buffer_size);
1219
1220	}
1221
1222	/*
1223	 * Seek to the offset in the file (someone may have written it
1224	 * since we last did).
1225	 */
1226	if ((ret = __os_io(env, DB_IO_WRITE,
1227	    dblp->lfhp, 0, 0, lp->w_off, len, addr, &nw)) != 0)
1228		return (ret);
1229
1230	/* Reset the buffer offset and update the seek offset. */
1231	lp->w_off += len;
1232
1233	/* Update written statistics. */
1234	if ((lp->stat.st_wc_bytes += len) >= MEGABYTE) {
1235		lp->stat.st_wc_bytes -= MEGABYTE;
1236		++lp->stat.st_wc_mbytes;
1237	}
1238#ifdef HAVE_STATISTICS
1239	if ((lp->stat.st_w_bytes += len) >= MEGABYTE) {
1240		lp->stat.st_w_bytes -= MEGABYTE;
1241		++lp->stat.st_w_mbytes;
1242	}
1243	++lp->stat.st_wcount;
1244#endif
1245
1246	return (0);
1247}
1248
1249/*
1250 * __log_file_pp --
1251 *	ENV->log_file pre/post processing.
1252 *
1253 * PUBLIC: int __log_file_pp __P((DB_ENV *, const DB_LSN *, char *, size_t));
1254 */
1255int
1256__log_file_pp(dbenv, lsn, namep, len)
1257	DB_ENV *dbenv;
1258	const DB_LSN *lsn;
1259	char *namep;
1260	size_t len;
1261{
1262	DB_THREAD_INFO *ip;
1263	ENV *env;
1264	int ret, set;
1265
1266	env = dbenv->env;
1267
1268	ENV_REQUIRES_CONFIG(env,
1269	    env->lg_handle, "DB_ENV->log_file", DB_INIT_LOG);
1270
1271	if ((ret = __log_get_config(dbenv, DB_LOG_IN_MEMORY, &set)) != 0)
1272		return (ret);
1273	if (set) {
1274		__db_errx(env,
1275		    "DB_ENV->log_file is illegal with in-memory logs");
1276		return (EINVAL);
1277	}
1278
1279	ENV_ENTER(env, ip);
1280	REPLICATION_WRAP(env, (__log_file(env, lsn, namep, len)), 0, ret);
1281	ENV_LEAVE(env, ip);
1282	return (ret);
1283}
1284
1285/*
1286 * __log_file --
1287 *	ENV->log_file.
1288 */
1289static int
1290__log_file(env, lsn, namep, len)
1291	ENV *env;
1292	const DB_LSN *lsn;
1293	char *namep;
1294	size_t len;
1295{
1296	DB_LOG *dblp;
1297	int ret;
1298	char *name;
1299
1300	dblp = env->lg_handle;
1301	LOG_SYSTEM_LOCK(env);
1302	ret = __log_name(dblp, lsn->file, &name, NULL, 0);
1303	LOG_SYSTEM_UNLOCK(env);
1304	if (ret != 0)
1305		return (ret);
1306
1307	/* Check to make sure there's enough room and copy the name. */
1308	if (len < strlen(name) + 1) {
1309		*namep = '\0';
1310		__db_errx(env, "DB_ENV->log_file: name buffer is too short");
1311		return (EINVAL);
1312	}
1313	(void)strcpy(namep, name);
1314	__os_free(env, name);
1315
1316	return (0);
1317}
1318
1319/*
1320 * __log_newfh --
1321 *	Acquire a file handle for the current log file.
1322 */
1323static int
1324__log_newfh(dblp, create)
1325	DB_LOG *dblp;
1326	int create;
1327{
1328	ENV *env;
1329	LOG *lp;
1330	u_int32_t flags;
1331	int ret;
1332	logfile_validity status;
1333
1334	env = dblp->env;
1335	lp = dblp->reginfo.primary;
1336
1337	/* Close any previous file descriptor. */
1338	if (dblp->lfhp != NULL) {
1339		(void)__os_closehandle(env, dblp->lfhp);
1340		dblp->lfhp = NULL;
1341	}
1342
1343	flags = DB_OSO_SEQ |
1344	    (create ? DB_OSO_CREATE : 0) |
1345	    (F_ISSET(dblp, DBLOG_DIRECT) ? DB_OSO_DIRECT : 0) |
1346	    (F_ISSET(dblp, DBLOG_DSYNC) ? DB_OSO_DSYNC : 0);
1347
1348	/* Get the path of the new file and open it. */
1349	dblp->lfname = lp->lsn.file;
1350	if ((ret = __log_valid(dblp, dblp->lfname, 0, &dblp->lfhp,
1351	    flags, &status, NULL)) != 0)
1352		__db_err(env, ret,
1353		    "DB_ENV->log_newfh: %lu", (u_long)lp->lsn.file);
1354	else if (status != DB_LV_NORMAL && status != DB_LV_INCOMPLETE &&
1355	    status != DB_LV_OLD_READABLE)
1356		ret = DB_NOTFOUND;
1357
1358	return (ret);
1359}
1360
1361/*
1362 * __log_name --
1363 *	Return the log name for a particular file, and optionally open it.
1364 *
1365 * PUBLIC: int __log_name __P((DB_LOG *,
1366 * PUBLIC:     u_int32_t, char **, DB_FH **, u_int32_t));
1367 */
1368int
1369__log_name(dblp, filenumber, namep, fhpp, flags)
1370	DB_LOG *dblp;
1371	u_int32_t filenumber, flags;
1372	char **namep;
1373	DB_FH **fhpp;
1374{
1375	ENV *env;
1376	LOG *lp;
1377	int mode, ret;
1378	char *oname;
1379	char old[sizeof(LFPREFIX) + 5 + 20], new[sizeof(LFPREFIX) + 10 + 20];
1380
1381	env = dblp->env;
1382	lp = dblp->reginfo.primary;
1383
1384	DB_ASSERT(env, !lp->db_log_inmemory);
1385
1386	/*
1387	 * !!!
1388	 * The semantics of this routine are bizarre.
1389	 *
1390	 * The reason for all of this is that we need a place where we can
1391	 * intercept requests for log files, and, if appropriate, check for
1392	 * both the old-style and new-style log file names.  The trick is
1393	 * that all callers of this routine that are opening the log file
1394	 * read-only want to use an old-style file name if they can't find
1395	 * a match using a new-style name.  The only down-side is that some
1396	 * callers may check for the old-style when they really don't need
1397	 * to, but that shouldn't mess up anything, and we only check for
1398	 * the old-style name when we've already failed to find a new-style
1399	 * one.
1400	 *
1401	 * Create a new-style file name, and if we're not going to open the
1402	 * file, return regardless.
1403	 */
1404	(void)snprintf(new, sizeof(new), LFNAME, filenumber);
1405	if ((ret = __db_appname(env,
1406	    DB_APP_LOG, new, 0, NULL, namep)) != 0 || fhpp == NULL)
1407		return (ret);
1408
1409	/* The application may have specified an absolute file mode. */
1410	if (lp->filemode == 0)
1411		mode = env->db_mode;
1412	else {
1413		LF_SET(DB_OSO_ABSMODE);
1414		mode = lp->filemode;
1415	}
1416
1417	/* Open the new-style file -- if we succeed, we're done. */
1418	dblp->lf_timestamp = lp->timestamp;
1419	if ((ret = __os_open(env, *namep, 0, flags, mode, fhpp)) == 0)
1420		return (0);
1421
1422	/*
1423	 * If the open failed for reason other than the file
1424	 * not being there, complain loudly, the wrong user
1425	 * probably started up the application.
1426	 */
1427	if (ret != ENOENT) {
1428		__db_err(env, ret, "%s: log file unreadable", *namep);
1429		return (__env_panic(env, ret));
1430	}
1431
1432	/*
1433	 * The open failed... if the DB_RDONLY flag isn't set, we're done,
1434	 * the caller isn't interested in old-style files.
1435	 */
1436	if (!LF_ISSET(DB_OSO_RDONLY)) {
1437		__db_err(env, ret, "%s: log file open failed", *namep);
1438		return (__env_panic(env, ret));
1439	}
1440
1441	/* Create an old-style file name. */
1442	(void)snprintf(old, sizeof(old), LFNAME_V1, filenumber);
1443	if ((ret = __db_appname(env, DB_APP_LOG, old, 0, NULL, &oname)) != 0)
1444		goto err;
1445
1446	/*
1447	 * Open the old-style file -- if we succeed, we're done.  Free the
1448	 * space allocated for the new-style name and return the old-style
1449	 * name to the caller.
1450	 */
1451	if ((ret = __os_open(env, oname, 0, flags, mode, fhpp)) == 0) {
1452		__os_free(env, *namep);
1453		*namep = oname;
1454		return (0);
1455	}
1456
1457	/*
1458	 * Couldn't find either style of name -- return the new-style name
1459	 * for the caller's error message.  If it's an old-style name that's
1460	 * actually missing we're going to confuse the user with the error
1461	 * message, but that implies that not only were we looking for an
1462	 * old-style name, but we expected it to exist and we weren't just
1463	 * looking for any log file.  That's not a likely error.
1464	 */
1465err:	__os_free(env, oname);
1466	return (ret);
1467}
1468
1469/*
1470 * __log_rep_put --
1471 *	Short-circuit way for replication clients to put records into the
1472 * log.  Replication clients' logs need to be laid out exactly as their masters'
1473 * are, so we let replication take responsibility for when the log gets
1474 * flushed, when log switches files, etc.  This is just a thin PUBLIC wrapper
1475 * for __log_putr with a slightly prettier interface.
1476 *
1477 * Note that the REP->mtx_clientdb should be held when this is called.
1478 * Note that we acquire the log region mutex while holding mtx_clientdb.
1479 *
1480 * PUBLIC: int __log_rep_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
1481 */
1482int
1483__log_rep_put(env, lsnp, rec, flags)
1484	ENV *env;
1485	DB_LSN *lsnp;
1486	const DBT *rec;
1487	u_int32_t flags;
1488{
1489	DBT *dbt, t;
1490	DB_CIPHER *db_cipher;
1491	DB_LOG *dblp;
1492	HDR hdr;
1493	LOG *lp;
1494	int need_free, ret;
1495
1496	dblp = env->lg_handle;
1497	lp = dblp->reginfo.primary;
1498
1499	LOG_SYSTEM_LOCK(env);
1500	memset(&hdr, 0, sizeof(HDR));
1501	t = *rec;
1502	dbt = &t;
1503	need_free = 0;
1504	db_cipher = env->crypto_handle;
1505	if (CRYPTO_ON(env))
1506		t.size += db_cipher->adj_size(rec->size);
1507	if ((ret = __os_calloc(env, 1, t.size, &t.data)) != 0)
1508		goto err;
1509	need_free = 1;
1510	memcpy(t.data, rec->data, rec->size);
1511
1512	if ((ret = __log_encrypt_record(env, dbt, &hdr, rec->size)) != 0)
1513		goto err;
1514	__db_chksum(&hdr, t.data, t.size,
1515	    (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL, hdr.chksum);
1516
1517	DB_ASSERT(env, LOG_COMPARE(lsnp, &lp->lsn) == 0);
1518	ret = __log_putr(dblp, lsnp, dbt, lp->lsn.offset - lp->len, &hdr);
1519err:
1520	/*
1521	 * !!! Assume caller holds REP->mtx_clientdb to modify ready_lsn.
1522	 */
1523	lp->ready_lsn = lp->lsn;
1524
1525	if (LF_ISSET(DB_LOG_CHKPNT))
1526		lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0;
1527
1528	/* Increment count of records added to the log. */
1529	STAT(++lp->stat.st_record);
1530	LOG_SYSTEM_UNLOCK(env);
1531	if (need_free)
1532		__os_free(env, t.data);
1533	return (ret);
1534}
1535
1536static int
1537__log_encrypt_record(env, dbt, hdr, orig)
1538	ENV *env;
1539	DBT *dbt;
1540	HDR *hdr;
1541	u_int32_t orig;
1542{
1543	DB_CIPHER *db_cipher;
1544	int ret;
1545
1546	if (CRYPTO_ON(env)) {
1547		db_cipher = env->crypto_handle;
1548		hdr->size = HDR_CRYPTO_SZ;
1549		hdr->orig_size = orig;
1550		if ((ret = db_cipher->encrypt(env, db_cipher->data,
1551		    hdr->iv, dbt->data, dbt->size)) != 0)
1552			return (ret);
1553	} else {
1554		hdr->size = HDR_NORMAL_SZ;
1555	}
1556	return (0);
1557}
1558