1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996, 1997, 1998
5 *	Sleepycat Software.  All rights reserved.
6 */
7#include "config.h"
8
9#ifndef lint
10static const char sccsid[] = "@(#)log_put.c	10.44 (Sleepycat) 11/3/98";
11#endif /* not lint */
12
13#ifndef NO_SYSTEM_INCLUDES
14#include <sys/types.h>
15
16#include <errno.h>
17#include <stdio.h>
18#include <string.h>
19#include <time.h>
20#include <unistd.h>
21#endif
22
23#include "db_int.h"
24#include "shqueue.h"
25#include "db_page.h"
26#include "log.h"
27#include "hash.h"
28#include "clib_ext.h"
29#include "common_ext.h"
30
31static int __log_fill __P((DB_LOG *, DB_LSN *, void *, u_int32_t));
32static int __log_flush __P((DB_LOG *, const DB_LSN *));
33static int __log_newfd __P((DB_LOG *));
34static int __log_putr __P((DB_LOG *, DB_LSN *, const DBT *, u_int32_t));
35static int __log_write __P((DB_LOG *, void *, u_int32_t));
36
37/*
38 * log_put --
39 *	Write a log record.
40 */
41int
42log_put(dblp, lsn, dbt, flags)
43	DB_LOG *dblp;
44	DB_LSN *lsn;
45	const DBT *dbt;
46	u_int32_t flags;
47{
48	int ret;
49
50	LOG_PANIC_CHECK(dblp);
51
52	/* Validate arguments. */
53	if (flags != 0 && flags != DB_CHECKPOINT &&
54	    flags != DB_CURLSN && flags != DB_FLUSH)
55		return (__db_ferr(dblp->dbenv, "log_put", 0));
56
57	LOCK_LOGREGION(dblp);
58	ret = __log_put(dblp, lsn, dbt, flags);
59	UNLOCK_LOGREGION(dblp);
60	return (ret);
61}
62
63/*
64 * __log_put --
65 *	Write a log record; internal version.
66 *
67 * PUBLIC: int __log_put __P((DB_LOG *, DB_LSN *, const DBT *, u_int32_t));
68 */
69int
70__log_put(dblp, lsn, dbt, flags)
71	DB_LOG *dblp;
72	DB_LSN *lsn;
73	const DBT *dbt;
74	u_int32_t flags;
75{
76	DBT fid_dbt, t;
77	DB_LSN r_unused;
78	FNAME *fnp;
79	LOG *lp;
80	u_int32_t lastoff;
81	int ret;
82
83	lp = dblp->lp;
84
85	/*
86	 * If the application just wants to know where we are, fill in
87	 * the information.  Currently used by the transaction manager
88	 * to avoid writing TXN_begin records.
89	 */
90	if (flags == DB_CURLSN) {
91		lsn->file = lp->lsn.file;
92		lsn->offset = lp->lsn.offset;
93		return (0);
94	}
95
96	/* If this information won't fit in the file, swap files. */
97	if (lp->lsn.offset + sizeof(HDR) + dbt->size > lp->persist.lg_max) {
98		if (sizeof(HDR) +
99		    sizeof(LOGP) + dbt->size > lp->persist.lg_max) {
100			__db_err(dblp->dbenv,
101			    "log_put: record larger than maximum file size");
102			return (EINVAL);
103		}
104
105		/* Flush the log. */
106		if ((ret = __log_flush(dblp, NULL)) != 0)
107			return (ret);
108
109		/*
110		 * Save the last known offset from the previous file, we'll
111		 * need it to initialize the persistent header information.
112		 */
113		lastoff = lp->lsn.offset;
114
115		/* Point the current LSN to the new file. */
116		++lp->lsn.file;
117		lp->lsn.offset = 0;
118
119		/* Reset the file write offset. */
120		lp->w_off = 0;
121	} else
122		lastoff = 0;
123
124	/* Initialize the LSN information returned to the user. */
125	lsn->file = lp->lsn.file;
126	lsn->offset = lp->lsn.offset;
127
128	/*
129	 * Insert persistent information as the first record in every file.
130	 * Note that the previous length is wrong for the very first record
131	 * of the log, but that's okay, we check for it during retrieval.
132	 */
133	if (lp->lsn.offset == 0) {
134		t.data = &lp->persist;
135		t.size = sizeof(LOGP);
136		if ((ret = __log_putr(dblp, lsn,
137		    &t, lastoff == 0 ? 0 : lastoff - lp->len)) != 0)
138			return (ret);
139
140		/* Update the LSN information returned to the user. */
141		lsn->file = lp->lsn.file;
142		lsn->offset = lp->lsn.offset;
143	}
144
145	/* Write the application's log record. */
146	if ((ret = __log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len)) != 0)
147		return (ret);
148
149	/*
150	 * On a checkpoint, we:
151	 *	Put out the checkpoint record (above).
152	 *	Save the LSN of the checkpoint in the shared region.
153	 *	Append the set of file name information into the log.
154	 */
155	if (flags == DB_CHECKPOINT) {
156		lp->chkpt_lsn = *lsn;
157
158		for (fnp = SH_TAILQ_FIRST(&dblp->lp->fq, __fname);
159		    fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) {
160			if (fnp->ref == 0)	/* Entry not in use. */
161				continue;
162			memset(&t, 0, sizeof(t));
163			t.data = R_ADDR(dblp, fnp->name_off);
164			t.size = strlen(t.data) + 1;
165			memset(&fid_dbt, 0, sizeof(fid_dbt));
166			fid_dbt.data = fnp->ufid;
167			fid_dbt.size = DB_FILE_ID_LEN;
168			if ((ret = __log_register_log(dblp, NULL, &r_unused, 0,
169			    LOG_CHECKPOINT, &t, &fid_dbt, fnp->id, fnp->s_type))
170			    != 0)
171				return (ret);
172		}
173	}
174
175	/*
176	 * On a checkpoint or when flush is requested, we:
177	 *	Flush the current buffer contents to disk.
178	 *	Sync the log to disk.
179	 */
180	if (flags == DB_FLUSH || flags == DB_CHECKPOINT)
181		if ((ret = __log_flush(dblp, NULL)) != 0)
182			return (ret);
183
184	/*
185	 * On a checkpoint, we:
186	 *	Save the time the checkpoint was written.
187	 *	Reset the bytes written since the last checkpoint.
188	 */
189	if (flags == DB_CHECKPOINT) {
190		(void)time(&lp->chkpt);
191		lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0;
192	}
193	return (0);
194}
195
196/*
197 * __log_putr --
198 *	Actually put a record into the log.
199 */
200static int
201__log_putr(dblp, lsn, dbt, prev)
202	DB_LOG *dblp;
203	DB_LSN *lsn;
204	const DBT *dbt;
205	u_int32_t prev;
206{
207	HDR hdr;
208	LOG *lp;
209	int ret;
210
211	lp = dblp->lp;
212
213	/*
214	 * Initialize the header.  If we just switched files, lsn.offset will
215	 * be 0, and what we really want is the offset of the previous record
216	 * in the previous file.  Fortunately, prev holds the value we want.
217	 */
218	hdr.prev = prev;
219	hdr.len = sizeof(HDR) + dbt->size;
220	hdr.cksum = __ham_func4(dbt->data, dbt->size);
221
222	if ((ret = __log_fill(dblp, lsn, &hdr, sizeof(HDR))) != 0)
223		return (ret);
224	lp->len = sizeof(HDR);
225	lp->lsn.offset += sizeof(HDR);
226
227	if ((ret = __log_fill(dblp, lsn, dbt->data, dbt->size)) != 0)
228		return (ret);
229	lp->len += dbt->size;
230	lp->lsn.offset += dbt->size;
231	return (0);
232}
233
234/*
235 * log_flush --
236 *	Write all records less than or equal to the specified LSN.
237 */
238int
239log_flush(dblp, lsn)
240	DB_LOG *dblp;
241	const DB_LSN *lsn;
242{
243	int ret;
244
245	LOG_PANIC_CHECK(dblp);
246
247	LOCK_LOGREGION(dblp);
248	ret = __log_flush(dblp, lsn);
249	UNLOCK_LOGREGION(dblp);
250	return (ret);
251}
252
253/*
254 * __log_flush --
255 *	Write all records less than or equal to the specified LSN; internal
256 *	version.
257 */
258static int
259__log_flush(dblp, lsn)
260	DB_LOG *dblp;
261	const DB_LSN *lsn;
262{
263	DB_LSN t_lsn;
264	LOG *lp;
265	int current, ret;
266
267	ret = 0;
268	lp = dblp->lp;
269
270	/*
271	 * If no LSN specified, flush the entire log by setting the flush LSN
272	 * to the last LSN written in the log.  Otherwise, check that the LSN
273	 * isn't a non-existent record for the log.
274	 */
275	if (lsn == NULL) {
276		t_lsn.file = lp->lsn.file;
277		t_lsn.offset = lp->lsn.offset - lp->len;
278		lsn = &t_lsn;
279	} else
280		if (lsn->file > lp->lsn.file ||
281		    (lsn->file == lp->lsn.file &&
282		    lsn->offset > lp->lsn.offset - lp->len)) {
283			__db_err(dblp->dbenv,
284			    "log_flush: LSN past current end-of-log");
285			return (EINVAL);
286		}
287
288	/*
289	 * If the LSN is less than the last-sync'd LSN, we're done.  Note,
290	 * the last-sync LSN saved in s_lsn is the LSN of the first byte
291	 * we absolutely know has been written to disk, so the test is <=.
292	 */
293	if (lsn->file < lp->s_lsn.file ||
294	    (lsn->file == lp->s_lsn.file && lsn->offset <= lp->s_lsn.offset))
295		return (0);
296
297	/*
298	 * We may need to write the current buffer.  We have to write the
299	 * current buffer if the flush LSN is greater than or equal to the
300	 * buffer's starting LSN.
301	 */
302	current = 0;
303	if (lp->b_off != 0 && log_compare(lsn, &lp->f_lsn) >= 0) {
304		if ((ret = __log_write(dblp, lp->buf, lp->b_off)) != 0)
305			return (ret);
306
307		lp->b_off = 0;
308		current = 1;
309	}
310
311	/*
312	 * It's possible that this thread may never have written to this log
313	 * file.  Acquire a file descriptor if we don't already have one.
314	 */
315	if (dblp->lfname != dblp->lp->lsn.file)
316		if ((ret = __log_newfd(dblp)) != 0)
317			return (ret);
318
319	/* Sync all writes to disk. */
320	if ((ret = __os_fsync(dblp->lfd)) != 0) {
321		__db_panic(dblp->dbenv, ret);
322		return (ret);
323	}
324	++lp->stat.st_scount;
325
326	/*
327	 * Set the last-synced LSN, using the LSN of the current buffer.  If
328	 * the current buffer was flushed, we know the LSN of the first byte
329	 * of the buffer is on disk, otherwise, we only know that the LSN of
330	 * the record before the one beginning the current buffer is on disk.
331	 *
332	 * XXX
333	 * Check to make sure that the saved lsn isn't 0 before we go making
334	 * this change.  If DB_CHECKPOINT was called before we actually wrote
335	 * something, you can end up here without ever having written anything
336	 * to a log file, and decrementing either s_lsn.file or s_lsn.offset
337	 * will cause much sadness later on.
338	 */
339	lp->s_lsn = lp->f_lsn;
340	if (!current && lp->s_lsn.file != 0)
341		if (lp->s_lsn.offset == 0) {
342			--lp->s_lsn.file;
343			lp->s_lsn.offset = lp->persist.lg_max;
344		} else
345			--lp->s_lsn.offset;
346
347	return (0);
348}
349
350/*
351 * __log_fill --
352 *	Write information into the log.
353 */
354static int
355__log_fill(dblp, lsn, addr, len)
356	DB_LOG *dblp;
357	DB_LSN *lsn;
358	void *addr;
359	u_int32_t len;
360{
361	LOG *lp;
362	u_int32_t nrec;
363	size_t nw, remain;
364	int ret;
365
366	/* Copy out the data. */
367	for (lp = dblp->lp; len > 0;) {
368		/*
369		 * If we're beginning a new buffer, note the user LSN to which
370		 * the first byte of the buffer belongs.  We have to know this
371		 * when flushing the buffer so that we know if the in-memory
372		 * buffer needs to be flushed.
373		 */
374		if (lp->b_off == 0)
375			lp->f_lsn = *lsn;
376
377		/*
378		 * If we're on a buffer boundary and the data is big enough,
379		 * copy as many records as we can directly from the data.
380		 */
381		if (lp->b_off == 0 && len >= sizeof(lp->buf)) {
382			nrec = len / sizeof(lp->buf);
383			if ((ret = __log_write(dblp,
384			    addr, nrec * sizeof(lp->buf))) != 0)
385				return (ret);
386			addr = (u_int8_t *)addr + nrec * sizeof(lp->buf);
387			len -= nrec * sizeof(lp->buf);
388			continue;
389		}
390
391		/* Figure out how many bytes we can copy this time. */
392		remain = sizeof(lp->buf) - lp->b_off;
393		nw = remain > len ? len : remain;
394		memcpy(lp->buf + lp->b_off, addr, nw);
395		addr = (u_int8_t *)addr + nw;
396		len -= nw;
397		lp->b_off += nw;
398
399		/* If we fill the buffer, flush it. */
400		if (lp->b_off == sizeof(lp->buf)) {
401			if ((ret =
402			    __log_write(dblp, lp->buf, sizeof(lp->buf))) != 0)
403				return (ret);
404			lp->b_off = 0;
405		}
406	}
407	return (0);
408}
409
410/*
411 * __log_write --
412 *	Write the log buffer to disk.
413 */
414static int
415__log_write(dblp, addr, len)
416	DB_LOG *dblp;
417	void *addr;
418	u_int32_t len;
419{
420	LOG *lp;
421	ssize_t nw;
422	int ret;
423
424	/*
425	 * If we haven't opened the log file yet or the current one
426	 * has changed, acquire a new log file.
427	 */
428	lp = dblp->lp;
429	if (dblp->lfd == -1 || dblp->lfname != lp->lsn.file)
430		if ((ret = __log_newfd(dblp)) != 0)
431			return (ret);
432
433	/*
434	 * Seek to the offset in the file (someone may have written it
435	 * since we last did).
436	 */
437	if ((ret = __os_seek(dblp->lfd, 0, 0, lp->w_off, 0, SEEK_SET)) != 0 ||
438	    (ret = __os_write(dblp->lfd, addr, len, &nw)) != 0) {
439		__db_panic(dblp->dbenv, ret);
440		return (ret);
441	}
442	if (nw != (int32_t)len)
443		return (EIO);
444
445	/* Reset the buffer offset and update the seek offset. */
446	lp->w_off += len;
447
448	/* Update written statistics. */
449	if ((lp->stat.st_w_bytes += len) >= MEGABYTE) {
450		lp->stat.st_w_bytes -= MEGABYTE;
451		++lp->stat.st_w_mbytes;
452	}
453	if ((lp->stat.st_wc_bytes += len) >= MEGABYTE) {
454		lp->stat.st_wc_bytes -= MEGABYTE;
455		++lp->stat.st_wc_mbytes;
456	}
457	++lp->stat.st_wcount;
458
459	return (0);
460}
461
462/*
463 * log_file --
464 *	Map a DB_LSN to a file name.
465 */
466int
467log_file(dblp, lsn, namep, len)
468	DB_LOG *dblp;
469	const DB_LSN *lsn;
470	char *namep;
471	size_t len;
472{
473	int ret;
474	char *name;
475
476	LOG_PANIC_CHECK(dblp);
477
478	LOCK_LOGREGION(dblp);
479	ret = __log_name(dblp, lsn->file, &name, NULL, 0);
480	UNLOCK_LOGREGION(dblp);
481	if (ret != 0)
482		return (ret);
483
484	/* Check to make sure there's enough room and copy the name. */
485	if (len < strlen(name) + 1) {
486		*namep = '\0';
487		return (ENOMEM);
488	}
489	(void)strcpy(namep, name);
490	__os_freestr(name);
491
492	return (0);
493}
494
495/*
496 * __log_newfd --
497 *	Acquire a file descriptor for the current log file.
498 */
499static int
500__log_newfd(dblp)
501	DB_LOG *dblp;
502{
503	int ret;
504	char *name;
505
506	/* Close any previous file descriptor. */
507	if (dblp->lfd != -1) {
508		(void)__os_close(dblp->lfd);
509		dblp->lfd = -1;
510	}
511
512	/* Get the path of the new file and open it. */
513	dblp->lfname = dblp->lp->lsn.file;
514	if ((ret = __log_name(dblp,
515	    dblp->lfname, &name, &dblp->lfd, DB_CREATE | DB_SEQUENTIAL)) != 0)
516		__db_err(dblp->dbenv, "log_put: %s: %s", name, strerror(ret));
517
518	__os_freestr(name);
519	return (ret);
520}
521
522/*
523 * __log_name --
524 *	Return the log name for a particular file, and optionally open it.
525 *
526 * PUBLIC: int __log_name __P((DB_LOG *, u_int32_t, char **, int *, u_int32_t));
527 */
528int
529__log_name(dblp, filenumber, namep, fdp, flags)
530	DB_LOG *dblp;
531	u_int32_t filenumber, flags;
532	char **namep;
533	int *fdp;
534{
535	int ret;
536	char *oname;
537	char old[sizeof(LFPREFIX) + 5 + 20], new[sizeof(LFPREFIX) + 10 + 20];
538
539	/*
540	 * !!!
541	 * The semantics of this routine are bizarre.
542	 *
543	 * The reason for all of this is that we need a place where we can
544	 * intercept requests for log files, and, if appropriate, check for
545	 * both the old-style and new-style log file names.  The trick is
546	 * that all callers of this routine that are opening the log file
547	 * read-only want to use an old-style file name if they can't find
548	 * a match using a new-style name.  The only down-side is that some
549	 * callers may check for the old-style when they really don't need
550	 * to, but that shouldn't mess up anything, and we only check for
551	 * the old-style name when we've already failed to find a new-style
552	 * one.
553	 *
554	 * Create a new-style file name, and if we're not going to open the
555	 * file, return regardless.
556	 */
557	(void)snprintf(new, sizeof(new), LFNAME, filenumber);
558	if ((ret = __db_appname(dblp->dbenv,
559	    DB_APP_LOG, dblp->dir, new, 0, NULL, namep)) != 0 || fdp == NULL)
560		return (ret);
561
562	/* Open the new-style file -- if we succeed, we're done. */
563	if ((ret = __db_open(*namep,
564	    flags, flags, dblp->lp->persist.mode, fdp)) == 0)
565		return (0);
566
567	/*
568	 * The open failed... if the DB_RDONLY flag isn't set, we're done,
569	 * the caller isn't interested in old-style files.
570	 */
571	if (!LF_ISSET(DB_RDONLY))
572		return (ret);
573
574	/* Create an old-style file name. */
575	(void)snprintf(old, sizeof(old), LFNAME_V1, filenumber);
576	if ((ret = __db_appname(dblp->dbenv,
577	    DB_APP_LOG, dblp->dir, old, 0, NULL, &oname)) != 0)
578		goto err;
579
580	/*
581	 * Open the old-style file -- if we succeed, we're done.  Free the
582	 * space allocated for the new-style name and return the old-style
583	 * name to the caller.
584	 */
585	if ((ret = __db_open(oname,
586	    flags, flags, dblp->lp->persist.mode, fdp)) == 0) {
587		__os_freestr(*namep);
588		*namep = oname;
589		return (0);
590	}
591
592	/*
593	 * Couldn't find either style of name -- return the new-style name
594	 * for the caller's error message.  If it's an old-style name that's
595	 * actually missing we're going to confuse the user with the error
596	 * message, but that implies that not only were we looking for an
597	 * old-style name, but we expected it to exist and we weren't just
598	 * looking for any log file.  That's not a likely error.
599	 */
600err:	__os_freestr(oname);
601	return (ret);
602}
603